xref: /xnu-10063.121.3/bsd/net/dlil.c (revision 2c2f96dc2b9a4408a43d3150ae9c105355ca3daa)
1 /*
2  * Copyright (c) 1999-2024 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
30  * support for mandatory and extensible security protections.  This notice
31  * is included in support of clause 2.2 (b) of the Apple Public License,
32  * Version 2.0.
33  */
34 #include "kpi_interface.h"
35 #include <stddef.h>
36 #include <ptrauth.h>
37 
38 #include <sys/param.h>
39 #include <sys/systm.h>
40 #include <sys/kernel.h>
41 #include <sys/malloc.h>
42 #include <sys/mbuf.h>
43 #include <sys/socket.h>
44 #include <sys/domain.h>
45 #include <sys/user.h>
46 #include <sys/random.h>
47 #include <sys/socketvar.h>
48 #include <net/if_dl.h>
49 #include <net/if.h>
50 #include <net/route.h>
51 #include <net/if_var.h>
52 #include <net/dlil.h>
53 #include <net/if_arp.h>
54 #include <net/iptap.h>
55 #include <net/pktap.h>
56 #include <net/nwk_wq.h>
57 #include <sys/kern_event.h>
58 #include <sys/kdebug.h>
59 #include <sys/mcache.h>
60 #include <sys/syslog.h>
61 #include <sys/protosw.h>
62 #include <sys/priv.h>
63 
64 #include <kern/assert.h>
65 #include <kern/task.h>
66 #include <kern/thread.h>
67 #include <kern/sched_prim.h>
68 #include <kern/locks.h>
69 #include <kern/zalloc.h>
70 
71 #include <net/kpi_protocol.h>
72 #include <net/if_types.h>
73 #include <net/if_ipsec.h>
74 #include <net/if_llreach.h>
75 #include <net/if_utun.h>
76 #include <net/kpi_interfacefilter.h>
77 #include <net/classq/classq.h>
78 #include <net/classq/classq_sfb.h>
79 #include <net/flowhash.h>
80 #include <net/ntstat.h>
81 #if SKYWALK && defined(XNU_TARGET_OS_OSX)
82 #include <skywalk/lib/net_filter_event.h>
83 #endif /* SKYWALK && XNU_TARGET_OS_OSX */
84 #include <net/net_api_stats.h>
85 #include <net/if_ports_used.h>
86 #include <net/if_vlan_var.h>
87 #include <netinet/in.h>
88 #if INET
89 #include <netinet/in_var.h>
90 #include <netinet/igmp_var.h>
91 #include <netinet/ip_var.h>
92 #include <netinet/tcp.h>
93 #include <netinet/tcp_var.h>
94 #include <netinet/udp.h>
95 #include <netinet/udp_var.h>
96 #include <netinet/if_ether.h>
97 #include <netinet/in_pcb.h>
98 #include <netinet/in_tclass.h>
99 #include <netinet/ip.h>
100 #include <netinet/ip_icmp.h>
101 #include <netinet/icmp_var.h>
102 #endif /* INET */
103 
104 #include <net/nat464_utils.h>
105 #include <netinet6/in6_var.h>
106 #include <netinet6/nd6.h>
107 #include <netinet6/mld6_var.h>
108 #include <netinet6/scope6_var.h>
109 #include <netinet/ip6.h>
110 #include <netinet/icmp6.h>
111 #include <net/pf_pbuf.h>
112 #include <libkern/OSAtomic.h>
113 #include <libkern/tree.h>
114 
115 #include <dev/random/randomdev.h>
116 #include <machine/machine_routines.h>
117 
118 #include <mach/thread_act.h>
119 #include <mach/sdt.h>
120 
121 #if CONFIG_MACF
122 #include <sys/kauth.h>
123 #include <security/mac_framework.h>
124 #include <net/ethernet.h>
125 #include <net/firewire.h>
126 #endif
127 
128 #if PF
129 #include <net/pfvar.h>
130 #endif /* PF */
131 #include <net/pktsched/pktsched.h>
132 #include <net/pktsched/pktsched_netem.h>
133 
134 #if NECP
135 #include <net/necp.h>
136 #endif /* NECP */
137 
138 #if SKYWALK
139 #include <skywalk/packet/packet_queue.h>
140 #include <skywalk/nexus/netif/nx_netif.h>
141 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
142 #endif /* SKYWALK */
143 
144 #include <net/sockaddr_utils.h>
145 
146 #include <os/log.h>
147 
148 #define DBG_LAYER_BEG           DLILDBG_CODE(DBG_DLIL_STATIC, 0)
149 #define DBG_LAYER_END           DLILDBG_CODE(DBG_DLIL_STATIC, 2)
150 #define DBG_FNC_DLIL_INPUT      DLILDBG_CODE(DBG_DLIL_STATIC, (1 << 8))
151 #define DBG_FNC_DLIL_OUTPUT     DLILDBG_CODE(DBG_DLIL_STATIC, (2 << 8))
152 #define DBG_FNC_DLIL_IFOUT      DLILDBG_CODE(DBG_DLIL_STATIC, (3 << 8))
153 
154 #define MAX_FRAME_TYPE_SIZE 4 /* LONGWORDS */
155 #define MAX_LINKADDR        4 /* LONGWORDS */
156 
157 #if 1
158 #define DLIL_PRINTF     printf
159 #else
160 #define DLIL_PRINTF     kprintf
161 #endif
162 
163 #define IF_DATA_REQUIRE_ALIGNED_64(f)   \
164 	_CASSERT(!(offsetof(struct if_data_internal, f) % sizeof (u_int64_t)))
165 
166 #define IFNET_IF_DATA_REQUIRE_ALIGNED_64(f)     \
167 	_CASSERT(!(offsetof(struct ifnet, if_data.f) % sizeof (u_int64_t)))
168 
169 enum {
170 	kProtoKPI_v1    = 1,
171 	kProtoKPI_v2    = 2
172 };
173 
174 uint64_t if_creation_generation_count = 0;
175 
176 /*
177  * List of if_proto structures in if_proto_hash[] is protected by
178  * the ifnet lock.  The rest of the fields are initialized at protocol
179  * attach time and never change, thus no lock required as long as
180  * a reference to it is valid, via if_proto_ref().
181  */
182 struct if_proto {
183 	SLIST_ENTRY(if_proto)       next_hash;
184 	u_int32_t                   refcount;
185 	u_int32_t                   detached;
186 	struct ifnet                *ifp;
187 	protocol_family_t           protocol_family;
188 	int                         proto_kpi;
189 	union {
190 		struct {
191 			proto_media_input               input;
192 			proto_media_preout              pre_output;
193 			proto_media_event               event;
194 			proto_media_ioctl               ioctl;
195 			proto_media_detached            detached;
196 			proto_media_resolve_multi       resolve_multi;
197 			proto_media_send_arp            send_arp;
198 		} v1;
199 		struct {
200 			proto_media_input_v2            input;
201 			proto_media_preout              pre_output;
202 			proto_media_event               event;
203 			proto_media_ioctl               ioctl;
204 			proto_media_detached            detached;
205 			proto_media_resolve_multi       resolve_multi;
206 			proto_media_send_arp            send_arp;
207 		} v2;
208 	} kpi;
209 };
210 
211 SLIST_HEAD(proto_hash_entry, if_proto);
212 
213 #define DLIL_SDLDATALEN \
214 	(DLIL_SDLMAXLEN - offsetof(struct sockaddr_dl, sdl_data[0]))
215 
216 /*
217  * In the common case, the LL address is stored in the
218  * `dl_if_lladdr' member of the `dlil_ifnet'. This is sufficient
219  * for LL addresses that do not exceed the `DLIL_SDLMAXLEN' constant.
220  */
221 struct dl_if_lladdr_std {
222 	struct ifaddr   ifa;
223 	u_int8_t        addr_sdl_bytes[DLIL_SDLMAXLEN];
224 	u_int8_t        mask_sdl_bytes[DLIL_SDLMAXLEN];
225 };
226 
227 /*
228  * However, in some rare cases we encounter LL addresses which
229  * would not fit in the `DLIL_SDLMAXLEN' limitation. In such cases
230  * we allocate the storage in the permanent arena, using this memory layout.
231  */
232 struct dl_if_lladdr_xtra_space {
233 	struct ifaddr   ifa;
234 	u_int8_t        addr_sdl_bytes[SOCK_MAXADDRLEN];
235 	u_int8_t        mask_sdl_bytes[SOCK_MAXADDRLEN];
236 };
237 
238 struct dlil_ifnet {
239 	struct ifnet    dl_if;                  /* public ifnet */
240 	/*
241 	 * DLIL private fields, protected by dl_if_lock
242 	 */
243 	decl_lck_mtx_data(, dl_if_lock);
244 	TAILQ_ENTRY(dlil_ifnet) dl_if_link;     /* dlil_ifnet link */
245 	u_int32_t dl_if_flags;                  /* flags (below) */
246 	u_int32_t dl_if_refcnt;                 /* refcnt */
247 	void (*dl_if_trace)(struct dlil_ifnet *, int); /* ref trace callback */
248 	void    *dl_if_uniqueid;                /* unique interface id */
249 	size_t  dl_if_uniqueid_len;             /* length of the unique id */
250 	char    dl_if_namestorage[IFNAMSIZ];    /* interface name storage */
251 	char    dl_if_xnamestorage[IFXNAMSIZ];  /* external name storage */
252 	struct dl_if_lladdr_std dl_if_lladdr;   /* link-level address storage*/
253 	u_int8_t dl_if_descstorage[IF_DESCSIZE]; /* desc storage */
254 	u_int8_t dl_if_permanent_ether[ETHER_ADDR_LEN]; /* permanent address */
255 	u_int8_t dl_if_permanent_ether_is_set;
256 	u_int8_t dl_if_unused;
257 	struct dlil_threading_info dl_if_inpstorage; /* input thread storage */
258 	ctrace_t        dl_if_attach;           /* attach PC stacktrace */
259 	ctrace_t        dl_if_detach;           /* detach PC stacktrace */
260 };
261 
262 /* Values for dl_if_flags (private to DLIL) */
263 #define DLIF_INUSE      0x1     /* DLIL ifnet recycler, ifnet in use */
264 #define DLIF_REUSE      0x2     /* DLIL ifnet recycles, ifnet is not new */
265 #define DLIF_DEBUG      0x4     /* has debugging info */
266 
267 #define IF_REF_TRACE_HIST_SIZE  8       /* size of ref trace history */
268 
269 /* For gdb */
270 __private_extern__ unsigned int if_ref_trace_hist_size = IF_REF_TRACE_HIST_SIZE;
271 
272 struct dlil_ifnet_dbg {
273 	struct dlil_ifnet       dldbg_dlif;             /* dlil_ifnet */
274 	u_int16_t               dldbg_if_refhold_cnt;   /* # ifnet references */
275 	u_int16_t               dldbg_if_refrele_cnt;   /* # ifnet releases */
276 	/*
277 	 * Circular lists of ifnet_{reference,release} callers.
278 	 */
279 	ctrace_t                dldbg_if_refhold[IF_REF_TRACE_HIST_SIZE];
280 	ctrace_t                dldbg_if_refrele[IF_REF_TRACE_HIST_SIZE];
281 };
282 
283 #define DLIL_TO_IFP(s)  (&s->dl_if)
284 #define IFP_TO_DLIL(s)  ((struct dlil_ifnet *)s)
285 
286 struct ifnet_filter {
287 	TAILQ_ENTRY(ifnet_filter)       filt_next;
288 	u_int32_t                       filt_skip;
289 	u_int32_t                       filt_flags;
290 	ifnet_t                         filt_ifp;
291 	const char                      *filt_name;
292 	void                            *filt_cookie;
293 	protocol_family_t               filt_protocol;
294 	iff_input_func                  filt_input;
295 	iff_output_func                 filt_output;
296 	iff_event_func                  filt_event;
297 	iff_ioctl_func                  filt_ioctl;
298 	iff_detached_func               filt_detached;
299 };
300 
301 /* Mbuf queue used for freeing the excessive mbufs */
302 typedef MBUFQ_HEAD(dlil_freeq) dlil_freeq_t;
303 
304 struct proto_input_entry;
305 
306 static TAILQ_HEAD(, dlil_ifnet) dlil_ifnet_head;
307 
308 static LCK_ATTR_DECLARE(dlil_lck_attributes, 0, 0);
309 
310 static LCK_GRP_DECLARE(dlil_lock_group, "DLIL internal locks");
311 LCK_GRP_DECLARE(ifnet_lock_group, "ifnet locks");
312 static LCK_GRP_DECLARE(ifnet_head_lock_group, "ifnet head lock");
313 static LCK_GRP_DECLARE(ifnet_snd_lock_group, "ifnet snd locks");
314 static LCK_GRP_DECLARE(ifnet_rcv_lock_group, "ifnet rcv locks");
315 
316 LCK_ATTR_DECLARE(ifnet_lock_attr, 0, 0);
317 static LCK_RW_DECLARE_ATTR(ifnet_head_lock, &ifnet_head_lock_group,
318     &dlil_lck_attributes);
319 static LCK_MTX_DECLARE_ATTR(dlil_ifnet_lock, &dlil_lock_group,
320     &dlil_lck_attributes);
321 
322 #if DEBUG
323 static unsigned int ifnet_debug = 1;    /* debugging (enabled) */
324 #else
325 static unsigned int ifnet_debug;        /* debugging (disabled) */
326 #endif /* !DEBUG */
327 static unsigned int dlif_size;          /* size of dlil_ifnet to allocate */
328 static unsigned int dlif_bufsize;       /* size of dlif_size + headroom */
329 static struct zone *dlif_zone;          /* zone for dlil_ifnet */
330 #define DLIF_ZONE_NAME          "ifnet"         /* zone name */
331 
332 static KALLOC_TYPE_DEFINE(dlif_filt_zone, struct ifnet_filter, NET_KT_DEFAULT);
333 
334 static KALLOC_TYPE_DEFINE(dlif_proto_zone, struct if_proto, NET_KT_DEFAULT);
335 
336 static unsigned int dlif_tcpstat_size;  /* size of tcpstat_local to allocate */
337 static unsigned int dlif_tcpstat_bufsize; /* size of dlif_tcpstat_size + headroom */
338 static struct zone *dlif_tcpstat_zone;          /* zone for tcpstat_local */
339 #define DLIF_TCPSTAT_ZONE_NAME  "ifnet_tcpstat" /* zone name */
340 
341 static unsigned int dlif_udpstat_size;  /* size of udpstat_local to allocate */
342 static unsigned int dlif_udpstat_bufsize;       /* size of dlif_udpstat_size + headroom */
343 static struct zone *dlif_udpstat_zone;          /* zone for udpstat_local */
344 #define DLIF_UDPSTAT_ZONE_NAME  "ifnet_udpstat" /* zone name */
345 
346 static u_int32_t net_rtref;
347 
348 static struct dlil_main_threading_info dlil_main_input_thread_info;
349 __private_extern__ struct dlil_threading_info *dlil_main_input_thread =
350     (struct dlil_threading_info *)&dlil_main_input_thread_info;
351 
352 static int dlil_event_internal(struct ifnet *ifp, struct kev_msg *msg, bool update_generation);
353 static int dlil_detach_filter_internal(interface_filter_t filter, int detached);
354 static void dlil_if_trace(struct dlil_ifnet *, int);
355 static void if_proto_ref(struct if_proto *);
356 static void if_proto_free(struct if_proto *);
357 static struct if_proto *find_attached_proto(struct ifnet *, u_int32_t);
358 static u_int32_t dlil_ifp_protolist(struct ifnet *ifp, protocol_family_t *list,
359     u_int32_t list_count);
360 static void _dlil_if_release(ifnet_t ifp, bool clear_in_use);
361 static void if_flt_monitor_busy(struct ifnet *);
362 static void if_flt_monitor_unbusy(struct ifnet *);
363 static void if_flt_monitor_enter(struct ifnet *);
364 static void if_flt_monitor_leave(struct ifnet *);
365 static int dlil_interface_filters_input(struct ifnet *, struct mbuf **,
366     char **, protocol_family_t);
367 static int dlil_interface_filters_output(struct ifnet *, struct mbuf **,
368     protocol_family_t);
369 static struct ifaddr *dlil_alloc_lladdr(struct ifnet *,
370     const struct sockaddr_dl *);
371 static int ifnet_lookup(struct ifnet *);
372 static void if_purgeaddrs(struct ifnet *);
373 
374 static errno_t ifproto_media_input_v1(struct ifnet *, protocol_family_t,
375     struct mbuf *, char *);
376 static errno_t ifproto_media_input_v2(struct ifnet *, protocol_family_t,
377     struct mbuf *);
378 static errno_t ifproto_media_preout(struct ifnet *, protocol_family_t,
379     mbuf_t *, const struct sockaddr *, void *, char *, char *);
380 static void ifproto_media_event(struct ifnet *, protocol_family_t,
381     const struct kev_msg *);
382 static errno_t ifproto_media_ioctl(struct ifnet *, protocol_family_t,
383     unsigned long, void *);
384 static errno_t ifproto_media_resolve_multi(ifnet_t, const struct sockaddr *,
385     struct sockaddr_dl *, size_t);
386 static errno_t ifproto_media_send_arp(struct ifnet *, u_short,
387     const struct sockaddr_dl *, const struct sockaddr *,
388     const struct sockaddr_dl *, const struct sockaddr *);
389 
390 static errno_t ifp_if_input(struct ifnet *ifp, struct mbuf *m_head,
391     struct mbuf *m_tail, const struct ifnet_stat_increment_param *s,
392     boolean_t poll, struct thread *tp);
393 static void ifp_if_input_poll(struct ifnet *, u_int32_t, u_int32_t,
394     struct mbuf **, struct mbuf **, u_int32_t *, u_int32_t *);
395 static errno_t ifp_if_ctl(struct ifnet *, ifnet_ctl_cmd_t, u_int32_t, void *);
396 static errno_t ifp_if_demux(struct ifnet *, struct mbuf *, char *,
397     protocol_family_t *);
398 static errno_t ifp_if_add_proto(struct ifnet *, protocol_family_t,
399     const struct ifnet_demux_desc *, u_int32_t);
400 static errno_t ifp_if_del_proto(struct ifnet *, protocol_family_t);
401 static errno_t ifp_if_check_multi(struct ifnet *, const struct sockaddr *);
402 #if !XNU_TARGET_OS_OSX
403 static errno_t ifp_if_framer(struct ifnet *, struct mbuf **,
404     const struct sockaddr *, const char *, const char *,
405     u_int32_t *, u_int32_t *);
406 #else /* XNU_TARGET_OS_OSX */
407 static errno_t ifp_if_framer(struct ifnet *, struct mbuf **,
408     const struct sockaddr *, const char *, const char *);
409 #endif /* XNU_TARGET_OS_OSX */
410 static errno_t ifp_if_framer_extended(struct ifnet *, struct mbuf **,
411     const struct sockaddr *, const char *, const char *,
412     u_int32_t *, u_int32_t *);
413 static errno_t ifp_if_set_bpf_tap(struct ifnet *, bpf_tap_mode, bpf_packet_func);
414 static void ifp_if_free(struct ifnet *);
415 static void ifp_if_event(struct ifnet *, const struct kev_msg *);
416 static __inline void ifp_inc_traffic_class_in(struct ifnet *, struct mbuf *);
417 static __inline void ifp_inc_traffic_class_out(struct ifnet *, struct mbuf *);
418 
419 static uint32_t dlil_trim_overcomitted_queue_locked(class_queue_t *,
420     dlil_freeq_t *, struct ifnet_stat_increment_param *);
421 
422 static errno_t dlil_input_async(struct dlil_threading_info *, struct ifnet *,
423     struct mbuf *, struct mbuf *, const struct ifnet_stat_increment_param *,
424     boolean_t, struct thread *);
425 static errno_t dlil_input_sync(struct dlil_threading_info *, struct ifnet *,
426     struct mbuf *, struct mbuf *, const struct ifnet_stat_increment_param *,
427     boolean_t, struct thread *);
428 
429 static void dlil_main_input_thread_func(void *, wait_result_t);
430 static void dlil_main_input_thread_cont(void *, wait_result_t);
431 
432 static void dlil_input_thread_func(void *, wait_result_t);
433 static void dlil_input_thread_cont(void *, wait_result_t);
434 
435 static void dlil_rxpoll_input_thread_func(void *, wait_result_t);
436 static void dlil_rxpoll_input_thread_cont(void *, wait_result_t);
437 
438 static int dlil_create_input_thread(ifnet_t, struct dlil_threading_info *,
439     thread_continue_t *);
440 static void dlil_terminate_input_thread(struct dlil_threading_info *);
441 static void dlil_input_stats_add(const struct ifnet_stat_increment_param *,
442     struct dlil_threading_info *, struct ifnet *, boolean_t);
443 static boolean_t dlil_input_stats_sync(struct ifnet *,
444     struct dlil_threading_info *);
445 static void dlil_input_packet_list_common(struct ifnet *, struct mbuf *,
446     u_int32_t, ifnet_model_t, boolean_t);
447 static errno_t ifnet_input_common(struct ifnet *, struct mbuf *, struct mbuf *,
448     const struct ifnet_stat_increment_param *, boolean_t, boolean_t);
449 static int dlil_is_clat_needed(protocol_family_t, mbuf_t );
450 static errno_t dlil_clat46(ifnet_t, protocol_family_t *, mbuf_t *);
451 static errno_t dlil_clat64(ifnet_t, protocol_family_t *, mbuf_t *);
452 #if DEBUG || DEVELOPMENT
453 static void dlil_verify_sum16(void);
454 #endif /* DEBUG || DEVELOPMENT */
455 static void dlil_output_cksum_dbg(struct ifnet *, struct mbuf *, uint32_t,
456     protocol_family_t);
457 static void dlil_input_cksum_dbg(struct ifnet *, struct mbuf *, char *,
458     protocol_family_t);
459 
460 static void dlil_incr_pending_thread_count(void);
461 static void dlil_decr_pending_thread_count(void);
462 
463 static void ifnet_detacher_thread_func(void *, wait_result_t);
464 static void ifnet_detacher_thread_cont(void *, wait_result_t);
465 static void ifnet_detach_final(struct ifnet *);
466 static void ifnet_detaching_enqueue(struct ifnet *);
467 static struct ifnet *ifnet_detaching_dequeue(void);
468 
469 static void ifnet_start_thread_func(void *, wait_result_t);
470 static void ifnet_start_thread_cont(void *, wait_result_t);
471 
472 static void ifnet_poll_thread_func(void *, wait_result_t);
473 static void ifnet_poll_thread_cont(void *, wait_result_t);
474 
475 static errno_t ifnet_enqueue_common(struct ifnet *, struct ifclassq *,
476     classq_pkt_t *, boolean_t, boolean_t *);
477 
478 static void ifp_src_route_copyout(struct ifnet *, struct route *);
479 static void ifp_src_route_copyin(struct ifnet *, struct route *);
480 static void ifp_src_route6_copyout(struct ifnet *, struct route_in6 *);
481 static void ifp_src_route6_copyin(struct ifnet *, struct route_in6 *);
482 
483 static errno_t if_mcasts_update_async(struct ifnet *);
484 
485 static int sysctl_rxpoll SYSCTL_HANDLER_ARGS;
486 static int sysctl_rxpoll_mode_holdtime SYSCTL_HANDLER_ARGS;
487 static int sysctl_rxpoll_sample_holdtime SYSCTL_HANDLER_ARGS;
488 static int sysctl_rxpoll_interval_time SYSCTL_HANDLER_ARGS;
489 static int sysctl_rxpoll_wlowat SYSCTL_HANDLER_ARGS;
490 static int sysctl_rxpoll_whiwat SYSCTL_HANDLER_ARGS;
491 static int sysctl_sndq_maxlen SYSCTL_HANDLER_ARGS;
492 static int sysctl_rcvq_maxlen SYSCTL_HANDLER_ARGS;
493 static int sysctl_rcvq_burst_limit SYSCTL_HANDLER_ARGS;
494 static int sysctl_rcvq_trim_pct SYSCTL_HANDLER_ARGS;
495 static int sysctl_hwcksum_dbg_mode SYSCTL_HANDLER_ARGS;
496 static int sysctl_hwcksum_dbg_partial_rxoff_forced SYSCTL_HANDLER_ARGS;
497 static int sysctl_hwcksum_dbg_partial_rxoff_adj SYSCTL_HANDLER_ARGS;
498 
499 struct chain_len_stats tx_chain_len_stats;
500 static int sysctl_tx_chain_len_stats SYSCTL_HANDLER_ARGS;
501 
502 #if TEST_INPUT_THREAD_TERMINATION
503 static int sysctl_input_thread_termination_spin SYSCTL_HANDLER_ARGS;
504 #endif /* TEST_INPUT_THREAD_TERMINATION */
505 
506 /* The following are protected by dlil_ifnet_lock */
507 static TAILQ_HEAD(, ifnet) ifnet_detaching_head;
508 static u_int32_t ifnet_detaching_cnt;
509 static boolean_t ifnet_detaching_embryonic;
510 static void *ifnet_delayed_run; /* wait channel for detaching thread */
511 
512 static LCK_MTX_DECLARE_ATTR(ifnet_fc_lock, &dlil_lock_group,
513     &dlil_lck_attributes);
514 
515 static uint32_t ifnet_flowhash_seed;
516 
517 struct ifnet_flowhash_key {
518 	char            ifk_name[IFNAMSIZ];
519 	uint32_t        ifk_unit;
520 	uint32_t        ifk_flags;
521 	uint32_t        ifk_eflags;
522 	uint32_t        ifk_capabilities;
523 	uint32_t        ifk_capenable;
524 	uint32_t        ifk_output_sched_model;
525 	uint32_t        ifk_rand1;
526 	uint32_t        ifk_rand2;
527 };
528 
529 /* Flow control entry per interface */
530 struct ifnet_fc_entry {
531 	RB_ENTRY(ifnet_fc_entry) ifce_entry;
532 	u_int32_t       ifce_flowhash;
533 	struct ifnet    *ifce_ifp;
534 };
535 
536 static uint32_t ifnet_calc_flowhash(struct ifnet *);
537 static int ifce_cmp(const struct ifnet_fc_entry *,
538     const struct ifnet_fc_entry *);
539 static int ifnet_fc_add(struct ifnet *);
540 static struct ifnet_fc_entry *ifnet_fc_get(u_int32_t);
541 static void ifnet_fc_entry_free(struct ifnet_fc_entry *);
542 
543 /* protected by ifnet_fc_lock */
544 RB_HEAD(ifnet_fc_tree, ifnet_fc_entry) ifnet_fc_tree;
545 RB_PROTOTYPE(ifnet_fc_tree, ifnet_fc_entry, ifce_entry, ifce_cmp);
546 RB_GENERATE(ifnet_fc_tree, ifnet_fc_entry, ifce_entry, ifce_cmp);
547 
548 static KALLOC_TYPE_DEFINE(ifnet_fc_zone, struct ifnet_fc_entry, NET_KT_DEFAULT);
549 
550 extern void bpfdetach(struct ifnet *);
551 extern void proto_input_run(void);
552 
553 extern uint32_t udp_count_opportunistic(unsigned int ifindex,
554     u_int32_t flags);
555 extern uint32_t tcp_count_opportunistic(unsigned int ifindex,
556     u_int32_t flags);
557 
558 __private_extern__ void link_rtrequest(int, struct rtentry *, struct sockaddr *);
559 
560 #if CONFIG_MACF
561 #if !XNU_TARGET_OS_OSX
562 int dlil_lladdr_ckreq = 1;
563 #else /* XNU_TARGET_OS_OSX */
564 int dlil_lladdr_ckreq = 0;
565 #endif /* XNU_TARGET_OS_OSX */
566 #endif /* CONFIG_MACF */
567 
568 #if DEBUG
569 int dlil_verbose = 1;
570 #else
571 int dlil_verbose = 0;
572 #endif /* DEBUG */
573 #if IFNET_INPUT_SANITY_CHK
574 /* sanity checking of input packet lists received */
575 static u_int32_t dlil_input_sanity_check = 0;
576 #endif /* IFNET_INPUT_SANITY_CHK */
577 /* rate limit debug messages */
578 struct timespec dlil_dbgrate = { .tv_sec = 1, .tv_nsec = 0 };
579 
580 SYSCTL_DECL(_net_link_generic_system);
581 
582 SYSCTL_INT(_net_link_generic_system, OID_AUTO, dlil_verbose,
583     CTLFLAG_RW | CTLFLAG_LOCKED, &dlil_verbose, 0, "Log DLIL error messages");
584 
585 #define IF_SNDQ_MINLEN  32
586 u_int32_t if_sndq_maxlen = IFQ_MAXLEN;
587 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, sndq_maxlen,
588     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_sndq_maxlen, IFQ_MAXLEN,
589     sysctl_sndq_maxlen, "I", "Default transmit queue max length");
590 
591 #define IF_RCVQ_MINLEN  32
592 #define IF_RCVQ_MAXLEN  256
593 u_int32_t if_rcvq_maxlen = IF_RCVQ_MAXLEN;
594 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rcvq_maxlen,
595     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rcvq_maxlen, IFQ_MAXLEN,
596     sysctl_rcvq_maxlen, "I", "Default receive queue max length");
597 
598 /*
599  * Protect against possible memory starvation that may happen
600  * when the driver is pushing data faster than the AP can process.
601  *
602  * If at any point during DLIL input phase any of the input queues
603  * exceeds the burst limit, DLIL will start to trim the queue,
604  * by returning mbufs in the input queue to the cache from which
605  * the mbufs were originally allocated, starting from the oldest
606  * mbuf and continuing until the new limit (see below) is reached.
607  *
608  * In order to avoid a steplocked equilibrium, the trimming
609  * will continue PAST the burst limit, until the corresponding
610  * input queue is reduced to `if_rcvq_trim_pct' %.
611  *
612  * For example, if the input queue limit is 1024 packets,
613  * and the trim percentage (`if_rcvq_trim_pct') is 80 %,
614  * the trimming will continue until the queue contains 819 packets
615  * (1024 * 80 / 100 == 819).
616  *
617  * Setting the burst limit too low can hurt the throughput,
618  * while setting the burst limit too high can defeat the purpose.
619  */
620 #define IF_RCVQ_BURST_LIMIT_MIN         1024
621 #define IF_RCVQ_BURST_LIMIT_DEFAULT     8192
622 #define IF_RCVQ_BURST_LIMIT_MAX         32768
623 uint32_t if_rcvq_burst_limit = IF_RCVQ_BURST_LIMIT_DEFAULT;
624 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rcvq_burst_limit,
625     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rcvq_burst_limit, IF_RCVQ_BURST_LIMIT_DEFAULT,
626     sysctl_rcvq_burst_limit, "I", "Upper memory limit for inbound data");
627 
628 #define IF_RCVQ_TRIM_PCT_MIN            20
629 #define IF_RCVQ_TRIM_PCT_DEFAULT        80
630 #define IF_RCVQ_TRIM_PCT_MAX            100
631 uint32_t if_rcvq_trim_pct = IF_RCVQ_TRIM_PCT_DEFAULT;
632 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rcvq_trim_pct,
633     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rcvq_trim_pct, IF_RCVQ_TRIM_PCT_DEFAULT,
634     sysctl_rcvq_trim_pct, "I",
635     "Percentage (0 - 100) of the queue limit to keep after detecting an overflow burst");
636 
637 #define IF_RXPOLL_DECAY         2       /* ilog2 of EWMA decay rate (4) */
638 u_int32_t if_rxpoll_decay = IF_RXPOLL_DECAY;
639 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, rxpoll_decay,
640     CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_decay, IF_RXPOLL_DECAY,
641     "ilog2 of EWMA decay rate of avg inbound packets");
642 
643 #define IF_RXPOLL_MODE_HOLDTIME_MIN     (10ULL * 1000 * 1000)   /* 10 ms */
644 #define IF_RXPOLL_MODE_HOLDTIME         (1000ULL * 1000 * 1000) /* 1 sec */
645 static u_int64_t if_rxpoll_mode_holdtime = IF_RXPOLL_MODE_HOLDTIME;
646 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_freeze_time,
647     CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_mode_holdtime,
648     IF_RXPOLL_MODE_HOLDTIME, sysctl_rxpoll_mode_holdtime,
649     "Q", "input poll mode freeze time");
650 
651 #define IF_RXPOLL_SAMPLETIME_MIN        (1ULL * 1000 * 1000)    /* 1 ms */
652 #define IF_RXPOLL_SAMPLETIME            (10ULL * 1000 * 1000)   /* 10 ms */
653 static u_int64_t if_rxpoll_sample_holdtime = IF_RXPOLL_SAMPLETIME;
654 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_sample_time,
655     CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_sample_holdtime,
656     IF_RXPOLL_SAMPLETIME, sysctl_rxpoll_sample_holdtime,
657     "Q", "input poll sampling time");
658 
659 static u_int64_t if_rxpoll_interval_time = IF_RXPOLL_INTERVALTIME;
660 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_interval_time,
661     CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_interval_time,
662     IF_RXPOLL_INTERVALTIME, sysctl_rxpoll_interval_time,
663     "Q", "input poll interval (time)");
664 
665 #define IF_RXPOLL_INTERVAL_PKTS 0       /* 0 (disabled) */
666 u_int32_t if_rxpoll_interval_pkts = IF_RXPOLL_INTERVAL_PKTS;
667 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, rxpoll_interval_pkts,
668     CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_interval_pkts,
669     IF_RXPOLL_INTERVAL_PKTS, "input poll interval (packets)");
670 
671 #define IF_RXPOLL_WLOWAT        10
672 static u_int32_t if_sysctl_rxpoll_wlowat = IF_RXPOLL_WLOWAT;
673 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_wakeups_lowat,
674     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_sysctl_rxpoll_wlowat,
675     IF_RXPOLL_WLOWAT, sysctl_rxpoll_wlowat,
676     "I", "input poll wakeup low watermark");
677 
678 #define IF_RXPOLL_WHIWAT        100
679 static u_int32_t if_sysctl_rxpoll_whiwat = IF_RXPOLL_WHIWAT;
680 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_wakeups_hiwat,
681     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_sysctl_rxpoll_whiwat,
682     IF_RXPOLL_WHIWAT, sysctl_rxpoll_whiwat,
683     "I", "input poll wakeup high watermark");
684 
685 static u_int32_t if_rxpoll_max = 0;                     /* 0 (automatic) */
686 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, rxpoll_max,
687     CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_max, 0,
688     "max packets per poll call");
689 
690 u_int32_t if_rxpoll = 1;
691 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll,
692     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll, 0,
693     sysctl_rxpoll, "I", "enable opportunistic input polling");
694 
695 #if TEST_INPUT_THREAD_TERMINATION
696 static u_int32_t if_input_thread_termination_spin = 0;
697 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, input_thread_termination_spin,
698     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
699     &if_input_thread_termination_spin, 0,
700     sysctl_input_thread_termination_spin,
701     "I", "input thread termination spin limit");
702 #endif /* TEST_INPUT_THREAD_TERMINATION */
703 
704 static u_int32_t cur_dlil_input_threads = 0;
705 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, dlil_input_threads,
706     CTLFLAG_RD | CTLFLAG_LOCKED, &cur_dlil_input_threads, 0,
707     "Current number of DLIL input threads");
708 
709 #if IFNET_INPUT_SANITY_CHK
710 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, dlil_input_sanity_check,
711     CTLFLAG_RW | CTLFLAG_LOCKED, &dlil_input_sanity_check, 0,
712     "Turn on sanity checking in DLIL input");
713 #endif /* IFNET_INPUT_SANITY_CHK */
714 
715 static u_int32_t if_flowadv = 1;
716 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, flow_advisory,
717     CTLFLAG_RW | CTLFLAG_LOCKED, &if_flowadv, 1,
718     "enable flow-advisory mechanism");
719 
720 static u_int32_t if_delaybased_queue = 1;
721 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, delaybased_queue,
722     CTLFLAG_RW | CTLFLAG_LOCKED, &if_delaybased_queue, 1,
723     "enable delay based dynamic queue sizing");
724 
725 static uint64_t hwcksum_in_invalidated = 0;
726 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
727     hwcksum_in_invalidated, CTLFLAG_RD | CTLFLAG_LOCKED,
728     &hwcksum_in_invalidated, "inbound packets with invalidated hardware cksum");
729 
730 uint32_t hwcksum_dbg = 0;
731 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, hwcksum_dbg,
732     CTLFLAG_RW | CTLFLAG_LOCKED, &hwcksum_dbg, 0,
733     "enable hardware cksum debugging");
734 
735 u_int32_t ifnet_start_delayed = 0;
736 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, start_delayed,
737     CTLFLAG_RW | CTLFLAG_LOCKED, &ifnet_start_delayed, 0,
738     "number of times start was delayed");
739 
740 u_int32_t ifnet_delay_start_disabled = 0;
741 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, start_delay_disabled,
742     CTLFLAG_RW | CTLFLAG_LOCKED, &ifnet_delay_start_disabled, 0,
743     "number of times start was delayed");
744 
745 static inline void
ifnet_delay_start_disabled_increment(void)746 ifnet_delay_start_disabled_increment(void)
747 {
748 	OSIncrementAtomic(&ifnet_delay_start_disabled);
749 }
750 
751 #define HWCKSUM_DBG_PARTIAL_FORCED      0x1     /* forced partial checksum */
752 #define HWCKSUM_DBG_PARTIAL_RXOFF_ADJ   0x2     /* adjust start offset */
753 #define HWCKSUM_DBG_FINALIZE_FORCED     0x10    /* forced finalize */
754 #define HWCKSUM_DBG_MASK \
755 	(HWCKSUM_DBG_PARTIAL_FORCED | HWCKSUM_DBG_PARTIAL_RXOFF_ADJ |   \
756 	HWCKSUM_DBG_FINALIZE_FORCED)
757 
758 static uint32_t hwcksum_dbg_mode = 0;
759 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, hwcksum_dbg_mode,
760     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &hwcksum_dbg_mode,
761     0, sysctl_hwcksum_dbg_mode, "I", "hardware cksum debugging mode");
762 
763 static uint64_t hwcksum_dbg_partial_forced = 0;
764 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
765     hwcksum_dbg_partial_forced, CTLFLAG_RD | CTLFLAG_LOCKED,
766     &hwcksum_dbg_partial_forced, "packets forced using partial cksum");
767 
768 static uint64_t hwcksum_dbg_partial_forced_bytes = 0;
769 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
770     hwcksum_dbg_partial_forced_bytes, CTLFLAG_RD | CTLFLAG_LOCKED,
771     &hwcksum_dbg_partial_forced_bytes, "bytes forced using partial cksum");
772 
773 static uint32_t hwcksum_dbg_partial_rxoff_forced = 0;
774 SYSCTL_PROC(_net_link_generic_system, OID_AUTO,
775     hwcksum_dbg_partial_rxoff_forced, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
776     &hwcksum_dbg_partial_rxoff_forced, 0,
777     sysctl_hwcksum_dbg_partial_rxoff_forced, "I",
778     "forced partial cksum rx offset");
779 
780 static uint32_t hwcksum_dbg_partial_rxoff_adj = 0;
781 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, hwcksum_dbg_partial_rxoff_adj,
782     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &hwcksum_dbg_partial_rxoff_adj,
783     0, sysctl_hwcksum_dbg_partial_rxoff_adj, "I",
784     "adjusted partial cksum rx offset");
785 
786 static uint64_t hwcksum_dbg_verified = 0;
787 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
788     hwcksum_dbg_verified, CTLFLAG_RD | CTLFLAG_LOCKED,
789     &hwcksum_dbg_verified, "packets verified for having good checksum");
790 
791 static uint64_t hwcksum_dbg_bad_cksum = 0;
792 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
793     hwcksum_dbg_bad_cksum, CTLFLAG_RD | CTLFLAG_LOCKED,
794     &hwcksum_dbg_bad_cksum, "packets with bad hardware calculated checksum");
795 
796 static uint64_t hwcksum_dbg_bad_rxoff = 0;
797 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
798     hwcksum_dbg_bad_rxoff, CTLFLAG_RD | CTLFLAG_LOCKED,
799     &hwcksum_dbg_bad_rxoff, "packets with invalid rxoff");
800 
801 static uint64_t hwcksum_dbg_adjusted = 0;
802 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
803     hwcksum_dbg_adjusted, CTLFLAG_RD | CTLFLAG_LOCKED,
804     &hwcksum_dbg_adjusted, "packets with rxoff adjusted");
805 
806 static uint64_t hwcksum_dbg_finalized_hdr = 0;
807 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
808     hwcksum_dbg_finalized_hdr, CTLFLAG_RD | CTLFLAG_LOCKED,
809     &hwcksum_dbg_finalized_hdr, "finalized headers");
810 
811 static uint64_t hwcksum_dbg_finalized_data = 0;
812 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
813     hwcksum_dbg_finalized_data, CTLFLAG_RD | CTLFLAG_LOCKED,
814     &hwcksum_dbg_finalized_data, "finalized payloads");
815 
816 uint32_t hwcksum_tx = 1;
817 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, hwcksum_tx,
818     CTLFLAG_RW | CTLFLAG_LOCKED, &hwcksum_tx, 0,
819     "enable transmit hardware checksum offload");
820 
821 uint32_t hwcksum_rx = 1;
822 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, hwcksum_rx,
823     CTLFLAG_RW | CTLFLAG_LOCKED, &hwcksum_rx, 0,
824     "enable receive hardware checksum offload");
825 
826 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, tx_chain_len_stats,
827     CTLFLAG_RD | CTLFLAG_LOCKED, 0, 9,
828     sysctl_tx_chain_len_stats, "S", "");
829 
830 uint32_t tx_chain_len_count = 0;
831 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, tx_chain_len_count,
832     CTLFLAG_RW | CTLFLAG_LOCKED, &tx_chain_len_count, 0, "");
833 
834 static uint32_t threshold_notify = 1;           /* enable/disable */
835 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, threshold_notify,
836     CTLFLAG_RW | CTLFLAG_LOCKED, &threshold_notify, 0, "");
837 
838 static uint32_t threshold_interval = 2;         /* in seconds */
839 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, threshold_interval,
840     CTLFLAG_RW | CTLFLAG_LOCKED, &threshold_interval, 0, "");
841 
842 #if (DEVELOPMENT || DEBUG)
843 static int sysctl_get_kao_frames SYSCTL_HANDLER_ARGS;
844 SYSCTL_NODE(_net_link_generic_system, OID_AUTO, get_kao_frames,
845     CTLFLAG_RD | CTLFLAG_LOCKED, sysctl_get_kao_frames, "");
846 #endif /* DEVELOPMENT || DEBUG */
847 
848 struct net_api_stats net_api_stats;
849 SYSCTL_STRUCT(_net, OID_AUTO, api_stats, CTLFLAG_RD | CTLFLAG_LOCKED,
850     &net_api_stats, net_api_stats, "");
851 
852 uint32_t net_wake_pkt_debug = 0;
853 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, wake_pkt_debug,
854     CTLFLAG_RW | CTLFLAG_LOCKED, &net_wake_pkt_debug, 0, "");
855 
856 static void log_hexdump(void *data, size_t len);
857 
858 unsigned int net_rxpoll = 1;
859 unsigned int net_affinity = 1;
860 unsigned int net_async = 1;     /* 0: synchronous, 1: asynchronous */
861 
862 static kern_return_t dlil_affinity_set(struct thread *, u_int32_t);
863 
864 extern u_int32_t        inject_buckets;
865 
866 /* DLIL data threshold thread call */
867 static void dlil_dt_tcall_fn(thread_call_param_t, thread_call_param_t);
868 
869 void
ifnet_filter_update_tso(struct ifnet * ifp,boolean_t filter_enable)870 ifnet_filter_update_tso(struct ifnet *ifp, boolean_t filter_enable)
871 {
872 	/*
873 	 * update filter count and route_generation ID to let TCP
874 	 * know it should reevalute doing TSO or not
875 	 */
876 	if (filter_enable) {
877 		OSAddAtomic(1, &ifp->if_flt_no_tso_count);
878 	} else {
879 		VERIFY(ifp->if_flt_no_tso_count != 0);
880 		OSAddAtomic(-1, &ifp->if_flt_no_tso_count);
881 	}
882 	routegenid_update();
883 }
884 
885 #if SKYWALK
886 
887 #if defined(XNU_TARGET_OS_OSX)
888 static bool net_check_compatible_if_filter(struct ifnet *ifp);
889 #endif /* XNU_TARGET_OS_OSX */
890 
891 /* if_attach_nx flags defined in os_skywalk_private.h */
892 static unsigned int if_attach_nx = IF_ATTACH_NX_DEFAULT;
893 unsigned int if_enable_fsw_ip_netagent =
894     ((IF_ATTACH_NX_DEFAULT & IF_ATTACH_NX_FSW_IP_NETAGENT) != 0);
895 unsigned int if_enable_fsw_transport_netagent =
896     ((IF_ATTACH_NX_DEFAULT & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0);
897 
898 unsigned int if_netif_all =
899     ((IF_ATTACH_NX_DEFAULT & IF_ATTACH_NX_NETIF_ALL) != 0);
900 
901 /* Configure flowswitch to use max mtu sized buffer */
902 static bool fsw_use_max_mtu_buffer = false;
903 
904 #if (DEVELOPMENT || DEBUG)
905 static int
906 if_attach_nx_sysctl SYSCTL_HANDLER_ARGS
907 {
908 #pragma unused(oidp, arg1, arg2)
909 	unsigned int new_value;
910 	int changed;
911 	int error = sysctl_io_number(req, if_attach_nx, sizeof(if_attach_nx),
912 	    &new_value, &changed);
913 	if (error) {
914 		return error;
915 	}
916 	if (changed) {
917 		if ((new_value & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) !=
918 		    (if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT)) {
919 			return ENOTSUP;
920 		}
921 		if_attach_nx = new_value;
922 	}
923 	return 0;
924 }
925 
926 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, if_attach_nx,
927     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
928     0, 0, &if_attach_nx_sysctl, "IU", "attach nexus");
929 
930 #endif /* DEVELOPMENT || DEBUG */
931 
932 static int
933 if_enable_fsw_transport_netagent_sysctl SYSCTL_HANDLER_ARGS
934 {
935 #pragma unused(oidp, arg1, arg2)
936 	unsigned int new_value;
937 	int changed;
938 	int error;
939 
940 	error = sysctl_io_number(req, if_enable_fsw_transport_netagent,
941 	    sizeof(if_enable_fsw_transport_netagent),
942 	    &new_value, &changed);
943 	if (error == 0 && changed != 0) {
944 		if (new_value != 0 && new_value != 1) {
945 			/* only allow 0 or 1 */
946 			error = EINVAL;
947 		} else if ((if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0) {
948 			/* netagent can be enabled/disabled */
949 			if_enable_fsw_transport_netagent = new_value;
950 			if (new_value == 0) {
951 				kern_nexus_deregister_netagents();
952 			} else {
953 				kern_nexus_register_netagents();
954 			}
955 		} else {
956 			/* netagent can't be enabled */
957 			error = ENOTSUP;
958 		}
959 	}
960 	return error;
961 }
962 
963 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, enable_netagent,
964     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
965     0, 0, &if_enable_fsw_transport_netagent_sysctl, "IU",
966     "enable flowswitch netagent");
967 
968 static void dlil_detach_flowswitch_nexus(if_nexus_flowswitch_t nexus_fsw);
969 
970 #include <skywalk/os_skywalk_private.h>
971 
972 boolean_t
ifnet_nx_noauto(ifnet_t ifp)973 ifnet_nx_noauto(ifnet_t ifp)
974 {
975 	return (ifp->if_xflags & IFXF_NX_NOAUTO) != 0;
976 }
977 
978 boolean_t
ifnet_nx_noauto_flowswitch(ifnet_t ifp)979 ifnet_nx_noauto_flowswitch(ifnet_t ifp)
980 {
981 	return ifnet_is_low_latency(ifp);
982 }
983 
984 boolean_t
ifnet_is_low_latency(ifnet_t ifp)985 ifnet_is_low_latency(ifnet_t ifp)
986 {
987 	return (ifp->if_xflags & IFXF_LOW_LATENCY) != 0;
988 }
989 
990 boolean_t
ifnet_needs_compat(ifnet_t ifp)991 ifnet_needs_compat(ifnet_t ifp)
992 {
993 	if ((if_attach_nx & IF_ATTACH_NX_NETIF_COMPAT) == 0) {
994 		return FALSE;
995 	}
996 #if !XNU_TARGET_OS_OSX
997 	/*
998 	 * To conserve memory, we plumb in the compat layer selectively; this
999 	 * can be overridden via if_attach_nx flag IF_ATTACH_NX_NETIF_ALL.
1000 	 * In particular, we check for Wi-Fi Access Point.
1001 	 */
1002 	if (IFNET_IS_WIFI(ifp)) {
1003 		/* Wi-Fi Access Point */
1004 		if (ifp->if_name[0] == 'a' && ifp->if_name[1] == 'p' &&
1005 		    ifp->if_name[2] == '\0') {
1006 			return if_netif_all;
1007 		}
1008 	}
1009 #else /* XNU_TARGET_OS_OSX */
1010 #pragma unused(ifp)
1011 #endif /* XNU_TARGET_OS_OSX */
1012 	return TRUE;
1013 }
1014 
1015 boolean_t
ifnet_needs_fsw_transport_netagent(ifnet_t ifp)1016 ifnet_needs_fsw_transport_netagent(ifnet_t ifp)
1017 {
1018 	if (if_is_fsw_transport_netagent_enabled()) {
1019 		/* check if netagent has been manually enabled for ipsec/utun */
1020 		if (ifp->if_family == IFNET_FAMILY_IPSEC) {
1021 			return ipsec_interface_needs_netagent(ifp);
1022 		} else if (ifp->if_family == IFNET_FAMILY_UTUN) {
1023 			return utun_interface_needs_netagent(ifp);
1024 		}
1025 
1026 		/* check ifnet no auto nexus override */
1027 		if (ifnet_nx_noauto(ifp)) {
1028 			return FALSE;
1029 		}
1030 
1031 		/* check global if_attach_nx configuration */
1032 		switch (ifp->if_family) {
1033 		case IFNET_FAMILY_CELLULAR:
1034 		case IFNET_FAMILY_ETHERNET:
1035 			if ((if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0) {
1036 				return TRUE;
1037 			}
1038 			break;
1039 		default:
1040 			break;
1041 		}
1042 	}
1043 	return FALSE;
1044 }
1045 
1046 boolean_t
ifnet_needs_fsw_ip_netagent(ifnet_t ifp)1047 ifnet_needs_fsw_ip_netagent(ifnet_t ifp)
1048 {
1049 #pragma unused(ifp)
1050 	if ((if_attach_nx & IF_ATTACH_NX_FSW_IP_NETAGENT) != 0) {
1051 		return TRUE;
1052 	}
1053 	return FALSE;
1054 }
1055 
1056 boolean_t
ifnet_needs_netif_netagent(ifnet_t ifp)1057 ifnet_needs_netif_netagent(ifnet_t ifp)
1058 {
1059 #pragma unused(ifp)
1060 	return (if_attach_nx & IF_ATTACH_NX_NETIF_NETAGENT) != 0;
1061 }
1062 
1063 static boolean_t
dlil_detach_nexus_instance(nexus_controller_t controller,const char * func_str,uuid_t instance,uuid_t device)1064 dlil_detach_nexus_instance(nexus_controller_t controller,
1065     const char *func_str, uuid_t instance, uuid_t device)
1066 {
1067 	errno_t         err;
1068 
1069 	if (instance == NULL || uuid_is_null(instance)) {
1070 		return FALSE;
1071 	}
1072 
1073 	/* followed by the device port */
1074 	if (device != NULL && !uuid_is_null(device)) {
1075 		err = kern_nexus_ifdetach(controller, instance, device);
1076 		if (err != 0) {
1077 			DLIL_PRINTF("%s kern_nexus_ifdetach device failed %d\n",
1078 			    func_str, err);
1079 		}
1080 	}
1081 	err = kern_nexus_controller_free_provider_instance(controller,
1082 	    instance);
1083 	if (err != 0) {
1084 		DLIL_PRINTF("%s free_provider_instance failed %d\n",
1085 		    func_str, err);
1086 	}
1087 	return TRUE;
1088 }
1089 
1090 static boolean_t
dlil_detach_nexus(const char * func_str,uuid_t provider,uuid_t instance,uuid_t device)1091 dlil_detach_nexus(const char *func_str, uuid_t provider, uuid_t instance,
1092     uuid_t device)
1093 {
1094 	boolean_t               detached = FALSE;
1095 	nexus_controller_t      controller = kern_nexus_shared_controller();
1096 	int                     err;
1097 
1098 	if (dlil_detach_nexus_instance(controller, func_str, instance,
1099 	    device)) {
1100 		detached = TRUE;
1101 	}
1102 	if (provider != NULL && !uuid_is_null(provider)) {
1103 		detached = TRUE;
1104 		err = kern_nexus_controller_deregister_provider(controller,
1105 		    provider);
1106 		if (err != 0) {
1107 			DLIL_PRINTF("%s deregister_provider %d\n",
1108 			    func_str, err);
1109 		}
1110 	}
1111 	return detached;
1112 }
1113 
1114 static errno_t
dlil_create_provider_and_instance(nexus_controller_t controller,nexus_type_t type,ifnet_t ifp,uuid_t * provider,uuid_t * instance,nexus_attr_t attr)1115 dlil_create_provider_and_instance(nexus_controller_t controller,
1116     nexus_type_t type, ifnet_t ifp, uuid_t *provider, uuid_t *instance,
1117     nexus_attr_t attr)
1118 {
1119 	uuid_t          dom_prov;
1120 	errno_t         err;
1121 	nexus_name_t    provider_name;
1122 	const char      *type_name =
1123 	    (type == NEXUS_TYPE_NET_IF) ? "netif" : "flowswitch";
1124 	struct kern_nexus_init init;
1125 
1126 	err = kern_nexus_get_default_domain_provider(type, &dom_prov);
1127 	if (err != 0) {
1128 		DLIL_PRINTF("%s can't get %s provider, error %d\n",
1129 		    __func__, type_name, err);
1130 		goto failed;
1131 	}
1132 
1133 	snprintf((char *)provider_name, sizeof(provider_name),
1134 	    "com.apple.%s.%s", type_name, if_name(ifp));
1135 	err = kern_nexus_controller_register_provider(controller,
1136 	    dom_prov,
1137 	    provider_name,
1138 	    NULL,
1139 	    0,
1140 	    attr,
1141 	    provider);
1142 	if (err != 0) {
1143 		DLIL_PRINTF("%s register %s provider failed, error %d\n",
1144 		    __func__, type_name, err);
1145 		goto failed;
1146 	}
1147 	bzero(&init, sizeof(init));
1148 	init.nxi_version = KERN_NEXUS_CURRENT_VERSION;
1149 	err = kern_nexus_controller_alloc_provider_instance(controller,
1150 	    *provider,
1151 	    NULL, NULL,
1152 	    instance, &init);
1153 	if (err != 0) {
1154 		DLIL_PRINTF("%s alloc_provider_instance %s failed, %d\n",
1155 		    __func__, type_name, err);
1156 		kern_nexus_controller_deregister_provider(controller,
1157 		    *provider);
1158 		goto failed;
1159 	}
1160 failed:
1161 	return err;
1162 }
1163 
1164 static boolean_t
dlil_attach_netif_nexus_common(ifnet_t ifp,if_nexus_netif_t netif_nx)1165 dlil_attach_netif_nexus_common(ifnet_t ifp, if_nexus_netif_t netif_nx)
1166 {
1167 	nexus_attr_t            attr = NULL;
1168 	nexus_controller_t      controller;
1169 	errno_t                 err;
1170 
1171 	if ((ifp->if_capabilities & IFCAP_SKYWALK) != 0) {
1172 		/* it's already attached */
1173 		if (dlil_verbose) {
1174 			DLIL_PRINTF("%s: %s already has nexus attached\n",
1175 			    __func__, if_name(ifp));
1176 			/* already attached */
1177 		}
1178 		goto failed;
1179 	}
1180 
1181 	err = kern_nexus_attr_create(&attr);
1182 	if (err != 0) {
1183 		DLIL_PRINTF("%s: nexus attr create for %s\n", __func__,
1184 		    if_name(ifp));
1185 		goto failed;
1186 	}
1187 	err = kern_nexus_attr_set(attr, NEXUS_ATTR_IFINDEX, ifp->if_index);
1188 	VERIFY(err == 0);
1189 
1190 	controller = kern_nexus_shared_controller();
1191 
1192 	/* create the netif provider and instance */
1193 	err = dlil_create_provider_and_instance(controller,
1194 	    NEXUS_TYPE_NET_IF, ifp, &netif_nx->if_nif_provider,
1195 	    &netif_nx->if_nif_instance, attr);
1196 	if (err != 0) {
1197 		goto failed;
1198 	}
1199 	err = kern_nexus_ifattach(controller, netif_nx->if_nif_instance,
1200 	    ifp, NULL, FALSE, &netif_nx->if_nif_attach);
1201 	if (err != 0) {
1202 		DLIL_PRINTF("%s kern_nexus_ifattach %d\n",
1203 		    __func__, err);
1204 		/* cleanup provider and instance */
1205 		dlil_detach_nexus(__func__, netif_nx->if_nif_provider,
1206 		    netif_nx->if_nif_instance, NULL);
1207 		goto failed;
1208 	}
1209 	return TRUE;
1210 
1211 failed:
1212 	if (attr != NULL) {
1213 		kern_nexus_attr_destroy(attr);
1214 	}
1215 	return FALSE;
1216 }
1217 
1218 static boolean_t
dlil_attach_netif_compat_nexus(ifnet_t ifp,if_nexus_netif_t netif_nx)1219 dlil_attach_netif_compat_nexus(ifnet_t ifp, if_nexus_netif_t netif_nx)
1220 {
1221 	if (ifnet_nx_noauto(ifp) || IFNET_IS_INTCOPROC(ifp) ||
1222 	    IFNET_IS_MANAGEMENT(ifp) || IFNET_IS_VMNET(ifp)) {
1223 		goto failed;
1224 	}
1225 	switch (ifp->if_type) {
1226 	case IFT_CELLULAR:
1227 	case IFT_ETHER:
1228 		if ((if_attach_nx & IF_ATTACH_NX_NETIF_COMPAT) == 0) {
1229 			/* don't auto-attach */
1230 			goto failed;
1231 		}
1232 		break;
1233 	default:
1234 		/* don't auto-attach */
1235 		goto failed;
1236 	}
1237 	return dlil_attach_netif_nexus_common(ifp, netif_nx);
1238 
1239 failed:
1240 	return FALSE;
1241 }
1242 
1243 static boolean_t
dlil_is_native_netif_nexus(ifnet_t ifp)1244 dlil_is_native_netif_nexus(ifnet_t ifp)
1245 {
1246 	return (ifp->if_eflags & IFEF_SKYWALK_NATIVE) && ifp->if_na != NULL;
1247 }
1248 
1249 __attribute__((noinline))
1250 static void
dlil_detach_netif_nexus(if_nexus_netif_t nexus_netif)1251 dlil_detach_netif_nexus(if_nexus_netif_t nexus_netif)
1252 {
1253 	dlil_detach_nexus(__func__, nexus_netif->if_nif_provider,
1254 	    nexus_netif->if_nif_instance, nexus_netif->if_nif_attach);
1255 }
1256 
1257 static inline int
dlil_siocgifdevmtu(struct ifnet * ifp,struct ifdevmtu * ifdm_p)1258 dlil_siocgifdevmtu(struct ifnet * ifp, struct ifdevmtu * ifdm_p)
1259 {
1260 	struct ifreq        ifr;
1261 	int                 error;
1262 
1263 	bzero(&ifr, sizeof(ifr));
1264 	error = ifnet_ioctl(ifp, 0, SIOCGIFDEVMTU, &ifr);
1265 	if (error == 0) {
1266 		*ifdm_p = ifr.ifr_devmtu;
1267 	}
1268 	return error;
1269 }
1270 
1271 static inline void
_dlil_adjust_large_buf_size_for_tso(ifnet_t ifp,uint32_t * large_buf_size)1272 _dlil_adjust_large_buf_size_for_tso(ifnet_t ifp, uint32_t *large_buf_size)
1273 {
1274 #ifdef XNU_TARGET_OS_OSX
1275 	uint32_t tso_v4_mtu = 0;
1276 	uint32_t tso_v6_mtu = 0;
1277 
1278 	if (!dlil_is_native_netif_nexus(ifp)) {
1279 		return;
1280 	}
1281 	/*
1282 	 * Note that we are reading the real hwassist flags set by the driver
1283 	 * and not the adjusted ones because nx_netif_host_adjust_if_capabilities()
1284 	 * hasn't been called yet.
1285 	 */
1286 	if ((ifp->if_hwassist & IFNET_TSO_IPV4) != 0) {
1287 		tso_v4_mtu = ifp->if_tso_v4_mtu;
1288 	}
1289 	if ((ifp->if_hwassist & IFNET_TSO_IPV6) != 0) {
1290 		tso_v6_mtu = ifp->if_tso_v6_mtu;
1291 	}
1292 	/*
1293 	 * If the hardware supports TSO, adjust the large buf size to match the
1294 	 * supported TSO MTU size.
1295 	 */
1296 	if (tso_v4_mtu != 0 || tso_v6_mtu != 0) {
1297 		*large_buf_size = MAX(tso_v4_mtu, tso_v6_mtu);
1298 	} else {
1299 		*large_buf_size = MAX(*large_buf_size, sk_fsw_gso_mtu);
1300 	}
1301 	*large_buf_size = MIN(NX_FSW_MAX_LARGE_BUFSIZE, *large_buf_size);
1302 #else
1303 #pragma unused(ifp, large_buf_size)
1304 #endif /* XNU_TARGET_OS_OSX */
1305 }
1306 
1307 static inline int
_dlil_get_flowswitch_buffer_size(ifnet_t ifp,uuid_t netif,uint32_t * buf_size,bool * use_multi_buflet,uint32_t * large_buf_size)1308 _dlil_get_flowswitch_buffer_size(ifnet_t ifp, uuid_t netif, uint32_t *buf_size,
1309     bool *use_multi_buflet, uint32_t *large_buf_size)
1310 {
1311 	struct kern_pbufpool_memory_info rx_pp_info;
1312 	struct kern_pbufpool_memory_info tx_pp_info;
1313 	uint32_t if_max_mtu = 0;
1314 	uint32_t drv_buf_size;
1315 	struct ifdevmtu ifdm;
1316 	int err;
1317 
1318 	/*
1319 	 * To perform intra-stack RX aggregation flowswitch needs to use
1320 	 * multi-buflet packet.
1321 	 */
1322 	*use_multi_buflet = NX_FSW_TCP_RX_AGG_ENABLED();
1323 
1324 	*large_buf_size = *use_multi_buflet ? NX_FSW_DEF_LARGE_BUFSIZE : 0;
1325 	/*
1326 	 * IP over Thunderbolt interface can deliver the largest IP packet,
1327 	 * but the driver advertises the MAX MTU as only 9K.
1328 	 */
1329 	if (IFNET_IS_THUNDERBOLT_IP(ifp)) {
1330 		if_max_mtu = IP_MAXPACKET;
1331 		goto skip_mtu_ioctl;
1332 	}
1333 
1334 	/* determine max mtu */
1335 	bzero(&ifdm, sizeof(ifdm));
1336 	err = dlil_siocgifdevmtu(ifp, &ifdm);
1337 	if (__improbable(err != 0)) {
1338 		DLIL_PRINTF("%s: SIOCGIFDEVMTU failed for %s\n",
1339 		    __func__, if_name(ifp));
1340 		/* use default flowswitch buffer size */
1341 		if_max_mtu = NX_FSW_BUFSIZE;
1342 	} else {
1343 		DLIL_PRINTF("%s: %s %d %d\n", __func__, if_name(ifp),
1344 		    ifdm.ifdm_max, ifdm.ifdm_current);
1345 		/* rdar://problem/44589731 */
1346 		if_max_mtu = MAX(ifdm.ifdm_max, ifdm.ifdm_current);
1347 	}
1348 
1349 skip_mtu_ioctl:
1350 	if (if_max_mtu == 0) {
1351 		DLIL_PRINTF("%s: can't determine MAX MTU for %s\n",
1352 		    __func__, if_name(ifp));
1353 		return EINVAL;
1354 	}
1355 	if ((if_max_mtu > NX_FSW_MAXBUFSIZE) && fsw_use_max_mtu_buffer) {
1356 		DLIL_PRINTF("%s: interace (%s) has MAX MTU (%u) > flowswitch "
1357 		    "max bufsize(%d)\n", __func__,
1358 		    if_name(ifp), if_max_mtu, NX_FSW_MAXBUFSIZE);
1359 		return EINVAL;
1360 	}
1361 
1362 	/*
1363 	 * for skywalk native driver, consult the driver packet pool also.
1364 	 */
1365 	if (dlil_is_native_netif_nexus(ifp)) {
1366 		err = kern_nexus_get_pbufpool_info(netif, &rx_pp_info,
1367 		    &tx_pp_info);
1368 		if (err != 0) {
1369 			DLIL_PRINTF("%s: can't get pbufpool info for %s\n",
1370 			    __func__, if_name(ifp));
1371 			return ENXIO;
1372 		}
1373 		drv_buf_size = tx_pp_info.kpm_bufsize *
1374 		    tx_pp_info.kpm_max_frags;
1375 		if (if_max_mtu > drv_buf_size) {
1376 			DLIL_PRINTF("%s: interface %s packet pool (rx %d * %d, "
1377 			    "tx %d * %d) can't support max mtu(%d)\n", __func__,
1378 			    if_name(ifp), rx_pp_info.kpm_bufsize,
1379 			    rx_pp_info.kpm_max_frags, tx_pp_info.kpm_bufsize,
1380 			    tx_pp_info.kpm_max_frags, if_max_mtu);
1381 			return EINVAL;
1382 		}
1383 	} else {
1384 		drv_buf_size = if_max_mtu;
1385 	}
1386 
1387 	if ((drv_buf_size > NX_FSW_BUFSIZE) && (!fsw_use_max_mtu_buffer)) {
1388 		_CASSERT((NX_FSW_BUFSIZE * NX_PBUF_FRAGS_MAX) >= IP_MAXPACKET);
1389 		*use_multi_buflet = true;
1390 		/* default flowswitch buffer size */
1391 		*buf_size = NX_FSW_BUFSIZE;
1392 		*large_buf_size = MIN(NX_FSW_MAX_LARGE_BUFSIZE, drv_buf_size);
1393 	} else {
1394 		*buf_size = MAX(drv_buf_size, NX_FSW_BUFSIZE);
1395 	}
1396 	_dlil_adjust_large_buf_size_for_tso(ifp, large_buf_size);
1397 	ASSERT(*buf_size <= NX_FSW_MAXBUFSIZE);
1398 	if (*buf_size >= *large_buf_size) {
1399 		*large_buf_size = 0;
1400 	}
1401 	return 0;
1402 }
1403 
1404 static boolean_t
_dlil_attach_flowswitch_nexus(ifnet_t ifp,if_nexus_flowswitch_t nexus_fsw)1405 _dlil_attach_flowswitch_nexus(ifnet_t ifp, if_nexus_flowswitch_t nexus_fsw)
1406 {
1407 	nexus_attr_t            attr = NULL;
1408 	nexus_controller_t      controller;
1409 	errno_t                 err = 0;
1410 	uuid_t                  netif;
1411 	uint32_t                buf_size = 0;
1412 	uint32_t                large_buf_size = 0;
1413 	bool                    multi_buflet;
1414 
1415 	if (ifnet_nx_noauto(ifp) || ifnet_nx_noauto_flowswitch(ifp) ||
1416 	    IFNET_IS_VMNET(ifp)) {
1417 		goto failed;
1418 	}
1419 
1420 	if ((ifp->if_capabilities & IFCAP_SKYWALK) == 0) {
1421 		/* not possible to attach (netif native/compat not plumbed) */
1422 		goto failed;
1423 	}
1424 
1425 	if ((if_attach_nx & IF_ATTACH_NX_FLOWSWITCH) == 0) {
1426 		/* don't auto-attach */
1427 		goto failed;
1428 	}
1429 
1430 	/* get the netif instance from the ifp */
1431 	err = kern_nexus_get_netif_instance(ifp, netif);
1432 	if (err != 0) {
1433 		DLIL_PRINTF("%s: can't find netif for %s\n", __func__,
1434 		    if_name(ifp));
1435 		goto failed;
1436 	}
1437 
1438 	err = kern_nexus_attr_create(&attr);
1439 	if (err != 0) {
1440 		DLIL_PRINTF("%s: nexus attr create for %s\n", __func__,
1441 		    if_name(ifp));
1442 		goto failed;
1443 	}
1444 
1445 	err = _dlil_get_flowswitch_buffer_size(ifp, netif, &buf_size,
1446 	    &multi_buflet, &large_buf_size);
1447 	if (err != 0) {
1448 		goto failed;
1449 	}
1450 	ASSERT((buf_size >= NX_FSW_BUFSIZE) && (buf_size <= NX_FSW_MAXBUFSIZE));
1451 	ASSERT(large_buf_size <= NX_FSW_MAX_LARGE_BUFSIZE);
1452 
1453 	/* Configure flowswitch buffer size */
1454 	err = kern_nexus_attr_set(attr, NEXUS_ATTR_SLOT_BUF_SIZE, buf_size);
1455 	VERIFY(err == 0);
1456 	err = kern_nexus_attr_set(attr, NEXUS_ATTR_LARGE_BUF_SIZE,
1457 	    large_buf_size);
1458 	VERIFY(err == 0);
1459 
1460 	/*
1461 	 * Configure flowswitch to use super-packet (multi-buflet).
1462 	 */
1463 	err = kern_nexus_attr_set(attr, NEXUS_ATTR_MAX_FRAGS,
1464 	    multi_buflet ? NX_PBUF_FRAGS_MAX : 1);
1465 	VERIFY(err == 0);
1466 
1467 	/* create the flowswitch provider and instance */
1468 	controller = kern_nexus_shared_controller();
1469 	err = dlil_create_provider_and_instance(controller,
1470 	    NEXUS_TYPE_FLOW_SWITCH, ifp, &nexus_fsw->if_fsw_provider,
1471 	    &nexus_fsw->if_fsw_instance, attr);
1472 	if (err != 0) {
1473 		goto failed;
1474 	}
1475 
1476 	/* attach the device port */
1477 	err = kern_nexus_ifattach(controller, nexus_fsw->if_fsw_instance,
1478 	    NULL, netif, FALSE, &nexus_fsw->if_fsw_device);
1479 	if (err != 0) {
1480 		DLIL_PRINTF("%s kern_nexus_ifattach device failed %d %s\n",
1481 		    __func__, err, if_name(ifp));
1482 		/* cleanup provider and instance */
1483 		dlil_detach_nexus(__func__, nexus_fsw->if_fsw_provider,
1484 		    nexus_fsw->if_fsw_instance, nexus_fsw->if_fsw_device);
1485 		goto failed;
1486 	}
1487 	return TRUE;
1488 
1489 failed:
1490 	if (err != 0) {
1491 		DLIL_PRINTF("%s: failed to attach flowswitch to %s, error %d\n",
1492 		    __func__, if_name(ifp), err);
1493 	} else {
1494 		DLIL_PRINTF("%s: not attaching flowswitch to %s\n",
1495 		    __func__, if_name(ifp));
1496 	}
1497 	if (attr != NULL) {
1498 		kern_nexus_attr_destroy(attr);
1499 	}
1500 	return FALSE;
1501 }
1502 
1503 static boolean_t
dlil_attach_flowswitch_nexus(ifnet_t ifp)1504 dlil_attach_flowswitch_nexus(ifnet_t ifp)
1505 {
1506 	boolean_t               attached;
1507 	if_nexus_flowswitch     nexus_fsw;
1508 
1509 #if (DEVELOPMENT || DEBUG)
1510 	if (skywalk_netif_direct_allowed(if_name(ifp))) {
1511 		DLIL_PRINTF("skip attaching fsw to %s", if_name(ifp));
1512 		return FALSE;
1513 	}
1514 #endif /* (DEVELOPMENT || DEBUG) */
1515 
1516 	/*
1517 	 * flowswitch attachment is not supported for interface using the
1518 	 * legacy model (IFNET_INIT_LEGACY)
1519 	 */
1520 	if ((ifp->if_eflags & IFEF_TXSTART) == 0) {
1521 		DLIL_PRINTF("skip attaching fsw to %s using legacy TX model",
1522 		    if_name(ifp));
1523 		return FALSE;
1524 	}
1525 
1526 	if (uuid_is_null(ifp->if_nx_flowswitch.if_fsw_instance) == 0) {
1527 		/* it's already attached */
1528 		return FALSE;
1529 	}
1530 	bzero(&nexus_fsw, sizeof(nexus_fsw));
1531 	attached = _dlil_attach_flowswitch_nexus(ifp, &nexus_fsw);
1532 	if (attached) {
1533 		ifnet_lock_exclusive(ifp);
1534 		if (!IF_FULLY_ATTACHED(ifp)) {
1535 			/* interface is going away */
1536 			attached = FALSE;
1537 		} else {
1538 			ifp->if_nx_flowswitch = nexus_fsw;
1539 		}
1540 		ifnet_lock_done(ifp);
1541 		if (!attached) {
1542 			/* clean up flowswitch nexus */
1543 			dlil_detach_flowswitch_nexus(&nexus_fsw);
1544 		}
1545 	}
1546 	return attached;
1547 }
1548 
1549 __attribute__((noinline))
1550 static void
dlil_detach_flowswitch_nexus(if_nexus_flowswitch_t nexus_fsw)1551 dlil_detach_flowswitch_nexus(if_nexus_flowswitch_t nexus_fsw)
1552 {
1553 	dlil_detach_nexus(__func__, nexus_fsw->if_fsw_provider,
1554 	    nexus_fsw->if_fsw_instance, nexus_fsw->if_fsw_device);
1555 }
1556 
1557 __attribute__((noinline))
1558 static void
dlil_netif_detach_notify(ifnet_t ifp)1559 dlil_netif_detach_notify(ifnet_t ifp)
1560 {
1561 	ifnet_detach_notify_cb_t notify = NULL;
1562 	void *arg = NULL;
1563 
1564 	ifnet_get_detach_notify(ifp, &notify, &arg);
1565 	if (notify == NULL) {
1566 		DTRACE_SKYWALK1(no__notify, ifnet_t, ifp);
1567 		return;
1568 	}
1569 	(*notify)(arg);
1570 }
1571 
1572 __attribute__((noinline))
1573 static void
dlil_quiesce_and_detach_nexuses(ifnet_t ifp)1574 dlil_quiesce_and_detach_nexuses(ifnet_t ifp)
1575 {
1576 	if_nexus_flowswitch *nx_fsw = &ifp->if_nx_flowswitch;
1577 	if_nexus_netif *nx_netif = &ifp->if_nx_netif;
1578 
1579 	ifnet_datamov_suspend_and_drain(ifp);
1580 	if (!uuid_is_null(nx_fsw->if_fsw_device)) {
1581 		ASSERT(!uuid_is_null(nx_fsw->if_fsw_provider));
1582 		ASSERT(!uuid_is_null(nx_fsw->if_fsw_instance));
1583 		dlil_detach_flowswitch_nexus(nx_fsw);
1584 		bzero(nx_fsw, sizeof(*nx_fsw));
1585 	} else {
1586 		ASSERT(uuid_is_null(nx_fsw->if_fsw_provider));
1587 		ASSERT(uuid_is_null(nx_fsw->if_fsw_instance));
1588 		DTRACE_IP1(fsw__not__attached, ifnet_t, ifp);
1589 	}
1590 
1591 	if (!uuid_is_null(nx_netif->if_nif_attach)) {
1592 		ASSERT(!uuid_is_null(nx_netif->if_nif_provider));
1593 		ASSERT(!uuid_is_null(nx_netif->if_nif_instance));
1594 		dlil_detach_netif_nexus(nx_netif);
1595 		bzero(nx_netif, sizeof(*nx_netif));
1596 	} else {
1597 		ASSERT(uuid_is_null(nx_netif->if_nif_provider));
1598 		ASSERT(uuid_is_null(nx_netif->if_nif_instance));
1599 		DTRACE_IP1(netif__not__attached, ifnet_t, ifp);
1600 	}
1601 	ifnet_datamov_resume(ifp);
1602 }
1603 
1604 boolean_t
ifnet_add_netagent(ifnet_t ifp)1605 ifnet_add_netagent(ifnet_t ifp)
1606 {
1607 	int     error;
1608 
1609 	error = kern_nexus_interface_add_netagent(ifp);
1610 	os_log(OS_LOG_DEFAULT,
1611 	    "kern_nexus_interface_add_netagent(%s) returned %d",
1612 	    ifp->if_xname, error);
1613 	return error == 0;
1614 }
1615 
1616 boolean_t
ifnet_remove_netagent(ifnet_t ifp)1617 ifnet_remove_netagent(ifnet_t ifp)
1618 {
1619 	int     error;
1620 
1621 	error = kern_nexus_interface_remove_netagent(ifp);
1622 	os_log(OS_LOG_DEFAULT,
1623 	    "kern_nexus_interface_remove_netagent(%s) returned %d",
1624 	    ifp->if_xname, error);
1625 	return error == 0;
1626 }
1627 
1628 boolean_t
ifnet_attach_flowswitch_nexus(ifnet_t ifp)1629 ifnet_attach_flowswitch_nexus(ifnet_t ifp)
1630 {
1631 	if (!IF_FULLY_ATTACHED(ifp)) {
1632 		return FALSE;
1633 	}
1634 	return dlil_attach_flowswitch_nexus(ifp);
1635 }
1636 
1637 boolean_t
ifnet_detach_flowswitch_nexus(ifnet_t ifp)1638 ifnet_detach_flowswitch_nexus(ifnet_t ifp)
1639 {
1640 	if_nexus_flowswitch     nexus_fsw;
1641 
1642 	ifnet_lock_exclusive(ifp);
1643 	nexus_fsw = ifp->if_nx_flowswitch;
1644 	bzero(&ifp->if_nx_flowswitch, sizeof(ifp->if_nx_flowswitch));
1645 	ifnet_lock_done(ifp);
1646 	return dlil_detach_nexus(__func__, nexus_fsw.if_fsw_provider,
1647 	           nexus_fsw.if_fsw_instance, nexus_fsw.if_fsw_device);
1648 }
1649 
1650 boolean_t
ifnet_attach_netif_nexus(ifnet_t ifp)1651 ifnet_attach_netif_nexus(ifnet_t ifp)
1652 {
1653 	boolean_t       nexus_attached;
1654 	if_nexus_netif  nexus_netif;
1655 
1656 	if (!IF_FULLY_ATTACHED(ifp)) {
1657 		return FALSE;
1658 	}
1659 	nexus_attached = dlil_attach_netif_nexus_common(ifp, &nexus_netif);
1660 	if (nexus_attached) {
1661 		ifnet_lock_exclusive(ifp);
1662 		ifp->if_nx_netif = nexus_netif;
1663 		ifnet_lock_done(ifp);
1664 	}
1665 	return nexus_attached;
1666 }
1667 
1668 boolean_t
ifnet_detach_netif_nexus(ifnet_t ifp)1669 ifnet_detach_netif_nexus(ifnet_t ifp)
1670 {
1671 	if_nexus_netif  nexus_netif;
1672 
1673 	ifnet_lock_exclusive(ifp);
1674 	nexus_netif = ifp->if_nx_netif;
1675 	bzero(&ifp->if_nx_netif, sizeof(ifp->if_nx_netif));
1676 	ifnet_lock_done(ifp);
1677 
1678 	return dlil_detach_nexus(__func__, nexus_netif.if_nif_provider,
1679 	           nexus_netif.if_nif_instance, nexus_netif.if_nif_attach);
1680 }
1681 
1682 void
ifnet_attach_native_flowswitch(ifnet_t ifp)1683 ifnet_attach_native_flowswitch(ifnet_t ifp)
1684 {
1685 	if (!dlil_is_native_netif_nexus(ifp)) {
1686 		/* not a native netif */
1687 		return;
1688 	}
1689 	ifnet_attach_flowswitch_nexus(ifp);
1690 }
1691 
1692 int
ifnet_set_flowswitch_rx_callback(ifnet_t ifp,ifnet_fsw_rx_cb_t cb,void * arg)1693 ifnet_set_flowswitch_rx_callback(ifnet_t ifp, ifnet_fsw_rx_cb_t cb, void *arg)
1694 {
1695 	lck_mtx_lock(&ifp->if_delegate_lock);
1696 	while (ifp->if_fsw_rx_cb_ref > 0) {
1697 		DTRACE_SKYWALK1(wait__fsw, ifnet_t, ifp);
1698 		(void) msleep(&ifp->if_fsw_rx_cb_ref, &ifp->if_delegate_lock,
1699 		    (PZERO + 1), __FUNCTION__, NULL);
1700 		DTRACE_SKYWALK1(wake__fsw, ifnet_t, ifp);
1701 	}
1702 	ifp->if_fsw_rx_cb = cb;
1703 	ifp->if_fsw_rx_cb_arg = arg;
1704 	lck_mtx_unlock(&ifp->if_delegate_lock);
1705 	return 0;
1706 }
1707 
1708 int
ifnet_get_flowswitch_rx_callback(ifnet_t ifp,ifnet_fsw_rx_cb_t * cbp,void ** argp)1709 ifnet_get_flowswitch_rx_callback(ifnet_t ifp, ifnet_fsw_rx_cb_t *cbp, void **argp)
1710 {
1711 	/*
1712 	 * This is for avoiding the unnecessary lock acquire for interfaces
1713 	 * not used by a redirect interface.
1714 	 */
1715 	if (ifp->if_fsw_rx_cb == NULL) {
1716 		return ENOENT;
1717 	}
1718 	lck_mtx_lock(&ifp->if_delegate_lock);
1719 	if (ifp->if_fsw_rx_cb == NULL) {
1720 		lck_mtx_unlock(&ifp->if_delegate_lock);
1721 		return ENOENT;
1722 	}
1723 	*cbp = ifp->if_fsw_rx_cb;
1724 	*argp = ifp->if_fsw_rx_cb_arg;
1725 	ifp->if_fsw_rx_cb_ref++;
1726 	lck_mtx_unlock(&ifp->if_delegate_lock);
1727 	return 0;
1728 }
1729 
1730 void
ifnet_release_flowswitch_rx_callback(ifnet_t ifp)1731 ifnet_release_flowswitch_rx_callback(ifnet_t ifp)
1732 {
1733 	lck_mtx_lock(&ifp->if_delegate_lock);
1734 	if (--ifp->if_fsw_rx_cb_ref == 0) {
1735 		wakeup(&ifp->if_fsw_rx_cb_ref);
1736 	}
1737 	lck_mtx_unlock(&ifp->if_delegate_lock);
1738 }
1739 
1740 int
ifnet_set_delegate_parent(ifnet_t difp,ifnet_t parent)1741 ifnet_set_delegate_parent(ifnet_t difp, ifnet_t parent)
1742 {
1743 	lck_mtx_lock(&difp->if_delegate_lock);
1744 	while (difp->if_delegate_parent_ref > 0) {
1745 		DTRACE_SKYWALK1(wait__parent, ifnet_t, difp);
1746 		(void) msleep(&difp->if_delegate_parent_ref, &difp->if_delegate_lock,
1747 		    (PZERO + 1), __FUNCTION__, NULL);
1748 		DTRACE_SKYWALK1(wake__parent, ifnet_t, difp);
1749 	}
1750 	difp->if_delegate_parent = parent;
1751 	lck_mtx_unlock(&difp->if_delegate_lock);
1752 	return 0;
1753 }
1754 
1755 int
ifnet_get_delegate_parent(ifnet_t difp,ifnet_t * parentp)1756 ifnet_get_delegate_parent(ifnet_t difp, ifnet_t *parentp)
1757 {
1758 	lck_mtx_lock(&difp->if_delegate_lock);
1759 	if (difp->if_delegate_parent == NULL) {
1760 		lck_mtx_unlock(&difp->if_delegate_lock);
1761 		return ENOENT;
1762 	}
1763 	*parentp = difp->if_delegate_parent;
1764 	difp->if_delegate_parent_ref++;
1765 	lck_mtx_unlock(&difp->if_delegate_lock);
1766 	return 0;
1767 }
1768 
1769 void
ifnet_release_delegate_parent(ifnet_t difp)1770 ifnet_release_delegate_parent(ifnet_t difp)
1771 {
1772 	lck_mtx_lock(&difp->if_delegate_lock);
1773 	if (--difp->if_delegate_parent_ref == 0) {
1774 		wakeup(&difp->if_delegate_parent_ref);
1775 	}
1776 	lck_mtx_unlock(&difp->if_delegate_lock);
1777 }
1778 
1779 __attribute__((noinline))
1780 void
ifnet_set_detach_notify_locked(ifnet_t ifp,ifnet_detach_notify_cb_t notify,void * arg)1781 ifnet_set_detach_notify_locked(ifnet_t ifp, ifnet_detach_notify_cb_t notify, void *arg)
1782 {
1783 	ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_EXCLUSIVE);
1784 	ifp->if_detach_notify = notify;
1785 	ifp->if_detach_notify_arg = arg;
1786 }
1787 
1788 __attribute__((noinline))
1789 void
ifnet_get_detach_notify_locked(ifnet_t ifp,ifnet_detach_notify_cb_t * notifyp,void ** argp)1790 ifnet_get_detach_notify_locked(ifnet_t ifp, ifnet_detach_notify_cb_t *notifyp, void **argp)
1791 {
1792 	ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_EXCLUSIVE);
1793 	*notifyp = ifp->if_detach_notify;
1794 	*argp = ifp->if_detach_notify_arg;
1795 }
1796 
1797 __attribute__((noinline))
1798 void
ifnet_set_detach_notify(ifnet_t ifp,ifnet_detach_notify_cb_t notify,void * arg)1799 ifnet_set_detach_notify(ifnet_t ifp, ifnet_detach_notify_cb_t notify, void *arg)
1800 {
1801 	ifnet_lock_exclusive(ifp);
1802 	ifnet_set_detach_notify_locked(ifp, notify, arg);
1803 	ifnet_lock_done(ifp);
1804 }
1805 
1806 __attribute__((noinline))
1807 void
ifnet_get_detach_notify(ifnet_t ifp,ifnet_detach_notify_cb_t * notifyp,void ** argp)1808 ifnet_get_detach_notify(ifnet_t ifp, ifnet_detach_notify_cb_t *notifyp, void **argp)
1809 {
1810 	ifnet_lock_exclusive(ifp);
1811 	ifnet_get_detach_notify_locked(ifp, notifyp, argp);
1812 	ifnet_lock_done(ifp);
1813 }
1814 #endif /* SKYWALK */
1815 
1816 #define DLIL_INPUT_CHECK(m, ifp) {                                      \
1817 	struct ifnet *_rcvif = mbuf_pkthdr_rcvif(m);                    \
1818 	if (_rcvif == NULL || (ifp != lo_ifp && _rcvif != ifp) ||       \
1819 	    !(mbuf_flags(m) & MBUF_PKTHDR)) {                           \
1820 	        panic_plain("%s: invalid mbuf %p\n", __func__, m);      \
1821 	/* NOTREACHED */                                        \
1822 	}                                                               \
1823 }
1824 
1825 #define DLIL_EWMA(old, new, decay) do {                                 \
1826 	u_int32_t _avg;                                                 \
1827 	if ((_avg = (old)) > 0)                                         \
1828 	        _avg = (((_avg << (decay)) - _avg) + (new)) >> (decay); \
1829 	else                                                            \
1830 	        _avg = (new);                                           \
1831 	(old) = _avg;                                                   \
1832 } while (0)
1833 
1834 #define MBPS    (1ULL * 1000 * 1000)
1835 #define GBPS    (MBPS * 1000)
1836 
1837 struct rxpoll_time_tbl {
1838 	u_int64_t       speed;          /* downlink speed */
1839 	u_int32_t       plowat;         /* packets low watermark */
1840 	u_int32_t       phiwat;         /* packets high watermark */
1841 	u_int32_t       blowat;         /* bytes low watermark */
1842 	u_int32_t       bhiwat;         /* bytes high watermark */
1843 };
1844 
1845 static struct rxpoll_time_tbl rxpoll_tbl[] = {
1846 	{ .speed =  10 * MBPS, .plowat = 2, .phiwat = 8, .blowat = (1 * 1024), .bhiwat = (6 * 1024)    },
1847 	{ .speed = 100 * MBPS, .plowat = 10, .phiwat = 40, .blowat = (4 * 1024), .bhiwat = (64 * 1024)   },
1848 	{ .speed =   1 * GBPS, .plowat = 10, .phiwat = 40, .blowat = (4 * 1024), .bhiwat = (64 * 1024)   },
1849 	{ .speed =  10 * GBPS, .plowat = 10, .phiwat = 40, .blowat = (4 * 1024), .bhiwat = (64 * 1024)   },
1850 	{ .speed = 100 * GBPS, .plowat = 10, .phiwat = 40, .blowat = (4 * 1024), .bhiwat = (64 * 1024)   },
1851 	{ .speed = 0, .plowat = 0, .phiwat = 0, .blowat = 0, .bhiwat = 0 }
1852 };
1853 
1854 static LCK_MTX_DECLARE_ATTR(dlil_thread_sync_lock, &dlil_lock_group,
1855     &dlil_lck_attributes);
1856 static uint32_t dlil_pending_thread_cnt = 0;
1857 
1858 static void
dlil_incr_pending_thread_count(void)1859 dlil_incr_pending_thread_count(void)
1860 {
1861 	LCK_MTX_ASSERT(&dlil_thread_sync_lock, LCK_MTX_ASSERT_NOTOWNED);
1862 	lck_mtx_lock(&dlil_thread_sync_lock);
1863 	dlil_pending_thread_cnt++;
1864 	lck_mtx_unlock(&dlil_thread_sync_lock);
1865 }
1866 
1867 static void
dlil_decr_pending_thread_count(void)1868 dlil_decr_pending_thread_count(void)
1869 {
1870 	LCK_MTX_ASSERT(&dlil_thread_sync_lock, LCK_MTX_ASSERT_NOTOWNED);
1871 	lck_mtx_lock(&dlil_thread_sync_lock);
1872 	VERIFY(dlil_pending_thread_cnt > 0);
1873 	dlil_pending_thread_cnt--;
1874 	if (dlil_pending_thread_cnt == 0) {
1875 		wakeup(&dlil_pending_thread_cnt);
1876 	}
1877 	lck_mtx_unlock(&dlil_thread_sync_lock);
1878 }
1879 
1880 int
proto_hash_value(u_int32_t protocol_family)1881 proto_hash_value(u_int32_t protocol_family)
1882 {
1883 	/*
1884 	 * dlil_proto_unplumb_all() depends on the mapping between
1885 	 * the hash bucket index and the protocol family defined
1886 	 * here; future changes must be applied there as well.
1887 	 */
1888 	switch (protocol_family) {
1889 	case PF_INET:
1890 		return 0;
1891 	case PF_INET6:
1892 		return 1;
1893 	case PF_VLAN:
1894 		return 2;
1895 	case PF_UNSPEC:
1896 	default:
1897 		return 3;
1898 	}
1899 }
1900 
1901 /*
1902  * Caller must already be holding ifnet lock.
1903  */
1904 static struct if_proto *
find_attached_proto(struct ifnet * ifp,u_int32_t protocol_family)1905 find_attached_proto(struct ifnet *ifp, u_int32_t protocol_family)
1906 {
1907 	struct if_proto *proto = NULL;
1908 	u_int32_t i = proto_hash_value(protocol_family);
1909 
1910 	ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_OWNED);
1911 
1912 	if (ifp->if_proto_hash != NULL) {
1913 		proto = SLIST_FIRST(&ifp->if_proto_hash[i]);
1914 	}
1915 
1916 	while (proto != NULL && proto->protocol_family != protocol_family) {
1917 		proto = SLIST_NEXT(proto, next_hash);
1918 	}
1919 
1920 	if (proto != NULL) {
1921 		if_proto_ref(proto);
1922 	}
1923 
1924 	return proto;
1925 }
1926 
1927 static void
if_proto_ref(struct if_proto * proto)1928 if_proto_ref(struct if_proto *proto)
1929 {
1930 	os_atomic_inc(&proto->refcount, relaxed);
1931 }
1932 
1933 extern void if_rtproto_del(struct ifnet *ifp, int protocol);
1934 
1935 static void
if_proto_free(struct if_proto * proto)1936 if_proto_free(struct if_proto *proto)
1937 {
1938 	u_int32_t oldval;
1939 	struct ifnet *ifp = proto->ifp;
1940 	u_int32_t proto_family = proto->protocol_family;
1941 	struct kev_dl_proto_data ev_pr_data;
1942 
1943 	oldval = os_atomic_dec_orig(&proto->refcount, relaxed);
1944 	if (oldval > 1) {
1945 		return;
1946 	}
1947 
1948 	if (proto->proto_kpi == kProtoKPI_v1) {
1949 		if (proto->kpi.v1.detached) {
1950 			proto->kpi.v1.detached(ifp, proto->protocol_family);
1951 		}
1952 	}
1953 	if (proto->proto_kpi == kProtoKPI_v2) {
1954 		if (proto->kpi.v2.detached) {
1955 			proto->kpi.v2.detached(ifp, proto->protocol_family);
1956 		}
1957 	}
1958 
1959 	/*
1960 	 * Cleanup routes that may still be in the routing table for that
1961 	 * interface/protocol pair.
1962 	 */
1963 	if_rtproto_del(ifp, proto_family);
1964 
1965 	ifnet_lock_shared(ifp);
1966 
1967 	/* No more reference on this, protocol must have been detached */
1968 	VERIFY(proto->detached);
1969 
1970 	/*
1971 	 * The reserved field carries the number of protocol still attached
1972 	 * (subject to change)
1973 	 */
1974 	ev_pr_data.proto_family = proto_family;
1975 	ev_pr_data.proto_remaining_count = dlil_ifp_protolist(ifp, NULL, 0);
1976 
1977 	ifnet_lock_done(ifp);
1978 
1979 	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_PROTO_DETACHED,
1980 	    (struct net_event_data *)&ev_pr_data,
1981 	    sizeof(struct kev_dl_proto_data), FALSE);
1982 
1983 	if (ev_pr_data.proto_remaining_count == 0) {
1984 		/*
1985 		 * The protocol count has gone to zero, mark the interface down.
1986 		 * This used to be done by configd.KernelEventMonitor, but that
1987 		 * is inherently prone to races (rdar://problem/30810208).
1988 		 */
1989 		(void) ifnet_set_flags(ifp, 0, IFF_UP);
1990 		(void) ifnet_ioctl(ifp, 0, SIOCSIFFLAGS, NULL);
1991 		dlil_post_sifflags_msg(ifp);
1992 	}
1993 
1994 	zfree(dlif_proto_zone, proto);
1995 }
1996 
1997 __private_extern__ void
ifnet_lock_assert(struct ifnet * ifp,ifnet_lock_assert_t what)1998 ifnet_lock_assert(struct ifnet *ifp, ifnet_lock_assert_t what)
1999 {
2000 #if !MACH_ASSERT
2001 #pragma unused(ifp)
2002 #endif
2003 	unsigned int type = 0;
2004 	int ass = 1;
2005 
2006 	switch (what) {
2007 	case IFNET_LCK_ASSERT_EXCLUSIVE:
2008 		type = LCK_RW_ASSERT_EXCLUSIVE;
2009 		break;
2010 
2011 	case IFNET_LCK_ASSERT_SHARED:
2012 		type = LCK_RW_ASSERT_SHARED;
2013 		break;
2014 
2015 	case IFNET_LCK_ASSERT_OWNED:
2016 		type = LCK_RW_ASSERT_HELD;
2017 		break;
2018 
2019 	case IFNET_LCK_ASSERT_NOTOWNED:
2020 		/* nothing to do here for RW lock; bypass assert */
2021 		ass = 0;
2022 		break;
2023 
2024 	default:
2025 		panic("bad ifnet assert type: %d", what);
2026 		/* NOTREACHED */
2027 	}
2028 	if (ass) {
2029 		LCK_RW_ASSERT(&ifp->if_lock, type);
2030 	}
2031 }
2032 
2033 __private_extern__ void
ifnet_lock_shared(struct ifnet * ifp)2034 ifnet_lock_shared(struct ifnet *ifp)
2035 {
2036 	lck_rw_lock_shared(&ifp->if_lock);
2037 }
2038 
2039 __private_extern__ void
ifnet_lock_exclusive(struct ifnet * ifp)2040 ifnet_lock_exclusive(struct ifnet *ifp)
2041 {
2042 	lck_rw_lock_exclusive(&ifp->if_lock);
2043 }
2044 
2045 __private_extern__ void
ifnet_lock_done(struct ifnet * ifp)2046 ifnet_lock_done(struct ifnet *ifp)
2047 {
2048 	lck_rw_done(&ifp->if_lock);
2049 }
2050 
2051 #if INET
2052 __private_extern__ void
if_inetdata_lock_shared(struct ifnet * ifp)2053 if_inetdata_lock_shared(struct ifnet *ifp)
2054 {
2055 	lck_rw_lock_shared(&ifp->if_inetdata_lock);
2056 }
2057 
2058 __private_extern__ void
if_inetdata_lock_exclusive(struct ifnet * ifp)2059 if_inetdata_lock_exclusive(struct ifnet *ifp)
2060 {
2061 	lck_rw_lock_exclusive(&ifp->if_inetdata_lock);
2062 }
2063 
2064 __private_extern__ void
if_inetdata_lock_done(struct ifnet * ifp)2065 if_inetdata_lock_done(struct ifnet *ifp)
2066 {
2067 	lck_rw_done(&ifp->if_inetdata_lock);
2068 }
2069 #endif
2070 
2071 __private_extern__ void
if_inet6data_lock_shared(struct ifnet * ifp)2072 if_inet6data_lock_shared(struct ifnet *ifp)
2073 {
2074 	lck_rw_lock_shared(&ifp->if_inet6data_lock);
2075 }
2076 
2077 __private_extern__ void
if_inet6data_lock_exclusive(struct ifnet * ifp)2078 if_inet6data_lock_exclusive(struct ifnet *ifp)
2079 {
2080 	lck_rw_lock_exclusive(&ifp->if_inet6data_lock);
2081 }
2082 
2083 __private_extern__ void
if_inet6data_lock_done(struct ifnet * ifp)2084 if_inet6data_lock_done(struct ifnet *ifp)
2085 {
2086 	lck_rw_done(&ifp->if_inet6data_lock);
2087 }
2088 
2089 __private_extern__ void
ifnet_head_lock_shared(void)2090 ifnet_head_lock_shared(void)
2091 {
2092 	lck_rw_lock_shared(&ifnet_head_lock);
2093 }
2094 
2095 __private_extern__ void
ifnet_head_lock_exclusive(void)2096 ifnet_head_lock_exclusive(void)
2097 {
2098 	lck_rw_lock_exclusive(&ifnet_head_lock);
2099 }
2100 
2101 __private_extern__ void
ifnet_head_done(void)2102 ifnet_head_done(void)
2103 {
2104 	lck_rw_done(&ifnet_head_lock);
2105 }
2106 
2107 __private_extern__ void
ifnet_head_assert_exclusive(void)2108 ifnet_head_assert_exclusive(void)
2109 {
2110 	LCK_RW_ASSERT(&ifnet_head_lock, LCK_RW_ASSERT_EXCLUSIVE);
2111 }
2112 
2113 /*
2114  * dlil_ifp_protolist
2115  * - get the list of protocols attached to the interface, or just the number
2116  *   of attached protocols
2117  * - if the number returned is greater than 'list_count', truncation occurred
2118  *
2119  * Note:
2120  * - caller must already be holding ifnet lock.
2121  */
2122 static u_int32_t
dlil_ifp_protolist(struct ifnet * ifp,protocol_family_t * list,u_int32_t list_count)2123 dlil_ifp_protolist(struct ifnet *ifp, protocol_family_t *list,
2124     u_int32_t list_count)
2125 {
2126 	u_int32_t       count = 0;
2127 	int             i;
2128 
2129 	ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_OWNED);
2130 
2131 	if (ifp->if_proto_hash == NULL) {
2132 		goto done;
2133 	}
2134 
2135 	for (i = 0; i < PROTO_HASH_SLOTS; i++) {
2136 		struct if_proto *proto;
2137 		SLIST_FOREACH(proto, &ifp->if_proto_hash[i], next_hash) {
2138 			if (list != NULL && count < list_count) {
2139 				list[count] = proto->protocol_family;
2140 			}
2141 			count++;
2142 		}
2143 	}
2144 done:
2145 	return count;
2146 }
2147 
2148 __private_extern__ u_int32_t
if_get_protolist(struct ifnet * ifp,u_int32_t * protolist,u_int32_t count)2149 if_get_protolist(struct ifnet * ifp, u_int32_t *protolist, u_int32_t count)
2150 {
2151 	ifnet_lock_shared(ifp);
2152 	count = dlil_ifp_protolist(ifp, protolist, count);
2153 	ifnet_lock_done(ifp);
2154 	return count;
2155 }
2156 
2157 __private_extern__ void
if_free_protolist(u_int32_t * list)2158 if_free_protolist(u_int32_t *list)
2159 {
2160 	kfree_data_addr(list);
2161 }
2162 
2163 __private_extern__ int
dlil_post_msg(struct ifnet * ifp,u_int32_t event_subclass,u_int32_t event_code,struct net_event_data * event_data,u_int32_t event_data_len,boolean_t suppress_generation)2164 dlil_post_msg(struct ifnet *ifp, u_int32_t event_subclass,
2165     u_int32_t event_code, struct net_event_data *event_data,
2166     u_int32_t event_data_len, boolean_t suppress_generation)
2167 {
2168 	struct net_event_data ev_data;
2169 	struct kev_msg ev_msg;
2170 
2171 	bzero(&ev_msg, sizeof(ev_msg));
2172 	bzero(&ev_data, sizeof(ev_data));
2173 	/*
2174 	 * a net event always starts with a net_event_data structure
2175 	 * but the caller can generate a simple net event or
2176 	 * provide a longer event structure to post
2177 	 */
2178 	ev_msg.vendor_code      = KEV_VENDOR_APPLE;
2179 	ev_msg.kev_class        = KEV_NETWORK_CLASS;
2180 	ev_msg.kev_subclass     = event_subclass;
2181 	ev_msg.event_code       = event_code;
2182 
2183 	if (event_data == NULL) {
2184 		event_data = &ev_data;
2185 		event_data_len = sizeof(struct net_event_data);
2186 	}
2187 
2188 	strlcpy(&event_data->if_name[0], ifp->if_name, IFNAMSIZ);
2189 	event_data->if_family = ifp->if_family;
2190 	event_data->if_unit   = (u_int32_t)ifp->if_unit;
2191 
2192 	ev_msg.dv[0].data_length = event_data_len;
2193 	ev_msg.dv[0].data_ptr    = event_data;
2194 	ev_msg.dv[1].data_length = 0;
2195 
2196 	bool update_generation = true;
2197 	if (event_subclass == KEV_DL_SUBCLASS) {
2198 		/* Don't update interface generation for frequent link quality and state changes  */
2199 		switch (event_code) {
2200 		case KEV_DL_LINK_QUALITY_METRIC_CHANGED:
2201 		case KEV_DL_RRC_STATE_CHANGED:
2202 		case KEV_DL_PRIMARY_ELECTED:
2203 			update_generation = false;
2204 			break;
2205 		default:
2206 			break;
2207 		}
2208 	}
2209 
2210 	/*
2211 	 * Some events that update generation counts might
2212 	 * want to suppress generation count.
2213 	 * One example is node presence/absence where we still
2214 	 * issue kernel event for the invocation but want to avoid
2215 	 * expensive operation of updating generation which triggers
2216 	 * NECP client updates.
2217 	 */
2218 	if (suppress_generation) {
2219 		update_generation = false;
2220 	}
2221 
2222 	return dlil_event_internal(ifp, &ev_msg, update_generation);
2223 }
2224 
2225 __private_extern__ int
dlil_alloc_local_stats(struct ifnet * ifp)2226 dlil_alloc_local_stats(struct ifnet *ifp)
2227 {
2228 	int ret = EINVAL;
2229 	void *buf, *base, **pbuf;
2230 
2231 	if (ifp == NULL) {
2232 		goto end;
2233 	}
2234 
2235 	if (ifp->if_tcp_stat == NULL && ifp->if_udp_stat == NULL) {
2236 		/* allocate tcpstat_local structure */
2237 		buf = zalloc_flags(dlif_tcpstat_zone,
2238 		    Z_WAITOK | Z_ZERO | Z_NOFAIL);
2239 
2240 		/* Get the 64-bit aligned base address for this object */
2241 		base = (void *)P2ROUNDUP((intptr_t)buf + sizeof(u_int64_t),
2242 		    sizeof(u_int64_t));
2243 		VERIFY(((intptr_t)base + dlif_tcpstat_size) <=
2244 		    ((intptr_t)buf + dlif_tcpstat_bufsize));
2245 
2246 		/*
2247 		 * Wind back a pointer size from the aligned base and
2248 		 * save the original address so we can free it later.
2249 		 */
2250 		pbuf = (void **)((intptr_t)base - sizeof(void *));
2251 		*pbuf = buf;
2252 		ifp->if_tcp_stat = base;
2253 
2254 		/* allocate udpstat_local structure */
2255 		buf = zalloc_flags(dlif_udpstat_zone,
2256 		    Z_WAITOK | Z_ZERO | Z_NOFAIL);
2257 
2258 		/* Get the 64-bit aligned base address for this object */
2259 		base = (void *)P2ROUNDUP((intptr_t)buf + sizeof(u_int64_t),
2260 		    sizeof(u_int64_t));
2261 		VERIFY(((intptr_t)base + dlif_udpstat_size) <=
2262 		    ((intptr_t)buf + dlif_udpstat_bufsize));
2263 
2264 		/*
2265 		 * Wind back a pointer size from the aligned base and
2266 		 * save the original address so we can free it later.
2267 		 */
2268 		pbuf = (void **)((intptr_t)base - sizeof(void *));
2269 		*pbuf = buf;
2270 		ifp->if_udp_stat = base;
2271 
2272 		VERIFY(IS_P2ALIGNED(ifp->if_tcp_stat, sizeof(u_int64_t)) &&
2273 		    IS_P2ALIGNED(ifp->if_udp_stat, sizeof(u_int64_t)));
2274 
2275 		ret = 0;
2276 	}
2277 
2278 	if (ifp->if_ipv4_stat == NULL) {
2279 		ifp->if_ipv4_stat = kalloc_type(struct if_tcp_ecn_stat, Z_WAITOK | Z_ZERO);
2280 	}
2281 
2282 	if (ifp->if_ipv6_stat == NULL) {
2283 		ifp->if_ipv6_stat = kalloc_type(struct if_tcp_ecn_stat, Z_WAITOK | Z_ZERO);
2284 	}
2285 end:
2286 	if (ifp != NULL && ret != 0) {
2287 		if (ifp->if_tcp_stat != NULL) {
2288 			pbuf = (void **)
2289 			    ((intptr_t)ifp->if_tcp_stat - sizeof(void *));
2290 			zfree(dlif_tcpstat_zone, *pbuf);
2291 			ifp->if_tcp_stat = NULL;
2292 		}
2293 		if (ifp->if_udp_stat != NULL) {
2294 			pbuf = (void **)
2295 			    ((intptr_t)ifp->if_udp_stat - sizeof(void *));
2296 			zfree(dlif_udpstat_zone, *pbuf);
2297 			ifp->if_udp_stat = NULL;
2298 		}
2299 		/* The macro kfree_type sets the passed pointer to NULL */
2300 		if (ifp->if_ipv4_stat != NULL) {
2301 			kfree_type(struct if_tcp_ecn_stat, ifp->if_ipv4_stat);
2302 		}
2303 		if (ifp->if_ipv6_stat != NULL) {
2304 			kfree_type(struct if_tcp_ecn_stat, ifp->if_ipv6_stat);
2305 		}
2306 	}
2307 
2308 	return ret;
2309 }
2310 
2311 static void
dlil_reset_rxpoll_params(ifnet_t ifp)2312 dlil_reset_rxpoll_params(ifnet_t ifp)
2313 {
2314 	ASSERT(ifp != NULL);
2315 	ifnet_set_poll_cycle(ifp, NULL);
2316 	ifp->if_poll_update = 0;
2317 	ifp->if_poll_flags = 0;
2318 	ifp->if_poll_req = 0;
2319 	ifp->if_poll_mode = IFNET_MODEL_INPUT_POLL_OFF;
2320 	bzero(&ifp->if_poll_tstats, sizeof(ifp->if_poll_tstats));
2321 	bzero(&ifp->if_poll_pstats, sizeof(ifp->if_poll_pstats));
2322 	bzero(&ifp->if_poll_sstats, sizeof(ifp->if_poll_sstats));
2323 	net_timerclear(&ifp->if_poll_mode_holdtime);
2324 	net_timerclear(&ifp->if_poll_mode_lasttime);
2325 	net_timerclear(&ifp->if_poll_sample_holdtime);
2326 	net_timerclear(&ifp->if_poll_sample_lasttime);
2327 	net_timerclear(&ifp->if_poll_dbg_lasttime);
2328 }
2329 
2330 static int
dlil_create_input_thread(ifnet_t ifp,struct dlil_threading_info * inp,thread_continue_t * thfunc)2331 dlil_create_input_thread(ifnet_t ifp, struct dlil_threading_info *inp,
2332     thread_continue_t *thfunc)
2333 {
2334 	boolean_t dlil_rxpoll_input;
2335 	thread_continue_t func = NULL;
2336 	u_int32_t limit;
2337 	int error = 0;
2338 
2339 	dlil_rxpoll_input = (ifp != NULL && net_rxpoll &&
2340 	    (ifp->if_eflags & IFEF_RXPOLL) && (ifp->if_xflags & IFXF_LEGACY));
2341 
2342 	/* default strategy utilizes the DLIL worker thread */
2343 	inp->dlth_strategy = dlil_input_async;
2344 
2345 	/* NULL ifp indicates the main input thread, called at dlil_init time */
2346 	if (ifp == NULL) {
2347 		/*
2348 		 * Main input thread only.
2349 		 */
2350 		func = dlil_main_input_thread_func;
2351 		VERIFY(inp == dlil_main_input_thread);
2352 		(void) strlcat(inp->dlth_name,
2353 		    "main_input", DLIL_THREADNAME_LEN);
2354 	} else if (dlil_rxpoll_input) {
2355 		/*
2356 		 * Legacy (non-netif) hybrid polling.
2357 		 */
2358 		func = dlil_rxpoll_input_thread_func;
2359 		VERIFY(inp != dlil_main_input_thread);
2360 		(void) snprintf(inp->dlth_name, DLIL_THREADNAME_LEN,
2361 		    "%s_input_poll", if_name(ifp));
2362 	} else if (net_async || (ifp->if_xflags & IFXF_LEGACY)) {
2363 		/*
2364 		 * Asynchronous strategy.
2365 		 */
2366 		func = dlil_input_thread_func;
2367 		VERIFY(inp != dlil_main_input_thread);
2368 		(void) snprintf(inp->dlth_name, DLIL_THREADNAME_LEN,
2369 		    "%s_input", if_name(ifp));
2370 	} else {
2371 		/*
2372 		 * Synchronous strategy if there's a netif below and
2373 		 * the device isn't capable of hybrid polling.
2374 		 */
2375 		ASSERT(func == NULL);
2376 		ASSERT(!(ifp->if_xflags & IFXF_LEGACY));
2377 		VERIFY(inp != dlil_main_input_thread);
2378 		ASSERT(!inp->dlth_affinity);
2379 		inp->dlth_strategy = dlil_input_sync;
2380 	}
2381 	VERIFY(inp->dlth_thread == THREAD_NULL);
2382 
2383 	/* let caller know */
2384 	if (thfunc != NULL) {
2385 		*thfunc = func;
2386 	}
2387 
2388 	inp->dlth_lock_grp = lck_grp_alloc_init(inp->dlth_name, LCK_GRP_ATTR_NULL);
2389 	lck_mtx_init(&inp->dlth_lock, inp->dlth_lock_grp, &dlil_lck_attributes);
2390 
2391 	inp->dlth_ifp = ifp; /* NULL for main input thread */
2392 
2393 	/*
2394 	 * For interfaces that support opportunistic polling, set the
2395 	 * low and high watermarks for outstanding inbound packets/bytes.
2396 	 * Also define freeze times for transitioning between modes
2397 	 * and updating the average.
2398 	 */
2399 	if (ifp != NULL && net_rxpoll && (ifp->if_eflags & IFEF_RXPOLL)) {
2400 		limit = MAX(if_rcvq_maxlen, IF_RCVQ_MINLEN);
2401 		if (ifp->if_xflags & IFXF_LEGACY) {
2402 			(void) dlil_rxpoll_set_params(ifp, NULL, FALSE);
2403 		}
2404 	} else {
2405 		/*
2406 		 * For interfaces that don't support opportunistic
2407 		 * polling, set the burst limit to prevent memory exhaustion.
2408 		 * The values of `if_rcvq_burst_limit' are safeguarded
2409 		 * on customer builds by `sysctl_rcvq_burst_limit'.
2410 		 */
2411 		limit = if_rcvq_burst_limit;
2412 	}
2413 
2414 	_qinit(&inp->dlth_pkts, Q_DROPTAIL, limit, QP_MBUF);
2415 	if (inp == dlil_main_input_thread) {
2416 		struct dlil_main_threading_info *inpm =
2417 		    (struct dlil_main_threading_info *)inp;
2418 		_qinit(&inpm->lo_rcvq_pkts, Q_DROPTAIL, limit, QP_MBUF);
2419 	}
2420 
2421 	if (func == NULL) {
2422 		ASSERT(!(ifp->if_xflags & IFXF_LEGACY));
2423 		ASSERT(error == 0);
2424 		error = ENODEV;
2425 		goto done;
2426 	}
2427 
2428 	error = kernel_thread_start(func, inp, &inp->dlth_thread);
2429 	if (error == KERN_SUCCESS) {
2430 		thread_precedence_policy_data_t info;
2431 		__unused kern_return_t kret;
2432 
2433 		bzero(&info, sizeof(info));
2434 		info.importance = 0;
2435 		kret = thread_policy_set(inp->dlth_thread,
2436 		    THREAD_PRECEDENCE_POLICY, (thread_policy_t)&info,
2437 		    THREAD_PRECEDENCE_POLICY_COUNT);
2438 		ASSERT(kret == KERN_SUCCESS);
2439 		/*
2440 		 * We create an affinity set so that the matching workloop
2441 		 * thread or the starter thread (for loopback) can be
2442 		 * scheduled on the same processor set as the input thread.
2443 		 */
2444 		if (net_affinity) {
2445 			struct thread *tp = inp->dlth_thread;
2446 			u_int32_t tag;
2447 			/*
2448 			 * Randomize to reduce the probability
2449 			 * of affinity tag namespace collision.
2450 			 */
2451 			read_frandom(&tag, sizeof(tag));
2452 			if (dlil_affinity_set(tp, tag) == KERN_SUCCESS) {
2453 				thread_reference(tp);
2454 				inp->dlth_affinity_tag = tag;
2455 				inp->dlth_affinity = TRUE;
2456 			}
2457 		}
2458 	} else if (inp == dlil_main_input_thread) {
2459 		panic_plain("%s: couldn't create main input thread", __func__);
2460 		/* NOTREACHED */
2461 	} else {
2462 		panic_plain("%s: couldn't create %s input thread", __func__,
2463 		    if_name(ifp));
2464 		/* NOTREACHED */
2465 	}
2466 	OSAddAtomic(1, &cur_dlil_input_threads);
2467 
2468 done:
2469 	return error;
2470 }
2471 
2472 #if TEST_INPUT_THREAD_TERMINATION
2473 static int
2474 sysctl_input_thread_termination_spin SYSCTL_HANDLER_ARGS
2475 {
2476 #pragma unused(arg1, arg2)
2477 	uint32_t i;
2478 	int err;
2479 
2480 	i = if_input_thread_termination_spin;
2481 
2482 	err = sysctl_handle_int(oidp, &i, 0, req);
2483 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
2484 		return err;
2485 	}
2486 
2487 	if (net_rxpoll == 0) {
2488 		return ENXIO;
2489 	}
2490 
2491 	if_input_thread_termination_spin = i;
2492 	return err;
2493 }
2494 #endif /* TEST_INPUT_THREAD_TERMINATION */
2495 
2496 static void
dlil_clean_threading_info(struct dlil_threading_info * inp)2497 dlil_clean_threading_info(struct dlil_threading_info *inp)
2498 {
2499 	lck_mtx_destroy(&inp->dlth_lock, inp->dlth_lock_grp);
2500 	lck_grp_free(inp->dlth_lock_grp);
2501 	inp->dlth_lock_grp = NULL;
2502 
2503 	inp->dlth_flags = 0;
2504 	inp->dlth_wtot = 0;
2505 	bzero(inp->dlth_name, sizeof(inp->dlth_name));
2506 	inp->dlth_ifp = NULL;
2507 	VERIFY(qhead(&inp->dlth_pkts) == NULL && qempty(&inp->dlth_pkts));
2508 	qlimit(&inp->dlth_pkts) = 0;
2509 	bzero(&inp->dlth_stats, sizeof(inp->dlth_stats));
2510 
2511 	VERIFY(!inp->dlth_affinity);
2512 	inp->dlth_thread = THREAD_NULL;
2513 	inp->dlth_strategy = NULL;
2514 	VERIFY(inp->dlth_driver_thread == THREAD_NULL);
2515 	VERIFY(inp->dlth_poller_thread == THREAD_NULL);
2516 	VERIFY(inp->dlth_affinity_tag == 0);
2517 #if IFNET_INPUT_SANITY_CHK
2518 	inp->dlth_pkts_cnt = 0;
2519 #endif /* IFNET_INPUT_SANITY_CHK */
2520 }
2521 
2522 static void
dlil_terminate_input_thread(struct dlil_threading_info * inp)2523 dlil_terminate_input_thread(struct dlil_threading_info *inp)
2524 {
2525 	struct ifnet *ifp = inp->dlth_ifp;
2526 	classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
2527 
2528 	VERIFY(current_thread() == inp->dlth_thread);
2529 	VERIFY(inp != dlil_main_input_thread);
2530 
2531 	OSAddAtomic(-1, &cur_dlil_input_threads);
2532 
2533 #if TEST_INPUT_THREAD_TERMINATION
2534 	{ /* do something useless that won't get optimized away */
2535 		uint32_t        v = 1;
2536 		for (uint32_t i = 0;
2537 		    i < if_input_thread_termination_spin;
2538 		    i++) {
2539 			v = (i + 1) * v;
2540 		}
2541 		DLIL_PRINTF("the value is %d\n", v);
2542 	}
2543 #endif /* TEST_INPUT_THREAD_TERMINATION */
2544 
2545 	lck_mtx_lock_spin(&inp->dlth_lock);
2546 	_getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
2547 	VERIFY((inp->dlth_flags & DLIL_INPUT_TERMINATE) != 0);
2548 	inp->dlth_flags |= DLIL_INPUT_TERMINATE_COMPLETE;
2549 	wakeup_one((caddr_t)&inp->dlth_flags);
2550 	lck_mtx_unlock(&inp->dlth_lock);
2551 
2552 	/* free up pending packets */
2553 	if (pkt.cp_mbuf != NULL) {
2554 		mbuf_freem_list(pkt.cp_mbuf);
2555 	}
2556 
2557 	/* for the extra refcnt from kernel_thread_start() */
2558 	thread_deallocate(current_thread());
2559 
2560 	if (dlil_verbose) {
2561 		DLIL_PRINTF("%s: input thread terminated\n",
2562 		    if_name(ifp));
2563 	}
2564 
2565 	/* this is the end */
2566 	thread_terminate(current_thread());
2567 	/* NOTREACHED */
2568 }
2569 
2570 static kern_return_t
dlil_affinity_set(struct thread * tp,u_int32_t tag)2571 dlil_affinity_set(struct thread *tp, u_int32_t tag)
2572 {
2573 	thread_affinity_policy_data_t policy;
2574 
2575 	bzero(&policy, sizeof(policy));
2576 	policy.affinity_tag = tag;
2577 	return thread_policy_set(tp, THREAD_AFFINITY_POLICY,
2578 	           (thread_policy_t)&policy, THREAD_AFFINITY_POLICY_COUNT);
2579 }
2580 
2581 #if SKYWALK && defined(XNU_TARGET_OS_OSX)
2582 static void
dlil_filter_event(struct eventhandler_entry_arg arg __unused,enum net_filter_event_subsystems state)2583 dlil_filter_event(struct eventhandler_entry_arg arg __unused,
2584     enum net_filter_event_subsystems state)
2585 {
2586 	bool old_if_enable_fsw_transport_netagent = if_enable_fsw_transport_netagent;
2587 	if ((state & ~NET_FILTER_EVENT_PF_PRIVATE_PROXY) == 0) {
2588 		if_enable_fsw_transport_netagent = 1;
2589 	} else {
2590 		if_enable_fsw_transport_netagent = 0;
2591 	}
2592 	if (old_if_enable_fsw_transport_netagent != if_enable_fsw_transport_netagent) {
2593 		kern_nexus_update_netagents();
2594 	} else if (!if_enable_fsw_transport_netagent) {
2595 		necp_update_all_clients();
2596 	}
2597 }
2598 #endif /* SKYWALK && XNU_TARGET_OS_OSX */
2599 
2600 void
dlil_init(void)2601 dlil_init(void)
2602 {
2603 	thread_t thread = THREAD_NULL;
2604 
2605 	/*
2606 	 * The following fields must be 64-bit aligned for atomic operations.
2607 	 */
2608 	IF_DATA_REQUIRE_ALIGNED_64(ifi_ipackets);
2609 	IF_DATA_REQUIRE_ALIGNED_64(ifi_ierrors);
2610 	IF_DATA_REQUIRE_ALIGNED_64(ifi_opackets);
2611 	IF_DATA_REQUIRE_ALIGNED_64(ifi_oerrors);
2612 	IF_DATA_REQUIRE_ALIGNED_64(ifi_collisions);
2613 	IF_DATA_REQUIRE_ALIGNED_64(ifi_ibytes);
2614 	IF_DATA_REQUIRE_ALIGNED_64(ifi_obytes);
2615 	IF_DATA_REQUIRE_ALIGNED_64(ifi_imcasts);
2616 	IF_DATA_REQUIRE_ALIGNED_64(ifi_omcasts);
2617 	IF_DATA_REQUIRE_ALIGNED_64(ifi_iqdrops);
2618 	IF_DATA_REQUIRE_ALIGNED_64(ifi_noproto);
2619 	IF_DATA_REQUIRE_ALIGNED_64(ifi_alignerrs);
2620 	IF_DATA_REQUIRE_ALIGNED_64(ifi_dt_bytes);
2621 	IF_DATA_REQUIRE_ALIGNED_64(ifi_fpackets);
2622 	IF_DATA_REQUIRE_ALIGNED_64(ifi_fbytes);
2623 
2624 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_ipackets);
2625 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_ierrors);
2626 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_opackets);
2627 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_oerrors);
2628 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_collisions);
2629 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_ibytes);
2630 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_obytes);
2631 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_imcasts);
2632 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_omcasts);
2633 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_iqdrops);
2634 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_noproto);
2635 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_alignerrs);
2636 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_dt_bytes);
2637 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_fpackets);
2638 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_fbytes);
2639 
2640 	/*
2641 	 * These IF_HWASSIST_ flags must be equal to their IFNET_* counterparts.
2642 	 */
2643 	_CASSERT(IF_HWASSIST_CSUM_IP == IFNET_CSUM_IP);
2644 	_CASSERT(IF_HWASSIST_CSUM_TCP == IFNET_CSUM_TCP);
2645 	_CASSERT(IF_HWASSIST_CSUM_UDP == IFNET_CSUM_UDP);
2646 	_CASSERT(IF_HWASSIST_CSUM_IP_FRAGS == IFNET_CSUM_FRAGMENT);
2647 	_CASSERT(IF_HWASSIST_CSUM_FRAGMENT == IFNET_IP_FRAGMENT);
2648 	_CASSERT(IF_HWASSIST_CSUM_TCPIPV6 == IFNET_CSUM_TCPIPV6);
2649 	_CASSERT(IF_HWASSIST_CSUM_UDPIPV6 == IFNET_CSUM_UDPIPV6);
2650 	_CASSERT(IF_HWASSIST_CSUM_FRAGMENT_IPV6 == IFNET_IPV6_FRAGMENT);
2651 	_CASSERT(IF_HWASSIST_CSUM_PARTIAL == IFNET_CSUM_PARTIAL);
2652 	_CASSERT(IF_HWASSIST_CSUM_ZERO_INVERT == IFNET_CSUM_ZERO_INVERT);
2653 	_CASSERT(IF_HWASSIST_VLAN_TAGGING == IFNET_VLAN_TAGGING);
2654 	_CASSERT(IF_HWASSIST_VLAN_MTU == IFNET_VLAN_MTU);
2655 	_CASSERT(IF_HWASSIST_TSO_V4 == IFNET_TSO_IPV4);
2656 	_CASSERT(IF_HWASSIST_TSO_V6 == IFNET_TSO_IPV6);
2657 
2658 	/*
2659 	 * ... as well as the mbuf checksum flags counterparts.
2660 	 */
2661 	_CASSERT(CSUM_IP == IF_HWASSIST_CSUM_IP);
2662 	_CASSERT(CSUM_TCP == IF_HWASSIST_CSUM_TCP);
2663 	_CASSERT(CSUM_UDP == IF_HWASSIST_CSUM_UDP);
2664 	_CASSERT(CSUM_IP_FRAGS == IF_HWASSIST_CSUM_IP_FRAGS);
2665 	_CASSERT(CSUM_FRAGMENT == IF_HWASSIST_CSUM_FRAGMENT);
2666 	_CASSERT(CSUM_TCPIPV6 == IF_HWASSIST_CSUM_TCPIPV6);
2667 	_CASSERT(CSUM_UDPIPV6 == IF_HWASSIST_CSUM_UDPIPV6);
2668 	_CASSERT(CSUM_FRAGMENT_IPV6 == IF_HWASSIST_CSUM_FRAGMENT_IPV6);
2669 	_CASSERT(CSUM_PARTIAL == IF_HWASSIST_CSUM_PARTIAL);
2670 	_CASSERT(CSUM_ZERO_INVERT == IF_HWASSIST_CSUM_ZERO_INVERT);
2671 	_CASSERT(CSUM_VLAN_TAG_VALID == IF_HWASSIST_VLAN_TAGGING);
2672 
2673 	/*
2674 	 * Make sure we have at least IF_LLREACH_MAXLEN in the llreach info.
2675 	 */
2676 	_CASSERT(IF_LLREACH_MAXLEN <= IF_LLREACHINFO_ADDRLEN);
2677 	_CASSERT(IFNET_LLREACHINFO_ADDRLEN == IF_LLREACHINFO_ADDRLEN);
2678 
2679 	_CASSERT(IFRLOGF_DLIL == IFNET_LOGF_DLIL);
2680 	_CASSERT(IFRLOGF_FAMILY == IFNET_LOGF_FAMILY);
2681 	_CASSERT(IFRLOGF_DRIVER == IFNET_LOGF_DRIVER);
2682 	_CASSERT(IFRLOGF_FIRMWARE == IFNET_LOGF_FIRMWARE);
2683 
2684 	_CASSERT(IFRLOGCAT_CONNECTIVITY == IFNET_LOGCAT_CONNECTIVITY);
2685 	_CASSERT(IFRLOGCAT_QUALITY == IFNET_LOGCAT_QUALITY);
2686 	_CASSERT(IFRLOGCAT_PERFORMANCE == IFNET_LOGCAT_PERFORMANCE);
2687 
2688 	_CASSERT(IFRTYPE_FAMILY_ANY == IFNET_FAMILY_ANY);
2689 	_CASSERT(IFRTYPE_FAMILY_LOOPBACK == IFNET_FAMILY_LOOPBACK);
2690 	_CASSERT(IFRTYPE_FAMILY_ETHERNET == IFNET_FAMILY_ETHERNET);
2691 	_CASSERT(IFRTYPE_FAMILY_SLIP == IFNET_FAMILY_SLIP);
2692 	_CASSERT(IFRTYPE_FAMILY_TUN == IFNET_FAMILY_TUN);
2693 	_CASSERT(IFRTYPE_FAMILY_VLAN == IFNET_FAMILY_VLAN);
2694 	_CASSERT(IFRTYPE_FAMILY_PPP == IFNET_FAMILY_PPP);
2695 	_CASSERT(IFRTYPE_FAMILY_PVC == IFNET_FAMILY_PVC);
2696 	_CASSERT(IFRTYPE_FAMILY_DISC == IFNET_FAMILY_DISC);
2697 	_CASSERT(IFRTYPE_FAMILY_MDECAP == IFNET_FAMILY_MDECAP);
2698 	_CASSERT(IFRTYPE_FAMILY_GIF == IFNET_FAMILY_GIF);
2699 	_CASSERT(IFRTYPE_FAMILY_FAITH == IFNET_FAMILY_FAITH);
2700 	_CASSERT(IFRTYPE_FAMILY_STF == IFNET_FAMILY_STF);
2701 	_CASSERT(IFRTYPE_FAMILY_FIREWIRE == IFNET_FAMILY_FIREWIRE);
2702 	_CASSERT(IFRTYPE_FAMILY_BOND == IFNET_FAMILY_BOND);
2703 	_CASSERT(IFRTYPE_FAMILY_CELLULAR == IFNET_FAMILY_CELLULAR);
2704 	_CASSERT(IFRTYPE_FAMILY_UTUN == IFNET_FAMILY_UTUN);
2705 	_CASSERT(IFRTYPE_FAMILY_IPSEC == IFNET_FAMILY_IPSEC);
2706 
2707 	_CASSERT(IFRTYPE_SUBFAMILY_ANY == IFNET_SUBFAMILY_ANY);
2708 	_CASSERT(IFRTYPE_SUBFAMILY_USB == IFNET_SUBFAMILY_USB);
2709 	_CASSERT(IFRTYPE_SUBFAMILY_BLUETOOTH == IFNET_SUBFAMILY_BLUETOOTH);
2710 	_CASSERT(IFRTYPE_SUBFAMILY_WIFI == IFNET_SUBFAMILY_WIFI);
2711 	_CASSERT(IFRTYPE_SUBFAMILY_THUNDERBOLT == IFNET_SUBFAMILY_THUNDERBOLT);
2712 	_CASSERT(IFRTYPE_SUBFAMILY_RESERVED == IFNET_SUBFAMILY_RESERVED);
2713 	_CASSERT(IFRTYPE_SUBFAMILY_INTCOPROC == IFNET_SUBFAMILY_INTCOPROC);
2714 	_CASSERT(IFRTYPE_SUBFAMILY_QUICKRELAY == IFNET_SUBFAMILY_QUICKRELAY);
2715 	_CASSERT(IFRTYPE_SUBFAMILY_VMNET == IFNET_SUBFAMILY_VMNET);
2716 	_CASSERT(IFRTYPE_SUBFAMILY_SIMCELL == IFNET_SUBFAMILY_SIMCELL);
2717 	_CASSERT(IFRTYPE_SUBFAMILY_MANAGEMENT == IFNET_SUBFAMILY_MANAGEMENT);
2718 
2719 	_CASSERT(DLIL_MODIDLEN == IFNET_MODIDLEN);
2720 	_CASSERT(DLIL_MODARGLEN == IFNET_MODARGLEN);
2721 
2722 	PE_parse_boot_argn("net_affinity", &net_affinity,
2723 	    sizeof(net_affinity));
2724 
2725 	PE_parse_boot_argn("net_rxpoll", &net_rxpoll, sizeof(net_rxpoll));
2726 
2727 	PE_parse_boot_argn("net_rtref", &net_rtref, sizeof(net_rtref));
2728 
2729 	PE_parse_boot_argn("net_async", &net_async, sizeof(net_async));
2730 
2731 	PE_parse_boot_argn("ifnet_debug", &ifnet_debug, sizeof(ifnet_debug));
2732 
2733 	VERIFY(dlil_pending_thread_cnt == 0);
2734 #if SKYWALK
2735 	boolean_t pe_enable_fsw_transport_netagent = FALSE;
2736 	boolean_t pe_disable_fsw_transport_netagent = FALSE;
2737 	boolean_t enable_fsw_netagent =
2738 	    (((if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0) ||
2739 	    (if_attach_nx & IF_ATTACH_NX_FSW_IP_NETAGENT) != 0);
2740 
2741 	/*
2742 	 * Check the device tree to see if Skywalk netagent has been explicitly
2743 	 * enabled or disabled.  This can be overridden via if_attach_nx below.
2744 	 * Note that the property is a 0-length key, and so checking for the
2745 	 * presence itself is enough (no need to check for the actual value of
2746 	 * the retrieved variable.)
2747 	 */
2748 	pe_enable_fsw_transport_netagent =
2749 	    PE_get_default("kern.skywalk_netagent_enable",
2750 	    &pe_enable_fsw_transport_netagent,
2751 	    sizeof(pe_enable_fsw_transport_netagent));
2752 	pe_disable_fsw_transport_netagent =
2753 	    PE_get_default("kern.skywalk_netagent_disable",
2754 	    &pe_disable_fsw_transport_netagent,
2755 	    sizeof(pe_disable_fsw_transport_netagent));
2756 
2757 	/*
2758 	 * These two are mutually exclusive, i.e. they both can be absent,
2759 	 * but only one can be present at a time, and so we assert to make
2760 	 * sure it is correct.
2761 	 */
2762 	VERIFY((!pe_enable_fsw_transport_netagent &&
2763 	    !pe_disable_fsw_transport_netagent) ||
2764 	    (pe_enable_fsw_transport_netagent ^
2765 	    pe_disable_fsw_transport_netagent));
2766 
2767 	if (pe_enable_fsw_transport_netagent) {
2768 		kprintf("SK: netagent is enabled via an override for "
2769 		    "this platform\n");
2770 		if_attach_nx = SKYWALK_NETWORKING_ENABLED;
2771 	} else if (pe_disable_fsw_transport_netagent) {
2772 		kprintf("SK: netagent is disabled via an override for "
2773 		    "this platform\n");
2774 		if_attach_nx = SKYWALK_NETWORKING_DISABLED;
2775 	} else {
2776 		kprintf("SK: netagent is %s by default for this platform\n",
2777 		    (enable_fsw_netagent ? "enabled" : "disabled"));
2778 		if_attach_nx = IF_ATTACH_NX_DEFAULT;
2779 	}
2780 
2781 	/*
2782 	 * Now see if there's a boot-arg override.
2783 	 */
2784 	(void) PE_parse_boot_argn("if_attach_nx", &if_attach_nx,
2785 	    sizeof(if_attach_nx));
2786 	if_enable_fsw_transport_netagent =
2787 	    ((if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0);
2788 
2789 	if_netif_all = ((if_attach_nx & IF_ATTACH_NX_NETIF_ALL) != 0);
2790 
2791 	if (pe_disable_fsw_transport_netagent &&
2792 	    if_enable_fsw_transport_netagent) {
2793 		kprintf("SK: netagent is force-enabled\n");
2794 	} else if (!pe_disable_fsw_transport_netagent &&
2795 	    !if_enable_fsw_transport_netagent) {
2796 		kprintf("SK: netagent is force-disabled\n");
2797 	}
2798 #ifdef XNU_TARGET_OS_OSX
2799 	if (if_enable_fsw_transport_netagent) {
2800 		net_filter_event_register(dlil_filter_event);
2801 	}
2802 #endif /* XNU_TARGET_OS_OSX */
2803 
2804 #if (DEVELOPMENT || DEBUG)
2805 	(void) PE_parse_boot_argn("fsw_use_max_mtu_buffer",
2806 	    &fsw_use_max_mtu_buffer, sizeof(fsw_use_max_mtu_buffer));
2807 #endif /* (DEVELOPMENT || DEBUG) */
2808 
2809 #endif /* SKYWALK */
2810 	dlif_size = (ifnet_debug == 0) ? sizeof(struct dlil_ifnet) :
2811 	    sizeof(struct dlil_ifnet_dbg);
2812 	/* Enforce 64-bit alignment for dlil_ifnet structure */
2813 	dlif_bufsize = dlif_size + sizeof(void *) + sizeof(u_int64_t);
2814 	dlif_bufsize = (uint32_t)P2ROUNDUP(dlif_bufsize, sizeof(u_int64_t));
2815 	dlif_zone = zone_create(DLIF_ZONE_NAME, dlif_bufsize, ZC_ZFREE_CLEARMEM);
2816 
2817 	dlif_tcpstat_size = sizeof(struct tcpstat_local);
2818 	/* Enforce 64-bit alignment for tcpstat_local structure */
2819 	dlif_tcpstat_bufsize =
2820 	    dlif_tcpstat_size + sizeof(void *) + sizeof(u_int64_t);
2821 	dlif_tcpstat_bufsize = (uint32_t)
2822 	    P2ROUNDUP(dlif_tcpstat_bufsize, sizeof(u_int64_t));
2823 	dlif_tcpstat_zone = zone_create(DLIF_TCPSTAT_ZONE_NAME,
2824 	    dlif_tcpstat_bufsize, ZC_ZFREE_CLEARMEM);
2825 
2826 	dlif_udpstat_size = sizeof(struct udpstat_local);
2827 	/* Enforce 64-bit alignment for udpstat_local structure */
2828 	dlif_udpstat_bufsize =
2829 	    dlif_udpstat_size + sizeof(void *) + sizeof(u_int64_t);
2830 	dlif_udpstat_bufsize = (uint32_t)
2831 	    P2ROUNDUP(dlif_udpstat_bufsize, sizeof(u_int64_t));
2832 	dlif_udpstat_zone = zone_create(DLIF_UDPSTAT_ZONE_NAME,
2833 	    dlif_udpstat_bufsize, ZC_ZFREE_CLEARMEM);
2834 
2835 	eventhandler_lists_ctxt_init(&ifnet_evhdlr_ctxt);
2836 
2837 	TAILQ_INIT(&dlil_ifnet_head);
2838 	TAILQ_INIT(&ifnet_head);
2839 	TAILQ_INIT(&ifnet_detaching_head);
2840 	TAILQ_INIT(&ifnet_ordered_head);
2841 
2842 	/* Initialize interface address subsystem */
2843 	ifa_init();
2844 
2845 #if PF
2846 	/* Initialize the packet filter */
2847 	pfinit();
2848 #endif /* PF */
2849 
2850 	/* Initialize queue algorithms */
2851 	classq_init();
2852 
2853 	/* Initialize packet schedulers */
2854 	pktsched_init();
2855 
2856 	/* Initialize flow advisory subsystem */
2857 	flowadv_init();
2858 
2859 	/* Initialize the pktap virtual interface */
2860 	pktap_init();
2861 
2862 	/* Initialize the service class to dscp map */
2863 	net_qos_map_init();
2864 
2865 	/* Initialize the interface low power mode event handler */
2866 	if_low_power_evhdlr_init();
2867 
2868 	/* Initialize the interface offload port list subsystem */
2869 	if_ports_used_init();
2870 
2871 #if DEBUG || DEVELOPMENT
2872 	/* Run self-tests */
2873 	dlil_verify_sum16();
2874 #endif /* DEBUG || DEVELOPMENT */
2875 
2876 	/*
2877 	 * Create and start up the main DLIL input thread and the interface
2878 	 * detacher threads once everything is initialized.
2879 	 */
2880 	dlil_incr_pending_thread_count();
2881 	(void) dlil_create_input_thread(NULL, dlil_main_input_thread, NULL);
2882 
2883 	/*
2884 	 * Create ifnet detacher thread.
2885 	 * When an interface gets detached, part of the detach processing
2886 	 * is delayed. The interface is added to delayed detach list
2887 	 * and this thread is woken up to call ifnet_detach_final
2888 	 * on these interfaces.
2889 	 */
2890 	dlil_incr_pending_thread_count();
2891 	if (kernel_thread_start(ifnet_detacher_thread_func,
2892 	    NULL, &thread) != KERN_SUCCESS) {
2893 		panic_plain("%s: couldn't create detacher thread", __func__);
2894 		/* NOTREACHED */
2895 	}
2896 	thread_deallocate(thread);
2897 
2898 	/*
2899 	 * Wait for the created kernel threads for dlil to get
2900 	 * scheduled and run at least once before we proceed
2901 	 */
2902 	lck_mtx_lock(&dlil_thread_sync_lock);
2903 	while (dlil_pending_thread_cnt != 0) {
2904 		DLIL_PRINTF("%s: Waiting for all the create dlil kernel "
2905 		    "threads to get scheduled at least once.\n", __func__);
2906 		(void) msleep(&dlil_pending_thread_cnt, &dlil_thread_sync_lock,
2907 		    (PZERO - 1), __func__, NULL);
2908 		LCK_MTX_ASSERT(&dlil_thread_sync_lock, LCK_ASSERT_OWNED);
2909 	}
2910 	lck_mtx_unlock(&dlil_thread_sync_lock);
2911 	DLIL_PRINTF("%s: All the created dlil kernel threads have been "
2912 	    "scheduled at least once. Proceeding.\n", __func__);
2913 }
2914 
2915 static void
if_flt_monitor_busy(struct ifnet * ifp)2916 if_flt_monitor_busy(struct ifnet *ifp)
2917 {
2918 	LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
2919 
2920 	++ifp->if_flt_busy;
2921 	VERIFY(ifp->if_flt_busy != 0);
2922 }
2923 
2924 static void
if_flt_monitor_unbusy(struct ifnet * ifp)2925 if_flt_monitor_unbusy(struct ifnet *ifp)
2926 {
2927 	if_flt_monitor_leave(ifp);
2928 }
2929 
2930 static void
if_flt_monitor_enter(struct ifnet * ifp)2931 if_flt_monitor_enter(struct ifnet *ifp)
2932 {
2933 	LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
2934 
2935 	while (ifp->if_flt_busy) {
2936 		++ifp->if_flt_waiters;
2937 		(void) msleep(&ifp->if_flt_head, &ifp->if_flt_lock,
2938 		    (PZERO - 1), "if_flt_monitor", NULL);
2939 	}
2940 	if_flt_monitor_busy(ifp);
2941 }
2942 
2943 static void
if_flt_monitor_leave(struct ifnet * ifp)2944 if_flt_monitor_leave(struct ifnet *ifp)
2945 {
2946 	LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
2947 
2948 	VERIFY(ifp->if_flt_busy != 0);
2949 	--ifp->if_flt_busy;
2950 
2951 	if (ifp->if_flt_busy == 0 && ifp->if_flt_waiters > 0) {
2952 		ifp->if_flt_waiters = 0;
2953 		wakeup(&ifp->if_flt_head);
2954 	}
2955 }
2956 
2957 __private_extern__ int
dlil_attach_filter(struct ifnet * ifp,const struct iff_filter * if_filter,interface_filter_t * filter_ref,u_int32_t flags)2958 dlil_attach_filter(struct ifnet *ifp, const struct iff_filter *if_filter,
2959     interface_filter_t *filter_ref, u_int32_t flags)
2960 {
2961 	int retval = 0;
2962 	struct ifnet_filter *filter = NULL;
2963 
2964 	ifnet_head_lock_shared();
2965 
2966 	/* Check that the interface is in the global list */
2967 	if (!ifnet_lookup(ifp)) {
2968 		retval = ENXIO;
2969 		goto done;
2970 	}
2971 	if (!ifnet_is_attached(ifp, 1)) {
2972 		os_log(OS_LOG_DEFAULT, "%s: %s is no longer attached",
2973 		    __func__, if_name(ifp));
2974 		retval = ENXIO;
2975 		goto done;
2976 	}
2977 
2978 	filter = zalloc_flags(dlif_filt_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
2979 
2980 	/* refcnt held above during lookup */
2981 	filter->filt_flags = flags;
2982 	filter->filt_ifp = ifp;
2983 	filter->filt_cookie = if_filter->iff_cookie;
2984 	filter->filt_name = if_filter->iff_name;
2985 	filter->filt_protocol = if_filter->iff_protocol;
2986 	/*
2987 	 * Do not install filter callbacks for internal coproc interface
2988 	 * and for management interfaces
2989 	 */
2990 	if (!IFNET_IS_INTCOPROC(ifp) && !IFNET_IS_MANAGEMENT(ifp)) {
2991 		filter->filt_input = if_filter->iff_input;
2992 		filter->filt_output = if_filter->iff_output;
2993 		filter->filt_event = if_filter->iff_event;
2994 		filter->filt_ioctl = if_filter->iff_ioctl;
2995 	}
2996 	filter->filt_detached = if_filter->iff_detached;
2997 
2998 	lck_mtx_lock(&ifp->if_flt_lock);
2999 	if_flt_monitor_enter(ifp);
3000 
3001 	LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
3002 	TAILQ_INSERT_TAIL(&ifp->if_flt_head, filter, filt_next);
3003 
3004 	*filter_ref = filter;
3005 
3006 	/*
3007 	 * Bump filter count and route_generation ID to let TCP
3008 	 * know it shouldn't do TSO on this connection
3009 	 */
3010 	if ((filter->filt_flags & DLIL_IFF_TSO) == 0) {
3011 		ifnet_filter_update_tso(ifp, TRUE);
3012 	}
3013 	OSIncrementAtomic64(&net_api_stats.nas_iflt_attach_count);
3014 	INC_ATOMIC_INT64_LIM(net_api_stats.nas_iflt_attach_total);
3015 	if (filter->filt_flags & DLIL_IFF_INTERNAL) {
3016 		OSIncrementAtomic64(&net_api_stats.nas_iflt_attach_os_count);
3017 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_iflt_attach_os_total);
3018 	} else {
3019 		OSAddAtomic(1, &ifp->if_flt_non_os_count);
3020 	}
3021 	if_flt_monitor_leave(ifp);
3022 	lck_mtx_unlock(&ifp->if_flt_lock);
3023 
3024 #if SKYWALK && defined(XNU_TARGET_OS_OSX)
3025 	net_filter_event_mark(NET_FILTER_EVENT_INTERFACE,
3026 	    net_check_compatible_if_filter(NULL));
3027 #endif /* SKYWALK && XNU_TARGET_OS_OSX */
3028 
3029 	if (dlil_verbose) {
3030 		DLIL_PRINTF("%s: %s filter attached\n", if_name(ifp),
3031 		    if_filter->iff_name);
3032 	}
3033 	ifnet_decr_iorefcnt(ifp);
3034 
3035 done:
3036 	ifnet_head_done();
3037 	if (retval != 0 && ifp != NULL) {
3038 		DLIL_PRINTF("%s: failed to attach %s (err=%d)\n",
3039 		    if_name(ifp), if_filter->iff_name, retval);
3040 	}
3041 	if (retval != 0 && filter != NULL) {
3042 		zfree(dlif_filt_zone, filter);
3043 	}
3044 
3045 	return retval;
3046 }
3047 
3048 static int
dlil_detach_filter_internal(interface_filter_t filter,int detached)3049 dlil_detach_filter_internal(interface_filter_t  filter, int detached)
3050 {
3051 	int retval = 0;
3052 
3053 	if (detached == 0) {
3054 		ifnet_t ifp = NULL;
3055 
3056 		ifnet_head_lock_shared();
3057 		TAILQ_FOREACH(ifp, &ifnet_head, if_link) {
3058 			interface_filter_t entry = NULL;
3059 
3060 			lck_mtx_lock(&ifp->if_flt_lock);
3061 			TAILQ_FOREACH(entry, &ifp->if_flt_head, filt_next) {
3062 				if (entry != filter || entry->filt_skip) {
3063 					continue;
3064 				}
3065 				/*
3066 				 * We've found a match; since it's possible
3067 				 * that the thread gets blocked in the monitor,
3068 				 * we do the lock dance.  Interface should
3069 				 * not be detached since we still have a use
3070 				 * count held during filter attach.
3071 				 */
3072 				entry->filt_skip = 1;   /* skip input/output */
3073 				lck_mtx_unlock(&ifp->if_flt_lock);
3074 				ifnet_head_done();
3075 
3076 				lck_mtx_lock(&ifp->if_flt_lock);
3077 				if_flt_monitor_enter(ifp);
3078 				LCK_MTX_ASSERT(&ifp->if_flt_lock,
3079 				    LCK_MTX_ASSERT_OWNED);
3080 
3081 				/* Remove the filter from the list */
3082 				TAILQ_REMOVE(&ifp->if_flt_head, filter,
3083 				    filt_next);
3084 
3085 				if (dlil_verbose) {
3086 					DLIL_PRINTF("%s: %s filter detached\n",
3087 					    if_name(ifp), filter->filt_name);
3088 				}
3089 				if (!(filter->filt_flags & DLIL_IFF_INTERNAL)) {
3090 					VERIFY(ifp->if_flt_non_os_count != 0);
3091 					OSAddAtomic(-1, &ifp->if_flt_non_os_count);
3092 				}
3093 				/*
3094 				 * Decrease filter count and route_generation
3095 				 * ID to let TCP know it should reevalute doing
3096 				 * TSO or not.
3097 				 */
3098 				if ((filter->filt_flags & DLIL_IFF_TSO) == 0) {
3099 					ifnet_filter_update_tso(ifp, FALSE);
3100 				}
3101 				if_flt_monitor_leave(ifp);
3102 				lck_mtx_unlock(&ifp->if_flt_lock);
3103 				goto destroy;
3104 			}
3105 			lck_mtx_unlock(&ifp->if_flt_lock);
3106 		}
3107 		ifnet_head_done();
3108 
3109 		/* filter parameter is not a valid filter ref */
3110 		retval = EINVAL;
3111 		goto done;
3112 	} else {
3113 		struct ifnet *ifp = filter->filt_ifp;
3114 		/*
3115 		 * Here we are called from ifnet_detach_final(); the
3116 		 * caller had emptied if_flt_head and we're doing an
3117 		 * implicit filter detach because the interface is
3118 		 * about to go away.  Make sure to adjust the counters
3119 		 * in this case.  We don't need the protection of the
3120 		 * filter monitor since we're called as part of the
3121 		 * final detach in the context of the detacher thread.
3122 		 */
3123 		if (!(filter->filt_flags & DLIL_IFF_INTERNAL)) {
3124 			VERIFY(ifp->if_flt_non_os_count != 0);
3125 			OSAddAtomic(-1, &ifp->if_flt_non_os_count);
3126 		}
3127 		/*
3128 		 * Decrease filter count and route_generation
3129 		 * ID to let TCP know it should reevalute doing
3130 		 * TSO or not.
3131 		 */
3132 		if ((filter->filt_flags & DLIL_IFF_TSO) == 0) {
3133 			ifnet_filter_update_tso(ifp, FALSE);
3134 		}
3135 	}
3136 
3137 	if (dlil_verbose) {
3138 		DLIL_PRINTF("%s filter detached\n", filter->filt_name);
3139 	}
3140 
3141 destroy:
3142 
3143 	/* Call the detached function if there is one */
3144 	if (filter->filt_detached) {
3145 		filter->filt_detached(filter->filt_cookie, filter->filt_ifp);
3146 	}
3147 
3148 	VERIFY(OSDecrementAtomic64(&net_api_stats.nas_iflt_attach_count) > 0);
3149 	if (filter->filt_flags & DLIL_IFF_INTERNAL) {
3150 		VERIFY(OSDecrementAtomic64(&net_api_stats.nas_iflt_attach_os_count) > 0);
3151 	}
3152 #if SKYWALK && defined(XNU_TARGET_OS_OSX)
3153 	net_filter_event_mark(NET_FILTER_EVENT_INTERFACE,
3154 	    net_check_compatible_if_filter(NULL));
3155 #endif /* SKYWALK && XNU_TARGET_OS_OSX */
3156 
3157 	/* Free the filter */
3158 	zfree(dlif_filt_zone, filter);
3159 	filter = NULL;
3160 done:
3161 	if (retval != 0 && filter != NULL) {
3162 		DLIL_PRINTF("failed to detach %s filter (err=%d)\n",
3163 		    filter->filt_name, retval);
3164 	}
3165 
3166 	return retval;
3167 }
3168 
3169 __private_extern__ void
dlil_detach_filter(interface_filter_t filter)3170 dlil_detach_filter(interface_filter_t filter)
3171 {
3172 	if (filter == NULL) {
3173 		return;
3174 	}
3175 	dlil_detach_filter_internal(filter, 0);
3176 }
3177 
3178 __private_extern__ boolean_t
dlil_has_ip_filter(void)3179 dlil_has_ip_filter(void)
3180 {
3181 	boolean_t has_filter = ((net_api_stats.nas_ipf_add_count - net_api_stats.nas_ipf_add_os_count) > 0);
3182 
3183 	VERIFY(net_api_stats.nas_ipf_add_count >= net_api_stats.nas_ipf_add_os_count);
3184 
3185 	DTRACE_IP1(dlil_has_ip_filter, boolean_t, has_filter);
3186 	return has_filter;
3187 }
3188 
3189 __private_extern__ boolean_t
dlil_has_if_filter(struct ifnet * ifp)3190 dlil_has_if_filter(struct ifnet *ifp)
3191 {
3192 	boolean_t has_filter = !TAILQ_EMPTY(&ifp->if_flt_head);
3193 	DTRACE_IP1(dlil_has_if_filter, boolean_t, has_filter);
3194 	return has_filter;
3195 }
3196 
3197 static inline void
dlil_input_wakeup(struct dlil_threading_info * inp)3198 dlil_input_wakeup(struct dlil_threading_info *inp)
3199 {
3200 	LCK_MTX_ASSERT(&inp->dlth_lock, LCK_MTX_ASSERT_OWNED);
3201 
3202 	inp->dlth_flags |= DLIL_INPUT_WAITING;
3203 	if (!(inp->dlth_flags & DLIL_INPUT_RUNNING)) {
3204 		inp->dlth_wtot++;
3205 		wakeup_one((caddr_t)&inp->dlth_flags);
3206 	}
3207 }
3208 
3209 __attribute__((noreturn))
3210 static void
dlil_main_input_thread_func(void * v,wait_result_t w)3211 dlil_main_input_thread_func(void *v, wait_result_t w)
3212 {
3213 #pragma unused(w)
3214 	struct dlil_threading_info *inp = v;
3215 
3216 	VERIFY(inp == dlil_main_input_thread);
3217 	VERIFY(inp->dlth_ifp == NULL);
3218 	VERIFY(current_thread() == inp->dlth_thread);
3219 
3220 	lck_mtx_lock(&inp->dlth_lock);
3221 	VERIFY(!(inp->dlth_flags & (DLIL_INPUT_EMBRYONIC | DLIL_INPUT_RUNNING)));
3222 	(void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3223 	inp->dlth_flags |= DLIL_INPUT_EMBRYONIC;
3224 	/* wake up once to get out of embryonic state */
3225 	dlil_input_wakeup(inp);
3226 	lck_mtx_unlock(&inp->dlth_lock);
3227 	(void) thread_block_parameter(dlil_main_input_thread_cont, inp);
3228 	/* NOTREACHED */
3229 	__builtin_unreachable();
3230 }
3231 
3232 /*
3233  * Main input thread:
3234  *
3235  *   a) handles all inbound packets for lo0
3236  *   b) handles all inbound packets for interfaces with no dedicated
3237  *	input thread (e.g. anything but Ethernet/PDP or those that support
3238  *	opportunistic polling.)
3239  *   c) protocol registrations
3240  *   d) packet injections
3241  */
3242 __attribute__((noreturn))
3243 static void
dlil_main_input_thread_cont(void * v,wait_result_t wres)3244 dlil_main_input_thread_cont(void *v, wait_result_t wres)
3245 {
3246 	struct dlil_main_threading_info *inpm = v;
3247 	struct dlil_threading_info *inp = v;
3248 
3249 	/* main input thread is uninterruptible */
3250 	VERIFY(wres != THREAD_INTERRUPTED);
3251 	lck_mtx_lock_spin(&inp->dlth_lock);
3252 	VERIFY(!(inp->dlth_flags & (DLIL_INPUT_TERMINATE |
3253 	    DLIL_INPUT_RUNNING)));
3254 	inp->dlth_flags |= DLIL_INPUT_RUNNING;
3255 
3256 	while (1) {
3257 		struct mbuf *m = NULL, *m_loop = NULL;
3258 		u_int32_t m_cnt, m_cnt_loop;
3259 		classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
3260 		boolean_t proto_req;
3261 		boolean_t embryonic;
3262 
3263 		inp->dlth_flags &= ~DLIL_INPUT_WAITING;
3264 
3265 		if (__improbable(embryonic =
3266 		    (inp->dlth_flags & DLIL_INPUT_EMBRYONIC))) {
3267 			inp->dlth_flags &= ~DLIL_INPUT_EMBRYONIC;
3268 		}
3269 
3270 		proto_req = (inp->dlth_flags &
3271 		    (DLIL_PROTO_WAITING | DLIL_PROTO_REGISTER));
3272 
3273 		/* Packets for non-dedicated interfaces other than lo0 */
3274 		m_cnt = qlen(&inp->dlth_pkts);
3275 		_getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
3276 		m = pkt.cp_mbuf;
3277 
3278 		/* Packets exclusive to lo0 */
3279 		m_cnt_loop = qlen(&inpm->lo_rcvq_pkts);
3280 		_getq_all(&inpm->lo_rcvq_pkts, &pkt, NULL, NULL, NULL);
3281 		m_loop = pkt.cp_mbuf;
3282 
3283 		inp->dlth_wtot = 0;
3284 
3285 		lck_mtx_unlock(&inp->dlth_lock);
3286 
3287 		if (__improbable(embryonic)) {
3288 			dlil_decr_pending_thread_count();
3289 		}
3290 
3291 		/*
3292 		 * NOTE warning %%% attention !!!!
3293 		 * We should think about putting some thread starvation
3294 		 * safeguards if we deal with long chains of packets.
3295 		 */
3296 		if (__probable(m_loop != NULL)) {
3297 			dlil_input_packet_list_extended(lo_ifp, m_loop,
3298 			    m_cnt_loop, IFNET_MODEL_INPUT_POLL_OFF);
3299 		}
3300 
3301 		if (__probable(m != NULL)) {
3302 			dlil_input_packet_list_extended(NULL, m,
3303 			    m_cnt, IFNET_MODEL_INPUT_POLL_OFF);
3304 		}
3305 
3306 		if (__improbable(proto_req)) {
3307 			proto_input_run();
3308 		}
3309 
3310 		lck_mtx_lock_spin(&inp->dlth_lock);
3311 		VERIFY(inp->dlth_flags & DLIL_INPUT_RUNNING);
3312 		/* main input thread cannot be terminated */
3313 		VERIFY(!(inp->dlth_flags & DLIL_INPUT_TERMINATE));
3314 		if (!(inp->dlth_flags & ~DLIL_INPUT_RUNNING)) {
3315 			break;
3316 		}
3317 	}
3318 
3319 	inp->dlth_flags &= ~DLIL_INPUT_RUNNING;
3320 	(void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3321 	lck_mtx_unlock(&inp->dlth_lock);
3322 	(void) thread_block_parameter(dlil_main_input_thread_cont, inp);
3323 
3324 	VERIFY(0);      /* we should never get here */
3325 	/* NOTREACHED */
3326 	__builtin_unreachable();
3327 }
3328 
3329 /*
3330  * Input thread for interfaces with legacy input model.
3331  */
3332 __attribute__((noreturn))
3333 static void
dlil_input_thread_func(void * v,wait_result_t w)3334 dlil_input_thread_func(void *v, wait_result_t w)
3335 {
3336 #pragma unused(w)
3337 	char thread_name[MAXTHREADNAMESIZE];
3338 	struct dlil_threading_info *inp = v;
3339 	struct ifnet *ifp = inp->dlth_ifp;
3340 
3341 	VERIFY(inp != dlil_main_input_thread);
3342 	VERIFY(ifp != NULL);
3343 	VERIFY(!(ifp->if_eflags & IFEF_RXPOLL) || !net_rxpoll ||
3344 	    !(ifp->if_xflags & IFXF_LEGACY));
3345 	VERIFY(ifp->if_poll_mode == IFNET_MODEL_INPUT_POLL_OFF ||
3346 	    !(ifp->if_xflags & IFXF_LEGACY));
3347 	VERIFY(current_thread() == inp->dlth_thread);
3348 
3349 	/* construct the name for this thread, and then apply it */
3350 	bzero(thread_name, sizeof(thread_name));
3351 	(void) snprintf(thread_name, sizeof(thread_name),
3352 	    "dlil_input_%s", ifp->if_xname);
3353 	thread_set_thread_name(inp->dlth_thread, thread_name);
3354 
3355 	lck_mtx_lock(&inp->dlth_lock);
3356 	VERIFY(!(inp->dlth_flags & (DLIL_INPUT_EMBRYONIC | DLIL_INPUT_RUNNING)));
3357 	(void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3358 	inp->dlth_flags |= DLIL_INPUT_EMBRYONIC;
3359 	/* wake up once to get out of embryonic state */
3360 	dlil_input_wakeup(inp);
3361 	lck_mtx_unlock(&inp->dlth_lock);
3362 	(void) thread_block_parameter(dlil_input_thread_cont, inp);
3363 	/* NOTREACHED */
3364 	__builtin_unreachable();
3365 }
3366 
3367 __attribute__((noreturn))
3368 static void
dlil_input_thread_cont(void * v,wait_result_t wres)3369 dlil_input_thread_cont(void *v, wait_result_t wres)
3370 {
3371 	struct dlil_threading_info *inp = v;
3372 	struct ifnet *ifp = inp->dlth_ifp;
3373 
3374 	lck_mtx_lock_spin(&inp->dlth_lock);
3375 	if (__improbable(wres == THREAD_INTERRUPTED ||
3376 	    (inp->dlth_flags & DLIL_INPUT_TERMINATE))) {
3377 		goto terminate;
3378 	}
3379 
3380 	VERIFY(!(inp->dlth_flags & DLIL_INPUT_RUNNING));
3381 	inp->dlth_flags |= DLIL_INPUT_RUNNING;
3382 
3383 	while (1) {
3384 		struct mbuf *m = NULL;
3385 		classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
3386 		boolean_t notify = FALSE;
3387 		boolean_t embryonic;
3388 		u_int32_t m_cnt;
3389 
3390 		inp->dlth_flags &= ~DLIL_INPUT_WAITING;
3391 
3392 		if (__improbable(embryonic =
3393 		    (inp->dlth_flags & DLIL_INPUT_EMBRYONIC))) {
3394 			inp->dlth_flags &= ~DLIL_INPUT_EMBRYONIC;
3395 		}
3396 
3397 		/*
3398 		 * Protocol registration and injection must always use
3399 		 * the main input thread; in theory the latter can utilize
3400 		 * the corresponding input thread where the packet arrived
3401 		 * on, but that requires our knowing the interface in advance
3402 		 * (and the benefits might not worth the trouble.)
3403 		 */
3404 		VERIFY(!(inp->dlth_flags &
3405 		    (DLIL_PROTO_WAITING | DLIL_PROTO_REGISTER)));
3406 
3407 		/* Packets for this interface */
3408 		m_cnt = qlen(&inp->dlth_pkts);
3409 		_getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
3410 		m = pkt.cp_mbuf;
3411 
3412 		inp->dlth_wtot = 0;
3413 
3414 #if SKYWALK
3415 		/*
3416 		 * If this interface is attached to a netif nexus,
3417 		 * the stats are already incremented there; otherwise
3418 		 * do it here.
3419 		 */
3420 		if (!(ifp->if_capabilities & IFCAP_SKYWALK))
3421 #endif /* SKYWALK */
3422 		notify = dlil_input_stats_sync(ifp, inp);
3423 
3424 		lck_mtx_unlock(&inp->dlth_lock);
3425 
3426 		if (__improbable(embryonic)) {
3427 			ifnet_decr_pending_thread_count(ifp);
3428 		}
3429 
3430 		if (__improbable(notify)) {
3431 			ifnet_notify_data_threshold(ifp);
3432 		}
3433 
3434 		/*
3435 		 * NOTE warning %%% attention !!!!
3436 		 * We should think about putting some thread starvation
3437 		 * safeguards if we deal with long chains of packets.
3438 		 */
3439 		if (__probable(m != NULL)) {
3440 			dlil_input_packet_list_extended(NULL, m,
3441 			    m_cnt, ifp->if_poll_mode);
3442 		}
3443 
3444 		lck_mtx_lock_spin(&inp->dlth_lock);
3445 		VERIFY(inp->dlth_flags & DLIL_INPUT_RUNNING);
3446 		if (!(inp->dlth_flags & ~(DLIL_INPUT_RUNNING |
3447 		    DLIL_INPUT_TERMINATE))) {
3448 			break;
3449 		}
3450 	}
3451 
3452 	inp->dlth_flags &= ~DLIL_INPUT_RUNNING;
3453 
3454 	if (__improbable(inp->dlth_flags & DLIL_INPUT_TERMINATE)) {
3455 terminate:
3456 		lck_mtx_unlock(&inp->dlth_lock);
3457 		dlil_terminate_input_thread(inp);
3458 		/* NOTREACHED */
3459 	} else {
3460 		(void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3461 		lck_mtx_unlock(&inp->dlth_lock);
3462 		(void) thread_block_parameter(dlil_input_thread_cont, inp);
3463 		/* NOTREACHED */
3464 	}
3465 
3466 	VERIFY(0);      /* we should never get here */
3467 	/* NOTREACHED */
3468 	__builtin_unreachable();
3469 }
3470 
3471 /*
3472  * Input thread for interfaces with opportunistic polling input model.
3473  */
3474 __attribute__((noreturn))
3475 static void
dlil_rxpoll_input_thread_func(void * v,wait_result_t w)3476 dlil_rxpoll_input_thread_func(void *v, wait_result_t w)
3477 {
3478 #pragma unused(w)
3479 	char thread_name[MAXTHREADNAMESIZE];
3480 	struct dlil_threading_info *inp = v;
3481 	struct ifnet *ifp = inp->dlth_ifp;
3482 
3483 	VERIFY(inp != dlil_main_input_thread);
3484 	VERIFY(ifp != NULL && (ifp->if_eflags & IFEF_RXPOLL) &&
3485 	    (ifp->if_xflags & IFXF_LEGACY));
3486 	VERIFY(current_thread() == inp->dlth_thread);
3487 
3488 	/* construct the name for this thread, and then apply it */
3489 	bzero(thread_name, sizeof(thread_name));
3490 	(void) snprintf(thread_name, sizeof(thread_name),
3491 	    "dlil_input_poll_%s", ifp->if_xname);
3492 	thread_set_thread_name(inp->dlth_thread, thread_name);
3493 
3494 	lck_mtx_lock(&inp->dlth_lock);
3495 	VERIFY(!(inp->dlth_flags & (DLIL_INPUT_EMBRYONIC | DLIL_INPUT_RUNNING)));
3496 	(void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3497 	inp->dlth_flags |= DLIL_INPUT_EMBRYONIC;
3498 	/* wake up once to get out of embryonic state */
3499 	dlil_input_wakeup(inp);
3500 	lck_mtx_unlock(&inp->dlth_lock);
3501 	(void) thread_block_parameter(dlil_rxpoll_input_thread_cont, inp);
3502 	/* NOTREACHED */
3503 	__builtin_unreachable();
3504 }
3505 
3506 __attribute__((noreturn))
3507 static void
dlil_rxpoll_input_thread_cont(void * v,wait_result_t wres)3508 dlil_rxpoll_input_thread_cont(void *v, wait_result_t wres)
3509 {
3510 	struct dlil_threading_info *inp = v;
3511 	struct ifnet *ifp = inp->dlth_ifp;
3512 	struct timespec ts;
3513 
3514 	lck_mtx_lock_spin(&inp->dlth_lock);
3515 	if (__improbable(wres == THREAD_INTERRUPTED ||
3516 	    (inp->dlth_flags & DLIL_INPUT_TERMINATE))) {
3517 		goto terminate;
3518 	}
3519 
3520 	VERIFY(!(inp->dlth_flags & DLIL_INPUT_RUNNING));
3521 	inp->dlth_flags |= DLIL_INPUT_RUNNING;
3522 
3523 	while (1) {
3524 		struct mbuf *m = NULL;
3525 		uint32_t m_cnt, poll_req = 0;
3526 		uint64_t m_size = 0;
3527 		ifnet_model_t mode;
3528 		struct timespec now, delta;
3529 		classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
3530 		boolean_t notify;
3531 		boolean_t embryonic;
3532 		uint64_t ival;
3533 
3534 		inp->dlth_flags &= ~DLIL_INPUT_WAITING;
3535 
3536 		if (__improbable(embryonic =
3537 		    (inp->dlth_flags & DLIL_INPUT_EMBRYONIC))) {
3538 			inp->dlth_flags &= ~DLIL_INPUT_EMBRYONIC;
3539 			goto skip;
3540 		}
3541 
3542 		if ((ival = ifp->if_rxpoll_ival) < IF_RXPOLL_INTERVALTIME_MIN) {
3543 			ival = IF_RXPOLL_INTERVALTIME_MIN;
3544 		}
3545 
3546 		/* Link parameters changed? */
3547 		if (ifp->if_poll_update != 0) {
3548 			ifp->if_poll_update = 0;
3549 			(void) dlil_rxpoll_set_params(ifp, NULL, TRUE);
3550 		}
3551 
3552 		/* Current operating mode */
3553 		mode = ifp->if_poll_mode;
3554 
3555 		/*
3556 		 * Protocol registration and injection must always use
3557 		 * the main input thread; in theory the latter can utilize
3558 		 * the corresponding input thread where the packet arrived
3559 		 * on, but that requires our knowing the interface in advance
3560 		 * (and the benefits might not worth the trouble.)
3561 		 */
3562 		VERIFY(!(inp->dlth_flags &
3563 		    (DLIL_PROTO_WAITING | DLIL_PROTO_REGISTER)));
3564 
3565 		/* Total count of all packets */
3566 		m_cnt = qlen(&inp->dlth_pkts);
3567 
3568 		/* Total bytes of all packets */
3569 		m_size = qsize(&inp->dlth_pkts);
3570 
3571 		/* Packets for this interface */
3572 		_getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
3573 		m = pkt.cp_mbuf;
3574 		VERIFY(m != NULL || m_cnt == 0);
3575 
3576 		nanouptime(&now);
3577 		if (!net_timerisset(&ifp->if_poll_sample_lasttime)) {
3578 			*(&ifp->if_poll_sample_lasttime) = *(&now);
3579 		}
3580 
3581 		net_timersub(&now, &ifp->if_poll_sample_lasttime, &delta);
3582 		if (if_rxpoll && net_timerisset(&ifp->if_poll_sample_holdtime)) {
3583 			u_int32_t ptot, btot;
3584 
3585 			/* Accumulate statistics for current sampling */
3586 			PKTCNTR_ADD(&ifp->if_poll_sstats, m_cnt, m_size);
3587 
3588 			if (net_timercmp(&delta, &ifp->if_poll_sample_holdtime, <)) {
3589 				goto skip;
3590 			}
3591 
3592 			*(&ifp->if_poll_sample_lasttime) = *(&now);
3593 
3594 			/* Calculate min/max of inbound bytes */
3595 			btot = (u_int32_t)ifp->if_poll_sstats.bytes;
3596 			if (ifp->if_rxpoll_bmin == 0 || ifp->if_rxpoll_bmin > btot) {
3597 				ifp->if_rxpoll_bmin = btot;
3598 			}
3599 			if (btot > ifp->if_rxpoll_bmax) {
3600 				ifp->if_rxpoll_bmax = btot;
3601 			}
3602 
3603 			/* Calculate EWMA of inbound bytes */
3604 			DLIL_EWMA(ifp->if_rxpoll_bavg, btot, if_rxpoll_decay);
3605 
3606 			/* Calculate min/max of inbound packets */
3607 			ptot = (u_int32_t)ifp->if_poll_sstats.packets;
3608 			if (ifp->if_rxpoll_pmin == 0 || ifp->if_rxpoll_pmin > ptot) {
3609 				ifp->if_rxpoll_pmin = ptot;
3610 			}
3611 			if (ptot > ifp->if_rxpoll_pmax) {
3612 				ifp->if_rxpoll_pmax = ptot;
3613 			}
3614 
3615 			/* Calculate EWMA of inbound packets */
3616 			DLIL_EWMA(ifp->if_rxpoll_pavg, ptot, if_rxpoll_decay);
3617 
3618 			/* Reset sampling statistics */
3619 			PKTCNTR_CLEAR(&ifp->if_poll_sstats);
3620 
3621 			/* Calculate EWMA of wakeup requests */
3622 			DLIL_EWMA(ifp->if_rxpoll_wavg, inp->dlth_wtot,
3623 			    if_rxpoll_decay);
3624 			inp->dlth_wtot = 0;
3625 
3626 			if (dlil_verbose) {
3627 				if (!net_timerisset(&ifp->if_poll_dbg_lasttime)) {
3628 					*(&ifp->if_poll_dbg_lasttime) = *(&now);
3629 				}
3630 				net_timersub(&now, &ifp->if_poll_dbg_lasttime, &delta);
3631 				if (net_timercmp(&delta, &dlil_dbgrate, >=)) {
3632 					*(&ifp->if_poll_dbg_lasttime) = *(&now);
3633 					DLIL_PRINTF("%s: [%s] pkts avg %d max %d "
3634 					    "limits [%d/%d], wreq avg %d "
3635 					    "limits [%d/%d], bytes avg %d "
3636 					    "limits [%d/%d]\n", if_name(ifp),
3637 					    (ifp->if_poll_mode ==
3638 					    IFNET_MODEL_INPUT_POLL_ON) ?
3639 					    "ON" : "OFF", ifp->if_rxpoll_pavg,
3640 					    ifp->if_rxpoll_pmax,
3641 					    ifp->if_rxpoll_plowat,
3642 					    ifp->if_rxpoll_phiwat,
3643 					    ifp->if_rxpoll_wavg,
3644 					    ifp->if_rxpoll_wlowat,
3645 					    ifp->if_rxpoll_whiwat,
3646 					    ifp->if_rxpoll_bavg,
3647 					    ifp->if_rxpoll_blowat,
3648 					    ifp->if_rxpoll_bhiwat);
3649 				}
3650 			}
3651 
3652 			/* Perform mode transition, if necessary */
3653 			if (!net_timerisset(&ifp->if_poll_mode_lasttime)) {
3654 				*(&ifp->if_poll_mode_lasttime) = *(&now);
3655 			}
3656 
3657 			net_timersub(&now, &ifp->if_poll_mode_lasttime, &delta);
3658 			if (net_timercmp(&delta, &ifp->if_poll_mode_holdtime, <)) {
3659 				goto skip;
3660 			}
3661 
3662 			if (ifp->if_rxpoll_pavg <= ifp->if_rxpoll_plowat &&
3663 			    ifp->if_rxpoll_bavg <= ifp->if_rxpoll_blowat &&
3664 			    ifp->if_poll_mode != IFNET_MODEL_INPUT_POLL_OFF) {
3665 				mode = IFNET_MODEL_INPUT_POLL_OFF;
3666 			} else if (ifp->if_rxpoll_pavg >= ifp->if_rxpoll_phiwat &&
3667 			    (ifp->if_rxpoll_bavg >= ifp->if_rxpoll_bhiwat ||
3668 			    ifp->if_rxpoll_wavg >= ifp->if_rxpoll_whiwat) &&
3669 			    ifp->if_poll_mode != IFNET_MODEL_INPUT_POLL_ON) {
3670 				mode = IFNET_MODEL_INPUT_POLL_ON;
3671 			}
3672 
3673 			if (mode != ifp->if_poll_mode) {
3674 				ifp->if_poll_mode = mode;
3675 				*(&ifp->if_poll_mode_lasttime) = *(&now);
3676 				poll_req++;
3677 			}
3678 		}
3679 skip:
3680 		notify = dlil_input_stats_sync(ifp, inp);
3681 
3682 		lck_mtx_unlock(&inp->dlth_lock);
3683 
3684 		if (__improbable(embryonic)) {
3685 			ifnet_decr_pending_thread_count(ifp);
3686 		}
3687 
3688 		if (__improbable(notify)) {
3689 			ifnet_notify_data_threshold(ifp);
3690 		}
3691 
3692 		/*
3693 		 * If there's a mode change and interface is still attached,
3694 		 * perform a downcall to the driver for the new mode.  Also
3695 		 * hold an IO refcnt on the interface to prevent it from
3696 		 * being detached (will be release below.)
3697 		 */
3698 		if (poll_req != 0 && ifnet_is_attached(ifp, 1)) {
3699 			struct ifnet_model_params p = {
3700 				.model = mode, .reserved = { 0 }
3701 			};
3702 			errno_t err;
3703 
3704 			if (dlil_verbose) {
3705 				DLIL_PRINTF("%s: polling is now %s, "
3706 				    "pkts avg %d max %d limits [%d/%d], "
3707 				    "wreq avg %d limits [%d/%d], "
3708 				    "bytes avg %d limits [%d/%d]\n",
3709 				    if_name(ifp),
3710 				    (mode == IFNET_MODEL_INPUT_POLL_ON) ?
3711 				    "ON" : "OFF", ifp->if_rxpoll_pavg,
3712 				    ifp->if_rxpoll_pmax, ifp->if_rxpoll_plowat,
3713 				    ifp->if_rxpoll_phiwat, ifp->if_rxpoll_wavg,
3714 				    ifp->if_rxpoll_wlowat, ifp->if_rxpoll_whiwat,
3715 				    ifp->if_rxpoll_bavg, ifp->if_rxpoll_blowat,
3716 				    ifp->if_rxpoll_bhiwat);
3717 			}
3718 
3719 			if ((err = ((*ifp->if_input_ctl)(ifp,
3720 			    IFNET_CTL_SET_INPUT_MODEL, sizeof(p), &p))) != 0) {
3721 				DLIL_PRINTF("%s: error setting polling mode "
3722 				    "to %s (%d)\n", if_name(ifp),
3723 				    (mode == IFNET_MODEL_INPUT_POLL_ON) ?
3724 				    "ON" : "OFF", err);
3725 			}
3726 
3727 			switch (mode) {
3728 			case IFNET_MODEL_INPUT_POLL_OFF:
3729 				ifnet_set_poll_cycle(ifp, NULL);
3730 				ifp->if_rxpoll_offreq++;
3731 				if (err != 0) {
3732 					ifp->if_rxpoll_offerr++;
3733 				}
3734 				break;
3735 
3736 			case IFNET_MODEL_INPUT_POLL_ON:
3737 				net_nsectimer(&ival, &ts);
3738 				ifnet_set_poll_cycle(ifp, &ts);
3739 				ifnet_poll(ifp);
3740 				ifp->if_rxpoll_onreq++;
3741 				if (err != 0) {
3742 					ifp->if_rxpoll_onerr++;
3743 				}
3744 				break;
3745 
3746 			default:
3747 				VERIFY(0);
3748 				/* NOTREACHED */
3749 			}
3750 
3751 			/* Release the IO refcnt */
3752 			ifnet_decr_iorefcnt(ifp);
3753 		}
3754 
3755 		/*
3756 		 * NOTE warning %%% attention !!!!
3757 		 * We should think about putting some thread starvation
3758 		 * safeguards if we deal with long chains of packets.
3759 		 */
3760 		if (__probable(m != NULL)) {
3761 			dlil_input_packet_list_extended(NULL, m, m_cnt, mode);
3762 		}
3763 
3764 		lck_mtx_lock_spin(&inp->dlth_lock);
3765 		VERIFY(inp->dlth_flags & DLIL_INPUT_RUNNING);
3766 		if (!(inp->dlth_flags & ~(DLIL_INPUT_RUNNING |
3767 		    DLIL_INPUT_TERMINATE))) {
3768 			break;
3769 		}
3770 	}
3771 
3772 	inp->dlth_flags &= ~DLIL_INPUT_RUNNING;
3773 
3774 	if (__improbable(inp->dlth_flags & DLIL_INPUT_TERMINATE)) {
3775 terminate:
3776 		lck_mtx_unlock(&inp->dlth_lock);
3777 		dlil_terminate_input_thread(inp);
3778 		/* NOTREACHED */
3779 	} else {
3780 		(void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3781 		lck_mtx_unlock(&inp->dlth_lock);
3782 		(void) thread_block_parameter(dlil_rxpoll_input_thread_cont,
3783 		    inp);
3784 		/* NOTREACHED */
3785 	}
3786 
3787 	VERIFY(0);      /* we should never get here */
3788 	/* NOTREACHED */
3789 	__builtin_unreachable();
3790 }
3791 
3792 errno_t
dlil_rxpoll_validate_params(struct ifnet_poll_params * p)3793 dlil_rxpoll_validate_params(struct ifnet_poll_params *p)
3794 {
3795 	if (p != NULL) {
3796 		if ((p->packets_lowat == 0 && p->packets_hiwat != 0) ||
3797 		    (p->packets_lowat != 0 && p->packets_hiwat == 0)) {
3798 			return EINVAL;
3799 		}
3800 		if (p->packets_lowat != 0 &&    /* hiwat must be non-zero */
3801 		    p->packets_lowat >= p->packets_hiwat) {
3802 			return EINVAL;
3803 		}
3804 		if ((p->bytes_lowat == 0 && p->bytes_hiwat != 0) ||
3805 		    (p->bytes_lowat != 0 && p->bytes_hiwat == 0)) {
3806 			return EINVAL;
3807 		}
3808 		if (p->bytes_lowat != 0 &&      /* hiwat must be non-zero */
3809 		    p->bytes_lowat >= p->bytes_hiwat) {
3810 			return EINVAL;
3811 		}
3812 		if (p->interval_time != 0 &&
3813 		    p->interval_time < IF_RXPOLL_INTERVALTIME_MIN) {
3814 			p->interval_time = IF_RXPOLL_INTERVALTIME_MIN;
3815 		}
3816 	}
3817 	return 0;
3818 }
3819 
3820 void
dlil_rxpoll_update_params(struct ifnet * ifp,struct ifnet_poll_params * p)3821 dlil_rxpoll_update_params(struct ifnet *ifp, struct ifnet_poll_params *p)
3822 {
3823 	u_int64_t sample_holdtime, inbw;
3824 
3825 	if ((inbw = ifnet_input_linkrate(ifp)) == 0 && p == NULL) {
3826 		sample_holdtime = 0;    /* polling is disabled */
3827 		ifp->if_rxpoll_wlowat = ifp->if_rxpoll_plowat =
3828 		    ifp->if_rxpoll_blowat = 0;
3829 		ifp->if_rxpoll_whiwat = ifp->if_rxpoll_phiwat =
3830 		    ifp->if_rxpoll_bhiwat = (u_int32_t)-1;
3831 		ifp->if_rxpoll_plim = 0;
3832 		ifp->if_rxpoll_ival = IF_RXPOLL_INTERVALTIME_MIN;
3833 	} else {
3834 		u_int32_t plowat, phiwat, blowat, bhiwat, plim;
3835 		u_int64_t ival;
3836 		unsigned int n, i;
3837 
3838 		for (n = 0, i = 0; rxpoll_tbl[i].speed != 0; i++) {
3839 			if (inbw < rxpoll_tbl[i].speed) {
3840 				break;
3841 			}
3842 			n = i;
3843 		}
3844 		/* auto-tune if caller didn't specify a value */
3845 		plowat = ((p == NULL || p->packets_lowat == 0) ?
3846 		    rxpoll_tbl[n].plowat : p->packets_lowat);
3847 		phiwat = ((p == NULL || p->packets_hiwat == 0) ?
3848 		    rxpoll_tbl[n].phiwat : p->packets_hiwat);
3849 		blowat = ((p == NULL || p->bytes_lowat == 0) ?
3850 		    rxpoll_tbl[n].blowat : p->bytes_lowat);
3851 		bhiwat = ((p == NULL || p->bytes_hiwat == 0) ?
3852 		    rxpoll_tbl[n].bhiwat : p->bytes_hiwat);
3853 		plim = ((p == NULL || p->packets_limit == 0 ||
3854 		    if_rxpoll_max != 0) ?  if_rxpoll_max : p->packets_limit);
3855 		ival = ((p == NULL || p->interval_time == 0 ||
3856 		    if_rxpoll_interval_time != IF_RXPOLL_INTERVALTIME) ?
3857 		    if_rxpoll_interval_time : p->interval_time);
3858 
3859 		VERIFY(plowat != 0 && phiwat != 0);
3860 		VERIFY(blowat != 0 && bhiwat != 0);
3861 		VERIFY(ival >= IF_RXPOLL_INTERVALTIME_MIN);
3862 
3863 		sample_holdtime = if_rxpoll_sample_holdtime;
3864 		ifp->if_rxpoll_wlowat = if_sysctl_rxpoll_wlowat;
3865 		ifp->if_rxpoll_whiwat = if_sysctl_rxpoll_whiwat;
3866 		ifp->if_rxpoll_plowat = plowat;
3867 		ifp->if_rxpoll_phiwat = phiwat;
3868 		ifp->if_rxpoll_blowat = blowat;
3869 		ifp->if_rxpoll_bhiwat = bhiwat;
3870 		ifp->if_rxpoll_plim = plim;
3871 		ifp->if_rxpoll_ival = ival;
3872 	}
3873 
3874 	net_nsectimer(&if_rxpoll_mode_holdtime, &ifp->if_poll_mode_holdtime);
3875 	net_nsectimer(&sample_holdtime, &ifp->if_poll_sample_holdtime);
3876 
3877 	if (dlil_verbose) {
3878 		DLIL_PRINTF("%s: speed %llu bps, sample per %llu nsec, "
3879 		    "poll interval %llu nsec, pkts per poll %u, "
3880 		    "pkt limits [%u/%u], wreq limits [%u/%u], "
3881 		    "bytes limits [%u/%u]\n", if_name(ifp),
3882 		    inbw, sample_holdtime, ifp->if_rxpoll_ival,
3883 		    ifp->if_rxpoll_plim, ifp->if_rxpoll_plowat,
3884 		    ifp->if_rxpoll_phiwat, ifp->if_rxpoll_wlowat,
3885 		    ifp->if_rxpoll_whiwat, ifp->if_rxpoll_blowat,
3886 		    ifp->if_rxpoll_bhiwat);
3887 	}
3888 }
3889 
3890 /*
3891  * Must be called on an attached ifnet (caller is expected to check.)
3892  * Caller may pass NULL for poll parameters to indicate "auto-tuning."
3893  */
3894 errno_t
dlil_rxpoll_set_params(struct ifnet * ifp,struct ifnet_poll_params * p,boolean_t locked)3895 dlil_rxpoll_set_params(struct ifnet *ifp, struct ifnet_poll_params *p,
3896     boolean_t locked)
3897 {
3898 	errno_t err;
3899 	struct dlil_threading_info *inp;
3900 
3901 	VERIFY(ifp != NULL);
3902 	if (!(ifp->if_eflags & IFEF_RXPOLL) || (inp = ifp->if_inp) == NULL) {
3903 		return ENXIO;
3904 	}
3905 	err = dlil_rxpoll_validate_params(p);
3906 	if (err != 0) {
3907 		return err;
3908 	}
3909 
3910 	if (!locked) {
3911 		lck_mtx_lock(&inp->dlth_lock);
3912 	}
3913 	LCK_MTX_ASSERT(&inp->dlth_lock, LCK_MTX_ASSERT_OWNED);
3914 	/*
3915 	 * Normally, we'd reset the parameters to the auto-tuned values
3916 	 * if the the input thread detects a change in link rate.  If the
3917 	 * driver provides its own parameters right after a link rate
3918 	 * changes, but before the input thread gets to run, we want to
3919 	 * make sure to keep the driver's values.  Clearing if_poll_update
3920 	 * will achieve that.
3921 	 */
3922 	if (p != NULL && !locked && ifp->if_poll_update != 0) {
3923 		ifp->if_poll_update = 0;
3924 	}
3925 	dlil_rxpoll_update_params(ifp, p);
3926 	if (!locked) {
3927 		lck_mtx_unlock(&inp->dlth_lock);
3928 	}
3929 	return 0;
3930 }
3931 
3932 /*
3933  * Must be called on an attached ifnet (caller is expected to check.)
3934  */
3935 errno_t
dlil_rxpoll_get_params(struct ifnet * ifp,struct ifnet_poll_params * p)3936 dlil_rxpoll_get_params(struct ifnet *ifp, struct ifnet_poll_params *p)
3937 {
3938 	struct dlil_threading_info *inp;
3939 
3940 	VERIFY(ifp != NULL && p != NULL);
3941 	if (!(ifp->if_eflags & IFEF_RXPOLL) || (inp = ifp->if_inp) == NULL) {
3942 		return ENXIO;
3943 	}
3944 
3945 	bzero(p, sizeof(*p));
3946 
3947 	lck_mtx_lock(&inp->dlth_lock);
3948 	p->packets_limit = ifp->if_rxpoll_plim;
3949 	p->packets_lowat = ifp->if_rxpoll_plowat;
3950 	p->packets_hiwat = ifp->if_rxpoll_phiwat;
3951 	p->bytes_lowat = ifp->if_rxpoll_blowat;
3952 	p->bytes_hiwat = ifp->if_rxpoll_bhiwat;
3953 	p->interval_time = ifp->if_rxpoll_ival;
3954 	lck_mtx_unlock(&inp->dlth_lock);
3955 
3956 	return 0;
3957 }
3958 
3959 errno_t
ifnet_input(struct ifnet * ifp,struct mbuf * m_head,const struct ifnet_stat_increment_param * s)3960 ifnet_input(struct ifnet *ifp, struct mbuf *m_head,
3961     const struct ifnet_stat_increment_param *s)
3962 {
3963 	return ifnet_input_common(ifp, m_head, NULL, s, FALSE, FALSE);
3964 }
3965 
3966 errno_t
ifnet_input_extended(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s)3967 ifnet_input_extended(struct ifnet *ifp, struct mbuf *m_head,
3968     struct mbuf *m_tail, const struct ifnet_stat_increment_param *s)
3969 {
3970 	return ifnet_input_common(ifp, m_head, m_tail, s, TRUE, FALSE);
3971 }
3972 
3973 errno_t
ifnet_input_poll(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s)3974 ifnet_input_poll(struct ifnet *ifp, struct mbuf *m_head,
3975     struct mbuf *m_tail, const struct ifnet_stat_increment_param *s)
3976 {
3977 	return ifnet_input_common(ifp, m_head, m_tail, s,
3978 	           (m_head != NULL), TRUE);
3979 }
3980 
3981 static errno_t
ifnet_input_common(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t ext,boolean_t poll)3982 ifnet_input_common(struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail,
3983     const struct ifnet_stat_increment_param *s, boolean_t ext, boolean_t poll)
3984 {
3985 	dlil_input_func input_func;
3986 	struct ifnet_stat_increment_param _s;
3987 	u_int32_t m_cnt = 0, m_size = 0;
3988 	struct mbuf *last;
3989 	errno_t err = 0;
3990 
3991 	if ((m_head == NULL && !poll) || (s == NULL && ext)) {
3992 		if (m_head != NULL) {
3993 			mbuf_freem_list(m_head);
3994 		}
3995 		return EINVAL;
3996 	}
3997 
3998 	VERIFY(m_head != NULL || (s == NULL && m_tail == NULL && !ext && poll));
3999 	VERIFY(m_tail == NULL || ext);
4000 	VERIFY(s != NULL || !ext);
4001 
4002 	/*
4003 	 * Drop the packet(s) if the parameters are invalid, or if the
4004 	 * interface is no longer attached; else hold an IO refcnt to
4005 	 * prevent it from being detached (will be released below.)
4006 	 */
4007 	if (ifp == NULL || (ifp != lo_ifp && !ifnet_datamov_begin(ifp))) {
4008 		if (m_head != NULL) {
4009 			mbuf_freem_list(m_head);
4010 		}
4011 		return EINVAL;
4012 	}
4013 
4014 	input_func = ifp->if_input_dlil;
4015 	VERIFY(input_func != NULL);
4016 
4017 	if (m_tail == NULL) {
4018 		last = m_head;
4019 		while (m_head != NULL) {
4020 #if IFNET_INPUT_SANITY_CHK
4021 			if (__improbable(dlil_input_sanity_check != 0)) {
4022 				DLIL_INPUT_CHECK(last, ifp);
4023 			}
4024 #endif /* IFNET_INPUT_SANITY_CHK */
4025 			m_cnt++;
4026 			m_size += m_length(last);
4027 			if (mbuf_nextpkt(last) == NULL) {
4028 				break;
4029 			}
4030 			last = mbuf_nextpkt(last);
4031 		}
4032 		m_tail = last;
4033 	} else {
4034 #if IFNET_INPUT_SANITY_CHK
4035 		if (__improbable(dlil_input_sanity_check != 0)) {
4036 			last = m_head;
4037 			while (1) {
4038 				DLIL_INPUT_CHECK(last, ifp);
4039 				m_cnt++;
4040 				m_size += m_length(last);
4041 				if (mbuf_nextpkt(last) == NULL) {
4042 					break;
4043 				}
4044 				last = mbuf_nextpkt(last);
4045 			}
4046 		} else {
4047 			m_cnt = s->packets_in;
4048 			m_size = s->bytes_in;
4049 			last = m_tail;
4050 		}
4051 #else
4052 		m_cnt = s->packets_in;
4053 		m_size = s->bytes_in;
4054 		last = m_tail;
4055 #endif /* IFNET_INPUT_SANITY_CHK */
4056 	}
4057 
4058 	if (last != m_tail) {
4059 		panic_plain("%s: invalid input packet chain for %s, "
4060 		    "tail mbuf %p instead of %p\n", __func__, if_name(ifp),
4061 		    m_tail, last);
4062 	}
4063 
4064 	/*
4065 	 * Assert packet count only for the extended variant, for backwards
4066 	 * compatibility, since this came directly from the device driver.
4067 	 * Relax this assertion for input bytes, as the driver may have
4068 	 * included the link-layer headers in the computation; hence
4069 	 * m_size is just an approximation.
4070 	 */
4071 	if (ext && s->packets_in != m_cnt) {
4072 		panic_plain("%s: input packet count mismatch for %s, "
4073 		    "%d instead of %d\n", __func__, if_name(ifp),
4074 		    s->packets_in, m_cnt);
4075 	}
4076 
4077 	if (s == NULL) {
4078 		bzero(&_s, sizeof(_s));
4079 		s = &_s;
4080 	} else {
4081 		_s = *s;
4082 	}
4083 	_s.packets_in = m_cnt;
4084 	_s.bytes_in = m_size;
4085 
4086 	err = (*input_func)(ifp, m_head, m_tail, s, poll, current_thread());
4087 
4088 	if (ifp != lo_ifp) {
4089 		/* Release the IO refcnt */
4090 		ifnet_datamov_end(ifp);
4091 	}
4092 
4093 	return err;
4094 }
4095 
4096 #if SKYWALK
4097 errno_t
dlil_set_input_handler(struct ifnet * ifp,dlil_input_func fn)4098 dlil_set_input_handler(struct ifnet *ifp, dlil_input_func fn)
4099 {
4100 	return os_atomic_cmpxchg((void * volatile *)&ifp->if_input_dlil,
4101 	           ptrauth_nop_cast(void *, &dlil_input_handler),
4102 	           ptrauth_nop_cast(void *, fn), acq_rel) ? 0 : EBUSY;
4103 }
4104 
4105 void
dlil_reset_input_handler(struct ifnet * ifp)4106 dlil_reset_input_handler(struct ifnet *ifp)
4107 {
4108 	while (!os_atomic_cmpxchg((void * volatile *)&ifp->if_input_dlil,
4109 	    ptrauth_nop_cast(void *, ifp->if_input_dlil),
4110 	    ptrauth_nop_cast(void *, &dlil_input_handler), acq_rel)) {
4111 		;
4112 	}
4113 }
4114 
4115 errno_t
dlil_set_output_handler(struct ifnet * ifp,dlil_output_func fn)4116 dlil_set_output_handler(struct ifnet *ifp, dlil_output_func fn)
4117 {
4118 	return os_atomic_cmpxchg((void * volatile *)&ifp->if_output_dlil,
4119 	           ptrauth_nop_cast(void *, &dlil_output_handler),
4120 	           ptrauth_nop_cast(void *, fn), acq_rel) ? 0 : EBUSY;
4121 }
4122 
4123 void
dlil_reset_output_handler(struct ifnet * ifp)4124 dlil_reset_output_handler(struct ifnet *ifp)
4125 {
4126 	while (!os_atomic_cmpxchg((void * volatile *)&ifp->if_output_dlil,
4127 	    ptrauth_nop_cast(void *, ifp->if_output_dlil),
4128 	    ptrauth_nop_cast(void *, &dlil_output_handler), acq_rel)) {
4129 		;
4130 	}
4131 }
4132 #endif /* SKYWALK */
4133 
4134 errno_t
dlil_output_handler(struct ifnet * ifp,struct mbuf * m)4135 dlil_output_handler(struct ifnet *ifp, struct mbuf *m)
4136 {
4137 	return ifp->if_output(ifp, m);
4138 }
4139 
4140 errno_t
dlil_input_handler(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t poll,struct thread * tp)4141 dlil_input_handler(struct ifnet *ifp, struct mbuf *m_head,
4142     struct mbuf *m_tail, const struct ifnet_stat_increment_param *s,
4143     boolean_t poll, struct thread *tp)
4144 {
4145 	struct dlil_threading_info *inp = ifp->if_inp;
4146 
4147 	if (__improbable(inp == NULL)) {
4148 		inp = dlil_main_input_thread;
4149 	}
4150 
4151 #if (DEVELOPMENT || DEBUG)
4152 	if (__improbable(net_thread_is_marked(NET_THREAD_SYNC_RX))) {
4153 		return dlil_input_sync(inp, ifp, m_head, m_tail, s, poll, tp);
4154 	} else
4155 #endif /* (DEVELOPMENT || DEBUG) */
4156 	{
4157 		return inp->dlth_strategy(inp, ifp, m_head, m_tail, s, poll, tp);
4158 	}
4159 }
4160 
4161 /*
4162  * Detect whether a queue contains a burst that needs to be trimmed.
4163  */
4164 #define MBUF_QUEUE_IS_OVERCOMMITTED(q)                                                                  \
4165 	__improbable(MAX(if_rcvq_burst_limit, qlimit(q)) < qlen(q) &&           \
4166 	                        qtype(q) == QP_MBUF)
4167 
4168 #define MAX_KNOWN_MBUF_CLASS 8
4169 
4170 static uint32_t
dlil_trim_overcomitted_queue_locked(class_queue_t * input_queue,dlil_freeq_t * freeq,struct ifnet_stat_increment_param * stat_delta)4171 dlil_trim_overcomitted_queue_locked(class_queue_t *input_queue,
4172     dlil_freeq_t *freeq, struct ifnet_stat_increment_param *stat_delta)
4173 {
4174 	uint32_t overcommitted_qlen;    /* Length in packets. */
4175 	uint64_t overcommitted_qsize;   /* Size in bytes. */
4176 	uint32_t target_qlen;           /* The desired queue length after trimming. */
4177 	uint32_t pkts_to_drop = 0;      /* Number of packets to drop. */
4178 	uint32_t dropped_pkts = 0;      /* Number of packets that were dropped. */
4179 	uint32_t dropped_bytes = 0;     /* Number of dropped bytes. */
4180 	struct mbuf *m = NULL, *m_tmp = NULL;
4181 
4182 	overcommitted_qlen = qlen(input_queue);
4183 	overcommitted_qsize = qsize(input_queue);
4184 	target_qlen = (qlimit(input_queue) * if_rcvq_trim_pct) / 100;
4185 
4186 	if (overcommitted_qlen <= target_qlen) {
4187 		/*
4188 		 * The queue is already within the target limits.
4189 		 */
4190 		dropped_pkts = 0;
4191 		goto out;
4192 	}
4193 
4194 	pkts_to_drop = overcommitted_qlen - target_qlen;
4195 
4196 	/*
4197 	 * Proceed to removing packets from the head of the queue,
4198 	 * starting from the oldest, until the desired number of packets
4199 	 * has been dropped.
4200 	 */
4201 	MBUFQ_FOREACH_SAFE(m, &qmbufq(input_queue), m_tmp) {
4202 		if (pkts_to_drop <= dropped_pkts) {
4203 			break;
4204 		}
4205 		MBUFQ_REMOVE(&qmbufq(input_queue), m);
4206 		MBUFQ_NEXT(m) = NULL;
4207 		MBUFQ_ENQUEUE(freeq, m);
4208 
4209 		dropped_pkts += 1;
4210 		dropped_bytes += m_length(m);
4211 	}
4212 
4213 	/*
4214 	 * Adjust the length and the estimated size of the queue
4215 	 * after trimming.
4216 	 */
4217 	VERIFY(overcommitted_qlen == target_qlen + dropped_pkts);
4218 	qlen(input_queue) = target_qlen;
4219 
4220 	/* qsize() is an approximation. */
4221 	if (dropped_bytes < qsize(input_queue)) {
4222 		qsize(input_queue) -= dropped_bytes;
4223 	} else {
4224 		qsize(input_queue) = 0;
4225 	}
4226 
4227 	/*
4228 	 * Adjust the ifnet statistics increments, if needed.
4229 	 */
4230 	stat_delta->dropped += dropped_pkts;
4231 	if (dropped_pkts < stat_delta->packets_in) {
4232 		stat_delta->packets_in -= dropped_pkts;
4233 	} else {
4234 		stat_delta->packets_in = 0;
4235 	}
4236 	if (dropped_bytes < stat_delta->bytes_in) {
4237 		stat_delta->bytes_in -= dropped_bytes;
4238 	} else {
4239 		stat_delta->bytes_in = 0;
4240 	}
4241 
4242 out:
4243 	if (dlil_verbose) {
4244 		/*
4245 		 * The basic information about the drop is logged
4246 		 * by the invoking function (dlil_input_{,a}sync).
4247 		 * If `dlil_verbose' flag is set, provide more information
4248 		 * that can be useful for debugging.
4249 		 */
4250 		DLIL_PRINTF("%s: "
4251 		    "qlen: %u -> %u, "
4252 		    "qsize: %llu -> %llu "
4253 		    "qlimit: %u (sysctl: %u) "
4254 		    "target_qlen: %u (if_rcvq_trim_pct: %u) pkts_to_drop: %u "
4255 		    "dropped_pkts: %u dropped_bytes %u\n",
4256 		    __func__,
4257 		    overcommitted_qlen, qlen(input_queue),
4258 		    overcommitted_qsize, qsize(input_queue),
4259 		    qlimit(input_queue), if_rcvq_burst_limit,
4260 		    target_qlen, if_rcvq_trim_pct, pkts_to_drop,
4261 		    dropped_pkts, dropped_bytes);
4262 	}
4263 
4264 	return dropped_pkts;
4265 }
4266 
4267 static errno_t
dlil_input_async(struct dlil_threading_info * inp,struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t poll,struct thread * tp)4268 dlil_input_async(struct dlil_threading_info *inp,
4269     struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail,
4270     const struct ifnet_stat_increment_param *s, boolean_t poll,
4271     struct thread *tp)
4272 {
4273 	u_int32_t m_cnt = s->packets_in;
4274 	u_int32_t m_size = s->bytes_in;
4275 	boolean_t notify = FALSE;
4276 	struct ifnet_stat_increment_param s_adj = *s;
4277 	dlil_freeq_t freeq;
4278 	MBUFQ_INIT(&freeq);
4279 
4280 	/*
4281 	 * If there is a matching DLIL input thread associated with an
4282 	 * affinity set, associate this thread with the same set.  We
4283 	 * will only do this once.
4284 	 */
4285 	lck_mtx_lock_spin(&inp->dlth_lock);
4286 	if (inp != dlil_main_input_thread && inp->dlth_affinity && tp != NULL &&
4287 	    ((!poll && inp->dlth_driver_thread == THREAD_NULL) ||
4288 	    (poll && inp->dlth_poller_thread == THREAD_NULL))) {
4289 		u_int32_t tag = inp->dlth_affinity_tag;
4290 
4291 		if (poll) {
4292 			VERIFY(inp->dlth_poller_thread == THREAD_NULL);
4293 			inp->dlth_poller_thread = tp;
4294 		} else {
4295 			VERIFY(inp->dlth_driver_thread == THREAD_NULL);
4296 			inp->dlth_driver_thread = tp;
4297 		}
4298 		lck_mtx_unlock(&inp->dlth_lock);
4299 
4300 		/* Associate the current thread with the new affinity tag */
4301 		(void) dlil_affinity_set(tp, tag);
4302 
4303 		/*
4304 		 * Take a reference on the current thread; during detach,
4305 		 * we will need to refer to it in order to tear down its
4306 		 * affinity.
4307 		 */
4308 		thread_reference(tp);
4309 		lck_mtx_lock_spin(&inp->dlth_lock);
4310 	}
4311 
4312 	VERIFY(m_head != NULL || (m_tail == NULL && m_cnt == 0));
4313 
4314 	/*
4315 	 * Because of loopbacked multicast we cannot stuff the ifp in
4316 	 * the rcvif of the packet header: loopback (lo0) packets use a
4317 	 * dedicated list so that we can later associate them with lo_ifp
4318 	 * on their way up the stack.  Packets for other interfaces without
4319 	 * dedicated input threads go to the regular list.
4320 	 */
4321 	if (m_head != NULL) {
4322 		classq_pkt_t head, tail;
4323 		class_queue_t *input_queue;
4324 		CLASSQ_PKT_INIT_MBUF(&head, m_head);
4325 		CLASSQ_PKT_INIT_MBUF(&tail, m_tail);
4326 		if (inp == dlil_main_input_thread && ifp == lo_ifp) {
4327 			struct dlil_main_threading_info *inpm =
4328 			    (struct dlil_main_threading_info *)inp;
4329 			input_queue = &inpm->lo_rcvq_pkts;
4330 		} else {
4331 			input_queue = &inp->dlth_pkts;
4332 		}
4333 
4334 		_addq_multi(input_queue, &head, &tail, m_cnt, m_size);
4335 
4336 		if (MBUF_QUEUE_IS_OVERCOMMITTED(input_queue)) {
4337 			dlil_trim_overcomitted_queue_locked(input_queue, &freeq, &s_adj);
4338 			inp->dlth_trim_pkts_dropped += s_adj.dropped;
4339 			inp->dlth_trim_cnt += 1;
4340 
4341 			os_log_error(OS_LOG_DEFAULT,
4342 			    "%s %s burst limit %u (sysctl: %u) exceeded. "
4343 			    "%u packets dropped [%u total in %u events]. new qlen %u ",
4344 			    __func__, if_name(ifp), qlimit(input_queue), if_rcvq_burst_limit,
4345 			    s_adj.dropped, inp->dlth_trim_pkts_dropped, inp->dlth_trim_cnt,
4346 			    qlen(input_queue));
4347 		}
4348 	}
4349 
4350 #if IFNET_INPUT_SANITY_CHK
4351 	/*
4352 	 * Verify that the original stat increment parameter
4353 	 * accurately describes the input chain `m_head`.
4354 	 * This is not affected by the trimming of input queue.
4355 	 */
4356 	if (__improbable(dlil_input_sanity_check != 0)) {
4357 		u_int32_t count = 0, size = 0;
4358 		struct mbuf *m0;
4359 
4360 		for (m0 = m_head; m0; m0 = mbuf_nextpkt(m0)) {
4361 			size += m_length(m0);
4362 			count++;
4363 		}
4364 
4365 		if (count != m_cnt) {
4366 			panic_plain("%s: invalid total packet count %u "
4367 			    "(expected %u)\n", if_name(ifp), count, m_cnt);
4368 			/* NOTREACHED */
4369 			__builtin_unreachable();
4370 		} else if (size != m_size) {
4371 			panic_plain("%s: invalid total packet size %u "
4372 			    "(expected %u)\n", if_name(ifp), size, m_size);
4373 			/* NOTREACHED */
4374 			__builtin_unreachable();
4375 		}
4376 
4377 		inp->dlth_pkts_cnt += m_cnt;
4378 	}
4379 #endif /* IFNET_INPUT_SANITY_CHK */
4380 
4381 	/* NOTE: use the adjusted parameter, vs the original one */
4382 	dlil_input_stats_add(&s_adj, inp, ifp, poll);
4383 	/*
4384 	 * If we're using the main input thread, synchronize the
4385 	 * stats now since we have the interface context.  All
4386 	 * other cases involving dedicated input threads will
4387 	 * have their stats synchronized there.
4388 	 */
4389 	if (inp == dlil_main_input_thread) {
4390 		notify = dlil_input_stats_sync(ifp, inp);
4391 	}
4392 
4393 	dlil_input_wakeup(inp);
4394 	lck_mtx_unlock(&inp->dlth_lock);
4395 
4396 	/*
4397 	 * Actual freeing of the excess packets must happen
4398 	 * after the dlth_lock had been released.
4399 	 */
4400 	if (!MBUFQ_EMPTY(&freeq)) {
4401 		m_freem_list(MBUFQ_FIRST(&freeq));
4402 	}
4403 
4404 	if (notify) {
4405 		ifnet_notify_data_threshold(ifp);
4406 	}
4407 
4408 	return 0;
4409 }
4410 
4411 static errno_t
dlil_input_sync(struct dlil_threading_info * inp,struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t poll,struct thread * tp)4412 dlil_input_sync(struct dlil_threading_info *inp,
4413     struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail,
4414     const struct ifnet_stat_increment_param *s, boolean_t poll,
4415     struct thread *tp)
4416 {
4417 #pragma unused(tp)
4418 	u_int32_t m_cnt = s->packets_in;
4419 	u_int32_t m_size = s->bytes_in;
4420 	boolean_t notify = FALSE;
4421 	classq_pkt_t head, tail;
4422 	struct ifnet_stat_increment_param s_adj = *s;
4423 	dlil_freeq_t freeq;
4424 	MBUFQ_INIT(&freeq);
4425 
4426 	ASSERT(inp != dlil_main_input_thread);
4427 
4428 	/* XXX: should we just assert instead? */
4429 	if (__improbable(m_head == NULL)) {
4430 		return 0;
4431 	}
4432 
4433 	CLASSQ_PKT_INIT_MBUF(&head, m_head);
4434 	CLASSQ_PKT_INIT_MBUF(&tail, m_tail);
4435 
4436 	lck_mtx_lock_spin(&inp->dlth_lock);
4437 	_addq_multi(&inp->dlth_pkts, &head, &tail, m_cnt, m_size);
4438 
4439 	if (MBUF_QUEUE_IS_OVERCOMMITTED(&inp->dlth_pkts)) {
4440 		dlil_trim_overcomitted_queue_locked(&inp->dlth_pkts, &freeq, &s_adj);
4441 		inp->dlth_trim_pkts_dropped += s_adj.dropped;
4442 		inp->dlth_trim_cnt += 1;
4443 
4444 		os_log_error(OS_LOG_DEFAULT,
4445 		    "%s %s burst limit %u (sysctl: %u) exceeded. "
4446 		    "%u packets dropped [%u total in %u events]. new qlen %u \n",
4447 		    __func__, if_name(ifp), qlimit(&inp->dlth_pkts), if_rcvq_burst_limit,
4448 		    s_adj.dropped, inp->dlth_trim_pkts_dropped, inp->dlth_trim_cnt,
4449 		    qlen(&inp->dlth_pkts));
4450 	}
4451 
4452 #if IFNET_INPUT_SANITY_CHK
4453 	if (__improbable(dlil_input_sanity_check != 0)) {
4454 		u_int32_t count = 0, size = 0;
4455 		struct mbuf *m0;
4456 
4457 		for (m0 = m_head; m0; m0 = mbuf_nextpkt(m0)) {
4458 			size += m_length(m0);
4459 			count++;
4460 		}
4461 
4462 		if (count != m_cnt) {
4463 			panic_plain("%s: invalid total packet count %u "
4464 			    "(expected %u)\n", if_name(ifp), count, m_cnt);
4465 			/* NOTREACHED */
4466 			__builtin_unreachable();
4467 		} else if (size != m_size) {
4468 			panic_plain("%s: invalid total packet size %u "
4469 			    "(expected %u)\n", if_name(ifp), size, m_size);
4470 			/* NOTREACHED */
4471 			__builtin_unreachable();
4472 		}
4473 
4474 		inp->dlth_pkts_cnt += m_cnt;
4475 	}
4476 #endif /* IFNET_INPUT_SANITY_CHK */
4477 
4478 	/* NOTE: use the adjusted parameter, vs the original one */
4479 	dlil_input_stats_add(&s_adj, inp, ifp, poll);
4480 
4481 	m_cnt = qlen(&inp->dlth_pkts);
4482 	_getq_all(&inp->dlth_pkts, &head, NULL, NULL, NULL);
4483 
4484 #if SKYWALK
4485 	/*
4486 	 * If this interface is attached to a netif nexus,
4487 	 * the stats are already incremented there; otherwise
4488 	 * do it here.
4489 	 */
4490 	if (!(ifp->if_capabilities & IFCAP_SKYWALK))
4491 #endif /* SKYWALK */
4492 	notify = dlil_input_stats_sync(ifp, inp);
4493 
4494 	lck_mtx_unlock(&inp->dlth_lock);
4495 
4496 	/*
4497 	 * Actual freeing of the excess packets must happen
4498 	 * after the dlth_lock had been released.
4499 	 */
4500 	if (!MBUFQ_EMPTY(&freeq)) {
4501 		m_freem_list(MBUFQ_FIRST(&freeq));
4502 	}
4503 
4504 	if (notify) {
4505 		ifnet_notify_data_threshold(ifp);
4506 	}
4507 
4508 	/*
4509 	 * NOTE warning %%% attention !!!!
4510 	 * We should think about putting some thread starvation
4511 	 * safeguards if we deal with long chains of packets.
4512 	 */
4513 	if (head.cp_mbuf != NULL) {
4514 		dlil_input_packet_list_extended(NULL, head.cp_mbuf,
4515 		    m_cnt, ifp->if_poll_mode);
4516 	}
4517 
4518 	return 0;
4519 }
4520 
4521 #if SKYWALK
4522 errno_t
ifnet_set_output_handler(struct ifnet * ifp,ifnet_output_func fn)4523 ifnet_set_output_handler(struct ifnet *ifp, ifnet_output_func fn)
4524 {
4525 	return os_atomic_cmpxchg((void * volatile *)&ifp->if_output,
4526 	           ptrauth_nop_cast(void *, ifp->if_save_output),
4527 	           ptrauth_nop_cast(void *, fn), acq_rel) ? 0 : EBUSY;
4528 }
4529 
4530 void
ifnet_reset_output_handler(struct ifnet * ifp)4531 ifnet_reset_output_handler(struct ifnet *ifp)
4532 {
4533 	while (!os_atomic_cmpxchg((void * volatile *)&ifp->if_output,
4534 	    ptrauth_nop_cast(void *, ifp->if_output),
4535 	    ptrauth_nop_cast(void *, ifp->if_save_output), acq_rel)) {
4536 		;
4537 	}
4538 }
4539 
4540 errno_t
ifnet_set_start_handler(struct ifnet * ifp,ifnet_start_func fn)4541 ifnet_set_start_handler(struct ifnet *ifp, ifnet_start_func fn)
4542 {
4543 	return os_atomic_cmpxchg((void * volatile *)&ifp->if_start,
4544 	           ptrauth_nop_cast(void *, ifp->if_save_start),
4545 	           ptrauth_nop_cast(void *, fn), acq_rel) ? 0 : EBUSY;
4546 }
4547 
4548 void
ifnet_reset_start_handler(struct ifnet * ifp)4549 ifnet_reset_start_handler(struct ifnet *ifp)
4550 {
4551 	while (!os_atomic_cmpxchg((void * volatile *)&ifp->if_start,
4552 	    ptrauth_nop_cast(void *, ifp->if_start),
4553 	    ptrauth_nop_cast(void *, ifp->if_save_start), acq_rel)) {
4554 		;
4555 	}
4556 }
4557 #endif /* SKYWALK */
4558 
4559 static void
ifnet_start_common(struct ifnet * ifp,boolean_t resetfc,boolean_t ignore_delay)4560 ifnet_start_common(struct ifnet *ifp, boolean_t resetfc, boolean_t ignore_delay)
4561 {
4562 	if (!(ifp->if_eflags & IFEF_TXSTART)) {
4563 		return;
4564 	}
4565 	/*
4566 	 * If the starter thread is inactive, signal it to do work,
4567 	 * unless the interface is being flow controlled from below,
4568 	 * e.g. a virtual interface being flow controlled by a real
4569 	 * network interface beneath it, or it's been disabled via
4570 	 * a call to ifnet_disable_output().
4571 	 */
4572 	lck_mtx_lock_spin(&ifp->if_start_lock);
4573 	if (ignore_delay) {
4574 		ifp->if_start_flags |= IFSF_NO_DELAY;
4575 	}
4576 	if (resetfc) {
4577 		ifp->if_start_flags &= ~IFSF_FLOW_CONTROLLED;
4578 	} else if (ifp->if_start_flags & IFSF_FLOW_CONTROLLED) {
4579 		lck_mtx_unlock(&ifp->if_start_lock);
4580 		return;
4581 	}
4582 	ifp->if_start_req++;
4583 	if (!ifp->if_start_active && ifp->if_start_thread != THREAD_NULL &&
4584 	    (resetfc || !(ifp->if_eflags & IFEF_ENQUEUE_MULTI) ||
4585 	    IFCQ_LEN(ifp->if_snd) >= ifp->if_start_delay_qlen ||
4586 	    ifp->if_start_delayed == 0)) {
4587 		(void) wakeup_one((caddr_t)&ifp->if_start_thread);
4588 	}
4589 	lck_mtx_unlock(&ifp->if_start_lock);
4590 }
4591 
4592 void
ifnet_start(struct ifnet * ifp)4593 ifnet_start(struct ifnet *ifp)
4594 {
4595 	ifnet_start_common(ifp, FALSE, FALSE);
4596 }
4597 
4598 void
ifnet_start_ignore_delay(struct ifnet * ifp)4599 ifnet_start_ignore_delay(struct ifnet *ifp)
4600 {
4601 	ifnet_start_common(ifp, FALSE, TRUE);
4602 }
4603 
4604 __attribute__((noreturn))
4605 static void
ifnet_start_thread_func(void * v,wait_result_t w)4606 ifnet_start_thread_func(void *v, wait_result_t w)
4607 {
4608 #pragma unused(w)
4609 	struct ifnet *ifp = v;
4610 	char thread_name[MAXTHREADNAMESIZE];
4611 
4612 	/* Construct the name for this thread, and then apply it. */
4613 	bzero(thread_name, sizeof(thread_name));
4614 	(void) snprintf(thread_name, sizeof(thread_name),
4615 	    "ifnet_start_%s", ifp->if_xname);
4616 #if SKYWALK
4617 	/* override name for native Skywalk interface */
4618 	if (ifp->if_eflags & IFEF_SKYWALK_NATIVE) {
4619 		(void) snprintf(thread_name, sizeof(thread_name),
4620 		    "skywalk_doorbell_%s_tx", ifp->if_xname);
4621 	}
4622 #endif /* SKYWALK */
4623 	ASSERT(ifp->if_start_thread == current_thread());
4624 	thread_set_thread_name(current_thread(), thread_name);
4625 
4626 	/*
4627 	 * Treat the dedicated starter thread for lo0 as equivalent to
4628 	 * the driver workloop thread; if net_affinity is enabled for
4629 	 * the main input thread, associate this starter thread to it
4630 	 * by binding them with the same affinity tag.  This is done
4631 	 * only once (as we only have one lo_ifp which never goes away.)
4632 	 */
4633 	if (ifp == lo_ifp) {
4634 		struct dlil_threading_info *inp = dlil_main_input_thread;
4635 		struct thread *tp = current_thread();
4636 #if SKYWALK
4637 		/* native skywalk loopback not yet implemented */
4638 		VERIFY(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
4639 #endif /* SKYWALK */
4640 
4641 		lck_mtx_lock(&inp->dlth_lock);
4642 		if (inp->dlth_affinity) {
4643 			u_int32_t tag = inp->dlth_affinity_tag;
4644 
4645 			VERIFY(inp->dlth_driver_thread == THREAD_NULL);
4646 			VERIFY(inp->dlth_poller_thread == THREAD_NULL);
4647 			inp->dlth_driver_thread = tp;
4648 			lck_mtx_unlock(&inp->dlth_lock);
4649 
4650 			/* Associate this thread with the affinity tag */
4651 			(void) dlil_affinity_set(tp, tag);
4652 		} else {
4653 			lck_mtx_unlock(&inp->dlth_lock);
4654 		}
4655 	}
4656 
4657 	lck_mtx_lock(&ifp->if_start_lock);
4658 	VERIFY(!ifp->if_start_embryonic && !ifp->if_start_active);
4659 	(void) assert_wait(&ifp->if_start_thread, THREAD_UNINT);
4660 	ifp->if_start_embryonic = 1;
4661 	/* wake up once to get out of embryonic state */
4662 	ifp->if_start_req++;
4663 	(void) wakeup_one((caddr_t)&ifp->if_start_thread);
4664 	lck_mtx_unlock(&ifp->if_start_lock);
4665 	(void) thread_block_parameter(ifnet_start_thread_cont, ifp);
4666 	/* NOTREACHED */
4667 	__builtin_unreachable();
4668 }
4669 
4670 __attribute__((noreturn))
4671 static void
ifnet_start_thread_cont(void * v,wait_result_t wres)4672 ifnet_start_thread_cont(void *v, wait_result_t wres)
4673 {
4674 	struct ifnet *ifp = v;
4675 	struct ifclassq *ifq = ifp->if_snd;
4676 
4677 	lck_mtx_lock_spin(&ifp->if_start_lock);
4678 	if (__improbable(wres == THREAD_INTERRUPTED ||
4679 	    (ifp->if_start_flags & IFSF_TERMINATING) != 0)) {
4680 		goto terminate;
4681 	}
4682 
4683 	if (__improbable(ifp->if_start_embryonic)) {
4684 		ifp->if_start_embryonic = 0;
4685 		lck_mtx_unlock(&ifp->if_start_lock);
4686 		ifnet_decr_pending_thread_count(ifp);
4687 		lck_mtx_lock_spin(&ifp->if_start_lock);
4688 		goto skip;
4689 	}
4690 
4691 	ifp->if_start_active = 1;
4692 
4693 	/*
4694 	 * Keep on servicing until no more request.
4695 	 */
4696 	for (;;) {
4697 		u_int32_t req = ifp->if_start_req;
4698 		if ((ifp->if_start_flags & IFSF_NO_DELAY) == 0 &&
4699 		    !IFCQ_IS_EMPTY(ifq) &&
4700 		    (ifp->if_eflags & IFEF_ENQUEUE_MULTI) &&
4701 		    ifp->if_start_delayed == 0 &&
4702 		    IFCQ_LEN(ifq) < ifp->if_start_delay_qlen &&
4703 		    (ifp->if_eflags & IFEF_DELAY_START)) {
4704 			ifp->if_start_delayed = 1;
4705 			ifnet_start_delayed++;
4706 			break;
4707 		}
4708 		ifp->if_start_flags &= ~IFSF_NO_DELAY;
4709 		ifp->if_start_delayed = 0;
4710 		lck_mtx_unlock(&ifp->if_start_lock);
4711 
4712 		/*
4713 		 * If no longer attached, don't call start because ifp
4714 		 * is being destroyed; else hold an IO refcnt to
4715 		 * prevent the interface from being detached (will be
4716 		 * released below.)
4717 		 */
4718 		if (!ifnet_datamov_begin(ifp)) {
4719 			lck_mtx_lock_spin(&ifp->if_start_lock);
4720 			break;
4721 		}
4722 
4723 		/* invoke the driver's start routine */
4724 		((*ifp->if_start)(ifp));
4725 
4726 		/*
4727 		 * Release the io ref count taken above.
4728 		 */
4729 		ifnet_datamov_end(ifp);
4730 
4731 		lck_mtx_lock_spin(&ifp->if_start_lock);
4732 
4733 		/*
4734 		 * If there's no pending request or if the
4735 		 * interface has been disabled, we're done.
4736 		 */
4737 #define _IFSF_DISABLED  (IFSF_FLOW_CONTROLLED | IFSF_TERMINATING)
4738 		if (req == ifp->if_start_req ||
4739 		    (ifp->if_start_flags & _IFSF_DISABLED) != 0) {
4740 			break;
4741 		}
4742 	}
4743 skip:
4744 	ifp->if_start_req = 0;
4745 	ifp->if_start_active = 0;
4746 
4747 #if SKYWALK
4748 	/*
4749 	 * Wakeup any waiters, e.g. any threads waiting to
4750 	 * detach the interface from the flowswitch, etc.
4751 	 */
4752 	if (ifp->if_start_waiters != 0) {
4753 		ifp->if_start_waiters = 0;
4754 		wakeup(&ifp->if_start_waiters);
4755 	}
4756 #endif /* SKYWALK */
4757 	if (__probable((ifp->if_start_flags & IFSF_TERMINATING) == 0)) {
4758 		uint64_t deadline = TIMEOUT_WAIT_FOREVER;
4759 		struct timespec delay_start_ts;
4760 		struct timespec *ts = NULL;
4761 
4762 		if (ts == NULL) {
4763 			ts = ((IFCQ_TBR_IS_ENABLED(ifq) && !IFCQ_IS_EMPTY(ifq)) ?
4764 			    &ifp->if_start_cycle : NULL);
4765 		}
4766 
4767 		if (ts == NULL && ifp->if_start_delayed == 1) {
4768 			delay_start_ts.tv_sec = 0;
4769 			delay_start_ts.tv_nsec = ifp->if_start_delay_timeout;
4770 			ts = &delay_start_ts;
4771 		}
4772 
4773 		if (ts != NULL && ts->tv_sec == 0 && ts->tv_nsec == 0) {
4774 			ts = NULL;
4775 		}
4776 
4777 		if (__improbable(ts != NULL)) {
4778 			clock_interval_to_deadline((uint32_t)(ts->tv_nsec +
4779 			    (ts->tv_sec * NSEC_PER_SEC)), 1, &deadline);
4780 		}
4781 
4782 		(void) assert_wait_deadline(&ifp->if_start_thread,
4783 		    THREAD_UNINT, deadline);
4784 		lck_mtx_unlock(&ifp->if_start_lock);
4785 		(void) thread_block_parameter(ifnet_start_thread_cont, ifp);
4786 		/* NOTREACHED */
4787 	} else {
4788 terminate:
4789 		/* interface is detached? */
4790 		ifnet_set_start_cycle(ifp, NULL);
4791 
4792 		/* clear if_start_thread to allow termination to continue */
4793 		ASSERT(ifp->if_start_thread != THREAD_NULL);
4794 		ifp->if_start_thread = THREAD_NULL;
4795 		wakeup((caddr_t)&ifp->if_start_thread);
4796 		lck_mtx_unlock(&ifp->if_start_lock);
4797 
4798 		if (dlil_verbose) {
4799 			DLIL_PRINTF("%s: starter thread terminated\n",
4800 			    if_name(ifp));
4801 		}
4802 
4803 		/* for the extra refcnt from kernel_thread_start() */
4804 		thread_deallocate(current_thread());
4805 		/* this is the end */
4806 		thread_terminate(current_thread());
4807 		/* NOTREACHED */
4808 	}
4809 
4810 	/* must never get here */
4811 	VERIFY(0);
4812 	/* NOTREACHED */
4813 	__builtin_unreachable();
4814 }
4815 
4816 void
ifnet_set_start_cycle(struct ifnet * ifp,struct timespec * ts)4817 ifnet_set_start_cycle(struct ifnet *ifp, struct timespec *ts)
4818 {
4819 	if (ts == NULL) {
4820 		bzero(&ifp->if_start_cycle, sizeof(ifp->if_start_cycle));
4821 	} else {
4822 		*(&ifp->if_start_cycle) = *ts;
4823 	}
4824 
4825 	if (ts != NULL && ts->tv_nsec != 0 && dlil_verbose) {
4826 		DLIL_PRINTF("%s: restart interval set to %lu nsec\n",
4827 		    if_name(ifp), ts->tv_nsec);
4828 	}
4829 }
4830 
4831 static inline void
ifnet_poll_wakeup(struct ifnet * ifp)4832 ifnet_poll_wakeup(struct ifnet *ifp)
4833 {
4834 	LCK_MTX_ASSERT(&ifp->if_poll_lock, LCK_MTX_ASSERT_OWNED);
4835 
4836 	ifp->if_poll_req++;
4837 	if (!(ifp->if_poll_flags & IF_POLLF_RUNNING) &&
4838 	    ifp->if_poll_thread != THREAD_NULL) {
4839 		wakeup_one((caddr_t)&ifp->if_poll_thread);
4840 	}
4841 }
4842 
4843 void
ifnet_poll(struct ifnet * ifp)4844 ifnet_poll(struct ifnet *ifp)
4845 {
4846 	/*
4847 	 * If the poller thread is inactive, signal it to do work.
4848 	 */
4849 	lck_mtx_lock_spin(&ifp->if_poll_lock);
4850 	ifnet_poll_wakeup(ifp);
4851 	lck_mtx_unlock(&ifp->if_poll_lock);
4852 }
4853 
4854 __attribute__((noreturn))
4855 static void
ifnet_poll_thread_func(void * v,wait_result_t w)4856 ifnet_poll_thread_func(void *v, wait_result_t w)
4857 {
4858 #pragma unused(w)
4859 	char thread_name[MAXTHREADNAMESIZE];
4860 	struct ifnet *ifp = v;
4861 
4862 	VERIFY(ifp->if_eflags & IFEF_RXPOLL);
4863 	VERIFY(current_thread() == ifp->if_poll_thread);
4864 
4865 	/* construct the name for this thread, and then apply it */
4866 	bzero(thread_name, sizeof(thread_name));
4867 	(void) snprintf(thread_name, sizeof(thread_name),
4868 	    "ifnet_poller_%s", ifp->if_xname);
4869 	thread_set_thread_name(ifp->if_poll_thread, thread_name);
4870 
4871 	lck_mtx_lock(&ifp->if_poll_lock);
4872 	VERIFY(!(ifp->if_poll_flags & (IF_POLLF_EMBRYONIC | IF_POLLF_RUNNING)));
4873 	(void) assert_wait(&ifp->if_poll_thread, THREAD_UNINT);
4874 	ifp->if_poll_flags |= IF_POLLF_EMBRYONIC;
4875 	/* wake up once to get out of embryonic state */
4876 	ifnet_poll_wakeup(ifp);
4877 	lck_mtx_unlock(&ifp->if_poll_lock);
4878 	(void) thread_block_parameter(ifnet_poll_thread_cont, ifp);
4879 	/* NOTREACHED */
4880 	__builtin_unreachable();
4881 }
4882 
4883 __attribute__((noreturn))
4884 static void
ifnet_poll_thread_cont(void * v,wait_result_t wres)4885 ifnet_poll_thread_cont(void *v, wait_result_t wres)
4886 {
4887 	struct dlil_threading_info *inp;
4888 	struct ifnet *ifp = v;
4889 	struct ifnet_stat_increment_param s;
4890 	struct timespec start_time;
4891 
4892 	VERIFY(ifp->if_eflags & IFEF_RXPOLL);
4893 
4894 	bzero(&s, sizeof(s));
4895 	net_timerclear(&start_time);
4896 
4897 	lck_mtx_lock_spin(&ifp->if_poll_lock);
4898 	if (__improbable(wres == THREAD_INTERRUPTED ||
4899 	    (ifp->if_poll_flags & IF_POLLF_TERMINATING) != 0)) {
4900 		goto terminate;
4901 	}
4902 
4903 	inp = ifp->if_inp;
4904 	VERIFY(inp != NULL);
4905 
4906 	if (__improbable(ifp->if_poll_flags & IF_POLLF_EMBRYONIC)) {
4907 		ifp->if_poll_flags &= ~IF_POLLF_EMBRYONIC;
4908 		lck_mtx_unlock(&ifp->if_poll_lock);
4909 		ifnet_decr_pending_thread_count(ifp);
4910 		lck_mtx_lock_spin(&ifp->if_poll_lock);
4911 		goto skip;
4912 	}
4913 
4914 	ifp->if_poll_flags |= IF_POLLF_RUNNING;
4915 
4916 	/*
4917 	 * Keep on servicing until no more request.
4918 	 */
4919 	for (;;) {
4920 		struct mbuf *m_head, *m_tail;
4921 		u_int32_t m_lim, m_cnt, m_totlen;
4922 		u_int16_t req = ifp->if_poll_req;
4923 
4924 		m_lim = (ifp->if_rxpoll_plim != 0) ? ifp->if_rxpoll_plim :
4925 		    MAX((qlimit(&inp->dlth_pkts)), (ifp->if_rxpoll_phiwat << 2));
4926 		lck_mtx_unlock(&ifp->if_poll_lock);
4927 
4928 		/*
4929 		 * If no longer attached, there's nothing to do;
4930 		 * else hold an IO refcnt to prevent the interface
4931 		 * from being detached (will be released below.)
4932 		 */
4933 		if (!ifnet_is_attached(ifp, 1)) {
4934 			lck_mtx_lock_spin(&ifp->if_poll_lock);
4935 			break;
4936 		}
4937 
4938 		if (dlil_verbose > 1) {
4939 			DLIL_PRINTF("%s: polling up to %d pkts, "
4940 			    "pkts avg %d max %d, wreq avg %d, "
4941 			    "bytes avg %d\n",
4942 			    if_name(ifp), m_lim,
4943 			    ifp->if_rxpoll_pavg, ifp->if_rxpoll_pmax,
4944 			    ifp->if_rxpoll_wavg, ifp->if_rxpoll_bavg);
4945 		}
4946 
4947 		/* invoke the driver's input poll routine */
4948 		((*ifp->if_input_poll)(ifp, 0, m_lim, &m_head, &m_tail,
4949 		&m_cnt, &m_totlen));
4950 
4951 		if (m_head != NULL) {
4952 			VERIFY(m_tail != NULL && m_cnt > 0);
4953 
4954 			if (dlil_verbose > 1) {
4955 				DLIL_PRINTF("%s: polled %d pkts, "
4956 				    "pkts avg %d max %d, wreq avg %d, "
4957 				    "bytes avg %d\n",
4958 				    if_name(ifp), m_cnt,
4959 				    ifp->if_rxpoll_pavg, ifp->if_rxpoll_pmax,
4960 				    ifp->if_rxpoll_wavg, ifp->if_rxpoll_bavg);
4961 			}
4962 
4963 			/* stats are required for extended variant */
4964 			s.packets_in = m_cnt;
4965 			s.bytes_in = m_totlen;
4966 
4967 			(void) ifnet_input_common(ifp, m_head, m_tail,
4968 			    &s, TRUE, TRUE);
4969 		} else {
4970 			if (dlil_verbose > 1) {
4971 				DLIL_PRINTF("%s: no packets, "
4972 				    "pkts avg %d max %d, wreq avg %d, "
4973 				    "bytes avg %d\n",
4974 				    if_name(ifp), ifp->if_rxpoll_pavg,
4975 				    ifp->if_rxpoll_pmax, ifp->if_rxpoll_wavg,
4976 				    ifp->if_rxpoll_bavg);
4977 			}
4978 
4979 			(void) ifnet_input_common(ifp, NULL, NULL,
4980 			    NULL, FALSE, TRUE);
4981 		}
4982 
4983 		/* Release the io ref count */
4984 		ifnet_decr_iorefcnt(ifp);
4985 
4986 		lck_mtx_lock_spin(&ifp->if_poll_lock);
4987 
4988 		/* if there's no pending request, we're done */
4989 		if (req == ifp->if_poll_req ||
4990 		    (ifp->if_poll_flags & IF_POLLF_TERMINATING) != 0) {
4991 			break;
4992 		}
4993 	}
4994 skip:
4995 	ifp->if_poll_req = 0;
4996 	ifp->if_poll_flags &= ~IF_POLLF_RUNNING;
4997 
4998 	if (__probable((ifp->if_poll_flags & IF_POLLF_TERMINATING) == 0)) {
4999 		uint64_t deadline = TIMEOUT_WAIT_FOREVER;
5000 		struct timespec *ts;
5001 
5002 		/*
5003 		 * Wakeup N ns from now, else sleep indefinitely (ts = NULL)
5004 		 * until ifnet_poll() is called again.
5005 		 */
5006 		ts = &ifp->if_poll_cycle;
5007 		if (ts->tv_sec == 0 && ts->tv_nsec == 0) {
5008 			ts = NULL;
5009 		}
5010 
5011 		if (ts != NULL) {
5012 			clock_interval_to_deadline((uint32_t)(ts->tv_nsec +
5013 			    (ts->tv_sec * NSEC_PER_SEC)), 1, &deadline);
5014 		}
5015 
5016 		(void) assert_wait_deadline(&ifp->if_poll_thread,
5017 		    THREAD_UNINT, deadline);
5018 		lck_mtx_unlock(&ifp->if_poll_lock);
5019 		(void) thread_block_parameter(ifnet_poll_thread_cont, ifp);
5020 		/* NOTREACHED */
5021 	} else {
5022 terminate:
5023 		/* interface is detached (maybe while asleep)? */
5024 		ifnet_set_poll_cycle(ifp, NULL);
5025 
5026 		/* clear if_poll_thread to allow termination to continue */
5027 		ASSERT(ifp->if_poll_thread != THREAD_NULL);
5028 		ifp->if_poll_thread = THREAD_NULL;
5029 		wakeup((caddr_t)&ifp->if_poll_thread);
5030 		lck_mtx_unlock(&ifp->if_poll_lock);
5031 
5032 		if (dlil_verbose) {
5033 			DLIL_PRINTF("%s: poller thread terminated\n",
5034 			    if_name(ifp));
5035 		}
5036 
5037 		/* for the extra refcnt from kernel_thread_start() */
5038 		thread_deallocate(current_thread());
5039 		/* this is the end */
5040 		thread_terminate(current_thread());
5041 		/* NOTREACHED */
5042 	}
5043 
5044 	/* must never get here */
5045 	VERIFY(0);
5046 	/* NOTREACHED */
5047 	__builtin_unreachable();
5048 }
5049 
5050 void
ifnet_set_poll_cycle(struct ifnet * ifp,struct timespec * ts)5051 ifnet_set_poll_cycle(struct ifnet *ifp, struct timespec *ts)
5052 {
5053 	if (ts == NULL) {
5054 		bzero(&ifp->if_poll_cycle, sizeof(ifp->if_poll_cycle));
5055 	} else {
5056 		*(&ifp->if_poll_cycle) = *ts;
5057 	}
5058 
5059 	if (ts != NULL && ts->tv_nsec != 0 && dlil_verbose) {
5060 		DLIL_PRINTF("%s: poll interval set to %lu nsec\n",
5061 		    if_name(ifp), ts->tv_nsec);
5062 	}
5063 }
5064 
5065 void
ifnet_purge(struct ifnet * ifp)5066 ifnet_purge(struct ifnet *ifp)
5067 {
5068 	if (ifp != NULL && (ifp->if_eflags & IFEF_TXSTART)) {
5069 		if_qflush_snd(ifp, false);
5070 	}
5071 }
5072 
5073 void
ifnet_update_sndq(struct ifclassq * ifq,cqev_t ev)5074 ifnet_update_sndq(struct ifclassq *ifq, cqev_t ev)
5075 {
5076 	IFCQ_LOCK_ASSERT_HELD(ifq);
5077 
5078 	if (!(IFCQ_IS_READY(ifq))) {
5079 		return;
5080 	}
5081 
5082 	if (IFCQ_TBR_IS_ENABLED(ifq)) {
5083 		struct tb_profile tb = {
5084 			.rate = ifq->ifcq_tbr.tbr_rate_raw,
5085 			.percent = ifq->ifcq_tbr.tbr_percent, .depth = 0
5086 		};
5087 		(void) ifclassq_tbr_set(ifq, &tb, FALSE);
5088 	}
5089 
5090 	ifclassq_update(ifq, ev);
5091 }
5092 
5093 void
ifnet_update_rcv(struct ifnet * ifp,cqev_t ev)5094 ifnet_update_rcv(struct ifnet *ifp, cqev_t ev)
5095 {
5096 	switch (ev) {
5097 	case CLASSQ_EV_LINK_BANDWIDTH:
5098 		if (net_rxpoll && (ifp->if_eflags & IFEF_RXPOLL)) {
5099 			ifp->if_poll_update++;
5100 		}
5101 		break;
5102 
5103 	default:
5104 		break;
5105 	}
5106 }
5107 
5108 errno_t
ifnet_set_output_sched_model(struct ifnet * ifp,u_int32_t model)5109 ifnet_set_output_sched_model(struct ifnet *ifp, u_int32_t model)
5110 {
5111 	struct ifclassq *ifq;
5112 	u_int32_t omodel;
5113 	errno_t err;
5114 
5115 	if (ifp == NULL || model >= IFNET_SCHED_MODEL_MAX) {
5116 		return EINVAL;
5117 	} else if (!(ifp->if_eflags & IFEF_TXSTART)) {
5118 		return ENXIO;
5119 	}
5120 
5121 	ifq = ifp->if_snd;
5122 	IFCQ_LOCK(ifq);
5123 	omodel = ifp->if_output_sched_model;
5124 	ifp->if_output_sched_model = model;
5125 	if ((err = ifclassq_pktsched_setup(ifq)) != 0) {
5126 		ifp->if_output_sched_model = omodel;
5127 	}
5128 	IFCQ_UNLOCK(ifq);
5129 
5130 	return err;
5131 }
5132 
5133 errno_t
ifnet_set_sndq_maxlen(struct ifnet * ifp,u_int32_t maxqlen)5134 ifnet_set_sndq_maxlen(struct ifnet *ifp, u_int32_t maxqlen)
5135 {
5136 	if (ifp == NULL) {
5137 		return EINVAL;
5138 	} else if (!(ifp->if_eflags & IFEF_TXSTART)) {
5139 		return ENXIO;
5140 	}
5141 
5142 	ifclassq_set_maxlen(ifp->if_snd, maxqlen);
5143 
5144 	return 0;
5145 }
5146 
5147 errno_t
ifnet_get_sndq_maxlen(struct ifnet * ifp,u_int32_t * maxqlen)5148 ifnet_get_sndq_maxlen(struct ifnet *ifp, u_int32_t *maxqlen)
5149 {
5150 	if (ifp == NULL || maxqlen == NULL) {
5151 		return EINVAL;
5152 	} else if (!(ifp->if_eflags & IFEF_TXSTART)) {
5153 		return ENXIO;
5154 	}
5155 
5156 	*maxqlen = ifclassq_get_maxlen(ifp->if_snd);
5157 
5158 	return 0;
5159 }
5160 
5161 errno_t
ifnet_get_sndq_len(struct ifnet * ifp,u_int32_t * pkts)5162 ifnet_get_sndq_len(struct ifnet *ifp, u_int32_t *pkts)
5163 {
5164 	errno_t err;
5165 
5166 	if (ifp == NULL || pkts == NULL) {
5167 		err = EINVAL;
5168 	} else if (!(ifp->if_eflags & IFEF_TXSTART)) {
5169 		err = ENXIO;
5170 	} else {
5171 		err = ifclassq_get_len(ifp->if_snd, MBUF_SC_UNSPEC,
5172 		    IF_CLASSQ_ALL_GRPS, pkts, NULL);
5173 	}
5174 
5175 	return err;
5176 }
5177 
5178 errno_t
ifnet_get_service_class_sndq_len(struct ifnet * ifp,mbuf_svc_class_t sc,u_int32_t * pkts,u_int32_t * bytes)5179 ifnet_get_service_class_sndq_len(struct ifnet *ifp, mbuf_svc_class_t sc,
5180     u_int32_t *pkts, u_int32_t *bytes)
5181 {
5182 	errno_t err;
5183 
5184 	if (ifp == NULL || !MBUF_VALID_SC(sc) ||
5185 	    (pkts == NULL && bytes == NULL)) {
5186 		err = EINVAL;
5187 	} else if (!(ifp->if_eflags & IFEF_TXSTART)) {
5188 		err = ENXIO;
5189 	} else {
5190 		err = ifclassq_get_len(ifp->if_snd, sc, IF_CLASSQ_ALL_GRPS,
5191 		    pkts, bytes);
5192 	}
5193 
5194 	return err;
5195 }
5196 
5197 errno_t
ifnet_set_rcvq_maxlen(struct ifnet * ifp,u_int32_t maxqlen)5198 ifnet_set_rcvq_maxlen(struct ifnet *ifp, u_int32_t maxqlen)
5199 {
5200 	struct dlil_threading_info *inp;
5201 
5202 	if (ifp == NULL) {
5203 		return EINVAL;
5204 	} else if (!(ifp->if_eflags & IFEF_RXPOLL) || ifp->if_inp == NULL) {
5205 		return ENXIO;
5206 	}
5207 
5208 	if (maxqlen == 0) {
5209 		maxqlen = if_rcvq_maxlen;
5210 	} else if (maxqlen < IF_RCVQ_MINLEN) {
5211 		maxqlen = IF_RCVQ_MINLEN;
5212 	}
5213 
5214 	inp = ifp->if_inp;
5215 	lck_mtx_lock(&inp->dlth_lock);
5216 	qlimit(&inp->dlth_pkts) = maxqlen;
5217 	lck_mtx_unlock(&inp->dlth_lock);
5218 
5219 	return 0;
5220 }
5221 
5222 errno_t
ifnet_get_rcvq_maxlen(struct ifnet * ifp,u_int32_t * maxqlen)5223 ifnet_get_rcvq_maxlen(struct ifnet *ifp, u_int32_t *maxqlen)
5224 {
5225 	struct dlil_threading_info *inp;
5226 
5227 	if (ifp == NULL || maxqlen == NULL) {
5228 		return EINVAL;
5229 	} else if (!(ifp->if_eflags & IFEF_RXPOLL) || ifp->if_inp == NULL) {
5230 		return ENXIO;
5231 	}
5232 
5233 	inp = ifp->if_inp;
5234 	lck_mtx_lock(&inp->dlth_lock);
5235 	*maxqlen = qlimit(&inp->dlth_pkts);
5236 	lck_mtx_unlock(&inp->dlth_lock);
5237 	return 0;
5238 }
5239 
5240 void
ifnet_enqueue_multi_setup(struct ifnet * ifp,uint16_t delay_qlen,uint16_t delay_timeout)5241 ifnet_enqueue_multi_setup(struct ifnet *ifp, uint16_t delay_qlen,
5242     uint16_t delay_timeout)
5243 {
5244 	if (delay_qlen > 0 && delay_timeout > 0) {
5245 		if_set_eflags(ifp, IFEF_ENQUEUE_MULTI);
5246 		ifp->if_start_delay_qlen = MIN(100, delay_qlen);
5247 		ifp->if_start_delay_timeout = min(20000, delay_timeout);
5248 		/* convert timeout to nanoseconds */
5249 		ifp->if_start_delay_timeout *= 1000;
5250 		kprintf("%s: forced IFEF_ENQUEUE_MULTI qlen %u timeout %u\n",
5251 		    ifp->if_xname, (uint32_t)delay_qlen,
5252 		    (uint32_t)delay_timeout);
5253 	} else {
5254 		if_clear_eflags(ifp, IFEF_ENQUEUE_MULTI);
5255 	}
5256 }
5257 
5258 /*
5259  * This function clears the DSCP bits in the IPV4/V6 header pointed to by buf.
5260  * While it's ok for buf to be not 32 bit aligned, the caller must ensure that
5261  * buf holds the full header.
5262  */
5263 static __attribute__((noinline)) void
ifnet_mcast_clear_dscp(uint8_t * buf,uint8_t ip_ver)5264 ifnet_mcast_clear_dscp(uint8_t *buf, uint8_t ip_ver)
5265 {
5266 	struct ip *ip;
5267 	struct ip6_hdr *ip6;
5268 	uint8_t lbuf[64] __attribute__((aligned(8)));
5269 	uint8_t *p = buf;
5270 
5271 	if (ip_ver == IPVERSION) {
5272 		uint8_t old_tos;
5273 		uint32_t sum;
5274 
5275 		if (__improbable(!IP_HDR_ALIGNED_P(p))) {
5276 			DTRACE_IP1(not__aligned__v4, uint8_t *, buf);
5277 			bcopy(buf, lbuf, sizeof(struct ip));
5278 			p = lbuf;
5279 		}
5280 		ip = (struct ip *)(void *)p;
5281 		if (__probable((ip->ip_tos & ~IPTOS_ECN_MASK) == 0)) {
5282 			return;
5283 		}
5284 
5285 		DTRACE_IP1(clear__v4, struct ip *, ip);
5286 		old_tos = ip->ip_tos;
5287 		ip->ip_tos &= IPTOS_ECN_MASK;
5288 		sum = ip->ip_sum + htons(old_tos) - htons(ip->ip_tos);
5289 		sum = (sum >> 16) + (sum & 0xffff);
5290 		ip->ip_sum = (uint16_t)(sum & 0xffff);
5291 
5292 		if (__improbable(p == lbuf)) {
5293 			bcopy(lbuf, buf, sizeof(struct ip));
5294 		}
5295 	} else {
5296 		uint32_t flow;
5297 		ASSERT(ip_ver == IPV6_VERSION);
5298 
5299 		if (__improbable(!IP_HDR_ALIGNED_P(p))) {
5300 			DTRACE_IP1(not__aligned__v6, uint8_t *, buf);
5301 			bcopy(buf, lbuf, sizeof(struct ip6_hdr));
5302 			p = lbuf;
5303 		}
5304 		ip6 = (struct ip6_hdr *)(void *)p;
5305 		flow = ntohl(ip6->ip6_flow);
5306 		if (__probable((flow & IP6FLOW_DSCP_MASK) == 0)) {
5307 			return;
5308 		}
5309 
5310 		DTRACE_IP1(clear__v6, struct ip6_hdr *, ip6);
5311 		ip6->ip6_flow = htonl(flow & ~IP6FLOW_DSCP_MASK);
5312 
5313 		if (__improbable(p == lbuf)) {
5314 			bcopy(lbuf, buf, sizeof(struct ip6_hdr));
5315 		}
5316 	}
5317 }
5318 
5319 static inline errno_t
ifnet_enqueue_ifclassq(struct ifnet * ifp,struct ifclassq * ifcq,classq_pkt_t * p,boolean_t flush,boolean_t * pdrop)5320 ifnet_enqueue_ifclassq(struct ifnet *ifp, struct ifclassq *ifcq,
5321     classq_pkt_t *p, boolean_t flush, boolean_t *pdrop)
5322 {
5323 #if SKYWALK
5324 	volatile struct sk_nexusadv *nxadv = NULL;
5325 #endif /* SKYWALK */
5326 	volatile uint64_t *fg_ts = NULL;
5327 	volatile uint64_t *rt_ts = NULL;
5328 	struct timespec now;
5329 	u_int64_t now_nsec = 0;
5330 	int error = 0;
5331 	uint8_t *mcast_buf = NULL;
5332 	uint8_t ip_ver;
5333 	uint32_t pktlen;
5334 
5335 	ASSERT(ifp->if_eflags & IFEF_TXSTART);
5336 #if SKYWALK
5337 	/*
5338 	 * If attached to flowswitch, grab pointers to the
5339 	 * timestamp variables in the nexus advisory region.
5340 	 */
5341 	if ((ifp->if_capabilities & IFCAP_SKYWALK) && ifp->if_na != NULL &&
5342 	    (nxadv = ifp->if_na->nifna_netif->nif_fsw_nxadv) != NULL) {
5343 		fg_ts = &nxadv->nxadv_fg_sendts;
5344 		rt_ts = &nxadv->nxadv_rt_sendts;
5345 	}
5346 #endif /* SKYWALK */
5347 
5348 	/*
5349 	 * If packet already carries a timestamp, either from dlil_output()
5350 	 * or from flowswitch, use it here.  Otherwise, record timestamp.
5351 	 * PKTF_TS_VALID is always cleared prior to entering classq, i.e.
5352 	 * the timestamp value is used internally there.
5353 	 */
5354 	switch (p->cp_ptype) {
5355 	case QP_MBUF:
5356 #if SKYWALK
5357 		/*
5358 		 * Valid only for non-native (compat) Skywalk interface.
5359 		 * If the data source uses packet, caller must convert
5360 		 * it to mbuf first prior to calling this routine.
5361 		 */
5362 		ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5363 #endif /* SKYWALK */
5364 		ASSERT(p->cp_mbuf->m_flags & M_PKTHDR);
5365 		ASSERT(p->cp_mbuf->m_nextpkt == NULL);
5366 
5367 		if (!(p->cp_mbuf->m_pkthdr.pkt_flags & PKTF_TS_VALID) ||
5368 		    p->cp_mbuf->m_pkthdr.pkt_timestamp == 0) {
5369 			nanouptime(&now);
5370 			net_timernsec(&now, &now_nsec);
5371 			p->cp_mbuf->m_pkthdr.pkt_timestamp = now_nsec;
5372 		}
5373 		p->cp_mbuf->m_pkthdr.pkt_flags &= ~PKTF_TS_VALID;
5374 		/*
5375 		 * If the packet service class is not background,
5376 		 * update the timestamp to indicate recent activity
5377 		 * on a foreground socket.
5378 		 */
5379 		if ((p->cp_mbuf->m_pkthdr.pkt_flags & PKTF_FLOW_ID) &&
5380 		    p->cp_mbuf->m_pkthdr.pkt_flowsrc == FLOWSRC_INPCB) {
5381 			if (!(p->cp_mbuf->m_pkthdr.pkt_flags &
5382 			    PKTF_SO_BACKGROUND)) {
5383 				ifp->if_fg_sendts = (uint32_t)_net_uptime;
5384 				if (fg_ts != NULL) {
5385 					*fg_ts = (uint32_t)_net_uptime;
5386 				}
5387 			}
5388 			if (p->cp_mbuf->m_pkthdr.pkt_flags & PKTF_SO_REALTIME) {
5389 				ifp->if_rt_sendts = (uint32_t)_net_uptime;
5390 				if (rt_ts != NULL) {
5391 					*rt_ts = (uint32_t)_net_uptime;
5392 				}
5393 			}
5394 		}
5395 		pktlen = m_pktlen(p->cp_mbuf);
5396 
5397 		/*
5398 		 * Some Wi-Fi AP implementations do not correctly handle
5399 		 * multicast IP packets with DSCP bits set (radr://9331522).
5400 		 * As a workaround we clear the DSCP bits but keep service
5401 		 * class (rdar://51507725).
5402 		 */
5403 		if ((p->cp_mbuf->m_flags & M_MCAST) != 0 &&
5404 		    IFNET_IS_WIFI_INFRA(ifp)) {
5405 			size_t len = mbuf_len(p->cp_mbuf), hlen;
5406 			struct ether_header *eh;
5407 			boolean_t pullup = FALSE;
5408 			uint16_t etype;
5409 
5410 			if (__improbable(len < sizeof(struct ether_header))) {
5411 				DTRACE_IP1(small__ether, size_t, len);
5412 				if ((p->cp_mbuf = m_pullup(p->cp_mbuf,
5413 				    sizeof(struct ether_header))) == NULL) {
5414 					return ENOMEM;
5415 				}
5416 			}
5417 			eh = (struct ether_header *)mbuf_data(p->cp_mbuf);
5418 			etype = ntohs(eh->ether_type);
5419 			if (etype == ETHERTYPE_IP) {
5420 				hlen = sizeof(struct ether_header) +
5421 				    sizeof(struct ip);
5422 				if (len < hlen) {
5423 					DTRACE_IP1(small__v4, size_t, len);
5424 					pullup = TRUE;
5425 				}
5426 				ip_ver = IPVERSION;
5427 			} else if (etype == ETHERTYPE_IPV6) {
5428 				hlen = sizeof(struct ether_header) +
5429 				    sizeof(struct ip6_hdr);
5430 				if (len < hlen) {
5431 					DTRACE_IP1(small__v6, size_t, len);
5432 					pullup = TRUE;
5433 				}
5434 				ip_ver = IPV6_VERSION;
5435 			} else {
5436 				DTRACE_IP1(invalid__etype, uint16_t, etype);
5437 				break;
5438 			}
5439 			if (pullup) {
5440 				if ((p->cp_mbuf = m_pullup(p->cp_mbuf, (int)hlen)) ==
5441 				    NULL) {
5442 					return ENOMEM;
5443 				}
5444 
5445 				eh = (struct ether_header *)mbuf_data(
5446 					p->cp_mbuf);
5447 			}
5448 			mcast_buf = (uint8_t *)(eh + 1);
5449 			/*
5450 			 * ifnet_mcast_clear_dscp() will finish the work below.
5451 			 * Note that the pullups above ensure that mcast_buf
5452 			 * points to a full IP header.
5453 			 */
5454 		}
5455 		break;
5456 
5457 #if SKYWALK
5458 	case QP_PACKET:
5459 		/*
5460 		 * Valid only for native Skywalk interface.  If the data
5461 		 * source uses mbuf, caller must convert it to packet first
5462 		 * prior to calling this routine.
5463 		 */
5464 		ASSERT(ifp->if_eflags & IFEF_SKYWALK_NATIVE);
5465 		if (!(p->cp_kpkt->pkt_pflags & PKT_F_TS_VALID) ||
5466 		    p->cp_kpkt->pkt_timestamp == 0) {
5467 			nanouptime(&now);
5468 			net_timernsec(&now, &now_nsec);
5469 			p->cp_kpkt->pkt_timestamp = now_nsec;
5470 		}
5471 		p->cp_kpkt->pkt_pflags &= ~PKT_F_TS_VALID;
5472 		/*
5473 		 * If the packet service class is not background,
5474 		 * update the timestamps on the interface, as well as
5475 		 * the ones in nexus-wide advisory to indicate recent
5476 		 * activity on a foreground flow.
5477 		 */
5478 		if (!(p->cp_kpkt->pkt_pflags & PKT_F_BACKGROUND)) {
5479 			ifp->if_fg_sendts = (uint32_t)_net_uptime;
5480 			if (fg_ts != NULL) {
5481 				*fg_ts = (uint32_t)_net_uptime;
5482 			}
5483 		}
5484 		if (p->cp_kpkt->pkt_pflags & PKT_F_REALTIME) {
5485 			ifp->if_rt_sendts = (uint32_t)_net_uptime;
5486 			if (rt_ts != NULL) {
5487 				*rt_ts = (uint32_t)_net_uptime;
5488 			}
5489 		}
5490 		pktlen = p->cp_kpkt->pkt_length;
5491 
5492 		/*
5493 		 * Some Wi-Fi AP implementations do not correctly handle
5494 		 * multicast IP packets with DSCP bits set (radr://9331522).
5495 		 * As a workaround we clear the DSCP bits but keep service
5496 		 * class (rdar://51507725).
5497 		 */
5498 		if ((p->cp_kpkt->pkt_link_flags & PKT_LINKF_MCAST) != 0 &&
5499 		    IFNET_IS_WIFI_INFRA(ifp)) {
5500 			uint8_t *baddr;
5501 			struct ether_header *eh;
5502 			uint16_t etype;
5503 
5504 			MD_BUFLET_ADDR_ABS(p->cp_kpkt, baddr);
5505 			baddr += p->cp_kpkt->pkt_headroom;
5506 			if (__improbable(pktlen < sizeof(struct ether_header))) {
5507 				DTRACE_IP1(pkt__small__ether, __kern_packet *,
5508 				    p->cp_kpkt);
5509 				break;
5510 			}
5511 			eh = (struct ether_header *)(void *)baddr;
5512 			etype = ntohs(eh->ether_type);
5513 			if (etype == ETHERTYPE_IP) {
5514 				if (pktlen < sizeof(struct ether_header) +
5515 				    sizeof(struct ip)) {
5516 					DTRACE_IP1(pkt__small__v4, uint32_t,
5517 					    pktlen);
5518 					break;
5519 				}
5520 				ip_ver = IPVERSION;
5521 			} else if (etype == ETHERTYPE_IPV6) {
5522 				if (pktlen < sizeof(struct ether_header) +
5523 				    sizeof(struct ip6_hdr)) {
5524 					DTRACE_IP1(pkt__small__v6, uint32_t,
5525 					    pktlen);
5526 					break;
5527 				}
5528 				ip_ver = IPV6_VERSION;
5529 			} else {
5530 				DTRACE_IP1(pkt__invalid__etype, uint16_t,
5531 				    etype);
5532 				break;
5533 			}
5534 			mcast_buf = (uint8_t *)(eh + 1);
5535 			/*
5536 			 * ifnet_mcast_clear_dscp() will finish the work below.
5537 			 * The checks above verify that the IP header is in the
5538 			 * first buflet.
5539 			 */
5540 		}
5541 		break;
5542 #endif /* SKYWALK */
5543 
5544 	default:
5545 		VERIFY(0);
5546 		/* NOTREACHED */
5547 		__builtin_unreachable();
5548 	}
5549 
5550 	if (mcast_buf != NULL) {
5551 		ifnet_mcast_clear_dscp(mcast_buf, ip_ver);
5552 	}
5553 
5554 	if (ifp->if_eflags & IFEF_ENQUEUE_MULTI) {
5555 		if (now_nsec == 0) {
5556 			nanouptime(&now);
5557 			net_timernsec(&now, &now_nsec);
5558 		}
5559 		/*
5560 		 * If the driver chose to delay start callback for
5561 		 * coalescing multiple packets, Then use the following
5562 		 * heuristics to make sure that start callback will
5563 		 * be delayed only when bulk data transfer is detected.
5564 		 * 1. number of packets enqueued in (delay_win * 2) is
5565 		 * greater than or equal to the delay qlen.
5566 		 * 2. If delay_start is enabled it will stay enabled for
5567 		 * another 10 idle windows. This is to take into account
5568 		 * variable RTT and burst traffic.
5569 		 * 3. If the time elapsed since last enqueue is more
5570 		 * than 200ms we disable delaying start callback. This is
5571 		 * is to take idle time into account.
5572 		 */
5573 		u_int64_t dwin = (ifp->if_start_delay_timeout << 1);
5574 		if (ifp->if_start_delay_swin > 0) {
5575 			if ((ifp->if_start_delay_swin + dwin) > now_nsec) {
5576 				ifp->if_start_delay_cnt++;
5577 			} else if ((now_nsec - ifp->if_start_delay_swin)
5578 			    >= (200 * 1000 * 1000)) {
5579 				ifp->if_start_delay_swin = now_nsec;
5580 				ifp->if_start_delay_cnt = 1;
5581 				ifp->if_start_delay_idle = 0;
5582 				if (ifp->if_eflags & IFEF_DELAY_START) {
5583 					if_clear_eflags(ifp, IFEF_DELAY_START);
5584 					ifnet_delay_start_disabled_increment();
5585 				}
5586 			} else {
5587 				if (ifp->if_start_delay_cnt >=
5588 				    ifp->if_start_delay_qlen) {
5589 					if_set_eflags(ifp, IFEF_DELAY_START);
5590 					ifp->if_start_delay_idle = 0;
5591 				} else {
5592 					if (ifp->if_start_delay_idle >= 10) {
5593 						if_clear_eflags(ifp,
5594 						    IFEF_DELAY_START);
5595 						ifnet_delay_start_disabled_increment();
5596 					} else {
5597 						ifp->if_start_delay_idle++;
5598 					}
5599 				}
5600 				ifp->if_start_delay_swin = now_nsec;
5601 				ifp->if_start_delay_cnt = 1;
5602 			}
5603 		} else {
5604 			ifp->if_start_delay_swin = now_nsec;
5605 			ifp->if_start_delay_cnt = 1;
5606 			ifp->if_start_delay_idle = 0;
5607 			if_clear_eflags(ifp, IFEF_DELAY_START);
5608 		}
5609 	} else {
5610 		if_clear_eflags(ifp, IFEF_DELAY_START);
5611 	}
5612 
5613 	/* enqueue the packet (caller consumes object) */
5614 	error = ifclassq_enqueue(((ifcq != NULL) ? ifcq : ifp->if_snd), p, p,
5615 	    1, pktlen, pdrop);
5616 
5617 	/*
5618 	 * Tell the driver to start dequeueing; do this even when the queue
5619 	 * for the packet is suspended (EQSUSPENDED), as the driver could still
5620 	 * be dequeueing from other unsuspended queues.
5621 	 */
5622 	if (!(ifp->if_eflags & IFEF_ENQUEUE_MULTI) &&
5623 	    ((error == 0 && flush) || error == EQFULL || error == EQSUSPENDED)) {
5624 		ifnet_start(ifp);
5625 	}
5626 
5627 	return error;
5628 }
5629 
5630 static inline errno_t
ifnet_enqueue_ifclassq_chain(struct ifnet * ifp,struct ifclassq * ifcq,classq_pkt_t * head,classq_pkt_t * tail,uint32_t cnt,uint32_t bytes,boolean_t flush,boolean_t * pdrop)5631 ifnet_enqueue_ifclassq_chain(struct ifnet *ifp, struct ifclassq *ifcq,
5632     classq_pkt_t *head, classq_pkt_t *tail, uint32_t cnt, uint32_t bytes,
5633     boolean_t flush, boolean_t *pdrop)
5634 {
5635 	int error;
5636 
5637 	/* enqueue the packet (caller consumes object) */
5638 	error = ifclassq_enqueue(ifcq != NULL ? ifcq : ifp->if_snd, head, tail,
5639 	    cnt, bytes, pdrop);
5640 
5641 	/*
5642 	 * Tell the driver to start dequeueing; do this even when the queue
5643 	 * for the packet is suspended (EQSUSPENDED), as the driver could still
5644 	 * be dequeueing from other unsuspended queues.
5645 	 */
5646 	if ((error == 0 && flush) || error == EQFULL || error == EQSUSPENDED) {
5647 		ifnet_start(ifp);
5648 	}
5649 	return error;
5650 }
5651 
5652 int
ifnet_enqueue_netem(void * handle,pktsched_pkt_t * pkts,uint32_t n_pkts)5653 ifnet_enqueue_netem(void *handle, pktsched_pkt_t *pkts, uint32_t n_pkts)
5654 {
5655 	struct ifnet *ifp = handle;
5656 	boolean_t pdrop;        /* dummy */
5657 	uint32_t i;
5658 
5659 	ASSERT(n_pkts >= 1);
5660 	for (i = 0; i < n_pkts - 1; i++) {
5661 		(void) ifnet_enqueue_ifclassq(ifp, NULL, &pkts[i].pktsched_pkt,
5662 		    FALSE, &pdrop);
5663 	}
5664 	/* flush with the last packet */
5665 	(void) ifnet_enqueue_ifclassq(ifp, NULL, &pkts[i].pktsched_pkt,
5666 	    TRUE, &pdrop);
5667 
5668 	return 0;
5669 }
5670 
5671 static inline errno_t
ifnet_enqueue_common(struct ifnet * ifp,struct ifclassq * ifcq,classq_pkt_t * pkt,boolean_t flush,boolean_t * pdrop)5672 ifnet_enqueue_common(struct ifnet *ifp, struct ifclassq *ifcq,
5673     classq_pkt_t *pkt, boolean_t flush, boolean_t *pdrop)
5674 {
5675 	if (ifp->if_output_netem != NULL) {
5676 		bool drop;
5677 		errno_t error;
5678 		error = netem_enqueue(ifp->if_output_netem, pkt, &drop);
5679 		*pdrop = drop ? TRUE : FALSE;
5680 		return error;
5681 	} else {
5682 		return ifnet_enqueue_ifclassq(ifp, ifcq, pkt, flush, pdrop);
5683 	}
5684 }
5685 
5686 errno_t
ifnet_enqueue(struct ifnet * ifp,struct mbuf * m)5687 ifnet_enqueue(struct ifnet *ifp, struct mbuf *m)
5688 {
5689 	uint32_t bytes = m_pktlen(m);
5690 	struct mbuf *tail = m;
5691 	uint32_t cnt = 1;
5692 	boolean_t pdrop;
5693 
5694 	while (tail->m_nextpkt) {
5695 		VERIFY(tail->m_flags & M_PKTHDR);
5696 		tail = tail->m_nextpkt;
5697 		cnt++;
5698 		bytes += m_pktlen(tail);
5699 	}
5700 
5701 	return ifnet_enqueue_mbuf_chain(ifp, m, tail, cnt, bytes, TRUE, &pdrop);
5702 }
5703 
5704 errno_t
ifnet_enqueue_mbuf(struct ifnet * ifp,struct mbuf * m,boolean_t flush,boolean_t * pdrop)5705 ifnet_enqueue_mbuf(struct ifnet *ifp, struct mbuf *m, boolean_t flush,
5706     boolean_t *pdrop)
5707 {
5708 	classq_pkt_t pkt;
5709 
5710 	if (ifp == NULL || m == NULL || !(m->m_flags & M_PKTHDR) ||
5711 	    m->m_nextpkt != NULL) {
5712 		if (m != NULL) {
5713 			m_freem_list(m);
5714 			*pdrop = TRUE;
5715 		}
5716 		return EINVAL;
5717 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5718 	    !IF_FULLY_ATTACHED(ifp)) {
5719 		/* flag tested without lock for performance */
5720 		m_freem(m);
5721 		*pdrop = TRUE;
5722 		return ENXIO;
5723 	} else if (!(ifp->if_flags & IFF_UP)) {
5724 		m_freem(m);
5725 		*pdrop = TRUE;
5726 		return ENETDOWN;
5727 	}
5728 
5729 	CLASSQ_PKT_INIT_MBUF(&pkt, m);
5730 	return ifnet_enqueue_common(ifp, NULL, &pkt, flush, pdrop);
5731 }
5732 
5733 errno_t
ifnet_enqueue_mbuf_chain(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,uint32_t cnt,uint32_t bytes,boolean_t flush,boolean_t * pdrop)5734 ifnet_enqueue_mbuf_chain(struct ifnet *ifp, struct mbuf *m_head,
5735     struct mbuf *m_tail, uint32_t cnt, uint32_t bytes, boolean_t flush,
5736     boolean_t *pdrop)
5737 {
5738 	classq_pkt_t head, tail;
5739 
5740 	ASSERT(m_head != NULL);
5741 	ASSERT((m_head->m_flags & M_PKTHDR) != 0);
5742 	ASSERT(m_tail != NULL);
5743 	ASSERT((m_tail->m_flags & M_PKTHDR) != 0);
5744 	ASSERT(ifp != NULL);
5745 	ASSERT((ifp->if_eflags & IFEF_TXSTART) != 0);
5746 
5747 	if (!IF_FULLY_ATTACHED(ifp)) {
5748 		/* flag tested without lock for performance */
5749 		m_freem_list(m_head);
5750 		*pdrop = TRUE;
5751 		return ENXIO;
5752 	} else if (!(ifp->if_flags & IFF_UP)) {
5753 		m_freem_list(m_head);
5754 		*pdrop = TRUE;
5755 		return ENETDOWN;
5756 	}
5757 
5758 	CLASSQ_PKT_INIT_MBUF(&head, m_head);
5759 	CLASSQ_PKT_INIT_MBUF(&tail, m_tail);
5760 	return ifnet_enqueue_ifclassq_chain(ifp, NULL, &head, &tail, cnt, bytes,
5761 	           flush, pdrop);
5762 }
5763 
5764 #if SKYWALK
5765 static errno_t
ifnet_enqueue_pkt_common(struct ifnet * ifp,struct ifclassq * ifcq,struct __kern_packet * kpkt,boolean_t flush,boolean_t * pdrop)5766 ifnet_enqueue_pkt_common(struct ifnet *ifp, struct ifclassq *ifcq,
5767     struct __kern_packet *kpkt, boolean_t flush, boolean_t *pdrop)
5768 {
5769 	classq_pkt_t pkt;
5770 
5771 	ASSERT(kpkt == NULL || kpkt->pkt_nextpkt == NULL);
5772 
5773 	if (__improbable(ifp == NULL || kpkt == NULL)) {
5774 		if (kpkt != NULL) {
5775 			pp_free_packet(__DECONST(struct kern_pbufpool *,
5776 			    kpkt->pkt_qum.qum_pp), SK_PTR_ADDR(kpkt));
5777 			*pdrop = TRUE;
5778 		}
5779 		return EINVAL;
5780 	} else if (__improbable(!(ifp->if_eflags & IFEF_TXSTART) ||
5781 	    !IF_FULLY_ATTACHED(ifp))) {
5782 		/* flag tested without lock for performance */
5783 		pp_free_packet(__DECONST(struct kern_pbufpool *,
5784 		    kpkt->pkt_qum.qum_pp), SK_PTR_ADDR(kpkt));
5785 		*pdrop = TRUE;
5786 		return ENXIO;
5787 	} else if (__improbable(!(ifp->if_flags & IFF_UP))) {
5788 		pp_free_packet(__DECONST(struct kern_pbufpool *,
5789 		    kpkt->pkt_qum.qum_pp), SK_PTR_ADDR(kpkt));
5790 		*pdrop = TRUE;
5791 		return ENETDOWN;
5792 	}
5793 
5794 	CLASSQ_PKT_INIT_PACKET(&pkt, kpkt);
5795 	return ifnet_enqueue_common(ifp, ifcq, &pkt, flush, pdrop);
5796 }
5797 
5798 errno_t
ifnet_enqueue_pkt(struct ifnet * ifp,struct __kern_packet * kpkt,boolean_t flush,boolean_t * pdrop)5799 ifnet_enqueue_pkt(struct ifnet *ifp, struct __kern_packet *kpkt,
5800     boolean_t flush, boolean_t *pdrop)
5801 {
5802 	return ifnet_enqueue_pkt_common(ifp, NULL, kpkt, flush, pdrop);
5803 }
5804 
5805 errno_t
ifnet_enqueue_ifcq_pkt(struct ifnet * ifp,struct ifclassq * ifcq,struct __kern_packet * kpkt,boolean_t flush,boolean_t * pdrop)5806 ifnet_enqueue_ifcq_pkt(struct ifnet *ifp, struct ifclassq *ifcq,
5807     struct __kern_packet *kpkt, boolean_t flush, boolean_t *pdrop)
5808 {
5809 	return ifnet_enqueue_pkt_common(ifp, ifcq, kpkt, flush, pdrop);
5810 }
5811 
5812 static errno_t
ifnet_enqueue_pkt_chain_common(struct ifnet * ifp,struct ifclassq * ifcq,struct __kern_packet * k_head,struct __kern_packet * k_tail,uint32_t cnt,uint32_t bytes,boolean_t flush,boolean_t * pdrop)5813 ifnet_enqueue_pkt_chain_common(struct ifnet *ifp, struct ifclassq *ifcq,
5814     struct __kern_packet *k_head, struct __kern_packet *k_tail, uint32_t cnt,
5815     uint32_t bytes, boolean_t flush, boolean_t *pdrop)
5816 {
5817 	classq_pkt_t head, tail;
5818 
5819 	ASSERT(k_head != NULL);
5820 	ASSERT(k_tail != NULL);
5821 	ASSERT(ifp != NULL);
5822 	ASSERT((ifp->if_eflags & IFEF_TXSTART) != 0);
5823 
5824 	if (!IF_FULLY_ATTACHED(ifp)) {
5825 		/* flag tested without lock for performance */
5826 		pp_free_packet_chain(k_head, NULL);
5827 		*pdrop = TRUE;
5828 		return ENXIO;
5829 	} else if (__improbable(!(ifp->if_flags & IFF_UP))) {
5830 		pp_free_packet_chain(k_head, NULL);
5831 		*pdrop = TRUE;
5832 		return ENETDOWN;
5833 	}
5834 
5835 	CLASSQ_PKT_INIT_PACKET(&head, k_head);
5836 	CLASSQ_PKT_INIT_PACKET(&tail, k_tail);
5837 	return ifnet_enqueue_ifclassq_chain(ifp, ifcq, &head, &tail, cnt, bytes,
5838 	           flush, pdrop);
5839 }
5840 
5841 errno_t
ifnet_enqueue_pkt_chain(struct ifnet * ifp,struct __kern_packet * k_head,struct __kern_packet * k_tail,uint32_t cnt,uint32_t bytes,boolean_t flush,boolean_t * pdrop)5842 ifnet_enqueue_pkt_chain(struct ifnet *ifp, struct __kern_packet *k_head,
5843     struct __kern_packet *k_tail, uint32_t cnt, uint32_t bytes, boolean_t flush,
5844     boolean_t *pdrop)
5845 {
5846 	return ifnet_enqueue_pkt_chain_common(ifp, NULL, k_head, k_tail,
5847 	           cnt, bytes, flush, pdrop);
5848 }
5849 
5850 errno_t
ifnet_enqueue_ifcq_pkt_chain(struct ifnet * ifp,struct ifclassq * ifcq,struct __kern_packet * k_head,struct __kern_packet * k_tail,uint32_t cnt,uint32_t bytes,boolean_t flush,boolean_t * pdrop)5851 ifnet_enqueue_ifcq_pkt_chain(struct ifnet *ifp, struct ifclassq *ifcq,
5852     struct __kern_packet *k_head, struct __kern_packet *k_tail, uint32_t cnt,
5853     uint32_t bytes, boolean_t flush, boolean_t *pdrop)
5854 {
5855 	return ifnet_enqueue_pkt_chain_common(ifp, ifcq, k_head, k_tail,
5856 	           cnt, bytes, flush, pdrop);
5857 }
5858 #endif /* SKYWALK */
5859 
5860 errno_t
ifnet_dequeue(struct ifnet * ifp,struct mbuf ** mp)5861 ifnet_dequeue(struct ifnet *ifp, struct mbuf **mp)
5862 {
5863 	errno_t rc;
5864 	classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
5865 
5866 	if (ifp == NULL || mp == NULL) {
5867 		return EINVAL;
5868 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5869 	    ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5870 		return ENXIO;
5871 	}
5872 	if (!ifnet_is_attached(ifp, 1)) {
5873 		return ENXIO;
5874 	}
5875 
5876 #if SKYWALK
5877 	ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5878 #endif /* SKYWALK */
5879 	rc = ifclassq_dequeue(ifp->if_snd, 1, CLASSQ_DEQUEUE_MAX_BYTE_LIMIT,
5880 	    &pkt, NULL, NULL, NULL, 0);
5881 	VERIFY((pkt.cp_ptype == QP_MBUF) || (pkt.cp_mbuf == NULL));
5882 	ifnet_decr_iorefcnt(ifp);
5883 	*mp = pkt.cp_mbuf;
5884 	return rc;
5885 }
5886 
5887 errno_t
ifnet_dequeue_service_class(struct ifnet * ifp,mbuf_svc_class_t sc,struct mbuf ** mp)5888 ifnet_dequeue_service_class(struct ifnet *ifp, mbuf_svc_class_t sc,
5889     struct mbuf **mp)
5890 {
5891 	errno_t rc;
5892 	classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
5893 
5894 	if (ifp == NULL || mp == NULL || !MBUF_VALID_SC(sc)) {
5895 		return EINVAL;
5896 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5897 	    ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5898 		return ENXIO;
5899 	}
5900 	if (!ifnet_is_attached(ifp, 1)) {
5901 		return ENXIO;
5902 	}
5903 
5904 #if SKYWALK
5905 	ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5906 #endif /* SKYWALK */
5907 	rc = ifclassq_dequeue_sc(ifp->if_snd, sc, 1,
5908 	    CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, &pkt, NULL, NULL, NULL, 0);
5909 	VERIFY((pkt.cp_ptype == QP_MBUF) || (pkt.cp_mbuf == NULL));
5910 	ifnet_decr_iorefcnt(ifp);
5911 	*mp = pkt.cp_mbuf;
5912 	return rc;
5913 }
5914 
5915 errno_t
ifnet_dequeue_multi(struct ifnet * ifp,u_int32_t pkt_limit,struct mbuf ** head,struct mbuf ** tail,u_int32_t * cnt,u_int32_t * len)5916 ifnet_dequeue_multi(struct ifnet *ifp, u_int32_t pkt_limit,
5917     struct mbuf **head, struct mbuf **tail, u_int32_t *cnt, u_int32_t *len)
5918 {
5919 	errno_t rc;
5920 	classq_pkt_t pkt_head = CLASSQ_PKT_INITIALIZER(pkt_head);
5921 	classq_pkt_t pkt_tail = CLASSQ_PKT_INITIALIZER(pkt_tail);
5922 
5923 	if (ifp == NULL || head == NULL || pkt_limit < 1) {
5924 		return EINVAL;
5925 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5926 	    ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5927 		return ENXIO;
5928 	}
5929 	if (!ifnet_is_attached(ifp, 1)) {
5930 		return ENXIO;
5931 	}
5932 
5933 #if SKYWALK
5934 	ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5935 #endif /* SKYWALK */
5936 	rc = ifclassq_dequeue(ifp->if_snd, pkt_limit,
5937 	    CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, &pkt_head, &pkt_tail, cnt, len, 0);
5938 	VERIFY((pkt_head.cp_ptype == QP_MBUF) || (pkt_head.cp_mbuf == NULL));
5939 	ifnet_decr_iorefcnt(ifp);
5940 	*head = pkt_head.cp_mbuf;
5941 	if (tail != NULL) {
5942 		*tail = pkt_tail.cp_mbuf;
5943 	}
5944 	return rc;
5945 }
5946 
5947 errno_t
ifnet_dequeue_multi_bytes(struct ifnet * ifp,u_int32_t byte_limit,struct mbuf ** head,struct mbuf ** tail,u_int32_t * cnt,u_int32_t * len)5948 ifnet_dequeue_multi_bytes(struct ifnet *ifp, u_int32_t byte_limit,
5949     struct mbuf **head, struct mbuf **tail, u_int32_t *cnt, u_int32_t *len)
5950 {
5951 	errno_t rc;
5952 	classq_pkt_t pkt_head = CLASSQ_PKT_INITIALIZER(pkt_head);
5953 	classq_pkt_t pkt_tail = CLASSQ_PKT_INITIALIZER(pkt_tail);
5954 
5955 	if (ifp == NULL || head == NULL || byte_limit < 1) {
5956 		return EINVAL;
5957 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5958 	    ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5959 		return ENXIO;
5960 	}
5961 	if (!ifnet_is_attached(ifp, 1)) {
5962 		return ENXIO;
5963 	}
5964 
5965 #if SKYWALK
5966 	ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5967 #endif /* SKYWALK */
5968 	rc = ifclassq_dequeue(ifp->if_snd, CLASSQ_DEQUEUE_MAX_PKT_LIMIT,
5969 	    byte_limit, &pkt_head, &pkt_tail, cnt, len, 0);
5970 	VERIFY((pkt_head.cp_ptype == QP_MBUF) || (pkt_head.cp_mbuf == NULL));
5971 	ifnet_decr_iorefcnt(ifp);
5972 	*head = pkt_head.cp_mbuf;
5973 	if (tail != NULL) {
5974 		*tail = pkt_tail.cp_mbuf;
5975 	}
5976 	return rc;
5977 }
5978 
5979 errno_t
ifnet_dequeue_service_class_multi(struct ifnet * ifp,mbuf_svc_class_t sc,u_int32_t pkt_limit,struct mbuf ** head,struct mbuf ** tail,u_int32_t * cnt,u_int32_t * len)5980 ifnet_dequeue_service_class_multi(struct ifnet *ifp, mbuf_svc_class_t sc,
5981     u_int32_t pkt_limit, struct mbuf **head, struct mbuf **tail, u_int32_t *cnt,
5982     u_int32_t *len)
5983 {
5984 	errno_t rc;
5985 	classq_pkt_t pkt_head = CLASSQ_PKT_INITIALIZER(pkt_head);
5986 	classq_pkt_t pkt_tail = CLASSQ_PKT_INITIALIZER(pkt_tail);
5987 
5988 	if (ifp == NULL || head == NULL || pkt_limit < 1 ||
5989 	    !MBUF_VALID_SC(sc)) {
5990 		return EINVAL;
5991 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5992 	    ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5993 		return ENXIO;
5994 	}
5995 	if (!ifnet_is_attached(ifp, 1)) {
5996 		return ENXIO;
5997 	}
5998 
5999 #if SKYWALK
6000 	ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
6001 #endif /* SKYWALK */
6002 	rc = ifclassq_dequeue_sc(ifp->if_snd, sc, pkt_limit,
6003 	    CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, &pkt_head, &pkt_tail,
6004 	    cnt, len, 0);
6005 	VERIFY((pkt_head.cp_ptype == QP_MBUF) || (pkt_head.cp_mbuf == NULL));
6006 	ifnet_decr_iorefcnt(ifp);
6007 	*head = pkt_head.cp_mbuf;
6008 	if (tail != NULL) {
6009 		*tail = pkt_tail.cp_mbuf;
6010 	}
6011 	return rc;
6012 }
6013 
6014 #if XNU_TARGET_OS_OSX
6015 errno_t
ifnet_framer_stub(struct ifnet * ifp,struct mbuf ** m,const struct sockaddr * dest,const char * dest_linkaddr,const char * frame_type,u_int32_t * pre,u_int32_t * post)6016 ifnet_framer_stub(struct ifnet *ifp, struct mbuf **m,
6017     const struct sockaddr *dest, const char *dest_linkaddr,
6018     const char *frame_type, u_int32_t *pre, u_int32_t *post)
6019 {
6020 	if (pre != NULL) {
6021 		*pre = 0;
6022 	}
6023 	if (post != NULL) {
6024 		*post = 0;
6025 	}
6026 
6027 	return ifp->if_framer_legacy(ifp, m, dest, dest_linkaddr, frame_type);
6028 }
6029 #endif /* XNU_TARGET_OS_OSX */
6030 
6031 static boolean_t
packet_has_vlan_tag(struct mbuf * m)6032 packet_has_vlan_tag(struct mbuf * m)
6033 {
6034 	u_int   tag = 0;
6035 
6036 	if ((m->m_pkthdr.csum_flags & CSUM_VLAN_TAG_VALID) != 0) {
6037 		tag = EVL_VLANOFTAG(m->m_pkthdr.vlan_tag);
6038 		if (tag == 0) {
6039 			/* the packet is just priority-tagged, clear the bit */
6040 			m->m_pkthdr.csum_flags &= ~CSUM_VLAN_TAG_VALID;
6041 		}
6042 	}
6043 	return tag != 0;
6044 }
6045 
6046 static int
dlil_interface_filters_input(struct ifnet * ifp,struct mbuf ** m_p,char ** frame_header_p,protocol_family_t protocol_family)6047 dlil_interface_filters_input(struct ifnet *ifp, struct mbuf **m_p,
6048     char **frame_header_p, protocol_family_t protocol_family)
6049 {
6050 	boolean_t               is_vlan_packet = FALSE;
6051 	struct ifnet_filter     *filter;
6052 	struct mbuf             *m = *m_p;
6053 
6054 	is_vlan_packet = packet_has_vlan_tag(m);
6055 
6056 	if (TAILQ_EMPTY(&ifp->if_flt_head)) {
6057 		return 0;
6058 	}
6059 
6060 	/*
6061 	 * Pass the inbound packet to the interface filters
6062 	 */
6063 	lck_mtx_lock_spin(&ifp->if_flt_lock);
6064 	/* prevent filter list from changing in case we drop the lock */
6065 	if_flt_monitor_busy(ifp);
6066 	TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
6067 		int result;
6068 
6069 		/* exclude VLAN packets from external filters PR-3586856 */
6070 		if (is_vlan_packet &&
6071 		    (filter->filt_flags & DLIL_IFF_INTERNAL) == 0) {
6072 			continue;
6073 		}
6074 
6075 		if (!filter->filt_skip && filter->filt_input != NULL &&
6076 		    (filter->filt_protocol == 0 ||
6077 		    filter->filt_protocol == protocol_family)) {
6078 			lck_mtx_unlock(&ifp->if_flt_lock);
6079 
6080 			result = (*filter->filt_input)(filter->filt_cookie,
6081 			    ifp, protocol_family, m_p, frame_header_p);
6082 
6083 			lck_mtx_lock_spin(&ifp->if_flt_lock);
6084 			if (result != 0) {
6085 				/* we're done with the filter list */
6086 				if_flt_monitor_unbusy(ifp);
6087 				lck_mtx_unlock(&ifp->if_flt_lock);
6088 				return result;
6089 			}
6090 		}
6091 	}
6092 	/* we're done with the filter list */
6093 	if_flt_monitor_unbusy(ifp);
6094 	lck_mtx_unlock(&ifp->if_flt_lock);
6095 
6096 	/*
6097 	 * Strip away M_PROTO1 bit prior to sending packet up the stack as
6098 	 * it is meant to be local to a subsystem -- if_bridge for M_PROTO1
6099 	 */
6100 	if (*m_p != NULL) {
6101 		(*m_p)->m_flags &= ~M_PROTO1;
6102 	}
6103 
6104 	return 0;
6105 }
6106 
6107 __attribute__((noinline))
6108 static int
dlil_interface_filters_output(struct ifnet * ifp,struct mbuf ** m_p,protocol_family_t protocol_family)6109 dlil_interface_filters_output(struct ifnet *ifp, struct mbuf **m_p,
6110     protocol_family_t protocol_family)
6111 {
6112 	boolean_t               is_vlan_packet;
6113 	struct ifnet_filter     *filter;
6114 	struct mbuf             *m = *m_p;
6115 
6116 	if (TAILQ_EMPTY(&ifp->if_flt_head)) {
6117 		return 0;
6118 	}
6119 	is_vlan_packet = packet_has_vlan_tag(m);
6120 
6121 	/*
6122 	 * Pass the outbound packet to the interface filters
6123 	 */
6124 	lck_mtx_lock_spin(&ifp->if_flt_lock);
6125 	/* prevent filter list from changing in case we drop the lock */
6126 	if_flt_monitor_busy(ifp);
6127 	TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
6128 		int result;
6129 
6130 		/* exclude VLAN packets from external filters PR-3586856 */
6131 		if (is_vlan_packet &&
6132 		    (filter->filt_flags & DLIL_IFF_INTERNAL) == 0) {
6133 			continue;
6134 		}
6135 
6136 		if (!filter->filt_skip && filter->filt_output != NULL &&
6137 		    (filter->filt_protocol == 0 ||
6138 		    filter->filt_protocol == protocol_family)) {
6139 			lck_mtx_unlock(&ifp->if_flt_lock);
6140 
6141 			result = filter->filt_output(filter->filt_cookie, ifp,
6142 			    protocol_family, m_p);
6143 
6144 			lck_mtx_lock_spin(&ifp->if_flt_lock);
6145 			if (result != 0) {
6146 				/* we're done with the filter list */
6147 				if_flt_monitor_unbusy(ifp);
6148 				lck_mtx_unlock(&ifp->if_flt_lock);
6149 				return result;
6150 			}
6151 		}
6152 	}
6153 	/* we're done with the filter list */
6154 	if_flt_monitor_unbusy(ifp);
6155 	lck_mtx_unlock(&ifp->if_flt_lock);
6156 
6157 	return 0;
6158 }
6159 
6160 static void
dlil_ifproto_input(struct if_proto * ifproto,mbuf_t m)6161 dlil_ifproto_input(struct if_proto * ifproto, mbuf_t m)
6162 {
6163 	int error;
6164 
6165 	if (ifproto->proto_kpi == kProtoKPI_v1) {
6166 		/* Version 1 protocols get one packet at a time */
6167 		while (m != NULL) {
6168 			char *  frame_header;
6169 			mbuf_t  next_packet;
6170 
6171 			next_packet = m->m_nextpkt;
6172 			m->m_nextpkt = NULL;
6173 			frame_header = m->m_pkthdr.pkt_hdr;
6174 			m->m_pkthdr.pkt_hdr = NULL;
6175 			error = (*ifproto->kpi.v1.input)(ifproto->ifp,
6176 			    ifproto->protocol_family, m, frame_header);
6177 			if (error != 0 && error != EJUSTRETURN) {
6178 				m_freem(m);
6179 			}
6180 			m = next_packet;
6181 		}
6182 	} else if (ifproto->proto_kpi == kProtoKPI_v2) {
6183 		/* Version 2 protocols support packet lists */
6184 		error = (*ifproto->kpi.v2.input)(ifproto->ifp,
6185 		    ifproto->protocol_family, m);
6186 		if (error != 0 && error != EJUSTRETURN) {
6187 			m_freem_list(m);
6188 		}
6189 	}
6190 }
6191 
6192 static void
dlil_input_stats_add(const struct ifnet_stat_increment_param * s,struct dlil_threading_info * inp,struct ifnet * ifp,boolean_t poll)6193 dlil_input_stats_add(const struct ifnet_stat_increment_param *s,
6194     struct dlil_threading_info *inp, struct ifnet *ifp, boolean_t poll)
6195 {
6196 	struct ifnet_stat_increment_param *d = &inp->dlth_stats;
6197 
6198 	if (s->packets_in != 0) {
6199 		d->packets_in += s->packets_in;
6200 	}
6201 	if (s->bytes_in != 0) {
6202 		d->bytes_in += s->bytes_in;
6203 	}
6204 	if (s->errors_in != 0) {
6205 		d->errors_in += s->errors_in;
6206 	}
6207 
6208 	if (s->packets_out != 0) {
6209 		d->packets_out += s->packets_out;
6210 	}
6211 	if (s->bytes_out != 0) {
6212 		d->bytes_out += s->bytes_out;
6213 	}
6214 	if (s->errors_out != 0) {
6215 		d->errors_out += s->errors_out;
6216 	}
6217 
6218 	if (s->collisions != 0) {
6219 		d->collisions += s->collisions;
6220 	}
6221 	if (s->dropped != 0) {
6222 		d->dropped += s->dropped;
6223 	}
6224 
6225 	if (poll) {
6226 		PKTCNTR_ADD(&ifp->if_poll_tstats, s->packets_in, s->bytes_in);
6227 	}
6228 }
6229 
6230 static boolean_t
dlil_input_stats_sync(struct ifnet * ifp,struct dlil_threading_info * inp)6231 dlil_input_stats_sync(struct ifnet *ifp, struct dlil_threading_info *inp)
6232 {
6233 	struct ifnet_stat_increment_param *s = &inp->dlth_stats;
6234 
6235 	/*
6236 	 * Use of atomic operations is unavoidable here because
6237 	 * these stats may also be incremented elsewhere via KPIs.
6238 	 */
6239 	if (s->packets_in != 0) {
6240 		os_atomic_add(&ifp->if_data.ifi_ipackets, s->packets_in, relaxed);
6241 		s->packets_in = 0;
6242 	}
6243 	if (s->bytes_in != 0) {
6244 		os_atomic_add(&ifp->if_data.ifi_ibytes, s->bytes_in, relaxed);
6245 		s->bytes_in = 0;
6246 	}
6247 	if (s->errors_in != 0) {
6248 		os_atomic_add(&ifp->if_data.ifi_ierrors, s->errors_in, relaxed);
6249 		s->errors_in = 0;
6250 	}
6251 
6252 	if (s->packets_out != 0) {
6253 		os_atomic_add(&ifp->if_data.ifi_opackets, s->packets_out, relaxed);
6254 		s->packets_out = 0;
6255 	}
6256 	if (s->bytes_out != 0) {
6257 		os_atomic_add(&ifp->if_data.ifi_obytes, s->bytes_out, relaxed);
6258 		s->bytes_out = 0;
6259 	}
6260 	if (s->errors_out != 0) {
6261 		os_atomic_add(&ifp->if_data.ifi_oerrors, s->errors_out, relaxed);
6262 		s->errors_out = 0;
6263 	}
6264 
6265 	if (s->collisions != 0) {
6266 		os_atomic_add(&ifp->if_data.ifi_collisions, s->collisions, relaxed);
6267 		s->collisions = 0;
6268 	}
6269 	if (s->dropped != 0) {
6270 		os_atomic_add(&ifp->if_data.ifi_iqdrops, s->dropped, relaxed);
6271 		s->dropped = 0;
6272 	}
6273 
6274 	/*
6275 	 * No need for atomic operations as they are modified here
6276 	 * only from within the DLIL input thread context.
6277 	 */
6278 	if (ifp->if_poll_tstats.packets != 0) {
6279 		ifp->if_poll_pstats.ifi_poll_packets += ifp->if_poll_tstats.packets;
6280 		ifp->if_poll_tstats.packets = 0;
6281 	}
6282 	if (ifp->if_poll_tstats.bytes != 0) {
6283 		ifp->if_poll_pstats.ifi_poll_bytes += ifp->if_poll_tstats.bytes;
6284 		ifp->if_poll_tstats.bytes = 0;
6285 	}
6286 
6287 	return ifp->if_data_threshold != 0;
6288 }
6289 
6290 __private_extern__ void
dlil_input_packet_list(struct ifnet * ifp,struct mbuf * m)6291 dlil_input_packet_list(struct ifnet *ifp, struct mbuf *m)
6292 {
6293 	return dlil_input_packet_list_common(ifp, m, 0,
6294 	           IFNET_MODEL_INPUT_POLL_OFF, FALSE);
6295 }
6296 
6297 __private_extern__ void
dlil_input_packet_list_extended(struct ifnet * ifp,struct mbuf * m,u_int32_t cnt,ifnet_model_t mode)6298 dlil_input_packet_list_extended(struct ifnet *ifp, struct mbuf *m,
6299     u_int32_t cnt, ifnet_model_t mode)
6300 {
6301 	return dlil_input_packet_list_common(ifp, m, cnt, mode, TRUE);
6302 }
6303 
6304 static void
dlil_input_packet_list_common(struct ifnet * ifp_param,struct mbuf * m,u_int32_t cnt,ifnet_model_t mode,boolean_t ext)6305 dlil_input_packet_list_common(struct ifnet *ifp_param, struct mbuf *m,
6306     u_int32_t cnt, ifnet_model_t mode, boolean_t ext)
6307 {
6308 	int error = 0;
6309 	protocol_family_t protocol_family;
6310 	mbuf_t next_packet;
6311 	ifnet_t ifp = ifp_param;
6312 	char *frame_header = NULL;
6313 	struct if_proto *last_ifproto = NULL;
6314 	mbuf_t pkt_first = NULL;
6315 	mbuf_t *pkt_next = NULL;
6316 	u_int32_t poll_thresh = 0, poll_ival = 0;
6317 	int iorefcnt = 0;
6318 
6319 	KERNEL_DEBUG(DBG_FNC_DLIL_INPUT | DBG_FUNC_START, 0, 0, 0, 0, 0);
6320 
6321 	if (ext && mode == IFNET_MODEL_INPUT_POLL_ON && cnt > 1 &&
6322 	    (poll_ival = if_rxpoll_interval_pkts) > 0) {
6323 		poll_thresh = cnt;
6324 	}
6325 
6326 	while (m != NULL) {
6327 		struct if_proto *ifproto = NULL;
6328 		uint32_t pktf_mask;     /* pkt flags to preserve */
6329 
6330 		m_add_crumb(m, PKT_CRUMB_DLIL_INPUT);
6331 
6332 		if (ifp_param == NULL) {
6333 			ifp = m->m_pkthdr.rcvif;
6334 		}
6335 
6336 		if ((ifp->if_eflags & IFEF_RXPOLL) &&
6337 		    (ifp->if_xflags & IFXF_LEGACY) && poll_thresh != 0 &&
6338 		    poll_ival > 0 && (--poll_thresh % poll_ival) == 0) {
6339 			ifnet_poll(ifp);
6340 		}
6341 
6342 		/* Check if this mbuf looks valid */
6343 		MBUF_INPUT_CHECK(m, ifp);
6344 
6345 		next_packet = m->m_nextpkt;
6346 		m->m_nextpkt = NULL;
6347 		frame_header = m->m_pkthdr.pkt_hdr;
6348 		m->m_pkthdr.pkt_hdr = NULL;
6349 
6350 		/*
6351 		 * Get an IO reference count if the interface is not
6352 		 * loopback (lo0) and it is attached; lo0 never goes
6353 		 * away, so optimize for that.
6354 		 */
6355 		if (ifp != lo_ifp) {
6356 			/* iorefcnt is 0 if it hasn't been taken yet */
6357 			if (iorefcnt == 0) {
6358 				if (!ifnet_datamov_begin(ifp)) {
6359 					m_freem(m);
6360 					goto next;
6361 				}
6362 			}
6363 			iorefcnt = 1;
6364 			/*
6365 			 * Preserve the time stamp and skip pktap flags.
6366 			 */
6367 			pktf_mask = PKTF_TS_VALID | PKTF_SKIP_PKTAP;
6368 		} else {
6369 			/*
6370 			 * If this arrived on lo0, preserve interface addr
6371 			 * info to allow for connectivity between loopback
6372 			 * and local interface addresses.
6373 			 */
6374 			pktf_mask = (PKTF_LOOP | PKTF_IFAINFO);
6375 		}
6376 		pktf_mask |= PKTF_WAKE_PKT;
6377 
6378 		/* make sure packet comes in clean */
6379 		m_classifier_init(m, pktf_mask);
6380 
6381 		ifp_inc_traffic_class_in(ifp, m);
6382 
6383 		/* find which protocol family this packet is for */
6384 		ifnet_lock_shared(ifp);
6385 		error = (*ifp->if_demux)(ifp, m, frame_header,
6386 		    &protocol_family);
6387 		ifnet_lock_done(ifp);
6388 		if (error != 0) {
6389 			if (error == EJUSTRETURN) {
6390 				goto next;
6391 			}
6392 			protocol_family = 0;
6393 		}
6394 
6395 #if (DEVELOPMENT || DEBUG)
6396 		/*
6397 		 * For testing we do not care about broadcast and multicast packets as
6398 		 * they are not as controllable as unicast traffic
6399 		 */
6400 		if (__improbable(ifp->if_xflags & IFXF_MARK_WAKE_PKT)) {
6401 			if ((protocol_family == PF_INET || protocol_family == PF_INET6) &&
6402 			    (m->m_flags & (M_BCAST | M_MCAST)) == 0) {
6403 				/*
6404 				 * This is a one-shot command
6405 				 */
6406 				ifp->if_xflags &= ~IFXF_MARK_WAKE_PKT;
6407 				m->m_pkthdr.pkt_flags |= PKTF_WAKE_PKT;
6408 			}
6409 		}
6410 #endif /* (DEVELOPMENT || DEBUG) */
6411 		if (__improbable(net_wake_pkt_debug > 0 && (m->m_pkthdr.pkt_flags & PKTF_WAKE_PKT))) {
6412 			char buffer[64];
6413 			size_t buflen = MIN(mbuf_pkthdr_len(m), sizeof(buffer));
6414 
6415 			os_log(OS_LOG_DEFAULT, "wake packet from %s len %d",
6416 			    ifp->if_xname, m_pktlen(m));
6417 			if (mbuf_copydata(m, 0, buflen, buffer) == 0) {
6418 				log_hexdump(buffer, buflen);
6419 			}
6420 		}
6421 
6422 		pktap_input(ifp, protocol_family, m, frame_header);
6423 
6424 		/* Drop v4 packets received on CLAT46 enabled cell interface */
6425 		if (protocol_family == PF_INET && IS_INTF_CLAT46(ifp) &&
6426 		    ifp->if_type == IFT_CELLULAR) {
6427 			m_freem(m);
6428 			ip6stat.ip6s_clat464_in_v4_drop++;
6429 			goto next;
6430 		}
6431 
6432 		/* Translate the packet if it is received on CLAT interface */
6433 		if (protocol_family == PF_INET6 && IS_INTF_CLAT46(ifp)
6434 		    && dlil_is_clat_needed(protocol_family, m)) {
6435 			char *data = NULL;
6436 			struct ether_header eh;
6437 			struct ether_header *ehp = NULL;
6438 
6439 			if (ifp->if_type == IFT_ETHER) {
6440 				ehp = (struct ether_header *)(void *)frame_header;
6441 				/* Skip RX Ethernet packets if they are not IPV6 */
6442 				if (ntohs(ehp->ether_type) != ETHERTYPE_IPV6) {
6443 					goto skip_clat;
6444 				}
6445 
6446 				/* Keep a copy of frame_header for Ethernet packets */
6447 				bcopy(frame_header, (caddr_t)&eh, ETHER_HDR_LEN);
6448 			}
6449 			error = dlil_clat64(ifp, &protocol_family, &m);
6450 			data = (char *) mbuf_data(m);
6451 			if (error != 0) {
6452 				m_freem(m);
6453 				ip6stat.ip6s_clat464_in_drop++;
6454 				goto next;
6455 			}
6456 			/* Native v6 should be No-op */
6457 			if (protocol_family != PF_INET) {
6458 				goto skip_clat;
6459 			}
6460 
6461 			/* Do this only for translated v4 packets. */
6462 			switch (ifp->if_type) {
6463 			case IFT_CELLULAR:
6464 				frame_header = data;
6465 				break;
6466 			case IFT_ETHER:
6467 				/*
6468 				 * Drop if the mbuf doesn't have enough
6469 				 * space for Ethernet header
6470 				 */
6471 				if (M_LEADINGSPACE(m) < ETHER_HDR_LEN) {
6472 					m_free(m);
6473 					ip6stat.ip6s_clat464_in_drop++;
6474 					goto next;
6475 				}
6476 				/*
6477 				 * Set the frame_header ETHER_HDR_LEN bytes
6478 				 * preceeding the data pointer. Change
6479 				 * the ether_type too.
6480 				 */
6481 				frame_header = data - ETHER_HDR_LEN;
6482 				eh.ether_type = htons(ETHERTYPE_IP);
6483 				bcopy((caddr_t)&eh, frame_header, ETHER_HDR_LEN);
6484 				break;
6485 			}
6486 		}
6487 skip_clat:
6488 		/*
6489 		 * Match the wake packet against the list of ports that has been
6490 		 * been queried by the driver before the device went to sleep
6491 		 */
6492 		if (__improbable(m->m_pkthdr.pkt_flags & PKTF_WAKE_PKT)) {
6493 			if (protocol_family != PF_INET && protocol_family != PF_INET6) {
6494 				if_ports_used_match_mbuf(ifp, protocol_family, m);
6495 			}
6496 		}
6497 		if (hwcksum_dbg != 0 && !(ifp->if_flags & IFF_LOOPBACK) &&
6498 		    !(m->m_pkthdr.pkt_flags & PKTF_LOOP)) {
6499 			dlil_input_cksum_dbg(ifp, m, frame_header,
6500 			    protocol_family);
6501 		}
6502 		/*
6503 		 * For partial checksum offload, we expect the driver to
6504 		 * set the start offset indicating the start of the span
6505 		 * that is covered by the hardware-computed checksum;
6506 		 * adjust this start offset accordingly because the data
6507 		 * pointer has been advanced beyond the link-layer header.
6508 		 *
6509 		 * Virtual lan types (bridge, vlan, bond) can call
6510 		 * dlil_input_packet_list() with the same packet with the
6511 		 * checksum flags set. Set a flag indicating that the
6512 		 * adjustment has already been done.
6513 		 */
6514 		if ((m->m_pkthdr.csum_flags & CSUM_ADJUST_DONE) != 0) {
6515 			/* adjustment has already been done */
6516 		} else if ((m->m_pkthdr.csum_flags &
6517 		    (CSUM_DATA_VALID | CSUM_PARTIAL)) ==
6518 		    (CSUM_DATA_VALID | CSUM_PARTIAL)) {
6519 			int adj;
6520 			if (frame_header == NULL ||
6521 			    frame_header < (char *)mbuf_datastart(m) ||
6522 			    frame_header > (char *)m->m_data ||
6523 			    (adj = (int)(m->m_data - (uintptr_t)frame_header)) >
6524 			    m->m_pkthdr.csum_rx_start) {
6525 				m->m_pkthdr.csum_data = 0;
6526 				m->m_pkthdr.csum_flags &= ~CSUM_DATA_VALID;
6527 				hwcksum_in_invalidated++;
6528 			} else {
6529 				m->m_pkthdr.csum_rx_start -= adj;
6530 			}
6531 			/* make sure we don't adjust more than once */
6532 			m->m_pkthdr.csum_flags |= CSUM_ADJUST_DONE;
6533 		}
6534 		if (clat_debug) {
6535 			pktap_input(ifp, protocol_family, m, frame_header);
6536 		}
6537 
6538 		if (m->m_flags & (M_BCAST | M_MCAST)) {
6539 			os_atomic_inc(&ifp->if_imcasts, relaxed);
6540 		}
6541 
6542 		/* run interface filters */
6543 		error = dlil_interface_filters_input(ifp, &m,
6544 		    &frame_header, protocol_family);
6545 		if (error != 0) {
6546 			if (error != EJUSTRETURN) {
6547 				m_freem(m);
6548 			}
6549 			goto next;
6550 		}
6551 		/*
6552 		 * A VLAN interface receives VLAN-tagged packets by attaching
6553 		 * its PF_VLAN protocol to a parent interface. When a VLAN
6554 		 * interface is a member of a bridge, the parent interface
6555 		 * receives VLAN-tagged M_PROMISC packets. A VLAN-tagged
6556 		 * M_PROMISC packet must be processed by the VLAN protocol
6557 		 * so that it can be sent up the stack via
6558 		 * dlil_input_packet_list(). That allows the bridge interface's
6559 		 * input filter, attached to the VLAN interface, to process
6560 		 * the packet.
6561 		 */
6562 		if (protocol_family != PF_VLAN &&
6563 		    (m->m_flags & M_PROMISC) != 0) {
6564 			m_freem(m);
6565 			goto next;
6566 		}
6567 
6568 		/* Lookup the protocol attachment to this interface */
6569 		if (protocol_family == 0) {
6570 			ifproto = NULL;
6571 		} else if (last_ifproto != NULL && last_ifproto->ifp == ifp &&
6572 		    (last_ifproto->protocol_family == protocol_family)) {
6573 			VERIFY(ifproto == NULL);
6574 			ifproto = last_ifproto;
6575 			if_proto_ref(last_ifproto);
6576 		} else {
6577 			VERIFY(ifproto == NULL);
6578 			ifnet_lock_shared(ifp);
6579 			/* callee holds a proto refcnt upon success */
6580 			ifproto = find_attached_proto(ifp, protocol_family);
6581 			ifnet_lock_done(ifp);
6582 		}
6583 		if (ifproto == NULL) {
6584 			/* no protocol for this packet, discard */
6585 			m_freem(m);
6586 			goto next;
6587 		}
6588 		if (ifproto != last_ifproto) {
6589 			if (last_ifproto != NULL) {
6590 				/* pass up the list for the previous protocol */
6591 				dlil_ifproto_input(last_ifproto, pkt_first);
6592 				pkt_first = NULL;
6593 				if_proto_free(last_ifproto);
6594 			}
6595 			last_ifproto = ifproto;
6596 			if_proto_ref(ifproto);
6597 		}
6598 		/* extend the list */
6599 		m->m_pkthdr.pkt_hdr = frame_header;
6600 		if (pkt_first == NULL) {
6601 			pkt_first = m;
6602 		} else {
6603 			*pkt_next = m;
6604 		}
6605 		pkt_next = &m->m_nextpkt;
6606 
6607 next:
6608 		if (next_packet == NULL && last_ifproto != NULL) {
6609 			/* pass up the last list of packets */
6610 			dlil_ifproto_input(last_ifproto, pkt_first);
6611 			if_proto_free(last_ifproto);
6612 			last_ifproto = NULL;
6613 		}
6614 		if (ifproto != NULL) {
6615 			if_proto_free(ifproto);
6616 			ifproto = NULL;
6617 		}
6618 
6619 		m = next_packet;
6620 
6621 		/* update the driver's multicast filter, if needed */
6622 		if (ifp->if_updatemcasts > 0 && if_mcasts_update(ifp) == 0) {
6623 			ifp->if_updatemcasts = 0;
6624 		}
6625 		if (iorefcnt == 1) {
6626 			/* If the next mbuf is on a different interface, unlock data-mov */
6627 			if (!m || (ifp != ifp_param && ifp != m->m_pkthdr.rcvif)) {
6628 				ifnet_datamov_end(ifp);
6629 				iorefcnt = 0;
6630 			}
6631 		}
6632 	}
6633 
6634 	KERNEL_DEBUG(DBG_FNC_DLIL_INPUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
6635 }
6636 
6637 static errno_t
if_mcasts_update_common(struct ifnet * ifp,bool sync)6638 if_mcasts_update_common(struct ifnet * ifp, bool sync)
6639 {
6640 	errno_t err;
6641 
6642 	if (sync) {
6643 		err = ifnet_ioctl(ifp, 0, SIOCADDMULTI, NULL);
6644 		if (err == EAFNOSUPPORT) {
6645 			err = 0;
6646 		}
6647 	} else {
6648 		ifnet_ioctl_async(ifp, SIOCADDMULTI);
6649 		err = 0;
6650 	}
6651 	DLIL_PRINTF("%s: %s %d suspended link-layer multicast membership(s) "
6652 	    "(err=%d)\n", if_name(ifp),
6653 	    (err == 0 ? "successfully restored" : "failed to restore"),
6654 	    ifp->if_updatemcasts, err);
6655 
6656 	/* just return success */
6657 	return 0;
6658 }
6659 
6660 static errno_t
if_mcasts_update_async(struct ifnet * ifp)6661 if_mcasts_update_async(struct ifnet *ifp)
6662 {
6663 	return if_mcasts_update_common(ifp, false);
6664 }
6665 
6666 errno_t
if_mcasts_update(struct ifnet * ifp)6667 if_mcasts_update(struct ifnet *ifp)
6668 {
6669 	return if_mcasts_update_common(ifp, true);
6670 }
6671 
6672 /* If ifp is set, we will increment the generation for the interface */
6673 int
dlil_post_complete_msg(struct ifnet * ifp,struct kev_msg * event)6674 dlil_post_complete_msg(struct ifnet *ifp, struct kev_msg *event)
6675 {
6676 	if (ifp != NULL) {
6677 		ifnet_increment_generation(ifp);
6678 	}
6679 
6680 #if NECP
6681 	necp_update_all_clients();
6682 #endif /* NECP */
6683 
6684 	return kev_post_msg(event);
6685 }
6686 
6687 __private_extern__ void
dlil_post_sifflags_msg(struct ifnet * ifp)6688 dlil_post_sifflags_msg(struct ifnet * ifp)
6689 {
6690 	struct kev_msg ev_msg;
6691 	struct net_event_data ev_data;
6692 
6693 	bzero(&ev_data, sizeof(ev_data));
6694 	bzero(&ev_msg, sizeof(ev_msg));
6695 	ev_msg.vendor_code = KEV_VENDOR_APPLE;
6696 	ev_msg.kev_class = KEV_NETWORK_CLASS;
6697 	ev_msg.kev_subclass = KEV_DL_SUBCLASS;
6698 	ev_msg.event_code = KEV_DL_SIFFLAGS;
6699 	strlcpy(&ev_data.if_name[0], ifp->if_name, IFNAMSIZ);
6700 	ev_data.if_family = ifp->if_family;
6701 	ev_data.if_unit = (u_int32_t) ifp->if_unit;
6702 	ev_msg.dv[0].data_length = sizeof(struct net_event_data);
6703 	ev_msg.dv[0].data_ptr = &ev_data;
6704 	ev_msg.dv[1].data_length = 0;
6705 	dlil_post_complete_msg(ifp, &ev_msg);
6706 }
6707 
6708 #define TMP_IF_PROTO_ARR_SIZE   10
6709 static int
dlil_event_internal(struct ifnet * ifp,struct kev_msg * event,bool update_generation)6710 dlil_event_internal(struct ifnet *ifp, struct kev_msg *event, bool update_generation)
6711 {
6712 	struct ifnet_filter *filter = NULL;
6713 	struct if_proto *proto = NULL;
6714 	int if_proto_count = 0;
6715 	struct if_proto *tmp_ifproto_stack_arr[TMP_IF_PROTO_ARR_SIZE] = {NULL};
6716 	struct if_proto **tmp_ifproto_arr = tmp_ifproto_stack_arr;
6717 	int tmp_ifproto_arr_idx = 0;
6718 
6719 	/*
6720 	 * Pass the event to the interface filters
6721 	 */
6722 	lck_mtx_lock_spin(&ifp->if_flt_lock);
6723 	/* prevent filter list from changing in case we drop the lock */
6724 	if_flt_monitor_busy(ifp);
6725 	TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
6726 		if (filter->filt_event != NULL) {
6727 			lck_mtx_unlock(&ifp->if_flt_lock);
6728 
6729 			filter->filt_event(filter->filt_cookie, ifp,
6730 			    filter->filt_protocol, event);
6731 
6732 			lck_mtx_lock_spin(&ifp->if_flt_lock);
6733 		}
6734 	}
6735 	/* we're done with the filter list */
6736 	if_flt_monitor_unbusy(ifp);
6737 	lck_mtx_unlock(&ifp->if_flt_lock);
6738 
6739 	/* Get an io ref count if the interface is attached */
6740 	if (!ifnet_is_attached(ifp, 1)) {
6741 		goto done;
6742 	}
6743 
6744 	/*
6745 	 * An embedded tmp_list_entry in if_proto may still get
6746 	 * over-written by another thread after giving up ifnet lock,
6747 	 * therefore we are avoiding embedded pointers here.
6748 	 */
6749 	ifnet_lock_shared(ifp);
6750 	if_proto_count = dlil_ifp_protolist(ifp, NULL, 0);
6751 	if (if_proto_count) {
6752 		int i;
6753 		VERIFY(ifp->if_proto_hash != NULL);
6754 		if (if_proto_count <= TMP_IF_PROTO_ARR_SIZE) {
6755 			tmp_ifproto_arr = tmp_ifproto_stack_arr;
6756 		} else {
6757 			tmp_ifproto_arr = kalloc_type(struct if_proto *,
6758 			    if_proto_count, Z_WAITOK | Z_ZERO);
6759 			if (tmp_ifproto_arr == NULL) {
6760 				ifnet_lock_done(ifp);
6761 				goto cleanup;
6762 			}
6763 		}
6764 
6765 		for (i = 0; i < PROTO_HASH_SLOTS; i++) {
6766 			SLIST_FOREACH(proto, &ifp->if_proto_hash[i],
6767 			    next_hash) {
6768 				if_proto_ref(proto);
6769 				tmp_ifproto_arr[tmp_ifproto_arr_idx] = proto;
6770 				tmp_ifproto_arr_idx++;
6771 			}
6772 		}
6773 		VERIFY(if_proto_count == tmp_ifproto_arr_idx);
6774 	}
6775 	ifnet_lock_done(ifp);
6776 
6777 	for (tmp_ifproto_arr_idx = 0; tmp_ifproto_arr_idx < if_proto_count;
6778 	    tmp_ifproto_arr_idx++) {
6779 		proto = tmp_ifproto_arr[tmp_ifproto_arr_idx];
6780 		VERIFY(proto != NULL);
6781 		proto_media_event eventp =
6782 		    (proto->proto_kpi == kProtoKPI_v1 ?
6783 		    proto->kpi.v1.event :
6784 		    proto->kpi.v2.event);
6785 
6786 		if (eventp != NULL) {
6787 			eventp(ifp, proto->protocol_family,
6788 			    event);
6789 		}
6790 		if_proto_free(proto);
6791 	}
6792 
6793 cleanup:
6794 	if (tmp_ifproto_arr != tmp_ifproto_stack_arr) {
6795 		kfree_type(struct if_proto *, if_proto_count, tmp_ifproto_arr);
6796 	}
6797 
6798 	/* Pass the event to the interface */
6799 	if (ifp->if_event != NULL) {
6800 		ifp->if_event(ifp, event);
6801 	}
6802 
6803 	/* Release the io ref count */
6804 	ifnet_decr_iorefcnt(ifp);
6805 done:
6806 	return dlil_post_complete_msg(update_generation ? ifp : NULL, event);
6807 }
6808 
6809 errno_t
ifnet_event(ifnet_t ifp,struct kern_event_msg * event)6810 ifnet_event(ifnet_t ifp, struct kern_event_msg *event)
6811 {
6812 	struct kev_msg kev_msg;
6813 	int result = 0;
6814 
6815 	if (ifp == NULL || event == NULL) {
6816 		return EINVAL;
6817 	}
6818 
6819 	bzero(&kev_msg, sizeof(kev_msg));
6820 	kev_msg.vendor_code = event->vendor_code;
6821 	kev_msg.kev_class = event->kev_class;
6822 	kev_msg.kev_subclass = event->kev_subclass;
6823 	kev_msg.event_code = event->event_code;
6824 	kev_msg.dv[0].data_ptr = &event->event_data[0];
6825 	kev_msg.dv[0].data_length = event->total_size - KEV_MSG_HEADER_SIZE;
6826 	kev_msg.dv[1].data_length = 0;
6827 
6828 	result = dlil_event_internal(ifp, &kev_msg, TRUE);
6829 
6830 	return result;
6831 }
6832 
6833 static void
dlil_count_chain_len(mbuf_t m,struct chain_len_stats * cls)6834 dlil_count_chain_len(mbuf_t m, struct chain_len_stats *cls)
6835 {
6836 	mbuf_t  n = m;
6837 	int chainlen = 0;
6838 
6839 	while (n != NULL) {
6840 		chainlen++;
6841 		n = n->m_next;
6842 	}
6843 	switch (chainlen) {
6844 	case 0:
6845 		break;
6846 	case 1:
6847 		os_atomic_inc(&cls->cls_one, relaxed);
6848 		break;
6849 	case 2:
6850 		os_atomic_inc(&cls->cls_two, relaxed);
6851 		break;
6852 	case 3:
6853 		os_atomic_inc(&cls->cls_three, relaxed);
6854 		break;
6855 	case 4:
6856 		os_atomic_inc(&cls->cls_four, relaxed);
6857 		break;
6858 	case 5:
6859 	default:
6860 		os_atomic_inc(&cls->cls_five_or_more, relaxed);
6861 		break;
6862 	}
6863 }
6864 
6865 #if CONFIG_DTRACE
6866 __attribute__((noinline))
6867 static void
dlil_output_dtrace(ifnet_t ifp,protocol_family_t proto_family,mbuf_t m)6868 dlil_output_dtrace(ifnet_t ifp, protocol_family_t proto_family, mbuf_t  m)
6869 {
6870 	if (proto_family == PF_INET) {
6871 		struct ip *ip = mtod(m, struct ip *);
6872 		DTRACE_IP6(send, struct mbuf *, m, struct inpcb *, NULL,
6873 		    struct ip *, ip, struct ifnet *, ifp,
6874 		    struct ip *, ip, struct ip6_hdr *, NULL);
6875 	} else if (proto_family == PF_INET6) {
6876 		struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
6877 		DTRACE_IP6(send, struct mbuf *, m, struct inpcb *, NULL,
6878 		    struct ip6_hdr *, ip6, struct ifnet *, ifp,
6879 		    struct ip *, NULL, struct ip6_hdr *, ip6);
6880 	}
6881 }
6882 #endif /* CONFIG_DTRACE */
6883 
6884 /*
6885  * dlil_output
6886  *
6887  * Caller should have a lock on the protocol domain if the protocol
6888  * doesn't support finer grained locking. In most cases, the lock
6889  * will be held from the socket layer and won't be released until
6890  * we return back to the socket layer.
6891  *
6892  * This does mean that we must take a protocol lock before we take
6893  * an interface lock if we're going to take both. This makes sense
6894  * because a protocol is likely to interact with an ifp while it
6895  * is under the protocol lock.
6896  *
6897  * An advisory code will be returned if adv is not null. This
6898  * can be used to provide feedback about interface queues to the
6899  * application.
6900  */
6901 errno_t
dlil_output(ifnet_t ifp,protocol_family_t proto_family,mbuf_t packetlist,void * route,const struct sockaddr * dest,int raw,struct flowadv * adv)6902 dlil_output(ifnet_t ifp, protocol_family_t proto_family, mbuf_t packetlist,
6903     void *route, const struct sockaddr *dest, int raw, struct flowadv *adv)
6904 {
6905 	char *frame_type = NULL;
6906 	char *dst_linkaddr = NULL;
6907 	int retval = 0;
6908 	char frame_type_buffer[MAX_FRAME_TYPE_SIZE * 4];
6909 	char dst_linkaddr_buffer[MAX_LINKADDR * 4];
6910 	struct if_proto *proto = NULL;
6911 	mbuf_t  m = NULL;
6912 	mbuf_t  send_head = NULL;
6913 	mbuf_t  *send_tail = &send_head;
6914 	int iorefcnt = 0;
6915 	u_int32_t pre = 0, post = 0;
6916 	u_int32_t fpkts = 0, fbytes = 0;
6917 	int32_t flen = 0;
6918 	struct timespec now;
6919 	u_int64_t now_nsec;
6920 	boolean_t did_clat46 = FALSE;
6921 	protocol_family_t old_proto_family = proto_family;
6922 	struct sockaddr_in6 dest6;
6923 	struct rtentry *rt = NULL;
6924 	u_int16_t m_loop_set = 0;
6925 
6926 	KERNEL_DEBUG(DBG_FNC_DLIL_OUTPUT | DBG_FUNC_START, 0, 0, 0, 0, 0);
6927 
6928 	/*
6929 	 * Get an io refcnt if the interface is attached to prevent ifnet_detach
6930 	 * from happening while this operation is in progress
6931 	 */
6932 	if (!ifnet_datamov_begin(ifp)) {
6933 		retval = ENXIO;
6934 		goto cleanup;
6935 	}
6936 	iorefcnt = 1;
6937 
6938 	VERIFY(ifp->if_output_dlil != NULL);
6939 
6940 	/* update the driver's multicast filter, if needed */
6941 	if (ifp->if_updatemcasts > 0) {
6942 		if_mcasts_update_async(ifp);
6943 		ifp->if_updatemcasts = 0;
6944 	}
6945 
6946 	frame_type = frame_type_buffer;
6947 	dst_linkaddr = dst_linkaddr_buffer;
6948 
6949 	if (raw == 0) {
6950 		ifnet_lock_shared(ifp);
6951 		/* callee holds a proto refcnt upon success */
6952 		proto = find_attached_proto(ifp, proto_family);
6953 		if (proto == NULL) {
6954 			ifnet_lock_done(ifp);
6955 			retval = ENXIO;
6956 			goto cleanup;
6957 		}
6958 		ifnet_lock_done(ifp);
6959 	}
6960 
6961 preout_again:
6962 	if (packetlist == NULL) {
6963 		goto cleanup;
6964 	}
6965 
6966 	m = packetlist;
6967 	packetlist = packetlist->m_nextpkt;
6968 	m->m_nextpkt = NULL;
6969 
6970 	m_add_crumb(m, PKT_CRUMB_DLIL_OUTPUT);
6971 
6972 	/*
6973 	 * Perform address family translation for the first
6974 	 * packet outside the loop in order to perform address
6975 	 * lookup for the translated proto family.
6976 	 */
6977 	if (proto_family == PF_INET && IS_INTF_CLAT46(ifp) &&
6978 	    (ifp->if_type == IFT_CELLULAR ||
6979 	    dlil_is_clat_needed(proto_family, m))) {
6980 		retval = dlil_clat46(ifp, &proto_family, &m);
6981 		/*
6982 		 * Go to the next packet if translation fails
6983 		 */
6984 		if (retval != 0) {
6985 			m_freem(m);
6986 			m = NULL;
6987 			ip6stat.ip6s_clat464_out_drop++;
6988 			/* Make sure that the proto family is PF_INET */
6989 			ASSERT(proto_family == PF_INET);
6990 			goto preout_again;
6991 		}
6992 		/*
6993 		 * Free the old one and make it point to the IPv6 proto structure.
6994 		 *
6995 		 * Change proto for the first time we have successfully
6996 		 * performed address family translation.
6997 		 */
6998 		if (!did_clat46 && proto_family == PF_INET6) {
6999 			did_clat46 = TRUE;
7000 
7001 			if (proto != NULL) {
7002 				if_proto_free(proto);
7003 			}
7004 			ifnet_lock_shared(ifp);
7005 			/* callee holds a proto refcnt upon success */
7006 			proto = find_attached_proto(ifp, proto_family);
7007 			if (proto == NULL) {
7008 				ifnet_lock_done(ifp);
7009 				retval = ENXIO;
7010 				m_freem(m);
7011 				m = NULL;
7012 				goto cleanup;
7013 			}
7014 			ifnet_lock_done(ifp);
7015 			if (ifp->if_type == IFT_ETHER) {
7016 				/* Update the dest to translated v6 address */
7017 				dest6.sin6_len = sizeof(struct sockaddr_in6);
7018 				dest6.sin6_family = AF_INET6;
7019 				dest6.sin6_addr = (mtod(m, struct ip6_hdr *))->ip6_dst;
7020 				dest = SA(&dest6);
7021 
7022 				/*
7023 				 * Lookup route to the translated destination
7024 				 * Free this route ref during cleanup
7025 				 */
7026 				rt = rtalloc1_scoped(SA(&dest6),
7027 				    0, 0, ifp->if_index);
7028 
7029 				route = rt;
7030 			}
7031 		}
7032 	}
7033 
7034 	/*
7035 	 * This path gets packet chain going to the same destination.
7036 	 * The pre output routine is used to either trigger resolution of
7037 	 * the next hop or retreive the next hop's link layer addressing.
7038 	 * For ex: ether_inet(6)_pre_output routine.
7039 	 *
7040 	 * If the routine returns EJUSTRETURN, it implies that packet has
7041 	 * been queued, and therefore we have to call preout_again for the
7042 	 * following packet in the chain.
7043 	 *
7044 	 * For errors other than EJUSTRETURN, the current packet is freed
7045 	 * and the rest of the chain (pointed by packetlist is freed as
7046 	 * part of clean up.
7047 	 *
7048 	 * Else if there is no error the retrieved information is used for
7049 	 * all the packets in the chain.
7050 	 */
7051 	if (raw == 0) {
7052 		proto_media_preout preoutp = (proto->proto_kpi == kProtoKPI_v1 ?
7053 		    proto->kpi.v1.pre_output : proto->kpi.v2.pre_output);
7054 		retval = 0;
7055 		if (preoutp != NULL) {
7056 			retval = preoutp(ifp, proto_family, &m, dest, route,
7057 			    frame_type, dst_linkaddr);
7058 
7059 			if (retval != 0) {
7060 				if (retval == EJUSTRETURN) {
7061 					goto preout_again;
7062 				}
7063 				m_freem(m);
7064 				m = NULL;
7065 				goto cleanup;
7066 			}
7067 		}
7068 	}
7069 
7070 	nanouptime(&now);
7071 	net_timernsec(&now, &now_nsec);
7072 
7073 	do {
7074 		/*
7075 		 * pkt_hdr is set here to point to m_data prior to
7076 		 * calling into the framer. This value of pkt_hdr is
7077 		 * used by the netif gso logic to retrieve the ip header
7078 		 * for the TCP packets, offloaded for TSO processing.
7079 		 */
7080 		if ((raw != 0) && (ifp->if_family == IFNET_FAMILY_ETHERNET)) {
7081 			uint8_t vlan_encap_len = 0;
7082 
7083 			if ((m->m_pkthdr.csum_flags & CSUM_VLAN_ENCAP_PRESENT) != 0) {
7084 				vlan_encap_len = ETHER_VLAN_ENCAP_LEN;
7085 			}
7086 			m->m_pkthdr.pkt_hdr = mtod(m, char *) + ETHER_HDR_LEN + vlan_encap_len;
7087 		} else {
7088 			m->m_pkthdr.pkt_hdr = mtod(m, void *);
7089 		}
7090 
7091 		/*
7092 		 * Perform address family translation if needed.
7093 		 * For now we only support stateless 4 to 6 translation
7094 		 * on the out path.
7095 		 *
7096 		 * The routine below translates IP header, updates protocol
7097 		 * checksum and also translates ICMP.
7098 		 *
7099 		 * We skip the first packet as it is already translated and
7100 		 * the proto family is set to PF_INET6.
7101 		 */
7102 		if (proto_family == PF_INET && IS_INTF_CLAT46(ifp) &&
7103 		    (ifp->if_type == IFT_CELLULAR ||
7104 		    dlil_is_clat_needed(proto_family, m))) {
7105 			retval = dlil_clat46(ifp, &proto_family, &m);
7106 			/* Goto the next packet if the translation fails */
7107 			if (retval != 0) {
7108 				m_freem(m);
7109 				m = NULL;
7110 				ip6stat.ip6s_clat464_out_drop++;
7111 				goto next;
7112 			}
7113 		}
7114 
7115 #if CONFIG_DTRACE
7116 		if (!raw) {
7117 			dlil_output_dtrace(ifp, proto_family, m);
7118 		}
7119 #endif /* CONFIG_DTRACE */
7120 
7121 		if (raw == 0 && ifp->if_framer != NULL) {
7122 			int rcvif_set = 0;
7123 
7124 			/*
7125 			 * If this is a broadcast packet that needs to be
7126 			 * looped back into the system, set the inbound ifp
7127 			 * to that of the outbound ifp.  This will allow
7128 			 * us to determine that it is a legitimate packet
7129 			 * for the system.  Only set the ifp if it's not
7130 			 * already set, just to be safe.
7131 			 */
7132 			if ((m->m_flags & (M_BCAST | M_LOOP)) &&
7133 			    m->m_pkthdr.rcvif == NULL) {
7134 				m->m_pkthdr.rcvif = ifp;
7135 				rcvif_set = 1;
7136 			}
7137 			m_loop_set = m->m_flags & M_LOOP;
7138 			retval = ifp->if_framer(ifp, &m, dest, dst_linkaddr,
7139 			    frame_type, &pre, &post);
7140 			if (retval != 0) {
7141 				if (retval != EJUSTRETURN) {
7142 					m_freem(m);
7143 				}
7144 				goto next;
7145 			}
7146 
7147 			/*
7148 			 * For partial checksum offload, adjust the start
7149 			 * and stuff offsets based on the prepended header.
7150 			 */
7151 			if ((m->m_pkthdr.csum_flags &
7152 			    (CSUM_DATA_VALID | CSUM_PARTIAL)) ==
7153 			    (CSUM_DATA_VALID | CSUM_PARTIAL)) {
7154 				m->m_pkthdr.csum_tx_stuff += pre;
7155 				m->m_pkthdr.csum_tx_start += pre;
7156 			}
7157 
7158 			if (hwcksum_dbg != 0 && !(ifp->if_flags & IFF_LOOPBACK)) {
7159 				dlil_output_cksum_dbg(ifp, m, pre,
7160 				    proto_family);
7161 			}
7162 
7163 			/*
7164 			 * Clear the ifp if it was set above, and to be
7165 			 * safe, only if it is still the same as the
7166 			 * outbound ifp we have in context.  If it was
7167 			 * looped back, then a copy of it was sent to the
7168 			 * loopback interface with the rcvif set, and we
7169 			 * are clearing the one that will go down to the
7170 			 * layer below.
7171 			 */
7172 			if (rcvif_set && m->m_pkthdr.rcvif == ifp) {
7173 				m->m_pkthdr.rcvif = NULL;
7174 			}
7175 		}
7176 
7177 		/*
7178 		 * Let interface filters (if any) do their thing ...
7179 		 */
7180 		retval = dlil_interface_filters_output(ifp, &m, proto_family);
7181 		if (retval != 0) {
7182 			if (retval != EJUSTRETURN) {
7183 				m_freem(m);
7184 			}
7185 			goto next;
7186 		}
7187 		/*
7188 		 * Strip away M_PROTO1 bit prior to sending packet
7189 		 * to the driver as this field may be used by the driver
7190 		 */
7191 		m->m_flags &= ~M_PROTO1;
7192 
7193 		/*
7194 		 * If the underlying interface is not capable of handling a
7195 		 * packet whose data portion spans across physically disjoint
7196 		 * pages, we need to "normalize" the packet so that we pass
7197 		 * down a chain of mbufs where each mbuf points to a span that
7198 		 * resides in the system page boundary.  If the packet does
7199 		 * not cross page(s), the following is a no-op.
7200 		 */
7201 		if (!(ifp->if_hwassist & IFNET_MULTIPAGES)) {
7202 			if ((m = m_normalize(m)) == NULL) {
7203 				goto next;
7204 			}
7205 		}
7206 
7207 		/*
7208 		 * If this is a TSO packet, make sure the interface still
7209 		 * advertise TSO capability.
7210 		 */
7211 		if (TSO_IPV4_NOTOK(ifp, m) || TSO_IPV6_NOTOK(ifp, m)) {
7212 			retval = EMSGSIZE;
7213 			m_freem(m);
7214 			goto cleanup;
7215 		}
7216 
7217 		ifp_inc_traffic_class_out(ifp, m);
7218 
7219 #if SKYWALK
7220 		/*
7221 		 * For native skywalk devices, packets will be passed to pktap
7222 		 * after GSO or after the mbuf to packet conversion.
7223 		 * This is done for IPv4/IPv6 packets only because there is no
7224 		 * space in the mbuf to pass down the proto family.
7225 		 */
7226 		if (dlil_is_native_netif_nexus(ifp)) {
7227 			if (raw || m->m_pkthdr.pkt_proto == 0) {
7228 				pktap_output(ifp, proto_family, m, pre, post);
7229 				m->m_pkthdr.pkt_flags |= PKTF_SKIP_PKTAP;
7230 			}
7231 		} else {
7232 			pktap_output(ifp, proto_family, m, pre, post);
7233 		}
7234 #else /* SKYWALK */
7235 		pktap_output(ifp, proto_family, m, pre, post);
7236 #endif /* SKYWALK */
7237 
7238 		/*
7239 		 * Count the number of elements in the mbuf chain
7240 		 */
7241 		if (tx_chain_len_count) {
7242 			dlil_count_chain_len(m, &tx_chain_len_stats);
7243 		}
7244 
7245 		/*
7246 		 * Discard partial sum information if this packet originated
7247 		 * from another interface; the packet would already have the
7248 		 * final checksum and we shouldn't recompute it.
7249 		 */
7250 		if ((m->m_pkthdr.pkt_flags & PKTF_FORWARDED) &&
7251 		    (m->m_pkthdr.csum_flags & (CSUM_DATA_VALID | CSUM_PARTIAL)) ==
7252 		    (CSUM_DATA_VALID | CSUM_PARTIAL)) {
7253 			m->m_pkthdr.csum_flags &= ~CSUM_TX_FLAGS;
7254 			m->m_pkthdr.csum_data = 0;
7255 		}
7256 
7257 		/*
7258 		 * Finally, call the driver.
7259 		 */
7260 		if (ifp->if_eflags & (IFEF_SENDLIST | IFEF_ENQUEUE_MULTI)) {
7261 			if (m->m_pkthdr.pkt_flags & PKTF_FORWARDED) {
7262 				flen += (m_pktlen(m) - (pre + post));
7263 				m->m_pkthdr.pkt_flags &= ~PKTF_FORWARDED;
7264 			}
7265 			(void) mbuf_set_timestamp(m, now_nsec, TRUE);
7266 
7267 			*send_tail = m;
7268 			send_tail = &m->m_nextpkt;
7269 		} else {
7270 			/*
7271 			 * Record timestamp; ifnet_enqueue() will use this info
7272 			 * rather than redoing the work.
7273 			 */
7274 			nanouptime(&now);
7275 			net_timernsec(&now, &now_nsec);
7276 			(void) mbuf_set_timestamp(m, now_nsec, TRUE);
7277 
7278 			if (m->m_pkthdr.pkt_flags & PKTF_FORWARDED) {
7279 				flen = (m_pktlen(m) - (pre + post));
7280 				m->m_pkthdr.pkt_flags &= ~PKTF_FORWARDED;
7281 			} else {
7282 				flen = 0;
7283 			}
7284 			KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_START,
7285 			    0, 0, 0, 0, 0);
7286 			retval = (*ifp->if_output_dlil)(ifp, m);
7287 			if (retval == EQFULL || retval == EQSUSPENDED) {
7288 				if (adv != NULL && adv->code == FADV_SUCCESS) {
7289 					adv->code = (retval == EQFULL ?
7290 					    FADV_FLOW_CONTROLLED :
7291 					    FADV_SUSPENDED);
7292 				}
7293 				retval = 0;
7294 			}
7295 			if (retval == 0 && flen > 0) {
7296 				fbytes += flen;
7297 				fpkts++;
7298 			}
7299 			if (retval != 0 && dlil_verbose) {
7300 				DLIL_PRINTF("%s: output error on %s retval = %d\n",
7301 				    __func__, if_name(ifp),
7302 				    retval);
7303 			}
7304 			KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_END,
7305 			    0, 0, 0, 0, 0);
7306 		}
7307 		KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
7308 
7309 next:
7310 		m = packetlist;
7311 		if (m != NULL) {
7312 			m->m_flags |= m_loop_set;
7313 			packetlist = packetlist->m_nextpkt;
7314 			m->m_nextpkt = NULL;
7315 		}
7316 		/* Reset the proto family to old proto family for CLAT */
7317 		if (did_clat46) {
7318 			proto_family = old_proto_family;
7319 		}
7320 	} while (m != NULL);
7321 
7322 	if (send_head != NULL) {
7323 		KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_START,
7324 		    0, 0, 0, 0, 0);
7325 		if (ifp->if_eflags & IFEF_SENDLIST) {
7326 			retval = (*ifp->if_output_dlil)(ifp, send_head);
7327 			if (retval == EQFULL || retval == EQSUSPENDED) {
7328 				if (adv != NULL) {
7329 					adv->code = (retval == EQFULL ?
7330 					    FADV_FLOW_CONTROLLED :
7331 					    FADV_SUSPENDED);
7332 				}
7333 				retval = 0;
7334 			}
7335 			if (retval == 0 && flen > 0) {
7336 				fbytes += flen;
7337 				fpkts++;
7338 			}
7339 			if (retval != 0 && dlil_verbose) {
7340 				DLIL_PRINTF("%s: output error on %s retval = %d\n",
7341 				    __func__, if_name(ifp), retval);
7342 			}
7343 		} else {
7344 			struct mbuf *send_m;
7345 			int enq_cnt = 0;
7346 			VERIFY(ifp->if_eflags & IFEF_ENQUEUE_MULTI);
7347 			while (send_head != NULL) {
7348 				send_m = send_head;
7349 				send_head = send_m->m_nextpkt;
7350 				send_m->m_nextpkt = NULL;
7351 				retval = (*ifp->if_output_dlil)(ifp, send_m);
7352 				if (retval == EQFULL || retval == EQSUSPENDED) {
7353 					if (adv != NULL) {
7354 						adv->code = (retval == EQFULL ?
7355 						    FADV_FLOW_CONTROLLED :
7356 						    FADV_SUSPENDED);
7357 					}
7358 					retval = 0;
7359 				}
7360 				if (retval == 0) {
7361 					enq_cnt++;
7362 					if (flen > 0) {
7363 						fpkts++;
7364 					}
7365 				}
7366 				if (retval != 0 && dlil_verbose) {
7367 					DLIL_PRINTF("%s: output error on %s "
7368 					    "retval = %d\n",
7369 					    __func__, if_name(ifp), retval);
7370 				}
7371 			}
7372 			if (enq_cnt > 0) {
7373 				fbytes += flen;
7374 				ifnet_start(ifp);
7375 			}
7376 		}
7377 		KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
7378 	}
7379 
7380 	KERNEL_DEBUG(DBG_FNC_DLIL_OUTPUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
7381 
7382 cleanup:
7383 	if (fbytes > 0) {
7384 		ifp->if_fbytes += fbytes;
7385 	}
7386 	if (fpkts > 0) {
7387 		ifp->if_fpackets += fpkts;
7388 	}
7389 	if (proto != NULL) {
7390 		if_proto_free(proto);
7391 	}
7392 	if (packetlist) { /* if any packets are left, clean up */
7393 		mbuf_freem_list(packetlist);
7394 	}
7395 	if (retval == EJUSTRETURN) {
7396 		retval = 0;
7397 	}
7398 	if (iorefcnt == 1) {
7399 		ifnet_datamov_end(ifp);
7400 	}
7401 	if (rt != NULL) {
7402 		rtfree(rt);
7403 		rt = NULL;
7404 	}
7405 
7406 	return retval;
7407 }
7408 
7409 /*
7410  * This routine checks if the destination address is not a loopback, link-local,
7411  * multicast or broadcast address.
7412  */
7413 static int
dlil_is_clat_needed(protocol_family_t proto_family,mbuf_t m)7414 dlil_is_clat_needed(protocol_family_t proto_family, mbuf_t m)
7415 {
7416 	int ret = 0;
7417 	switch (proto_family) {
7418 	case PF_INET: {
7419 		struct ip *iph = mtod(m, struct ip *);
7420 		if (CLAT46_NEEDED(ntohl(iph->ip_dst.s_addr))) {
7421 			ret = 1;
7422 		}
7423 		break;
7424 	}
7425 	case PF_INET6: {
7426 		struct ip6_hdr *ip6h = mtod(m, struct ip6_hdr *);
7427 		if ((size_t)m_pktlen(m) >= sizeof(struct ip6_hdr) &&
7428 		    CLAT64_NEEDED(&ip6h->ip6_dst)) {
7429 			ret = 1;
7430 		}
7431 		break;
7432 	}
7433 	}
7434 
7435 	return ret;
7436 }
7437 /*
7438  * @brief This routine translates IPv4 packet to IPv6 packet,
7439  *     updates protocol checksum and also translates ICMP for code
7440  *     along with inner header translation.
7441  *
7442  * @param ifp Pointer to the interface
7443  * @param proto_family pointer to protocol family. It is updated if function
7444  *     performs the translation successfully.
7445  * @param m Pointer to the pointer pointing to the packet. Needed because this
7446  *     routine can end up changing the mbuf to a different one.
7447  *
7448  * @return 0 on success or else a negative value.
7449  */
7450 static errno_t
dlil_clat46(ifnet_t ifp,protocol_family_t * proto_family,mbuf_t * m)7451 dlil_clat46(ifnet_t ifp, protocol_family_t *proto_family, mbuf_t *m)
7452 {
7453 	VERIFY(*proto_family == PF_INET);
7454 	VERIFY(IS_INTF_CLAT46(ifp));
7455 
7456 	pbuf_t pbuf_store, *pbuf = NULL;
7457 	struct ip *iph = NULL;
7458 	struct in_addr osrc, odst;
7459 	uint8_t proto = 0;
7460 	struct in6_addr src_storage = {};
7461 	struct in6_addr *src = NULL;
7462 	struct sockaddr_in6 dstsock = {};
7463 	int error = 0;
7464 	uint16_t off = 0;
7465 	uint16_t tot_len = 0;
7466 	uint16_t ip_id_val = 0;
7467 	uint16_t ip_frag_off = 0;
7468 
7469 	boolean_t is_frag = FALSE;
7470 	boolean_t is_first_frag = TRUE;
7471 	boolean_t is_last_frag = TRUE;
7472 
7473 	pbuf_init_mbuf(&pbuf_store, *m, ifp);
7474 	pbuf = &pbuf_store;
7475 	iph = pbuf->pb_data;
7476 
7477 	osrc = iph->ip_src;
7478 	odst = iph->ip_dst;
7479 	proto = iph->ip_p;
7480 	off = (uint16_t)(iph->ip_hl << 2);
7481 	ip_id_val = iph->ip_id;
7482 	ip_frag_off = ntohs(iph->ip_off) & IP_OFFMASK;
7483 
7484 	tot_len = ntohs(iph->ip_len);
7485 
7486 	/*
7487 	 * For packets that are not first frags
7488 	 * we only need to adjust CSUM.
7489 	 * For 4 to 6, Fragmentation header gets appended
7490 	 * after proto translation.
7491 	 */
7492 	if (ntohs(iph->ip_off) & ~(IP_DF | IP_RF)) {
7493 		is_frag = TRUE;
7494 
7495 		/* If the offset is not zero, it is not first frag */
7496 		if (ip_frag_off != 0) {
7497 			is_first_frag = FALSE;
7498 		}
7499 
7500 		/* If IP_MF is set, then it is not last frag */
7501 		if (ntohs(iph->ip_off) & IP_MF) {
7502 			is_last_frag = FALSE;
7503 		}
7504 	}
7505 
7506 	/*
7507 	 * Translate IPv4 destination to IPv6 destination by using the
7508 	 * prefixes learned through prior PLAT discovery.
7509 	 */
7510 	if ((error = nat464_synthesize_ipv6(ifp, &odst, &dstsock.sin6_addr)) != 0) {
7511 		ip6stat.ip6s_clat464_out_v6synthfail_drop++;
7512 		goto cleanup;
7513 	}
7514 
7515 	dstsock.sin6_len = sizeof(struct sockaddr_in6);
7516 	dstsock.sin6_family = AF_INET6;
7517 
7518 	/*
7519 	 * Retrive the local IPv6 CLAT46 address reserved for stateless
7520 	 * translation.
7521 	 */
7522 	src = in6_selectsrc_core(&dstsock, 0, ifp, 0, &src_storage, NULL, &error,
7523 	    NULL, NULL, TRUE);
7524 
7525 	if (src == NULL) {
7526 		ip6stat.ip6s_clat464_out_nov6addr_drop++;
7527 		error = -1;
7528 		goto cleanup;
7529 	}
7530 
7531 
7532 	/* Translate the IP header part first */
7533 	error = (nat464_translate_46(pbuf, off, iph->ip_tos, iph->ip_p,
7534 	    iph->ip_ttl, src_storage, dstsock.sin6_addr, tot_len) == NT_NAT64) ? 0 : -1;
7535 
7536 	iph = NULL;     /* Invalidate iph as pbuf has been modified */
7537 
7538 	if (error != 0) {
7539 		ip6stat.ip6s_clat464_out_46transfail_drop++;
7540 		goto cleanup;
7541 	}
7542 
7543 	/*
7544 	 * Translate protocol header, update checksum, checksum flags
7545 	 * and related fields.
7546 	 */
7547 	error = (nat464_translate_proto(pbuf, (struct nat464_addr *)&osrc, (struct nat464_addr *)&odst,
7548 	    proto, PF_INET, PF_INET6, NT_OUT, !is_first_frag) == NT_NAT64) ? 0 : -1;
7549 
7550 	if (error != 0) {
7551 		ip6stat.ip6s_clat464_out_46proto_transfail_drop++;
7552 		goto cleanup;
7553 	}
7554 
7555 	/* Now insert the IPv6 fragment header */
7556 	if (is_frag) {
7557 		error = nat464_insert_frag46(pbuf, ip_id_val, ip_frag_off, is_last_frag);
7558 
7559 		if (error != 0) {
7560 			ip6stat.ip6s_clat464_out_46frag_transfail_drop++;
7561 			goto cleanup;
7562 		}
7563 	}
7564 
7565 cleanup:
7566 	if (pbuf_is_valid(pbuf)) {
7567 		*m = pbuf->pb_mbuf;
7568 		pbuf->pb_mbuf = NULL;
7569 		pbuf_destroy(pbuf);
7570 	} else {
7571 		error = -1;
7572 		*m = NULL;
7573 		ip6stat.ip6s_clat464_out_invalpbuf_drop++;
7574 	}
7575 
7576 	if (error == 0) {
7577 		*proto_family = PF_INET6;
7578 		ip6stat.ip6s_clat464_out_success++;
7579 	}
7580 
7581 	return error;
7582 }
7583 
7584 /*
7585  * @brief This routine translates incoming IPv6 to IPv4 packet,
7586  *     updates protocol checksum and also translates ICMPv6 outer
7587  *     and inner headers
7588  *
7589  * @return 0 on success or else a negative value.
7590  */
7591 static errno_t
dlil_clat64(ifnet_t ifp,protocol_family_t * proto_family,mbuf_t * m)7592 dlil_clat64(ifnet_t ifp, protocol_family_t *proto_family, mbuf_t *m)
7593 {
7594 	VERIFY(*proto_family == PF_INET6);
7595 	VERIFY(IS_INTF_CLAT46(ifp));
7596 
7597 	struct ip6_hdr *ip6h = NULL;
7598 	struct in6_addr osrc, odst;
7599 	uint8_t proto = 0;
7600 	struct in6_ifaddr *ia6_clat_dst = NULL;
7601 	struct in_ifaddr *ia4_clat_dst = NULL;
7602 	struct in_addr *dst = NULL;
7603 	struct in_addr src;
7604 	int error = 0;
7605 	uint32_t off = 0;
7606 	u_int64_t tot_len = 0;
7607 	uint8_t tos = 0;
7608 	boolean_t is_first_frag = TRUE;
7609 
7610 	/* Incoming mbuf does not contain valid IP6 header */
7611 	if ((size_t)(*m)->m_pkthdr.len < sizeof(struct ip6_hdr) ||
7612 	    ((size_t)(*m)->m_len < sizeof(struct ip6_hdr) &&
7613 	    (*m = m_pullup(*m, sizeof(struct ip6_hdr))) == NULL)) {
7614 		ip6stat.ip6s_clat464_in_tooshort_drop++;
7615 		return -1;
7616 	}
7617 
7618 	ip6h = mtod(*m, struct ip6_hdr *);
7619 	/* Validate that mbuf contains IP payload equal to ip6_plen  */
7620 	if ((size_t)(*m)->m_pkthdr.len < ntohs(ip6h->ip6_plen) + sizeof(struct ip6_hdr)) {
7621 		ip6stat.ip6s_clat464_in_tooshort_drop++;
7622 		return -1;
7623 	}
7624 
7625 	osrc = ip6h->ip6_src;
7626 	odst = ip6h->ip6_dst;
7627 
7628 	/*
7629 	 * Retrieve the local CLAT46 reserved IPv6 address.
7630 	 * Let the packet pass if we don't find one, as the flag
7631 	 * may get set before IPv6 configuration has taken place.
7632 	 */
7633 	ia6_clat_dst = in6ifa_ifpwithflag(ifp, IN6_IFF_CLAT46);
7634 	if (ia6_clat_dst == NULL) {
7635 		goto done;
7636 	}
7637 
7638 	/*
7639 	 * Check if the original dest in the packet is same as the reserved
7640 	 * CLAT46 IPv6 address
7641 	 */
7642 	if (IN6_ARE_ADDR_EQUAL(&odst, &ia6_clat_dst->ia_addr.sin6_addr)) {
7643 		pbuf_t pbuf_store, *pbuf = NULL;
7644 		pbuf_init_mbuf(&pbuf_store, *m, ifp);
7645 		pbuf = &pbuf_store;
7646 
7647 		/*
7648 		 * Retrive the local CLAT46 IPv4 address reserved for stateless
7649 		 * translation.
7650 		 */
7651 		ia4_clat_dst = inifa_ifpclatv4(ifp);
7652 		if (ia4_clat_dst == NULL) {
7653 			ifa_remref(&ia6_clat_dst->ia_ifa);
7654 			ip6stat.ip6s_clat464_in_nov4addr_drop++;
7655 			error = -1;
7656 			goto cleanup;
7657 		}
7658 		ifa_remref(&ia6_clat_dst->ia_ifa);
7659 
7660 		/* Translate IPv6 src to IPv4 src by removing the NAT64 prefix */
7661 		dst = &ia4_clat_dst->ia_addr.sin_addr;
7662 		if ((error = nat464_synthesize_ipv4(ifp, &osrc, &src)) != 0) {
7663 			ip6stat.ip6s_clat464_in_v4synthfail_drop++;
7664 			error = -1;
7665 			goto cleanup;
7666 		}
7667 
7668 		ip6h = pbuf->pb_data;
7669 		off = sizeof(struct ip6_hdr);
7670 		proto = ip6h->ip6_nxt;
7671 		tos = (ntohl(ip6h->ip6_flow) >> 20) & 0xff;
7672 		tot_len = ntohs(ip6h->ip6_plen) + sizeof(struct ip6_hdr);
7673 
7674 		/*
7675 		 * Translate the IP header and update the fragmentation
7676 		 * header if needed
7677 		 */
7678 		error = (nat464_translate_64(pbuf, off, tos, &proto,
7679 		    ip6h->ip6_hlim, src, *dst, tot_len, &is_first_frag) == NT_NAT64) ?
7680 		    0 : -1;
7681 
7682 		ip6h = NULL; /* Invalidate ip6h as pbuf has been changed */
7683 
7684 		if (error != 0) {
7685 			ip6stat.ip6s_clat464_in_64transfail_drop++;
7686 			goto cleanup;
7687 		}
7688 
7689 		/*
7690 		 * Translate protocol header, update checksum, checksum flags
7691 		 * and related fields.
7692 		 */
7693 		error = (nat464_translate_proto(pbuf, (struct nat464_addr *)&osrc,
7694 		    (struct nat464_addr *)&odst, proto, PF_INET6, PF_INET,
7695 		    NT_IN, !is_first_frag) == NT_NAT64) ? 0 : -1;
7696 
7697 		if (error != 0) {
7698 			ip6stat.ip6s_clat464_in_64proto_transfail_drop++;
7699 			goto cleanup;
7700 		}
7701 
7702 cleanup:
7703 		if (ia4_clat_dst != NULL) {
7704 			ifa_remref(&ia4_clat_dst->ia_ifa);
7705 		}
7706 
7707 		if (pbuf_is_valid(pbuf)) {
7708 			*m = pbuf->pb_mbuf;
7709 			pbuf->pb_mbuf = NULL;
7710 			pbuf_destroy(pbuf);
7711 		} else {
7712 			error = -1;
7713 			ip6stat.ip6s_clat464_in_invalpbuf_drop++;
7714 		}
7715 
7716 		if (error == 0) {
7717 			*proto_family = PF_INET;
7718 			ip6stat.ip6s_clat464_in_success++;
7719 		}
7720 	} /* CLAT traffic */
7721 
7722 done:
7723 	return error;
7724 }
7725 
7726 /* The following is used to enqueue work items for ifnet ioctl events */
7727 static void ifnet_ioctl_event_callback(struct nwk_wq_entry *);
7728 
7729 struct ifnet_ioctl_event {
7730 	struct ifnet *ifp;
7731 	u_long ioctl_code;
7732 };
7733 
7734 struct ifnet_ioctl_event_nwk_wq_entry {
7735 	struct nwk_wq_entry nwk_wqe;
7736 	struct ifnet_ioctl_event ifnet_ioctl_ev_arg;
7737 };
7738 
7739 void
ifnet_ioctl_async(struct ifnet * ifp,u_long ioctl_code)7740 ifnet_ioctl_async(struct ifnet *ifp, u_long ioctl_code)
7741 {
7742 	struct ifnet_ioctl_event_nwk_wq_entry *p_ifnet_ioctl_ev = NULL;
7743 	bool compare_expected;
7744 
7745 	/*
7746 	 * Get an io ref count if the interface is attached.
7747 	 * At this point it most likely is. We are taking a reference for
7748 	 * deferred processing.
7749 	 */
7750 	if (!ifnet_is_attached(ifp, 1)) {
7751 		os_log(OS_LOG_DEFAULT, "%s:%d %s Failed for ioctl %lu as interface "
7752 		    "is not attached",
7753 		    __func__, __LINE__, if_name(ifp), ioctl_code);
7754 		return;
7755 	}
7756 	switch (ioctl_code) {
7757 	case SIOCADDMULTI:
7758 		compare_expected = false;
7759 		if (!atomic_compare_exchange_strong(&ifp->if_mcast_add_signaled, &compare_expected, true)) {
7760 			ifnet_decr_iorefcnt(ifp);
7761 			return;
7762 		}
7763 		break;
7764 	case SIOCDELMULTI:
7765 		compare_expected = false;
7766 		if (!atomic_compare_exchange_strong(&ifp->if_mcast_del_signaled, &compare_expected, true)) {
7767 			ifnet_decr_iorefcnt(ifp);
7768 			return;
7769 		}
7770 		break;
7771 	default:
7772 		os_log(OS_LOG_DEFAULT, "%s:%d %s unknown ioctl %lu",
7773 		    __func__, __LINE__, if_name(ifp), ioctl_code);
7774 		return;
7775 	}
7776 
7777 	p_ifnet_ioctl_ev = kalloc_type(struct ifnet_ioctl_event_nwk_wq_entry,
7778 	    Z_WAITOK | Z_ZERO | Z_NOFAIL);
7779 
7780 	p_ifnet_ioctl_ev->ifnet_ioctl_ev_arg.ifp = ifp;
7781 	p_ifnet_ioctl_ev->ifnet_ioctl_ev_arg.ioctl_code = ioctl_code;
7782 	p_ifnet_ioctl_ev->nwk_wqe.func = ifnet_ioctl_event_callback;
7783 	nwk_wq_enqueue(&p_ifnet_ioctl_ev->nwk_wqe);
7784 }
7785 
7786 static void
ifnet_ioctl_event_callback(struct nwk_wq_entry * nwk_item)7787 ifnet_ioctl_event_callback(struct nwk_wq_entry *nwk_item)
7788 {
7789 	struct ifnet_ioctl_event_nwk_wq_entry *p_ev = __container_of(nwk_item,
7790 	    struct ifnet_ioctl_event_nwk_wq_entry, nwk_wqe);
7791 
7792 	struct ifnet *ifp = p_ev->ifnet_ioctl_ev_arg.ifp;
7793 	u_long ioctl_code = p_ev->ifnet_ioctl_ev_arg.ioctl_code;
7794 	int ret = 0;
7795 
7796 	switch (ioctl_code) {
7797 	case SIOCADDMULTI:
7798 		atomic_store(&ifp->if_mcast_add_signaled, false);
7799 		break;
7800 	case SIOCDELMULTI:
7801 		atomic_store(&ifp->if_mcast_del_signaled, false);
7802 		break;
7803 	}
7804 	if ((ret = ifnet_ioctl(ifp, 0, ioctl_code, NULL)) != 0) {
7805 		os_log(OS_LOG_DEFAULT, "%s:%d %s ifnet_ioctl returned %d for ioctl %lu",
7806 		    __func__, __LINE__, if_name(ifp), ret, ioctl_code);
7807 	} else if (dlil_verbose) {
7808 		os_log(OS_LOG_DEFAULT, "%s:%d %s ifnet_ioctl returned successfully "
7809 		    "for ioctl %lu",
7810 		    __func__, __LINE__, if_name(ifp), ioctl_code);
7811 	}
7812 	ifnet_decr_iorefcnt(ifp);
7813 	kfree_type(struct ifnet_ioctl_event_nwk_wq_entry, p_ev);
7814 	return;
7815 }
7816 
7817 errno_t
ifnet_ioctl(ifnet_t ifp,protocol_family_t proto_fam,u_long ioctl_code,void * ioctl_arg)7818 ifnet_ioctl(ifnet_t ifp, protocol_family_t proto_fam, u_long ioctl_code,
7819     void *ioctl_arg)
7820 {
7821 	struct ifnet_filter *filter;
7822 	int retval = EOPNOTSUPP;
7823 	int result = 0;
7824 
7825 	if (ifp == NULL || ioctl_code == 0) {
7826 		return EINVAL;
7827 	}
7828 
7829 	/* Get an io ref count if the interface is attached */
7830 	if (!ifnet_is_attached(ifp, 1)) {
7831 		return EOPNOTSUPP;
7832 	}
7833 
7834 	/*
7835 	 * Run the interface filters first.
7836 	 * We want to run all filters before calling the protocol,
7837 	 * interface family, or interface.
7838 	 */
7839 	lck_mtx_lock_spin(&ifp->if_flt_lock);
7840 	/* prevent filter list from changing in case we drop the lock */
7841 	if_flt_monitor_busy(ifp);
7842 	TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
7843 		if (filter->filt_ioctl != NULL && (filter->filt_protocol == 0 ||
7844 		    filter->filt_protocol == proto_fam)) {
7845 			lck_mtx_unlock(&ifp->if_flt_lock);
7846 
7847 			result = filter->filt_ioctl(filter->filt_cookie, ifp,
7848 			    proto_fam, ioctl_code, ioctl_arg);
7849 
7850 			lck_mtx_lock_spin(&ifp->if_flt_lock);
7851 
7852 			/* Only update retval if no one has handled the ioctl */
7853 			if (retval == EOPNOTSUPP || result == EJUSTRETURN) {
7854 				if (result == ENOTSUP) {
7855 					result = EOPNOTSUPP;
7856 				}
7857 				retval = result;
7858 				if (retval != 0 && retval != EOPNOTSUPP) {
7859 					/* we're done with the filter list */
7860 					if_flt_monitor_unbusy(ifp);
7861 					lck_mtx_unlock(&ifp->if_flt_lock);
7862 					goto cleanup;
7863 				}
7864 			}
7865 		}
7866 	}
7867 	/* we're done with the filter list */
7868 	if_flt_monitor_unbusy(ifp);
7869 	lck_mtx_unlock(&ifp->if_flt_lock);
7870 
7871 	/* Allow the protocol to handle the ioctl */
7872 	if (proto_fam != 0) {
7873 		struct if_proto *proto;
7874 
7875 		/* callee holds a proto refcnt upon success */
7876 		ifnet_lock_shared(ifp);
7877 		proto = find_attached_proto(ifp, proto_fam);
7878 		ifnet_lock_done(ifp);
7879 		if (proto != NULL) {
7880 			proto_media_ioctl ioctlp =
7881 			    (proto->proto_kpi == kProtoKPI_v1 ?
7882 			    proto->kpi.v1.ioctl : proto->kpi.v2.ioctl);
7883 			result = EOPNOTSUPP;
7884 			if (ioctlp != NULL) {
7885 				result = ioctlp(ifp, proto_fam, ioctl_code,
7886 				    ioctl_arg);
7887 			}
7888 			if_proto_free(proto);
7889 
7890 			/* Only update retval if no one has handled the ioctl */
7891 			if (retval == EOPNOTSUPP || result == EJUSTRETURN) {
7892 				if (result == ENOTSUP) {
7893 					result = EOPNOTSUPP;
7894 				}
7895 				retval = result;
7896 				if (retval && retval != EOPNOTSUPP) {
7897 					goto cleanup;
7898 				}
7899 			}
7900 		}
7901 	}
7902 
7903 	/* retval is either 0 or EOPNOTSUPP */
7904 
7905 	/*
7906 	 * Let the interface handle this ioctl.
7907 	 * If it returns EOPNOTSUPP, ignore that, we may have
7908 	 * already handled this in the protocol or family.
7909 	 */
7910 	if (ifp->if_ioctl) {
7911 		result = (*ifp->if_ioctl)(ifp, ioctl_code, ioctl_arg);
7912 	}
7913 
7914 	/* Only update retval if no one has handled the ioctl */
7915 	if (retval == EOPNOTSUPP || result == EJUSTRETURN) {
7916 		if (result == ENOTSUP) {
7917 			result = EOPNOTSUPP;
7918 		}
7919 		retval = result;
7920 		if (retval && retval != EOPNOTSUPP) {
7921 			goto cleanup;
7922 		}
7923 	}
7924 
7925 cleanup:
7926 	if (retval == EJUSTRETURN) {
7927 		retval = 0;
7928 	}
7929 
7930 	ifnet_decr_iorefcnt(ifp);
7931 
7932 	return retval;
7933 }
7934 
7935 __private_extern__ errno_t
dlil_set_bpf_tap(ifnet_t ifp,bpf_tap_mode mode,bpf_packet_func callback)7936 dlil_set_bpf_tap(ifnet_t ifp, bpf_tap_mode mode, bpf_packet_func callback)
7937 {
7938 	errno_t error = 0;
7939 
7940 	if (ifp->if_set_bpf_tap) {
7941 		/* Get an io reference on the interface if it is attached */
7942 		if (!ifnet_is_attached(ifp, 1)) {
7943 			return ENXIO;
7944 		}
7945 		error = ifp->if_set_bpf_tap(ifp, mode, callback);
7946 		ifnet_decr_iorefcnt(ifp);
7947 	}
7948 	return error;
7949 }
7950 
7951 errno_t
dlil_resolve_multi(struct ifnet * ifp,const struct sockaddr * proto_addr,struct sockaddr * ll_addr,size_t ll_len)7952 dlil_resolve_multi(struct ifnet *ifp, const struct sockaddr *proto_addr,
7953     struct sockaddr *ll_addr, size_t ll_len)
7954 {
7955 	errno_t result = EOPNOTSUPP;
7956 	struct if_proto *proto;
7957 	const struct sockaddr *verify;
7958 	proto_media_resolve_multi resolvep;
7959 
7960 	if (!ifnet_is_attached(ifp, 1)) {
7961 		return result;
7962 	}
7963 
7964 	bzero(ll_addr, ll_len);
7965 
7966 	/* Call the protocol first; callee holds a proto refcnt upon success */
7967 	ifnet_lock_shared(ifp);
7968 	proto = find_attached_proto(ifp, proto_addr->sa_family);
7969 	ifnet_lock_done(ifp);
7970 	if (proto != NULL) {
7971 		resolvep = (proto->proto_kpi == kProtoKPI_v1 ?
7972 		    proto->kpi.v1.resolve_multi : proto->kpi.v2.resolve_multi);
7973 		if (resolvep != NULL) {
7974 			result = resolvep(ifp, proto_addr, SDL(ll_addr), ll_len);
7975 		}
7976 		if_proto_free(proto);
7977 	}
7978 
7979 	/* Let the interface verify the multicast address */
7980 	if ((result == EOPNOTSUPP || result == 0) && ifp->if_check_multi) {
7981 		if (result == 0) {
7982 			verify = ll_addr;
7983 		} else {
7984 			verify = proto_addr;
7985 		}
7986 		result = ifp->if_check_multi(ifp, verify);
7987 	}
7988 
7989 	ifnet_decr_iorefcnt(ifp);
7990 	return result;
7991 }
7992 
7993 __private_extern__ errno_t
dlil_send_arp_internal(ifnet_t ifp,u_short arpop,const struct sockaddr_dl * sender_hw,const struct sockaddr * sender_proto,const struct sockaddr_dl * target_hw,const struct sockaddr * target_proto)7994 dlil_send_arp_internal(ifnet_t ifp, u_short arpop,
7995     const struct sockaddr_dl *sender_hw, const struct sockaddr *sender_proto,
7996     const struct sockaddr_dl *target_hw, const struct sockaddr *target_proto)
7997 {
7998 	struct if_proto *proto;
7999 	errno_t result = 0;
8000 
8001 	if ((ifp->if_flags & IFF_NOARP) != 0) {
8002 		result = ENOTSUP;
8003 		goto done;
8004 	}
8005 
8006 	/* callee holds a proto refcnt upon success */
8007 	ifnet_lock_shared(ifp);
8008 	proto = find_attached_proto(ifp, target_proto->sa_family);
8009 	ifnet_lock_done(ifp);
8010 	if (proto == NULL) {
8011 		result = ENOTSUP;
8012 	} else {
8013 		proto_media_send_arp    arpp;
8014 		arpp = (proto->proto_kpi == kProtoKPI_v1 ?
8015 		    proto->kpi.v1.send_arp : proto->kpi.v2.send_arp);
8016 		if (arpp == NULL) {
8017 			result = ENOTSUP;
8018 		} else {
8019 			switch (arpop) {
8020 			case ARPOP_REQUEST:
8021 				arpstat.txrequests++;
8022 				if (target_hw != NULL) {
8023 					arpstat.txurequests++;
8024 				}
8025 				break;
8026 			case ARPOP_REPLY:
8027 				arpstat.txreplies++;
8028 				break;
8029 			}
8030 			result = arpp(ifp, arpop, sender_hw, sender_proto,
8031 			    target_hw, target_proto);
8032 		}
8033 		if_proto_free(proto);
8034 	}
8035 done:
8036 	return result;
8037 }
8038 
8039 struct net_thread_marks { };
8040 static const struct net_thread_marks net_thread_marks_base = { };
8041 
8042 __private_extern__ const net_thread_marks_t net_thread_marks_none =
8043     &net_thread_marks_base;
8044 
8045 __private_extern__ net_thread_marks_t
net_thread_marks_push(u_int32_t push)8046 net_thread_marks_push(u_int32_t push)
8047 {
8048 	static const char *const base = (const void*)&net_thread_marks_base;
8049 	u_int32_t pop = 0;
8050 
8051 	if (push != 0) {
8052 		struct uthread *uth = current_uthread();
8053 
8054 		pop = push & ~uth->uu_network_marks;
8055 		if (pop != 0) {
8056 			uth->uu_network_marks |= pop;
8057 		}
8058 	}
8059 
8060 	return (net_thread_marks_t)&base[pop];
8061 }
8062 
8063 __private_extern__ net_thread_marks_t
net_thread_unmarks_push(u_int32_t unpush)8064 net_thread_unmarks_push(u_int32_t unpush)
8065 {
8066 	static const char *const base = (const void*)&net_thread_marks_base;
8067 	u_int32_t unpop = 0;
8068 
8069 	if (unpush != 0) {
8070 		struct uthread *uth = current_uthread();
8071 
8072 		unpop = unpush & uth->uu_network_marks;
8073 		if (unpop != 0) {
8074 			uth->uu_network_marks &= ~unpop;
8075 		}
8076 	}
8077 
8078 	return (net_thread_marks_t)&base[unpop];
8079 }
8080 
8081 __private_extern__ void
net_thread_marks_pop(net_thread_marks_t popx)8082 net_thread_marks_pop(net_thread_marks_t popx)
8083 {
8084 	static const char *const base = (const void*)&net_thread_marks_base;
8085 	const ptrdiff_t pop = (const char *)popx - (const char *)base;
8086 
8087 	if (pop != 0) {
8088 		static const ptrdiff_t ones = (ptrdiff_t)(u_int32_t)~0U;
8089 		struct uthread *uth = current_uthread();
8090 
8091 		VERIFY((pop & ones) == pop);
8092 		VERIFY((ptrdiff_t)(uth->uu_network_marks & pop) == pop);
8093 		uth->uu_network_marks &= ~pop;
8094 	}
8095 }
8096 
8097 __private_extern__ void
net_thread_unmarks_pop(net_thread_marks_t unpopx)8098 net_thread_unmarks_pop(net_thread_marks_t unpopx)
8099 {
8100 	static const char *const base = (const void*)&net_thread_marks_base;
8101 	ptrdiff_t unpop = (const char *)unpopx - (const char *)base;
8102 
8103 	if (unpop != 0) {
8104 		static const ptrdiff_t ones = (ptrdiff_t)(u_int32_t)~0U;
8105 		struct uthread *uth = current_uthread();
8106 
8107 		VERIFY((unpop & ones) == unpop);
8108 		VERIFY((ptrdiff_t)(uth->uu_network_marks & unpop) == 0);
8109 		uth->uu_network_marks |= (u_int32_t)unpop;
8110 	}
8111 }
8112 
8113 __private_extern__ u_int32_t
net_thread_is_marked(u_int32_t check)8114 net_thread_is_marked(u_int32_t check)
8115 {
8116 	if (check != 0) {
8117 		struct uthread *uth = current_uthread();
8118 		return uth->uu_network_marks & check;
8119 	} else {
8120 		return 0;
8121 	}
8122 }
8123 
8124 __private_extern__ u_int32_t
net_thread_is_unmarked(u_int32_t check)8125 net_thread_is_unmarked(u_int32_t check)
8126 {
8127 	if (check != 0) {
8128 		struct uthread *uth = current_uthread();
8129 		return ~uth->uu_network_marks & check;
8130 	} else {
8131 		return 0;
8132 	}
8133 }
8134 
8135 static __inline__ int
_is_announcement(const struct sockaddr_in * sender_sin,const struct sockaddr_in * target_sin)8136 _is_announcement(const struct sockaddr_in * sender_sin,
8137     const struct sockaddr_in * target_sin)
8138 {
8139 	if (target_sin == NULL || sender_sin == NULL) {
8140 		return FALSE;
8141 	}
8142 
8143 	return sender_sin->sin_addr.s_addr == target_sin->sin_addr.s_addr;
8144 }
8145 
8146 __private_extern__ errno_t
dlil_send_arp(ifnet_t ifp,u_short arpop,const struct sockaddr_dl * sender_hw,const struct sockaddr * sender_proto,const struct sockaddr_dl * target_hw,const struct sockaddr * target_proto0,u_int32_t rtflags)8147 dlil_send_arp(ifnet_t ifp, u_short arpop, const struct sockaddr_dl *sender_hw,
8148     const struct sockaddr *sender_proto, const struct sockaddr_dl *target_hw,
8149     const struct sockaddr *target_proto0, u_int32_t rtflags)
8150 {
8151 	errno_t result = 0;
8152 	const struct sockaddr_in * sender_sin;
8153 	const struct sockaddr_in * target_sin;
8154 	struct sockaddr_inarp target_proto_sinarp;
8155 	struct sockaddr *target_proto = __DECONST_SA(target_proto0);
8156 
8157 	if (target_proto == NULL || sender_proto == NULL) {
8158 		return EINVAL;
8159 	}
8160 
8161 	if (sender_proto->sa_family != target_proto->sa_family) {
8162 		return EINVAL;
8163 	}
8164 
8165 	/*
8166 	 * If the target is a (default) router, provide that
8167 	 * information to the send_arp callback routine.
8168 	 */
8169 	if (rtflags & RTF_ROUTER) {
8170 		SOCKADDR_COPY(target_proto, &target_proto_sinarp, sizeof(struct sockaddr_in));
8171 		target_proto_sinarp.sin_other |= SIN_ROUTER;
8172 		target_proto = SA(&target_proto_sinarp);
8173 	}
8174 
8175 	/*
8176 	 * If this is an ARP request and the target IP is IPv4LL,
8177 	 * send the request on all interfaces.  The exception is
8178 	 * an announcement, which must only appear on the specific
8179 	 * interface.
8180 	 */
8181 	sender_sin = SIN(sender_proto);
8182 	target_sin = SIN(target_proto);
8183 	if (target_proto->sa_family == AF_INET &&
8184 	    IN_LINKLOCAL(ntohl(target_sin->sin_addr.s_addr)) &&
8185 	    ipv4_ll_arp_aware != 0 && arpop == ARPOP_REQUEST &&
8186 	    !_is_announcement(sender_sin, target_sin)) {
8187 		ifnet_t         *ifp_list;
8188 		u_int32_t       count;
8189 		u_int32_t       ifp_on;
8190 
8191 		result = ENOTSUP;
8192 
8193 		if (ifnet_list_get(IFNET_FAMILY_ANY, &ifp_list, &count) == 0) {
8194 			for (ifp_on = 0; ifp_on < count; ifp_on++) {
8195 				errno_t new_result;
8196 				ifaddr_t source_hw = NULL;
8197 				ifaddr_t source_ip = NULL;
8198 				struct sockaddr_in source_ip_copy;
8199 				struct ifnet *cur_ifp = ifp_list[ifp_on];
8200 
8201 				/*
8202 				 * Only arp on interfaces marked for IPv4LL
8203 				 * ARPing.  This may mean that we don't ARP on
8204 				 * the interface the subnet route points to.
8205 				 */
8206 				if (!(cur_ifp->if_eflags & IFEF_ARPLL)) {
8207 					continue;
8208 				}
8209 
8210 				/* Find the source IP address */
8211 				ifnet_lock_shared(cur_ifp);
8212 				source_hw = cur_ifp->if_lladdr;
8213 				TAILQ_FOREACH(source_ip, &cur_ifp->if_addrhead,
8214 				    ifa_link) {
8215 					IFA_LOCK(source_ip);
8216 					if (source_ip->ifa_addr != NULL &&
8217 					    source_ip->ifa_addr->sa_family ==
8218 					    AF_INET) {
8219 						/* Copy the source IP address */
8220 						SOCKADDR_COPY(SIN(source_ip->ifa_addr), &source_ip_copy, sizeof(source_ip_copy));
8221 						IFA_UNLOCK(source_ip);
8222 						break;
8223 					}
8224 					IFA_UNLOCK(source_ip);
8225 				}
8226 
8227 				/* No IP Source, don't arp */
8228 				if (source_ip == NULL) {
8229 					ifnet_lock_done(cur_ifp);
8230 					continue;
8231 				}
8232 
8233 				ifa_addref(source_hw);
8234 				ifnet_lock_done(cur_ifp);
8235 
8236 				/* Send the ARP */
8237 				new_result = dlil_send_arp_internal(cur_ifp,
8238 				    arpop, SDL(source_hw->ifa_addr),
8239 				    SA(&source_ip_copy), NULL,
8240 				    target_proto);
8241 
8242 				ifa_remref(source_hw);
8243 				if (result == ENOTSUP) {
8244 					result = new_result;
8245 				}
8246 			}
8247 			ifnet_list_free(ifp_list);
8248 		}
8249 	} else {
8250 		result = dlil_send_arp_internal(ifp, arpop, sender_hw,
8251 		    sender_proto, target_hw, target_proto);
8252 	}
8253 
8254 	return result;
8255 }
8256 
8257 /*
8258  * Caller must hold ifnet head lock.
8259  */
8260 static int
ifnet_lookup(struct ifnet * ifp)8261 ifnet_lookup(struct ifnet *ifp)
8262 {
8263 	struct ifnet *_ifp;
8264 
8265 	LCK_RW_ASSERT(&ifnet_head_lock, LCK_RW_ASSERT_HELD);
8266 	TAILQ_FOREACH(_ifp, &ifnet_head, if_link) {
8267 		if (_ifp == ifp) {
8268 			break;
8269 		}
8270 	}
8271 	return _ifp != NULL;
8272 }
8273 
8274 /*
8275  * Caller has to pass a non-zero refio argument to get a
8276  * IO reference count. This will prevent ifnet_detach from
8277  * being called when there are outstanding io reference counts.
8278  */
8279 int
ifnet_is_attached(struct ifnet * ifp,int refio)8280 ifnet_is_attached(struct ifnet *ifp, int refio)
8281 {
8282 	int ret;
8283 
8284 	lck_mtx_lock_spin(&ifp->if_ref_lock);
8285 	if ((ret = IF_FULLY_ATTACHED(ifp))) {
8286 		if (refio > 0) {
8287 			ifp->if_refio++;
8288 		}
8289 	}
8290 	lck_mtx_unlock(&ifp->if_ref_lock);
8291 
8292 	return ret;
8293 }
8294 
8295 void
ifnet_incr_pending_thread_count(struct ifnet * ifp)8296 ifnet_incr_pending_thread_count(struct ifnet *ifp)
8297 {
8298 	lck_mtx_lock_spin(&ifp->if_ref_lock);
8299 	ifp->if_threads_pending++;
8300 	lck_mtx_unlock(&ifp->if_ref_lock);
8301 }
8302 
8303 void
ifnet_decr_pending_thread_count(struct ifnet * ifp)8304 ifnet_decr_pending_thread_count(struct ifnet *ifp)
8305 {
8306 	lck_mtx_lock_spin(&ifp->if_ref_lock);
8307 	VERIFY(ifp->if_threads_pending > 0);
8308 	ifp->if_threads_pending--;
8309 	if (ifp->if_threads_pending == 0) {
8310 		wakeup(&ifp->if_threads_pending);
8311 	}
8312 	lck_mtx_unlock(&ifp->if_ref_lock);
8313 }
8314 
8315 /*
8316  * Caller must ensure the interface is attached; the assumption is that
8317  * there is at least an outstanding IO reference count held already.
8318  * Most callers would call ifnet_is_{attached,data_ready}() instead.
8319  */
8320 void
ifnet_incr_iorefcnt(struct ifnet * ifp)8321 ifnet_incr_iorefcnt(struct ifnet *ifp)
8322 {
8323 	lck_mtx_lock_spin(&ifp->if_ref_lock);
8324 	VERIFY(IF_FULLY_ATTACHED(ifp));
8325 	VERIFY(ifp->if_refio > 0);
8326 	ifp->if_refio++;
8327 	lck_mtx_unlock(&ifp->if_ref_lock);
8328 }
8329 
8330 __attribute__((always_inline))
8331 static void
ifnet_decr_iorefcnt_locked(struct ifnet * ifp)8332 ifnet_decr_iorefcnt_locked(struct ifnet *ifp)
8333 {
8334 	LCK_MTX_ASSERT(&ifp->if_ref_lock, LCK_MTX_ASSERT_OWNED);
8335 
8336 	VERIFY(ifp->if_refio > 0);
8337 	VERIFY(ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING));
8338 
8339 	ifp->if_refio--;
8340 	VERIFY(ifp->if_refio != 0 || ifp->if_datamov == 0);
8341 
8342 	/*
8343 	 * if there are no more outstanding io references, wakeup the
8344 	 * ifnet_detach thread if detaching flag is set.
8345 	 */
8346 	if (ifp->if_refio == 0 && (ifp->if_refflags & IFRF_DETACHING)) {
8347 		wakeup(&(ifp->if_refio));
8348 	}
8349 }
8350 
8351 void
ifnet_decr_iorefcnt(struct ifnet * ifp)8352 ifnet_decr_iorefcnt(struct ifnet *ifp)
8353 {
8354 	lck_mtx_lock_spin(&ifp->if_ref_lock);
8355 	ifnet_decr_iorefcnt_locked(ifp);
8356 	lck_mtx_unlock(&ifp->if_ref_lock);
8357 }
8358 
8359 boolean_t
ifnet_datamov_begin(struct ifnet * ifp)8360 ifnet_datamov_begin(struct ifnet *ifp)
8361 {
8362 	boolean_t ret;
8363 
8364 	lck_mtx_lock_spin(&ifp->if_ref_lock);
8365 	if ((ret = IF_FULLY_ATTACHED_AND_READY(ifp))) {
8366 		ifp->if_refio++;
8367 		ifp->if_datamov++;
8368 	}
8369 	lck_mtx_unlock(&ifp->if_ref_lock);
8370 
8371 	DTRACE_IP2(datamov__begin, struct ifnet *, ifp, boolean_t, ret);
8372 	return ret;
8373 }
8374 
8375 void
ifnet_datamov_end(struct ifnet * ifp)8376 ifnet_datamov_end(struct ifnet *ifp)
8377 {
8378 	lck_mtx_lock_spin(&ifp->if_ref_lock);
8379 	VERIFY(ifp->if_datamov > 0);
8380 	/*
8381 	 * if there's no more thread moving data, wakeup any
8382 	 * drainers that's blocked waiting for this.
8383 	 */
8384 	if (--ifp->if_datamov == 0 && ifp->if_drainers > 0) {
8385 		DLIL_PRINTF("Waking up drainers on %s\n", if_name(ifp));
8386 		DTRACE_IP1(datamov__drain__wake, struct ifnet *, ifp);
8387 		wakeup(&(ifp->if_datamov));
8388 	}
8389 	ifnet_decr_iorefcnt_locked(ifp);
8390 	lck_mtx_unlock(&ifp->if_ref_lock);
8391 
8392 	DTRACE_IP1(datamov__end, struct ifnet *, ifp);
8393 }
8394 
8395 static void
ifnet_datamov_suspend_locked(struct ifnet * ifp)8396 ifnet_datamov_suspend_locked(struct ifnet *ifp)
8397 {
8398 	LCK_MTX_ASSERT(&ifp->if_ref_lock, LCK_MTX_ASSERT_OWNED);
8399 	ifp->if_refio++;
8400 	if (ifp->if_suspend++ == 0) {
8401 		VERIFY(ifp->if_refflags & IFRF_READY);
8402 		ifp->if_refflags &= ~IFRF_READY;
8403 	}
8404 }
8405 
8406 void
ifnet_datamov_suspend(struct ifnet * ifp)8407 ifnet_datamov_suspend(struct ifnet *ifp)
8408 {
8409 	lck_mtx_lock_spin(&ifp->if_ref_lock);
8410 	VERIFY(ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING));
8411 	ifnet_datamov_suspend_locked(ifp);
8412 	lck_mtx_unlock(&ifp->if_ref_lock);
8413 }
8414 
8415 boolean_t
ifnet_datamov_suspend_if_needed(struct ifnet * ifp)8416 ifnet_datamov_suspend_if_needed(struct ifnet *ifp)
8417 {
8418 	lck_mtx_lock_spin(&ifp->if_ref_lock);
8419 	VERIFY(ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING));
8420 	if (ifp->if_suspend > 0) {
8421 		lck_mtx_unlock(&ifp->if_ref_lock);
8422 		return FALSE;
8423 	}
8424 	ifnet_datamov_suspend_locked(ifp);
8425 	lck_mtx_unlock(&ifp->if_ref_lock);
8426 	return TRUE;
8427 }
8428 
8429 void
ifnet_datamov_drain(struct ifnet * ifp)8430 ifnet_datamov_drain(struct ifnet *ifp)
8431 {
8432 	lck_mtx_lock(&ifp->if_ref_lock);
8433 	VERIFY(ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING));
8434 	/* data movement must already be suspended */
8435 	VERIFY(ifp->if_suspend > 0);
8436 	VERIFY(!(ifp->if_refflags & IFRF_READY));
8437 	ifp->if_drainers++;
8438 	while (ifp->if_datamov != 0) {
8439 		DLIL_PRINTF("Waiting for data path(s) to quiesce on %s\n",
8440 		    if_name(ifp));
8441 		DTRACE_IP1(datamov__wait, struct ifnet *, ifp);
8442 		(void) msleep(&(ifp->if_datamov), &ifp->if_ref_lock,
8443 		    (PZERO - 1), __func__, NULL);
8444 		DTRACE_IP1(datamov__wake, struct ifnet *, ifp);
8445 	}
8446 	VERIFY(!(ifp->if_refflags & IFRF_READY));
8447 	VERIFY(ifp->if_drainers > 0);
8448 	ifp->if_drainers--;
8449 	lck_mtx_unlock(&ifp->if_ref_lock);
8450 
8451 	/* purge the interface queues */
8452 	if ((ifp->if_eflags & IFEF_TXSTART) != 0) {
8453 		if_qflush_snd(ifp, false);
8454 	}
8455 }
8456 
8457 void
ifnet_datamov_suspend_and_drain(struct ifnet * ifp)8458 ifnet_datamov_suspend_and_drain(struct ifnet *ifp)
8459 {
8460 	ifnet_datamov_suspend(ifp);
8461 	ifnet_datamov_drain(ifp);
8462 }
8463 
8464 void
ifnet_datamov_resume(struct ifnet * ifp)8465 ifnet_datamov_resume(struct ifnet *ifp)
8466 {
8467 	lck_mtx_lock(&ifp->if_ref_lock);
8468 	/* data movement must already be suspended */
8469 	VERIFY(ifp->if_suspend > 0);
8470 	if (--ifp->if_suspend == 0) {
8471 		VERIFY(!(ifp->if_refflags & IFRF_READY));
8472 		ifp->if_refflags |= IFRF_READY;
8473 	}
8474 	ifnet_decr_iorefcnt_locked(ifp);
8475 	lck_mtx_unlock(&ifp->if_ref_lock);
8476 }
8477 
8478 static void
dlil_if_trace(struct dlil_ifnet * dl_if,int refhold)8479 dlil_if_trace(struct dlil_ifnet *dl_if, int refhold)
8480 {
8481 	struct dlil_ifnet_dbg *dl_if_dbg = (struct dlil_ifnet_dbg *)dl_if;
8482 	ctrace_t *tr;
8483 	u_int32_t idx;
8484 	u_int16_t *cnt;
8485 
8486 	if (!(dl_if->dl_if_flags & DLIF_DEBUG)) {
8487 		panic("%s: dl_if %p has no debug structure", __func__, dl_if);
8488 		/* NOTREACHED */
8489 	}
8490 
8491 	if (refhold) {
8492 		cnt = &dl_if_dbg->dldbg_if_refhold_cnt;
8493 		tr = dl_if_dbg->dldbg_if_refhold;
8494 	} else {
8495 		cnt = &dl_if_dbg->dldbg_if_refrele_cnt;
8496 		tr = dl_if_dbg->dldbg_if_refrele;
8497 	}
8498 
8499 	idx = os_atomic_inc_orig(cnt, relaxed) % IF_REF_TRACE_HIST_SIZE;
8500 	ctrace_record(&tr[idx]);
8501 }
8502 
8503 errno_t
dlil_if_ref(struct ifnet * ifp)8504 dlil_if_ref(struct ifnet *ifp)
8505 {
8506 	struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
8507 
8508 	if (dl_if == NULL) {
8509 		return EINVAL;
8510 	}
8511 
8512 	lck_mtx_lock_spin(&dl_if->dl_if_lock);
8513 	++dl_if->dl_if_refcnt;
8514 	if (dl_if->dl_if_refcnt == 0) {
8515 		panic("%s: wraparound refcnt for ifp=%p", __func__, ifp);
8516 		/* NOTREACHED */
8517 	}
8518 	if (dl_if->dl_if_trace != NULL) {
8519 		(*dl_if->dl_if_trace)(dl_if, TRUE);
8520 	}
8521 	lck_mtx_unlock(&dl_if->dl_if_lock);
8522 
8523 	return 0;
8524 }
8525 
8526 errno_t
dlil_if_free(struct ifnet * ifp)8527 dlil_if_free(struct ifnet *ifp)
8528 {
8529 	struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
8530 	bool need_release = FALSE;
8531 
8532 	if (dl_if == NULL) {
8533 		return EINVAL;
8534 	}
8535 
8536 	lck_mtx_lock_spin(&dl_if->dl_if_lock);
8537 	switch (dl_if->dl_if_refcnt) {
8538 	case 0:
8539 		panic("%s: negative refcnt for ifp=%p", __func__, ifp);
8540 		/* NOTREACHED */
8541 		break;
8542 	case 1:
8543 		if ((ifp->if_refflags & IFRF_EMBRYONIC) != 0) {
8544 			need_release = TRUE;
8545 		}
8546 		break;
8547 	default:
8548 		break;
8549 	}
8550 	--dl_if->dl_if_refcnt;
8551 	if (dl_if->dl_if_trace != NULL) {
8552 		(*dl_if->dl_if_trace)(dl_if, FALSE);
8553 	}
8554 	lck_mtx_unlock(&dl_if->dl_if_lock);
8555 	if (need_release) {
8556 		_dlil_if_release(ifp, true);
8557 	}
8558 	return 0;
8559 }
8560 
8561 static errno_t
dlil_attach_protocol(struct if_proto * proto,const struct ifnet_demux_desc * demux_list,u_int32_t demux_count,uint32_t * proto_count)8562 dlil_attach_protocol(struct if_proto *proto,
8563     const struct ifnet_demux_desc *demux_list, u_int32_t demux_count,
8564     uint32_t * proto_count)
8565 {
8566 	struct kev_dl_proto_data ev_pr_data;
8567 	struct ifnet *ifp = proto->ifp;
8568 	errno_t retval = 0;
8569 	u_int32_t hash_value = proto_hash_value(proto->protocol_family);
8570 	struct if_proto *prev_proto;
8571 	struct if_proto *_proto;
8572 
8573 	/* don't allow attaching anything but PF_BRIDGE to vmnet interfaces */
8574 	if (IFNET_IS_VMNET(ifp) && proto->protocol_family != PF_BRIDGE) {
8575 		return EINVAL;
8576 	}
8577 
8578 	if (!ifnet_is_attached(ifp, 1)) {
8579 		os_log(OS_LOG_DEFAULT, "%s: %s is no longer attached",
8580 		    __func__, if_name(ifp));
8581 		return ENXIO;
8582 	}
8583 	/* callee holds a proto refcnt upon success */
8584 	ifnet_lock_exclusive(ifp);
8585 	_proto = find_attached_proto(ifp, proto->protocol_family);
8586 	if (_proto != NULL) {
8587 		ifnet_lock_done(ifp);
8588 		if_proto_free(_proto);
8589 		retval = EEXIST;
8590 		goto ioref_done;
8591 	}
8592 
8593 	/*
8594 	 * Call family module add_proto routine so it can refine the
8595 	 * demux descriptors as it wishes.
8596 	 */
8597 	retval = ifp->if_add_proto(ifp, proto->protocol_family, demux_list,
8598 	    demux_count);
8599 	if (retval) {
8600 		ifnet_lock_done(ifp);
8601 		goto ioref_done;
8602 	}
8603 
8604 	/*
8605 	 * Insert the protocol in the hash
8606 	 */
8607 	prev_proto = SLIST_FIRST(&ifp->if_proto_hash[hash_value]);
8608 	while (prev_proto != NULL && SLIST_NEXT(prev_proto, next_hash) != NULL) {
8609 		prev_proto = SLIST_NEXT(prev_proto, next_hash);
8610 	}
8611 	if (prev_proto) {
8612 		SLIST_INSERT_AFTER(prev_proto, proto, next_hash);
8613 	} else {
8614 		SLIST_INSERT_HEAD(&ifp->if_proto_hash[hash_value],
8615 		    proto, next_hash);
8616 	}
8617 
8618 	/* hold a proto refcnt for attach */
8619 	if_proto_ref(proto);
8620 
8621 	/*
8622 	 * The reserved field carries the number of protocol still attached
8623 	 * (subject to change)
8624 	 */
8625 	ev_pr_data.proto_family = proto->protocol_family;
8626 	ev_pr_data.proto_remaining_count = dlil_ifp_protolist(ifp, NULL, 0);
8627 
8628 	ifnet_lock_done(ifp);
8629 
8630 	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_PROTO_ATTACHED,
8631 	    (struct net_event_data *)&ev_pr_data,
8632 	    sizeof(struct kev_dl_proto_data), FALSE);
8633 	if (proto_count != NULL) {
8634 		*proto_count = ev_pr_data.proto_remaining_count;
8635 	}
8636 ioref_done:
8637 	ifnet_decr_iorefcnt(ifp);
8638 	return retval;
8639 }
8640 
8641 static void
dlil_handle_proto_attach(ifnet_t ifp,protocol_family_t protocol)8642 dlil_handle_proto_attach(ifnet_t ifp, protocol_family_t protocol)
8643 {
8644 	/*
8645 	 * A protocol has been attached, mark the interface up.
8646 	 * This used to be done by configd.KernelEventMonitor, but that
8647 	 * is inherently prone to races (rdar://problem/30810208).
8648 	 */
8649 	(void) ifnet_set_flags(ifp, IFF_UP, IFF_UP);
8650 	(void) ifnet_ioctl(ifp, 0, SIOCSIFFLAGS, NULL);
8651 	dlil_post_sifflags_msg(ifp);
8652 #if SKYWALK
8653 	switch (protocol) {
8654 	case AF_INET:
8655 	case AF_INET6:
8656 		/* don't attach the flowswitch unless attaching IP */
8657 		dlil_attach_flowswitch_nexus(ifp);
8658 		break;
8659 	default:
8660 		break;
8661 	}
8662 #endif /* SKYWALK */
8663 }
8664 
8665 errno_t
ifnet_attach_protocol(ifnet_t ifp,protocol_family_t protocol,const struct ifnet_attach_proto_param * proto_details)8666 ifnet_attach_protocol(ifnet_t ifp, protocol_family_t protocol,
8667     const struct ifnet_attach_proto_param *proto_details)
8668 {
8669 	int retval = 0;
8670 	struct if_proto  *ifproto = NULL;
8671 	uint32_t proto_count = 0;
8672 
8673 	ifnet_head_lock_shared();
8674 	if (ifp == NULL || protocol == 0 || proto_details == NULL) {
8675 		retval = EINVAL;
8676 		goto end;
8677 	}
8678 	/* Check that the interface is in the global list */
8679 	if (!ifnet_lookup(ifp)) {
8680 		retval = ENXIO;
8681 		goto end;
8682 	}
8683 
8684 	ifproto = zalloc_flags(dlif_proto_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
8685 
8686 	/* refcnt held above during lookup */
8687 	ifproto->ifp = ifp;
8688 	ifproto->protocol_family = protocol;
8689 	ifproto->proto_kpi = kProtoKPI_v1;
8690 	ifproto->kpi.v1.input = proto_details->input;
8691 	ifproto->kpi.v1.pre_output = proto_details->pre_output;
8692 	ifproto->kpi.v1.event = proto_details->event;
8693 	ifproto->kpi.v1.ioctl = proto_details->ioctl;
8694 	ifproto->kpi.v1.detached = proto_details->detached;
8695 	ifproto->kpi.v1.resolve_multi = proto_details->resolve;
8696 	ifproto->kpi.v1.send_arp = proto_details->send_arp;
8697 
8698 	retval = dlil_attach_protocol(ifproto,
8699 	    proto_details->demux_list, proto_details->demux_count,
8700 	    &proto_count);
8701 
8702 end:
8703 	if (retval == EEXIST) {
8704 		/* already attached */
8705 		if (dlil_verbose) {
8706 			DLIL_PRINTF("%s: protocol %d already attached\n",
8707 			    ifp != NULL ? if_name(ifp) : "N/A",
8708 			    protocol);
8709 		}
8710 	} else if (retval != 0) {
8711 		DLIL_PRINTF("%s: failed to attach v1 protocol %d (err=%d)\n",
8712 		    ifp != NULL ? if_name(ifp) : "N/A", protocol, retval);
8713 	} else if (dlil_verbose) {
8714 		DLIL_PRINTF("%s: attached v1 protocol %d (count = %d)\n",
8715 		    ifp != NULL ? if_name(ifp) : "N/A",
8716 		    protocol, proto_count);
8717 	}
8718 	ifnet_head_done();
8719 	if (retval == 0) {
8720 		dlil_handle_proto_attach(ifp, protocol);
8721 	} else if (ifproto != NULL) {
8722 		zfree(dlif_proto_zone, ifproto);
8723 	}
8724 	return retval;
8725 }
8726 
8727 errno_t
ifnet_attach_protocol_v2(ifnet_t ifp,protocol_family_t protocol,const struct ifnet_attach_proto_param_v2 * proto_details)8728 ifnet_attach_protocol_v2(ifnet_t ifp, protocol_family_t protocol,
8729     const struct ifnet_attach_proto_param_v2 *proto_details)
8730 {
8731 	int retval = 0;
8732 	struct if_proto  *ifproto = NULL;
8733 	uint32_t proto_count = 0;
8734 
8735 	ifnet_head_lock_shared();
8736 	if (ifp == NULL || protocol == 0 || proto_details == NULL) {
8737 		retval = EINVAL;
8738 		goto end;
8739 	}
8740 	/* Check that the interface is in the global list */
8741 	if (!ifnet_lookup(ifp)) {
8742 		retval = ENXIO;
8743 		goto end;
8744 	}
8745 
8746 	ifproto = zalloc_flags(dlif_proto_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
8747 
8748 	/* refcnt held above during lookup */
8749 	ifproto->ifp = ifp;
8750 	ifproto->protocol_family = protocol;
8751 	ifproto->proto_kpi = kProtoKPI_v2;
8752 	ifproto->kpi.v2.input = proto_details->input;
8753 	ifproto->kpi.v2.pre_output = proto_details->pre_output;
8754 	ifproto->kpi.v2.event = proto_details->event;
8755 	ifproto->kpi.v2.ioctl = proto_details->ioctl;
8756 	ifproto->kpi.v2.detached = proto_details->detached;
8757 	ifproto->kpi.v2.resolve_multi = proto_details->resolve;
8758 	ifproto->kpi.v2.send_arp = proto_details->send_arp;
8759 
8760 	retval = dlil_attach_protocol(ifproto,
8761 	    proto_details->demux_list, proto_details->demux_count,
8762 	    &proto_count);
8763 
8764 end:
8765 	if (retval == EEXIST) {
8766 		/* already attached */
8767 		if (dlil_verbose) {
8768 			DLIL_PRINTF("%s: protocol %d already attached\n",
8769 			    ifp != NULL ? if_name(ifp) : "N/A",
8770 			    protocol);
8771 		}
8772 	} else if (retval != 0) {
8773 		DLIL_PRINTF("%s: failed to attach v2 protocol %d (err=%d)\n",
8774 		    ifp != NULL ? if_name(ifp) : "N/A", protocol, retval);
8775 	} else if (dlil_verbose) {
8776 		DLIL_PRINTF("%s: attached v2 protocol %d (count = %d)\n",
8777 		    ifp != NULL ? if_name(ifp) : "N/A",
8778 		    protocol, proto_count);
8779 	}
8780 	ifnet_head_done();
8781 	if (retval == 0) {
8782 		dlil_handle_proto_attach(ifp, protocol);
8783 	} else if (ifproto != NULL) {
8784 		zfree(dlif_proto_zone, ifproto);
8785 	}
8786 	return retval;
8787 }
8788 
8789 errno_t
ifnet_detach_protocol(ifnet_t ifp,protocol_family_t proto_family)8790 ifnet_detach_protocol(ifnet_t ifp, protocol_family_t proto_family)
8791 {
8792 	struct if_proto *proto = NULL;
8793 	int     retval = 0;
8794 
8795 	if (ifp == NULL || proto_family == 0) {
8796 		retval = EINVAL;
8797 		goto end;
8798 	}
8799 
8800 	ifnet_lock_exclusive(ifp);
8801 	/* callee holds a proto refcnt upon success */
8802 	proto = find_attached_proto(ifp, proto_family);
8803 	if (proto == NULL) {
8804 		retval = ENXIO;
8805 		ifnet_lock_done(ifp);
8806 		goto end;
8807 	}
8808 
8809 	/* call family module del_proto */
8810 	if (ifp->if_del_proto) {
8811 		ifp->if_del_proto(ifp, proto->protocol_family);
8812 	}
8813 
8814 	SLIST_REMOVE(&ifp->if_proto_hash[proto_hash_value(proto_family)],
8815 	    proto, if_proto, next_hash);
8816 
8817 	if (proto->proto_kpi == kProtoKPI_v1) {
8818 		proto->kpi.v1.input = ifproto_media_input_v1;
8819 		proto->kpi.v1.pre_output = ifproto_media_preout;
8820 		proto->kpi.v1.event = ifproto_media_event;
8821 		proto->kpi.v1.ioctl = ifproto_media_ioctl;
8822 		proto->kpi.v1.resolve_multi = ifproto_media_resolve_multi;
8823 		proto->kpi.v1.send_arp = ifproto_media_send_arp;
8824 	} else {
8825 		proto->kpi.v2.input = ifproto_media_input_v2;
8826 		proto->kpi.v2.pre_output = ifproto_media_preout;
8827 		proto->kpi.v2.event = ifproto_media_event;
8828 		proto->kpi.v2.ioctl = ifproto_media_ioctl;
8829 		proto->kpi.v2.resolve_multi = ifproto_media_resolve_multi;
8830 		proto->kpi.v2.send_arp = ifproto_media_send_arp;
8831 	}
8832 	proto->detached = 1;
8833 	ifnet_lock_done(ifp);
8834 
8835 	if (dlil_verbose) {
8836 		DLIL_PRINTF("%s: detached %s protocol %d\n", if_name(ifp),
8837 		    (proto->proto_kpi == kProtoKPI_v1) ?
8838 		    "v1" : "v2", proto_family);
8839 	}
8840 
8841 	/* release proto refcnt held during protocol attach */
8842 	if_proto_free(proto);
8843 
8844 	/*
8845 	 * Release proto refcnt held during lookup; the rest of
8846 	 * protocol detach steps will happen when the last proto
8847 	 * reference is released.
8848 	 */
8849 	if_proto_free(proto);
8850 
8851 end:
8852 	return retval;
8853 }
8854 
8855 static errno_t
ifproto_media_input_v1(struct ifnet * ifp,protocol_family_t protocol,struct mbuf * packet,char * header)8856 ifproto_media_input_v1(struct ifnet *ifp, protocol_family_t protocol,
8857     struct mbuf *packet, char *header)
8858 {
8859 #pragma unused(ifp, protocol, packet, header)
8860 	return ENXIO;
8861 }
8862 
8863 static errno_t
ifproto_media_input_v2(struct ifnet * ifp,protocol_family_t protocol,struct mbuf * packet)8864 ifproto_media_input_v2(struct ifnet *ifp, protocol_family_t protocol,
8865     struct mbuf *packet)
8866 {
8867 #pragma unused(ifp, protocol, packet)
8868 	return ENXIO;
8869 }
8870 
8871 static errno_t
ifproto_media_preout(struct ifnet * ifp,protocol_family_t protocol,mbuf_t * packet,const struct sockaddr * dest,void * route,char * frame_type,char * link_layer_dest)8872 ifproto_media_preout(struct ifnet *ifp, protocol_family_t protocol,
8873     mbuf_t *packet, const struct sockaddr *dest, void *route, char *frame_type,
8874     char *link_layer_dest)
8875 {
8876 #pragma unused(ifp, protocol, packet, dest, route, frame_type, link_layer_dest)
8877 	return ENXIO;
8878 }
8879 
8880 static void
ifproto_media_event(struct ifnet * ifp,protocol_family_t protocol,const struct kev_msg * event)8881 ifproto_media_event(struct ifnet *ifp, protocol_family_t protocol,
8882     const struct kev_msg *event)
8883 {
8884 #pragma unused(ifp, protocol, event)
8885 }
8886 
8887 static errno_t
ifproto_media_ioctl(struct ifnet * ifp,protocol_family_t protocol,unsigned long command,void * argument)8888 ifproto_media_ioctl(struct ifnet *ifp, protocol_family_t protocol,
8889     unsigned long command, void *argument)
8890 {
8891 #pragma unused(ifp, protocol, command, argument)
8892 	return ENXIO;
8893 }
8894 
8895 static errno_t
ifproto_media_resolve_multi(ifnet_t ifp,const struct sockaddr * proto_addr,struct sockaddr_dl * out_ll,size_t ll_len)8896 ifproto_media_resolve_multi(ifnet_t ifp, const struct sockaddr *proto_addr,
8897     struct sockaddr_dl *out_ll, size_t ll_len)
8898 {
8899 #pragma unused(ifp, proto_addr, out_ll, ll_len)
8900 	return ENXIO;
8901 }
8902 
8903 static errno_t
ifproto_media_send_arp(struct ifnet * ifp,u_short arpop,const struct sockaddr_dl * sender_hw,const struct sockaddr * sender_proto,const struct sockaddr_dl * target_hw,const struct sockaddr * target_proto)8904 ifproto_media_send_arp(struct ifnet *ifp, u_short arpop,
8905     const struct sockaddr_dl *sender_hw, const struct sockaddr *sender_proto,
8906     const struct sockaddr_dl *target_hw, const struct sockaddr *target_proto)
8907 {
8908 #pragma unused(ifp, arpop, sender_hw, sender_proto, target_hw, target_proto)
8909 	return ENXIO;
8910 }
8911 
8912 extern int if_next_index(void);
8913 extern int tcp_ecn_outbound;
8914 
8915 void
dlil_ifclassq_setup(struct ifnet * ifp,struct ifclassq * ifcq)8916 dlil_ifclassq_setup(struct ifnet *ifp, struct ifclassq *ifcq)
8917 {
8918 	uint32_t sflags = 0;
8919 	int err;
8920 
8921 	if (if_flowadv) {
8922 		sflags |= PKTSCHEDF_QALG_FLOWCTL;
8923 	}
8924 
8925 	if (if_delaybased_queue) {
8926 		sflags |= PKTSCHEDF_QALG_DELAYBASED;
8927 	}
8928 
8929 	if (ifp->if_output_sched_model ==
8930 	    IFNET_SCHED_MODEL_DRIVER_MANAGED) {
8931 		sflags |= PKTSCHEDF_QALG_DRIVER_MANAGED;
8932 	}
8933 	/* Inherit drop limit from the default queue */
8934 	if (ifp->if_snd != ifcq) {
8935 		IFCQ_PKT_DROP_LIMIT(ifcq) = IFCQ_PKT_DROP_LIMIT(ifp->if_snd);
8936 	}
8937 	/* Initialize transmit queue(s) */
8938 	err = ifclassq_setup(ifcq, ifp, sflags);
8939 	if (err != 0) {
8940 		panic_plain("%s: ifp=%p couldn't initialize transmit queue; "
8941 		    "err=%d", __func__, ifp, err);
8942 		/* NOTREACHED */
8943 	}
8944 }
8945 
8946 errno_t
ifnet_attach(ifnet_t ifp,const struct sockaddr_dl * ll_addr)8947 ifnet_attach(ifnet_t ifp, const struct sockaddr_dl *ll_addr)
8948 {
8949 #if SKYWALK
8950 	boolean_t netif_compat;
8951 	if_nexus_netif  nexus_netif;
8952 #endif /* SKYWALK */
8953 	struct ifnet *tmp_if;
8954 	struct ifaddr *ifa;
8955 	struct if_data_internal if_data_saved;
8956 	struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
8957 	struct dlil_threading_info *dl_inp;
8958 	thread_continue_t thfunc = NULL;
8959 	int err;
8960 
8961 	if (ifp == NULL) {
8962 		return EINVAL;
8963 	}
8964 
8965 	/*
8966 	 * Serialize ifnet attach using dlil_ifnet_lock, in order to
8967 	 * prevent the interface from being configured while it is
8968 	 * embryonic, as ifnet_head_lock is dropped and reacquired
8969 	 * below prior to marking the ifnet with IFRF_ATTACHED.
8970 	 */
8971 	dlil_if_lock();
8972 	ifnet_head_lock_exclusive();
8973 	/* Verify we aren't already on the list */
8974 	TAILQ_FOREACH(tmp_if, &ifnet_head, if_link) {
8975 		if (tmp_if == ifp) {
8976 			ifnet_head_done();
8977 			dlil_if_unlock();
8978 			return EEXIST;
8979 		}
8980 	}
8981 
8982 	lck_mtx_lock_spin(&ifp->if_ref_lock);
8983 	if (!(ifp->if_refflags & IFRF_EMBRYONIC)) {
8984 		panic_plain("%s: flags mismatch (embryonic not set) ifp=%p",
8985 		    __func__, ifp);
8986 		/* NOTREACHED */
8987 	}
8988 	lck_mtx_unlock(&ifp->if_ref_lock);
8989 
8990 	ifnet_lock_exclusive(ifp);
8991 
8992 	/* Sanity check */
8993 	VERIFY(ifp->if_detaching_link.tqe_next == NULL);
8994 	VERIFY(ifp->if_detaching_link.tqe_prev == NULL);
8995 	VERIFY(ifp->if_threads_pending == 0);
8996 
8997 	if (ll_addr != NULL) {
8998 		if (ifp->if_addrlen == 0) {
8999 			ifp->if_addrlen = ll_addr->sdl_alen;
9000 		} else if (ll_addr->sdl_alen != ifp->if_addrlen) {
9001 			ifnet_lock_done(ifp);
9002 			ifnet_head_done();
9003 			dlil_if_unlock();
9004 			return EINVAL;
9005 		}
9006 	}
9007 
9008 	/*
9009 	 * Allow interfaces without protocol families to attach
9010 	 * only if they have the necessary fields filled out.
9011 	 */
9012 	if (ifp->if_add_proto == NULL || ifp->if_del_proto == NULL) {
9013 		DLIL_PRINTF("%s: Attempt to attach interface without "
9014 		    "family module - %d\n", __func__, ifp->if_family);
9015 		ifnet_lock_done(ifp);
9016 		ifnet_head_done();
9017 		dlil_if_unlock();
9018 		return ENODEV;
9019 	}
9020 
9021 	/* Allocate protocol hash table */
9022 	VERIFY(ifp->if_proto_hash == NULL);
9023 	ifp->if_proto_hash = kalloc_type(struct proto_hash_entry,
9024 	    PROTO_HASH_SLOTS, Z_WAITOK | Z_ZERO | Z_NOFAIL);
9025 
9026 	lck_mtx_lock_spin(&ifp->if_flt_lock);
9027 	VERIFY(TAILQ_EMPTY(&ifp->if_flt_head));
9028 	TAILQ_INIT(&ifp->if_flt_head);
9029 	VERIFY(ifp->if_flt_busy == 0);
9030 	VERIFY(ifp->if_flt_waiters == 0);
9031 	VERIFY(ifp->if_flt_non_os_count == 0);
9032 	VERIFY(ifp->if_flt_no_tso_count == 0);
9033 	lck_mtx_unlock(&ifp->if_flt_lock);
9034 
9035 	if (!(dl_if->dl_if_flags & DLIF_REUSE)) {
9036 		VERIFY(LIST_EMPTY(&ifp->if_multiaddrs));
9037 		LIST_INIT(&ifp->if_multiaddrs);
9038 	}
9039 
9040 	VERIFY(ifp->if_allhostsinm == NULL);
9041 	VERIFY(TAILQ_EMPTY(&ifp->if_addrhead));
9042 	TAILQ_INIT(&ifp->if_addrhead);
9043 
9044 	if (ifp->if_index == 0) {
9045 		int idx = if_next_index();
9046 
9047 		/*
9048 		 * Since we exhausted the list of
9049 		 * if_index's, try to find an empty slot
9050 		 * in ifindex2ifnet.
9051 		 */
9052 		if (idx == -1 && if_index >= UINT16_MAX) {
9053 			for (int i = 1; i < if_index; i++) {
9054 				if (ifindex2ifnet[i] == NULL &&
9055 				    ifnet_addrs[i - 1] == NULL) {
9056 					idx = i;
9057 					break;
9058 				}
9059 			}
9060 		}
9061 		if (idx == -1) {
9062 			ifp->if_index = 0;
9063 			ifnet_lock_done(ifp);
9064 			ifnet_head_done();
9065 			dlil_if_unlock();
9066 			return ENOBUFS;
9067 		}
9068 		ifp->if_index = (uint16_t)idx;
9069 
9070 		/* the lladdr passed at attach time is the permanent address */
9071 		if (ll_addr != NULL && ifp->if_type == IFT_ETHER &&
9072 		    ll_addr->sdl_alen == ETHER_ADDR_LEN) {
9073 			bcopy(CONST_LLADDR(ll_addr),
9074 			    dl_if->dl_if_permanent_ether,
9075 			    ETHER_ADDR_LEN);
9076 			dl_if->dl_if_permanent_ether_is_set = 1;
9077 		}
9078 	}
9079 	/* There should not be anything occupying this slot */
9080 	VERIFY(ifindex2ifnet[ifp->if_index] == NULL);
9081 
9082 	/* allocate (if needed) and initialize a link address */
9083 	ifa = dlil_alloc_lladdr(ifp, ll_addr);
9084 	if (ifa == NULL) {
9085 		ifnet_lock_done(ifp);
9086 		ifnet_head_done();
9087 		dlil_if_unlock();
9088 		return ENOBUFS;
9089 	}
9090 
9091 	VERIFY(ifnet_addrs[ifp->if_index - 1] == NULL);
9092 	ifnet_addrs[ifp->if_index - 1] = ifa;
9093 
9094 	/* make this address the first on the list */
9095 	IFA_LOCK(ifa);
9096 	/* hold a reference for ifnet_addrs[] */
9097 	ifa_addref(ifa);
9098 	/* if_attach_link_ifa() holds a reference for ifa_link */
9099 	if_attach_link_ifa(ifp, ifa);
9100 	IFA_UNLOCK(ifa);
9101 
9102 	TAILQ_INSERT_TAIL(&ifnet_head, ifp, if_link);
9103 	ifindex2ifnet[ifp->if_index] = ifp;
9104 
9105 	/* Hold a reference to the underlying dlil_ifnet */
9106 	ifnet_reference(ifp);
9107 
9108 	/* Clear stats (save and restore other fields that we care) */
9109 	if_data_saved = ifp->if_data;
9110 	bzero(&ifp->if_data, sizeof(ifp->if_data));
9111 	ifp->if_data.ifi_type = if_data_saved.ifi_type;
9112 	ifp->if_data.ifi_typelen = if_data_saved.ifi_typelen;
9113 	ifp->if_data.ifi_physical = if_data_saved.ifi_physical;
9114 	ifp->if_data.ifi_addrlen = if_data_saved.ifi_addrlen;
9115 	ifp->if_data.ifi_hdrlen = if_data_saved.ifi_hdrlen;
9116 	ifp->if_data.ifi_mtu = if_data_saved.ifi_mtu;
9117 	ifp->if_data.ifi_baudrate = if_data_saved.ifi_baudrate;
9118 	ifp->if_data.ifi_hwassist = if_data_saved.ifi_hwassist;
9119 	ifp->if_data.ifi_tso_v4_mtu = if_data_saved.ifi_tso_v4_mtu;
9120 	ifp->if_data.ifi_tso_v6_mtu = if_data_saved.ifi_tso_v6_mtu;
9121 	ifnet_touch_lastchange(ifp);
9122 
9123 	VERIFY(ifp->if_output_sched_model == IFNET_SCHED_MODEL_NORMAL ||
9124 	    ifp->if_output_sched_model == IFNET_SCHED_MODEL_DRIVER_MANAGED ||
9125 	    ifp->if_output_sched_model == IFNET_SCHED_MODEL_FQ_CODEL);
9126 
9127 	dlil_ifclassq_setup(ifp, ifp->if_snd);
9128 
9129 	/* Sanity checks on the input thread storage */
9130 	dl_inp = &dl_if->dl_if_inpstorage;
9131 	bzero(&dl_inp->dlth_stats, sizeof(dl_inp->dlth_stats));
9132 	VERIFY(dl_inp->dlth_flags == 0);
9133 	VERIFY(dl_inp->dlth_wtot == 0);
9134 	VERIFY(dl_inp->dlth_ifp == NULL);
9135 	VERIFY(qhead(&dl_inp->dlth_pkts) == NULL && qempty(&dl_inp->dlth_pkts));
9136 	VERIFY(qlimit(&dl_inp->dlth_pkts) == 0);
9137 	VERIFY(!dl_inp->dlth_affinity);
9138 	VERIFY(ifp->if_inp == NULL);
9139 	VERIFY(dl_inp->dlth_thread == THREAD_NULL);
9140 	VERIFY(dl_inp->dlth_strategy == NULL);
9141 	VERIFY(dl_inp->dlth_driver_thread == THREAD_NULL);
9142 	VERIFY(dl_inp->dlth_poller_thread == THREAD_NULL);
9143 	VERIFY(dl_inp->dlth_affinity_tag == 0);
9144 
9145 #if IFNET_INPUT_SANITY_CHK
9146 	VERIFY(dl_inp->dlth_pkts_cnt == 0);
9147 #endif /* IFNET_INPUT_SANITY_CHK */
9148 
9149 	VERIFY(ifp->if_poll_thread == THREAD_NULL);
9150 	dlil_reset_rxpoll_params(ifp);
9151 	/*
9152 	 * A specific DLIL input thread is created per non-loopback interface.
9153 	 */
9154 	if (ifp->if_family != IFNET_FAMILY_LOOPBACK) {
9155 		ifp->if_inp = dl_inp;
9156 		ifnet_incr_pending_thread_count(ifp);
9157 		err = dlil_create_input_thread(ifp, ifp->if_inp, &thfunc);
9158 		if (err == ENODEV) {
9159 			VERIFY(thfunc == NULL);
9160 			ifnet_decr_pending_thread_count(ifp);
9161 		} else if (err != 0) {
9162 			panic_plain("%s: ifp=%p couldn't get an input thread; "
9163 			    "err=%d", __func__, ifp, err);
9164 			/* NOTREACHED */
9165 		}
9166 	}
9167 	/*
9168 	 * If the driver supports the new transmit model, calculate flow hash
9169 	 * and create a workloop starter thread to invoke the if_start callback
9170 	 * where the packets may be dequeued and transmitted.
9171 	 */
9172 	if (ifp->if_eflags & IFEF_TXSTART) {
9173 		thread_precedence_policy_data_t info;
9174 		__unused kern_return_t kret;
9175 
9176 		ifp->if_flowhash = ifnet_calc_flowhash(ifp);
9177 		VERIFY(ifp->if_flowhash != 0);
9178 		VERIFY(ifp->if_start_thread == THREAD_NULL);
9179 
9180 		ifnet_set_start_cycle(ifp, NULL);
9181 		ifp->if_start_active = 0;
9182 		ifp->if_start_req = 0;
9183 		ifp->if_start_flags = 0;
9184 		VERIFY(ifp->if_start != NULL);
9185 		ifnet_incr_pending_thread_count(ifp);
9186 		if ((err = kernel_thread_start(ifnet_start_thread_func,
9187 		    ifp, &ifp->if_start_thread)) != KERN_SUCCESS) {
9188 			panic_plain("%s: "
9189 			    "ifp=%p couldn't get a start thread; "
9190 			    "err=%d", __func__, ifp, err);
9191 			/* NOTREACHED */
9192 		}
9193 		bzero(&info, sizeof(info));
9194 		info.importance = 1;
9195 		kret = thread_policy_set(ifp->if_start_thread,
9196 		    THREAD_PRECEDENCE_POLICY, (thread_policy_t)&info,
9197 		    THREAD_PRECEDENCE_POLICY_COUNT);
9198 		ASSERT(kret == KERN_SUCCESS);
9199 	} else {
9200 		ifp->if_flowhash = 0;
9201 	}
9202 
9203 	/* Reset polling parameters */
9204 	ifnet_set_poll_cycle(ifp, NULL);
9205 	ifp->if_poll_update = 0;
9206 	ifp->if_poll_flags = 0;
9207 	ifp->if_poll_req = 0;
9208 	VERIFY(ifp->if_poll_thread == THREAD_NULL);
9209 
9210 	/*
9211 	 * If the driver supports the new receive model, create a poller
9212 	 * thread to invoke if_input_poll callback where the packets may
9213 	 * be dequeued from the driver and processed for reception.
9214 	 * if the interface is netif compat then the poller thread is
9215 	 * managed by netif.
9216 	 */
9217 	if (thfunc == dlil_rxpoll_input_thread_func) {
9218 		thread_precedence_policy_data_t info;
9219 		__unused kern_return_t kret;
9220 #if SKYWALK
9221 		VERIFY(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
9222 #endif /* SKYWALK */
9223 		VERIFY(ifp->if_input_poll != NULL);
9224 		VERIFY(ifp->if_input_ctl != NULL);
9225 		ifnet_incr_pending_thread_count(ifp);
9226 		if ((err = kernel_thread_start(ifnet_poll_thread_func, ifp,
9227 		    &ifp->if_poll_thread)) != KERN_SUCCESS) {
9228 			panic_plain("%s: ifp=%p couldn't get a poll thread; "
9229 			    "err=%d", __func__, ifp, err);
9230 			/* NOTREACHED */
9231 		}
9232 		bzero(&info, sizeof(info));
9233 		info.importance = 1;
9234 		kret = thread_policy_set(ifp->if_poll_thread,
9235 		    THREAD_PRECEDENCE_POLICY, (thread_policy_t)&info,
9236 		    THREAD_PRECEDENCE_POLICY_COUNT);
9237 		ASSERT(kret == KERN_SUCCESS);
9238 	}
9239 
9240 	VERIFY(ifp->if_desc.ifd_maxlen == IF_DESCSIZE);
9241 	VERIFY(ifp->if_desc.ifd_len == 0);
9242 	VERIFY(ifp->if_desc.ifd_desc != NULL);
9243 
9244 	/* Record attach PC stacktrace */
9245 	ctrace_record(&((struct dlil_ifnet *)ifp)->dl_if_attach);
9246 
9247 	ifp->if_updatemcasts = 0;
9248 	if (!LIST_EMPTY(&ifp->if_multiaddrs)) {
9249 		struct ifmultiaddr *ifma;
9250 		LIST_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
9251 			IFMA_LOCK(ifma);
9252 			if (ifma->ifma_addr->sa_family == AF_LINK ||
9253 			    ifma->ifma_addr->sa_family == AF_UNSPEC) {
9254 				ifp->if_updatemcasts++;
9255 			}
9256 			IFMA_UNLOCK(ifma);
9257 		}
9258 
9259 		DLIL_PRINTF("%s: attached with %d suspended link-layer multicast "
9260 		    "membership(s)\n", if_name(ifp),
9261 		    ifp->if_updatemcasts);
9262 	}
9263 
9264 	/* Clear logging parameters */
9265 	bzero(&ifp->if_log, sizeof(ifp->if_log));
9266 
9267 	/* Clear foreground/realtime activity timestamps */
9268 	ifp->if_fg_sendts = 0;
9269 	ifp->if_rt_sendts = 0;
9270 
9271 	/* Clear throughput estimates and radio type */
9272 	ifp->if_estimated_up_bucket = 0;
9273 	ifp->if_estimated_down_bucket = 0;
9274 	ifp->if_radio_type = 0;
9275 	ifp->if_radio_channel = 0;
9276 
9277 	VERIFY(ifp->if_delegated.ifp == NULL);
9278 	VERIFY(ifp->if_delegated.type == 0);
9279 	VERIFY(ifp->if_delegated.family == 0);
9280 	VERIFY(ifp->if_delegated.subfamily == 0);
9281 	VERIFY(ifp->if_delegated.expensive == 0);
9282 	VERIFY(ifp->if_delegated.constrained == 0);
9283 
9284 	VERIFY(ifp->if_agentids == NULL);
9285 	VERIFY(ifp->if_agentcount == 0);
9286 
9287 	/* Reset interface state */
9288 	bzero(&ifp->if_interface_state, sizeof(ifp->if_interface_state));
9289 	ifp->if_interface_state.valid_bitmask |=
9290 	    IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID;
9291 	ifp->if_interface_state.interface_availability =
9292 	    IF_INTERFACE_STATE_INTERFACE_AVAILABLE;
9293 
9294 	/* Initialize Link Quality Metric (loopback [lo0] is always good) */
9295 	if (ifp == lo_ifp) {
9296 		ifp->if_interface_state.lqm_state = IFNET_LQM_THRESH_GOOD;
9297 		ifp->if_interface_state.valid_bitmask |=
9298 		    IF_INTERFACE_STATE_LQM_STATE_VALID;
9299 	} else {
9300 		ifp->if_interface_state.lqm_state = IFNET_LQM_THRESH_UNKNOWN;
9301 	}
9302 
9303 	/*
9304 	 * Enable ECN capability on this interface depending on the
9305 	 * value of ECN global setting
9306 	 */
9307 	if (tcp_ecn_outbound == 2 && !IFNET_IS_CELLULAR(ifp)) {
9308 		if_set_eflags(ifp, IFEF_ECN_ENABLE);
9309 		if_clear_eflags(ifp, IFEF_ECN_DISABLE);
9310 	}
9311 
9312 	/*
9313 	 * Built-in Cyclops always on policy for WiFi infra
9314 	 */
9315 	if (IFNET_IS_WIFI_INFRA(ifp) && net_qos_policy_wifi_enabled != 0) {
9316 		errno_t error;
9317 
9318 		error = if_set_qosmarking_mode(ifp,
9319 		    IFRTYPE_QOSMARKING_FASTLANE);
9320 		if (error != 0) {
9321 			DLIL_PRINTF("%s if_set_qosmarking_mode(%s) error %d\n",
9322 			    __func__, ifp->if_xname, error);
9323 		} else {
9324 			if_set_eflags(ifp, IFEF_QOSMARKING_ENABLED);
9325 #if (DEVELOPMENT || DEBUG)
9326 			DLIL_PRINTF("%s fastlane enabled on %s\n",
9327 			    __func__, ifp->if_xname);
9328 #endif /* (DEVELOPMENT || DEBUG) */
9329 		}
9330 	}
9331 
9332 	ifnet_lock_done(ifp);
9333 	ifnet_head_done();
9334 
9335 #if SKYWALK
9336 	netif_compat = dlil_attach_netif_compat_nexus(ifp, &nexus_netif);
9337 #endif /* SKYWALK */
9338 
9339 	lck_mtx_lock(&ifp->if_cached_route_lock);
9340 	/* Enable forwarding cached route */
9341 	ifp->if_fwd_cacheok = 1;
9342 	/* Clean up any existing cached routes */
9343 	ROUTE_RELEASE(&ifp->if_fwd_route);
9344 	bzero(&ifp->if_fwd_route, sizeof(ifp->if_fwd_route));
9345 	ROUTE_RELEASE(&ifp->if_src_route);
9346 	bzero(&ifp->if_src_route, sizeof(ifp->if_src_route));
9347 	ROUTE_RELEASE(&ifp->if_src_route6);
9348 	bzero(&ifp->if_src_route6, sizeof(ifp->if_src_route6));
9349 	lck_mtx_unlock(&ifp->if_cached_route_lock);
9350 
9351 	ifnet_llreach_ifattach(ifp, (dl_if->dl_if_flags & DLIF_REUSE));
9352 
9353 	/*
9354 	 * Allocate and attach IGMPv3/MLDv2 interface specific variables
9355 	 * and trees; do this before the ifnet is marked as attached.
9356 	 * The ifnet keeps the reference to the info structures even after
9357 	 * the ifnet is detached, since the network-layer records still
9358 	 * refer to the info structures even after that.  This also
9359 	 * makes it possible for them to still function after the ifnet
9360 	 * is recycled or reattached.
9361 	 */
9362 #if INET
9363 	if (IGMP_IFINFO(ifp) == NULL) {
9364 		IGMP_IFINFO(ifp) = igmp_domifattach(ifp, Z_WAITOK);
9365 		VERIFY(IGMP_IFINFO(ifp) != NULL);
9366 	} else {
9367 		VERIFY(IGMP_IFINFO(ifp)->igi_ifp == ifp);
9368 		igmp_domifreattach(IGMP_IFINFO(ifp));
9369 	}
9370 #endif /* INET */
9371 	if (MLD_IFINFO(ifp) == NULL) {
9372 		MLD_IFINFO(ifp) = mld_domifattach(ifp, Z_WAITOK);
9373 		VERIFY(MLD_IFINFO(ifp) != NULL);
9374 	} else {
9375 		VERIFY(MLD_IFINFO(ifp)->mli_ifp == ifp);
9376 		mld_domifreattach(MLD_IFINFO(ifp));
9377 	}
9378 
9379 	VERIFY(ifp->if_data_threshold == 0);
9380 	VERIFY(ifp->if_dt_tcall != NULL);
9381 
9382 	/*
9383 	 * Wait for the created kernel threads for I/O to get
9384 	 * scheduled and run at least once before we proceed
9385 	 * to mark interface as attached.
9386 	 */
9387 	lck_mtx_lock(&ifp->if_ref_lock);
9388 	while (ifp->if_threads_pending != 0) {
9389 		DLIL_PRINTF("%s: Waiting for all kernel threads created for "
9390 		    "interface %s to get scheduled at least once.\n",
9391 		    __func__, ifp->if_xname);
9392 		(void) msleep(&ifp->if_threads_pending, &ifp->if_ref_lock, (PZERO - 1),
9393 		    __func__, NULL);
9394 		LCK_MTX_ASSERT(&ifp->if_ref_lock, LCK_ASSERT_OWNED);
9395 	}
9396 	lck_mtx_unlock(&ifp->if_ref_lock);
9397 	DLIL_PRINTF("%s: All kernel threads created for interface %s have been scheduled "
9398 	    "at least once. Proceeding.\n", __func__, ifp->if_xname);
9399 
9400 	/* Final mark this ifnet as attached. */
9401 	ifnet_lock_exclusive(ifp);
9402 	lck_mtx_lock_spin(&ifp->if_ref_lock);
9403 	ifp->if_refflags = (IFRF_ATTACHED | IFRF_READY); /* clears embryonic */
9404 	lck_mtx_unlock(&ifp->if_ref_lock);
9405 	if (net_rtref) {
9406 		/* boot-args override; enable idle notification */
9407 		(void) ifnet_set_idle_flags_locked(ifp, IFRF_IDLE_NOTIFY,
9408 		    IFRF_IDLE_NOTIFY);
9409 	} else {
9410 		/* apply previous request(s) to set the idle flags, if any */
9411 		(void) ifnet_set_idle_flags_locked(ifp, ifp->if_idle_new_flags,
9412 		    ifp->if_idle_new_flags_mask);
9413 	}
9414 #if SKYWALK
9415 	/* the interface is fully attached; let the nexus adapter know */
9416 	if (netif_compat || dlil_is_native_netif_nexus(ifp)) {
9417 		if (netif_compat) {
9418 			if (sk_netif_compat_txmodel ==
9419 			    NETIF_COMPAT_TXMODEL_ENQUEUE_MULTI) {
9420 				ifnet_enqueue_multi_setup(ifp,
9421 				    sk_tx_delay_qlen, sk_tx_delay_timeout);
9422 			}
9423 			ifp->if_nx_netif = nexus_netif;
9424 		}
9425 		ifp->if_na_ops->ni_finalize(ifp->if_na, ifp);
9426 	}
9427 #endif /* SKYWALK */
9428 	ifnet_lock_done(ifp);
9429 	dlil_if_unlock();
9430 
9431 #if PF
9432 	/*
9433 	 * Attach packet filter to this interface, if enabled.
9434 	 */
9435 	pf_ifnet_hook(ifp, 1);
9436 #endif /* PF */
9437 
9438 	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_IF_ATTACHED, NULL, 0, FALSE);
9439 
9440 	if (dlil_verbose) {
9441 		DLIL_PRINTF("%s: attached%s\n", if_name(ifp),
9442 		    (dl_if->dl_if_flags & DLIF_REUSE) ? " (recycled)" : "");
9443 	}
9444 
9445 	return 0;
9446 }
9447 
9448 /*
9449  * Prepare the storage for the first/permanent link address, which must
9450  * must have the same lifetime as the ifnet itself.  Although the link
9451  * address gets removed from if_addrhead and ifnet_addrs[] at detach time,
9452  * its location in memory must never change as it may still be referred
9453  * to by some parts of the system afterwards (unfortunate implementation
9454  * artifacts inherited from BSD.)
9455  *
9456  * Caller must hold ifnet lock as writer.
9457  */
9458 static struct ifaddr *
dlil_alloc_lladdr(struct ifnet * ifp,const struct sockaddr_dl * ll_addr)9459 dlil_alloc_lladdr(struct ifnet *ifp, const struct sockaddr_dl *ll_addr)
9460 {
9461 	struct ifaddr *ifa, *oifa = NULL;
9462 	struct sockaddr_dl *addr_sdl, *mask_sdl;
9463 	char workbuf[IFNAMSIZ * 2];
9464 	int namelen, masklen, socksize;
9465 	struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
9466 
9467 	ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_EXCLUSIVE);
9468 	VERIFY(ll_addr == NULL || ll_addr->sdl_alen == ifp->if_addrlen);
9469 
9470 	namelen = scnprintf(workbuf, sizeof(workbuf), "%s",
9471 	    if_name(ifp));
9472 	masklen = offsetof(struct sockaddr_dl, sdl_data[0])
9473 	    + ((namelen > 0) ? namelen : 0);
9474 	socksize = masklen + ifp->if_addrlen;
9475 #define ROUNDUP(a) (1 + (((a) - 1) | (sizeof (u_int32_t) - 1)))
9476 	if ((u_int32_t)socksize < sizeof(struct sockaddr_dl)) {
9477 		socksize = sizeof(struct sockaddr_dl);
9478 	}
9479 	socksize = ROUNDUP(socksize);
9480 #undef ROUNDUP
9481 
9482 	ifa = ifp->if_lladdr;
9483 	if (socksize > DLIL_SDLMAXLEN ||
9484 	    (ifa != NULL && ifa != &dl_if->dl_if_lladdr.ifa)) {
9485 		/*
9486 		 * Rare, but in the event that the link address requires
9487 		 * more storage space than DLIL_SDLMAXLEN, allocate the
9488 		 * largest possible storages for address and mask, such
9489 		 * that we can reuse the same space when if_addrlen grows.
9490 		 * This same space will be used when if_addrlen shrinks.
9491 		 */
9492 		struct dl_if_lladdr_xtra_space *__single dl_if_lladdr_ext;
9493 
9494 		if (ifa == NULL || ifa == &dl_if->dl_if_lladdr.ifa) {
9495 			dl_if_lladdr_ext = zalloc_permanent(
9496 				sizeof(*dl_if_lladdr_ext), ZALIGN(struct ifaddr));
9497 
9498 			ifa = &dl_if_lladdr_ext->ifa;
9499 			ifa_lock_init(ifa);
9500 			ifa_initref(ifa);
9501 			/* Don't set IFD_ALLOC, as this is permanent */
9502 			ifa->ifa_debug = IFD_LINK;
9503 		} else {
9504 			dl_if_lladdr_ext = __unsafe_forge_single(
9505 				struct dl_if_lladdr_xtra_space*, ifa);
9506 			ifa = &dl_if_lladdr_ext->ifa;
9507 		}
9508 
9509 		IFA_LOCK(ifa);
9510 		/* address and mask sockaddr_dl locations */
9511 		bzero(dl_if_lladdr_ext->addr_sdl_bytes,
9512 		    sizeof(dl_if_lladdr_ext->addr_sdl_bytes));
9513 		bzero(dl_if_lladdr_ext->mask_sdl_bytes,
9514 		    sizeof(dl_if_lladdr_ext->mask_sdl_bytes));
9515 		addr_sdl = SDL(dl_if_lladdr_ext->addr_sdl_bytes);
9516 		mask_sdl = SDL(dl_if_lladdr_ext->mask_sdl_bytes);
9517 	} else {
9518 		VERIFY(ifa == NULL || ifa == &dl_if->dl_if_lladdr.ifa);
9519 		/*
9520 		 * Use the storage areas for address and mask within the
9521 		 * dlil_ifnet structure.  This is the most common case.
9522 		 */
9523 		if (ifa == NULL) {
9524 			ifa = &dl_if->dl_if_lladdr.ifa;
9525 			ifa_lock_init(ifa);
9526 			ifa_initref(ifa);
9527 			/* Don't set IFD_ALLOC, as this is permanent */
9528 			ifa->ifa_debug = IFD_LINK;
9529 		}
9530 		IFA_LOCK(ifa);
9531 		/* address and mask sockaddr_dl locations */
9532 		bzero(dl_if->dl_if_lladdr.addr_sdl_bytes,
9533 		    sizeof(dl_if->dl_if_lladdr.addr_sdl_bytes));
9534 		bzero(dl_if->dl_if_lladdr.mask_sdl_bytes,
9535 		    sizeof(dl_if->dl_if_lladdr.mask_sdl_bytes));
9536 		addr_sdl = SDL(dl_if->dl_if_lladdr.addr_sdl_bytes);
9537 		mask_sdl = SDL(dl_if->dl_if_lladdr.mask_sdl_bytes);
9538 	}
9539 
9540 	if (ifp->if_lladdr != ifa) {
9541 		oifa = ifp->if_lladdr;
9542 		ifp->if_lladdr = ifa;
9543 	}
9544 
9545 	VERIFY(ifa->ifa_debug == IFD_LINK);
9546 	ifa->ifa_ifp = ifp;
9547 	ifa->ifa_rtrequest = link_rtrequest;
9548 	ifa->ifa_addr = SA(addr_sdl);
9549 	addr_sdl->sdl_len = (u_char)socksize;
9550 	addr_sdl->sdl_family = AF_LINK;
9551 	if (namelen > 0) {
9552 		bcopy(workbuf, addr_sdl->sdl_data, min(namelen,
9553 		    sizeof(addr_sdl->sdl_data)));
9554 		addr_sdl->sdl_nlen = (u_char)namelen;
9555 	} else {
9556 		addr_sdl->sdl_nlen = 0;
9557 	}
9558 	addr_sdl->sdl_index = ifp->if_index;
9559 	addr_sdl->sdl_type = ifp->if_type;
9560 	if (ll_addr != NULL) {
9561 		addr_sdl->sdl_alen = ll_addr->sdl_alen;
9562 		bcopy(CONST_LLADDR(ll_addr), LLADDR(addr_sdl), addr_sdl->sdl_alen);
9563 	} else {
9564 		addr_sdl->sdl_alen = 0;
9565 	}
9566 	ifa->ifa_netmask = SA(mask_sdl);
9567 	mask_sdl->sdl_len = (u_char)masklen;
9568 	while (namelen > 0) {
9569 		mask_sdl->sdl_data[--namelen] = 0xff;
9570 	}
9571 	IFA_UNLOCK(ifa);
9572 
9573 	if (oifa != NULL) {
9574 		ifa_remref(oifa);
9575 	}
9576 
9577 	return ifa;
9578 }
9579 
9580 static void
if_purgeaddrs(struct ifnet * ifp)9581 if_purgeaddrs(struct ifnet *ifp)
9582 {
9583 #if INET
9584 	in_purgeaddrs(ifp);
9585 #endif /* INET */
9586 	in6_purgeaddrs(ifp);
9587 }
9588 
9589 errno_t
ifnet_detach(ifnet_t ifp)9590 ifnet_detach(ifnet_t ifp)
9591 {
9592 	struct ifnet *delegated_ifp;
9593 	struct nd_ifinfo *ndi = NULL;
9594 
9595 	if (ifp == NULL) {
9596 		return EINVAL;
9597 	}
9598 
9599 	ndi = ND_IFINFO(ifp);
9600 	if (NULL != ndi) {
9601 		ndi->cga_initialized = FALSE;
9602 	}
9603 
9604 	/* Mark the interface down */
9605 	if_down(ifp);
9606 
9607 	/*
9608 	 * IMPORTANT NOTE
9609 	 *
9610 	 * Any field in the ifnet that relies on IF_FULLY_ATTACHED()
9611 	 * or equivalently, ifnet_is_attached(ifp, 1), can't be modified
9612 	 * until after we've waited for all I/O references to drain
9613 	 * in ifnet_detach_final().
9614 	 */
9615 
9616 	ifnet_head_lock_exclusive();
9617 	ifnet_lock_exclusive(ifp);
9618 
9619 	if (ifp->if_output_netem != NULL) {
9620 		netem_destroy(ifp->if_output_netem);
9621 		ifp->if_output_netem = NULL;
9622 	}
9623 
9624 	/*
9625 	 * Check to see if this interface has previously triggered
9626 	 * aggressive protocol draining; if so, decrement the global
9627 	 * refcnt and clear PR_AGGDRAIN on the route domain if
9628 	 * there are no more of such an interface around.
9629 	 */
9630 	(void) ifnet_set_idle_flags_locked(ifp, 0, ~0);
9631 
9632 	lck_mtx_lock_spin(&ifp->if_ref_lock);
9633 	if (!(ifp->if_refflags & IFRF_ATTACHED)) {
9634 		lck_mtx_unlock(&ifp->if_ref_lock);
9635 		ifnet_lock_done(ifp);
9636 		ifnet_head_done();
9637 		return EINVAL;
9638 	} else if (ifp->if_refflags & IFRF_DETACHING) {
9639 		/* Interface has already been detached */
9640 		lck_mtx_unlock(&ifp->if_ref_lock);
9641 		ifnet_lock_done(ifp);
9642 		ifnet_head_done();
9643 		return ENXIO;
9644 	}
9645 	VERIFY(!(ifp->if_refflags & IFRF_EMBRYONIC));
9646 	/* Indicate this interface is being detached */
9647 	ifp->if_refflags &= ~IFRF_ATTACHED;
9648 	ifp->if_refflags |= IFRF_DETACHING;
9649 	lck_mtx_unlock(&ifp->if_ref_lock);
9650 
9651 	if (dlil_verbose) {
9652 		DLIL_PRINTF("%s: detaching\n", if_name(ifp));
9653 	}
9654 
9655 	/* clean up flow control entry object if there's any */
9656 	if (ifp->if_eflags & IFEF_TXSTART) {
9657 		ifnet_flowadv(ifp->if_flowhash);
9658 	}
9659 
9660 	/* Reset ECN enable/disable flags */
9661 	/* Reset CLAT46 flag */
9662 	if_clear_eflags(ifp, IFEF_ECN_ENABLE | IFEF_ECN_DISABLE | IFEF_CLAT46);
9663 
9664 	/*
9665 	 * We do not reset the TCP keep alive counters in case
9666 	 * a TCP connection stays connection after the interface
9667 	 * went down
9668 	 */
9669 	if (ifp->if_tcp_kao_cnt > 0) {
9670 		os_log(OS_LOG_DEFAULT, "%s %s tcp_kao_cnt %u not zero",
9671 		    __func__, if_name(ifp), ifp->if_tcp_kao_cnt);
9672 	}
9673 	ifp->if_tcp_kao_max = 0;
9674 
9675 	/*
9676 	 * Remove ifnet from the ifnet_head, ifindex2ifnet[]; it will
9677 	 * no longer be visible during lookups from this point.
9678 	 */
9679 	VERIFY(ifindex2ifnet[ifp->if_index] == ifp);
9680 	TAILQ_REMOVE(&ifnet_head, ifp, if_link);
9681 	ifp->if_link.tqe_next = NULL;
9682 	ifp->if_link.tqe_prev = NULL;
9683 	if (ifp->if_ordered_link.tqe_next != NULL ||
9684 	    ifp->if_ordered_link.tqe_prev != NULL) {
9685 		ifnet_remove_from_ordered_list(ifp);
9686 	}
9687 	ifindex2ifnet[ifp->if_index] = NULL;
9688 
9689 	/* 18717626 - reset router mode */
9690 	if_clear_eflags(ifp, IFEF_IPV4_ROUTER);
9691 	ifp->if_ipv6_router_mode = IPV6_ROUTER_MODE_DISABLED;
9692 
9693 	/* Record detach PC stacktrace */
9694 	ctrace_record(&((struct dlil_ifnet *)ifp)->dl_if_detach);
9695 
9696 	/* Clear logging parameters */
9697 	bzero(&ifp->if_log, sizeof(ifp->if_log));
9698 
9699 	/* Clear delegated interface info (reference released below) */
9700 	delegated_ifp = ifp->if_delegated.ifp;
9701 	bzero(&ifp->if_delegated, sizeof(ifp->if_delegated));
9702 
9703 	/* Reset interface state */
9704 	bzero(&ifp->if_interface_state, sizeof(ifp->if_interface_state));
9705 
9706 	/*
9707 	 * Increment the generation count on interface deletion
9708 	 */
9709 	ifp->if_creation_generation_id = os_atomic_inc(&if_creation_generation_count, relaxed);
9710 
9711 	ifnet_lock_done(ifp);
9712 	ifnet_head_done();
9713 
9714 	/* Release reference held on the delegated interface */
9715 	if (delegated_ifp != NULL) {
9716 		ifnet_release(delegated_ifp);
9717 	}
9718 
9719 	/* Reset Link Quality Metric (unless loopback [lo0]) */
9720 	if (ifp != lo_ifp) {
9721 		if_lqm_update(ifp, IFNET_LQM_THRESH_OFF, 0);
9722 	}
9723 
9724 	/* Reset TCP local statistics */
9725 	if (ifp->if_tcp_stat != NULL) {
9726 		bzero(ifp->if_tcp_stat, sizeof(*ifp->if_tcp_stat));
9727 	}
9728 
9729 	/* Reset UDP local statistics */
9730 	if (ifp->if_udp_stat != NULL) {
9731 		bzero(ifp->if_udp_stat, sizeof(*ifp->if_udp_stat));
9732 	}
9733 
9734 	/* Reset ifnet IPv4 stats */
9735 	if (ifp->if_ipv4_stat != NULL) {
9736 		bzero(ifp->if_ipv4_stat, sizeof(*ifp->if_ipv4_stat));
9737 	}
9738 
9739 	/* Reset ifnet IPv6 stats */
9740 	if (ifp->if_ipv6_stat != NULL) {
9741 		bzero(ifp->if_ipv6_stat, sizeof(*ifp->if_ipv6_stat));
9742 	}
9743 
9744 	/* Release memory held for interface link status report */
9745 	if (ifp->if_link_status != NULL) {
9746 		kfree_type(struct if_link_status, ifp->if_link_status);
9747 		ifp->if_link_status = NULL;
9748 	}
9749 
9750 	/* Disable forwarding cached route */
9751 	lck_mtx_lock(&ifp->if_cached_route_lock);
9752 	ifp->if_fwd_cacheok = 0;
9753 	lck_mtx_unlock(&ifp->if_cached_route_lock);
9754 
9755 	/* Disable data threshold and wait for any pending event posting */
9756 	ifp->if_data_threshold = 0;
9757 	VERIFY(ifp->if_dt_tcall != NULL);
9758 	(void) thread_call_cancel_wait(ifp->if_dt_tcall);
9759 
9760 	/*
9761 	 * Drain any deferred IGMPv3/MLDv2 query responses, but keep the
9762 	 * references to the info structures and leave them attached to
9763 	 * this ifnet.
9764 	 */
9765 #if INET
9766 	igmp_domifdetach(ifp);
9767 #endif /* INET */
9768 	mld_domifdetach(ifp);
9769 
9770 #if SKYWALK
9771 	/* Clean up any netns tokens still pointing to to this ifnet */
9772 	netns_ifnet_detach(ifp);
9773 #endif /* SKYWALK */
9774 	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_IF_DETACHING, NULL, 0, FALSE);
9775 
9776 	/* Let worker thread take care of the rest, to avoid reentrancy */
9777 	dlil_if_lock();
9778 	ifnet_detaching_enqueue(ifp);
9779 	dlil_if_unlock();
9780 
9781 	return 0;
9782 }
9783 
9784 static void
ifnet_detaching_enqueue(struct ifnet * ifp)9785 ifnet_detaching_enqueue(struct ifnet *ifp)
9786 {
9787 	dlil_if_lock_assert();
9788 
9789 	++ifnet_detaching_cnt;
9790 	VERIFY(ifnet_detaching_cnt != 0);
9791 	TAILQ_INSERT_TAIL(&ifnet_detaching_head, ifp, if_detaching_link);
9792 	wakeup((caddr_t)&ifnet_delayed_run);
9793 }
9794 
9795 static struct ifnet *
ifnet_detaching_dequeue(void)9796 ifnet_detaching_dequeue(void)
9797 {
9798 	struct ifnet *ifp;
9799 
9800 	dlil_if_lock_assert();
9801 
9802 	ifp = TAILQ_FIRST(&ifnet_detaching_head);
9803 	VERIFY(ifnet_detaching_cnt != 0 || ifp == NULL);
9804 	if (ifp != NULL) {
9805 		VERIFY(ifnet_detaching_cnt != 0);
9806 		--ifnet_detaching_cnt;
9807 		TAILQ_REMOVE(&ifnet_detaching_head, ifp, if_detaching_link);
9808 		ifp->if_detaching_link.tqe_next = NULL;
9809 		ifp->if_detaching_link.tqe_prev = NULL;
9810 	}
9811 	return ifp;
9812 }
9813 
9814 __attribute__((noreturn))
9815 static void
ifnet_detacher_thread_cont(void * v,wait_result_t wres)9816 ifnet_detacher_thread_cont(void *v, wait_result_t wres)
9817 {
9818 #pragma unused(v, wres)
9819 	struct ifnet *ifp;
9820 
9821 	dlil_if_lock();
9822 	if (__improbable(ifnet_detaching_embryonic)) {
9823 		ifnet_detaching_embryonic = FALSE;
9824 		/* there's no lock ordering constrain so OK to do this here */
9825 		dlil_decr_pending_thread_count();
9826 	}
9827 
9828 	for (;;) {
9829 		dlil_if_lock_assert();
9830 
9831 		if (ifnet_detaching_cnt == 0) {
9832 			break;
9833 		}
9834 
9835 		net_update_uptime();
9836 
9837 		VERIFY(TAILQ_FIRST(&ifnet_detaching_head) != NULL);
9838 
9839 		/* Take care of detaching ifnet */
9840 		ifp = ifnet_detaching_dequeue();
9841 		if (ifp != NULL) {
9842 			dlil_if_unlock();
9843 			ifnet_detach_final(ifp);
9844 			dlil_if_lock();
9845 		}
9846 	}
9847 
9848 	(void) assert_wait(&ifnet_delayed_run, THREAD_UNINT);
9849 	dlil_if_unlock();
9850 	(void) thread_block(ifnet_detacher_thread_cont);
9851 
9852 	VERIFY(0);      /* we should never get here */
9853 	/* NOTREACHED */
9854 	__builtin_unreachable();
9855 }
9856 
9857 __dead2
9858 static void
ifnet_detacher_thread_func(void * v,wait_result_t w)9859 ifnet_detacher_thread_func(void *v, wait_result_t w)
9860 {
9861 #pragma unused(v, w)
9862 	dlil_if_lock();
9863 	(void) assert_wait(&ifnet_delayed_run, THREAD_UNINT);
9864 	ifnet_detaching_embryonic = TRUE;
9865 	/* wake up once to get out of embryonic state */
9866 	wakeup((caddr_t)&ifnet_delayed_run);
9867 	dlil_if_unlock();
9868 	(void) thread_block(ifnet_detacher_thread_cont);
9869 	VERIFY(0);
9870 	/* NOTREACHED */
9871 	__builtin_unreachable();
9872 }
9873 
9874 static void
ifnet_detach_final(struct ifnet * ifp)9875 ifnet_detach_final(struct ifnet *ifp)
9876 {
9877 	struct ifnet_filter *filter, *filter_next;
9878 	struct dlil_ifnet *dlifp;
9879 	struct ifnet_filter_head fhead;
9880 	struct dlil_threading_info *inp;
9881 	struct ifaddr *ifa;
9882 	ifnet_detached_func if_free;
9883 	int i;
9884 
9885 	/* Let BPF know we're detaching */
9886 	bpfdetach(ifp);
9887 
9888 #if SKYWALK
9889 	dlil_netif_detach_notify(ifp);
9890 	/*
9891 	 * Wait for the datapath to quiesce before tearing down
9892 	 * netif/flowswitch nexuses.
9893 	 */
9894 	dlil_quiesce_and_detach_nexuses(ifp);
9895 #endif /* SKYWALK */
9896 
9897 	lck_mtx_lock(&ifp->if_ref_lock);
9898 	if (!(ifp->if_refflags & IFRF_DETACHING)) {
9899 		panic("%s: flags mismatch (detaching not set) ifp=%p",
9900 		    __func__, ifp);
9901 		/* NOTREACHED */
9902 	}
9903 
9904 	/*
9905 	 * Wait until the existing IO references get released
9906 	 * before we proceed with ifnet_detach.  This is not a
9907 	 * common case, so block without using a continuation.
9908 	 */
9909 	while (ifp->if_refio > 0) {
9910 		DLIL_PRINTF("%s: Waiting for IO references on %s interface "
9911 		    "to be released\n", __func__, if_name(ifp));
9912 		(void) msleep(&(ifp->if_refio), &ifp->if_ref_lock,
9913 		    (PZERO - 1), "ifnet_ioref_wait", NULL);
9914 	}
9915 
9916 	VERIFY(ifp->if_datamov == 0);
9917 	VERIFY(ifp->if_drainers == 0);
9918 	VERIFY(ifp->if_suspend == 0);
9919 	ifp->if_refflags &= ~IFRF_READY;
9920 	lck_mtx_unlock(&ifp->if_ref_lock);
9921 
9922 	/* Clear agent IDs */
9923 	if (ifp->if_agentids != NULL) {
9924 		kfree_data(ifp->if_agentids,
9925 		    sizeof(uuid_t) * ifp->if_agentcount);
9926 		ifp->if_agentids = NULL;
9927 	}
9928 	ifp->if_agentcount = 0;
9929 
9930 #if SKYWALK
9931 	VERIFY(SLIST_EMPTY(&ifp->if_netns_tokens));
9932 #endif /* SKYWALK */
9933 	/* Drain and destroy send queue */
9934 	ifclassq_teardown(ifp->if_snd);
9935 
9936 	/* Detach interface filters */
9937 	lck_mtx_lock(&ifp->if_flt_lock);
9938 	if_flt_monitor_enter(ifp);
9939 
9940 	LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
9941 	fhead = ifp->if_flt_head;
9942 	TAILQ_INIT(&ifp->if_flt_head);
9943 
9944 	for (filter = TAILQ_FIRST(&fhead); filter; filter = filter_next) {
9945 		filter_next = TAILQ_NEXT(filter, filt_next);
9946 		lck_mtx_unlock(&ifp->if_flt_lock);
9947 
9948 		dlil_detach_filter_internal(filter, 1);
9949 		lck_mtx_lock(&ifp->if_flt_lock);
9950 	}
9951 	if_flt_monitor_leave(ifp);
9952 	lck_mtx_unlock(&ifp->if_flt_lock);
9953 
9954 	/* Tell upper layers to drop their network addresses */
9955 	if_purgeaddrs(ifp);
9956 
9957 	ifnet_lock_exclusive(ifp);
9958 
9959 	/* Unplumb all protocols */
9960 	for (i = 0; i < PROTO_HASH_SLOTS; i++) {
9961 		struct if_proto *proto;
9962 
9963 		proto = SLIST_FIRST(&ifp->if_proto_hash[i]);
9964 		while (proto != NULL) {
9965 			protocol_family_t family = proto->protocol_family;
9966 			ifnet_lock_done(ifp);
9967 			proto_unplumb(family, ifp);
9968 			ifnet_lock_exclusive(ifp);
9969 			proto = SLIST_FIRST(&ifp->if_proto_hash[i]);
9970 		}
9971 		/* There should not be any protocols left */
9972 		VERIFY(SLIST_EMPTY(&ifp->if_proto_hash[i]));
9973 	}
9974 	kfree_type(struct proto_hash_entry, PROTO_HASH_SLOTS, ifp->if_proto_hash);
9975 	ifp->if_proto_hash = NULL;
9976 
9977 	/* Detach (permanent) link address from if_addrhead */
9978 	ifa = TAILQ_FIRST(&ifp->if_addrhead);
9979 	VERIFY(ifnet_addrs[ifp->if_index - 1] == ifa);
9980 	IFA_LOCK(ifa);
9981 	if_detach_link_ifa(ifp, ifa);
9982 	IFA_UNLOCK(ifa);
9983 
9984 	/* Remove (permanent) link address from ifnet_addrs[] */
9985 	ifa_remref(ifa);
9986 	ifnet_addrs[ifp->if_index - 1] = NULL;
9987 
9988 	/* This interface should not be on {ifnet_head,detaching} */
9989 	VERIFY(ifp->if_link.tqe_next == NULL);
9990 	VERIFY(ifp->if_link.tqe_prev == NULL);
9991 	VERIFY(ifp->if_detaching_link.tqe_next == NULL);
9992 	VERIFY(ifp->if_detaching_link.tqe_prev == NULL);
9993 	VERIFY(ifp->if_ordered_link.tqe_next == NULL);
9994 	VERIFY(ifp->if_ordered_link.tqe_prev == NULL);
9995 
9996 	/* The slot should have been emptied */
9997 	VERIFY(ifindex2ifnet[ifp->if_index] == NULL);
9998 
9999 	/* There should not be any addresses left */
10000 	VERIFY(TAILQ_EMPTY(&ifp->if_addrhead));
10001 
10002 	/*
10003 	 * Signal the starter thread to terminate itself, and wait until
10004 	 * it has exited.
10005 	 */
10006 	if (ifp->if_start_thread != THREAD_NULL) {
10007 		lck_mtx_lock_spin(&ifp->if_start_lock);
10008 		ifp->if_start_flags |= IFSF_TERMINATING;
10009 		wakeup_one((caddr_t)&ifp->if_start_thread);
10010 		lck_mtx_unlock(&ifp->if_start_lock);
10011 
10012 		/* wait for starter thread to terminate */
10013 		lck_mtx_lock(&ifp->if_start_lock);
10014 		while (ifp->if_start_thread != THREAD_NULL) {
10015 			if (dlil_verbose) {
10016 				DLIL_PRINTF("%s: waiting for %s starter thread to terminate\n",
10017 				    __func__,
10018 				    if_name(ifp));
10019 			}
10020 			(void) msleep(&ifp->if_start_thread,
10021 			    &ifp->if_start_lock, (PZERO - 1),
10022 			    "ifnet_start_thread_exit", NULL);
10023 		}
10024 		lck_mtx_unlock(&ifp->if_start_lock);
10025 		if (dlil_verbose) {
10026 			DLIL_PRINTF("%s: %s starter thread termination complete",
10027 			    __func__, if_name(ifp));
10028 		}
10029 	}
10030 
10031 	/*
10032 	 * Signal the poller thread to terminate itself, and wait until
10033 	 * it has exited.
10034 	 */
10035 	if (ifp->if_poll_thread != THREAD_NULL) {
10036 #if SKYWALK
10037 		VERIFY(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
10038 #endif /* SKYWALK */
10039 		lck_mtx_lock_spin(&ifp->if_poll_lock);
10040 		ifp->if_poll_flags |= IF_POLLF_TERMINATING;
10041 		wakeup_one((caddr_t)&ifp->if_poll_thread);
10042 		lck_mtx_unlock(&ifp->if_poll_lock);
10043 
10044 		/* wait for poller thread to terminate */
10045 		lck_mtx_lock(&ifp->if_poll_lock);
10046 		while (ifp->if_poll_thread != THREAD_NULL) {
10047 			if (dlil_verbose) {
10048 				DLIL_PRINTF("%s: waiting for %s poller thread to terminate\n",
10049 				    __func__,
10050 				    if_name(ifp));
10051 			}
10052 			(void) msleep(&ifp->if_poll_thread,
10053 			    &ifp->if_poll_lock, (PZERO - 1),
10054 			    "ifnet_poll_thread_exit", NULL);
10055 		}
10056 		lck_mtx_unlock(&ifp->if_poll_lock);
10057 		if (dlil_verbose) {
10058 			DLIL_PRINTF("%s: %s poller thread termination complete\n",
10059 			    __func__, if_name(ifp));
10060 		}
10061 	}
10062 
10063 	/*
10064 	 * If thread affinity was set for the workloop thread, we will need
10065 	 * to tear down the affinity and release the extra reference count
10066 	 * taken at attach time.  Does not apply to lo0 or other interfaces
10067 	 * without dedicated input threads.
10068 	 */
10069 	if ((inp = ifp->if_inp) != NULL) {
10070 		VERIFY(inp != dlil_main_input_thread);
10071 
10072 		if (inp->dlth_affinity) {
10073 			struct thread *tp, *wtp, *ptp;
10074 
10075 			lck_mtx_lock_spin(&inp->dlth_lock);
10076 			wtp = inp->dlth_driver_thread;
10077 			inp->dlth_driver_thread = THREAD_NULL;
10078 			ptp = inp->dlth_poller_thread;
10079 			inp->dlth_poller_thread = THREAD_NULL;
10080 			ASSERT(inp->dlth_thread != THREAD_NULL);
10081 			tp = inp->dlth_thread;    /* don't nullify now */
10082 			inp->dlth_affinity_tag = 0;
10083 			inp->dlth_affinity = FALSE;
10084 			lck_mtx_unlock(&inp->dlth_lock);
10085 
10086 			/* Tear down poll thread affinity */
10087 			if (ptp != NULL) {
10088 				VERIFY(ifp->if_eflags & IFEF_RXPOLL);
10089 				VERIFY(ifp->if_xflags & IFXF_LEGACY);
10090 				(void) dlil_affinity_set(ptp,
10091 				    THREAD_AFFINITY_TAG_NULL);
10092 				thread_deallocate(ptp);
10093 			}
10094 
10095 			/* Tear down workloop thread affinity */
10096 			if (wtp != NULL) {
10097 				(void) dlil_affinity_set(wtp,
10098 				    THREAD_AFFINITY_TAG_NULL);
10099 				thread_deallocate(wtp);
10100 			}
10101 
10102 			/* Tear down DLIL input thread affinity */
10103 			(void) dlil_affinity_set(tp, THREAD_AFFINITY_TAG_NULL);
10104 			thread_deallocate(tp);
10105 		}
10106 
10107 		/* disassociate ifp DLIL input thread */
10108 		ifp->if_inp = NULL;
10109 
10110 		/* if the worker thread was created, tell it to terminate */
10111 		if (inp->dlth_thread != THREAD_NULL) {
10112 			lck_mtx_lock_spin(&inp->dlth_lock);
10113 			inp->dlth_flags |= DLIL_INPUT_TERMINATE;
10114 			if (!(inp->dlth_flags & DLIL_INPUT_RUNNING)) {
10115 				wakeup_one((caddr_t)&inp->dlth_flags);
10116 			}
10117 			lck_mtx_unlock(&inp->dlth_lock);
10118 			ifnet_lock_done(ifp);
10119 
10120 			/* wait for the input thread to terminate */
10121 			lck_mtx_lock_spin(&inp->dlth_lock);
10122 			while ((inp->dlth_flags & DLIL_INPUT_TERMINATE_COMPLETE)
10123 			    == 0) {
10124 				(void) msleep(&inp->dlth_flags, &inp->dlth_lock,
10125 				    (PZERO - 1) | PSPIN, inp->dlth_name, NULL);
10126 			}
10127 			lck_mtx_unlock(&inp->dlth_lock);
10128 			ifnet_lock_exclusive(ifp);
10129 		}
10130 
10131 		/* clean-up input thread state */
10132 		dlil_clean_threading_info(inp);
10133 		/* clean-up poll parameters */
10134 		VERIFY(ifp->if_poll_thread == THREAD_NULL);
10135 		dlil_reset_rxpoll_params(ifp);
10136 	}
10137 
10138 	/* The driver might unload, so point these to ourselves */
10139 	if_free = ifp->if_free;
10140 	ifp->if_output_dlil = ifp_if_output;
10141 	ifp->if_output = ifp_if_output;
10142 	ifp->if_pre_enqueue = ifp_if_output;
10143 	ifp->if_start = ifp_if_start;
10144 	ifp->if_output_ctl = ifp_if_ctl;
10145 	ifp->if_input_dlil = ifp_if_input;
10146 	ifp->if_input_poll = ifp_if_input_poll;
10147 	ifp->if_input_ctl = ifp_if_ctl;
10148 	ifp->if_ioctl = ifp_if_ioctl;
10149 	ifp->if_set_bpf_tap = ifp_if_set_bpf_tap;
10150 	ifp->if_free = ifp_if_free;
10151 	ifp->if_demux = ifp_if_demux;
10152 	ifp->if_event = ifp_if_event;
10153 	ifp->if_framer_legacy = ifp_if_framer;
10154 	ifp->if_framer = ifp_if_framer_extended;
10155 	ifp->if_add_proto = ifp_if_add_proto;
10156 	ifp->if_del_proto = ifp_if_del_proto;
10157 	ifp->if_check_multi = ifp_if_check_multi;
10158 
10159 	/* wipe out interface description */
10160 	VERIFY(ifp->if_desc.ifd_maxlen == IF_DESCSIZE);
10161 	ifp->if_desc.ifd_len = 0;
10162 	VERIFY(ifp->if_desc.ifd_desc != NULL);
10163 	bzero(ifp->if_desc.ifd_desc, IF_DESCSIZE);
10164 
10165 	/* there shouldn't be any delegation by now */
10166 	VERIFY(ifp->if_delegated.ifp == NULL);
10167 	VERIFY(ifp->if_delegated.type == 0);
10168 	VERIFY(ifp->if_delegated.family == 0);
10169 	VERIFY(ifp->if_delegated.subfamily == 0);
10170 	VERIFY(ifp->if_delegated.expensive == 0);
10171 	VERIFY(ifp->if_delegated.constrained == 0);
10172 
10173 	/* QoS marking get cleared */
10174 	if_clear_eflags(ifp, IFEF_QOSMARKING_ENABLED);
10175 	if_set_qosmarking_mode(ifp, IFRTYPE_QOSMARKING_MODE_NONE);
10176 
10177 #if SKYWALK
10178 	/* the nexus destructor is responsible for clearing these */
10179 	VERIFY(ifp->if_na_ops == NULL);
10180 	VERIFY(ifp->if_na == NULL);
10181 #endif /* SKYWALK */
10182 
10183 	/* promiscuous/allmulti counts need to start at zero again */
10184 	ifp->if_pcount = 0;
10185 	ifp->if_amcount = 0;
10186 	ifp->if_flags &= ~(IFF_PROMISC | IFF_ALLMULTI);
10187 
10188 	ifnet_lock_done(ifp);
10189 
10190 #if PF
10191 	/*
10192 	 * Detach this interface from packet filter, if enabled.
10193 	 */
10194 	pf_ifnet_hook(ifp, 0);
10195 #endif /* PF */
10196 
10197 	/* Filter list should be empty */
10198 	lck_mtx_lock_spin(&ifp->if_flt_lock);
10199 	VERIFY(TAILQ_EMPTY(&ifp->if_flt_head));
10200 	VERIFY(ifp->if_flt_busy == 0);
10201 	VERIFY(ifp->if_flt_waiters == 0);
10202 	VERIFY(ifp->if_flt_non_os_count == 0);
10203 	VERIFY(ifp->if_flt_no_tso_count == 0);
10204 	lck_mtx_unlock(&ifp->if_flt_lock);
10205 
10206 	/* Last chance to drain send queue */
10207 	if_qflush_snd(ifp, 0);
10208 
10209 	/* Last chance to cleanup any cached route */
10210 	lck_mtx_lock(&ifp->if_cached_route_lock);
10211 	VERIFY(!ifp->if_fwd_cacheok);
10212 	ROUTE_RELEASE(&ifp->if_fwd_route);
10213 	bzero(&ifp->if_fwd_route, sizeof(ifp->if_fwd_route));
10214 	ROUTE_RELEASE(&ifp->if_src_route);
10215 	bzero(&ifp->if_src_route, sizeof(ifp->if_src_route));
10216 	ROUTE_RELEASE(&ifp->if_src_route6);
10217 	bzero(&ifp->if_src_route6, sizeof(ifp->if_src_route6));
10218 	lck_mtx_unlock(&ifp->if_cached_route_lock);
10219 
10220 	/* Ignore any pending data threshold as the interface is anyways gone */
10221 	ifp->if_data_threshold = 0;
10222 
10223 	VERIFY(ifp->if_dt_tcall != NULL);
10224 	VERIFY(!thread_call_isactive(ifp->if_dt_tcall));
10225 
10226 	ifnet_llreach_ifdetach(ifp);
10227 
10228 	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_IF_DETACHED, NULL, 0, FALSE);
10229 
10230 	/*
10231 	 * Finally, mark this ifnet as detached.
10232 	 */
10233 	if (dlil_verbose) {
10234 		DLIL_PRINTF("%s: detached\n", if_name(ifp));
10235 	}
10236 	lck_mtx_lock_spin(&ifp->if_ref_lock);
10237 	if (!(ifp->if_refflags & IFRF_DETACHING)) {
10238 		panic("%s: flags mismatch (detaching not set) ifp=%p",
10239 		    __func__, ifp);
10240 		/* NOTREACHED */
10241 	}
10242 	ifp->if_refflags &= ~IFRF_DETACHING;
10243 	lck_mtx_unlock(&ifp->if_ref_lock);
10244 	if (if_free != NULL) {
10245 		if_free(ifp);
10246 	}
10247 
10248 	ifclassq_release(&ifp->if_snd);
10249 
10250 	/* we're fully detached, clear the "in use" bit */
10251 	dlifp = (struct dlil_ifnet *)ifp;
10252 	lck_mtx_lock(&dlifp->dl_if_lock);
10253 	ASSERT((dlifp->dl_if_flags & DLIF_INUSE) != 0);
10254 	dlifp->dl_if_flags &= ~DLIF_INUSE;
10255 	lck_mtx_unlock(&dlifp->dl_if_lock);
10256 
10257 	/* Release reference held during ifnet attach */
10258 	ifnet_release(ifp);
10259 }
10260 
10261 errno_t
ifp_if_output(struct ifnet * ifp,struct mbuf * m)10262 ifp_if_output(struct ifnet *ifp, struct mbuf *m)
10263 {
10264 #pragma unused(ifp)
10265 	m_freem_list(m);
10266 	return 0;
10267 }
10268 
10269 void
ifp_if_start(struct ifnet * ifp)10270 ifp_if_start(struct ifnet *ifp)
10271 {
10272 	ifnet_purge(ifp);
10273 }
10274 
10275 static errno_t
ifp_if_input(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t poll,struct thread * tp)10276 ifp_if_input(struct ifnet *ifp, struct mbuf *m_head,
10277     struct mbuf *m_tail, const struct ifnet_stat_increment_param *s,
10278     boolean_t poll, struct thread *tp)
10279 {
10280 #pragma unused(ifp, m_tail, s, poll, tp)
10281 	m_freem_list(m_head);
10282 	return ENXIO;
10283 }
10284 
10285 static void
ifp_if_input_poll(struct ifnet * ifp,u_int32_t flags,u_int32_t max_cnt,struct mbuf ** m_head,struct mbuf ** m_tail,u_int32_t * cnt,u_int32_t * len)10286 ifp_if_input_poll(struct ifnet *ifp, u_int32_t flags, u_int32_t max_cnt,
10287     struct mbuf **m_head, struct mbuf **m_tail, u_int32_t *cnt, u_int32_t *len)
10288 {
10289 #pragma unused(ifp, flags, max_cnt)
10290 	if (m_head != NULL) {
10291 		*m_head = NULL;
10292 	}
10293 	if (m_tail != NULL) {
10294 		*m_tail = NULL;
10295 	}
10296 	if (cnt != NULL) {
10297 		*cnt = 0;
10298 	}
10299 	if (len != NULL) {
10300 		*len = 0;
10301 	}
10302 }
10303 
10304 static errno_t
ifp_if_ctl(struct ifnet * ifp,ifnet_ctl_cmd_t cmd,u_int32_t arglen,void * arg)10305 ifp_if_ctl(struct ifnet *ifp, ifnet_ctl_cmd_t cmd, u_int32_t arglen, void *arg)
10306 {
10307 #pragma unused(ifp, cmd, arglen, arg)
10308 	return EOPNOTSUPP;
10309 }
10310 
10311 static errno_t
ifp_if_demux(struct ifnet * ifp,struct mbuf * m,char * fh,protocol_family_t * pf)10312 ifp_if_demux(struct ifnet *ifp, struct mbuf *m, char *fh, protocol_family_t *pf)
10313 {
10314 #pragma unused(ifp, fh, pf)
10315 	m_freem(m);
10316 	return EJUSTRETURN;
10317 }
10318 
10319 static errno_t
ifp_if_add_proto(struct ifnet * ifp,protocol_family_t pf,const struct ifnet_demux_desc * da,u_int32_t dc)10320 ifp_if_add_proto(struct ifnet *ifp, protocol_family_t pf,
10321     const struct ifnet_demux_desc *da, u_int32_t dc)
10322 {
10323 #pragma unused(ifp, pf, da, dc)
10324 	return EINVAL;
10325 }
10326 
10327 static errno_t
ifp_if_del_proto(struct ifnet * ifp,protocol_family_t pf)10328 ifp_if_del_proto(struct ifnet *ifp, protocol_family_t pf)
10329 {
10330 #pragma unused(ifp, pf)
10331 	return EINVAL;
10332 }
10333 
10334 static errno_t
ifp_if_check_multi(struct ifnet * ifp,const struct sockaddr * sa)10335 ifp_if_check_multi(struct ifnet *ifp, const struct sockaddr *sa)
10336 {
10337 #pragma unused(ifp, sa)
10338 	return EOPNOTSUPP;
10339 }
10340 
10341 #if !XNU_TARGET_OS_OSX
10342 static errno_t
ifp_if_framer(struct ifnet * ifp,struct mbuf ** m,const struct sockaddr * sa,const char * ll,const char * t,u_int32_t * pre,u_int32_t * post)10343 ifp_if_framer(struct ifnet *ifp, struct mbuf **m,
10344     const struct sockaddr *sa, const char *ll, const char *t,
10345     u_int32_t *pre, u_int32_t *post)
10346 #else /* XNU_TARGET_OS_OSX */
10347 static errno_t
10348 ifp_if_framer(struct ifnet *ifp, struct mbuf **m,
10349     const struct sockaddr *sa, const char *ll, const char *t)
10350 #endif /* XNU_TARGET_OS_OSX */
10351 {
10352 #pragma unused(ifp, m, sa, ll, t)
10353 #if !XNU_TARGET_OS_OSX
10354 	return ifp_if_framer_extended(ifp, m, sa, ll, t, pre, post);
10355 #else /* XNU_TARGET_OS_OSX */
10356 	return ifp_if_framer_extended(ifp, m, sa, ll, t, NULL, NULL);
10357 #endif /* XNU_TARGET_OS_OSX */
10358 }
10359 
10360 static errno_t
ifp_if_framer_extended(struct ifnet * ifp,struct mbuf ** m,const struct sockaddr * sa,const char * ll,const char * t,u_int32_t * pre,u_int32_t * post)10361 ifp_if_framer_extended(struct ifnet *ifp, struct mbuf **m,
10362     const struct sockaddr *sa, const char *ll, const char *t,
10363     u_int32_t *pre, u_int32_t *post)
10364 {
10365 #pragma unused(ifp, sa, ll, t)
10366 	m_freem(*m);
10367 	*m = NULL;
10368 
10369 	if (pre != NULL) {
10370 		*pre = 0;
10371 	}
10372 	if (post != NULL) {
10373 		*post = 0;
10374 	}
10375 
10376 	return EJUSTRETURN;
10377 }
10378 
10379 errno_t
ifp_if_ioctl(struct ifnet * ifp,unsigned long cmd,void * arg)10380 ifp_if_ioctl(struct ifnet *ifp, unsigned long cmd, void *arg)
10381 {
10382 #pragma unused(ifp, cmd, arg)
10383 	return EOPNOTSUPP;
10384 }
10385 
10386 static errno_t
ifp_if_set_bpf_tap(struct ifnet * ifp,bpf_tap_mode tm,bpf_packet_func f)10387 ifp_if_set_bpf_tap(struct ifnet *ifp, bpf_tap_mode tm, bpf_packet_func f)
10388 {
10389 #pragma unused(ifp, tm, f)
10390 	/* XXX not sure what to do here */
10391 	return 0;
10392 }
10393 
10394 static void
ifp_if_free(struct ifnet * ifp)10395 ifp_if_free(struct ifnet *ifp)
10396 {
10397 #pragma unused(ifp)
10398 }
10399 
10400 static void
ifp_if_event(struct ifnet * ifp,const struct kev_msg * e)10401 ifp_if_event(struct ifnet *ifp, const struct kev_msg *e)
10402 {
10403 #pragma unused(ifp, e)
10404 }
10405 
10406 int
dlil_if_acquire(u_int32_t family,const void * uniqueid,size_t uniqueid_len,const char * ifxname,struct ifnet ** ifp)10407 dlil_if_acquire(u_int32_t family, const void *uniqueid,
10408     size_t uniqueid_len, const char *ifxname, struct ifnet **ifp)
10409 {
10410 	struct ifnet *ifp1 = NULL;
10411 	struct dlil_ifnet *dlifp1 = NULL;
10412 	struct dlil_ifnet *dlifp1_saved = NULL;
10413 	void *buf, *base, **pbuf;
10414 	int ret = 0;
10415 
10416 	VERIFY(*ifp == NULL);
10417 	dlil_if_lock();
10418 	/*
10419 	 * We absolutely can't have an interface with the same name
10420 	 * in in-use state.
10421 	 * To make sure of that list has to be traversed completely
10422 	 */
10423 	TAILQ_FOREACH(dlifp1, &dlil_ifnet_head, dl_if_link) {
10424 		ifp1 = (struct ifnet *)dlifp1;
10425 
10426 		if (ifp1->if_family != family) {
10427 			continue;
10428 		}
10429 
10430 		/*
10431 		 * If interface is in use, return EBUSY if either unique id
10432 		 * or interface extended names are the same
10433 		 */
10434 		lck_mtx_lock(&dlifp1->dl_if_lock);
10435 		if (strncmp(ifxname, ifp1->if_xname, IFXNAMSIZ) == 0 &&
10436 		    (dlifp1->dl_if_flags & DLIF_INUSE) != 0) {
10437 			lck_mtx_unlock(&dlifp1->dl_if_lock);
10438 			ret = EBUSY;
10439 			goto end;
10440 		}
10441 
10442 		if (uniqueid_len != 0 &&
10443 		    uniqueid_len == dlifp1->dl_if_uniqueid_len &&
10444 		    bcmp(uniqueid, dlifp1->dl_if_uniqueid, uniqueid_len) == 0) {
10445 			if ((dlifp1->dl_if_flags & DLIF_INUSE) != 0) {
10446 				lck_mtx_unlock(&dlifp1->dl_if_lock);
10447 				ret = EBUSY;
10448 				goto end;
10449 			}
10450 			if (dlifp1_saved == NULL) {
10451 				/* cache the first match */
10452 				dlifp1_saved = dlifp1;
10453 			}
10454 			/*
10455 			 * Do not break or jump to end as we have to traverse
10456 			 * the whole list to ensure there are no name collisions
10457 			 */
10458 		}
10459 		lck_mtx_unlock(&dlifp1->dl_if_lock);
10460 	}
10461 
10462 	/* If there's an interface that can be recycled, use that */
10463 	if (dlifp1_saved != NULL) {
10464 		lck_mtx_lock(&dlifp1_saved->dl_if_lock);
10465 		if ((dlifp1_saved->dl_if_flags & DLIF_INUSE) != 0) {
10466 			/* some other thread got in ahead of us */
10467 			lck_mtx_unlock(&dlifp1_saved->dl_if_lock);
10468 			ret = EBUSY;
10469 			goto end;
10470 		}
10471 		dlifp1_saved->dl_if_flags |= (DLIF_INUSE | DLIF_REUSE);
10472 		lck_mtx_unlock(&dlifp1_saved->dl_if_lock);
10473 		*ifp = (struct ifnet *)dlifp1_saved;
10474 		dlil_if_ref(*ifp);
10475 		goto end;
10476 	}
10477 
10478 	/* no interface found, allocate a new one */
10479 	buf = zalloc_flags(dlif_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
10480 
10481 	/* Get the 64-bit aligned base address for this object */
10482 	base = (void *)P2ROUNDUP((intptr_t)buf + sizeof(u_int64_t),
10483 	    sizeof(u_int64_t));
10484 	VERIFY(((intptr_t)base + dlif_size) <= ((intptr_t)buf + dlif_bufsize));
10485 
10486 	/*
10487 	 * Wind back a pointer size from the aligned base and
10488 	 * save the original address so we can free it later.
10489 	 */
10490 	pbuf = (void **)((intptr_t)base - sizeof(void *));
10491 	*pbuf = buf;
10492 	dlifp1 = base;
10493 
10494 	if (uniqueid_len) {
10495 		dlifp1->dl_if_uniqueid = kalloc_data(uniqueid_len,
10496 		    Z_WAITOK);
10497 		if (dlifp1->dl_if_uniqueid == NULL) {
10498 			zfree(dlif_zone, buf);
10499 			ret = ENOMEM;
10500 			goto end;
10501 		}
10502 		bcopy(uniqueid, dlifp1->dl_if_uniqueid, uniqueid_len);
10503 		dlifp1->dl_if_uniqueid_len = uniqueid_len;
10504 	}
10505 
10506 	ifp1 = (struct ifnet *)dlifp1;
10507 	dlifp1->dl_if_flags = DLIF_INUSE;
10508 	if (ifnet_debug) {
10509 		dlifp1->dl_if_flags |= DLIF_DEBUG;
10510 		dlifp1->dl_if_trace = dlil_if_trace;
10511 	}
10512 	ifp1->if_name = dlifp1->dl_if_namestorage;
10513 	ifp1->if_xname = dlifp1->dl_if_xnamestorage;
10514 
10515 	/* initialize interface description */
10516 	ifp1->if_desc.ifd_maxlen = IF_DESCSIZE;
10517 	ifp1->if_desc.ifd_len = 0;
10518 	ifp1->if_desc.ifd_desc = dlifp1->dl_if_descstorage;
10519 
10520 #if SKYWALK
10521 	SLIST_INIT(&ifp1->if_netns_tokens);
10522 #endif /* SKYWALK */
10523 
10524 	if ((ret = dlil_alloc_local_stats(ifp1)) != 0) {
10525 		DLIL_PRINTF("%s: failed to allocate if local stats, "
10526 		    "error: %d\n", __func__, ret);
10527 		/* This probably shouldn't be fatal */
10528 		ret = 0;
10529 	}
10530 
10531 	lck_mtx_init(&dlifp1->dl_if_lock, &ifnet_lock_group, &ifnet_lock_attr);
10532 	lck_rw_init(&ifp1->if_lock, &ifnet_lock_group, &ifnet_lock_attr);
10533 	lck_mtx_init(&ifp1->if_ref_lock, &ifnet_lock_group, &ifnet_lock_attr);
10534 	lck_mtx_init(&ifp1->if_flt_lock, &ifnet_lock_group, &ifnet_lock_attr);
10535 	lck_mtx_init(&ifp1->if_addrconfig_lock, &ifnet_lock_group,
10536 	    &ifnet_lock_attr);
10537 	lck_rw_init(&ifp1->if_llreach_lock, &ifnet_lock_group, &ifnet_lock_attr);
10538 #if INET
10539 	lck_rw_init(&ifp1->if_inetdata_lock, &ifnet_lock_group,
10540 	    &ifnet_lock_attr);
10541 	ifp1->if_inetdata = NULL;
10542 #endif
10543 	lck_mtx_init(&ifp1->if_inet6_ioctl_lock, &ifnet_lock_group, &ifnet_lock_attr);
10544 	ifp1->if_inet6_ioctl_busy = FALSE;
10545 	lck_rw_init(&ifp1->if_inet6data_lock, &ifnet_lock_group,
10546 	    &ifnet_lock_attr);
10547 	ifp1->if_inet6data = NULL;
10548 	lck_rw_init(&ifp1->if_link_status_lock, &ifnet_lock_group,
10549 	    &ifnet_lock_attr);
10550 	ifp1->if_link_status = NULL;
10551 	lck_mtx_init(&ifp1->if_delegate_lock, &ifnet_lock_group, &ifnet_lock_attr);
10552 
10553 	/* for send data paths */
10554 	lck_mtx_init(&ifp1->if_start_lock, &ifnet_snd_lock_group,
10555 	    &ifnet_lock_attr);
10556 	lck_mtx_init(&ifp1->if_cached_route_lock, &ifnet_snd_lock_group,
10557 	    &ifnet_lock_attr);
10558 
10559 	/* for receive data paths */
10560 	lck_mtx_init(&ifp1->if_poll_lock, &ifnet_rcv_lock_group,
10561 	    &ifnet_lock_attr);
10562 
10563 	/* thread call allocation is done with sleeping zalloc */
10564 	ifp1->if_dt_tcall = thread_call_allocate_with_options(dlil_dt_tcall_fn,
10565 	    ifp1, THREAD_CALL_PRIORITY_KERNEL, THREAD_CALL_OPTIONS_ONCE);
10566 	if (ifp1->if_dt_tcall == NULL) {
10567 		panic_plain("%s: couldn't create if_dt_tcall", __func__);
10568 		/* NOTREACHED */
10569 	}
10570 
10571 	TAILQ_INSERT_TAIL(&dlil_ifnet_head, dlifp1, dl_if_link);
10572 
10573 	*ifp = ifp1;
10574 	dlil_if_ref(*ifp);
10575 
10576 end:
10577 	dlil_if_unlock();
10578 
10579 	VERIFY(dlifp1 == NULL || (IS_P2ALIGNED(dlifp1, sizeof(u_int64_t)) &&
10580 	    IS_P2ALIGNED(&ifp1->if_data, sizeof(u_int64_t))));
10581 
10582 	return ret;
10583 }
10584 
10585 static void
_dlil_if_release(ifnet_t ifp,bool clear_in_use)10586 _dlil_if_release(ifnet_t ifp, bool clear_in_use)
10587 {
10588 	struct dlil_ifnet *dlifp = (struct dlil_ifnet *)ifp;
10589 
10590 	VERIFY(OSDecrementAtomic64(&net_api_stats.nas_ifnet_alloc_count) > 0);
10591 	if (!(ifp->if_xflags & IFXF_ALLOC_KPI)) {
10592 		VERIFY(OSDecrementAtomic64(&net_api_stats.nas_ifnet_alloc_os_count) > 0);
10593 	}
10594 
10595 	ifnet_lock_exclusive(ifp);
10596 	if (ifp->if_broadcast.length > sizeof(ifp->if_broadcast.u.buffer)) {
10597 		kfree_data(ifp->if_broadcast.u.ptr, ifp->if_broadcast.length);
10598 		ifp->if_broadcast.length = 0;
10599 		ifp->if_broadcast.u.ptr = NULL;
10600 	}
10601 	lck_mtx_lock(&dlifp->dl_if_lock);
10602 	strlcpy(dlifp->dl_if_namestorage, ifp->if_name, IFNAMSIZ);
10603 	ifp->if_name = dlifp->dl_if_namestorage;
10604 	/* Reset external name (name + unit) */
10605 	ifp->if_xname = dlifp->dl_if_xnamestorage;
10606 	snprintf(__DECONST(char *, ifp->if_xname), IFXNAMSIZ,
10607 	    "%s?", ifp->if_name);
10608 	if (clear_in_use) {
10609 		ASSERT((dlifp->dl_if_flags & DLIF_INUSE) != 0);
10610 		dlifp->dl_if_flags &= ~DLIF_INUSE;
10611 	}
10612 	lck_mtx_unlock(&dlifp->dl_if_lock);
10613 	ifnet_lock_done(ifp);
10614 }
10615 
10616 __private_extern__ void
dlil_if_release(ifnet_t ifp)10617 dlil_if_release(ifnet_t ifp)
10618 {
10619 	_dlil_if_release(ifp, false);
10620 }
10621 
10622 __private_extern__ void
dlil_if_lock(void)10623 dlil_if_lock(void)
10624 {
10625 	lck_mtx_lock(&dlil_ifnet_lock);
10626 }
10627 
10628 __private_extern__ void
dlil_if_unlock(void)10629 dlil_if_unlock(void)
10630 {
10631 	lck_mtx_unlock(&dlil_ifnet_lock);
10632 }
10633 
10634 __private_extern__ void
dlil_if_lock_assert(void)10635 dlil_if_lock_assert(void)
10636 {
10637 	LCK_MTX_ASSERT(&dlil_ifnet_lock, LCK_MTX_ASSERT_OWNED);
10638 }
10639 
10640 __private_extern__ void
dlil_proto_unplumb_all(struct ifnet * ifp)10641 dlil_proto_unplumb_all(struct ifnet *ifp)
10642 {
10643 	/*
10644 	 * if_proto_hash[0-2] are for PF_INET, PF_INET6 and PF_VLAN, where
10645 	 * each bucket contains exactly one entry; PF_VLAN does not need an
10646 	 * explicit unplumb.
10647 	 *
10648 	 * if_proto_hash[3] is for other protocols; we expect anything
10649 	 * in this bucket to respond to the DETACHING event (which would
10650 	 * have happened by now) and do the unplumb then.
10651 	 */
10652 	(void) proto_unplumb(PF_INET, ifp);
10653 	(void) proto_unplumb(PF_INET6, ifp);
10654 }
10655 
10656 static void
ifp_src_route_copyout(struct ifnet * ifp,struct route * dst)10657 ifp_src_route_copyout(struct ifnet *ifp, struct route *dst)
10658 {
10659 	lck_mtx_lock_spin(&ifp->if_cached_route_lock);
10660 	lck_mtx_convert_spin(&ifp->if_cached_route_lock);
10661 
10662 	route_copyout(dst, &ifp->if_src_route, sizeof(*dst));
10663 
10664 	lck_mtx_unlock(&ifp->if_cached_route_lock);
10665 }
10666 
10667 static void
ifp_src_route_copyin(struct ifnet * ifp,struct route * src)10668 ifp_src_route_copyin(struct ifnet *ifp, struct route *src)
10669 {
10670 	lck_mtx_lock_spin(&ifp->if_cached_route_lock);
10671 	lck_mtx_convert_spin(&ifp->if_cached_route_lock);
10672 
10673 	if (ifp->if_fwd_cacheok) {
10674 		route_copyin(src, &ifp->if_src_route, sizeof(*src));
10675 	} else {
10676 		ROUTE_RELEASE(src);
10677 	}
10678 	lck_mtx_unlock(&ifp->if_cached_route_lock);
10679 }
10680 
10681 static void
ifp_src_route6_copyout(struct ifnet * ifp,struct route_in6 * dst)10682 ifp_src_route6_copyout(struct ifnet *ifp, struct route_in6 *dst)
10683 {
10684 	lck_mtx_lock_spin(&ifp->if_cached_route_lock);
10685 	lck_mtx_convert_spin(&ifp->if_cached_route_lock);
10686 
10687 	route_copyout((struct route *)dst, (struct route *)&ifp->if_src_route6,
10688 	    sizeof(*dst));
10689 
10690 	lck_mtx_unlock(&ifp->if_cached_route_lock);
10691 }
10692 
10693 static void
ifp_src_route6_copyin(struct ifnet * ifp,struct route_in6 * src)10694 ifp_src_route6_copyin(struct ifnet *ifp, struct route_in6 *src)
10695 {
10696 	lck_mtx_lock_spin(&ifp->if_cached_route_lock);
10697 	lck_mtx_convert_spin(&ifp->if_cached_route_lock);
10698 
10699 	if (ifp->if_fwd_cacheok) {
10700 		route_copyin((struct route *)src,
10701 		    (struct route *)&ifp->if_src_route6, sizeof(*src));
10702 	} else {
10703 		ROUTE_RELEASE(src);
10704 	}
10705 	lck_mtx_unlock(&ifp->if_cached_route_lock);
10706 }
10707 
10708 struct rtentry *
ifnet_cached_rtlookup_inet(struct ifnet * ifp,struct in_addr src_ip)10709 ifnet_cached_rtlookup_inet(struct ifnet *ifp, struct in_addr src_ip)
10710 {
10711 	struct route            src_rt;
10712 	struct sockaddr_in      *dst;
10713 
10714 	dst = SIN(&src_rt.ro_dst);
10715 
10716 	ifp_src_route_copyout(ifp, &src_rt);
10717 
10718 	if (ROUTE_UNUSABLE(&src_rt) || src_ip.s_addr != dst->sin_addr.s_addr) {
10719 		ROUTE_RELEASE(&src_rt);
10720 		if (dst->sin_family != AF_INET) {
10721 			SOCKADDR_ZERO(&src_rt.ro_dst, sizeof(src_rt.ro_dst));
10722 			dst->sin_len = sizeof(src_rt.ro_dst);
10723 			dst->sin_family = AF_INET;
10724 		}
10725 		dst->sin_addr = src_ip;
10726 
10727 		VERIFY(src_rt.ro_rt == NULL);
10728 		src_rt.ro_rt = rtalloc1_scoped(SA(dst),
10729 		    0, 0, ifp->if_index);
10730 
10731 		if (src_rt.ro_rt != NULL) {
10732 			/* retain a ref, copyin consumes one */
10733 			struct rtentry  *rte = src_rt.ro_rt;
10734 			RT_ADDREF(rte);
10735 			ifp_src_route_copyin(ifp, &src_rt);
10736 			src_rt.ro_rt = rte;
10737 		}
10738 	}
10739 
10740 	return src_rt.ro_rt;
10741 }
10742 
10743 struct rtentry *
ifnet_cached_rtlookup_inet6(struct ifnet * ifp,struct in6_addr * src_ip6)10744 ifnet_cached_rtlookup_inet6(struct ifnet *ifp, struct in6_addr *src_ip6)
10745 {
10746 	struct route_in6 src_rt;
10747 
10748 	ifp_src_route6_copyout(ifp, &src_rt);
10749 
10750 	if (ROUTE_UNUSABLE(&src_rt) ||
10751 	    !IN6_ARE_ADDR_EQUAL(src_ip6, &src_rt.ro_dst.sin6_addr)) {
10752 		ROUTE_RELEASE(&src_rt);
10753 		if (src_rt.ro_dst.sin6_family != AF_INET6) {
10754 			SOCKADDR_ZERO(&src_rt.ro_dst, sizeof(src_rt.ro_dst));
10755 			src_rt.ro_dst.sin6_len = sizeof(src_rt.ro_dst);
10756 			src_rt.ro_dst.sin6_family = AF_INET6;
10757 		}
10758 		src_rt.ro_dst.sin6_scope_id = in6_addr2scopeid(ifp, src_ip6);
10759 		bcopy(src_ip6, &src_rt.ro_dst.sin6_addr,
10760 		    sizeof(src_rt.ro_dst.sin6_addr));
10761 
10762 		if (src_rt.ro_rt == NULL) {
10763 			src_rt.ro_rt = rtalloc1_scoped(
10764 				SA(&src_rt.ro_dst), 0, 0,
10765 				ifp->if_index);
10766 
10767 			if (src_rt.ro_rt != NULL) {
10768 				/* retain a ref, copyin consumes one */
10769 				struct rtentry  *rte = src_rt.ro_rt;
10770 				RT_ADDREF(rte);
10771 				ifp_src_route6_copyin(ifp, &src_rt);
10772 				src_rt.ro_rt = rte;
10773 			}
10774 		}
10775 	}
10776 
10777 	return src_rt.ro_rt;
10778 }
10779 
10780 void
if_lqm_update(struct ifnet * ifp,int lqm,int locked)10781 if_lqm_update(struct ifnet *ifp, int lqm, int locked)
10782 {
10783 	struct kev_dl_link_quality_metric_data ev_lqm_data;
10784 
10785 	VERIFY(lqm >= IFNET_LQM_MIN && lqm <= IFNET_LQM_MAX);
10786 
10787 	/* Normalize to edge */
10788 	if (lqm >= 0 && lqm <= IFNET_LQM_THRESH_ABORT) {
10789 		lqm = IFNET_LQM_THRESH_ABORT;
10790 		os_atomic_or(&tcbinfo.ipi_flags, INPCBINFO_HANDLE_LQM_ABORT, relaxed);
10791 		inpcb_timer_sched(&tcbinfo, INPCB_TIMER_FAST);
10792 	} else if (lqm > IFNET_LQM_THRESH_ABORT &&
10793 	    lqm <= IFNET_LQM_THRESH_MINIMALLY_VIABLE) {
10794 		lqm = IFNET_LQM_THRESH_MINIMALLY_VIABLE;
10795 	} else if (lqm > IFNET_LQM_THRESH_MINIMALLY_VIABLE &&
10796 	    lqm <= IFNET_LQM_THRESH_POOR) {
10797 		lqm = IFNET_LQM_THRESH_POOR;
10798 	} else if (lqm > IFNET_LQM_THRESH_POOR &&
10799 	    lqm <= IFNET_LQM_THRESH_GOOD) {
10800 		lqm = IFNET_LQM_THRESH_GOOD;
10801 	}
10802 
10803 	/*
10804 	 * Take the lock if needed
10805 	 */
10806 	if (!locked) {
10807 		ifnet_lock_exclusive(ifp);
10808 	}
10809 
10810 	if (lqm == ifp->if_interface_state.lqm_state &&
10811 	    (ifp->if_interface_state.valid_bitmask &
10812 	    IF_INTERFACE_STATE_LQM_STATE_VALID)) {
10813 		/*
10814 		 * Release the lock if was not held by the caller
10815 		 */
10816 		if (!locked) {
10817 			ifnet_lock_done(ifp);
10818 		}
10819 		return;         /* nothing to update */
10820 	}
10821 	ifp->if_interface_state.valid_bitmask |=
10822 	    IF_INTERFACE_STATE_LQM_STATE_VALID;
10823 	ifp->if_interface_state.lqm_state = (int8_t)lqm;
10824 
10825 	/*
10826 	 * Don't want to hold the lock when issuing kernel events
10827 	 */
10828 	ifnet_lock_done(ifp);
10829 
10830 	bzero(&ev_lqm_data, sizeof(ev_lqm_data));
10831 	ev_lqm_data.link_quality_metric = lqm;
10832 
10833 	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_LINK_QUALITY_METRIC_CHANGED,
10834 	    (struct net_event_data *)&ev_lqm_data, sizeof(ev_lqm_data), FALSE);
10835 
10836 	/*
10837 	 * Reacquire the lock for the caller
10838 	 */
10839 	if (locked) {
10840 		ifnet_lock_exclusive(ifp);
10841 	}
10842 }
10843 
10844 static void
if_rrc_state_update(struct ifnet * ifp,unsigned int rrc_state)10845 if_rrc_state_update(struct ifnet *ifp, unsigned int rrc_state)
10846 {
10847 	struct kev_dl_rrc_state kev;
10848 
10849 	if (rrc_state == ifp->if_interface_state.rrc_state &&
10850 	    (ifp->if_interface_state.valid_bitmask &
10851 	    IF_INTERFACE_STATE_RRC_STATE_VALID)) {
10852 		return;
10853 	}
10854 
10855 	ifp->if_interface_state.valid_bitmask |=
10856 	    IF_INTERFACE_STATE_RRC_STATE_VALID;
10857 
10858 	ifp->if_interface_state.rrc_state = (uint8_t)rrc_state;
10859 
10860 	/*
10861 	 * Don't want to hold the lock when issuing kernel events
10862 	 */
10863 	ifnet_lock_done(ifp);
10864 
10865 	bzero(&kev, sizeof(struct kev_dl_rrc_state));
10866 	kev.rrc_state = rrc_state;
10867 
10868 	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_RRC_STATE_CHANGED,
10869 	    (struct net_event_data *)&kev, sizeof(struct kev_dl_rrc_state), FALSE);
10870 
10871 	ifnet_lock_exclusive(ifp);
10872 }
10873 
10874 errno_t
if_state_update(struct ifnet * ifp,struct if_interface_state * if_interface_state)10875 if_state_update(struct ifnet *ifp,
10876     struct if_interface_state *if_interface_state)
10877 {
10878 	u_short if_index_available = 0;
10879 
10880 	ifnet_lock_exclusive(ifp);
10881 
10882 	if ((ifp->if_type != IFT_CELLULAR) &&
10883 	    (if_interface_state->valid_bitmask &
10884 	    IF_INTERFACE_STATE_RRC_STATE_VALID)) {
10885 		ifnet_lock_done(ifp);
10886 		return ENOTSUP;
10887 	}
10888 	if ((if_interface_state->valid_bitmask &
10889 	    IF_INTERFACE_STATE_LQM_STATE_VALID) &&
10890 	    (if_interface_state->lqm_state < IFNET_LQM_MIN ||
10891 	    if_interface_state->lqm_state > IFNET_LQM_MAX)) {
10892 		ifnet_lock_done(ifp);
10893 		return EINVAL;
10894 	}
10895 	if ((if_interface_state->valid_bitmask &
10896 	    IF_INTERFACE_STATE_RRC_STATE_VALID) &&
10897 	    if_interface_state->rrc_state !=
10898 	    IF_INTERFACE_STATE_RRC_STATE_IDLE &&
10899 	    if_interface_state->rrc_state !=
10900 	    IF_INTERFACE_STATE_RRC_STATE_CONNECTED) {
10901 		ifnet_lock_done(ifp);
10902 		return EINVAL;
10903 	}
10904 
10905 	if (if_interface_state->valid_bitmask &
10906 	    IF_INTERFACE_STATE_LQM_STATE_VALID) {
10907 		if_lqm_update(ifp, if_interface_state->lqm_state, 1);
10908 	}
10909 	if (if_interface_state->valid_bitmask &
10910 	    IF_INTERFACE_STATE_RRC_STATE_VALID) {
10911 		if_rrc_state_update(ifp, if_interface_state->rrc_state);
10912 	}
10913 	if (if_interface_state->valid_bitmask &
10914 	    IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID) {
10915 		ifp->if_interface_state.valid_bitmask |=
10916 		    IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID;
10917 		ifp->if_interface_state.interface_availability =
10918 		    if_interface_state->interface_availability;
10919 
10920 		if (ifp->if_interface_state.interface_availability ==
10921 		    IF_INTERFACE_STATE_INTERFACE_AVAILABLE) {
10922 			os_log(OS_LOG_DEFAULT, "%s: interface %s (%u) available\n",
10923 			    __func__, if_name(ifp), ifp->if_index);
10924 			if_index_available = ifp->if_index;
10925 		} else {
10926 			os_log(OS_LOG_DEFAULT, "%s: interface %s (%u) unavailable)\n",
10927 			    __func__, if_name(ifp), ifp->if_index);
10928 		}
10929 	}
10930 	ifnet_lock_done(ifp);
10931 
10932 	/*
10933 	 * Check if the TCP connections going on this interface should be
10934 	 * forced to send probe packets instead of waiting for TCP timers
10935 	 * to fire. This is done on an explicit notification such as
10936 	 * SIOCSIFINTERFACESTATE which marks the interface as available.
10937 	 */
10938 	if (if_index_available > 0) {
10939 		tcp_interface_send_probe(if_index_available);
10940 	}
10941 
10942 	return 0;
10943 }
10944 
10945 void
if_get_state(struct ifnet * ifp,struct if_interface_state * if_interface_state)10946 if_get_state(struct ifnet *ifp,
10947     struct if_interface_state *if_interface_state)
10948 {
10949 	ifnet_lock_shared(ifp);
10950 
10951 	if_interface_state->valid_bitmask = 0;
10952 
10953 	if (ifp->if_interface_state.valid_bitmask &
10954 	    IF_INTERFACE_STATE_RRC_STATE_VALID) {
10955 		if_interface_state->valid_bitmask |=
10956 		    IF_INTERFACE_STATE_RRC_STATE_VALID;
10957 		if_interface_state->rrc_state =
10958 		    ifp->if_interface_state.rrc_state;
10959 	}
10960 	if (ifp->if_interface_state.valid_bitmask &
10961 	    IF_INTERFACE_STATE_LQM_STATE_VALID) {
10962 		if_interface_state->valid_bitmask |=
10963 		    IF_INTERFACE_STATE_LQM_STATE_VALID;
10964 		if_interface_state->lqm_state =
10965 		    ifp->if_interface_state.lqm_state;
10966 	}
10967 	if (ifp->if_interface_state.valid_bitmask &
10968 	    IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID) {
10969 		if_interface_state->valid_bitmask |=
10970 		    IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID;
10971 		if_interface_state->interface_availability =
10972 		    ifp->if_interface_state.interface_availability;
10973 	}
10974 
10975 	ifnet_lock_done(ifp);
10976 }
10977 
10978 errno_t
if_probe_connectivity(struct ifnet * ifp,u_int32_t conn_probe)10979 if_probe_connectivity(struct ifnet *ifp, u_int32_t conn_probe)
10980 {
10981 	if (conn_probe > 1) {
10982 		return EINVAL;
10983 	}
10984 	if (conn_probe == 0) {
10985 		if_clear_eflags(ifp, IFEF_PROBE_CONNECTIVITY);
10986 	} else {
10987 		if_set_eflags(ifp, IFEF_PROBE_CONNECTIVITY);
10988 	}
10989 
10990 #if NECP
10991 	necp_update_all_clients();
10992 #endif /* NECP */
10993 
10994 	tcp_probe_connectivity(ifp, conn_probe);
10995 	return 0;
10996 }
10997 
10998 /* for uuid.c */
10999 static int
get_ether_index(int * ret_other_index)11000 get_ether_index(int * ret_other_index)
11001 {
11002 	struct ifnet *ifp;
11003 	int en0_index = 0;
11004 	int other_en_index = 0;
11005 	int any_ether_index = 0;
11006 	short best_unit = 0;
11007 
11008 	*ret_other_index = 0;
11009 	TAILQ_FOREACH(ifp, &ifnet_head, if_link) {
11010 		/*
11011 		 * find en0, or if not en0, the lowest unit en*, and if not
11012 		 * that, any ethernet
11013 		 */
11014 		ifnet_lock_shared(ifp);
11015 		if (strcmp(ifp->if_name, "en") == 0) {
11016 			if (ifp->if_unit == 0) {
11017 				/* found en0, we're done */
11018 				en0_index = ifp->if_index;
11019 				ifnet_lock_done(ifp);
11020 				break;
11021 			}
11022 			if (other_en_index == 0 || ifp->if_unit < best_unit) {
11023 				other_en_index = ifp->if_index;
11024 				best_unit = ifp->if_unit;
11025 			}
11026 		} else if (ifp->if_type == IFT_ETHER && any_ether_index == 0) {
11027 			any_ether_index = ifp->if_index;
11028 		}
11029 		ifnet_lock_done(ifp);
11030 	}
11031 	if (en0_index == 0) {
11032 		if (other_en_index != 0) {
11033 			*ret_other_index = other_en_index;
11034 		} else if (any_ether_index != 0) {
11035 			*ret_other_index = any_ether_index;
11036 		}
11037 	}
11038 	return en0_index;
11039 }
11040 
11041 int
uuid_get_ethernet(u_int8_t * node)11042 uuid_get_ethernet(u_int8_t *node)
11043 {
11044 	static int en0_index;
11045 	struct ifnet *ifp;
11046 	int other_index = 0;
11047 	int the_index = 0;
11048 	int ret;
11049 
11050 	ifnet_head_lock_shared();
11051 	if (en0_index == 0 || ifindex2ifnet[en0_index] == NULL) {
11052 		en0_index = get_ether_index(&other_index);
11053 	}
11054 	if (en0_index != 0) {
11055 		the_index = en0_index;
11056 	} else if (other_index != 0) {
11057 		the_index = other_index;
11058 	}
11059 	if (the_index != 0) {
11060 		struct dlil_ifnet *dl_if;
11061 
11062 		ifp = ifindex2ifnet[the_index];
11063 		VERIFY(ifp != NULL);
11064 		dl_if = (struct dlil_ifnet *)ifp;
11065 		if (dl_if->dl_if_permanent_ether_is_set != 0) {
11066 			/*
11067 			 * Use the permanent ethernet address if it is
11068 			 * available because it will never change.
11069 			 */
11070 			memcpy(node, dl_if->dl_if_permanent_ether,
11071 			    ETHER_ADDR_LEN);
11072 		} else {
11073 			memcpy(node, IF_LLADDR(ifp), ETHER_ADDR_LEN);
11074 		}
11075 		ret = 0;
11076 	} else {
11077 		ret = -1;
11078 	}
11079 	ifnet_head_done();
11080 	return ret;
11081 }
11082 
11083 static int
11084 sysctl_rxpoll SYSCTL_HANDLER_ARGS
11085 {
11086 #pragma unused(arg1, arg2)
11087 	uint32_t i;
11088 	int err;
11089 
11090 	i = if_rxpoll;
11091 
11092 	err = sysctl_handle_int(oidp, &i, 0, req);
11093 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
11094 		return err;
11095 	}
11096 
11097 	if (net_rxpoll == 0) {
11098 		return ENXIO;
11099 	}
11100 
11101 	if_rxpoll = i;
11102 	return err;
11103 }
11104 
11105 static int
11106 sysctl_rxpoll_mode_holdtime SYSCTL_HANDLER_ARGS
11107 {
11108 #pragma unused(arg1, arg2)
11109 	uint64_t q;
11110 	int err;
11111 
11112 	q = if_rxpoll_mode_holdtime;
11113 
11114 	err = sysctl_handle_quad(oidp, &q, 0, req);
11115 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
11116 		return err;
11117 	}
11118 
11119 	if (q < IF_RXPOLL_MODE_HOLDTIME_MIN) {
11120 		q = IF_RXPOLL_MODE_HOLDTIME_MIN;
11121 	}
11122 
11123 	if_rxpoll_mode_holdtime = q;
11124 
11125 	return err;
11126 }
11127 
11128 static int
11129 sysctl_rxpoll_sample_holdtime SYSCTL_HANDLER_ARGS
11130 {
11131 #pragma unused(arg1, arg2)
11132 	uint64_t q;
11133 	int err;
11134 
11135 	q = if_rxpoll_sample_holdtime;
11136 
11137 	err = sysctl_handle_quad(oidp, &q, 0, req);
11138 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
11139 		return err;
11140 	}
11141 
11142 	if (q < IF_RXPOLL_SAMPLETIME_MIN) {
11143 		q = IF_RXPOLL_SAMPLETIME_MIN;
11144 	}
11145 
11146 	if_rxpoll_sample_holdtime = q;
11147 
11148 	return err;
11149 }
11150 
11151 static int
11152 sysctl_rxpoll_interval_time SYSCTL_HANDLER_ARGS
11153 {
11154 #pragma unused(arg1, arg2)
11155 	uint64_t q;
11156 	int err;
11157 
11158 	q = if_rxpoll_interval_time;
11159 
11160 	err = sysctl_handle_quad(oidp, &q, 0, req);
11161 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
11162 		return err;
11163 	}
11164 
11165 	if (q < IF_RXPOLL_INTERVALTIME_MIN) {
11166 		q = IF_RXPOLL_INTERVALTIME_MIN;
11167 	}
11168 
11169 	if_rxpoll_interval_time = q;
11170 
11171 	return err;
11172 }
11173 
11174 static int
11175 sysctl_rxpoll_wlowat SYSCTL_HANDLER_ARGS
11176 {
11177 #pragma unused(arg1, arg2)
11178 	uint32_t i;
11179 	int err;
11180 
11181 	i = if_sysctl_rxpoll_wlowat;
11182 
11183 	err = sysctl_handle_int(oidp, &i, 0, req);
11184 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
11185 		return err;
11186 	}
11187 
11188 	if (i == 0 || i >= if_sysctl_rxpoll_whiwat) {
11189 		return EINVAL;
11190 	}
11191 
11192 	if_sysctl_rxpoll_wlowat = i;
11193 	return err;
11194 }
11195 
11196 static int
11197 sysctl_rxpoll_whiwat SYSCTL_HANDLER_ARGS
11198 {
11199 #pragma unused(arg1, arg2)
11200 	uint32_t i;
11201 	int err;
11202 
11203 	i = if_sysctl_rxpoll_whiwat;
11204 
11205 	err = sysctl_handle_int(oidp, &i, 0, req);
11206 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
11207 		return err;
11208 	}
11209 
11210 	if (i <= if_sysctl_rxpoll_wlowat) {
11211 		return EINVAL;
11212 	}
11213 
11214 	if_sysctl_rxpoll_whiwat = i;
11215 	return err;
11216 }
11217 
11218 static int
11219 sysctl_sndq_maxlen SYSCTL_HANDLER_ARGS
11220 {
11221 #pragma unused(arg1, arg2)
11222 	int i, err;
11223 
11224 	i = if_sndq_maxlen;
11225 
11226 	err = sysctl_handle_int(oidp, &i, 0, req);
11227 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
11228 		return err;
11229 	}
11230 
11231 	if (i < IF_SNDQ_MINLEN) {
11232 		i = IF_SNDQ_MINLEN;
11233 	}
11234 
11235 	if_sndq_maxlen = i;
11236 	return err;
11237 }
11238 
11239 static int
11240 sysctl_rcvq_maxlen SYSCTL_HANDLER_ARGS
11241 {
11242 #pragma unused(arg1, arg2)
11243 	int i, err;
11244 
11245 	i = if_rcvq_maxlen;
11246 
11247 	err = sysctl_handle_int(oidp, &i, 0, req);
11248 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
11249 		return err;
11250 	}
11251 
11252 	if (i < IF_RCVQ_MINLEN) {
11253 		i = IF_RCVQ_MINLEN;
11254 	}
11255 
11256 	if_rcvq_maxlen = i;
11257 	return err;
11258 }
11259 
11260 static int
11261 sysctl_rcvq_burst_limit SYSCTL_HANDLER_ARGS
11262 {
11263 #pragma unused(arg1, arg2)
11264 	int i, err;
11265 
11266 	i = if_rcvq_burst_limit;
11267 
11268 	err = sysctl_handle_int(oidp, &i, 0, req);
11269 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
11270 		return err;
11271 	}
11272 
11273 /*
11274  * Safeguard the burst limit to "sane" values on customer builds.
11275  */
11276 #if !(DEVELOPMENT || DEBUG)
11277 	if (i < IF_RCVQ_BURST_LIMIT_MIN) {
11278 		i = IF_RCVQ_BURST_LIMIT_MIN;
11279 	}
11280 
11281 	if (IF_RCVQ_BURST_LIMIT_MAX < i) {
11282 		i = IF_RCVQ_BURST_LIMIT_MAX;
11283 	}
11284 #endif
11285 
11286 	if_rcvq_burst_limit = i;
11287 	return err;
11288 }
11289 
11290 static int
11291 sysctl_rcvq_trim_pct SYSCTL_HANDLER_ARGS
11292 {
11293 #pragma unused(arg1, arg2)
11294 	int i, err;
11295 
11296 	i = if_rcvq_burst_limit;
11297 
11298 	err = sysctl_handle_int(oidp, &i, 0, req);
11299 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
11300 		return err;
11301 	}
11302 
11303 	if (IF_RCVQ_TRIM_PCT_MAX < i) {
11304 		i = IF_RCVQ_TRIM_PCT_MAX;
11305 	}
11306 
11307 	if (i < IF_RCVQ_TRIM_PCT_MIN) {
11308 		i = IF_RCVQ_TRIM_PCT_MIN;
11309 	}
11310 
11311 	if_rcvq_trim_pct = i;
11312 	return err;
11313 }
11314 
11315 int
dlil_node_present(struct ifnet * ifp,struct sockaddr * sa,int32_t rssi,int lqm,int npm,u_int8_t srvinfo[48])11316 dlil_node_present(struct ifnet *ifp, struct sockaddr *sa,
11317     int32_t rssi, int lqm, int npm, u_int8_t srvinfo[48])
11318 {
11319 	struct kev_dl_node_presence kev;
11320 	struct sockaddr_dl *sdl;
11321 	struct sockaddr_in6 *sin6;
11322 	int ret = 0;
11323 
11324 	VERIFY(ifp);
11325 	VERIFY(sa);
11326 	VERIFY(sa->sa_family == AF_LINK || sa->sa_family == AF_INET6);
11327 
11328 	bzero(&kev, sizeof(kev));
11329 	sin6 = &kev.sin6_node_address;
11330 	sdl = &kev.sdl_node_address;
11331 	nd6_alt_node_addr_decompose(ifp, sa, sdl, sin6);
11332 	kev.rssi = rssi;
11333 	kev.link_quality_metric = lqm;
11334 	kev.node_proximity_metric = npm;
11335 	bcopy(srvinfo, kev.node_service_info, sizeof(kev.node_service_info));
11336 
11337 	ret = nd6_alt_node_present(ifp, sin6, sdl, rssi, lqm, npm);
11338 	if (ret == 0 || ret == EEXIST) {
11339 		int err = dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_NODE_PRESENCE,
11340 		    &kev.link_data, sizeof(kev), (ret == EEXIST) ? TRUE : FALSE);
11341 		if (err != 0) {
11342 			log(LOG_ERR, "%s: Post DL_NODE_PRESENCE failed with"
11343 			    "error %d\n", __func__, err);
11344 		}
11345 	}
11346 
11347 	if (ret == EEXIST) {
11348 		ret = 0;
11349 	}
11350 	return ret;
11351 }
11352 
11353 void
dlil_node_absent(struct ifnet * ifp,struct sockaddr * sa)11354 dlil_node_absent(struct ifnet *ifp, struct sockaddr *sa)
11355 {
11356 	struct kev_dl_node_absence kev = {};
11357 	struct sockaddr_in6 *kev_sin6 = NULL;
11358 	struct sockaddr_dl *kev_sdl = NULL;
11359 	int error = 0;
11360 
11361 	VERIFY(ifp != NULL);
11362 	VERIFY(sa != NULL);
11363 	VERIFY(sa->sa_family == AF_LINK || sa->sa_family == AF_INET6);
11364 
11365 	kev_sin6 = &kev.sin6_node_address;
11366 	kev_sdl = &kev.sdl_node_address;
11367 
11368 	if (sa->sa_family == AF_INET6) {
11369 		/*
11370 		 * If IPv6 address is given, get the link layer
11371 		 * address from what was cached in the neighbor cache
11372 		 */
11373 		VERIFY(sa->sa_len <= sizeof(*kev_sin6));
11374 		bcopy(sa, kev_sin6, sa->sa_len);
11375 		error = nd6_alt_node_absent(ifp, kev_sin6, kev_sdl);
11376 	} else {
11377 		/*
11378 		 * If passed address is AF_LINK type, derive the address
11379 		 * based on the link address.
11380 		 */
11381 		nd6_alt_node_addr_decompose(ifp, sa, kev_sdl, kev_sin6);
11382 		error = nd6_alt_node_absent(ifp, kev_sin6, NULL);
11383 	}
11384 
11385 	if (error == 0) {
11386 		kev_sdl->sdl_type = ifp->if_type;
11387 		kev_sdl->sdl_index = ifp->if_index;
11388 
11389 		dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_NODE_ABSENCE,
11390 		    &kev.link_data, sizeof(kev), FALSE);
11391 	}
11392 }
11393 
11394 int
dlil_node_present_v2(struct ifnet * ifp,struct sockaddr * sa,struct sockaddr_dl * sdl,int32_t rssi,int lqm,int npm,u_int8_t srvinfo[48])11395 dlil_node_present_v2(struct ifnet *ifp, struct sockaddr *sa, struct sockaddr_dl *sdl,
11396     int32_t rssi, int lqm, int npm, u_int8_t srvinfo[48])
11397 {
11398 	struct kev_dl_node_presence kev = {};
11399 	struct sockaddr_dl *kev_sdl = NULL;
11400 	struct sockaddr_in6 *kev_sin6 = NULL;
11401 	int ret = 0;
11402 
11403 	VERIFY(ifp != NULL);
11404 	VERIFY(sa != NULL && sdl != NULL);
11405 	VERIFY(sa->sa_family == AF_INET6 && sdl->sdl_family == AF_LINK);
11406 
11407 	kev_sin6 = &kev.sin6_node_address;
11408 	kev_sdl = &kev.sdl_node_address;
11409 
11410 	VERIFY(sdl->sdl_len <= sizeof(*kev_sdl));
11411 	bcopy(sdl, kev_sdl, sdl->sdl_len);
11412 	kev_sdl->sdl_type = ifp->if_type;
11413 	kev_sdl->sdl_index = ifp->if_index;
11414 
11415 	VERIFY(sa->sa_len <= sizeof(*kev_sin6));
11416 	bcopy(sa, kev_sin6, sa->sa_len);
11417 
11418 	kev.rssi = rssi;
11419 	kev.link_quality_metric = lqm;
11420 	kev.node_proximity_metric = npm;
11421 	bcopy(srvinfo, kev.node_service_info, sizeof(kev.node_service_info));
11422 
11423 	ret = nd6_alt_node_present(ifp, SIN6(sa), sdl, rssi, lqm, npm);
11424 	if (ret == 0 || ret == EEXIST) {
11425 		int err = dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_NODE_PRESENCE,
11426 		    &kev.link_data, sizeof(kev), (ret == EEXIST) ? TRUE : FALSE);
11427 		if (err != 0) {
11428 			log(LOG_ERR, "%s: Post DL_NODE_PRESENCE failed with error %d\n", __func__, err);
11429 		}
11430 	}
11431 
11432 	if (ret == EEXIST) {
11433 		ret = 0;
11434 	}
11435 	return ret;
11436 }
11437 
11438 const void * __indexable
dlil_ifaddr_bytes(const struct sockaddr_dl * sdl,size_t * sizep,kauth_cred_t * credp)11439 dlil_ifaddr_bytes(const struct sockaddr_dl *sdl, size_t *sizep,
11440     kauth_cred_t *credp)
11441 {
11442 	const u_int8_t *bytes;
11443 	size_t size;
11444 
11445 	bytes = CONST_LLADDR(sdl);
11446 	size = sdl->sdl_alen;
11447 
11448 #if CONFIG_MACF
11449 	if (dlil_lladdr_ckreq) {
11450 		switch (sdl->sdl_type) {
11451 		case IFT_ETHER:
11452 		case IFT_IEEE1394:
11453 			break;
11454 		default:
11455 			credp = NULL;
11456 			break;
11457 		}
11458 		;
11459 
11460 		if (credp && mac_system_check_info(*credp, "net.link.addr")) {
11461 			static const u_int8_t unspec[FIREWIRE_EUI64_LEN] = {
11462 				[0] = 2
11463 			};
11464 
11465 			bytes = unspec;
11466 		}
11467 	}
11468 #else
11469 #pragma unused(credp)
11470 #endif
11471 
11472 	if (sizep != NULL) {
11473 		*sizep = size;
11474 	}
11475 	return bytes;
11476 }
11477 
11478 void
dlil_report_issues(struct ifnet * ifp,u_int8_t modid[DLIL_MODIDLEN],u_int8_t info[DLIL_MODARGLEN])11479 dlil_report_issues(struct ifnet *ifp, u_int8_t modid[DLIL_MODIDLEN],
11480     u_int8_t info[DLIL_MODARGLEN])
11481 {
11482 	struct kev_dl_issues kev;
11483 	struct timeval tv;
11484 
11485 	VERIFY(ifp != NULL);
11486 	VERIFY(modid != NULL);
11487 	_CASSERT(sizeof(kev.modid) == DLIL_MODIDLEN);
11488 	_CASSERT(sizeof(kev.info) == DLIL_MODARGLEN);
11489 
11490 	bzero(&kev, sizeof(kev));
11491 
11492 	microtime(&tv);
11493 	kev.timestamp = tv.tv_sec;
11494 	bcopy(modid, &kev.modid, DLIL_MODIDLEN);
11495 	if (info != NULL) {
11496 		bcopy(info, &kev.info, DLIL_MODARGLEN);
11497 	}
11498 
11499 	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_ISSUES,
11500 	    &kev.link_data, sizeof(kev), FALSE);
11501 }
11502 
11503 errno_t
ifnet_getset_opportunistic(ifnet_t ifp,u_long cmd,struct ifreq * ifr,struct proc * p)11504 ifnet_getset_opportunistic(ifnet_t ifp, u_long cmd, struct ifreq *ifr,
11505     struct proc *p)
11506 {
11507 	u_int32_t level = IFNET_THROTTLE_OFF;
11508 	errno_t result = 0;
11509 
11510 	VERIFY(cmd == SIOCSIFOPPORTUNISTIC || cmd == SIOCGIFOPPORTUNISTIC);
11511 
11512 	if (cmd == SIOCSIFOPPORTUNISTIC) {
11513 		/*
11514 		 * XXX: Use priv_check_cred() instead of root check?
11515 		 */
11516 		if ((result = proc_suser(p)) != 0) {
11517 			return result;
11518 		}
11519 
11520 		if (ifr->ifr_opportunistic.ifo_flags ==
11521 		    IFRIFOF_BLOCK_OPPORTUNISTIC) {
11522 			level = IFNET_THROTTLE_OPPORTUNISTIC;
11523 		} else if (ifr->ifr_opportunistic.ifo_flags == 0) {
11524 			level = IFNET_THROTTLE_OFF;
11525 		} else {
11526 			result = EINVAL;
11527 		}
11528 
11529 		if (result == 0) {
11530 			result = ifnet_set_throttle(ifp, level);
11531 		}
11532 	} else if ((result = ifnet_get_throttle(ifp, &level)) == 0) {
11533 		ifr->ifr_opportunistic.ifo_flags = 0;
11534 		if (level == IFNET_THROTTLE_OPPORTUNISTIC) {
11535 			ifr->ifr_opportunistic.ifo_flags |=
11536 			    IFRIFOF_BLOCK_OPPORTUNISTIC;
11537 		}
11538 	}
11539 
11540 	/*
11541 	 * Return the count of current opportunistic connections
11542 	 * over the interface.
11543 	 */
11544 	if (result == 0) {
11545 		uint32_t flags = 0;
11546 		flags |= (cmd == SIOCSIFOPPORTUNISTIC) ?
11547 		    INPCB_OPPORTUNISTIC_SETCMD : 0;
11548 		flags |= (level == IFNET_THROTTLE_OPPORTUNISTIC) ?
11549 		    INPCB_OPPORTUNISTIC_THROTTLEON : 0;
11550 		ifr->ifr_opportunistic.ifo_inuse =
11551 		    udp_count_opportunistic(ifp->if_index, flags) +
11552 		    tcp_count_opportunistic(ifp->if_index, flags);
11553 	}
11554 
11555 	if (result == EALREADY) {
11556 		result = 0;
11557 	}
11558 
11559 	return result;
11560 }
11561 
11562 int
ifnet_get_throttle(struct ifnet * ifp,u_int32_t * level)11563 ifnet_get_throttle(struct ifnet *ifp, u_int32_t *level)
11564 {
11565 	struct ifclassq *ifq;
11566 	int err = 0;
11567 
11568 	if (!(ifp->if_eflags & IFEF_TXSTART)) {
11569 		return ENXIO;
11570 	}
11571 
11572 	*level = IFNET_THROTTLE_OFF;
11573 
11574 	ifq = ifp->if_snd;
11575 	IFCQ_LOCK(ifq);
11576 	/* Throttling works only for IFCQ, not ALTQ instances */
11577 	if (IFCQ_IS_ENABLED(ifq)) {
11578 		cqrq_throttle_t req = { 0, IFNET_THROTTLE_OFF };
11579 
11580 		err = fq_if_request_classq(ifq, CLASSQRQ_THROTTLE, &req);
11581 		*level = req.level;
11582 	}
11583 	IFCQ_UNLOCK(ifq);
11584 
11585 	return err;
11586 }
11587 
11588 int
ifnet_set_throttle(struct ifnet * ifp,u_int32_t level)11589 ifnet_set_throttle(struct ifnet *ifp, u_int32_t level)
11590 {
11591 	struct ifclassq *ifq;
11592 	int err = 0;
11593 
11594 	if (!(ifp->if_eflags & IFEF_TXSTART)) {
11595 		return ENXIO;
11596 	}
11597 
11598 	ifq = ifp->if_snd;
11599 
11600 	switch (level) {
11601 	case IFNET_THROTTLE_OFF:
11602 	case IFNET_THROTTLE_OPPORTUNISTIC:
11603 		break;
11604 	default:
11605 		return EINVAL;
11606 	}
11607 
11608 	IFCQ_LOCK(ifq);
11609 	if (IFCQ_IS_ENABLED(ifq)) {
11610 		cqrq_throttle_t req = { 1, level };
11611 
11612 		err = fq_if_request_classq(ifq, CLASSQRQ_THROTTLE, &req);
11613 	}
11614 	IFCQ_UNLOCK(ifq);
11615 
11616 	if (err == 0) {
11617 		DLIL_PRINTF("%s: throttling level set to %d\n", if_name(ifp),
11618 		    level);
11619 #if NECP
11620 		necp_update_all_clients();
11621 #endif /* NECP */
11622 		if (level == IFNET_THROTTLE_OFF) {
11623 			ifnet_start(ifp);
11624 		}
11625 	}
11626 
11627 	return err;
11628 }
11629 
11630 errno_t
ifnet_getset_log(ifnet_t ifp,u_long cmd,struct ifreq * ifr,struct proc * p)11631 ifnet_getset_log(ifnet_t ifp, u_long cmd, struct ifreq *ifr,
11632     struct proc *p)
11633 {
11634 #pragma unused(p)
11635 	errno_t result = 0;
11636 	uint32_t flags;
11637 	int level, category, subcategory;
11638 
11639 	VERIFY(cmd == SIOCSIFLOG || cmd == SIOCGIFLOG);
11640 
11641 	if (cmd == SIOCSIFLOG) {
11642 		if ((result = priv_check_cred(kauth_cred_get(),
11643 		    PRIV_NET_INTERFACE_CONTROL, 0)) != 0) {
11644 			return result;
11645 		}
11646 
11647 		level = ifr->ifr_log.ifl_level;
11648 		if (level < IFNET_LOG_MIN || level > IFNET_LOG_MAX) {
11649 			result = EINVAL;
11650 		}
11651 
11652 		flags = ifr->ifr_log.ifl_flags;
11653 		if ((flags &= IFNET_LOGF_MASK) == 0) {
11654 			result = EINVAL;
11655 		}
11656 
11657 		category = ifr->ifr_log.ifl_category;
11658 		subcategory = ifr->ifr_log.ifl_subcategory;
11659 
11660 		if (result == 0) {
11661 			result = ifnet_set_log(ifp, level, flags,
11662 			    category, subcategory);
11663 		}
11664 	} else {
11665 		result = ifnet_get_log(ifp, &level, &flags, &category,
11666 		    &subcategory);
11667 		if (result == 0) {
11668 			ifr->ifr_log.ifl_level = level;
11669 			ifr->ifr_log.ifl_flags = flags;
11670 			ifr->ifr_log.ifl_category = category;
11671 			ifr->ifr_log.ifl_subcategory = subcategory;
11672 		}
11673 	}
11674 
11675 	return result;
11676 }
11677 
11678 int
ifnet_set_log(struct ifnet * ifp,int32_t level,uint32_t flags,int32_t category,int32_t subcategory)11679 ifnet_set_log(struct ifnet *ifp, int32_t level, uint32_t flags,
11680     int32_t category, int32_t subcategory)
11681 {
11682 	int err = 0;
11683 
11684 	VERIFY(level >= IFNET_LOG_MIN && level <= IFNET_LOG_MAX);
11685 	VERIFY(flags & IFNET_LOGF_MASK);
11686 
11687 	/*
11688 	 * The logging level applies to all facilities; make sure to
11689 	 * update them all with the most current level.
11690 	 */
11691 	flags |= ifp->if_log.flags;
11692 
11693 	if (ifp->if_output_ctl != NULL) {
11694 		struct ifnet_log_params l;
11695 
11696 		bzero(&l, sizeof(l));
11697 		l.level = level;
11698 		l.flags = flags;
11699 		l.flags &= ~IFNET_LOGF_DLIL;
11700 		l.category = category;
11701 		l.subcategory = subcategory;
11702 
11703 		/* Send this request to lower layers */
11704 		if (l.flags != 0) {
11705 			err = ifp->if_output_ctl(ifp, IFNET_CTL_SET_LOG,
11706 			    sizeof(l), &l);
11707 		}
11708 	} else if ((flags & ~IFNET_LOGF_DLIL) && ifp->if_output_ctl == NULL) {
11709 		/*
11710 		 * If targeted to the lower layers without an output
11711 		 * control callback registered on the interface, just
11712 		 * silently ignore facilities other than ours.
11713 		 */
11714 		flags &= IFNET_LOGF_DLIL;
11715 		if (flags == 0 && (!(ifp->if_log.flags & IFNET_LOGF_DLIL))) {
11716 			level = 0;
11717 		}
11718 	}
11719 
11720 	if (err == 0) {
11721 		if ((ifp->if_log.level = level) == IFNET_LOG_DEFAULT) {
11722 			ifp->if_log.flags = 0;
11723 		} else {
11724 			ifp->if_log.flags |= flags;
11725 		}
11726 
11727 		log(LOG_INFO, "%s: logging level set to %d flags=0x%x "
11728 		    "arg=0x%x, category=%d subcategory=%d\n", if_name(ifp),
11729 		    ifp->if_log.level, ifp->if_log.flags, flags,
11730 		    category, subcategory);
11731 	}
11732 
11733 	return err;
11734 }
11735 
11736 int
ifnet_get_log(struct ifnet * ifp,int32_t * level,uint32_t * flags,int32_t * category,int32_t * subcategory)11737 ifnet_get_log(struct ifnet *ifp, int32_t *level, uint32_t *flags,
11738     int32_t *category, int32_t *subcategory)
11739 {
11740 	if (level != NULL) {
11741 		*level = ifp->if_log.level;
11742 	}
11743 	if (flags != NULL) {
11744 		*flags = ifp->if_log.flags;
11745 	}
11746 	if (category != NULL) {
11747 		*category = ifp->if_log.category;
11748 	}
11749 	if (subcategory != NULL) {
11750 		*subcategory = ifp->if_log.subcategory;
11751 	}
11752 
11753 	return 0;
11754 }
11755 
11756 int
ifnet_notify_address(struct ifnet * ifp,int af)11757 ifnet_notify_address(struct ifnet *ifp, int af)
11758 {
11759 	struct ifnet_notify_address_params na;
11760 
11761 #if PF
11762 	(void) pf_ifaddr_hook(ifp);
11763 #endif /* PF */
11764 
11765 	if (ifp->if_output_ctl == NULL) {
11766 		return EOPNOTSUPP;
11767 	}
11768 
11769 	bzero(&na, sizeof(na));
11770 	na.address_family = (sa_family_t)af;
11771 
11772 	return ifp->if_output_ctl(ifp, IFNET_CTL_NOTIFY_ADDRESS,
11773 	           sizeof(na), &na);
11774 }
11775 
11776 errno_t
ifnet_flowid(struct ifnet * ifp,uint32_t * flowid)11777 ifnet_flowid(struct ifnet *ifp, uint32_t *flowid)
11778 {
11779 	if (ifp == NULL || flowid == NULL) {
11780 		return EINVAL;
11781 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
11782 	    !IF_FULLY_ATTACHED(ifp)) {
11783 		return ENXIO;
11784 	}
11785 
11786 	*flowid = ifp->if_flowhash;
11787 
11788 	return 0;
11789 }
11790 
11791 errno_t
ifnet_disable_output(struct ifnet * ifp)11792 ifnet_disable_output(struct ifnet *ifp)
11793 {
11794 	int err;
11795 
11796 	if (ifp == NULL) {
11797 		return EINVAL;
11798 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
11799 	    !IF_FULLY_ATTACHED(ifp)) {
11800 		return ENXIO;
11801 	}
11802 
11803 	if ((err = ifnet_fc_add(ifp)) == 0) {
11804 		lck_mtx_lock_spin(&ifp->if_start_lock);
11805 		ifp->if_start_flags |= IFSF_FLOW_CONTROLLED;
11806 		lck_mtx_unlock(&ifp->if_start_lock);
11807 	}
11808 	return err;
11809 }
11810 
11811 errno_t
ifnet_enable_output(struct ifnet * ifp)11812 ifnet_enable_output(struct ifnet *ifp)
11813 {
11814 	if (ifp == NULL) {
11815 		return EINVAL;
11816 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
11817 	    !IF_FULLY_ATTACHED(ifp)) {
11818 		return ENXIO;
11819 	}
11820 
11821 	ifnet_start_common(ifp, TRUE, FALSE);
11822 	return 0;
11823 }
11824 
11825 void
ifnet_flowadv(uint32_t flowhash)11826 ifnet_flowadv(uint32_t flowhash)
11827 {
11828 	struct ifnet_fc_entry *ifce;
11829 	struct ifnet *ifp;
11830 
11831 	ifce = ifnet_fc_get(flowhash);
11832 	if (ifce == NULL) {
11833 		return;
11834 	}
11835 
11836 	VERIFY(ifce->ifce_ifp != NULL);
11837 	ifp = ifce->ifce_ifp;
11838 
11839 	/* flow hash gets recalculated per attach, so check */
11840 	if (ifnet_is_attached(ifp, 1)) {
11841 		if (ifp->if_flowhash == flowhash) {
11842 			(void) ifnet_enable_output(ifp);
11843 		}
11844 		ifnet_decr_iorefcnt(ifp);
11845 	}
11846 	ifnet_fc_entry_free(ifce);
11847 }
11848 
11849 /*
11850  * Function to compare ifnet_fc_entries in ifnet flow control tree
11851  */
11852 static inline int
ifce_cmp(const struct ifnet_fc_entry * fc1,const struct ifnet_fc_entry * fc2)11853 ifce_cmp(const struct ifnet_fc_entry *fc1, const struct ifnet_fc_entry *fc2)
11854 {
11855 	return fc1->ifce_flowhash - fc2->ifce_flowhash;
11856 }
11857 
11858 static int
ifnet_fc_add(struct ifnet * ifp)11859 ifnet_fc_add(struct ifnet *ifp)
11860 {
11861 	struct ifnet_fc_entry keyfc, *ifce;
11862 	uint32_t flowhash;
11863 
11864 	VERIFY(ifp != NULL && (ifp->if_eflags & IFEF_TXSTART));
11865 	VERIFY(ifp->if_flowhash != 0);
11866 	flowhash = ifp->if_flowhash;
11867 
11868 	bzero(&keyfc, sizeof(keyfc));
11869 	keyfc.ifce_flowhash = flowhash;
11870 
11871 	lck_mtx_lock_spin(&ifnet_fc_lock);
11872 	ifce = RB_FIND(ifnet_fc_tree, &ifnet_fc_tree, &keyfc);
11873 	if (ifce != NULL && ifce->ifce_ifp == ifp) {
11874 		/* Entry is already in ifnet_fc_tree, return */
11875 		lck_mtx_unlock(&ifnet_fc_lock);
11876 		return 0;
11877 	}
11878 
11879 	if (ifce != NULL) {
11880 		/*
11881 		 * There is a different fc entry with the same flow hash
11882 		 * but different ifp pointer.  There can be a collision
11883 		 * on flow hash but the probability is low.  Let's just
11884 		 * avoid adding a second one when there is a collision.
11885 		 */
11886 		lck_mtx_unlock(&ifnet_fc_lock);
11887 		return EAGAIN;
11888 	}
11889 
11890 	/* become regular mutex */
11891 	lck_mtx_convert_spin(&ifnet_fc_lock);
11892 
11893 	ifce = zalloc_flags(ifnet_fc_zone, Z_WAITOK | Z_ZERO);
11894 	ifce->ifce_flowhash = flowhash;
11895 	ifce->ifce_ifp = ifp;
11896 
11897 	RB_INSERT(ifnet_fc_tree, &ifnet_fc_tree, ifce);
11898 	lck_mtx_unlock(&ifnet_fc_lock);
11899 	return 0;
11900 }
11901 
11902 static struct ifnet_fc_entry *
ifnet_fc_get(uint32_t flowhash)11903 ifnet_fc_get(uint32_t flowhash)
11904 {
11905 	struct ifnet_fc_entry keyfc, *ifce;
11906 	struct ifnet *ifp;
11907 
11908 	bzero(&keyfc, sizeof(keyfc));
11909 	keyfc.ifce_flowhash = flowhash;
11910 
11911 	lck_mtx_lock_spin(&ifnet_fc_lock);
11912 	ifce = RB_FIND(ifnet_fc_tree, &ifnet_fc_tree, &keyfc);
11913 	if (ifce == NULL) {
11914 		/* Entry is not present in ifnet_fc_tree, return */
11915 		lck_mtx_unlock(&ifnet_fc_lock);
11916 		return NULL;
11917 	}
11918 
11919 	RB_REMOVE(ifnet_fc_tree, &ifnet_fc_tree, ifce);
11920 
11921 	VERIFY(ifce->ifce_ifp != NULL);
11922 	ifp = ifce->ifce_ifp;
11923 
11924 	/* become regular mutex */
11925 	lck_mtx_convert_spin(&ifnet_fc_lock);
11926 
11927 	if (!ifnet_is_attached(ifp, 0)) {
11928 		/*
11929 		 * This ifp is not attached or in the process of being
11930 		 * detached; just don't process it.
11931 		 */
11932 		ifnet_fc_entry_free(ifce);
11933 		ifce = NULL;
11934 	}
11935 	lck_mtx_unlock(&ifnet_fc_lock);
11936 
11937 	return ifce;
11938 }
11939 
11940 static void
ifnet_fc_entry_free(struct ifnet_fc_entry * ifce)11941 ifnet_fc_entry_free(struct ifnet_fc_entry *ifce)
11942 {
11943 	zfree(ifnet_fc_zone, ifce);
11944 }
11945 
11946 static uint32_t
ifnet_calc_flowhash(struct ifnet * ifp)11947 ifnet_calc_flowhash(struct ifnet *ifp)
11948 {
11949 	struct ifnet_flowhash_key fh __attribute__((aligned(8)));
11950 	uint32_t flowhash = 0;
11951 
11952 	if (ifnet_flowhash_seed == 0) {
11953 		ifnet_flowhash_seed = RandomULong();
11954 	}
11955 
11956 	bzero(&fh, sizeof(fh));
11957 
11958 	(void) snprintf(fh.ifk_name, sizeof(fh.ifk_name), "%s", ifp->if_name);
11959 	fh.ifk_unit = ifp->if_unit;
11960 	fh.ifk_flags = ifp->if_flags;
11961 	fh.ifk_eflags = ifp->if_eflags;
11962 	fh.ifk_capabilities = ifp->if_capabilities;
11963 	fh.ifk_capenable = ifp->if_capenable;
11964 	fh.ifk_output_sched_model = ifp->if_output_sched_model;
11965 	fh.ifk_rand1 = RandomULong();
11966 	fh.ifk_rand2 = RandomULong();
11967 
11968 try_again:
11969 	flowhash = net_flowhash(&fh, sizeof(fh), ifnet_flowhash_seed);
11970 	if (flowhash == 0) {
11971 		/* try to get a non-zero flowhash */
11972 		ifnet_flowhash_seed = RandomULong();
11973 		goto try_again;
11974 	}
11975 
11976 	return flowhash;
11977 }
11978 
11979 int
ifnet_set_netsignature(struct ifnet * ifp,uint8_t family,uint8_t len,uint16_t flags,uint8_t * data)11980 ifnet_set_netsignature(struct ifnet *ifp, uint8_t family, uint8_t len,
11981     uint16_t flags, uint8_t *data)
11982 {
11983 #pragma unused(flags)
11984 	int error = 0;
11985 
11986 	switch (family) {
11987 	case AF_INET:
11988 		if_inetdata_lock_exclusive(ifp);
11989 		if (IN_IFEXTRA(ifp) != NULL) {
11990 			if (len == 0) {
11991 				/* Allow clearing the signature */
11992 				IN_IFEXTRA(ifp)->netsig_len = 0;
11993 				bzero(IN_IFEXTRA(ifp)->netsig,
11994 				    sizeof(IN_IFEXTRA(ifp)->netsig));
11995 				if_inetdata_lock_done(ifp);
11996 				break;
11997 			} else if (len > sizeof(IN_IFEXTRA(ifp)->netsig)) {
11998 				error = EINVAL;
11999 				if_inetdata_lock_done(ifp);
12000 				break;
12001 			}
12002 			IN_IFEXTRA(ifp)->netsig_len = len;
12003 			bcopy(data, IN_IFEXTRA(ifp)->netsig, len);
12004 		} else {
12005 			error = ENOMEM;
12006 		}
12007 		if_inetdata_lock_done(ifp);
12008 		break;
12009 
12010 	case AF_INET6:
12011 		if_inet6data_lock_exclusive(ifp);
12012 		if (IN6_IFEXTRA(ifp) != NULL) {
12013 			if (len == 0) {
12014 				/* Allow clearing the signature */
12015 				IN6_IFEXTRA(ifp)->netsig_len = 0;
12016 				bzero(IN6_IFEXTRA(ifp)->netsig,
12017 				    sizeof(IN6_IFEXTRA(ifp)->netsig));
12018 				if_inet6data_lock_done(ifp);
12019 				break;
12020 			} else if (len > sizeof(IN6_IFEXTRA(ifp)->netsig)) {
12021 				error = EINVAL;
12022 				if_inet6data_lock_done(ifp);
12023 				break;
12024 			}
12025 			IN6_IFEXTRA(ifp)->netsig_len = len;
12026 			bcopy(data, IN6_IFEXTRA(ifp)->netsig, len);
12027 		} else {
12028 			error = ENOMEM;
12029 		}
12030 		if_inet6data_lock_done(ifp);
12031 		break;
12032 
12033 	default:
12034 		error = EINVAL;
12035 		break;
12036 	}
12037 
12038 	return error;
12039 }
12040 
12041 int
ifnet_get_netsignature(struct ifnet * ifp,uint8_t family,uint8_t * len,uint16_t * flags,uint8_t * data)12042 ifnet_get_netsignature(struct ifnet *ifp, uint8_t family, uint8_t *len,
12043     uint16_t *flags, uint8_t *data)
12044 {
12045 	int error = 0;
12046 
12047 	if (ifp == NULL || len == NULL || data == NULL) {
12048 		return EINVAL;
12049 	}
12050 
12051 	switch (family) {
12052 	case AF_INET:
12053 		if_inetdata_lock_shared(ifp);
12054 		if (IN_IFEXTRA(ifp) != NULL) {
12055 			if (*len == 0 || *len < IN_IFEXTRA(ifp)->netsig_len) {
12056 				error = EINVAL;
12057 				if_inetdata_lock_done(ifp);
12058 				break;
12059 			}
12060 			if ((*len = (uint8_t)IN_IFEXTRA(ifp)->netsig_len) > 0) {
12061 				bcopy(IN_IFEXTRA(ifp)->netsig, data, *len);
12062 			} else {
12063 				error = ENOENT;
12064 			}
12065 		} else {
12066 			error = ENOMEM;
12067 		}
12068 		if_inetdata_lock_done(ifp);
12069 		break;
12070 
12071 	case AF_INET6:
12072 		if_inet6data_lock_shared(ifp);
12073 		if (IN6_IFEXTRA(ifp) != NULL) {
12074 			if (*len == 0 || *len < IN6_IFEXTRA(ifp)->netsig_len) {
12075 				error = EINVAL;
12076 				if_inet6data_lock_done(ifp);
12077 				break;
12078 			}
12079 			if ((*len = (uint8_t)IN6_IFEXTRA(ifp)->netsig_len) > 0) {
12080 				bcopy(IN6_IFEXTRA(ifp)->netsig, data, *len);
12081 			} else {
12082 				error = ENOENT;
12083 			}
12084 		} else {
12085 			error = ENOMEM;
12086 		}
12087 		if_inet6data_lock_done(ifp);
12088 		break;
12089 
12090 	default:
12091 		error = EINVAL;
12092 		break;
12093 	}
12094 
12095 	if (error == 0 && flags != NULL) {
12096 		*flags = 0;
12097 	}
12098 
12099 	return error;
12100 }
12101 
12102 int
ifnet_set_nat64prefix(struct ifnet * ifp,struct ipv6_prefix * prefixes)12103 ifnet_set_nat64prefix(struct ifnet *ifp, struct ipv6_prefix *prefixes)
12104 {
12105 	int i, error = 0, one_set = 0;
12106 
12107 	if_inet6data_lock_exclusive(ifp);
12108 
12109 	if (IN6_IFEXTRA(ifp) == NULL) {
12110 		error = ENOMEM;
12111 		goto out;
12112 	}
12113 
12114 	for (i = 0; i < NAT64_MAX_NUM_PREFIXES; i++) {
12115 		uint32_t prefix_len =
12116 		    prefixes[i].prefix_len;
12117 		struct in6_addr *prefix =
12118 		    &prefixes[i].ipv6_prefix;
12119 
12120 		if (prefix_len == 0) {
12121 			clat_log0((LOG_DEBUG,
12122 			    "NAT64 prefixes purged from Interface %s\n",
12123 			    if_name(ifp)));
12124 			/* Allow clearing the signature */
12125 			IN6_IFEXTRA(ifp)->nat64_prefixes[i].prefix_len = 0;
12126 			bzero(&IN6_IFEXTRA(ifp)->nat64_prefixes[i].ipv6_prefix,
12127 			    sizeof(struct in6_addr));
12128 
12129 			continue;
12130 		} else if (prefix_len != NAT64_PREFIX_LEN_32 &&
12131 		    prefix_len != NAT64_PREFIX_LEN_40 &&
12132 		    prefix_len != NAT64_PREFIX_LEN_48 &&
12133 		    prefix_len != NAT64_PREFIX_LEN_56 &&
12134 		    prefix_len != NAT64_PREFIX_LEN_64 &&
12135 		    prefix_len != NAT64_PREFIX_LEN_96) {
12136 			clat_log0((LOG_DEBUG,
12137 			    "NAT64 prefixlen is incorrect %d\n", prefix_len));
12138 			error = EINVAL;
12139 			goto out;
12140 		}
12141 
12142 		if (IN6_IS_SCOPE_EMBED(prefix)) {
12143 			clat_log0((LOG_DEBUG,
12144 			    "NAT64 prefix has interface/link local scope.\n"));
12145 			error = EINVAL;
12146 			goto out;
12147 		}
12148 
12149 		IN6_IFEXTRA(ifp)->nat64_prefixes[i].prefix_len = prefix_len;
12150 		bcopy(prefix, &IN6_IFEXTRA(ifp)->nat64_prefixes[i].ipv6_prefix,
12151 		    sizeof(struct in6_addr));
12152 		clat_log0((LOG_DEBUG,
12153 		    "NAT64 prefix set to %s with prefixlen: %d\n",
12154 		    ip6_sprintf(prefix), prefix_len));
12155 		one_set = 1;
12156 	}
12157 
12158 out:
12159 	if_inet6data_lock_done(ifp);
12160 
12161 	if (error == 0 && one_set != 0) {
12162 		necp_update_all_clients();
12163 	}
12164 
12165 	return error;
12166 }
12167 
12168 int
ifnet_get_nat64prefix(struct ifnet * ifp,struct ipv6_prefix * prefixes)12169 ifnet_get_nat64prefix(struct ifnet *ifp, struct ipv6_prefix *prefixes)
12170 {
12171 	int i, found_one = 0, error = 0;
12172 
12173 	if (ifp == NULL) {
12174 		return EINVAL;
12175 	}
12176 
12177 	if_inet6data_lock_shared(ifp);
12178 
12179 	if (IN6_IFEXTRA(ifp) == NULL) {
12180 		error = ENOMEM;
12181 		goto out;
12182 	}
12183 
12184 	for (i = 0; i < NAT64_MAX_NUM_PREFIXES; i++) {
12185 		if (IN6_IFEXTRA(ifp)->nat64_prefixes[i].prefix_len != 0) {
12186 			found_one = 1;
12187 		}
12188 	}
12189 
12190 	if (found_one == 0) {
12191 		error = ENOENT;
12192 		goto out;
12193 	}
12194 
12195 	if (prefixes) {
12196 		bcopy(IN6_IFEXTRA(ifp)->nat64_prefixes, prefixes,
12197 		    sizeof(IN6_IFEXTRA(ifp)->nat64_prefixes));
12198 	}
12199 
12200 out:
12201 	if_inet6data_lock_done(ifp);
12202 
12203 	return error;
12204 }
12205 
12206 __attribute__((noinline))
12207 static void
dlil_output_cksum_dbg(struct ifnet * ifp,struct mbuf * m,uint32_t hoff,protocol_family_t pf)12208 dlil_output_cksum_dbg(struct ifnet *ifp, struct mbuf *m, uint32_t hoff,
12209     protocol_family_t pf)
12210 {
12211 #pragma unused(ifp)
12212 	uint32_t did_sw;
12213 
12214 	if (!(hwcksum_dbg_mode & HWCKSUM_DBG_FINALIZE_FORCED) ||
12215 	    (m->m_pkthdr.csum_flags & (CSUM_TSO_IPV4 | CSUM_TSO_IPV6))) {
12216 		return;
12217 	}
12218 
12219 	switch (pf) {
12220 	case PF_INET:
12221 		did_sw = in_finalize_cksum(m, hoff, m->m_pkthdr.csum_flags);
12222 		if (did_sw & CSUM_DELAY_IP) {
12223 			hwcksum_dbg_finalized_hdr++;
12224 		}
12225 		if (did_sw & CSUM_DELAY_DATA) {
12226 			hwcksum_dbg_finalized_data++;
12227 		}
12228 		break;
12229 	case PF_INET6:
12230 		/*
12231 		 * Checksum offload should not have been enabled when
12232 		 * extension headers exist; that also means that we
12233 		 * cannot force-finalize packets with extension headers.
12234 		 * Indicate to the callee should it skip such case by
12235 		 * setting optlen to -1.
12236 		 */
12237 		did_sw = in6_finalize_cksum(m, hoff, -1, -1,
12238 		    m->m_pkthdr.csum_flags);
12239 		if (did_sw & CSUM_DELAY_IPV6_DATA) {
12240 			hwcksum_dbg_finalized_data++;
12241 		}
12242 		break;
12243 	default:
12244 		return;
12245 	}
12246 }
12247 
12248 static void
dlil_input_cksum_dbg(struct ifnet * ifp,struct mbuf * m,char * frame_header,protocol_family_t pf)12249 dlil_input_cksum_dbg(struct ifnet *ifp, struct mbuf *m, char *frame_header,
12250     protocol_family_t pf)
12251 {
12252 	uint16_t sum = 0;
12253 	uint32_t hlen;
12254 
12255 	if (frame_header == NULL ||
12256 	    frame_header < (char *)mbuf_datastart(m) ||
12257 	    frame_header > (char *)m->m_data) {
12258 		DLIL_PRINTF("%s: frame header pointer 0x%llx out of range "
12259 		    "[0x%llx,0x%llx] for mbuf 0x%llx\n", if_name(ifp),
12260 		    (uint64_t)VM_KERNEL_ADDRPERM(frame_header),
12261 		    (uint64_t)VM_KERNEL_ADDRPERM(mbuf_datastart(m)),
12262 		    (uint64_t)VM_KERNEL_ADDRPERM(m->m_data),
12263 		    (uint64_t)VM_KERNEL_ADDRPERM(m));
12264 		return;
12265 	}
12266 	hlen = (uint32_t)(m->m_data - (uintptr_t)frame_header);
12267 
12268 	switch (pf) {
12269 	case PF_INET:
12270 	case PF_INET6:
12271 		break;
12272 	default:
12273 		return;
12274 	}
12275 
12276 	/*
12277 	 * Force partial checksum offload; useful to simulate cases
12278 	 * where the hardware does not support partial checksum offload,
12279 	 * in order to validate correctness throughout the layers above.
12280 	 */
12281 	if (hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_FORCED) {
12282 		uint32_t foff = hwcksum_dbg_partial_rxoff_forced;
12283 
12284 		if (foff > (uint32_t)m->m_pkthdr.len) {
12285 			return;
12286 		}
12287 
12288 		m->m_pkthdr.csum_flags &= ~CSUM_RX_FLAGS;
12289 
12290 		/* Compute 16-bit 1's complement sum from forced offset */
12291 		sum = m_sum16(m, foff, (m->m_pkthdr.len - foff));
12292 
12293 		m->m_pkthdr.csum_flags |= (CSUM_DATA_VALID | CSUM_PARTIAL);
12294 		m->m_pkthdr.csum_rx_val = sum;
12295 		m->m_pkthdr.csum_rx_start = (uint16_t)(foff + hlen);
12296 
12297 		hwcksum_dbg_partial_forced++;
12298 		hwcksum_dbg_partial_forced_bytes += m->m_pkthdr.len;
12299 	}
12300 
12301 	/*
12302 	 * Partial checksum offload verification (and adjustment);
12303 	 * useful to validate and test cases where the hardware
12304 	 * supports partial checksum offload.
12305 	 */
12306 	if ((m->m_pkthdr.csum_flags &
12307 	    (CSUM_DATA_VALID | CSUM_PARTIAL | CSUM_PSEUDO_HDR)) ==
12308 	    (CSUM_DATA_VALID | CSUM_PARTIAL)) {
12309 		uint32_t rxoff;
12310 
12311 		/* Start offset must begin after frame header */
12312 		rxoff = m->m_pkthdr.csum_rx_start;
12313 		if (hlen > rxoff) {
12314 			hwcksum_dbg_bad_rxoff++;
12315 			if (dlil_verbose) {
12316 				DLIL_PRINTF("%s: partial cksum start offset %d "
12317 				    "is less than frame header length %d for "
12318 				    "mbuf 0x%llx\n", if_name(ifp), rxoff, hlen,
12319 				    (uint64_t)VM_KERNEL_ADDRPERM(m));
12320 			}
12321 			return;
12322 		}
12323 		rxoff -= hlen;
12324 
12325 		if (!(hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_FORCED)) {
12326 			/*
12327 			 * Compute the expected 16-bit 1's complement sum;
12328 			 * skip this if we've already computed it above
12329 			 * when partial checksum offload is forced.
12330 			 */
12331 			sum = m_sum16(m, rxoff, (m->m_pkthdr.len - rxoff));
12332 
12333 			/* Hardware or driver is buggy */
12334 			if (sum != m->m_pkthdr.csum_rx_val) {
12335 				hwcksum_dbg_bad_cksum++;
12336 				if (dlil_verbose) {
12337 					DLIL_PRINTF("%s: bad partial cksum value "
12338 					    "0x%x (expected 0x%x) for mbuf "
12339 					    "0x%llx [rx_start %d]\n",
12340 					    if_name(ifp),
12341 					    m->m_pkthdr.csum_rx_val, sum,
12342 					    (uint64_t)VM_KERNEL_ADDRPERM(m),
12343 					    m->m_pkthdr.csum_rx_start);
12344 				}
12345 				return;
12346 			}
12347 		}
12348 		hwcksum_dbg_verified++;
12349 
12350 		/*
12351 		 * This code allows us to emulate various hardwares that
12352 		 * perform 16-bit 1's complement sum beginning at various
12353 		 * start offset values.
12354 		 */
12355 		if (hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_RXOFF_ADJ) {
12356 			uint32_t aoff = hwcksum_dbg_partial_rxoff_adj;
12357 
12358 			if (aoff == rxoff || aoff > (uint32_t)m->m_pkthdr.len) {
12359 				return;
12360 			}
12361 
12362 			sum = m_adj_sum16(m, rxoff, aoff,
12363 			    m_pktlen(m) - aoff, sum);
12364 
12365 			m->m_pkthdr.csum_rx_val = sum;
12366 			m->m_pkthdr.csum_rx_start = (uint16_t)(aoff + hlen);
12367 
12368 			hwcksum_dbg_adjusted++;
12369 		}
12370 	}
12371 }
12372 
12373 static int
12374 sysctl_hwcksum_dbg_mode SYSCTL_HANDLER_ARGS
12375 {
12376 #pragma unused(arg1, arg2)
12377 	u_int32_t i;
12378 	int err;
12379 
12380 	i = hwcksum_dbg_mode;
12381 
12382 	err = sysctl_handle_int(oidp, &i, 0, req);
12383 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
12384 		return err;
12385 	}
12386 
12387 	if (hwcksum_dbg == 0) {
12388 		return ENODEV;
12389 	}
12390 
12391 	if ((i & ~HWCKSUM_DBG_MASK) != 0) {
12392 		return EINVAL;
12393 	}
12394 
12395 	hwcksum_dbg_mode = (i & HWCKSUM_DBG_MASK);
12396 
12397 	return err;
12398 }
12399 
12400 static int
12401 sysctl_hwcksum_dbg_partial_rxoff_forced SYSCTL_HANDLER_ARGS
12402 {
12403 #pragma unused(arg1, arg2)
12404 	u_int32_t i;
12405 	int err;
12406 
12407 	i = hwcksum_dbg_partial_rxoff_forced;
12408 
12409 	err = sysctl_handle_int(oidp, &i, 0, req);
12410 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
12411 		return err;
12412 	}
12413 
12414 	if (!(hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_FORCED)) {
12415 		return ENODEV;
12416 	}
12417 
12418 	hwcksum_dbg_partial_rxoff_forced = i;
12419 
12420 	return err;
12421 }
12422 
12423 static int
12424 sysctl_hwcksum_dbg_partial_rxoff_adj SYSCTL_HANDLER_ARGS
12425 {
12426 #pragma unused(arg1, arg2)
12427 	u_int32_t i;
12428 	int err;
12429 
12430 	i = hwcksum_dbg_partial_rxoff_adj;
12431 
12432 	err = sysctl_handle_int(oidp, &i, 0, req);
12433 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
12434 		return err;
12435 	}
12436 
12437 	if (!(hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_RXOFF_ADJ)) {
12438 		return ENODEV;
12439 	}
12440 
12441 	hwcksum_dbg_partial_rxoff_adj = i;
12442 
12443 	return err;
12444 }
12445 
12446 static int
12447 sysctl_tx_chain_len_stats SYSCTL_HANDLER_ARGS
12448 {
12449 #pragma unused(oidp, arg1, arg2)
12450 	int err;
12451 
12452 	if (req->oldptr == USER_ADDR_NULL) {
12453 	}
12454 	if (req->newptr != USER_ADDR_NULL) {
12455 		return EPERM;
12456 	}
12457 	err = SYSCTL_OUT(req, &tx_chain_len_stats,
12458 	    sizeof(struct chain_len_stats));
12459 
12460 	return err;
12461 }
12462 
12463 #if DEBUG || DEVELOPMENT
12464 /* Blob for sum16 verification */
12465 static uint8_t sumdata[] = {
12466 	0x1f, 0x8b, 0x08, 0x08, 0x4c, 0xe5, 0x9a, 0x4f, 0x00, 0x03,
12467 	0x5f, 0x00, 0x5d, 0x91, 0x41, 0x4e, 0xc4, 0x30, 0x0c, 0x45,
12468 	0xf7, 0x9c, 0xc2, 0x07, 0x18, 0xf5, 0x0e, 0xb0, 0xe2, 0x00,
12469 	0x48, 0x88, 0xa5, 0xdb, 0xba, 0x49, 0x34, 0x69, 0xdc, 0x71,
12470 	0x92, 0xa9, 0xc2, 0x8a, 0x6b, 0x70, 0x3d, 0x4e, 0x82, 0x93,
12471 	0xb4, 0x08, 0xd8, 0xc5, 0xb1, 0xfd, 0xff, 0xb3, 0xfd, 0x4c,
12472 	0x42, 0x5f, 0x1f, 0x9f, 0x11, 0x12, 0x43, 0xb2, 0x04, 0x93,
12473 	0xe0, 0x7b, 0x01, 0x0e, 0x14, 0x07, 0x78, 0xd1, 0x78, 0x75,
12474 	0x71, 0x71, 0xe9, 0x08, 0x84, 0x46, 0xf2, 0xc7, 0x3b, 0x09,
12475 	0xe7, 0xd1, 0xd3, 0x8a, 0x57, 0x92, 0x33, 0xcd, 0x39, 0xcc,
12476 	0xb0, 0x91, 0x89, 0xe0, 0x42, 0x53, 0x8b, 0xb7, 0x8c, 0x42,
12477 	0x60, 0xd9, 0x9f, 0x7a, 0x55, 0x19, 0x76, 0xcb, 0x10, 0x49,
12478 	0x35, 0xac, 0x0b, 0x5a, 0x3c, 0xbb, 0x65, 0x51, 0x8c, 0x90,
12479 	0x7c, 0x69, 0x45, 0x45, 0x81, 0xb4, 0x2b, 0x70, 0x82, 0x85,
12480 	0x55, 0x91, 0x17, 0x90, 0xdc, 0x14, 0x1e, 0x35, 0x52, 0xdd,
12481 	0x02, 0x16, 0xef, 0xb5, 0x40, 0x89, 0xe2, 0x46, 0x53, 0xad,
12482 	0x93, 0x6e, 0x98, 0x30, 0xe5, 0x08, 0xb7, 0xcc, 0x03, 0xbc,
12483 	0x71, 0x86, 0x09, 0x43, 0x0d, 0x52, 0xf5, 0xa2, 0xf5, 0xa2,
12484 	0x56, 0x11, 0x8d, 0xa8, 0xf5, 0xee, 0x92, 0x3d, 0xfe, 0x8c,
12485 	0x67, 0x71, 0x8b, 0x0e, 0x2d, 0x70, 0x77, 0xbe, 0xbe, 0xea,
12486 	0xbf, 0x9a, 0x8d, 0x9c, 0x53, 0x53, 0xe5, 0xe0, 0x4b, 0x87,
12487 	0x85, 0xd2, 0x45, 0x95, 0x30, 0xc1, 0xcc, 0xe0, 0x74, 0x54,
12488 	0x13, 0x58, 0xe8, 0xe8, 0x79, 0xa2, 0x09, 0x73, 0xa4, 0x0e,
12489 	0x39, 0x59, 0x0c, 0xe6, 0x9c, 0xb2, 0x4f, 0x06, 0x5b, 0x8e,
12490 	0xcd, 0x17, 0x6c, 0x5e, 0x95, 0x4d, 0x70, 0xa2, 0x0a, 0xbf,
12491 	0xa3, 0xcc, 0x03, 0xbc, 0x5a, 0xe7, 0x75, 0x06, 0x5e, 0x75,
12492 	0xef, 0x58, 0x8e, 0x15, 0xd1, 0x0a, 0x18, 0xff, 0xdd, 0xe6,
12493 	0x02, 0x3b, 0xb5, 0xb4, 0xa1, 0xe0, 0x72, 0xfc, 0xe3, 0xab,
12494 	0x07, 0xe0, 0x4d, 0x65, 0xea, 0x92, 0xeb, 0xf2, 0x7b, 0x17,
12495 	0x05, 0xce, 0xc6, 0xf6, 0x2b, 0xbb, 0x70, 0x3d, 0x00, 0x95,
12496 	0xe0, 0x07, 0x52, 0x3b, 0x58, 0xfc, 0x7c, 0x69, 0x4d, 0xe9,
12497 	0xf7, 0xa9, 0x66, 0x1e, 0x1e, 0xbe, 0x01, 0x69, 0x98, 0xfe,
12498 	0xc8, 0x28, 0x02, 0x00, 0x00
12499 };
12500 
12501 /* Precomputed 16-bit 1's complement sums for various spans of the above data */
12502 static struct {
12503 	boolean_t       init;
12504 	uint16_t        len;
12505 	uint16_t        sumr;   /* reference */
12506 	uint16_t        sumrp;  /* reference, precomputed */
12507 } sumtbl[] = {
12508 	{ FALSE, 0, 0, 0x0000 },
12509 	{ FALSE, 1, 0, 0x001f },
12510 	{ FALSE, 2, 0, 0x8b1f },
12511 	{ FALSE, 3, 0, 0x8b27 },
12512 	{ FALSE, 7, 0, 0x790e },
12513 	{ FALSE, 11, 0, 0xcb6d },
12514 	{ FALSE, 20, 0, 0x20dd },
12515 	{ FALSE, 27, 0, 0xbabd },
12516 	{ FALSE, 32, 0, 0xf3e8 },
12517 	{ FALSE, 37, 0, 0x197d },
12518 	{ FALSE, 43, 0, 0x9eae },
12519 	{ FALSE, 64, 0, 0x4678 },
12520 	{ FALSE, 127, 0, 0x9399 },
12521 	{ FALSE, 256, 0, 0xd147 },
12522 	{ FALSE, 325, 0, 0x0358 },
12523 };
12524 #define SUMTBL_MAX      ((int)sizeof (sumtbl) / (int)sizeof (sumtbl[0]))
12525 
12526 static void
dlil_verify_sum16(void)12527 dlil_verify_sum16(void)
12528 {
12529 	struct mbuf *m;
12530 	uint8_t *buf;
12531 	int n;
12532 
12533 	/* Make sure test data plus extra room for alignment fits in cluster */
12534 	_CASSERT((sizeof(sumdata) + (sizeof(uint64_t) * 2)) <= MCLBYTES);
12535 
12536 	kprintf("DLIL: running SUM16 self-tests ... ");
12537 
12538 	m = m_getcl(M_WAITOK, MT_DATA, M_PKTHDR);
12539 	m_align(m, sizeof(sumdata) + (sizeof(uint64_t) * 2));
12540 
12541 	buf = mtod(m, uint8_t *);               /* base address */
12542 
12543 	for (n = 0; n < SUMTBL_MAX; n++) {
12544 		uint16_t len = sumtbl[n].len;
12545 		int i;
12546 
12547 		/* Verify for all possible alignments */
12548 		for (i = 0; i < (int)sizeof(uint64_t); i++) {
12549 			uint16_t sum, sumr;
12550 			uint8_t *c;
12551 
12552 			/* Copy over test data to mbuf */
12553 			VERIFY(len <= sizeof(sumdata));
12554 			c = buf + i;
12555 			bcopy(sumdata, c, len);
12556 
12557 			/* Zero-offset test (align by data pointer) */
12558 			m->m_data = (uintptr_t)c;
12559 			m->m_len = len;
12560 			sum = m_sum16(m, 0, len);
12561 
12562 			if (!sumtbl[n].init) {
12563 				sumr = (uint16_t)in_cksum_mbuf_ref(m, len, 0, 0);
12564 				sumtbl[n].sumr = sumr;
12565 				sumtbl[n].init = TRUE;
12566 			} else {
12567 				sumr = sumtbl[n].sumr;
12568 			}
12569 
12570 			/* Something is horribly broken; stop now */
12571 			if (sumr != sumtbl[n].sumrp) {
12572 				panic_plain("\n%s: broken in_cksum_mbuf_ref() "
12573 				    "for len=%d align=%d sum=0x%04x "
12574 				    "[expected=0x%04x]\n", __func__,
12575 				    len, i, sum, sumr);
12576 				/* NOTREACHED */
12577 			} else if (sum != sumr) {
12578 				panic_plain("\n%s: broken m_sum16() for len=%d "
12579 				    "align=%d sum=0x%04x [expected=0x%04x]\n",
12580 				    __func__, len, i, sum, sumr);
12581 				/* NOTREACHED */
12582 			}
12583 
12584 			/* Alignment test by offset (fixed data pointer) */
12585 			m->m_data = (uintptr_t)buf;
12586 			m->m_len = i + len;
12587 			sum = m_sum16(m, i, len);
12588 
12589 			/* Something is horribly broken; stop now */
12590 			if (sum != sumr) {
12591 				panic_plain("\n%s: broken m_sum16() for len=%d "
12592 				    "offset=%d sum=0x%04x [expected=0x%04x]\n",
12593 				    __func__, len, i, sum, sumr);
12594 				/* NOTREACHED */
12595 			}
12596 #if INET
12597 			/* Simple sum16 contiguous buffer test by aligment */
12598 			sum = b_sum16(c, len);
12599 
12600 			/* Something is horribly broken; stop now */
12601 			if (sum != sumr) {
12602 				panic_plain("\n%s: broken b_sum16() for len=%d "
12603 				    "align=%d sum=0x%04x [expected=0x%04x]\n",
12604 				    __func__, len, i, sum, sumr);
12605 				/* NOTREACHED */
12606 			}
12607 #endif /* INET */
12608 		}
12609 	}
12610 	m_freem(m);
12611 
12612 	kprintf("PASSED\n");
12613 }
12614 #endif /* DEBUG || DEVELOPMENT */
12615 
12616 #define CASE_STRINGIFY(x) case x: return #x
12617 
12618 __private_extern__ const char *
dlil_kev_dl_code_str(u_int32_t event_code)12619 dlil_kev_dl_code_str(u_int32_t event_code)
12620 {
12621 	switch (event_code) {
12622 		CASE_STRINGIFY(KEV_DL_SIFFLAGS);
12623 		CASE_STRINGIFY(KEV_DL_SIFMETRICS);
12624 		CASE_STRINGIFY(KEV_DL_SIFMTU);
12625 		CASE_STRINGIFY(KEV_DL_SIFPHYS);
12626 		CASE_STRINGIFY(KEV_DL_SIFMEDIA);
12627 		CASE_STRINGIFY(KEV_DL_SIFGENERIC);
12628 		CASE_STRINGIFY(KEV_DL_ADDMULTI);
12629 		CASE_STRINGIFY(KEV_DL_DELMULTI);
12630 		CASE_STRINGIFY(KEV_DL_IF_ATTACHED);
12631 		CASE_STRINGIFY(KEV_DL_IF_DETACHING);
12632 		CASE_STRINGIFY(KEV_DL_IF_DETACHED);
12633 		CASE_STRINGIFY(KEV_DL_LINK_OFF);
12634 		CASE_STRINGIFY(KEV_DL_LINK_ON);
12635 		CASE_STRINGIFY(KEV_DL_PROTO_ATTACHED);
12636 		CASE_STRINGIFY(KEV_DL_PROTO_DETACHED);
12637 		CASE_STRINGIFY(KEV_DL_LINK_ADDRESS_CHANGED);
12638 		CASE_STRINGIFY(KEV_DL_WAKEFLAGS_CHANGED);
12639 		CASE_STRINGIFY(KEV_DL_IF_IDLE_ROUTE_REFCNT);
12640 		CASE_STRINGIFY(KEV_DL_IFCAP_CHANGED);
12641 		CASE_STRINGIFY(KEV_DL_LINK_QUALITY_METRIC_CHANGED);
12642 		CASE_STRINGIFY(KEV_DL_NODE_PRESENCE);
12643 		CASE_STRINGIFY(KEV_DL_NODE_ABSENCE);
12644 		CASE_STRINGIFY(KEV_DL_PRIMARY_ELECTED);
12645 		CASE_STRINGIFY(KEV_DL_ISSUES);
12646 		CASE_STRINGIFY(KEV_DL_IFDELEGATE_CHANGED);
12647 	default:
12648 		break;
12649 	}
12650 	return "";
12651 }
12652 
12653 static void
dlil_dt_tcall_fn(thread_call_param_t arg0,thread_call_param_t arg1)12654 dlil_dt_tcall_fn(thread_call_param_t arg0, thread_call_param_t arg1)
12655 {
12656 #pragma unused(arg1)
12657 	struct ifnet *ifp = arg0;
12658 
12659 	if (ifnet_is_attached(ifp, 1)) {
12660 		nstat_ifnet_threshold_reached(ifp->if_index);
12661 		ifnet_decr_iorefcnt(ifp);
12662 	}
12663 }
12664 
12665 void
ifnet_notify_data_threshold(struct ifnet * ifp)12666 ifnet_notify_data_threshold(struct ifnet *ifp)
12667 {
12668 	uint64_t bytes = (ifp->if_ibytes + ifp->if_obytes);
12669 	uint64_t oldbytes = ifp->if_dt_bytes;
12670 
12671 	ASSERT(ifp->if_dt_tcall != NULL);
12672 
12673 	/*
12674 	 * If we went over the threshold, notify NetworkStatistics.
12675 	 * We rate-limit it based on the threshold interval value.
12676 	 */
12677 	if (threshold_notify && (bytes - oldbytes) > ifp->if_data_threshold &&
12678 	    OSCompareAndSwap64(oldbytes, bytes, &ifp->if_dt_bytes) &&
12679 	    !thread_call_isactive(ifp->if_dt_tcall)) {
12680 		uint64_t tival = (threshold_interval * NSEC_PER_SEC);
12681 		uint64_t now = mach_absolute_time(), deadline = now;
12682 		uint64_t ival;
12683 
12684 		if (tival != 0) {
12685 			nanoseconds_to_absolutetime(tival, &ival);
12686 			clock_deadline_for_periodic_event(ival, now, &deadline);
12687 			(void) thread_call_enter_delayed(ifp->if_dt_tcall,
12688 			    deadline);
12689 		} else {
12690 			(void) thread_call_enter(ifp->if_dt_tcall);
12691 		}
12692 	}
12693 }
12694 
12695 #if (DEVELOPMENT || DEBUG)
12696 /*
12697  * The sysctl variable name contains the input parameters of
12698  * ifnet_get_keepalive_offload_frames()
12699  *  ifp (interface index): name[0]
12700  *  frames_array_count:    name[1]
12701  *  frame_data_offset:     name[2]
12702  * The return length gives used_frames_count
12703  */
12704 static int
12705 sysctl_get_kao_frames SYSCTL_HANDLER_ARGS
12706 {
12707 #pragma unused(oidp)
12708 	int *name = (int *)arg1;
12709 	u_int namelen = arg2;
12710 	int idx;
12711 	ifnet_t ifp = NULL;
12712 	u_int32_t frames_array_count;
12713 	size_t frame_data_offset;
12714 	u_int32_t used_frames_count;
12715 	struct ifnet_keepalive_offload_frame *frames_array = NULL;
12716 	int error = 0;
12717 	u_int32_t i;
12718 
12719 	/*
12720 	 * Only root can get look at other people TCP frames
12721 	 */
12722 	error = proc_suser(current_proc());
12723 	if (error != 0) {
12724 		goto done;
12725 	}
12726 	/*
12727 	 * Validate the input parameters
12728 	 */
12729 	if (req->newptr != USER_ADDR_NULL) {
12730 		error = EPERM;
12731 		goto done;
12732 	}
12733 	if (namelen != 3) {
12734 		error = EINVAL;
12735 		goto done;
12736 	}
12737 	if (req->oldptr == USER_ADDR_NULL) {
12738 		error = EINVAL;
12739 		goto done;
12740 	}
12741 	if (req->oldlen == 0) {
12742 		error = EINVAL;
12743 		goto done;
12744 	}
12745 	idx = name[0];
12746 	frames_array_count = name[1];
12747 	frame_data_offset = name[2];
12748 
12749 	/* Make sure the passed buffer is large enough */
12750 	if (frames_array_count * sizeof(struct ifnet_keepalive_offload_frame) >
12751 	    req->oldlen) {
12752 		error = ENOMEM;
12753 		goto done;
12754 	}
12755 
12756 	ifnet_head_lock_shared();
12757 	if (!IF_INDEX_IN_RANGE(idx)) {
12758 		ifnet_head_done();
12759 		error = ENOENT;
12760 		goto done;
12761 	}
12762 	ifp = ifindex2ifnet[idx];
12763 	ifnet_head_done();
12764 
12765 	frames_array = (struct ifnet_keepalive_offload_frame *)kalloc_data(
12766 		frames_array_count * sizeof(struct ifnet_keepalive_offload_frame),
12767 		Z_WAITOK);
12768 	if (frames_array == NULL) {
12769 		error = ENOMEM;
12770 		goto done;
12771 	}
12772 
12773 	error = ifnet_get_keepalive_offload_frames(ifp, frames_array,
12774 	    frames_array_count, frame_data_offset, &used_frames_count);
12775 	if (error != 0) {
12776 		DLIL_PRINTF("%s: ifnet_get_keepalive_offload_frames error %d\n",
12777 		    __func__, error);
12778 		goto done;
12779 	}
12780 
12781 	for (i = 0; i < used_frames_count; i++) {
12782 		error = SYSCTL_OUT(req, frames_array + i,
12783 		    sizeof(struct ifnet_keepalive_offload_frame));
12784 		if (error != 0) {
12785 			goto done;
12786 		}
12787 	}
12788 done:
12789 	if (frames_array != NULL) {
12790 		kfree_data(frames_array, frames_array_count *
12791 		    sizeof(struct ifnet_keepalive_offload_frame));
12792 	}
12793 	return error;
12794 }
12795 #endif /* DEVELOPMENT || DEBUG */
12796 
12797 void
ifnet_update_stats_per_flow(struct ifnet_stats_per_flow * ifs,struct ifnet * ifp)12798 ifnet_update_stats_per_flow(struct ifnet_stats_per_flow *ifs,
12799     struct ifnet *ifp)
12800 {
12801 	tcp_update_stats_per_flow(ifs, ifp);
12802 }
12803 
12804 static inline u_int32_t
_set_flags(u_int32_t * flags_p,u_int32_t set_flags)12805 _set_flags(u_int32_t *flags_p, u_int32_t set_flags)
12806 {
12807 	return (u_int32_t)OSBitOrAtomic(set_flags, flags_p);
12808 }
12809 
12810 static inline void
_clear_flags(u_int32_t * flags_p,u_int32_t clear_flags)12811 _clear_flags(u_int32_t *flags_p, u_int32_t clear_flags)
12812 {
12813 	OSBitAndAtomic(~clear_flags, flags_p);
12814 }
12815 
12816 __private_extern__ u_int32_t
if_set_eflags(ifnet_t interface,u_int32_t set_flags)12817 if_set_eflags(ifnet_t interface, u_int32_t set_flags)
12818 {
12819 	return _set_flags(&interface->if_eflags, set_flags);
12820 }
12821 
12822 __private_extern__ void
if_clear_eflags(ifnet_t interface,u_int32_t clear_flags)12823 if_clear_eflags(ifnet_t interface, u_int32_t clear_flags)
12824 {
12825 	_clear_flags(&interface->if_eflags, clear_flags);
12826 }
12827 
12828 __private_extern__ u_int32_t
if_set_xflags(ifnet_t interface,u_int32_t set_flags)12829 if_set_xflags(ifnet_t interface, u_int32_t set_flags)
12830 {
12831 	return _set_flags(&interface->if_xflags, set_flags);
12832 }
12833 
12834 __private_extern__ void
if_clear_xflags(ifnet_t interface,u_int32_t clear_flags)12835 if_clear_xflags(ifnet_t interface, u_int32_t clear_flags)
12836 {
12837 	_clear_flags(&interface->if_xflags, clear_flags);
12838 }
12839 
12840 __private_extern__ void
ifnet_update_traffic_rule_genid(ifnet_t ifp)12841 ifnet_update_traffic_rule_genid(ifnet_t ifp)
12842 {
12843 	os_atomic_inc(&ifp->if_traffic_rule_genid, relaxed);
12844 }
12845 
12846 __private_extern__ boolean_t
ifnet_sync_traffic_rule_genid(ifnet_t ifp,uint32_t * genid)12847 ifnet_sync_traffic_rule_genid(ifnet_t ifp, uint32_t *genid)
12848 {
12849 	if (*genid != ifp->if_traffic_rule_genid) {
12850 		*genid = ifp->if_traffic_rule_genid;
12851 		return TRUE;
12852 	}
12853 	return FALSE;
12854 }
12855 __private_extern__ void
ifnet_update_traffic_rule_count(ifnet_t ifp,uint32_t count)12856 ifnet_update_traffic_rule_count(ifnet_t ifp, uint32_t count)
12857 {
12858 	os_atomic_store(&ifp->if_traffic_rule_count, count, release);
12859 	ifnet_update_traffic_rule_genid(ifp);
12860 }
12861 
12862 static void
log_hexdump(void * data,size_t len)12863 log_hexdump(void *data, size_t len)
12864 {
12865 	size_t i, j, k;
12866 	unsigned char *ptr = (unsigned char *)data;
12867 #define MAX_DUMP_BUF 32
12868 	unsigned char buf[3 * MAX_DUMP_BUF + 1];
12869 
12870 	for (i = 0; i < len; i += MAX_DUMP_BUF) {
12871 		for (j = i, k = 0; j < i + MAX_DUMP_BUF && j < len; j++) {
12872 			unsigned char msnbl = ptr[j] >> 4;
12873 			unsigned char lsnbl = ptr[j] & 0x0f;
12874 
12875 			buf[k++] = msnbl < 10 ? msnbl + '0' : msnbl + 'a' - 10;
12876 			buf[k++] = lsnbl < 10 ? lsnbl + '0' : lsnbl + 'a' - 10;
12877 
12878 			if ((j % 2) == 1) {
12879 				buf[k++] = ' ';
12880 			}
12881 			if ((j % MAX_DUMP_BUF) == MAX_DUMP_BUF - 1) {
12882 				buf[k++] = ' ';
12883 			}
12884 		}
12885 		buf[k] = 0;
12886 		os_log(OS_LOG_DEFAULT, "%3lu: %s", i, buf);
12887 	}
12888 }
12889 
12890 #if SKYWALK && defined(XNU_TARGET_OS_OSX)
12891 static bool
net_check_compatible_if_filter(struct ifnet * ifp)12892 net_check_compatible_if_filter(struct ifnet *ifp)
12893 {
12894 	if (ifp == NULL) {
12895 		if (net_api_stats.nas_iflt_attach_count > net_api_stats.nas_iflt_attach_os_count) {
12896 			return false;
12897 		}
12898 	} else {
12899 		if (ifp->if_flt_non_os_count > 0) {
12900 			return false;
12901 		}
12902 	}
12903 	return true;
12904 }
12905 #endif /* SKYWALK && XNU_TARGET_OS_OSX */
12906 
12907 #define DUMP_BUF_CHK() {        \
12908 	clen -= k;              \
12909 	if (clen < 1)           \
12910 	        goto done;      \
12911 	c += k;                 \
12912 }
12913 
12914 int dlil_dump_top_if_qlen(char *, int);
12915 int
dlil_dump_top_if_qlen(char * str,int str_len)12916 dlil_dump_top_if_qlen(char *str, int str_len)
12917 {
12918 	char *c = str;
12919 	int k, clen = str_len;
12920 	struct ifnet *top_ifcq_ifp = NULL;
12921 	uint32_t top_ifcq_len = 0;
12922 	struct ifnet *top_inq_ifp = NULL;
12923 	uint32_t top_inq_len = 0;
12924 
12925 	for (int ifidx = 1; ifidx < if_index; ifidx++) {
12926 		struct ifnet *ifp = ifindex2ifnet[ifidx];
12927 		struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
12928 
12929 		if (ifp == NULL) {
12930 			continue;
12931 		}
12932 		if (ifp->if_snd != NULL && ifp->if_snd->ifcq_len > top_ifcq_len) {
12933 			top_ifcq_len = ifp->if_snd->ifcq_len;
12934 			top_ifcq_ifp = ifp;
12935 		}
12936 		if (dl_if->dl_if_inpstorage.dlth_pkts.qlen > top_inq_len) {
12937 			top_inq_len = dl_if->dl_if_inpstorage.dlth_pkts.qlen;
12938 			top_inq_ifp = ifp;
12939 		}
12940 	}
12941 
12942 	if (top_ifcq_ifp != NULL) {
12943 		k = scnprintf(c, clen, "\ntop ifcq_len %u packets by %s\n",
12944 		    top_ifcq_len, top_ifcq_ifp->if_xname);
12945 		DUMP_BUF_CHK();
12946 	}
12947 	if (top_inq_ifp != NULL) {
12948 		k = scnprintf(c, clen, "\ntop inq_len %u packets by %s\n",
12949 		    top_inq_len, top_inq_ifp->if_xname);
12950 		DUMP_BUF_CHK();
12951 	}
12952 done:
12953 	return str_len - clen;
12954 }
12955