xref: /xnu-11215.61.5/bsd/net/dlil.c (revision 4f1223e81cd707a65cc109d0b8ad6653699da3c4)
1 /*
2  * Copyright (c) 1999-2024 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
30  * support for mandatory and extensible security protections.  This notice
31  * is included in support of clause 2.2 (b) of the Apple Public License,
32  * Version 2.0.
33  */
34 #include "kpi_interface.h"
35 #include <stddef.h>
36 #include <ptrauth.h>
37 
38 #include <sys/param.h>
39 #include <sys/systm.h>
40 #include <sys/kernel.h>
41 #include <sys/malloc.h>
42 #include <sys/mbuf.h>
43 #include <sys/socket.h>
44 #include <sys/domain.h>
45 #include <sys/user.h>
46 #include <sys/random.h>
47 #include <sys/socketvar.h>
48 #include <net/if_dl.h>
49 #include <net/if.h>
50 #include <net/route.h>
51 #include <net/if_var.h>
52 #include <net/dlil.h>
53 #include <net/dlil_sysctl.h>
54 #include <net/dlil_var_private.h>
55 #include <net/if_arp.h>
56 #include <net/iptap.h>
57 #include <net/pktap.h>
58 #include <net/droptap.h>
59 #include <net/nwk_wq.h>
60 #include <sys/kern_event.h>
61 #include <sys/kdebug.h>
62 #include <sys/mcache.h>
63 #include <sys/syslog.h>
64 #include <sys/protosw.h>
65 #include <sys/priv.h>
66 
67 #include <kern/assert.h>
68 #include <kern/task.h>
69 #include <kern/thread.h>
70 #include <kern/sched_prim.h>
71 #include <kern/locks.h>
72 #include <kern/zalloc.h>
73 
74 #include <net/kpi_protocol.h>
75 #include <net/if_types.h>
76 #include <net/if_ipsec.h>
77 #include <net/if_llreach.h>
78 #include <net/if_utun.h>
79 #include <net/kpi_interfacefilter.h>
80 #include <net/classq/classq.h>
81 #include <net/classq/classq_sfb.h>
82 #include <net/flowhash.h>
83 #include <net/ntstat.h>
84 #if SKYWALK
85 #include <skywalk/lib/net_filter_event.h>
86 #endif /* SKYWALK */
87 #include <net/net_api_stats.h>
88 #include <net/if_ports_used.h>
89 #include <net/if_vlan_var.h>
90 #include <netinet/in.h>
91 #if INET
92 #include <netinet/in_var.h>
93 #include <netinet/igmp_var.h>
94 #include <netinet/ip_var.h>
95 #include <netinet/tcp.h>
96 #include <netinet/tcp_var.h>
97 #include <netinet/udp.h>
98 #include <netinet/udp_var.h>
99 #include <netinet/if_ether.h>
100 #include <netinet/in_pcb.h>
101 #include <netinet/in_tclass.h>
102 #include <netinet/ip.h>
103 #include <netinet/ip_icmp.h>
104 #include <netinet/icmp_var.h>
105 #endif /* INET */
106 
107 #include <net/nat464_utils.h>
108 #include <netinet6/in6_var.h>
109 #include <netinet6/nd6.h>
110 #include <netinet6/mld6_var.h>
111 #include <netinet6/scope6_var.h>
112 #include <netinet/ip6.h>
113 #include <netinet/icmp6.h>
114 #include <net/pf_pbuf.h>
115 #include <libkern/OSAtomic.h>
116 #include <libkern/tree.h>
117 
118 #include <dev/random/randomdev.h>
119 #include <machine/machine_routines.h>
120 
121 #include <mach/thread_act.h>
122 #include <mach/sdt.h>
123 
124 #if CONFIG_MACF
125 #include <sys/kauth.h>
126 #include <security/mac_framework.h>
127 #include <net/ethernet.h>
128 #include <net/firewire.h>
129 #endif
130 
131 #if PF
132 #include <net/pfvar.h>
133 #endif /* PF */
134 #include <net/pktsched/pktsched.h>
135 #include <net/pktsched/pktsched_netem.h>
136 
137 #if NECP
138 #include <net/necp.h>
139 #endif /* NECP */
140 
141 #if SKYWALK
142 #include <skywalk/packet/packet_queue.h>
143 #include <skywalk/nexus/netif/nx_netif.h>
144 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
145 #endif /* SKYWALK */
146 
147 #include <net/sockaddr_utils.h>
148 
149 #include <os/log.h>
150 
151 #define DBG_LAYER_BEG           DLILDBG_CODE(DBG_DLIL_STATIC, 0)
152 #define DBG_LAYER_END           DLILDBG_CODE(DBG_DLIL_STATIC, 2)
153 #define DBG_FNC_DLIL_INPUT      DLILDBG_CODE(DBG_DLIL_STATIC, (1 << 8))
154 #define DBG_FNC_DLIL_OUTPUT     DLILDBG_CODE(DBG_DLIL_STATIC, (2 << 8))
155 #define DBG_FNC_DLIL_IFOUT      DLILDBG_CODE(DBG_DLIL_STATIC, (3 << 8))
156 
157 #define IF_DATA_REQUIRE_ALIGNED_64(f)   \
158 	_CASSERT(!(offsetof(struct if_data_internal, f) % sizeof (u_int64_t)))
159 
160 #define IFNET_IF_DATA_REQUIRE_ALIGNED_64(f)     \
161 	_CASSERT(!(offsetof(struct ifnet, if_data.f) % sizeof (u_int64_t)))
162 
163 enum {
164 	kProtoKPI_v1    = 1,
165 	kProtoKPI_v2    = 2
166 };
167 
168 uint64_t if_creation_generation_count = 0;
169 
170 /*
171  * List of if_proto structures in if_proto_hash[] is protected by
172  * the ifnet lock.  The rest of the fields are initialized at protocol
173  * attach time and never change, thus no lock required as long as
174  * a reference to it is valid, via if_proto_ref().
175  */
176 struct if_proto {
177 	SLIST_ENTRY(if_proto)       next_hash;
178 	u_int32_t                   refcount;
179 	u_int32_t                   detached;
180 	struct ifnet                *ifp;
181 	protocol_family_t           protocol_family;
182 	int                         proto_kpi;
183 	union {
184 		struct {
185 			proto_media_input               input;
186 			proto_media_preout              pre_output;
187 			proto_media_event               event;
188 			proto_media_ioctl               ioctl;
189 			proto_media_detached            detached;
190 			proto_media_resolve_multi       resolve_multi;
191 			proto_media_send_arp            send_arp;
192 		} v1;
193 		struct {
194 			proto_media_input_v2            input;
195 			proto_media_preout              pre_output;
196 			proto_media_event               event;
197 			proto_media_ioctl               ioctl;
198 			proto_media_detached            detached;
199 			proto_media_resolve_multi       resolve_multi;
200 			proto_media_send_arp            send_arp;
201 		} v2;
202 	} kpi;
203 };
204 
205 SLIST_HEAD(proto_hash_entry, if_proto);
206 
207 #define DLIL_SDLDATALEN \
208 	(DLIL_SDLMAXLEN - offsetof(struct sockaddr_dl, sdl_data[0]))
209 
210 /*
211  * In the common case, the LL address is stored in the
212  * `dl_if_lladdr' member of the `dlil_ifnet'. This is sufficient
213  * for LL addresses that do not exceed the `DLIL_SDLMAXLEN' constant.
214  */
215 struct dl_if_lladdr_std {
216 	struct ifaddr   ifa;
217 	u_int8_t        addr_sdl_bytes[DLIL_SDLMAXLEN];
218 	u_int8_t        mask_sdl_bytes[DLIL_SDLMAXLEN];
219 };
220 
221 /*
222  * However, in some rare cases we encounter LL addresses which
223  * would not fit in the `DLIL_SDLMAXLEN' limitation. In such cases
224  * we allocate the storage in the permanent arena, using this memory layout.
225  */
226 struct dl_if_lladdr_xtra_space {
227 	struct ifaddr   ifa;
228 	u_int8_t        addr_sdl_bytes[SOCK_MAXADDRLEN];
229 	u_int8_t        mask_sdl_bytes[SOCK_MAXADDRLEN];
230 };
231 
232 struct dlil_ifnet {
233 	struct ifnet    dl_if;                  /* public ifnet */
234 	/*
235 	 * DLIL private fields, protected by dl_if_lock
236 	 */
237 	decl_lck_mtx_data(, dl_if_lock);
238 	TAILQ_ENTRY(dlil_ifnet) dl_if_link;     /* dlil_ifnet link */
239 	u_int32_t dl_if_flags;                  /* flags (below) */
240 	u_int32_t dl_if_refcnt;                 /* refcnt */
241 	void (*dl_if_trace)(struct dlil_ifnet *, int); /* ref trace callback */
242 	void    *dl_if_uniqueid;                /* unique interface id */
243 	size_t  dl_if_uniqueid_len;             /* length of the unique id */
244 	char    dl_if_namestorage[IFNAMSIZ];    /* interface name storage */
245 	char    dl_if_xnamestorage[IFXNAMSIZ];  /* external name storage */
246 	struct dl_if_lladdr_std dl_if_lladdr;   /* link-level address storage*/
247 	u_int8_t dl_if_descstorage[IF_DESCSIZE]; /* desc storage */
248 	u_int8_t dl_if_permanent_ether[ETHER_ADDR_LEN]; /* permanent address */
249 	u_int8_t dl_if_permanent_ether_is_set;
250 	u_int8_t dl_if_unused;
251 	struct dlil_threading_info dl_if_inpstorage; /* input thread storage */
252 	ctrace_t        dl_if_attach;           /* attach PC stacktrace */
253 	ctrace_t        dl_if_detach;           /* detach PC stacktrace */
254 };
255 
256 /* Values for dl_if_flags (private to DLIL) */
257 #define DLIF_INUSE      0x1     /* DLIL ifnet recycler, ifnet in use */
258 #define DLIF_REUSE      0x2     /* DLIL ifnet recycles, ifnet is not new */
259 #define DLIF_DEBUG      0x4     /* has debugging info */
260 
261 #define IF_REF_TRACE_HIST_SIZE  8       /* size of ref trace history */
262 
263 /* For gdb */
264 __private_extern__ unsigned int if_ref_trace_hist_size = IF_REF_TRACE_HIST_SIZE;
265 
266 struct dlil_ifnet_dbg {
267 	struct dlil_ifnet       dldbg_dlif;             /* dlil_ifnet */
268 	u_int16_t               dldbg_if_refhold_cnt;   /* # ifnet references */
269 	u_int16_t               dldbg_if_refrele_cnt;   /* # ifnet releases */
270 	/*
271 	 * Circular lists of ifnet_{reference,release} callers.
272 	 */
273 	ctrace_t                dldbg_if_refhold[IF_REF_TRACE_HIST_SIZE];
274 	ctrace_t                dldbg_if_refrele[IF_REF_TRACE_HIST_SIZE];
275 };
276 
277 #define DLIL_TO_IFP(s)  (&s->dl_if)
278 #define IFP_TO_DLIL(s)  ((struct dlil_ifnet *)s)
279 
280 struct ifnet_filter {
281 	TAILQ_ENTRY(ifnet_filter)       filt_next;
282 	u_int32_t                       filt_skip;
283 	u_int32_t                       filt_flags;
284 	ifnet_t                         filt_ifp;
285 	const char                      *filt_name;
286 	void                            *filt_cookie;
287 	protocol_family_t               filt_protocol;
288 	iff_input_func                  filt_input;
289 	iff_output_func                 filt_output;
290 	iff_event_func                  filt_event;
291 	iff_ioctl_func                  filt_ioctl;
292 	iff_detached_func               filt_detached;
293 };
294 
295 /* Mbuf queue used for freeing the excessive mbufs */
296 typedef MBUFQ_HEAD(dlil_freeq) dlil_freeq_t;
297 
298 struct proto_input_entry;
299 
300 static TAILQ_HEAD(, dlil_ifnet) dlil_ifnet_head;
301 
302 static LCK_ATTR_DECLARE(dlil_lck_attributes, 0, 0);
303 
304 static LCK_GRP_DECLARE(dlil_lock_group, "DLIL internal locks");
305 LCK_GRP_DECLARE(ifnet_lock_group, "ifnet locks");
306 static LCK_GRP_DECLARE(ifnet_head_lock_group, "ifnet head lock");
307 static LCK_GRP_DECLARE(ifnet_snd_lock_group, "ifnet snd locks");
308 static LCK_GRP_DECLARE(ifnet_rcv_lock_group, "ifnet rcv locks");
309 
310 LCK_ATTR_DECLARE(ifnet_lock_attr, 0, 0);
311 static LCK_RW_DECLARE_ATTR(ifnet_head_lock, &ifnet_head_lock_group,
312     &dlil_lck_attributes);
313 static LCK_MTX_DECLARE_ATTR(dlil_ifnet_lock, &dlil_lock_group,
314     &dlil_lck_attributes);
315 
316 #if DEBUG
317 static unsigned int ifnet_debug = 1;    /* debugging (enabled) */
318 #else
319 static unsigned int ifnet_debug;        /* debugging (disabled) */
320 #endif /* !DEBUG */
321 static unsigned int dlif_size;          /* size of dlil_ifnet to allocate */
322 static unsigned int dlif_bufsize;       /* size of dlif_size + headroom */
323 static struct zone *dlif_zone;          /* zone for dlil_ifnet */
324 #define DLIF_ZONE_NAME          "ifnet"         /* zone name */
325 
326 static KALLOC_TYPE_DEFINE(dlif_filt_zone, struct ifnet_filter, NET_KT_DEFAULT);
327 
328 static KALLOC_TYPE_DEFINE(dlif_proto_zone, struct if_proto, NET_KT_DEFAULT);
329 
330 static unsigned int dlif_tcpstat_size;  /* size of tcpstat_local to allocate */
331 static unsigned int dlif_tcpstat_bufsize; /* size of dlif_tcpstat_size + headroom */
332 static struct zone *dlif_tcpstat_zone;          /* zone for tcpstat_local */
333 #define DLIF_TCPSTAT_ZONE_NAME  "ifnet_tcpstat" /* zone name */
334 
335 static unsigned int dlif_udpstat_size;  /* size of udpstat_local to allocate */
336 static unsigned int dlif_udpstat_bufsize;       /* size of dlif_udpstat_size + headroom */
337 static struct zone *dlif_udpstat_zone;          /* zone for udpstat_local */
338 #define DLIF_UDPSTAT_ZONE_NAME  "ifnet_udpstat" /* zone name */
339 
340 static u_int32_t net_rtref;
341 
342 static struct dlil_main_threading_info dlil_main_input_thread_info;
343 __private_extern__ struct dlil_threading_info *dlil_main_input_thread =
344     (struct dlil_threading_info *)&dlil_main_input_thread_info;
345 
346 static int dlil_event_internal(struct ifnet *ifp, struct kev_msg *msg, bool update_generation);
347 static int dlil_detach_filter_internal(interface_filter_t filter, int detached);
348 static void dlil_if_trace(struct dlil_ifnet *, int);
349 static void if_proto_ref(struct if_proto *);
350 static void if_proto_free(struct if_proto *);
351 static struct if_proto *find_attached_proto(struct ifnet *, u_int32_t);
352 static u_int32_t dlil_ifp_protolist(struct ifnet *ifp, protocol_family_t *list,
353     u_int32_t list_count);
354 static void _dlil_if_release(ifnet_t ifp, bool clear_in_use);
355 static void if_flt_monitor_busy(struct ifnet *);
356 static void if_flt_monitor_unbusy(struct ifnet *);
357 static void if_flt_monitor_enter(struct ifnet *);
358 static void if_flt_monitor_leave(struct ifnet *);
359 static int dlil_interface_filters_input(struct ifnet *, struct mbuf **,
360     char **, protocol_family_t, boolean_t);
361 static int dlil_interface_filters_output(struct ifnet *, struct mbuf **,
362     protocol_family_t);
363 static struct ifaddr *dlil_alloc_lladdr(struct ifnet *,
364     const struct sockaddr_dl *);
365 static int ifnet_lookup(struct ifnet *);
366 static void if_purgeaddrs(struct ifnet *);
367 
368 static errno_t ifproto_media_input_v1(struct ifnet *, protocol_family_t,
369     struct mbuf *, char *);
370 static errno_t ifproto_media_input_v2(struct ifnet *, protocol_family_t,
371     struct mbuf *);
372 static errno_t ifproto_media_preout(struct ifnet *, protocol_family_t,
373     mbuf_t *, const struct sockaddr *, void *, char *, char *);
374 static void ifproto_media_event(struct ifnet *, protocol_family_t,
375     const struct kev_msg *);
376 static errno_t ifproto_media_ioctl(struct ifnet *, protocol_family_t,
377     unsigned long, void *);
378 static errno_t ifproto_media_resolve_multi(ifnet_t, const struct sockaddr *,
379     struct sockaddr_dl *, size_t);
380 static errno_t ifproto_media_send_arp(struct ifnet *, u_short,
381     const struct sockaddr_dl *, const struct sockaddr *,
382     const struct sockaddr_dl *, const struct sockaddr *);
383 
384 static errno_t ifp_if_input(struct ifnet *ifp, struct mbuf *m_head,
385     struct mbuf *m_tail, const struct ifnet_stat_increment_param *s,
386     boolean_t poll, struct thread *tp);
387 static void ifp_if_input_poll(struct ifnet *, u_int32_t, u_int32_t,
388     struct mbuf **, struct mbuf **, u_int32_t *, u_int32_t *);
389 static errno_t ifp_if_ctl(struct ifnet *, ifnet_ctl_cmd_t, u_int32_t, void *);
390 static errno_t ifp_if_demux(struct ifnet *, struct mbuf *, char *,
391     protocol_family_t *);
392 static errno_t ifp_if_add_proto(struct ifnet *, protocol_family_t,
393     const struct ifnet_demux_desc *, u_int32_t);
394 static errno_t ifp_if_del_proto(struct ifnet *, protocol_family_t);
395 static errno_t ifp_if_check_multi(struct ifnet *, const struct sockaddr *);
396 #if !XNU_TARGET_OS_OSX
397 static errno_t ifp_if_framer(struct ifnet *, struct mbuf **,
398     const struct sockaddr *, const char *, const char *,
399     u_int32_t *, u_int32_t *);
400 #else /* XNU_TARGET_OS_OSX */
401 static errno_t ifp_if_framer(struct ifnet *, struct mbuf **,
402     const struct sockaddr *, const char *, const char *);
403 #endif /* XNU_TARGET_OS_OSX */
404 static errno_t ifp_if_framer_extended(struct ifnet *, struct mbuf **,
405     const struct sockaddr *, const char *, const char *,
406     u_int32_t *, u_int32_t *);
407 static errno_t ifp_if_set_bpf_tap(struct ifnet *, bpf_tap_mode, bpf_packet_func);
408 static void ifp_if_free(struct ifnet *);
409 static void ifp_if_event(struct ifnet *, const struct kev_msg *);
410 static __inline void ifp_inc_traffic_class_in(struct ifnet *, struct mbuf *);
411 static __inline void ifp_inc_traffic_class_out(struct ifnet *, struct mbuf *);
412 
413 static uint32_t dlil_trim_overcomitted_queue_locked(class_queue_t *,
414     dlil_freeq_t *, struct ifnet_stat_increment_param *);
415 
416 static errno_t dlil_input_async(struct dlil_threading_info *, struct ifnet *,
417     struct mbuf *, struct mbuf *, const struct ifnet_stat_increment_param *,
418     boolean_t, struct thread *);
419 static errno_t dlil_input_sync(struct dlil_threading_info *, struct ifnet *,
420     struct mbuf *, struct mbuf *, const struct ifnet_stat_increment_param *,
421     boolean_t, struct thread *);
422 
423 static void dlil_main_input_thread_func(void *, wait_result_t);
424 static void dlil_main_input_thread_cont(void *, wait_result_t);
425 
426 static void dlil_input_thread_func(void *, wait_result_t);
427 static void dlil_input_thread_cont(void *, wait_result_t);
428 
429 static void dlil_rxpoll_input_thread_func(void *, wait_result_t);
430 static void dlil_rxpoll_input_thread_cont(void *, wait_result_t);
431 
432 static int dlil_create_input_thread(ifnet_t, struct dlil_threading_info *,
433     thread_continue_t *);
434 static void dlil_terminate_input_thread(struct dlil_threading_info *);
435 static void dlil_input_stats_add(const struct ifnet_stat_increment_param *,
436     struct dlil_threading_info *, struct ifnet *, boolean_t);
437 static boolean_t dlil_input_stats_sync(struct ifnet *,
438     struct dlil_threading_info *);
439 static void dlil_input_packet_list_common(struct ifnet *, struct mbuf *,
440     u_int32_t, ifnet_model_t, boolean_t);
441 static errno_t ifnet_input_common(struct ifnet *, struct mbuf *, struct mbuf *,
442     const struct ifnet_stat_increment_param *, boolean_t, boolean_t);
443 static int dlil_is_clat_needed(protocol_family_t, mbuf_t );
444 static errno_t dlil_clat46(ifnet_t, protocol_family_t *, mbuf_t *);
445 static errno_t dlil_clat64(ifnet_t, protocol_family_t *, mbuf_t *);
446 #if DEBUG || DEVELOPMENT
447 static void dlil_verify_sum16(void);
448 #endif /* DEBUG || DEVELOPMENT */
449 static void dlil_output_cksum_dbg(struct ifnet *, struct mbuf *, uint32_t,
450     protocol_family_t);
451 static void dlil_input_cksum_dbg(struct ifnet *, struct mbuf *, char *,
452     protocol_family_t);
453 
454 static void dlil_incr_pending_thread_count(void);
455 static void dlil_decr_pending_thread_count(void);
456 
457 static void ifnet_detacher_thread_func(void *, wait_result_t);
458 static void ifnet_detacher_thread_cont(void *, wait_result_t);
459 static void ifnet_detach_final(struct ifnet *);
460 static void ifnet_detaching_enqueue(struct ifnet *);
461 static struct ifnet *ifnet_detaching_dequeue(void);
462 
463 static void ifnet_start_thread_func(void *, wait_result_t);
464 static void ifnet_start_thread_cont(void *, wait_result_t);
465 
466 static void ifnet_poll_thread_func(void *, wait_result_t);
467 static void ifnet_poll_thread_cont(void *, wait_result_t);
468 
469 static errno_t ifnet_enqueue_common(struct ifnet *, struct ifclassq *,
470     classq_pkt_t *, boolean_t, boolean_t *);
471 
472 static void ifp_src_route_copyout(struct ifnet *, struct route *);
473 static void ifp_src_route_copyin(struct ifnet *, struct route *);
474 static void ifp_src_route6_copyout(struct ifnet *, struct route_in6 *);
475 static void ifp_src_route6_copyin(struct ifnet *, struct route_in6 *);
476 
477 static errno_t if_mcasts_update_async(struct ifnet *);
478 
479 /* The following are protected by dlil_ifnet_lock */
480 static TAILQ_HEAD(, ifnet) ifnet_detaching_head;
481 static u_int32_t ifnet_detaching_cnt;
482 static boolean_t ifnet_detaching_embryonic;
483 static void *ifnet_delayed_run; /* wait channel for detaching thread */
484 
485 static LCK_MTX_DECLARE_ATTR(ifnet_fc_lock, &dlil_lock_group,
486     &dlil_lck_attributes);
487 
488 static uint32_t ifnet_flowhash_seed;
489 
490 struct ifnet_flowhash_key {
491 	char            ifk_name[IFNAMSIZ];
492 	uint32_t        ifk_unit;
493 	uint32_t        ifk_flags;
494 	uint32_t        ifk_eflags;
495 	uint32_t        ifk_capabilities;
496 	uint32_t        ifk_capenable;
497 	uint32_t        ifk_output_sched_model;
498 	uint32_t        ifk_rand1;
499 	uint32_t        ifk_rand2;
500 };
501 
502 /* Flow control entry per interface */
503 struct ifnet_fc_entry {
504 	RB_ENTRY(ifnet_fc_entry) ifce_entry;
505 	u_int32_t       ifce_flowhash;
506 	struct ifnet    *ifce_ifp;
507 };
508 
509 static uint32_t ifnet_calc_flowhash(struct ifnet *);
510 static int ifce_cmp(const struct ifnet_fc_entry *,
511     const struct ifnet_fc_entry *);
512 static int ifnet_fc_add(struct ifnet *);
513 static struct ifnet_fc_entry *ifnet_fc_get(u_int32_t);
514 static void ifnet_fc_entry_free(struct ifnet_fc_entry *);
515 
516 /* protected by ifnet_fc_lock */
517 RB_HEAD(ifnet_fc_tree, ifnet_fc_entry) ifnet_fc_tree;
518 RB_PROTOTYPE(ifnet_fc_tree, ifnet_fc_entry, ifce_entry, ifce_cmp);
519 RB_GENERATE(ifnet_fc_tree, ifnet_fc_entry, ifce_entry, ifce_cmp);
520 
521 static KALLOC_TYPE_DEFINE(ifnet_fc_zone, struct ifnet_fc_entry, NET_KT_DEFAULT);
522 
523 extern void bpfdetach(struct ifnet *);
524 extern void proto_input_run(void);
525 
526 extern uint32_t udp_count_opportunistic(unsigned int ifindex,
527     u_int32_t flags);
528 extern uint32_t tcp_count_opportunistic(unsigned int ifindex,
529     u_int32_t flags);
530 
531 __private_extern__ void link_rtrequest(int, struct rtentry *, struct sockaddr *);
532 
533 #if CONFIG_MACF
534 #if !XNU_TARGET_OS_OSX
535 int dlil_lladdr_ckreq = 1;
536 #else /* XNU_TARGET_OS_OSX */
537 int dlil_lladdr_ckreq = 0;
538 #endif /* XNU_TARGET_OS_OSX */
539 #endif /* CONFIG_MACF */
540 
541 /* rate limit debug messages */
542 struct timespec dlil_dbgrate = { .tv_sec = 1, .tv_nsec = 0 };
543 
544 static inline void
ifnet_delay_start_disabled_increment(void)545 ifnet_delay_start_disabled_increment(void)
546 {
547 	OSIncrementAtomic(&ifnet_delay_start_disabled);
548 }
549 
550 static void log_hexdump(void *data, size_t len);
551 
552 unsigned int net_rxpoll = 1;
553 unsigned int net_affinity = 1;
554 unsigned int net_async = 1;     /* 0: synchronous, 1: asynchronous */
555 
556 static kern_return_t dlil_affinity_set(struct thread *, u_int32_t);
557 
558 extern u_int32_t        inject_buckets;
559 
560 /* DLIL data threshold thread call */
561 static void dlil_dt_tcall_fn(thread_call_param_t, thread_call_param_t);
562 
563 void
ifnet_filter_update_tso(struct ifnet * ifp,boolean_t filter_enable)564 ifnet_filter_update_tso(struct ifnet *ifp, boolean_t filter_enable)
565 {
566 	/*
567 	 * update filter count and route_generation ID to let TCP
568 	 * know it should reevalute doing TSO or not
569 	 */
570 	if (filter_enable) {
571 		OSAddAtomic(1, &ifp->if_flt_no_tso_count);
572 	} else {
573 		VERIFY(ifp->if_flt_no_tso_count != 0);
574 		OSAddAtomic(-1, &ifp->if_flt_no_tso_count);
575 	}
576 	routegenid_update();
577 }
578 
579 #if SKYWALK
580 
581 static bool net_check_compatible_if_filter(struct ifnet *ifp);
582 
583 /* if_attach_nx flags defined in os_skywalk_private.h */
584 unsigned int if_attach_nx = IF_ATTACH_NX_DEFAULT;
585 unsigned int if_enable_fsw_ip_netagent =
586     ((IF_ATTACH_NX_DEFAULT & IF_ATTACH_NX_FSW_IP_NETAGENT) != 0);
587 unsigned int if_enable_fsw_transport_netagent =
588     ((IF_ATTACH_NX_DEFAULT & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0);
589 
590 unsigned int if_netif_all =
591     ((IF_ATTACH_NX_DEFAULT & IF_ATTACH_NX_NETIF_ALL) != 0);
592 
593 /* Configure flowswitch to use max mtu sized buffer */
594 static bool fsw_use_max_mtu_buffer = false;
595 
596 
597 static void dlil_detach_flowswitch_nexus(if_nexus_flowswitch_t nexus_fsw);
598 
599 #include <skywalk/os_skywalk_private.h>
600 
601 boolean_t
ifnet_nx_noauto(ifnet_t ifp)602 ifnet_nx_noauto(ifnet_t ifp)
603 {
604 	return (ifp->if_xflags & IFXF_NX_NOAUTO) != 0;
605 }
606 
607 boolean_t
ifnet_nx_noauto_flowswitch(ifnet_t ifp)608 ifnet_nx_noauto_flowswitch(ifnet_t ifp)
609 {
610 	return ifnet_is_low_latency(ifp);
611 }
612 
613 boolean_t
ifnet_is_low_latency(ifnet_t ifp)614 ifnet_is_low_latency(ifnet_t ifp)
615 {
616 	return (ifp->if_xflags & IFXF_LOW_LATENCY) != 0;
617 }
618 
619 boolean_t
ifnet_needs_compat(ifnet_t ifp)620 ifnet_needs_compat(ifnet_t ifp)
621 {
622 	if ((if_attach_nx & IF_ATTACH_NX_NETIF_COMPAT) == 0) {
623 		return FALSE;
624 	}
625 #if !XNU_TARGET_OS_OSX
626 	/*
627 	 * To conserve memory, we plumb in the compat layer selectively; this
628 	 * can be overridden via if_attach_nx flag IF_ATTACH_NX_NETIF_ALL.
629 	 * In particular, we check for Wi-Fi Access Point.
630 	 */
631 	if (IFNET_IS_WIFI(ifp)) {
632 		/* Wi-Fi Access Point */
633 		if (ifp->if_name[0] == 'a' && ifp->if_name[1] == 'p' &&
634 		    ifp->if_name[2] == '\0') {
635 			return if_netif_all;
636 		}
637 	}
638 #else /* XNU_TARGET_OS_OSX */
639 #pragma unused(ifp)
640 #endif /* XNU_TARGET_OS_OSX */
641 	return TRUE;
642 }
643 
644 boolean_t
ifnet_needs_fsw_transport_netagent(ifnet_t ifp)645 ifnet_needs_fsw_transport_netagent(ifnet_t ifp)
646 {
647 	if (if_is_fsw_transport_netagent_enabled()) {
648 		/* check if netagent has been manually enabled for ipsec/utun */
649 		if (ifp->if_family == IFNET_FAMILY_IPSEC) {
650 			return ipsec_interface_needs_netagent(ifp);
651 		} else if (ifp->if_family == IFNET_FAMILY_UTUN) {
652 			return utun_interface_needs_netagent(ifp);
653 		}
654 
655 		/* check ifnet no auto nexus override */
656 		if (ifnet_nx_noauto(ifp)) {
657 			return FALSE;
658 		}
659 
660 		/* check global if_attach_nx configuration */
661 		switch (ifp->if_family) {
662 		case IFNET_FAMILY_CELLULAR:
663 		case IFNET_FAMILY_ETHERNET:
664 			if ((if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0) {
665 				return TRUE;
666 			}
667 			break;
668 		default:
669 			break;
670 		}
671 	}
672 	return FALSE;
673 }
674 
675 boolean_t
ifnet_needs_fsw_ip_netagent(ifnet_t ifp)676 ifnet_needs_fsw_ip_netagent(ifnet_t ifp)
677 {
678 #pragma unused(ifp)
679 	if ((if_attach_nx & IF_ATTACH_NX_FSW_IP_NETAGENT) != 0) {
680 		return TRUE;
681 	}
682 	return FALSE;
683 }
684 
685 boolean_t
ifnet_needs_netif_netagent(ifnet_t ifp)686 ifnet_needs_netif_netagent(ifnet_t ifp)
687 {
688 #pragma unused(ifp)
689 	return (if_attach_nx & IF_ATTACH_NX_NETIF_NETAGENT) != 0;
690 }
691 
692 static boolean_t
dlil_detach_nexus_instance(nexus_controller_t controller,const char * func_str,uuid_t instance,uuid_t device)693 dlil_detach_nexus_instance(nexus_controller_t controller,
694     const char *func_str, uuid_t instance, uuid_t device)
695 {
696 	errno_t         err;
697 
698 	if (instance == NULL || uuid_is_null(instance)) {
699 		return FALSE;
700 	}
701 
702 	/* followed by the device port */
703 	if (device != NULL && !uuid_is_null(device)) {
704 		err = kern_nexus_ifdetach(controller, instance, device);
705 		if (err != 0) {
706 			DLIL_PRINTF("%s kern_nexus_ifdetach device failed %d\n",
707 			    func_str, err);
708 		}
709 	}
710 	err = kern_nexus_controller_free_provider_instance(controller,
711 	    instance);
712 	if (err != 0) {
713 		DLIL_PRINTF("%s free_provider_instance failed %d\n",
714 		    func_str, err);
715 	}
716 	return TRUE;
717 }
718 
719 static boolean_t
dlil_detach_nexus(const char * func_str,uuid_t provider,uuid_t instance,uuid_t device)720 dlil_detach_nexus(const char *func_str, uuid_t provider, uuid_t instance,
721     uuid_t device)
722 {
723 	boolean_t               detached = FALSE;
724 	nexus_controller_t      controller = kern_nexus_shared_controller();
725 	int                     err;
726 
727 	if (dlil_detach_nexus_instance(controller, func_str, instance,
728 	    device)) {
729 		detached = TRUE;
730 	}
731 	if (provider != NULL && !uuid_is_null(provider)) {
732 		detached = TRUE;
733 		err = kern_nexus_controller_deregister_provider(controller,
734 		    provider);
735 		if (err != 0) {
736 			DLIL_PRINTF("%s deregister_provider %d\n",
737 			    func_str, err);
738 		}
739 	}
740 	return detached;
741 }
742 
743 static errno_t
dlil_create_provider_and_instance(nexus_controller_t controller,nexus_type_t type,ifnet_t ifp,uuid_t * provider,uuid_t * instance,nexus_attr_t attr)744 dlil_create_provider_and_instance(nexus_controller_t controller,
745     nexus_type_t type, ifnet_t ifp, uuid_t *provider, uuid_t *instance,
746     nexus_attr_t attr)
747 {
748 	uuid_t          dom_prov;
749 	errno_t         err;
750 	nexus_name_t    provider_name;
751 	const char      *type_name =
752 	    (type == NEXUS_TYPE_NET_IF) ? "netif" : "flowswitch";
753 	struct kern_nexus_init init;
754 
755 	err = kern_nexus_get_default_domain_provider(type, &dom_prov);
756 	if (err != 0) {
757 		DLIL_PRINTF("%s can't get %s provider, error %d\n",
758 		    __func__, type_name, err);
759 		goto failed;
760 	}
761 
762 	snprintf((char *)provider_name, sizeof(provider_name),
763 	    "com.apple.%s.%s", type_name, if_name(ifp));
764 	err = kern_nexus_controller_register_provider(controller,
765 	    dom_prov,
766 	    provider_name,
767 	    NULL,
768 	    0,
769 	    attr,
770 	    provider);
771 	if (err != 0) {
772 		DLIL_PRINTF("%s register %s provider failed, error %d\n",
773 		    __func__, type_name, err);
774 		goto failed;
775 	}
776 	bzero(&init, sizeof(init));
777 	init.nxi_version = KERN_NEXUS_CURRENT_VERSION;
778 	err = kern_nexus_controller_alloc_provider_instance(controller,
779 	    *provider,
780 	    NULL, NULL,
781 	    instance, &init);
782 	if (err != 0) {
783 		DLIL_PRINTF("%s alloc_provider_instance %s failed, %d\n",
784 		    __func__, type_name, err);
785 		kern_nexus_controller_deregister_provider(controller,
786 		    *provider);
787 		goto failed;
788 	}
789 failed:
790 	return err;
791 }
792 
793 static boolean_t
dlil_attach_netif_nexus_common(ifnet_t ifp,if_nexus_netif_t netif_nx)794 dlil_attach_netif_nexus_common(ifnet_t ifp, if_nexus_netif_t netif_nx)
795 {
796 	nexus_attr_t            attr = NULL;
797 	nexus_controller_t      controller;
798 	errno_t                 err;
799 
800 	if ((ifp->if_capabilities & IFCAP_SKYWALK) != 0) {
801 		/* it's already attached */
802 		if (dlil_verbose) {
803 			DLIL_PRINTF("%s: %s already has nexus attached\n",
804 			    __func__, if_name(ifp));
805 			/* already attached */
806 		}
807 		goto failed;
808 	}
809 
810 	err = kern_nexus_attr_create(&attr);
811 	if (err != 0) {
812 		DLIL_PRINTF("%s: nexus attr create for %s\n", __func__,
813 		    if_name(ifp));
814 		goto failed;
815 	}
816 	err = kern_nexus_attr_set(attr, NEXUS_ATTR_IFINDEX, ifp->if_index);
817 	VERIFY(err == 0);
818 
819 	controller = kern_nexus_shared_controller();
820 
821 	/* create the netif provider and instance */
822 	err = dlil_create_provider_and_instance(controller,
823 	    NEXUS_TYPE_NET_IF, ifp, &netif_nx->if_nif_provider,
824 	    &netif_nx->if_nif_instance, attr);
825 	if (err != 0) {
826 		goto failed;
827 	}
828 	err = kern_nexus_ifattach(controller, netif_nx->if_nif_instance,
829 	    ifp, NULL, FALSE, &netif_nx->if_nif_attach);
830 	if (err != 0) {
831 		DLIL_PRINTF("%s kern_nexus_ifattach %d\n",
832 		    __func__, err);
833 		/* cleanup provider and instance */
834 		dlil_detach_nexus(__func__, netif_nx->if_nif_provider,
835 		    netif_nx->if_nif_instance, NULL);
836 		goto failed;
837 	}
838 	return TRUE;
839 
840 failed:
841 	if (attr != NULL) {
842 		kern_nexus_attr_destroy(attr);
843 	}
844 	return FALSE;
845 }
846 
847 static boolean_t
dlil_attach_netif_compat_nexus(ifnet_t ifp,if_nexus_netif_t netif_nx)848 dlil_attach_netif_compat_nexus(ifnet_t ifp, if_nexus_netif_t netif_nx)
849 {
850 	if (ifnet_nx_noauto(ifp) || IFNET_IS_INTCOPROC(ifp) ||
851 	    IFNET_IS_MANAGEMENT(ifp) || IFNET_IS_VMNET(ifp)) {
852 		goto failed;
853 	}
854 	switch (ifp->if_type) {
855 	case IFT_CELLULAR:
856 	case IFT_ETHER:
857 		if ((if_attach_nx & IF_ATTACH_NX_NETIF_COMPAT) == 0) {
858 			/* don't auto-attach */
859 			goto failed;
860 		}
861 		break;
862 	default:
863 		/* don't auto-attach */
864 		goto failed;
865 	}
866 	return dlil_attach_netif_nexus_common(ifp, netif_nx);
867 
868 failed:
869 	return FALSE;
870 }
871 
872 static boolean_t
dlil_is_native_netif_nexus(ifnet_t ifp)873 dlil_is_native_netif_nexus(ifnet_t ifp)
874 {
875 	return (ifp->if_eflags & IFEF_SKYWALK_NATIVE) && ifp->if_na != NULL;
876 }
877 
878 __attribute__((noinline))
879 static void
dlil_detach_netif_nexus(if_nexus_netif_t nexus_netif)880 dlil_detach_netif_nexus(if_nexus_netif_t nexus_netif)
881 {
882 	dlil_detach_nexus(__func__, nexus_netif->if_nif_provider,
883 	    nexus_netif->if_nif_instance, nexus_netif->if_nif_attach);
884 }
885 
886 static inline int
dlil_siocgifdevmtu(struct ifnet * ifp,struct ifdevmtu * ifdm_p)887 dlil_siocgifdevmtu(struct ifnet * ifp, struct ifdevmtu * ifdm_p)
888 {
889 	struct ifreq        ifr;
890 	int                 error;
891 
892 	bzero(&ifr, sizeof(ifr));
893 	error = ifnet_ioctl(ifp, 0, SIOCGIFDEVMTU, &ifr);
894 	if (error == 0) {
895 		*ifdm_p = ifr.ifr_devmtu;
896 	}
897 	return error;
898 }
899 
900 static inline void
_dlil_adjust_large_buf_size_for_tso(ifnet_t ifp,uint32_t * large_buf_size)901 _dlil_adjust_large_buf_size_for_tso(ifnet_t ifp, uint32_t *large_buf_size)
902 {
903 	uint32_t tso_v4_mtu = 0;
904 	uint32_t tso_v6_mtu = 0;
905 
906 	if (!kernel_is_macos_or_server()) {
907 		return;
908 	}
909 
910 	if (!dlil_is_native_netif_nexus(ifp)) {
911 		return;
912 	}
913 	/*
914 	 * Note that we are reading the real hwassist flags set by the driver
915 	 * and not the adjusted ones because nx_netif_host_adjust_if_capabilities()
916 	 * hasn't been called yet.
917 	 */
918 	if ((ifp->if_hwassist & IFNET_TSO_IPV4) != 0) {
919 		tso_v4_mtu = ifp->if_tso_v4_mtu;
920 	}
921 	if ((ifp->if_hwassist & IFNET_TSO_IPV6) != 0) {
922 		tso_v6_mtu = ifp->if_tso_v6_mtu;
923 	}
924 	/*
925 	 * If the hardware supports TSO, adjust the large buf size to match the
926 	 * supported TSO MTU size.
927 	 */
928 	if (tso_v4_mtu != 0 || tso_v6_mtu != 0) {
929 		*large_buf_size = MAX(tso_v4_mtu, tso_v6_mtu);
930 	} else {
931 		*large_buf_size = MAX(*large_buf_size, sk_fsw_gso_mtu);
932 	}
933 	*large_buf_size = MIN(NX_FSW_MAX_LARGE_BUFSIZE, *large_buf_size);
934 }
935 
936 static inline int
_dlil_get_flowswitch_buffer_size(ifnet_t ifp,uuid_t netif,uint32_t * buf_size,bool * use_multi_buflet,uint32_t * large_buf_size)937 _dlil_get_flowswitch_buffer_size(ifnet_t ifp, uuid_t netif, uint32_t *buf_size,
938     bool *use_multi_buflet, uint32_t *large_buf_size)
939 {
940 	struct kern_pbufpool_memory_info rx_pp_info;
941 	struct kern_pbufpool_memory_info tx_pp_info;
942 	uint32_t if_max_mtu = 0;
943 	uint32_t drv_buf_size;
944 	struct ifdevmtu ifdm;
945 	int err;
946 
947 	/*
948 	 * To perform intra-stack RX aggregation flowswitch needs to use
949 	 * multi-buflet packet.
950 	 */
951 	*use_multi_buflet = NX_FSW_TCP_RX_AGG_ENABLED();
952 
953 	*large_buf_size = *use_multi_buflet ? NX_FSW_DEF_LARGE_BUFSIZE : 0;
954 	/*
955 	 * IP over Thunderbolt interface can deliver the largest IP packet,
956 	 * but the driver advertises the MAX MTU as only 9K.
957 	 */
958 	if (IFNET_IS_THUNDERBOLT_IP(ifp)) {
959 		if_max_mtu = IP_MAXPACKET;
960 		goto skip_mtu_ioctl;
961 	}
962 
963 	/* determine max mtu */
964 	bzero(&ifdm, sizeof(ifdm));
965 	err = dlil_siocgifdevmtu(ifp, &ifdm);
966 	if (__improbable(err != 0)) {
967 		DLIL_PRINTF("%s: SIOCGIFDEVMTU failed for %s\n",
968 		    __func__, if_name(ifp));
969 		/* use default flowswitch buffer size */
970 		if_max_mtu = NX_FSW_BUFSIZE;
971 	} else {
972 		DLIL_PRINTF("%s: %s %d %d\n", __func__, if_name(ifp),
973 		    ifdm.ifdm_max, ifdm.ifdm_current);
974 		/* rdar://problem/44589731 */
975 		if_max_mtu = MAX(ifdm.ifdm_max, ifdm.ifdm_current);
976 	}
977 
978 skip_mtu_ioctl:
979 	if (if_max_mtu == 0) {
980 		DLIL_PRINTF("%s: can't determine MAX MTU for %s\n",
981 		    __func__, if_name(ifp));
982 		return EINVAL;
983 	}
984 	if ((if_max_mtu > NX_FSW_MAXBUFSIZE) && fsw_use_max_mtu_buffer) {
985 		DLIL_PRINTF("%s: interace (%s) has MAX MTU (%u) > flowswitch "
986 		    "max bufsize(%d)\n", __func__,
987 		    if_name(ifp), if_max_mtu, NX_FSW_MAXBUFSIZE);
988 		return EINVAL;
989 	}
990 
991 	/*
992 	 * for skywalk native driver, consult the driver packet pool also.
993 	 */
994 	if (dlil_is_native_netif_nexus(ifp)) {
995 		err = kern_nexus_get_pbufpool_info(netif, &rx_pp_info,
996 		    &tx_pp_info);
997 		if (err != 0) {
998 			DLIL_PRINTF("%s: can't get pbufpool info for %s\n",
999 			    __func__, if_name(ifp));
1000 			return ENXIO;
1001 		}
1002 		drv_buf_size = tx_pp_info.kpm_bufsize *
1003 		    tx_pp_info.kpm_max_frags;
1004 		if (if_max_mtu > drv_buf_size) {
1005 			DLIL_PRINTF("%s: interface %s packet pool (rx %d * %d, "
1006 			    "tx %d * %d) can't support max mtu(%d)\n", __func__,
1007 			    if_name(ifp), rx_pp_info.kpm_bufsize,
1008 			    rx_pp_info.kpm_max_frags, tx_pp_info.kpm_bufsize,
1009 			    tx_pp_info.kpm_max_frags, if_max_mtu);
1010 			return EINVAL;
1011 		}
1012 	} else {
1013 		drv_buf_size = if_max_mtu;
1014 	}
1015 
1016 	if ((drv_buf_size > NX_FSW_BUFSIZE) && (!fsw_use_max_mtu_buffer)) {
1017 		_CASSERT((NX_FSW_BUFSIZE * NX_PBUF_FRAGS_MAX) >= IP_MAXPACKET);
1018 		*use_multi_buflet = true;
1019 		/* default flowswitch buffer size */
1020 		*buf_size = NX_FSW_BUFSIZE;
1021 		*large_buf_size = MIN(NX_FSW_MAX_LARGE_BUFSIZE, drv_buf_size);
1022 	} else {
1023 		*buf_size = MAX(drv_buf_size, NX_FSW_BUFSIZE);
1024 	}
1025 	_dlil_adjust_large_buf_size_for_tso(ifp, large_buf_size);
1026 	ASSERT(*buf_size <= NX_FSW_MAXBUFSIZE);
1027 	if (*buf_size >= *large_buf_size) {
1028 		*large_buf_size = 0;
1029 	}
1030 	return 0;
1031 }
1032 
1033 static boolean_t
_dlil_attach_flowswitch_nexus(ifnet_t ifp,if_nexus_flowswitch_t nexus_fsw)1034 _dlil_attach_flowswitch_nexus(ifnet_t ifp, if_nexus_flowswitch_t nexus_fsw)
1035 {
1036 	nexus_attr_t            attr = NULL;
1037 	nexus_controller_t      controller;
1038 	errno_t                 err = 0;
1039 	uuid_t                  netif;
1040 	uint32_t                buf_size = 0;
1041 	uint32_t                large_buf_size = 0;
1042 	bool                    multi_buflet;
1043 
1044 	if (ifnet_nx_noauto(ifp) || ifnet_nx_noauto_flowswitch(ifp) ||
1045 	    IFNET_IS_VMNET(ifp)) {
1046 		goto failed;
1047 	}
1048 
1049 	if ((ifp->if_capabilities & IFCAP_SKYWALK) == 0) {
1050 		/* not possible to attach (netif native/compat not plumbed) */
1051 		goto failed;
1052 	}
1053 
1054 	if ((if_attach_nx & IF_ATTACH_NX_FLOWSWITCH) == 0) {
1055 		/* don't auto-attach */
1056 		goto failed;
1057 	}
1058 
1059 	/* get the netif instance from the ifp */
1060 	err = kern_nexus_get_netif_instance(ifp, netif);
1061 	if (err != 0) {
1062 		DLIL_PRINTF("%s: can't find netif for %s\n", __func__,
1063 		    if_name(ifp));
1064 		goto failed;
1065 	}
1066 
1067 	err = kern_nexus_attr_create(&attr);
1068 	if (err != 0) {
1069 		DLIL_PRINTF("%s: nexus attr create for %s\n", __func__,
1070 		    if_name(ifp));
1071 		goto failed;
1072 	}
1073 
1074 	err = _dlil_get_flowswitch_buffer_size(ifp, netif, &buf_size,
1075 	    &multi_buflet, &large_buf_size);
1076 	if (err != 0) {
1077 		goto failed;
1078 	}
1079 	ASSERT((buf_size >= NX_FSW_BUFSIZE) && (buf_size <= NX_FSW_MAXBUFSIZE));
1080 	ASSERT(large_buf_size <= NX_FSW_MAX_LARGE_BUFSIZE);
1081 
1082 	/* Configure flowswitch buffer size */
1083 	err = kern_nexus_attr_set(attr, NEXUS_ATTR_SLOT_BUF_SIZE, buf_size);
1084 	VERIFY(err == 0);
1085 	err = kern_nexus_attr_set(attr, NEXUS_ATTR_LARGE_BUF_SIZE,
1086 	    large_buf_size);
1087 	VERIFY(err == 0);
1088 
1089 	/*
1090 	 * Configure flowswitch to use super-packet (multi-buflet).
1091 	 */
1092 	err = kern_nexus_attr_set(attr, NEXUS_ATTR_MAX_FRAGS,
1093 	    multi_buflet ? NX_PBUF_FRAGS_MAX : 1);
1094 	VERIFY(err == 0);
1095 
1096 	/* create the flowswitch provider and instance */
1097 	controller = kern_nexus_shared_controller();
1098 	err = dlil_create_provider_and_instance(controller,
1099 	    NEXUS_TYPE_FLOW_SWITCH, ifp, &nexus_fsw->if_fsw_provider,
1100 	    &nexus_fsw->if_fsw_instance, attr);
1101 	if (err != 0) {
1102 		goto failed;
1103 	}
1104 
1105 	/* attach the device port */
1106 	err = kern_nexus_ifattach(controller, nexus_fsw->if_fsw_instance,
1107 	    NULL, netif, FALSE, &nexus_fsw->if_fsw_device);
1108 	if (err != 0) {
1109 		DLIL_PRINTF("%s kern_nexus_ifattach device failed %d %s\n",
1110 		    __func__, err, if_name(ifp));
1111 		/* cleanup provider and instance */
1112 		dlil_detach_nexus(__func__, nexus_fsw->if_fsw_provider,
1113 		    nexus_fsw->if_fsw_instance, nexus_fsw->if_fsw_device);
1114 		goto failed;
1115 	}
1116 	return TRUE;
1117 
1118 failed:
1119 	if (err != 0) {
1120 		DLIL_PRINTF("%s: failed to attach flowswitch to %s, error %d\n",
1121 		    __func__, if_name(ifp), err);
1122 	} else {
1123 		DLIL_PRINTF("%s: not attaching flowswitch to %s\n",
1124 		    __func__, if_name(ifp));
1125 	}
1126 	if (attr != NULL) {
1127 		kern_nexus_attr_destroy(attr);
1128 	}
1129 	return FALSE;
1130 }
1131 
1132 static boolean_t
dlil_attach_flowswitch_nexus(ifnet_t ifp)1133 dlil_attach_flowswitch_nexus(ifnet_t ifp)
1134 {
1135 	boolean_t               attached = FALSE;
1136 	if_nexus_flowswitch     nexus_fsw;
1137 
1138 #if (DEVELOPMENT || DEBUG)
1139 	if (skywalk_netif_direct_allowed(if_name(ifp))) {
1140 		DLIL_PRINTF("skip attaching fsw to %s\n", if_name(ifp));
1141 		return FALSE;
1142 	}
1143 #endif /* (DEVELOPMENT || DEBUG) */
1144 
1145 	/*
1146 	 * flowswitch attachment is not supported for interface using the
1147 	 * legacy model (IFNET_INIT_LEGACY)
1148 	 */
1149 	if ((ifp->if_eflags & IFEF_TXSTART) == 0) {
1150 		DLIL_PRINTF("skip attaching fsw to %s using legacy TX model\n",
1151 		    if_name(ifp));
1152 		return FALSE;
1153 	}
1154 	bzero(&nexus_fsw, sizeof(nexus_fsw));
1155 	if (!ifnet_is_attached(ifp, 1)) {
1156 		os_log(OS_LOG_DEFAULT, "%s: %s not attached",
1157 		    __func__, ifp->if_xname);
1158 		goto done;
1159 	}
1160 	if (uuid_is_null(ifp->if_nx_flowswitch.if_fsw_instance)) {
1161 		attached = _dlil_attach_flowswitch_nexus(ifp, &nexus_fsw);
1162 		if (attached) {
1163 			ifnet_lock_exclusive(ifp);
1164 			ifp->if_nx_flowswitch = nexus_fsw;
1165 			ifnet_lock_done(ifp);
1166 		}
1167 	}
1168 	ifnet_decr_iorefcnt(ifp);
1169 
1170 done:
1171 	return attached;
1172 }
1173 
1174 __attribute__((noinline))
1175 static void
dlil_detach_flowswitch_nexus(if_nexus_flowswitch_t nexus_fsw)1176 dlil_detach_flowswitch_nexus(if_nexus_flowswitch_t nexus_fsw)
1177 {
1178 	dlil_detach_nexus(__func__, nexus_fsw->if_fsw_provider,
1179 	    nexus_fsw->if_fsw_instance, nexus_fsw->if_fsw_device);
1180 }
1181 
1182 __attribute__((noinline))
1183 static void
dlil_netif_detach_notify(ifnet_t ifp)1184 dlil_netif_detach_notify(ifnet_t ifp)
1185 {
1186 	ifnet_detach_notify_cb_t notify = NULL;
1187 	void *arg = NULL;
1188 
1189 	ifnet_get_detach_notify(ifp, &notify, &arg);
1190 	if (notify == NULL) {
1191 		DTRACE_SKYWALK1(no__notify, ifnet_t, ifp);
1192 		return;
1193 	}
1194 	(*notify)(arg);
1195 }
1196 
1197 __attribute__((noinline))
1198 static void
dlil_quiesce_and_detach_nexuses(ifnet_t ifp)1199 dlil_quiesce_and_detach_nexuses(ifnet_t ifp)
1200 {
1201 	if_nexus_flowswitch *nx_fsw = &ifp->if_nx_flowswitch;
1202 	if_nexus_netif *nx_netif = &ifp->if_nx_netif;
1203 
1204 	ifnet_datamov_suspend_and_drain(ifp);
1205 	if (!uuid_is_null(nx_fsw->if_fsw_device)) {
1206 		ASSERT(!uuid_is_null(nx_fsw->if_fsw_provider));
1207 		ASSERT(!uuid_is_null(nx_fsw->if_fsw_instance));
1208 		dlil_detach_flowswitch_nexus(nx_fsw);
1209 	} else {
1210 		ASSERT(uuid_is_null(nx_fsw->if_fsw_provider));
1211 		ASSERT(uuid_is_null(nx_fsw->if_fsw_instance));
1212 		DTRACE_IP1(fsw__not__attached, ifnet_t, ifp);
1213 	}
1214 
1215 	if (!uuid_is_null(nx_netif->if_nif_attach)) {
1216 		ASSERT(!uuid_is_null(nx_netif->if_nif_provider));
1217 		ASSERT(!uuid_is_null(nx_netif->if_nif_instance));
1218 		dlil_detach_netif_nexus(nx_netif);
1219 	} else {
1220 		ASSERT(uuid_is_null(nx_netif->if_nif_provider));
1221 		ASSERT(uuid_is_null(nx_netif->if_nif_instance));
1222 		DTRACE_IP1(netif__not__attached, ifnet_t, ifp);
1223 	}
1224 	ifnet_datamov_resume(ifp);
1225 }
1226 
1227 boolean_t
ifnet_add_netagent(ifnet_t ifp)1228 ifnet_add_netagent(ifnet_t ifp)
1229 {
1230 	int     error;
1231 
1232 	error = kern_nexus_interface_add_netagent(ifp);
1233 	os_log(OS_LOG_DEFAULT,
1234 	    "kern_nexus_interface_add_netagent(%s) returned %d",
1235 	    ifp->if_xname, error);
1236 	return error == 0;
1237 }
1238 
1239 boolean_t
ifnet_remove_netagent(ifnet_t ifp)1240 ifnet_remove_netagent(ifnet_t ifp)
1241 {
1242 	int     error;
1243 
1244 	error = kern_nexus_interface_remove_netagent(ifp);
1245 	os_log(OS_LOG_DEFAULT,
1246 	    "kern_nexus_interface_remove_netagent(%s) returned %d",
1247 	    ifp->if_xname, error);
1248 	return error == 0;
1249 }
1250 
1251 boolean_t
ifnet_attach_flowswitch_nexus(ifnet_t ifp)1252 ifnet_attach_flowswitch_nexus(ifnet_t ifp)
1253 {
1254 	if (!IF_FULLY_ATTACHED(ifp)) {
1255 		return FALSE;
1256 	}
1257 	return dlil_attach_flowswitch_nexus(ifp);
1258 }
1259 
1260 boolean_t
ifnet_detach_flowswitch_nexus(ifnet_t ifp)1261 ifnet_detach_flowswitch_nexus(ifnet_t ifp)
1262 {
1263 	if_nexus_flowswitch     nexus_fsw;
1264 
1265 	ifnet_lock_exclusive(ifp);
1266 	nexus_fsw = ifp->if_nx_flowswitch;
1267 	bzero(&ifp->if_nx_flowswitch, sizeof(ifp->if_nx_flowswitch));
1268 	ifnet_lock_done(ifp);
1269 	return dlil_detach_nexus(__func__, nexus_fsw.if_fsw_provider,
1270 	           nexus_fsw.if_fsw_instance, nexus_fsw.if_fsw_device);
1271 }
1272 
1273 void
ifnet_attach_native_flowswitch(ifnet_t ifp)1274 ifnet_attach_native_flowswitch(ifnet_t ifp)
1275 {
1276 	if (!dlil_is_native_netif_nexus(ifp)) {
1277 		/* not a native netif */
1278 		return;
1279 	}
1280 	ifnet_attach_flowswitch_nexus(ifp);
1281 }
1282 
1283 int
ifnet_set_flowswitch_rx_callback(ifnet_t ifp,ifnet_fsw_rx_cb_t cb,void * arg)1284 ifnet_set_flowswitch_rx_callback(ifnet_t ifp, ifnet_fsw_rx_cb_t cb, void *arg)
1285 {
1286 	lck_mtx_lock(&ifp->if_delegate_lock);
1287 	while (ifp->if_fsw_rx_cb_ref > 0) {
1288 		DTRACE_SKYWALK1(wait__fsw, ifnet_t, ifp);
1289 		(void) msleep(&ifp->if_fsw_rx_cb_ref, &ifp->if_delegate_lock,
1290 		    (PZERO + 1), __FUNCTION__, NULL);
1291 		DTRACE_SKYWALK1(wake__fsw, ifnet_t, ifp);
1292 	}
1293 	ifp->if_fsw_rx_cb = cb;
1294 	ifp->if_fsw_rx_cb_arg = arg;
1295 	lck_mtx_unlock(&ifp->if_delegate_lock);
1296 	return 0;
1297 }
1298 
1299 int
ifnet_get_flowswitch_rx_callback(ifnet_t ifp,ifnet_fsw_rx_cb_t * cbp,void ** argp)1300 ifnet_get_flowswitch_rx_callback(ifnet_t ifp, ifnet_fsw_rx_cb_t *cbp, void **argp)
1301 {
1302 	/*
1303 	 * This is for avoiding the unnecessary lock acquire for interfaces
1304 	 * not used by a redirect interface.
1305 	 */
1306 	if (ifp->if_fsw_rx_cb == NULL) {
1307 		return ENOENT;
1308 	}
1309 	lck_mtx_lock(&ifp->if_delegate_lock);
1310 	if (ifp->if_fsw_rx_cb == NULL) {
1311 		lck_mtx_unlock(&ifp->if_delegate_lock);
1312 		return ENOENT;
1313 	}
1314 	*cbp = ifp->if_fsw_rx_cb;
1315 	*argp = ifp->if_fsw_rx_cb_arg;
1316 	ifp->if_fsw_rx_cb_ref++;
1317 	lck_mtx_unlock(&ifp->if_delegate_lock);
1318 	return 0;
1319 }
1320 
1321 void
ifnet_release_flowswitch_rx_callback(ifnet_t ifp)1322 ifnet_release_flowswitch_rx_callback(ifnet_t ifp)
1323 {
1324 	lck_mtx_lock(&ifp->if_delegate_lock);
1325 	if (--ifp->if_fsw_rx_cb_ref == 0) {
1326 		wakeup(&ifp->if_fsw_rx_cb_ref);
1327 	}
1328 	lck_mtx_unlock(&ifp->if_delegate_lock);
1329 }
1330 
1331 int
ifnet_set_delegate_parent(ifnet_t difp,ifnet_t parent)1332 ifnet_set_delegate_parent(ifnet_t difp, ifnet_t parent)
1333 {
1334 	lck_mtx_lock(&difp->if_delegate_lock);
1335 	while (difp->if_delegate_parent_ref > 0) {
1336 		DTRACE_SKYWALK1(wait__parent, ifnet_t, difp);
1337 		(void) msleep(&difp->if_delegate_parent_ref, &difp->if_delegate_lock,
1338 		    (PZERO + 1), __FUNCTION__, NULL);
1339 		DTRACE_SKYWALK1(wake__parent, ifnet_t, difp);
1340 	}
1341 	difp->if_delegate_parent = parent;
1342 	lck_mtx_unlock(&difp->if_delegate_lock);
1343 	return 0;
1344 }
1345 
1346 int
ifnet_get_delegate_parent(ifnet_t difp,ifnet_t * parentp)1347 ifnet_get_delegate_parent(ifnet_t difp, ifnet_t *parentp)
1348 {
1349 	lck_mtx_lock(&difp->if_delegate_lock);
1350 	if (difp->if_delegate_parent == NULL) {
1351 		lck_mtx_unlock(&difp->if_delegate_lock);
1352 		return ENOENT;
1353 	}
1354 	*parentp = difp->if_delegate_parent;
1355 	difp->if_delegate_parent_ref++;
1356 	lck_mtx_unlock(&difp->if_delegate_lock);
1357 	return 0;
1358 }
1359 
1360 void
ifnet_release_delegate_parent(ifnet_t difp)1361 ifnet_release_delegate_parent(ifnet_t difp)
1362 {
1363 	lck_mtx_lock(&difp->if_delegate_lock);
1364 	if (--difp->if_delegate_parent_ref == 0) {
1365 		wakeup(&difp->if_delegate_parent_ref);
1366 	}
1367 	lck_mtx_unlock(&difp->if_delegate_lock);
1368 }
1369 
1370 __attribute__((noinline))
1371 void
ifnet_set_detach_notify_locked(ifnet_t ifp,ifnet_detach_notify_cb_t notify,void * arg)1372 ifnet_set_detach_notify_locked(ifnet_t ifp, ifnet_detach_notify_cb_t notify, void *arg)
1373 {
1374 	ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_EXCLUSIVE);
1375 	ifp->if_detach_notify = notify;
1376 	ifp->if_detach_notify_arg = arg;
1377 }
1378 
1379 __attribute__((noinline))
1380 void
ifnet_get_detach_notify_locked(ifnet_t ifp,ifnet_detach_notify_cb_t * notifyp,void ** argp)1381 ifnet_get_detach_notify_locked(ifnet_t ifp, ifnet_detach_notify_cb_t *notifyp, void **argp)
1382 {
1383 	ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_EXCLUSIVE);
1384 	*notifyp = ifp->if_detach_notify;
1385 	*argp = ifp->if_detach_notify_arg;
1386 }
1387 
1388 __attribute__((noinline))
1389 void
ifnet_set_detach_notify(ifnet_t ifp,ifnet_detach_notify_cb_t notify,void * arg)1390 ifnet_set_detach_notify(ifnet_t ifp, ifnet_detach_notify_cb_t notify, void *arg)
1391 {
1392 	ifnet_lock_exclusive(ifp);
1393 	ifnet_set_detach_notify_locked(ifp, notify, arg);
1394 	ifnet_lock_done(ifp);
1395 }
1396 
1397 __attribute__((noinline))
1398 void
ifnet_get_detach_notify(ifnet_t ifp,ifnet_detach_notify_cb_t * notifyp,void ** argp)1399 ifnet_get_detach_notify(ifnet_t ifp, ifnet_detach_notify_cb_t *notifyp, void **argp)
1400 {
1401 	ifnet_lock_exclusive(ifp);
1402 	ifnet_get_detach_notify_locked(ifp, notifyp, argp);
1403 	ifnet_lock_done(ifp);
1404 }
1405 #endif /* SKYWALK */
1406 
1407 #define DLIL_INPUT_CHECK(m, ifp) {                                      \
1408 	struct ifnet *_rcvif = mbuf_pkthdr_rcvif(m);                    \
1409 	if (_rcvif == NULL || (ifp != lo_ifp && _rcvif != ifp) ||       \
1410 	    !(mbuf_flags(m) & MBUF_PKTHDR)) {                           \
1411 	        panic_plain("%s: invalid mbuf %p\n", __func__, m);      \
1412 	/* NOTREACHED */                                        \
1413 	}                                                               \
1414 }
1415 
1416 #define DLIL_EWMA(old, new, decay) do {                                 \
1417 	u_int32_t _avg;                                                 \
1418 	if ((_avg = (old)) > 0)                                         \
1419 	        _avg = (((_avg << (decay)) - _avg) + (new)) >> (decay); \
1420 	else                                                            \
1421 	        _avg = (new);                                           \
1422 	(old) = _avg;                                                   \
1423 } while (0)
1424 
1425 #define MBPS    (1ULL * 1000 * 1000)
1426 #define GBPS    (MBPS * 1000)
1427 
1428 struct rxpoll_time_tbl {
1429 	u_int64_t       speed;          /* downlink speed */
1430 	u_int32_t       plowat;         /* packets low watermark */
1431 	u_int32_t       phiwat;         /* packets high watermark */
1432 	u_int32_t       blowat;         /* bytes low watermark */
1433 	u_int32_t       bhiwat;         /* bytes high watermark */
1434 };
1435 
1436 static struct rxpoll_time_tbl rxpoll_tbl[] = {
1437 	{ .speed =  10 * MBPS, .plowat = 2, .phiwat = 8, .blowat = (1 * 1024), .bhiwat = (6 * 1024)    },
1438 	{ .speed = 100 * MBPS, .plowat = 10, .phiwat = 40, .blowat = (4 * 1024), .bhiwat = (64 * 1024)   },
1439 	{ .speed =   1 * GBPS, .plowat = 10, .phiwat = 40, .blowat = (4 * 1024), .bhiwat = (64 * 1024)   },
1440 	{ .speed =  10 * GBPS, .plowat = 10, .phiwat = 40, .blowat = (4 * 1024), .bhiwat = (64 * 1024)   },
1441 	{ .speed = 100 * GBPS, .plowat = 10, .phiwat = 40, .blowat = (4 * 1024), .bhiwat = (64 * 1024)   },
1442 	{ .speed = 0, .plowat = 0, .phiwat = 0, .blowat = 0, .bhiwat = 0 }
1443 };
1444 
1445 static LCK_MTX_DECLARE_ATTR(dlil_thread_sync_lock, &dlil_lock_group,
1446     &dlil_lck_attributes);
1447 static uint32_t dlil_pending_thread_cnt = 0;
1448 
1449 static void
dlil_incr_pending_thread_count(void)1450 dlil_incr_pending_thread_count(void)
1451 {
1452 	LCK_MTX_ASSERT(&dlil_thread_sync_lock, LCK_MTX_ASSERT_NOTOWNED);
1453 	lck_mtx_lock(&dlil_thread_sync_lock);
1454 	dlil_pending_thread_cnt++;
1455 	lck_mtx_unlock(&dlil_thread_sync_lock);
1456 }
1457 
1458 static void
dlil_decr_pending_thread_count(void)1459 dlil_decr_pending_thread_count(void)
1460 {
1461 	LCK_MTX_ASSERT(&dlil_thread_sync_lock, LCK_MTX_ASSERT_NOTOWNED);
1462 	lck_mtx_lock(&dlil_thread_sync_lock);
1463 	VERIFY(dlil_pending_thread_cnt > 0);
1464 	dlil_pending_thread_cnt--;
1465 	if (dlil_pending_thread_cnt == 0) {
1466 		wakeup(&dlil_pending_thread_cnt);
1467 	}
1468 	lck_mtx_unlock(&dlil_thread_sync_lock);
1469 }
1470 
1471 int
proto_hash_value(u_int32_t protocol_family)1472 proto_hash_value(u_int32_t protocol_family)
1473 {
1474 	/*
1475 	 * dlil_proto_unplumb_all() depends on the mapping between
1476 	 * the hash bucket index and the protocol family defined
1477 	 * here; future changes must be applied there as well.
1478 	 */
1479 	switch (protocol_family) {
1480 	case PF_INET:
1481 		return 0;
1482 	case PF_INET6:
1483 		return 1;
1484 	case PF_VLAN:
1485 		return 2;
1486 	case PF_UNSPEC:
1487 	default:
1488 		return 3;
1489 	}
1490 }
1491 
1492 /*
1493  * Caller must already be holding ifnet lock.
1494  */
1495 static struct if_proto *
find_attached_proto(struct ifnet * ifp,u_int32_t protocol_family)1496 find_attached_proto(struct ifnet *ifp, u_int32_t protocol_family)
1497 {
1498 	struct if_proto *proto = NULL;
1499 	u_int32_t i = proto_hash_value(protocol_family);
1500 
1501 	ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_OWNED);
1502 
1503 	if (ifp->if_proto_hash != NULL) {
1504 		proto = SLIST_FIRST(&ifp->if_proto_hash[i]);
1505 	}
1506 
1507 	while (proto != NULL && proto->protocol_family != protocol_family) {
1508 		proto = SLIST_NEXT(proto, next_hash);
1509 	}
1510 
1511 	if (proto != NULL) {
1512 		if_proto_ref(proto);
1513 	}
1514 
1515 	return proto;
1516 }
1517 
1518 static void
if_proto_ref(struct if_proto * proto)1519 if_proto_ref(struct if_proto *proto)
1520 {
1521 	os_atomic_inc(&proto->refcount, relaxed);
1522 }
1523 
1524 extern void if_rtproto_del(struct ifnet *ifp, int protocol);
1525 
1526 static void
if_proto_free(struct if_proto * proto)1527 if_proto_free(struct if_proto *proto)
1528 {
1529 	u_int32_t oldval;
1530 	struct ifnet *ifp = proto->ifp;
1531 	u_int32_t proto_family = proto->protocol_family;
1532 	struct kev_dl_proto_data ev_pr_data;
1533 
1534 	oldval = os_atomic_dec_orig(&proto->refcount, relaxed);
1535 	if (oldval > 1) {
1536 		return;
1537 	}
1538 
1539 	if (proto->proto_kpi == kProtoKPI_v1) {
1540 		if (proto->kpi.v1.detached) {
1541 			proto->kpi.v1.detached(ifp, proto->protocol_family);
1542 		}
1543 	}
1544 	if (proto->proto_kpi == kProtoKPI_v2) {
1545 		if (proto->kpi.v2.detached) {
1546 			proto->kpi.v2.detached(ifp, proto->protocol_family);
1547 		}
1548 	}
1549 
1550 	/*
1551 	 * Cleanup routes that may still be in the routing table for that
1552 	 * interface/protocol pair.
1553 	 */
1554 	if_rtproto_del(ifp, proto_family);
1555 
1556 	ifnet_lock_shared(ifp);
1557 
1558 	/* No more reference on this, protocol must have been detached */
1559 	VERIFY(proto->detached);
1560 
1561 	/*
1562 	 * The reserved field carries the number of protocol still attached
1563 	 * (subject to change)
1564 	 */
1565 	ev_pr_data.proto_family = proto_family;
1566 	ev_pr_data.proto_remaining_count = dlil_ifp_protolist(ifp, NULL, 0);
1567 
1568 	ifnet_lock_done(ifp);
1569 
1570 	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_PROTO_DETACHED,
1571 	    (struct net_event_data *)&ev_pr_data,
1572 	    sizeof(struct kev_dl_proto_data), FALSE);
1573 
1574 	if (ev_pr_data.proto_remaining_count == 0) {
1575 		/*
1576 		 * The protocol count has gone to zero, mark the interface down.
1577 		 * This used to be done by configd.KernelEventMonitor, but that
1578 		 * is inherently prone to races (rdar://problem/30810208).
1579 		 */
1580 		(void) ifnet_set_flags(ifp, 0, IFF_UP);
1581 		(void) ifnet_ioctl(ifp, 0, SIOCSIFFLAGS, NULL);
1582 		dlil_post_sifflags_msg(ifp);
1583 	}
1584 
1585 	zfree(dlif_proto_zone, proto);
1586 }
1587 
1588 __private_extern__ void
ifnet_lock_assert(struct ifnet * ifp,ifnet_lock_assert_t what)1589 ifnet_lock_assert(struct ifnet *ifp, ifnet_lock_assert_t what)
1590 {
1591 #if !MACH_ASSERT
1592 #pragma unused(ifp)
1593 #endif
1594 	unsigned int type = 0;
1595 	int ass = 1;
1596 
1597 	switch (what) {
1598 	case IFNET_LCK_ASSERT_EXCLUSIVE:
1599 		type = LCK_RW_ASSERT_EXCLUSIVE;
1600 		break;
1601 
1602 	case IFNET_LCK_ASSERT_SHARED:
1603 		type = LCK_RW_ASSERT_SHARED;
1604 		break;
1605 
1606 	case IFNET_LCK_ASSERT_OWNED:
1607 		type = LCK_RW_ASSERT_HELD;
1608 		break;
1609 
1610 	case IFNET_LCK_ASSERT_NOTOWNED:
1611 		/* nothing to do here for RW lock; bypass assert */
1612 		ass = 0;
1613 		break;
1614 
1615 	default:
1616 		panic("bad ifnet assert type: %d", what);
1617 		/* NOTREACHED */
1618 	}
1619 	if (ass) {
1620 		LCK_RW_ASSERT(&ifp->if_lock, type);
1621 	}
1622 }
1623 
1624 __private_extern__ void
ifnet_lock_shared(struct ifnet * ifp)1625 ifnet_lock_shared(struct ifnet *ifp)
1626 {
1627 	lck_rw_lock_shared(&ifp->if_lock);
1628 }
1629 
1630 __private_extern__ void
ifnet_lock_exclusive(struct ifnet * ifp)1631 ifnet_lock_exclusive(struct ifnet *ifp)
1632 {
1633 	lck_rw_lock_exclusive(&ifp->if_lock);
1634 }
1635 
1636 __private_extern__ void
ifnet_lock_done(struct ifnet * ifp)1637 ifnet_lock_done(struct ifnet *ifp)
1638 {
1639 	lck_rw_done(&ifp->if_lock);
1640 }
1641 
1642 #if INET
1643 __private_extern__ void
if_inetdata_lock_shared(struct ifnet * ifp)1644 if_inetdata_lock_shared(struct ifnet *ifp)
1645 {
1646 	lck_rw_lock_shared(&ifp->if_inetdata_lock);
1647 }
1648 
1649 __private_extern__ void
if_inetdata_lock_exclusive(struct ifnet * ifp)1650 if_inetdata_lock_exclusive(struct ifnet *ifp)
1651 {
1652 	lck_rw_lock_exclusive(&ifp->if_inetdata_lock);
1653 }
1654 
1655 __private_extern__ void
if_inetdata_lock_done(struct ifnet * ifp)1656 if_inetdata_lock_done(struct ifnet *ifp)
1657 {
1658 	lck_rw_done(&ifp->if_inetdata_lock);
1659 }
1660 #endif
1661 
1662 __private_extern__ void
if_inet6data_lock_shared(struct ifnet * ifp)1663 if_inet6data_lock_shared(struct ifnet *ifp)
1664 {
1665 	lck_rw_lock_shared(&ifp->if_inet6data_lock);
1666 }
1667 
1668 __private_extern__ void
if_inet6data_lock_exclusive(struct ifnet * ifp)1669 if_inet6data_lock_exclusive(struct ifnet *ifp)
1670 {
1671 	lck_rw_lock_exclusive(&ifp->if_inet6data_lock);
1672 }
1673 
1674 __private_extern__ void
if_inet6data_lock_done(struct ifnet * ifp)1675 if_inet6data_lock_done(struct ifnet *ifp)
1676 {
1677 	lck_rw_done(&ifp->if_inet6data_lock);
1678 }
1679 
1680 __private_extern__ void
ifnet_head_lock_shared(void)1681 ifnet_head_lock_shared(void)
1682 {
1683 	lck_rw_lock_shared(&ifnet_head_lock);
1684 }
1685 
1686 __private_extern__ void
ifnet_head_lock_exclusive(void)1687 ifnet_head_lock_exclusive(void)
1688 {
1689 	lck_rw_lock_exclusive(&ifnet_head_lock);
1690 }
1691 
1692 __private_extern__ void
ifnet_head_done(void)1693 ifnet_head_done(void)
1694 {
1695 	lck_rw_done(&ifnet_head_lock);
1696 }
1697 
1698 __private_extern__ void
ifnet_head_assert_exclusive(void)1699 ifnet_head_assert_exclusive(void)
1700 {
1701 	LCK_RW_ASSERT(&ifnet_head_lock, LCK_RW_ASSERT_EXCLUSIVE);
1702 }
1703 
1704 /*
1705  * dlil_ifp_protolist
1706  * - get the list of protocols attached to the interface, or just the number
1707  *   of attached protocols
1708  * - if the number returned is greater than 'list_count', truncation occurred
1709  *
1710  * Note:
1711  * - caller must already be holding ifnet lock.
1712  */
1713 static u_int32_t
dlil_ifp_protolist(struct ifnet * ifp,protocol_family_t * list,u_int32_t list_count)1714 dlil_ifp_protolist(struct ifnet *ifp, protocol_family_t *list,
1715     u_int32_t list_count)
1716 {
1717 	u_int32_t       count = 0;
1718 	int             i;
1719 
1720 	ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_OWNED);
1721 
1722 	if (ifp->if_proto_hash == NULL) {
1723 		goto done;
1724 	}
1725 
1726 	for (i = 0; i < PROTO_HASH_SLOTS; i++) {
1727 		struct if_proto *proto;
1728 		SLIST_FOREACH(proto, &ifp->if_proto_hash[i], next_hash) {
1729 			if (list != NULL && count < list_count) {
1730 				list[count] = proto->protocol_family;
1731 			}
1732 			count++;
1733 		}
1734 	}
1735 done:
1736 	return count;
1737 }
1738 
1739 __private_extern__ u_int32_t
if_get_protolist(struct ifnet * ifp,u_int32_t * protolist,u_int32_t count)1740 if_get_protolist(struct ifnet * ifp, u_int32_t *protolist, u_int32_t count)
1741 {
1742 	ifnet_lock_shared(ifp);
1743 	count = dlil_ifp_protolist(ifp, protolist, count);
1744 	ifnet_lock_done(ifp);
1745 	return count;
1746 }
1747 
1748 __private_extern__ void
if_free_protolist(u_int32_t * list)1749 if_free_protolist(u_int32_t *list)
1750 {
1751 	kfree_data_addr(list);
1752 }
1753 
1754 __private_extern__ int
dlil_post_msg(struct ifnet * ifp,u_int32_t event_subclass,u_int32_t event_code,struct net_event_data * event_data,u_int32_t event_data_len,boolean_t suppress_generation)1755 dlil_post_msg(struct ifnet *ifp, u_int32_t event_subclass,
1756     u_int32_t event_code, struct net_event_data *event_data,
1757     u_int32_t event_data_len, boolean_t suppress_generation)
1758 {
1759 	struct net_event_data ev_data;
1760 	struct kev_msg ev_msg;
1761 
1762 	bzero(&ev_msg, sizeof(ev_msg));
1763 	bzero(&ev_data, sizeof(ev_data));
1764 	/*
1765 	 * a net event always starts with a net_event_data structure
1766 	 * but the caller can generate a simple net event or
1767 	 * provide a longer event structure to post
1768 	 */
1769 	ev_msg.vendor_code      = KEV_VENDOR_APPLE;
1770 	ev_msg.kev_class        = KEV_NETWORK_CLASS;
1771 	ev_msg.kev_subclass     = event_subclass;
1772 	ev_msg.event_code       = event_code;
1773 
1774 	if (event_data == NULL) {
1775 		event_data = &ev_data;
1776 		event_data_len = sizeof(struct net_event_data);
1777 	}
1778 
1779 	strlcpy(&event_data->if_name[0], ifp->if_name, IFNAMSIZ);
1780 	event_data->if_family = ifp->if_family;
1781 	event_data->if_unit   = (u_int32_t)ifp->if_unit;
1782 
1783 	ev_msg.dv[0].data_length = event_data_len;
1784 	ev_msg.dv[0].data_ptr    = event_data;
1785 	ev_msg.dv[1].data_length = 0;
1786 
1787 	bool update_generation = true;
1788 	if (event_subclass == KEV_DL_SUBCLASS) {
1789 		/* Don't update interface generation for frequent link quality and state changes  */
1790 		switch (event_code) {
1791 		case KEV_DL_LINK_QUALITY_METRIC_CHANGED:
1792 		case KEV_DL_RRC_STATE_CHANGED:
1793 		case KEV_DL_PRIMARY_ELECTED:
1794 			update_generation = false;
1795 			break;
1796 		default:
1797 			break;
1798 		}
1799 	}
1800 
1801 	/*
1802 	 * Some events that update generation counts might
1803 	 * want to suppress generation count.
1804 	 * One example is node presence/absence where we still
1805 	 * issue kernel event for the invocation but want to avoid
1806 	 * expensive operation of updating generation which triggers
1807 	 * NECP client updates.
1808 	 */
1809 	if (suppress_generation) {
1810 		update_generation = false;
1811 	}
1812 
1813 	return dlil_event_internal(ifp, &ev_msg, update_generation);
1814 }
1815 
1816 __private_extern__ int
dlil_alloc_local_stats(struct ifnet * ifp)1817 dlil_alloc_local_stats(struct ifnet *ifp)
1818 {
1819 	int ret = EINVAL;
1820 	void *buf, *base, **pbuf;
1821 
1822 	if (ifp == NULL) {
1823 		goto end;
1824 	}
1825 
1826 	if (ifp->if_tcp_stat == NULL && ifp->if_udp_stat == NULL) {
1827 		/* allocate tcpstat_local structure */
1828 		buf = zalloc_flags(dlif_tcpstat_zone,
1829 		    Z_WAITOK | Z_ZERO | Z_NOFAIL);
1830 
1831 		/* Get the 64-bit aligned base address for this object */
1832 		base = (void *)P2ROUNDUP((intptr_t)buf + sizeof(u_int64_t),
1833 		    sizeof(u_int64_t));
1834 		VERIFY(((intptr_t)base + dlif_tcpstat_size) <=
1835 		    ((intptr_t)buf + dlif_tcpstat_bufsize));
1836 
1837 		/*
1838 		 * Wind back a pointer size from the aligned base and
1839 		 * save the original address so we can free it later.
1840 		 */
1841 		pbuf = (void **)((intptr_t)base - sizeof(void *));
1842 		*pbuf = buf;
1843 		ifp->if_tcp_stat = base;
1844 
1845 		/* allocate udpstat_local structure */
1846 		buf = zalloc_flags(dlif_udpstat_zone,
1847 		    Z_WAITOK | Z_ZERO | Z_NOFAIL);
1848 
1849 		/* Get the 64-bit aligned base address for this object */
1850 		base = (void *)P2ROUNDUP((intptr_t)buf + sizeof(u_int64_t),
1851 		    sizeof(u_int64_t));
1852 		VERIFY(((intptr_t)base + dlif_udpstat_size) <=
1853 		    ((intptr_t)buf + dlif_udpstat_bufsize));
1854 
1855 		/*
1856 		 * Wind back a pointer size from the aligned base and
1857 		 * save the original address so we can free it later.
1858 		 */
1859 		pbuf = (void **)((intptr_t)base - sizeof(void *));
1860 		*pbuf = buf;
1861 		ifp->if_udp_stat = base;
1862 
1863 		VERIFY(IS_P2ALIGNED(ifp->if_tcp_stat, sizeof(u_int64_t)) &&
1864 		    IS_P2ALIGNED(ifp->if_udp_stat, sizeof(u_int64_t)));
1865 
1866 		ret = 0;
1867 	}
1868 
1869 	if (ifp->if_ipv4_stat == NULL) {
1870 		ifp->if_ipv4_stat = kalloc_type(struct if_tcp_ecn_stat, Z_WAITOK | Z_ZERO);
1871 	}
1872 
1873 	if (ifp->if_ipv6_stat == NULL) {
1874 		ifp->if_ipv6_stat = kalloc_type(struct if_tcp_ecn_stat, Z_WAITOK | Z_ZERO);
1875 	}
1876 end:
1877 	if (ifp != NULL && ret != 0) {
1878 		if (ifp->if_tcp_stat != NULL) {
1879 			pbuf = (void **)
1880 			    ((intptr_t)ifp->if_tcp_stat - sizeof(void *));
1881 			zfree(dlif_tcpstat_zone, *pbuf);
1882 			ifp->if_tcp_stat = NULL;
1883 		}
1884 		if (ifp->if_udp_stat != NULL) {
1885 			pbuf = (void **)
1886 			    ((intptr_t)ifp->if_udp_stat - sizeof(void *));
1887 			zfree(dlif_udpstat_zone, *pbuf);
1888 			ifp->if_udp_stat = NULL;
1889 		}
1890 		/* The macro kfree_type sets the passed pointer to NULL */
1891 		if (ifp->if_ipv4_stat != NULL) {
1892 			kfree_type(struct if_tcp_ecn_stat, ifp->if_ipv4_stat);
1893 		}
1894 		if (ifp->if_ipv6_stat != NULL) {
1895 			kfree_type(struct if_tcp_ecn_stat, ifp->if_ipv6_stat);
1896 		}
1897 	}
1898 
1899 	return ret;
1900 }
1901 
1902 static void
dlil_reset_rxpoll_params(ifnet_t ifp)1903 dlil_reset_rxpoll_params(ifnet_t ifp)
1904 {
1905 	ASSERT(ifp != NULL);
1906 	ifnet_set_poll_cycle(ifp, NULL);
1907 	ifp->if_poll_update = 0;
1908 	ifp->if_poll_flags = 0;
1909 	ifp->if_poll_req = 0;
1910 	ifp->if_poll_mode = IFNET_MODEL_INPUT_POLL_OFF;
1911 	bzero(&ifp->if_poll_tstats, sizeof(ifp->if_poll_tstats));
1912 	bzero(&ifp->if_poll_pstats, sizeof(ifp->if_poll_pstats));
1913 	bzero(&ifp->if_poll_sstats, sizeof(ifp->if_poll_sstats));
1914 	net_timerclear(&ifp->if_poll_mode_holdtime);
1915 	net_timerclear(&ifp->if_poll_mode_lasttime);
1916 	net_timerclear(&ifp->if_poll_sample_holdtime);
1917 	net_timerclear(&ifp->if_poll_sample_lasttime);
1918 	net_timerclear(&ifp->if_poll_dbg_lasttime);
1919 }
1920 
1921 static int
dlil_create_input_thread(ifnet_t ifp,struct dlil_threading_info * inp,thread_continue_t * thfunc)1922 dlil_create_input_thread(ifnet_t ifp, struct dlil_threading_info *inp,
1923     thread_continue_t *thfunc)
1924 {
1925 	boolean_t dlil_rxpoll_input;
1926 	thread_continue_t func = NULL;
1927 	u_int32_t limit;
1928 	int error = 0;
1929 
1930 	dlil_rxpoll_input = (ifp != NULL && net_rxpoll &&
1931 	    (ifp->if_eflags & IFEF_RXPOLL) && (ifp->if_xflags & IFXF_LEGACY));
1932 
1933 	/* default strategy utilizes the DLIL worker thread */
1934 	inp->dlth_strategy = dlil_input_async;
1935 
1936 	/* NULL ifp indicates the main input thread, called at dlil_init time */
1937 	if (ifp == NULL) {
1938 		/*
1939 		 * Main input thread only.
1940 		 */
1941 		func = dlil_main_input_thread_func;
1942 		VERIFY(inp == dlil_main_input_thread);
1943 		(void) strlcat(inp->dlth_name,
1944 		    "main_input", DLIL_THREADNAME_LEN);
1945 	} else if (dlil_rxpoll_input) {
1946 		/*
1947 		 * Legacy (non-netif) hybrid polling.
1948 		 */
1949 		func = dlil_rxpoll_input_thread_func;
1950 		VERIFY(inp != dlil_main_input_thread);
1951 		(void) snprintf(inp->dlth_name, DLIL_THREADNAME_LEN,
1952 		    "%s_input_poll", if_name(ifp));
1953 	} else if (net_async || (ifp->if_xflags & IFXF_LEGACY)) {
1954 		/*
1955 		 * Asynchronous strategy.
1956 		 */
1957 		func = dlil_input_thread_func;
1958 		VERIFY(inp != dlil_main_input_thread);
1959 		(void) snprintf(inp->dlth_name, DLIL_THREADNAME_LEN,
1960 		    "%s_input", if_name(ifp));
1961 	} else {
1962 		/*
1963 		 * Synchronous strategy if there's a netif below and
1964 		 * the device isn't capable of hybrid polling.
1965 		 */
1966 		ASSERT(func == NULL);
1967 		ASSERT(!(ifp->if_xflags & IFXF_LEGACY));
1968 		VERIFY(inp != dlil_main_input_thread);
1969 		ASSERT(!inp->dlth_affinity);
1970 		inp->dlth_strategy = dlil_input_sync;
1971 	}
1972 	VERIFY(inp->dlth_thread == THREAD_NULL);
1973 
1974 	/* let caller know */
1975 	if (thfunc != NULL) {
1976 		*thfunc = func;
1977 	}
1978 
1979 	inp->dlth_lock_grp = lck_grp_alloc_init(inp->dlth_name, LCK_GRP_ATTR_NULL);
1980 	lck_mtx_init(&inp->dlth_lock, inp->dlth_lock_grp, &dlil_lck_attributes);
1981 
1982 	inp->dlth_ifp = ifp; /* NULL for main input thread */
1983 
1984 	/*
1985 	 * For interfaces that support opportunistic polling, set the
1986 	 * low and high watermarks for outstanding inbound packets/bytes.
1987 	 * Also define freeze times for transitioning between modes
1988 	 * and updating the average.
1989 	 */
1990 	if (ifp != NULL && net_rxpoll && (ifp->if_eflags & IFEF_RXPOLL)) {
1991 		limit = MAX(if_rcvq_maxlen, IF_RCVQ_MINLEN);
1992 		if (ifp->if_xflags & IFXF_LEGACY) {
1993 			(void) dlil_rxpoll_set_params(ifp, NULL, FALSE);
1994 		}
1995 	} else {
1996 		/*
1997 		 * For interfaces that don't support opportunistic
1998 		 * polling, set the burst limit to prevent memory exhaustion.
1999 		 * The values of `if_rcvq_burst_limit' are safeguarded
2000 		 * on customer builds by `sysctl_rcvq_burst_limit'.
2001 		 */
2002 		limit = if_rcvq_burst_limit;
2003 	}
2004 
2005 	_qinit(&inp->dlth_pkts, Q_DROPTAIL, limit, QP_MBUF);
2006 	if (inp == dlil_main_input_thread) {
2007 		struct dlil_main_threading_info *inpm =
2008 		    (struct dlil_main_threading_info *)inp;
2009 		_qinit(&inpm->lo_rcvq_pkts, Q_DROPTAIL, limit, QP_MBUF);
2010 	}
2011 
2012 	if (func == NULL) {
2013 		ASSERT(!(ifp->if_xflags & IFXF_LEGACY));
2014 		ASSERT(error == 0);
2015 		error = ENODEV;
2016 		goto done;
2017 	}
2018 
2019 	error = kernel_thread_start(func, inp, &inp->dlth_thread);
2020 	if (error == KERN_SUCCESS) {
2021 		thread_precedence_policy_data_t info;
2022 		__unused kern_return_t kret;
2023 
2024 		bzero(&info, sizeof(info));
2025 		info.importance = 0;
2026 		kret = thread_policy_set(inp->dlth_thread,
2027 		    THREAD_PRECEDENCE_POLICY, (thread_policy_t)&info,
2028 		    THREAD_PRECEDENCE_POLICY_COUNT);
2029 		ASSERT(kret == KERN_SUCCESS);
2030 		/*
2031 		 * We create an affinity set so that the matching workloop
2032 		 * thread or the starter thread (for loopback) can be
2033 		 * scheduled on the same processor set as the input thread.
2034 		 */
2035 		if (net_affinity) {
2036 			struct thread *tp = inp->dlth_thread;
2037 			u_int32_t tag;
2038 			/*
2039 			 * Randomize to reduce the probability
2040 			 * of affinity tag namespace collision.
2041 			 */
2042 			read_frandom(&tag, sizeof(tag));
2043 			if (dlil_affinity_set(tp, tag) == KERN_SUCCESS) {
2044 				thread_reference(tp);
2045 				inp->dlth_affinity_tag = tag;
2046 				inp->dlth_affinity = TRUE;
2047 			}
2048 		}
2049 	} else if (inp == dlil_main_input_thread) {
2050 		panic_plain("%s: couldn't create main input thread", __func__);
2051 		/* NOTREACHED */
2052 	} else {
2053 		panic_plain("%s: couldn't create %s input thread", __func__,
2054 		    if_name(ifp));
2055 		/* NOTREACHED */
2056 	}
2057 	OSAddAtomic(1, &cur_dlil_input_threads);
2058 
2059 done:
2060 	return error;
2061 }
2062 
2063 static void
dlil_clean_threading_info(struct dlil_threading_info * inp)2064 dlil_clean_threading_info(struct dlil_threading_info *inp)
2065 {
2066 	lck_mtx_destroy(&inp->dlth_lock, inp->dlth_lock_grp);
2067 	lck_grp_free(inp->dlth_lock_grp);
2068 	inp->dlth_lock_grp = NULL;
2069 
2070 	inp->dlth_flags = 0;
2071 	inp->dlth_wtot = 0;
2072 	bzero(inp->dlth_name, sizeof(inp->dlth_name));
2073 	inp->dlth_ifp = NULL;
2074 	VERIFY(qhead(&inp->dlth_pkts) == NULL && qempty(&inp->dlth_pkts));
2075 	qlimit(&inp->dlth_pkts) = 0;
2076 	bzero(&inp->dlth_stats, sizeof(inp->dlth_stats));
2077 
2078 	VERIFY(!inp->dlth_affinity);
2079 	inp->dlth_thread = THREAD_NULL;
2080 	inp->dlth_strategy = NULL;
2081 	VERIFY(inp->dlth_driver_thread == THREAD_NULL);
2082 	VERIFY(inp->dlth_poller_thread == THREAD_NULL);
2083 	VERIFY(inp->dlth_affinity_tag == 0);
2084 #if IFNET_INPUT_SANITY_CHK
2085 	inp->dlth_pkts_cnt = 0;
2086 #endif /* IFNET_INPUT_SANITY_CHK */
2087 }
2088 
2089 static void
dlil_terminate_input_thread(struct dlil_threading_info * inp)2090 dlil_terminate_input_thread(struct dlil_threading_info *inp)
2091 {
2092 	struct ifnet *ifp = inp->dlth_ifp;
2093 	classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
2094 
2095 	VERIFY(current_thread() == inp->dlth_thread);
2096 	VERIFY(inp != dlil_main_input_thread);
2097 
2098 	OSAddAtomic(-1, &cur_dlil_input_threads);
2099 
2100 #if TEST_INPUT_THREAD_TERMINATION
2101 	{ /* do something useless that won't get optimized away */
2102 		uint32_t        v = 1;
2103 		for (uint32_t i = 0;
2104 		    i < if_input_thread_termination_spin;
2105 		    i++) {
2106 			v = (i + 1) * v;
2107 		}
2108 		DLIL_PRINTF("the value is %d\n", v);
2109 	}
2110 #endif /* TEST_INPUT_THREAD_TERMINATION */
2111 
2112 	lck_mtx_lock_spin(&inp->dlth_lock);
2113 	_getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
2114 	VERIFY((inp->dlth_flags & DLIL_INPUT_TERMINATE) != 0);
2115 	inp->dlth_flags |= DLIL_INPUT_TERMINATE_COMPLETE;
2116 	wakeup_one((caddr_t)&inp->dlth_flags);
2117 	lck_mtx_unlock(&inp->dlth_lock);
2118 
2119 	/* free up pending packets */
2120 	if (pkt.cp_mbuf != NULL) {
2121 		mbuf_freem_list(pkt.cp_mbuf);
2122 	}
2123 
2124 	/* for the extra refcnt from kernel_thread_start() */
2125 	thread_deallocate(current_thread());
2126 
2127 	if (dlil_verbose) {
2128 		DLIL_PRINTF("%s: input thread terminated\n",
2129 		    if_name(ifp));
2130 	}
2131 
2132 	/* this is the end */
2133 	thread_terminate(current_thread());
2134 	/* NOTREACHED */
2135 }
2136 
2137 static kern_return_t
dlil_affinity_set(struct thread * tp,u_int32_t tag)2138 dlil_affinity_set(struct thread *tp, u_int32_t tag)
2139 {
2140 	thread_affinity_policy_data_t policy;
2141 
2142 	bzero(&policy, sizeof(policy));
2143 	policy.affinity_tag = tag;
2144 	return thread_policy_set(tp, THREAD_AFFINITY_POLICY,
2145 	           (thread_policy_t)&policy, THREAD_AFFINITY_POLICY_COUNT);
2146 }
2147 
2148 #if SKYWALK
2149 static void
dlil_filter_event(struct eventhandler_entry_arg arg __unused,enum net_filter_event_subsystems state)2150 dlil_filter_event(struct eventhandler_entry_arg arg __unused,
2151     enum net_filter_event_subsystems state)
2152 {
2153 	evhlog(debug, "%s: eventhandler saw event type=net_filter_event_state event_code=0x%d",
2154 	    __func__, state);
2155 
2156 	bool old_if_enable_fsw_transport_netagent = if_enable_fsw_transport_netagent;
2157 	if ((state & ~NET_FILTER_EVENT_PF_PRIVATE_PROXY) == 0) {
2158 		if_enable_fsw_transport_netagent = 1;
2159 	} else {
2160 		if_enable_fsw_transport_netagent = 0;
2161 	}
2162 	if (old_if_enable_fsw_transport_netagent != if_enable_fsw_transport_netagent) {
2163 		kern_nexus_update_netagents();
2164 	} else if (!if_enable_fsw_transport_netagent) {
2165 		necp_update_all_clients();
2166 	}
2167 }
2168 #endif /* SKYWALK */
2169 
2170 void
dlil_init(void)2171 dlil_init(void)
2172 {
2173 	thread_t thread = THREAD_NULL;
2174 
2175 	/*
2176 	 * The following fields must be 64-bit aligned for atomic operations.
2177 	 */
2178 	IF_DATA_REQUIRE_ALIGNED_64(ifi_ipackets);
2179 	IF_DATA_REQUIRE_ALIGNED_64(ifi_ierrors);
2180 	IF_DATA_REQUIRE_ALIGNED_64(ifi_opackets);
2181 	IF_DATA_REQUIRE_ALIGNED_64(ifi_oerrors);
2182 	IF_DATA_REQUIRE_ALIGNED_64(ifi_collisions);
2183 	IF_DATA_REQUIRE_ALIGNED_64(ifi_ibytes);
2184 	IF_DATA_REQUIRE_ALIGNED_64(ifi_obytes);
2185 	IF_DATA_REQUIRE_ALIGNED_64(ifi_imcasts);
2186 	IF_DATA_REQUIRE_ALIGNED_64(ifi_omcasts);
2187 	IF_DATA_REQUIRE_ALIGNED_64(ifi_iqdrops);
2188 	IF_DATA_REQUIRE_ALIGNED_64(ifi_noproto);
2189 	IF_DATA_REQUIRE_ALIGNED_64(ifi_alignerrs);
2190 	IF_DATA_REQUIRE_ALIGNED_64(ifi_dt_bytes);
2191 	IF_DATA_REQUIRE_ALIGNED_64(ifi_fpackets);
2192 	IF_DATA_REQUIRE_ALIGNED_64(ifi_fbytes);
2193 
2194 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_ipackets);
2195 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_ierrors);
2196 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_opackets);
2197 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_oerrors);
2198 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_collisions);
2199 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_ibytes);
2200 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_obytes);
2201 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_imcasts);
2202 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_omcasts);
2203 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_iqdrops);
2204 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_noproto);
2205 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_alignerrs);
2206 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_dt_bytes);
2207 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_fpackets);
2208 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_fbytes);
2209 
2210 	/*
2211 	 * These IF_HWASSIST_ flags must be equal to their IFNET_* counterparts.
2212 	 */
2213 	_CASSERT(IF_HWASSIST_CSUM_IP == IFNET_CSUM_IP);
2214 	_CASSERT(IF_HWASSIST_CSUM_TCP == IFNET_CSUM_TCP);
2215 	_CASSERT(IF_HWASSIST_CSUM_UDP == IFNET_CSUM_UDP);
2216 	_CASSERT(IF_HWASSIST_CSUM_IP_FRAGS == IFNET_CSUM_FRAGMENT);
2217 	_CASSERT(IF_HWASSIST_CSUM_FRAGMENT == IFNET_IP_FRAGMENT);
2218 	_CASSERT(IF_HWASSIST_CSUM_TCPIPV6 == IFNET_CSUM_TCPIPV6);
2219 	_CASSERT(IF_HWASSIST_CSUM_UDPIPV6 == IFNET_CSUM_UDPIPV6);
2220 	_CASSERT(IF_HWASSIST_CSUM_FRAGMENT_IPV6 == IFNET_IPV6_FRAGMENT);
2221 	_CASSERT(IF_HWASSIST_CSUM_PARTIAL == IFNET_CSUM_PARTIAL);
2222 	_CASSERT(IF_HWASSIST_CSUM_ZERO_INVERT == IFNET_CSUM_ZERO_INVERT);
2223 	_CASSERT(IF_HWASSIST_VLAN_TAGGING == IFNET_VLAN_TAGGING);
2224 	_CASSERT(IF_HWASSIST_VLAN_MTU == IFNET_VLAN_MTU);
2225 	_CASSERT(IF_HWASSIST_TSO_V4 == IFNET_TSO_IPV4);
2226 	_CASSERT(IF_HWASSIST_TSO_V6 == IFNET_TSO_IPV6);
2227 
2228 	/*
2229 	 * ... as well as the mbuf checksum flags counterparts.
2230 	 */
2231 	_CASSERT(CSUM_IP == IF_HWASSIST_CSUM_IP);
2232 	_CASSERT(CSUM_TCP == IF_HWASSIST_CSUM_TCP);
2233 	_CASSERT(CSUM_UDP == IF_HWASSIST_CSUM_UDP);
2234 	_CASSERT(CSUM_IP_FRAGS == IF_HWASSIST_CSUM_IP_FRAGS);
2235 	_CASSERT(CSUM_FRAGMENT == IF_HWASSIST_CSUM_FRAGMENT);
2236 	_CASSERT(CSUM_TCPIPV6 == IF_HWASSIST_CSUM_TCPIPV6);
2237 	_CASSERT(CSUM_UDPIPV6 == IF_HWASSIST_CSUM_UDPIPV6);
2238 	_CASSERT(CSUM_FRAGMENT_IPV6 == IF_HWASSIST_CSUM_FRAGMENT_IPV6);
2239 	_CASSERT(CSUM_PARTIAL == IF_HWASSIST_CSUM_PARTIAL);
2240 	_CASSERT(CSUM_ZERO_INVERT == IF_HWASSIST_CSUM_ZERO_INVERT);
2241 	_CASSERT(CSUM_VLAN_TAG_VALID == IF_HWASSIST_VLAN_TAGGING);
2242 
2243 	/*
2244 	 * Make sure we have at least IF_LLREACH_MAXLEN in the llreach info.
2245 	 */
2246 	_CASSERT(IF_LLREACH_MAXLEN <= IF_LLREACHINFO_ADDRLEN);
2247 	_CASSERT(IFNET_LLREACHINFO_ADDRLEN == IF_LLREACHINFO_ADDRLEN);
2248 
2249 	_CASSERT(IFRLOGF_DLIL == IFNET_LOGF_DLIL);
2250 	_CASSERT(IFRLOGF_FAMILY == IFNET_LOGF_FAMILY);
2251 	_CASSERT(IFRLOGF_DRIVER == IFNET_LOGF_DRIVER);
2252 	_CASSERT(IFRLOGF_FIRMWARE == IFNET_LOGF_FIRMWARE);
2253 
2254 	_CASSERT(IFRLOGCAT_CONNECTIVITY == IFNET_LOGCAT_CONNECTIVITY);
2255 	_CASSERT(IFRLOGCAT_QUALITY == IFNET_LOGCAT_QUALITY);
2256 	_CASSERT(IFRLOGCAT_PERFORMANCE == IFNET_LOGCAT_PERFORMANCE);
2257 
2258 	_CASSERT(IFRTYPE_FAMILY_ANY == IFNET_FAMILY_ANY);
2259 	_CASSERT(IFRTYPE_FAMILY_LOOPBACK == IFNET_FAMILY_LOOPBACK);
2260 	_CASSERT(IFRTYPE_FAMILY_ETHERNET == IFNET_FAMILY_ETHERNET);
2261 	_CASSERT(IFRTYPE_FAMILY_SLIP == IFNET_FAMILY_SLIP);
2262 	_CASSERT(IFRTYPE_FAMILY_TUN == IFNET_FAMILY_TUN);
2263 	_CASSERT(IFRTYPE_FAMILY_VLAN == IFNET_FAMILY_VLAN);
2264 	_CASSERT(IFRTYPE_FAMILY_PPP == IFNET_FAMILY_PPP);
2265 	_CASSERT(IFRTYPE_FAMILY_PVC == IFNET_FAMILY_PVC);
2266 	_CASSERT(IFRTYPE_FAMILY_DISC == IFNET_FAMILY_DISC);
2267 	_CASSERT(IFRTYPE_FAMILY_MDECAP == IFNET_FAMILY_MDECAP);
2268 	_CASSERT(IFRTYPE_FAMILY_GIF == IFNET_FAMILY_GIF);
2269 	_CASSERT(IFRTYPE_FAMILY_FAITH == IFNET_FAMILY_FAITH);
2270 	_CASSERT(IFRTYPE_FAMILY_STF == IFNET_FAMILY_STF);
2271 	_CASSERT(IFRTYPE_FAMILY_FIREWIRE == IFNET_FAMILY_FIREWIRE);
2272 	_CASSERT(IFRTYPE_FAMILY_BOND == IFNET_FAMILY_BOND);
2273 	_CASSERT(IFRTYPE_FAMILY_CELLULAR == IFNET_FAMILY_CELLULAR);
2274 	_CASSERT(IFRTYPE_FAMILY_UTUN == IFNET_FAMILY_UTUN);
2275 	_CASSERT(IFRTYPE_FAMILY_IPSEC == IFNET_FAMILY_IPSEC);
2276 
2277 	_CASSERT(IFRTYPE_SUBFAMILY_ANY == IFNET_SUBFAMILY_ANY);
2278 	_CASSERT(IFRTYPE_SUBFAMILY_USB == IFNET_SUBFAMILY_USB);
2279 	_CASSERT(IFRTYPE_SUBFAMILY_BLUETOOTH == IFNET_SUBFAMILY_BLUETOOTH);
2280 	_CASSERT(IFRTYPE_SUBFAMILY_WIFI == IFNET_SUBFAMILY_WIFI);
2281 	_CASSERT(IFRTYPE_SUBFAMILY_THUNDERBOLT == IFNET_SUBFAMILY_THUNDERBOLT);
2282 	_CASSERT(IFRTYPE_SUBFAMILY_RESERVED == IFNET_SUBFAMILY_RESERVED);
2283 	_CASSERT(IFRTYPE_SUBFAMILY_INTCOPROC == IFNET_SUBFAMILY_INTCOPROC);
2284 	_CASSERT(IFRTYPE_SUBFAMILY_QUICKRELAY == IFNET_SUBFAMILY_QUICKRELAY);
2285 	_CASSERT(IFRTYPE_SUBFAMILY_VMNET == IFNET_SUBFAMILY_VMNET);
2286 	_CASSERT(IFRTYPE_SUBFAMILY_SIMCELL == IFNET_SUBFAMILY_SIMCELL);
2287 	_CASSERT(IFRTYPE_SUBFAMILY_MANAGEMENT == IFNET_SUBFAMILY_MANAGEMENT);
2288 
2289 	_CASSERT(DLIL_MODIDLEN == IFNET_MODIDLEN);
2290 	_CASSERT(DLIL_MODARGLEN == IFNET_MODARGLEN);
2291 
2292 	PE_parse_boot_argn("net_affinity", &net_affinity,
2293 	    sizeof(net_affinity));
2294 
2295 	PE_parse_boot_argn("net_rxpoll", &net_rxpoll, sizeof(net_rxpoll));
2296 
2297 	PE_parse_boot_argn("net_rtref", &net_rtref, sizeof(net_rtref));
2298 
2299 	PE_parse_boot_argn("net_async", &net_async, sizeof(net_async));
2300 
2301 	PE_parse_boot_argn("ifnet_debug", &ifnet_debug, sizeof(ifnet_debug));
2302 
2303 	VERIFY(dlil_pending_thread_cnt == 0);
2304 #if SKYWALK
2305 	boolean_t pe_enable_fsw_transport_netagent = FALSE;
2306 	boolean_t pe_disable_fsw_transport_netagent = FALSE;
2307 	boolean_t enable_fsw_netagent =
2308 	    (((if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0) ||
2309 	    (if_attach_nx & IF_ATTACH_NX_FSW_IP_NETAGENT) != 0);
2310 
2311 	/*
2312 	 * Check the device tree to see if Skywalk netagent has been explicitly
2313 	 * enabled or disabled.  This can be overridden via if_attach_nx below.
2314 	 * Note that the property is a 0-length key, and so checking for the
2315 	 * presence itself is enough (no need to check for the actual value of
2316 	 * the retrieved variable.)
2317 	 */
2318 	pe_enable_fsw_transport_netagent =
2319 	    PE_get_default("kern.skywalk_netagent_enable",
2320 	    &pe_enable_fsw_transport_netagent,
2321 	    sizeof(pe_enable_fsw_transport_netagent));
2322 	pe_disable_fsw_transport_netagent =
2323 	    PE_get_default("kern.skywalk_netagent_disable",
2324 	    &pe_disable_fsw_transport_netagent,
2325 	    sizeof(pe_disable_fsw_transport_netagent));
2326 
2327 	/*
2328 	 * These two are mutually exclusive, i.e. they both can be absent,
2329 	 * but only one can be present at a time, and so we assert to make
2330 	 * sure it is correct.
2331 	 */
2332 	VERIFY((!pe_enable_fsw_transport_netagent &&
2333 	    !pe_disable_fsw_transport_netagent) ||
2334 	    (pe_enable_fsw_transport_netagent ^
2335 	    pe_disable_fsw_transport_netagent));
2336 
2337 	if (pe_enable_fsw_transport_netagent) {
2338 		kprintf("SK: netagent is enabled via an override for "
2339 		    "this platform\n");
2340 		if_attach_nx = SKYWALK_NETWORKING_ENABLED;
2341 	} else if (pe_disable_fsw_transport_netagent) {
2342 		kprintf("SK: netagent is disabled via an override for "
2343 		    "this platform\n");
2344 		if_attach_nx = SKYWALK_NETWORKING_DISABLED;
2345 	} else {
2346 		kprintf("SK: netagent is %s by default for this platform\n",
2347 		    (enable_fsw_netagent ? "enabled" : "disabled"));
2348 		if_attach_nx = IF_ATTACH_NX_DEFAULT;
2349 	}
2350 
2351 	/*
2352 	 * Now see if there's a boot-arg override.
2353 	 */
2354 	(void) PE_parse_boot_argn("if_attach_nx", &if_attach_nx,
2355 	    sizeof(if_attach_nx));
2356 	if_enable_fsw_transport_netagent =
2357 	    ((if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0);
2358 
2359 	if_netif_all = ((if_attach_nx & IF_ATTACH_NX_NETIF_ALL) != 0);
2360 
2361 	if (pe_disable_fsw_transport_netagent &&
2362 	    if_enable_fsw_transport_netagent) {
2363 		kprintf("SK: netagent is force-enabled\n");
2364 	} else if (!pe_disable_fsw_transport_netagent &&
2365 	    !if_enable_fsw_transport_netagent) {
2366 		kprintf("SK: netagent is force-disabled\n");
2367 	}
2368 	if (kernel_is_macos_or_server() && if_enable_fsw_transport_netagent) {
2369 		net_filter_event_register(dlil_filter_event);
2370 	}
2371 
2372 #if (DEVELOPMENT || DEBUG)
2373 	(void) PE_parse_boot_argn("fsw_use_max_mtu_buffer",
2374 	    &fsw_use_max_mtu_buffer, sizeof(fsw_use_max_mtu_buffer));
2375 #endif /* (DEVELOPMENT || DEBUG) */
2376 
2377 #endif /* SKYWALK */
2378 	dlif_size = (ifnet_debug == 0) ? sizeof(struct dlil_ifnet) :
2379 	    sizeof(struct dlil_ifnet_dbg);
2380 	/* Enforce 64-bit alignment for dlil_ifnet structure */
2381 	dlif_bufsize = dlif_size + sizeof(void *) + sizeof(u_int64_t);
2382 	dlif_bufsize = (uint32_t)P2ROUNDUP(dlif_bufsize, sizeof(u_int64_t));
2383 	dlif_zone = zone_create(DLIF_ZONE_NAME, dlif_bufsize, ZC_ZFREE_CLEARMEM);
2384 
2385 	dlif_tcpstat_size = sizeof(struct tcpstat_local);
2386 	/* Enforce 64-bit alignment for tcpstat_local structure */
2387 	dlif_tcpstat_bufsize =
2388 	    dlif_tcpstat_size + sizeof(void *) + sizeof(u_int64_t);
2389 	dlif_tcpstat_bufsize = (uint32_t)
2390 	    P2ROUNDUP(dlif_tcpstat_bufsize, sizeof(u_int64_t));
2391 	dlif_tcpstat_zone = zone_create(DLIF_TCPSTAT_ZONE_NAME,
2392 	    dlif_tcpstat_bufsize, ZC_ZFREE_CLEARMEM);
2393 
2394 	dlif_udpstat_size = sizeof(struct udpstat_local);
2395 	/* Enforce 64-bit alignment for udpstat_local structure */
2396 	dlif_udpstat_bufsize =
2397 	    dlif_udpstat_size + sizeof(void *) + sizeof(u_int64_t);
2398 	dlif_udpstat_bufsize = (uint32_t)
2399 	    P2ROUNDUP(dlif_udpstat_bufsize, sizeof(u_int64_t));
2400 	dlif_udpstat_zone = zone_create(DLIF_UDPSTAT_ZONE_NAME,
2401 	    dlif_udpstat_bufsize, ZC_ZFREE_CLEARMEM);
2402 
2403 	eventhandler_lists_ctxt_init(&ifnet_evhdlr_ctxt);
2404 
2405 	TAILQ_INIT(&dlil_ifnet_head);
2406 	TAILQ_INIT(&ifnet_head);
2407 	TAILQ_INIT(&ifnet_detaching_head);
2408 	TAILQ_INIT(&ifnet_ordered_head);
2409 
2410 	/* Initialize interface address subsystem */
2411 	ifa_init();
2412 
2413 #if PF
2414 	/* Initialize the packet filter */
2415 	pfinit();
2416 #endif /* PF */
2417 
2418 	/* Initialize queue algorithms */
2419 	classq_init();
2420 
2421 	/* Initialize packet schedulers */
2422 	pktsched_init();
2423 
2424 	/* Initialize flow advisory subsystem */
2425 	flowadv_init();
2426 
2427 	/* Initialize the pktap virtual interface */
2428 	pktap_init();
2429 
2430 	/* Initialize droptap interface */
2431 	droptap_init();
2432 
2433 	/* Initialize the service class to dscp map */
2434 	net_qos_map_init();
2435 
2436 	/* Initialize the interface low power mode event handler */
2437 	if_low_power_evhdlr_init();
2438 
2439 	/* Initialize the interface offload port list subsystem */
2440 	if_ports_used_init();
2441 
2442 #if DEBUG || DEVELOPMENT
2443 	/* Run self-tests */
2444 	dlil_verify_sum16();
2445 #endif /* DEBUG || DEVELOPMENT */
2446 
2447 	/*
2448 	 * Create and start up the main DLIL input thread and the interface
2449 	 * detacher threads once everything is initialized.
2450 	 */
2451 	dlil_incr_pending_thread_count();
2452 	(void) dlil_create_input_thread(NULL, dlil_main_input_thread, NULL);
2453 
2454 	/*
2455 	 * Create ifnet detacher thread.
2456 	 * When an interface gets detached, part of the detach processing
2457 	 * is delayed. The interface is added to delayed detach list
2458 	 * and this thread is woken up to call ifnet_detach_final
2459 	 * on these interfaces.
2460 	 */
2461 	dlil_incr_pending_thread_count();
2462 	if (kernel_thread_start(ifnet_detacher_thread_func,
2463 	    NULL, &thread) != KERN_SUCCESS) {
2464 		panic_plain("%s: couldn't create detacher thread", __func__);
2465 		/* NOTREACHED */
2466 	}
2467 	thread_deallocate(thread);
2468 
2469 	/*
2470 	 * Wait for the created kernel threads for dlil to get
2471 	 * scheduled and run at least once before we proceed
2472 	 */
2473 	lck_mtx_lock(&dlil_thread_sync_lock);
2474 	while (dlil_pending_thread_cnt != 0) {
2475 		DLIL_PRINTF("%s: Waiting for all the create dlil kernel "
2476 		    "threads to get scheduled at least once.\n", __func__);
2477 		(void) msleep(&dlil_pending_thread_cnt, &dlil_thread_sync_lock,
2478 		    (PZERO - 1), __func__, NULL);
2479 		LCK_MTX_ASSERT(&dlil_thread_sync_lock, LCK_ASSERT_OWNED);
2480 	}
2481 	lck_mtx_unlock(&dlil_thread_sync_lock);
2482 	DLIL_PRINTF("%s: All the created dlil kernel threads have been "
2483 	    "scheduled at least once. Proceeding.\n", __func__);
2484 }
2485 
2486 static void
if_flt_monitor_busy(struct ifnet * ifp)2487 if_flt_monitor_busy(struct ifnet *ifp)
2488 {
2489 	LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
2490 
2491 	++ifp->if_flt_busy;
2492 	VERIFY(ifp->if_flt_busy != 0);
2493 }
2494 
2495 static void
if_flt_monitor_unbusy(struct ifnet * ifp)2496 if_flt_monitor_unbusy(struct ifnet *ifp)
2497 {
2498 	if_flt_monitor_leave(ifp);
2499 }
2500 
2501 static void
if_flt_monitor_enter(struct ifnet * ifp)2502 if_flt_monitor_enter(struct ifnet *ifp)
2503 {
2504 	LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
2505 
2506 	while (ifp->if_flt_busy) {
2507 		++ifp->if_flt_waiters;
2508 		(void) msleep(&ifp->if_flt_head, &ifp->if_flt_lock,
2509 		    (PZERO - 1), "if_flt_monitor", NULL);
2510 	}
2511 	if_flt_monitor_busy(ifp);
2512 }
2513 
2514 static void
if_flt_monitor_leave(struct ifnet * ifp)2515 if_flt_monitor_leave(struct ifnet *ifp)
2516 {
2517 	LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
2518 
2519 	VERIFY(ifp->if_flt_busy != 0);
2520 	--ifp->if_flt_busy;
2521 
2522 	if (ifp->if_flt_busy == 0 && ifp->if_flt_waiters > 0) {
2523 		ifp->if_flt_waiters = 0;
2524 		wakeup(&ifp->if_flt_head);
2525 	}
2526 }
2527 
2528 __private_extern__ int
dlil_attach_filter(struct ifnet * ifp,const struct iff_filter * if_filter,interface_filter_t * filter_ref,u_int32_t flags)2529 dlil_attach_filter(struct ifnet *ifp, const struct iff_filter *if_filter,
2530     interface_filter_t *filter_ref, u_int32_t flags)
2531 {
2532 	int retval = 0;
2533 	struct ifnet_filter *filter = NULL;
2534 
2535 	ifnet_head_lock_shared();
2536 
2537 	/* Check that the interface is in the global list */
2538 	if (!ifnet_lookup(ifp)) {
2539 		retval = ENXIO;
2540 		goto done;
2541 	}
2542 	if (!ifnet_is_attached(ifp, 1)) {
2543 		os_log(OS_LOG_DEFAULT, "%s: %s is no longer attached",
2544 		    __func__, if_name(ifp));
2545 		retval = ENXIO;
2546 		goto done;
2547 	}
2548 
2549 	filter = zalloc_flags(dlif_filt_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
2550 
2551 	/* refcnt held above during lookup */
2552 	filter->filt_flags = flags;
2553 	filter->filt_ifp = ifp;
2554 	filter->filt_cookie = if_filter->iff_cookie;
2555 	filter->filt_name = if_filter->iff_name;
2556 	filter->filt_protocol = if_filter->iff_protocol;
2557 	/*
2558 	 * Do not install filter callbacks for internal coproc interface
2559 	 * and for management interfaces
2560 	 */
2561 	if (!IFNET_IS_INTCOPROC(ifp) && !IFNET_IS_MANAGEMENT(ifp)) {
2562 		filter->filt_input = if_filter->iff_input;
2563 		filter->filt_output = if_filter->iff_output;
2564 		filter->filt_event = if_filter->iff_event;
2565 		filter->filt_ioctl = if_filter->iff_ioctl;
2566 	}
2567 	filter->filt_detached = if_filter->iff_detached;
2568 
2569 	lck_mtx_lock(&ifp->if_flt_lock);
2570 	if_flt_monitor_enter(ifp);
2571 
2572 	LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
2573 	TAILQ_INSERT_TAIL(&ifp->if_flt_head, filter, filt_next);
2574 
2575 	*filter_ref = filter;
2576 
2577 	/*
2578 	 * Bump filter count and route_generation ID to let TCP
2579 	 * know it shouldn't do TSO on this connection
2580 	 */
2581 	if ((filter->filt_flags & DLIL_IFF_TSO) == 0) {
2582 		ifnet_filter_update_tso(ifp, TRUE);
2583 	}
2584 	OSIncrementAtomic64(&net_api_stats.nas_iflt_attach_count);
2585 	INC_ATOMIC_INT64_LIM(net_api_stats.nas_iflt_attach_total);
2586 	if (filter->filt_flags & DLIL_IFF_INTERNAL) {
2587 		OSIncrementAtomic64(&net_api_stats.nas_iflt_attach_os_count);
2588 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_iflt_attach_os_total);
2589 	} else {
2590 		OSAddAtomic(1, &ifp->if_flt_non_os_count);
2591 	}
2592 	if_flt_monitor_leave(ifp);
2593 	lck_mtx_unlock(&ifp->if_flt_lock);
2594 
2595 #if SKYWALK
2596 	if (kernel_is_macos_or_server()) {
2597 		net_filter_event_mark(NET_FILTER_EVENT_INTERFACE,
2598 		    net_check_compatible_if_filter(NULL));
2599 	}
2600 #endif /* SKYWALK */
2601 
2602 	if (dlil_verbose) {
2603 		DLIL_PRINTF("%s: %s filter attached\n", if_name(ifp),
2604 		    if_filter->iff_name);
2605 	}
2606 	ifnet_decr_iorefcnt(ifp);
2607 
2608 done:
2609 	ifnet_head_done();
2610 	if (retval != 0 && ifp != NULL) {
2611 		DLIL_PRINTF("%s: failed to attach %s (err=%d)\n",
2612 		    if_name(ifp), if_filter->iff_name, retval);
2613 	}
2614 	if (retval != 0 && filter != NULL) {
2615 		zfree(dlif_filt_zone, filter);
2616 	}
2617 
2618 	return retval;
2619 }
2620 
2621 static int
dlil_detach_filter_internal(interface_filter_t filter,int detached)2622 dlil_detach_filter_internal(interface_filter_t  filter, int detached)
2623 {
2624 	int retval = 0;
2625 
2626 	if (detached == 0) {
2627 		ifnet_t ifp = NULL;
2628 
2629 		ifnet_head_lock_shared();
2630 		TAILQ_FOREACH(ifp, &ifnet_head, if_link) {
2631 			interface_filter_t entry = NULL;
2632 
2633 			lck_mtx_lock(&ifp->if_flt_lock);
2634 			TAILQ_FOREACH(entry, &ifp->if_flt_head, filt_next) {
2635 				if (entry != filter || entry->filt_skip) {
2636 					continue;
2637 				}
2638 				/*
2639 				 * We've found a match; since it's possible
2640 				 * that the thread gets blocked in the monitor,
2641 				 * we do the lock dance.  Interface should
2642 				 * not be detached since we still have a use
2643 				 * count held during filter attach.
2644 				 */
2645 				entry->filt_skip = 1;   /* skip input/output */
2646 				lck_mtx_unlock(&ifp->if_flt_lock);
2647 				ifnet_head_done();
2648 
2649 				lck_mtx_lock(&ifp->if_flt_lock);
2650 				if_flt_monitor_enter(ifp);
2651 				LCK_MTX_ASSERT(&ifp->if_flt_lock,
2652 				    LCK_MTX_ASSERT_OWNED);
2653 
2654 				/* Remove the filter from the list */
2655 				TAILQ_REMOVE(&ifp->if_flt_head, filter,
2656 				    filt_next);
2657 
2658 				if (dlil_verbose) {
2659 					DLIL_PRINTF("%s: %s filter detached\n",
2660 					    if_name(ifp), filter->filt_name);
2661 				}
2662 				if (!(filter->filt_flags & DLIL_IFF_INTERNAL)) {
2663 					VERIFY(ifp->if_flt_non_os_count != 0);
2664 					OSAddAtomic(-1, &ifp->if_flt_non_os_count);
2665 				}
2666 				/*
2667 				 * Decrease filter count and route_generation
2668 				 * ID to let TCP know it should reevalute doing
2669 				 * TSO or not.
2670 				 */
2671 				if ((filter->filt_flags & DLIL_IFF_TSO) == 0) {
2672 					ifnet_filter_update_tso(ifp, FALSE);
2673 				}
2674 				/*
2675 				 * When we remove the bridge's interface filter,
2676 				 * clear the field in the ifnet.
2677 				 */
2678 				if ((filter->filt_flags & DLIL_IFF_BRIDGE)
2679 				    != 0) {
2680 					ifp->if_bridge = NULL;
2681 				}
2682 				if_flt_monitor_leave(ifp);
2683 				lck_mtx_unlock(&ifp->if_flt_lock);
2684 				goto destroy;
2685 			}
2686 			lck_mtx_unlock(&ifp->if_flt_lock);
2687 		}
2688 		ifnet_head_done();
2689 
2690 		/* filter parameter is not a valid filter ref */
2691 		retval = EINVAL;
2692 		goto done;
2693 	} else {
2694 		struct ifnet *ifp = filter->filt_ifp;
2695 		/*
2696 		 * Here we are called from ifnet_detach_final(); the
2697 		 * caller had emptied if_flt_head and we're doing an
2698 		 * implicit filter detach because the interface is
2699 		 * about to go away.  Make sure to adjust the counters
2700 		 * in this case.  We don't need the protection of the
2701 		 * filter monitor since we're called as part of the
2702 		 * final detach in the context of the detacher thread.
2703 		 */
2704 		if (!(filter->filt_flags & DLIL_IFF_INTERNAL)) {
2705 			VERIFY(ifp->if_flt_non_os_count != 0);
2706 			OSAddAtomic(-1, &ifp->if_flt_non_os_count);
2707 		}
2708 		/*
2709 		 * Decrease filter count and route_generation
2710 		 * ID to let TCP know it should reevalute doing
2711 		 * TSO or not.
2712 		 */
2713 		if ((filter->filt_flags & DLIL_IFF_TSO) == 0) {
2714 			ifnet_filter_update_tso(ifp, FALSE);
2715 		}
2716 	}
2717 
2718 	if (dlil_verbose) {
2719 		DLIL_PRINTF("%s filter detached\n", filter->filt_name);
2720 	}
2721 
2722 destroy:
2723 
2724 	/* Call the detached function if there is one */
2725 	if (filter->filt_detached) {
2726 		filter->filt_detached(filter->filt_cookie, filter->filt_ifp);
2727 	}
2728 
2729 	VERIFY(OSDecrementAtomic64(&net_api_stats.nas_iflt_attach_count) > 0);
2730 	if (filter->filt_flags & DLIL_IFF_INTERNAL) {
2731 		VERIFY(OSDecrementAtomic64(&net_api_stats.nas_iflt_attach_os_count) > 0);
2732 	}
2733 #if SKYWALK
2734 	if (kernel_is_macos_or_server()) {
2735 		net_filter_event_mark(NET_FILTER_EVENT_INTERFACE,
2736 		    net_check_compatible_if_filter(NULL));
2737 	}
2738 #endif /* SKYWALK */
2739 
2740 	/* Free the filter */
2741 	zfree(dlif_filt_zone, filter);
2742 	filter = NULL;
2743 done:
2744 	if (retval != 0 && filter != NULL) {
2745 		DLIL_PRINTF("failed to detach %s filter (err=%d)\n",
2746 		    filter->filt_name, retval);
2747 	}
2748 
2749 	return retval;
2750 }
2751 
2752 __private_extern__ void
dlil_detach_filter(interface_filter_t filter)2753 dlil_detach_filter(interface_filter_t filter)
2754 {
2755 	if (filter == NULL) {
2756 		return;
2757 	}
2758 	dlil_detach_filter_internal(filter, 0);
2759 }
2760 
2761 __private_extern__ boolean_t
dlil_has_ip_filter(void)2762 dlil_has_ip_filter(void)
2763 {
2764 	boolean_t has_filter = ((net_api_stats.nas_ipf_add_count - net_api_stats.nas_ipf_add_os_count) > 0);
2765 
2766 	VERIFY(net_api_stats.nas_ipf_add_count >= net_api_stats.nas_ipf_add_os_count);
2767 
2768 	DTRACE_IP1(dlil_has_ip_filter, boolean_t, has_filter);
2769 	return has_filter;
2770 }
2771 
2772 __private_extern__ boolean_t
dlil_has_if_filter(struct ifnet * ifp)2773 dlil_has_if_filter(struct ifnet *ifp)
2774 {
2775 	boolean_t has_filter = !TAILQ_EMPTY(&ifp->if_flt_head);
2776 	DTRACE_IP1(dlil_has_if_filter, boolean_t, has_filter);
2777 	return has_filter;
2778 }
2779 
2780 static inline void
dlil_input_wakeup(struct dlil_threading_info * inp)2781 dlil_input_wakeup(struct dlil_threading_info *inp)
2782 {
2783 	LCK_MTX_ASSERT(&inp->dlth_lock, LCK_MTX_ASSERT_OWNED);
2784 
2785 	inp->dlth_flags |= DLIL_INPUT_WAITING;
2786 	if (!(inp->dlth_flags & DLIL_INPUT_RUNNING)) {
2787 		inp->dlth_wtot++;
2788 		wakeup_one((caddr_t)&inp->dlth_flags);
2789 	}
2790 }
2791 
2792 __attribute__((noreturn))
2793 static void
dlil_main_input_thread_func(void * v,wait_result_t w)2794 dlil_main_input_thread_func(void *v, wait_result_t w)
2795 {
2796 #pragma unused(w)
2797 	struct dlil_threading_info *inp = v;
2798 
2799 	VERIFY(inp == dlil_main_input_thread);
2800 	VERIFY(inp->dlth_ifp == NULL);
2801 	VERIFY(current_thread() == inp->dlth_thread);
2802 
2803 	lck_mtx_lock(&inp->dlth_lock);
2804 	VERIFY(!(inp->dlth_flags & (DLIL_INPUT_EMBRYONIC | DLIL_INPUT_RUNNING)));
2805 	(void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
2806 	inp->dlth_flags |= DLIL_INPUT_EMBRYONIC;
2807 	/* wake up once to get out of embryonic state */
2808 	dlil_input_wakeup(inp);
2809 	lck_mtx_unlock(&inp->dlth_lock);
2810 	(void) thread_block_parameter(dlil_main_input_thread_cont, inp);
2811 	/* NOTREACHED */
2812 	__builtin_unreachable();
2813 }
2814 
2815 /*
2816  * Main input thread:
2817  *
2818  *   a) handles all inbound packets for lo0
2819  *   b) handles all inbound packets for interfaces with no dedicated
2820  *	input thread (e.g. anything but Ethernet/PDP or those that support
2821  *	opportunistic polling.)
2822  *   c) protocol registrations
2823  *   d) packet injections
2824  */
2825 __attribute__((noreturn))
2826 static void
dlil_main_input_thread_cont(void * v,wait_result_t wres)2827 dlil_main_input_thread_cont(void *v, wait_result_t wres)
2828 {
2829 	struct dlil_main_threading_info *inpm = v;
2830 	struct dlil_threading_info *inp = v;
2831 
2832 	/* main input thread is uninterruptible */
2833 	VERIFY(wres != THREAD_INTERRUPTED);
2834 	lck_mtx_lock_spin(&inp->dlth_lock);
2835 	VERIFY(!(inp->dlth_flags & (DLIL_INPUT_TERMINATE |
2836 	    DLIL_INPUT_RUNNING)));
2837 	inp->dlth_flags |= DLIL_INPUT_RUNNING;
2838 
2839 	while (1) {
2840 		struct mbuf *m = NULL, *m_loop = NULL;
2841 		u_int32_t m_cnt, m_cnt_loop;
2842 		classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
2843 		boolean_t proto_req;
2844 		boolean_t embryonic;
2845 
2846 		inp->dlth_flags &= ~DLIL_INPUT_WAITING;
2847 
2848 		if (__improbable(embryonic =
2849 		    (inp->dlth_flags & DLIL_INPUT_EMBRYONIC))) {
2850 			inp->dlth_flags &= ~DLIL_INPUT_EMBRYONIC;
2851 		}
2852 
2853 		proto_req = (inp->dlth_flags &
2854 		    (DLIL_PROTO_WAITING | DLIL_PROTO_REGISTER));
2855 
2856 		/* Packets for non-dedicated interfaces other than lo0 */
2857 		m_cnt = qlen(&inp->dlth_pkts);
2858 		_getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
2859 		m = pkt.cp_mbuf;
2860 
2861 		/* Packets exclusive to lo0 */
2862 		m_cnt_loop = qlen(&inpm->lo_rcvq_pkts);
2863 		_getq_all(&inpm->lo_rcvq_pkts, &pkt, NULL, NULL, NULL);
2864 		m_loop = pkt.cp_mbuf;
2865 
2866 		inp->dlth_wtot = 0;
2867 
2868 		lck_mtx_unlock(&inp->dlth_lock);
2869 
2870 		if (__improbable(embryonic)) {
2871 			dlil_decr_pending_thread_count();
2872 		}
2873 
2874 		/*
2875 		 * NOTE warning %%% attention !!!!
2876 		 * We should think about putting some thread starvation
2877 		 * safeguards if we deal with long chains of packets.
2878 		 */
2879 		if (__probable(m_loop != NULL)) {
2880 			dlil_input_packet_list_extended(lo_ifp, m_loop,
2881 			    m_cnt_loop, IFNET_MODEL_INPUT_POLL_OFF);
2882 		}
2883 
2884 		if (__probable(m != NULL)) {
2885 			dlil_input_packet_list_extended(NULL, m,
2886 			    m_cnt, IFNET_MODEL_INPUT_POLL_OFF);
2887 		}
2888 
2889 		if (__improbable(proto_req)) {
2890 			proto_input_run();
2891 		}
2892 
2893 		lck_mtx_lock_spin(&inp->dlth_lock);
2894 		VERIFY(inp->dlth_flags & DLIL_INPUT_RUNNING);
2895 		/* main input thread cannot be terminated */
2896 		VERIFY(!(inp->dlth_flags & DLIL_INPUT_TERMINATE));
2897 		if (!(inp->dlth_flags & ~DLIL_INPUT_RUNNING)) {
2898 			break;
2899 		}
2900 	}
2901 
2902 	inp->dlth_flags &= ~DLIL_INPUT_RUNNING;
2903 	(void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
2904 	lck_mtx_unlock(&inp->dlth_lock);
2905 	(void) thread_block_parameter(dlil_main_input_thread_cont, inp);
2906 
2907 	VERIFY(0);      /* we should never get here */
2908 	/* NOTREACHED */
2909 	__builtin_unreachable();
2910 }
2911 
2912 /*
2913  * Input thread for interfaces with legacy input model.
2914  */
2915 __attribute__((noreturn))
2916 static void
dlil_input_thread_func(void * v,wait_result_t w)2917 dlil_input_thread_func(void *v, wait_result_t w)
2918 {
2919 #pragma unused(w)
2920 	char thread_name[MAXTHREADNAMESIZE];
2921 	struct dlil_threading_info *inp = v;
2922 	struct ifnet *ifp = inp->dlth_ifp;
2923 
2924 	VERIFY(inp != dlil_main_input_thread);
2925 	VERIFY(ifp != NULL);
2926 	VERIFY(!(ifp->if_eflags & IFEF_RXPOLL) || !net_rxpoll ||
2927 	    !(ifp->if_xflags & IFXF_LEGACY));
2928 	VERIFY(ifp->if_poll_mode == IFNET_MODEL_INPUT_POLL_OFF ||
2929 	    !(ifp->if_xflags & IFXF_LEGACY));
2930 	VERIFY(current_thread() == inp->dlth_thread);
2931 
2932 	/* construct the name for this thread, and then apply it */
2933 	bzero(thread_name, sizeof(thread_name));
2934 	(void) snprintf(thread_name, sizeof(thread_name),
2935 	    "dlil_input_%s", ifp->if_xname);
2936 	thread_set_thread_name(inp->dlth_thread, thread_name);
2937 
2938 	lck_mtx_lock(&inp->dlth_lock);
2939 	VERIFY(!(inp->dlth_flags & (DLIL_INPUT_EMBRYONIC | DLIL_INPUT_RUNNING)));
2940 	(void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
2941 	inp->dlth_flags |= DLIL_INPUT_EMBRYONIC;
2942 	/* wake up once to get out of embryonic state */
2943 	dlil_input_wakeup(inp);
2944 	lck_mtx_unlock(&inp->dlth_lock);
2945 	(void) thread_block_parameter(dlil_input_thread_cont, inp);
2946 	/* NOTREACHED */
2947 	__builtin_unreachable();
2948 }
2949 
2950 __attribute__((noreturn))
2951 static void
dlil_input_thread_cont(void * v,wait_result_t wres)2952 dlil_input_thread_cont(void *v, wait_result_t wres)
2953 {
2954 	struct dlil_threading_info *inp = v;
2955 	struct ifnet *ifp = inp->dlth_ifp;
2956 
2957 	lck_mtx_lock_spin(&inp->dlth_lock);
2958 	if (__improbable(wres == THREAD_INTERRUPTED ||
2959 	    (inp->dlth_flags & DLIL_INPUT_TERMINATE))) {
2960 		goto terminate;
2961 	}
2962 
2963 	VERIFY(!(inp->dlth_flags & DLIL_INPUT_RUNNING));
2964 	inp->dlth_flags |= DLIL_INPUT_RUNNING;
2965 
2966 	while (1) {
2967 		struct mbuf *m = NULL;
2968 		classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
2969 		boolean_t notify = FALSE;
2970 		boolean_t embryonic;
2971 		u_int32_t m_cnt;
2972 
2973 		inp->dlth_flags &= ~DLIL_INPUT_WAITING;
2974 
2975 		if (__improbable(embryonic =
2976 		    (inp->dlth_flags & DLIL_INPUT_EMBRYONIC))) {
2977 			inp->dlth_flags &= ~DLIL_INPUT_EMBRYONIC;
2978 		}
2979 
2980 		/*
2981 		 * Protocol registration and injection must always use
2982 		 * the main input thread; in theory the latter can utilize
2983 		 * the corresponding input thread where the packet arrived
2984 		 * on, but that requires our knowing the interface in advance
2985 		 * (and the benefits might not worth the trouble.)
2986 		 */
2987 		VERIFY(!(inp->dlth_flags &
2988 		    (DLIL_PROTO_WAITING | DLIL_PROTO_REGISTER)));
2989 
2990 		/* Packets for this interface */
2991 		m_cnt = qlen(&inp->dlth_pkts);
2992 		_getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
2993 		m = pkt.cp_mbuf;
2994 
2995 		inp->dlth_wtot = 0;
2996 
2997 #if SKYWALK
2998 		/*
2999 		 * If this interface is attached to a netif nexus,
3000 		 * the stats are already incremented there; otherwise
3001 		 * do it here.
3002 		 */
3003 		if (!(ifp->if_capabilities & IFCAP_SKYWALK))
3004 #endif /* SKYWALK */
3005 		notify = dlil_input_stats_sync(ifp, inp);
3006 
3007 		lck_mtx_unlock(&inp->dlth_lock);
3008 
3009 		if (__improbable(embryonic)) {
3010 			ifnet_decr_pending_thread_count(ifp);
3011 		}
3012 
3013 		if (__improbable(notify)) {
3014 			ifnet_notify_data_threshold(ifp);
3015 		}
3016 
3017 		/*
3018 		 * NOTE warning %%% attention !!!!
3019 		 * We should think about putting some thread starvation
3020 		 * safeguards if we deal with long chains of packets.
3021 		 */
3022 		if (__probable(m != NULL)) {
3023 			dlil_input_packet_list_extended(ifp, m,
3024 			    m_cnt, ifp->if_poll_mode);
3025 		}
3026 
3027 		lck_mtx_lock_spin(&inp->dlth_lock);
3028 		VERIFY(inp->dlth_flags & DLIL_INPUT_RUNNING);
3029 		if (!(inp->dlth_flags & ~(DLIL_INPUT_RUNNING |
3030 		    DLIL_INPUT_TERMINATE))) {
3031 			break;
3032 		}
3033 	}
3034 
3035 	inp->dlth_flags &= ~DLIL_INPUT_RUNNING;
3036 
3037 	if (__improbable(inp->dlth_flags & DLIL_INPUT_TERMINATE)) {
3038 terminate:
3039 		lck_mtx_unlock(&inp->dlth_lock);
3040 		dlil_terminate_input_thread(inp);
3041 		/* NOTREACHED */
3042 	} else {
3043 		(void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3044 		lck_mtx_unlock(&inp->dlth_lock);
3045 		(void) thread_block_parameter(dlil_input_thread_cont, inp);
3046 		/* NOTREACHED */
3047 	}
3048 
3049 	VERIFY(0);      /* we should never get here */
3050 	/* NOTREACHED */
3051 	__builtin_unreachable();
3052 }
3053 
3054 /*
3055  * Input thread for interfaces with opportunistic polling input model.
3056  */
3057 __attribute__((noreturn))
3058 static void
dlil_rxpoll_input_thread_func(void * v,wait_result_t w)3059 dlil_rxpoll_input_thread_func(void *v, wait_result_t w)
3060 {
3061 #pragma unused(w)
3062 	char thread_name[MAXTHREADNAMESIZE];
3063 	struct dlil_threading_info *inp = v;
3064 	struct ifnet *ifp = inp->dlth_ifp;
3065 
3066 	VERIFY(inp != dlil_main_input_thread);
3067 	VERIFY(ifp != NULL && (ifp->if_eflags & IFEF_RXPOLL) &&
3068 	    (ifp->if_xflags & IFXF_LEGACY));
3069 	VERIFY(current_thread() == inp->dlth_thread);
3070 
3071 	/* construct the name for this thread, and then apply it */
3072 	bzero(thread_name, sizeof(thread_name));
3073 	(void) snprintf(thread_name, sizeof(thread_name),
3074 	    "dlil_input_poll_%s", ifp->if_xname);
3075 	thread_set_thread_name(inp->dlth_thread, thread_name);
3076 
3077 	lck_mtx_lock(&inp->dlth_lock);
3078 	VERIFY(!(inp->dlth_flags & (DLIL_INPUT_EMBRYONIC | DLIL_INPUT_RUNNING)));
3079 	(void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3080 	inp->dlth_flags |= DLIL_INPUT_EMBRYONIC;
3081 	/* wake up once to get out of embryonic state */
3082 	dlil_input_wakeup(inp);
3083 	lck_mtx_unlock(&inp->dlth_lock);
3084 	(void) thread_block_parameter(dlil_rxpoll_input_thread_cont, inp);
3085 	/* NOTREACHED */
3086 	__builtin_unreachable();
3087 }
3088 
3089 __attribute__((noreturn))
3090 static void
dlil_rxpoll_input_thread_cont(void * v,wait_result_t wres)3091 dlil_rxpoll_input_thread_cont(void *v, wait_result_t wres)
3092 {
3093 	struct dlil_threading_info *inp = v;
3094 	struct ifnet *ifp = inp->dlth_ifp;
3095 	struct timespec ts;
3096 
3097 	lck_mtx_lock_spin(&inp->dlth_lock);
3098 	if (__improbable(wres == THREAD_INTERRUPTED ||
3099 	    (inp->dlth_flags & DLIL_INPUT_TERMINATE))) {
3100 		goto terminate;
3101 	}
3102 
3103 	VERIFY(!(inp->dlth_flags & DLIL_INPUT_RUNNING));
3104 	inp->dlth_flags |= DLIL_INPUT_RUNNING;
3105 
3106 	while (1) {
3107 		struct mbuf *m = NULL;
3108 		uint32_t m_cnt, poll_req = 0;
3109 		uint64_t m_size = 0;
3110 		ifnet_model_t mode;
3111 		struct timespec now, delta;
3112 		classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
3113 		boolean_t notify;
3114 		boolean_t embryonic;
3115 		uint64_t ival;
3116 
3117 		inp->dlth_flags &= ~DLIL_INPUT_WAITING;
3118 
3119 		if (__improbable(embryonic =
3120 		    (inp->dlth_flags & DLIL_INPUT_EMBRYONIC))) {
3121 			inp->dlth_flags &= ~DLIL_INPUT_EMBRYONIC;
3122 			goto skip;
3123 		}
3124 
3125 		if ((ival = ifp->if_rxpoll_ival) < IF_RXPOLL_INTERVALTIME_MIN) {
3126 			ival = IF_RXPOLL_INTERVALTIME_MIN;
3127 		}
3128 
3129 		/* Link parameters changed? */
3130 		if (ifp->if_poll_update != 0) {
3131 			ifp->if_poll_update = 0;
3132 			(void) dlil_rxpoll_set_params(ifp, NULL, TRUE);
3133 		}
3134 
3135 		/* Current operating mode */
3136 		mode = ifp->if_poll_mode;
3137 
3138 		/*
3139 		 * Protocol registration and injection must always use
3140 		 * the main input thread; in theory the latter can utilize
3141 		 * the corresponding input thread where the packet arrived
3142 		 * on, but that requires our knowing the interface in advance
3143 		 * (and the benefits might not worth the trouble.)
3144 		 */
3145 		VERIFY(!(inp->dlth_flags &
3146 		    (DLIL_PROTO_WAITING | DLIL_PROTO_REGISTER)));
3147 
3148 		/* Total count of all packets */
3149 		m_cnt = qlen(&inp->dlth_pkts);
3150 
3151 		/* Total bytes of all packets */
3152 		m_size = qsize(&inp->dlth_pkts);
3153 
3154 		/* Packets for this interface */
3155 		_getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
3156 		m = pkt.cp_mbuf;
3157 		VERIFY(m != NULL || m_cnt == 0);
3158 
3159 		nanouptime(&now);
3160 		if (!net_timerisset(&ifp->if_poll_sample_lasttime)) {
3161 			*(&ifp->if_poll_sample_lasttime) = *(&now);
3162 		}
3163 
3164 		net_timersub(&now, &ifp->if_poll_sample_lasttime, &delta);
3165 		if (if_rxpoll && net_timerisset(&ifp->if_poll_sample_holdtime)) {
3166 			u_int32_t ptot, btot;
3167 
3168 			/* Accumulate statistics for current sampling */
3169 			PKTCNTR_ADD(&ifp->if_poll_sstats, m_cnt, m_size);
3170 
3171 			if (net_timercmp(&delta, &ifp->if_poll_sample_holdtime, <)) {
3172 				goto skip;
3173 			}
3174 
3175 			*(&ifp->if_poll_sample_lasttime) = *(&now);
3176 
3177 			/* Calculate min/max of inbound bytes */
3178 			btot = (u_int32_t)ifp->if_poll_sstats.bytes;
3179 			if (ifp->if_rxpoll_bmin == 0 || ifp->if_rxpoll_bmin > btot) {
3180 				ifp->if_rxpoll_bmin = btot;
3181 			}
3182 			if (btot > ifp->if_rxpoll_bmax) {
3183 				ifp->if_rxpoll_bmax = btot;
3184 			}
3185 
3186 			/* Calculate EWMA of inbound bytes */
3187 			DLIL_EWMA(ifp->if_rxpoll_bavg, btot, if_rxpoll_decay);
3188 
3189 			/* Calculate min/max of inbound packets */
3190 			ptot = (u_int32_t)ifp->if_poll_sstats.packets;
3191 			if (ifp->if_rxpoll_pmin == 0 || ifp->if_rxpoll_pmin > ptot) {
3192 				ifp->if_rxpoll_pmin = ptot;
3193 			}
3194 			if (ptot > ifp->if_rxpoll_pmax) {
3195 				ifp->if_rxpoll_pmax = ptot;
3196 			}
3197 
3198 			/* Calculate EWMA of inbound packets */
3199 			DLIL_EWMA(ifp->if_rxpoll_pavg, ptot, if_rxpoll_decay);
3200 
3201 			/* Reset sampling statistics */
3202 			PKTCNTR_CLEAR(&ifp->if_poll_sstats);
3203 
3204 			/* Calculate EWMA of wakeup requests */
3205 			DLIL_EWMA(ifp->if_rxpoll_wavg, inp->dlth_wtot,
3206 			    if_rxpoll_decay);
3207 			inp->dlth_wtot = 0;
3208 
3209 			if (dlil_verbose) {
3210 				if (!net_timerisset(&ifp->if_poll_dbg_lasttime)) {
3211 					*(&ifp->if_poll_dbg_lasttime) = *(&now);
3212 				}
3213 				net_timersub(&now, &ifp->if_poll_dbg_lasttime, &delta);
3214 				if (net_timercmp(&delta, &dlil_dbgrate, >=)) {
3215 					*(&ifp->if_poll_dbg_lasttime) = *(&now);
3216 					DLIL_PRINTF("%s: [%s] pkts avg %d max %d "
3217 					    "limits [%d/%d], wreq avg %d "
3218 					    "limits [%d/%d], bytes avg %d "
3219 					    "limits [%d/%d]\n", if_name(ifp),
3220 					    (ifp->if_poll_mode ==
3221 					    IFNET_MODEL_INPUT_POLL_ON) ?
3222 					    "ON" : "OFF", ifp->if_rxpoll_pavg,
3223 					    ifp->if_rxpoll_pmax,
3224 					    ifp->if_rxpoll_plowat,
3225 					    ifp->if_rxpoll_phiwat,
3226 					    ifp->if_rxpoll_wavg,
3227 					    ifp->if_rxpoll_wlowat,
3228 					    ifp->if_rxpoll_whiwat,
3229 					    ifp->if_rxpoll_bavg,
3230 					    ifp->if_rxpoll_blowat,
3231 					    ifp->if_rxpoll_bhiwat);
3232 				}
3233 			}
3234 
3235 			/* Perform mode transition, if necessary */
3236 			if (!net_timerisset(&ifp->if_poll_mode_lasttime)) {
3237 				*(&ifp->if_poll_mode_lasttime) = *(&now);
3238 			}
3239 
3240 			net_timersub(&now, &ifp->if_poll_mode_lasttime, &delta);
3241 			if (net_timercmp(&delta, &ifp->if_poll_mode_holdtime, <)) {
3242 				goto skip;
3243 			}
3244 
3245 			if (ifp->if_rxpoll_pavg <= ifp->if_rxpoll_plowat &&
3246 			    ifp->if_rxpoll_bavg <= ifp->if_rxpoll_blowat &&
3247 			    ifp->if_poll_mode != IFNET_MODEL_INPUT_POLL_OFF) {
3248 				mode = IFNET_MODEL_INPUT_POLL_OFF;
3249 			} else if (ifp->if_rxpoll_pavg >= ifp->if_rxpoll_phiwat &&
3250 			    (ifp->if_rxpoll_bavg >= ifp->if_rxpoll_bhiwat ||
3251 			    ifp->if_rxpoll_wavg >= ifp->if_rxpoll_whiwat) &&
3252 			    ifp->if_poll_mode != IFNET_MODEL_INPUT_POLL_ON) {
3253 				mode = IFNET_MODEL_INPUT_POLL_ON;
3254 			}
3255 
3256 			if (mode != ifp->if_poll_mode) {
3257 				ifp->if_poll_mode = mode;
3258 				*(&ifp->if_poll_mode_lasttime) = *(&now);
3259 				poll_req++;
3260 			}
3261 		}
3262 skip:
3263 		notify = dlil_input_stats_sync(ifp, inp);
3264 
3265 		lck_mtx_unlock(&inp->dlth_lock);
3266 
3267 		if (__improbable(embryonic)) {
3268 			ifnet_decr_pending_thread_count(ifp);
3269 		}
3270 
3271 		if (__improbable(notify)) {
3272 			ifnet_notify_data_threshold(ifp);
3273 		}
3274 
3275 		/*
3276 		 * If there's a mode change and interface is still attached,
3277 		 * perform a downcall to the driver for the new mode.  Also
3278 		 * hold an IO refcnt on the interface to prevent it from
3279 		 * being detached (will be release below.)
3280 		 */
3281 		if (poll_req != 0 && ifnet_is_attached(ifp, 1)) {
3282 			struct ifnet_model_params p = {
3283 				.model = mode, .reserved = { 0 }
3284 			};
3285 			errno_t err;
3286 
3287 			if (dlil_verbose) {
3288 				DLIL_PRINTF("%s: polling is now %s, "
3289 				    "pkts avg %d max %d limits [%d/%d], "
3290 				    "wreq avg %d limits [%d/%d], "
3291 				    "bytes avg %d limits [%d/%d]\n",
3292 				    if_name(ifp),
3293 				    (mode == IFNET_MODEL_INPUT_POLL_ON) ?
3294 				    "ON" : "OFF", ifp->if_rxpoll_pavg,
3295 				    ifp->if_rxpoll_pmax, ifp->if_rxpoll_plowat,
3296 				    ifp->if_rxpoll_phiwat, ifp->if_rxpoll_wavg,
3297 				    ifp->if_rxpoll_wlowat, ifp->if_rxpoll_whiwat,
3298 				    ifp->if_rxpoll_bavg, ifp->if_rxpoll_blowat,
3299 				    ifp->if_rxpoll_bhiwat);
3300 			}
3301 
3302 			if ((err = ((*ifp->if_input_ctl)(ifp,
3303 			    IFNET_CTL_SET_INPUT_MODEL, sizeof(p), &p))) != 0) {
3304 				DLIL_PRINTF("%s: error setting polling mode "
3305 				    "to %s (%d)\n", if_name(ifp),
3306 				    (mode == IFNET_MODEL_INPUT_POLL_ON) ?
3307 				    "ON" : "OFF", err);
3308 			}
3309 
3310 			switch (mode) {
3311 			case IFNET_MODEL_INPUT_POLL_OFF:
3312 				ifnet_set_poll_cycle(ifp, NULL);
3313 				ifp->if_rxpoll_offreq++;
3314 				if (err != 0) {
3315 					ifp->if_rxpoll_offerr++;
3316 				}
3317 				break;
3318 
3319 			case IFNET_MODEL_INPUT_POLL_ON:
3320 				net_nsectimer(&ival, &ts);
3321 				ifnet_set_poll_cycle(ifp, &ts);
3322 				ifnet_poll(ifp);
3323 				ifp->if_rxpoll_onreq++;
3324 				if (err != 0) {
3325 					ifp->if_rxpoll_onerr++;
3326 				}
3327 				break;
3328 
3329 			default:
3330 				VERIFY(0);
3331 				/* NOTREACHED */
3332 			}
3333 
3334 			/* Release the IO refcnt */
3335 			ifnet_decr_iorefcnt(ifp);
3336 		}
3337 
3338 		/*
3339 		 * NOTE warning %%% attention !!!!
3340 		 * We should think about putting some thread starvation
3341 		 * safeguards if we deal with long chains of packets.
3342 		 */
3343 		if (__probable(m != NULL)) {
3344 			dlil_input_packet_list_extended(ifp, m, m_cnt, mode);
3345 		}
3346 
3347 		lck_mtx_lock_spin(&inp->dlth_lock);
3348 		VERIFY(inp->dlth_flags & DLIL_INPUT_RUNNING);
3349 		if (!(inp->dlth_flags & ~(DLIL_INPUT_RUNNING |
3350 		    DLIL_INPUT_TERMINATE))) {
3351 			break;
3352 		}
3353 	}
3354 
3355 	inp->dlth_flags &= ~DLIL_INPUT_RUNNING;
3356 
3357 	if (__improbable(inp->dlth_flags & DLIL_INPUT_TERMINATE)) {
3358 terminate:
3359 		lck_mtx_unlock(&inp->dlth_lock);
3360 		dlil_terminate_input_thread(inp);
3361 		/* NOTREACHED */
3362 	} else {
3363 		(void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3364 		lck_mtx_unlock(&inp->dlth_lock);
3365 		(void) thread_block_parameter(dlil_rxpoll_input_thread_cont,
3366 		    inp);
3367 		/* NOTREACHED */
3368 	}
3369 
3370 	VERIFY(0);      /* we should never get here */
3371 	/* NOTREACHED */
3372 	__builtin_unreachable();
3373 }
3374 
3375 errno_t
dlil_rxpoll_validate_params(struct ifnet_poll_params * p)3376 dlil_rxpoll_validate_params(struct ifnet_poll_params *p)
3377 {
3378 	if (p != NULL) {
3379 		if ((p->packets_lowat == 0 && p->packets_hiwat != 0) ||
3380 		    (p->packets_lowat != 0 && p->packets_hiwat == 0)) {
3381 			return EINVAL;
3382 		}
3383 		if (p->packets_lowat != 0 &&    /* hiwat must be non-zero */
3384 		    p->packets_lowat >= p->packets_hiwat) {
3385 			return EINVAL;
3386 		}
3387 		if ((p->bytes_lowat == 0 && p->bytes_hiwat != 0) ||
3388 		    (p->bytes_lowat != 0 && p->bytes_hiwat == 0)) {
3389 			return EINVAL;
3390 		}
3391 		if (p->bytes_lowat != 0 &&      /* hiwat must be non-zero */
3392 		    p->bytes_lowat >= p->bytes_hiwat) {
3393 			return EINVAL;
3394 		}
3395 		if (p->interval_time != 0 &&
3396 		    p->interval_time < IF_RXPOLL_INTERVALTIME_MIN) {
3397 			p->interval_time = IF_RXPOLL_INTERVALTIME_MIN;
3398 		}
3399 	}
3400 	return 0;
3401 }
3402 
3403 void
dlil_rxpoll_update_params(struct ifnet * ifp,struct ifnet_poll_params * p)3404 dlil_rxpoll_update_params(struct ifnet *ifp, struct ifnet_poll_params *p)
3405 {
3406 	u_int64_t sample_holdtime, inbw;
3407 
3408 	if ((inbw = ifnet_input_linkrate(ifp)) == 0 && p == NULL) {
3409 		sample_holdtime = 0;    /* polling is disabled */
3410 		ifp->if_rxpoll_wlowat = ifp->if_rxpoll_plowat =
3411 		    ifp->if_rxpoll_blowat = 0;
3412 		ifp->if_rxpoll_whiwat = ifp->if_rxpoll_phiwat =
3413 		    ifp->if_rxpoll_bhiwat = (u_int32_t)-1;
3414 		ifp->if_rxpoll_plim = 0;
3415 		ifp->if_rxpoll_ival = IF_RXPOLL_INTERVALTIME_MIN;
3416 	} else {
3417 		u_int32_t plowat, phiwat, blowat, bhiwat, plim;
3418 		u_int64_t ival;
3419 		unsigned int n, i;
3420 
3421 		for (n = 0, i = 0; rxpoll_tbl[i].speed != 0; i++) {
3422 			if (inbw < rxpoll_tbl[i].speed) {
3423 				break;
3424 			}
3425 			n = i;
3426 		}
3427 		/* auto-tune if caller didn't specify a value */
3428 		plowat = ((p == NULL || p->packets_lowat == 0) ?
3429 		    rxpoll_tbl[n].plowat : p->packets_lowat);
3430 		phiwat = ((p == NULL || p->packets_hiwat == 0) ?
3431 		    rxpoll_tbl[n].phiwat : p->packets_hiwat);
3432 		blowat = ((p == NULL || p->bytes_lowat == 0) ?
3433 		    rxpoll_tbl[n].blowat : p->bytes_lowat);
3434 		bhiwat = ((p == NULL || p->bytes_hiwat == 0) ?
3435 		    rxpoll_tbl[n].bhiwat : p->bytes_hiwat);
3436 		plim = ((p == NULL || p->packets_limit == 0 ||
3437 		    if_rxpoll_max != 0) ?  if_rxpoll_max : p->packets_limit);
3438 		ival = ((p == NULL || p->interval_time == 0 ||
3439 		    if_rxpoll_interval_time != IF_RXPOLL_INTERVALTIME) ?
3440 		    if_rxpoll_interval_time : p->interval_time);
3441 
3442 		VERIFY(plowat != 0 && phiwat != 0);
3443 		VERIFY(blowat != 0 && bhiwat != 0);
3444 		VERIFY(ival >= IF_RXPOLL_INTERVALTIME_MIN);
3445 
3446 		sample_holdtime = if_rxpoll_sample_holdtime;
3447 		ifp->if_rxpoll_wlowat = if_sysctl_rxpoll_wlowat;
3448 		ifp->if_rxpoll_whiwat = if_sysctl_rxpoll_whiwat;
3449 		ifp->if_rxpoll_plowat = plowat;
3450 		ifp->if_rxpoll_phiwat = phiwat;
3451 		ifp->if_rxpoll_blowat = blowat;
3452 		ifp->if_rxpoll_bhiwat = bhiwat;
3453 		ifp->if_rxpoll_plim = plim;
3454 		ifp->if_rxpoll_ival = ival;
3455 	}
3456 
3457 	net_nsectimer(&if_rxpoll_mode_holdtime, &ifp->if_poll_mode_holdtime);
3458 	net_nsectimer(&sample_holdtime, &ifp->if_poll_sample_holdtime);
3459 
3460 	if (dlil_verbose) {
3461 		DLIL_PRINTF("%s: speed %llu bps, sample per %llu nsec, "
3462 		    "poll interval %llu nsec, pkts per poll %u, "
3463 		    "pkt limits [%u/%u], wreq limits [%u/%u], "
3464 		    "bytes limits [%u/%u]\n", if_name(ifp),
3465 		    inbw, sample_holdtime, ifp->if_rxpoll_ival,
3466 		    ifp->if_rxpoll_plim, ifp->if_rxpoll_plowat,
3467 		    ifp->if_rxpoll_phiwat, ifp->if_rxpoll_wlowat,
3468 		    ifp->if_rxpoll_whiwat, ifp->if_rxpoll_blowat,
3469 		    ifp->if_rxpoll_bhiwat);
3470 	}
3471 }
3472 
3473 /*
3474  * Must be called on an attached ifnet (caller is expected to check.)
3475  * Caller may pass NULL for poll parameters to indicate "auto-tuning."
3476  */
3477 errno_t
dlil_rxpoll_set_params(struct ifnet * ifp,struct ifnet_poll_params * p,boolean_t locked)3478 dlil_rxpoll_set_params(struct ifnet *ifp, struct ifnet_poll_params *p,
3479     boolean_t locked)
3480 {
3481 	errno_t err;
3482 	struct dlil_threading_info *inp;
3483 
3484 	VERIFY(ifp != NULL);
3485 	if (!(ifp->if_eflags & IFEF_RXPOLL) || (inp = ifp->if_inp) == NULL) {
3486 		return ENXIO;
3487 	}
3488 	err = dlil_rxpoll_validate_params(p);
3489 	if (err != 0) {
3490 		return err;
3491 	}
3492 
3493 	if (!locked) {
3494 		lck_mtx_lock(&inp->dlth_lock);
3495 	}
3496 	LCK_MTX_ASSERT(&inp->dlth_lock, LCK_MTX_ASSERT_OWNED);
3497 	/*
3498 	 * Normally, we'd reset the parameters to the auto-tuned values
3499 	 * if the the input thread detects a change in link rate.  If the
3500 	 * driver provides its own parameters right after a link rate
3501 	 * changes, but before the input thread gets to run, we want to
3502 	 * make sure to keep the driver's values.  Clearing if_poll_update
3503 	 * will achieve that.
3504 	 */
3505 	if (p != NULL && !locked && ifp->if_poll_update != 0) {
3506 		ifp->if_poll_update = 0;
3507 	}
3508 	dlil_rxpoll_update_params(ifp, p);
3509 	if (!locked) {
3510 		lck_mtx_unlock(&inp->dlth_lock);
3511 	}
3512 	return 0;
3513 }
3514 
3515 /*
3516  * Must be called on an attached ifnet (caller is expected to check.)
3517  */
3518 errno_t
dlil_rxpoll_get_params(struct ifnet * ifp,struct ifnet_poll_params * p)3519 dlil_rxpoll_get_params(struct ifnet *ifp, struct ifnet_poll_params *p)
3520 {
3521 	struct dlil_threading_info *inp;
3522 
3523 	VERIFY(ifp != NULL && p != NULL);
3524 	if (!(ifp->if_eflags & IFEF_RXPOLL) || (inp = ifp->if_inp) == NULL) {
3525 		return ENXIO;
3526 	}
3527 
3528 	bzero(p, sizeof(*p));
3529 
3530 	lck_mtx_lock(&inp->dlth_lock);
3531 	p->packets_limit = ifp->if_rxpoll_plim;
3532 	p->packets_lowat = ifp->if_rxpoll_plowat;
3533 	p->packets_hiwat = ifp->if_rxpoll_phiwat;
3534 	p->bytes_lowat = ifp->if_rxpoll_blowat;
3535 	p->bytes_hiwat = ifp->if_rxpoll_bhiwat;
3536 	p->interval_time = ifp->if_rxpoll_ival;
3537 	lck_mtx_unlock(&inp->dlth_lock);
3538 
3539 	return 0;
3540 }
3541 
3542 errno_t
ifnet_input(struct ifnet * ifp,struct mbuf * m_head,const struct ifnet_stat_increment_param * s)3543 ifnet_input(struct ifnet *ifp, struct mbuf *m_head,
3544     const struct ifnet_stat_increment_param *s)
3545 {
3546 	return ifnet_input_common(ifp, m_head, NULL, s, FALSE, FALSE);
3547 }
3548 
3549 errno_t
ifnet_input_extended(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s)3550 ifnet_input_extended(struct ifnet *ifp, struct mbuf *m_head,
3551     struct mbuf *m_tail, const struct ifnet_stat_increment_param *s)
3552 {
3553 	return ifnet_input_common(ifp, m_head, m_tail, s, TRUE, FALSE);
3554 }
3555 
3556 errno_t
ifnet_input_poll(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s)3557 ifnet_input_poll(struct ifnet *ifp, struct mbuf *m_head,
3558     struct mbuf *m_tail, const struct ifnet_stat_increment_param *s)
3559 {
3560 	return ifnet_input_common(ifp, m_head, m_tail, s,
3561 	           (m_head != NULL), TRUE);
3562 }
3563 
3564 static errno_t
ifnet_input_common(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t ext,boolean_t poll)3565 ifnet_input_common(struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail,
3566     const struct ifnet_stat_increment_param *s, boolean_t ext, boolean_t poll)
3567 {
3568 	dlil_input_func input_func;
3569 	struct ifnet_stat_increment_param _s;
3570 	u_int32_t m_cnt = 0, m_size = 0;
3571 	struct mbuf *last;
3572 	errno_t err = 0;
3573 
3574 	if ((m_head == NULL && !poll) || (s == NULL && ext)) {
3575 		if (m_head != NULL) {
3576 			mbuf_freem_list(m_head);
3577 		}
3578 		return EINVAL;
3579 	}
3580 
3581 	VERIFY(m_head != NULL || (s == NULL && m_tail == NULL && !ext && poll));
3582 	VERIFY(m_tail == NULL || ext);
3583 	VERIFY(s != NULL || !ext);
3584 
3585 	/*
3586 	 * Drop the packet(s) if the parameters are invalid, or if the
3587 	 * interface is no longer attached; else hold an IO refcnt to
3588 	 * prevent it from being detached (will be released below.)
3589 	 */
3590 	if (ifp == NULL || (ifp != lo_ifp && !ifnet_datamov_begin(ifp))) {
3591 		if (m_head != NULL) {
3592 			mbuf_freem_list(m_head);
3593 		}
3594 		return EINVAL;
3595 	}
3596 
3597 	input_func = ifp->if_input_dlil;
3598 	VERIFY(input_func != NULL);
3599 
3600 	if (m_tail == NULL) {
3601 		last = m_head;
3602 		while (m_head != NULL) {
3603 #if IFNET_INPUT_SANITY_CHK
3604 			if (__improbable(dlil_input_sanity_check != 0)) {
3605 				DLIL_INPUT_CHECK(last, ifp);
3606 			}
3607 #endif /* IFNET_INPUT_SANITY_CHK */
3608 			m_cnt++;
3609 			m_size += m_length(last);
3610 			if (mbuf_nextpkt(last) == NULL) {
3611 				break;
3612 			}
3613 			last = mbuf_nextpkt(last);
3614 		}
3615 		m_tail = last;
3616 	} else {
3617 #if IFNET_INPUT_SANITY_CHK
3618 		if (__improbable(dlil_input_sanity_check != 0)) {
3619 			last = m_head;
3620 			while (1) {
3621 				DLIL_INPUT_CHECK(last, ifp);
3622 				m_cnt++;
3623 				m_size += m_length(last);
3624 				if (mbuf_nextpkt(last) == NULL) {
3625 					break;
3626 				}
3627 				last = mbuf_nextpkt(last);
3628 			}
3629 		} else {
3630 			m_cnt = s->packets_in;
3631 			m_size = s->bytes_in;
3632 			last = m_tail;
3633 		}
3634 #else
3635 		m_cnt = s->packets_in;
3636 		m_size = s->bytes_in;
3637 		last = m_tail;
3638 #endif /* IFNET_INPUT_SANITY_CHK */
3639 	}
3640 
3641 	if (last != m_tail) {
3642 		panic_plain("%s: invalid input packet chain for %s, "
3643 		    "tail mbuf %p instead of %p\n", __func__, if_name(ifp),
3644 		    m_tail, last);
3645 	}
3646 
3647 	/*
3648 	 * Assert packet count only for the extended variant, for backwards
3649 	 * compatibility, since this came directly from the device driver.
3650 	 * Relax this assertion for input bytes, as the driver may have
3651 	 * included the link-layer headers in the computation; hence
3652 	 * m_size is just an approximation.
3653 	 */
3654 	if (ext && s->packets_in != m_cnt) {
3655 		panic_plain("%s: input packet count mismatch for %s, "
3656 		    "%d instead of %d\n", __func__, if_name(ifp),
3657 		    s->packets_in, m_cnt);
3658 	}
3659 
3660 	if (s == NULL) {
3661 		bzero(&_s, sizeof(_s));
3662 		s = &_s;
3663 	} else {
3664 		_s = *s;
3665 	}
3666 	_s.packets_in = m_cnt;
3667 	_s.bytes_in = m_size;
3668 
3669 	if (ifp->if_xflags & IFXF_DISABLE_INPUT) {
3670 		m_freem_list(m_head);
3671 
3672 		os_atomic_add(&ifp->if_data.ifi_ipackets, _s.packets_in, relaxed);
3673 		os_atomic_add(&ifp->if_data.ifi_ibytes, _s.bytes_in, relaxed);
3674 
3675 		goto done;
3676 	}
3677 
3678 	err = (*input_func)(ifp, m_head, m_tail, s, poll, current_thread());
3679 
3680 done:
3681 	if (ifp != lo_ifp) {
3682 		/* Release the IO refcnt */
3683 		ifnet_datamov_end(ifp);
3684 	}
3685 
3686 	return err;
3687 }
3688 
3689 #if SKYWALK
3690 errno_t
dlil_set_input_handler(struct ifnet * ifp,dlil_input_func fn)3691 dlil_set_input_handler(struct ifnet *ifp, dlil_input_func fn)
3692 {
3693 	return os_atomic_cmpxchg((void * volatile *)&ifp->if_input_dlil,
3694 	           ptrauth_nop_cast(void *, &dlil_input_handler),
3695 	           ptrauth_nop_cast(void *, fn), acq_rel) ? 0 : EBUSY;
3696 }
3697 
3698 void
dlil_reset_input_handler(struct ifnet * ifp)3699 dlil_reset_input_handler(struct ifnet *ifp)
3700 {
3701 	while (!os_atomic_cmpxchg((void * volatile *)&ifp->if_input_dlil,
3702 	    ptrauth_nop_cast(void *, ifp->if_input_dlil),
3703 	    ptrauth_nop_cast(void *, &dlil_input_handler), acq_rel)) {
3704 		;
3705 	}
3706 }
3707 
3708 errno_t
dlil_set_output_handler(struct ifnet * ifp,dlil_output_func fn)3709 dlil_set_output_handler(struct ifnet *ifp, dlil_output_func fn)
3710 {
3711 	return os_atomic_cmpxchg((void * volatile *)&ifp->if_output_dlil,
3712 	           ptrauth_nop_cast(void *, &dlil_output_handler),
3713 	           ptrauth_nop_cast(void *, fn), acq_rel) ? 0 : EBUSY;
3714 }
3715 
3716 void
dlil_reset_output_handler(struct ifnet * ifp)3717 dlil_reset_output_handler(struct ifnet *ifp)
3718 {
3719 	while (!os_atomic_cmpxchg((void * volatile *)&ifp->if_output_dlil,
3720 	    ptrauth_nop_cast(void *, ifp->if_output_dlil),
3721 	    ptrauth_nop_cast(void *, &dlil_output_handler), acq_rel)) {
3722 		;
3723 	}
3724 }
3725 #endif /* SKYWALK */
3726 
3727 errno_t
dlil_output_handler(struct ifnet * ifp,struct mbuf * m)3728 dlil_output_handler(struct ifnet *ifp, struct mbuf *m)
3729 {
3730 	return ifp->if_output(ifp, m);
3731 }
3732 
3733 errno_t
dlil_input_handler(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t poll,struct thread * tp)3734 dlil_input_handler(struct ifnet *ifp, struct mbuf *m_head,
3735     struct mbuf *m_tail, const struct ifnet_stat_increment_param *s,
3736     boolean_t poll, struct thread *tp)
3737 {
3738 	struct dlil_threading_info *inp = ifp->if_inp;
3739 
3740 	if (__improbable(inp == NULL)) {
3741 		inp = dlil_main_input_thread;
3742 	}
3743 
3744 #if (DEVELOPMENT || DEBUG)
3745 	if (__improbable(net_thread_is_marked(NET_THREAD_SYNC_RX))) {
3746 		return dlil_input_sync(inp, ifp, m_head, m_tail, s, poll, tp);
3747 	} else
3748 #endif /* (DEVELOPMENT || DEBUG) */
3749 	{
3750 		return inp->dlth_strategy(inp, ifp, m_head, m_tail, s, poll, tp);
3751 	}
3752 }
3753 
3754 /*
3755  * Detect whether a queue contains a burst that needs to be trimmed.
3756  */
3757 #define MBUF_QUEUE_IS_OVERCOMMITTED(q)                                                                  \
3758 	__improbable(MAX(if_rcvq_burst_limit, qlimit(q)) < qlen(q) &&           \
3759 	                        qtype(q) == QP_MBUF)
3760 
3761 #define MAX_KNOWN_MBUF_CLASS 8
3762 
3763 static uint32_t
dlil_trim_overcomitted_queue_locked(class_queue_t * input_queue,dlil_freeq_t * freeq,struct ifnet_stat_increment_param * stat_delta)3764 dlil_trim_overcomitted_queue_locked(class_queue_t *input_queue,
3765     dlil_freeq_t *freeq, struct ifnet_stat_increment_param *stat_delta)
3766 {
3767 	uint32_t overcommitted_qlen;    /* Length in packets. */
3768 	uint64_t overcommitted_qsize;   /* Size in bytes. */
3769 	uint32_t target_qlen;           /* The desired queue length after trimming. */
3770 	uint32_t pkts_to_drop = 0;      /* Number of packets to drop. */
3771 	uint32_t dropped_pkts = 0;      /* Number of packets that were dropped. */
3772 	uint32_t dropped_bytes = 0;     /* Number of dropped bytes. */
3773 	struct mbuf *m = NULL, *m_tmp = NULL;
3774 
3775 	overcommitted_qlen = qlen(input_queue);
3776 	overcommitted_qsize = qsize(input_queue);
3777 	target_qlen = (qlimit(input_queue) * if_rcvq_trim_pct) / 100;
3778 
3779 	if (overcommitted_qlen <= target_qlen) {
3780 		/*
3781 		 * The queue is already within the target limits.
3782 		 */
3783 		dropped_pkts = 0;
3784 		goto out;
3785 	}
3786 
3787 	pkts_to_drop = overcommitted_qlen - target_qlen;
3788 
3789 	/*
3790 	 * Proceed to removing packets from the head of the queue,
3791 	 * starting from the oldest, until the desired number of packets
3792 	 * has been dropped.
3793 	 */
3794 	MBUFQ_FOREACH_SAFE(m, &qmbufq(input_queue), m_tmp) {
3795 		if (pkts_to_drop <= dropped_pkts) {
3796 			break;
3797 		}
3798 		MBUFQ_REMOVE(&qmbufq(input_queue), m);
3799 		MBUFQ_NEXT(m) = NULL;
3800 		MBUFQ_ENQUEUE(freeq, m);
3801 
3802 		dropped_pkts += 1;
3803 		dropped_bytes += m_length(m);
3804 	}
3805 
3806 	/*
3807 	 * Adjust the length and the estimated size of the queue
3808 	 * after trimming.
3809 	 */
3810 	VERIFY(overcommitted_qlen == target_qlen + dropped_pkts);
3811 	qlen(input_queue) = target_qlen;
3812 
3813 	/* qsize() is an approximation. */
3814 	if (dropped_bytes < qsize(input_queue)) {
3815 		qsize(input_queue) -= dropped_bytes;
3816 	} else {
3817 		qsize(input_queue) = 0;
3818 	}
3819 
3820 	/*
3821 	 * Adjust the ifnet statistics increments, if needed.
3822 	 */
3823 	stat_delta->dropped += dropped_pkts;
3824 	if (dropped_pkts < stat_delta->packets_in) {
3825 		stat_delta->packets_in -= dropped_pkts;
3826 	} else {
3827 		stat_delta->packets_in = 0;
3828 	}
3829 	if (dropped_bytes < stat_delta->bytes_in) {
3830 		stat_delta->bytes_in -= dropped_bytes;
3831 	} else {
3832 		stat_delta->bytes_in = 0;
3833 	}
3834 
3835 out:
3836 	if (dlil_verbose) {
3837 		/*
3838 		 * The basic information about the drop is logged
3839 		 * by the invoking function (dlil_input_{,a}sync).
3840 		 * If `dlil_verbose' flag is set, provide more information
3841 		 * that can be useful for debugging.
3842 		 */
3843 		DLIL_PRINTF("%s: "
3844 		    "qlen: %u -> %u, "
3845 		    "qsize: %llu -> %llu "
3846 		    "qlimit: %u (sysctl: %u) "
3847 		    "target_qlen: %u (if_rcvq_trim_pct: %u) pkts_to_drop: %u "
3848 		    "dropped_pkts: %u dropped_bytes %u\n",
3849 		    __func__,
3850 		    overcommitted_qlen, qlen(input_queue),
3851 		    overcommitted_qsize, qsize(input_queue),
3852 		    qlimit(input_queue), if_rcvq_burst_limit,
3853 		    target_qlen, if_rcvq_trim_pct, pkts_to_drop,
3854 		    dropped_pkts, dropped_bytes);
3855 	}
3856 
3857 	return dropped_pkts;
3858 }
3859 
3860 static errno_t
dlil_input_async(struct dlil_threading_info * inp,struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t poll,struct thread * tp)3861 dlil_input_async(struct dlil_threading_info *inp,
3862     struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail,
3863     const struct ifnet_stat_increment_param *s, boolean_t poll,
3864     struct thread *tp)
3865 {
3866 	u_int32_t m_cnt = s->packets_in;
3867 	u_int32_t m_size = s->bytes_in;
3868 	boolean_t notify = FALSE;
3869 	struct ifnet_stat_increment_param s_adj = *s;
3870 	dlil_freeq_t freeq;
3871 	MBUFQ_INIT(&freeq);
3872 
3873 	/*
3874 	 * If there is a matching DLIL input thread associated with an
3875 	 * affinity set, associate this thread with the same set.  We
3876 	 * will only do this once.
3877 	 */
3878 	lck_mtx_lock_spin(&inp->dlth_lock);
3879 	if (inp != dlil_main_input_thread && inp->dlth_affinity && tp != NULL &&
3880 	    ((!poll && inp->dlth_driver_thread == THREAD_NULL) ||
3881 	    (poll && inp->dlth_poller_thread == THREAD_NULL))) {
3882 		u_int32_t tag = inp->dlth_affinity_tag;
3883 
3884 		if (poll) {
3885 			VERIFY(inp->dlth_poller_thread == THREAD_NULL);
3886 			inp->dlth_poller_thread = tp;
3887 		} else {
3888 			VERIFY(inp->dlth_driver_thread == THREAD_NULL);
3889 			inp->dlth_driver_thread = tp;
3890 		}
3891 		lck_mtx_unlock(&inp->dlth_lock);
3892 
3893 		/* Associate the current thread with the new affinity tag */
3894 		(void) dlil_affinity_set(tp, tag);
3895 
3896 		/*
3897 		 * Take a reference on the current thread; during detach,
3898 		 * we will need to refer to it in order to tear down its
3899 		 * affinity.
3900 		 */
3901 		thread_reference(tp);
3902 		lck_mtx_lock_spin(&inp->dlth_lock);
3903 	}
3904 
3905 	VERIFY(m_head != NULL || (m_tail == NULL && m_cnt == 0));
3906 
3907 	/*
3908 	 * Because of loopbacked multicast we cannot stuff the ifp in
3909 	 * the rcvif of the packet header: loopback (lo0) packets use a
3910 	 * dedicated list so that we can later associate them with lo_ifp
3911 	 * on their way up the stack.  Packets for other interfaces without
3912 	 * dedicated input threads go to the regular list.
3913 	 */
3914 	if (m_head != NULL) {
3915 		classq_pkt_t head, tail;
3916 		class_queue_t *input_queue;
3917 		CLASSQ_PKT_INIT_MBUF(&head, m_head);
3918 		CLASSQ_PKT_INIT_MBUF(&tail, m_tail);
3919 		if (inp == dlil_main_input_thread && ifp == lo_ifp) {
3920 			struct dlil_main_threading_info *inpm =
3921 			    (struct dlil_main_threading_info *)inp;
3922 			input_queue = &inpm->lo_rcvq_pkts;
3923 		} else {
3924 			input_queue = &inp->dlth_pkts;
3925 		}
3926 
3927 		_addq_multi(input_queue, &head, &tail, m_cnt, m_size);
3928 
3929 		if (MBUF_QUEUE_IS_OVERCOMMITTED(input_queue)) {
3930 			dlil_trim_overcomitted_queue_locked(input_queue, &freeq, &s_adj);
3931 			inp->dlth_trim_pkts_dropped += s_adj.dropped;
3932 			inp->dlth_trim_cnt += 1;
3933 
3934 			os_log_error(OS_LOG_DEFAULT,
3935 			    "%s %s burst limit %u (sysctl: %u) exceeded. "
3936 			    "%u packets dropped [%u total in %u events]. new qlen %u ",
3937 			    __func__, if_name(ifp), qlimit(input_queue), if_rcvq_burst_limit,
3938 			    s_adj.dropped, inp->dlth_trim_pkts_dropped, inp->dlth_trim_cnt,
3939 			    qlen(input_queue));
3940 		}
3941 	}
3942 
3943 #if IFNET_INPUT_SANITY_CHK
3944 	/*
3945 	 * Verify that the original stat increment parameter
3946 	 * accurately describes the input chain `m_head`.
3947 	 * This is not affected by the trimming of input queue.
3948 	 */
3949 	if (__improbable(dlil_input_sanity_check != 0)) {
3950 		u_int32_t count = 0, size = 0;
3951 		struct mbuf *m0;
3952 
3953 		for (m0 = m_head; m0; m0 = mbuf_nextpkt(m0)) {
3954 			size += m_length(m0);
3955 			count++;
3956 		}
3957 
3958 		if (count != m_cnt) {
3959 			panic_plain("%s: invalid total packet count %u "
3960 			    "(expected %u)\n", if_name(ifp), count, m_cnt);
3961 			/* NOTREACHED */
3962 			__builtin_unreachable();
3963 		} else if (size != m_size) {
3964 			panic_plain("%s: invalid total packet size %u "
3965 			    "(expected %u)\n", if_name(ifp), size, m_size);
3966 			/* NOTREACHED */
3967 			__builtin_unreachable();
3968 		}
3969 
3970 		inp->dlth_pkts_cnt += m_cnt;
3971 	}
3972 #endif /* IFNET_INPUT_SANITY_CHK */
3973 
3974 	/* NOTE: use the adjusted parameter, vs the original one */
3975 	dlil_input_stats_add(&s_adj, inp, ifp, poll);
3976 	/*
3977 	 * If we're using the main input thread, synchronize the
3978 	 * stats now since we have the interface context.  All
3979 	 * other cases involving dedicated input threads will
3980 	 * have their stats synchronized there.
3981 	 */
3982 	if (inp == dlil_main_input_thread) {
3983 		notify = dlil_input_stats_sync(ifp, inp);
3984 	}
3985 
3986 	dlil_input_wakeup(inp);
3987 	lck_mtx_unlock(&inp->dlth_lock);
3988 
3989 	/*
3990 	 * Actual freeing of the excess packets must happen
3991 	 * after the dlth_lock had been released.
3992 	 */
3993 	if (!MBUFQ_EMPTY(&freeq)) {
3994 		m_freem_list(MBUFQ_FIRST(&freeq));
3995 	}
3996 
3997 	if (notify) {
3998 		ifnet_notify_data_threshold(ifp);
3999 	}
4000 
4001 	return 0;
4002 }
4003 
4004 static errno_t
dlil_input_sync(struct dlil_threading_info * inp,struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t poll,struct thread * tp)4005 dlil_input_sync(struct dlil_threading_info *inp,
4006     struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail,
4007     const struct ifnet_stat_increment_param *s, boolean_t poll,
4008     struct thread *tp)
4009 {
4010 #pragma unused(tp)
4011 	u_int32_t m_cnt = s->packets_in;
4012 	u_int32_t m_size = s->bytes_in;
4013 	boolean_t notify = FALSE;
4014 	classq_pkt_t head, tail;
4015 	struct ifnet_stat_increment_param s_adj = *s;
4016 	dlil_freeq_t freeq;
4017 	MBUFQ_INIT(&freeq);
4018 
4019 	ASSERT(inp != dlil_main_input_thread);
4020 
4021 	/* XXX: should we just assert instead? */
4022 	if (__improbable(m_head == NULL)) {
4023 		return 0;
4024 	}
4025 
4026 	CLASSQ_PKT_INIT_MBUF(&head, m_head);
4027 	CLASSQ_PKT_INIT_MBUF(&tail, m_tail);
4028 
4029 	lck_mtx_lock_spin(&inp->dlth_lock);
4030 	_addq_multi(&inp->dlth_pkts, &head, &tail, m_cnt, m_size);
4031 
4032 	if (MBUF_QUEUE_IS_OVERCOMMITTED(&inp->dlth_pkts)) {
4033 		dlil_trim_overcomitted_queue_locked(&inp->dlth_pkts, &freeq, &s_adj);
4034 		inp->dlth_trim_pkts_dropped += s_adj.dropped;
4035 		inp->dlth_trim_cnt += 1;
4036 
4037 		os_log_error(OS_LOG_DEFAULT,
4038 		    "%s %s burst limit %u (sysctl: %u) exceeded. "
4039 		    "%u packets dropped [%u total in %u events]. new qlen %u \n",
4040 		    __func__, if_name(ifp), qlimit(&inp->dlth_pkts), if_rcvq_burst_limit,
4041 		    s_adj.dropped, inp->dlth_trim_pkts_dropped, inp->dlth_trim_cnt,
4042 		    qlen(&inp->dlth_pkts));
4043 	}
4044 
4045 #if IFNET_INPUT_SANITY_CHK
4046 	if (__improbable(dlil_input_sanity_check != 0)) {
4047 		u_int32_t count = 0, size = 0;
4048 		struct mbuf *m0;
4049 
4050 		for (m0 = m_head; m0; m0 = mbuf_nextpkt(m0)) {
4051 			size += m_length(m0);
4052 			count++;
4053 		}
4054 
4055 		if (count != m_cnt) {
4056 			panic_plain("%s: invalid total packet count %u "
4057 			    "(expected %u)\n", if_name(ifp), count, m_cnt);
4058 			/* NOTREACHED */
4059 			__builtin_unreachable();
4060 		} else if (size != m_size) {
4061 			panic_plain("%s: invalid total packet size %u "
4062 			    "(expected %u)\n", if_name(ifp), size, m_size);
4063 			/* NOTREACHED */
4064 			__builtin_unreachable();
4065 		}
4066 
4067 		inp->dlth_pkts_cnt += m_cnt;
4068 	}
4069 #endif /* IFNET_INPUT_SANITY_CHK */
4070 
4071 	/* NOTE: use the adjusted parameter, vs the original one */
4072 	dlil_input_stats_add(&s_adj, inp, ifp, poll);
4073 
4074 	m_cnt = qlen(&inp->dlth_pkts);
4075 	_getq_all(&inp->dlth_pkts, &head, NULL, NULL, NULL);
4076 
4077 #if SKYWALK
4078 	/*
4079 	 * If this interface is attached to a netif nexus,
4080 	 * the stats are already incremented there; otherwise
4081 	 * do it here.
4082 	 */
4083 	if (!(ifp->if_capabilities & IFCAP_SKYWALK))
4084 #endif /* SKYWALK */
4085 	notify = dlil_input_stats_sync(ifp, inp);
4086 
4087 	lck_mtx_unlock(&inp->dlth_lock);
4088 
4089 	/*
4090 	 * Actual freeing of the excess packets must happen
4091 	 * after the dlth_lock had been released.
4092 	 */
4093 	if (!MBUFQ_EMPTY(&freeq)) {
4094 		m_freem_list(MBUFQ_FIRST(&freeq));
4095 	}
4096 
4097 	if (notify) {
4098 		ifnet_notify_data_threshold(ifp);
4099 	}
4100 
4101 	/*
4102 	 * NOTE warning %%% attention !!!!
4103 	 * We should think about putting some thread starvation
4104 	 * safeguards if we deal with long chains of packets.
4105 	 */
4106 	if (head.cp_mbuf != NULL) {
4107 		dlil_input_packet_list_extended(ifp, head.cp_mbuf,
4108 		    m_cnt, ifp->if_poll_mode);
4109 	}
4110 
4111 	return 0;
4112 }
4113 
4114 #if SKYWALK
4115 errno_t
ifnet_set_output_handler(struct ifnet * ifp,ifnet_output_func fn)4116 ifnet_set_output_handler(struct ifnet *ifp, ifnet_output_func fn)
4117 {
4118 	return os_atomic_cmpxchg((void * volatile *)&ifp->if_output,
4119 	           ptrauth_nop_cast(void *, ifp->if_save_output),
4120 	           ptrauth_nop_cast(void *, fn), acq_rel) ? 0 : EBUSY;
4121 }
4122 
4123 void
ifnet_reset_output_handler(struct ifnet * ifp)4124 ifnet_reset_output_handler(struct ifnet *ifp)
4125 {
4126 	while (!os_atomic_cmpxchg((void * volatile *)&ifp->if_output,
4127 	    ptrauth_nop_cast(void *, ifp->if_output),
4128 	    ptrauth_nop_cast(void *, ifp->if_save_output), acq_rel)) {
4129 		;
4130 	}
4131 }
4132 
4133 errno_t
ifnet_set_start_handler(struct ifnet * ifp,ifnet_start_func fn)4134 ifnet_set_start_handler(struct ifnet *ifp, ifnet_start_func fn)
4135 {
4136 	return os_atomic_cmpxchg((void * volatile *)&ifp->if_start,
4137 	           ptrauth_nop_cast(void *, ifp->if_save_start),
4138 	           ptrauth_nop_cast(void *, fn), acq_rel) ? 0 : EBUSY;
4139 }
4140 
4141 void
ifnet_reset_start_handler(struct ifnet * ifp)4142 ifnet_reset_start_handler(struct ifnet *ifp)
4143 {
4144 	while (!os_atomic_cmpxchg((void * volatile *)&ifp->if_start,
4145 	    ptrauth_nop_cast(void *, ifp->if_start),
4146 	    ptrauth_nop_cast(void *, ifp->if_save_start), acq_rel)) {
4147 		;
4148 	}
4149 }
4150 #endif /* SKYWALK */
4151 
4152 static void
ifnet_start_common(struct ifnet * ifp,boolean_t resetfc,boolean_t ignore_delay)4153 ifnet_start_common(struct ifnet *ifp, boolean_t resetfc, boolean_t ignore_delay)
4154 {
4155 	if (!(ifp->if_eflags & IFEF_TXSTART)) {
4156 		return;
4157 	}
4158 	/*
4159 	 * If the starter thread is inactive, signal it to do work,
4160 	 * unless the interface is being flow controlled from below,
4161 	 * e.g. a virtual interface being flow controlled by a real
4162 	 * network interface beneath it, or it's been disabled via
4163 	 * a call to ifnet_disable_output().
4164 	 */
4165 	lck_mtx_lock_spin(&ifp->if_start_lock);
4166 	if (ignore_delay) {
4167 		ifp->if_start_flags |= IFSF_NO_DELAY;
4168 	}
4169 	if (resetfc) {
4170 		ifp->if_start_flags &= ~IFSF_FLOW_CONTROLLED;
4171 	} else if (ifp->if_start_flags & IFSF_FLOW_CONTROLLED) {
4172 		lck_mtx_unlock(&ifp->if_start_lock);
4173 		return;
4174 	}
4175 	ifp->if_start_req++;
4176 	if (!ifp->if_start_active && ifp->if_start_thread != THREAD_NULL &&
4177 	    (resetfc || !(ifp->if_eflags & IFEF_ENQUEUE_MULTI) ||
4178 	    IFCQ_LEN(ifp->if_snd) >= ifp->if_start_delay_qlen ||
4179 	    ifp->if_start_delayed == 0)) {
4180 		(void) wakeup_one((caddr_t)&ifp->if_start_thread);
4181 	}
4182 	lck_mtx_unlock(&ifp->if_start_lock);
4183 }
4184 
4185 void
ifnet_start(struct ifnet * ifp)4186 ifnet_start(struct ifnet *ifp)
4187 {
4188 	ifnet_start_common(ifp, FALSE, FALSE);
4189 }
4190 
4191 void
ifnet_start_ignore_delay(struct ifnet * ifp)4192 ifnet_start_ignore_delay(struct ifnet *ifp)
4193 {
4194 	ifnet_start_common(ifp, FALSE, TRUE);
4195 }
4196 
4197 __attribute__((noreturn))
4198 static void
ifnet_start_thread_func(void * v,wait_result_t w)4199 ifnet_start_thread_func(void *v, wait_result_t w)
4200 {
4201 #pragma unused(w)
4202 	struct ifnet *ifp = v;
4203 	char thread_name[MAXTHREADNAMESIZE];
4204 
4205 	/* Construct the name for this thread, and then apply it. */
4206 	bzero(thread_name, sizeof(thread_name));
4207 	(void) snprintf(thread_name, sizeof(thread_name),
4208 	    "ifnet_start_%s", ifp->if_xname);
4209 #if SKYWALK
4210 	/* override name for native Skywalk interface */
4211 	if (ifp->if_eflags & IFEF_SKYWALK_NATIVE) {
4212 		(void) snprintf(thread_name, sizeof(thread_name),
4213 		    "skywalk_doorbell_%s_tx", ifp->if_xname);
4214 	}
4215 #endif /* SKYWALK */
4216 	ASSERT(ifp->if_start_thread == current_thread());
4217 	thread_set_thread_name(current_thread(), thread_name);
4218 
4219 	/*
4220 	 * Treat the dedicated starter thread for lo0 as equivalent to
4221 	 * the driver workloop thread; if net_affinity is enabled for
4222 	 * the main input thread, associate this starter thread to it
4223 	 * by binding them with the same affinity tag.  This is done
4224 	 * only once (as we only have one lo_ifp which never goes away.)
4225 	 */
4226 	if (ifp == lo_ifp) {
4227 		struct dlil_threading_info *inp = dlil_main_input_thread;
4228 		struct thread *tp = current_thread();
4229 #if SKYWALK
4230 		/* native skywalk loopback not yet implemented */
4231 		VERIFY(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
4232 #endif /* SKYWALK */
4233 
4234 		lck_mtx_lock(&inp->dlth_lock);
4235 		if (inp->dlth_affinity) {
4236 			u_int32_t tag = inp->dlth_affinity_tag;
4237 
4238 			VERIFY(inp->dlth_driver_thread == THREAD_NULL);
4239 			VERIFY(inp->dlth_poller_thread == THREAD_NULL);
4240 			inp->dlth_driver_thread = tp;
4241 			lck_mtx_unlock(&inp->dlth_lock);
4242 
4243 			/* Associate this thread with the affinity tag */
4244 			(void) dlil_affinity_set(tp, tag);
4245 		} else {
4246 			lck_mtx_unlock(&inp->dlth_lock);
4247 		}
4248 	}
4249 
4250 	lck_mtx_lock(&ifp->if_start_lock);
4251 	VERIFY(!ifp->if_start_embryonic && !ifp->if_start_active);
4252 	(void) assert_wait(&ifp->if_start_thread, THREAD_UNINT);
4253 	ifp->if_start_embryonic = 1;
4254 	/* wake up once to get out of embryonic state */
4255 	ifp->if_start_req++;
4256 	(void) wakeup_one((caddr_t)&ifp->if_start_thread);
4257 	lck_mtx_unlock(&ifp->if_start_lock);
4258 	(void) thread_block_parameter(ifnet_start_thread_cont, ifp);
4259 	/* NOTREACHED */
4260 	__builtin_unreachable();
4261 }
4262 
4263 __attribute__((noreturn))
4264 static void
ifnet_start_thread_cont(void * v,wait_result_t wres)4265 ifnet_start_thread_cont(void *v, wait_result_t wres)
4266 {
4267 	struct ifnet *ifp = v;
4268 	struct ifclassq *ifq = ifp->if_snd;
4269 
4270 	lck_mtx_lock_spin(&ifp->if_start_lock);
4271 	if (__improbable(wres == THREAD_INTERRUPTED ||
4272 	    (ifp->if_start_flags & IFSF_TERMINATING) != 0)) {
4273 		goto terminate;
4274 	}
4275 
4276 	if (__improbable(ifp->if_start_embryonic)) {
4277 		ifp->if_start_embryonic = 0;
4278 		lck_mtx_unlock(&ifp->if_start_lock);
4279 		ifnet_decr_pending_thread_count(ifp);
4280 		lck_mtx_lock_spin(&ifp->if_start_lock);
4281 		goto skip;
4282 	}
4283 
4284 	ifp->if_start_active = 1;
4285 
4286 	/*
4287 	 * Keep on servicing until no more request.
4288 	 */
4289 	for (;;) {
4290 		u_int32_t req = ifp->if_start_req;
4291 		if ((ifp->if_start_flags & IFSF_NO_DELAY) == 0 &&
4292 		    !IFCQ_IS_EMPTY(ifq) &&
4293 		    (ifp->if_eflags & IFEF_ENQUEUE_MULTI) &&
4294 		    ifp->if_start_delayed == 0 &&
4295 		    IFCQ_LEN(ifq) < ifp->if_start_delay_qlen &&
4296 		    (ifp->if_eflags & IFEF_DELAY_START)) {
4297 			ifp->if_start_delayed = 1;
4298 			ifnet_start_delayed++;
4299 			break;
4300 		}
4301 		ifp->if_start_flags &= ~IFSF_NO_DELAY;
4302 		ifp->if_start_delayed = 0;
4303 		lck_mtx_unlock(&ifp->if_start_lock);
4304 
4305 		/*
4306 		 * If no longer attached, don't call start because ifp
4307 		 * is being destroyed; else hold an IO refcnt to
4308 		 * prevent the interface from being detached (will be
4309 		 * released below.)
4310 		 */
4311 		if (!ifnet_datamov_begin(ifp)) {
4312 			lck_mtx_lock_spin(&ifp->if_start_lock);
4313 			break;
4314 		}
4315 
4316 		/* invoke the driver's start routine */
4317 		((*ifp->if_start)(ifp));
4318 
4319 		/*
4320 		 * Release the io ref count taken above.
4321 		 */
4322 		ifnet_datamov_end(ifp);
4323 
4324 		lck_mtx_lock_spin(&ifp->if_start_lock);
4325 
4326 		/*
4327 		 * If there's no pending request or if the
4328 		 * interface has been disabled, we're done.
4329 		 */
4330 #define _IFSF_DISABLED  (IFSF_FLOW_CONTROLLED | IFSF_TERMINATING)
4331 		if (req == ifp->if_start_req ||
4332 		    (ifp->if_start_flags & _IFSF_DISABLED) != 0) {
4333 			break;
4334 		}
4335 	}
4336 skip:
4337 	ifp->if_start_req = 0;
4338 	ifp->if_start_active = 0;
4339 
4340 #if SKYWALK
4341 	/*
4342 	 * Wakeup any waiters, e.g. any threads waiting to
4343 	 * detach the interface from the flowswitch, etc.
4344 	 */
4345 	if (ifp->if_start_waiters != 0) {
4346 		ifp->if_start_waiters = 0;
4347 		wakeup(&ifp->if_start_waiters);
4348 	}
4349 #endif /* SKYWALK */
4350 	if (__probable((ifp->if_start_flags & IFSF_TERMINATING) == 0)) {
4351 		uint64_t deadline = TIMEOUT_WAIT_FOREVER;
4352 		struct timespec delay_start_ts;
4353 		struct timespec *ts = NULL;
4354 
4355 		if (ts == NULL) {
4356 			ts = ((IFCQ_TBR_IS_ENABLED(ifq) && !IFCQ_IS_EMPTY(ifq)) ?
4357 			    &ifp->if_start_cycle : NULL);
4358 		}
4359 
4360 		if (ts == NULL && ifp->if_start_delayed == 1) {
4361 			delay_start_ts.tv_sec = 0;
4362 			delay_start_ts.tv_nsec = ifp->if_start_delay_timeout;
4363 			ts = &delay_start_ts;
4364 		}
4365 
4366 		if (ts != NULL && ts->tv_sec == 0 && ts->tv_nsec == 0) {
4367 			ts = NULL;
4368 		}
4369 
4370 		if (__improbable(ts != NULL)) {
4371 			clock_interval_to_deadline((uint32_t)(ts->tv_nsec +
4372 			    (ts->tv_sec * NSEC_PER_SEC)), 1, &deadline);
4373 		}
4374 
4375 		(void) assert_wait_deadline(&ifp->if_start_thread,
4376 		    THREAD_UNINT, deadline);
4377 		lck_mtx_unlock(&ifp->if_start_lock);
4378 		(void) thread_block_parameter(ifnet_start_thread_cont, ifp);
4379 		/* NOTREACHED */
4380 	} else {
4381 terminate:
4382 		/* interface is detached? */
4383 		ifnet_set_start_cycle(ifp, NULL);
4384 
4385 		/* clear if_start_thread to allow termination to continue */
4386 		ASSERT(ifp->if_start_thread != THREAD_NULL);
4387 		ifp->if_start_thread = THREAD_NULL;
4388 		wakeup((caddr_t)&ifp->if_start_thread);
4389 		lck_mtx_unlock(&ifp->if_start_lock);
4390 
4391 		if (dlil_verbose) {
4392 			DLIL_PRINTF("%s: starter thread terminated\n",
4393 			    if_name(ifp));
4394 		}
4395 
4396 		/* for the extra refcnt from kernel_thread_start() */
4397 		thread_deallocate(current_thread());
4398 		/* this is the end */
4399 		thread_terminate(current_thread());
4400 		/* NOTREACHED */
4401 	}
4402 
4403 	/* must never get here */
4404 	VERIFY(0);
4405 	/* NOTREACHED */
4406 	__builtin_unreachable();
4407 }
4408 
4409 void
ifnet_set_start_cycle(struct ifnet * ifp,struct timespec * ts)4410 ifnet_set_start_cycle(struct ifnet *ifp, struct timespec *ts)
4411 {
4412 	if (ts == NULL) {
4413 		bzero(&ifp->if_start_cycle, sizeof(ifp->if_start_cycle));
4414 	} else {
4415 		*(&ifp->if_start_cycle) = *ts;
4416 	}
4417 
4418 	if (ts != NULL && ts->tv_nsec != 0 && dlil_verbose) {
4419 		DLIL_PRINTF("%s: restart interval set to %lu nsec\n",
4420 		    if_name(ifp), ts->tv_nsec);
4421 	}
4422 }
4423 
4424 static inline void
ifnet_poll_wakeup(struct ifnet * ifp)4425 ifnet_poll_wakeup(struct ifnet *ifp)
4426 {
4427 	LCK_MTX_ASSERT(&ifp->if_poll_lock, LCK_MTX_ASSERT_OWNED);
4428 
4429 	ifp->if_poll_req++;
4430 	if (!(ifp->if_poll_flags & IF_POLLF_RUNNING) &&
4431 	    ifp->if_poll_thread != THREAD_NULL) {
4432 		wakeup_one((caddr_t)&ifp->if_poll_thread);
4433 	}
4434 }
4435 
4436 void
ifnet_poll(struct ifnet * ifp)4437 ifnet_poll(struct ifnet *ifp)
4438 {
4439 	/*
4440 	 * If the poller thread is inactive, signal it to do work.
4441 	 */
4442 	lck_mtx_lock_spin(&ifp->if_poll_lock);
4443 	ifnet_poll_wakeup(ifp);
4444 	lck_mtx_unlock(&ifp->if_poll_lock);
4445 }
4446 
4447 __attribute__((noreturn))
4448 static void
ifnet_poll_thread_func(void * v,wait_result_t w)4449 ifnet_poll_thread_func(void *v, wait_result_t w)
4450 {
4451 #pragma unused(w)
4452 	char thread_name[MAXTHREADNAMESIZE];
4453 	struct ifnet *ifp = v;
4454 
4455 	VERIFY(ifp->if_eflags & IFEF_RXPOLL);
4456 	VERIFY(current_thread() == ifp->if_poll_thread);
4457 
4458 	/* construct the name for this thread, and then apply it */
4459 	bzero(thread_name, sizeof(thread_name));
4460 	(void) snprintf(thread_name, sizeof(thread_name),
4461 	    "ifnet_poller_%s", ifp->if_xname);
4462 	thread_set_thread_name(ifp->if_poll_thread, thread_name);
4463 
4464 	lck_mtx_lock(&ifp->if_poll_lock);
4465 	VERIFY(!(ifp->if_poll_flags & (IF_POLLF_EMBRYONIC | IF_POLLF_RUNNING)));
4466 	(void) assert_wait(&ifp->if_poll_thread, THREAD_UNINT);
4467 	ifp->if_poll_flags |= IF_POLLF_EMBRYONIC;
4468 	/* wake up once to get out of embryonic state */
4469 	ifnet_poll_wakeup(ifp);
4470 	lck_mtx_unlock(&ifp->if_poll_lock);
4471 	(void) thread_block_parameter(ifnet_poll_thread_cont, ifp);
4472 	/* NOTREACHED */
4473 	__builtin_unreachable();
4474 }
4475 
4476 __attribute__((noreturn))
4477 static void
ifnet_poll_thread_cont(void * v,wait_result_t wres)4478 ifnet_poll_thread_cont(void *v, wait_result_t wres)
4479 {
4480 	struct dlil_threading_info *inp;
4481 	struct ifnet *ifp = v;
4482 	struct ifnet_stat_increment_param s;
4483 	struct timespec start_time;
4484 
4485 	VERIFY(ifp->if_eflags & IFEF_RXPOLL);
4486 
4487 	bzero(&s, sizeof(s));
4488 	net_timerclear(&start_time);
4489 
4490 	lck_mtx_lock_spin(&ifp->if_poll_lock);
4491 	if (__improbable(wres == THREAD_INTERRUPTED ||
4492 	    (ifp->if_poll_flags & IF_POLLF_TERMINATING) != 0)) {
4493 		goto terminate;
4494 	}
4495 
4496 	inp = ifp->if_inp;
4497 	VERIFY(inp != NULL);
4498 
4499 	if (__improbable(ifp->if_poll_flags & IF_POLLF_EMBRYONIC)) {
4500 		ifp->if_poll_flags &= ~IF_POLLF_EMBRYONIC;
4501 		lck_mtx_unlock(&ifp->if_poll_lock);
4502 		ifnet_decr_pending_thread_count(ifp);
4503 		lck_mtx_lock_spin(&ifp->if_poll_lock);
4504 		goto skip;
4505 	}
4506 
4507 	ifp->if_poll_flags |= IF_POLLF_RUNNING;
4508 
4509 	/*
4510 	 * Keep on servicing until no more request.
4511 	 */
4512 	for (;;) {
4513 		struct mbuf *m_head, *m_tail;
4514 		u_int32_t m_lim, m_cnt, m_totlen;
4515 		u_int16_t req = ifp->if_poll_req;
4516 
4517 		m_lim = (ifp->if_rxpoll_plim != 0) ? ifp->if_rxpoll_plim :
4518 		    MAX((qlimit(&inp->dlth_pkts)), (ifp->if_rxpoll_phiwat << 2));
4519 		lck_mtx_unlock(&ifp->if_poll_lock);
4520 
4521 		/*
4522 		 * If no longer attached, there's nothing to do;
4523 		 * else hold an IO refcnt to prevent the interface
4524 		 * from being detached (will be released below.)
4525 		 */
4526 		if (!ifnet_is_attached(ifp, 1)) {
4527 			lck_mtx_lock_spin(&ifp->if_poll_lock);
4528 			break;
4529 		}
4530 
4531 		if (dlil_verbose > 1) {
4532 			DLIL_PRINTF("%s: polling up to %d pkts, "
4533 			    "pkts avg %d max %d, wreq avg %d, "
4534 			    "bytes avg %d\n",
4535 			    if_name(ifp), m_lim,
4536 			    ifp->if_rxpoll_pavg, ifp->if_rxpoll_pmax,
4537 			    ifp->if_rxpoll_wavg, ifp->if_rxpoll_bavg);
4538 		}
4539 
4540 		/* invoke the driver's input poll routine */
4541 		((*ifp->if_input_poll)(ifp, 0, m_lim, &m_head, &m_tail,
4542 		&m_cnt, &m_totlen));
4543 
4544 		if (m_head != NULL) {
4545 			VERIFY(m_tail != NULL && m_cnt > 0);
4546 
4547 			if (dlil_verbose > 1) {
4548 				DLIL_PRINTF("%s: polled %d pkts, "
4549 				    "pkts avg %d max %d, wreq avg %d, "
4550 				    "bytes avg %d\n",
4551 				    if_name(ifp), m_cnt,
4552 				    ifp->if_rxpoll_pavg, ifp->if_rxpoll_pmax,
4553 				    ifp->if_rxpoll_wavg, ifp->if_rxpoll_bavg);
4554 			}
4555 
4556 			/* stats are required for extended variant */
4557 			s.packets_in = m_cnt;
4558 			s.bytes_in = m_totlen;
4559 
4560 			(void) ifnet_input_common(ifp, m_head, m_tail,
4561 			    &s, TRUE, TRUE);
4562 		} else {
4563 			if (dlil_verbose > 1) {
4564 				DLIL_PRINTF("%s: no packets, "
4565 				    "pkts avg %d max %d, wreq avg %d, "
4566 				    "bytes avg %d\n",
4567 				    if_name(ifp), ifp->if_rxpoll_pavg,
4568 				    ifp->if_rxpoll_pmax, ifp->if_rxpoll_wavg,
4569 				    ifp->if_rxpoll_bavg);
4570 			}
4571 
4572 			(void) ifnet_input_common(ifp, NULL, NULL,
4573 			    NULL, FALSE, TRUE);
4574 		}
4575 
4576 		/* Release the io ref count */
4577 		ifnet_decr_iorefcnt(ifp);
4578 
4579 		lck_mtx_lock_spin(&ifp->if_poll_lock);
4580 
4581 		/* if there's no pending request, we're done */
4582 		if (req == ifp->if_poll_req ||
4583 		    (ifp->if_poll_flags & IF_POLLF_TERMINATING) != 0) {
4584 			break;
4585 		}
4586 	}
4587 skip:
4588 	ifp->if_poll_req = 0;
4589 	ifp->if_poll_flags &= ~IF_POLLF_RUNNING;
4590 
4591 	if (__probable((ifp->if_poll_flags & IF_POLLF_TERMINATING) == 0)) {
4592 		uint64_t deadline = TIMEOUT_WAIT_FOREVER;
4593 		struct timespec *ts;
4594 
4595 		/*
4596 		 * Wakeup N ns from now, else sleep indefinitely (ts = NULL)
4597 		 * until ifnet_poll() is called again.
4598 		 */
4599 		ts = &ifp->if_poll_cycle;
4600 		if (ts->tv_sec == 0 && ts->tv_nsec == 0) {
4601 			ts = NULL;
4602 		}
4603 
4604 		if (ts != NULL) {
4605 			clock_interval_to_deadline((uint32_t)(ts->tv_nsec +
4606 			    (ts->tv_sec * NSEC_PER_SEC)), 1, &deadline);
4607 		}
4608 
4609 		(void) assert_wait_deadline(&ifp->if_poll_thread,
4610 		    THREAD_UNINT, deadline);
4611 		lck_mtx_unlock(&ifp->if_poll_lock);
4612 		(void) thread_block_parameter(ifnet_poll_thread_cont, ifp);
4613 		/* NOTREACHED */
4614 	} else {
4615 terminate:
4616 		/* interface is detached (maybe while asleep)? */
4617 		ifnet_set_poll_cycle(ifp, NULL);
4618 
4619 		/* clear if_poll_thread to allow termination to continue */
4620 		ASSERT(ifp->if_poll_thread != THREAD_NULL);
4621 		ifp->if_poll_thread = THREAD_NULL;
4622 		wakeup((caddr_t)&ifp->if_poll_thread);
4623 		lck_mtx_unlock(&ifp->if_poll_lock);
4624 
4625 		if (dlil_verbose) {
4626 			DLIL_PRINTF("%s: poller thread terminated\n",
4627 			    if_name(ifp));
4628 		}
4629 
4630 		/* for the extra refcnt from kernel_thread_start() */
4631 		thread_deallocate(current_thread());
4632 		/* this is the end */
4633 		thread_terminate(current_thread());
4634 		/* NOTREACHED */
4635 	}
4636 
4637 	/* must never get here */
4638 	VERIFY(0);
4639 	/* NOTREACHED */
4640 	__builtin_unreachable();
4641 }
4642 
4643 void
ifnet_set_poll_cycle(struct ifnet * ifp,struct timespec * ts)4644 ifnet_set_poll_cycle(struct ifnet *ifp, struct timespec *ts)
4645 {
4646 	if (ts == NULL) {
4647 		bzero(&ifp->if_poll_cycle, sizeof(ifp->if_poll_cycle));
4648 	} else {
4649 		*(&ifp->if_poll_cycle) = *ts;
4650 	}
4651 
4652 	if (ts != NULL && ts->tv_nsec != 0 && dlil_verbose) {
4653 		DLIL_PRINTF("%s: poll interval set to %lu nsec\n",
4654 		    if_name(ifp), ts->tv_nsec);
4655 	}
4656 }
4657 
4658 void
ifnet_purge(struct ifnet * ifp)4659 ifnet_purge(struct ifnet *ifp)
4660 {
4661 	if (ifp != NULL && (ifp->if_eflags & IFEF_TXSTART)) {
4662 		if_qflush_snd(ifp, false);
4663 	}
4664 }
4665 
4666 void
ifnet_update_sndq(struct ifclassq * ifq,cqev_t ev)4667 ifnet_update_sndq(struct ifclassq *ifq, cqev_t ev)
4668 {
4669 	IFCQ_LOCK_ASSERT_HELD(ifq);
4670 
4671 	if (!(IFCQ_IS_READY(ifq))) {
4672 		return;
4673 	}
4674 
4675 	if (IFCQ_TBR_IS_ENABLED(ifq)) {
4676 		struct tb_profile tb = {
4677 			.rate = ifq->ifcq_tbr.tbr_rate_raw,
4678 			.percent = ifq->ifcq_tbr.tbr_percent, .depth = 0
4679 		};
4680 		(void) ifclassq_tbr_set(ifq, &tb, FALSE);
4681 	}
4682 
4683 	ifclassq_update(ifq, ev);
4684 }
4685 
4686 void
ifnet_update_rcv(struct ifnet * ifp,cqev_t ev)4687 ifnet_update_rcv(struct ifnet *ifp, cqev_t ev)
4688 {
4689 	switch (ev) {
4690 	case CLASSQ_EV_LINK_BANDWIDTH:
4691 		if (net_rxpoll && (ifp->if_eflags & IFEF_RXPOLL)) {
4692 			ifp->if_poll_update++;
4693 		}
4694 		break;
4695 
4696 	default:
4697 		break;
4698 	}
4699 }
4700 
4701 errno_t
ifnet_set_output_sched_model(struct ifnet * ifp,u_int32_t model)4702 ifnet_set_output_sched_model(struct ifnet *ifp, u_int32_t model)
4703 {
4704 	struct ifclassq *ifq;
4705 	u_int32_t omodel;
4706 	errno_t err;
4707 
4708 	if (ifp == NULL || model >= IFNET_SCHED_MODEL_MAX) {
4709 		return EINVAL;
4710 	} else if (!(ifp->if_eflags & IFEF_TXSTART)) {
4711 		return ENXIO;
4712 	}
4713 
4714 	ifq = ifp->if_snd;
4715 	IFCQ_LOCK(ifq);
4716 	omodel = ifp->if_output_sched_model;
4717 	ifp->if_output_sched_model = model;
4718 	if ((err = ifclassq_pktsched_setup(ifq)) != 0) {
4719 		ifp->if_output_sched_model = omodel;
4720 	}
4721 	IFCQ_UNLOCK(ifq);
4722 
4723 	return err;
4724 }
4725 
4726 errno_t
ifnet_set_sndq_maxlen(struct ifnet * ifp,u_int32_t maxqlen)4727 ifnet_set_sndq_maxlen(struct ifnet *ifp, u_int32_t maxqlen)
4728 {
4729 	if (ifp == NULL) {
4730 		return EINVAL;
4731 	} else if (!(ifp->if_eflags & IFEF_TXSTART)) {
4732 		return ENXIO;
4733 	}
4734 
4735 	ifclassq_set_maxlen(ifp->if_snd, maxqlen);
4736 
4737 	return 0;
4738 }
4739 
4740 errno_t
ifnet_get_sndq_maxlen(struct ifnet * ifp,u_int32_t * maxqlen)4741 ifnet_get_sndq_maxlen(struct ifnet *ifp, u_int32_t *maxqlen)
4742 {
4743 	if (ifp == NULL || maxqlen == NULL) {
4744 		return EINVAL;
4745 	} else if (!(ifp->if_eflags & IFEF_TXSTART)) {
4746 		return ENXIO;
4747 	}
4748 
4749 	*maxqlen = ifclassq_get_maxlen(ifp->if_snd);
4750 
4751 	return 0;
4752 }
4753 
4754 errno_t
ifnet_get_sndq_len(struct ifnet * ifp,u_int32_t * pkts)4755 ifnet_get_sndq_len(struct ifnet *ifp, u_int32_t *pkts)
4756 {
4757 	errno_t err;
4758 
4759 	if (ifp == NULL || pkts == NULL) {
4760 		err = EINVAL;
4761 	} else if (!(ifp->if_eflags & IFEF_TXSTART)) {
4762 		err = ENXIO;
4763 	} else {
4764 		err = ifclassq_get_len(ifp->if_snd, MBUF_SC_UNSPEC,
4765 		    IF_CLASSQ_ALL_GRPS, pkts, NULL);
4766 	}
4767 
4768 	return err;
4769 }
4770 
4771 errno_t
ifnet_get_service_class_sndq_len(struct ifnet * ifp,mbuf_svc_class_t sc,u_int32_t * pkts,u_int32_t * bytes)4772 ifnet_get_service_class_sndq_len(struct ifnet *ifp, mbuf_svc_class_t sc,
4773     u_int32_t *pkts, u_int32_t *bytes)
4774 {
4775 	errno_t err;
4776 
4777 	if (ifp == NULL || !MBUF_VALID_SC(sc) ||
4778 	    (pkts == NULL && bytes == NULL)) {
4779 		err = EINVAL;
4780 	} else if (!(ifp->if_eflags & IFEF_TXSTART)) {
4781 		err = ENXIO;
4782 	} else {
4783 		err = ifclassq_get_len(ifp->if_snd, sc, IF_CLASSQ_ALL_GRPS,
4784 		    pkts, bytes);
4785 	}
4786 
4787 	return err;
4788 }
4789 
4790 errno_t
ifnet_set_rcvq_maxlen(struct ifnet * ifp,u_int32_t maxqlen)4791 ifnet_set_rcvq_maxlen(struct ifnet *ifp, u_int32_t maxqlen)
4792 {
4793 	struct dlil_threading_info *inp;
4794 
4795 	if (ifp == NULL) {
4796 		return EINVAL;
4797 	} else if (!(ifp->if_eflags & IFEF_RXPOLL) || ifp->if_inp == NULL) {
4798 		return ENXIO;
4799 	}
4800 
4801 	if (maxqlen == 0) {
4802 		maxqlen = if_rcvq_maxlen;
4803 	} else if (maxqlen < IF_RCVQ_MINLEN) {
4804 		maxqlen = IF_RCVQ_MINLEN;
4805 	}
4806 
4807 	inp = ifp->if_inp;
4808 	lck_mtx_lock(&inp->dlth_lock);
4809 	qlimit(&inp->dlth_pkts) = maxqlen;
4810 	lck_mtx_unlock(&inp->dlth_lock);
4811 
4812 	return 0;
4813 }
4814 
4815 errno_t
ifnet_get_rcvq_maxlen(struct ifnet * ifp,u_int32_t * maxqlen)4816 ifnet_get_rcvq_maxlen(struct ifnet *ifp, u_int32_t *maxqlen)
4817 {
4818 	struct dlil_threading_info *inp;
4819 
4820 	if (ifp == NULL || maxqlen == NULL) {
4821 		return EINVAL;
4822 	} else if (!(ifp->if_eflags & IFEF_RXPOLL) || ifp->if_inp == NULL) {
4823 		return ENXIO;
4824 	}
4825 
4826 	inp = ifp->if_inp;
4827 	lck_mtx_lock(&inp->dlth_lock);
4828 	*maxqlen = qlimit(&inp->dlth_pkts);
4829 	lck_mtx_unlock(&inp->dlth_lock);
4830 	return 0;
4831 }
4832 
4833 void
ifnet_enqueue_multi_setup(struct ifnet * ifp,uint16_t delay_qlen,uint16_t delay_timeout)4834 ifnet_enqueue_multi_setup(struct ifnet *ifp, uint16_t delay_qlen,
4835     uint16_t delay_timeout)
4836 {
4837 	if (delay_qlen > 0 && delay_timeout > 0) {
4838 		if_set_eflags(ifp, IFEF_ENQUEUE_MULTI);
4839 		ifp->if_start_delay_qlen = MIN(100, delay_qlen);
4840 		ifp->if_start_delay_timeout = min(20000, delay_timeout);
4841 		/* convert timeout to nanoseconds */
4842 		ifp->if_start_delay_timeout *= 1000;
4843 		kprintf("%s: forced IFEF_ENQUEUE_MULTI qlen %u timeout %u\n",
4844 		    ifp->if_xname, (uint32_t)delay_qlen,
4845 		    (uint32_t)delay_timeout);
4846 	} else {
4847 		if_clear_eflags(ifp, IFEF_ENQUEUE_MULTI);
4848 	}
4849 }
4850 
4851 /*
4852  * This function clears the DSCP bits in the IPV4/V6 header pointed to by buf.
4853  * While it's ok for buf to be not 32 bit aligned, the caller must ensure that
4854  * buf holds the full header.
4855  */
4856 static __attribute__((noinline)) void
ifnet_mcast_clear_dscp(uint8_t * buf,uint8_t ip_ver)4857 ifnet_mcast_clear_dscp(uint8_t *buf, uint8_t ip_ver)
4858 {
4859 	struct ip *ip;
4860 	struct ip6_hdr *ip6;
4861 	uint8_t lbuf[64] __attribute__((aligned(8)));
4862 	uint8_t *p = buf;
4863 
4864 	if (ip_ver == IPVERSION) {
4865 		uint8_t old_tos;
4866 		uint32_t sum;
4867 
4868 		if (__improbable(!IP_HDR_ALIGNED_P(p))) {
4869 			DTRACE_IP1(not__aligned__v4, uint8_t *, buf);
4870 			bcopy(buf, lbuf, sizeof(struct ip));
4871 			p = lbuf;
4872 		}
4873 		ip = (struct ip *)(void *)p;
4874 		if (__probable((ip->ip_tos & ~IPTOS_ECN_MASK) == 0)) {
4875 			return;
4876 		}
4877 
4878 		DTRACE_IP1(clear__v4, struct ip *, ip);
4879 		old_tos = ip->ip_tos;
4880 		ip->ip_tos &= IPTOS_ECN_MASK;
4881 		sum = ip->ip_sum + htons(old_tos) - htons(ip->ip_tos);
4882 		sum = (sum >> 16) + (sum & 0xffff);
4883 		ip->ip_sum = (uint16_t)(sum & 0xffff);
4884 
4885 		if (__improbable(p == lbuf)) {
4886 			bcopy(lbuf, buf, sizeof(struct ip));
4887 		}
4888 	} else {
4889 		uint32_t flow;
4890 		ASSERT(ip_ver == IPV6_VERSION);
4891 
4892 		if (__improbable(!IP_HDR_ALIGNED_P(p))) {
4893 			DTRACE_IP1(not__aligned__v6, uint8_t *, buf);
4894 			bcopy(buf, lbuf, sizeof(struct ip6_hdr));
4895 			p = lbuf;
4896 		}
4897 		ip6 = (struct ip6_hdr *)(void *)p;
4898 		flow = ntohl(ip6->ip6_flow);
4899 		if (__probable((flow & IP6FLOW_DSCP_MASK) == 0)) {
4900 			return;
4901 		}
4902 
4903 		DTRACE_IP1(clear__v6, struct ip6_hdr *, ip6);
4904 		ip6->ip6_flow = htonl(flow & ~IP6FLOW_DSCP_MASK);
4905 
4906 		if (__improbable(p == lbuf)) {
4907 			bcopy(lbuf, buf, sizeof(struct ip6_hdr));
4908 		}
4909 	}
4910 }
4911 
4912 static inline errno_t
ifnet_enqueue_ifclassq(struct ifnet * ifp,struct ifclassq * ifcq,classq_pkt_t * p,boolean_t flush,boolean_t * pdrop)4913 ifnet_enqueue_ifclassq(struct ifnet *ifp, struct ifclassq *ifcq,
4914     classq_pkt_t *p, boolean_t flush, boolean_t *pdrop)
4915 {
4916 #if SKYWALK
4917 	volatile struct sk_nexusadv *nxadv = NULL;
4918 #endif /* SKYWALK */
4919 	volatile uint64_t *fg_ts = NULL;
4920 	volatile uint64_t *rt_ts = NULL;
4921 	struct timespec now;
4922 	u_int64_t now_nsec = 0;
4923 	int error = 0;
4924 	uint8_t *mcast_buf = NULL;
4925 	uint8_t ip_ver;
4926 	uint32_t pktlen;
4927 
4928 	ASSERT(ifp->if_eflags & IFEF_TXSTART);
4929 #if SKYWALK
4930 	/*
4931 	 * If attached to flowswitch, grab pointers to the
4932 	 * timestamp variables in the nexus advisory region.
4933 	 */
4934 	if ((ifp->if_capabilities & IFCAP_SKYWALK) && ifp->if_na != NULL &&
4935 	    (nxadv = ifp->if_na->nifna_netif->nif_fsw_nxadv) != NULL) {
4936 		fg_ts = &nxadv->nxadv_fg_sendts;
4937 		rt_ts = &nxadv->nxadv_rt_sendts;
4938 	}
4939 #endif /* SKYWALK */
4940 
4941 	/*
4942 	 * If packet already carries a timestamp, either from dlil_output()
4943 	 * or from flowswitch, use it here.  Otherwise, record timestamp.
4944 	 * PKTF_TS_VALID is always cleared prior to entering classq, i.e.
4945 	 * the timestamp value is used internally there.
4946 	 */
4947 	switch (p->cp_ptype) {
4948 	case QP_MBUF:
4949 #if SKYWALK
4950 		/*
4951 		 * Valid only for non-native (compat) Skywalk interface.
4952 		 * If the data source uses packet, caller must convert
4953 		 * it to mbuf first prior to calling this routine.
4954 		 */
4955 		ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
4956 #endif /* SKYWALK */
4957 		ASSERT(p->cp_mbuf->m_flags & M_PKTHDR);
4958 		ASSERT(p->cp_mbuf->m_nextpkt == NULL);
4959 
4960 		if (!(p->cp_mbuf->m_pkthdr.pkt_flags & PKTF_TS_VALID) ||
4961 		    p->cp_mbuf->m_pkthdr.pkt_timestamp == 0) {
4962 			nanouptime(&now);
4963 			net_timernsec(&now, &now_nsec);
4964 			p->cp_mbuf->m_pkthdr.pkt_timestamp = now_nsec;
4965 		}
4966 		p->cp_mbuf->m_pkthdr.pkt_flags &= ~PKTF_TS_VALID;
4967 		/*
4968 		 * If the packet service class is not background,
4969 		 * update the timestamp to indicate recent activity
4970 		 * on a foreground socket.
4971 		 */
4972 		if ((p->cp_mbuf->m_pkthdr.pkt_flags & PKTF_FLOW_ID) &&
4973 		    p->cp_mbuf->m_pkthdr.pkt_flowsrc == FLOWSRC_INPCB) {
4974 			if (!(p->cp_mbuf->m_pkthdr.pkt_flags &
4975 			    PKTF_SO_BACKGROUND)) {
4976 				ifp->if_fg_sendts = (uint32_t)_net_uptime;
4977 				if (fg_ts != NULL) {
4978 					*fg_ts = (uint32_t)_net_uptime;
4979 				}
4980 			}
4981 			if (p->cp_mbuf->m_pkthdr.pkt_flags & PKTF_SO_REALTIME) {
4982 				ifp->if_rt_sendts = (uint32_t)_net_uptime;
4983 				if (rt_ts != NULL) {
4984 					*rt_ts = (uint32_t)_net_uptime;
4985 				}
4986 			}
4987 		}
4988 		pktlen = m_pktlen(p->cp_mbuf);
4989 
4990 		/*
4991 		 * Some Wi-Fi AP implementations do not correctly handle
4992 		 * multicast IP packets with DSCP bits set (radr://9331522).
4993 		 * As a workaround we clear the DSCP bits but keep service
4994 		 * class (rdar://51507725).
4995 		 */
4996 		if ((p->cp_mbuf->m_flags & M_MCAST) != 0 &&
4997 		    IFNET_IS_WIFI_INFRA(ifp)) {
4998 			size_t len = mbuf_len(p->cp_mbuf), hlen;
4999 			struct ether_header *eh;
5000 			boolean_t pullup = FALSE;
5001 			uint16_t etype;
5002 
5003 			if (__improbable(len < sizeof(struct ether_header))) {
5004 				DTRACE_IP1(small__ether, size_t, len);
5005 				if ((p->cp_mbuf = m_pullup(p->cp_mbuf,
5006 				    sizeof(struct ether_header))) == NULL) {
5007 					return ENOMEM;
5008 				}
5009 			}
5010 			eh = mtod(p->cp_mbuf, struct ether_header *);
5011 			etype = ntohs(eh->ether_type);
5012 			if (etype == ETHERTYPE_IP) {
5013 				hlen = sizeof(struct ether_header) +
5014 				    sizeof(struct ip);
5015 				if (len < hlen) {
5016 					DTRACE_IP1(small__v4, size_t, len);
5017 					pullup = TRUE;
5018 				}
5019 				ip_ver = IPVERSION;
5020 			} else if (etype == ETHERTYPE_IPV6) {
5021 				hlen = sizeof(struct ether_header) +
5022 				    sizeof(struct ip6_hdr);
5023 				if (len < hlen) {
5024 					DTRACE_IP1(small__v6, size_t, len);
5025 					pullup = TRUE;
5026 				}
5027 				ip_ver = IPV6_VERSION;
5028 			} else {
5029 				DTRACE_IP1(invalid__etype, uint16_t, etype);
5030 				break;
5031 			}
5032 			if (pullup) {
5033 				if ((p->cp_mbuf = m_pullup(p->cp_mbuf, (int)hlen)) ==
5034 				    NULL) {
5035 					return ENOMEM;
5036 				}
5037 
5038 				eh = mtod(p->cp_mbuf, struct ether_header *);
5039 			}
5040 			mcast_buf = (uint8_t *)(eh + 1);
5041 			/*
5042 			 * ifnet_mcast_clear_dscp() will finish the work below.
5043 			 * Note that the pullups above ensure that mcast_buf
5044 			 * points to a full IP header.
5045 			 */
5046 		}
5047 		break;
5048 
5049 #if SKYWALK
5050 	case QP_PACKET:
5051 		/*
5052 		 * Valid only for native Skywalk interface.  If the data
5053 		 * source uses mbuf, caller must convert it to packet first
5054 		 * prior to calling this routine.
5055 		 */
5056 		ASSERT(ifp->if_eflags & IFEF_SKYWALK_NATIVE);
5057 		if (!(p->cp_kpkt->pkt_pflags & PKT_F_TS_VALID) ||
5058 		    p->cp_kpkt->pkt_timestamp == 0) {
5059 			nanouptime(&now);
5060 			net_timernsec(&now, &now_nsec);
5061 			p->cp_kpkt->pkt_timestamp = now_nsec;
5062 		}
5063 		p->cp_kpkt->pkt_pflags &= ~PKT_F_TS_VALID;
5064 		/*
5065 		 * If the packet service class is not background,
5066 		 * update the timestamps on the interface, as well as
5067 		 * the ones in nexus-wide advisory to indicate recent
5068 		 * activity on a foreground flow.
5069 		 */
5070 		if (!(p->cp_kpkt->pkt_pflags & PKT_F_BACKGROUND)) {
5071 			ifp->if_fg_sendts = (uint32_t)_net_uptime;
5072 			if (fg_ts != NULL) {
5073 				*fg_ts = (uint32_t)_net_uptime;
5074 			}
5075 		}
5076 		if (p->cp_kpkt->pkt_pflags & PKT_F_REALTIME) {
5077 			ifp->if_rt_sendts = (uint32_t)_net_uptime;
5078 			if (rt_ts != NULL) {
5079 				*rt_ts = (uint32_t)_net_uptime;
5080 			}
5081 		}
5082 		pktlen = p->cp_kpkt->pkt_length;
5083 
5084 		/*
5085 		 * Some Wi-Fi AP implementations do not correctly handle
5086 		 * multicast IP packets with DSCP bits set (radr://9331522).
5087 		 * As a workaround we clear the DSCP bits but keep service
5088 		 * class (rdar://51507725).
5089 		 */
5090 		if ((p->cp_kpkt->pkt_link_flags & PKT_LINKF_MCAST) != 0 &&
5091 		    IFNET_IS_WIFI_INFRA(ifp)) {
5092 			uint8_t *baddr;
5093 			struct ether_header *eh;
5094 			uint16_t etype;
5095 
5096 			MD_BUFLET_ADDR_ABS(p->cp_kpkt, baddr);
5097 			baddr += p->cp_kpkt->pkt_headroom;
5098 			if (__improbable(pktlen < sizeof(struct ether_header))) {
5099 				DTRACE_IP1(pkt__small__ether, __kern_packet *,
5100 				    p->cp_kpkt);
5101 				break;
5102 			}
5103 			eh = (struct ether_header *)(void *)baddr;
5104 			etype = ntohs(eh->ether_type);
5105 			if (etype == ETHERTYPE_IP) {
5106 				if (pktlen < sizeof(struct ether_header) +
5107 				    sizeof(struct ip)) {
5108 					DTRACE_IP1(pkt__small__v4, uint32_t,
5109 					    pktlen);
5110 					break;
5111 				}
5112 				ip_ver = IPVERSION;
5113 			} else if (etype == ETHERTYPE_IPV6) {
5114 				if (pktlen < sizeof(struct ether_header) +
5115 				    sizeof(struct ip6_hdr)) {
5116 					DTRACE_IP1(pkt__small__v6, uint32_t,
5117 					    pktlen);
5118 					break;
5119 				}
5120 				ip_ver = IPV6_VERSION;
5121 			} else {
5122 				DTRACE_IP1(pkt__invalid__etype, uint16_t,
5123 				    etype);
5124 				break;
5125 			}
5126 			mcast_buf = (uint8_t *)(eh + 1);
5127 			/*
5128 			 * ifnet_mcast_clear_dscp() will finish the work below.
5129 			 * The checks above verify that the IP header is in the
5130 			 * first buflet.
5131 			 */
5132 		}
5133 		break;
5134 #endif /* SKYWALK */
5135 
5136 	default:
5137 		VERIFY(0);
5138 		/* NOTREACHED */
5139 		__builtin_unreachable();
5140 	}
5141 
5142 	if (mcast_buf != NULL) {
5143 		ifnet_mcast_clear_dscp(mcast_buf, ip_ver);
5144 	}
5145 
5146 	if (ifp->if_eflags & IFEF_ENQUEUE_MULTI) {
5147 		if (now_nsec == 0) {
5148 			nanouptime(&now);
5149 			net_timernsec(&now, &now_nsec);
5150 		}
5151 		/*
5152 		 * If the driver chose to delay start callback for
5153 		 * coalescing multiple packets, Then use the following
5154 		 * heuristics to make sure that start callback will
5155 		 * be delayed only when bulk data transfer is detected.
5156 		 * 1. number of packets enqueued in (delay_win * 2) is
5157 		 * greater than or equal to the delay qlen.
5158 		 * 2. If delay_start is enabled it will stay enabled for
5159 		 * another 10 idle windows. This is to take into account
5160 		 * variable RTT and burst traffic.
5161 		 * 3. If the time elapsed since last enqueue is more
5162 		 * than 200ms we disable delaying start callback. This is
5163 		 * is to take idle time into account.
5164 		 */
5165 		u_int64_t dwin = (ifp->if_start_delay_timeout << 1);
5166 		if (ifp->if_start_delay_swin > 0) {
5167 			if ((ifp->if_start_delay_swin + dwin) > now_nsec) {
5168 				ifp->if_start_delay_cnt++;
5169 			} else if ((now_nsec - ifp->if_start_delay_swin)
5170 			    >= (200 * 1000 * 1000)) {
5171 				ifp->if_start_delay_swin = now_nsec;
5172 				ifp->if_start_delay_cnt = 1;
5173 				ifp->if_start_delay_idle = 0;
5174 				if (ifp->if_eflags & IFEF_DELAY_START) {
5175 					if_clear_eflags(ifp, IFEF_DELAY_START);
5176 					ifnet_delay_start_disabled_increment();
5177 				}
5178 			} else {
5179 				if (ifp->if_start_delay_cnt >=
5180 				    ifp->if_start_delay_qlen) {
5181 					if_set_eflags(ifp, IFEF_DELAY_START);
5182 					ifp->if_start_delay_idle = 0;
5183 				} else {
5184 					if (ifp->if_start_delay_idle >= 10) {
5185 						if_clear_eflags(ifp,
5186 						    IFEF_DELAY_START);
5187 						ifnet_delay_start_disabled_increment();
5188 					} else {
5189 						ifp->if_start_delay_idle++;
5190 					}
5191 				}
5192 				ifp->if_start_delay_swin = now_nsec;
5193 				ifp->if_start_delay_cnt = 1;
5194 			}
5195 		} else {
5196 			ifp->if_start_delay_swin = now_nsec;
5197 			ifp->if_start_delay_cnt = 1;
5198 			ifp->if_start_delay_idle = 0;
5199 			if_clear_eflags(ifp, IFEF_DELAY_START);
5200 		}
5201 	} else {
5202 		if_clear_eflags(ifp, IFEF_DELAY_START);
5203 	}
5204 
5205 	/* enqueue the packet (caller consumes object) */
5206 	error = ifclassq_enqueue(((ifcq != NULL) ? ifcq : ifp->if_snd), p, p,
5207 	    1, pktlen, pdrop);
5208 
5209 	/*
5210 	 * Tell the driver to start dequeueing; do this even when the queue
5211 	 * for the packet is suspended (EQSUSPENDED), as the driver could still
5212 	 * be dequeueing from other unsuspended queues.
5213 	 */
5214 	if (!(ifp->if_eflags & IFEF_ENQUEUE_MULTI) &&
5215 	    ((error == 0 && flush) || error == EQFULL || error == EQSUSPENDED)) {
5216 		ifnet_start(ifp);
5217 	}
5218 
5219 	return error;
5220 }
5221 
5222 static inline errno_t
ifnet_enqueue_ifclassq_chain(struct ifnet * ifp,struct ifclassq * ifcq,classq_pkt_t * head,classq_pkt_t * tail,uint32_t cnt,uint32_t bytes,boolean_t flush,boolean_t * pdrop)5223 ifnet_enqueue_ifclassq_chain(struct ifnet *ifp, struct ifclassq *ifcq,
5224     classq_pkt_t *head, classq_pkt_t *tail, uint32_t cnt, uint32_t bytes,
5225     boolean_t flush, boolean_t *pdrop)
5226 {
5227 	int error;
5228 
5229 	/* enqueue the packet (caller consumes object) */
5230 	error = ifclassq_enqueue(ifcq != NULL ? ifcq : ifp->if_snd, head, tail,
5231 	    cnt, bytes, pdrop);
5232 
5233 	/*
5234 	 * Tell the driver to start dequeueing; do this even when the queue
5235 	 * for the packet is suspended (EQSUSPENDED), as the driver could still
5236 	 * be dequeueing from other unsuspended queues.
5237 	 */
5238 	if ((error == 0 && flush) || error == EQFULL || error == EQSUSPENDED) {
5239 		ifnet_start(ifp);
5240 	}
5241 	return error;
5242 }
5243 
5244 int
ifnet_enqueue_netem(void * handle,pktsched_pkt_t * pkts,uint32_t n_pkts)5245 ifnet_enqueue_netem(void *handle, pktsched_pkt_t *pkts, uint32_t n_pkts)
5246 {
5247 	struct ifnet *ifp = handle;
5248 	boolean_t pdrop;        /* dummy */
5249 	uint32_t i;
5250 
5251 	ASSERT(n_pkts >= 1);
5252 	for (i = 0; i < n_pkts - 1; i++) {
5253 		(void) ifnet_enqueue_ifclassq(ifp, NULL, &pkts[i].pktsched_pkt,
5254 		    FALSE, &pdrop);
5255 	}
5256 	/* flush with the last packet */
5257 	(void) ifnet_enqueue_ifclassq(ifp, NULL, &pkts[i].pktsched_pkt,
5258 	    TRUE, &pdrop);
5259 
5260 	return 0;
5261 }
5262 
5263 static inline errno_t
ifnet_enqueue_common(struct ifnet * ifp,struct ifclassq * ifcq,classq_pkt_t * pkt,boolean_t flush,boolean_t * pdrop)5264 ifnet_enqueue_common(struct ifnet *ifp, struct ifclassq *ifcq,
5265     classq_pkt_t *pkt, boolean_t flush, boolean_t *pdrop)
5266 {
5267 	if (ifp->if_output_netem != NULL) {
5268 		bool drop;
5269 		errno_t error;
5270 		error = netem_enqueue(ifp->if_output_netem, pkt, &drop);
5271 		*pdrop = drop ? TRUE : FALSE;
5272 		return error;
5273 	} else {
5274 		return ifnet_enqueue_ifclassq(ifp, ifcq, pkt, flush, pdrop);
5275 	}
5276 }
5277 
5278 errno_t
ifnet_enqueue(struct ifnet * ifp,struct mbuf * m)5279 ifnet_enqueue(struct ifnet *ifp, struct mbuf *m)
5280 {
5281 	uint32_t bytes = m_pktlen(m);
5282 	struct mbuf *tail = m;
5283 	uint32_t cnt = 1;
5284 	boolean_t pdrop;
5285 
5286 	while (tail->m_nextpkt) {
5287 		VERIFY(tail->m_flags & M_PKTHDR);
5288 		tail = tail->m_nextpkt;
5289 		cnt++;
5290 		bytes += m_pktlen(tail);
5291 	}
5292 
5293 	return ifnet_enqueue_mbuf_chain(ifp, m, tail, cnt, bytes, TRUE, &pdrop);
5294 }
5295 
5296 errno_t
ifnet_enqueue_mbuf(struct ifnet * ifp,struct mbuf * m,boolean_t flush,boolean_t * pdrop)5297 ifnet_enqueue_mbuf(struct ifnet *ifp, struct mbuf *m, boolean_t flush,
5298     boolean_t *pdrop)
5299 {
5300 	classq_pkt_t pkt;
5301 
5302 	if (ifp == NULL || m == NULL || !(m->m_flags & M_PKTHDR) ||
5303 	    m->m_nextpkt != NULL) {
5304 		if (m != NULL) {
5305 			m_freem_list(m);
5306 			*pdrop = TRUE;
5307 		}
5308 		return EINVAL;
5309 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5310 	    !IF_FULLY_ATTACHED(ifp)) {
5311 		/* flag tested without lock for performance */
5312 		m_freem(m);
5313 		*pdrop = TRUE;
5314 		return ENXIO;
5315 	} else if (!(ifp->if_flags & IFF_UP)) {
5316 		m_freem(m);
5317 		*pdrop = TRUE;
5318 		return ENETDOWN;
5319 	}
5320 
5321 	CLASSQ_PKT_INIT_MBUF(&pkt, m);
5322 	return ifnet_enqueue_common(ifp, NULL, &pkt, flush, pdrop);
5323 }
5324 
5325 errno_t
ifnet_enqueue_mbuf_chain(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,uint32_t cnt,uint32_t bytes,boolean_t flush,boolean_t * pdrop)5326 ifnet_enqueue_mbuf_chain(struct ifnet *ifp, struct mbuf *m_head,
5327     struct mbuf *m_tail, uint32_t cnt, uint32_t bytes, boolean_t flush,
5328     boolean_t *pdrop)
5329 {
5330 	classq_pkt_t head, tail;
5331 
5332 	ASSERT(m_head != NULL);
5333 	ASSERT((m_head->m_flags & M_PKTHDR) != 0);
5334 	ASSERT(m_tail != NULL);
5335 	ASSERT((m_tail->m_flags & M_PKTHDR) != 0);
5336 	ASSERT(ifp != NULL);
5337 	ASSERT((ifp->if_eflags & IFEF_TXSTART) != 0);
5338 
5339 	if (!IF_FULLY_ATTACHED(ifp)) {
5340 		/* flag tested without lock for performance */
5341 		m_freem_list(m_head);
5342 		*pdrop = TRUE;
5343 		return ENXIO;
5344 	} else if (!(ifp->if_flags & IFF_UP)) {
5345 		m_freem_list(m_head);
5346 		*pdrop = TRUE;
5347 		return ENETDOWN;
5348 	}
5349 
5350 	CLASSQ_PKT_INIT_MBUF(&head, m_head);
5351 	CLASSQ_PKT_INIT_MBUF(&tail, m_tail);
5352 	return ifnet_enqueue_ifclassq_chain(ifp, NULL, &head, &tail, cnt, bytes,
5353 	           flush, pdrop);
5354 }
5355 
5356 #if SKYWALK
5357 static errno_t
ifnet_enqueue_pkt_common(struct ifnet * ifp,struct ifclassq * ifcq,struct __kern_packet * kpkt,boolean_t flush,boolean_t * pdrop)5358 ifnet_enqueue_pkt_common(struct ifnet *ifp, struct ifclassq *ifcq,
5359     struct __kern_packet *kpkt, boolean_t flush, boolean_t *pdrop)
5360 {
5361 	classq_pkt_t pkt;
5362 
5363 	ASSERT(kpkt == NULL || kpkt->pkt_nextpkt == NULL);
5364 
5365 	if (__improbable(ifp == NULL || kpkt == NULL)) {
5366 		if (kpkt != NULL) {
5367 			pp_free_packet(__DECONST(struct kern_pbufpool *,
5368 			    kpkt->pkt_qum.qum_pp), SK_PTR_ADDR(kpkt));
5369 			*pdrop = TRUE;
5370 		}
5371 		return EINVAL;
5372 	} else if (__improbable(!(ifp->if_eflags & IFEF_TXSTART) ||
5373 	    !IF_FULLY_ATTACHED(ifp))) {
5374 		/* flag tested without lock for performance */
5375 		pp_free_packet(__DECONST(struct kern_pbufpool *,
5376 		    kpkt->pkt_qum.qum_pp), SK_PTR_ADDR(kpkt));
5377 		*pdrop = TRUE;
5378 		return ENXIO;
5379 	} else if (__improbable(!(ifp->if_flags & IFF_UP))) {
5380 		pp_free_packet(__DECONST(struct kern_pbufpool *,
5381 		    kpkt->pkt_qum.qum_pp), SK_PTR_ADDR(kpkt));
5382 		*pdrop = TRUE;
5383 		return ENETDOWN;
5384 	}
5385 
5386 	CLASSQ_PKT_INIT_PACKET(&pkt, kpkt);
5387 	return ifnet_enqueue_common(ifp, ifcq, &pkt, flush, pdrop);
5388 }
5389 
5390 errno_t
ifnet_enqueue_pkt(struct ifnet * ifp,struct __kern_packet * kpkt,boolean_t flush,boolean_t * pdrop)5391 ifnet_enqueue_pkt(struct ifnet *ifp, struct __kern_packet *kpkt,
5392     boolean_t flush, boolean_t *pdrop)
5393 {
5394 	return ifnet_enqueue_pkt_common(ifp, NULL, kpkt, flush, pdrop);
5395 }
5396 
5397 errno_t
ifnet_enqueue_ifcq_pkt(struct ifnet * ifp,struct ifclassq * ifcq,struct __kern_packet * kpkt,boolean_t flush,boolean_t * pdrop)5398 ifnet_enqueue_ifcq_pkt(struct ifnet *ifp, struct ifclassq *ifcq,
5399     struct __kern_packet *kpkt, boolean_t flush, boolean_t *pdrop)
5400 {
5401 	return ifnet_enqueue_pkt_common(ifp, ifcq, kpkt, flush, pdrop);
5402 }
5403 
5404 static errno_t
ifnet_enqueue_pkt_chain_common(struct ifnet * ifp,struct ifclassq * ifcq,struct __kern_packet * k_head,struct __kern_packet * k_tail,uint32_t cnt,uint32_t bytes,boolean_t flush,boolean_t * pdrop)5405 ifnet_enqueue_pkt_chain_common(struct ifnet *ifp, struct ifclassq *ifcq,
5406     struct __kern_packet *k_head, struct __kern_packet *k_tail, uint32_t cnt,
5407     uint32_t bytes, boolean_t flush, boolean_t *pdrop)
5408 {
5409 	classq_pkt_t head, tail;
5410 
5411 	ASSERT(k_head != NULL);
5412 	ASSERT(k_tail != NULL);
5413 	ASSERT(ifp != NULL);
5414 	ASSERT((ifp->if_eflags & IFEF_TXSTART) != 0);
5415 
5416 	if (!IF_FULLY_ATTACHED(ifp)) {
5417 		/* flag tested without lock for performance */
5418 		pp_free_packet_chain(k_head, NULL);
5419 		*pdrop = TRUE;
5420 		return ENXIO;
5421 	} else if (__improbable(!(ifp->if_flags & IFF_UP))) {
5422 		pp_free_packet_chain(k_head, NULL);
5423 		*pdrop = TRUE;
5424 		return ENETDOWN;
5425 	}
5426 
5427 	CLASSQ_PKT_INIT_PACKET(&head, k_head);
5428 	CLASSQ_PKT_INIT_PACKET(&tail, k_tail);
5429 	return ifnet_enqueue_ifclassq_chain(ifp, ifcq, &head, &tail, cnt, bytes,
5430 	           flush, pdrop);
5431 }
5432 
5433 errno_t
ifnet_enqueue_pkt_chain(struct ifnet * ifp,struct __kern_packet * k_head,struct __kern_packet * k_tail,uint32_t cnt,uint32_t bytes,boolean_t flush,boolean_t * pdrop)5434 ifnet_enqueue_pkt_chain(struct ifnet *ifp, struct __kern_packet *k_head,
5435     struct __kern_packet *k_tail, uint32_t cnt, uint32_t bytes, boolean_t flush,
5436     boolean_t *pdrop)
5437 {
5438 	return ifnet_enqueue_pkt_chain_common(ifp, NULL, k_head, k_tail,
5439 	           cnt, bytes, flush, pdrop);
5440 }
5441 
5442 errno_t
ifnet_enqueue_ifcq_pkt_chain(struct ifnet * ifp,struct ifclassq * ifcq,struct __kern_packet * k_head,struct __kern_packet * k_tail,uint32_t cnt,uint32_t bytes,boolean_t flush,boolean_t * pdrop)5443 ifnet_enqueue_ifcq_pkt_chain(struct ifnet *ifp, struct ifclassq *ifcq,
5444     struct __kern_packet *k_head, struct __kern_packet *k_tail, uint32_t cnt,
5445     uint32_t bytes, boolean_t flush, boolean_t *pdrop)
5446 {
5447 	return ifnet_enqueue_pkt_chain_common(ifp, ifcq, k_head, k_tail,
5448 	           cnt, bytes, flush, pdrop);
5449 }
5450 #endif /* SKYWALK */
5451 
5452 errno_t
ifnet_dequeue(struct ifnet * ifp,struct mbuf ** mp)5453 ifnet_dequeue(struct ifnet *ifp, struct mbuf **mp)
5454 {
5455 	errno_t rc;
5456 	classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
5457 
5458 	if (ifp == NULL || mp == NULL) {
5459 		return EINVAL;
5460 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5461 	    ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5462 		return ENXIO;
5463 	}
5464 	if (!ifnet_is_attached(ifp, 1)) {
5465 		return ENXIO;
5466 	}
5467 
5468 #if SKYWALK
5469 	ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5470 #endif /* SKYWALK */
5471 	rc = ifclassq_dequeue(ifp->if_snd, 1, CLASSQ_DEQUEUE_MAX_BYTE_LIMIT,
5472 	    &pkt, NULL, NULL, NULL, 0);
5473 	VERIFY((pkt.cp_ptype == QP_MBUF) || (pkt.cp_mbuf == NULL));
5474 	ifnet_decr_iorefcnt(ifp);
5475 	*mp = pkt.cp_mbuf;
5476 	return rc;
5477 }
5478 
5479 errno_t
ifnet_dequeue_service_class(struct ifnet * ifp,mbuf_svc_class_t sc,struct mbuf ** mp)5480 ifnet_dequeue_service_class(struct ifnet *ifp, mbuf_svc_class_t sc,
5481     struct mbuf **mp)
5482 {
5483 	errno_t rc;
5484 	classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
5485 
5486 	if (ifp == NULL || mp == NULL || !MBUF_VALID_SC(sc)) {
5487 		return EINVAL;
5488 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5489 	    ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5490 		return ENXIO;
5491 	}
5492 	if (!ifnet_is_attached(ifp, 1)) {
5493 		return ENXIO;
5494 	}
5495 
5496 #if SKYWALK
5497 	ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5498 #endif /* SKYWALK */
5499 	rc = ifclassq_dequeue_sc(ifp->if_snd, sc, 1,
5500 	    CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, &pkt, NULL, NULL, NULL, 0);
5501 	VERIFY((pkt.cp_ptype == QP_MBUF) || (pkt.cp_mbuf == NULL));
5502 	ifnet_decr_iorefcnt(ifp);
5503 	*mp = pkt.cp_mbuf;
5504 	return rc;
5505 }
5506 
5507 errno_t
ifnet_dequeue_multi(struct ifnet * ifp,u_int32_t pkt_limit,struct mbuf ** head,struct mbuf ** tail,u_int32_t * cnt,u_int32_t * len)5508 ifnet_dequeue_multi(struct ifnet *ifp, u_int32_t pkt_limit,
5509     struct mbuf **head, struct mbuf **tail, u_int32_t *cnt, u_int32_t *len)
5510 {
5511 	errno_t rc;
5512 	classq_pkt_t pkt_head = CLASSQ_PKT_INITIALIZER(pkt_head);
5513 	classq_pkt_t pkt_tail = CLASSQ_PKT_INITIALIZER(pkt_tail);
5514 
5515 	if (ifp == NULL || head == NULL || pkt_limit < 1) {
5516 		return EINVAL;
5517 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5518 	    ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5519 		return ENXIO;
5520 	}
5521 	if (!ifnet_is_attached(ifp, 1)) {
5522 		return ENXIO;
5523 	}
5524 
5525 #if SKYWALK
5526 	ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5527 #endif /* SKYWALK */
5528 	rc = ifclassq_dequeue(ifp->if_snd, pkt_limit,
5529 	    CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, &pkt_head, &pkt_tail, cnt, len, 0);
5530 	VERIFY((pkt_head.cp_ptype == QP_MBUF) || (pkt_head.cp_mbuf == NULL));
5531 	ifnet_decr_iorefcnt(ifp);
5532 	*head = pkt_head.cp_mbuf;
5533 	if (tail != NULL) {
5534 		*tail = pkt_tail.cp_mbuf;
5535 	}
5536 	return rc;
5537 }
5538 
5539 errno_t
ifnet_dequeue_multi_bytes(struct ifnet * ifp,u_int32_t byte_limit,struct mbuf ** head,struct mbuf ** tail,u_int32_t * cnt,u_int32_t * len)5540 ifnet_dequeue_multi_bytes(struct ifnet *ifp, u_int32_t byte_limit,
5541     struct mbuf **head, struct mbuf **tail, u_int32_t *cnt, u_int32_t *len)
5542 {
5543 	errno_t rc;
5544 	classq_pkt_t pkt_head = CLASSQ_PKT_INITIALIZER(pkt_head);
5545 	classq_pkt_t pkt_tail = CLASSQ_PKT_INITIALIZER(pkt_tail);
5546 
5547 	if (ifp == NULL || head == NULL || byte_limit < 1) {
5548 		return EINVAL;
5549 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5550 	    ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5551 		return ENXIO;
5552 	}
5553 	if (!ifnet_is_attached(ifp, 1)) {
5554 		return ENXIO;
5555 	}
5556 
5557 #if SKYWALK
5558 	ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5559 #endif /* SKYWALK */
5560 	rc = ifclassq_dequeue(ifp->if_snd, CLASSQ_DEQUEUE_MAX_PKT_LIMIT,
5561 	    byte_limit, &pkt_head, &pkt_tail, cnt, len, 0);
5562 	VERIFY((pkt_head.cp_ptype == QP_MBUF) || (pkt_head.cp_mbuf == NULL));
5563 	ifnet_decr_iorefcnt(ifp);
5564 	*head = pkt_head.cp_mbuf;
5565 	if (tail != NULL) {
5566 		*tail = pkt_tail.cp_mbuf;
5567 	}
5568 	return rc;
5569 }
5570 
5571 errno_t
ifnet_dequeue_service_class_multi(struct ifnet * ifp,mbuf_svc_class_t sc,u_int32_t pkt_limit,struct mbuf ** head,struct mbuf ** tail,u_int32_t * cnt,u_int32_t * len)5572 ifnet_dequeue_service_class_multi(struct ifnet *ifp, mbuf_svc_class_t sc,
5573     u_int32_t pkt_limit, struct mbuf **head, struct mbuf **tail, u_int32_t *cnt,
5574     u_int32_t *len)
5575 {
5576 	errno_t rc;
5577 	classq_pkt_t pkt_head = CLASSQ_PKT_INITIALIZER(pkt_head);
5578 	classq_pkt_t pkt_tail = CLASSQ_PKT_INITIALIZER(pkt_tail);
5579 
5580 	if (ifp == NULL || head == NULL || pkt_limit < 1 ||
5581 	    !MBUF_VALID_SC(sc)) {
5582 		return EINVAL;
5583 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5584 	    ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5585 		return ENXIO;
5586 	}
5587 	if (!ifnet_is_attached(ifp, 1)) {
5588 		return ENXIO;
5589 	}
5590 
5591 #if SKYWALK
5592 	ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5593 #endif /* SKYWALK */
5594 	rc = ifclassq_dequeue_sc(ifp->if_snd, sc, pkt_limit,
5595 	    CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, &pkt_head, &pkt_tail,
5596 	    cnt, len, 0);
5597 	VERIFY((pkt_head.cp_ptype == QP_MBUF) || (pkt_head.cp_mbuf == NULL));
5598 	ifnet_decr_iorefcnt(ifp);
5599 	*head = pkt_head.cp_mbuf;
5600 	if (tail != NULL) {
5601 		*tail = pkt_tail.cp_mbuf;
5602 	}
5603 	return rc;
5604 }
5605 
5606 #if XNU_TARGET_OS_OSX
5607 errno_t
ifnet_framer_stub(struct ifnet * ifp,struct mbuf ** m,const struct sockaddr * dest,const char * dest_linkaddr,const char * frame_type,u_int32_t * pre,u_int32_t * post)5608 ifnet_framer_stub(struct ifnet *ifp, struct mbuf **m,
5609     const struct sockaddr *dest, const char *dest_linkaddr,
5610     const char *frame_type, u_int32_t *pre, u_int32_t *post)
5611 {
5612 	if (pre != NULL) {
5613 		*pre = 0;
5614 	}
5615 	if (post != NULL) {
5616 		*post = 0;
5617 	}
5618 
5619 	return ifp->if_framer_legacy(ifp, m, dest, dest_linkaddr, frame_type);
5620 }
5621 #endif /* XNU_TARGET_OS_OSX */
5622 
5623 static boolean_t
packet_has_vlan_tag(struct mbuf * m)5624 packet_has_vlan_tag(struct mbuf * m)
5625 {
5626 	u_int   tag = 0;
5627 
5628 	if ((m->m_pkthdr.csum_flags & CSUM_VLAN_TAG_VALID) != 0) {
5629 		tag = EVL_VLANOFTAG(m->m_pkthdr.vlan_tag);
5630 		if (tag == 0) {
5631 			/* the packet is just priority-tagged, clear the bit */
5632 			m->m_pkthdr.csum_flags &= ~CSUM_VLAN_TAG_VALID;
5633 		}
5634 	}
5635 	return tag != 0;
5636 }
5637 
5638 static int
dlil_interface_filters_input(struct ifnet * ifp,struct mbuf ** m_p,char ** frame_header_p,protocol_family_t protocol_family,boolean_t skip_bridge)5639 dlil_interface_filters_input(struct ifnet *ifp, struct mbuf **m_p,
5640     char **frame_header_p, protocol_family_t protocol_family,
5641     boolean_t skip_bridge)
5642 {
5643 	boolean_t               is_vlan_packet = FALSE;
5644 	struct ifnet_filter     *filter;
5645 	struct mbuf             *m = *m_p;
5646 
5647 	is_vlan_packet = packet_has_vlan_tag(m);
5648 
5649 	if (TAILQ_EMPTY(&ifp->if_flt_head)) {
5650 		return 0;
5651 	}
5652 
5653 	/*
5654 	 * Pass the inbound packet to the interface filters
5655 	 */
5656 	lck_mtx_lock_spin(&ifp->if_flt_lock);
5657 	/* prevent filter list from changing in case we drop the lock */
5658 	if_flt_monitor_busy(ifp);
5659 	TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
5660 		int result;
5661 
5662 		/* exclude VLAN packets from external filters PR-3586856 */
5663 		if (is_vlan_packet &&
5664 		    (filter->filt_flags & DLIL_IFF_INTERNAL) == 0) {
5665 			continue;
5666 		}
5667 		/* the bridge has already seen the packet */
5668 		if (skip_bridge &&
5669 		    (filter->filt_flags & DLIL_IFF_BRIDGE) != 0) {
5670 			continue;
5671 		}
5672 		if (!filter->filt_skip && filter->filt_input != NULL &&
5673 		    (filter->filt_protocol == 0 ||
5674 		    filter->filt_protocol == protocol_family)) {
5675 			lck_mtx_unlock(&ifp->if_flt_lock);
5676 
5677 			result = (*filter->filt_input)(filter->filt_cookie,
5678 			    ifp, protocol_family, m_p, frame_header_p);
5679 
5680 			lck_mtx_lock_spin(&ifp->if_flt_lock);
5681 			if (result != 0) {
5682 				/* we're done with the filter list */
5683 				if_flt_monitor_unbusy(ifp);
5684 				lck_mtx_unlock(&ifp->if_flt_lock);
5685 				return result;
5686 			}
5687 		}
5688 	}
5689 	/* we're done with the filter list */
5690 	if_flt_monitor_unbusy(ifp);
5691 	lck_mtx_unlock(&ifp->if_flt_lock);
5692 
5693 	/*
5694 	 * Strip away M_PROTO1 bit prior to sending packet up the stack as
5695 	 * it is meant to be local to a subsystem -- if_bridge for M_PROTO1
5696 	 */
5697 	if (*m_p != NULL) {
5698 		(*m_p)->m_flags &= ~M_PROTO1;
5699 	}
5700 
5701 	return 0;
5702 }
5703 
5704 __attribute__((noinline))
5705 static int
dlil_interface_filters_output(struct ifnet * ifp,struct mbuf ** m_p,protocol_family_t protocol_family)5706 dlil_interface_filters_output(struct ifnet *ifp, struct mbuf **m_p,
5707     protocol_family_t protocol_family)
5708 {
5709 	boolean_t               is_vlan_packet;
5710 	struct ifnet_filter     *filter;
5711 	struct mbuf             *m = *m_p;
5712 
5713 	if (TAILQ_EMPTY(&ifp->if_flt_head)) {
5714 		return 0;
5715 	}
5716 	is_vlan_packet = packet_has_vlan_tag(m);
5717 
5718 	/*
5719 	 * Pass the outbound packet to the interface filters
5720 	 */
5721 	lck_mtx_lock_spin(&ifp->if_flt_lock);
5722 	/* prevent filter list from changing in case we drop the lock */
5723 	if_flt_monitor_busy(ifp);
5724 	TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
5725 		int result;
5726 
5727 		/* exclude VLAN packets from external filters PR-3586856 */
5728 		if (is_vlan_packet &&
5729 		    (filter->filt_flags & DLIL_IFF_INTERNAL) == 0) {
5730 			continue;
5731 		}
5732 
5733 		if (!filter->filt_skip && filter->filt_output != NULL &&
5734 		    (filter->filt_protocol == 0 ||
5735 		    filter->filt_protocol == protocol_family)) {
5736 			lck_mtx_unlock(&ifp->if_flt_lock);
5737 
5738 			result = filter->filt_output(filter->filt_cookie, ifp,
5739 			    protocol_family, m_p);
5740 
5741 			lck_mtx_lock_spin(&ifp->if_flt_lock);
5742 			if (result != 0) {
5743 				/* we're done with the filter list */
5744 				if_flt_monitor_unbusy(ifp);
5745 				lck_mtx_unlock(&ifp->if_flt_lock);
5746 				return result;
5747 			}
5748 		}
5749 	}
5750 	/* we're done with the filter list */
5751 	if_flt_monitor_unbusy(ifp);
5752 	lck_mtx_unlock(&ifp->if_flt_lock);
5753 
5754 	return 0;
5755 }
5756 
5757 static void
dlil_ifproto_input(struct if_proto * ifproto,mbuf_t m)5758 dlil_ifproto_input(struct if_proto * ifproto, mbuf_t m)
5759 {
5760 	int error;
5761 
5762 	if (ifproto->proto_kpi == kProtoKPI_v1) {
5763 		/* Version 1 protocols get one packet at a time */
5764 		while (m != NULL) {
5765 			char *  frame_header;
5766 			mbuf_t  next_packet;
5767 
5768 			next_packet = m->m_nextpkt;
5769 			m->m_nextpkt = NULL;
5770 			frame_header = m->m_pkthdr.pkt_hdr;
5771 			m->m_pkthdr.pkt_hdr = NULL;
5772 			error = (*ifproto->kpi.v1.input)(ifproto->ifp,
5773 			    ifproto->protocol_family, m, frame_header);
5774 			if (error != 0 && error != EJUSTRETURN) {
5775 				m_freem(m);
5776 			}
5777 			m = next_packet;
5778 		}
5779 	} else if (ifproto->proto_kpi == kProtoKPI_v2) {
5780 		/* Version 2 protocols support packet lists */
5781 		error = (*ifproto->kpi.v2.input)(ifproto->ifp,
5782 		    ifproto->protocol_family, m);
5783 		if (error != 0 && error != EJUSTRETURN) {
5784 			m_freem_list(m);
5785 		}
5786 	}
5787 }
5788 
5789 static void
dlil_input_stats_add(const struct ifnet_stat_increment_param * s,struct dlil_threading_info * inp,struct ifnet * ifp,boolean_t poll)5790 dlil_input_stats_add(const struct ifnet_stat_increment_param *s,
5791     struct dlil_threading_info *inp, struct ifnet *ifp, boolean_t poll)
5792 {
5793 	struct ifnet_stat_increment_param *d = &inp->dlth_stats;
5794 
5795 	if (s->packets_in != 0) {
5796 		d->packets_in += s->packets_in;
5797 	}
5798 	if (s->bytes_in != 0) {
5799 		d->bytes_in += s->bytes_in;
5800 	}
5801 	if (s->errors_in != 0) {
5802 		d->errors_in += s->errors_in;
5803 	}
5804 
5805 	if (s->packets_out != 0) {
5806 		d->packets_out += s->packets_out;
5807 	}
5808 	if (s->bytes_out != 0) {
5809 		d->bytes_out += s->bytes_out;
5810 	}
5811 	if (s->errors_out != 0) {
5812 		d->errors_out += s->errors_out;
5813 	}
5814 
5815 	if (s->collisions != 0) {
5816 		d->collisions += s->collisions;
5817 	}
5818 	if (s->dropped != 0) {
5819 		d->dropped += s->dropped;
5820 	}
5821 
5822 	if (poll) {
5823 		PKTCNTR_ADD(&ifp->if_poll_tstats, s->packets_in, s->bytes_in);
5824 	}
5825 }
5826 
5827 static boolean_t
dlil_input_stats_sync(struct ifnet * ifp,struct dlil_threading_info * inp)5828 dlil_input_stats_sync(struct ifnet *ifp, struct dlil_threading_info *inp)
5829 {
5830 	struct ifnet_stat_increment_param *s = &inp->dlth_stats;
5831 
5832 	/*
5833 	 * Use of atomic operations is unavoidable here because
5834 	 * these stats may also be incremented elsewhere via KPIs.
5835 	 */
5836 	if (s->packets_in != 0) {
5837 		os_atomic_add(&ifp->if_data.ifi_ipackets, s->packets_in, relaxed);
5838 		s->packets_in = 0;
5839 	}
5840 	if (s->bytes_in != 0) {
5841 		os_atomic_add(&ifp->if_data.ifi_ibytes, s->bytes_in, relaxed);
5842 		s->bytes_in = 0;
5843 	}
5844 	if (s->errors_in != 0) {
5845 		os_atomic_add(&ifp->if_data.ifi_ierrors, s->errors_in, relaxed);
5846 		s->errors_in = 0;
5847 	}
5848 
5849 	if (s->packets_out != 0) {
5850 		os_atomic_add(&ifp->if_data.ifi_opackets, s->packets_out, relaxed);
5851 		s->packets_out = 0;
5852 	}
5853 	if (s->bytes_out != 0) {
5854 		os_atomic_add(&ifp->if_data.ifi_obytes, s->bytes_out, relaxed);
5855 		s->bytes_out = 0;
5856 	}
5857 	if (s->errors_out != 0) {
5858 		os_atomic_add(&ifp->if_data.ifi_oerrors, s->errors_out, relaxed);
5859 		s->errors_out = 0;
5860 	}
5861 
5862 	if (s->collisions != 0) {
5863 		os_atomic_add(&ifp->if_data.ifi_collisions, s->collisions, relaxed);
5864 		s->collisions = 0;
5865 	}
5866 	if (s->dropped != 0) {
5867 		os_atomic_add(&ifp->if_data.ifi_iqdrops, s->dropped, relaxed);
5868 		s->dropped = 0;
5869 	}
5870 
5871 	/*
5872 	 * No need for atomic operations as they are modified here
5873 	 * only from within the DLIL input thread context.
5874 	 */
5875 	if (ifp->if_poll_tstats.packets != 0) {
5876 		ifp->if_poll_pstats.ifi_poll_packets += ifp->if_poll_tstats.packets;
5877 		ifp->if_poll_tstats.packets = 0;
5878 	}
5879 	if (ifp->if_poll_tstats.bytes != 0) {
5880 		ifp->if_poll_pstats.ifi_poll_bytes += ifp->if_poll_tstats.bytes;
5881 		ifp->if_poll_tstats.bytes = 0;
5882 	}
5883 
5884 	return ifp->if_data_threshold != 0;
5885 }
5886 
5887 __private_extern__ void
dlil_input_packet_list(struct ifnet * ifp,struct mbuf * m)5888 dlil_input_packet_list(struct ifnet *ifp, struct mbuf *m)
5889 {
5890 	return dlil_input_packet_list_common(ifp, m, 0,
5891 	           IFNET_MODEL_INPUT_POLL_OFF, FALSE);
5892 }
5893 
5894 __private_extern__ void
dlil_input_packet_list_extended(struct ifnet * ifp,struct mbuf * m,u_int32_t cnt,ifnet_model_t mode)5895 dlil_input_packet_list_extended(struct ifnet *ifp, struct mbuf *m,
5896     u_int32_t cnt, ifnet_model_t mode)
5897 {
5898 	return dlil_input_packet_list_common(ifp, m, cnt, mode, TRUE);
5899 }
5900 
5901 static inline mbuf_t
handle_bridge_early_input(ifnet_t ifp,mbuf_t m,u_int32_t cnt)5902 handle_bridge_early_input(ifnet_t ifp, mbuf_t m, u_int32_t cnt)
5903 {
5904 	lck_mtx_lock_spin(&ifp->if_flt_lock);
5905 	if_flt_monitor_busy(ifp);
5906 	lck_mtx_unlock(&ifp->if_flt_lock);
5907 
5908 	if (ifp->if_bridge != NULL) {
5909 		m = bridge_early_input(ifp, m, cnt);
5910 	}
5911 	lck_mtx_lock_spin(&ifp->if_flt_lock);
5912 	if_flt_monitor_unbusy(ifp);
5913 	lck_mtx_unlock(&ifp->if_flt_lock);
5914 	return m;
5915 }
5916 
5917 static void
dlil_input_packet_list_common(struct ifnet * ifp_param,struct mbuf * m,u_int32_t cnt,ifnet_model_t mode,boolean_t ext)5918 dlil_input_packet_list_common(struct ifnet *ifp_param, struct mbuf *m,
5919     u_int32_t cnt, ifnet_model_t mode, boolean_t ext)
5920 {
5921 	int error = 0;
5922 	protocol_family_t protocol_family;
5923 	mbuf_t next_packet;
5924 	ifnet_t ifp = ifp_param;
5925 	char *frame_header = NULL;
5926 	struct if_proto *last_ifproto = NULL;
5927 	mbuf_t pkt_first = NULL;
5928 	mbuf_t *pkt_next = NULL;
5929 	u_int32_t poll_thresh = 0, poll_ival = 0;
5930 	int iorefcnt = 0;
5931 	boolean_t skip_bridge_filter = FALSE;
5932 
5933 	KERNEL_DEBUG(DBG_FNC_DLIL_INPUT | DBG_FUNC_START, 0, 0, 0, 0, 0);
5934 
5935 	if (ext && mode == IFNET_MODEL_INPUT_POLL_ON && cnt > 1 &&
5936 	    (poll_ival = if_rxpoll_interval_pkts) > 0) {
5937 		poll_thresh = cnt;
5938 	}
5939 	if (bridge_enable_early_input != 0 &&
5940 	    ifp != NULL && ifp->if_bridge != NULL) {
5941 		m = handle_bridge_early_input(ifp, m, cnt);
5942 		skip_bridge_filter = TRUE;
5943 	}
5944 	while (m != NULL) {
5945 		struct if_proto *ifproto = NULL;
5946 		uint32_t pktf_mask;     /* pkt flags to preserve */
5947 
5948 		m_add_crumb(m, PKT_CRUMB_DLIL_INPUT);
5949 
5950 		if (ifp_param == NULL) {
5951 			ifp = m->m_pkthdr.rcvif;
5952 		}
5953 
5954 		if ((ifp->if_eflags & IFEF_RXPOLL) &&
5955 		    (ifp->if_xflags & IFXF_LEGACY) && poll_thresh != 0 &&
5956 		    poll_ival > 0 && (--poll_thresh % poll_ival) == 0) {
5957 			ifnet_poll(ifp);
5958 		}
5959 
5960 		/* Check if this mbuf looks valid */
5961 		MBUF_INPUT_CHECK(m, ifp);
5962 
5963 		next_packet = m->m_nextpkt;
5964 		m->m_nextpkt = NULL;
5965 		frame_header = m->m_pkthdr.pkt_hdr;
5966 		m->m_pkthdr.pkt_hdr = NULL;
5967 
5968 		/*
5969 		 * Get an IO reference count if the interface is not
5970 		 * loopback (lo0) and it is attached; lo0 never goes
5971 		 * away, so optimize for that.
5972 		 */
5973 		if (ifp != lo_ifp) {
5974 			/* iorefcnt is 0 if it hasn't been taken yet */
5975 			if (iorefcnt == 0) {
5976 				if (!ifnet_datamov_begin(ifp)) {
5977 					m_freem(m);
5978 					goto next;
5979 				}
5980 			}
5981 			iorefcnt = 1;
5982 			/*
5983 			 * Preserve the time stamp and skip pktap flags.
5984 			 */
5985 			pktf_mask = PKTF_TS_VALID | PKTF_SKIP_PKTAP;
5986 		} else {
5987 			/*
5988 			 * If this arrived on lo0, preserve interface addr
5989 			 * info to allow for connectivity between loopback
5990 			 * and local interface addresses.
5991 			 */
5992 			pktf_mask = (PKTF_LOOP | PKTF_IFAINFO);
5993 		}
5994 		pktf_mask |= PKTF_WAKE_PKT;
5995 
5996 		/* make sure packet comes in clean */
5997 		m_classifier_init(m, pktf_mask);
5998 
5999 		ifp_inc_traffic_class_in(ifp, m);
6000 
6001 		/* find which protocol family this packet is for */
6002 		ifnet_lock_shared(ifp);
6003 		error = (*ifp->if_demux)(ifp, m, frame_header,
6004 		    &protocol_family);
6005 		ifnet_lock_done(ifp);
6006 		if (error != 0) {
6007 			if (error == EJUSTRETURN) {
6008 				goto next;
6009 			}
6010 			protocol_family = 0;
6011 		}
6012 		/* check for an updated frame header */
6013 		if (m->m_pkthdr.pkt_hdr != NULL) {
6014 			frame_header = m->m_pkthdr.pkt_hdr;
6015 			m->m_pkthdr.pkt_hdr = NULL;
6016 		}
6017 
6018 #if (DEVELOPMENT || DEBUG)
6019 		/*
6020 		 * For testing we do not care about broadcast and multicast packets as
6021 		 * they are not as controllable as unicast traffic
6022 		 */
6023 		if (__improbable(ifp->if_xflags & IFXF_MARK_WAKE_PKT)) {
6024 			if ((protocol_family == PF_INET || protocol_family == PF_INET6) &&
6025 			    (m->m_flags & (M_BCAST | M_MCAST)) == 0) {
6026 				/*
6027 				 * This is a one-shot command
6028 				 */
6029 				ifp->if_xflags &= ~IFXF_MARK_WAKE_PKT;
6030 				m->m_pkthdr.pkt_flags |= PKTF_WAKE_PKT;
6031 			}
6032 		}
6033 #endif /* (DEVELOPMENT || DEBUG) */
6034 		if (__improbable(net_wake_pkt_debug > 0 && (m->m_pkthdr.pkt_flags & PKTF_WAKE_PKT))) {
6035 			char buffer[64];
6036 			size_t buflen = MIN(mbuf_pkthdr_len(m), sizeof(buffer));
6037 
6038 			os_log(OS_LOG_DEFAULT, "wake packet from %s len %d",
6039 			    ifp->if_xname, m_pktlen(m));
6040 			if (mbuf_copydata(m, 0, buflen, buffer) == 0) {
6041 				log_hexdump(buffer, buflen);
6042 			}
6043 		}
6044 
6045 		pktap_input(ifp, protocol_family, m, frame_header);
6046 
6047 		/* Drop v4 packets received on CLAT46 enabled cell interface */
6048 		if (protocol_family == PF_INET && IS_INTF_CLAT46(ifp) &&
6049 		    ifp->if_type == IFT_CELLULAR) {
6050 			m_freem(m);
6051 			ip6stat.ip6s_clat464_in_v4_drop++;
6052 			goto next;
6053 		}
6054 
6055 		/* Translate the packet if it is received on CLAT interface */
6056 		if ((m->m_flags & M_PROMISC) == 0 &&
6057 		    protocol_family == PF_INET6 &&
6058 		    IS_INTF_CLAT46(ifp) &&
6059 		    dlil_is_clat_needed(protocol_family, m)) {
6060 			char *data = NULL;
6061 			struct ether_header eh;
6062 			struct ether_header *ehp = NULL;
6063 
6064 			if (ifp->if_type == IFT_ETHER) {
6065 				ehp = (struct ether_header *)(void *)frame_header;
6066 				/* Skip RX Ethernet packets if they are not IPV6 */
6067 				if (ntohs(ehp->ether_type) != ETHERTYPE_IPV6) {
6068 					goto skip_clat;
6069 				}
6070 
6071 				/* Keep a copy of frame_header for Ethernet packets */
6072 				bcopy(frame_header, (caddr_t)&eh, ETHER_HDR_LEN);
6073 			}
6074 			error = dlil_clat64(ifp, &protocol_family, &m);
6075 			data = mtod(m, char*);
6076 			if (error != 0) {
6077 				m_freem(m);
6078 				ip6stat.ip6s_clat464_in_drop++;
6079 				goto next;
6080 			}
6081 			/* Native v6 should be No-op */
6082 			if (protocol_family != PF_INET) {
6083 				goto skip_clat;
6084 			}
6085 
6086 			/* Do this only for translated v4 packets. */
6087 			switch (ifp->if_type) {
6088 			case IFT_CELLULAR:
6089 				frame_header = data;
6090 				break;
6091 			case IFT_ETHER:
6092 				/*
6093 				 * Drop if the mbuf doesn't have enough
6094 				 * space for Ethernet header
6095 				 */
6096 				if (M_LEADINGSPACE(m) < ETHER_HDR_LEN) {
6097 					m_freem(m);
6098 					ip6stat.ip6s_clat464_in_drop++;
6099 					goto next;
6100 				}
6101 				/*
6102 				 * Set the frame_header ETHER_HDR_LEN bytes
6103 				 * preceeding the data pointer. Change
6104 				 * the ether_type too.
6105 				 */
6106 				frame_header = data - ETHER_HDR_LEN;
6107 				eh.ether_type = htons(ETHERTYPE_IP);
6108 				bcopy((caddr_t)&eh, frame_header, ETHER_HDR_LEN);
6109 				break;
6110 			}
6111 		}
6112 skip_clat:
6113 		/*
6114 		 * Match the wake packet against the list of ports that has been
6115 		 * been queried by the driver before the device went to sleep
6116 		 */
6117 		if (__improbable(m->m_pkthdr.pkt_flags & PKTF_WAKE_PKT)) {
6118 			if (protocol_family != PF_INET && protocol_family != PF_INET6) {
6119 				if_ports_used_match_mbuf(ifp, protocol_family, m);
6120 			}
6121 		}
6122 		if (hwcksum_dbg != 0 && !(ifp->if_flags & IFF_LOOPBACK) &&
6123 		    !(m->m_pkthdr.pkt_flags & PKTF_LOOP)) {
6124 			dlil_input_cksum_dbg(ifp, m, frame_header,
6125 			    protocol_family);
6126 		}
6127 		/*
6128 		 * For partial checksum offload, we expect the driver to
6129 		 * set the start offset indicating the start of the span
6130 		 * that is covered by the hardware-computed checksum;
6131 		 * adjust this start offset accordingly because the data
6132 		 * pointer has been advanced beyond the link-layer header.
6133 		 *
6134 		 * Virtual lan types (bridge, vlan, bond) can call
6135 		 * dlil_input_packet_list() with the same packet with the
6136 		 * checksum flags set. Set a flag indicating that the
6137 		 * adjustment has already been done.
6138 		 */
6139 		if ((m->m_pkthdr.csum_flags & CSUM_ADJUST_DONE) != 0) {
6140 			/* adjustment has already been done */
6141 		} else if ((m->m_pkthdr.csum_flags &
6142 		    (CSUM_DATA_VALID | CSUM_PARTIAL)) ==
6143 		    (CSUM_DATA_VALID | CSUM_PARTIAL)) {
6144 			int adj;
6145 			if (frame_header == NULL ||
6146 			    frame_header < (char *)mbuf_datastart(m) ||
6147 			    frame_header > (char *)m->m_data ||
6148 			    (adj = (int)(m->m_data - (uintptr_t)frame_header)) >
6149 			    m->m_pkthdr.csum_rx_start) {
6150 				m->m_pkthdr.csum_data = 0;
6151 				m->m_pkthdr.csum_flags &= ~CSUM_DATA_VALID;
6152 				hwcksum_in_invalidated++;
6153 			} else {
6154 				m->m_pkthdr.csum_rx_start -= adj;
6155 			}
6156 			/* make sure we don't adjust more than once */
6157 			m->m_pkthdr.csum_flags |= CSUM_ADJUST_DONE;
6158 		}
6159 		if (clat_debug) {
6160 			pktap_input(ifp, protocol_family, m, frame_header);
6161 		}
6162 
6163 		if (m->m_flags & (M_BCAST | M_MCAST)) {
6164 			os_atomic_inc(&ifp->if_imcasts, relaxed);
6165 		}
6166 
6167 		/* run interface filters */
6168 		error = dlil_interface_filters_input(ifp, &m,
6169 		    &frame_header, protocol_family, skip_bridge_filter);
6170 		if (error != 0) {
6171 			if (error != EJUSTRETURN) {
6172 				m_freem(m);
6173 			}
6174 			goto next;
6175 		}
6176 		/*
6177 		 * A VLAN and Bond interface receives packets by attaching
6178 		 * a "protocol" to the underlying interface.
6179 		 * A promiscuous packet needs to be delivered to the
6180 		 * VLAN or Bond interface since:
6181 		 * - Bond interface member may not support setting the
6182 		 *   MAC address, so packets are inherently "promiscuous"
6183 		 * - A VLAN or Bond interface could be members of a bridge,
6184 		 *   where promiscuous packets correspond to other
6185 		 *   devices that the bridge forwards packets to/from
6186 		 */
6187 		if ((m->m_flags & M_PROMISC) != 0) {
6188 			switch (protocol_family) {
6189 			case PF_VLAN:
6190 			case PF_BOND:
6191 				/* VLAN and Bond get promiscuous packets */
6192 				break;
6193 			default:
6194 				m_freem(m);
6195 				goto next;
6196 			}
6197 		}
6198 
6199 		/* Lookup the protocol attachment to this interface */
6200 		if (protocol_family == 0) {
6201 			ifproto = NULL;
6202 		} else if (last_ifproto != NULL && last_ifproto->ifp == ifp &&
6203 		    (last_ifproto->protocol_family == protocol_family)) {
6204 			VERIFY(ifproto == NULL);
6205 			ifproto = last_ifproto;
6206 			if_proto_ref(last_ifproto);
6207 		} else {
6208 			VERIFY(ifproto == NULL);
6209 			ifnet_lock_shared(ifp);
6210 			/* callee holds a proto refcnt upon success */
6211 			ifproto = find_attached_proto(ifp, protocol_family);
6212 			ifnet_lock_done(ifp);
6213 		}
6214 		if (ifproto == NULL) {
6215 			/* no protocol for this packet, discard */
6216 			m_freem(m);
6217 			goto next;
6218 		}
6219 		if (ifproto != last_ifproto) {
6220 			if (last_ifproto != NULL) {
6221 				/* pass up the list for the previous protocol */
6222 				dlil_ifproto_input(last_ifproto, pkt_first);
6223 				pkt_first = NULL;
6224 				if_proto_free(last_ifproto);
6225 			}
6226 			last_ifproto = ifproto;
6227 			if_proto_ref(ifproto);
6228 		}
6229 		/* extend the list */
6230 		m->m_pkthdr.pkt_hdr = frame_header;
6231 		if (pkt_first == NULL) {
6232 			pkt_first = m;
6233 		} else {
6234 			*pkt_next = m;
6235 		}
6236 		pkt_next = &m->m_nextpkt;
6237 
6238 next:
6239 		if (next_packet == NULL && last_ifproto != NULL) {
6240 			/* pass up the last list of packets */
6241 			dlil_ifproto_input(last_ifproto, pkt_first);
6242 			if_proto_free(last_ifproto);
6243 			last_ifproto = NULL;
6244 		}
6245 		if (ifproto != NULL) {
6246 			if_proto_free(ifproto);
6247 			ifproto = NULL;
6248 		}
6249 
6250 		m = next_packet;
6251 
6252 		/* update the driver's multicast filter, if needed */
6253 		if (ifp->if_updatemcasts > 0 && if_mcasts_update(ifp) == 0) {
6254 			ifp->if_updatemcasts = 0;
6255 		}
6256 		if (iorefcnt == 1) {
6257 			/* If the next mbuf is on a different interface, unlock data-mov */
6258 			if (!m || (ifp != ifp_param && ifp != m->m_pkthdr.rcvif)) {
6259 				ifnet_datamov_end(ifp);
6260 				iorefcnt = 0;
6261 			}
6262 		}
6263 	}
6264 
6265 	KERNEL_DEBUG(DBG_FNC_DLIL_INPUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
6266 }
6267 
6268 static errno_t
if_mcasts_update_common(struct ifnet * ifp,bool sync)6269 if_mcasts_update_common(struct ifnet * ifp, bool sync)
6270 {
6271 	errno_t err;
6272 
6273 	if (sync) {
6274 		err = ifnet_ioctl(ifp, 0, SIOCADDMULTI, NULL);
6275 		if (err == EAFNOSUPPORT) {
6276 			err = 0;
6277 		}
6278 	} else {
6279 		ifnet_ioctl_async(ifp, SIOCADDMULTI);
6280 		err = 0;
6281 	}
6282 	DLIL_PRINTF("%s: %s %d suspended link-layer multicast membership(s) "
6283 	    "(err=%d)\n", if_name(ifp),
6284 	    (err == 0 ? "successfully restored" : "failed to restore"),
6285 	    ifp->if_updatemcasts, err);
6286 
6287 	/* just return success */
6288 	return 0;
6289 }
6290 
6291 static errno_t
if_mcasts_update_async(struct ifnet * ifp)6292 if_mcasts_update_async(struct ifnet *ifp)
6293 {
6294 	return if_mcasts_update_common(ifp, false);
6295 }
6296 
6297 errno_t
if_mcasts_update(struct ifnet * ifp)6298 if_mcasts_update(struct ifnet *ifp)
6299 {
6300 	return if_mcasts_update_common(ifp, true);
6301 }
6302 
6303 /* If ifp is set, we will increment the generation for the interface */
6304 int
dlil_post_complete_msg(struct ifnet * ifp,struct kev_msg * event)6305 dlil_post_complete_msg(struct ifnet *ifp, struct kev_msg *event)
6306 {
6307 	if (ifp != NULL) {
6308 		ifnet_increment_generation(ifp);
6309 	}
6310 
6311 #if NECP
6312 	necp_update_all_clients();
6313 #endif /* NECP */
6314 
6315 	return kev_post_msg(event);
6316 }
6317 
6318 __private_extern__ void
dlil_post_sifflags_msg(struct ifnet * ifp)6319 dlil_post_sifflags_msg(struct ifnet * ifp)
6320 {
6321 	struct kev_msg ev_msg;
6322 	struct net_event_data ev_data;
6323 
6324 	bzero(&ev_data, sizeof(ev_data));
6325 	bzero(&ev_msg, sizeof(ev_msg));
6326 	ev_msg.vendor_code = KEV_VENDOR_APPLE;
6327 	ev_msg.kev_class = KEV_NETWORK_CLASS;
6328 	ev_msg.kev_subclass = KEV_DL_SUBCLASS;
6329 	ev_msg.event_code = KEV_DL_SIFFLAGS;
6330 	strlcpy(&ev_data.if_name[0], ifp->if_name, IFNAMSIZ);
6331 	ev_data.if_family = ifp->if_family;
6332 	ev_data.if_unit = (u_int32_t) ifp->if_unit;
6333 	ev_msg.dv[0].data_length = sizeof(struct net_event_data);
6334 	ev_msg.dv[0].data_ptr = &ev_data;
6335 	ev_msg.dv[1].data_length = 0;
6336 	dlil_post_complete_msg(ifp, &ev_msg);
6337 }
6338 
6339 #define TMP_IF_PROTO_ARR_SIZE   10
6340 static int
dlil_event_internal(struct ifnet * ifp,struct kev_msg * event,bool update_generation)6341 dlil_event_internal(struct ifnet *ifp, struct kev_msg *event, bool update_generation)
6342 {
6343 	struct ifnet_filter *filter = NULL;
6344 	struct if_proto *proto = NULL;
6345 	int if_proto_count = 0;
6346 	struct if_proto *tmp_ifproto_stack_arr[TMP_IF_PROTO_ARR_SIZE] = {NULL};
6347 	struct if_proto **tmp_ifproto_arr = tmp_ifproto_stack_arr;
6348 	int tmp_ifproto_arr_idx = 0;
6349 
6350 	/*
6351 	 * Pass the event to the interface filters
6352 	 */
6353 	lck_mtx_lock_spin(&ifp->if_flt_lock);
6354 	/* prevent filter list from changing in case we drop the lock */
6355 	if_flt_monitor_busy(ifp);
6356 	TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
6357 		if (filter->filt_event != NULL) {
6358 			lck_mtx_unlock(&ifp->if_flt_lock);
6359 
6360 			filter->filt_event(filter->filt_cookie, ifp,
6361 			    filter->filt_protocol, event);
6362 
6363 			lck_mtx_lock_spin(&ifp->if_flt_lock);
6364 		}
6365 	}
6366 	/* we're done with the filter list */
6367 	if_flt_monitor_unbusy(ifp);
6368 	lck_mtx_unlock(&ifp->if_flt_lock);
6369 
6370 	/* Get an io ref count if the interface is attached */
6371 	if (!ifnet_is_attached(ifp, 1)) {
6372 		goto done;
6373 	}
6374 
6375 	/*
6376 	 * An embedded tmp_list_entry in if_proto may still get
6377 	 * over-written by another thread after giving up ifnet lock,
6378 	 * therefore we are avoiding embedded pointers here.
6379 	 */
6380 	ifnet_lock_shared(ifp);
6381 	if_proto_count = dlil_ifp_protolist(ifp, NULL, 0);
6382 	if (if_proto_count) {
6383 		int i;
6384 		VERIFY(ifp->if_proto_hash != NULL);
6385 		if (if_proto_count <= TMP_IF_PROTO_ARR_SIZE) {
6386 			tmp_ifproto_arr = tmp_ifproto_stack_arr;
6387 		} else {
6388 			tmp_ifproto_arr = kalloc_type(struct if_proto *,
6389 			    if_proto_count, Z_WAITOK | Z_ZERO);
6390 			if (tmp_ifproto_arr == NULL) {
6391 				ifnet_lock_done(ifp);
6392 				goto cleanup;
6393 			}
6394 		}
6395 
6396 		for (i = 0; i < PROTO_HASH_SLOTS; i++) {
6397 			SLIST_FOREACH(proto, &ifp->if_proto_hash[i],
6398 			    next_hash) {
6399 				if_proto_ref(proto);
6400 				tmp_ifproto_arr[tmp_ifproto_arr_idx] = proto;
6401 				tmp_ifproto_arr_idx++;
6402 			}
6403 		}
6404 		VERIFY(if_proto_count == tmp_ifproto_arr_idx);
6405 	}
6406 	ifnet_lock_done(ifp);
6407 
6408 	for (tmp_ifproto_arr_idx = 0; tmp_ifproto_arr_idx < if_proto_count;
6409 	    tmp_ifproto_arr_idx++) {
6410 		proto = tmp_ifproto_arr[tmp_ifproto_arr_idx];
6411 		VERIFY(proto != NULL);
6412 		proto_media_event eventp =
6413 		    (proto->proto_kpi == kProtoKPI_v1 ?
6414 		    proto->kpi.v1.event :
6415 		    proto->kpi.v2.event);
6416 
6417 		if (eventp != NULL) {
6418 			eventp(ifp, proto->protocol_family,
6419 			    event);
6420 		}
6421 		if_proto_free(proto);
6422 	}
6423 
6424 cleanup:
6425 	if (tmp_ifproto_arr != tmp_ifproto_stack_arr) {
6426 		kfree_type(struct if_proto *, if_proto_count, tmp_ifproto_arr);
6427 	}
6428 
6429 	/* Pass the event to the interface */
6430 	if (ifp->if_event != NULL) {
6431 		ifp->if_event(ifp, event);
6432 	}
6433 
6434 	/* Release the io ref count */
6435 	ifnet_decr_iorefcnt(ifp);
6436 done:
6437 	return dlil_post_complete_msg(update_generation ? ifp : NULL, event);
6438 }
6439 
6440 errno_t
ifnet_event(ifnet_t ifp,struct kern_event_msg * event)6441 ifnet_event(ifnet_t ifp, struct kern_event_msg *event)
6442 {
6443 	struct kev_msg kev_msg;
6444 	int result = 0;
6445 
6446 	if (ifp == NULL || event == NULL) {
6447 		return EINVAL;
6448 	}
6449 
6450 	bzero(&kev_msg, sizeof(kev_msg));
6451 	kev_msg.vendor_code = event->vendor_code;
6452 	kev_msg.kev_class = event->kev_class;
6453 	kev_msg.kev_subclass = event->kev_subclass;
6454 	kev_msg.event_code = event->event_code;
6455 	kev_msg.dv[0].data_ptr = &event->event_data[0];
6456 	kev_msg.dv[0].data_length = event->total_size - KEV_MSG_HEADER_SIZE;
6457 	kev_msg.dv[1].data_length = 0;
6458 
6459 	result = dlil_event_internal(ifp, &kev_msg, TRUE);
6460 
6461 	return result;
6462 }
6463 
6464 static void
dlil_count_chain_len(mbuf_t m,struct chain_len_stats * cls)6465 dlil_count_chain_len(mbuf_t m, struct chain_len_stats *cls)
6466 {
6467 	mbuf_t  n = m;
6468 	int chainlen = 0;
6469 
6470 	while (n != NULL) {
6471 		chainlen++;
6472 		n = n->m_next;
6473 	}
6474 	switch (chainlen) {
6475 	case 0:
6476 		break;
6477 	case 1:
6478 		os_atomic_inc(&cls->cls_one, relaxed);
6479 		break;
6480 	case 2:
6481 		os_atomic_inc(&cls->cls_two, relaxed);
6482 		break;
6483 	case 3:
6484 		os_atomic_inc(&cls->cls_three, relaxed);
6485 		break;
6486 	case 4:
6487 		os_atomic_inc(&cls->cls_four, relaxed);
6488 		break;
6489 	case 5:
6490 	default:
6491 		os_atomic_inc(&cls->cls_five_or_more, relaxed);
6492 		break;
6493 	}
6494 }
6495 
6496 #if CONFIG_DTRACE
6497 __attribute__((noinline))
6498 static void
dlil_output_dtrace(ifnet_t ifp,protocol_family_t proto_family,mbuf_t m)6499 dlil_output_dtrace(ifnet_t ifp, protocol_family_t proto_family, mbuf_t  m)
6500 {
6501 	if (proto_family == PF_INET) {
6502 		struct ip *ip = mtod(m, struct ip *);
6503 		DTRACE_IP6(send, struct mbuf *, m, struct inpcb *, NULL,
6504 		    struct ip *, ip, struct ifnet *, ifp,
6505 		    struct ip *, ip, struct ip6_hdr *, NULL);
6506 	} else if (proto_family == PF_INET6) {
6507 		struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
6508 		DTRACE_IP6(send, struct mbuf *, m, struct inpcb *, NULL,
6509 		    struct ip6_hdr *, ip6, struct ifnet *, ifp,
6510 		    struct ip *, NULL, struct ip6_hdr *, ip6);
6511 	}
6512 }
6513 #endif /* CONFIG_DTRACE */
6514 
6515 /*
6516  * dlil_output
6517  *
6518  * Caller should have a lock on the protocol domain if the protocol
6519  * doesn't support finer grained locking. In most cases, the lock
6520  * will be held from the socket layer and won't be released until
6521  * we return back to the socket layer.
6522  *
6523  * This does mean that we must take a protocol lock before we take
6524  * an interface lock if we're going to take both. This makes sense
6525  * because a protocol is likely to interact with an ifp while it
6526  * is under the protocol lock.
6527  *
6528  * An advisory code will be returned if adv is not null. This
6529  * can be used to provide feedback about interface queues to the
6530  * application.
6531  */
6532 errno_t
dlil_output(ifnet_t ifp,protocol_family_t proto_family,mbuf_t packetlist,void * route,const struct sockaddr * dest,int flags,struct flowadv * adv)6533 dlil_output(ifnet_t ifp, protocol_family_t proto_family, mbuf_t packetlist,
6534     void *route, const struct sockaddr *dest, int flags, struct flowadv *adv)
6535 {
6536 	char *frame_type = NULL;
6537 	char *dst_linkaddr = NULL;
6538 	int retval = 0;
6539 	char frame_type_buffer[DLIL_MAX_FRAME_TYPE_BUFFER_SIZE];
6540 	char dst_linkaddr_buffer[DLIL_MAX_LINKADDR_BUFFER_SIZE];
6541 	struct if_proto *proto = NULL;
6542 	mbuf_t  m = NULL;
6543 	mbuf_t  send_head = NULL;
6544 	mbuf_t  *send_tail = &send_head;
6545 	int iorefcnt = 0;
6546 	u_int32_t pre = 0, post = 0;
6547 	u_int32_t fpkts = 0, fbytes = 0;
6548 	int32_t flen = 0;
6549 	struct timespec now;
6550 	u_int64_t now_nsec;
6551 	boolean_t did_clat46 = FALSE;
6552 	protocol_family_t old_proto_family = proto_family;
6553 	struct sockaddr_in6 dest6;
6554 	struct rtentry *rt = NULL;
6555 	u_int16_t m_loop_set = 0;
6556 	bool raw = (flags & DLIL_OUTPUT_FLAGS_RAW) != 0;
6557 
6558 	KERNEL_DEBUG(DBG_FNC_DLIL_OUTPUT | DBG_FUNC_START, 0, 0, 0, 0, 0);
6559 
6560 	/*
6561 	 * Get an io refcnt if the interface is attached to prevent ifnet_detach
6562 	 * from happening while this operation is in progress
6563 	 */
6564 	if (!ifnet_datamov_begin(ifp)) {
6565 		retval = ENXIO;
6566 		goto cleanup;
6567 	}
6568 	iorefcnt = 1;
6569 
6570 	VERIFY(ifp->if_output_dlil != NULL);
6571 
6572 	/* update the driver's multicast filter, if needed */
6573 	if (ifp->if_updatemcasts > 0) {
6574 		if_mcasts_update_async(ifp);
6575 		ifp->if_updatemcasts = 0;
6576 	}
6577 
6578 	frame_type = frame_type_buffer;
6579 	dst_linkaddr = dst_linkaddr_buffer;
6580 
6581 	if (flags == DLIL_OUTPUT_FLAGS_NONE) {
6582 		ifnet_lock_shared(ifp);
6583 		/* callee holds a proto refcnt upon success */
6584 		proto = find_attached_proto(ifp, proto_family);
6585 		if (proto == NULL) {
6586 			ifnet_lock_done(ifp);
6587 			retval = ENXIO;
6588 			goto cleanup;
6589 		}
6590 		ifnet_lock_done(ifp);
6591 	}
6592 
6593 preout_again:
6594 	if (packetlist == NULL) {
6595 		goto cleanup;
6596 	}
6597 
6598 	m = packetlist;
6599 	packetlist = packetlist->m_nextpkt;
6600 	m->m_nextpkt = NULL;
6601 
6602 	m_add_crumb(m, PKT_CRUMB_DLIL_OUTPUT);
6603 
6604 	/*
6605 	 * Perform address family translation for the first
6606 	 * packet outside the loop in order to perform address
6607 	 * lookup for the translated proto family.
6608 	 */
6609 	if (proto_family == PF_INET && IS_INTF_CLAT46(ifp) &&
6610 	    (ifp->if_type == IFT_CELLULAR ||
6611 	    dlil_is_clat_needed(proto_family, m))) {
6612 		retval = dlil_clat46(ifp, &proto_family, &m);
6613 		/*
6614 		 * Go to the next packet if translation fails
6615 		 */
6616 		if (retval != 0) {
6617 			m_freem(m);
6618 			m = NULL;
6619 			ip6stat.ip6s_clat464_out_drop++;
6620 			/* Make sure that the proto family is PF_INET */
6621 			ASSERT(proto_family == PF_INET);
6622 			goto preout_again;
6623 		}
6624 		/*
6625 		 * Free the old one and make it point to the IPv6 proto structure.
6626 		 *
6627 		 * Change proto for the first time we have successfully
6628 		 * performed address family translation.
6629 		 */
6630 		if (!did_clat46 && proto_family == PF_INET6) {
6631 			did_clat46 = TRUE;
6632 
6633 			if (proto != NULL) {
6634 				if_proto_free(proto);
6635 			}
6636 			ifnet_lock_shared(ifp);
6637 			/* callee holds a proto refcnt upon success */
6638 			proto = find_attached_proto(ifp, proto_family);
6639 			if (proto == NULL) {
6640 				ifnet_lock_done(ifp);
6641 				retval = ENXIO;
6642 				m_freem(m);
6643 				m = NULL;
6644 				goto cleanup;
6645 			}
6646 			ifnet_lock_done(ifp);
6647 			if (ifp->if_type == IFT_ETHER) {
6648 				/* Update the dest to translated v6 address */
6649 				dest6.sin6_len = sizeof(struct sockaddr_in6);
6650 				dest6.sin6_family = AF_INET6;
6651 				dest6.sin6_addr = (mtod(m, struct ip6_hdr *))->ip6_dst;
6652 				dest = SA(&dest6);
6653 
6654 				/*
6655 				 * Lookup route to the translated destination
6656 				 * Free this route ref during cleanup
6657 				 */
6658 				rt = rtalloc1_scoped(SA(&dest6),
6659 				    0, 0, ifp->if_index);
6660 
6661 				route = rt;
6662 			}
6663 		}
6664 	}
6665 
6666 	/*
6667 	 * This path gets packet chain going to the same destination.
6668 	 * The pre output routine is used to either trigger resolution of
6669 	 * the next hop or retrieve the next hop's link layer addressing.
6670 	 * For ex: ether_inet(6)_pre_output routine.
6671 	 *
6672 	 * If the routine returns EJUSTRETURN, it implies that packet has
6673 	 * been queued, and therefore we have to call preout_again for the
6674 	 * following packet in the chain.
6675 	 *
6676 	 * For errors other than EJUSTRETURN, the current packet is freed
6677 	 * and the rest of the chain (pointed by packetlist is freed as
6678 	 * part of clean up.
6679 	 *
6680 	 * Else if there is no error the retrieved information is used for
6681 	 * all the packets in the chain.
6682 	 */
6683 	if (flags == DLIL_OUTPUT_FLAGS_NONE) {
6684 		proto_media_preout preoutp = (proto->proto_kpi == kProtoKPI_v1 ?
6685 		    proto->kpi.v1.pre_output : proto->kpi.v2.pre_output);
6686 		retval = 0;
6687 		if (preoutp != NULL) {
6688 			retval = preoutp(ifp, proto_family, &m, dest, route,
6689 			    frame_type, dst_linkaddr);
6690 
6691 			if (retval != 0) {
6692 				if (retval == EJUSTRETURN) {
6693 					goto preout_again;
6694 				}
6695 				m_freem(m);
6696 				m = NULL;
6697 				goto cleanup;
6698 			}
6699 		}
6700 	}
6701 
6702 	nanouptime(&now);
6703 	net_timernsec(&now, &now_nsec);
6704 
6705 	do {
6706 		/*
6707 		 * pkt_hdr is set here to point to m_data prior to
6708 		 * calling into the framer. This value of pkt_hdr is
6709 		 * used by the netif gso logic to retrieve the ip header
6710 		 * for the TCP packets, offloaded for TSO processing.
6711 		 */
6712 		if (raw && (ifp->if_family == IFNET_FAMILY_ETHERNET)) {
6713 			uint8_t vlan_encap_len = 0;
6714 
6715 			if ((m->m_pkthdr.csum_flags & CSUM_VLAN_ENCAP_PRESENT) != 0) {
6716 				vlan_encap_len = ETHER_VLAN_ENCAP_LEN;
6717 			}
6718 			m->m_pkthdr.pkt_hdr = mtod(m, char *) + ETHER_HDR_LEN + vlan_encap_len;
6719 		} else {
6720 			m->m_pkthdr.pkt_hdr = mtod(m, void *);
6721 		}
6722 
6723 		/*
6724 		 * Perform address family translation if needed.
6725 		 * For now we only support stateless 4 to 6 translation
6726 		 * on the out path.
6727 		 *
6728 		 * The routine below translates IP header, updates protocol
6729 		 * checksum and also translates ICMP.
6730 		 *
6731 		 * We skip the first packet as it is already translated and
6732 		 * the proto family is set to PF_INET6.
6733 		 */
6734 		if (proto_family == PF_INET && IS_INTF_CLAT46(ifp) &&
6735 		    (ifp->if_type == IFT_CELLULAR ||
6736 		    dlil_is_clat_needed(proto_family, m))) {
6737 			retval = dlil_clat46(ifp, &proto_family, &m);
6738 			/* Goto the next packet if the translation fails */
6739 			if (retval != 0) {
6740 				m_freem(m);
6741 				m = NULL;
6742 				ip6stat.ip6s_clat464_out_drop++;
6743 				goto next;
6744 			}
6745 		}
6746 
6747 #if CONFIG_DTRACE
6748 		if (flags == DLIL_OUTPUT_FLAGS_NONE) {
6749 			dlil_output_dtrace(ifp, proto_family, m);
6750 		}
6751 #endif /* CONFIG_DTRACE */
6752 
6753 		if (flags == DLIL_OUTPUT_FLAGS_NONE && ifp->if_framer != NULL) {
6754 			int rcvif_set = 0;
6755 
6756 			/*
6757 			 * If this is a broadcast packet that needs to be
6758 			 * looped back into the system, set the inbound ifp
6759 			 * to that of the outbound ifp.  This will allow
6760 			 * us to determine that it is a legitimate packet
6761 			 * for the system.  Only set the ifp if it's not
6762 			 * already set, just to be safe.
6763 			 */
6764 			if ((m->m_flags & (M_BCAST | M_LOOP)) &&
6765 			    m->m_pkthdr.rcvif == NULL) {
6766 				m->m_pkthdr.rcvif = ifp;
6767 				rcvif_set = 1;
6768 			}
6769 			m_loop_set = m->m_flags & M_LOOP;
6770 			retval = ifp->if_framer(ifp, &m, dest, dst_linkaddr,
6771 			    frame_type, &pre, &post);
6772 			if (retval != 0) {
6773 				if (retval != EJUSTRETURN) {
6774 					m_freem(m);
6775 				}
6776 				goto next;
6777 			}
6778 
6779 			/*
6780 			 * For partial checksum offload, adjust the start
6781 			 * and stuff offsets based on the prepended header.
6782 			 */
6783 			if ((m->m_pkthdr.csum_flags &
6784 			    (CSUM_DATA_VALID | CSUM_PARTIAL)) ==
6785 			    (CSUM_DATA_VALID | CSUM_PARTIAL)) {
6786 				m->m_pkthdr.csum_tx_stuff += pre;
6787 				m->m_pkthdr.csum_tx_start += pre;
6788 			}
6789 
6790 			if (hwcksum_dbg != 0 && !(ifp->if_flags & IFF_LOOPBACK)) {
6791 				dlil_output_cksum_dbg(ifp, m, pre,
6792 				    proto_family);
6793 			}
6794 
6795 			/*
6796 			 * Clear the ifp if it was set above, and to be
6797 			 * safe, only if it is still the same as the
6798 			 * outbound ifp we have in context.  If it was
6799 			 * looped back, then a copy of it was sent to the
6800 			 * loopback interface with the rcvif set, and we
6801 			 * are clearing the one that will go down to the
6802 			 * layer below.
6803 			 */
6804 			if (rcvif_set && m->m_pkthdr.rcvif == ifp) {
6805 				m->m_pkthdr.rcvif = NULL;
6806 			}
6807 		}
6808 
6809 		/*
6810 		 * Let interface filters (if any) do their thing ...
6811 		 */
6812 		if ((flags & DLIL_OUTPUT_FLAGS_SKIP_IF_FILTERS) == 0) {
6813 			retval = dlil_interface_filters_output(ifp, &m, proto_family);
6814 			if (retval != 0) {
6815 				if (retval != EJUSTRETURN) {
6816 					m_freem(m);
6817 				}
6818 				goto next;
6819 			}
6820 		}
6821 		/*
6822 		 * Strip away M_PROTO1 bit prior to sending packet
6823 		 * to the driver as this field may be used by the driver
6824 		 */
6825 		m->m_flags &= ~M_PROTO1;
6826 
6827 		/*
6828 		 * If the underlying interface is not capable of handling a
6829 		 * packet whose data portion spans across physically disjoint
6830 		 * pages, we need to "normalize" the packet so that we pass
6831 		 * down a chain of mbufs where each mbuf points to a span that
6832 		 * resides in the system page boundary.  If the packet does
6833 		 * not cross page(s), the following is a no-op.
6834 		 */
6835 		if (!(ifp->if_hwassist & IFNET_MULTIPAGES)) {
6836 			if ((m = m_normalize(m)) == NULL) {
6837 				goto next;
6838 			}
6839 		}
6840 
6841 		/*
6842 		 * If this is a TSO packet, make sure the interface still
6843 		 * advertise TSO capability.
6844 		 */
6845 		if (TSO_IPV4_NOTOK(ifp, m) || TSO_IPV6_NOTOK(ifp, m)) {
6846 			retval = EMSGSIZE;
6847 			m_freem(m);
6848 			goto cleanup;
6849 		}
6850 
6851 		ifp_inc_traffic_class_out(ifp, m);
6852 
6853 #if SKYWALK
6854 		/*
6855 		 * For native skywalk devices, packets will be passed to pktap
6856 		 * after GSO or after the mbuf to packet conversion.
6857 		 * This is done for IPv4/IPv6 packets only because there is no
6858 		 * space in the mbuf to pass down the proto family.
6859 		 */
6860 		if (dlil_is_native_netif_nexus(ifp)) {
6861 			if (raw || m->m_pkthdr.pkt_proto == 0) {
6862 				pktap_output(ifp, proto_family, m, pre, post);
6863 				m->m_pkthdr.pkt_flags |= PKTF_SKIP_PKTAP;
6864 			}
6865 		} else {
6866 			pktap_output(ifp, proto_family, m, pre, post);
6867 		}
6868 #else /* SKYWALK */
6869 		pktap_output(ifp, proto_family, m, pre, post);
6870 #endif /* SKYWALK */
6871 
6872 		/*
6873 		 * Count the number of elements in the mbuf chain
6874 		 */
6875 		if (tx_chain_len_count) {
6876 			dlil_count_chain_len(m, &tx_chain_len_stats);
6877 		}
6878 
6879 		/*
6880 		 * Discard partial sum information if this packet originated
6881 		 * from another interface; the packet would already have the
6882 		 * final checksum and we shouldn't recompute it.
6883 		 */
6884 		if ((m->m_pkthdr.pkt_flags & PKTF_FORWARDED) &&
6885 		    (m->m_pkthdr.csum_flags & (CSUM_DATA_VALID | CSUM_PARTIAL)) ==
6886 		    (CSUM_DATA_VALID | CSUM_PARTIAL)) {
6887 			m->m_pkthdr.csum_flags &= ~CSUM_TX_FLAGS;
6888 			m->m_pkthdr.csum_data = 0;
6889 		}
6890 
6891 		/*
6892 		 * Finally, call the driver.
6893 		 */
6894 		if (ifp->if_eflags & (IFEF_SENDLIST | IFEF_ENQUEUE_MULTI)) {
6895 			if (m->m_pkthdr.pkt_flags & PKTF_FORWARDED) {
6896 				flen += (m_pktlen(m) - (pre + post));
6897 				m->m_pkthdr.pkt_flags &= ~PKTF_FORWARDED;
6898 			}
6899 			(void) mbuf_set_timestamp(m, now_nsec, TRUE);
6900 
6901 			*send_tail = m;
6902 			send_tail = &m->m_nextpkt;
6903 		} else {
6904 			/*
6905 			 * Record timestamp; ifnet_enqueue() will use this info
6906 			 * rather than redoing the work.
6907 			 */
6908 			nanouptime(&now);
6909 			net_timernsec(&now, &now_nsec);
6910 			(void) mbuf_set_timestamp(m, now_nsec, TRUE);
6911 
6912 			if (m->m_pkthdr.pkt_flags & PKTF_FORWARDED) {
6913 				flen = (m_pktlen(m) - (pre + post));
6914 				m->m_pkthdr.pkt_flags &= ~PKTF_FORWARDED;
6915 			} else {
6916 				flen = 0;
6917 			}
6918 			KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_START,
6919 			    0, 0, 0, 0, 0);
6920 			retval = (*ifp->if_output_dlil)(ifp, m);
6921 			if (retval == EQFULL || retval == EQSUSPENDED) {
6922 				if (adv != NULL && adv->code == FADV_SUCCESS) {
6923 					adv->code = (retval == EQFULL ?
6924 					    FADV_FLOW_CONTROLLED :
6925 					    FADV_SUSPENDED);
6926 				}
6927 				retval = 0;
6928 			}
6929 			if (retval == 0 && flen > 0) {
6930 				fbytes += flen;
6931 				fpkts++;
6932 			}
6933 			if (retval != 0 && dlil_verbose) {
6934 				DLIL_PRINTF("%s: output error on %s retval = %d\n",
6935 				    __func__, if_name(ifp),
6936 				    retval);
6937 			}
6938 			KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_END,
6939 			    0, 0, 0, 0, 0);
6940 		}
6941 		KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
6942 
6943 next:
6944 		m = packetlist;
6945 		if (m != NULL) {
6946 			m->m_flags |= m_loop_set;
6947 			packetlist = packetlist->m_nextpkt;
6948 			m->m_nextpkt = NULL;
6949 		}
6950 		/* Reset the proto family to old proto family for CLAT */
6951 		if (did_clat46) {
6952 			proto_family = old_proto_family;
6953 		}
6954 	} while (m != NULL);
6955 
6956 	if (send_head != NULL) {
6957 		KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_START,
6958 		    0, 0, 0, 0, 0);
6959 		if (ifp->if_eflags & IFEF_SENDLIST) {
6960 			retval = (*ifp->if_output_dlil)(ifp, send_head);
6961 			if (retval == EQFULL || retval == EQSUSPENDED) {
6962 				if (adv != NULL) {
6963 					adv->code = (retval == EQFULL ?
6964 					    FADV_FLOW_CONTROLLED :
6965 					    FADV_SUSPENDED);
6966 				}
6967 				retval = 0;
6968 			}
6969 			if (retval == 0 && flen > 0) {
6970 				fbytes += flen;
6971 				fpkts++;
6972 			}
6973 			if (retval != 0 && dlil_verbose) {
6974 				DLIL_PRINTF("%s: output error on %s retval = %d\n",
6975 				    __func__, if_name(ifp), retval);
6976 			}
6977 		} else {
6978 			struct mbuf *send_m;
6979 			int enq_cnt = 0;
6980 			VERIFY(ifp->if_eflags & IFEF_ENQUEUE_MULTI);
6981 			while (send_head != NULL) {
6982 				send_m = send_head;
6983 				send_head = send_m->m_nextpkt;
6984 				send_m->m_nextpkt = NULL;
6985 				retval = (*ifp->if_output_dlil)(ifp, send_m);
6986 				if (retval == EQFULL || retval == EQSUSPENDED) {
6987 					if (adv != NULL) {
6988 						adv->code = (retval == EQFULL ?
6989 						    FADV_FLOW_CONTROLLED :
6990 						    FADV_SUSPENDED);
6991 					}
6992 					retval = 0;
6993 				}
6994 				if (retval == 0) {
6995 					enq_cnt++;
6996 					if (flen > 0) {
6997 						fpkts++;
6998 					}
6999 				}
7000 				if (retval != 0 && dlil_verbose) {
7001 					DLIL_PRINTF("%s: output error on %s "
7002 					    "retval = %d\n",
7003 					    __func__, if_name(ifp), retval);
7004 				}
7005 			}
7006 			if (enq_cnt > 0) {
7007 				fbytes += flen;
7008 				ifnet_start(ifp);
7009 			}
7010 		}
7011 		KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
7012 	}
7013 
7014 	KERNEL_DEBUG(DBG_FNC_DLIL_OUTPUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
7015 
7016 cleanup:
7017 	if (fbytes > 0) {
7018 		ifp->if_fbytes += fbytes;
7019 	}
7020 	if (fpkts > 0) {
7021 		ifp->if_fpackets += fpkts;
7022 	}
7023 	if (proto != NULL) {
7024 		if_proto_free(proto);
7025 	}
7026 	if (packetlist) { /* if any packets are left, clean up */
7027 		mbuf_freem_list(packetlist);
7028 	}
7029 	if (retval == EJUSTRETURN) {
7030 		retval = 0;
7031 	}
7032 	if (iorefcnt == 1) {
7033 		ifnet_datamov_end(ifp);
7034 	}
7035 	if (rt != NULL) {
7036 		rtfree(rt);
7037 		rt = NULL;
7038 	}
7039 
7040 	return retval;
7041 }
7042 
7043 /*
7044  * This routine checks if the destination address is not a loopback, link-local,
7045  * multicast or broadcast address.
7046  */
7047 static int
dlil_is_clat_needed(protocol_family_t proto_family,mbuf_t m)7048 dlil_is_clat_needed(protocol_family_t proto_family, mbuf_t m)
7049 {
7050 	int ret = 0;
7051 	switch (proto_family) {
7052 	case PF_INET: {
7053 		struct ip *iph = mtod(m, struct ip *);
7054 		if (CLAT46_NEEDED(ntohl(iph->ip_dst.s_addr))) {
7055 			ret = 1;
7056 		}
7057 		break;
7058 	}
7059 	case PF_INET6: {
7060 		struct ip6_hdr *ip6h = mtod(m, struct ip6_hdr *);
7061 		if ((size_t)m_pktlen(m) >= sizeof(struct ip6_hdr) &&
7062 		    CLAT64_NEEDED(&ip6h->ip6_dst)) {
7063 			ret = 1;
7064 		}
7065 		break;
7066 	}
7067 	}
7068 
7069 	return ret;
7070 }
7071 /*
7072  * @brief This routine translates IPv4 packet to IPv6 packet,
7073  *     updates protocol checksum and also translates ICMP for code
7074  *     along with inner header translation.
7075  *
7076  * @param ifp Pointer to the interface
7077  * @param proto_family pointer to protocol family. It is updated if function
7078  *     performs the translation successfully.
7079  * @param m Pointer to the pointer pointing to the packet. Needed because this
7080  *     routine can end up changing the mbuf to a different one.
7081  *
7082  * @return 0 on success or else a negative value.
7083  */
7084 static errno_t
dlil_clat46(ifnet_t ifp,protocol_family_t * proto_family,mbuf_t * m)7085 dlil_clat46(ifnet_t ifp, protocol_family_t *proto_family, mbuf_t *m)
7086 {
7087 	VERIFY(*proto_family == PF_INET);
7088 	VERIFY(IS_INTF_CLAT46(ifp));
7089 
7090 	pbuf_t pbuf_store, *pbuf = NULL;
7091 	struct ip *iph = NULL;
7092 	struct in_addr osrc, odst;
7093 	uint8_t proto = 0;
7094 	struct in6_addr src_storage = {};
7095 	struct in6_addr *src = NULL;
7096 	struct sockaddr_in6 dstsock = {};
7097 	int error = 0;
7098 	uint16_t off = 0;
7099 	uint16_t tot_len = 0;
7100 	uint16_t ip_id_val = 0;
7101 	uint16_t ip_frag_off = 0;
7102 
7103 	boolean_t is_frag = FALSE;
7104 	boolean_t is_first_frag = TRUE;
7105 	boolean_t is_last_frag = TRUE;
7106 
7107 	pbuf_init_mbuf(&pbuf_store, *m, ifp);
7108 	pbuf = &pbuf_store;
7109 	iph = pbuf->pb_data;
7110 
7111 	osrc = iph->ip_src;
7112 	odst = iph->ip_dst;
7113 	proto = iph->ip_p;
7114 	off = (uint16_t)(iph->ip_hl << 2);
7115 	ip_id_val = iph->ip_id;
7116 	ip_frag_off = ntohs(iph->ip_off) & IP_OFFMASK;
7117 
7118 	tot_len = ntohs(iph->ip_len);
7119 
7120 	/*
7121 	 * For packets that are not first frags
7122 	 * we only need to adjust CSUM.
7123 	 * For 4 to 6, Fragmentation header gets appended
7124 	 * after proto translation.
7125 	 */
7126 	if (ntohs(iph->ip_off) & ~(IP_DF | IP_RF)) {
7127 		is_frag = TRUE;
7128 
7129 		/* If the offset is not zero, it is not first frag */
7130 		if (ip_frag_off != 0) {
7131 			is_first_frag = FALSE;
7132 		}
7133 
7134 		/* If IP_MF is set, then it is not last frag */
7135 		if (ntohs(iph->ip_off) & IP_MF) {
7136 			is_last_frag = FALSE;
7137 		}
7138 	}
7139 
7140 	/*
7141 	 * Translate IPv4 destination to IPv6 destination by using the
7142 	 * prefixes learned through prior PLAT discovery.
7143 	 */
7144 	if ((error = nat464_synthesize_ipv6(ifp, &odst, &dstsock.sin6_addr)) != 0) {
7145 		ip6stat.ip6s_clat464_out_v6synthfail_drop++;
7146 		goto cleanup;
7147 	}
7148 
7149 	dstsock.sin6_len = sizeof(struct sockaddr_in6);
7150 	dstsock.sin6_family = AF_INET6;
7151 
7152 	/*
7153 	 * Retrive the local IPv6 CLAT46 address reserved for stateless
7154 	 * translation.
7155 	 */
7156 	src = in6_selectsrc_core(&dstsock, 0, ifp, 0, &src_storage, NULL, &error,
7157 	    NULL, NULL, TRUE);
7158 
7159 	if (src == NULL) {
7160 		ip6stat.ip6s_clat464_out_nov6addr_drop++;
7161 		error = -1;
7162 		goto cleanup;
7163 	}
7164 
7165 
7166 	/* Translate the IP header part first */
7167 	error = (nat464_translate_46(pbuf, off, iph->ip_tos, iph->ip_p,
7168 	    iph->ip_ttl, src_storage, dstsock.sin6_addr, tot_len) == NT_NAT64) ? 0 : -1;
7169 
7170 	iph = NULL;     /* Invalidate iph as pbuf has been modified */
7171 
7172 	if (error != 0) {
7173 		ip6stat.ip6s_clat464_out_46transfail_drop++;
7174 		goto cleanup;
7175 	}
7176 
7177 	/*
7178 	 * Translate protocol header, update checksum, checksum flags
7179 	 * and related fields.
7180 	 */
7181 	error = (nat464_translate_proto(pbuf, (struct nat464_addr *)&osrc, (struct nat464_addr *)&odst,
7182 	    proto, PF_INET, PF_INET6, NT_OUT, !is_first_frag) == NT_NAT64) ? 0 : -1;
7183 
7184 	if (error != 0) {
7185 		ip6stat.ip6s_clat464_out_46proto_transfail_drop++;
7186 		goto cleanup;
7187 	}
7188 
7189 	/* Now insert the IPv6 fragment header */
7190 	if (is_frag) {
7191 		error = nat464_insert_frag46(pbuf, ip_id_val, ip_frag_off, is_last_frag);
7192 
7193 		if (error != 0) {
7194 			ip6stat.ip6s_clat464_out_46frag_transfail_drop++;
7195 			goto cleanup;
7196 		}
7197 	}
7198 
7199 cleanup:
7200 	if (pbuf_is_valid(pbuf)) {
7201 		*m = pbuf->pb_mbuf;
7202 		pbuf->pb_mbuf = NULL;
7203 		pbuf_destroy(pbuf);
7204 	} else {
7205 		error = -1;
7206 		*m = NULL;
7207 		ip6stat.ip6s_clat464_out_invalpbuf_drop++;
7208 	}
7209 
7210 	if (error == 0) {
7211 		*proto_family = PF_INET6;
7212 		ip6stat.ip6s_clat464_out_success++;
7213 	}
7214 
7215 	return error;
7216 }
7217 
7218 /*
7219  * @brief This routine translates incoming IPv6 to IPv4 packet,
7220  *     updates protocol checksum and also translates ICMPv6 outer
7221  *     and inner headers
7222  *
7223  * @return 0 on success or else a negative value.
7224  */
7225 static errno_t
dlil_clat64(ifnet_t ifp,protocol_family_t * proto_family,mbuf_t * m)7226 dlil_clat64(ifnet_t ifp, protocol_family_t *proto_family, mbuf_t *m)
7227 {
7228 	VERIFY(*proto_family == PF_INET6);
7229 	VERIFY(IS_INTF_CLAT46(ifp));
7230 
7231 	struct ip6_hdr *ip6h = NULL;
7232 	struct in6_addr osrc, odst;
7233 	uint8_t proto = 0;
7234 	struct in6_ifaddr *ia6_clat_dst = NULL;
7235 	struct in_ifaddr *ia4_clat_dst = NULL;
7236 	struct in_addr *dst = NULL;
7237 	struct in_addr src;
7238 	int error = 0;
7239 	uint32_t off = 0;
7240 	u_int64_t tot_len = 0;
7241 	uint8_t tos = 0;
7242 	boolean_t is_first_frag = TRUE;
7243 
7244 	/* Incoming mbuf does not contain valid IP6 header */
7245 	if ((size_t)(*m)->m_pkthdr.len < sizeof(struct ip6_hdr) ||
7246 	    ((size_t)(*m)->m_len < sizeof(struct ip6_hdr) &&
7247 	    (*m = m_pullup(*m, sizeof(struct ip6_hdr))) == NULL)) {
7248 		ip6stat.ip6s_clat464_in_tooshort_drop++;
7249 		return -1;
7250 	}
7251 
7252 	ip6h = mtod(*m, struct ip6_hdr *);
7253 	/* Validate that mbuf contains IP payload equal to ip6_plen  */
7254 	if ((size_t)(*m)->m_pkthdr.len < ntohs(ip6h->ip6_plen) + sizeof(struct ip6_hdr)) {
7255 		ip6stat.ip6s_clat464_in_tooshort_drop++;
7256 		return -1;
7257 	}
7258 
7259 	osrc = ip6h->ip6_src;
7260 	odst = ip6h->ip6_dst;
7261 
7262 	/*
7263 	 * Retrieve the local CLAT46 reserved IPv6 address.
7264 	 * Let the packet pass if we don't find one, as the flag
7265 	 * may get set before IPv6 configuration has taken place.
7266 	 */
7267 	ia6_clat_dst = in6ifa_ifpwithflag(ifp, IN6_IFF_CLAT46);
7268 	if (ia6_clat_dst == NULL) {
7269 		goto done;
7270 	}
7271 
7272 	/*
7273 	 * Check if the original dest in the packet is same as the reserved
7274 	 * CLAT46 IPv6 address
7275 	 */
7276 	if (IN6_ARE_ADDR_EQUAL(&odst, &ia6_clat_dst->ia_addr.sin6_addr)) {
7277 		pbuf_t pbuf_store, *pbuf = NULL;
7278 		pbuf_init_mbuf(&pbuf_store, *m, ifp);
7279 		pbuf = &pbuf_store;
7280 
7281 		/*
7282 		 * Retrive the local CLAT46 IPv4 address reserved for stateless
7283 		 * translation.
7284 		 */
7285 		ia4_clat_dst = inifa_ifpclatv4(ifp);
7286 		if (ia4_clat_dst == NULL) {
7287 			ifa_remref(&ia6_clat_dst->ia_ifa);
7288 			ip6stat.ip6s_clat464_in_nov4addr_drop++;
7289 			error = -1;
7290 			goto cleanup;
7291 		}
7292 		ifa_remref(&ia6_clat_dst->ia_ifa);
7293 
7294 		/* Translate IPv6 src to IPv4 src by removing the NAT64 prefix */
7295 		dst = &ia4_clat_dst->ia_addr.sin_addr;
7296 		if ((error = nat464_synthesize_ipv4(ifp, &osrc, &src)) != 0) {
7297 			ip6stat.ip6s_clat464_in_v4synthfail_drop++;
7298 			error = -1;
7299 			goto cleanup;
7300 		}
7301 
7302 		ip6h = pbuf->pb_data;
7303 		off = sizeof(struct ip6_hdr);
7304 		proto = ip6h->ip6_nxt;
7305 		tos = (ntohl(ip6h->ip6_flow) >> 20) & 0xff;
7306 		tot_len = ntohs(ip6h->ip6_plen) + sizeof(struct ip6_hdr);
7307 
7308 		/*
7309 		 * Translate the IP header and update the fragmentation
7310 		 * header if needed
7311 		 */
7312 		error = (nat464_translate_64(pbuf, off, tos, &proto,
7313 		    ip6h->ip6_hlim, src, *dst, tot_len, &is_first_frag) == NT_NAT64) ?
7314 		    0 : -1;
7315 
7316 		ip6h = NULL; /* Invalidate ip6h as pbuf has been changed */
7317 
7318 		if (error != 0) {
7319 			ip6stat.ip6s_clat464_in_64transfail_drop++;
7320 			goto cleanup;
7321 		}
7322 
7323 		/*
7324 		 * Translate protocol header, update checksum, checksum flags
7325 		 * and related fields.
7326 		 */
7327 		error = (nat464_translate_proto(pbuf, (struct nat464_addr *)&osrc,
7328 		    (struct nat464_addr *)&odst, proto, PF_INET6, PF_INET,
7329 		    NT_IN, !is_first_frag) == NT_NAT64) ? 0 : -1;
7330 
7331 		if (error != 0) {
7332 			ip6stat.ip6s_clat464_in_64proto_transfail_drop++;
7333 			goto cleanup;
7334 		}
7335 
7336 cleanup:
7337 		if (ia4_clat_dst != NULL) {
7338 			ifa_remref(&ia4_clat_dst->ia_ifa);
7339 		}
7340 
7341 		if (pbuf_is_valid(pbuf)) {
7342 			*m = pbuf->pb_mbuf;
7343 			pbuf->pb_mbuf = NULL;
7344 			pbuf_destroy(pbuf);
7345 		} else {
7346 			error = -1;
7347 			ip6stat.ip6s_clat464_in_invalpbuf_drop++;
7348 		}
7349 
7350 		if (error == 0) {
7351 			*proto_family = PF_INET;
7352 			ip6stat.ip6s_clat464_in_success++;
7353 		}
7354 	} /* CLAT traffic */
7355 
7356 done:
7357 	return error;
7358 }
7359 
7360 /* The following is used to enqueue work items for ifnet ioctl events */
7361 static void ifnet_ioctl_event_callback(struct nwk_wq_entry *);
7362 
7363 struct ifnet_ioctl_event {
7364 	struct ifnet *ifp;
7365 	u_long ioctl_code;
7366 };
7367 
7368 struct ifnet_ioctl_event_nwk_wq_entry {
7369 	struct nwk_wq_entry nwk_wqe;
7370 	struct ifnet_ioctl_event ifnet_ioctl_ev_arg;
7371 };
7372 
7373 void
ifnet_ioctl_async(struct ifnet * ifp,u_long ioctl_code)7374 ifnet_ioctl_async(struct ifnet *ifp, u_long ioctl_code)
7375 {
7376 	struct ifnet_ioctl_event_nwk_wq_entry *p_ifnet_ioctl_ev = NULL;
7377 	bool compare_expected;
7378 
7379 	/*
7380 	 * Get an io ref count if the interface is attached.
7381 	 * At this point it most likely is. We are taking a reference for
7382 	 * deferred processing.
7383 	 */
7384 	if (!ifnet_is_attached(ifp, 1)) {
7385 		os_log(OS_LOG_DEFAULT, "%s:%d %s Failed for ioctl %lu as interface "
7386 		    "is not attached",
7387 		    __func__, __LINE__, if_name(ifp), ioctl_code);
7388 		return;
7389 	}
7390 	switch (ioctl_code) {
7391 	case SIOCADDMULTI:
7392 		compare_expected = false;
7393 		if (!atomic_compare_exchange_strong(&ifp->if_mcast_add_signaled, &compare_expected, true)) {
7394 			ifnet_decr_iorefcnt(ifp);
7395 			return;
7396 		}
7397 		break;
7398 	case SIOCDELMULTI:
7399 		compare_expected = false;
7400 		if (!atomic_compare_exchange_strong(&ifp->if_mcast_del_signaled, &compare_expected, true)) {
7401 			ifnet_decr_iorefcnt(ifp);
7402 			return;
7403 		}
7404 		break;
7405 	default:
7406 		os_log(OS_LOG_DEFAULT, "%s:%d %s unknown ioctl %lu",
7407 		    __func__, __LINE__, if_name(ifp), ioctl_code);
7408 		return;
7409 	}
7410 
7411 	p_ifnet_ioctl_ev = kalloc_type(struct ifnet_ioctl_event_nwk_wq_entry,
7412 	    Z_WAITOK | Z_ZERO | Z_NOFAIL);
7413 
7414 	p_ifnet_ioctl_ev->ifnet_ioctl_ev_arg.ifp = ifp;
7415 	p_ifnet_ioctl_ev->ifnet_ioctl_ev_arg.ioctl_code = ioctl_code;
7416 	p_ifnet_ioctl_ev->nwk_wqe.func = ifnet_ioctl_event_callback;
7417 	nwk_wq_enqueue(&p_ifnet_ioctl_ev->nwk_wqe);
7418 }
7419 
7420 static void
ifnet_ioctl_event_callback(struct nwk_wq_entry * nwk_item)7421 ifnet_ioctl_event_callback(struct nwk_wq_entry *nwk_item)
7422 {
7423 	struct ifnet_ioctl_event_nwk_wq_entry *p_ev = __container_of(nwk_item,
7424 	    struct ifnet_ioctl_event_nwk_wq_entry, nwk_wqe);
7425 
7426 	struct ifnet *ifp = p_ev->ifnet_ioctl_ev_arg.ifp;
7427 	u_long ioctl_code = p_ev->ifnet_ioctl_ev_arg.ioctl_code;
7428 	int ret = 0;
7429 
7430 	switch (ioctl_code) {
7431 	case SIOCADDMULTI:
7432 		atomic_store(&ifp->if_mcast_add_signaled, false);
7433 		break;
7434 	case SIOCDELMULTI:
7435 		atomic_store(&ifp->if_mcast_del_signaled, false);
7436 		break;
7437 	}
7438 	if ((ret = ifnet_ioctl(ifp, 0, ioctl_code, NULL)) != 0) {
7439 		os_log(OS_LOG_DEFAULT, "%s:%d %s ifnet_ioctl returned %d for ioctl %lu",
7440 		    __func__, __LINE__, if_name(ifp), ret, ioctl_code);
7441 	} else if (dlil_verbose) {
7442 		os_log(OS_LOG_DEFAULT, "%s:%d %s ifnet_ioctl returned successfully "
7443 		    "for ioctl %lu",
7444 		    __func__, __LINE__, if_name(ifp), ioctl_code);
7445 	}
7446 	ifnet_decr_iorefcnt(ifp);
7447 	kfree_type(struct ifnet_ioctl_event_nwk_wq_entry, p_ev);
7448 	return;
7449 }
7450 
7451 errno_t
ifnet_ioctl(ifnet_t ifp,protocol_family_t proto_fam,u_long ioctl_code,void * ioctl_arg)7452 ifnet_ioctl(ifnet_t ifp, protocol_family_t proto_fam, u_long ioctl_code,
7453     void *ioctl_arg)
7454 {
7455 	struct ifnet_filter *filter;
7456 	int retval = EOPNOTSUPP;
7457 	int result = 0;
7458 
7459 	if (ifp == NULL || ioctl_code == 0) {
7460 		return EINVAL;
7461 	}
7462 
7463 	/* Get an io ref count if the interface is attached */
7464 	if (!ifnet_is_attached(ifp, 1)) {
7465 		return EOPNOTSUPP;
7466 	}
7467 
7468 	/*
7469 	 * Run the interface filters first.
7470 	 * We want to run all filters before calling the protocol,
7471 	 * interface family, or interface.
7472 	 */
7473 	lck_mtx_lock_spin(&ifp->if_flt_lock);
7474 	/* prevent filter list from changing in case we drop the lock */
7475 	if_flt_monitor_busy(ifp);
7476 	TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
7477 		if (filter->filt_ioctl != NULL && (filter->filt_protocol == 0 ||
7478 		    filter->filt_protocol == proto_fam)) {
7479 			lck_mtx_unlock(&ifp->if_flt_lock);
7480 
7481 			result = filter->filt_ioctl(filter->filt_cookie, ifp,
7482 			    proto_fam, ioctl_code, ioctl_arg);
7483 
7484 			lck_mtx_lock_spin(&ifp->if_flt_lock);
7485 
7486 			/* Only update retval if no one has handled the ioctl */
7487 			if (retval == EOPNOTSUPP || result == EJUSTRETURN) {
7488 				if (result == ENOTSUP) {
7489 					result = EOPNOTSUPP;
7490 				}
7491 				retval = result;
7492 				if (retval != 0 && retval != EOPNOTSUPP) {
7493 					/* we're done with the filter list */
7494 					if_flt_monitor_unbusy(ifp);
7495 					lck_mtx_unlock(&ifp->if_flt_lock);
7496 					goto cleanup;
7497 				}
7498 			}
7499 		}
7500 	}
7501 	/* we're done with the filter list */
7502 	if_flt_monitor_unbusy(ifp);
7503 	lck_mtx_unlock(&ifp->if_flt_lock);
7504 
7505 	/* Allow the protocol to handle the ioctl */
7506 	if (proto_fam != 0) {
7507 		struct if_proto *proto;
7508 
7509 		/* callee holds a proto refcnt upon success */
7510 		ifnet_lock_shared(ifp);
7511 		proto = find_attached_proto(ifp, proto_fam);
7512 		ifnet_lock_done(ifp);
7513 		if (proto != NULL) {
7514 			proto_media_ioctl ioctlp =
7515 			    (proto->proto_kpi == kProtoKPI_v1 ?
7516 			    proto->kpi.v1.ioctl : proto->kpi.v2.ioctl);
7517 			result = EOPNOTSUPP;
7518 			if (ioctlp != NULL) {
7519 				result = ioctlp(ifp, proto_fam, ioctl_code,
7520 				    ioctl_arg);
7521 			}
7522 			if_proto_free(proto);
7523 
7524 			/* Only update retval if no one has handled the ioctl */
7525 			if (retval == EOPNOTSUPP || result == EJUSTRETURN) {
7526 				if (result == ENOTSUP) {
7527 					result = EOPNOTSUPP;
7528 				}
7529 				retval = result;
7530 				if (retval && retval != EOPNOTSUPP) {
7531 					goto cleanup;
7532 				}
7533 			}
7534 		}
7535 	}
7536 
7537 	/* retval is either 0 or EOPNOTSUPP */
7538 
7539 	/*
7540 	 * Let the interface handle this ioctl.
7541 	 * If it returns EOPNOTSUPP, ignore that, we may have
7542 	 * already handled this in the protocol or family.
7543 	 */
7544 	if (ifp->if_ioctl) {
7545 		result = (*ifp->if_ioctl)(ifp, ioctl_code, ioctl_arg);
7546 	}
7547 
7548 	/* Only update retval if no one has handled the ioctl */
7549 	if (retval == EOPNOTSUPP || result == EJUSTRETURN) {
7550 		if (result == ENOTSUP) {
7551 			result = EOPNOTSUPP;
7552 		}
7553 		retval = result;
7554 		if (retval && retval != EOPNOTSUPP) {
7555 			goto cleanup;
7556 		}
7557 	}
7558 
7559 cleanup:
7560 	if (retval == EJUSTRETURN) {
7561 		retval = 0;
7562 	}
7563 
7564 	ifnet_decr_iorefcnt(ifp);
7565 
7566 	return retval;
7567 }
7568 
7569 __private_extern__ errno_t
dlil_set_bpf_tap(ifnet_t ifp,bpf_tap_mode mode,bpf_packet_func callback)7570 dlil_set_bpf_tap(ifnet_t ifp, bpf_tap_mode mode, bpf_packet_func callback)
7571 {
7572 	errno_t error = 0;
7573 
7574 	if (ifp->if_set_bpf_tap) {
7575 		/* Get an io reference on the interface if it is attached */
7576 		if (!ifnet_is_attached(ifp, 1)) {
7577 			return ENXIO;
7578 		}
7579 		error = ifp->if_set_bpf_tap(ifp, mode, callback);
7580 		ifnet_decr_iorefcnt(ifp);
7581 	}
7582 	return error;
7583 }
7584 
7585 errno_t
dlil_resolve_multi(struct ifnet * ifp,const struct sockaddr * proto_addr,struct sockaddr * ll_addr,size_t ll_len)7586 dlil_resolve_multi(struct ifnet *ifp, const struct sockaddr *proto_addr,
7587     struct sockaddr *ll_addr, size_t ll_len)
7588 {
7589 	errno_t result = EOPNOTSUPP;
7590 	struct if_proto *proto;
7591 	const struct sockaddr *verify;
7592 	proto_media_resolve_multi resolvep;
7593 
7594 	if (!ifnet_is_attached(ifp, 1)) {
7595 		return result;
7596 	}
7597 
7598 	bzero(ll_addr, ll_len);
7599 
7600 	/* Call the protocol first; callee holds a proto refcnt upon success */
7601 	ifnet_lock_shared(ifp);
7602 	proto = find_attached_proto(ifp, proto_addr->sa_family);
7603 	ifnet_lock_done(ifp);
7604 	if (proto != NULL) {
7605 		resolvep = (proto->proto_kpi == kProtoKPI_v1 ?
7606 		    proto->kpi.v1.resolve_multi : proto->kpi.v2.resolve_multi);
7607 		if (resolvep != NULL) {
7608 			result = resolvep(ifp, proto_addr, SDL(ll_addr), ll_len);
7609 		}
7610 		if_proto_free(proto);
7611 	}
7612 
7613 	/* Let the interface verify the multicast address */
7614 	if ((result == EOPNOTSUPP || result == 0) && ifp->if_check_multi) {
7615 		if (result == 0) {
7616 			verify = ll_addr;
7617 		} else {
7618 			verify = proto_addr;
7619 		}
7620 		result = ifp->if_check_multi(ifp, verify);
7621 	}
7622 
7623 	ifnet_decr_iorefcnt(ifp);
7624 	return result;
7625 }
7626 
7627 __private_extern__ errno_t
dlil_send_arp_internal(ifnet_t ifp,u_short arpop,const struct sockaddr_dl * sender_hw,const struct sockaddr * sender_proto,const struct sockaddr_dl * target_hw,const struct sockaddr * target_proto)7628 dlil_send_arp_internal(ifnet_t ifp, u_short arpop,
7629     const struct sockaddr_dl *sender_hw, const struct sockaddr *sender_proto,
7630     const struct sockaddr_dl *target_hw, const struct sockaddr *target_proto)
7631 {
7632 	struct if_proto *proto;
7633 	errno_t result = 0;
7634 
7635 	if ((ifp->if_flags & IFF_NOARP) != 0) {
7636 		result = ENOTSUP;
7637 		goto done;
7638 	}
7639 
7640 	/* callee holds a proto refcnt upon success */
7641 	ifnet_lock_shared(ifp);
7642 	proto = find_attached_proto(ifp, target_proto->sa_family);
7643 	ifnet_lock_done(ifp);
7644 	if (proto == NULL) {
7645 		result = ENOTSUP;
7646 	} else {
7647 		proto_media_send_arp    arpp;
7648 		arpp = (proto->proto_kpi == kProtoKPI_v1 ?
7649 		    proto->kpi.v1.send_arp : proto->kpi.v2.send_arp);
7650 		if (arpp == NULL) {
7651 			result = ENOTSUP;
7652 		} else {
7653 			switch (arpop) {
7654 			case ARPOP_REQUEST:
7655 				arpstat.txrequests++;
7656 				if (target_hw != NULL) {
7657 					arpstat.txurequests++;
7658 				}
7659 				break;
7660 			case ARPOP_REPLY:
7661 				arpstat.txreplies++;
7662 				break;
7663 			}
7664 			result = arpp(ifp, arpop, sender_hw, sender_proto,
7665 			    target_hw, target_proto);
7666 		}
7667 		if_proto_free(proto);
7668 	}
7669 done:
7670 	return result;
7671 }
7672 
7673 struct net_thread_marks { };
7674 static const struct net_thread_marks net_thread_marks_base = { };
7675 
7676 __private_extern__ const net_thread_marks_t net_thread_marks_none =
7677     &net_thread_marks_base;
7678 
7679 __private_extern__ net_thread_marks_t
net_thread_marks_push(u_int32_t push)7680 net_thread_marks_push(u_int32_t push)
7681 {
7682 	static const char *const base = (const void*)&net_thread_marks_base;
7683 	u_int32_t pop = 0;
7684 
7685 	if (push != 0) {
7686 		struct uthread *uth = current_uthread();
7687 
7688 		pop = push & ~uth->uu_network_marks;
7689 		if (pop != 0) {
7690 			uth->uu_network_marks |= pop;
7691 		}
7692 	}
7693 
7694 	return (net_thread_marks_t)&base[pop];
7695 }
7696 
7697 __private_extern__ net_thread_marks_t
net_thread_unmarks_push(u_int32_t unpush)7698 net_thread_unmarks_push(u_int32_t unpush)
7699 {
7700 	static const char *const base = (const void*)&net_thread_marks_base;
7701 	u_int32_t unpop = 0;
7702 
7703 	if (unpush != 0) {
7704 		struct uthread *uth = current_uthread();
7705 
7706 		unpop = unpush & uth->uu_network_marks;
7707 		if (unpop != 0) {
7708 			uth->uu_network_marks &= ~unpop;
7709 		}
7710 	}
7711 
7712 	return (net_thread_marks_t)&base[unpop];
7713 }
7714 
7715 __private_extern__ void
net_thread_marks_pop(net_thread_marks_t popx)7716 net_thread_marks_pop(net_thread_marks_t popx)
7717 {
7718 	static const char *const base = (const void*)&net_thread_marks_base;
7719 	const ptrdiff_t pop = (const char *)popx - (const char *)base;
7720 
7721 	if (pop != 0) {
7722 		static const ptrdiff_t ones = (ptrdiff_t)(u_int32_t)~0U;
7723 		struct uthread *uth = current_uthread();
7724 
7725 		VERIFY((pop & ones) == pop);
7726 		VERIFY((ptrdiff_t)(uth->uu_network_marks & pop) == pop);
7727 		uth->uu_network_marks &= ~pop;
7728 	}
7729 }
7730 
7731 __private_extern__ void
net_thread_unmarks_pop(net_thread_marks_t unpopx)7732 net_thread_unmarks_pop(net_thread_marks_t unpopx)
7733 {
7734 	static const char *const base = (const void*)&net_thread_marks_base;
7735 	ptrdiff_t unpop = (const char *)unpopx - (const char *)base;
7736 
7737 	if (unpop != 0) {
7738 		static const ptrdiff_t ones = (ptrdiff_t)(u_int32_t)~0U;
7739 		struct uthread *uth = current_uthread();
7740 
7741 		VERIFY((unpop & ones) == unpop);
7742 		VERIFY((ptrdiff_t)(uth->uu_network_marks & unpop) == 0);
7743 		uth->uu_network_marks |= (u_int32_t)unpop;
7744 	}
7745 }
7746 
7747 __private_extern__ u_int32_t
net_thread_is_marked(u_int32_t check)7748 net_thread_is_marked(u_int32_t check)
7749 {
7750 	if (check != 0) {
7751 		struct uthread *uth = current_uthread();
7752 		return uth->uu_network_marks & check;
7753 	} else {
7754 		return 0;
7755 	}
7756 }
7757 
7758 __private_extern__ u_int32_t
net_thread_is_unmarked(u_int32_t check)7759 net_thread_is_unmarked(u_int32_t check)
7760 {
7761 	if (check != 0) {
7762 		struct uthread *uth = current_uthread();
7763 		return ~uth->uu_network_marks & check;
7764 	} else {
7765 		return 0;
7766 	}
7767 }
7768 
7769 static __inline__ int
_is_announcement(const struct sockaddr_in * sender_sin,const struct sockaddr_in * target_sin)7770 _is_announcement(const struct sockaddr_in * sender_sin,
7771     const struct sockaddr_in * target_sin)
7772 {
7773 	if (target_sin == NULL || sender_sin == NULL) {
7774 		return FALSE;
7775 	}
7776 
7777 	return sender_sin->sin_addr.s_addr == target_sin->sin_addr.s_addr;
7778 }
7779 
7780 __private_extern__ errno_t
dlil_send_arp(ifnet_t ifp,u_short arpop,const struct sockaddr_dl * sender_hw,const struct sockaddr * sender_proto,const struct sockaddr_dl * target_hw,const struct sockaddr * target_proto0,u_int32_t rtflags)7781 dlil_send_arp(ifnet_t ifp, u_short arpop, const struct sockaddr_dl *sender_hw,
7782     const struct sockaddr *sender_proto, const struct sockaddr_dl *target_hw,
7783     const struct sockaddr *target_proto0, u_int32_t rtflags)
7784 {
7785 	errno_t result = 0;
7786 	const struct sockaddr_in * sender_sin;
7787 	const struct sockaddr_in * target_sin;
7788 	struct sockaddr_inarp target_proto_sinarp;
7789 	struct sockaddr *target_proto = __DECONST_SA(target_proto0);
7790 
7791 	if (target_proto == NULL || sender_proto == NULL) {
7792 		return EINVAL;
7793 	}
7794 
7795 	if (sender_proto->sa_family != target_proto->sa_family) {
7796 		return EINVAL;
7797 	}
7798 
7799 	/*
7800 	 * If the target is a (default) router, provide that
7801 	 * information to the send_arp callback routine.
7802 	 */
7803 	if (rtflags & RTF_ROUTER) {
7804 		SOCKADDR_COPY(target_proto, &target_proto_sinarp, sizeof(struct sockaddr_in));
7805 		target_proto_sinarp.sin_other |= SIN_ROUTER;
7806 		target_proto = SA(&target_proto_sinarp);
7807 	}
7808 
7809 	/*
7810 	 * If this is an ARP request and the target IP is IPv4LL,
7811 	 * send the request on all interfaces.  The exception is
7812 	 * an announcement, which must only appear on the specific
7813 	 * interface.
7814 	 */
7815 	sender_sin = SIN(sender_proto);
7816 	target_sin = SIN(target_proto);
7817 	if (target_proto->sa_family == AF_INET &&
7818 	    IN_LINKLOCAL(ntohl(target_sin->sin_addr.s_addr)) &&
7819 	    ipv4_ll_arp_aware != 0 && arpop == ARPOP_REQUEST &&
7820 	    !_is_announcement(sender_sin, target_sin)) {
7821 		ifnet_t         *__counted_by(count) ifp_list;
7822 		u_int32_t       count;
7823 		u_int32_t       ifp_on;
7824 
7825 		result = ENOTSUP;
7826 
7827 		if (ifnet_list_get(IFNET_FAMILY_ANY, &ifp_list, &count) == 0) {
7828 			for (ifp_on = 0; ifp_on < count; ifp_on++) {
7829 				errno_t new_result;
7830 				ifaddr_t source_hw = NULL;
7831 				ifaddr_t source_ip = NULL;
7832 				struct sockaddr_in source_ip_copy;
7833 				struct ifnet *cur_ifp = ifp_list[ifp_on];
7834 
7835 				/*
7836 				 * Only arp on interfaces marked for IPv4LL
7837 				 * ARPing.  This may mean that we don't ARP on
7838 				 * the interface the subnet route points to.
7839 				 */
7840 				if (!(cur_ifp->if_eflags & IFEF_ARPLL)) {
7841 					continue;
7842 				}
7843 
7844 				/* Find the source IP address */
7845 				ifnet_lock_shared(cur_ifp);
7846 				source_hw = cur_ifp->if_lladdr;
7847 				TAILQ_FOREACH(source_ip, &cur_ifp->if_addrhead,
7848 				    ifa_link) {
7849 					IFA_LOCK(source_ip);
7850 					if (source_ip->ifa_addr != NULL &&
7851 					    source_ip->ifa_addr->sa_family ==
7852 					    AF_INET) {
7853 						/* Copy the source IP address */
7854 						SOCKADDR_COPY(SIN(source_ip->ifa_addr), &source_ip_copy, sizeof(source_ip_copy));
7855 						IFA_UNLOCK(source_ip);
7856 						break;
7857 					}
7858 					IFA_UNLOCK(source_ip);
7859 				}
7860 
7861 				/* No IP Source, don't arp */
7862 				if (source_ip == NULL) {
7863 					ifnet_lock_done(cur_ifp);
7864 					continue;
7865 				}
7866 
7867 				ifa_addref(source_hw);
7868 				ifnet_lock_done(cur_ifp);
7869 
7870 				/* Send the ARP */
7871 				new_result = dlil_send_arp_internal(cur_ifp,
7872 				    arpop, SDL(source_hw->ifa_addr),
7873 				    SA(&source_ip_copy), NULL,
7874 				    target_proto);
7875 
7876 				ifa_remref(source_hw);
7877 				if (result == ENOTSUP) {
7878 					result = new_result;
7879 				}
7880 			}
7881 			ifnet_list_free_counted_by(ifp_list, count);
7882 		}
7883 	} else {
7884 		result = dlil_send_arp_internal(ifp, arpop, sender_hw,
7885 		    sender_proto, target_hw, target_proto);
7886 	}
7887 
7888 	return result;
7889 }
7890 
7891 /*
7892  * Caller must hold ifnet head lock.
7893  */
7894 static int
ifnet_lookup(struct ifnet * ifp)7895 ifnet_lookup(struct ifnet *ifp)
7896 {
7897 	struct ifnet *_ifp;
7898 
7899 	LCK_RW_ASSERT(&ifnet_head_lock, LCK_RW_ASSERT_HELD);
7900 	TAILQ_FOREACH(_ifp, &ifnet_head, if_link) {
7901 		if (_ifp == ifp) {
7902 			break;
7903 		}
7904 	}
7905 	return _ifp != NULL;
7906 }
7907 
7908 /*
7909  * Caller has to pass a non-zero refio argument to get a
7910  * IO reference count. This will prevent ifnet_detach from
7911  * being called when there are outstanding io reference counts.
7912  */
7913 int
ifnet_is_attached(struct ifnet * ifp,int refio)7914 ifnet_is_attached(struct ifnet *ifp, int refio)
7915 {
7916 	int ret;
7917 
7918 	lck_mtx_lock_spin(&ifp->if_ref_lock);
7919 	if ((ret = IF_FULLY_ATTACHED(ifp))) {
7920 		if (refio > 0) {
7921 			ifp->if_refio++;
7922 		}
7923 	}
7924 	lck_mtx_unlock(&ifp->if_ref_lock);
7925 
7926 	return ret;
7927 }
7928 
7929 void
ifnet_incr_pending_thread_count(struct ifnet * ifp)7930 ifnet_incr_pending_thread_count(struct ifnet *ifp)
7931 {
7932 	lck_mtx_lock_spin(&ifp->if_ref_lock);
7933 	ifp->if_threads_pending++;
7934 	lck_mtx_unlock(&ifp->if_ref_lock);
7935 }
7936 
7937 void
ifnet_decr_pending_thread_count(struct ifnet * ifp)7938 ifnet_decr_pending_thread_count(struct ifnet *ifp)
7939 {
7940 	lck_mtx_lock_spin(&ifp->if_ref_lock);
7941 	VERIFY(ifp->if_threads_pending > 0);
7942 	ifp->if_threads_pending--;
7943 	if (ifp->if_threads_pending == 0) {
7944 		wakeup(&ifp->if_threads_pending);
7945 	}
7946 	lck_mtx_unlock(&ifp->if_ref_lock);
7947 }
7948 
7949 /*
7950  * Caller must ensure the interface is attached; the assumption is that
7951  * there is at least an outstanding IO reference count held already.
7952  * Most callers would call ifnet_is_{attached,data_ready}() instead.
7953  */
7954 void
ifnet_incr_iorefcnt(struct ifnet * ifp)7955 ifnet_incr_iorefcnt(struct ifnet *ifp)
7956 {
7957 	lck_mtx_lock_spin(&ifp->if_ref_lock);
7958 	VERIFY(IF_FULLY_ATTACHED(ifp));
7959 	VERIFY(ifp->if_refio > 0);
7960 	ifp->if_refio++;
7961 	lck_mtx_unlock(&ifp->if_ref_lock);
7962 }
7963 
7964 __attribute__((always_inline))
7965 static void
ifnet_decr_iorefcnt_locked(struct ifnet * ifp)7966 ifnet_decr_iorefcnt_locked(struct ifnet *ifp)
7967 {
7968 	LCK_MTX_ASSERT(&ifp->if_ref_lock, LCK_MTX_ASSERT_OWNED);
7969 
7970 	VERIFY(ifp->if_refio > 0);
7971 	VERIFY(ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING));
7972 
7973 	ifp->if_refio--;
7974 	VERIFY(ifp->if_refio != 0 || ifp->if_datamov == 0);
7975 
7976 	/*
7977 	 * if there are no more outstanding io references, wakeup the
7978 	 * ifnet_detach thread if detaching flag is set.
7979 	 */
7980 	if (ifp->if_refio == 0 && (ifp->if_refflags & IFRF_DETACHING)) {
7981 		wakeup(&(ifp->if_refio));
7982 	}
7983 }
7984 
7985 void
ifnet_decr_iorefcnt(struct ifnet * ifp)7986 ifnet_decr_iorefcnt(struct ifnet *ifp)
7987 {
7988 	lck_mtx_lock_spin(&ifp->if_ref_lock);
7989 	ifnet_decr_iorefcnt_locked(ifp);
7990 	lck_mtx_unlock(&ifp->if_ref_lock);
7991 }
7992 
7993 boolean_t
ifnet_datamov_begin(struct ifnet * ifp)7994 ifnet_datamov_begin(struct ifnet *ifp)
7995 {
7996 	boolean_t ret;
7997 
7998 	lck_mtx_lock_spin(&ifp->if_ref_lock);
7999 	if ((ret = IF_FULLY_ATTACHED_AND_READY(ifp))) {
8000 		ifp->if_refio++;
8001 		ifp->if_datamov++;
8002 	}
8003 	lck_mtx_unlock(&ifp->if_ref_lock);
8004 
8005 	DTRACE_IP2(datamov__begin, struct ifnet *, ifp, boolean_t, ret);
8006 	return ret;
8007 }
8008 
8009 void
ifnet_datamov_end(struct ifnet * ifp)8010 ifnet_datamov_end(struct ifnet *ifp)
8011 {
8012 	lck_mtx_lock_spin(&ifp->if_ref_lock);
8013 	VERIFY(ifp->if_datamov > 0);
8014 	/*
8015 	 * if there's no more thread moving data, wakeup any
8016 	 * drainers that's blocked waiting for this.
8017 	 */
8018 	if (--ifp->if_datamov == 0 && ifp->if_drainers > 0) {
8019 		DLIL_PRINTF("Waking up drainers on %s\n", if_name(ifp));
8020 		DTRACE_IP1(datamov__drain__wake, struct ifnet *, ifp);
8021 		wakeup(&(ifp->if_datamov));
8022 	}
8023 	ifnet_decr_iorefcnt_locked(ifp);
8024 	lck_mtx_unlock(&ifp->if_ref_lock);
8025 
8026 	DTRACE_IP1(datamov__end, struct ifnet *, ifp);
8027 }
8028 
8029 static void
ifnet_datamov_suspend_locked(struct ifnet * ifp)8030 ifnet_datamov_suspend_locked(struct ifnet *ifp)
8031 {
8032 	LCK_MTX_ASSERT(&ifp->if_ref_lock, LCK_MTX_ASSERT_OWNED);
8033 	ifp->if_refio++;
8034 	if (ifp->if_suspend++ == 0) {
8035 		VERIFY(ifp->if_refflags & IFRF_READY);
8036 		ifp->if_refflags &= ~IFRF_READY;
8037 	}
8038 }
8039 
8040 void
ifnet_datamov_suspend(struct ifnet * ifp)8041 ifnet_datamov_suspend(struct ifnet *ifp)
8042 {
8043 	lck_mtx_lock_spin(&ifp->if_ref_lock);
8044 	VERIFY(ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING));
8045 	ifnet_datamov_suspend_locked(ifp);
8046 	lck_mtx_unlock(&ifp->if_ref_lock);
8047 }
8048 
8049 boolean_t
ifnet_datamov_suspend_if_needed(struct ifnet * ifp)8050 ifnet_datamov_suspend_if_needed(struct ifnet *ifp)
8051 {
8052 	lck_mtx_lock_spin(&ifp->if_ref_lock);
8053 	VERIFY(ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING));
8054 	if (ifp->if_suspend > 0) {
8055 		lck_mtx_unlock(&ifp->if_ref_lock);
8056 		return FALSE;
8057 	}
8058 	ifnet_datamov_suspend_locked(ifp);
8059 	lck_mtx_unlock(&ifp->if_ref_lock);
8060 	return TRUE;
8061 }
8062 
8063 void
ifnet_datamov_drain(struct ifnet * ifp)8064 ifnet_datamov_drain(struct ifnet *ifp)
8065 {
8066 	lck_mtx_lock(&ifp->if_ref_lock);
8067 	VERIFY(ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING));
8068 	/* data movement must already be suspended */
8069 	VERIFY(ifp->if_suspend > 0);
8070 	VERIFY(!(ifp->if_refflags & IFRF_READY));
8071 	ifp->if_drainers++;
8072 	while (ifp->if_datamov != 0) {
8073 		DLIL_PRINTF("Waiting for data path(s) to quiesce on %s\n",
8074 		    if_name(ifp));
8075 		DTRACE_IP1(datamov__wait, struct ifnet *, ifp);
8076 		(void) msleep(&(ifp->if_datamov), &ifp->if_ref_lock,
8077 		    (PZERO - 1), __func__, NULL);
8078 		DTRACE_IP1(datamov__wake, struct ifnet *, ifp);
8079 	}
8080 	VERIFY(!(ifp->if_refflags & IFRF_READY));
8081 	VERIFY(ifp->if_drainers > 0);
8082 	ifp->if_drainers--;
8083 	lck_mtx_unlock(&ifp->if_ref_lock);
8084 
8085 	/* purge the interface queues */
8086 	if ((ifp->if_eflags & IFEF_TXSTART) != 0) {
8087 		if_qflush_snd(ifp, false);
8088 	}
8089 }
8090 
8091 void
ifnet_datamov_suspend_and_drain(struct ifnet * ifp)8092 ifnet_datamov_suspend_and_drain(struct ifnet *ifp)
8093 {
8094 	ifnet_datamov_suspend(ifp);
8095 	ifnet_datamov_drain(ifp);
8096 }
8097 
8098 void
ifnet_datamov_resume(struct ifnet * ifp)8099 ifnet_datamov_resume(struct ifnet *ifp)
8100 {
8101 	lck_mtx_lock(&ifp->if_ref_lock);
8102 	/* data movement must already be suspended */
8103 	VERIFY(ifp->if_suspend > 0);
8104 	if (--ifp->if_suspend == 0) {
8105 		VERIFY(!(ifp->if_refflags & IFRF_READY));
8106 		ifp->if_refflags |= IFRF_READY;
8107 	}
8108 	ifnet_decr_iorefcnt_locked(ifp);
8109 	lck_mtx_unlock(&ifp->if_ref_lock);
8110 }
8111 
8112 static void
dlil_if_trace(struct dlil_ifnet * dl_if,int refhold)8113 dlil_if_trace(struct dlil_ifnet *dl_if, int refhold)
8114 {
8115 	struct dlil_ifnet_dbg *dl_if_dbg = (struct dlil_ifnet_dbg *)dl_if;
8116 	ctrace_t *tr;
8117 	u_int32_t idx;
8118 	u_int16_t *cnt;
8119 
8120 	if (!(dl_if->dl_if_flags & DLIF_DEBUG)) {
8121 		panic("%s: dl_if %p has no debug structure", __func__, dl_if);
8122 		/* NOTREACHED */
8123 	}
8124 
8125 	if (refhold) {
8126 		cnt = &dl_if_dbg->dldbg_if_refhold_cnt;
8127 		tr = dl_if_dbg->dldbg_if_refhold;
8128 	} else {
8129 		cnt = &dl_if_dbg->dldbg_if_refrele_cnt;
8130 		tr = dl_if_dbg->dldbg_if_refrele;
8131 	}
8132 
8133 	idx = os_atomic_inc_orig(cnt, relaxed) % IF_REF_TRACE_HIST_SIZE;
8134 	ctrace_record(&tr[idx]);
8135 }
8136 
8137 errno_t
dlil_if_ref(struct ifnet * ifp)8138 dlil_if_ref(struct ifnet *ifp)
8139 {
8140 	struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
8141 
8142 	if (dl_if == NULL) {
8143 		return EINVAL;
8144 	}
8145 
8146 	lck_mtx_lock_spin(&dl_if->dl_if_lock);
8147 	++dl_if->dl_if_refcnt;
8148 	if (dl_if->dl_if_refcnt == 0) {
8149 		panic("%s: wraparound refcnt for ifp=%p", __func__, ifp);
8150 		/* NOTREACHED */
8151 	}
8152 	if (dl_if->dl_if_trace != NULL) {
8153 		(*dl_if->dl_if_trace)(dl_if, TRUE);
8154 	}
8155 	lck_mtx_unlock(&dl_if->dl_if_lock);
8156 
8157 	return 0;
8158 }
8159 
8160 errno_t
dlil_if_free(struct ifnet * ifp)8161 dlil_if_free(struct ifnet *ifp)
8162 {
8163 	struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
8164 	bool need_release = FALSE;
8165 
8166 	if (dl_if == NULL) {
8167 		return EINVAL;
8168 	}
8169 
8170 	lck_mtx_lock_spin(&dl_if->dl_if_lock);
8171 	switch (dl_if->dl_if_refcnt) {
8172 	case 0:
8173 		panic("%s: negative refcnt for ifp=%p", __func__, ifp);
8174 		/* NOTREACHED */
8175 		break;
8176 	case 1:
8177 		if ((ifp->if_refflags & IFRF_EMBRYONIC) != 0) {
8178 			need_release = TRUE;
8179 		}
8180 		break;
8181 	default:
8182 		break;
8183 	}
8184 	--dl_if->dl_if_refcnt;
8185 	if (dl_if->dl_if_trace != NULL) {
8186 		(*dl_if->dl_if_trace)(dl_if, FALSE);
8187 	}
8188 	lck_mtx_unlock(&dl_if->dl_if_lock);
8189 	if (need_release) {
8190 		_dlil_if_release(ifp, true);
8191 	}
8192 	return 0;
8193 }
8194 
8195 static errno_t
dlil_attach_protocol(struct if_proto * proto,const struct ifnet_demux_desc * demux_list,u_int32_t demux_count,uint32_t * proto_count)8196 dlil_attach_protocol(struct if_proto *proto,
8197     const struct ifnet_demux_desc *demux_list, u_int32_t demux_count,
8198     uint32_t * proto_count)
8199 {
8200 	struct kev_dl_proto_data ev_pr_data;
8201 	struct ifnet *ifp = proto->ifp;
8202 	errno_t retval = 0;
8203 	u_int32_t hash_value = proto_hash_value(proto->protocol_family);
8204 	struct if_proto *prev_proto;
8205 	struct if_proto *_proto;
8206 
8207 	/* don't allow attaching anything but PF_BRIDGE to vmnet interfaces */
8208 	if (IFNET_IS_VMNET(ifp) && proto->protocol_family != PF_BRIDGE) {
8209 		return EINVAL;
8210 	}
8211 
8212 	if (!ifnet_is_attached(ifp, 1)) {
8213 		os_log(OS_LOG_DEFAULT, "%s: %s is no longer attached",
8214 		    __func__, if_name(ifp));
8215 		return ENXIO;
8216 	}
8217 	/* callee holds a proto refcnt upon success */
8218 	ifnet_lock_exclusive(ifp);
8219 	_proto = find_attached_proto(ifp, proto->protocol_family);
8220 	if (_proto != NULL) {
8221 		ifnet_lock_done(ifp);
8222 		if_proto_free(_proto);
8223 		retval = EEXIST;
8224 		goto ioref_done;
8225 	}
8226 
8227 	/*
8228 	 * Call family module add_proto routine so it can refine the
8229 	 * demux descriptors as it wishes.
8230 	 */
8231 	retval = ifp->if_add_proto(ifp, proto->protocol_family, demux_list,
8232 	    demux_count);
8233 	if (retval) {
8234 		ifnet_lock_done(ifp);
8235 		goto ioref_done;
8236 	}
8237 
8238 	/*
8239 	 * Insert the protocol in the hash
8240 	 */
8241 	prev_proto = SLIST_FIRST(&ifp->if_proto_hash[hash_value]);
8242 	while (prev_proto != NULL && SLIST_NEXT(prev_proto, next_hash) != NULL) {
8243 		prev_proto = SLIST_NEXT(prev_proto, next_hash);
8244 	}
8245 	if (prev_proto) {
8246 		SLIST_INSERT_AFTER(prev_proto, proto, next_hash);
8247 	} else {
8248 		SLIST_INSERT_HEAD(&ifp->if_proto_hash[hash_value],
8249 		    proto, next_hash);
8250 	}
8251 
8252 	/* hold a proto refcnt for attach */
8253 	if_proto_ref(proto);
8254 
8255 	/*
8256 	 * The reserved field carries the number of protocol still attached
8257 	 * (subject to change)
8258 	 */
8259 	ev_pr_data.proto_family = proto->protocol_family;
8260 	ev_pr_data.proto_remaining_count = dlil_ifp_protolist(ifp, NULL, 0);
8261 
8262 	ifnet_lock_done(ifp);
8263 
8264 	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_PROTO_ATTACHED,
8265 	    (struct net_event_data *)&ev_pr_data,
8266 	    sizeof(struct kev_dl_proto_data), FALSE);
8267 	if (proto_count != NULL) {
8268 		*proto_count = ev_pr_data.proto_remaining_count;
8269 	}
8270 ioref_done:
8271 	ifnet_decr_iorefcnt(ifp);
8272 	return retval;
8273 }
8274 
8275 static void
dlil_handle_proto_attach(ifnet_t ifp,protocol_family_t protocol)8276 dlil_handle_proto_attach(ifnet_t ifp, protocol_family_t protocol)
8277 {
8278 	/*
8279 	 * A protocol has been attached, mark the interface up.
8280 	 * This used to be done by configd.KernelEventMonitor, but that
8281 	 * is inherently prone to races (rdar://problem/30810208).
8282 	 */
8283 	(void) ifnet_set_flags(ifp, IFF_UP, IFF_UP);
8284 	(void) ifnet_ioctl(ifp, 0, SIOCSIFFLAGS, NULL);
8285 	dlil_post_sifflags_msg(ifp);
8286 #if SKYWALK
8287 	switch (protocol) {
8288 	case AF_INET:
8289 	case AF_INET6:
8290 		/* don't attach the flowswitch unless attaching IP */
8291 		dlil_attach_flowswitch_nexus(ifp);
8292 		break;
8293 	default:
8294 		break;
8295 	}
8296 #endif /* SKYWALK */
8297 }
8298 
8299 errno_t
ifnet_attach_protocol(ifnet_t ifp,protocol_family_t protocol,const struct ifnet_attach_proto_param * proto_details)8300 ifnet_attach_protocol(ifnet_t ifp, protocol_family_t protocol,
8301     const struct ifnet_attach_proto_param *proto_details)
8302 {
8303 	int retval = 0;
8304 	struct if_proto  *ifproto = NULL;
8305 	uint32_t proto_count = 0;
8306 
8307 	ifnet_head_lock_shared();
8308 	if (ifp == NULL || protocol == 0 || proto_details == NULL) {
8309 		retval = EINVAL;
8310 		goto end;
8311 	}
8312 	/* Check that the interface is in the global list */
8313 	if (!ifnet_lookup(ifp)) {
8314 		retval = ENXIO;
8315 		goto end;
8316 	}
8317 
8318 	ifproto = zalloc_flags(dlif_proto_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
8319 
8320 	/* refcnt held above during lookup */
8321 	ifproto->ifp = ifp;
8322 	ifproto->protocol_family = protocol;
8323 	ifproto->proto_kpi = kProtoKPI_v1;
8324 	ifproto->kpi.v1.input = proto_details->input;
8325 	ifproto->kpi.v1.pre_output = proto_details->pre_output;
8326 	ifproto->kpi.v1.event = proto_details->event;
8327 	ifproto->kpi.v1.ioctl = proto_details->ioctl;
8328 	ifproto->kpi.v1.detached = proto_details->detached;
8329 	ifproto->kpi.v1.resolve_multi = proto_details->resolve;
8330 	ifproto->kpi.v1.send_arp = proto_details->send_arp;
8331 
8332 	retval = dlil_attach_protocol(ifproto,
8333 	    proto_details->demux_list, proto_details->demux_count,
8334 	    &proto_count);
8335 
8336 end:
8337 	if (retval == EEXIST) {
8338 		/* already attached */
8339 		if (dlil_verbose) {
8340 			DLIL_PRINTF("%s: protocol %d already attached\n",
8341 			    ifp != NULL ? if_name(ifp) : "N/A",
8342 			    protocol);
8343 		}
8344 	} else if (retval != 0) {
8345 		DLIL_PRINTF("%s: failed to attach v1 protocol %d (err=%d)\n",
8346 		    ifp != NULL ? if_name(ifp) : "N/A", protocol, retval);
8347 	} else if (dlil_verbose) {
8348 		DLIL_PRINTF("%s: attached v1 protocol %d (count = %d)\n",
8349 		    ifp != NULL ? if_name(ifp) : "N/A",
8350 		    protocol, proto_count);
8351 	}
8352 	ifnet_head_done();
8353 	if (retval == 0) {
8354 		dlil_handle_proto_attach(ifp, protocol);
8355 	} else if (ifproto != NULL) {
8356 		zfree(dlif_proto_zone, ifproto);
8357 	}
8358 	return retval;
8359 }
8360 
8361 errno_t
ifnet_attach_protocol_v2(ifnet_t ifp,protocol_family_t protocol,const struct ifnet_attach_proto_param_v2 * proto_details)8362 ifnet_attach_protocol_v2(ifnet_t ifp, protocol_family_t protocol,
8363     const struct ifnet_attach_proto_param_v2 *proto_details)
8364 {
8365 	int retval = 0;
8366 	struct if_proto  *ifproto = NULL;
8367 	uint32_t proto_count = 0;
8368 
8369 	ifnet_head_lock_shared();
8370 	if (ifp == NULL || protocol == 0 || proto_details == NULL) {
8371 		retval = EINVAL;
8372 		goto end;
8373 	}
8374 	/* Check that the interface is in the global list */
8375 	if (!ifnet_lookup(ifp)) {
8376 		retval = ENXIO;
8377 		goto end;
8378 	}
8379 
8380 	ifproto = zalloc_flags(dlif_proto_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
8381 
8382 	/* refcnt held above during lookup */
8383 	ifproto->ifp = ifp;
8384 	ifproto->protocol_family = protocol;
8385 	ifproto->proto_kpi = kProtoKPI_v2;
8386 	ifproto->kpi.v2.input = proto_details->input;
8387 	ifproto->kpi.v2.pre_output = proto_details->pre_output;
8388 	ifproto->kpi.v2.event = proto_details->event;
8389 	ifproto->kpi.v2.ioctl = proto_details->ioctl;
8390 	ifproto->kpi.v2.detached = proto_details->detached;
8391 	ifproto->kpi.v2.resolve_multi = proto_details->resolve;
8392 	ifproto->kpi.v2.send_arp = proto_details->send_arp;
8393 
8394 	retval = dlil_attach_protocol(ifproto,
8395 	    proto_details->demux_list, proto_details->demux_count,
8396 	    &proto_count);
8397 
8398 end:
8399 	if (retval == EEXIST) {
8400 		/* already attached */
8401 		if (dlil_verbose) {
8402 			DLIL_PRINTF("%s: protocol %d already attached\n",
8403 			    ifp != NULL ? if_name(ifp) : "N/A",
8404 			    protocol);
8405 		}
8406 	} else if (retval != 0) {
8407 		DLIL_PRINTF("%s: failed to attach v2 protocol %d (err=%d)\n",
8408 		    ifp != NULL ? if_name(ifp) : "N/A", protocol, retval);
8409 	} else if (dlil_verbose) {
8410 		DLIL_PRINTF("%s: attached v2 protocol %d (count = %d)\n",
8411 		    ifp != NULL ? if_name(ifp) : "N/A",
8412 		    protocol, proto_count);
8413 	}
8414 	ifnet_head_done();
8415 	if (retval == 0) {
8416 		dlil_handle_proto_attach(ifp, protocol);
8417 	} else if (ifproto != NULL) {
8418 		zfree(dlif_proto_zone, ifproto);
8419 	}
8420 	return retval;
8421 }
8422 
8423 errno_t
ifnet_detach_protocol(ifnet_t ifp,protocol_family_t proto_family)8424 ifnet_detach_protocol(ifnet_t ifp, protocol_family_t proto_family)
8425 {
8426 	struct if_proto *proto = NULL;
8427 	int     retval = 0;
8428 
8429 	if (ifp == NULL || proto_family == 0) {
8430 		retval = EINVAL;
8431 		goto end;
8432 	}
8433 
8434 	ifnet_lock_exclusive(ifp);
8435 	/* callee holds a proto refcnt upon success */
8436 	proto = find_attached_proto(ifp, proto_family);
8437 	if (proto == NULL) {
8438 		retval = ENXIO;
8439 		ifnet_lock_done(ifp);
8440 		goto end;
8441 	}
8442 
8443 	/* call family module del_proto */
8444 	if (ifp->if_del_proto) {
8445 		ifp->if_del_proto(ifp, proto->protocol_family);
8446 	}
8447 
8448 	SLIST_REMOVE(&ifp->if_proto_hash[proto_hash_value(proto_family)],
8449 	    proto, if_proto, next_hash);
8450 
8451 	if (proto->proto_kpi == kProtoKPI_v1) {
8452 		proto->kpi.v1.input = ifproto_media_input_v1;
8453 		proto->kpi.v1.pre_output = ifproto_media_preout;
8454 		proto->kpi.v1.event = ifproto_media_event;
8455 		proto->kpi.v1.ioctl = ifproto_media_ioctl;
8456 		proto->kpi.v1.resolve_multi = ifproto_media_resolve_multi;
8457 		proto->kpi.v1.send_arp = ifproto_media_send_arp;
8458 	} else {
8459 		proto->kpi.v2.input = ifproto_media_input_v2;
8460 		proto->kpi.v2.pre_output = ifproto_media_preout;
8461 		proto->kpi.v2.event = ifproto_media_event;
8462 		proto->kpi.v2.ioctl = ifproto_media_ioctl;
8463 		proto->kpi.v2.resolve_multi = ifproto_media_resolve_multi;
8464 		proto->kpi.v2.send_arp = ifproto_media_send_arp;
8465 	}
8466 	proto->detached = 1;
8467 	ifnet_lock_done(ifp);
8468 
8469 	if (dlil_verbose) {
8470 		DLIL_PRINTF("%s: detached %s protocol %d\n", if_name(ifp),
8471 		    (proto->proto_kpi == kProtoKPI_v1) ?
8472 		    "v1" : "v2", proto_family);
8473 	}
8474 
8475 	/* release proto refcnt held during protocol attach */
8476 	if_proto_free(proto);
8477 
8478 	/*
8479 	 * Release proto refcnt held during lookup; the rest of
8480 	 * protocol detach steps will happen when the last proto
8481 	 * reference is released.
8482 	 */
8483 	if_proto_free(proto);
8484 
8485 end:
8486 	return retval;
8487 }
8488 
8489 static errno_t
ifproto_media_input_v1(struct ifnet * ifp,protocol_family_t protocol,struct mbuf * packet,char * header)8490 ifproto_media_input_v1(struct ifnet *ifp, protocol_family_t protocol,
8491     struct mbuf *packet, char *header)
8492 {
8493 #pragma unused(ifp, protocol, packet, header)
8494 	return ENXIO;
8495 }
8496 
8497 static errno_t
ifproto_media_input_v2(struct ifnet * ifp,protocol_family_t protocol,struct mbuf * packet)8498 ifproto_media_input_v2(struct ifnet *ifp, protocol_family_t protocol,
8499     struct mbuf *packet)
8500 {
8501 #pragma unused(ifp, protocol, packet)
8502 	return ENXIO;
8503 }
8504 
8505 static errno_t
ifproto_media_preout(struct ifnet * ifp,protocol_family_t protocol,mbuf_t * packet,const struct sockaddr * dest,void * route,char * frame_type,char * link_layer_dest)8506 ifproto_media_preout(struct ifnet *ifp, protocol_family_t protocol,
8507     mbuf_t *packet, const struct sockaddr *dest, void *route, char *frame_type,
8508     char *link_layer_dest)
8509 {
8510 #pragma unused(ifp, protocol, packet, dest, route, frame_type, link_layer_dest)
8511 	return ENXIO;
8512 }
8513 
8514 static void
ifproto_media_event(struct ifnet * ifp,protocol_family_t protocol,const struct kev_msg * event)8515 ifproto_media_event(struct ifnet *ifp, protocol_family_t protocol,
8516     const struct kev_msg *event)
8517 {
8518 #pragma unused(ifp, protocol, event)
8519 }
8520 
8521 static errno_t
ifproto_media_ioctl(struct ifnet * ifp,protocol_family_t protocol,unsigned long command,void * argument)8522 ifproto_media_ioctl(struct ifnet *ifp, protocol_family_t protocol,
8523     unsigned long command, void *argument)
8524 {
8525 #pragma unused(ifp, protocol, command, argument)
8526 	return ENXIO;
8527 }
8528 
8529 static errno_t
ifproto_media_resolve_multi(ifnet_t ifp,const struct sockaddr * proto_addr,struct sockaddr_dl * out_ll,size_t ll_len)8530 ifproto_media_resolve_multi(ifnet_t ifp, const struct sockaddr *proto_addr,
8531     struct sockaddr_dl *out_ll, size_t ll_len)
8532 {
8533 #pragma unused(ifp, proto_addr, out_ll, ll_len)
8534 	return ENXIO;
8535 }
8536 
8537 static errno_t
ifproto_media_send_arp(struct ifnet * ifp,u_short arpop,const struct sockaddr_dl * sender_hw,const struct sockaddr * sender_proto,const struct sockaddr_dl * target_hw,const struct sockaddr * target_proto)8538 ifproto_media_send_arp(struct ifnet *ifp, u_short arpop,
8539     const struct sockaddr_dl *sender_hw, const struct sockaddr *sender_proto,
8540     const struct sockaddr_dl *target_hw, const struct sockaddr *target_proto)
8541 {
8542 #pragma unused(ifp, arpop, sender_hw, sender_proto, target_hw, target_proto)
8543 	return ENXIO;
8544 }
8545 
8546 extern int if_next_index(void);
8547 extern int tcp_ecn_outbound;
8548 
8549 void
dlil_ifclassq_setup(struct ifnet * ifp,struct ifclassq * ifcq)8550 dlil_ifclassq_setup(struct ifnet *ifp, struct ifclassq *ifcq)
8551 {
8552 	uint32_t sflags = 0;
8553 	int err;
8554 
8555 	if (if_flowadv) {
8556 		sflags |= PKTSCHEDF_QALG_FLOWCTL;
8557 	}
8558 
8559 	if (if_delaybased_queue) {
8560 		sflags |= PKTSCHEDF_QALG_DELAYBASED;
8561 	}
8562 
8563 	if (ifp->if_output_sched_model ==
8564 	    IFNET_SCHED_MODEL_DRIVER_MANAGED) {
8565 		sflags |= PKTSCHEDF_QALG_DRIVER_MANAGED;
8566 	}
8567 	/* Inherit drop limit from the default queue */
8568 	if (ifp->if_snd != ifcq) {
8569 		IFCQ_PKT_DROP_LIMIT(ifcq) = IFCQ_PKT_DROP_LIMIT(ifp->if_snd);
8570 	}
8571 	/* Initialize transmit queue(s) */
8572 	err = ifclassq_setup(ifcq, ifp, sflags);
8573 	if (err != 0) {
8574 		panic_plain("%s: ifp=%p couldn't initialize transmit queue; "
8575 		    "err=%d", __func__, ifp, err);
8576 		/* NOTREACHED */
8577 	}
8578 }
8579 
8580 errno_t
ifnet_attach(ifnet_t ifp,const struct sockaddr_dl * ll_addr)8581 ifnet_attach(ifnet_t ifp, const struct sockaddr_dl *ll_addr)
8582 {
8583 #if SKYWALK
8584 	boolean_t netif_compat;
8585 	if_nexus_netif  nexus_netif;
8586 #endif /* SKYWALK */
8587 	struct ifnet *tmp_if;
8588 	struct ifaddr *ifa;
8589 	struct if_data_internal if_data_saved;
8590 	struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
8591 	struct dlil_threading_info *dl_inp;
8592 	thread_continue_t thfunc = NULL;
8593 	int err;
8594 
8595 	if (ifp == NULL) {
8596 		return EINVAL;
8597 	}
8598 
8599 	/*
8600 	 * Serialize ifnet attach using dlil_ifnet_lock, in order to
8601 	 * prevent the interface from being configured while it is
8602 	 * embryonic, as ifnet_head_lock is dropped and reacquired
8603 	 * below prior to marking the ifnet with IFRF_ATTACHED.
8604 	 */
8605 	dlil_if_lock();
8606 	ifnet_head_lock_exclusive();
8607 	/* Verify we aren't already on the list */
8608 	TAILQ_FOREACH(tmp_if, &ifnet_head, if_link) {
8609 		if (tmp_if == ifp) {
8610 			ifnet_head_done();
8611 			dlil_if_unlock();
8612 			return EEXIST;
8613 		}
8614 	}
8615 
8616 	lck_mtx_lock_spin(&ifp->if_ref_lock);
8617 	if (!(ifp->if_refflags & IFRF_EMBRYONIC)) {
8618 		panic_plain("%s: flags mismatch (embryonic not set) ifp=%p",
8619 		    __func__, ifp);
8620 		/* NOTREACHED */
8621 	}
8622 	lck_mtx_unlock(&ifp->if_ref_lock);
8623 
8624 	ifnet_lock_exclusive(ifp);
8625 
8626 	/* Sanity check */
8627 	VERIFY(ifp->if_detaching_link.tqe_next == NULL);
8628 	VERIFY(ifp->if_detaching_link.tqe_prev == NULL);
8629 	VERIFY(ifp->if_threads_pending == 0);
8630 
8631 	if (ll_addr != NULL) {
8632 		if (ifp->if_addrlen == 0) {
8633 			ifp->if_addrlen = ll_addr->sdl_alen;
8634 		} else if (ll_addr->sdl_alen != ifp->if_addrlen) {
8635 			ifnet_lock_done(ifp);
8636 			ifnet_head_done();
8637 			dlil_if_unlock();
8638 			return EINVAL;
8639 		}
8640 	}
8641 
8642 	/*
8643 	 * Allow interfaces without protocol families to attach
8644 	 * only if they have the necessary fields filled out.
8645 	 */
8646 	if (ifp->if_add_proto == NULL || ifp->if_del_proto == NULL) {
8647 		DLIL_PRINTF("%s: Attempt to attach interface without "
8648 		    "family module - %d\n", __func__, ifp->if_family);
8649 		ifnet_lock_done(ifp);
8650 		ifnet_head_done();
8651 		dlil_if_unlock();
8652 		return ENODEV;
8653 	}
8654 
8655 	/* Allocate protocol hash table */
8656 	VERIFY(ifp->if_proto_hash == NULL);
8657 	ifp->if_proto_hash = kalloc_type(struct proto_hash_entry,
8658 	    PROTO_HASH_SLOTS, Z_WAITOK | Z_ZERO | Z_NOFAIL);
8659 
8660 	lck_mtx_lock_spin(&ifp->if_flt_lock);
8661 	VERIFY(TAILQ_EMPTY(&ifp->if_flt_head));
8662 	TAILQ_INIT(&ifp->if_flt_head);
8663 	VERIFY(ifp->if_flt_busy == 0);
8664 	VERIFY(ifp->if_flt_waiters == 0);
8665 	VERIFY(ifp->if_flt_non_os_count == 0);
8666 	VERIFY(ifp->if_flt_no_tso_count == 0);
8667 	lck_mtx_unlock(&ifp->if_flt_lock);
8668 
8669 	if (!(dl_if->dl_if_flags & DLIF_REUSE)) {
8670 		VERIFY(LIST_EMPTY(&ifp->if_multiaddrs));
8671 		LIST_INIT(&ifp->if_multiaddrs);
8672 	}
8673 
8674 	VERIFY(ifp->if_allhostsinm == NULL);
8675 	VERIFY(TAILQ_EMPTY(&ifp->if_addrhead));
8676 	TAILQ_INIT(&ifp->if_addrhead);
8677 
8678 	if (ifp->if_index == 0) {
8679 		int idx = if_next_index();
8680 
8681 		/*
8682 		 * Since we exhausted the list of
8683 		 * if_index's, try to find an empty slot
8684 		 * in ifindex2ifnet.
8685 		 */
8686 		if (idx == -1 && if_index >= UINT16_MAX) {
8687 			for (int i = 1; i < if_index; i++) {
8688 				if (ifindex2ifnet[i] == NULL &&
8689 				    ifnet_addrs[i - 1] == NULL) {
8690 					idx = i;
8691 					break;
8692 				}
8693 			}
8694 		}
8695 		if (idx == -1) {
8696 			ifp->if_index = 0;
8697 			ifnet_lock_done(ifp);
8698 			ifnet_head_done();
8699 			dlil_if_unlock();
8700 			return ENOBUFS;
8701 		}
8702 		ifp->if_index = (uint16_t)idx;
8703 
8704 		/* the lladdr passed at attach time is the permanent address */
8705 		if (ll_addr != NULL && ifp->if_type == IFT_ETHER &&
8706 		    ll_addr->sdl_alen == ETHER_ADDR_LEN) {
8707 			bcopy(CONST_LLADDR(ll_addr),
8708 			    dl_if->dl_if_permanent_ether,
8709 			    ETHER_ADDR_LEN);
8710 			dl_if->dl_if_permanent_ether_is_set = 1;
8711 		}
8712 	}
8713 	/* There should not be anything occupying this slot */
8714 	VERIFY(ifindex2ifnet[ifp->if_index] == NULL);
8715 
8716 	/* allocate (if needed) and initialize a link address */
8717 	ifa = dlil_alloc_lladdr(ifp, ll_addr);
8718 	if (ifa == NULL) {
8719 		ifnet_lock_done(ifp);
8720 		ifnet_head_done();
8721 		dlil_if_unlock();
8722 		return ENOBUFS;
8723 	}
8724 
8725 	VERIFY(ifnet_addrs[ifp->if_index - 1] == NULL);
8726 	ifnet_addrs[ifp->if_index - 1] = ifa;
8727 
8728 	/* make this address the first on the list */
8729 	IFA_LOCK(ifa);
8730 	/* hold a reference for ifnet_addrs[] */
8731 	ifa_addref(ifa);
8732 	/* if_attach_link_ifa() holds a reference for ifa_link */
8733 	if_attach_link_ifa(ifp, ifa);
8734 	IFA_UNLOCK(ifa);
8735 
8736 	TAILQ_INSERT_TAIL(&ifnet_head, ifp, if_link);
8737 	ifindex2ifnet[ifp->if_index] = ifp;
8738 
8739 	/* Hold a reference to the underlying dlil_ifnet */
8740 	ifnet_reference(ifp);
8741 
8742 	/* Clear stats (save and restore other fields that we care) */
8743 	if_data_saved = ifp->if_data;
8744 	bzero(&ifp->if_data, sizeof(ifp->if_data));
8745 	ifp->if_data.ifi_type = if_data_saved.ifi_type;
8746 	ifp->if_data.ifi_typelen = if_data_saved.ifi_typelen;
8747 	ifp->if_data.ifi_physical = if_data_saved.ifi_physical;
8748 	ifp->if_data.ifi_addrlen = if_data_saved.ifi_addrlen;
8749 	ifp->if_data.ifi_hdrlen = if_data_saved.ifi_hdrlen;
8750 	ifp->if_data.ifi_mtu = if_data_saved.ifi_mtu;
8751 	ifp->if_data.ifi_baudrate = if_data_saved.ifi_baudrate;
8752 	ifp->if_data.ifi_hwassist = if_data_saved.ifi_hwassist;
8753 	ifp->if_data.ifi_tso_v4_mtu = if_data_saved.ifi_tso_v4_mtu;
8754 	ifp->if_data.ifi_tso_v6_mtu = if_data_saved.ifi_tso_v6_mtu;
8755 	ifnet_touch_lastchange(ifp);
8756 
8757 	VERIFY(ifp->if_output_sched_model == IFNET_SCHED_MODEL_NORMAL ||
8758 	    ifp->if_output_sched_model == IFNET_SCHED_MODEL_DRIVER_MANAGED ||
8759 	    ifp->if_output_sched_model == IFNET_SCHED_MODEL_FQ_CODEL);
8760 
8761 	dlil_ifclassq_setup(ifp, ifp->if_snd);
8762 
8763 	/* Sanity checks on the input thread storage */
8764 	dl_inp = &dl_if->dl_if_inpstorage;
8765 	bzero(&dl_inp->dlth_stats, sizeof(dl_inp->dlth_stats));
8766 	VERIFY(dl_inp->dlth_flags == 0);
8767 	VERIFY(dl_inp->dlth_wtot == 0);
8768 	VERIFY(dl_inp->dlth_ifp == NULL);
8769 	VERIFY(qhead(&dl_inp->dlth_pkts) == NULL && qempty(&dl_inp->dlth_pkts));
8770 	VERIFY(qlimit(&dl_inp->dlth_pkts) == 0);
8771 	VERIFY(!dl_inp->dlth_affinity);
8772 	VERIFY(ifp->if_inp == NULL);
8773 	VERIFY(dl_inp->dlth_thread == THREAD_NULL);
8774 	VERIFY(dl_inp->dlth_strategy == NULL);
8775 	VERIFY(dl_inp->dlth_driver_thread == THREAD_NULL);
8776 	VERIFY(dl_inp->dlth_poller_thread == THREAD_NULL);
8777 	VERIFY(dl_inp->dlth_affinity_tag == 0);
8778 
8779 #if IFNET_INPUT_SANITY_CHK
8780 	VERIFY(dl_inp->dlth_pkts_cnt == 0);
8781 #endif /* IFNET_INPUT_SANITY_CHK */
8782 
8783 	VERIFY(ifp->if_poll_thread == THREAD_NULL);
8784 	dlil_reset_rxpoll_params(ifp);
8785 	/*
8786 	 * A specific DLIL input thread is created per non-loopback interface.
8787 	 */
8788 	if (ifp->if_family != IFNET_FAMILY_LOOPBACK) {
8789 		ifp->if_inp = dl_inp;
8790 		ifnet_incr_pending_thread_count(ifp);
8791 		err = dlil_create_input_thread(ifp, ifp->if_inp, &thfunc);
8792 		if (err == ENODEV) {
8793 			VERIFY(thfunc == NULL);
8794 			ifnet_decr_pending_thread_count(ifp);
8795 		} else if (err != 0) {
8796 			panic_plain("%s: ifp=%p couldn't get an input thread; "
8797 			    "err=%d", __func__, ifp, err);
8798 			/* NOTREACHED */
8799 		}
8800 	}
8801 	/*
8802 	 * If the driver supports the new transmit model, calculate flow hash
8803 	 * and create a workloop starter thread to invoke the if_start callback
8804 	 * where the packets may be dequeued and transmitted.
8805 	 */
8806 	if (ifp->if_eflags & IFEF_TXSTART) {
8807 		thread_precedence_policy_data_t info;
8808 		__unused kern_return_t kret;
8809 
8810 		ifp->if_flowhash = ifnet_calc_flowhash(ifp);
8811 		VERIFY(ifp->if_flowhash != 0);
8812 		VERIFY(ifp->if_start_thread == THREAD_NULL);
8813 
8814 		ifnet_set_start_cycle(ifp, NULL);
8815 		ifp->if_start_active = 0;
8816 		ifp->if_start_req = 0;
8817 		ifp->if_start_flags = 0;
8818 		VERIFY(ifp->if_start != NULL);
8819 		ifnet_incr_pending_thread_count(ifp);
8820 		if ((err = kernel_thread_start(ifnet_start_thread_func,
8821 		    ifp, &ifp->if_start_thread)) != KERN_SUCCESS) {
8822 			panic_plain("%s: "
8823 			    "ifp=%p couldn't get a start thread; "
8824 			    "err=%d", __func__, ifp, err);
8825 			/* NOTREACHED */
8826 		}
8827 		bzero(&info, sizeof(info));
8828 		info.importance = 1;
8829 		kret = thread_policy_set(ifp->if_start_thread,
8830 		    THREAD_PRECEDENCE_POLICY, (thread_policy_t)&info,
8831 		    THREAD_PRECEDENCE_POLICY_COUNT);
8832 		ASSERT(kret == KERN_SUCCESS);
8833 	} else {
8834 		ifp->if_flowhash = 0;
8835 	}
8836 
8837 	/* Reset polling parameters */
8838 	ifnet_set_poll_cycle(ifp, NULL);
8839 	ifp->if_poll_update = 0;
8840 	ifp->if_poll_flags = 0;
8841 	ifp->if_poll_req = 0;
8842 	VERIFY(ifp->if_poll_thread == THREAD_NULL);
8843 
8844 	/*
8845 	 * If the driver supports the new receive model, create a poller
8846 	 * thread to invoke if_input_poll callback where the packets may
8847 	 * be dequeued from the driver and processed for reception.
8848 	 * if the interface is netif compat then the poller thread is
8849 	 * managed by netif.
8850 	 */
8851 	if (thfunc == dlil_rxpoll_input_thread_func) {
8852 		thread_precedence_policy_data_t info;
8853 		__unused kern_return_t kret;
8854 #if SKYWALK
8855 		VERIFY(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
8856 #endif /* SKYWALK */
8857 		VERIFY(ifp->if_input_poll != NULL);
8858 		VERIFY(ifp->if_input_ctl != NULL);
8859 		ifnet_incr_pending_thread_count(ifp);
8860 		if ((err = kernel_thread_start(ifnet_poll_thread_func, ifp,
8861 		    &ifp->if_poll_thread)) != KERN_SUCCESS) {
8862 			panic_plain("%s: ifp=%p couldn't get a poll thread; "
8863 			    "err=%d", __func__, ifp, err);
8864 			/* NOTREACHED */
8865 		}
8866 		bzero(&info, sizeof(info));
8867 		info.importance = 1;
8868 		kret = thread_policy_set(ifp->if_poll_thread,
8869 		    THREAD_PRECEDENCE_POLICY, (thread_policy_t)&info,
8870 		    THREAD_PRECEDENCE_POLICY_COUNT);
8871 		ASSERT(kret == KERN_SUCCESS);
8872 	}
8873 
8874 	VERIFY(ifp->if_desc.ifd_maxlen == IF_DESCSIZE);
8875 	VERIFY(ifp->if_desc.ifd_len == 0);
8876 	VERIFY(ifp->if_desc.ifd_desc != NULL);
8877 
8878 	/* Record attach PC stacktrace */
8879 	ctrace_record(&((struct dlil_ifnet *)ifp)->dl_if_attach);
8880 
8881 	ifp->if_updatemcasts = 0;
8882 	if (!LIST_EMPTY(&ifp->if_multiaddrs)) {
8883 		struct ifmultiaddr *ifma;
8884 		LIST_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
8885 			IFMA_LOCK(ifma);
8886 			if (ifma->ifma_addr->sa_family == AF_LINK ||
8887 			    ifma->ifma_addr->sa_family == AF_UNSPEC) {
8888 				ifp->if_updatemcasts++;
8889 			}
8890 			IFMA_UNLOCK(ifma);
8891 		}
8892 
8893 		DLIL_PRINTF("%s: attached with %d suspended link-layer multicast "
8894 		    "membership(s)\n", if_name(ifp),
8895 		    ifp->if_updatemcasts);
8896 	}
8897 
8898 	/* Clear logging parameters */
8899 	bzero(&ifp->if_log, sizeof(ifp->if_log));
8900 
8901 	/* Clear foreground/realtime activity timestamps */
8902 	ifp->if_fg_sendts = 0;
8903 	ifp->if_rt_sendts = 0;
8904 
8905 	/* Clear throughput estimates and radio type */
8906 	ifp->if_estimated_up_bucket = 0;
8907 	ifp->if_estimated_down_bucket = 0;
8908 	ifp->if_radio_type = 0;
8909 	ifp->if_radio_channel = 0;
8910 
8911 	VERIFY(ifp->if_delegated.ifp == NULL);
8912 	VERIFY(ifp->if_delegated.type == 0);
8913 	VERIFY(ifp->if_delegated.family == 0);
8914 	VERIFY(ifp->if_delegated.subfamily == 0);
8915 	VERIFY(ifp->if_delegated.expensive == 0);
8916 	VERIFY(ifp->if_delegated.constrained == 0);
8917 	VERIFY(ifp->if_delegated.ultra_constrained == 0);
8918 
8919 	VERIFY(ifp->if_agentids == NULL);
8920 	VERIFY(ifp->if_agentcount == 0);
8921 
8922 	/* Reset interface state */
8923 	bzero(&ifp->if_interface_state, sizeof(ifp->if_interface_state));
8924 	ifp->if_interface_state.valid_bitmask |=
8925 	    IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID;
8926 	ifp->if_interface_state.interface_availability =
8927 	    IF_INTERFACE_STATE_INTERFACE_AVAILABLE;
8928 
8929 	/* Initialize Link Quality Metric (loopback [lo0] is always good) */
8930 	if (ifp == lo_ifp) {
8931 		ifp->if_interface_state.lqm_state = IFNET_LQM_THRESH_GOOD;
8932 		ifp->if_interface_state.valid_bitmask |=
8933 		    IF_INTERFACE_STATE_LQM_STATE_VALID;
8934 	} else {
8935 		ifp->if_interface_state.lqm_state = IFNET_LQM_THRESH_UNKNOWN;
8936 	}
8937 
8938 	/*
8939 	 * Enable ECN capability on this interface depending on the
8940 	 * value of ECN global setting
8941 	 */
8942 	if (tcp_ecn_outbound == 2 && !IFNET_IS_CELLULAR(ifp)) {
8943 		if_set_eflags(ifp, IFEF_ECN_ENABLE);
8944 		if_clear_eflags(ifp, IFEF_ECN_DISABLE);
8945 	}
8946 
8947 	/*
8948 	 * Built-in Cyclops always on policy for WiFi infra
8949 	 */
8950 	if (IFNET_IS_WIFI_INFRA(ifp) && net_qos_policy_wifi_enabled != 0) {
8951 		errno_t error;
8952 
8953 		error = if_set_qosmarking_mode(ifp,
8954 		    IFRTYPE_QOSMARKING_FASTLANE);
8955 		if (error != 0) {
8956 			DLIL_PRINTF("%s if_set_qosmarking_mode(%s) error %d\n",
8957 			    __func__, ifp->if_xname, error);
8958 		} else {
8959 			if_set_eflags(ifp, IFEF_QOSMARKING_ENABLED);
8960 #if (DEVELOPMENT || DEBUG)
8961 			DLIL_PRINTF("%s fastlane enabled on %s\n",
8962 			    __func__, ifp->if_xname);
8963 #endif /* (DEVELOPMENT || DEBUG) */
8964 		}
8965 	}
8966 
8967 	ifnet_lock_done(ifp);
8968 	ifnet_head_done();
8969 
8970 #if SKYWALK
8971 	netif_compat = dlil_attach_netif_compat_nexus(ifp, &nexus_netif);
8972 #endif /* SKYWALK */
8973 
8974 	lck_mtx_lock(&ifp->if_cached_route_lock);
8975 	/* Enable forwarding cached route */
8976 	ifp->if_fwd_cacheok = 1;
8977 	/* Clean up any existing cached routes */
8978 	ROUTE_RELEASE(&ifp->if_fwd_route);
8979 	bzero(&ifp->if_fwd_route, sizeof(ifp->if_fwd_route));
8980 	ROUTE_RELEASE(&ifp->if_src_route);
8981 	bzero(&ifp->if_src_route, sizeof(ifp->if_src_route));
8982 	ROUTE_RELEASE(&ifp->if_src_route6);
8983 	bzero(&ifp->if_src_route6, sizeof(ifp->if_src_route6));
8984 	lck_mtx_unlock(&ifp->if_cached_route_lock);
8985 
8986 	ifnet_llreach_ifattach(ifp, (dl_if->dl_if_flags & DLIF_REUSE));
8987 
8988 	/*
8989 	 * Allocate and attach IGMPv3/MLDv2 interface specific variables
8990 	 * and trees; do this before the ifnet is marked as attached.
8991 	 * The ifnet keeps the reference to the info structures even after
8992 	 * the ifnet is detached, since the network-layer records still
8993 	 * refer to the info structures even after that.  This also
8994 	 * makes it possible for them to still function after the ifnet
8995 	 * is recycled or reattached.
8996 	 */
8997 #if INET
8998 	if (IGMP_IFINFO(ifp) == NULL) {
8999 		IGMP_IFINFO(ifp) = igmp_domifattach(ifp, Z_WAITOK);
9000 		VERIFY(IGMP_IFINFO(ifp) != NULL);
9001 	} else {
9002 		VERIFY(IGMP_IFINFO(ifp)->igi_ifp == ifp);
9003 		igmp_domifreattach(IGMP_IFINFO(ifp));
9004 	}
9005 #endif /* INET */
9006 	if (MLD_IFINFO(ifp) == NULL) {
9007 		MLD_IFINFO(ifp) = mld_domifattach(ifp, Z_WAITOK);
9008 		VERIFY(MLD_IFINFO(ifp) != NULL);
9009 	} else {
9010 		VERIFY(MLD_IFINFO(ifp)->mli_ifp == ifp);
9011 		mld_domifreattach(MLD_IFINFO(ifp));
9012 	}
9013 
9014 	VERIFY(ifp->if_data_threshold == 0);
9015 	VERIFY(ifp->if_dt_tcall != NULL);
9016 
9017 	/*
9018 	 * Wait for the created kernel threads for I/O to get
9019 	 * scheduled and run at least once before we proceed
9020 	 * to mark interface as attached.
9021 	 */
9022 	lck_mtx_lock(&ifp->if_ref_lock);
9023 	while (ifp->if_threads_pending != 0) {
9024 		DLIL_PRINTF("%s: Waiting for all kernel threads created for "
9025 		    "interface %s to get scheduled at least once.\n",
9026 		    __func__, ifp->if_xname);
9027 		(void) msleep(&ifp->if_threads_pending, &ifp->if_ref_lock, (PZERO - 1),
9028 		    __func__, NULL);
9029 		LCK_MTX_ASSERT(&ifp->if_ref_lock, LCK_ASSERT_OWNED);
9030 	}
9031 	lck_mtx_unlock(&ifp->if_ref_lock);
9032 	DLIL_PRINTF("%s: All kernel threads created for interface %s have been scheduled "
9033 	    "at least once. Proceeding.\n", __func__, ifp->if_xname);
9034 
9035 	/* Final mark this ifnet as attached. */
9036 	ifnet_lock_exclusive(ifp);
9037 	lck_mtx_lock_spin(&ifp->if_ref_lock);
9038 	ifp->if_refflags = (IFRF_ATTACHED | IFRF_READY); /* clears embryonic */
9039 	lck_mtx_unlock(&ifp->if_ref_lock);
9040 	if (net_rtref) {
9041 		/* boot-args override; enable idle notification */
9042 		(void) ifnet_set_idle_flags_locked(ifp, IFRF_IDLE_NOTIFY,
9043 		    IFRF_IDLE_NOTIFY);
9044 	} else {
9045 		/* apply previous request(s) to set the idle flags, if any */
9046 		(void) ifnet_set_idle_flags_locked(ifp, ifp->if_idle_new_flags,
9047 		    ifp->if_idle_new_flags_mask);
9048 	}
9049 #if SKYWALK
9050 	/* the interface is fully attached; let the nexus adapter know */
9051 	if (netif_compat || dlil_is_native_netif_nexus(ifp)) {
9052 		if (netif_compat) {
9053 			if (sk_netif_compat_txmodel ==
9054 			    NETIF_COMPAT_TXMODEL_ENQUEUE_MULTI) {
9055 				ifnet_enqueue_multi_setup(ifp,
9056 				    sk_tx_delay_qlen, sk_tx_delay_timeout);
9057 			}
9058 			ifp->if_nx_netif = nexus_netif;
9059 		}
9060 		ifp->if_na_ops->ni_finalize(ifp->if_na, ifp);
9061 	}
9062 #endif /* SKYWALK */
9063 	ifnet_lock_done(ifp);
9064 	dlil_if_unlock();
9065 
9066 #if PF
9067 	/*
9068 	 * Attach packet filter to this interface, if enabled.
9069 	 */
9070 	pf_ifnet_hook(ifp, 1);
9071 #endif /* PF */
9072 
9073 	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_IF_ATTACHED, NULL, 0, FALSE);
9074 
9075 	if (dlil_verbose) {
9076 		DLIL_PRINTF("%s: attached%s\n", if_name(ifp),
9077 		    (dl_if->dl_if_flags & DLIF_REUSE) ? " (recycled)" : "");
9078 	}
9079 
9080 	return 0;
9081 }
9082 
9083 /*
9084  * Prepare the storage for the first/permanent link address, which must
9085  * must have the same lifetime as the ifnet itself.  Although the link
9086  * address gets removed from if_addrhead and ifnet_addrs[] at detach time,
9087  * its location in memory must never change as it may still be referred
9088  * to by some parts of the system afterwards (unfortunate implementation
9089  * artifacts inherited from BSD.)
9090  *
9091  * Caller must hold ifnet lock as writer.
9092  */
9093 static struct ifaddr *
dlil_alloc_lladdr(struct ifnet * ifp,const struct sockaddr_dl * ll_addr)9094 dlil_alloc_lladdr(struct ifnet *ifp, const struct sockaddr_dl *ll_addr)
9095 {
9096 	struct ifaddr *ifa, *oifa = NULL;
9097 	struct sockaddr_dl *addr_sdl, *mask_sdl;
9098 	char workbuf[IFNAMSIZ * 2];
9099 	int namelen, masklen, socksize;
9100 	struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
9101 
9102 	ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_EXCLUSIVE);
9103 	VERIFY(ll_addr == NULL || ll_addr->sdl_alen == ifp->if_addrlen);
9104 
9105 	namelen = scnprintf(workbuf, sizeof(workbuf), "%s",
9106 	    if_name(ifp));
9107 	masklen = offsetof(struct sockaddr_dl, sdl_data[0])
9108 	    + ((namelen > 0) ? namelen : 0);
9109 	socksize = masklen + ifp->if_addrlen;
9110 #define ROUNDUP(a) (1 + (((a) - 1) | (sizeof (u_int32_t) - 1)))
9111 	if ((u_int32_t)socksize < sizeof(struct sockaddr_dl)) {
9112 		socksize = sizeof(struct sockaddr_dl);
9113 	}
9114 	socksize = ROUNDUP(socksize);
9115 #undef ROUNDUP
9116 
9117 	ifa = ifp->if_lladdr;
9118 	if (socksize > DLIL_SDLMAXLEN ||
9119 	    (ifa != NULL && ifa != &dl_if->dl_if_lladdr.ifa)) {
9120 		/*
9121 		 * Rare, but in the event that the link address requires
9122 		 * more storage space than DLIL_SDLMAXLEN, allocate the
9123 		 * largest possible storages for address and mask, such
9124 		 * that we can reuse the same space when if_addrlen grows.
9125 		 * This same space will be used when if_addrlen shrinks.
9126 		 */
9127 		struct dl_if_lladdr_xtra_space *__single dl_if_lladdr_ext;
9128 
9129 		if (ifa == NULL || ifa == &dl_if->dl_if_lladdr.ifa) {
9130 			dl_if_lladdr_ext = zalloc_permanent(
9131 				sizeof(*dl_if_lladdr_ext), ZALIGN(struct ifaddr));
9132 
9133 			ifa = &dl_if_lladdr_ext->ifa;
9134 			ifa_lock_init(ifa);
9135 			ifa_initref(ifa);
9136 			/* Don't set IFD_ALLOC, as this is permanent */
9137 			ifa->ifa_debug = IFD_LINK;
9138 		} else {
9139 			dl_if_lladdr_ext = __unsafe_forge_single(
9140 				struct dl_if_lladdr_xtra_space*, ifa);
9141 			ifa = &dl_if_lladdr_ext->ifa;
9142 		}
9143 
9144 		IFA_LOCK(ifa);
9145 		/* address and mask sockaddr_dl locations */
9146 		bzero(dl_if_lladdr_ext->addr_sdl_bytes,
9147 		    sizeof(dl_if_lladdr_ext->addr_sdl_bytes));
9148 		bzero(dl_if_lladdr_ext->mask_sdl_bytes,
9149 		    sizeof(dl_if_lladdr_ext->mask_sdl_bytes));
9150 		addr_sdl = SDL(dl_if_lladdr_ext->addr_sdl_bytes);
9151 		mask_sdl = SDL(dl_if_lladdr_ext->mask_sdl_bytes);
9152 	} else {
9153 		VERIFY(ifa == NULL || ifa == &dl_if->dl_if_lladdr.ifa);
9154 		/*
9155 		 * Use the storage areas for address and mask within the
9156 		 * dlil_ifnet structure.  This is the most common case.
9157 		 */
9158 		if (ifa == NULL) {
9159 			ifa = &dl_if->dl_if_lladdr.ifa;
9160 			ifa_lock_init(ifa);
9161 			ifa_initref(ifa);
9162 			/* Don't set IFD_ALLOC, as this is permanent */
9163 			ifa->ifa_debug = IFD_LINK;
9164 		}
9165 		IFA_LOCK(ifa);
9166 		/* address and mask sockaddr_dl locations */
9167 		bzero(dl_if->dl_if_lladdr.addr_sdl_bytes,
9168 		    sizeof(dl_if->dl_if_lladdr.addr_sdl_bytes));
9169 		bzero(dl_if->dl_if_lladdr.mask_sdl_bytes,
9170 		    sizeof(dl_if->dl_if_lladdr.mask_sdl_bytes));
9171 		addr_sdl = SDL(dl_if->dl_if_lladdr.addr_sdl_bytes);
9172 		mask_sdl = SDL(dl_if->dl_if_lladdr.mask_sdl_bytes);
9173 	}
9174 
9175 	if (ifp->if_lladdr != ifa) {
9176 		oifa = ifp->if_lladdr;
9177 		ifp->if_lladdr = ifa;
9178 	}
9179 
9180 	VERIFY(ifa->ifa_debug == IFD_LINK);
9181 	ifa->ifa_ifp = ifp;
9182 	ifa->ifa_rtrequest = link_rtrequest;
9183 	ifa->ifa_addr = SA(addr_sdl);
9184 	addr_sdl->sdl_len = (u_char)socksize;
9185 	addr_sdl->sdl_family = AF_LINK;
9186 	if (namelen > 0) {
9187 		bcopy(workbuf, addr_sdl->sdl_data, min(namelen,
9188 		    sizeof(addr_sdl->sdl_data)));
9189 		addr_sdl->sdl_nlen = (u_char)namelen;
9190 	} else {
9191 		addr_sdl->sdl_nlen = 0;
9192 	}
9193 	addr_sdl->sdl_index = ifp->if_index;
9194 	addr_sdl->sdl_type = ifp->if_type;
9195 	if (ll_addr != NULL) {
9196 		addr_sdl->sdl_alen = ll_addr->sdl_alen;
9197 		bcopy(CONST_LLADDR(ll_addr), LLADDR(addr_sdl), addr_sdl->sdl_alen);
9198 	} else {
9199 		addr_sdl->sdl_alen = 0;
9200 	}
9201 	ifa->ifa_netmask = SA(mask_sdl);
9202 	mask_sdl->sdl_len = (u_char)masklen;
9203 	while (namelen > 0) {
9204 		mask_sdl->sdl_data[--namelen] = 0xff;
9205 	}
9206 	IFA_UNLOCK(ifa);
9207 
9208 	if (oifa != NULL) {
9209 		ifa_remref(oifa);
9210 	}
9211 
9212 	return ifa;
9213 }
9214 
9215 static void
if_purgeaddrs(struct ifnet * ifp)9216 if_purgeaddrs(struct ifnet *ifp)
9217 {
9218 #if INET
9219 	in_purgeaddrs(ifp);
9220 #endif /* INET */
9221 	in6_purgeaddrs(ifp);
9222 }
9223 
9224 errno_t
ifnet_detach(ifnet_t ifp)9225 ifnet_detach(ifnet_t ifp)
9226 {
9227 	struct ifnet *delegated_ifp;
9228 	struct nd_ifinfo *ndi = NULL;
9229 
9230 	if (ifp == NULL) {
9231 		return EINVAL;
9232 	}
9233 
9234 	ndi = ND_IFINFO(ifp);
9235 	if (NULL != ndi) {
9236 		ndi->cga_initialized = FALSE;
9237 	}
9238 
9239 	/* Mark the interface down */
9240 	if_down(ifp);
9241 
9242 	/*
9243 	 * IMPORTANT NOTE
9244 	 *
9245 	 * Any field in the ifnet that relies on IF_FULLY_ATTACHED()
9246 	 * or equivalently, ifnet_is_attached(ifp, 1), can't be modified
9247 	 * until after we've waited for all I/O references to drain
9248 	 * in ifnet_detach_final().
9249 	 */
9250 
9251 	ifnet_head_lock_exclusive();
9252 	ifnet_lock_exclusive(ifp);
9253 
9254 	if (ifp->if_output_netem != NULL) {
9255 		netem_destroy(ifp->if_output_netem);
9256 		ifp->if_output_netem = NULL;
9257 	}
9258 
9259 	/*
9260 	 * Check to see if this interface has previously triggered
9261 	 * aggressive protocol draining; if so, decrement the global
9262 	 * refcnt and clear PR_AGGDRAIN on the route domain if
9263 	 * there are no more of such an interface around.
9264 	 */
9265 	(void) ifnet_set_idle_flags_locked(ifp, 0, ~0);
9266 
9267 	lck_mtx_lock_spin(&ifp->if_ref_lock);
9268 	if (!(ifp->if_refflags & IFRF_ATTACHED)) {
9269 		lck_mtx_unlock(&ifp->if_ref_lock);
9270 		ifnet_lock_done(ifp);
9271 		ifnet_head_done();
9272 		return EINVAL;
9273 	} else if (ifp->if_refflags & IFRF_DETACHING) {
9274 		/* Interface has already been detached */
9275 		lck_mtx_unlock(&ifp->if_ref_lock);
9276 		ifnet_lock_done(ifp);
9277 		ifnet_head_done();
9278 		return ENXIO;
9279 	}
9280 	VERIFY(!(ifp->if_refflags & IFRF_EMBRYONIC));
9281 	/* Indicate this interface is being detached */
9282 	ifp->if_refflags &= ~IFRF_ATTACHED;
9283 	ifp->if_refflags |= IFRF_DETACHING;
9284 	lck_mtx_unlock(&ifp->if_ref_lock);
9285 
9286 	if (dlil_verbose) {
9287 		DLIL_PRINTF("%s: detaching\n", if_name(ifp));
9288 	}
9289 
9290 	/* clean up flow control entry object if there's any */
9291 	if (ifp->if_eflags & IFEF_TXSTART) {
9292 		ifnet_flowadv(ifp->if_flowhash);
9293 	}
9294 
9295 	/* Reset ECN enable/disable flags */
9296 	/* Reset CLAT46 flag */
9297 	if_clear_eflags(ifp, IFEF_ECN_ENABLE | IFEF_ECN_DISABLE | IFEF_CLAT46);
9298 
9299 	/*
9300 	 * We do not reset the TCP keep alive counters in case
9301 	 * a TCP connection stays connection after the interface
9302 	 * went down
9303 	 */
9304 	if (ifp->if_tcp_kao_cnt > 0) {
9305 		os_log(OS_LOG_DEFAULT, "%s %s tcp_kao_cnt %u not zero",
9306 		    __func__, if_name(ifp), ifp->if_tcp_kao_cnt);
9307 	}
9308 	ifp->if_tcp_kao_max = 0;
9309 
9310 	/*
9311 	 * Remove ifnet from the ifnet_head, ifindex2ifnet[]; it will
9312 	 * no longer be visible during lookups from this point.
9313 	 */
9314 	VERIFY(ifindex2ifnet[ifp->if_index] == ifp);
9315 	TAILQ_REMOVE(&ifnet_head, ifp, if_link);
9316 	ifp->if_link.tqe_next = NULL;
9317 	ifp->if_link.tqe_prev = NULL;
9318 	if (ifp->if_ordered_link.tqe_next != NULL ||
9319 	    ifp->if_ordered_link.tqe_prev != NULL) {
9320 		ifnet_remove_from_ordered_list(ifp);
9321 	}
9322 	ifindex2ifnet[ifp->if_index] = NULL;
9323 
9324 	/* 18717626 - reset router mode */
9325 	if_clear_eflags(ifp, IFEF_IPV4_ROUTER);
9326 	ifp->if_ipv6_router_mode = IPV6_ROUTER_MODE_DISABLED;
9327 
9328 	/* Record detach PC stacktrace */
9329 	ctrace_record(&((struct dlil_ifnet *)ifp)->dl_if_detach);
9330 
9331 	/* Clear logging parameters */
9332 	bzero(&ifp->if_log, sizeof(ifp->if_log));
9333 
9334 	/* Clear delegated interface info (reference released below) */
9335 	delegated_ifp = ifp->if_delegated.ifp;
9336 	bzero(&ifp->if_delegated, sizeof(ifp->if_delegated));
9337 
9338 	/* Reset interface state */
9339 	bzero(&ifp->if_interface_state, sizeof(ifp->if_interface_state));
9340 
9341 	/*
9342 	 * Increment the generation count on interface deletion
9343 	 */
9344 	ifp->if_creation_generation_id = os_atomic_inc(&if_creation_generation_count, relaxed);
9345 
9346 	ifnet_lock_done(ifp);
9347 	ifnet_head_done();
9348 
9349 	/* Release reference held on the delegated interface */
9350 	if (delegated_ifp != NULL) {
9351 		ifnet_release(delegated_ifp);
9352 	}
9353 
9354 	/* Reset Link Quality Metric (unless loopback [lo0]) */
9355 	if (ifp != lo_ifp) {
9356 		if_lqm_update(ifp, IFNET_LQM_THRESH_OFF, 0);
9357 	}
9358 
9359 	/* Reset TCP local statistics */
9360 	if (ifp->if_tcp_stat != NULL) {
9361 		bzero(ifp->if_tcp_stat, sizeof(*ifp->if_tcp_stat));
9362 	}
9363 
9364 	/* Reset UDP local statistics */
9365 	if (ifp->if_udp_stat != NULL) {
9366 		bzero(ifp->if_udp_stat, sizeof(*ifp->if_udp_stat));
9367 	}
9368 
9369 	/* Reset ifnet IPv4 stats */
9370 	if (ifp->if_ipv4_stat != NULL) {
9371 		bzero(ifp->if_ipv4_stat, sizeof(*ifp->if_ipv4_stat));
9372 	}
9373 
9374 	/* Reset ifnet IPv6 stats */
9375 	if (ifp->if_ipv6_stat != NULL) {
9376 		bzero(ifp->if_ipv6_stat, sizeof(*ifp->if_ipv6_stat));
9377 	}
9378 
9379 	/* Release memory held for interface link status report */
9380 	if (ifp->if_link_status != NULL) {
9381 		kfree_type(struct if_link_status, ifp->if_link_status);
9382 		ifp->if_link_status = NULL;
9383 	}
9384 
9385 	/* Disable forwarding cached route */
9386 	lck_mtx_lock(&ifp->if_cached_route_lock);
9387 	ifp->if_fwd_cacheok = 0;
9388 	lck_mtx_unlock(&ifp->if_cached_route_lock);
9389 
9390 	/* Disable data threshold and wait for any pending event posting */
9391 	ifp->if_data_threshold = 0;
9392 	VERIFY(ifp->if_dt_tcall != NULL);
9393 	(void) thread_call_cancel_wait(ifp->if_dt_tcall);
9394 
9395 	/*
9396 	 * Drain any deferred IGMPv3/MLDv2 query responses, but keep the
9397 	 * references to the info structures and leave them attached to
9398 	 * this ifnet.
9399 	 */
9400 #if INET
9401 	igmp_domifdetach(ifp);
9402 #endif /* INET */
9403 	mld_domifdetach(ifp);
9404 
9405 #if SKYWALK
9406 	/* Clean up any netns tokens still pointing to to this ifnet */
9407 	netns_ifnet_detach(ifp);
9408 #endif /* SKYWALK */
9409 	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_IF_DETACHING, NULL, 0, FALSE);
9410 
9411 	/* Let worker thread take care of the rest, to avoid reentrancy */
9412 	dlil_if_lock();
9413 	ifnet_detaching_enqueue(ifp);
9414 	dlil_if_unlock();
9415 
9416 	return 0;
9417 }
9418 
9419 static void
ifnet_detaching_enqueue(struct ifnet * ifp)9420 ifnet_detaching_enqueue(struct ifnet *ifp)
9421 {
9422 	dlil_if_lock_assert();
9423 
9424 	++ifnet_detaching_cnt;
9425 	VERIFY(ifnet_detaching_cnt != 0);
9426 	TAILQ_INSERT_TAIL(&ifnet_detaching_head, ifp, if_detaching_link);
9427 	wakeup((caddr_t)&ifnet_delayed_run);
9428 }
9429 
9430 static struct ifnet *
ifnet_detaching_dequeue(void)9431 ifnet_detaching_dequeue(void)
9432 {
9433 	struct ifnet *ifp;
9434 
9435 	dlil_if_lock_assert();
9436 
9437 	ifp = TAILQ_FIRST(&ifnet_detaching_head);
9438 	VERIFY(ifnet_detaching_cnt != 0 || ifp == NULL);
9439 	if (ifp != NULL) {
9440 		VERIFY(ifnet_detaching_cnt != 0);
9441 		--ifnet_detaching_cnt;
9442 		TAILQ_REMOVE(&ifnet_detaching_head, ifp, if_detaching_link);
9443 		ifp->if_detaching_link.tqe_next = NULL;
9444 		ifp->if_detaching_link.tqe_prev = NULL;
9445 	}
9446 	return ifp;
9447 }
9448 
9449 __attribute__((noreturn))
9450 static void
ifnet_detacher_thread_cont(void * v,wait_result_t wres)9451 ifnet_detacher_thread_cont(void *v, wait_result_t wres)
9452 {
9453 #pragma unused(v, wres)
9454 	struct ifnet *ifp;
9455 
9456 	dlil_if_lock();
9457 	if (__improbable(ifnet_detaching_embryonic)) {
9458 		ifnet_detaching_embryonic = FALSE;
9459 		/* there's no lock ordering constrain so OK to do this here */
9460 		dlil_decr_pending_thread_count();
9461 	}
9462 
9463 	for (;;) {
9464 		dlil_if_lock_assert();
9465 
9466 		if (ifnet_detaching_cnt == 0) {
9467 			break;
9468 		}
9469 
9470 		net_update_uptime();
9471 
9472 		VERIFY(TAILQ_FIRST(&ifnet_detaching_head) != NULL);
9473 
9474 		/* Take care of detaching ifnet */
9475 		ifp = ifnet_detaching_dequeue();
9476 		if (ifp != NULL) {
9477 			dlil_if_unlock();
9478 			ifnet_detach_final(ifp);
9479 			dlil_if_lock();
9480 		}
9481 	}
9482 
9483 	(void) assert_wait(&ifnet_delayed_run, THREAD_UNINT);
9484 	dlil_if_unlock();
9485 	(void) thread_block(ifnet_detacher_thread_cont);
9486 
9487 	VERIFY(0);      /* we should never get here */
9488 	/* NOTREACHED */
9489 	__builtin_unreachable();
9490 }
9491 
9492 __dead2
9493 static void
ifnet_detacher_thread_func(void * v,wait_result_t w)9494 ifnet_detacher_thread_func(void *v, wait_result_t w)
9495 {
9496 #pragma unused(v, w)
9497 	dlil_if_lock();
9498 	(void) assert_wait(&ifnet_delayed_run, THREAD_UNINT);
9499 	ifnet_detaching_embryonic = TRUE;
9500 	/* wake up once to get out of embryonic state */
9501 	wakeup((caddr_t)&ifnet_delayed_run);
9502 	dlil_if_unlock();
9503 	(void) thread_block(ifnet_detacher_thread_cont);
9504 	VERIFY(0);
9505 	/* NOTREACHED */
9506 	__builtin_unreachable();
9507 }
9508 
9509 static void
ifnet_detach_final(struct ifnet * ifp)9510 ifnet_detach_final(struct ifnet *ifp)
9511 {
9512 	struct ifnet_filter *filter, *filter_next;
9513 	struct dlil_ifnet *dlifp;
9514 	struct ifnet_filter_head fhead;
9515 	struct dlil_threading_info *inp;
9516 	struct ifaddr *ifa;
9517 	ifnet_detached_func if_free;
9518 	int i;
9519 	bool waited = false;
9520 
9521 	/* Let BPF know we're detaching */
9522 	bpfdetach(ifp);
9523 
9524 #if SKYWALK
9525 	dlil_netif_detach_notify(ifp);
9526 	/*
9527 	 * Wait for the datapath to quiesce before tearing down
9528 	 * netif/flowswitch nexuses.
9529 	 */
9530 	dlil_quiesce_and_detach_nexuses(ifp);
9531 #endif /* SKYWALK */
9532 
9533 	lck_mtx_lock(&ifp->if_ref_lock);
9534 	if (!(ifp->if_refflags & IFRF_DETACHING)) {
9535 		panic("%s: flags mismatch (detaching not set) ifp=%p",
9536 		    __func__, ifp);
9537 		/* NOTREACHED */
9538 	}
9539 
9540 	/*
9541 	 * Wait until the existing IO references get released
9542 	 * before we proceed with ifnet_detach.  This is not a
9543 	 * common case, so block without using a continuation.
9544 	 */
9545 	while (ifp->if_refio > 0) {
9546 		waited = true;
9547 		DLIL_PRINTF("%s: %s waiting for IO references to drain\n",
9548 		    __func__, if_name(ifp));
9549 		(void) msleep(&(ifp->if_refio), &ifp->if_ref_lock,
9550 		    (PZERO - 1), "ifnet_ioref_wait", NULL);
9551 	}
9552 	if (waited) {
9553 		DLIL_PRINTF("%s: %s IO references drained\n",
9554 		    __func__, if_name(ifp));
9555 	}
9556 	VERIFY(ifp->if_datamov == 0);
9557 	VERIFY(ifp->if_drainers == 0);
9558 	VERIFY(ifp->if_suspend == 0);
9559 	ifp->if_refflags &= ~IFRF_READY;
9560 	lck_mtx_unlock(&ifp->if_ref_lock);
9561 
9562 	/* Clear agent IDs */
9563 	if (ifp->if_agentids != NULL) {
9564 		kfree_data(ifp->if_agentids,
9565 		    sizeof(uuid_t) * ifp->if_agentcount);
9566 		ifp->if_agentids = NULL;
9567 	}
9568 	ifp->if_agentcount = 0;
9569 
9570 #if SKYWALK
9571 	VERIFY(LIST_EMPTY(&ifp->if_netns_tokens));
9572 #endif /* SKYWALK */
9573 	/* Drain and destroy send queue */
9574 	ifclassq_teardown(ifp->if_snd);
9575 
9576 	/* Detach interface filters */
9577 	lck_mtx_lock(&ifp->if_flt_lock);
9578 	if_flt_monitor_enter(ifp);
9579 
9580 	LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
9581 	fhead = ifp->if_flt_head;
9582 	TAILQ_INIT(&ifp->if_flt_head);
9583 
9584 	for (filter = TAILQ_FIRST(&fhead); filter; filter = filter_next) {
9585 		filter_next = TAILQ_NEXT(filter, filt_next);
9586 		lck_mtx_unlock(&ifp->if_flt_lock);
9587 
9588 		dlil_detach_filter_internal(filter, 1);
9589 		lck_mtx_lock(&ifp->if_flt_lock);
9590 	}
9591 	if_flt_monitor_leave(ifp);
9592 	lck_mtx_unlock(&ifp->if_flt_lock);
9593 
9594 	/* Tell upper layers to drop their network addresses */
9595 	if_purgeaddrs(ifp);
9596 
9597 	ifnet_lock_exclusive(ifp);
9598 
9599 	bzero(&ifp->if_nx_netif, sizeof(ifp->if_nx_netif));
9600 	bzero(&ifp->if_nx_flowswitch, sizeof(ifp->if_nx_flowswitch));
9601 
9602 	/* Unplumb all protocols */
9603 	for (i = 0; i < PROTO_HASH_SLOTS; i++) {
9604 		struct if_proto *proto;
9605 
9606 		proto = SLIST_FIRST(&ifp->if_proto_hash[i]);
9607 		while (proto != NULL) {
9608 			protocol_family_t family = proto->protocol_family;
9609 			ifnet_lock_done(ifp);
9610 			proto_unplumb(family, ifp);
9611 			ifnet_lock_exclusive(ifp);
9612 			proto = SLIST_FIRST(&ifp->if_proto_hash[i]);
9613 		}
9614 		/* There should not be any protocols left */
9615 		VERIFY(SLIST_EMPTY(&ifp->if_proto_hash[i]));
9616 	}
9617 	kfree_type(struct proto_hash_entry, PROTO_HASH_SLOTS, ifp->if_proto_hash);
9618 	ifp->if_proto_hash = NULL;
9619 
9620 	/* Detach (permanent) link address from if_addrhead */
9621 	ifa = TAILQ_FIRST(&ifp->if_addrhead);
9622 	VERIFY(ifnet_addrs[ifp->if_index - 1] == ifa);
9623 	IFA_LOCK(ifa);
9624 	if_detach_link_ifa(ifp, ifa);
9625 	IFA_UNLOCK(ifa);
9626 
9627 	/* Remove (permanent) link address from ifnet_addrs[] */
9628 	ifa_remref(ifa);
9629 	ifnet_addrs[ifp->if_index - 1] = NULL;
9630 
9631 	/* This interface should not be on {ifnet_head,detaching} */
9632 	VERIFY(ifp->if_link.tqe_next == NULL);
9633 	VERIFY(ifp->if_link.tqe_prev == NULL);
9634 	VERIFY(ifp->if_detaching_link.tqe_next == NULL);
9635 	VERIFY(ifp->if_detaching_link.tqe_prev == NULL);
9636 	VERIFY(ifp->if_ordered_link.tqe_next == NULL);
9637 	VERIFY(ifp->if_ordered_link.tqe_prev == NULL);
9638 
9639 	/* The slot should have been emptied */
9640 	VERIFY(ifindex2ifnet[ifp->if_index] == NULL);
9641 
9642 	/* There should not be any addresses left */
9643 	VERIFY(TAILQ_EMPTY(&ifp->if_addrhead));
9644 
9645 	/*
9646 	 * Signal the starter thread to terminate itself, and wait until
9647 	 * it has exited.
9648 	 */
9649 	if (ifp->if_start_thread != THREAD_NULL) {
9650 		lck_mtx_lock_spin(&ifp->if_start_lock);
9651 		ifp->if_start_flags |= IFSF_TERMINATING;
9652 		wakeup_one((caddr_t)&ifp->if_start_thread);
9653 		lck_mtx_unlock(&ifp->if_start_lock);
9654 
9655 		/* wait for starter thread to terminate */
9656 		lck_mtx_lock(&ifp->if_start_lock);
9657 		while (ifp->if_start_thread != THREAD_NULL) {
9658 			if (dlil_verbose) {
9659 				DLIL_PRINTF("%s: waiting for %s starter thread to terminate\n",
9660 				    __func__,
9661 				    if_name(ifp));
9662 			}
9663 			(void) msleep(&ifp->if_start_thread,
9664 			    &ifp->if_start_lock, (PZERO - 1),
9665 			    "ifnet_start_thread_exit", NULL);
9666 		}
9667 		lck_mtx_unlock(&ifp->if_start_lock);
9668 		if (dlil_verbose) {
9669 			DLIL_PRINTF("%s: %s starter thread termination complete",
9670 			    __func__, if_name(ifp));
9671 		}
9672 	}
9673 
9674 	/*
9675 	 * Signal the poller thread to terminate itself, and wait until
9676 	 * it has exited.
9677 	 */
9678 	if (ifp->if_poll_thread != THREAD_NULL) {
9679 #if SKYWALK
9680 		VERIFY(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
9681 #endif /* SKYWALK */
9682 		lck_mtx_lock_spin(&ifp->if_poll_lock);
9683 		ifp->if_poll_flags |= IF_POLLF_TERMINATING;
9684 		wakeup_one((caddr_t)&ifp->if_poll_thread);
9685 		lck_mtx_unlock(&ifp->if_poll_lock);
9686 
9687 		/* wait for poller thread to terminate */
9688 		lck_mtx_lock(&ifp->if_poll_lock);
9689 		while (ifp->if_poll_thread != THREAD_NULL) {
9690 			if (dlil_verbose) {
9691 				DLIL_PRINTF("%s: waiting for %s poller thread to terminate\n",
9692 				    __func__,
9693 				    if_name(ifp));
9694 			}
9695 			(void) msleep(&ifp->if_poll_thread,
9696 			    &ifp->if_poll_lock, (PZERO - 1),
9697 			    "ifnet_poll_thread_exit", NULL);
9698 		}
9699 		lck_mtx_unlock(&ifp->if_poll_lock);
9700 		if (dlil_verbose) {
9701 			DLIL_PRINTF("%s: %s poller thread termination complete\n",
9702 			    __func__, if_name(ifp));
9703 		}
9704 	}
9705 
9706 	/*
9707 	 * If thread affinity was set for the workloop thread, we will need
9708 	 * to tear down the affinity and release the extra reference count
9709 	 * taken at attach time.  Does not apply to lo0 or other interfaces
9710 	 * without dedicated input threads.
9711 	 */
9712 	if ((inp = ifp->if_inp) != NULL) {
9713 		VERIFY(inp != dlil_main_input_thread);
9714 
9715 		if (inp->dlth_affinity) {
9716 			struct thread *tp, *wtp, *ptp;
9717 
9718 			lck_mtx_lock_spin(&inp->dlth_lock);
9719 			wtp = inp->dlth_driver_thread;
9720 			inp->dlth_driver_thread = THREAD_NULL;
9721 			ptp = inp->dlth_poller_thread;
9722 			inp->dlth_poller_thread = THREAD_NULL;
9723 			ASSERT(inp->dlth_thread != THREAD_NULL);
9724 			tp = inp->dlth_thread;    /* don't nullify now */
9725 			inp->dlth_affinity_tag = 0;
9726 			inp->dlth_affinity = FALSE;
9727 			lck_mtx_unlock(&inp->dlth_lock);
9728 
9729 			/* Tear down poll thread affinity */
9730 			if (ptp != NULL) {
9731 				VERIFY(ifp->if_eflags & IFEF_RXPOLL);
9732 				VERIFY(ifp->if_xflags & IFXF_LEGACY);
9733 				(void) dlil_affinity_set(ptp,
9734 				    THREAD_AFFINITY_TAG_NULL);
9735 				thread_deallocate(ptp);
9736 			}
9737 
9738 			/* Tear down workloop thread affinity */
9739 			if (wtp != NULL) {
9740 				(void) dlil_affinity_set(wtp,
9741 				    THREAD_AFFINITY_TAG_NULL);
9742 				thread_deallocate(wtp);
9743 			}
9744 
9745 			/* Tear down DLIL input thread affinity */
9746 			(void) dlil_affinity_set(tp, THREAD_AFFINITY_TAG_NULL);
9747 			thread_deallocate(tp);
9748 		}
9749 
9750 		/* disassociate ifp DLIL input thread */
9751 		ifp->if_inp = NULL;
9752 
9753 		/* if the worker thread was created, tell it to terminate */
9754 		if (inp->dlth_thread != THREAD_NULL) {
9755 			lck_mtx_lock_spin(&inp->dlth_lock);
9756 			inp->dlth_flags |= DLIL_INPUT_TERMINATE;
9757 			if (!(inp->dlth_flags & DLIL_INPUT_RUNNING)) {
9758 				wakeup_one((caddr_t)&inp->dlth_flags);
9759 			}
9760 			lck_mtx_unlock(&inp->dlth_lock);
9761 			ifnet_lock_done(ifp);
9762 
9763 			/* wait for the input thread to terminate */
9764 			lck_mtx_lock_spin(&inp->dlth_lock);
9765 			while ((inp->dlth_flags & DLIL_INPUT_TERMINATE_COMPLETE)
9766 			    == 0) {
9767 				(void) msleep(&inp->dlth_flags, &inp->dlth_lock,
9768 				    (PZERO - 1) | PSPIN, inp->dlth_name, NULL);
9769 			}
9770 			lck_mtx_unlock(&inp->dlth_lock);
9771 			ifnet_lock_exclusive(ifp);
9772 		}
9773 
9774 		/* clean-up input thread state */
9775 		dlil_clean_threading_info(inp);
9776 		/* clean-up poll parameters */
9777 		VERIFY(ifp->if_poll_thread == THREAD_NULL);
9778 		dlil_reset_rxpoll_params(ifp);
9779 	}
9780 
9781 	/* The driver might unload, so point these to ourselves */
9782 	if_free = ifp->if_free;
9783 	ifp->if_output_dlil = ifp_if_output;
9784 	ifp->if_output = ifp_if_output;
9785 	ifp->if_pre_enqueue = ifp_if_output;
9786 	ifp->if_start = ifp_if_start;
9787 	ifp->if_output_ctl = ifp_if_ctl;
9788 	ifp->if_input_dlil = ifp_if_input;
9789 	ifp->if_input_poll = ifp_if_input_poll;
9790 	ifp->if_input_ctl = ifp_if_ctl;
9791 	ifp->if_ioctl = ifp_if_ioctl;
9792 	ifp->if_set_bpf_tap = ifp_if_set_bpf_tap;
9793 	ifp->if_free = ifp_if_free;
9794 	ifp->if_demux = ifp_if_demux;
9795 	ifp->if_event = ifp_if_event;
9796 	ifp->if_framer_legacy = ifp_if_framer;
9797 	ifp->if_framer = ifp_if_framer_extended;
9798 	ifp->if_add_proto = ifp_if_add_proto;
9799 	ifp->if_del_proto = ifp_if_del_proto;
9800 	ifp->if_check_multi = ifp_if_check_multi;
9801 
9802 	/* wipe out interface description */
9803 	VERIFY(ifp->if_desc.ifd_maxlen == IF_DESCSIZE);
9804 	ifp->if_desc.ifd_len = 0;
9805 	VERIFY(ifp->if_desc.ifd_desc != NULL);
9806 	bzero(ifp->if_desc.ifd_desc, IF_DESCSIZE);
9807 
9808 	/* there shouldn't be any delegation by now */
9809 	VERIFY(ifp->if_delegated.ifp == NULL);
9810 	VERIFY(ifp->if_delegated.type == 0);
9811 	VERIFY(ifp->if_delegated.family == 0);
9812 	VERIFY(ifp->if_delegated.subfamily == 0);
9813 	VERIFY(ifp->if_delegated.expensive == 0);
9814 	VERIFY(ifp->if_delegated.constrained == 0);
9815 	VERIFY(ifp->if_delegated.ultra_constrained == 0);
9816 
9817 	/* QoS marking get cleared */
9818 	if_clear_eflags(ifp, IFEF_QOSMARKING_ENABLED);
9819 	if_set_qosmarking_mode(ifp, IFRTYPE_QOSMARKING_MODE_NONE);
9820 
9821 #if SKYWALK
9822 	/* the nexus destructor is responsible for clearing these */
9823 	VERIFY(ifp->if_na_ops == NULL);
9824 	VERIFY(ifp->if_na == NULL);
9825 #endif /* SKYWALK */
9826 
9827 	/* promiscuous/allmulti counts need to start at zero again */
9828 	ifp->if_pcount = 0;
9829 	ifp->if_amcount = 0;
9830 	ifp->if_flags &= ~(IFF_PROMISC | IFF_ALLMULTI);
9831 
9832 	ifnet_lock_done(ifp);
9833 
9834 #if PF
9835 	/*
9836 	 * Detach this interface from packet filter, if enabled.
9837 	 */
9838 	pf_ifnet_hook(ifp, 0);
9839 #endif /* PF */
9840 
9841 	/* Filter list should be empty */
9842 	lck_mtx_lock_spin(&ifp->if_flt_lock);
9843 	VERIFY(TAILQ_EMPTY(&ifp->if_flt_head));
9844 	VERIFY(ifp->if_flt_busy == 0);
9845 	VERIFY(ifp->if_flt_waiters == 0);
9846 	VERIFY(ifp->if_flt_non_os_count == 0);
9847 	VERIFY(ifp->if_flt_no_tso_count == 0);
9848 	lck_mtx_unlock(&ifp->if_flt_lock);
9849 
9850 	/* Last chance to drain send queue */
9851 	if_qflush_snd(ifp, 0);
9852 
9853 	/* Last chance to cleanup any cached route */
9854 	lck_mtx_lock(&ifp->if_cached_route_lock);
9855 	VERIFY(!ifp->if_fwd_cacheok);
9856 	ROUTE_RELEASE(&ifp->if_fwd_route);
9857 	bzero(&ifp->if_fwd_route, sizeof(ifp->if_fwd_route));
9858 	ROUTE_RELEASE(&ifp->if_src_route);
9859 	bzero(&ifp->if_src_route, sizeof(ifp->if_src_route));
9860 	ROUTE_RELEASE(&ifp->if_src_route6);
9861 	bzero(&ifp->if_src_route6, sizeof(ifp->if_src_route6));
9862 	lck_mtx_unlock(&ifp->if_cached_route_lock);
9863 
9864 	/* Ignore any pending data threshold as the interface is anyways gone */
9865 	ifp->if_data_threshold = 0;
9866 
9867 	VERIFY(ifp->if_dt_tcall != NULL);
9868 	VERIFY(!thread_call_isactive(ifp->if_dt_tcall));
9869 
9870 	ifnet_llreach_ifdetach(ifp);
9871 
9872 	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_IF_DETACHED, NULL, 0, FALSE);
9873 
9874 	/*
9875 	 * Finally, mark this ifnet as detached.
9876 	 */
9877 	if (dlil_verbose) {
9878 		DLIL_PRINTF("%s: detached\n", if_name(ifp));
9879 	}
9880 	lck_mtx_lock_spin(&ifp->if_ref_lock);
9881 	if (!(ifp->if_refflags & IFRF_DETACHING)) {
9882 		panic("%s: flags mismatch (detaching not set) ifp=%p",
9883 		    __func__, ifp);
9884 		/* NOTREACHED */
9885 	}
9886 	ifp->if_refflags &= ~IFRF_DETACHING;
9887 	lck_mtx_unlock(&ifp->if_ref_lock);
9888 	if (if_free != NULL) {
9889 		if_free(ifp);
9890 	}
9891 
9892 	ifclassq_release(&ifp->if_snd);
9893 
9894 	/* we're fully detached, clear the "in use" bit */
9895 	dlifp = (struct dlil_ifnet *)ifp;
9896 	lck_mtx_lock(&dlifp->dl_if_lock);
9897 	ASSERT((dlifp->dl_if_flags & DLIF_INUSE) != 0);
9898 	dlifp->dl_if_flags &= ~DLIF_INUSE;
9899 	lck_mtx_unlock(&dlifp->dl_if_lock);
9900 
9901 	/* Release reference held during ifnet attach */
9902 	ifnet_release(ifp);
9903 }
9904 
9905 errno_t
ifp_if_output(struct ifnet * ifp,struct mbuf * m)9906 ifp_if_output(struct ifnet *ifp, struct mbuf *m)
9907 {
9908 #pragma unused(ifp)
9909 	m_freem_list(m);
9910 	return 0;
9911 }
9912 
9913 void
ifp_if_start(struct ifnet * ifp)9914 ifp_if_start(struct ifnet *ifp)
9915 {
9916 	ifnet_purge(ifp);
9917 }
9918 
9919 static errno_t
ifp_if_input(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t poll,struct thread * tp)9920 ifp_if_input(struct ifnet *ifp, struct mbuf *m_head,
9921     struct mbuf *m_tail, const struct ifnet_stat_increment_param *s,
9922     boolean_t poll, struct thread *tp)
9923 {
9924 #pragma unused(ifp, m_tail, s, poll, tp)
9925 	m_freem_list(m_head);
9926 	return ENXIO;
9927 }
9928 
9929 static void
ifp_if_input_poll(struct ifnet * ifp,u_int32_t flags,u_int32_t max_cnt,struct mbuf ** m_head,struct mbuf ** m_tail,u_int32_t * cnt,u_int32_t * len)9930 ifp_if_input_poll(struct ifnet *ifp, u_int32_t flags, u_int32_t max_cnt,
9931     struct mbuf **m_head, struct mbuf **m_tail, u_int32_t *cnt, u_int32_t *len)
9932 {
9933 #pragma unused(ifp, flags, max_cnt)
9934 	if (m_head != NULL) {
9935 		*m_head = NULL;
9936 	}
9937 	if (m_tail != NULL) {
9938 		*m_tail = NULL;
9939 	}
9940 	if (cnt != NULL) {
9941 		*cnt = 0;
9942 	}
9943 	if (len != NULL) {
9944 		*len = 0;
9945 	}
9946 }
9947 
9948 static errno_t
ifp_if_ctl(struct ifnet * ifp,ifnet_ctl_cmd_t cmd,u_int32_t arglen,void * arg)9949 ifp_if_ctl(struct ifnet *ifp, ifnet_ctl_cmd_t cmd, u_int32_t arglen, void *arg)
9950 {
9951 #pragma unused(ifp, cmd, arglen, arg)
9952 	return EOPNOTSUPP;
9953 }
9954 
9955 static errno_t
ifp_if_demux(struct ifnet * ifp,struct mbuf * m,char * fh,protocol_family_t * pf)9956 ifp_if_demux(struct ifnet *ifp, struct mbuf *m, char *fh, protocol_family_t *pf)
9957 {
9958 #pragma unused(ifp, fh, pf)
9959 	m_freem(m);
9960 	return EJUSTRETURN;
9961 }
9962 
9963 static errno_t
ifp_if_add_proto(struct ifnet * ifp,protocol_family_t pf,const struct ifnet_demux_desc * da,u_int32_t dc)9964 ifp_if_add_proto(struct ifnet *ifp, protocol_family_t pf,
9965     const struct ifnet_demux_desc *da, u_int32_t dc)
9966 {
9967 #pragma unused(ifp, pf, da, dc)
9968 	return EINVAL;
9969 }
9970 
9971 static errno_t
ifp_if_del_proto(struct ifnet * ifp,protocol_family_t pf)9972 ifp_if_del_proto(struct ifnet *ifp, protocol_family_t pf)
9973 {
9974 #pragma unused(ifp, pf)
9975 	return EINVAL;
9976 }
9977 
9978 static errno_t
ifp_if_check_multi(struct ifnet * ifp,const struct sockaddr * sa)9979 ifp_if_check_multi(struct ifnet *ifp, const struct sockaddr *sa)
9980 {
9981 #pragma unused(ifp, sa)
9982 	return EOPNOTSUPP;
9983 }
9984 
9985 #if !XNU_TARGET_OS_OSX
9986 static errno_t
ifp_if_framer(struct ifnet * ifp,struct mbuf ** m,const struct sockaddr * sa,const char * ll,const char * t,u_int32_t * pre,u_int32_t * post)9987 ifp_if_framer(struct ifnet *ifp, struct mbuf **m,
9988     const struct sockaddr *sa, const char *ll, const char *t,
9989     u_int32_t *pre, u_int32_t *post)
9990 #else /* XNU_TARGET_OS_OSX */
9991 static errno_t
9992 ifp_if_framer(struct ifnet *ifp, struct mbuf **m,
9993     const struct sockaddr *sa, const char *ll, const char *t)
9994 #endif /* XNU_TARGET_OS_OSX */
9995 {
9996 #pragma unused(ifp, m, sa, ll, t)
9997 #if !XNU_TARGET_OS_OSX
9998 	return ifp_if_framer_extended(ifp, m, sa, ll, t, pre, post);
9999 #else /* XNU_TARGET_OS_OSX */
10000 	return ifp_if_framer_extended(ifp, m, sa, ll, t, NULL, NULL);
10001 #endif /* XNU_TARGET_OS_OSX */
10002 }
10003 
10004 static errno_t
ifp_if_framer_extended(struct ifnet * ifp,struct mbuf ** m,const struct sockaddr * sa,const char * ll,const char * t,u_int32_t * pre,u_int32_t * post)10005 ifp_if_framer_extended(struct ifnet *ifp, struct mbuf **m,
10006     const struct sockaddr *sa, const char *ll, const char *t,
10007     u_int32_t *pre, u_int32_t *post)
10008 {
10009 #pragma unused(ifp, sa, ll, t)
10010 	m_freem(*m);
10011 	*m = NULL;
10012 
10013 	if (pre != NULL) {
10014 		*pre = 0;
10015 	}
10016 	if (post != NULL) {
10017 		*post = 0;
10018 	}
10019 
10020 	return EJUSTRETURN;
10021 }
10022 
10023 errno_t
ifp_if_ioctl(struct ifnet * ifp,unsigned long cmd,void * arg)10024 ifp_if_ioctl(struct ifnet *ifp, unsigned long cmd, void *arg)
10025 {
10026 #pragma unused(ifp, cmd, arg)
10027 	return EOPNOTSUPP;
10028 }
10029 
10030 static errno_t
ifp_if_set_bpf_tap(struct ifnet * ifp,bpf_tap_mode tm,bpf_packet_func f)10031 ifp_if_set_bpf_tap(struct ifnet *ifp, bpf_tap_mode tm, bpf_packet_func f)
10032 {
10033 #pragma unused(ifp, tm, f)
10034 	/* XXX not sure what to do here */
10035 	return 0;
10036 }
10037 
10038 static void
ifp_if_free(struct ifnet * ifp)10039 ifp_if_free(struct ifnet *ifp)
10040 {
10041 #pragma unused(ifp)
10042 }
10043 
10044 static void
ifp_if_event(struct ifnet * ifp,const struct kev_msg * e)10045 ifp_if_event(struct ifnet *ifp, const struct kev_msg *e)
10046 {
10047 #pragma unused(ifp, e)
10048 }
10049 
10050 int
dlil_if_acquire(u_int32_t family,const void * uniqueid,size_t uniqueid_len,const char * ifxname,struct ifnet ** ifp)10051 dlil_if_acquire(u_int32_t family, const void *uniqueid,
10052     size_t uniqueid_len, const char *ifxname, struct ifnet **ifp)
10053 {
10054 	struct ifnet *ifp1 = NULL;
10055 	struct dlil_ifnet *dlifp1 = NULL;
10056 	struct dlil_ifnet *dlifp1_saved = NULL;
10057 	void *buf, *base, **pbuf;
10058 	int ret = 0;
10059 
10060 	VERIFY(*ifp == NULL);
10061 	dlil_if_lock();
10062 	/*
10063 	 * We absolutely can't have an interface with the same name
10064 	 * in in-use state.
10065 	 * To make sure of that list has to be traversed completely
10066 	 */
10067 	TAILQ_FOREACH(dlifp1, &dlil_ifnet_head, dl_if_link) {
10068 		ifp1 = (struct ifnet *)dlifp1;
10069 
10070 		if (ifp1->if_family != family) {
10071 			continue;
10072 		}
10073 
10074 		/*
10075 		 * If interface is in use, return EBUSY if either unique id
10076 		 * or interface extended names are the same
10077 		 */
10078 		lck_mtx_lock(&dlifp1->dl_if_lock);
10079 		if (strncmp(ifxname, ifp1->if_xname, IFXNAMSIZ) == 0 &&
10080 		    (dlifp1->dl_if_flags & DLIF_INUSE) != 0) {
10081 			lck_mtx_unlock(&dlifp1->dl_if_lock);
10082 			ret = EBUSY;
10083 			goto end;
10084 		}
10085 
10086 		if (uniqueid_len != 0 &&
10087 		    uniqueid_len == dlifp1->dl_if_uniqueid_len &&
10088 		    bcmp(uniqueid, dlifp1->dl_if_uniqueid, uniqueid_len) == 0) {
10089 			if ((dlifp1->dl_if_flags & DLIF_INUSE) != 0) {
10090 				lck_mtx_unlock(&dlifp1->dl_if_lock);
10091 				ret = EBUSY;
10092 				goto end;
10093 			}
10094 			if (dlifp1_saved == NULL) {
10095 				/* cache the first match */
10096 				dlifp1_saved = dlifp1;
10097 			}
10098 			/*
10099 			 * Do not break or jump to end as we have to traverse
10100 			 * the whole list to ensure there are no name collisions
10101 			 */
10102 		}
10103 		lck_mtx_unlock(&dlifp1->dl_if_lock);
10104 	}
10105 
10106 	/* If there's an interface that can be recycled, use that */
10107 	if (dlifp1_saved != NULL) {
10108 		lck_mtx_lock(&dlifp1_saved->dl_if_lock);
10109 		if ((dlifp1_saved->dl_if_flags & DLIF_INUSE) != 0) {
10110 			/* some other thread got in ahead of us */
10111 			lck_mtx_unlock(&dlifp1_saved->dl_if_lock);
10112 			ret = EBUSY;
10113 			goto end;
10114 		}
10115 		dlifp1_saved->dl_if_flags |= (DLIF_INUSE | DLIF_REUSE);
10116 		lck_mtx_unlock(&dlifp1_saved->dl_if_lock);
10117 		*ifp = (struct ifnet *)dlifp1_saved;
10118 		dlil_if_ref(*ifp);
10119 		goto end;
10120 	}
10121 
10122 	/* no interface found, allocate a new one */
10123 	buf = zalloc_flags(dlif_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
10124 
10125 	/* Get the 64-bit aligned base address for this object */
10126 	base = (void *)P2ROUNDUP((intptr_t)buf + sizeof(u_int64_t),
10127 	    sizeof(u_int64_t));
10128 	VERIFY(((intptr_t)base + dlif_size) <= ((intptr_t)buf + dlif_bufsize));
10129 
10130 	/*
10131 	 * Wind back a pointer size from the aligned base and
10132 	 * save the original address so we can free it later.
10133 	 */
10134 	pbuf = (void **)((intptr_t)base - sizeof(void *));
10135 	*pbuf = buf;
10136 	dlifp1 = base;
10137 
10138 	if (uniqueid_len) {
10139 		dlifp1->dl_if_uniqueid = kalloc_data(uniqueid_len,
10140 		    Z_WAITOK);
10141 		if (dlifp1->dl_if_uniqueid == NULL) {
10142 			zfree(dlif_zone, buf);
10143 			ret = ENOMEM;
10144 			goto end;
10145 		}
10146 		bcopy(uniqueid, dlifp1->dl_if_uniqueid, uniqueid_len);
10147 		dlifp1->dl_if_uniqueid_len = uniqueid_len;
10148 	}
10149 
10150 	ifp1 = (struct ifnet *)dlifp1;
10151 	dlifp1->dl_if_flags = DLIF_INUSE;
10152 	if (ifnet_debug) {
10153 		dlifp1->dl_if_flags |= DLIF_DEBUG;
10154 		dlifp1->dl_if_trace = dlil_if_trace;
10155 	}
10156 	ifp1->if_name = dlifp1->dl_if_namestorage;
10157 	ifp1->if_xname = dlifp1->dl_if_xnamestorage;
10158 
10159 	/* initialize interface description */
10160 	ifp1->if_desc.ifd_maxlen = IF_DESCSIZE;
10161 	ifp1->if_desc.ifd_len = 0;
10162 	ifp1->if_desc.ifd_desc = dlifp1->dl_if_descstorage;
10163 
10164 #if SKYWALK
10165 	LIST_INIT(&ifp1->if_netns_tokens);
10166 #endif /* SKYWALK */
10167 
10168 	if ((ret = dlil_alloc_local_stats(ifp1)) != 0) {
10169 		DLIL_PRINTF("%s: failed to allocate if local stats, "
10170 		    "error: %d\n", __func__, ret);
10171 		/* This probably shouldn't be fatal */
10172 		ret = 0;
10173 	}
10174 
10175 	lck_mtx_init(&dlifp1->dl_if_lock, &ifnet_lock_group, &ifnet_lock_attr);
10176 	lck_rw_init(&ifp1->if_lock, &ifnet_lock_group, &ifnet_lock_attr);
10177 	lck_mtx_init(&ifp1->if_ref_lock, &ifnet_lock_group, &ifnet_lock_attr);
10178 	lck_mtx_init(&ifp1->if_flt_lock, &ifnet_lock_group, &ifnet_lock_attr);
10179 	lck_mtx_init(&ifp1->if_addrconfig_lock, &ifnet_lock_group,
10180 	    &ifnet_lock_attr);
10181 	lck_rw_init(&ifp1->if_llreach_lock, &ifnet_lock_group, &ifnet_lock_attr);
10182 #if INET
10183 	lck_rw_init(&ifp1->if_inetdata_lock, &ifnet_lock_group,
10184 	    &ifnet_lock_attr);
10185 	ifp1->if_inetdata = NULL;
10186 #endif
10187 	lck_mtx_init(&ifp1->if_inet6_ioctl_lock, &ifnet_lock_group, &ifnet_lock_attr);
10188 	ifp1->if_inet6_ioctl_busy = FALSE;
10189 	lck_rw_init(&ifp1->if_inet6data_lock, &ifnet_lock_group,
10190 	    &ifnet_lock_attr);
10191 	ifp1->if_inet6data = NULL;
10192 	lck_rw_init(&ifp1->if_link_status_lock, &ifnet_lock_group,
10193 	    &ifnet_lock_attr);
10194 	ifp1->if_link_status = NULL;
10195 	lck_mtx_init(&ifp1->if_delegate_lock, &ifnet_lock_group, &ifnet_lock_attr);
10196 
10197 	/* for send data paths */
10198 	lck_mtx_init(&ifp1->if_start_lock, &ifnet_snd_lock_group,
10199 	    &ifnet_lock_attr);
10200 	lck_mtx_init(&ifp1->if_cached_route_lock, &ifnet_snd_lock_group,
10201 	    &ifnet_lock_attr);
10202 
10203 	/* for receive data paths */
10204 	lck_mtx_init(&ifp1->if_poll_lock, &ifnet_rcv_lock_group,
10205 	    &ifnet_lock_attr);
10206 
10207 	/* thread call allocation is done with sleeping zalloc */
10208 	ifp1->if_dt_tcall = thread_call_allocate_with_options(dlil_dt_tcall_fn,
10209 	    ifp1, THREAD_CALL_PRIORITY_KERNEL, THREAD_CALL_OPTIONS_ONCE);
10210 	if (ifp1->if_dt_tcall == NULL) {
10211 		panic_plain("%s: couldn't create if_dt_tcall", __func__);
10212 		/* NOTREACHED */
10213 	}
10214 
10215 	TAILQ_INSERT_TAIL(&dlil_ifnet_head, dlifp1, dl_if_link);
10216 
10217 	*ifp = ifp1;
10218 	dlil_if_ref(*ifp);
10219 
10220 end:
10221 	dlil_if_unlock();
10222 
10223 	VERIFY(dlifp1 == NULL || (IS_P2ALIGNED(dlifp1, sizeof(u_int64_t)) &&
10224 	    IS_P2ALIGNED(&ifp1->if_data, sizeof(u_int64_t))));
10225 
10226 	return ret;
10227 }
10228 
10229 static void
_dlil_if_release(ifnet_t ifp,bool clear_in_use)10230 _dlil_if_release(ifnet_t ifp, bool clear_in_use)
10231 {
10232 	struct dlil_ifnet *dlifp = (struct dlil_ifnet *)ifp;
10233 
10234 	VERIFY(OSDecrementAtomic64(&net_api_stats.nas_ifnet_alloc_count) > 0);
10235 	if (!(ifp->if_xflags & IFXF_ALLOC_KPI)) {
10236 		VERIFY(OSDecrementAtomic64(&net_api_stats.nas_ifnet_alloc_os_count) > 0);
10237 	}
10238 
10239 	ifnet_lock_exclusive(ifp);
10240 	kfree_data_counted_by(ifp->if_broadcast.ptr, ifp->if_broadcast.length);
10241 	lck_mtx_lock(&dlifp->dl_if_lock);
10242 	strlcpy(dlifp->dl_if_namestorage, ifp->if_name, IFNAMSIZ);
10243 	ifp->if_name = dlifp->dl_if_namestorage;
10244 	/* Reset external name (name + unit) */
10245 	ifp->if_xname = dlifp->dl_if_xnamestorage;
10246 	snprintf(__DECONST(char *, ifp->if_xname), IFXNAMSIZ,
10247 	    "%s?", ifp->if_name);
10248 	if (clear_in_use) {
10249 		ASSERT((dlifp->dl_if_flags & DLIF_INUSE) != 0);
10250 		dlifp->dl_if_flags &= ~DLIF_INUSE;
10251 	}
10252 	lck_mtx_unlock(&dlifp->dl_if_lock);
10253 	ifnet_lock_done(ifp);
10254 }
10255 
10256 __private_extern__ void
dlil_if_release(ifnet_t ifp)10257 dlil_if_release(ifnet_t ifp)
10258 {
10259 	_dlil_if_release(ifp, false);
10260 }
10261 
10262 __private_extern__ void
dlil_if_lock(void)10263 dlil_if_lock(void)
10264 {
10265 	lck_mtx_lock(&dlil_ifnet_lock);
10266 }
10267 
10268 __private_extern__ void
dlil_if_unlock(void)10269 dlil_if_unlock(void)
10270 {
10271 	lck_mtx_unlock(&dlil_ifnet_lock);
10272 }
10273 
10274 __private_extern__ void
dlil_if_lock_assert(void)10275 dlil_if_lock_assert(void)
10276 {
10277 	LCK_MTX_ASSERT(&dlil_ifnet_lock, LCK_MTX_ASSERT_OWNED);
10278 }
10279 
10280 __private_extern__ void
dlil_proto_unplumb_all(struct ifnet * ifp)10281 dlil_proto_unplumb_all(struct ifnet *ifp)
10282 {
10283 	/*
10284 	 * if_proto_hash[0-2] are for PF_INET, PF_INET6 and PF_VLAN, where
10285 	 * each bucket contains exactly one entry; PF_VLAN does not need an
10286 	 * explicit unplumb.
10287 	 *
10288 	 * if_proto_hash[3] is for other protocols; we expect anything
10289 	 * in this bucket to respond to the DETACHING event (which would
10290 	 * have happened by now) and do the unplumb then.
10291 	 */
10292 	(void) proto_unplumb(PF_INET, ifp);
10293 	(void) proto_unplumb(PF_INET6, ifp);
10294 }
10295 
10296 static void
ifp_src_route_copyout(struct ifnet * ifp,struct route * dst)10297 ifp_src_route_copyout(struct ifnet *ifp, struct route *dst)
10298 {
10299 	lck_mtx_lock_spin(&ifp->if_cached_route_lock);
10300 	lck_mtx_convert_spin(&ifp->if_cached_route_lock);
10301 
10302 	route_copyout(dst, &ifp->if_src_route, sizeof(*dst));
10303 
10304 	lck_mtx_unlock(&ifp->if_cached_route_lock);
10305 }
10306 
10307 static void
ifp_src_route_copyin(struct ifnet * ifp,struct route * src)10308 ifp_src_route_copyin(struct ifnet *ifp, struct route *src)
10309 {
10310 	lck_mtx_lock_spin(&ifp->if_cached_route_lock);
10311 	lck_mtx_convert_spin(&ifp->if_cached_route_lock);
10312 
10313 	if (ifp->if_fwd_cacheok) {
10314 		route_copyin(src, &ifp->if_src_route, sizeof(*src));
10315 	} else {
10316 		ROUTE_RELEASE(src);
10317 	}
10318 	lck_mtx_unlock(&ifp->if_cached_route_lock);
10319 }
10320 
10321 static void
ifp_src_route6_copyout(struct ifnet * ifp,struct route_in6 * dst)10322 ifp_src_route6_copyout(struct ifnet *ifp, struct route_in6 *dst)
10323 {
10324 	lck_mtx_lock_spin(&ifp->if_cached_route_lock);
10325 	lck_mtx_convert_spin(&ifp->if_cached_route_lock);
10326 
10327 	route_copyout((struct route *)dst, (struct route *)&ifp->if_src_route6,
10328 	    sizeof(*dst));
10329 
10330 	lck_mtx_unlock(&ifp->if_cached_route_lock);
10331 }
10332 
10333 static void
ifp_src_route6_copyin(struct ifnet * ifp,struct route_in6 * src)10334 ifp_src_route6_copyin(struct ifnet *ifp, struct route_in6 *src)
10335 {
10336 	lck_mtx_lock_spin(&ifp->if_cached_route_lock);
10337 	lck_mtx_convert_spin(&ifp->if_cached_route_lock);
10338 
10339 	if (ifp->if_fwd_cacheok) {
10340 		route_copyin((struct route *)src,
10341 		    (struct route *)&ifp->if_src_route6, sizeof(*src));
10342 	} else {
10343 		ROUTE_RELEASE(src);
10344 	}
10345 	lck_mtx_unlock(&ifp->if_cached_route_lock);
10346 }
10347 
10348 struct rtentry *
ifnet_cached_rtlookup_inet(struct ifnet * ifp,struct in_addr src_ip)10349 ifnet_cached_rtlookup_inet(struct ifnet *ifp, struct in_addr src_ip)
10350 {
10351 	struct route            src_rt;
10352 	struct sockaddr_in      *dst;
10353 
10354 	dst = SIN(&src_rt.ro_dst);
10355 
10356 	ifp_src_route_copyout(ifp, &src_rt);
10357 
10358 	if (ROUTE_UNUSABLE(&src_rt) || src_ip.s_addr != dst->sin_addr.s_addr) {
10359 		ROUTE_RELEASE(&src_rt);
10360 		if (dst->sin_family != AF_INET) {
10361 			SOCKADDR_ZERO(&src_rt.ro_dst, sizeof(src_rt.ro_dst));
10362 			dst->sin_len = sizeof(src_rt.ro_dst);
10363 			dst->sin_family = AF_INET;
10364 		}
10365 		dst->sin_addr = src_ip;
10366 
10367 		VERIFY(src_rt.ro_rt == NULL);
10368 		src_rt.ro_rt = rtalloc1_scoped(SA(dst),
10369 		    0, 0, ifp->if_index);
10370 
10371 		if (src_rt.ro_rt != NULL) {
10372 			/* retain a ref, copyin consumes one */
10373 			struct rtentry  *rte = src_rt.ro_rt;
10374 			RT_ADDREF(rte);
10375 			ifp_src_route_copyin(ifp, &src_rt);
10376 			src_rt.ro_rt = rte;
10377 		}
10378 	}
10379 
10380 	return src_rt.ro_rt;
10381 }
10382 
10383 struct rtentry *
ifnet_cached_rtlookup_inet6(struct ifnet * ifp,struct in6_addr * src_ip6)10384 ifnet_cached_rtlookup_inet6(struct ifnet *ifp, struct in6_addr *src_ip6)
10385 {
10386 	struct route_in6 src_rt;
10387 
10388 	ifp_src_route6_copyout(ifp, &src_rt);
10389 
10390 	if (ROUTE_UNUSABLE(&src_rt) ||
10391 	    !IN6_ARE_ADDR_EQUAL(src_ip6, &src_rt.ro_dst.sin6_addr)) {
10392 		ROUTE_RELEASE(&src_rt);
10393 		if (src_rt.ro_dst.sin6_family != AF_INET6) {
10394 			SOCKADDR_ZERO(&src_rt.ro_dst, sizeof(src_rt.ro_dst));
10395 			src_rt.ro_dst.sin6_len = sizeof(src_rt.ro_dst);
10396 			src_rt.ro_dst.sin6_family = AF_INET6;
10397 		}
10398 		src_rt.ro_dst.sin6_scope_id = in6_addr2scopeid(ifp, src_ip6);
10399 		bcopy(src_ip6, &src_rt.ro_dst.sin6_addr,
10400 		    sizeof(src_rt.ro_dst.sin6_addr));
10401 
10402 		if (src_rt.ro_rt == NULL) {
10403 			src_rt.ro_rt = rtalloc1_scoped(
10404 				SA(&src_rt.ro_dst), 0, 0,
10405 				ifp->if_index);
10406 
10407 			if (src_rt.ro_rt != NULL) {
10408 				/* retain a ref, copyin consumes one */
10409 				struct rtentry  *rte = src_rt.ro_rt;
10410 				RT_ADDREF(rte);
10411 				ifp_src_route6_copyin(ifp, &src_rt);
10412 				src_rt.ro_rt = rte;
10413 			}
10414 		}
10415 	}
10416 
10417 	return src_rt.ro_rt;
10418 }
10419 
10420 void
if_lqm_update(struct ifnet * ifp,int lqm,int locked)10421 if_lqm_update(struct ifnet *ifp, int lqm, int locked)
10422 {
10423 	struct kev_dl_link_quality_metric_data ev_lqm_data;
10424 
10425 	VERIFY(lqm >= IFNET_LQM_MIN && lqm <= IFNET_LQM_MAX);
10426 
10427 	/* Normalize to edge */
10428 	if (lqm >= 0 && lqm <= IFNET_LQM_THRESH_ABORT) {
10429 		lqm = IFNET_LQM_THRESH_ABORT;
10430 		os_atomic_or(&tcbinfo.ipi_flags, INPCBINFO_HANDLE_LQM_ABORT, relaxed);
10431 		inpcb_timer_sched(&tcbinfo, INPCB_TIMER_FAST);
10432 	} else if (lqm > IFNET_LQM_THRESH_ABORT &&
10433 	    lqm <= IFNET_LQM_THRESH_MINIMALLY_VIABLE) {
10434 		lqm = IFNET_LQM_THRESH_MINIMALLY_VIABLE;
10435 	} else if (lqm > IFNET_LQM_THRESH_MINIMALLY_VIABLE &&
10436 	    lqm <= IFNET_LQM_THRESH_POOR) {
10437 		lqm = IFNET_LQM_THRESH_POOR;
10438 	} else if (lqm > IFNET_LQM_THRESH_POOR &&
10439 	    lqm <= IFNET_LQM_THRESH_GOOD) {
10440 		lqm = IFNET_LQM_THRESH_GOOD;
10441 	}
10442 
10443 	/*
10444 	 * Take the lock if needed
10445 	 */
10446 	if (!locked) {
10447 		ifnet_lock_exclusive(ifp);
10448 	}
10449 
10450 	if (lqm == ifp->if_interface_state.lqm_state &&
10451 	    (ifp->if_interface_state.valid_bitmask &
10452 	    IF_INTERFACE_STATE_LQM_STATE_VALID)) {
10453 		/*
10454 		 * Release the lock if was not held by the caller
10455 		 */
10456 		if (!locked) {
10457 			ifnet_lock_done(ifp);
10458 		}
10459 		return;         /* nothing to update */
10460 	}
10461 	ifp->if_interface_state.valid_bitmask |=
10462 	    IF_INTERFACE_STATE_LQM_STATE_VALID;
10463 	ifp->if_interface_state.lqm_state = (int8_t)lqm;
10464 
10465 	/*
10466 	 * Don't want to hold the lock when issuing kernel events
10467 	 */
10468 	ifnet_lock_done(ifp);
10469 
10470 	bzero(&ev_lqm_data, sizeof(ev_lqm_data));
10471 	ev_lqm_data.link_quality_metric = lqm;
10472 
10473 	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_LINK_QUALITY_METRIC_CHANGED,
10474 	    (struct net_event_data *)&ev_lqm_data, sizeof(ev_lqm_data), FALSE);
10475 
10476 	/*
10477 	 * Reacquire the lock for the caller
10478 	 */
10479 	if (locked) {
10480 		ifnet_lock_exclusive(ifp);
10481 	}
10482 }
10483 
10484 static void
if_rrc_state_update(struct ifnet * ifp,unsigned int rrc_state)10485 if_rrc_state_update(struct ifnet *ifp, unsigned int rrc_state)
10486 {
10487 	struct kev_dl_rrc_state kev;
10488 
10489 	if (rrc_state == ifp->if_interface_state.rrc_state &&
10490 	    (ifp->if_interface_state.valid_bitmask &
10491 	    IF_INTERFACE_STATE_RRC_STATE_VALID)) {
10492 		return;
10493 	}
10494 
10495 	ifp->if_interface_state.valid_bitmask |=
10496 	    IF_INTERFACE_STATE_RRC_STATE_VALID;
10497 
10498 	ifp->if_interface_state.rrc_state = (uint8_t)rrc_state;
10499 
10500 	/*
10501 	 * Don't want to hold the lock when issuing kernel events
10502 	 */
10503 	ifnet_lock_done(ifp);
10504 
10505 	bzero(&kev, sizeof(struct kev_dl_rrc_state));
10506 	kev.rrc_state = rrc_state;
10507 
10508 	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_RRC_STATE_CHANGED,
10509 	    (struct net_event_data *)&kev, sizeof(struct kev_dl_rrc_state), FALSE);
10510 
10511 	ifnet_lock_exclusive(ifp);
10512 }
10513 
10514 errno_t
if_state_update(struct ifnet * ifp,struct if_interface_state * if_interface_state)10515 if_state_update(struct ifnet *ifp,
10516     struct if_interface_state *if_interface_state)
10517 {
10518 	u_short if_index_available = 0;
10519 
10520 	ifnet_lock_exclusive(ifp);
10521 
10522 	if ((ifp->if_type != IFT_CELLULAR) &&
10523 	    (if_interface_state->valid_bitmask &
10524 	    IF_INTERFACE_STATE_RRC_STATE_VALID)) {
10525 		ifnet_lock_done(ifp);
10526 		return ENOTSUP;
10527 	}
10528 	if ((if_interface_state->valid_bitmask &
10529 	    IF_INTERFACE_STATE_LQM_STATE_VALID) &&
10530 	    (if_interface_state->lqm_state < IFNET_LQM_MIN ||
10531 	    if_interface_state->lqm_state > IFNET_LQM_MAX)) {
10532 		ifnet_lock_done(ifp);
10533 		return EINVAL;
10534 	}
10535 	if ((if_interface_state->valid_bitmask &
10536 	    IF_INTERFACE_STATE_RRC_STATE_VALID) &&
10537 	    if_interface_state->rrc_state !=
10538 	    IF_INTERFACE_STATE_RRC_STATE_IDLE &&
10539 	    if_interface_state->rrc_state !=
10540 	    IF_INTERFACE_STATE_RRC_STATE_CONNECTED) {
10541 		ifnet_lock_done(ifp);
10542 		return EINVAL;
10543 	}
10544 
10545 	if (if_interface_state->valid_bitmask &
10546 	    IF_INTERFACE_STATE_LQM_STATE_VALID) {
10547 		if_lqm_update(ifp, if_interface_state->lqm_state, 1);
10548 	}
10549 	if (if_interface_state->valid_bitmask &
10550 	    IF_INTERFACE_STATE_RRC_STATE_VALID) {
10551 		if_rrc_state_update(ifp, if_interface_state->rrc_state);
10552 	}
10553 	if (if_interface_state->valid_bitmask &
10554 	    IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID) {
10555 		ifp->if_interface_state.valid_bitmask |=
10556 		    IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID;
10557 		ifp->if_interface_state.interface_availability =
10558 		    if_interface_state->interface_availability;
10559 
10560 		if (ifp->if_interface_state.interface_availability ==
10561 		    IF_INTERFACE_STATE_INTERFACE_AVAILABLE) {
10562 			os_log(OS_LOG_DEFAULT, "%s: interface %s (%u) available\n",
10563 			    __func__, if_name(ifp), ifp->if_index);
10564 			if_index_available = ifp->if_index;
10565 		} else {
10566 			os_log(OS_LOG_DEFAULT, "%s: interface %s (%u) unavailable)\n",
10567 			    __func__, if_name(ifp), ifp->if_index);
10568 		}
10569 	}
10570 	ifnet_lock_done(ifp);
10571 
10572 	/*
10573 	 * Check if the TCP connections going on this interface should be
10574 	 * forced to send probe packets instead of waiting for TCP timers
10575 	 * to fire. This is done on an explicit notification such as
10576 	 * SIOCSIFINTERFACESTATE which marks the interface as available.
10577 	 */
10578 	if (if_index_available > 0) {
10579 		tcp_interface_send_probe(if_index_available);
10580 	}
10581 
10582 	return 0;
10583 }
10584 
10585 void
if_get_state(struct ifnet * ifp,struct if_interface_state * if_interface_state)10586 if_get_state(struct ifnet *ifp,
10587     struct if_interface_state *if_interface_state)
10588 {
10589 	ifnet_lock_shared(ifp);
10590 
10591 	if_interface_state->valid_bitmask = 0;
10592 
10593 	if (ifp->if_interface_state.valid_bitmask &
10594 	    IF_INTERFACE_STATE_RRC_STATE_VALID) {
10595 		if_interface_state->valid_bitmask |=
10596 		    IF_INTERFACE_STATE_RRC_STATE_VALID;
10597 		if_interface_state->rrc_state =
10598 		    ifp->if_interface_state.rrc_state;
10599 	}
10600 	if (ifp->if_interface_state.valid_bitmask &
10601 	    IF_INTERFACE_STATE_LQM_STATE_VALID) {
10602 		if_interface_state->valid_bitmask |=
10603 		    IF_INTERFACE_STATE_LQM_STATE_VALID;
10604 		if_interface_state->lqm_state =
10605 		    ifp->if_interface_state.lqm_state;
10606 	}
10607 	if (ifp->if_interface_state.valid_bitmask &
10608 	    IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID) {
10609 		if_interface_state->valid_bitmask |=
10610 		    IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID;
10611 		if_interface_state->interface_availability =
10612 		    ifp->if_interface_state.interface_availability;
10613 	}
10614 
10615 	ifnet_lock_done(ifp);
10616 }
10617 
10618 errno_t
if_probe_connectivity(struct ifnet * ifp,u_int32_t conn_probe)10619 if_probe_connectivity(struct ifnet *ifp, u_int32_t conn_probe)
10620 {
10621 	if (conn_probe > 1) {
10622 		return EINVAL;
10623 	}
10624 	if (conn_probe == 0) {
10625 		if_clear_eflags(ifp, IFEF_PROBE_CONNECTIVITY);
10626 	} else {
10627 		if_set_eflags(ifp, IFEF_PROBE_CONNECTIVITY);
10628 	}
10629 
10630 #if NECP
10631 	necp_update_all_clients();
10632 #endif /* NECP */
10633 
10634 	tcp_probe_connectivity(ifp, conn_probe);
10635 	return 0;
10636 }
10637 
10638 /* for uuid.c */
10639 static int
get_ether_index(int * ret_other_index)10640 get_ether_index(int * ret_other_index)
10641 {
10642 	struct ifnet *ifp;
10643 	int en0_index = 0;
10644 	int other_en_index = 0;
10645 	int any_ether_index = 0;
10646 	short best_unit = 0;
10647 
10648 	*ret_other_index = 0;
10649 	TAILQ_FOREACH(ifp, &ifnet_head, if_link) {
10650 		/*
10651 		 * find en0, or if not en0, the lowest unit en*, and if not
10652 		 * that, any ethernet
10653 		 */
10654 		ifnet_lock_shared(ifp);
10655 		if (strcmp(ifp->if_name, "en") == 0) {
10656 			if (ifp->if_unit == 0) {
10657 				/* found en0, we're done */
10658 				en0_index = ifp->if_index;
10659 				ifnet_lock_done(ifp);
10660 				break;
10661 			}
10662 			if (other_en_index == 0 || ifp->if_unit < best_unit) {
10663 				other_en_index = ifp->if_index;
10664 				best_unit = ifp->if_unit;
10665 			}
10666 		} else if (ifp->if_type == IFT_ETHER && any_ether_index == 0) {
10667 			any_ether_index = ifp->if_index;
10668 		}
10669 		ifnet_lock_done(ifp);
10670 	}
10671 	if (en0_index == 0) {
10672 		if (other_en_index != 0) {
10673 			*ret_other_index = other_en_index;
10674 		} else if (any_ether_index != 0) {
10675 			*ret_other_index = any_ether_index;
10676 		}
10677 	}
10678 	return en0_index;
10679 }
10680 
10681 int
uuid_get_ethernet(u_int8_t * node)10682 uuid_get_ethernet(u_int8_t *node)
10683 {
10684 	static int en0_index;
10685 	struct ifnet *ifp;
10686 	int other_index = 0;
10687 	int the_index = 0;
10688 	int ret;
10689 
10690 	ifnet_head_lock_shared();
10691 	if (en0_index == 0 || ifindex2ifnet[en0_index] == NULL) {
10692 		en0_index = get_ether_index(&other_index);
10693 	}
10694 	if (en0_index != 0) {
10695 		the_index = en0_index;
10696 	} else if (other_index != 0) {
10697 		the_index = other_index;
10698 	}
10699 	if (the_index != 0) {
10700 		struct dlil_ifnet *dl_if;
10701 
10702 		ifp = ifindex2ifnet[the_index];
10703 		VERIFY(ifp != NULL);
10704 		dl_if = (struct dlil_ifnet *)ifp;
10705 		if (dl_if->dl_if_permanent_ether_is_set != 0) {
10706 			/*
10707 			 * Use the permanent ethernet address if it is
10708 			 * available because it will never change.
10709 			 */
10710 			memcpy(node, dl_if->dl_if_permanent_ether,
10711 			    ETHER_ADDR_LEN);
10712 		} else {
10713 			memcpy(node, IF_LLADDR(ifp), ETHER_ADDR_LEN);
10714 		}
10715 		ret = 0;
10716 	} else {
10717 		ret = -1;
10718 	}
10719 	ifnet_head_done();
10720 	return ret;
10721 }
10722 
10723 int
dlil_node_present(struct ifnet * ifp,struct sockaddr * sa,int32_t rssi,int lqm,int npm,u_int8_t srvinfo[48])10724 dlil_node_present(struct ifnet *ifp, struct sockaddr *sa,
10725     int32_t rssi, int lqm, int npm, u_int8_t srvinfo[48])
10726 {
10727 	struct kev_dl_node_presence kev;
10728 	struct sockaddr_dl *sdl;
10729 	struct sockaddr_in6 *sin6;
10730 	int ret = 0;
10731 
10732 	VERIFY(ifp);
10733 	VERIFY(sa);
10734 	VERIFY(sa->sa_family == AF_LINK || sa->sa_family == AF_INET6);
10735 
10736 	bzero(&kev, sizeof(kev));
10737 	sin6 = &kev.sin6_node_address;
10738 	sdl = &kev.sdl_node_address;
10739 	nd6_alt_node_addr_decompose(ifp, sa, sdl, sin6);
10740 	kev.rssi = rssi;
10741 	kev.link_quality_metric = lqm;
10742 	kev.node_proximity_metric = npm;
10743 	bcopy(srvinfo, kev.node_service_info, sizeof(kev.node_service_info));
10744 
10745 	ret = nd6_alt_node_present(ifp, sin6, sdl, rssi, lqm, npm);
10746 	if (ret == 0 || ret == EEXIST) {
10747 		int err = dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_NODE_PRESENCE,
10748 		    &kev.link_data, sizeof(kev), (ret == EEXIST) ? TRUE : FALSE);
10749 		if (err != 0) {
10750 			log(LOG_ERR, "%s: Post DL_NODE_PRESENCE failed with"
10751 			    "error %d\n", __func__, err);
10752 		}
10753 	}
10754 
10755 	if (ret == EEXIST) {
10756 		ret = 0;
10757 	}
10758 	return ret;
10759 }
10760 
10761 void
dlil_node_absent(struct ifnet * ifp,struct sockaddr * sa)10762 dlil_node_absent(struct ifnet *ifp, struct sockaddr *sa)
10763 {
10764 	struct kev_dl_node_absence kev = {};
10765 	struct sockaddr_in6 *kev_sin6 = NULL;
10766 	struct sockaddr_dl *kev_sdl = NULL;
10767 	int error = 0;
10768 
10769 	VERIFY(ifp != NULL);
10770 	VERIFY(sa != NULL);
10771 	VERIFY(sa->sa_family == AF_LINK || sa->sa_family == AF_INET6);
10772 
10773 	kev_sin6 = &kev.sin6_node_address;
10774 	kev_sdl = &kev.sdl_node_address;
10775 
10776 	if (sa->sa_family == AF_INET6) {
10777 		/*
10778 		 * If IPv6 address is given, get the link layer
10779 		 * address from what was cached in the neighbor cache
10780 		 */
10781 		VERIFY(sa->sa_len <= sizeof(*kev_sin6));
10782 		bcopy(sa, kev_sin6, sa->sa_len);
10783 		error = nd6_alt_node_absent(ifp, kev_sin6, kev_sdl);
10784 	} else {
10785 		/*
10786 		 * If passed address is AF_LINK type, derive the address
10787 		 * based on the link address.
10788 		 */
10789 		nd6_alt_node_addr_decompose(ifp, sa, kev_sdl, kev_sin6);
10790 		error = nd6_alt_node_absent(ifp, kev_sin6, NULL);
10791 	}
10792 
10793 	if (error == 0) {
10794 		kev_sdl->sdl_type = ifp->if_type;
10795 		kev_sdl->sdl_index = ifp->if_index;
10796 
10797 		dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_NODE_ABSENCE,
10798 		    &kev.link_data, sizeof(kev), FALSE);
10799 	}
10800 }
10801 
10802 int
dlil_node_present_v2(struct ifnet * ifp,struct sockaddr * sa,struct sockaddr_dl * sdl,int32_t rssi,int lqm,int npm,u_int8_t srvinfo[48])10803 dlil_node_present_v2(struct ifnet *ifp, struct sockaddr *sa, struct sockaddr_dl *sdl,
10804     int32_t rssi, int lqm, int npm, u_int8_t srvinfo[48])
10805 {
10806 	struct kev_dl_node_presence kev = {};
10807 	struct sockaddr_dl *kev_sdl = NULL;
10808 	struct sockaddr_in6 *kev_sin6 = NULL;
10809 	int ret = 0;
10810 
10811 	VERIFY(ifp != NULL);
10812 	VERIFY(sa != NULL && sdl != NULL);
10813 	VERIFY(sa->sa_family == AF_INET6 && sdl->sdl_family == AF_LINK);
10814 
10815 	kev_sin6 = &kev.sin6_node_address;
10816 	kev_sdl = &kev.sdl_node_address;
10817 
10818 	VERIFY(sdl->sdl_len <= sizeof(*kev_sdl));
10819 	bcopy(sdl, kev_sdl, sdl->sdl_len);
10820 	kev_sdl->sdl_type = ifp->if_type;
10821 	kev_sdl->sdl_index = ifp->if_index;
10822 
10823 	VERIFY(sa->sa_len <= sizeof(*kev_sin6));
10824 	bcopy(sa, kev_sin6, sa->sa_len);
10825 
10826 	kev.rssi = rssi;
10827 	kev.link_quality_metric = lqm;
10828 	kev.node_proximity_metric = npm;
10829 	bcopy(srvinfo, kev.node_service_info, sizeof(kev.node_service_info));
10830 
10831 	ret = nd6_alt_node_present(ifp, SIN6(sa), sdl, rssi, lqm, npm);
10832 	if (ret == 0 || ret == EEXIST) {
10833 		int err = dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_NODE_PRESENCE,
10834 		    &kev.link_data, sizeof(kev), (ret == EEXIST) ? TRUE : FALSE);
10835 		if (err != 0) {
10836 			log(LOG_ERR, "%s: Post DL_NODE_PRESENCE failed with error %d\n", __func__, err);
10837 		}
10838 	}
10839 
10840 	if (ret == EEXIST) {
10841 		ret = 0;
10842 	}
10843 	return ret;
10844 }
10845 
10846 const void *
dlil_ifaddr_bytes(const struct sockaddr_dl * sdl,size_t * sizep,kauth_cred_t * credp)10847 dlil_ifaddr_bytes(const struct sockaddr_dl *sdl, size_t *sizep,
10848     kauth_cred_t *credp)
10849 {
10850 	const u_int8_t *bytes;
10851 	size_t size;
10852 
10853 	bytes = CONST_LLADDR(sdl);
10854 	size = sdl->sdl_alen;
10855 
10856 #if CONFIG_MACF
10857 	if (dlil_lladdr_ckreq) {
10858 		switch (sdl->sdl_type) {
10859 		case IFT_ETHER:
10860 		case IFT_IEEE1394:
10861 			break;
10862 		default:
10863 			credp = NULL;
10864 			break;
10865 		}
10866 		;
10867 
10868 		if (credp && mac_system_check_info(*credp, "net.link.addr")) {
10869 			static const u_int8_t unspec[FIREWIRE_EUI64_LEN] = {
10870 				[0] = 2
10871 			};
10872 
10873 			bytes = unspec;
10874 		}
10875 	}
10876 #else
10877 #pragma unused(credp)
10878 #endif
10879 
10880 	if (sizep != NULL) {
10881 		*sizep = size;
10882 	}
10883 	return bytes;
10884 }
10885 
10886 void
dlil_report_issues(struct ifnet * ifp,u_int8_t modid[DLIL_MODIDLEN],u_int8_t info[DLIL_MODARGLEN])10887 dlil_report_issues(struct ifnet *ifp, u_int8_t modid[DLIL_MODIDLEN],
10888     u_int8_t info[DLIL_MODARGLEN])
10889 {
10890 	struct kev_dl_issues kev;
10891 	struct timeval tv;
10892 
10893 	VERIFY(ifp != NULL);
10894 	VERIFY(modid != NULL);
10895 	_CASSERT(sizeof(kev.modid) == DLIL_MODIDLEN);
10896 	_CASSERT(sizeof(kev.info) == DLIL_MODARGLEN);
10897 
10898 	bzero(&kev, sizeof(kev));
10899 
10900 	microtime(&tv);
10901 	kev.timestamp = tv.tv_sec;
10902 	bcopy(modid, &kev.modid, DLIL_MODIDLEN);
10903 	if (info != NULL) {
10904 		bcopy(info, &kev.info, DLIL_MODARGLEN);
10905 	}
10906 
10907 	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_ISSUES,
10908 	    &kev.link_data, sizeof(kev), FALSE);
10909 }
10910 
10911 errno_t
ifnet_getset_opportunistic(ifnet_t ifp,u_long cmd,struct ifreq * ifr,struct proc * p)10912 ifnet_getset_opportunistic(ifnet_t ifp, u_long cmd, struct ifreq *ifr,
10913     struct proc *p)
10914 {
10915 	u_int32_t level = IFNET_THROTTLE_OFF;
10916 	errno_t result = 0;
10917 
10918 	VERIFY(cmd == SIOCSIFOPPORTUNISTIC || cmd == SIOCGIFOPPORTUNISTIC);
10919 
10920 	if (cmd == SIOCSIFOPPORTUNISTIC) {
10921 		/*
10922 		 * XXX: Use priv_check_cred() instead of root check?
10923 		 */
10924 		if ((result = proc_suser(p)) != 0) {
10925 			return result;
10926 		}
10927 
10928 		if (ifr->ifr_opportunistic.ifo_flags ==
10929 		    IFRIFOF_BLOCK_OPPORTUNISTIC) {
10930 			level = IFNET_THROTTLE_OPPORTUNISTIC;
10931 		} else if (ifr->ifr_opportunistic.ifo_flags == 0) {
10932 			level = IFNET_THROTTLE_OFF;
10933 		} else {
10934 			result = EINVAL;
10935 		}
10936 
10937 		if (result == 0) {
10938 			result = ifnet_set_throttle(ifp, level);
10939 		}
10940 	} else if ((result = ifnet_get_throttle(ifp, &level)) == 0) {
10941 		ifr->ifr_opportunistic.ifo_flags = 0;
10942 		if (level == IFNET_THROTTLE_OPPORTUNISTIC) {
10943 			ifr->ifr_opportunistic.ifo_flags |=
10944 			    IFRIFOF_BLOCK_OPPORTUNISTIC;
10945 		}
10946 	}
10947 
10948 	/*
10949 	 * Return the count of current opportunistic connections
10950 	 * over the interface.
10951 	 */
10952 	if (result == 0) {
10953 		uint32_t flags = 0;
10954 		flags |= (cmd == SIOCSIFOPPORTUNISTIC) ?
10955 		    INPCB_OPPORTUNISTIC_SETCMD : 0;
10956 		flags |= (level == IFNET_THROTTLE_OPPORTUNISTIC) ?
10957 		    INPCB_OPPORTUNISTIC_THROTTLEON : 0;
10958 		ifr->ifr_opportunistic.ifo_inuse =
10959 		    udp_count_opportunistic(ifp->if_index, flags) +
10960 		    tcp_count_opportunistic(ifp->if_index, flags);
10961 	}
10962 
10963 	if (result == EALREADY) {
10964 		result = 0;
10965 	}
10966 
10967 	return result;
10968 }
10969 
10970 int
ifnet_get_throttle(struct ifnet * ifp,u_int32_t * level)10971 ifnet_get_throttle(struct ifnet *ifp, u_int32_t *level)
10972 {
10973 	struct ifclassq *ifq;
10974 	int err = 0;
10975 
10976 	if (!(ifp->if_eflags & IFEF_TXSTART)) {
10977 		return ENXIO;
10978 	}
10979 
10980 	*level = IFNET_THROTTLE_OFF;
10981 
10982 	ifq = ifp->if_snd;
10983 	IFCQ_LOCK(ifq);
10984 	/* Throttling works only for IFCQ, not ALTQ instances */
10985 	if (IFCQ_IS_ENABLED(ifq)) {
10986 		cqrq_throttle_t req = { 0, IFNET_THROTTLE_OFF };
10987 
10988 		err = fq_if_request_classq(ifq, CLASSQRQ_THROTTLE, &req);
10989 		*level = req.level;
10990 	}
10991 	IFCQ_UNLOCK(ifq);
10992 
10993 	return err;
10994 }
10995 
10996 int
ifnet_set_throttle(struct ifnet * ifp,u_int32_t level)10997 ifnet_set_throttle(struct ifnet *ifp, u_int32_t level)
10998 {
10999 	struct ifclassq *ifq;
11000 	int err = 0;
11001 
11002 	if (!(ifp->if_eflags & IFEF_TXSTART)) {
11003 		return ENXIO;
11004 	}
11005 
11006 	ifq = ifp->if_snd;
11007 
11008 	switch (level) {
11009 	case IFNET_THROTTLE_OFF:
11010 	case IFNET_THROTTLE_OPPORTUNISTIC:
11011 		break;
11012 	default:
11013 		return EINVAL;
11014 	}
11015 
11016 	IFCQ_LOCK(ifq);
11017 	if (IFCQ_IS_ENABLED(ifq)) {
11018 		cqrq_throttle_t req = { 1, level };
11019 
11020 		err = fq_if_request_classq(ifq, CLASSQRQ_THROTTLE, &req);
11021 	}
11022 	IFCQ_UNLOCK(ifq);
11023 
11024 	if (err == 0) {
11025 		DLIL_PRINTF("%s: throttling level set to %d\n", if_name(ifp),
11026 		    level);
11027 #if NECP
11028 		necp_update_all_clients();
11029 #endif /* NECP */
11030 		if (level == IFNET_THROTTLE_OFF) {
11031 			ifnet_start(ifp);
11032 		}
11033 	}
11034 
11035 	return err;
11036 }
11037 
11038 errno_t
ifnet_getset_log(ifnet_t ifp,u_long cmd,struct ifreq * ifr,struct proc * p)11039 ifnet_getset_log(ifnet_t ifp, u_long cmd, struct ifreq *ifr,
11040     struct proc *p)
11041 {
11042 #pragma unused(p)
11043 	errno_t result = 0;
11044 	uint32_t flags;
11045 	int level, category, subcategory;
11046 
11047 	VERIFY(cmd == SIOCSIFLOG || cmd == SIOCGIFLOG);
11048 
11049 	if (cmd == SIOCSIFLOG) {
11050 		if ((result = priv_check_cred(kauth_cred_get(),
11051 		    PRIV_NET_INTERFACE_CONTROL, 0)) != 0) {
11052 			return result;
11053 		}
11054 
11055 		level = ifr->ifr_log.ifl_level;
11056 		if (level < IFNET_LOG_MIN || level > IFNET_LOG_MAX) {
11057 			result = EINVAL;
11058 		}
11059 
11060 		flags = ifr->ifr_log.ifl_flags;
11061 		if ((flags &= IFNET_LOGF_MASK) == 0) {
11062 			result = EINVAL;
11063 		}
11064 
11065 		category = ifr->ifr_log.ifl_category;
11066 		subcategory = ifr->ifr_log.ifl_subcategory;
11067 
11068 		if (result == 0) {
11069 			result = ifnet_set_log(ifp, level, flags,
11070 			    category, subcategory);
11071 		}
11072 	} else {
11073 		result = ifnet_get_log(ifp, &level, &flags, &category,
11074 		    &subcategory);
11075 		if (result == 0) {
11076 			ifr->ifr_log.ifl_level = level;
11077 			ifr->ifr_log.ifl_flags = flags;
11078 			ifr->ifr_log.ifl_category = category;
11079 			ifr->ifr_log.ifl_subcategory = subcategory;
11080 		}
11081 	}
11082 
11083 	return result;
11084 }
11085 
11086 int
ifnet_set_log(struct ifnet * ifp,int32_t level,uint32_t flags,int32_t category,int32_t subcategory)11087 ifnet_set_log(struct ifnet *ifp, int32_t level, uint32_t flags,
11088     int32_t category, int32_t subcategory)
11089 {
11090 	int err = 0;
11091 
11092 	VERIFY(level >= IFNET_LOG_MIN && level <= IFNET_LOG_MAX);
11093 	VERIFY(flags & IFNET_LOGF_MASK);
11094 
11095 	/*
11096 	 * The logging level applies to all facilities; make sure to
11097 	 * update them all with the most current level.
11098 	 */
11099 	flags |= ifp->if_log.flags;
11100 
11101 	if (ifp->if_output_ctl != NULL) {
11102 		struct ifnet_log_params l;
11103 
11104 		bzero(&l, sizeof(l));
11105 		l.level = level;
11106 		l.flags = flags;
11107 		l.flags &= ~IFNET_LOGF_DLIL;
11108 		l.category = category;
11109 		l.subcategory = subcategory;
11110 
11111 		/* Send this request to lower layers */
11112 		if (l.flags != 0) {
11113 			err = ifp->if_output_ctl(ifp, IFNET_CTL_SET_LOG,
11114 			    sizeof(l), &l);
11115 		}
11116 	} else if ((flags & ~IFNET_LOGF_DLIL) && ifp->if_output_ctl == NULL) {
11117 		/*
11118 		 * If targeted to the lower layers without an output
11119 		 * control callback registered on the interface, just
11120 		 * silently ignore facilities other than ours.
11121 		 */
11122 		flags &= IFNET_LOGF_DLIL;
11123 		if (flags == 0 && (!(ifp->if_log.flags & IFNET_LOGF_DLIL))) {
11124 			level = 0;
11125 		}
11126 	}
11127 
11128 	if (err == 0) {
11129 		if ((ifp->if_log.level = level) == IFNET_LOG_DEFAULT) {
11130 			ifp->if_log.flags = 0;
11131 		} else {
11132 			ifp->if_log.flags |= flags;
11133 		}
11134 
11135 		log(LOG_INFO, "%s: logging level set to %d flags=0x%x "
11136 		    "arg=0x%x, category=%d subcategory=%d\n", if_name(ifp),
11137 		    ifp->if_log.level, ifp->if_log.flags, flags,
11138 		    category, subcategory);
11139 	}
11140 
11141 	return err;
11142 }
11143 
11144 int
ifnet_get_log(struct ifnet * ifp,int32_t * level,uint32_t * flags,int32_t * category,int32_t * subcategory)11145 ifnet_get_log(struct ifnet *ifp, int32_t *level, uint32_t *flags,
11146     int32_t *category, int32_t *subcategory)
11147 {
11148 	if (level != NULL) {
11149 		*level = ifp->if_log.level;
11150 	}
11151 	if (flags != NULL) {
11152 		*flags = ifp->if_log.flags;
11153 	}
11154 	if (category != NULL) {
11155 		*category = ifp->if_log.category;
11156 	}
11157 	if (subcategory != NULL) {
11158 		*subcategory = ifp->if_log.subcategory;
11159 	}
11160 
11161 	return 0;
11162 }
11163 
11164 int
ifnet_notify_address(struct ifnet * ifp,int af)11165 ifnet_notify_address(struct ifnet *ifp, int af)
11166 {
11167 	struct ifnet_notify_address_params na;
11168 
11169 #if PF
11170 	(void) pf_ifaddr_hook(ifp);
11171 #endif /* PF */
11172 
11173 	if (ifp->if_output_ctl == NULL) {
11174 		return EOPNOTSUPP;
11175 	}
11176 
11177 	bzero(&na, sizeof(na));
11178 	na.address_family = (sa_family_t)af;
11179 
11180 	return ifp->if_output_ctl(ifp, IFNET_CTL_NOTIFY_ADDRESS,
11181 	           sizeof(na), &na);
11182 }
11183 
11184 errno_t
ifnet_flowid(struct ifnet * ifp,uint32_t * flowid)11185 ifnet_flowid(struct ifnet *ifp, uint32_t *flowid)
11186 {
11187 	if (ifp == NULL || flowid == NULL) {
11188 		return EINVAL;
11189 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
11190 	    !IF_FULLY_ATTACHED(ifp)) {
11191 		return ENXIO;
11192 	}
11193 
11194 	*flowid = ifp->if_flowhash;
11195 
11196 	return 0;
11197 }
11198 
11199 errno_t
ifnet_disable_output(struct ifnet * ifp)11200 ifnet_disable_output(struct ifnet *ifp)
11201 {
11202 	int err = 0;
11203 
11204 	if (ifp == NULL) {
11205 		return EINVAL;
11206 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
11207 	    !IF_FULLY_ATTACHED(ifp)) {
11208 		return ENXIO;
11209 	}
11210 
11211 	lck_mtx_lock(&ifp->if_start_lock);
11212 	if (ifp->if_start_flags & IFSF_FLOW_RESUME_PENDING) {
11213 		ifp->if_start_flags &= ~(IFSF_FLOW_RESUME_PENDING | IFSF_FLOW_CONTROLLED);
11214 	} else if ((err = ifnet_fc_add(ifp)) == 0) {
11215 		ifp->if_start_flags |= IFSF_FLOW_CONTROLLED;
11216 	}
11217 	lck_mtx_unlock(&ifp->if_start_lock);
11218 
11219 	return err;
11220 }
11221 
11222 errno_t
ifnet_enable_output(struct ifnet * ifp)11223 ifnet_enable_output(struct ifnet *ifp)
11224 {
11225 	if (ifp == NULL) {
11226 		return EINVAL;
11227 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
11228 	    !IF_FULLY_ATTACHED(ifp)) {
11229 		return ENXIO;
11230 	}
11231 
11232 	ifnet_start_common(ifp, TRUE, FALSE);
11233 	return 0;
11234 }
11235 
11236 void
ifnet_flowadv(uint32_t flowhash)11237 ifnet_flowadv(uint32_t flowhash)
11238 {
11239 	struct ifnet_fc_entry *ifce;
11240 	struct ifnet *ifp;
11241 
11242 	ifce = ifnet_fc_get(flowhash);
11243 	if (ifce == NULL) {
11244 		return;
11245 	}
11246 
11247 	VERIFY(ifce->ifce_ifp != NULL);
11248 	ifp = ifce->ifce_ifp;
11249 
11250 	/* flow hash gets recalculated per attach, so check */
11251 	if (ifnet_is_attached(ifp, 1)) {
11252 		if (ifp->if_flowhash == flowhash) {
11253 			lck_mtx_lock_spin(&ifp->if_start_lock);
11254 			if ((ifp->if_start_flags & IFSF_FLOW_CONTROLLED) == 0) {
11255 				ifp->if_start_flags |= IFSF_FLOW_RESUME_PENDING;
11256 			}
11257 			lck_mtx_unlock(&ifp->if_start_lock);
11258 			(void) ifnet_enable_output(ifp);
11259 		}
11260 		ifnet_decr_iorefcnt(ifp);
11261 	}
11262 	ifnet_fc_entry_free(ifce);
11263 }
11264 
11265 /*
11266  * Function to compare ifnet_fc_entries in ifnet flow control tree
11267  */
11268 static inline int
ifce_cmp(const struct ifnet_fc_entry * fc1,const struct ifnet_fc_entry * fc2)11269 ifce_cmp(const struct ifnet_fc_entry *fc1, const struct ifnet_fc_entry *fc2)
11270 {
11271 	return fc1->ifce_flowhash - fc2->ifce_flowhash;
11272 }
11273 
11274 static int
ifnet_fc_add(struct ifnet * ifp)11275 ifnet_fc_add(struct ifnet *ifp)
11276 {
11277 	struct ifnet_fc_entry keyfc, *ifce;
11278 	uint32_t flowhash;
11279 
11280 	VERIFY(ifp != NULL && (ifp->if_eflags & IFEF_TXSTART));
11281 	VERIFY(ifp->if_flowhash != 0);
11282 	flowhash = ifp->if_flowhash;
11283 
11284 	bzero(&keyfc, sizeof(keyfc));
11285 	keyfc.ifce_flowhash = flowhash;
11286 
11287 	lck_mtx_lock_spin(&ifnet_fc_lock);
11288 	ifce = RB_FIND(ifnet_fc_tree, &ifnet_fc_tree, &keyfc);
11289 	if (ifce != NULL && ifce->ifce_ifp == ifp) {
11290 		/* Entry is already in ifnet_fc_tree, return */
11291 		lck_mtx_unlock(&ifnet_fc_lock);
11292 		return 0;
11293 	}
11294 
11295 	if (ifce != NULL) {
11296 		/*
11297 		 * There is a different fc entry with the same flow hash
11298 		 * but different ifp pointer.  There can be a collision
11299 		 * on flow hash but the probability is low.  Let's just
11300 		 * avoid adding a second one when there is a collision.
11301 		 */
11302 		lck_mtx_unlock(&ifnet_fc_lock);
11303 		return EAGAIN;
11304 	}
11305 
11306 	/* become regular mutex */
11307 	lck_mtx_convert_spin(&ifnet_fc_lock);
11308 
11309 	ifce = zalloc_flags(ifnet_fc_zone, Z_WAITOK | Z_ZERO);
11310 	ifce->ifce_flowhash = flowhash;
11311 	ifce->ifce_ifp = ifp;
11312 
11313 	RB_INSERT(ifnet_fc_tree, &ifnet_fc_tree, ifce);
11314 	lck_mtx_unlock(&ifnet_fc_lock);
11315 	return 0;
11316 }
11317 
11318 static struct ifnet_fc_entry *
ifnet_fc_get(uint32_t flowhash)11319 ifnet_fc_get(uint32_t flowhash)
11320 {
11321 	struct ifnet_fc_entry keyfc, *ifce;
11322 	struct ifnet *ifp;
11323 
11324 	bzero(&keyfc, sizeof(keyfc));
11325 	keyfc.ifce_flowhash = flowhash;
11326 
11327 	lck_mtx_lock_spin(&ifnet_fc_lock);
11328 	ifce = RB_FIND(ifnet_fc_tree, &ifnet_fc_tree, &keyfc);
11329 	if (ifce == NULL) {
11330 		/* Entry is not present in ifnet_fc_tree, return */
11331 		lck_mtx_unlock(&ifnet_fc_lock);
11332 		return NULL;
11333 	}
11334 
11335 	RB_REMOVE(ifnet_fc_tree, &ifnet_fc_tree, ifce);
11336 
11337 	VERIFY(ifce->ifce_ifp != NULL);
11338 	ifp = ifce->ifce_ifp;
11339 
11340 	/* become regular mutex */
11341 	lck_mtx_convert_spin(&ifnet_fc_lock);
11342 
11343 	if (!ifnet_is_attached(ifp, 0)) {
11344 		/*
11345 		 * This ifp is not attached or in the process of being
11346 		 * detached; just don't process it.
11347 		 */
11348 		ifnet_fc_entry_free(ifce);
11349 		ifce = NULL;
11350 	}
11351 	lck_mtx_unlock(&ifnet_fc_lock);
11352 
11353 	return ifce;
11354 }
11355 
11356 static void
ifnet_fc_entry_free(struct ifnet_fc_entry * ifce)11357 ifnet_fc_entry_free(struct ifnet_fc_entry *ifce)
11358 {
11359 	zfree(ifnet_fc_zone, ifce);
11360 }
11361 
11362 static uint32_t
ifnet_calc_flowhash(struct ifnet * ifp)11363 ifnet_calc_flowhash(struct ifnet *ifp)
11364 {
11365 	struct ifnet_flowhash_key fh __attribute__((aligned(8)));
11366 	uint32_t flowhash = 0;
11367 
11368 	if (ifnet_flowhash_seed == 0) {
11369 		ifnet_flowhash_seed = RandomULong();
11370 	}
11371 
11372 	bzero(&fh, sizeof(fh));
11373 
11374 	(void) snprintf(fh.ifk_name, sizeof(fh.ifk_name), "%s", ifp->if_name);
11375 	fh.ifk_unit = ifp->if_unit;
11376 	fh.ifk_flags = ifp->if_flags;
11377 	fh.ifk_eflags = ifp->if_eflags;
11378 	fh.ifk_capabilities = ifp->if_capabilities;
11379 	fh.ifk_capenable = ifp->if_capenable;
11380 	fh.ifk_output_sched_model = ifp->if_output_sched_model;
11381 	fh.ifk_rand1 = RandomULong();
11382 	fh.ifk_rand2 = RandomULong();
11383 
11384 try_again:
11385 	flowhash = net_flowhash(&fh, sizeof(fh), ifnet_flowhash_seed);
11386 	if (flowhash == 0) {
11387 		/* try to get a non-zero flowhash */
11388 		ifnet_flowhash_seed = RandomULong();
11389 		goto try_again;
11390 	}
11391 
11392 	return flowhash;
11393 }
11394 
11395 int
ifnet_set_netsignature(struct ifnet * ifp,uint8_t family,uint8_t len,uint16_t flags,uint8_t * data)11396 ifnet_set_netsignature(struct ifnet *ifp, uint8_t family, uint8_t len,
11397     uint16_t flags, uint8_t *data)
11398 {
11399 #pragma unused(flags)
11400 	int error = 0;
11401 
11402 	switch (family) {
11403 	case AF_INET:
11404 		if_inetdata_lock_exclusive(ifp);
11405 		if (IN_IFEXTRA(ifp) != NULL) {
11406 			if (len == 0) {
11407 				/* Allow clearing the signature */
11408 				IN_IFEXTRA(ifp)->netsig_len = 0;
11409 				bzero(IN_IFEXTRA(ifp)->netsig,
11410 				    sizeof(IN_IFEXTRA(ifp)->netsig));
11411 				if_inetdata_lock_done(ifp);
11412 				break;
11413 			} else if (len > sizeof(IN_IFEXTRA(ifp)->netsig)) {
11414 				error = EINVAL;
11415 				if_inetdata_lock_done(ifp);
11416 				break;
11417 			}
11418 			IN_IFEXTRA(ifp)->netsig_len = len;
11419 			bcopy(data, IN_IFEXTRA(ifp)->netsig, len);
11420 		} else {
11421 			error = ENOMEM;
11422 		}
11423 		if_inetdata_lock_done(ifp);
11424 		break;
11425 
11426 	case AF_INET6:
11427 		if_inet6data_lock_exclusive(ifp);
11428 		if (IN6_IFEXTRA(ifp) != NULL) {
11429 			if (len == 0) {
11430 				/* Allow clearing the signature */
11431 				IN6_IFEXTRA(ifp)->netsig_len = 0;
11432 				bzero(IN6_IFEXTRA(ifp)->netsig,
11433 				    sizeof(IN6_IFEXTRA(ifp)->netsig));
11434 				if_inet6data_lock_done(ifp);
11435 				break;
11436 			} else if (len > sizeof(IN6_IFEXTRA(ifp)->netsig)) {
11437 				error = EINVAL;
11438 				if_inet6data_lock_done(ifp);
11439 				break;
11440 			}
11441 			IN6_IFEXTRA(ifp)->netsig_len = len;
11442 			bcopy(data, IN6_IFEXTRA(ifp)->netsig, len);
11443 		} else {
11444 			error = ENOMEM;
11445 		}
11446 		if_inet6data_lock_done(ifp);
11447 		break;
11448 
11449 	default:
11450 		error = EINVAL;
11451 		break;
11452 	}
11453 
11454 	return error;
11455 }
11456 
11457 int
ifnet_get_netsignature(struct ifnet * ifp,uint8_t family,uint8_t * len,uint16_t * flags,uint8_t * data)11458 ifnet_get_netsignature(struct ifnet *ifp, uint8_t family, uint8_t *len,
11459     uint16_t *flags, uint8_t *data)
11460 {
11461 	int error = 0;
11462 
11463 	if (ifp == NULL || len == NULL || data == NULL) {
11464 		return EINVAL;
11465 	}
11466 
11467 	switch (family) {
11468 	case AF_INET:
11469 		if_inetdata_lock_shared(ifp);
11470 		if (IN_IFEXTRA(ifp) != NULL) {
11471 			if (*len == 0 || *len < IN_IFEXTRA(ifp)->netsig_len) {
11472 				error = EINVAL;
11473 				if_inetdata_lock_done(ifp);
11474 				break;
11475 			}
11476 			if ((*len = (uint8_t)IN_IFEXTRA(ifp)->netsig_len) > 0) {
11477 				bcopy(IN_IFEXTRA(ifp)->netsig, data, *len);
11478 			} else {
11479 				error = ENOENT;
11480 			}
11481 		} else {
11482 			error = ENOMEM;
11483 		}
11484 		if_inetdata_lock_done(ifp);
11485 		break;
11486 
11487 	case AF_INET6:
11488 		if_inet6data_lock_shared(ifp);
11489 		if (IN6_IFEXTRA(ifp) != NULL) {
11490 			if (*len == 0 || *len < IN6_IFEXTRA(ifp)->netsig_len) {
11491 				error = EINVAL;
11492 				if_inet6data_lock_done(ifp);
11493 				break;
11494 			}
11495 			if ((*len = (uint8_t)IN6_IFEXTRA(ifp)->netsig_len) > 0) {
11496 				bcopy(IN6_IFEXTRA(ifp)->netsig, data, *len);
11497 			} else {
11498 				error = ENOENT;
11499 			}
11500 		} else {
11501 			error = ENOMEM;
11502 		}
11503 		if_inet6data_lock_done(ifp);
11504 		break;
11505 
11506 	default:
11507 		error = EINVAL;
11508 		break;
11509 	}
11510 
11511 	if (error == 0 && flags != NULL) {
11512 		*flags = 0;
11513 	}
11514 
11515 	return error;
11516 }
11517 
11518 int
ifnet_set_nat64prefix(struct ifnet * ifp,struct ipv6_prefix * prefixes)11519 ifnet_set_nat64prefix(struct ifnet *ifp, struct ipv6_prefix *prefixes)
11520 {
11521 	int i, error = 0, one_set = 0;
11522 
11523 	if_inet6data_lock_exclusive(ifp);
11524 
11525 	if (IN6_IFEXTRA(ifp) == NULL) {
11526 		error = ENOMEM;
11527 		goto out;
11528 	}
11529 
11530 	for (i = 0; i < NAT64_MAX_NUM_PREFIXES; i++) {
11531 		uint32_t prefix_len =
11532 		    prefixes[i].prefix_len;
11533 		struct in6_addr *prefix =
11534 		    &prefixes[i].ipv6_prefix;
11535 
11536 		if (prefix_len == 0) {
11537 			clat_log0((LOG_DEBUG,
11538 			    "NAT64 prefixes purged from Interface %s\n",
11539 			    if_name(ifp)));
11540 			/* Allow clearing the signature */
11541 			IN6_IFEXTRA(ifp)->nat64_prefixes[i].prefix_len = 0;
11542 			bzero(&IN6_IFEXTRA(ifp)->nat64_prefixes[i].ipv6_prefix,
11543 			    sizeof(struct in6_addr));
11544 
11545 			continue;
11546 		} else if (prefix_len != NAT64_PREFIX_LEN_32 &&
11547 		    prefix_len != NAT64_PREFIX_LEN_40 &&
11548 		    prefix_len != NAT64_PREFIX_LEN_48 &&
11549 		    prefix_len != NAT64_PREFIX_LEN_56 &&
11550 		    prefix_len != NAT64_PREFIX_LEN_64 &&
11551 		    prefix_len != NAT64_PREFIX_LEN_96) {
11552 			clat_log0((LOG_DEBUG,
11553 			    "NAT64 prefixlen is incorrect %d\n", prefix_len));
11554 			error = EINVAL;
11555 			goto out;
11556 		}
11557 
11558 		if (IN6_IS_SCOPE_EMBED(prefix)) {
11559 			clat_log0((LOG_DEBUG,
11560 			    "NAT64 prefix has interface/link local scope.\n"));
11561 			error = EINVAL;
11562 			goto out;
11563 		}
11564 
11565 		IN6_IFEXTRA(ifp)->nat64_prefixes[i].prefix_len = prefix_len;
11566 		bcopy(prefix, &IN6_IFEXTRA(ifp)->nat64_prefixes[i].ipv6_prefix,
11567 		    sizeof(struct in6_addr));
11568 		clat_log0((LOG_DEBUG,
11569 		    "NAT64 prefix set to %s with prefixlen: %d\n",
11570 		    ip6_sprintf(prefix), prefix_len));
11571 		one_set = 1;
11572 	}
11573 
11574 out:
11575 	if_inet6data_lock_done(ifp);
11576 
11577 	if (error == 0 && one_set != 0) {
11578 		necp_update_all_clients();
11579 	}
11580 
11581 	return error;
11582 }
11583 
11584 int
ifnet_get_nat64prefix(struct ifnet * ifp,struct ipv6_prefix * prefixes)11585 ifnet_get_nat64prefix(struct ifnet *ifp, struct ipv6_prefix *prefixes)
11586 {
11587 	int i, found_one = 0, error = 0;
11588 
11589 	if (ifp == NULL) {
11590 		return EINVAL;
11591 	}
11592 
11593 	if_inet6data_lock_shared(ifp);
11594 
11595 	if (IN6_IFEXTRA(ifp) == NULL) {
11596 		error = ENOMEM;
11597 		goto out;
11598 	}
11599 
11600 	for (i = 0; i < NAT64_MAX_NUM_PREFIXES; i++) {
11601 		if (IN6_IFEXTRA(ifp)->nat64_prefixes[i].prefix_len != 0) {
11602 			found_one = 1;
11603 		}
11604 	}
11605 
11606 	if (found_one == 0) {
11607 		error = ENOENT;
11608 		goto out;
11609 	}
11610 
11611 	if (prefixes) {
11612 		bcopy(IN6_IFEXTRA(ifp)->nat64_prefixes, prefixes,
11613 		    sizeof(IN6_IFEXTRA(ifp)->nat64_prefixes));
11614 	}
11615 
11616 out:
11617 	if_inet6data_lock_done(ifp);
11618 
11619 	return error;
11620 }
11621 
11622 __attribute__((noinline))
11623 static void
dlil_output_cksum_dbg(struct ifnet * ifp,struct mbuf * m,uint32_t hoff,protocol_family_t pf)11624 dlil_output_cksum_dbg(struct ifnet *ifp, struct mbuf *m, uint32_t hoff,
11625     protocol_family_t pf)
11626 {
11627 #pragma unused(ifp)
11628 	uint32_t did_sw;
11629 
11630 	if (!(hwcksum_dbg_mode & HWCKSUM_DBG_FINALIZE_FORCED) ||
11631 	    (m->m_pkthdr.csum_flags & (CSUM_TSO_IPV4 | CSUM_TSO_IPV6))) {
11632 		return;
11633 	}
11634 
11635 	switch (pf) {
11636 	case PF_INET:
11637 		did_sw = in_finalize_cksum(m, hoff, m->m_pkthdr.csum_flags);
11638 		if (did_sw & CSUM_DELAY_IP) {
11639 			hwcksum_dbg_finalized_hdr++;
11640 		}
11641 		if (did_sw & CSUM_DELAY_DATA) {
11642 			hwcksum_dbg_finalized_data++;
11643 		}
11644 		break;
11645 	case PF_INET6:
11646 		/*
11647 		 * Checksum offload should not have been enabled when
11648 		 * extension headers exist; that also means that we
11649 		 * cannot force-finalize packets with extension headers.
11650 		 * Indicate to the callee should it skip such case by
11651 		 * setting optlen to -1.
11652 		 */
11653 		did_sw = in6_finalize_cksum(m, hoff, -1, -1,
11654 		    m->m_pkthdr.csum_flags);
11655 		if (did_sw & CSUM_DELAY_IPV6_DATA) {
11656 			hwcksum_dbg_finalized_data++;
11657 		}
11658 		break;
11659 	default:
11660 		return;
11661 	}
11662 }
11663 
11664 static void
dlil_input_cksum_dbg(struct ifnet * ifp,struct mbuf * m,char * frame_header,protocol_family_t pf)11665 dlil_input_cksum_dbg(struct ifnet *ifp, struct mbuf *m, char *frame_header,
11666     protocol_family_t pf)
11667 {
11668 	uint16_t sum = 0;
11669 	uint32_t hlen;
11670 
11671 	if (frame_header == NULL ||
11672 	    frame_header < (char *)mbuf_datastart(m) ||
11673 	    frame_header > (char *)m->m_data) {
11674 		DLIL_PRINTF("%s: frame header pointer 0x%llx out of range "
11675 		    "[0x%llx,0x%llx] for mbuf 0x%llx\n", if_name(ifp),
11676 		    (uint64_t)VM_KERNEL_ADDRPERM(frame_header),
11677 		    (uint64_t)VM_KERNEL_ADDRPERM(mbuf_datastart(m)),
11678 		    (uint64_t)VM_KERNEL_ADDRPERM(m->m_data),
11679 		    (uint64_t)VM_KERNEL_ADDRPERM(m));
11680 		return;
11681 	}
11682 	hlen = (uint32_t)(m->m_data - (uintptr_t)frame_header);
11683 
11684 	switch (pf) {
11685 	case PF_INET:
11686 	case PF_INET6:
11687 		break;
11688 	default:
11689 		return;
11690 	}
11691 
11692 	/*
11693 	 * Force partial checksum offload; useful to simulate cases
11694 	 * where the hardware does not support partial checksum offload,
11695 	 * in order to validate correctness throughout the layers above.
11696 	 */
11697 	if (hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_FORCED) {
11698 		uint32_t foff = hwcksum_dbg_partial_rxoff_forced;
11699 
11700 		if (foff > (uint32_t)m->m_pkthdr.len) {
11701 			return;
11702 		}
11703 
11704 		m->m_pkthdr.csum_flags &= ~CSUM_RX_FLAGS;
11705 
11706 		/* Compute 16-bit 1's complement sum from forced offset */
11707 		sum = m_sum16(m, foff, (m->m_pkthdr.len - foff));
11708 
11709 		m->m_pkthdr.csum_flags |= (CSUM_DATA_VALID | CSUM_PARTIAL);
11710 		m->m_pkthdr.csum_rx_val = sum;
11711 		m->m_pkthdr.csum_rx_start = (uint16_t)(foff + hlen);
11712 
11713 		hwcksum_dbg_partial_forced++;
11714 		hwcksum_dbg_partial_forced_bytes += m->m_pkthdr.len;
11715 	}
11716 
11717 	/*
11718 	 * Partial checksum offload verification (and adjustment);
11719 	 * useful to validate and test cases where the hardware
11720 	 * supports partial checksum offload.
11721 	 */
11722 	if ((m->m_pkthdr.csum_flags &
11723 	    (CSUM_DATA_VALID | CSUM_PARTIAL | CSUM_PSEUDO_HDR)) ==
11724 	    (CSUM_DATA_VALID | CSUM_PARTIAL)) {
11725 		uint32_t rxoff;
11726 
11727 		/* Start offset must begin after frame header */
11728 		rxoff = m->m_pkthdr.csum_rx_start;
11729 		if (hlen > rxoff) {
11730 			hwcksum_dbg_bad_rxoff++;
11731 			if (dlil_verbose) {
11732 				DLIL_PRINTF("%s: partial cksum start offset %d "
11733 				    "is less than frame header length %d for "
11734 				    "mbuf 0x%llx\n", if_name(ifp), rxoff, hlen,
11735 				    (uint64_t)VM_KERNEL_ADDRPERM(m));
11736 			}
11737 			return;
11738 		}
11739 		rxoff -= hlen;
11740 
11741 		if (!(hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_FORCED)) {
11742 			/*
11743 			 * Compute the expected 16-bit 1's complement sum;
11744 			 * skip this if we've already computed it above
11745 			 * when partial checksum offload is forced.
11746 			 */
11747 			sum = m_sum16(m, rxoff, (m->m_pkthdr.len - rxoff));
11748 
11749 			/* Hardware or driver is buggy */
11750 			if (sum != m->m_pkthdr.csum_rx_val) {
11751 				hwcksum_dbg_bad_cksum++;
11752 				if (dlil_verbose) {
11753 					DLIL_PRINTF("%s: bad partial cksum value "
11754 					    "0x%x (expected 0x%x) for mbuf "
11755 					    "0x%llx [rx_start %d]\n",
11756 					    if_name(ifp),
11757 					    m->m_pkthdr.csum_rx_val, sum,
11758 					    (uint64_t)VM_KERNEL_ADDRPERM(m),
11759 					    m->m_pkthdr.csum_rx_start);
11760 				}
11761 				return;
11762 			}
11763 		}
11764 		hwcksum_dbg_verified++;
11765 
11766 		/*
11767 		 * This code allows us to emulate various hardwares that
11768 		 * perform 16-bit 1's complement sum beginning at various
11769 		 * start offset values.
11770 		 */
11771 		if (hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_RXOFF_ADJ) {
11772 			uint32_t aoff = hwcksum_dbg_partial_rxoff_adj;
11773 
11774 			if (aoff == rxoff || aoff > (uint32_t)m->m_pkthdr.len) {
11775 				return;
11776 			}
11777 
11778 			sum = m_adj_sum16(m, rxoff, aoff,
11779 			    m_pktlen(m) - aoff, sum);
11780 
11781 			m->m_pkthdr.csum_rx_val = sum;
11782 			m->m_pkthdr.csum_rx_start = (uint16_t)(aoff + hlen);
11783 
11784 			hwcksum_dbg_adjusted++;
11785 		}
11786 	}
11787 }
11788 
11789 #if DEBUG || DEVELOPMENT
11790 /* Blob for sum16 verification */
11791 static uint8_t sumdata[] = {
11792 	0x1f, 0x8b, 0x08, 0x08, 0x4c, 0xe5, 0x9a, 0x4f, 0x00, 0x03,
11793 	0x5f, 0x00, 0x5d, 0x91, 0x41, 0x4e, 0xc4, 0x30, 0x0c, 0x45,
11794 	0xf7, 0x9c, 0xc2, 0x07, 0x18, 0xf5, 0x0e, 0xb0, 0xe2, 0x00,
11795 	0x48, 0x88, 0xa5, 0xdb, 0xba, 0x49, 0x34, 0x69, 0xdc, 0x71,
11796 	0x92, 0xa9, 0xc2, 0x8a, 0x6b, 0x70, 0x3d, 0x4e, 0x82, 0x93,
11797 	0xb4, 0x08, 0xd8, 0xc5, 0xb1, 0xfd, 0xff, 0xb3, 0xfd, 0x4c,
11798 	0x42, 0x5f, 0x1f, 0x9f, 0x11, 0x12, 0x43, 0xb2, 0x04, 0x93,
11799 	0xe0, 0x7b, 0x01, 0x0e, 0x14, 0x07, 0x78, 0xd1, 0x78, 0x75,
11800 	0x71, 0x71, 0xe9, 0x08, 0x84, 0x46, 0xf2, 0xc7, 0x3b, 0x09,
11801 	0xe7, 0xd1, 0xd3, 0x8a, 0x57, 0x92, 0x33, 0xcd, 0x39, 0xcc,
11802 	0xb0, 0x91, 0x89, 0xe0, 0x42, 0x53, 0x8b, 0xb7, 0x8c, 0x42,
11803 	0x60, 0xd9, 0x9f, 0x7a, 0x55, 0x19, 0x76, 0xcb, 0x10, 0x49,
11804 	0x35, 0xac, 0x0b, 0x5a, 0x3c, 0xbb, 0x65, 0x51, 0x8c, 0x90,
11805 	0x7c, 0x69, 0x45, 0x45, 0x81, 0xb4, 0x2b, 0x70, 0x82, 0x85,
11806 	0x55, 0x91, 0x17, 0x90, 0xdc, 0x14, 0x1e, 0x35, 0x52, 0xdd,
11807 	0x02, 0x16, 0xef, 0xb5, 0x40, 0x89, 0xe2, 0x46, 0x53, 0xad,
11808 	0x93, 0x6e, 0x98, 0x30, 0xe5, 0x08, 0xb7, 0xcc, 0x03, 0xbc,
11809 	0x71, 0x86, 0x09, 0x43, 0x0d, 0x52, 0xf5, 0xa2, 0xf5, 0xa2,
11810 	0x56, 0x11, 0x8d, 0xa8, 0xf5, 0xee, 0x92, 0x3d, 0xfe, 0x8c,
11811 	0x67, 0x71, 0x8b, 0x0e, 0x2d, 0x70, 0x77, 0xbe, 0xbe, 0xea,
11812 	0xbf, 0x9a, 0x8d, 0x9c, 0x53, 0x53, 0xe5, 0xe0, 0x4b, 0x87,
11813 	0x85, 0xd2, 0x45, 0x95, 0x30, 0xc1, 0xcc, 0xe0, 0x74, 0x54,
11814 	0x13, 0x58, 0xe8, 0xe8, 0x79, 0xa2, 0x09, 0x73, 0xa4, 0x0e,
11815 	0x39, 0x59, 0x0c, 0xe6, 0x9c, 0xb2, 0x4f, 0x06, 0x5b, 0x8e,
11816 	0xcd, 0x17, 0x6c, 0x5e, 0x95, 0x4d, 0x70, 0xa2, 0x0a, 0xbf,
11817 	0xa3, 0xcc, 0x03, 0xbc, 0x5a, 0xe7, 0x75, 0x06, 0x5e, 0x75,
11818 	0xef, 0x58, 0x8e, 0x15, 0xd1, 0x0a, 0x18, 0xff, 0xdd, 0xe6,
11819 	0x02, 0x3b, 0xb5, 0xb4, 0xa1, 0xe0, 0x72, 0xfc, 0xe3, 0xab,
11820 	0x07, 0xe0, 0x4d, 0x65, 0xea, 0x92, 0xeb, 0xf2, 0x7b, 0x17,
11821 	0x05, 0xce, 0xc6, 0xf6, 0x2b, 0xbb, 0x70, 0x3d, 0x00, 0x95,
11822 	0xe0, 0x07, 0x52, 0x3b, 0x58, 0xfc, 0x7c, 0x69, 0x4d, 0xe9,
11823 	0xf7, 0xa9, 0x66, 0x1e, 0x1e, 0xbe, 0x01, 0x69, 0x98, 0xfe,
11824 	0xc8, 0x28, 0x02, 0x00, 0x00
11825 };
11826 
11827 /* Precomputed 16-bit 1's complement sums for various spans of the above data */
11828 static struct {
11829 	boolean_t       init;
11830 	uint16_t        len;
11831 	uint16_t        sumr;   /* reference */
11832 	uint16_t        sumrp;  /* reference, precomputed */
11833 } sumtbl[] = {
11834 	{ FALSE, 0, 0, 0x0000 },
11835 	{ FALSE, 1, 0, 0x001f },
11836 	{ FALSE, 2, 0, 0x8b1f },
11837 	{ FALSE, 3, 0, 0x8b27 },
11838 	{ FALSE, 7, 0, 0x790e },
11839 	{ FALSE, 11, 0, 0xcb6d },
11840 	{ FALSE, 20, 0, 0x20dd },
11841 	{ FALSE, 27, 0, 0xbabd },
11842 	{ FALSE, 32, 0, 0xf3e8 },
11843 	{ FALSE, 37, 0, 0x197d },
11844 	{ FALSE, 43, 0, 0x9eae },
11845 	{ FALSE, 64, 0, 0x4678 },
11846 	{ FALSE, 127, 0, 0x9399 },
11847 	{ FALSE, 256, 0, 0xd147 },
11848 	{ FALSE, 325, 0, 0x0358 },
11849 };
11850 #define SUMTBL_MAX      ((int)sizeof (sumtbl) / (int)sizeof (sumtbl[0]))
11851 
11852 static void
dlil_verify_sum16(void)11853 dlil_verify_sum16(void)
11854 {
11855 	struct mbuf *m;
11856 	uint8_t *buf;
11857 	int n;
11858 
11859 	/* Make sure test data plus extra room for alignment fits in cluster */
11860 	_CASSERT((sizeof(sumdata) + (sizeof(uint64_t) * 2)) <= MCLBYTES);
11861 
11862 	kprintf("DLIL: running SUM16 self-tests ... ");
11863 
11864 	m = m_getcl(M_WAITOK, MT_DATA, M_PKTHDR);
11865 	m_align(m, sizeof(sumdata) + (sizeof(uint64_t) * 2));
11866 
11867 	buf = mtod(m, uint8_t *);               /* base address */
11868 
11869 	for (n = 0; n < SUMTBL_MAX; n++) {
11870 		uint16_t len = sumtbl[n].len;
11871 		int i;
11872 
11873 		/* Verify for all possible alignments */
11874 		for (i = 0; i < (int)sizeof(uint64_t); i++) {
11875 			uint16_t sum, sumr;
11876 			uint8_t *c;
11877 
11878 			/* Copy over test data to mbuf */
11879 			VERIFY(len <= sizeof(sumdata));
11880 			c = buf + i;
11881 			bcopy(sumdata, c, len);
11882 
11883 			/* Zero-offset test (align by data pointer) */
11884 			m->m_data = (uintptr_t)c;
11885 			m->m_len = len;
11886 			sum = m_sum16(m, 0, len);
11887 
11888 			if (!sumtbl[n].init) {
11889 				sumr = (uint16_t)in_cksum_mbuf_ref(m, len, 0, 0);
11890 				sumtbl[n].sumr = sumr;
11891 				sumtbl[n].init = TRUE;
11892 			} else {
11893 				sumr = sumtbl[n].sumr;
11894 			}
11895 
11896 			/* Something is horribly broken; stop now */
11897 			if (sumr != sumtbl[n].sumrp) {
11898 				panic_plain("\n%s: broken in_cksum_mbuf_ref() "
11899 				    "for len=%d align=%d sum=0x%04x "
11900 				    "[expected=0x%04x]\n", __func__,
11901 				    len, i, sum, sumr);
11902 				/* NOTREACHED */
11903 			} else if (sum != sumr) {
11904 				panic_plain("\n%s: broken m_sum16() for len=%d "
11905 				    "align=%d sum=0x%04x [expected=0x%04x]\n",
11906 				    __func__, len, i, sum, sumr);
11907 				/* NOTREACHED */
11908 			}
11909 
11910 			/* Alignment test by offset (fixed data pointer) */
11911 			m->m_data = (uintptr_t)buf;
11912 			m->m_len = i + len;
11913 			sum = m_sum16(m, i, len);
11914 
11915 			/* Something is horribly broken; stop now */
11916 			if (sum != sumr) {
11917 				panic_plain("\n%s: broken m_sum16() for len=%d "
11918 				    "offset=%d sum=0x%04x [expected=0x%04x]\n",
11919 				    __func__, len, i, sum, sumr);
11920 				/* NOTREACHED */
11921 			}
11922 #if INET
11923 			/* Simple sum16 contiguous buffer test by aligment */
11924 			sum = b_sum16(c, len);
11925 
11926 			/* Something is horribly broken; stop now */
11927 			if (sum != sumr) {
11928 				panic_plain("\n%s: broken b_sum16() for len=%d "
11929 				    "align=%d sum=0x%04x [expected=0x%04x]\n",
11930 				    __func__, len, i, sum, sumr);
11931 				/* NOTREACHED */
11932 			}
11933 #endif /* INET */
11934 		}
11935 	}
11936 	m_freem(m);
11937 
11938 	kprintf("PASSED\n");
11939 }
11940 #endif /* DEBUG || DEVELOPMENT */
11941 
11942 #define CASE_STRINGIFY(x) case x: return #x
11943 
11944 __private_extern__ const char *
dlil_kev_dl_code_str(u_int32_t event_code)11945 dlil_kev_dl_code_str(u_int32_t event_code)
11946 {
11947 	switch (event_code) {
11948 		CASE_STRINGIFY(KEV_DL_SIFFLAGS);
11949 		CASE_STRINGIFY(KEV_DL_SIFMETRICS);
11950 		CASE_STRINGIFY(KEV_DL_SIFMTU);
11951 		CASE_STRINGIFY(KEV_DL_SIFPHYS);
11952 		CASE_STRINGIFY(KEV_DL_SIFMEDIA);
11953 		CASE_STRINGIFY(KEV_DL_SIFGENERIC);
11954 		CASE_STRINGIFY(KEV_DL_ADDMULTI);
11955 		CASE_STRINGIFY(KEV_DL_DELMULTI);
11956 		CASE_STRINGIFY(KEV_DL_IF_ATTACHED);
11957 		CASE_STRINGIFY(KEV_DL_IF_DETACHING);
11958 		CASE_STRINGIFY(KEV_DL_IF_DETACHED);
11959 		CASE_STRINGIFY(KEV_DL_LINK_OFF);
11960 		CASE_STRINGIFY(KEV_DL_LINK_ON);
11961 		CASE_STRINGIFY(KEV_DL_PROTO_ATTACHED);
11962 		CASE_STRINGIFY(KEV_DL_PROTO_DETACHED);
11963 		CASE_STRINGIFY(KEV_DL_LINK_ADDRESS_CHANGED);
11964 		CASE_STRINGIFY(KEV_DL_WAKEFLAGS_CHANGED);
11965 		CASE_STRINGIFY(KEV_DL_IF_IDLE_ROUTE_REFCNT);
11966 		CASE_STRINGIFY(KEV_DL_IFCAP_CHANGED);
11967 		CASE_STRINGIFY(KEV_DL_LINK_QUALITY_METRIC_CHANGED);
11968 		CASE_STRINGIFY(KEV_DL_NODE_PRESENCE);
11969 		CASE_STRINGIFY(KEV_DL_NODE_ABSENCE);
11970 		CASE_STRINGIFY(KEV_DL_PRIMARY_ELECTED);
11971 		CASE_STRINGIFY(KEV_DL_ISSUES);
11972 		CASE_STRINGIFY(KEV_DL_IFDELEGATE_CHANGED);
11973 	default:
11974 		break;
11975 	}
11976 	return "";
11977 }
11978 
11979 static void
dlil_dt_tcall_fn(thread_call_param_t arg0,thread_call_param_t arg1)11980 dlil_dt_tcall_fn(thread_call_param_t arg0, thread_call_param_t arg1)
11981 {
11982 #pragma unused(arg1)
11983 	struct ifnet *ifp = arg0;
11984 
11985 	if (ifnet_is_attached(ifp, 1)) {
11986 		nstat_ifnet_threshold_reached(ifp->if_index);
11987 		ifnet_decr_iorefcnt(ifp);
11988 	}
11989 }
11990 
11991 void
ifnet_notify_data_threshold(struct ifnet * ifp)11992 ifnet_notify_data_threshold(struct ifnet *ifp)
11993 {
11994 	uint64_t bytes = (ifp->if_ibytes + ifp->if_obytes);
11995 	uint64_t oldbytes = ifp->if_dt_bytes;
11996 
11997 	ASSERT(ifp->if_dt_tcall != NULL);
11998 
11999 	/*
12000 	 * If we went over the threshold, notify NetworkStatistics.
12001 	 * We rate-limit it based on the threshold interval value.
12002 	 */
12003 	if (threshold_notify && (bytes - oldbytes) > ifp->if_data_threshold &&
12004 	    OSCompareAndSwap64(oldbytes, bytes, &ifp->if_dt_bytes) &&
12005 	    !thread_call_isactive(ifp->if_dt_tcall)) {
12006 		uint64_t tival = (threshold_interval * NSEC_PER_SEC);
12007 		uint64_t now = mach_absolute_time(), deadline = now;
12008 		uint64_t ival;
12009 
12010 		if (tival != 0) {
12011 			nanoseconds_to_absolutetime(tival, &ival);
12012 			clock_deadline_for_periodic_event(ival, now, &deadline);
12013 			(void) thread_call_enter_delayed(ifp->if_dt_tcall,
12014 			    deadline);
12015 		} else {
12016 			(void) thread_call_enter(ifp->if_dt_tcall);
12017 		}
12018 	}
12019 }
12020 
12021 
12022 void
ifnet_update_stats_per_flow(struct ifnet_stats_per_flow * ifs,struct ifnet * ifp)12023 ifnet_update_stats_per_flow(struct ifnet_stats_per_flow *ifs,
12024     struct ifnet *ifp)
12025 {
12026 	tcp_update_stats_per_flow(ifs, ifp);
12027 }
12028 
12029 static inline u_int32_t
_set_flags(u_int32_t * flags_p,u_int32_t set_flags)12030 _set_flags(u_int32_t *flags_p, u_int32_t set_flags)
12031 {
12032 	return (u_int32_t)OSBitOrAtomic(set_flags, flags_p);
12033 }
12034 
12035 static inline void
_clear_flags(u_int32_t * flags_p,u_int32_t clear_flags)12036 _clear_flags(u_int32_t *flags_p, u_int32_t clear_flags)
12037 {
12038 	OSBitAndAtomic(~clear_flags, flags_p);
12039 }
12040 
12041 __private_extern__ u_int32_t
if_set_eflags(ifnet_t interface,u_int32_t set_flags)12042 if_set_eflags(ifnet_t interface, u_int32_t set_flags)
12043 {
12044 	return _set_flags(&interface->if_eflags, set_flags);
12045 }
12046 
12047 __private_extern__ void
if_clear_eflags(ifnet_t interface,u_int32_t clear_flags)12048 if_clear_eflags(ifnet_t interface, u_int32_t clear_flags)
12049 {
12050 	_clear_flags(&interface->if_eflags, clear_flags);
12051 }
12052 
12053 __private_extern__ u_int32_t
if_set_xflags(ifnet_t interface,u_int32_t set_flags)12054 if_set_xflags(ifnet_t interface, u_int32_t set_flags)
12055 {
12056 	return _set_flags(&interface->if_xflags, set_flags);
12057 }
12058 
12059 __private_extern__ void
if_clear_xflags(ifnet_t interface,u_int32_t clear_flags)12060 if_clear_xflags(ifnet_t interface, u_int32_t clear_flags)
12061 {
12062 	_clear_flags(&interface->if_xflags, clear_flags);
12063 }
12064 
12065 __private_extern__ void
ifnet_update_traffic_rule_genid(ifnet_t ifp)12066 ifnet_update_traffic_rule_genid(ifnet_t ifp)
12067 {
12068 	os_atomic_inc(&ifp->if_traffic_rule_genid, relaxed);
12069 }
12070 
12071 __private_extern__ boolean_t
ifnet_sync_traffic_rule_genid(ifnet_t ifp,uint32_t * genid)12072 ifnet_sync_traffic_rule_genid(ifnet_t ifp, uint32_t *genid)
12073 {
12074 	if (*genid != ifp->if_traffic_rule_genid) {
12075 		*genid = ifp->if_traffic_rule_genid;
12076 		return TRUE;
12077 	}
12078 	return FALSE;
12079 }
12080 __private_extern__ void
ifnet_update_traffic_rule_count(ifnet_t ifp,uint32_t count)12081 ifnet_update_traffic_rule_count(ifnet_t ifp, uint32_t count)
12082 {
12083 	os_atomic_store(&ifp->if_traffic_rule_count, count, release);
12084 	ifnet_update_traffic_rule_genid(ifp);
12085 }
12086 
12087 static void
log_hexdump(void * data,size_t len)12088 log_hexdump(void *data, size_t len)
12089 {
12090 	size_t i, j, k;
12091 	unsigned char *ptr = (unsigned char *)data;
12092 #define MAX_DUMP_BUF 32
12093 	unsigned char buf[3 * MAX_DUMP_BUF + 1];
12094 
12095 	for (i = 0; i < len; i += MAX_DUMP_BUF) {
12096 		for (j = i, k = 0; j < i + MAX_DUMP_BUF && j < len; j++) {
12097 			unsigned char msnbl = ptr[j] >> 4;
12098 			unsigned char lsnbl = ptr[j] & 0x0f;
12099 
12100 			buf[k++] = msnbl < 10 ? msnbl + '0' : msnbl + 'a' - 10;
12101 			buf[k++] = lsnbl < 10 ? lsnbl + '0' : lsnbl + 'a' - 10;
12102 
12103 			if ((j % 2) == 1) {
12104 				buf[k++] = ' ';
12105 			}
12106 			if ((j % MAX_DUMP_BUF) == MAX_DUMP_BUF - 1) {
12107 				buf[k++] = ' ';
12108 			}
12109 		}
12110 		buf[k] = 0;
12111 		os_log(OS_LOG_DEFAULT, "%3lu: %s", i, buf);
12112 	}
12113 }
12114 
12115 #if SKYWALK
12116 static bool
net_check_compatible_if_filter(struct ifnet * ifp)12117 net_check_compatible_if_filter(struct ifnet *ifp)
12118 {
12119 	if (ifp == NULL) {
12120 		if (net_api_stats.nas_iflt_attach_count > net_api_stats.nas_iflt_attach_os_count) {
12121 			return false;
12122 		}
12123 	} else {
12124 		if (ifp->if_flt_non_os_count > 0) {
12125 			return false;
12126 		}
12127 	}
12128 	return true;
12129 }
12130 #endif /* SKYWALK */
12131 
12132 #define DUMP_BUF_CHK() {        \
12133 	clen -= k;              \
12134 	if (clen < 1)           \
12135 	        goto done;      \
12136 	c += k;                 \
12137 }
12138 
12139 int dlil_dump_top_if_qlen(char *, int);
12140 int
dlil_dump_top_if_qlen(char * str,int str_len)12141 dlil_dump_top_if_qlen(char *str, int str_len)
12142 {
12143 	char *c = str;
12144 	int k, clen = str_len;
12145 	struct ifnet *top_ifcq_ifp = NULL;
12146 	uint32_t top_ifcq_len = 0;
12147 	struct ifnet *top_inq_ifp = NULL;
12148 	uint32_t top_inq_len = 0;
12149 
12150 	for (int ifidx = 1; ifidx < if_index; ifidx++) {
12151 		struct ifnet *ifp = ifindex2ifnet[ifidx];
12152 		struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
12153 
12154 		if (ifp == NULL) {
12155 			continue;
12156 		}
12157 		if (ifp->if_snd != NULL && ifp->if_snd->ifcq_len > top_ifcq_len) {
12158 			top_ifcq_len = ifp->if_snd->ifcq_len;
12159 			top_ifcq_ifp = ifp;
12160 		}
12161 		if (dl_if->dl_if_inpstorage.dlth_pkts.qlen > top_inq_len) {
12162 			top_inq_len = dl_if->dl_if_inpstorage.dlth_pkts.qlen;
12163 			top_inq_ifp = ifp;
12164 		}
12165 	}
12166 
12167 	if (top_ifcq_ifp != NULL) {
12168 		k = scnprintf(c, clen, "\ntop ifcq_len %u packets by %s\n",
12169 		    top_ifcq_len, top_ifcq_ifp->if_xname);
12170 		DUMP_BUF_CHK();
12171 	}
12172 	if (top_inq_ifp != NULL) {
12173 		k = scnprintf(c, clen, "\ntop inq_len %u packets by %s\n",
12174 		    top_inq_len, top_inq_ifp->if_xname);
12175 		DUMP_BUF_CHK();
12176 	}
12177 done:
12178 	return str_len - clen;
12179 }
12180