xref: /xnu-8796.121.2/bsd/net/dlil.c (revision c54f35ca767986246321eb901baf8f5ff7923f6a)
1 /*
2  * Copyright (c) 1999-2023 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
30  * support for mandatory and extensible security protections.  This notice
31  * is included in support of clause 2.2 (b) of the Apple Public License,
32  * Version 2.0.
33  */
34 #include <stddef.h>
35 #include <ptrauth.h>
36 
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/kernel.h>
40 #include <sys/malloc.h>
41 #include <sys/mbuf.h>
42 #include <sys/socket.h>
43 #include <sys/domain.h>
44 #include <sys/user.h>
45 #include <sys/random.h>
46 #include <sys/socketvar.h>
47 #include <net/if_dl.h>
48 #include <net/if.h>
49 #include <net/route.h>
50 #include <net/if_var.h>
51 #include <net/dlil.h>
52 #include <net/if_arp.h>
53 #include <net/iptap.h>
54 #include <net/pktap.h>
55 #include <net/nwk_wq.h>
56 #include <sys/kern_event.h>
57 #include <sys/kdebug.h>
58 #include <sys/mcache.h>
59 #include <sys/syslog.h>
60 #include <sys/protosw.h>
61 #include <sys/priv.h>
62 
63 #include <kern/assert.h>
64 #include <kern/task.h>
65 #include <kern/thread.h>
66 #include <kern/sched_prim.h>
67 #include <kern/locks.h>
68 #include <kern/zalloc.h>
69 
70 #include <net/kpi_protocol.h>
71 #include <net/if_types.h>
72 #include <net/if_ipsec.h>
73 #include <net/if_llreach.h>
74 #include <net/if_utun.h>
75 #include <net/kpi_interfacefilter.h>
76 #include <net/classq/classq.h>
77 #include <net/classq/classq_sfb.h>
78 #include <net/flowhash.h>
79 #include <net/ntstat.h>
80 #if SKYWALK && defined(XNU_TARGET_OS_OSX)
81 #include <skywalk/lib/net_filter_event.h>
82 #endif /* SKYWALK && XNU_TARGET_OS_OSX */
83 #include <net/if_llatbl.h>
84 #include <net/net_api_stats.h>
85 #include <net/if_ports_used.h>
86 #include <net/if_vlan_var.h>
87 #include <netinet/in.h>
88 #if INET
89 #include <netinet/in_var.h>
90 #include <netinet/igmp_var.h>
91 #include <netinet/ip_var.h>
92 #include <netinet/tcp.h>
93 #include <netinet/tcp_var.h>
94 #include <netinet/udp.h>
95 #include <netinet/udp_var.h>
96 #include <netinet/if_ether.h>
97 #include <netinet/in_pcb.h>
98 #include <netinet/in_tclass.h>
99 #include <netinet/ip.h>
100 #include <netinet/ip_icmp.h>
101 #include <netinet/icmp_var.h>
102 #endif /* INET */
103 
104 #include <net/nat464_utils.h>
105 #include <netinet6/in6_var.h>
106 #include <netinet6/nd6.h>
107 #include <netinet6/mld6_var.h>
108 #include <netinet6/scope6_var.h>
109 #include <netinet/ip6.h>
110 #include <netinet/icmp6.h>
111 #include <net/pf_pbuf.h>
112 #include <libkern/OSAtomic.h>
113 #include <libkern/tree.h>
114 
115 #include <dev/random/randomdev.h>
116 #include <machine/machine_routines.h>
117 
118 #include <mach/thread_act.h>
119 #include <mach/sdt.h>
120 
121 #if CONFIG_MACF
122 #include <sys/kauth.h>
123 #include <security/mac_framework.h>
124 #include <net/ethernet.h>
125 #include <net/firewire.h>
126 #endif
127 
128 #if PF
129 #include <net/pfvar.h>
130 #endif /* PF */
131 #include <net/pktsched/pktsched.h>
132 #include <net/pktsched/pktsched_netem.h>
133 
134 #if NECP
135 #include <net/necp.h>
136 #endif /* NECP */
137 
138 #if SKYWALK
139 #include <skywalk/packet/packet_queue.h>
140 #include <skywalk/nexus/netif/nx_netif.h>
141 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
142 #endif /* SKYWALK */
143 
144 #include <os/log.h>
145 
146 #define DBG_LAYER_BEG           DLILDBG_CODE(DBG_DLIL_STATIC, 0)
147 #define DBG_LAYER_END           DLILDBG_CODE(DBG_DLIL_STATIC, 2)
148 #define DBG_FNC_DLIL_INPUT      DLILDBG_CODE(DBG_DLIL_STATIC, (1 << 8))
149 #define DBG_FNC_DLIL_OUTPUT     DLILDBG_CODE(DBG_DLIL_STATIC, (2 << 8))
150 #define DBG_FNC_DLIL_IFOUT      DLILDBG_CODE(DBG_DLIL_STATIC, (3 << 8))
151 
152 #define IFNET_KTRACE_TX_PKT_DUMP   IFNETDBG_CODE(DBG_IFNET, 0x001)
153 #define IFNET_KTRACE_RX_PKT_DUMP   IFNETDBG_CODE(DBG_IFNET, 0x002)
154 
155 #define MAX_FRAME_TYPE_SIZE 4 /* LONGWORDS */
156 #define MAX_LINKADDR        4 /* LONGWORDS */
157 
158 
159 #if 1
160 #define DLIL_PRINTF     printf
161 #else
162 #define DLIL_PRINTF     kprintf
163 #endif
164 
165 #define IF_DATA_REQUIRE_ALIGNED_64(f)   \
166 	_CASSERT(!(offsetof(struct if_data_internal, f) % sizeof (u_int64_t)))
167 
168 #define IFNET_IF_DATA_REQUIRE_ALIGNED_64(f)     \
169 	_CASSERT(!(offsetof(struct ifnet, if_data.f) % sizeof (u_int64_t)))
170 
171 enum {
172 	kProtoKPI_v1    = 1,
173 	kProtoKPI_v2    = 2
174 };
175 
176 /*
177  * List of if_proto structures in if_proto_hash[] is protected by
178  * the ifnet lock.  The rest of the fields are initialized at protocol
179  * attach time and never change, thus no lock required as long as
180  * a reference to it is valid, via if_proto_ref().
181  */
182 struct if_proto {
183 	SLIST_ENTRY(if_proto)       next_hash;
184 	u_int32_t                   refcount;
185 	u_int32_t                   detached;
186 	struct ifnet                *ifp;
187 	protocol_family_t           protocol_family;
188 	int                         proto_kpi;
189 	union {
190 		struct {
191 			proto_media_input               input;
192 			proto_media_preout              pre_output;
193 			proto_media_event               event;
194 			proto_media_ioctl               ioctl;
195 			proto_media_detached            detached;
196 			proto_media_resolve_multi       resolve_multi;
197 			proto_media_send_arp            send_arp;
198 		} v1;
199 		struct {
200 			proto_media_input_v2            input;
201 			proto_media_preout              pre_output;
202 			proto_media_event               event;
203 			proto_media_ioctl               ioctl;
204 			proto_media_detached            detached;
205 			proto_media_resolve_multi       resolve_multi;
206 			proto_media_send_arp            send_arp;
207 		} v2;
208 	} kpi;
209 };
210 
211 SLIST_HEAD(proto_hash_entry, if_proto);
212 
213 #define DLIL_SDLDATALEN \
214 	(DLIL_SDLMAXLEN - offsetof(struct sockaddr_dl, sdl_data[0]))
215 
216 struct dlil_ifnet {
217 	struct ifnet    dl_if;                  /* public ifnet */
218 	/*
219 	 * DLIL private fields, protected by dl_if_lock
220 	 */
221 	decl_lck_mtx_data(, dl_if_lock);
222 	TAILQ_ENTRY(dlil_ifnet) dl_if_link;     /* dlil_ifnet link */
223 	u_int32_t dl_if_flags;                  /* flags (below) */
224 	u_int32_t dl_if_refcnt;                 /* refcnt */
225 	void (*dl_if_trace)(struct dlil_ifnet *, int); /* ref trace callback */
226 	void    *dl_if_uniqueid;                /* unique interface id */
227 	size_t  dl_if_uniqueid_len;             /* length of the unique id */
228 	char    dl_if_namestorage[IFNAMSIZ];    /* interface name storage */
229 	char    dl_if_xnamestorage[IFXNAMSIZ];  /* external name storage */
230 	struct {
231 		struct ifaddr   ifa;            /* lladdr ifa */
232 		u_int8_t        asdl[DLIL_SDLMAXLEN]; /* addr storage */
233 		u_int8_t        msdl[DLIL_SDLMAXLEN]; /* mask storage */
234 	} dl_if_lladdr;
235 	u_int8_t dl_if_descstorage[IF_DESCSIZE]; /* desc storage */
236 	u_int8_t dl_if_permanent_ether[ETHER_ADDR_LEN]; /* permanent address */
237 	u_int8_t dl_if_permanent_ether_is_set;
238 	u_int8_t dl_if_unused;
239 	struct dlil_threading_info dl_if_inpstorage; /* input thread storage */
240 	ctrace_t        dl_if_attach;           /* attach PC stacktrace */
241 	ctrace_t        dl_if_detach;           /* detach PC stacktrace */
242 };
243 
244 /* Values for dl_if_flags (private to DLIL) */
245 #define DLIF_INUSE      0x1     /* DLIL ifnet recycler, ifnet in use */
246 #define DLIF_REUSE      0x2     /* DLIL ifnet recycles, ifnet is not new */
247 #define DLIF_DEBUG      0x4     /* has debugging info */
248 
249 #define IF_REF_TRACE_HIST_SIZE  8       /* size of ref trace history */
250 
251 /* For gdb */
252 __private_extern__ unsigned int if_ref_trace_hist_size = IF_REF_TRACE_HIST_SIZE;
253 
254 struct dlil_ifnet_dbg {
255 	struct dlil_ifnet       dldbg_dlif;             /* dlil_ifnet */
256 	u_int16_t               dldbg_if_refhold_cnt;   /* # ifnet references */
257 	u_int16_t               dldbg_if_refrele_cnt;   /* # ifnet releases */
258 	/*
259 	 * Circular lists of ifnet_{reference,release} callers.
260 	 */
261 	ctrace_t                dldbg_if_refhold[IF_REF_TRACE_HIST_SIZE];
262 	ctrace_t                dldbg_if_refrele[IF_REF_TRACE_HIST_SIZE];
263 };
264 
265 #define DLIL_TO_IFP(s)  (&s->dl_if)
266 #define IFP_TO_DLIL(s)  ((struct dlil_ifnet *)s)
267 
268 struct ifnet_filter {
269 	TAILQ_ENTRY(ifnet_filter)       filt_next;
270 	u_int32_t                       filt_skip;
271 	u_int32_t                       filt_flags;
272 	ifnet_t                         filt_ifp;
273 	const char                      *filt_name;
274 	void                            *filt_cookie;
275 	protocol_family_t               filt_protocol;
276 	iff_input_func                  filt_input;
277 	iff_output_func                 filt_output;
278 	iff_event_func                  filt_event;
279 	iff_ioctl_func                  filt_ioctl;
280 	iff_detached_func               filt_detached;
281 };
282 
283 struct proto_input_entry;
284 
285 static TAILQ_HEAD(, dlil_ifnet) dlil_ifnet_head;
286 
287 static LCK_ATTR_DECLARE(dlil_lck_attributes, 0, 0);
288 
289 static LCK_GRP_DECLARE(dlil_lock_group, "DLIL internal locks");
290 LCK_GRP_DECLARE(ifnet_lock_group, "ifnet locks");
291 static LCK_GRP_DECLARE(ifnet_head_lock_group, "ifnet head lock");
292 static LCK_GRP_DECLARE(ifnet_snd_lock_group, "ifnet snd locks");
293 static LCK_GRP_DECLARE(ifnet_rcv_lock_group, "ifnet rcv locks");
294 
295 LCK_ATTR_DECLARE(ifnet_lock_attr, 0, 0);
296 static LCK_RW_DECLARE_ATTR(ifnet_head_lock, &ifnet_head_lock_group,
297     &dlil_lck_attributes);
298 static LCK_MTX_DECLARE_ATTR(dlil_ifnet_lock, &dlil_lock_group,
299     &dlil_lck_attributes);
300 
301 #if DEBUG
302 static unsigned int ifnet_debug = 1;    /* debugging (enabled) */
303 #else
304 static unsigned int ifnet_debug;        /* debugging (disabled) */
305 #endif /* !DEBUG */
306 static unsigned int dlif_size;          /* size of dlil_ifnet to allocate */
307 static unsigned int dlif_bufsize;       /* size of dlif_size + headroom */
308 static struct zone *dlif_zone;          /* zone for dlil_ifnet */
309 #define DLIF_ZONE_NAME          "ifnet"         /* zone name */
310 
311 static KALLOC_TYPE_DEFINE(dlif_filt_zone, struct ifnet_filter, NET_KT_DEFAULT);
312 
313 static KALLOC_TYPE_DEFINE(dlif_proto_zone, struct if_proto, NET_KT_DEFAULT);
314 
315 static unsigned int dlif_tcpstat_size;  /* size of tcpstat_local to allocate */
316 static unsigned int dlif_tcpstat_bufsize; /* size of dlif_tcpstat_size + headroom */
317 static struct zone *dlif_tcpstat_zone;          /* zone for tcpstat_local */
318 #define DLIF_TCPSTAT_ZONE_NAME  "ifnet_tcpstat" /* zone name */
319 
320 static unsigned int dlif_udpstat_size;  /* size of udpstat_local to allocate */
321 static unsigned int dlif_udpstat_bufsize;       /* size of dlif_udpstat_size + headroom */
322 static struct zone *dlif_udpstat_zone;          /* zone for udpstat_local */
323 #define DLIF_UDPSTAT_ZONE_NAME  "ifnet_udpstat" /* zone name */
324 
325 static u_int32_t net_rtref;
326 
327 static struct dlil_main_threading_info dlil_main_input_thread_info;
328 __private_extern__ struct dlil_threading_info *dlil_main_input_thread =
329     (struct dlil_threading_info *)&dlil_main_input_thread_info;
330 
331 static int dlil_event_internal(struct ifnet *ifp, struct kev_msg *msg, bool update_generation);
332 static int dlil_detach_filter_internal(interface_filter_t filter, int detached);
333 static void dlil_if_trace(struct dlil_ifnet *, int);
334 static void if_proto_ref(struct if_proto *);
335 static void if_proto_free(struct if_proto *);
336 static struct if_proto *find_attached_proto(struct ifnet *, u_int32_t);
337 static u_int32_t dlil_ifp_protolist(struct ifnet *ifp, protocol_family_t *list,
338     u_int32_t list_count);
339 static void _dlil_if_release(ifnet_t ifp, bool clear_in_use);
340 static void if_flt_monitor_busy(struct ifnet *);
341 static void if_flt_monitor_unbusy(struct ifnet *);
342 static void if_flt_monitor_enter(struct ifnet *);
343 static void if_flt_monitor_leave(struct ifnet *);
344 static int dlil_interface_filters_input(struct ifnet *, struct mbuf **,
345     char **, protocol_family_t);
346 static int dlil_interface_filters_output(struct ifnet *, struct mbuf **,
347     protocol_family_t);
348 static struct ifaddr *dlil_alloc_lladdr(struct ifnet *,
349     const struct sockaddr_dl *);
350 static int ifnet_lookup(struct ifnet *);
351 static void if_purgeaddrs(struct ifnet *);
352 
353 static errno_t ifproto_media_input_v1(struct ifnet *, protocol_family_t,
354     struct mbuf *, char *);
355 static errno_t ifproto_media_input_v2(struct ifnet *, protocol_family_t,
356     struct mbuf *);
357 static errno_t ifproto_media_preout(struct ifnet *, protocol_family_t,
358     mbuf_t *, const struct sockaddr *, void *, char *, char *);
359 static void ifproto_media_event(struct ifnet *, protocol_family_t,
360     const struct kev_msg *);
361 static errno_t ifproto_media_ioctl(struct ifnet *, protocol_family_t,
362     unsigned long, void *);
363 static errno_t ifproto_media_resolve_multi(ifnet_t, const struct sockaddr *,
364     struct sockaddr_dl *, size_t);
365 static errno_t ifproto_media_send_arp(struct ifnet *, u_short,
366     const struct sockaddr_dl *, const struct sockaddr *,
367     const struct sockaddr_dl *, const struct sockaddr *);
368 
369 static errno_t ifp_if_input(struct ifnet *ifp, struct mbuf *m_head,
370     struct mbuf *m_tail, const struct ifnet_stat_increment_param *s,
371     boolean_t poll, struct thread *tp);
372 static void ifp_if_input_poll(struct ifnet *, u_int32_t, u_int32_t,
373     struct mbuf **, struct mbuf **, u_int32_t *, u_int32_t *);
374 static errno_t ifp_if_ctl(struct ifnet *, ifnet_ctl_cmd_t, u_int32_t, void *);
375 static errno_t ifp_if_demux(struct ifnet *, struct mbuf *, char *,
376     protocol_family_t *);
377 static errno_t ifp_if_add_proto(struct ifnet *, protocol_family_t,
378     const struct ifnet_demux_desc *, u_int32_t);
379 static errno_t ifp_if_del_proto(struct ifnet *, protocol_family_t);
380 static errno_t ifp_if_check_multi(struct ifnet *, const struct sockaddr *);
381 #if !XNU_TARGET_OS_OSX
382 static errno_t ifp_if_framer(struct ifnet *, struct mbuf **,
383     const struct sockaddr *, const char *, const char *,
384     u_int32_t *, u_int32_t *);
385 #else /* XNU_TARGET_OS_OSX */
386 static errno_t ifp_if_framer(struct ifnet *, struct mbuf **,
387     const struct sockaddr *, const char *, const char *);
388 #endif /* XNU_TARGET_OS_OSX */
389 static errno_t ifp_if_framer_extended(struct ifnet *, struct mbuf **,
390     const struct sockaddr *, const char *, const char *,
391     u_int32_t *, u_int32_t *);
392 static errno_t ifp_if_set_bpf_tap(struct ifnet *, bpf_tap_mode, bpf_packet_func);
393 static void ifp_if_free(struct ifnet *);
394 static void ifp_if_event(struct ifnet *, const struct kev_msg *);
395 static __inline void ifp_inc_traffic_class_in(struct ifnet *, struct mbuf *);
396 static __inline void ifp_inc_traffic_class_out(struct ifnet *, struct mbuf *);
397 
398 static errno_t dlil_input_async(struct dlil_threading_info *, struct ifnet *,
399     struct mbuf *, struct mbuf *, const struct ifnet_stat_increment_param *,
400     boolean_t, struct thread *);
401 static errno_t dlil_input_sync(struct dlil_threading_info *, struct ifnet *,
402     struct mbuf *, struct mbuf *, const struct ifnet_stat_increment_param *,
403     boolean_t, struct thread *);
404 
405 static void dlil_main_input_thread_func(void *, wait_result_t);
406 static void dlil_main_input_thread_cont(void *, wait_result_t);
407 
408 static void dlil_input_thread_func(void *, wait_result_t);
409 static void dlil_input_thread_cont(void *, wait_result_t);
410 
411 static void dlil_rxpoll_input_thread_func(void *, wait_result_t);
412 static void dlil_rxpoll_input_thread_cont(void *, wait_result_t);
413 
414 static int dlil_create_input_thread(ifnet_t, struct dlil_threading_info *,
415     thread_continue_t *);
416 static void dlil_terminate_input_thread(struct dlil_threading_info *);
417 static void dlil_input_stats_add(const struct ifnet_stat_increment_param *,
418     struct dlil_threading_info *, struct ifnet *, boolean_t);
419 static boolean_t dlil_input_stats_sync(struct ifnet *,
420     struct dlil_threading_info *);
421 static void dlil_input_packet_list_common(struct ifnet *, struct mbuf *,
422     u_int32_t, ifnet_model_t, boolean_t);
423 static errno_t ifnet_input_common(struct ifnet *, struct mbuf *, struct mbuf *,
424     const struct ifnet_stat_increment_param *, boolean_t, boolean_t);
425 static int dlil_is_clat_needed(protocol_family_t, mbuf_t );
426 static errno_t dlil_clat46(ifnet_t, protocol_family_t *, mbuf_t *);
427 static errno_t dlil_clat64(ifnet_t, protocol_family_t *, mbuf_t *);
428 #if DEBUG || DEVELOPMENT
429 static void dlil_verify_sum16(void);
430 #endif /* DEBUG || DEVELOPMENT */
431 static void dlil_output_cksum_dbg(struct ifnet *, struct mbuf *, uint32_t,
432     protocol_family_t);
433 static void dlil_input_cksum_dbg(struct ifnet *, struct mbuf *, char *,
434     protocol_family_t);
435 
436 static void dlil_incr_pending_thread_count(void);
437 static void dlil_decr_pending_thread_count(void);
438 
439 static void ifnet_detacher_thread_func(void *, wait_result_t);
440 static void ifnet_detacher_thread_cont(void *, wait_result_t);
441 static void ifnet_detach_final(struct ifnet *);
442 static void ifnet_detaching_enqueue(struct ifnet *);
443 static struct ifnet *ifnet_detaching_dequeue(void);
444 
445 static void ifnet_start_thread_func(void *, wait_result_t);
446 static void ifnet_start_thread_cont(void *, wait_result_t);
447 
448 static void ifnet_poll_thread_func(void *, wait_result_t);
449 static void ifnet_poll_thread_cont(void *, wait_result_t);
450 
451 static errno_t ifnet_enqueue_common(struct ifnet *, struct ifclassq *,
452     classq_pkt_t *, boolean_t, boolean_t *);
453 
454 static void ifp_src_route_copyout(struct ifnet *, struct route *);
455 static void ifp_src_route_copyin(struct ifnet *, struct route *);
456 static void ifp_src_route6_copyout(struct ifnet *, struct route_in6 *);
457 static void ifp_src_route6_copyin(struct ifnet *, struct route_in6 *);
458 
459 static int sysctl_rxpoll SYSCTL_HANDLER_ARGS;
460 static int sysctl_rxpoll_mode_holdtime SYSCTL_HANDLER_ARGS;
461 static int sysctl_rxpoll_sample_holdtime SYSCTL_HANDLER_ARGS;
462 static int sysctl_rxpoll_interval_time SYSCTL_HANDLER_ARGS;
463 static int sysctl_rxpoll_wlowat SYSCTL_HANDLER_ARGS;
464 static int sysctl_rxpoll_whiwat SYSCTL_HANDLER_ARGS;
465 static int sysctl_sndq_maxlen SYSCTL_HANDLER_ARGS;
466 static int sysctl_rcvq_maxlen SYSCTL_HANDLER_ARGS;
467 static int sysctl_hwcksum_dbg_mode SYSCTL_HANDLER_ARGS;
468 static int sysctl_hwcksum_dbg_partial_rxoff_forced SYSCTL_HANDLER_ARGS;
469 static int sysctl_hwcksum_dbg_partial_rxoff_adj SYSCTL_HANDLER_ARGS;
470 
471 struct chain_len_stats tx_chain_len_stats;
472 static int sysctl_tx_chain_len_stats SYSCTL_HANDLER_ARGS;
473 
474 #if TEST_INPUT_THREAD_TERMINATION
475 static int sysctl_input_thread_termination_spin SYSCTL_HANDLER_ARGS;
476 #endif /* TEST_INPUT_THREAD_TERMINATION */
477 
478 
479 /* The following are protected by dlil_ifnet_lock */
480 static TAILQ_HEAD(, ifnet) ifnet_detaching_head;
481 static u_int32_t ifnet_detaching_cnt;
482 static boolean_t ifnet_detaching_embryonic;
483 static void *ifnet_delayed_run; /* wait channel for detaching thread */
484 
485 static LCK_MTX_DECLARE_ATTR(ifnet_fc_lock, &dlil_lock_group,
486     &dlil_lck_attributes);
487 
488 static uint32_t ifnet_flowhash_seed;
489 
490 struct ifnet_flowhash_key {
491 	char            ifk_name[IFNAMSIZ];
492 	uint32_t        ifk_unit;
493 	uint32_t        ifk_flags;
494 	uint32_t        ifk_eflags;
495 	uint32_t        ifk_capabilities;
496 	uint32_t        ifk_capenable;
497 	uint32_t        ifk_output_sched_model;
498 	uint32_t        ifk_rand1;
499 	uint32_t        ifk_rand2;
500 };
501 
502 /* Flow control entry per interface */
503 struct ifnet_fc_entry {
504 	RB_ENTRY(ifnet_fc_entry) ifce_entry;
505 	u_int32_t       ifce_flowhash;
506 	struct ifnet    *ifce_ifp;
507 };
508 
509 static uint32_t ifnet_calc_flowhash(struct ifnet *);
510 static int ifce_cmp(const struct ifnet_fc_entry *,
511     const struct ifnet_fc_entry *);
512 static int ifnet_fc_add(struct ifnet *);
513 static struct ifnet_fc_entry *ifnet_fc_get(u_int32_t);
514 static void ifnet_fc_entry_free(struct ifnet_fc_entry *);
515 
516 /* protected by ifnet_fc_lock */
517 RB_HEAD(ifnet_fc_tree, ifnet_fc_entry) ifnet_fc_tree;
518 RB_PROTOTYPE(ifnet_fc_tree, ifnet_fc_entry, ifce_entry, ifce_cmp);
519 RB_GENERATE(ifnet_fc_tree, ifnet_fc_entry, ifce_entry, ifce_cmp);
520 
521 static KALLOC_TYPE_DEFINE(ifnet_fc_zone, struct ifnet_fc_entry, NET_KT_DEFAULT);
522 
523 extern void bpfdetach(struct ifnet *);
524 extern void proto_input_run(void);
525 
526 extern uint32_t udp_count_opportunistic(unsigned int ifindex,
527     u_int32_t flags);
528 extern uint32_t tcp_count_opportunistic(unsigned int ifindex,
529     u_int32_t flags);
530 
531 __private_extern__ void link_rtrequest(int, struct rtentry *, struct sockaddr *);
532 
533 #if CONFIG_MACF
534 #if !XNU_TARGET_OS_OSX
535 int dlil_lladdr_ckreq = 1;
536 #else /* XNU_TARGET_OS_OSX */
537 int dlil_lladdr_ckreq = 0;
538 #endif /* XNU_TARGET_OS_OSX */
539 #endif /* CONFIG_MACF */
540 
541 #if DEBUG
542 int dlil_verbose = 1;
543 #else
544 int dlil_verbose = 0;
545 #endif /* DEBUG */
546 #if IFNET_INPUT_SANITY_CHK
547 /* sanity checking of input packet lists received */
548 static u_int32_t dlil_input_sanity_check = 0;
549 #endif /* IFNET_INPUT_SANITY_CHK */
550 /* rate limit debug messages */
551 struct timespec dlil_dbgrate = { .tv_sec = 1, .tv_nsec = 0 };
552 
553 SYSCTL_DECL(_net_link_generic_system);
554 
555 SYSCTL_INT(_net_link_generic_system, OID_AUTO, dlil_verbose,
556     CTLFLAG_RW | CTLFLAG_LOCKED, &dlil_verbose, 0, "Log DLIL error messages");
557 
558 #define IF_SNDQ_MINLEN  32
559 u_int32_t if_sndq_maxlen = IFQ_MAXLEN;
560 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, sndq_maxlen,
561     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_sndq_maxlen, IFQ_MAXLEN,
562     sysctl_sndq_maxlen, "I", "Default transmit queue max length");
563 
564 #define IF_RCVQ_MINLEN  32
565 #define IF_RCVQ_MAXLEN  256
566 u_int32_t if_rcvq_maxlen = IF_RCVQ_MAXLEN;
567 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rcvq_maxlen,
568     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rcvq_maxlen, IFQ_MAXLEN,
569     sysctl_rcvq_maxlen, "I", "Default receive queue max length");
570 
571 #define IF_RXPOLL_DECAY         2       /* ilog2 of EWMA decay rate (4) */
572 u_int32_t if_rxpoll_decay = IF_RXPOLL_DECAY;
573 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, rxpoll_decay,
574     CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_decay, IF_RXPOLL_DECAY,
575     "ilog2 of EWMA decay rate of avg inbound packets");
576 
577 #define IF_RXPOLL_MODE_HOLDTIME_MIN     (10ULL * 1000 * 1000)   /* 10 ms */
578 #define IF_RXPOLL_MODE_HOLDTIME         (1000ULL * 1000 * 1000) /* 1 sec */
579 static u_int64_t if_rxpoll_mode_holdtime = IF_RXPOLL_MODE_HOLDTIME;
580 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_freeze_time,
581     CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_mode_holdtime,
582     IF_RXPOLL_MODE_HOLDTIME, sysctl_rxpoll_mode_holdtime,
583     "Q", "input poll mode freeze time");
584 
585 #define IF_RXPOLL_SAMPLETIME_MIN        (1ULL * 1000 * 1000)    /* 1 ms */
586 #define IF_RXPOLL_SAMPLETIME            (10ULL * 1000 * 1000)   /* 10 ms */
587 static u_int64_t if_rxpoll_sample_holdtime = IF_RXPOLL_SAMPLETIME;
588 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_sample_time,
589     CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_sample_holdtime,
590     IF_RXPOLL_SAMPLETIME, sysctl_rxpoll_sample_holdtime,
591     "Q", "input poll sampling time");
592 
593 static u_int64_t if_rxpoll_interval_time = IF_RXPOLL_INTERVALTIME;
594 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_interval_time,
595     CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_interval_time,
596     IF_RXPOLL_INTERVALTIME, sysctl_rxpoll_interval_time,
597     "Q", "input poll interval (time)");
598 
599 #define IF_RXPOLL_INTERVAL_PKTS 0       /* 0 (disabled) */
600 u_int32_t if_rxpoll_interval_pkts = IF_RXPOLL_INTERVAL_PKTS;
601 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, rxpoll_interval_pkts,
602     CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_interval_pkts,
603     IF_RXPOLL_INTERVAL_PKTS, "input poll interval (packets)");
604 
605 #define IF_RXPOLL_WLOWAT        10
606 static u_int32_t if_sysctl_rxpoll_wlowat = IF_RXPOLL_WLOWAT;
607 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_wakeups_lowat,
608     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_sysctl_rxpoll_wlowat,
609     IF_RXPOLL_WLOWAT, sysctl_rxpoll_wlowat,
610     "I", "input poll wakeup low watermark");
611 
612 #define IF_RXPOLL_WHIWAT        100
613 static u_int32_t if_sysctl_rxpoll_whiwat = IF_RXPOLL_WHIWAT;
614 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_wakeups_hiwat,
615     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_sysctl_rxpoll_whiwat,
616     IF_RXPOLL_WHIWAT, sysctl_rxpoll_whiwat,
617     "I", "input poll wakeup high watermark");
618 
619 static u_int32_t if_rxpoll_max = 0;                     /* 0 (automatic) */
620 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, rxpoll_max,
621     CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_max, 0,
622     "max packets per poll call");
623 
624 u_int32_t if_rxpoll = 1;
625 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll,
626     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll, 0,
627     sysctl_rxpoll, "I", "enable opportunistic input polling");
628 
629 #if TEST_INPUT_THREAD_TERMINATION
630 static u_int32_t if_input_thread_termination_spin = 0;
631 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, input_thread_termination_spin,
632     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
633     &if_input_thread_termination_spin, 0,
634     sysctl_input_thread_termination_spin,
635     "I", "input thread termination spin limit");
636 #endif /* TEST_INPUT_THREAD_TERMINATION */
637 
638 static u_int32_t cur_dlil_input_threads = 0;
639 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, dlil_input_threads,
640     CTLFLAG_RD | CTLFLAG_LOCKED, &cur_dlil_input_threads, 0,
641     "Current number of DLIL input threads");
642 
643 #if IFNET_INPUT_SANITY_CHK
644 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, dlil_input_sanity_check,
645     CTLFLAG_RW | CTLFLAG_LOCKED, &dlil_input_sanity_check, 0,
646     "Turn on sanity checking in DLIL input");
647 #endif /* IFNET_INPUT_SANITY_CHK */
648 
649 static u_int32_t if_flowadv = 1;
650 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, flow_advisory,
651     CTLFLAG_RW | CTLFLAG_LOCKED, &if_flowadv, 1,
652     "enable flow-advisory mechanism");
653 
654 static u_int32_t if_delaybased_queue = 1;
655 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, delaybased_queue,
656     CTLFLAG_RW | CTLFLAG_LOCKED, &if_delaybased_queue, 1,
657     "enable delay based dynamic queue sizing");
658 
659 static uint64_t hwcksum_in_invalidated = 0;
660 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
661     hwcksum_in_invalidated, CTLFLAG_RD | CTLFLAG_LOCKED,
662     &hwcksum_in_invalidated, "inbound packets with invalidated hardware cksum");
663 
664 uint32_t hwcksum_dbg = 0;
665 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, hwcksum_dbg,
666     CTLFLAG_RW | CTLFLAG_LOCKED, &hwcksum_dbg, 0,
667     "enable hardware cksum debugging");
668 
669 u_int32_t ifnet_start_delayed = 0;
670 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, start_delayed,
671     CTLFLAG_RW | CTLFLAG_LOCKED, &ifnet_start_delayed, 0,
672     "number of times start was delayed");
673 
674 u_int32_t ifnet_delay_start_disabled = 0;
675 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, start_delay_disabled,
676     CTLFLAG_RW | CTLFLAG_LOCKED, &ifnet_delay_start_disabled, 0,
677     "number of times start was delayed");
678 
679 #if DEVELOPMENT || DEBUG
680 static int packet_dump_trace_update SYSCTL_HANDLER_ARGS;
681 
682 struct flow_key flow_key_trace;
683 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, flow_key_trace, CTLFLAG_WR | CTLFLAG_LOCKED |
684     CTLFLAG_KERN | CTLFLAG_ANYBODY, 0, 0, packet_dump_trace_update, "S", "Set flow key for packet tracing");
685 #endif /* DEVELOPMENT || DEBUG */
686 
687 static inline void
ifnet_delay_start_disabled_increment(void)688 ifnet_delay_start_disabled_increment(void)
689 {
690 	OSIncrementAtomic(&ifnet_delay_start_disabled);
691 }
692 
693 #define HWCKSUM_DBG_PARTIAL_FORCED      0x1     /* forced partial checksum */
694 #define HWCKSUM_DBG_PARTIAL_RXOFF_ADJ   0x2     /* adjust start offset */
695 #define HWCKSUM_DBG_FINALIZE_FORCED     0x10    /* forced finalize */
696 #define HWCKSUM_DBG_MASK \
697 	(HWCKSUM_DBG_PARTIAL_FORCED | HWCKSUM_DBG_PARTIAL_RXOFF_ADJ |   \
698 	HWCKSUM_DBG_FINALIZE_FORCED)
699 
700 static uint32_t hwcksum_dbg_mode = 0;
701 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, hwcksum_dbg_mode,
702     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &hwcksum_dbg_mode,
703     0, sysctl_hwcksum_dbg_mode, "I", "hardware cksum debugging mode");
704 
705 static uint64_t hwcksum_dbg_partial_forced = 0;
706 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
707     hwcksum_dbg_partial_forced, CTLFLAG_RD | CTLFLAG_LOCKED,
708     &hwcksum_dbg_partial_forced, "packets forced using partial cksum");
709 
710 static uint64_t hwcksum_dbg_partial_forced_bytes = 0;
711 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
712     hwcksum_dbg_partial_forced_bytes, CTLFLAG_RD | CTLFLAG_LOCKED,
713     &hwcksum_dbg_partial_forced_bytes, "bytes forced using partial cksum");
714 
715 static uint32_t hwcksum_dbg_partial_rxoff_forced = 0;
716 SYSCTL_PROC(_net_link_generic_system, OID_AUTO,
717     hwcksum_dbg_partial_rxoff_forced, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
718     &hwcksum_dbg_partial_rxoff_forced, 0,
719     sysctl_hwcksum_dbg_partial_rxoff_forced, "I",
720     "forced partial cksum rx offset");
721 
722 static uint32_t hwcksum_dbg_partial_rxoff_adj = 0;
723 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, hwcksum_dbg_partial_rxoff_adj,
724     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &hwcksum_dbg_partial_rxoff_adj,
725     0, sysctl_hwcksum_dbg_partial_rxoff_adj, "I",
726     "adjusted partial cksum rx offset");
727 
728 static uint64_t hwcksum_dbg_verified = 0;
729 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
730     hwcksum_dbg_verified, CTLFLAG_RD | CTLFLAG_LOCKED,
731     &hwcksum_dbg_verified, "packets verified for having good checksum");
732 
733 static uint64_t hwcksum_dbg_bad_cksum = 0;
734 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
735     hwcksum_dbg_bad_cksum, CTLFLAG_RD | CTLFLAG_LOCKED,
736     &hwcksum_dbg_bad_cksum, "packets with bad hardware calculated checksum");
737 
738 static uint64_t hwcksum_dbg_bad_rxoff = 0;
739 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
740     hwcksum_dbg_bad_rxoff, CTLFLAG_RD | CTLFLAG_LOCKED,
741     &hwcksum_dbg_bad_rxoff, "packets with invalid rxoff");
742 
743 static uint64_t hwcksum_dbg_adjusted = 0;
744 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
745     hwcksum_dbg_adjusted, CTLFLAG_RD | CTLFLAG_LOCKED,
746     &hwcksum_dbg_adjusted, "packets with rxoff adjusted");
747 
748 static uint64_t hwcksum_dbg_finalized_hdr = 0;
749 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
750     hwcksum_dbg_finalized_hdr, CTLFLAG_RD | CTLFLAG_LOCKED,
751     &hwcksum_dbg_finalized_hdr, "finalized headers");
752 
753 static uint64_t hwcksum_dbg_finalized_data = 0;
754 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
755     hwcksum_dbg_finalized_data, CTLFLAG_RD | CTLFLAG_LOCKED,
756     &hwcksum_dbg_finalized_data, "finalized payloads");
757 
758 uint32_t hwcksum_tx = 1;
759 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, hwcksum_tx,
760     CTLFLAG_RW | CTLFLAG_LOCKED, &hwcksum_tx, 0,
761     "enable transmit hardware checksum offload");
762 
763 uint32_t hwcksum_rx = 1;
764 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, hwcksum_rx,
765     CTLFLAG_RW | CTLFLAG_LOCKED, &hwcksum_rx, 0,
766     "enable receive hardware checksum offload");
767 
768 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, tx_chain_len_stats,
769     CTLFLAG_RD | CTLFLAG_LOCKED, 0, 9,
770     sysctl_tx_chain_len_stats, "S", "");
771 
772 uint32_t tx_chain_len_count = 0;
773 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, tx_chain_len_count,
774     CTLFLAG_RW | CTLFLAG_LOCKED, &tx_chain_len_count, 0, "");
775 
776 static uint32_t threshold_notify = 1;           /* enable/disable */
777 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, threshold_notify,
778     CTLFLAG_RW | CTLFLAG_LOCKED, &threshold_notify, 0, "");
779 
780 static uint32_t threshold_interval = 2;         /* in seconds */
781 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, threshold_interval,
782     CTLFLAG_RW | CTLFLAG_LOCKED, &threshold_interval, 0, "");
783 
784 #if (DEVELOPMENT || DEBUG)
785 static int sysctl_get_kao_frames SYSCTL_HANDLER_ARGS;
786 SYSCTL_NODE(_net_link_generic_system, OID_AUTO, get_kao_frames,
787     CTLFLAG_RD | CTLFLAG_LOCKED, sysctl_get_kao_frames, "");
788 #endif /* DEVELOPMENT || DEBUG */
789 
790 struct net_api_stats net_api_stats;
791 SYSCTL_STRUCT(_net, OID_AUTO, api_stats, CTLFLAG_RD | CTLFLAG_LOCKED,
792     &net_api_stats, net_api_stats, "");
793 
794 uint32_t net_wake_pkt_debug = 0;
795 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, wake_pkt_debug,
796     CTLFLAG_RW | CTLFLAG_LOCKED, &net_wake_pkt_debug, 0, "");
797 
798 static void log_hexdump(void *data, size_t len);
799 
800 unsigned int net_rxpoll = 1;
801 unsigned int net_affinity = 1;
802 unsigned int net_async = 1;     /* 0: synchronous, 1: asynchronous */
803 
804 static kern_return_t dlil_affinity_set(struct thread *, u_int32_t);
805 
806 extern u_int32_t        inject_buckets;
807 
808 /* DLIL data threshold thread call */
809 static void dlil_dt_tcall_fn(thread_call_param_t, thread_call_param_t);
810 
811 void
ifnet_filter_update_tso(struct ifnet * ifp,boolean_t filter_enable)812 ifnet_filter_update_tso(struct ifnet *ifp, boolean_t filter_enable)
813 {
814 	/*
815 	 * update filter count and route_generation ID to let TCP
816 	 * know it should reevalute doing TSO or not
817 	 */
818 	if (filter_enable) {
819 		OSAddAtomic(1, &ifp->if_flt_no_tso_count);
820 	} else {
821 		VERIFY(ifp->if_flt_no_tso_count != 0);
822 		OSAddAtomic(-1, &ifp->if_flt_no_tso_count);
823 	}
824 	routegenid_update();
825 }
826 
827 #if SKYWALK
828 
829 #if defined(XNU_TARGET_OS_OSX)
830 static bool net_check_compatible_if_filter(struct ifnet *ifp);
831 #endif /* XNU_TARGET_OS_OSX */
832 
833 /* if_attach_nx flags defined in os_skywalk_private.h */
834 static unsigned int if_attach_nx = IF_ATTACH_NX_DEFAULT;
835 unsigned int if_enable_fsw_ip_netagent =
836     ((IF_ATTACH_NX_DEFAULT & IF_ATTACH_NX_FSW_IP_NETAGENT) != 0);
837 unsigned int if_enable_fsw_transport_netagent =
838     ((IF_ATTACH_NX_DEFAULT & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0);
839 
840 unsigned int if_netif_all =
841     ((IF_ATTACH_NX_DEFAULT & IF_ATTACH_NX_NETIF_ALL) != 0);
842 
843 /* Configure flowswitch to use max mtu sized buffer */
844 static bool fsw_use_max_mtu_buffer = false;
845 
846 #if (DEVELOPMENT || DEBUG)
847 static int
848 if_attach_nx_sysctl SYSCTL_HANDLER_ARGS
849 {
850 #pragma unused(oidp, arg1, arg2)
851 	unsigned int new_value;
852 	int changed;
853 	int error = sysctl_io_number(req, if_attach_nx, sizeof(if_attach_nx),
854 	    &new_value, &changed);
855 	if (error) {
856 		return error;
857 	}
858 	if (changed) {
859 		if ((new_value & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) !=
860 		    (if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT)) {
861 			return ENOTSUP;
862 		}
863 		if_attach_nx = new_value;
864 	}
865 	return 0;
866 }
867 
868 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, if_attach_nx,
869     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
870     0, 0, &if_attach_nx_sysctl, "IU", "attach nexus");
871 
872 #endif /* DEVELOPMENT || DEBUG */
873 
874 static int
875 if_enable_fsw_transport_netagent_sysctl SYSCTL_HANDLER_ARGS
876 {
877 #pragma unused(oidp, arg1, arg2)
878 	unsigned int new_value;
879 	int changed;
880 	int error;
881 
882 	error = sysctl_io_number(req, if_enable_fsw_transport_netagent,
883 	    sizeof(if_enable_fsw_transport_netagent),
884 	    &new_value, &changed);
885 	if (error == 0 && changed != 0) {
886 		if (new_value != 0 && new_value != 1) {
887 			/* only allow 0 or 1 */
888 			error = EINVAL;
889 		} else if ((if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0) {
890 			/* netagent can be enabled/disabled */
891 			if_enable_fsw_transport_netagent = new_value;
892 			if (new_value == 0) {
893 				kern_nexus_deregister_netagents();
894 			} else {
895 				kern_nexus_register_netagents();
896 			}
897 		} else {
898 			/* netagent can't be enabled */
899 			error = ENOTSUP;
900 		}
901 	}
902 	return error;
903 }
904 
905 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, enable_netagent,
906     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
907     0, 0, &if_enable_fsw_transport_netagent_sysctl, "IU",
908     "enable flowswitch netagent");
909 
910 static void dlil_detach_flowswitch_nexus(if_nexus_flowswitch_t nexus_fsw);
911 
912 #include <skywalk/os_skywalk_private.h>
913 
914 boolean_t
ifnet_nx_noauto(ifnet_t ifp)915 ifnet_nx_noauto(ifnet_t ifp)
916 {
917 	return (ifp->if_xflags & IFXF_NX_NOAUTO) != 0;
918 }
919 
920 boolean_t
ifnet_nx_noauto_flowswitch(ifnet_t ifp)921 ifnet_nx_noauto_flowswitch(ifnet_t ifp)
922 {
923 	return ifnet_is_low_latency(ifp);
924 }
925 
926 boolean_t
ifnet_is_low_latency(ifnet_t ifp)927 ifnet_is_low_latency(ifnet_t ifp)
928 {
929 	return (ifp->if_xflags & IFXF_LOW_LATENCY) != 0;
930 }
931 
932 boolean_t
ifnet_needs_compat(ifnet_t ifp)933 ifnet_needs_compat(ifnet_t ifp)
934 {
935 	if ((if_attach_nx & IF_ATTACH_NX_NETIF_COMPAT) == 0) {
936 		return FALSE;
937 	}
938 #if !XNU_TARGET_OS_OSX
939 	/*
940 	 * To conserve memory, we plumb in the compat layer selectively; this
941 	 * can be overridden via if_attach_nx flag IF_ATTACH_NX_NETIF_ALL.
942 	 * In particular, we check for Wi-Fi Access Point.
943 	 */
944 	if (IFNET_IS_WIFI(ifp)) {
945 		/* Wi-Fi Access Point */
946 		if (ifp->if_name[0] == 'a' && ifp->if_name[1] == 'p' &&
947 		    ifp->if_name[2] == '\0') {
948 			return if_netif_all;
949 		}
950 	}
951 #else /* XNU_TARGET_OS_OSX */
952 #pragma unused(ifp)
953 #endif /* XNU_TARGET_OS_OSX */
954 	return TRUE;
955 }
956 
957 boolean_t
ifnet_needs_fsw_transport_netagent(ifnet_t ifp)958 ifnet_needs_fsw_transport_netagent(ifnet_t ifp)
959 {
960 	if (if_is_fsw_transport_netagent_enabled()) {
961 		/* check if netagent has been manually enabled for ipsec/utun */
962 		if (ifp->if_family == IFNET_FAMILY_IPSEC) {
963 			return ipsec_interface_needs_netagent(ifp);
964 		} else if (ifp->if_family == IFNET_FAMILY_UTUN) {
965 			return utun_interface_needs_netagent(ifp);
966 		}
967 
968 		/* check ifnet no auto nexus override */
969 		if (ifnet_nx_noauto(ifp)) {
970 			return FALSE;
971 		}
972 
973 		/* check global if_attach_nx configuration */
974 		switch (ifp->if_family) {
975 		case IFNET_FAMILY_CELLULAR:
976 		case IFNET_FAMILY_ETHERNET:
977 			if ((if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0) {
978 				return TRUE;
979 			}
980 			break;
981 		default:
982 			break;
983 		}
984 	}
985 	return FALSE;
986 }
987 
988 boolean_t
ifnet_needs_fsw_ip_netagent(ifnet_t ifp)989 ifnet_needs_fsw_ip_netagent(ifnet_t ifp)
990 {
991 #pragma unused(ifp)
992 	if ((if_attach_nx & IF_ATTACH_NX_FSW_IP_NETAGENT) != 0) {
993 		return TRUE;
994 	}
995 	return FALSE;
996 }
997 
998 boolean_t
ifnet_needs_netif_netagent(ifnet_t ifp)999 ifnet_needs_netif_netagent(ifnet_t ifp)
1000 {
1001 #pragma unused(ifp)
1002 	return (if_attach_nx & IF_ATTACH_NX_NETIF_NETAGENT) != 0;
1003 }
1004 
1005 static boolean_t
dlil_detach_nexus_instance(nexus_controller_t controller,const char * func_str,uuid_t instance,uuid_t device)1006 dlil_detach_nexus_instance(nexus_controller_t controller,
1007     const char *func_str, uuid_t instance, uuid_t device)
1008 {
1009 	errno_t         err;
1010 
1011 	if (instance == NULL || uuid_is_null(instance)) {
1012 		return FALSE;
1013 	}
1014 
1015 	/* followed by the device port */
1016 	if (device != NULL && !uuid_is_null(device)) {
1017 		err = kern_nexus_ifdetach(controller, instance, device);
1018 		if (err != 0) {
1019 			DLIL_PRINTF("%s kern_nexus_ifdetach device failed %d\n",
1020 			    func_str, err);
1021 		}
1022 	}
1023 	err = kern_nexus_controller_free_provider_instance(controller,
1024 	    instance);
1025 	if (err != 0) {
1026 		DLIL_PRINTF("%s free_provider_instance failed %d\n",
1027 		    func_str, err);
1028 	}
1029 	return TRUE;
1030 }
1031 
1032 static boolean_t
dlil_detach_nexus(const char * func_str,uuid_t provider,uuid_t instance,uuid_t device)1033 dlil_detach_nexus(const char *func_str, uuid_t provider, uuid_t instance,
1034     uuid_t device)
1035 {
1036 	boolean_t               detached = FALSE;
1037 	nexus_controller_t      controller = kern_nexus_shared_controller();
1038 	int                     err;
1039 
1040 	if (dlil_detach_nexus_instance(controller, func_str, instance,
1041 	    device)) {
1042 		detached = TRUE;
1043 	}
1044 	if (provider != NULL && !uuid_is_null(provider)) {
1045 		detached = TRUE;
1046 		err = kern_nexus_controller_deregister_provider(controller,
1047 		    provider);
1048 		if (err != 0) {
1049 			DLIL_PRINTF("%s deregister_provider %d\n",
1050 			    func_str, err);
1051 		}
1052 	}
1053 	return detached;
1054 }
1055 
1056 static errno_t
dlil_create_provider_and_instance(nexus_controller_t controller,nexus_type_t type,ifnet_t ifp,uuid_t * provider,uuid_t * instance,nexus_attr_t attr)1057 dlil_create_provider_and_instance(nexus_controller_t controller,
1058     nexus_type_t type, ifnet_t ifp, uuid_t *provider, uuid_t *instance,
1059     nexus_attr_t attr)
1060 {
1061 	uuid_t          dom_prov;
1062 	errno_t         err;
1063 	nexus_name_t    provider_name;
1064 	const char      *type_name =
1065 	    (type == NEXUS_TYPE_NET_IF) ? "netif" : "flowswitch";
1066 	struct kern_nexus_init init;
1067 
1068 	err = kern_nexus_get_default_domain_provider(type, &dom_prov);
1069 	if (err != 0) {
1070 		DLIL_PRINTF("%s can't get %s provider, error %d\n",
1071 		    __func__, type_name, err);
1072 		goto failed;
1073 	}
1074 
1075 	snprintf((char *)provider_name, sizeof(provider_name),
1076 	    "com.apple.%s.%s", type_name, if_name(ifp));
1077 	err = kern_nexus_controller_register_provider(controller,
1078 	    dom_prov,
1079 	    provider_name,
1080 	    NULL,
1081 	    0,
1082 	    attr,
1083 	    provider);
1084 	if (err != 0) {
1085 		DLIL_PRINTF("%s register %s provider failed, error %d\n",
1086 		    __func__, type_name, err);
1087 		goto failed;
1088 	}
1089 	bzero(&init, sizeof(init));
1090 	init.nxi_version = KERN_NEXUS_CURRENT_VERSION;
1091 	err = kern_nexus_controller_alloc_provider_instance(controller,
1092 	    *provider,
1093 	    NULL, NULL,
1094 	    instance, &init);
1095 	if (err != 0) {
1096 		DLIL_PRINTF("%s alloc_provider_instance %s failed, %d\n",
1097 		    __func__, type_name, err);
1098 		kern_nexus_controller_deregister_provider(controller,
1099 		    *provider);
1100 		goto failed;
1101 	}
1102 failed:
1103 	return err;
1104 }
1105 
1106 static boolean_t
dlil_attach_netif_nexus_common(ifnet_t ifp,if_nexus_netif_t netif_nx)1107 dlil_attach_netif_nexus_common(ifnet_t ifp, if_nexus_netif_t netif_nx)
1108 {
1109 	nexus_attr_t            attr = NULL;
1110 	nexus_controller_t      controller;
1111 	errno_t                 err;
1112 
1113 	if ((ifp->if_capabilities & IFCAP_SKYWALK) != 0) {
1114 		/* it's already attached */
1115 		if (dlil_verbose) {
1116 			DLIL_PRINTF("%s: %s already has nexus attached\n",
1117 			    __func__, if_name(ifp));
1118 			/* already attached */
1119 		}
1120 		goto failed;
1121 	}
1122 
1123 	err = kern_nexus_attr_create(&attr);
1124 	if (err != 0) {
1125 		DLIL_PRINTF("%s: nexus attr create for %s\n", __func__,
1126 		    if_name(ifp));
1127 		goto failed;
1128 	}
1129 	err = kern_nexus_attr_set(attr, NEXUS_ATTR_IFINDEX, ifp->if_index);
1130 	VERIFY(err == 0);
1131 
1132 	controller = kern_nexus_shared_controller();
1133 
1134 	/* create the netif provider and instance */
1135 	err = dlil_create_provider_and_instance(controller,
1136 	    NEXUS_TYPE_NET_IF, ifp, &netif_nx->if_nif_provider,
1137 	    &netif_nx->if_nif_instance, attr);
1138 	if (err != 0) {
1139 		goto failed;
1140 	}
1141 	err = kern_nexus_ifattach(controller, netif_nx->if_nif_instance,
1142 	    ifp, NULL, FALSE, &netif_nx->if_nif_attach);
1143 	if (err != 0) {
1144 		DLIL_PRINTF("%s kern_nexus_ifattach %d\n",
1145 		    __func__, err);
1146 		/* cleanup provider and instance */
1147 		dlil_detach_nexus(__func__, netif_nx->if_nif_provider,
1148 		    netif_nx->if_nif_instance, NULL);
1149 		goto failed;
1150 	}
1151 	return TRUE;
1152 
1153 failed:
1154 	if (attr != NULL) {
1155 		kern_nexus_attr_destroy(attr);
1156 	}
1157 	return FALSE;
1158 }
1159 
1160 static boolean_t
dlil_attach_netif_compat_nexus(ifnet_t ifp,if_nexus_netif_t netif_nx)1161 dlil_attach_netif_compat_nexus(ifnet_t ifp, if_nexus_netif_t netif_nx)
1162 {
1163 	if (ifnet_nx_noauto(ifp) || IFNET_IS_INTCOPROC(ifp) ||
1164 	    IFNET_IS_VMNET(ifp)) {
1165 		goto failed;
1166 	}
1167 	switch (ifp->if_type) {
1168 	case IFT_CELLULAR:
1169 	case IFT_ETHER:
1170 		if ((if_attach_nx & IF_ATTACH_NX_NETIF_COMPAT) == 0) {
1171 			/* don't auto-attach */
1172 			goto failed;
1173 		}
1174 		break;
1175 	default:
1176 		/* don't auto-attach */
1177 		goto failed;
1178 	}
1179 	return dlil_attach_netif_nexus_common(ifp, netif_nx);
1180 
1181 failed:
1182 	return FALSE;
1183 }
1184 
1185 static boolean_t
dlil_is_native_netif_nexus(ifnet_t ifp)1186 dlil_is_native_netif_nexus(ifnet_t ifp)
1187 {
1188 	return (ifp->if_eflags & IFEF_SKYWALK_NATIVE) && ifp->if_na != NULL;
1189 }
1190 
1191 __attribute__((noinline))
1192 static void
dlil_detach_netif_nexus(if_nexus_netif_t nexus_netif)1193 dlil_detach_netif_nexus(if_nexus_netif_t nexus_netif)
1194 {
1195 	dlil_detach_nexus(__func__, nexus_netif->if_nif_provider,
1196 	    nexus_netif->if_nif_instance, nexus_netif->if_nif_attach);
1197 }
1198 
1199 static inline int
dlil_siocgifdevmtu(struct ifnet * ifp,struct ifdevmtu * ifdm_p)1200 dlil_siocgifdevmtu(struct ifnet * ifp, struct ifdevmtu * ifdm_p)
1201 {
1202 	struct ifreq        ifr;
1203 	int                 error;
1204 
1205 	bzero(&ifr, sizeof(ifr));
1206 	error = ifnet_ioctl(ifp, 0, SIOCGIFDEVMTU, &ifr);
1207 	if (error == 0) {
1208 		*ifdm_p = ifr.ifr_devmtu;
1209 	}
1210 	return error;
1211 }
1212 
1213 static inline int
_dlil_get_flowswitch_buffer_size(ifnet_t ifp,uuid_t netif,uint32_t * buf_size,bool * use_multi_buflet,uint32_t * large_buf_size)1214 _dlil_get_flowswitch_buffer_size(ifnet_t ifp, uuid_t netif, uint32_t *buf_size,
1215     bool *use_multi_buflet, uint32_t *large_buf_size)
1216 {
1217 	struct kern_pbufpool_memory_info rx_pp_info;
1218 	struct kern_pbufpool_memory_info tx_pp_info;
1219 	uint32_t if_max_mtu = 0;
1220 	uint32_t drv_buf_size;
1221 	struct ifdevmtu ifdm;
1222 	int err;
1223 
1224 	/*
1225 	 * To perform intra-stack RX aggregation flowswitch needs to use
1226 	 * multi-buflet packet.
1227 	 */
1228 	*use_multi_buflet = NX_FSW_TCP_RX_AGG_ENABLED();
1229 
1230 	*large_buf_size = *use_multi_buflet ? NX_FSW_DEF_LARGE_BUFSIZE : 0;
1231 	/*
1232 	 * IP over Thunderbolt interface can deliver the largest IP packet,
1233 	 * but the driver advertises the MAX MTU as only 9K.
1234 	 */
1235 	if (IFNET_IS_THUNDERBOLT_IP(ifp)) {
1236 		if_max_mtu = IP_MAXPACKET;
1237 		goto skip_mtu_ioctl;
1238 	}
1239 
1240 	/* determine max mtu */
1241 	bzero(&ifdm, sizeof(ifdm));
1242 	err = dlil_siocgifdevmtu(ifp, &ifdm);
1243 	if (__improbable(err != 0)) {
1244 		DLIL_PRINTF("%s: SIOCGIFDEVMTU failed for %s\n",
1245 		    __func__, if_name(ifp));
1246 		/* use default flowswitch buffer size */
1247 		if_max_mtu = NX_FSW_BUFSIZE;
1248 	} else {
1249 		DLIL_PRINTF("%s: %s %d %d\n", __func__, if_name(ifp),
1250 		    ifdm.ifdm_max, ifdm.ifdm_current);
1251 		/* rdar://problem/44589731 */
1252 		if_max_mtu = MAX(ifdm.ifdm_max, ifdm.ifdm_current);
1253 	}
1254 
1255 skip_mtu_ioctl:
1256 	if (if_max_mtu == 0) {
1257 		DLIL_PRINTF("%s: can't determine MAX MTU for %s\n",
1258 		    __func__, if_name(ifp));
1259 		return EINVAL;
1260 	}
1261 	if ((if_max_mtu > NX_FSW_MAXBUFSIZE) && fsw_use_max_mtu_buffer) {
1262 		DLIL_PRINTF("%s: interace (%s) has MAX MTU (%u) > flowswitch "
1263 		    "max bufsize(%d)\n", __func__,
1264 		    if_name(ifp), if_max_mtu, NX_FSW_MAXBUFSIZE);
1265 		return EINVAL;
1266 	}
1267 
1268 	/*
1269 	 * for skywalk native driver, consult the driver packet pool also.
1270 	 */
1271 	if (dlil_is_native_netif_nexus(ifp)) {
1272 		err = kern_nexus_get_pbufpool_info(netif, &rx_pp_info,
1273 		    &tx_pp_info);
1274 		if (err != 0) {
1275 			DLIL_PRINTF("%s: can't get pbufpool info for %s\n",
1276 			    __func__, if_name(ifp));
1277 			return ENXIO;
1278 		}
1279 		drv_buf_size = tx_pp_info.kpm_bufsize *
1280 		    tx_pp_info.kpm_max_frags;
1281 		if (if_max_mtu > drv_buf_size) {
1282 			DLIL_PRINTF("%s: interface %s packet pool (rx %d * %d, "
1283 			    "tx %d * %d) can't support max mtu(%d)\n", __func__,
1284 			    if_name(ifp), rx_pp_info.kpm_bufsize,
1285 			    rx_pp_info.kpm_max_frags, tx_pp_info.kpm_bufsize,
1286 			    tx_pp_info.kpm_max_frags, if_max_mtu);
1287 			return EINVAL;
1288 		}
1289 	} else {
1290 		drv_buf_size = if_max_mtu;
1291 	}
1292 
1293 	if ((drv_buf_size > NX_FSW_BUFSIZE) && (!fsw_use_max_mtu_buffer)) {
1294 		_CASSERT((NX_FSW_BUFSIZE * NX_PBUF_FRAGS_MAX) >= IP_MAXPACKET);
1295 		*use_multi_buflet = true;
1296 		/* default flowswitch buffer size */
1297 		*buf_size = NX_FSW_BUFSIZE;
1298 		*large_buf_size = MIN(NX_FSW_MAX_LARGE_BUFSIZE, drv_buf_size);
1299 	} else {
1300 		*buf_size = MAX(drv_buf_size, NX_FSW_BUFSIZE);
1301 	}
1302 
1303 	/*
1304 	 * if HW TSO is enabled on a Skywalk native interface then make
1305 	 * the flowswitch default buffer be able to handle max TSO segment.
1306 	 */
1307 	uint32_t tso_v4_mtu = 0;
1308 	uint32_t tso_v6_mtu = 0;
1309 #ifdef XNU_TARGET_OS_OSX
1310 	if (dlil_is_native_netif_nexus(ifp)) {
1311 		if ((ifp->if_hwassist & IFNET_TSO_IPV4) != 0) {
1312 			tso_v4_mtu = ifp->if_tso_v4_mtu;
1313 		}
1314 		if ((ifp->if_hwassist & IFNET_TSO_IPV6) != 0) {
1315 			tso_v6_mtu = ifp->if_tso_v6_mtu;
1316 		}
1317 	}
1318 #endif /* XNU_TARGET_OS_OSX */
1319 	if ((tso_v4_mtu != 0) || (tso_v6_mtu != 0)) {
1320 		*buf_size = max(*buf_size, max(tso_v4_mtu, tso_v6_mtu));
1321 		ASSERT(*buf_size <= NX_FSW_MAXBUFSIZE);
1322 	}
1323 	if (*buf_size >= *large_buf_size) {
1324 		*large_buf_size = 0;
1325 	}
1326 	return 0;
1327 }
1328 
1329 static boolean_t
_dlil_attach_flowswitch_nexus(ifnet_t ifp,if_nexus_flowswitch_t nexus_fsw)1330 _dlil_attach_flowswitch_nexus(ifnet_t ifp, if_nexus_flowswitch_t nexus_fsw)
1331 {
1332 	nexus_attr_t            attr = NULL;
1333 	nexus_controller_t      controller;
1334 	errno_t                 err = 0;
1335 	uuid_t                  netif;
1336 	uint32_t                buf_size = 0;
1337 	uint32_t                large_buf_size = 0;
1338 	bool                    multi_buflet;
1339 
1340 	if (ifnet_nx_noauto(ifp) || ifnet_nx_noauto_flowswitch(ifp) ||
1341 	    IFNET_IS_VMNET(ifp)) {
1342 		goto failed;
1343 	}
1344 
1345 	if ((ifp->if_capabilities & IFCAP_SKYWALK) == 0) {
1346 		/* not possible to attach (netif native/compat not plumbed) */
1347 		goto failed;
1348 	}
1349 
1350 	if ((if_attach_nx & IF_ATTACH_NX_FLOWSWITCH) == 0) {
1351 		/* don't auto-attach */
1352 		goto failed;
1353 	}
1354 
1355 	/* get the netif instance from the ifp */
1356 	err = kern_nexus_get_netif_instance(ifp, netif);
1357 	if (err != 0) {
1358 		DLIL_PRINTF("%s: can't find netif for %s\n", __func__,
1359 		    if_name(ifp));
1360 		goto failed;
1361 	}
1362 
1363 	err = kern_nexus_attr_create(&attr);
1364 	if (err != 0) {
1365 		DLIL_PRINTF("%s: nexus attr create for %s\n", __func__,
1366 		    if_name(ifp));
1367 		goto failed;
1368 	}
1369 
1370 	err = _dlil_get_flowswitch_buffer_size(ifp, netif, &buf_size,
1371 	    &multi_buflet, &large_buf_size);
1372 	if (err != 0) {
1373 		goto failed;
1374 	}
1375 	ASSERT((buf_size >= NX_FSW_BUFSIZE) && (buf_size <= NX_FSW_MAXBUFSIZE));
1376 	ASSERT(large_buf_size <= NX_FSW_MAX_LARGE_BUFSIZE);
1377 
1378 	/* Configure flowswitch buffer size */
1379 	err = kern_nexus_attr_set(attr, NEXUS_ATTR_SLOT_BUF_SIZE, buf_size);
1380 	VERIFY(err == 0);
1381 	err = kern_nexus_attr_set(attr, NEXUS_ATTR_LARGE_BUF_SIZE,
1382 	    large_buf_size);
1383 	VERIFY(err == 0);
1384 
1385 	/*
1386 	 * Configure flowswitch to use super-packet (multi-buflet).
1387 	 */
1388 	err = kern_nexus_attr_set(attr, NEXUS_ATTR_MAX_FRAGS,
1389 	    multi_buflet ? NX_PBUF_FRAGS_MAX : 1);
1390 	VERIFY(err == 0);
1391 
1392 	/* create the flowswitch provider and instance */
1393 	controller = kern_nexus_shared_controller();
1394 	err = dlil_create_provider_and_instance(controller,
1395 	    NEXUS_TYPE_FLOW_SWITCH, ifp, &nexus_fsw->if_fsw_provider,
1396 	    &nexus_fsw->if_fsw_instance, attr);
1397 	if (err != 0) {
1398 		goto failed;
1399 	}
1400 
1401 	/* attach the device port */
1402 	err = kern_nexus_ifattach(controller, nexus_fsw->if_fsw_instance,
1403 	    NULL, netif, FALSE, &nexus_fsw->if_fsw_device);
1404 	if (err != 0) {
1405 		DLIL_PRINTF("%s kern_nexus_ifattach device failed %d %s\n",
1406 		    __func__, err, if_name(ifp));
1407 		/* cleanup provider and instance */
1408 		dlil_detach_nexus(__func__, nexus_fsw->if_fsw_provider,
1409 		    nexus_fsw->if_fsw_instance, nexus_fsw->if_fsw_device);
1410 		goto failed;
1411 	}
1412 	return TRUE;
1413 
1414 failed:
1415 	if (err != 0) {
1416 		DLIL_PRINTF("%s: failed to attach flowswitch to %s, error %d\n",
1417 		    __func__, if_name(ifp), err);
1418 	} else {
1419 		DLIL_PRINTF("%s: not attaching flowswitch to %s\n",
1420 		    __func__, if_name(ifp));
1421 	}
1422 	if (attr != NULL) {
1423 		kern_nexus_attr_destroy(attr);
1424 	}
1425 	return FALSE;
1426 }
1427 
1428 static boolean_t
dlil_attach_flowswitch_nexus(ifnet_t ifp)1429 dlil_attach_flowswitch_nexus(ifnet_t ifp)
1430 {
1431 	boolean_t               attached;
1432 	if_nexus_flowswitch     nexus_fsw;
1433 
1434 #if (DEVELOPMENT || DEBUG)
1435 	if (skywalk_netif_direct_allowed(if_name(ifp))) {
1436 		DLIL_PRINTF("skip attaching fsw to %s", if_name(ifp));
1437 		return FALSE;
1438 	}
1439 #endif /* (DEVELOPMENT || DEBUG) */
1440 
1441 	/*
1442 	 * flowswitch attachment is not supported for interface using the
1443 	 * legacy model (IFNET_INIT_LEGACY)
1444 	 */
1445 	if ((ifp->if_eflags & IFEF_TXSTART) == 0) {
1446 		DLIL_PRINTF("skip attaching fsw to %s using legacy TX model",
1447 		    if_name(ifp));
1448 		return FALSE;
1449 	}
1450 
1451 	if (uuid_is_null(ifp->if_nx_flowswitch.if_fsw_instance) == 0) {
1452 		/* it's already attached */
1453 		return FALSE;
1454 	}
1455 	bzero(&nexus_fsw, sizeof(nexus_fsw));
1456 	attached = _dlil_attach_flowswitch_nexus(ifp, &nexus_fsw);
1457 	if (attached) {
1458 		ifnet_lock_exclusive(ifp);
1459 		if (!IF_FULLY_ATTACHED(ifp)) {
1460 			/* interface is going away */
1461 			attached = FALSE;
1462 		} else {
1463 			ifp->if_nx_flowswitch = nexus_fsw;
1464 		}
1465 		ifnet_lock_done(ifp);
1466 		if (!attached) {
1467 			/* clean up flowswitch nexus */
1468 			dlil_detach_flowswitch_nexus(&nexus_fsw);
1469 		}
1470 	}
1471 	return attached;
1472 }
1473 
1474 __attribute__((noinline))
1475 static void
dlil_detach_flowswitch_nexus(if_nexus_flowswitch_t nexus_fsw)1476 dlil_detach_flowswitch_nexus(if_nexus_flowswitch_t nexus_fsw)
1477 {
1478 	dlil_detach_nexus(__func__, nexus_fsw->if_fsw_provider,
1479 	    nexus_fsw->if_fsw_instance, nexus_fsw->if_fsw_device);
1480 }
1481 
1482 __attribute__((noinline))
1483 static void
dlil_netif_detach_notify(ifnet_t ifp)1484 dlil_netif_detach_notify(ifnet_t ifp)
1485 {
1486 	void (*detach_notify)(struct nexus_netif_adapter *);
1487 
1488 	/*
1489 	 * This is only needed for low latency interfaces for now.
1490 	 */
1491 	if (!ifnet_is_low_latency(ifp)) {
1492 		return;
1493 	}
1494 	detach_notify = (ifp->if_na_ops != NULL) ? ifp->if_na_ops->ni_detach_notify : NULL;
1495 	if (detach_notify != NULL) {
1496 		(*detach_notify)(ifp->if_na);
1497 	} else {
1498 		DLIL_PRINTF("%s: %s has no detach notify calback\n",
1499 		    __func__, if_name(ifp));
1500 	}
1501 }
1502 
1503 __attribute__((noinline))
1504 static void
dlil_quiesce_and_detach_nexuses(ifnet_t ifp)1505 dlil_quiesce_and_detach_nexuses(ifnet_t ifp)
1506 {
1507 	if_nexus_flowswitch *nx_fsw = &ifp->if_nx_flowswitch;
1508 	if_nexus_netif *nx_netif = &ifp->if_nx_netif;
1509 
1510 	ifnet_datamov_suspend_and_drain(ifp);
1511 	if (!uuid_is_null(nx_fsw->if_fsw_device)) {
1512 		ASSERT(!uuid_is_null(nx_fsw->if_fsw_provider));
1513 		ASSERT(!uuid_is_null(nx_fsw->if_fsw_instance));
1514 		dlil_detach_flowswitch_nexus(nx_fsw);
1515 		bzero(nx_fsw, sizeof(*nx_fsw));
1516 	} else {
1517 		ASSERT(uuid_is_null(nx_fsw->if_fsw_provider));
1518 		ASSERT(uuid_is_null(nx_fsw->if_fsw_instance));
1519 		DTRACE_IP1(fsw__not__attached, ifnet_t, ifp);
1520 	}
1521 
1522 	if (!uuid_is_null(nx_netif->if_nif_attach)) {
1523 		ASSERT(!uuid_is_null(nx_netif->if_nif_provider));
1524 		ASSERT(!uuid_is_null(nx_netif->if_nif_instance));
1525 		dlil_detach_netif_nexus(nx_netif);
1526 		bzero(nx_netif, sizeof(*nx_netif));
1527 	} else {
1528 		ASSERT(uuid_is_null(nx_netif->if_nif_provider));
1529 		ASSERT(uuid_is_null(nx_netif->if_nif_instance));
1530 		DTRACE_IP1(netif__not__attached, ifnet_t, ifp);
1531 	}
1532 	ifnet_datamov_resume(ifp);
1533 }
1534 
1535 boolean_t
ifnet_add_netagent(ifnet_t ifp)1536 ifnet_add_netagent(ifnet_t ifp)
1537 {
1538 	int     error;
1539 
1540 	error = kern_nexus_interface_add_netagent(ifp);
1541 	os_log(OS_LOG_DEFAULT,
1542 	    "kern_nexus_interface_add_netagent(%s) returned %d",
1543 	    ifp->if_xname, error);
1544 	return error == 0;
1545 }
1546 
1547 boolean_t
ifnet_remove_netagent(ifnet_t ifp)1548 ifnet_remove_netagent(ifnet_t ifp)
1549 {
1550 	int     error;
1551 
1552 	error = kern_nexus_interface_remove_netagent(ifp);
1553 	os_log(OS_LOG_DEFAULT,
1554 	    "kern_nexus_interface_remove_netagent(%s) returned %d",
1555 	    ifp->if_xname, error);
1556 	return error == 0;
1557 }
1558 
1559 boolean_t
ifnet_attach_flowswitch_nexus(ifnet_t ifp)1560 ifnet_attach_flowswitch_nexus(ifnet_t ifp)
1561 {
1562 	if (!IF_FULLY_ATTACHED(ifp)) {
1563 		return FALSE;
1564 	}
1565 	return dlil_attach_flowswitch_nexus(ifp);
1566 }
1567 
1568 boolean_t
ifnet_detach_flowswitch_nexus(ifnet_t ifp)1569 ifnet_detach_flowswitch_nexus(ifnet_t ifp)
1570 {
1571 	if_nexus_flowswitch     nexus_fsw;
1572 
1573 	ifnet_lock_exclusive(ifp);
1574 	nexus_fsw = ifp->if_nx_flowswitch;
1575 	bzero(&ifp->if_nx_flowswitch, sizeof(ifp->if_nx_flowswitch));
1576 	ifnet_lock_done(ifp);
1577 	return dlil_detach_nexus(__func__, nexus_fsw.if_fsw_provider,
1578 	           nexus_fsw.if_fsw_instance, nexus_fsw.if_fsw_device);
1579 }
1580 
1581 boolean_t
ifnet_attach_netif_nexus(ifnet_t ifp)1582 ifnet_attach_netif_nexus(ifnet_t ifp)
1583 {
1584 	boolean_t       nexus_attached;
1585 	if_nexus_netif  nexus_netif;
1586 
1587 	if (!IF_FULLY_ATTACHED(ifp)) {
1588 		return FALSE;
1589 	}
1590 	nexus_attached = dlil_attach_netif_nexus_common(ifp, &nexus_netif);
1591 	if (nexus_attached) {
1592 		ifnet_lock_exclusive(ifp);
1593 		ifp->if_nx_netif = nexus_netif;
1594 		ifnet_lock_done(ifp);
1595 	}
1596 	return nexus_attached;
1597 }
1598 
1599 boolean_t
ifnet_detach_netif_nexus(ifnet_t ifp)1600 ifnet_detach_netif_nexus(ifnet_t ifp)
1601 {
1602 	if_nexus_netif  nexus_netif;
1603 
1604 	ifnet_lock_exclusive(ifp);
1605 	nexus_netif = ifp->if_nx_netif;
1606 	bzero(&ifp->if_nx_netif, sizeof(ifp->if_nx_netif));
1607 	ifnet_lock_done(ifp);
1608 
1609 	return dlil_detach_nexus(__func__, nexus_netif.if_nif_provider,
1610 	           nexus_netif.if_nif_instance, nexus_netif.if_nif_attach);
1611 }
1612 
1613 void
ifnet_attach_native_flowswitch(ifnet_t ifp)1614 ifnet_attach_native_flowswitch(ifnet_t ifp)
1615 {
1616 	if (!dlil_is_native_netif_nexus(ifp)) {
1617 		/* not a native netif */
1618 		return;
1619 	}
1620 	ifnet_attach_flowswitch_nexus(ifp);
1621 }
1622 
1623 #endif /* SKYWALK */
1624 
1625 #define DLIL_INPUT_CHECK(m, ifp) {                                      \
1626 	struct ifnet *_rcvif = mbuf_pkthdr_rcvif(m);                    \
1627 	if (_rcvif == NULL || (ifp != lo_ifp && _rcvif != ifp) ||       \
1628 	    !(mbuf_flags(m) & MBUF_PKTHDR)) {                           \
1629 	        panic_plain("%s: invalid mbuf %p\n", __func__, m);      \
1630 	/* NOTREACHED */                                        \
1631 	}                                                               \
1632 }
1633 
1634 #define DLIL_EWMA(old, new, decay) do {                                 \
1635 	u_int32_t _avg;                                                 \
1636 	if ((_avg = (old)) > 0)                                         \
1637 	        _avg = (((_avg << (decay)) - _avg) + (new)) >> (decay); \
1638 	else                                                            \
1639 	        _avg = (new);                                           \
1640 	(old) = _avg;                                                   \
1641 } while (0)
1642 
1643 #define MBPS    (1ULL * 1000 * 1000)
1644 #define GBPS    (MBPS * 1000)
1645 
1646 struct rxpoll_time_tbl {
1647 	u_int64_t       speed;          /* downlink speed */
1648 	u_int32_t       plowat;         /* packets low watermark */
1649 	u_int32_t       phiwat;         /* packets high watermark */
1650 	u_int32_t       blowat;         /* bytes low watermark */
1651 	u_int32_t       bhiwat;         /* bytes high watermark */
1652 };
1653 
1654 static struct rxpoll_time_tbl rxpoll_tbl[] = {
1655 	{ .speed =  10 * MBPS, .plowat = 2, .phiwat = 8, .blowat = (1 * 1024), .bhiwat = (6 * 1024)    },
1656 	{ .speed = 100 * MBPS, .plowat = 10, .phiwat = 40, .blowat = (4 * 1024), .bhiwat = (64 * 1024)   },
1657 	{ .speed =   1 * GBPS, .plowat = 10, .phiwat = 40, .blowat = (4 * 1024), .bhiwat = (64 * 1024)   },
1658 	{ .speed =  10 * GBPS, .plowat = 10, .phiwat = 40, .blowat = (4 * 1024), .bhiwat = (64 * 1024)   },
1659 	{ .speed = 100 * GBPS, .plowat = 10, .phiwat = 40, .blowat = (4 * 1024), .bhiwat = (64 * 1024)   },
1660 	{ .speed = 0, .plowat = 0, .phiwat = 0, .blowat = 0, .bhiwat = 0 }
1661 };
1662 
1663 static LCK_MTX_DECLARE_ATTR(dlil_thread_sync_lock, &dlil_lock_group,
1664     &dlil_lck_attributes);
1665 static uint32_t dlil_pending_thread_cnt = 0;
1666 
1667 static void
dlil_incr_pending_thread_count(void)1668 dlil_incr_pending_thread_count(void)
1669 {
1670 	LCK_MTX_ASSERT(&dlil_thread_sync_lock, LCK_MTX_ASSERT_NOTOWNED);
1671 	lck_mtx_lock(&dlil_thread_sync_lock);
1672 	dlil_pending_thread_cnt++;
1673 	lck_mtx_unlock(&dlil_thread_sync_lock);
1674 }
1675 
1676 static void
dlil_decr_pending_thread_count(void)1677 dlil_decr_pending_thread_count(void)
1678 {
1679 	LCK_MTX_ASSERT(&dlil_thread_sync_lock, LCK_MTX_ASSERT_NOTOWNED);
1680 	lck_mtx_lock(&dlil_thread_sync_lock);
1681 	VERIFY(dlil_pending_thread_cnt > 0);
1682 	dlil_pending_thread_cnt--;
1683 	if (dlil_pending_thread_cnt == 0) {
1684 		wakeup(&dlil_pending_thread_cnt);
1685 	}
1686 	lck_mtx_unlock(&dlil_thread_sync_lock);
1687 }
1688 
1689 int
proto_hash_value(u_int32_t protocol_family)1690 proto_hash_value(u_int32_t protocol_family)
1691 {
1692 	/*
1693 	 * dlil_proto_unplumb_all() depends on the mapping between
1694 	 * the hash bucket index and the protocol family defined
1695 	 * here; future changes must be applied there as well.
1696 	 */
1697 	switch (protocol_family) {
1698 	case PF_INET:
1699 		return 0;
1700 	case PF_INET6:
1701 		return 1;
1702 	case PF_VLAN:
1703 		return 2;
1704 	case PF_UNSPEC:
1705 	default:
1706 		return 3;
1707 	}
1708 }
1709 
1710 /*
1711  * Caller must already be holding ifnet lock.
1712  */
1713 static struct if_proto *
find_attached_proto(struct ifnet * ifp,u_int32_t protocol_family)1714 find_attached_proto(struct ifnet *ifp, u_int32_t protocol_family)
1715 {
1716 	struct if_proto *proto = NULL;
1717 	u_int32_t i = proto_hash_value(protocol_family);
1718 
1719 	ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_OWNED);
1720 
1721 	if (ifp->if_proto_hash != NULL) {
1722 		proto = SLIST_FIRST(&ifp->if_proto_hash[i]);
1723 	}
1724 
1725 	while (proto != NULL && proto->protocol_family != protocol_family) {
1726 		proto = SLIST_NEXT(proto, next_hash);
1727 	}
1728 
1729 	if (proto != NULL) {
1730 		if_proto_ref(proto);
1731 	}
1732 
1733 	return proto;
1734 }
1735 
1736 static void
if_proto_ref(struct if_proto * proto)1737 if_proto_ref(struct if_proto *proto)
1738 {
1739 	atomic_add_32(&proto->refcount, 1);
1740 }
1741 
1742 extern void if_rtproto_del(struct ifnet *ifp, int protocol);
1743 
1744 static void
if_proto_free(struct if_proto * proto)1745 if_proto_free(struct if_proto *proto)
1746 {
1747 	u_int32_t oldval;
1748 	struct ifnet *ifp = proto->ifp;
1749 	u_int32_t proto_family = proto->protocol_family;
1750 	struct kev_dl_proto_data ev_pr_data;
1751 
1752 	oldval = atomic_add_32_ov(&proto->refcount, -1);
1753 	if (oldval > 1) {
1754 		return;
1755 	}
1756 
1757 	if (proto->proto_kpi == kProtoKPI_v1) {
1758 		if (proto->kpi.v1.detached) {
1759 			proto->kpi.v1.detached(ifp, proto->protocol_family);
1760 		}
1761 	}
1762 	if (proto->proto_kpi == kProtoKPI_v2) {
1763 		if (proto->kpi.v2.detached) {
1764 			proto->kpi.v2.detached(ifp, proto->protocol_family);
1765 		}
1766 	}
1767 
1768 	/*
1769 	 * Cleanup routes that may still be in the routing table for that
1770 	 * interface/protocol pair.
1771 	 */
1772 	if_rtproto_del(ifp, proto_family);
1773 
1774 	ifnet_lock_shared(ifp);
1775 
1776 	/* No more reference on this, protocol must have been detached */
1777 	VERIFY(proto->detached);
1778 
1779 	/*
1780 	 * The reserved field carries the number of protocol still attached
1781 	 * (subject to change)
1782 	 */
1783 	ev_pr_data.proto_family = proto_family;
1784 	ev_pr_data.proto_remaining_count = dlil_ifp_protolist(ifp, NULL, 0);
1785 
1786 	ifnet_lock_done(ifp);
1787 
1788 	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_PROTO_DETACHED,
1789 	    (struct net_event_data *)&ev_pr_data,
1790 	    sizeof(struct kev_dl_proto_data), FALSE);
1791 
1792 	if (ev_pr_data.proto_remaining_count == 0) {
1793 		/*
1794 		 * The protocol count has gone to zero, mark the interface down.
1795 		 * This used to be done by configd.KernelEventMonitor, but that
1796 		 * is inherently prone to races (rdar://problem/30810208).
1797 		 */
1798 		(void) ifnet_set_flags(ifp, 0, IFF_UP);
1799 		(void) ifnet_ioctl(ifp, 0, SIOCSIFFLAGS, NULL);
1800 		dlil_post_sifflags_msg(ifp);
1801 	}
1802 
1803 	zfree(dlif_proto_zone, proto);
1804 }
1805 
1806 __private_extern__ void
ifnet_lock_assert(struct ifnet * ifp,ifnet_lock_assert_t what)1807 ifnet_lock_assert(struct ifnet *ifp, ifnet_lock_assert_t what)
1808 {
1809 #if !MACH_ASSERT
1810 #pragma unused(ifp)
1811 #endif
1812 	unsigned int type = 0;
1813 	int ass = 1;
1814 
1815 	switch (what) {
1816 	case IFNET_LCK_ASSERT_EXCLUSIVE:
1817 		type = LCK_RW_ASSERT_EXCLUSIVE;
1818 		break;
1819 
1820 	case IFNET_LCK_ASSERT_SHARED:
1821 		type = LCK_RW_ASSERT_SHARED;
1822 		break;
1823 
1824 	case IFNET_LCK_ASSERT_OWNED:
1825 		type = LCK_RW_ASSERT_HELD;
1826 		break;
1827 
1828 	case IFNET_LCK_ASSERT_NOTOWNED:
1829 		/* nothing to do here for RW lock; bypass assert */
1830 		ass = 0;
1831 		break;
1832 
1833 	default:
1834 		panic("bad ifnet assert type: %d", what);
1835 		/* NOTREACHED */
1836 	}
1837 	if (ass) {
1838 		LCK_RW_ASSERT(&ifp->if_lock, type);
1839 	}
1840 }
1841 
1842 __private_extern__ void
ifnet_lock_shared(struct ifnet * ifp)1843 ifnet_lock_shared(struct ifnet *ifp)
1844 {
1845 	lck_rw_lock_shared(&ifp->if_lock);
1846 }
1847 
1848 __private_extern__ void
ifnet_lock_exclusive(struct ifnet * ifp)1849 ifnet_lock_exclusive(struct ifnet *ifp)
1850 {
1851 	lck_rw_lock_exclusive(&ifp->if_lock);
1852 }
1853 
1854 __private_extern__ void
ifnet_lock_done(struct ifnet * ifp)1855 ifnet_lock_done(struct ifnet *ifp)
1856 {
1857 	lck_rw_done(&ifp->if_lock);
1858 }
1859 
1860 #if INET
1861 __private_extern__ void
if_inetdata_lock_shared(struct ifnet * ifp)1862 if_inetdata_lock_shared(struct ifnet *ifp)
1863 {
1864 	lck_rw_lock_shared(&ifp->if_inetdata_lock);
1865 }
1866 
1867 __private_extern__ void
if_inetdata_lock_exclusive(struct ifnet * ifp)1868 if_inetdata_lock_exclusive(struct ifnet *ifp)
1869 {
1870 	lck_rw_lock_exclusive(&ifp->if_inetdata_lock);
1871 }
1872 
1873 __private_extern__ void
if_inetdata_lock_done(struct ifnet * ifp)1874 if_inetdata_lock_done(struct ifnet *ifp)
1875 {
1876 	lck_rw_done(&ifp->if_inetdata_lock);
1877 }
1878 #endif
1879 
1880 __private_extern__ void
if_inet6data_lock_shared(struct ifnet * ifp)1881 if_inet6data_lock_shared(struct ifnet *ifp)
1882 {
1883 	lck_rw_lock_shared(&ifp->if_inet6data_lock);
1884 }
1885 
1886 __private_extern__ void
if_inet6data_lock_exclusive(struct ifnet * ifp)1887 if_inet6data_lock_exclusive(struct ifnet *ifp)
1888 {
1889 	lck_rw_lock_exclusive(&ifp->if_inet6data_lock);
1890 }
1891 
1892 __private_extern__ void
if_inet6data_lock_done(struct ifnet * ifp)1893 if_inet6data_lock_done(struct ifnet *ifp)
1894 {
1895 	lck_rw_done(&ifp->if_inet6data_lock);
1896 }
1897 
1898 __private_extern__ void
ifnet_head_lock_shared(void)1899 ifnet_head_lock_shared(void)
1900 {
1901 	lck_rw_lock_shared(&ifnet_head_lock);
1902 }
1903 
1904 __private_extern__ void
ifnet_head_lock_exclusive(void)1905 ifnet_head_lock_exclusive(void)
1906 {
1907 	lck_rw_lock_exclusive(&ifnet_head_lock);
1908 }
1909 
1910 __private_extern__ void
ifnet_head_done(void)1911 ifnet_head_done(void)
1912 {
1913 	lck_rw_done(&ifnet_head_lock);
1914 }
1915 
1916 __private_extern__ void
ifnet_head_assert_exclusive(void)1917 ifnet_head_assert_exclusive(void)
1918 {
1919 	LCK_RW_ASSERT(&ifnet_head_lock, LCK_RW_ASSERT_EXCLUSIVE);
1920 }
1921 
1922 /*
1923  * dlil_ifp_protolist
1924  * - get the list of protocols attached to the interface, or just the number
1925  *   of attached protocols
1926  * - if the number returned is greater than 'list_count', truncation occurred
1927  *
1928  * Note:
1929  * - caller must already be holding ifnet lock.
1930  */
1931 static u_int32_t
dlil_ifp_protolist(struct ifnet * ifp,protocol_family_t * list,u_int32_t list_count)1932 dlil_ifp_protolist(struct ifnet *ifp, protocol_family_t *list,
1933     u_int32_t list_count)
1934 {
1935 	u_int32_t       count = 0;
1936 	int             i;
1937 
1938 	ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_OWNED);
1939 
1940 	if (ifp->if_proto_hash == NULL) {
1941 		goto done;
1942 	}
1943 
1944 	for (i = 0; i < PROTO_HASH_SLOTS; i++) {
1945 		struct if_proto *proto;
1946 		SLIST_FOREACH(proto, &ifp->if_proto_hash[i], next_hash) {
1947 			if (list != NULL && count < list_count) {
1948 				list[count] = proto->protocol_family;
1949 			}
1950 			count++;
1951 		}
1952 	}
1953 done:
1954 	return count;
1955 }
1956 
1957 __private_extern__ u_int32_t
if_get_protolist(struct ifnet * ifp,u_int32_t * protolist,u_int32_t count)1958 if_get_protolist(struct ifnet * ifp, u_int32_t *protolist, u_int32_t count)
1959 {
1960 	ifnet_lock_shared(ifp);
1961 	count = dlil_ifp_protolist(ifp, protolist, count);
1962 	ifnet_lock_done(ifp);
1963 	return count;
1964 }
1965 
1966 __private_extern__ void
if_free_protolist(u_int32_t * list)1967 if_free_protolist(u_int32_t *list)
1968 {
1969 	kfree_data_addr(list);
1970 }
1971 
1972 __private_extern__ int
dlil_post_msg(struct ifnet * ifp,u_int32_t event_subclass,u_int32_t event_code,struct net_event_data * event_data,u_int32_t event_data_len,boolean_t suppress_generation)1973 dlil_post_msg(struct ifnet *ifp, u_int32_t event_subclass,
1974     u_int32_t event_code, struct net_event_data *event_data,
1975     u_int32_t event_data_len, boolean_t suppress_generation)
1976 {
1977 	struct net_event_data ev_data;
1978 	struct kev_msg ev_msg;
1979 
1980 	bzero(&ev_msg, sizeof(ev_msg));
1981 	bzero(&ev_data, sizeof(ev_data));
1982 	/*
1983 	 * a net event always starts with a net_event_data structure
1984 	 * but the caller can generate a simple net event or
1985 	 * provide a longer event structure to post
1986 	 */
1987 	ev_msg.vendor_code      = KEV_VENDOR_APPLE;
1988 	ev_msg.kev_class        = KEV_NETWORK_CLASS;
1989 	ev_msg.kev_subclass     = event_subclass;
1990 	ev_msg.event_code       = event_code;
1991 
1992 	if (event_data == NULL) {
1993 		event_data = &ev_data;
1994 		event_data_len = sizeof(struct net_event_data);
1995 	}
1996 
1997 	strlcpy(&event_data->if_name[0], ifp->if_name, IFNAMSIZ);
1998 	event_data->if_family = ifp->if_family;
1999 	event_data->if_unit   = (u_int32_t)ifp->if_unit;
2000 
2001 	ev_msg.dv[0].data_length = event_data_len;
2002 	ev_msg.dv[0].data_ptr    = event_data;
2003 	ev_msg.dv[1].data_length = 0;
2004 
2005 	bool update_generation = true;
2006 	if (event_subclass == KEV_DL_SUBCLASS) {
2007 		/* Don't update interface generation for frequent link quality and state changes  */
2008 		switch (event_code) {
2009 		case KEV_DL_LINK_QUALITY_METRIC_CHANGED:
2010 		case KEV_DL_RRC_STATE_CHANGED:
2011 		case KEV_DL_PRIMARY_ELECTED:
2012 			update_generation = false;
2013 			break;
2014 		default:
2015 			break;
2016 		}
2017 	}
2018 
2019 	/*
2020 	 * Some events that update generation counts might
2021 	 * want to suppress generation count.
2022 	 * One example is node presence/absence where we still
2023 	 * issue kernel event for the invocation but want to avoid
2024 	 * expensive operation of updating generation which triggers
2025 	 * NECP client updates.
2026 	 */
2027 	if (suppress_generation) {
2028 		update_generation = false;
2029 	}
2030 
2031 	return dlil_event_internal(ifp, &ev_msg, update_generation);
2032 }
2033 
2034 __private_extern__ int
dlil_alloc_local_stats(struct ifnet * ifp)2035 dlil_alloc_local_stats(struct ifnet *ifp)
2036 {
2037 	int ret = EINVAL;
2038 	void *buf, *base, **pbuf;
2039 
2040 	if (ifp == NULL) {
2041 		goto end;
2042 	}
2043 
2044 	if (ifp->if_tcp_stat == NULL && ifp->if_udp_stat == NULL) {
2045 		/* allocate tcpstat_local structure */
2046 		buf = zalloc_flags(dlif_tcpstat_zone,
2047 		    Z_WAITOK | Z_ZERO | Z_NOFAIL);
2048 
2049 		/* Get the 64-bit aligned base address for this object */
2050 		base = (void *)P2ROUNDUP((intptr_t)buf + sizeof(u_int64_t),
2051 		    sizeof(u_int64_t));
2052 		VERIFY(((intptr_t)base + dlif_tcpstat_size) <=
2053 		    ((intptr_t)buf + dlif_tcpstat_bufsize));
2054 
2055 		/*
2056 		 * Wind back a pointer size from the aligned base and
2057 		 * save the original address so we can free it later.
2058 		 */
2059 		pbuf = (void **)((intptr_t)base - sizeof(void *));
2060 		*pbuf = buf;
2061 		ifp->if_tcp_stat = base;
2062 
2063 		/* allocate udpstat_local structure */
2064 		buf = zalloc_flags(dlif_udpstat_zone,
2065 		    Z_WAITOK | Z_ZERO | Z_NOFAIL);
2066 
2067 		/* Get the 64-bit aligned base address for this object */
2068 		base = (void *)P2ROUNDUP((intptr_t)buf + sizeof(u_int64_t),
2069 		    sizeof(u_int64_t));
2070 		VERIFY(((intptr_t)base + dlif_udpstat_size) <=
2071 		    ((intptr_t)buf + dlif_udpstat_bufsize));
2072 
2073 		/*
2074 		 * Wind back a pointer size from the aligned base and
2075 		 * save the original address so we can free it later.
2076 		 */
2077 		pbuf = (void **)((intptr_t)base - sizeof(void *));
2078 		*pbuf = buf;
2079 		ifp->if_udp_stat = base;
2080 
2081 		VERIFY(IS_P2ALIGNED(ifp->if_tcp_stat, sizeof(u_int64_t)) &&
2082 		    IS_P2ALIGNED(ifp->if_udp_stat, sizeof(u_int64_t)));
2083 
2084 		ret = 0;
2085 	}
2086 
2087 	if (ifp->if_ipv4_stat == NULL) {
2088 		ifp->if_ipv4_stat = kalloc_type(struct if_tcp_ecn_stat, Z_WAITOK | Z_ZERO);
2089 	}
2090 
2091 	if (ifp->if_ipv6_stat == NULL) {
2092 		ifp->if_ipv6_stat = kalloc_type(struct if_tcp_ecn_stat, Z_WAITOK | Z_ZERO);
2093 	}
2094 end:
2095 	if (ifp != NULL && ret != 0) {
2096 		if (ifp->if_tcp_stat != NULL) {
2097 			pbuf = (void **)
2098 			    ((intptr_t)ifp->if_tcp_stat - sizeof(void *));
2099 			zfree(dlif_tcpstat_zone, *pbuf);
2100 			ifp->if_tcp_stat = NULL;
2101 		}
2102 		if (ifp->if_udp_stat != NULL) {
2103 			pbuf = (void **)
2104 			    ((intptr_t)ifp->if_udp_stat - sizeof(void *));
2105 			zfree(dlif_udpstat_zone, *pbuf);
2106 			ifp->if_udp_stat = NULL;
2107 		}
2108 		/* The macro kfree_type sets the passed pointer to NULL */
2109 		if (ifp->if_ipv4_stat != NULL) {
2110 			kfree_type(struct if_tcp_ecn_stat, ifp->if_ipv4_stat);
2111 		}
2112 		if (ifp->if_ipv6_stat != NULL) {
2113 			kfree_type(struct if_tcp_ecn_stat, ifp->if_ipv6_stat);
2114 		}
2115 	}
2116 
2117 	return ret;
2118 }
2119 
2120 static void
dlil_reset_rxpoll_params(ifnet_t ifp)2121 dlil_reset_rxpoll_params(ifnet_t ifp)
2122 {
2123 	ASSERT(ifp != NULL);
2124 	ifnet_set_poll_cycle(ifp, NULL);
2125 	ifp->if_poll_update = 0;
2126 	ifp->if_poll_flags = 0;
2127 	ifp->if_poll_req = 0;
2128 	ifp->if_poll_mode = IFNET_MODEL_INPUT_POLL_OFF;
2129 	bzero(&ifp->if_poll_tstats, sizeof(ifp->if_poll_tstats));
2130 	bzero(&ifp->if_poll_pstats, sizeof(ifp->if_poll_pstats));
2131 	bzero(&ifp->if_poll_sstats, sizeof(ifp->if_poll_sstats));
2132 	net_timerclear(&ifp->if_poll_mode_holdtime);
2133 	net_timerclear(&ifp->if_poll_mode_lasttime);
2134 	net_timerclear(&ifp->if_poll_sample_holdtime);
2135 	net_timerclear(&ifp->if_poll_sample_lasttime);
2136 	net_timerclear(&ifp->if_poll_dbg_lasttime);
2137 }
2138 
2139 static int
dlil_create_input_thread(ifnet_t ifp,struct dlil_threading_info * inp,thread_continue_t * thfunc)2140 dlil_create_input_thread(ifnet_t ifp, struct dlil_threading_info *inp,
2141     thread_continue_t *thfunc)
2142 {
2143 	boolean_t dlil_rxpoll_input;
2144 	thread_continue_t func = NULL;
2145 	u_int32_t limit;
2146 	int error = 0;
2147 
2148 	dlil_rxpoll_input = (ifp != NULL && net_rxpoll &&
2149 	    (ifp->if_eflags & IFEF_RXPOLL) && (ifp->if_xflags & IFXF_LEGACY));
2150 
2151 	/* default strategy utilizes the DLIL worker thread */
2152 	inp->dlth_strategy = dlil_input_async;
2153 
2154 	/* NULL ifp indicates the main input thread, called at dlil_init time */
2155 	if (ifp == NULL) {
2156 		/*
2157 		 * Main input thread only.
2158 		 */
2159 		func = dlil_main_input_thread_func;
2160 		VERIFY(inp == dlil_main_input_thread);
2161 		(void) strlcat(inp->dlth_name,
2162 		    "main_input", DLIL_THREADNAME_LEN);
2163 	} else if (dlil_rxpoll_input) {
2164 		/*
2165 		 * Legacy (non-netif) hybrid polling.
2166 		 */
2167 		func = dlil_rxpoll_input_thread_func;
2168 		VERIFY(inp != dlil_main_input_thread);
2169 		(void) snprintf(inp->dlth_name, DLIL_THREADNAME_LEN,
2170 		    "%s_input_poll", if_name(ifp));
2171 	} else if (net_async || (ifp->if_xflags & IFXF_LEGACY)) {
2172 		/*
2173 		 * Asynchronous strategy.
2174 		 */
2175 		func = dlil_input_thread_func;
2176 		VERIFY(inp != dlil_main_input_thread);
2177 		(void) snprintf(inp->dlth_name, DLIL_THREADNAME_LEN,
2178 		    "%s_input", if_name(ifp));
2179 	} else {
2180 		/*
2181 		 * Synchronous strategy if there's a netif below and
2182 		 * the device isn't capable of hybrid polling.
2183 		 */
2184 		ASSERT(func == NULL);
2185 		ASSERT(!(ifp->if_xflags & IFXF_LEGACY));
2186 		VERIFY(inp != dlil_main_input_thread);
2187 		ASSERT(!inp->dlth_affinity);
2188 		inp->dlth_strategy = dlil_input_sync;
2189 	}
2190 	VERIFY(inp->dlth_thread == THREAD_NULL);
2191 
2192 	/* let caller know */
2193 	if (thfunc != NULL) {
2194 		*thfunc = func;
2195 	}
2196 
2197 	inp->dlth_lock_grp = lck_grp_alloc_init(inp->dlth_name, LCK_GRP_ATTR_NULL);
2198 	lck_mtx_init(&inp->dlth_lock, inp->dlth_lock_grp, &dlil_lck_attributes);
2199 
2200 	inp->dlth_ifp = ifp; /* NULL for main input thread */
2201 	/*
2202 	 * For interfaces that support opportunistic polling, set the
2203 	 * low and high watermarks for outstanding inbound packets/bytes.
2204 	 * Also define freeze times for transitioning between modes
2205 	 * and updating the average.
2206 	 */
2207 	if (ifp != NULL && net_rxpoll && (ifp->if_eflags & IFEF_RXPOLL)) {
2208 		limit = MAX(if_rcvq_maxlen, IF_RCVQ_MINLEN);
2209 		if (ifp->if_xflags & IFXF_LEGACY) {
2210 			(void) dlil_rxpoll_set_params(ifp, NULL, FALSE);
2211 		}
2212 	} else {
2213 		limit = (u_int32_t)-1;
2214 	}
2215 
2216 	_qinit(&inp->dlth_pkts, Q_DROPTAIL, limit, QP_MBUF);
2217 	if (inp == dlil_main_input_thread) {
2218 		struct dlil_main_threading_info *inpm =
2219 		    (struct dlil_main_threading_info *)inp;
2220 		_qinit(&inpm->lo_rcvq_pkts, Q_DROPTAIL, limit, QP_MBUF);
2221 	}
2222 
2223 	if (func == NULL) {
2224 		ASSERT(!(ifp->if_xflags & IFXF_LEGACY));
2225 		ASSERT(error == 0);
2226 		error = ENODEV;
2227 		goto done;
2228 	}
2229 
2230 	error = kernel_thread_start(func, inp, &inp->dlth_thread);
2231 	if (error == KERN_SUCCESS) {
2232 		thread_precedence_policy_data_t info;
2233 		__unused kern_return_t kret;
2234 
2235 		bzero(&info, sizeof(info));
2236 		info.importance = 0;
2237 		kret = thread_policy_set(inp->dlth_thread,
2238 		    THREAD_PRECEDENCE_POLICY, (thread_policy_t)&info,
2239 		    THREAD_PRECEDENCE_POLICY_COUNT);
2240 		ASSERT(kret == KERN_SUCCESS);
2241 		/*
2242 		 * We create an affinity set so that the matching workloop
2243 		 * thread or the starter thread (for loopback) can be
2244 		 * scheduled on the same processor set as the input thread.
2245 		 */
2246 		if (net_affinity) {
2247 			struct thread *tp = inp->dlth_thread;
2248 			u_int32_t tag;
2249 			/*
2250 			 * Randomize to reduce the probability
2251 			 * of affinity tag namespace collision.
2252 			 */
2253 			read_frandom(&tag, sizeof(tag));
2254 			if (dlil_affinity_set(tp, tag) == KERN_SUCCESS) {
2255 				thread_reference(tp);
2256 				inp->dlth_affinity_tag = tag;
2257 				inp->dlth_affinity = TRUE;
2258 			}
2259 		}
2260 	} else if (inp == dlil_main_input_thread) {
2261 		panic_plain("%s: couldn't create main input thread", __func__);
2262 		/* NOTREACHED */
2263 	} else {
2264 		panic_plain("%s: couldn't create %s input thread", __func__,
2265 		    if_name(ifp));
2266 		/* NOTREACHED */
2267 	}
2268 	OSAddAtomic(1, &cur_dlil_input_threads);
2269 
2270 done:
2271 	return error;
2272 }
2273 
2274 #if TEST_INPUT_THREAD_TERMINATION
2275 static int
2276 sysctl_input_thread_termination_spin SYSCTL_HANDLER_ARGS
2277 {
2278 #pragma unused(arg1, arg2)
2279 	uint32_t i;
2280 	int err;
2281 
2282 	i = if_input_thread_termination_spin;
2283 
2284 	err = sysctl_handle_int(oidp, &i, 0, req);
2285 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
2286 		return err;
2287 	}
2288 
2289 	if (net_rxpoll == 0) {
2290 		return ENXIO;
2291 	}
2292 
2293 	if_input_thread_termination_spin = i;
2294 	return err;
2295 }
2296 #endif /* TEST_INPUT_THREAD_TERMINATION */
2297 
2298 static void
dlil_clean_threading_info(struct dlil_threading_info * inp)2299 dlil_clean_threading_info(struct dlil_threading_info *inp)
2300 {
2301 	lck_mtx_destroy(&inp->dlth_lock, inp->dlth_lock_grp);
2302 	lck_grp_free(inp->dlth_lock_grp);
2303 	inp->dlth_lock_grp = NULL;
2304 
2305 	inp->dlth_flags = 0;
2306 	inp->dlth_wtot = 0;
2307 	bzero(inp->dlth_name, sizeof(inp->dlth_name));
2308 	inp->dlth_ifp = NULL;
2309 	VERIFY(qhead(&inp->dlth_pkts) == NULL && qempty(&inp->dlth_pkts));
2310 	qlimit(&inp->dlth_pkts) = 0;
2311 	bzero(&inp->dlth_stats, sizeof(inp->dlth_stats));
2312 
2313 	VERIFY(!inp->dlth_affinity);
2314 	inp->dlth_thread = THREAD_NULL;
2315 	inp->dlth_strategy = NULL;
2316 	VERIFY(inp->dlth_driver_thread == THREAD_NULL);
2317 	VERIFY(inp->dlth_poller_thread == THREAD_NULL);
2318 	VERIFY(inp->dlth_affinity_tag == 0);
2319 #if IFNET_INPUT_SANITY_CHK
2320 	inp->dlth_pkts_cnt = 0;
2321 #endif /* IFNET_INPUT_SANITY_CHK */
2322 }
2323 
2324 static void
dlil_terminate_input_thread(struct dlil_threading_info * inp)2325 dlil_terminate_input_thread(struct dlil_threading_info *inp)
2326 {
2327 	struct ifnet *ifp = inp->dlth_ifp;
2328 	classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
2329 
2330 	VERIFY(current_thread() == inp->dlth_thread);
2331 	VERIFY(inp != dlil_main_input_thread);
2332 
2333 	OSAddAtomic(-1, &cur_dlil_input_threads);
2334 
2335 #if TEST_INPUT_THREAD_TERMINATION
2336 	{ /* do something useless that won't get optimized away */
2337 		uint32_t        v = 1;
2338 		for (uint32_t i = 0;
2339 		    i < if_input_thread_termination_spin;
2340 		    i++) {
2341 			v = (i + 1) * v;
2342 		}
2343 		DLIL_PRINTF("the value is %d\n", v);
2344 	}
2345 #endif /* TEST_INPUT_THREAD_TERMINATION */
2346 
2347 	lck_mtx_lock_spin(&inp->dlth_lock);
2348 	_getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
2349 	VERIFY((inp->dlth_flags & DLIL_INPUT_TERMINATE) != 0);
2350 	inp->dlth_flags |= DLIL_INPUT_TERMINATE_COMPLETE;
2351 	wakeup_one((caddr_t)&inp->dlth_flags);
2352 	lck_mtx_unlock(&inp->dlth_lock);
2353 
2354 	/* free up pending packets */
2355 	if (pkt.cp_mbuf != NULL) {
2356 		mbuf_freem_list(pkt.cp_mbuf);
2357 	}
2358 
2359 	/* for the extra refcnt from kernel_thread_start() */
2360 	thread_deallocate(current_thread());
2361 
2362 	if (dlil_verbose) {
2363 		DLIL_PRINTF("%s: input thread terminated\n",
2364 		    if_name(ifp));
2365 	}
2366 
2367 	/* this is the end */
2368 	thread_terminate(current_thread());
2369 	/* NOTREACHED */
2370 }
2371 
2372 static kern_return_t
dlil_affinity_set(struct thread * tp,u_int32_t tag)2373 dlil_affinity_set(struct thread *tp, u_int32_t tag)
2374 {
2375 	thread_affinity_policy_data_t policy;
2376 
2377 	bzero(&policy, sizeof(policy));
2378 	policy.affinity_tag = tag;
2379 	return thread_policy_set(tp, THREAD_AFFINITY_POLICY,
2380 	           (thread_policy_t)&policy, THREAD_AFFINITY_POLICY_COUNT);
2381 }
2382 
2383 #if SKYWALK && defined(XNU_TARGET_OS_OSX)
2384 static void
dlil_filter_event(struct eventhandler_entry_arg arg __unused,enum net_filter_event_subsystems state)2385 dlil_filter_event(struct eventhandler_entry_arg arg __unused,
2386     enum net_filter_event_subsystems state)
2387 {
2388 	bool old_if_enable_fsw_transport_netagent = if_enable_fsw_transport_netagent;
2389 	if ((state & ~NET_FILTER_EVENT_PF_PRIVATE_PROXY) == 0) {
2390 		if_enable_fsw_transport_netagent = 1;
2391 	} else {
2392 		if_enable_fsw_transport_netagent = 0;
2393 	}
2394 	if (old_if_enable_fsw_transport_netagent != if_enable_fsw_transport_netagent) {
2395 		kern_nexus_update_netagents();
2396 	} else if (!if_enable_fsw_transport_netagent) {
2397 		necp_update_all_clients();
2398 	}
2399 }
2400 #endif /* SKYWALK && XNU_TARGET_OS_OSX */
2401 
2402 void
dlil_init(void)2403 dlil_init(void)
2404 {
2405 	thread_t thread = THREAD_NULL;
2406 
2407 	/*
2408 	 * The following fields must be 64-bit aligned for atomic operations.
2409 	 */
2410 	IF_DATA_REQUIRE_ALIGNED_64(ifi_ipackets);
2411 	IF_DATA_REQUIRE_ALIGNED_64(ifi_ierrors);
2412 	IF_DATA_REQUIRE_ALIGNED_64(ifi_opackets);
2413 	IF_DATA_REQUIRE_ALIGNED_64(ifi_oerrors);
2414 	IF_DATA_REQUIRE_ALIGNED_64(ifi_collisions);
2415 	IF_DATA_REQUIRE_ALIGNED_64(ifi_ibytes);
2416 	IF_DATA_REQUIRE_ALIGNED_64(ifi_obytes);
2417 	IF_DATA_REQUIRE_ALIGNED_64(ifi_imcasts);
2418 	IF_DATA_REQUIRE_ALIGNED_64(ifi_omcasts);
2419 	IF_DATA_REQUIRE_ALIGNED_64(ifi_iqdrops);
2420 	IF_DATA_REQUIRE_ALIGNED_64(ifi_noproto);
2421 	IF_DATA_REQUIRE_ALIGNED_64(ifi_alignerrs);
2422 	IF_DATA_REQUIRE_ALIGNED_64(ifi_dt_bytes);
2423 	IF_DATA_REQUIRE_ALIGNED_64(ifi_fpackets);
2424 	IF_DATA_REQUIRE_ALIGNED_64(ifi_fbytes);
2425 
2426 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_ipackets);
2427 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_ierrors);
2428 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_opackets);
2429 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_oerrors);
2430 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_collisions);
2431 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_ibytes);
2432 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_obytes);
2433 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_imcasts);
2434 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_omcasts);
2435 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_iqdrops);
2436 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_noproto);
2437 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_alignerrs);
2438 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_dt_bytes);
2439 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_fpackets);
2440 	IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_fbytes);
2441 
2442 	/*
2443 	 * These IF_HWASSIST_ flags must be equal to their IFNET_* counterparts.
2444 	 */
2445 	_CASSERT(IF_HWASSIST_CSUM_IP == IFNET_CSUM_IP);
2446 	_CASSERT(IF_HWASSIST_CSUM_TCP == IFNET_CSUM_TCP);
2447 	_CASSERT(IF_HWASSIST_CSUM_UDP == IFNET_CSUM_UDP);
2448 	_CASSERT(IF_HWASSIST_CSUM_IP_FRAGS == IFNET_CSUM_FRAGMENT);
2449 	_CASSERT(IF_HWASSIST_CSUM_FRAGMENT == IFNET_IP_FRAGMENT);
2450 	_CASSERT(IF_HWASSIST_CSUM_TCPIPV6 == IFNET_CSUM_TCPIPV6);
2451 	_CASSERT(IF_HWASSIST_CSUM_UDPIPV6 == IFNET_CSUM_UDPIPV6);
2452 	_CASSERT(IF_HWASSIST_CSUM_FRAGMENT_IPV6 == IFNET_IPV6_FRAGMENT);
2453 	_CASSERT(IF_HWASSIST_CSUM_PARTIAL == IFNET_CSUM_PARTIAL);
2454 	_CASSERT(IF_HWASSIST_CSUM_ZERO_INVERT == IFNET_CSUM_ZERO_INVERT);
2455 	_CASSERT(IF_HWASSIST_VLAN_TAGGING == IFNET_VLAN_TAGGING);
2456 	_CASSERT(IF_HWASSIST_VLAN_MTU == IFNET_VLAN_MTU);
2457 	_CASSERT(IF_HWASSIST_TSO_V4 == IFNET_TSO_IPV4);
2458 	_CASSERT(IF_HWASSIST_TSO_V6 == IFNET_TSO_IPV6);
2459 
2460 	/*
2461 	 * ... as well as the mbuf checksum flags counterparts.
2462 	 */
2463 	_CASSERT(CSUM_IP == IF_HWASSIST_CSUM_IP);
2464 	_CASSERT(CSUM_TCP == IF_HWASSIST_CSUM_TCP);
2465 	_CASSERT(CSUM_UDP == IF_HWASSIST_CSUM_UDP);
2466 	_CASSERT(CSUM_IP_FRAGS == IF_HWASSIST_CSUM_IP_FRAGS);
2467 	_CASSERT(CSUM_FRAGMENT == IF_HWASSIST_CSUM_FRAGMENT);
2468 	_CASSERT(CSUM_TCPIPV6 == IF_HWASSIST_CSUM_TCPIPV6);
2469 	_CASSERT(CSUM_UDPIPV6 == IF_HWASSIST_CSUM_UDPIPV6);
2470 	_CASSERT(CSUM_FRAGMENT_IPV6 == IF_HWASSIST_CSUM_FRAGMENT_IPV6);
2471 	_CASSERT(CSUM_PARTIAL == IF_HWASSIST_CSUM_PARTIAL);
2472 	_CASSERT(CSUM_ZERO_INVERT == IF_HWASSIST_CSUM_ZERO_INVERT);
2473 	_CASSERT(CSUM_VLAN_TAG_VALID == IF_HWASSIST_VLAN_TAGGING);
2474 
2475 	/*
2476 	 * Make sure we have at least IF_LLREACH_MAXLEN in the llreach info.
2477 	 */
2478 	_CASSERT(IF_LLREACH_MAXLEN <= IF_LLREACHINFO_ADDRLEN);
2479 	_CASSERT(IFNET_LLREACHINFO_ADDRLEN == IF_LLREACHINFO_ADDRLEN);
2480 
2481 	_CASSERT(IFRLOGF_DLIL == IFNET_LOGF_DLIL);
2482 	_CASSERT(IFRLOGF_FAMILY == IFNET_LOGF_FAMILY);
2483 	_CASSERT(IFRLOGF_DRIVER == IFNET_LOGF_DRIVER);
2484 	_CASSERT(IFRLOGF_FIRMWARE == IFNET_LOGF_FIRMWARE);
2485 
2486 	_CASSERT(IFRLOGCAT_CONNECTIVITY == IFNET_LOGCAT_CONNECTIVITY);
2487 	_CASSERT(IFRLOGCAT_QUALITY == IFNET_LOGCAT_QUALITY);
2488 	_CASSERT(IFRLOGCAT_PERFORMANCE == IFNET_LOGCAT_PERFORMANCE);
2489 
2490 	_CASSERT(IFRTYPE_FAMILY_ANY == IFNET_FAMILY_ANY);
2491 	_CASSERT(IFRTYPE_FAMILY_LOOPBACK == IFNET_FAMILY_LOOPBACK);
2492 	_CASSERT(IFRTYPE_FAMILY_ETHERNET == IFNET_FAMILY_ETHERNET);
2493 	_CASSERT(IFRTYPE_FAMILY_SLIP == IFNET_FAMILY_SLIP);
2494 	_CASSERT(IFRTYPE_FAMILY_TUN == IFNET_FAMILY_TUN);
2495 	_CASSERT(IFRTYPE_FAMILY_VLAN == IFNET_FAMILY_VLAN);
2496 	_CASSERT(IFRTYPE_FAMILY_PPP == IFNET_FAMILY_PPP);
2497 	_CASSERT(IFRTYPE_FAMILY_PVC == IFNET_FAMILY_PVC);
2498 	_CASSERT(IFRTYPE_FAMILY_DISC == IFNET_FAMILY_DISC);
2499 	_CASSERT(IFRTYPE_FAMILY_MDECAP == IFNET_FAMILY_MDECAP);
2500 	_CASSERT(IFRTYPE_FAMILY_GIF == IFNET_FAMILY_GIF);
2501 	_CASSERT(IFRTYPE_FAMILY_FAITH == IFNET_FAMILY_FAITH);
2502 	_CASSERT(IFRTYPE_FAMILY_STF == IFNET_FAMILY_STF);
2503 	_CASSERT(IFRTYPE_FAMILY_FIREWIRE == IFNET_FAMILY_FIREWIRE);
2504 	_CASSERT(IFRTYPE_FAMILY_BOND == IFNET_FAMILY_BOND);
2505 	_CASSERT(IFRTYPE_FAMILY_CELLULAR == IFNET_FAMILY_CELLULAR);
2506 	_CASSERT(IFRTYPE_FAMILY_UTUN == IFNET_FAMILY_UTUN);
2507 	_CASSERT(IFRTYPE_FAMILY_IPSEC == IFNET_FAMILY_IPSEC);
2508 
2509 	_CASSERT(IFRTYPE_SUBFAMILY_ANY == IFNET_SUBFAMILY_ANY);
2510 	_CASSERT(IFRTYPE_SUBFAMILY_USB == IFNET_SUBFAMILY_USB);
2511 	_CASSERT(IFRTYPE_SUBFAMILY_BLUETOOTH == IFNET_SUBFAMILY_BLUETOOTH);
2512 	_CASSERT(IFRTYPE_SUBFAMILY_WIFI == IFNET_SUBFAMILY_WIFI);
2513 	_CASSERT(IFRTYPE_SUBFAMILY_THUNDERBOLT == IFNET_SUBFAMILY_THUNDERBOLT);
2514 	_CASSERT(IFRTYPE_SUBFAMILY_RESERVED == IFNET_SUBFAMILY_RESERVED);
2515 	_CASSERT(IFRTYPE_SUBFAMILY_INTCOPROC == IFNET_SUBFAMILY_INTCOPROC);
2516 	_CASSERT(IFRTYPE_SUBFAMILY_QUICKRELAY == IFNET_SUBFAMILY_QUICKRELAY);
2517 	_CASSERT(IFRTYPE_SUBFAMILY_DEFAULT == IFNET_SUBFAMILY_DEFAULT);
2518 
2519 	_CASSERT(DLIL_MODIDLEN == IFNET_MODIDLEN);
2520 	_CASSERT(DLIL_MODARGLEN == IFNET_MODARGLEN);
2521 
2522 	PE_parse_boot_argn("net_affinity", &net_affinity,
2523 	    sizeof(net_affinity));
2524 
2525 	PE_parse_boot_argn("net_rxpoll", &net_rxpoll, sizeof(net_rxpoll));
2526 
2527 	PE_parse_boot_argn("net_rtref", &net_rtref, sizeof(net_rtref));
2528 
2529 	PE_parse_boot_argn("net_async", &net_async, sizeof(net_async));
2530 
2531 	PE_parse_boot_argn("ifnet_debug", &ifnet_debug, sizeof(ifnet_debug));
2532 
2533 	VERIFY(dlil_pending_thread_cnt == 0);
2534 #if SKYWALK
2535 	boolean_t pe_enable_fsw_transport_netagent = FALSE;
2536 	boolean_t pe_disable_fsw_transport_netagent = FALSE;
2537 	boolean_t enable_fsw_netagent =
2538 	    (((if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0) ||
2539 	    (if_attach_nx & IF_ATTACH_NX_FSW_IP_NETAGENT) != 0);
2540 
2541 	/*
2542 	 * Check the device tree to see if Skywalk netagent has been explicitly
2543 	 * enabled or disabled.  This can be overridden via if_attach_nx below.
2544 	 * Note that the property is a 0-length key, and so checking for the
2545 	 * presence itself is enough (no need to check for the actual value of
2546 	 * the retrieved variable.)
2547 	 */
2548 	pe_enable_fsw_transport_netagent =
2549 	    PE_get_default("kern.skywalk_netagent_enable",
2550 	    &pe_enable_fsw_transport_netagent,
2551 	    sizeof(pe_enable_fsw_transport_netagent));
2552 	pe_disable_fsw_transport_netagent =
2553 	    PE_get_default("kern.skywalk_netagent_disable",
2554 	    &pe_disable_fsw_transport_netagent,
2555 	    sizeof(pe_disable_fsw_transport_netagent));
2556 
2557 	/*
2558 	 * These two are mutually exclusive, i.e. they both can be absent,
2559 	 * but only one can be present at a time, and so we assert to make
2560 	 * sure it is correct.
2561 	 */
2562 	VERIFY((!pe_enable_fsw_transport_netagent &&
2563 	    !pe_disable_fsw_transport_netagent) ||
2564 	    (pe_enable_fsw_transport_netagent ^
2565 	    pe_disable_fsw_transport_netagent));
2566 
2567 	if (pe_enable_fsw_transport_netagent) {
2568 		kprintf("SK: netagent is enabled via an override for "
2569 		    "this platform\n");
2570 		if_attach_nx = SKYWALK_NETWORKING_ENABLED;
2571 	} else if (pe_disable_fsw_transport_netagent) {
2572 		kprintf("SK: netagent is disabled via an override for "
2573 		    "this platform\n");
2574 		if_attach_nx = SKYWALK_NETWORKING_DISABLED;
2575 	} else {
2576 		kprintf("SK: netagent is %s by default for this platform\n",
2577 		    (enable_fsw_netagent ? "enabled" : "disabled"));
2578 		if_attach_nx = IF_ATTACH_NX_DEFAULT;
2579 	}
2580 
2581 	/*
2582 	 * Now see if there's a boot-arg override.
2583 	 */
2584 	(void) PE_parse_boot_argn("if_attach_nx", &if_attach_nx,
2585 	    sizeof(if_attach_nx));
2586 	if_enable_fsw_transport_netagent =
2587 	    ((if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0);
2588 
2589 	if_netif_all = ((if_attach_nx & IF_ATTACH_NX_NETIF_ALL) != 0);
2590 
2591 	if (pe_disable_fsw_transport_netagent &&
2592 	    if_enable_fsw_transport_netagent) {
2593 		kprintf("SK: netagent is force-enabled\n");
2594 	} else if (!pe_disable_fsw_transport_netagent &&
2595 	    !if_enable_fsw_transport_netagent) {
2596 		kprintf("SK: netagent is force-disabled\n");
2597 	}
2598 #ifdef XNU_TARGET_OS_OSX
2599 	if (if_enable_fsw_transport_netagent) {
2600 		net_filter_event_register(dlil_filter_event);
2601 	}
2602 #endif /* XNU_TARGET_OS_OSX */
2603 
2604 #if (DEVELOPMENT || DEBUG)
2605 	(void) PE_parse_boot_argn("fsw_use_max_mtu_buffer",
2606 	    &fsw_use_max_mtu_buffer, sizeof(fsw_use_max_mtu_buffer));
2607 #endif /* (DEVELOPMENT || DEBUG) */
2608 
2609 #endif /* SKYWALK */
2610 	dlif_size = (ifnet_debug == 0) ? sizeof(struct dlil_ifnet) :
2611 	    sizeof(struct dlil_ifnet_dbg);
2612 	/* Enforce 64-bit alignment for dlil_ifnet structure */
2613 	dlif_bufsize = dlif_size + sizeof(void *) + sizeof(u_int64_t);
2614 	dlif_bufsize = (uint32_t)P2ROUNDUP(dlif_bufsize, sizeof(u_int64_t));
2615 	dlif_zone = zone_create(DLIF_ZONE_NAME, dlif_bufsize, ZC_ZFREE_CLEARMEM);
2616 
2617 	dlif_tcpstat_size = sizeof(struct tcpstat_local);
2618 	/* Enforce 64-bit alignment for tcpstat_local structure */
2619 	dlif_tcpstat_bufsize =
2620 	    dlif_tcpstat_size + sizeof(void *) + sizeof(u_int64_t);
2621 	dlif_tcpstat_bufsize = (uint32_t)
2622 	    P2ROUNDUP(dlif_tcpstat_bufsize, sizeof(u_int64_t));
2623 	dlif_tcpstat_zone = zone_create(DLIF_TCPSTAT_ZONE_NAME,
2624 	    dlif_tcpstat_bufsize, ZC_ZFREE_CLEARMEM);
2625 
2626 	dlif_udpstat_size = sizeof(struct udpstat_local);
2627 	/* Enforce 64-bit alignment for udpstat_local structure */
2628 	dlif_udpstat_bufsize =
2629 	    dlif_udpstat_size + sizeof(void *) + sizeof(u_int64_t);
2630 	dlif_udpstat_bufsize = (uint32_t)
2631 	    P2ROUNDUP(dlif_udpstat_bufsize, sizeof(u_int64_t));
2632 	dlif_udpstat_zone = zone_create(DLIF_UDPSTAT_ZONE_NAME,
2633 	    dlif_udpstat_bufsize, ZC_ZFREE_CLEARMEM);
2634 
2635 	eventhandler_lists_ctxt_init(&ifnet_evhdlr_ctxt);
2636 
2637 	TAILQ_INIT(&dlil_ifnet_head);
2638 	TAILQ_INIT(&ifnet_head);
2639 	TAILQ_INIT(&ifnet_detaching_head);
2640 	TAILQ_INIT(&ifnet_ordered_head);
2641 
2642 	/* Initialize interface address subsystem */
2643 	ifa_init();
2644 
2645 #if PF
2646 	/* Initialize the packet filter */
2647 	pfinit();
2648 #endif /* PF */
2649 
2650 	/* Initialize queue algorithms */
2651 	classq_init();
2652 
2653 	/* Initialize packet schedulers */
2654 	pktsched_init();
2655 
2656 	/* Initialize flow advisory subsystem */
2657 	flowadv_init();
2658 
2659 	/* Initialize the pktap virtual interface */
2660 	pktap_init();
2661 
2662 	/* Initialize the service class to dscp map */
2663 	net_qos_map_init();
2664 
2665 	/* Initialize the interface low power mode event handler */
2666 	if_low_power_evhdlr_init();
2667 
2668 	/* Initialize the interface offload port list subsystem */
2669 	if_ports_used_init();
2670 
2671 #if DEBUG || DEVELOPMENT
2672 	/* Run self-tests */
2673 	dlil_verify_sum16();
2674 #endif /* DEBUG || DEVELOPMENT */
2675 
2676 	/*
2677 	 * Create and start up the main DLIL input thread and the interface
2678 	 * detacher threads once everything is initialized.
2679 	 */
2680 	dlil_incr_pending_thread_count();
2681 	(void) dlil_create_input_thread(NULL, dlil_main_input_thread, NULL);
2682 
2683 	/*
2684 	 * Create ifnet detacher thread.
2685 	 * When an interface gets detached, part of the detach processing
2686 	 * is delayed. The interface is added to delayed detach list
2687 	 * and this thread is woken up to call ifnet_detach_final
2688 	 * on these interfaces.
2689 	 */
2690 	dlil_incr_pending_thread_count();
2691 	if (kernel_thread_start(ifnet_detacher_thread_func,
2692 	    NULL, &thread) != KERN_SUCCESS) {
2693 		panic_plain("%s: couldn't create detacher thread", __func__);
2694 		/* NOTREACHED */
2695 	}
2696 	thread_deallocate(thread);
2697 
2698 	/*
2699 	 * Wait for the created kernel threads for dlil to get
2700 	 * scheduled and run at least once before we proceed
2701 	 */
2702 	lck_mtx_lock(&dlil_thread_sync_lock);
2703 	while (dlil_pending_thread_cnt != 0) {
2704 		DLIL_PRINTF("%s: Waiting for all the create dlil kernel "
2705 		    "threads to get scheduled at least once.\n", __func__);
2706 		(void) msleep(&dlil_pending_thread_cnt, &dlil_thread_sync_lock,
2707 		    (PZERO - 1), __func__, NULL);
2708 		LCK_MTX_ASSERT(&dlil_thread_sync_lock, LCK_ASSERT_OWNED);
2709 	}
2710 	lck_mtx_unlock(&dlil_thread_sync_lock);
2711 	DLIL_PRINTF("%s: All the created dlil kernel threads have been "
2712 	    "scheduled at least once. Proceeding.\n", __func__);
2713 }
2714 
2715 static void
if_flt_monitor_busy(struct ifnet * ifp)2716 if_flt_monitor_busy(struct ifnet *ifp)
2717 {
2718 	LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
2719 
2720 	++ifp->if_flt_busy;
2721 	VERIFY(ifp->if_flt_busy != 0);
2722 }
2723 
2724 static void
if_flt_monitor_unbusy(struct ifnet * ifp)2725 if_flt_monitor_unbusy(struct ifnet *ifp)
2726 {
2727 	if_flt_monitor_leave(ifp);
2728 }
2729 
2730 static void
if_flt_monitor_enter(struct ifnet * ifp)2731 if_flt_monitor_enter(struct ifnet *ifp)
2732 {
2733 	LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
2734 
2735 	while (ifp->if_flt_busy) {
2736 		++ifp->if_flt_waiters;
2737 		(void) msleep(&ifp->if_flt_head, &ifp->if_flt_lock,
2738 		    (PZERO - 1), "if_flt_monitor", NULL);
2739 	}
2740 	if_flt_monitor_busy(ifp);
2741 }
2742 
2743 static void
if_flt_monitor_leave(struct ifnet * ifp)2744 if_flt_monitor_leave(struct ifnet *ifp)
2745 {
2746 	LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
2747 
2748 	VERIFY(ifp->if_flt_busy != 0);
2749 	--ifp->if_flt_busy;
2750 
2751 	if (ifp->if_flt_busy == 0 && ifp->if_flt_waiters > 0) {
2752 		ifp->if_flt_waiters = 0;
2753 		wakeup(&ifp->if_flt_head);
2754 	}
2755 }
2756 
2757 __private_extern__ int
dlil_attach_filter(struct ifnet * ifp,const struct iff_filter * if_filter,interface_filter_t * filter_ref,u_int32_t flags)2758 dlil_attach_filter(struct ifnet *ifp, const struct iff_filter *if_filter,
2759     interface_filter_t *filter_ref, u_int32_t flags)
2760 {
2761 	int retval = 0;
2762 	struct ifnet_filter *filter = NULL;
2763 
2764 	ifnet_head_lock_shared();
2765 
2766 	/* Check that the interface is in the global list */
2767 	if (!ifnet_lookup(ifp)) {
2768 		retval = ENXIO;
2769 		goto done;
2770 	}
2771 	if (!ifnet_is_attached(ifp, 1)) {
2772 		os_log(OS_LOG_DEFAULT, "%s: %s is no longer attached",
2773 		    __func__, if_name(ifp));
2774 		retval = ENXIO;
2775 		goto done;
2776 	}
2777 
2778 	filter = zalloc_flags(dlif_filt_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
2779 
2780 	/* refcnt held above during lookup */
2781 	filter->filt_flags = flags;
2782 	filter->filt_ifp = ifp;
2783 	filter->filt_cookie = if_filter->iff_cookie;
2784 	filter->filt_name = if_filter->iff_name;
2785 	filter->filt_protocol = if_filter->iff_protocol;
2786 	/*
2787 	 * Do not install filter callbacks for internal coproc interface
2788 	 */
2789 	if (!IFNET_IS_INTCOPROC(ifp)) {
2790 		filter->filt_input = if_filter->iff_input;
2791 		filter->filt_output = if_filter->iff_output;
2792 		filter->filt_event = if_filter->iff_event;
2793 		filter->filt_ioctl = if_filter->iff_ioctl;
2794 	}
2795 	filter->filt_detached = if_filter->iff_detached;
2796 
2797 	lck_mtx_lock(&ifp->if_flt_lock);
2798 	if_flt_monitor_enter(ifp);
2799 
2800 	LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
2801 	TAILQ_INSERT_TAIL(&ifp->if_flt_head, filter, filt_next);
2802 
2803 	*filter_ref = filter;
2804 
2805 	/*
2806 	 * Bump filter count and route_generation ID to let TCP
2807 	 * know it shouldn't do TSO on this connection
2808 	 */
2809 	if ((filter->filt_flags & DLIL_IFF_TSO) == 0) {
2810 		ifnet_filter_update_tso(ifp, TRUE);
2811 	}
2812 	OSIncrementAtomic64(&net_api_stats.nas_iflt_attach_count);
2813 	INC_ATOMIC_INT64_LIM(net_api_stats.nas_iflt_attach_total);
2814 	if (filter->filt_flags & DLIL_IFF_INTERNAL) {
2815 		OSIncrementAtomic64(&net_api_stats.nas_iflt_attach_os_count);
2816 		INC_ATOMIC_INT64_LIM(net_api_stats.nas_iflt_attach_os_total);
2817 	} else {
2818 		OSAddAtomic(1, &ifp->if_flt_non_os_count);
2819 	}
2820 	if_flt_monitor_leave(ifp);
2821 	lck_mtx_unlock(&ifp->if_flt_lock);
2822 
2823 #if SKYWALK && defined(XNU_TARGET_OS_OSX)
2824 	net_filter_event_mark(NET_FILTER_EVENT_INTERFACE,
2825 	    net_check_compatible_if_filter(NULL));
2826 #endif /* SKYWALK && XNU_TARGET_OS_OSX */
2827 
2828 	if (dlil_verbose) {
2829 		DLIL_PRINTF("%s: %s filter attached\n", if_name(ifp),
2830 		    if_filter->iff_name);
2831 	}
2832 	ifnet_decr_iorefcnt(ifp);
2833 
2834 done:
2835 	ifnet_head_done();
2836 	if (retval != 0 && ifp != NULL) {
2837 		DLIL_PRINTF("%s: failed to attach %s (err=%d)\n",
2838 		    if_name(ifp), if_filter->iff_name, retval);
2839 	}
2840 	if (retval != 0 && filter != NULL) {
2841 		zfree(dlif_filt_zone, filter);
2842 	}
2843 
2844 	return retval;
2845 }
2846 
2847 static int
dlil_detach_filter_internal(interface_filter_t filter,int detached)2848 dlil_detach_filter_internal(interface_filter_t  filter, int detached)
2849 {
2850 	int retval = 0;
2851 
2852 	if (detached == 0) {
2853 		ifnet_t ifp = NULL;
2854 
2855 		ifnet_head_lock_shared();
2856 		TAILQ_FOREACH(ifp, &ifnet_head, if_link) {
2857 			interface_filter_t entry = NULL;
2858 
2859 			lck_mtx_lock(&ifp->if_flt_lock);
2860 			TAILQ_FOREACH(entry, &ifp->if_flt_head, filt_next) {
2861 				if (entry != filter || entry->filt_skip) {
2862 					continue;
2863 				}
2864 				/*
2865 				 * We've found a match; since it's possible
2866 				 * that the thread gets blocked in the monitor,
2867 				 * we do the lock dance.  Interface should
2868 				 * not be detached since we still have a use
2869 				 * count held during filter attach.
2870 				 */
2871 				entry->filt_skip = 1;   /* skip input/output */
2872 				lck_mtx_unlock(&ifp->if_flt_lock);
2873 				ifnet_head_done();
2874 
2875 				lck_mtx_lock(&ifp->if_flt_lock);
2876 				if_flt_monitor_enter(ifp);
2877 				LCK_MTX_ASSERT(&ifp->if_flt_lock,
2878 				    LCK_MTX_ASSERT_OWNED);
2879 
2880 				/* Remove the filter from the list */
2881 				TAILQ_REMOVE(&ifp->if_flt_head, filter,
2882 				    filt_next);
2883 
2884 				if (dlil_verbose) {
2885 					DLIL_PRINTF("%s: %s filter detached\n",
2886 					    if_name(ifp), filter->filt_name);
2887 				}
2888 				if (!(filter->filt_flags & DLIL_IFF_INTERNAL)) {
2889 					VERIFY(ifp->if_flt_non_os_count != 0);
2890 					OSAddAtomic(-1, &ifp->if_flt_non_os_count);
2891 				}
2892 				/*
2893 				 * Decrease filter count and route_generation
2894 				 * ID to let TCP know it should reevalute doing
2895 				 * TSO or not.
2896 				 */
2897 				if ((filter->filt_flags & DLIL_IFF_TSO) == 0) {
2898 					ifnet_filter_update_tso(ifp, FALSE);
2899 				}
2900 				if_flt_monitor_leave(ifp);
2901 				lck_mtx_unlock(&ifp->if_flt_lock);
2902 				goto destroy;
2903 			}
2904 			lck_mtx_unlock(&ifp->if_flt_lock);
2905 		}
2906 		ifnet_head_done();
2907 
2908 		/* filter parameter is not a valid filter ref */
2909 		retval = EINVAL;
2910 		goto done;
2911 	} else {
2912 		struct ifnet *ifp = filter->filt_ifp;
2913 		/*
2914 		 * Here we are called from ifnet_detach_final(); the
2915 		 * caller had emptied if_flt_head and we're doing an
2916 		 * implicit filter detach because the interface is
2917 		 * about to go away.  Make sure to adjust the counters
2918 		 * in this case.  We don't need the protection of the
2919 		 * filter monitor since we're called as part of the
2920 		 * final detach in the context of the detacher thread.
2921 		 */
2922 		if (!(filter->filt_flags & DLIL_IFF_INTERNAL)) {
2923 			VERIFY(ifp->if_flt_non_os_count != 0);
2924 			OSAddAtomic(-1, &ifp->if_flt_non_os_count);
2925 		}
2926 		/*
2927 		 * Decrease filter count and route_generation
2928 		 * ID to let TCP know it should reevalute doing
2929 		 * TSO or not.
2930 		 */
2931 		if ((filter->filt_flags & DLIL_IFF_TSO) == 0) {
2932 			ifnet_filter_update_tso(ifp, FALSE);
2933 		}
2934 	}
2935 
2936 	if (dlil_verbose) {
2937 		DLIL_PRINTF("%s filter detached\n", filter->filt_name);
2938 	}
2939 
2940 destroy:
2941 
2942 	/* Call the detached function if there is one */
2943 	if (filter->filt_detached) {
2944 		filter->filt_detached(filter->filt_cookie, filter->filt_ifp);
2945 	}
2946 
2947 	VERIFY(OSDecrementAtomic64(&net_api_stats.nas_iflt_attach_count) > 0);
2948 	if (filter->filt_flags & DLIL_IFF_INTERNAL) {
2949 		VERIFY(OSDecrementAtomic64(&net_api_stats.nas_iflt_attach_os_count) > 0);
2950 	}
2951 #if SKYWALK && defined(XNU_TARGET_OS_OSX)
2952 	net_filter_event_mark(NET_FILTER_EVENT_INTERFACE,
2953 	    net_check_compatible_if_filter(NULL));
2954 #endif /* SKYWALK && XNU_TARGET_OS_OSX */
2955 
2956 	/* Free the filter */
2957 	zfree(dlif_filt_zone, filter);
2958 	filter = NULL;
2959 done:
2960 	if (retval != 0 && filter != NULL) {
2961 		DLIL_PRINTF("failed to detach %s filter (err=%d)\n",
2962 		    filter->filt_name, retval);
2963 	}
2964 
2965 	return retval;
2966 }
2967 
2968 __private_extern__ void
dlil_detach_filter(interface_filter_t filter)2969 dlil_detach_filter(interface_filter_t filter)
2970 {
2971 	if (filter == NULL) {
2972 		return;
2973 	}
2974 	dlil_detach_filter_internal(filter, 0);
2975 }
2976 
2977 __private_extern__ boolean_t
dlil_has_ip_filter(void)2978 dlil_has_ip_filter(void)
2979 {
2980 	boolean_t has_filter = (net_api_stats.nas_ipf_add_count > 0);
2981 	DTRACE_IP1(dlil_has_ip_filter, boolean_t, has_filter);
2982 	return has_filter;
2983 }
2984 
2985 __private_extern__ boolean_t
dlil_has_if_filter(struct ifnet * ifp)2986 dlil_has_if_filter(struct ifnet *ifp)
2987 {
2988 	boolean_t has_filter = !TAILQ_EMPTY(&ifp->if_flt_head);
2989 	DTRACE_IP1(dlil_has_if_filter, boolean_t, has_filter);
2990 	return has_filter;
2991 }
2992 
2993 static inline void
dlil_input_wakeup(struct dlil_threading_info * inp)2994 dlil_input_wakeup(struct dlil_threading_info *inp)
2995 {
2996 	LCK_MTX_ASSERT(&inp->dlth_lock, LCK_MTX_ASSERT_OWNED);
2997 
2998 	inp->dlth_flags |= DLIL_INPUT_WAITING;
2999 	if (!(inp->dlth_flags & DLIL_INPUT_RUNNING)) {
3000 		inp->dlth_wtot++;
3001 		wakeup_one((caddr_t)&inp->dlth_flags);
3002 	}
3003 }
3004 
3005 __attribute__((noreturn))
3006 static void
dlil_main_input_thread_func(void * v,wait_result_t w)3007 dlil_main_input_thread_func(void *v, wait_result_t w)
3008 {
3009 #pragma unused(w)
3010 	struct dlil_threading_info *inp = v;
3011 
3012 	VERIFY(inp == dlil_main_input_thread);
3013 	VERIFY(inp->dlth_ifp == NULL);
3014 	VERIFY(current_thread() == inp->dlth_thread);
3015 
3016 	lck_mtx_lock(&inp->dlth_lock);
3017 	VERIFY(!(inp->dlth_flags & (DLIL_INPUT_EMBRYONIC | DLIL_INPUT_RUNNING)));
3018 	(void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3019 	inp->dlth_flags |= DLIL_INPUT_EMBRYONIC;
3020 	/* wake up once to get out of embryonic state */
3021 	dlil_input_wakeup(inp);
3022 	lck_mtx_unlock(&inp->dlth_lock);
3023 	(void) thread_block_parameter(dlil_main_input_thread_cont, inp);
3024 	/* NOTREACHED */
3025 	__builtin_unreachable();
3026 }
3027 
3028 /*
3029  * Main input thread:
3030  *
3031  *   a) handles all inbound packets for lo0
3032  *   b) handles all inbound packets for interfaces with no dedicated
3033  *	input thread (e.g. anything but Ethernet/PDP or those that support
3034  *	opportunistic polling.)
3035  *   c) protocol registrations
3036  *   d) packet injections
3037  */
3038 __attribute__((noreturn))
3039 static void
dlil_main_input_thread_cont(void * v,wait_result_t wres)3040 dlil_main_input_thread_cont(void *v, wait_result_t wres)
3041 {
3042 	struct dlil_main_threading_info *inpm = v;
3043 	struct dlil_threading_info *inp = v;
3044 
3045 	/* main input thread is uninterruptible */
3046 	VERIFY(wres != THREAD_INTERRUPTED);
3047 	lck_mtx_lock_spin(&inp->dlth_lock);
3048 	VERIFY(!(inp->dlth_flags & (DLIL_INPUT_TERMINATE |
3049 	    DLIL_INPUT_RUNNING)));
3050 	inp->dlth_flags |= DLIL_INPUT_RUNNING;
3051 
3052 	while (1) {
3053 		struct mbuf *m = NULL, *m_loop = NULL;
3054 		u_int32_t m_cnt, m_cnt_loop;
3055 		classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
3056 		boolean_t proto_req;
3057 		boolean_t embryonic;
3058 
3059 		inp->dlth_flags &= ~DLIL_INPUT_WAITING;
3060 
3061 		if (__improbable(embryonic =
3062 		    (inp->dlth_flags & DLIL_INPUT_EMBRYONIC))) {
3063 			inp->dlth_flags &= ~DLIL_INPUT_EMBRYONIC;
3064 		}
3065 
3066 		proto_req = (inp->dlth_flags &
3067 		    (DLIL_PROTO_WAITING | DLIL_PROTO_REGISTER));
3068 
3069 		/* Packets for non-dedicated interfaces other than lo0 */
3070 		m_cnt = qlen(&inp->dlth_pkts);
3071 		_getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
3072 		m = pkt.cp_mbuf;
3073 
3074 		/* Packets exclusive to lo0 */
3075 		m_cnt_loop = qlen(&inpm->lo_rcvq_pkts);
3076 		_getq_all(&inpm->lo_rcvq_pkts, &pkt, NULL, NULL, NULL);
3077 		m_loop = pkt.cp_mbuf;
3078 
3079 		inp->dlth_wtot = 0;
3080 
3081 		lck_mtx_unlock(&inp->dlth_lock);
3082 
3083 		if (__improbable(embryonic)) {
3084 			dlil_decr_pending_thread_count();
3085 		}
3086 
3087 		/*
3088 		 * NOTE warning %%% attention !!!!
3089 		 * We should think about putting some thread starvation
3090 		 * safeguards if we deal with long chains of packets.
3091 		 */
3092 		if (__probable(m_loop != NULL)) {
3093 			dlil_input_packet_list_extended(lo_ifp, m_loop,
3094 			    m_cnt_loop, IFNET_MODEL_INPUT_POLL_OFF);
3095 		}
3096 
3097 		if (__probable(m != NULL)) {
3098 			dlil_input_packet_list_extended(NULL, m,
3099 			    m_cnt, IFNET_MODEL_INPUT_POLL_OFF);
3100 		}
3101 
3102 		if (__improbable(proto_req)) {
3103 			proto_input_run();
3104 		}
3105 
3106 		lck_mtx_lock_spin(&inp->dlth_lock);
3107 		VERIFY(inp->dlth_flags & DLIL_INPUT_RUNNING);
3108 		/* main input thread cannot be terminated */
3109 		VERIFY(!(inp->dlth_flags & DLIL_INPUT_TERMINATE));
3110 		if (!(inp->dlth_flags & ~DLIL_INPUT_RUNNING)) {
3111 			break;
3112 		}
3113 	}
3114 
3115 	inp->dlth_flags &= ~DLIL_INPUT_RUNNING;
3116 	(void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3117 	lck_mtx_unlock(&inp->dlth_lock);
3118 	(void) thread_block_parameter(dlil_main_input_thread_cont, inp);
3119 
3120 	VERIFY(0);      /* we should never get here */
3121 	/* NOTREACHED */
3122 	__builtin_unreachable();
3123 }
3124 
3125 /*
3126  * Input thread for interfaces with legacy input model.
3127  */
3128 __attribute__((noreturn))
3129 static void
dlil_input_thread_func(void * v,wait_result_t w)3130 dlil_input_thread_func(void *v, wait_result_t w)
3131 {
3132 #pragma unused(w)
3133 	char thread_name[MAXTHREADNAMESIZE];
3134 	struct dlil_threading_info *inp = v;
3135 	struct ifnet *ifp = inp->dlth_ifp;
3136 
3137 	VERIFY(inp != dlil_main_input_thread);
3138 	VERIFY(ifp != NULL);
3139 	VERIFY(!(ifp->if_eflags & IFEF_RXPOLL) || !net_rxpoll ||
3140 	    !(ifp->if_xflags & IFXF_LEGACY));
3141 	VERIFY(ifp->if_poll_mode == IFNET_MODEL_INPUT_POLL_OFF ||
3142 	    !(ifp->if_xflags & IFXF_LEGACY));
3143 	VERIFY(current_thread() == inp->dlth_thread);
3144 
3145 	/* construct the name for this thread, and then apply it */
3146 	bzero(thread_name, sizeof(thread_name));
3147 	(void) snprintf(thread_name, sizeof(thread_name),
3148 	    "dlil_input_%s", ifp->if_xname);
3149 	thread_set_thread_name(inp->dlth_thread, thread_name);
3150 
3151 	lck_mtx_lock(&inp->dlth_lock);
3152 	VERIFY(!(inp->dlth_flags & (DLIL_INPUT_EMBRYONIC | DLIL_INPUT_RUNNING)));
3153 	(void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3154 	inp->dlth_flags |= DLIL_INPUT_EMBRYONIC;
3155 	/* wake up once to get out of embryonic state */
3156 	dlil_input_wakeup(inp);
3157 	lck_mtx_unlock(&inp->dlth_lock);
3158 	(void) thread_block_parameter(dlil_input_thread_cont, inp);
3159 	/* NOTREACHED */
3160 	__builtin_unreachable();
3161 }
3162 
3163 __attribute__((noreturn))
3164 static void
dlil_input_thread_cont(void * v,wait_result_t wres)3165 dlil_input_thread_cont(void *v, wait_result_t wres)
3166 {
3167 	struct dlil_threading_info *inp = v;
3168 	struct ifnet *ifp = inp->dlth_ifp;
3169 
3170 	lck_mtx_lock_spin(&inp->dlth_lock);
3171 	if (__improbable(wres == THREAD_INTERRUPTED ||
3172 	    (inp->dlth_flags & DLIL_INPUT_TERMINATE))) {
3173 		goto terminate;
3174 	}
3175 
3176 	VERIFY(!(inp->dlth_flags & DLIL_INPUT_RUNNING));
3177 	inp->dlth_flags |= DLIL_INPUT_RUNNING;
3178 
3179 	while (1) {
3180 		struct mbuf *m = NULL;
3181 		classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
3182 		boolean_t notify = FALSE;
3183 		boolean_t embryonic;
3184 		u_int32_t m_cnt;
3185 
3186 		inp->dlth_flags &= ~DLIL_INPUT_WAITING;
3187 
3188 		if (__improbable(embryonic =
3189 		    (inp->dlth_flags & DLIL_INPUT_EMBRYONIC))) {
3190 			inp->dlth_flags &= ~DLIL_INPUT_EMBRYONIC;
3191 		}
3192 
3193 		/*
3194 		 * Protocol registration and injection must always use
3195 		 * the main input thread; in theory the latter can utilize
3196 		 * the corresponding input thread where the packet arrived
3197 		 * on, but that requires our knowing the interface in advance
3198 		 * (and the benefits might not worth the trouble.)
3199 		 */
3200 		VERIFY(!(inp->dlth_flags &
3201 		    (DLIL_PROTO_WAITING | DLIL_PROTO_REGISTER)));
3202 
3203 		/* Packets for this interface */
3204 		m_cnt = qlen(&inp->dlth_pkts);
3205 		_getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
3206 		m = pkt.cp_mbuf;
3207 
3208 		inp->dlth_wtot = 0;
3209 
3210 #if SKYWALK
3211 		/*
3212 		 * If this interface is attached to a netif nexus,
3213 		 * the stats are already incremented there; otherwise
3214 		 * do it here.
3215 		 */
3216 		if (!(ifp->if_capabilities & IFCAP_SKYWALK))
3217 #endif /* SKYWALK */
3218 		notify = dlil_input_stats_sync(ifp, inp);
3219 
3220 		lck_mtx_unlock(&inp->dlth_lock);
3221 
3222 		if (__improbable(embryonic)) {
3223 			ifnet_decr_pending_thread_count(ifp);
3224 		}
3225 
3226 		if (__improbable(notify)) {
3227 			ifnet_notify_data_threshold(ifp);
3228 		}
3229 
3230 		/*
3231 		 * NOTE warning %%% attention !!!!
3232 		 * We should think about putting some thread starvation
3233 		 * safeguards if we deal with long chains of packets.
3234 		 */
3235 		if (__probable(m != NULL)) {
3236 			dlil_input_packet_list_extended(NULL, m,
3237 			    m_cnt, ifp->if_poll_mode);
3238 		}
3239 
3240 		lck_mtx_lock_spin(&inp->dlth_lock);
3241 		VERIFY(inp->dlth_flags & DLIL_INPUT_RUNNING);
3242 		if (!(inp->dlth_flags & ~(DLIL_INPUT_RUNNING |
3243 		    DLIL_INPUT_TERMINATE))) {
3244 			break;
3245 		}
3246 	}
3247 
3248 	inp->dlth_flags &= ~DLIL_INPUT_RUNNING;
3249 
3250 	if (__improbable(inp->dlth_flags & DLIL_INPUT_TERMINATE)) {
3251 terminate:
3252 		lck_mtx_unlock(&inp->dlth_lock);
3253 		dlil_terminate_input_thread(inp);
3254 		/* NOTREACHED */
3255 	} else {
3256 		(void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3257 		lck_mtx_unlock(&inp->dlth_lock);
3258 		(void) thread_block_parameter(dlil_input_thread_cont, inp);
3259 		/* NOTREACHED */
3260 	}
3261 
3262 	VERIFY(0);      /* we should never get here */
3263 	/* NOTREACHED */
3264 	__builtin_unreachable();
3265 }
3266 
3267 /*
3268  * Input thread for interfaces with opportunistic polling input model.
3269  */
3270 __attribute__((noreturn))
3271 static void
dlil_rxpoll_input_thread_func(void * v,wait_result_t w)3272 dlil_rxpoll_input_thread_func(void *v, wait_result_t w)
3273 {
3274 #pragma unused(w)
3275 	char thread_name[MAXTHREADNAMESIZE];
3276 	struct dlil_threading_info *inp = v;
3277 	struct ifnet *ifp = inp->dlth_ifp;
3278 
3279 	VERIFY(inp != dlil_main_input_thread);
3280 	VERIFY(ifp != NULL && (ifp->if_eflags & IFEF_RXPOLL) &&
3281 	    (ifp->if_xflags & IFXF_LEGACY));
3282 	VERIFY(current_thread() == inp->dlth_thread);
3283 
3284 	/* construct the name for this thread, and then apply it */
3285 	bzero(thread_name, sizeof(thread_name));
3286 	(void) snprintf(thread_name, sizeof(thread_name),
3287 	    "dlil_input_poll_%s", ifp->if_xname);
3288 	thread_set_thread_name(inp->dlth_thread, thread_name);
3289 
3290 	lck_mtx_lock(&inp->dlth_lock);
3291 	VERIFY(!(inp->dlth_flags & (DLIL_INPUT_EMBRYONIC | DLIL_INPUT_RUNNING)));
3292 	(void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3293 	inp->dlth_flags |= DLIL_INPUT_EMBRYONIC;
3294 	/* wake up once to get out of embryonic state */
3295 	dlil_input_wakeup(inp);
3296 	lck_mtx_unlock(&inp->dlth_lock);
3297 	(void) thread_block_parameter(dlil_rxpoll_input_thread_cont, inp);
3298 	/* NOTREACHED */
3299 	__builtin_unreachable();
3300 }
3301 
3302 __attribute__((noreturn))
3303 static void
dlil_rxpoll_input_thread_cont(void * v,wait_result_t wres)3304 dlil_rxpoll_input_thread_cont(void *v, wait_result_t wres)
3305 {
3306 	struct dlil_threading_info *inp = v;
3307 	struct ifnet *ifp = inp->dlth_ifp;
3308 	struct timespec ts;
3309 
3310 	lck_mtx_lock_spin(&inp->dlth_lock);
3311 	if (__improbable(wres == THREAD_INTERRUPTED ||
3312 	    (inp->dlth_flags & DLIL_INPUT_TERMINATE))) {
3313 		goto terminate;
3314 	}
3315 
3316 	VERIFY(!(inp->dlth_flags & DLIL_INPUT_RUNNING));
3317 	inp->dlth_flags |= DLIL_INPUT_RUNNING;
3318 
3319 	while (1) {
3320 		struct mbuf *m = NULL;
3321 		uint32_t m_cnt, poll_req = 0;
3322 		uint64_t m_size = 0;
3323 		ifnet_model_t mode;
3324 		struct timespec now, delta;
3325 		classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
3326 		boolean_t notify;
3327 		boolean_t embryonic;
3328 		uint64_t ival;
3329 
3330 		inp->dlth_flags &= ~DLIL_INPUT_WAITING;
3331 
3332 		if (__improbable(embryonic =
3333 		    (inp->dlth_flags & DLIL_INPUT_EMBRYONIC))) {
3334 			inp->dlth_flags &= ~DLIL_INPUT_EMBRYONIC;
3335 			goto skip;
3336 		}
3337 
3338 		if ((ival = ifp->if_rxpoll_ival) < IF_RXPOLL_INTERVALTIME_MIN) {
3339 			ival = IF_RXPOLL_INTERVALTIME_MIN;
3340 		}
3341 
3342 		/* Link parameters changed? */
3343 		if (ifp->if_poll_update != 0) {
3344 			ifp->if_poll_update = 0;
3345 			(void) dlil_rxpoll_set_params(ifp, NULL, TRUE);
3346 		}
3347 
3348 		/* Current operating mode */
3349 		mode = ifp->if_poll_mode;
3350 
3351 		/*
3352 		 * Protocol registration and injection must always use
3353 		 * the main input thread; in theory the latter can utilize
3354 		 * the corresponding input thread where the packet arrived
3355 		 * on, but that requires our knowing the interface in advance
3356 		 * (and the benefits might not worth the trouble.)
3357 		 */
3358 		VERIFY(!(inp->dlth_flags &
3359 		    (DLIL_PROTO_WAITING | DLIL_PROTO_REGISTER)));
3360 
3361 		/* Total count of all packets */
3362 		m_cnt = qlen(&inp->dlth_pkts);
3363 
3364 		/* Total bytes of all packets */
3365 		m_size = qsize(&inp->dlth_pkts);
3366 
3367 		/* Packets for this interface */
3368 		_getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
3369 		m = pkt.cp_mbuf;
3370 		VERIFY(m != NULL || m_cnt == 0);
3371 
3372 		nanouptime(&now);
3373 		if (!net_timerisset(&ifp->if_poll_sample_lasttime)) {
3374 			*(&ifp->if_poll_sample_lasttime) = *(&now);
3375 		}
3376 
3377 		net_timersub(&now, &ifp->if_poll_sample_lasttime, &delta);
3378 		if (if_rxpoll && net_timerisset(&ifp->if_poll_sample_holdtime)) {
3379 			u_int32_t ptot, btot;
3380 
3381 			/* Accumulate statistics for current sampling */
3382 			PKTCNTR_ADD(&ifp->if_poll_sstats, m_cnt, m_size);
3383 
3384 			if (net_timercmp(&delta, &ifp->if_poll_sample_holdtime, <)) {
3385 				goto skip;
3386 			}
3387 
3388 			*(&ifp->if_poll_sample_lasttime) = *(&now);
3389 
3390 			/* Calculate min/max of inbound bytes */
3391 			btot = (u_int32_t)ifp->if_poll_sstats.bytes;
3392 			if (ifp->if_rxpoll_bmin == 0 || ifp->if_rxpoll_bmin > btot) {
3393 				ifp->if_rxpoll_bmin = btot;
3394 			}
3395 			if (btot > ifp->if_rxpoll_bmax) {
3396 				ifp->if_rxpoll_bmax = btot;
3397 			}
3398 
3399 			/* Calculate EWMA of inbound bytes */
3400 			DLIL_EWMA(ifp->if_rxpoll_bavg, btot, if_rxpoll_decay);
3401 
3402 			/* Calculate min/max of inbound packets */
3403 			ptot = (u_int32_t)ifp->if_poll_sstats.packets;
3404 			if (ifp->if_rxpoll_pmin == 0 || ifp->if_rxpoll_pmin > ptot) {
3405 				ifp->if_rxpoll_pmin = ptot;
3406 			}
3407 			if (ptot > ifp->if_rxpoll_pmax) {
3408 				ifp->if_rxpoll_pmax = ptot;
3409 			}
3410 
3411 			/* Calculate EWMA of inbound packets */
3412 			DLIL_EWMA(ifp->if_rxpoll_pavg, ptot, if_rxpoll_decay);
3413 
3414 			/* Reset sampling statistics */
3415 			PKTCNTR_CLEAR(&ifp->if_poll_sstats);
3416 
3417 			/* Calculate EWMA of wakeup requests */
3418 			DLIL_EWMA(ifp->if_rxpoll_wavg, inp->dlth_wtot,
3419 			    if_rxpoll_decay);
3420 			inp->dlth_wtot = 0;
3421 
3422 			if (dlil_verbose) {
3423 				if (!net_timerisset(&ifp->if_poll_dbg_lasttime)) {
3424 					*(&ifp->if_poll_dbg_lasttime) = *(&now);
3425 				}
3426 				net_timersub(&now, &ifp->if_poll_dbg_lasttime, &delta);
3427 				if (net_timercmp(&delta, &dlil_dbgrate, >=)) {
3428 					*(&ifp->if_poll_dbg_lasttime) = *(&now);
3429 					DLIL_PRINTF("%s: [%s] pkts avg %d max %d "
3430 					    "limits [%d/%d], wreq avg %d "
3431 					    "limits [%d/%d], bytes avg %d "
3432 					    "limits [%d/%d]\n", if_name(ifp),
3433 					    (ifp->if_poll_mode ==
3434 					    IFNET_MODEL_INPUT_POLL_ON) ?
3435 					    "ON" : "OFF", ifp->if_rxpoll_pavg,
3436 					    ifp->if_rxpoll_pmax,
3437 					    ifp->if_rxpoll_plowat,
3438 					    ifp->if_rxpoll_phiwat,
3439 					    ifp->if_rxpoll_wavg,
3440 					    ifp->if_rxpoll_wlowat,
3441 					    ifp->if_rxpoll_whiwat,
3442 					    ifp->if_rxpoll_bavg,
3443 					    ifp->if_rxpoll_blowat,
3444 					    ifp->if_rxpoll_bhiwat);
3445 				}
3446 			}
3447 
3448 			/* Perform mode transition, if necessary */
3449 			if (!net_timerisset(&ifp->if_poll_mode_lasttime)) {
3450 				*(&ifp->if_poll_mode_lasttime) = *(&now);
3451 			}
3452 
3453 			net_timersub(&now, &ifp->if_poll_mode_lasttime, &delta);
3454 			if (net_timercmp(&delta, &ifp->if_poll_mode_holdtime, <)) {
3455 				goto skip;
3456 			}
3457 
3458 			if (ifp->if_rxpoll_pavg <= ifp->if_rxpoll_plowat &&
3459 			    ifp->if_rxpoll_bavg <= ifp->if_rxpoll_blowat &&
3460 			    ifp->if_poll_mode != IFNET_MODEL_INPUT_POLL_OFF) {
3461 				mode = IFNET_MODEL_INPUT_POLL_OFF;
3462 			} else if (ifp->if_rxpoll_pavg >= ifp->if_rxpoll_phiwat &&
3463 			    (ifp->if_rxpoll_bavg >= ifp->if_rxpoll_bhiwat ||
3464 			    ifp->if_rxpoll_wavg >= ifp->if_rxpoll_whiwat) &&
3465 			    ifp->if_poll_mode != IFNET_MODEL_INPUT_POLL_ON) {
3466 				mode = IFNET_MODEL_INPUT_POLL_ON;
3467 			}
3468 
3469 			if (mode != ifp->if_poll_mode) {
3470 				ifp->if_poll_mode = mode;
3471 				*(&ifp->if_poll_mode_lasttime) = *(&now);
3472 				poll_req++;
3473 			}
3474 		}
3475 skip:
3476 		notify = dlil_input_stats_sync(ifp, inp);
3477 
3478 		lck_mtx_unlock(&inp->dlth_lock);
3479 
3480 		if (__improbable(embryonic)) {
3481 			ifnet_decr_pending_thread_count(ifp);
3482 		}
3483 
3484 		if (__improbable(notify)) {
3485 			ifnet_notify_data_threshold(ifp);
3486 		}
3487 
3488 		/*
3489 		 * If there's a mode change and interface is still attached,
3490 		 * perform a downcall to the driver for the new mode.  Also
3491 		 * hold an IO refcnt on the interface to prevent it from
3492 		 * being detached (will be release below.)
3493 		 */
3494 		if (poll_req != 0 && ifnet_is_attached(ifp, 1)) {
3495 			struct ifnet_model_params p = {
3496 				.model = mode, .reserved = { 0 }
3497 			};
3498 			errno_t err;
3499 
3500 			if (dlil_verbose) {
3501 				DLIL_PRINTF("%s: polling is now %s, "
3502 				    "pkts avg %d max %d limits [%d/%d], "
3503 				    "wreq avg %d limits [%d/%d], "
3504 				    "bytes avg %d limits [%d/%d]\n",
3505 				    if_name(ifp),
3506 				    (mode == IFNET_MODEL_INPUT_POLL_ON) ?
3507 				    "ON" : "OFF", ifp->if_rxpoll_pavg,
3508 				    ifp->if_rxpoll_pmax, ifp->if_rxpoll_plowat,
3509 				    ifp->if_rxpoll_phiwat, ifp->if_rxpoll_wavg,
3510 				    ifp->if_rxpoll_wlowat, ifp->if_rxpoll_whiwat,
3511 				    ifp->if_rxpoll_bavg, ifp->if_rxpoll_blowat,
3512 				    ifp->if_rxpoll_bhiwat);
3513 			}
3514 
3515 			if ((err = ((*ifp->if_input_ctl)(ifp,
3516 			    IFNET_CTL_SET_INPUT_MODEL, sizeof(p), &p))) != 0) {
3517 				DLIL_PRINTF("%s: error setting polling mode "
3518 				    "to %s (%d)\n", if_name(ifp),
3519 				    (mode == IFNET_MODEL_INPUT_POLL_ON) ?
3520 				    "ON" : "OFF", err);
3521 			}
3522 
3523 			switch (mode) {
3524 			case IFNET_MODEL_INPUT_POLL_OFF:
3525 				ifnet_set_poll_cycle(ifp, NULL);
3526 				ifp->if_rxpoll_offreq++;
3527 				if (err != 0) {
3528 					ifp->if_rxpoll_offerr++;
3529 				}
3530 				break;
3531 
3532 			case IFNET_MODEL_INPUT_POLL_ON:
3533 				net_nsectimer(&ival, &ts);
3534 				ifnet_set_poll_cycle(ifp, &ts);
3535 				ifnet_poll(ifp);
3536 				ifp->if_rxpoll_onreq++;
3537 				if (err != 0) {
3538 					ifp->if_rxpoll_onerr++;
3539 				}
3540 				break;
3541 
3542 			default:
3543 				VERIFY(0);
3544 				/* NOTREACHED */
3545 			}
3546 
3547 			/* Release the IO refcnt */
3548 			ifnet_decr_iorefcnt(ifp);
3549 		}
3550 
3551 		/*
3552 		 * NOTE warning %%% attention !!!!
3553 		 * We should think about putting some thread starvation
3554 		 * safeguards if we deal with long chains of packets.
3555 		 */
3556 		if (__probable(m != NULL)) {
3557 			dlil_input_packet_list_extended(NULL, m, m_cnt, mode);
3558 		}
3559 
3560 		lck_mtx_lock_spin(&inp->dlth_lock);
3561 		VERIFY(inp->dlth_flags & DLIL_INPUT_RUNNING);
3562 		if (!(inp->dlth_flags & ~(DLIL_INPUT_RUNNING |
3563 		    DLIL_INPUT_TERMINATE))) {
3564 			break;
3565 		}
3566 	}
3567 
3568 	inp->dlth_flags &= ~DLIL_INPUT_RUNNING;
3569 
3570 	if (__improbable(inp->dlth_flags & DLIL_INPUT_TERMINATE)) {
3571 terminate:
3572 		lck_mtx_unlock(&inp->dlth_lock);
3573 		dlil_terminate_input_thread(inp);
3574 		/* NOTREACHED */
3575 	} else {
3576 		(void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3577 		lck_mtx_unlock(&inp->dlth_lock);
3578 		(void) thread_block_parameter(dlil_rxpoll_input_thread_cont,
3579 		    inp);
3580 		/* NOTREACHED */
3581 	}
3582 
3583 	VERIFY(0);      /* we should never get here */
3584 	/* NOTREACHED */
3585 	__builtin_unreachable();
3586 }
3587 
3588 errno_t
dlil_rxpoll_validate_params(struct ifnet_poll_params * p)3589 dlil_rxpoll_validate_params(struct ifnet_poll_params *p)
3590 {
3591 	if (p != NULL) {
3592 		if ((p->packets_lowat == 0 && p->packets_hiwat != 0) ||
3593 		    (p->packets_lowat != 0 && p->packets_hiwat == 0)) {
3594 			return EINVAL;
3595 		}
3596 		if (p->packets_lowat != 0 &&    /* hiwat must be non-zero */
3597 		    p->packets_lowat >= p->packets_hiwat) {
3598 			return EINVAL;
3599 		}
3600 		if ((p->bytes_lowat == 0 && p->bytes_hiwat != 0) ||
3601 		    (p->bytes_lowat != 0 && p->bytes_hiwat == 0)) {
3602 			return EINVAL;
3603 		}
3604 		if (p->bytes_lowat != 0 &&      /* hiwat must be non-zero */
3605 		    p->bytes_lowat >= p->bytes_hiwat) {
3606 			return EINVAL;
3607 		}
3608 		if (p->interval_time != 0 &&
3609 		    p->interval_time < IF_RXPOLL_INTERVALTIME_MIN) {
3610 			p->interval_time = IF_RXPOLL_INTERVALTIME_MIN;
3611 		}
3612 	}
3613 	return 0;
3614 }
3615 
3616 void
dlil_rxpoll_update_params(struct ifnet * ifp,struct ifnet_poll_params * p)3617 dlil_rxpoll_update_params(struct ifnet *ifp, struct ifnet_poll_params *p)
3618 {
3619 	u_int64_t sample_holdtime, inbw;
3620 
3621 	if ((inbw = ifnet_input_linkrate(ifp)) == 0 && p == NULL) {
3622 		sample_holdtime = 0;    /* polling is disabled */
3623 		ifp->if_rxpoll_wlowat = ifp->if_rxpoll_plowat =
3624 		    ifp->if_rxpoll_blowat = 0;
3625 		ifp->if_rxpoll_whiwat = ifp->if_rxpoll_phiwat =
3626 		    ifp->if_rxpoll_bhiwat = (u_int32_t)-1;
3627 		ifp->if_rxpoll_plim = 0;
3628 		ifp->if_rxpoll_ival = IF_RXPOLL_INTERVALTIME_MIN;
3629 	} else {
3630 		u_int32_t plowat, phiwat, blowat, bhiwat, plim;
3631 		u_int64_t ival;
3632 		unsigned int n, i;
3633 
3634 		for (n = 0, i = 0; rxpoll_tbl[i].speed != 0; i++) {
3635 			if (inbw < rxpoll_tbl[i].speed) {
3636 				break;
3637 			}
3638 			n = i;
3639 		}
3640 		/* auto-tune if caller didn't specify a value */
3641 		plowat = ((p == NULL || p->packets_lowat == 0) ?
3642 		    rxpoll_tbl[n].plowat : p->packets_lowat);
3643 		phiwat = ((p == NULL || p->packets_hiwat == 0) ?
3644 		    rxpoll_tbl[n].phiwat : p->packets_hiwat);
3645 		blowat = ((p == NULL || p->bytes_lowat == 0) ?
3646 		    rxpoll_tbl[n].blowat : p->bytes_lowat);
3647 		bhiwat = ((p == NULL || p->bytes_hiwat == 0) ?
3648 		    rxpoll_tbl[n].bhiwat : p->bytes_hiwat);
3649 		plim = ((p == NULL || p->packets_limit == 0 ||
3650 		    if_rxpoll_max != 0) ?  if_rxpoll_max : p->packets_limit);
3651 		ival = ((p == NULL || p->interval_time == 0 ||
3652 		    if_rxpoll_interval_time != IF_RXPOLL_INTERVALTIME) ?
3653 		    if_rxpoll_interval_time : p->interval_time);
3654 
3655 		VERIFY(plowat != 0 && phiwat != 0);
3656 		VERIFY(blowat != 0 && bhiwat != 0);
3657 		VERIFY(ival >= IF_RXPOLL_INTERVALTIME_MIN);
3658 
3659 		sample_holdtime = if_rxpoll_sample_holdtime;
3660 		ifp->if_rxpoll_wlowat = if_sysctl_rxpoll_wlowat;
3661 		ifp->if_rxpoll_whiwat = if_sysctl_rxpoll_whiwat;
3662 		ifp->if_rxpoll_plowat = plowat;
3663 		ifp->if_rxpoll_phiwat = phiwat;
3664 		ifp->if_rxpoll_blowat = blowat;
3665 		ifp->if_rxpoll_bhiwat = bhiwat;
3666 		ifp->if_rxpoll_plim = plim;
3667 		ifp->if_rxpoll_ival = ival;
3668 	}
3669 
3670 	net_nsectimer(&if_rxpoll_mode_holdtime, &ifp->if_poll_mode_holdtime);
3671 	net_nsectimer(&sample_holdtime, &ifp->if_poll_sample_holdtime);
3672 
3673 	if (dlil_verbose) {
3674 		DLIL_PRINTF("%s: speed %llu bps, sample per %llu nsec, "
3675 		    "poll interval %llu nsec, pkts per poll %u, "
3676 		    "pkt limits [%u/%u], wreq limits [%u/%u], "
3677 		    "bytes limits [%u/%u]\n", if_name(ifp),
3678 		    inbw, sample_holdtime, ifp->if_rxpoll_ival,
3679 		    ifp->if_rxpoll_plim, ifp->if_rxpoll_plowat,
3680 		    ifp->if_rxpoll_phiwat, ifp->if_rxpoll_wlowat,
3681 		    ifp->if_rxpoll_whiwat, ifp->if_rxpoll_blowat,
3682 		    ifp->if_rxpoll_bhiwat);
3683 	}
3684 }
3685 
3686 /*
3687  * Must be called on an attached ifnet (caller is expected to check.)
3688  * Caller may pass NULL for poll parameters to indicate "auto-tuning."
3689  */
3690 errno_t
dlil_rxpoll_set_params(struct ifnet * ifp,struct ifnet_poll_params * p,boolean_t locked)3691 dlil_rxpoll_set_params(struct ifnet *ifp, struct ifnet_poll_params *p,
3692     boolean_t locked)
3693 {
3694 	errno_t err;
3695 	struct dlil_threading_info *inp;
3696 
3697 	VERIFY(ifp != NULL);
3698 	if (!(ifp->if_eflags & IFEF_RXPOLL) || (inp = ifp->if_inp) == NULL) {
3699 		return ENXIO;
3700 	}
3701 	err = dlil_rxpoll_validate_params(p);
3702 	if (err != 0) {
3703 		return err;
3704 	}
3705 
3706 	if (!locked) {
3707 		lck_mtx_lock(&inp->dlth_lock);
3708 	}
3709 	LCK_MTX_ASSERT(&inp->dlth_lock, LCK_MTX_ASSERT_OWNED);
3710 	/*
3711 	 * Normally, we'd reset the parameters to the auto-tuned values
3712 	 * if the the input thread detects a change in link rate.  If the
3713 	 * driver provides its own parameters right after a link rate
3714 	 * changes, but before the input thread gets to run, we want to
3715 	 * make sure to keep the driver's values.  Clearing if_poll_update
3716 	 * will achieve that.
3717 	 */
3718 	if (p != NULL && !locked && ifp->if_poll_update != 0) {
3719 		ifp->if_poll_update = 0;
3720 	}
3721 	dlil_rxpoll_update_params(ifp, p);
3722 	if (!locked) {
3723 		lck_mtx_unlock(&inp->dlth_lock);
3724 	}
3725 	return 0;
3726 }
3727 
3728 /*
3729  * Must be called on an attached ifnet (caller is expected to check.)
3730  */
3731 errno_t
dlil_rxpoll_get_params(struct ifnet * ifp,struct ifnet_poll_params * p)3732 dlil_rxpoll_get_params(struct ifnet *ifp, struct ifnet_poll_params *p)
3733 {
3734 	struct dlil_threading_info *inp;
3735 
3736 	VERIFY(ifp != NULL && p != NULL);
3737 	if (!(ifp->if_eflags & IFEF_RXPOLL) || (inp = ifp->if_inp) == NULL) {
3738 		return ENXIO;
3739 	}
3740 
3741 	bzero(p, sizeof(*p));
3742 
3743 	lck_mtx_lock(&inp->dlth_lock);
3744 	p->packets_limit = ifp->if_rxpoll_plim;
3745 	p->packets_lowat = ifp->if_rxpoll_plowat;
3746 	p->packets_hiwat = ifp->if_rxpoll_phiwat;
3747 	p->bytes_lowat = ifp->if_rxpoll_blowat;
3748 	p->bytes_hiwat = ifp->if_rxpoll_bhiwat;
3749 	p->interval_time = ifp->if_rxpoll_ival;
3750 	lck_mtx_unlock(&inp->dlth_lock);
3751 
3752 	return 0;
3753 }
3754 
3755 errno_t
ifnet_input(struct ifnet * ifp,struct mbuf * m_head,const struct ifnet_stat_increment_param * s)3756 ifnet_input(struct ifnet *ifp, struct mbuf *m_head,
3757     const struct ifnet_stat_increment_param *s)
3758 {
3759 	return ifnet_input_common(ifp, m_head, NULL, s, FALSE, FALSE);
3760 }
3761 
3762 errno_t
ifnet_input_extended(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s)3763 ifnet_input_extended(struct ifnet *ifp, struct mbuf *m_head,
3764     struct mbuf *m_tail, const struct ifnet_stat_increment_param *s)
3765 {
3766 	return ifnet_input_common(ifp, m_head, m_tail, s, TRUE, FALSE);
3767 }
3768 
3769 errno_t
ifnet_input_poll(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s)3770 ifnet_input_poll(struct ifnet *ifp, struct mbuf *m_head,
3771     struct mbuf *m_tail, const struct ifnet_stat_increment_param *s)
3772 {
3773 	return ifnet_input_common(ifp, m_head, m_tail, s,
3774 	           (m_head != NULL), TRUE);
3775 }
3776 
3777 static errno_t
ifnet_input_common(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t ext,boolean_t poll)3778 ifnet_input_common(struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail,
3779     const struct ifnet_stat_increment_param *s, boolean_t ext, boolean_t poll)
3780 {
3781 	dlil_input_func input_func;
3782 	struct ifnet_stat_increment_param _s;
3783 	u_int32_t m_cnt = 0, m_size = 0;
3784 	struct mbuf *last;
3785 	errno_t err = 0;
3786 
3787 	if ((m_head == NULL && !poll) || (s == NULL && ext)) {
3788 		if (m_head != NULL) {
3789 			mbuf_freem_list(m_head);
3790 		}
3791 		return EINVAL;
3792 	}
3793 
3794 	VERIFY(m_head != NULL || (s == NULL && m_tail == NULL && !ext && poll));
3795 	VERIFY(m_tail == NULL || ext);
3796 	VERIFY(s != NULL || !ext);
3797 
3798 	/*
3799 	 * Drop the packet(s) if the parameters are invalid, or if the
3800 	 * interface is no longer attached; else hold an IO refcnt to
3801 	 * prevent it from being detached (will be released below.)
3802 	 */
3803 	if (ifp == NULL || (ifp != lo_ifp && !ifnet_datamov_begin(ifp))) {
3804 		if (m_head != NULL) {
3805 			mbuf_freem_list(m_head);
3806 		}
3807 		return EINVAL;
3808 	}
3809 
3810 	input_func = ifp->if_input_dlil;
3811 	VERIFY(input_func != NULL);
3812 
3813 	if (m_tail == NULL) {
3814 		last = m_head;
3815 		while (m_head != NULL) {
3816 #if IFNET_INPUT_SANITY_CHK
3817 			if (__improbable(dlil_input_sanity_check != 0)) {
3818 				DLIL_INPUT_CHECK(last, ifp);
3819 			}
3820 #endif /* IFNET_INPUT_SANITY_CHK */
3821 			m_cnt++;
3822 			m_size += m_length(last);
3823 			if (mbuf_nextpkt(last) == NULL) {
3824 				break;
3825 			}
3826 			last = mbuf_nextpkt(last);
3827 		}
3828 		m_tail = last;
3829 	} else {
3830 #if IFNET_INPUT_SANITY_CHK
3831 		if (__improbable(dlil_input_sanity_check != 0)) {
3832 			last = m_head;
3833 			while (1) {
3834 				DLIL_INPUT_CHECK(last, ifp);
3835 				m_cnt++;
3836 				m_size += m_length(last);
3837 				if (mbuf_nextpkt(last) == NULL) {
3838 					break;
3839 				}
3840 				last = mbuf_nextpkt(last);
3841 			}
3842 		} else {
3843 			m_cnt = s->packets_in;
3844 			m_size = s->bytes_in;
3845 			last = m_tail;
3846 		}
3847 #else
3848 		m_cnt = s->packets_in;
3849 		m_size = s->bytes_in;
3850 		last = m_tail;
3851 #endif /* IFNET_INPUT_SANITY_CHK */
3852 	}
3853 
3854 	if (last != m_tail) {
3855 		panic_plain("%s: invalid input packet chain for %s, "
3856 		    "tail mbuf %p instead of %p\n", __func__, if_name(ifp),
3857 		    m_tail, last);
3858 	}
3859 
3860 	/*
3861 	 * Assert packet count only for the extended variant, for backwards
3862 	 * compatibility, since this came directly from the device driver.
3863 	 * Relax this assertion for input bytes, as the driver may have
3864 	 * included the link-layer headers in the computation; hence
3865 	 * m_size is just an approximation.
3866 	 */
3867 	if (ext && s->packets_in != m_cnt) {
3868 		panic_plain("%s: input packet count mismatch for %s, "
3869 		    "%d instead of %d\n", __func__, if_name(ifp),
3870 		    s->packets_in, m_cnt);
3871 	}
3872 
3873 	if (s == NULL) {
3874 		bzero(&_s, sizeof(_s));
3875 		s = &_s;
3876 	} else {
3877 		_s = *s;
3878 	}
3879 	_s.packets_in = m_cnt;
3880 	_s.bytes_in = m_size;
3881 
3882 	err = (*input_func)(ifp, m_head, m_tail, s, poll, current_thread());
3883 
3884 	if (ifp != lo_ifp) {
3885 		/* Release the IO refcnt */
3886 		ifnet_datamov_end(ifp);
3887 	}
3888 
3889 	return err;
3890 }
3891 
3892 #if SKYWALK
3893 errno_t
dlil_set_input_handler(struct ifnet * ifp,dlil_input_func fn)3894 dlil_set_input_handler(struct ifnet *ifp, dlil_input_func fn)
3895 {
3896 	return atomic_test_set_ptr(&ifp->if_input_dlil,
3897 	           ptrauth_nop_cast(void *, &dlil_input_handler),
3898 	           ptrauth_nop_cast(void *, fn)) ? 0 : EBUSY;
3899 }
3900 
3901 void
dlil_reset_input_handler(struct ifnet * ifp)3902 dlil_reset_input_handler(struct ifnet *ifp)
3903 {
3904 	while (!atomic_test_set_ptr(&ifp->if_input_dlil,
3905 	    ptrauth_nop_cast(void *, ifp->if_input_dlil),
3906 	    ptrauth_nop_cast(void *, &dlil_input_handler))) {
3907 		;
3908 	}
3909 }
3910 
3911 errno_t
dlil_set_output_handler(struct ifnet * ifp,dlil_output_func fn)3912 dlil_set_output_handler(struct ifnet *ifp, dlil_output_func fn)
3913 {
3914 	return atomic_test_set_ptr(&ifp->if_output_dlil,
3915 	           ptrauth_nop_cast(void *, &dlil_output_handler),
3916 	           ptrauth_nop_cast(void *, fn)) ? 0 : EBUSY;
3917 }
3918 
3919 void
dlil_reset_output_handler(struct ifnet * ifp)3920 dlil_reset_output_handler(struct ifnet *ifp)
3921 {
3922 	while (!atomic_test_set_ptr(&ifp->if_output_dlil,
3923 	    ptrauth_nop_cast(void *, ifp->if_output_dlil),
3924 	    ptrauth_nop_cast(void *, &dlil_output_handler))) {
3925 		;
3926 	}
3927 }
3928 #endif /* SKYWALK */
3929 
3930 errno_t
dlil_output_handler(struct ifnet * ifp,struct mbuf * m)3931 dlil_output_handler(struct ifnet *ifp, struct mbuf *m)
3932 {
3933 	return ifp->if_output(ifp, m);
3934 }
3935 
3936 errno_t
dlil_input_handler(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t poll,struct thread * tp)3937 dlil_input_handler(struct ifnet *ifp, struct mbuf *m_head,
3938     struct mbuf *m_tail, const struct ifnet_stat_increment_param *s,
3939     boolean_t poll, struct thread *tp)
3940 {
3941 	struct dlil_threading_info *inp = ifp->if_inp;
3942 
3943 	if (__improbable(inp == NULL)) {
3944 		inp = dlil_main_input_thread;
3945 	}
3946 
3947 #if (DEVELOPMENT || DEBUG)
3948 	if (__improbable(net_thread_is_marked(NET_THREAD_SYNC_RX))) {
3949 		return dlil_input_sync(inp, ifp, m_head, m_tail, s, poll, tp);
3950 	} else
3951 #endif /* (DEVELOPMENT || DEBUG) */
3952 	{
3953 		return inp->dlth_strategy(inp, ifp, m_head, m_tail, s, poll, tp);
3954 	}
3955 }
3956 
3957 static errno_t
dlil_input_async(struct dlil_threading_info * inp,struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t poll,struct thread * tp)3958 dlil_input_async(struct dlil_threading_info *inp,
3959     struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail,
3960     const struct ifnet_stat_increment_param *s, boolean_t poll,
3961     struct thread *tp)
3962 {
3963 	u_int32_t m_cnt = s->packets_in;
3964 	u_int32_t m_size = s->bytes_in;
3965 	boolean_t notify = FALSE;
3966 
3967 	/*
3968 	 * If there is a matching DLIL input thread associated with an
3969 	 * affinity set, associate this thread with the same set.  We
3970 	 * will only do this once.
3971 	 */
3972 	lck_mtx_lock_spin(&inp->dlth_lock);
3973 	if (inp != dlil_main_input_thread && inp->dlth_affinity && tp != NULL &&
3974 	    ((!poll && inp->dlth_driver_thread == THREAD_NULL) ||
3975 	    (poll && inp->dlth_poller_thread == THREAD_NULL))) {
3976 		u_int32_t tag = inp->dlth_affinity_tag;
3977 
3978 		if (poll) {
3979 			VERIFY(inp->dlth_poller_thread == THREAD_NULL);
3980 			inp->dlth_poller_thread = tp;
3981 		} else {
3982 			VERIFY(inp->dlth_driver_thread == THREAD_NULL);
3983 			inp->dlth_driver_thread = tp;
3984 		}
3985 		lck_mtx_unlock(&inp->dlth_lock);
3986 
3987 		/* Associate the current thread with the new affinity tag */
3988 		(void) dlil_affinity_set(tp, tag);
3989 
3990 		/*
3991 		 * Take a reference on the current thread; during detach,
3992 		 * we will need to refer to it in order to tear down its
3993 		 * affinity.
3994 		 */
3995 		thread_reference(tp);
3996 		lck_mtx_lock_spin(&inp->dlth_lock);
3997 	}
3998 
3999 	VERIFY(m_head != NULL || (m_tail == NULL && m_cnt == 0));
4000 
4001 	/*
4002 	 * Because of loopbacked multicast we cannot stuff the ifp in
4003 	 * the rcvif of the packet header: loopback (lo0) packets use a
4004 	 * dedicated list so that we can later associate them with lo_ifp
4005 	 * on their way up the stack.  Packets for other interfaces without
4006 	 * dedicated input threads go to the regular list.
4007 	 */
4008 	if (m_head != NULL) {
4009 		classq_pkt_t head, tail;
4010 		CLASSQ_PKT_INIT_MBUF(&head, m_head);
4011 		CLASSQ_PKT_INIT_MBUF(&tail, m_tail);
4012 		if (inp == dlil_main_input_thread && ifp == lo_ifp) {
4013 			struct dlil_main_threading_info *inpm =
4014 			    (struct dlil_main_threading_info *)inp;
4015 			_addq_multi(&inpm->lo_rcvq_pkts, &head, &tail,
4016 			    m_cnt, m_size);
4017 		} else {
4018 			_addq_multi(&inp->dlth_pkts, &head, &tail,
4019 			    m_cnt, m_size);
4020 		}
4021 	}
4022 
4023 #if IFNET_INPUT_SANITY_CHK
4024 	if (__improbable(dlil_input_sanity_check != 0)) {
4025 		u_int32_t count = 0, size = 0;
4026 		struct mbuf *m0;
4027 
4028 		for (m0 = m_head; m0; m0 = mbuf_nextpkt(m0)) {
4029 			size += m_length(m0);
4030 			count++;
4031 		}
4032 
4033 		if (count != m_cnt) {
4034 			panic_plain("%s: invalid total packet count %u "
4035 			    "(expected %u)\n", if_name(ifp), count, m_cnt);
4036 			/* NOTREACHED */
4037 			__builtin_unreachable();
4038 		} else if (size != m_size) {
4039 			panic_plain("%s: invalid total packet size %u "
4040 			    "(expected %u)\n", if_name(ifp), size, m_size);
4041 			/* NOTREACHED */
4042 			__builtin_unreachable();
4043 		}
4044 
4045 		inp->dlth_pkts_cnt += m_cnt;
4046 	}
4047 #endif /* IFNET_INPUT_SANITY_CHK */
4048 
4049 	dlil_input_stats_add(s, inp, ifp, poll);
4050 	/*
4051 	 * If we're using the main input thread, synchronize the
4052 	 * stats now since we have the interface context.  All
4053 	 * other cases involving dedicated input threads will
4054 	 * have their stats synchronized there.
4055 	 */
4056 	if (inp == dlil_main_input_thread) {
4057 		notify = dlil_input_stats_sync(ifp, inp);
4058 	}
4059 
4060 	dlil_input_wakeup(inp);
4061 	lck_mtx_unlock(&inp->dlth_lock);
4062 
4063 	if (notify) {
4064 		ifnet_notify_data_threshold(ifp);
4065 	}
4066 
4067 	return 0;
4068 }
4069 
4070 static errno_t
dlil_input_sync(struct dlil_threading_info * inp,struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t poll,struct thread * tp)4071 dlil_input_sync(struct dlil_threading_info *inp,
4072     struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail,
4073     const struct ifnet_stat_increment_param *s, boolean_t poll,
4074     struct thread *tp)
4075 {
4076 #pragma unused(tp)
4077 	u_int32_t m_cnt = s->packets_in;
4078 	u_int32_t m_size = s->bytes_in;
4079 	boolean_t notify = FALSE;
4080 	classq_pkt_t head, tail;
4081 
4082 	ASSERT(inp != dlil_main_input_thread);
4083 
4084 	/* XXX: should we just assert instead? */
4085 	if (__improbable(m_head == NULL)) {
4086 		return 0;
4087 	}
4088 
4089 	CLASSQ_PKT_INIT_MBUF(&head, m_head);
4090 	CLASSQ_PKT_INIT_MBUF(&tail, m_tail);
4091 
4092 	lck_mtx_lock_spin(&inp->dlth_lock);
4093 	_addq_multi(&inp->dlth_pkts, &head, &tail, m_cnt, m_size);
4094 
4095 #if IFNET_INPUT_SANITY_CHK
4096 	if (__improbable(dlil_input_sanity_check != 0)) {
4097 		u_int32_t count = 0, size = 0;
4098 		struct mbuf *m0;
4099 
4100 		for (m0 = m_head; m0; m0 = mbuf_nextpkt(m0)) {
4101 			size += m_length(m0);
4102 			count++;
4103 		}
4104 
4105 		if (count != m_cnt) {
4106 			panic_plain("%s: invalid total packet count %u "
4107 			    "(expected %u)\n", if_name(ifp), count, m_cnt);
4108 			/* NOTREACHED */
4109 			__builtin_unreachable();
4110 		} else if (size != m_size) {
4111 			panic_plain("%s: invalid total packet size %u "
4112 			    "(expected %u)\n", if_name(ifp), size, m_size);
4113 			/* NOTREACHED */
4114 			__builtin_unreachable();
4115 		}
4116 
4117 		inp->dlth_pkts_cnt += m_cnt;
4118 	}
4119 #endif /* IFNET_INPUT_SANITY_CHK */
4120 
4121 	dlil_input_stats_add(s, inp, ifp, poll);
4122 
4123 	m_cnt = qlen(&inp->dlth_pkts);
4124 	_getq_all(&inp->dlth_pkts, &head, NULL, NULL, NULL);
4125 
4126 #if SKYWALK
4127 	/*
4128 	 * If this interface is attached to a netif nexus,
4129 	 * the stats are already incremented there; otherwise
4130 	 * do it here.
4131 	 */
4132 	if (!(ifp->if_capabilities & IFCAP_SKYWALK))
4133 #endif /* SKYWALK */
4134 	notify = dlil_input_stats_sync(ifp, inp);
4135 
4136 	lck_mtx_unlock(&inp->dlth_lock);
4137 
4138 	if (notify) {
4139 		ifnet_notify_data_threshold(ifp);
4140 	}
4141 
4142 	/*
4143 	 * NOTE warning %%% attention !!!!
4144 	 * We should think about putting some thread starvation
4145 	 * safeguards if we deal with long chains of packets.
4146 	 */
4147 	if (head.cp_mbuf != NULL) {
4148 		dlil_input_packet_list_extended(NULL, head.cp_mbuf,
4149 		    m_cnt, ifp->if_poll_mode);
4150 	}
4151 
4152 	return 0;
4153 }
4154 
4155 #if SKYWALK
4156 errno_t
ifnet_set_output_handler(struct ifnet * ifp,ifnet_output_func fn)4157 ifnet_set_output_handler(struct ifnet *ifp, ifnet_output_func fn)
4158 {
4159 	return atomic_test_set_ptr(&ifp->if_output,
4160 	           ptrauth_nop_cast(void *, ifp->if_save_output),
4161 	           ptrauth_nop_cast(void *, fn)) ? 0 : EBUSY;
4162 }
4163 
4164 void
ifnet_reset_output_handler(struct ifnet * ifp)4165 ifnet_reset_output_handler(struct ifnet *ifp)
4166 {
4167 	while (!atomic_test_set_ptr(&ifp->if_output,
4168 	    ptrauth_nop_cast(void *, ifp->if_output),
4169 	    ptrauth_nop_cast(void *, ifp->if_save_output))) {
4170 		;
4171 	}
4172 }
4173 
4174 errno_t
ifnet_set_start_handler(struct ifnet * ifp,ifnet_start_func fn)4175 ifnet_set_start_handler(struct ifnet *ifp, ifnet_start_func fn)
4176 {
4177 	return atomic_test_set_ptr(&ifp->if_start,
4178 	           ptrauth_nop_cast(void *, ifp->if_save_start),
4179 	           ptrauth_nop_cast(void *, fn)) ? 0 : EBUSY;
4180 }
4181 
4182 void
ifnet_reset_start_handler(struct ifnet * ifp)4183 ifnet_reset_start_handler(struct ifnet *ifp)
4184 {
4185 	while (!atomic_test_set_ptr(&ifp->if_start,
4186 	    ptrauth_nop_cast(void *, ifp->if_start),
4187 	    ptrauth_nop_cast(void *, ifp->if_save_start))) {
4188 		;
4189 	}
4190 }
4191 #endif /* SKYWALK */
4192 
4193 static void
ifnet_start_common(struct ifnet * ifp,boolean_t resetfc)4194 ifnet_start_common(struct ifnet *ifp, boolean_t resetfc)
4195 {
4196 	if (!(ifp->if_eflags & IFEF_TXSTART)) {
4197 		return;
4198 	}
4199 	/*
4200 	 * If the starter thread is inactive, signal it to do work,
4201 	 * unless the interface is being flow controlled from below,
4202 	 * e.g. a virtual interface being flow controlled by a real
4203 	 * network interface beneath it, or it's been disabled via
4204 	 * a call to ifnet_disable_output().
4205 	 */
4206 	lck_mtx_lock_spin(&ifp->if_start_lock);
4207 	if (resetfc) {
4208 		ifp->if_start_flags &= ~IFSF_FLOW_CONTROLLED;
4209 	} else if (ifp->if_start_flags & IFSF_FLOW_CONTROLLED) {
4210 		lck_mtx_unlock(&ifp->if_start_lock);
4211 		return;
4212 	}
4213 	ifp->if_start_req++;
4214 	if (!ifp->if_start_active && ifp->if_start_thread != THREAD_NULL &&
4215 	    (resetfc || !(ifp->if_eflags & IFEF_ENQUEUE_MULTI) ||
4216 	    IFCQ_LEN(ifp->if_snd) >= ifp->if_start_delay_qlen ||
4217 	    ifp->if_start_delayed == 0)) {
4218 		(void) wakeup_one((caddr_t)&ifp->if_start_thread);
4219 	}
4220 	lck_mtx_unlock(&ifp->if_start_lock);
4221 }
4222 
4223 void
ifnet_start(struct ifnet * ifp)4224 ifnet_start(struct ifnet *ifp)
4225 {
4226 	ifnet_start_common(ifp, FALSE);
4227 }
4228 
4229 __attribute__((noreturn))
4230 static void
ifnet_start_thread_func(void * v,wait_result_t w)4231 ifnet_start_thread_func(void *v, wait_result_t w)
4232 {
4233 #pragma unused(w)
4234 	struct ifnet *ifp = v;
4235 	char thread_name[MAXTHREADNAMESIZE];
4236 
4237 	/* Construct the name for this thread, and then apply it. */
4238 	bzero(thread_name, sizeof(thread_name));
4239 	(void) snprintf(thread_name, sizeof(thread_name),
4240 	    "ifnet_start_%s", ifp->if_xname);
4241 #if SKYWALK
4242 	/* override name for native Skywalk interface */
4243 	if (ifp->if_eflags & IFEF_SKYWALK_NATIVE) {
4244 		(void) snprintf(thread_name, sizeof(thread_name),
4245 		    "skywalk_doorbell_%s_tx", ifp->if_xname);
4246 	}
4247 #endif /* SKYWALK */
4248 	ASSERT(ifp->if_start_thread == current_thread());
4249 	thread_set_thread_name(current_thread(), thread_name);
4250 
4251 	/*
4252 	 * Treat the dedicated starter thread for lo0 as equivalent to
4253 	 * the driver workloop thread; if net_affinity is enabled for
4254 	 * the main input thread, associate this starter thread to it
4255 	 * by binding them with the same affinity tag.  This is done
4256 	 * only once (as we only have one lo_ifp which never goes away.)
4257 	 */
4258 	if (ifp == lo_ifp) {
4259 		struct dlil_threading_info *inp = dlil_main_input_thread;
4260 		struct thread *tp = current_thread();
4261 #if SKYWALK
4262 		/* native skywalk loopback not yet implemented */
4263 		VERIFY(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
4264 #endif /* SKYWALK */
4265 
4266 		lck_mtx_lock(&inp->dlth_lock);
4267 		if (inp->dlth_affinity) {
4268 			u_int32_t tag = inp->dlth_affinity_tag;
4269 
4270 			VERIFY(inp->dlth_driver_thread == THREAD_NULL);
4271 			VERIFY(inp->dlth_poller_thread == THREAD_NULL);
4272 			inp->dlth_driver_thread = tp;
4273 			lck_mtx_unlock(&inp->dlth_lock);
4274 
4275 			/* Associate this thread with the affinity tag */
4276 			(void) dlil_affinity_set(tp, tag);
4277 		} else {
4278 			lck_mtx_unlock(&inp->dlth_lock);
4279 		}
4280 	}
4281 
4282 	lck_mtx_lock(&ifp->if_start_lock);
4283 	VERIFY(!ifp->if_start_embryonic && !ifp->if_start_active);
4284 	(void) assert_wait(&ifp->if_start_thread, THREAD_UNINT);
4285 	ifp->if_start_embryonic = 1;
4286 	/* wake up once to get out of embryonic state */
4287 	ifp->if_start_req++;
4288 	(void) wakeup_one((caddr_t)&ifp->if_start_thread);
4289 	lck_mtx_unlock(&ifp->if_start_lock);
4290 	(void) thread_block_parameter(ifnet_start_thread_cont, ifp);
4291 	/* NOTREACHED */
4292 	__builtin_unreachable();
4293 }
4294 
4295 __attribute__((noreturn))
4296 static void
ifnet_start_thread_cont(void * v,wait_result_t wres)4297 ifnet_start_thread_cont(void *v, wait_result_t wres)
4298 {
4299 	struct ifnet *ifp = v;
4300 	struct ifclassq *ifq = ifp->if_snd;
4301 
4302 	lck_mtx_lock_spin(&ifp->if_start_lock);
4303 	if (__improbable(wres == THREAD_INTERRUPTED ||
4304 	    (ifp->if_start_flags & IFSF_TERMINATING) != 0)) {
4305 		goto terminate;
4306 	}
4307 
4308 	if (__improbable(ifp->if_start_embryonic)) {
4309 		ifp->if_start_embryonic = 0;
4310 		lck_mtx_unlock(&ifp->if_start_lock);
4311 		ifnet_decr_pending_thread_count(ifp);
4312 		lck_mtx_lock_spin(&ifp->if_start_lock);
4313 		goto skip;
4314 	}
4315 
4316 	ifp->if_start_active = 1;
4317 
4318 	/*
4319 	 * Keep on servicing until no more request.
4320 	 */
4321 	for (;;) {
4322 		u_int32_t req = ifp->if_start_req;
4323 		if (!IFCQ_IS_EMPTY(ifq) &&
4324 		    (ifp->if_eflags & IFEF_ENQUEUE_MULTI) &&
4325 		    ifp->if_start_delayed == 0 &&
4326 		    IFCQ_LEN(ifq) < ifp->if_start_delay_qlen &&
4327 		    (ifp->if_eflags & IFEF_DELAY_START)) {
4328 			ifp->if_start_delayed = 1;
4329 			ifnet_start_delayed++;
4330 			break;
4331 		}
4332 		ifp->if_start_delayed = 0;
4333 		lck_mtx_unlock(&ifp->if_start_lock);
4334 
4335 		/*
4336 		 * If no longer attached, don't call start because ifp
4337 		 * is being destroyed; else hold an IO refcnt to
4338 		 * prevent the interface from being detached (will be
4339 		 * released below.)
4340 		 */
4341 		if (!ifnet_datamov_begin(ifp)) {
4342 			lck_mtx_lock_spin(&ifp->if_start_lock);
4343 			break;
4344 		}
4345 
4346 		/* invoke the driver's start routine */
4347 		((*ifp->if_start)(ifp));
4348 
4349 		/*
4350 		 * Release the io ref count taken above.
4351 		 */
4352 		ifnet_datamov_end(ifp);
4353 
4354 		lck_mtx_lock_spin(&ifp->if_start_lock);
4355 
4356 		/*
4357 		 * If there's no pending request or if the
4358 		 * interface has been disabled, we're done.
4359 		 */
4360 #define _IFSF_DISABLED  (IFSF_FLOW_CONTROLLED | IFSF_TERMINATING)
4361 		if (req == ifp->if_start_req ||
4362 		    (ifp->if_start_flags & _IFSF_DISABLED) != 0) {
4363 			break;
4364 		}
4365 	}
4366 skip:
4367 	ifp->if_start_req = 0;
4368 	ifp->if_start_active = 0;
4369 
4370 #if SKYWALK
4371 	/*
4372 	 * Wakeup any waiters, e.g. any threads waiting to
4373 	 * detach the interface from the flowswitch, etc.
4374 	 */
4375 	if (ifp->if_start_waiters != 0) {
4376 		ifp->if_start_waiters = 0;
4377 		wakeup(&ifp->if_start_waiters);
4378 	}
4379 #endif /* SKYWALK */
4380 	if (__probable((ifp->if_start_flags & IFSF_TERMINATING) == 0)) {
4381 		uint64_t deadline = TIMEOUT_WAIT_FOREVER;
4382 		struct timespec delay_start_ts;
4383 		struct timespec *ts;
4384 
4385 		/*
4386 		 * Wakeup N ns from now if rate-controlled by TBR, and if
4387 		 * there are still packets in the send queue which haven't
4388 		 * been dequeued so far; else sleep indefinitely (ts = NULL)
4389 		 * until ifnet_start() is called again.
4390 		 */
4391 		ts = ((IFCQ_TBR_IS_ENABLED(ifq) && !IFCQ_IS_EMPTY(ifq)) ?
4392 		    &ifp->if_start_cycle : NULL);
4393 
4394 		if (ts == NULL && ifp->if_start_delayed == 1) {
4395 			delay_start_ts.tv_sec = 0;
4396 			delay_start_ts.tv_nsec = ifp->if_start_delay_timeout;
4397 			ts = &delay_start_ts;
4398 		}
4399 
4400 		if (ts != NULL && ts->tv_sec == 0 && ts->tv_nsec == 0) {
4401 			ts = NULL;
4402 		}
4403 
4404 		if (__improbable(ts != NULL)) {
4405 			clock_interval_to_deadline((uint32_t)(ts->tv_nsec +
4406 			    (ts->tv_sec * NSEC_PER_SEC)), 1, &deadline);
4407 		}
4408 
4409 		(void) assert_wait_deadline(&ifp->if_start_thread,
4410 		    THREAD_UNINT, deadline);
4411 		lck_mtx_unlock(&ifp->if_start_lock);
4412 		(void) thread_block_parameter(ifnet_start_thread_cont, ifp);
4413 		/* NOTREACHED */
4414 	} else {
4415 terminate:
4416 		/* interface is detached? */
4417 		ifnet_set_start_cycle(ifp, NULL);
4418 
4419 		/* clear if_start_thread to allow termination to continue */
4420 		ASSERT(ifp->if_start_thread != THREAD_NULL);
4421 		ifp->if_start_thread = THREAD_NULL;
4422 		wakeup((caddr_t)&ifp->if_start_thread);
4423 		lck_mtx_unlock(&ifp->if_start_lock);
4424 
4425 		if (dlil_verbose) {
4426 			DLIL_PRINTF("%s: starter thread terminated\n",
4427 			    if_name(ifp));
4428 		}
4429 
4430 		/* for the extra refcnt from kernel_thread_start() */
4431 		thread_deallocate(current_thread());
4432 		/* this is the end */
4433 		thread_terminate(current_thread());
4434 		/* NOTREACHED */
4435 	}
4436 
4437 	/* must never get here */
4438 	VERIFY(0);
4439 	/* NOTREACHED */
4440 	__builtin_unreachable();
4441 }
4442 
4443 void
ifnet_set_start_cycle(struct ifnet * ifp,struct timespec * ts)4444 ifnet_set_start_cycle(struct ifnet *ifp, struct timespec *ts)
4445 {
4446 	if (ts == NULL) {
4447 		bzero(&ifp->if_start_cycle, sizeof(ifp->if_start_cycle));
4448 	} else {
4449 		*(&ifp->if_start_cycle) = *ts;
4450 	}
4451 
4452 	if (ts != NULL && ts->tv_nsec != 0 && dlil_verbose) {
4453 		DLIL_PRINTF("%s: restart interval set to %lu nsec\n",
4454 		    if_name(ifp), ts->tv_nsec);
4455 	}
4456 }
4457 
4458 static inline void
ifnet_poll_wakeup(struct ifnet * ifp)4459 ifnet_poll_wakeup(struct ifnet *ifp)
4460 {
4461 	LCK_MTX_ASSERT(&ifp->if_poll_lock, LCK_MTX_ASSERT_OWNED);
4462 
4463 	ifp->if_poll_req++;
4464 	if (!(ifp->if_poll_flags & IF_POLLF_RUNNING) &&
4465 	    ifp->if_poll_thread != THREAD_NULL) {
4466 		wakeup_one((caddr_t)&ifp->if_poll_thread);
4467 	}
4468 }
4469 
4470 void
ifnet_poll(struct ifnet * ifp)4471 ifnet_poll(struct ifnet *ifp)
4472 {
4473 	/*
4474 	 * If the poller thread is inactive, signal it to do work.
4475 	 */
4476 	lck_mtx_lock_spin(&ifp->if_poll_lock);
4477 	ifnet_poll_wakeup(ifp);
4478 	lck_mtx_unlock(&ifp->if_poll_lock);
4479 }
4480 
4481 __attribute__((noreturn))
4482 static void
ifnet_poll_thread_func(void * v,wait_result_t w)4483 ifnet_poll_thread_func(void *v, wait_result_t w)
4484 {
4485 #pragma unused(w)
4486 	char thread_name[MAXTHREADNAMESIZE];
4487 	struct ifnet *ifp = v;
4488 
4489 	VERIFY(ifp->if_eflags & IFEF_RXPOLL);
4490 	VERIFY(current_thread() == ifp->if_poll_thread);
4491 
4492 	/* construct the name for this thread, and then apply it */
4493 	bzero(thread_name, sizeof(thread_name));
4494 	(void) snprintf(thread_name, sizeof(thread_name),
4495 	    "ifnet_poller_%s", ifp->if_xname);
4496 	thread_set_thread_name(ifp->if_poll_thread, thread_name);
4497 
4498 	lck_mtx_lock(&ifp->if_poll_lock);
4499 	VERIFY(!(ifp->if_poll_flags & (IF_POLLF_EMBRYONIC | IF_POLLF_RUNNING)));
4500 	(void) assert_wait(&ifp->if_poll_thread, THREAD_UNINT);
4501 	ifp->if_poll_flags |= IF_POLLF_EMBRYONIC;
4502 	/* wake up once to get out of embryonic state */
4503 	ifnet_poll_wakeup(ifp);
4504 	lck_mtx_unlock(&ifp->if_poll_lock);
4505 	(void) thread_block_parameter(ifnet_poll_thread_cont, ifp);
4506 	/* NOTREACHED */
4507 	__builtin_unreachable();
4508 }
4509 
4510 __attribute__((noreturn))
4511 static void
ifnet_poll_thread_cont(void * v,wait_result_t wres)4512 ifnet_poll_thread_cont(void *v, wait_result_t wres)
4513 {
4514 	struct dlil_threading_info *inp;
4515 	struct ifnet *ifp = v;
4516 	struct ifnet_stat_increment_param s;
4517 	struct timespec start_time;
4518 
4519 	VERIFY(ifp->if_eflags & IFEF_RXPOLL);
4520 
4521 	bzero(&s, sizeof(s));
4522 	net_timerclear(&start_time);
4523 
4524 	lck_mtx_lock_spin(&ifp->if_poll_lock);
4525 	if (__improbable(wres == THREAD_INTERRUPTED ||
4526 	    (ifp->if_poll_flags & IF_POLLF_TERMINATING) != 0)) {
4527 		goto terminate;
4528 	}
4529 
4530 	inp = ifp->if_inp;
4531 	VERIFY(inp != NULL);
4532 
4533 	if (__improbable(ifp->if_poll_flags & IF_POLLF_EMBRYONIC)) {
4534 		ifp->if_poll_flags &= ~IF_POLLF_EMBRYONIC;
4535 		lck_mtx_unlock(&ifp->if_poll_lock);
4536 		ifnet_decr_pending_thread_count(ifp);
4537 		lck_mtx_lock_spin(&ifp->if_poll_lock);
4538 		goto skip;
4539 	}
4540 
4541 	ifp->if_poll_flags |= IF_POLLF_RUNNING;
4542 
4543 	/*
4544 	 * Keep on servicing until no more request.
4545 	 */
4546 	for (;;) {
4547 		struct mbuf *m_head, *m_tail;
4548 		u_int32_t m_lim, m_cnt, m_totlen;
4549 		u_int16_t req = ifp->if_poll_req;
4550 
4551 		m_lim = (ifp->if_rxpoll_plim != 0) ? ifp->if_rxpoll_plim :
4552 		    MAX((qlimit(&inp->dlth_pkts)), (ifp->if_rxpoll_phiwat << 2));
4553 		lck_mtx_unlock(&ifp->if_poll_lock);
4554 
4555 		/*
4556 		 * If no longer attached, there's nothing to do;
4557 		 * else hold an IO refcnt to prevent the interface
4558 		 * from being detached (will be released below.)
4559 		 */
4560 		if (!ifnet_is_attached(ifp, 1)) {
4561 			lck_mtx_lock_spin(&ifp->if_poll_lock);
4562 			break;
4563 		}
4564 
4565 		if (dlil_verbose > 1) {
4566 			DLIL_PRINTF("%s: polling up to %d pkts, "
4567 			    "pkts avg %d max %d, wreq avg %d, "
4568 			    "bytes avg %d\n",
4569 			    if_name(ifp), m_lim,
4570 			    ifp->if_rxpoll_pavg, ifp->if_rxpoll_pmax,
4571 			    ifp->if_rxpoll_wavg, ifp->if_rxpoll_bavg);
4572 		}
4573 
4574 		/* invoke the driver's input poll routine */
4575 		((*ifp->if_input_poll)(ifp, 0, m_lim, &m_head, &m_tail,
4576 		&m_cnt, &m_totlen));
4577 
4578 		if (m_head != NULL) {
4579 			VERIFY(m_tail != NULL && m_cnt > 0);
4580 
4581 			if (dlil_verbose > 1) {
4582 				DLIL_PRINTF("%s: polled %d pkts, "
4583 				    "pkts avg %d max %d, wreq avg %d, "
4584 				    "bytes avg %d\n",
4585 				    if_name(ifp), m_cnt,
4586 				    ifp->if_rxpoll_pavg, ifp->if_rxpoll_pmax,
4587 				    ifp->if_rxpoll_wavg, ifp->if_rxpoll_bavg);
4588 			}
4589 
4590 			/* stats are required for extended variant */
4591 			s.packets_in = m_cnt;
4592 			s.bytes_in = m_totlen;
4593 
4594 			(void) ifnet_input_common(ifp, m_head, m_tail,
4595 			    &s, TRUE, TRUE);
4596 		} else {
4597 			if (dlil_verbose > 1) {
4598 				DLIL_PRINTF("%s: no packets, "
4599 				    "pkts avg %d max %d, wreq avg %d, "
4600 				    "bytes avg %d\n",
4601 				    if_name(ifp), ifp->if_rxpoll_pavg,
4602 				    ifp->if_rxpoll_pmax, ifp->if_rxpoll_wavg,
4603 				    ifp->if_rxpoll_bavg);
4604 			}
4605 
4606 			(void) ifnet_input_common(ifp, NULL, NULL,
4607 			    NULL, FALSE, TRUE);
4608 		}
4609 
4610 		/* Release the io ref count */
4611 		ifnet_decr_iorefcnt(ifp);
4612 
4613 		lck_mtx_lock_spin(&ifp->if_poll_lock);
4614 
4615 		/* if there's no pending request, we're done */
4616 		if (req == ifp->if_poll_req ||
4617 		    (ifp->if_poll_flags & IF_POLLF_TERMINATING) != 0) {
4618 			break;
4619 		}
4620 	}
4621 skip:
4622 	ifp->if_poll_req = 0;
4623 	ifp->if_poll_flags &= ~IF_POLLF_RUNNING;
4624 
4625 	if (__probable((ifp->if_poll_flags & IF_POLLF_TERMINATING) == 0)) {
4626 		uint64_t deadline = TIMEOUT_WAIT_FOREVER;
4627 		struct timespec *ts;
4628 
4629 		/*
4630 		 * Wakeup N ns from now, else sleep indefinitely (ts = NULL)
4631 		 * until ifnet_poll() is called again.
4632 		 */
4633 		ts = &ifp->if_poll_cycle;
4634 		if (ts->tv_sec == 0 && ts->tv_nsec == 0) {
4635 			ts = NULL;
4636 		}
4637 
4638 		if (ts != NULL) {
4639 			clock_interval_to_deadline((uint32_t)(ts->tv_nsec +
4640 			    (ts->tv_sec * NSEC_PER_SEC)), 1, &deadline);
4641 		}
4642 
4643 		(void) assert_wait_deadline(&ifp->if_poll_thread,
4644 		    THREAD_UNINT, deadline);
4645 		lck_mtx_unlock(&ifp->if_poll_lock);
4646 		(void) thread_block_parameter(ifnet_poll_thread_cont, ifp);
4647 		/* NOTREACHED */
4648 	} else {
4649 terminate:
4650 		/* interface is detached (maybe while asleep)? */
4651 		ifnet_set_poll_cycle(ifp, NULL);
4652 
4653 		/* clear if_poll_thread to allow termination to continue */
4654 		ASSERT(ifp->if_poll_thread != THREAD_NULL);
4655 		ifp->if_poll_thread = THREAD_NULL;
4656 		wakeup((caddr_t)&ifp->if_poll_thread);
4657 		lck_mtx_unlock(&ifp->if_poll_lock);
4658 
4659 		if (dlil_verbose) {
4660 			DLIL_PRINTF("%s: poller thread terminated\n",
4661 			    if_name(ifp));
4662 		}
4663 
4664 		/* for the extra refcnt from kernel_thread_start() */
4665 		thread_deallocate(current_thread());
4666 		/* this is the end */
4667 		thread_terminate(current_thread());
4668 		/* NOTREACHED */
4669 	}
4670 
4671 	/* must never get here */
4672 	VERIFY(0);
4673 	/* NOTREACHED */
4674 	__builtin_unreachable();
4675 }
4676 
4677 void
ifnet_set_poll_cycle(struct ifnet * ifp,struct timespec * ts)4678 ifnet_set_poll_cycle(struct ifnet *ifp, struct timespec *ts)
4679 {
4680 	if (ts == NULL) {
4681 		bzero(&ifp->if_poll_cycle, sizeof(ifp->if_poll_cycle));
4682 	} else {
4683 		*(&ifp->if_poll_cycle) = *ts;
4684 	}
4685 
4686 	if (ts != NULL && ts->tv_nsec != 0 && dlil_verbose) {
4687 		DLIL_PRINTF("%s: poll interval set to %lu nsec\n",
4688 		    if_name(ifp), ts->tv_nsec);
4689 	}
4690 }
4691 
4692 void
ifnet_purge(struct ifnet * ifp)4693 ifnet_purge(struct ifnet *ifp)
4694 {
4695 	if (ifp != NULL && (ifp->if_eflags & IFEF_TXSTART)) {
4696 		if_qflush_snd(ifp, false);
4697 	}
4698 }
4699 
4700 void
ifnet_update_sndq(struct ifclassq * ifq,cqev_t ev)4701 ifnet_update_sndq(struct ifclassq *ifq, cqev_t ev)
4702 {
4703 	IFCQ_LOCK_ASSERT_HELD(ifq);
4704 
4705 	if (!(IFCQ_IS_READY(ifq))) {
4706 		return;
4707 	}
4708 
4709 	if (IFCQ_TBR_IS_ENABLED(ifq)) {
4710 		struct tb_profile tb = {
4711 			.rate = ifq->ifcq_tbr.tbr_rate_raw,
4712 			.percent = ifq->ifcq_tbr.tbr_percent, .depth = 0
4713 		};
4714 		(void) ifclassq_tbr_set(ifq, &tb, FALSE);
4715 	}
4716 
4717 	ifclassq_update(ifq, ev);
4718 }
4719 
4720 void
ifnet_update_rcv(struct ifnet * ifp,cqev_t ev)4721 ifnet_update_rcv(struct ifnet *ifp, cqev_t ev)
4722 {
4723 	switch (ev) {
4724 	case CLASSQ_EV_LINK_BANDWIDTH:
4725 		if (net_rxpoll && (ifp->if_eflags & IFEF_RXPOLL)) {
4726 			ifp->if_poll_update++;
4727 		}
4728 		break;
4729 
4730 	default:
4731 		break;
4732 	}
4733 }
4734 
4735 errno_t
ifnet_set_output_sched_model(struct ifnet * ifp,u_int32_t model)4736 ifnet_set_output_sched_model(struct ifnet *ifp, u_int32_t model)
4737 {
4738 	struct ifclassq *ifq;
4739 	u_int32_t omodel;
4740 	errno_t err;
4741 
4742 	if (ifp == NULL || model >= IFNET_SCHED_MODEL_MAX) {
4743 		return EINVAL;
4744 	} else if (!(ifp->if_eflags & IFEF_TXSTART)) {
4745 		return ENXIO;
4746 	}
4747 
4748 	ifq = ifp->if_snd;
4749 	IFCQ_LOCK(ifq);
4750 	omodel = ifp->if_output_sched_model;
4751 	ifp->if_output_sched_model = model;
4752 	if ((err = ifclassq_pktsched_setup(ifq)) != 0) {
4753 		ifp->if_output_sched_model = omodel;
4754 	}
4755 	IFCQ_UNLOCK(ifq);
4756 
4757 	return err;
4758 }
4759 
4760 errno_t
ifnet_set_sndq_maxlen(struct ifnet * ifp,u_int32_t maxqlen)4761 ifnet_set_sndq_maxlen(struct ifnet *ifp, u_int32_t maxqlen)
4762 {
4763 	if (ifp == NULL) {
4764 		return EINVAL;
4765 	} else if (!(ifp->if_eflags & IFEF_TXSTART)) {
4766 		return ENXIO;
4767 	}
4768 
4769 	ifclassq_set_maxlen(ifp->if_snd, maxqlen);
4770 
4771 	return 0;
4772 }
4773 
4774 errno_t
ifnet_get_sndq_maxlen(struct ifnet * ifp,u_int32_t * maxqlen)4775 ifnet_get_sndq_maxlen(struct ifnet *ifp, u_int32_t *maxqlen)
4776 {
4777 	if (ifp == NULL || maxqlen == NULL) {
4778 		return EINVAL;
4779 	} else if (!(ifp->if_eflags & IFEF_TXSTART)) {
4780 		return ENXIO;
4781 	}
4782 
4783 	*maxqlen = ifclassq_get_maxlen(ifp->if_snd);
4784 
4785 	return 0;
4786 }
4787 
4788 errno_t
ifnet_get_sndq_len(struct ifnet * ifp,u_int32_t * pkts)4789 ifnet_get_sndq_len(struct ifnet *ifp, u_int32_t *pkts)
4790 {
4791 	errno_t err;
4792 
4793 	if (ifp == NULL || pkts == NULL) {
4794 		err = EINVAL;
4795 	} else if (!(ifp->if_eflags & IFEF_TXSTART)) {
4796 		err = ENXIO;
4797 	} else {
4798 		err = ifclassq_get_len(ifp->if_snd, MBUF_SC_UNSPEC,
4799 		    IF_CLASSQ_ALL_GRPS, pkts, NULL);
4800 	}
4801 
4802 	return err;
4803 }
4804 
4805 errno_t
ifnet_get_service_class_sndq_len(struct ifnet * ifp,mbuf_svc_class_t sc,u_int32_t * pkts,u_int32_t * bytes)4806 ifnet_get_service_class_sndq_len(struct ifnet *ifp, mbuf_svc_class_t sc,
4807     u_int32_t *pkts, u_int32_t *bytes)
4808 {
4809 	errno_t err;
4810 
4811 	if (ifp == NULL || !MBUF_VALID_SC(sc) ||
4812 	    (pkts == NULL && bytes == NULL)) {
4813 		err = EINVAL;
4814 	} else if (!(ifp->if_eflags & IFEF_TXSTART)) {
4815 		err = ENXIO;
4816 	} else {
4817 		err = ifclassq_get_len(ifp->if_snd, sc, IF_CLASSQ_ALL_GRPS,
4818 		    pkts, bytes);
4819 	}
4820 
4821 	return err;
4822 }
4823 
4824 errno_t
ifnet_set_rcvq_maxlen(struct ifnet * ifp,u_int32_t maxqlen)4825 ifnet_set_rcvq_maxlen(struct ifnet *ifp, u_int32_t maxqlen)
4826 {
4827 	struct dlil_threading_info *inp;
4828 
4829 	if (ifp == NULL) {
4830 		return EINVAL;
4831 	} else if (!(ifp->if_eflags & IFEF_RXPOLL) || ifp->if_inp == NULL) {
4832 		return ENXIO;
4833 	}
4834 
4835 	if (maxqlen == 0) {
4836 		maxqlen = if_rcvq_maxlen;
4837 	} else if (maxqlen < IF_RCVQ_MINLEN) {
4838 		maxqlen = IF_RCVQ_MINLEN;
4839 	}
4840 
4841 	inp = ifp->if_inp;
4842 	lck_mtx_lock(&inp->dlth_lock);
4843 	qlimit(&inp->dlth_pkts) = maxqlen;
4844 	lck_mtx_unlock(&inp->dlth_lock);
4845 
4846 	return 0;
4847 }
4848 
4849 errno_t
ifnet_get_rcvq_maxlen(struct ifnet * ifp,u_int32_t * maxqlen)4850 ifnet_get_rcvq_maxlen(struct ifnet *ifp, u_int32_t *maxqlen)
4851 {
4852 	struct dlil_threading_info *inp;
4853 
4854 	if (ifp == NULL || maxqlen == NULL) {
4855 		return EINVAL;
4856 	} else if (!(ifp->if_eflags & IFEF_RXPOLL) || ifp->if_inp == NULL) {
4857 		return ENXIO;
4858 	}
4859 
4860 	inp = ifp->if_inp;
4861 	lck_mtx_lock(&inp->dlth_lock);
4862 	*maxqlen = qlimit(&inp->dlth_pkts);
4863 	lck_mtx_unlock(&inp->dlth_lock);
4864 	return 0;
4865 }
4866 
4867 void
ifnet_enqueue_multi_setup(struct ifnet * ifp,uint16_t delay_qlen,uint16_t delay_timeout)4868 ifnet_enqueue_multi_setup(struct ifnet *ifp, uint16_t delay_qlen,
4869     uint16_t delay_timeout)
4870 {
4871 	if (delay_qlen > 0 && delay_timeout > 0) {
4872 		if_set_eflags(ifp, IFEF_ENQUEUE_MULTI);
4873 		ifp->if_start_delay_qlen = MIN(100, delay_qlen);
4874 		ifp->if_start_delay_timeout = min(20000, delay_timeout);
4875 		/* convert timeout to nanoseconds */
4876 		ifp->if_start_delay_timeout *= 1000;
4877 		kprintf("%s: forced IFEF_ENQUEUE_MULTI qlen %u timeout %u\n",
4878 		    ifp->if_xname, (uint32_t)delay_qlen,
4879 		    (uint32_t)delay_timeout);
4880 	} else {
4881 		if_clear_eflags(ifp, IFEF_ENQUEUE_MULTI);
4882 	}
4883 }
4884 
4885 /*
4886  * This function clears the DSCP bits in the IPV4/V6 header pointed to by buf.
4887  * While it's ok for buf to be not 32 bit aligned, the caller must ensure that
4888  * buf holds the full header.
4889  */
4890 static __attribute__((noinline)) void
ifnet_mcast_clear_dscp(uint8_t * buf,uint8_t ip_ver)4891 ifnet_mcast_clear_dscp(uint8_t *buf, uint8_t ip_ver)
4892 {
4893 	struct ip *ip;
4894 	struct ip6_hdr *ip6;
4895 	uint8_t lbuf[64] __attribute__((aligned(8)));
4896 	uint8_t *p = buf;
4897 
4898 	if (ip_ver == IPVERSION) {
4899 		uint8_t old_tos;
4900 		uint32_t sum;
4901 
4902 		if (__improbable(!IP_HDR_ALIGNED_P(p))) {
4903 			DTRACE_IP1(not__aligned__v4, uint8_t *, buf);
4904 			bcopy(buf, lbuf, sizeof(struct ip));
4905 			p = lbuf;
4906 		}
4907 		ip = (struct ip *)(void *)p;
4908 		if (__probable((ip->ip_tos & ~IPTOS_ECN_MASK) == 0)) {
4909 			return;
4910 		}
4911 
4912 		DTRACE_IP1(clear__v4, struct ip *, ip);
4913 		old_tos = ip->ip_tos;
4914 		ip->ip_tos &= IPTOS_ECN_MASK;
4915 		sum = ip->ip_sum + htons(old_tos) - htons(ip->ip_tos);
4916 		sum = (sum >> 16) + (sum & 0xffff);
4917 		ip->ip_sum = (uint16_t)(sum & 0xffff);
4918 
4919 		if (__improbable(p == lbuf)) {
4920 			bcopy(lbuf, buf, sizeof(struct ip));
4921 		}
4922 	} else {
4923 		uint32_t flow;
4924 		ASSERT(ip_ver == IPV6_VERSION);
4925 
4926 		if (__improbable(!IP_HDR_ALIGNED_P(p))) {
4927 			DTRACE_IP1(not__aligned__v6, uint8_t *, buf);
4928 			bcopy(buf, lbuf, sizeof(struct ip6_hdr));
4929 			p = lbuf;
4930 		}
4931 		ip6 = (struct ip6_hdr *)(void *)p;
4932 		flow = ntohl(ip6->ip6_flow);
4933 		if (__probable((flow & IP6FLOW_DSCP_MASK) == 0)) {
4934 			return;
4935 		}
4936 
4937 		DTRACE_IP1(clear__v6, struct ip6_hdr *, ip6);
4938 		ip6->ip6_flow = htonl(flow & ~IP6FLOW_DSCP_MASK);
4939 
4940 		if (__improbable(p == lbuf)) {
4941 			bcopy(lbuf, buf, sizeof(struct ip6_hdr));
4942 		}
4943 	}
4944 }
4945 
4946 static inline errno_t
ifnet_enqueue_ifclassq(struct ifnet * ifp,struct ifclassq * ifcq,classq_pkt_t * p,boolean_t flush,boolean_t * pdrop)4947 ifnet_enqueue_ifclassq(struct ifnet *ifp, struct ifclassq *ifcq,
4948     classq_pkt_t *p, boolean_t flush, boolean_t *pdrop)
4949 {
4950 #if SKYWALK
4951 	volatile struct sk_nexusadv *nxadv = NULL;
4952 #endif /* SKYWALK */
4953 	volatile uint64_t *fg_ts = NULL;
4954 	volatile uint64_t *rt_ts = NULL;
4955 	struct timespec now;
4956 	u_int64_t now_nsec = 0;
4957 	int error = 0;
4958 	uint8_t *mcast_buf = NULL;
4959 	uint8_t ip_ver;
4960 	uint32_t pktlen;
4961 
4962 	ASSERT(ifp->if_eflags & IFEF_TXSTART);
4963 #if SKYWALK
4964 	/*
4965 	 * If attached to flowswitch, grab pointers to the
4966 	 * timestamp variables in the nexus advisory region.
4967 	 */
4968 	if ((ifp->if_capabilities & IFCAP_SKYWALK) && ifp->if_na != NULL &&
4969 	    (nxadv = ifp->if_na->nifna_netif->nif_fsw_nxadv) != NULL) {
4970 		fg_ts = &nxadv->nxadv_fg_sendts;
4971 		rt_ts = &nxadv->nxadv_rt_sendts;
4972 	}
4973 #endif /* SKYWALK */
4974 
4975 	/*
4976 	 * If packet already carries a timestamp, either from dlil_output()
4977 	 * or from flowswitch, use it here.  Otherwise, record timestamp.
4978 	 * PKTF_TS_VALID is always cleared prior to entering classq, i.e.
4979 	 * the timestamp value is used internally there.
4980 	 */
4981 	switch (p->cp_ptype) {
4982 	case QP_MBUF:
4983 #if SKYWALK
4984 		/*
4985 		 * Valid only for non-native (compat) Skywalk interface.
4986 		 * If the data source uses packet, caller must convert
4987 		 * it to mbuf first prior to calling this routine.
4988 		 */
4989 		ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
4990 #endif /* SKYWALK */
4991 		ASSERT(p->cp_mbuf->m_flags & M_PKTHDR);
4992 		ASSERT(p->cp_mbuf->m_nextpkt == NULL);
4993 
4994 		if (!(p->cp_mbuf->m_pkthdr.pkt_flags & PKTF_TS_VALID) ||
4995 		    p->cp_mbuf->m_pkthdr.pkt_timestamp == 0) {
4996 			nanouptime(&now);
4997 			net_timernsec(&now, &now_nsec);
4998 			p->cp_mbuf->m_pkthdr.pkt_timestamp = now_nsec;
4999 		}
5000 		p->cp_mbuf->m_pkthdr.pkt_flags &= ~PKTF_TS_VALID;
5001 		/*
5002 		 * If the packet service class is not background,
5003 		 * update the timestamp to indicate recent activity
5004 		 * on a foreground socket.
5005 		 */
5006 		if ((p->cp_mbuf->m_pkthdr.pkt_flags & PKTF_FLOW_ID) &&
5007 		    p->cp_mbuf->m_pkthdr.pkt_flowsrc == FLOWSRC_INPCB) {
5008 			if (!(p->cp_mbuf->m_pkthdr.pkt_flags &
5009 			    PKTF_SO_BACKGROUND)) {
5010 				ifp->if_fg_sendts = (uint32_t)_net_uptime;
5011 				if (fg_ts != NULL) {
5012 					*fg_ts = (uint32_t)_net_uptime;
5013 				}
5014 			}
5015 			if (p->cp_mbuf->m_pkthdr.pkt_flags & PKTF_SO_REALTIME) {
5016 				ifp->if_rt_sendts = (uint32_t)_net_uptime;
5017 				if (rt_ts != NULL) {
5018 					*rt_ts = (uint32_t)_net_uptime;
5019 				}
5020 			}
5021 		}
5022 		pktlen = m_pktlen(p->cp_mbuf);
5023 
5024 		/*
5025 		 * Some Wi-Fi AP implementations do not correctly handle
5026 		 * multicast IP packets with DSCP bits set (radr://9331522).
5027 		 * As a workaround we clear the DSCP bits but keep service
5028 		 * class (rdar://51507725).
5029 		 */
5030 		if ((p->cp_mbuf->m_flags & M_MCAST) != 0 &&
5031 		    IFNET_IS_WIFI_INFRA(ifp)) {
5032 			size_t len = mbuf_len(p->cp_mbuf), hlen;
5033 			struct ether_header *eh;
5034 			boolean_t pullup = FALSE;
5035 			uint16_t etype;
5036 
5037 			if (__improbable(len < sizeof(struct ether_header))) {
5038 				DTRACE_IP1(small__ether, size_t, len);
5039 				if ((p->cp_mbuf = m_pullup(p->cp_mbuf,
5040 				    sizeof(struct ether_header))) == NULL) {
5041 					return ENOMEM;
5042 				}
5043 			}
5044 			eh = (struct ether_header *)mbuf_data(p->cp_mbuf);
5045 			etype = ntohs(eh->ether_type);
5046 			if (etype == ETHERTYPE_IP) {
5047 				hlen = sizeof(struct ether_header) +
5048 				    sizeof(struct ip);
5049 				if (len < hlen) {
5050 					DTRACE_IP1(small__v4, size_t, len);
5051 					pullup = TRUE;
5052 				}
5053 				ip_ver = IPVERSION;
5054 			} else if (etype == ETHERTYPE_IPV6) {
5055 				hlen = sizeof(struct ether_header) +
5056 				    sizeof(struct ip6_hdr);
5057 				if (len < hlen) {
5058 					DTRACE_IP1(small__v6, size_t, len);
5059 					pullup = TRUE;
5060 				}
5061 				ip_ver = IPV6_VERSION;
5062 			} else {
5063 				DTRACE_IP1(invalid__etype, uint16_t, etype);
5064 				break;
5065 			}
5066 			if (pullup) {
5067 				if ((p->cp_mbuf = m_pullup(p->cp_mbuf, (int)hlen)) ==
5068 				    NULL) {
5069 					return ENOMEM;
5070 				}
5071 
5072 				eh = (struct ether_header *)mbuf_data(
5073 					p->cp_mbuf);
5074 			}
5075 			mcast_buf = (uint8_t *)(eh + 1);
5076 			/*
5077 			 * ifnet_mcast_clear_dscp() will finish the work below.
5078 			 * Note that the pullups above ensure that mcast_buf
5079 			 * points to a full IP header.
5080 			 */
5081 		}
5082 		break;
5083 
5084 #if SKYWALK
5085 	case QP_PACKET:
5086 		/*
5087 		 * Valid only for native Skywalk interface.  If the data
5088 		 * source uses mbuf, caller must convert it to packet first
5089 		 * prior to calling this routine.
5090 		 */
5091 		ASSERT(ifp->if_eflags & IFEF_SKYWALK_NATIVE);
5092 		if (!(p->cp_kpkt->pkt_pflags & PKT_F_TS_VALID) ||
5093 		    p->cp_kpkt->pkt_timestamp == 0) {
5094 			nanouptime(&now);
5095 			net_timernsec(&now, &now_nsec);
5096 			p->cp_kpkt->pkt_timestamp = now_nsec;
5097 		}
5098 		p->cp_kpkt->pkt_pflags &= ~PKT_F_TS_VALID;
5099 		/*
5100 		 * If the packet service class is not background,
5101 		 * update the timestamps on the interface, as well as
5102 		 * the ones in nexus-wide advisory to indicate recent
5103 		 * activity on a foreground flow.
5104 		 */
5105 		if (!(p->cp_kpkt->pkt_pflags & PKT_F_BACKGROUND)) {
5106 			ifp->if_fg_sendts = (uint32_t)_net_uptime;
5107 			if (fg_ts != NULL) {
5108 				*fg_ts = (uint32_t)_net_uptime;
5109 			}
5110 		}
5111 		if (p->cp_kpkt->pkt_pflags & PKT_F_REALTIME) {
5112 			ifp->if_rt_sendts = (uint32_t)_net_uptime;
5113 			if (rt_ts != NULL) {
5114 				*rt_ts = (uint32_t)_net_uptime;
5115 			}
5116 		}
5117 		pktlen = p->cp_kpkt->pkt_length;
5118 
5119 		/*
5120 		 * Some Wi-Fi AP implementations do not correctly handle
5121 		 * multicast IP packets with DSCP bits set (radr://9331522).
5122 		 * As a workaround we clear the DSCP bits but keep service
5123 		 * class (rdar://51507725).
5124 		 */
5125 		if ((p->cp_kpkt->pkt_link_flags & PKT_LINKF_MCAST) != 0 &&
5126 		    IFNET_IS_WIFI_INFRA(ifp)) {
5127 			uint8_t *baddr;
5128 			struct ether_header *eh;
5129 			uint16_t etype;
5130 
5131 			MD_BUFLET_ADDR_ABS(p->cp_kpkt, baddr);
5132 			baddr += p->cp_kpkt->pkt_headroom;
5133 			if (__improbable(pktlen < sizeof(struct ether_header))) {
5134 				DTRACE_IP1(pkt__small__ether, __kern_packet *,
5135 				    p->cp_kpkt);
5136 				break;
5137 			}
5138 			eh = (struct ether_header *)(void *)baddr;
5139 			etype = ntohs(eh->ether_type);
5140 			if (etype == ETHERTYPE_IP) {
5141 				if (pktlen < sizeof(struct ether_header) +
5142 				    sizeof(struct ip)) {
5143 					DTRACE_IP1(pkt__small__v4, uint32_t,
5144 					    pktlen);
5145 					break;
5146 				}
5147 				ip_ver = IPVERSION;
5148 			} else if (etype == ETHERTYPE_IPV6) {
5149 				if (pktlen < sizeof(struct ether_header) +
5150 				    sizeof(struct ip6_hdr)) {
5151 					DTRACE_IP1(pkt__small__v6, uint32_t,
5152 					    pktlen);
5153 					break;
5154 				}
5155 				ip_ver = IPV6_VERSION;
5156 			} else {
5157 				DTRACE_IP1(pkt__invalid__etype, uint16_t,
5158 				    etype);
5159 				break;
5160 			}
5161 			mcast_buf = (uint8_t *)(eh + 1);
5162 			/*
5163 			 * ifnet_mcast_clear_dscp() will finish the work below.
5164 			 * The checks above verify that the IP header is in the
5165 			 * first buflet.
5166 			 */
5167 		}
5168 		break;
5169 #endif /* SKYWALK */
5170 
5171 	default:
5172 		VERIFY(0);
5173 		/* NOTREACHED */
5174 		__builtin_unreachable();
5175 	}
5176 
5177 	if (mcast_buf != NULL) {
5178 		ifnet_mcast_clear_dscp(mcast_buf, ip_ver);
5179 	}
5180 
5181 	if (ifp->if_eflags & IFEF_ENQUEUE_MULTI) {
5182 		if (now_nsec == 0) {
5183 			nanouptime(&now);
5184 			net_timernsec(&now, &now_nsec);
5185 		}
5186 		/*
5187 		 * If the driver chose to delay start callback for
5188 		 * coalescing multiple packets, Then use the following
5189 		 * heuristics to make sure that start callback will
5190 		 * be delayed only when bulk data transfer is detected.
5191 		 * 1. number of packets enqueued in (delay_win * 2) is
5192 		 * greater than or equal to the delay qlen.
5193 		 * 2. If delay_start is enabled it will stay enabled for
5194 		 * another 10 idle windows. This is to take into account
5195 		 * variable RTT and burst traffic.
5196 		 * 3. If the time elapsed since last enqueue is more
5197 		 * than 200ms we disable delaying start callback. This is
5198 		 * is to take idle time into account.
5199 		 */
5200 		u_int64_t dwin = (ifp->if_start_delay_timeout << 1);
5201 		if (ifp->if_start_delay_swin > 0) {
5202 			if ((ifp->if_start_delay_swin + dwin) > now_nsec) {
5203 				ifp->if_start_delay_cnt++;
5204 			} else if ((now_nsec - ifp->if_start_delay_swin)
5205 			    >= (200 * 1000 * 1000)) {
5206 				ifp->if_start_delay_swin = now_nsec;
5207 				ifp->if_start_delay_cnt = 1;
5208 				ifp->if_start_delay_idle = 0;
5209 				if (ifp->if_eflags & IFEF_DELAY_START) {
5210 					if_clear_eflags(ifp, IFEF_DELAY_START);
5211 					ifnet_delay_start_disabled_increment();
5212 				}
5213 			} else {
5214 				if (ifp->if_start_delay_cnt >=
5215 				    ifp->if_start_delay_qlen) {
5216 					if_set_eflags(ifp, IFEF_DELAY_START);
5217 					ifp->if_start_delay_idle = 0;
5218 				} else {
5219 					if (ifp->if_start_delay_idle >= 10) {
5220 						if_clear_eflags(ifp,
5221 						    IFEF_DELAY_START);
5222 						ifnet_delay_start_disabled_increment();
5223 					} else {
5224 						ifp->if_start_delay_idle++;
5225 					}
5226 				}
5227 				ifp->if_start_delay_swin = now_nsec;
5228 				ifp->if_start_delay_cnt = 1;
5229 			}
5230 		} else {
5231 			ifp->if_start_delay_swin = now_nsec;
5232 			ifp->if_start_delay_cnt = 1;
5233 			ifp->if_start_delay_idle = 0;
5234 			if_clear_eflags(ifp, IFEF_DELAY_START);
5235 		}
5236 	} else {
5237 		if_clear_eflags(ifp, IFEF_DELAY_START);
5238 	}
5239 
5240 	/* enqueue the packet (caller consumes object) */
5241 	error = ifclassq_enqueue(((ifcq != NULL) ? ifcq : ifp->if_snd), p, p,
5242 	    1, pktlen, pdrop);
5243 
5244 	/*
5245 	 * Tell the driver to start dequeueing; do this even when the queue
5246 	 * for the packet is suspended (EQSUSPENDED), as the driver could still
5247 	 * be dequeueing from other unsuspended queues.
5248 	 */
5249 	if (!(ifp->if_eflags & IFEF_ENQUEUE_MULTI) &&
5250 	    ((error == 0 && flush) || error == EQFULL || error == EQSUSPENDED)) {
5251 		ifnet_start(ifp);
5252 	}
5253 
5254 	return error;
5255 }
5256 
5257 static inline errno_t
ifnet_enqueue_ifclassq_chain(struct ifnet * ifp,struct ifclassq * ifcq,classq_pkt_t * head,classq_pkt_t * tail,uint32_t cnt,uint32_t bytes,boolean_t flush,boolean_t * pdrop)5258 ifnet_enqueue_ifclassq_chain(struct ifnet *ifp, struct ifclassq *ifcq,
5259     classq_pkt_t *head, classq_pkt_t *tail, uint32_t cnt, uint32_t bytes,
5260     boolean_t flush, boolean_t *pdrop)
5261 {
5262 	int error;
5263 
5264 	/* enqueue the packet (caller consumes object) */
5265 	error = ifclassq_enqueue(ifcq != NULL ? ifcq : ifp->if_snd, head, tail,
5266 	    cnt, bytes, pdrop);
5267 
5268 	/*
5269 	 * Tell the driver to start dequeueing; do this even when the queue
5270 	 * for the packet is suspended (EQSUSPENDED), as the driver could still
5271 	 * be dequeueing from other unsuspended queues.
5272 	 */
5273 	if ((error == 0 && flush) || error == EQFULL || error == EQSUSPENDED) {
5274 		ifnet_start(ifp);
5275 	}
5276 	return error;
5277 }
5278 
5279 #if DEVELOPMENT || DEBUG
5280 void
trace_pkt_dump_payload(struct ifnet * ifp,struct __kern_packet * kpkt,bool input)5281 trace_pkt_dump_payload(struct ifnet *ifp, struct __kern_packet *kpkt, bool input)
5282 {
5283 #define MIN_TRACE_DUMP_PKT_SIZE  32
5284 	struct ether_header *eh = NULL;
5285 	struct udphdr *uh = NULL;
5286 
5287 	if (__probable(kdebug_enable == 0 || (flow_key_trace.fk_ipver != IPVERSION &&
5288 	    flow_key_trace.fk_ipver != IPV6_VERSION))) {
5289 		return;
5290 	}
5291 
5292 	uint16_t bdlim, bdlen, bdoff;
5293 	uint8_t *baddr;
5294 
5295 	MD_BUFLET_ADDR_ABS_DLEN(kpkt, baddr, bdlen, bdlim, bdoff);
5296 
5297 	if (!(kpkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED)) {
5298 		if (!IFNET_IS_ETHERNET(ifp)) {
5299 			return;
5300 		}
5301 
5302 		sa_family_t af = AF_UNSPEC;
5303 		ASSERT(kpkt->pkt_l2_len > 0);
5304 
5305 		baddr += kpkt->pkt_headroom;
5306 		eh = (struct ether_header *)(void *)baddr;
5307 		if (__improbable(sizeof(*eh) > kpkt->pkt_length)) {
5308 			return;
5309 		}
5310 		if (__improbable(kpkt->pkt_headroom + sizeof(*eh) > bdlim)) {
5311 			return;
5312 		}
5313 		uint16_t ether_type = ntohs(eh->ether_type);
5314 		if (ether_type == ETHERTYPE_IP) {
5315 			af = AF_INET;
5316 		} else if (ether_type == ETHERTYPE_IPV6) {
5317 			af = AF_INET6;
5318 		} else {
5319 			return;
5320 		}
5321 		flow_pkt_classify(kpkt, ifp, af, input);
5322 	}
5323 
5324 	if (kpkt->pkt_flow_ip_ver != flow_key_trace.fk_ipver) {
5325 		return;
5326 	}
5327 
5328 	if (kpkt->pkt_flow_ip_proto != IPPROTO_UDP) {
5329 		return;
5330 	}
5331 
5332 	uint16_t sport = input ? flow_key_trace.fk_dport : flow_key_trace.fk_sport;
5333 	uint16_t dport = input ? flow_key_trace.fk_sport : flow_key_trace.fk_dport;
5334 
5335 	if (kpkt->pkt_flow_udp_src != sport ||
5336 	    kpkt->pkt_flow_udp_dst != dport) {
5337 		return;
5338 	}
5339 
5340 	if (kpkt->pkt_flow_ip_ver == IPVERSION) {
5341 		struct ip *ip_header = (struct ip *)kpkt->pkt_flow_ip_hdr;
5342 		struct in_addr *saddr = input ? &flow_key_trace.fk_dst4 : &flow_key_trace.fk_src4;
5343 		struct in_addr *daddr = input ? &flow_key_trace.fk_src4 : &flow_key_trace.fk_dst4;
5344 
5345 		if (ip_header->ip_src.s_addr != saddr->s_addr ||
5346 		    ip_header->ip_dst.s_addr != daddr->s_addr) {
5347 			return;
5348 		}
5349 	} else if (kpkt->pkt_flow_ip_ver == IPV6_VERSION) {
5350 		struct ip6_hdr *ip6_header = (struct ip6_hdr *)kpkt->pkt_flow_ip_hdr;
5351 		struct in6_addr *saddr = input ? &flow_key_trace.fk_dst6 : &flow_key_trace.fk_src6;
5352 		struct in6_addr *daddr = input ? &flow_key_trace.fk_src6 : &flow_key_trace.fk_dst6;
5353 
5354 		if (!IN6_ARE_ADDR_EQUAL(&ip6_header->ip6_src, saddr) ||
5355 		    !IN6_ARE_ADDR_EQUAL(&ip6_header->ip6_dst, daddr)) {
5356 			return;
5357 		}
5358 	}
5359 
5360 	int udp_payload_offset = kpkt->pkt_l2_len + kpkt->pkt_flow_ip_hlen + sizeof(struct udphdr);
5361 
5362 	uint16_t pkt_payload_len = bdlim - bdoff;
5363 	pkt_payload_len = (uint16_t)MIN(pkt_payload_len, kpkt->pkt_length);
5364 	pkt_payload_len -= udp_payload_offset;
5365 
5366 	if (pkt_payload_len >= MIN_TRACE_DUMP_PKT_SIZE) {
5367 		uh = (struct udphdr *)kpkt->pkt_flow_udp_hdr;
5368 		uint8_t *payload = (uint8_t *)(uh + 1);
5369 
5370 		/* Trace 32 bytes of UDP transport payload */
5371 		uint64_t *trace1 = __DECONST(uint64_t *, payload);
5372 		uint64_t *trace2 = trace1 + 1;
5373 		uint64_t *trace3 = trace2 + 1;
5374 		uint64_t *trace4 = trace3 + 1;
5375 
5376 		if (input) {
5377 			KDBG(IFNET_KTRACE_RX_PKT_DUMP, *trace1, *trace2, *trace3, *trace4);
5378 		} else {
5379 			KDBG(IFNET_KTRACE_TX_PKT_DUMP, *trace1, *trace2, *trace3, *trace4);
5380 		}
5381 	}
5382 }
5383 #endif /* DEVELOPMENT || DEBUG */
5384 
5385 int
ifnet_enqueue_netem(void * handle,pktsched_pkt_t * pkts,uint32_t n_pkts)5386 ifnet_enqueue_netem(void *handle, pktsched_pkt_t *pkts, uint32_t n_pkts)
5387 {
5388 	struct ifnet *ifp = handle;
5389 	boolean_t pdrop;        /* dummy */
5390 	uint32_t i;
5391 
5392 	ASSERT(n_pkts >= 1);
5393 	for (i = 0; i < n_pkts - 1; i++) {
5394 		(void) ifnet_enqueue_ifclassq(ifp, NULL, &pkts[i].pktsched_pkt,
5395 		    FALSE, &pdrop);
5396 	}
5397 	/* flush with the last packet */
5398 	(void) ifnet_enqueue_ifclassq(ifp, NULL, &pkts[i].pktsched_pkt,
5399 	    TRUE, &pdrop);
5400 
5401 	return 0;
5402 }
5403 
5404 static inline errno_t
ifnet_enqueue_common(struct ifnet * ifp,struct ifclassq * ifcq,classq_pkt_t * pkt,boolean_t flush,boolean_t * pdrop)5405 ifnet_enqueue_common(struct ifnet *ifp, struct ifclassq *ifcq,
5406     classq_pkt_t *pkt, boolean_t flush, boolean_t *pdrop)
5407 {
5408 #if DEVELOPMENT || DEBUG
5409 	switch (pkt->cp_ptype) {
5410 	case QP_PACKET: {
5411 		trace_pkt_dump_payload(ifp, pkt->cp_kpkt, false);
5412 		break;
5413 	}
5414 	case QP_MBUF:
5415 	case QP_INVALID: {
5416 		break;
5417 	}
5418 	}
5419 #endif /* DEVELOPMENT || DEBUG */
5420 
5421 	if (ifp->if_output_netem != NULL) {
5422 		bool drop;
5423 		errno_t error;
5424 		error = netem_enqueue(ifp->if_output_netem, pkt, &drop);
5425 		*pdrop = drop ? TRUE : FALSE;
5426 		return error;
5427 	} else {
5428 		return ifnet_enqueue_ifclassq(ifp, ifcq, pkt, flush, pdrop);
5429 	}
5430 }
5431 
5432 errno_t
ifnet_enqueue(struct ifnet * ifp,struct mbuf * m)5433 ifnet_enqueue(struct ifnet *ifp, struct mbuf *m)
5434 {
5435 	boolean_t pdrop;
5436 	return ifnet_enqueue_mbuf(ifp, m, TRUE, &pdrop);
5437 }
5438 
5439 errno_t
ifnet_enqueue_mbuf(struct ifnet * ifp,struct mbuf * m,boolean_t flush,boolean_t * pdrop)5440 ifnet_enqueue_mbuf(struct ifnet *ifp, struct mbuf *m, boolean_t flush,
5441     boolean_t *pdrop)
5442 {
5443 	classq_pkt_t pkt;
5444 
5445 	if (ifp == NULL || m == NULL || !(m->m_flags & M_PKTHDR) ||
5446 	    m->m_nextpkt != NULL) {
5447 		if (m != NULL) {
5448 			m_freem_list(m);
5449 			*pdrop = TRUE;
5450 		}
5451 		return EINVAL;
5452 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5453 	    !IF_FULLY_ATTACHED(ifp)) {
5454 		/* flag tested without lock for performance */
5455 		m_freem(m);
5456 		*pdrop = TRUE;
5457 		return ENXIO;
5458 	} else if (!(ifp->if_flags & IFF_UP)) {
5459 		m_freem(m);
5460 		*pdrop = TRUE;
5461 		return ENETDOWN;
5462 	}
5463 
5464 	CLASSQ_PKT_INIT_MBUF(&pkt, m);
5465 	return ifnet_enqueue_common(ifp, NULL, &pkt, flush, pdrop);
5466 }
5467 
5468 errno_t
ifnet_enqueue_mbuf_chain(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,uint32_t cnt,uint32_t bytes,boolean_t flush,boolean_t * pdrop)5469 ifnet_enqueue_mbuf_chain(struct ifnet *ifp, struct mbuf *m_head,
5470     struct mbuf *m_tail, uint32_t cnt, uint32_t bytes, boolean_t flush,
5471     boolean_t *pdrop)
5472 {
5473 	classq_pkt_t head, tail;
5474 
5475 	ASSERT(m_head != NULL);
5476 	ASSERT((m_head->m_flags & M_PKTHDR) != 0);
5477 	ASSERT(m_tail != NULL);
5478 	ASSERT((m_tail->m_flags & M_PKTHDR) != 0);
5479 	ASSERT(ifp != NULL);
5480 	ASSERT((ifp->if_eflags & IFEF_TXSTART) != 0);
5481 
5482 	if (!IF_FULLY_ATTACHED(ifp)) {
5483 		/* flag tested without lock for performance */
5484 		m_freem_list(m_head);
5485 		*pdrop = TRUE;
5486 		return ENXIO;
5487 	} else if (!(ifp->if_flags & IFF_UP)) {
5488 		m_freem_list(m_head);
5489 		*pdrop = TRUE;
5490 		return ENETDOWN;
5491 	}
5492 
5493 	CLASSQ_PKT_INIT_MBUF(&head, m_head);
5494 	CLASSQ_PKT_INIT_MBUF(&tail, m_tail);
5495 	return ifnet_enqueue_ifclassq_chain(ifp, NULL, &head, &tail, cnt, bytes,
5496 	           flush, pdrop);
5497 }
5498 
5499 #if SKYWALK
5500 static errno_t
ifnet_enqueue_pkt_common(struct ifnet * ifp,struct ifclassq * ifcq,struct __kern_packet * kpkt,boolean_t flush,boolean_t * pdrop)5501 ifnet_enqueue_pkt_common(struct ifnet *ifp, struct ifclassq *ifcq,
5502     struct __kern_packet *kpkt, boolean_t flush, boolean_t *pdrop)
5503 {
5504 	classq_pkt_t pkt;
5505 
5506 	ASSERT(kpkt == NULL || kpkt->pkt_nextpkt == NULL);
5507 
5508 	if (__improbable(ifp == NULL || kpkt == NULL)) {
5509 		if (kpkt != NULL) {
5510 			pp_free_packet(__DECONST(struct kern_pbufpool *,
5511 			    kpkt->pkt_qum.qum_pp), SK_PTR_ADDR(kpkt));
5512 			*pdrop = TRUE;
5513 		}
5514 		return EINVAL;
5515 	} else if (__improbable(!(ifp->if_eflags & IFEF_TXSTART) ||
5516 	    !IF_FULLY_ATTACHED(ifp))) {
5517 		/* flag tested without lock for performance */
5518 		pp_free_packet(__DECONST(struct kern_pbufpool *,
5519 		    kpkt->pkt_qum.qum_pp), SK_PTR_ADDR(kpkt));
5520 		*pdrop = TRUE;
5521 		return ENXIO;
5522 	} else if (__improbable(!(ifp->if_flags & IFF_UP))) {
5523 		pp_free_packet(__DECONST(struct kern_pbufpool *,
5524 		    kpkt->pkt_qum.qum_pp), SK_PTR_ADDR(kpkt));
5525 		*pdrop = TRUE;
5526 		return ENETDOWN;
5527 	}
5528 
5529 	CLASSQ_PKT_INIT_PACKET(&pkt, kpkt);
5530 	return ifnet_enqueue_common(ifp, ifcq, &pkt, flush, pdrop);
5531 }
5532 
5533 errno_t
ifnet_enqueue_pkt(struct ifnet * ifp,struct __kern_packet * kpkt,boolean_t flush,boolean_t * pdrop)5534 ifnet_enqueue_pkt(struct ifnet *ifp, struct __kern_packet *kpkt,
5535     boolean_t flush, boolean_t *pdrop)
5536 {
5537 	return ifnet_enqueue_pkt_common(ifp, NULL, kpkt, flush, pdrop);
5538 }
5539 
5540 errno_t
ifnet_enqueue_ifcq_pkt(struct ifnet * ifp,struct ifclassq * ifcq,struct __kern_packet * kpkt,boolean_t flush,boolean_t * pdrop)5541 ifnet_enqueue_ifcq_pkt(struct ifnet *ifp, struct ifclassq *ifcq,
5542     struct __kern_packet *kpkt, boolean_t flush, boolean_t *pdrop)
5543 {
5544 	return ifnet_enqueue_pkt_common(ifp, ifcq, kpkt, flush, pdrop);
5545 }
5546 
5547 static errno_t
ifnet_enqueue_pkt_chain_common(struct ifnet * ifp,struct ifclassq * ifcq,struct __kern_packet * k_head,struct __kern_packet * k_tail,uint32_t cnt,uint32_t bytes,boolean_t flush,boolean_t * pdrop)5548 ifnet_enqueue_pkt_chain_common(struct ifnet *ifp, struct ifclassq *ifcq,
5549     struct __kern_packet *k_head, struct __kern_packet *k_tail, uint32_t cnt,
5550     uint32_t bytes, boolean_t flush, boolean_t *pdrop)
5551 {
5552 	classq_pkt_t head, tail;
5553 
5554 	ASSERT(k_head != NULL);
5555 	ASSERT(k_tail != NULL);
5556 	ASSERT(ifp != NULL);
5557 	ASSERT((ifp->if_eflags & IFEF_TXSTART) != 0);
5558 
5559 	if (!IF_FULLY_ATTACHED(ifp)) {
5560 		/* flag tested without lock for performance */
5561 		pp_free_packet_chain(k_head, NULL);
5562 		*pdrop = TRUE;
5563 		return ENXIO;
5564 	} else if (__improbable(!(ifp->if_flags & IFF_UP))) {
5565 		pp_free_packet_chain(k_head, NULL);
5566 		*pdrop = TRUE;
5567 		return ENETDOWN;
5568 	}
5569 
5570 	CLASSQ_PKT_INIT_PACKET(&head, k_head);
5571 	CLASSQ_PKT_INIT_PACKET(&tail, k_tail);
5572 	return ifnet_enqueue_ifclassq_chain(ifp, ifcq, &head, &tail, cnt, bytes,
5573 	           flush, pdrop);
5574 }
5575 
5576 errno_t
ifnet_enqueue_pkt_chain(struct ifnet * ifp,struct __kern_packet * k_head,struct __kern_packet * k_tail,uint32_t cnt,uint32_t bytes,boolean_t flush,boolean_t * pdrop)5577 ifnet_enqueue_pkt_chain(struct ifnet *ifp, struct __kern_packet *k_head,
5578     struct __kern_packet *k_tail, uint32_t cnt, uint32_t bytes, boolean_t flush,
5579     boolean_t *pdrop)
5580 {
5581 	return ifnet_enqueue_pkt_chain_common(ifp, NULL, k_head, k_tail,
5582 	           cnt, bytes, flush, pdrop);
5583 }
5584 
5585 errno_t
ifnet_enqueue_ifcq_pkt_chain(struct ifnet * ifp,struct ifclassq * ifcq,struct __kern_packet * k_head,struct __kern_packet * k_tail,uint32_t cnt,uint32_t bytes,boolean_t flush,boolean_t * pdrop)5586 ifnet_enqueue_ifcq_pkt_chain(struct ifnet *ifp, struct ifclassq *ifcq,
5587     struct __kern_packet *k_head, struct __kern_packet *k_tail, uint32_t cnt,
5588     uint32_t bytes, boolean_t flush, boolean_t *pdrop)
5589 {
5590 	return ifnet_enqueue_pkt_chain_common(ifp, ifcq, k_head, k_tail,
5591 	           cnt, bytes, flush, pdrop);
5592 }
5593 #endif /* SKYWALK */
5594 
5595 errno_t
ifnet_dequeue(struct ifnet * ifp,struct mbuf ** mp)5596 ifnet_dequeue(struct ifnet *ifp, struct mbuf **mp)
5597 {
5598 	errno_t rc;
5599 	classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
5600 
5601 	if (ifp == NULL || mp == NULL) {
5602 		return EINVAL;
5603 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5604 	    ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5605 		return ENXIO;
5606 	}
5607 	if (!ifnet_is_attached(ifp, 1)) {
5608 		return ENXIO;
5609 	}
5610 
5611 #if SKYWALK
5612 	ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5613 #endif /* SKYWALK */
5614 	rc = ifclassq_dequeue(ifp->if_snd, 1, CLASSQ_DEQUEUE_MAX_BYTE_LIMIT,
5615 	    &pkt, NULL, NULL, NULL, 0);
5616 	VERIFY((pkt.cp_ptype == QP_MBUF) || (pkt.cp_mbuf == NULL));
5617 	ifnet_decr_iorefcnt(ifp);
5618 	*mp = pkt.cp_mbuf;
5619 	return rc;
5620 }
5621 
5622 errno_t
ifnet_dequeue_service_class(struct ifnet * ifp,mbuf_svc_class_t sc,struct mbuf ** mp)5623 ifnet_dequeue_service_class(struct ifnet *ifp, mbuf_svc_class_t sc,
5624     struct mbuf **mp)
5625 {
5626 	errno_t rc;
5627 	classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
5628 
5629 	if (ifp == NULL || mp == NULL || !MBUF_VALID_SC(sc)) {
5630 		return EINVAL;
5631 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5632 	    ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5633 		return ENXIO;
5634 	}
5635 	if (!ifnet_is_attached(ifp, 1)) {
5636 		return ENXIO;
5637 	}
5638 
5639 #if SKYWALK
5640 	ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5641 #endif /* SKYWALK */
5642 	rc = ifclassq_dequeue_sc(ifp->if_snd, sc, 1,
5643 	    CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, &pkt, NULL, NULL, NULL, 0);
5644 	VERIFY((pkt.cp_ptype == QP_MBUF) || (pkt.cp_mbuf == NULL));
5645 	ifnet_decr_iorefcnt(ifp);
5646 	*mp = pkt.cp_mbuf;
5647 	return rc;
5648 }
5649 
5650 errno_t
ifnet_dequeue_multi(struct ifnet * ifp,u_int32_t pkt_limit,struct mbuf ** head,struct mbuf ** tail,u_int32_t * cnt,u_int32_t * len)5651 ifnet_dequeue_multi(struct ifnet *ifp, u_int32_t pkt_limit,
5652     struct mbuf **head, struct mbuf **tail, u_int32_t *cnt, u_int32_t *len)
5653 {
5654 	errno_t rc;
5655 	classq_pkt_t pkt_head = CLASSQ_PKT_INITIALIZER(pkt_head);
5656 	classq_pkt_t pkt_tail = CLASSQ_PKT_INITIALIZER(pkt_tail);
5657 
5658 	if (ifp == NULL || head == NULL || pkt_limit < 1) {
5659 		return EINVAL;
5660 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5661 	    ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5662 		return ENXIO;
5663 	}
5664 	if (!ifnet_is_attached(ifp, 1)) {
5665 		return ENXIO;
5666 	}
5667 
5668 #if SKYWALK
5669 	ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5670 #endif /* SKYWALK */
5671 	rc = ifclassq_dequeue(ifp->if_snd, pkt_limit,
5672 	    CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, &pkt_head, &pkt_tail, cnt, len, 0);
5673 	VERIFY((pkt_head.cp_ptype == QP_MBUF) || (pkt_head.cp_mbuf == NULL));
5674 	ifnet_decr_iorefcnt(ifp);
5675 	*head = pkt_head.cp_mbuf;
5676 	if (tail != NULL) {
5677 		*tail = pkt_tail.cp_mbuf;
5678 	}
5679 	return rc;
5680 }
5681 
5682 errno_t
ifnet_dequeue_multi_bytes(struct ifnet * ifp,u_int32_t byte_limit,struct mbuf ** head,struct mbuf ** tail,u_int32_t * cnt,u_int32_t * len)5683 ifnet_dequeue_multi_bytes(struct ifnet *ifp, u_int32_t byte_limit,
5684     struct mbuf **head, struct mbuf **tail, u_int32_t *cnt, u_int32_t *len)
5685 {
5686 	errno_t rc;
5687 	classq_pkt_t pkt_head = CLASSQ_PKT_INITIALIZER(pkt_head);
5688 	classq_pkt_t pkt_tail = CLASSQ_PKT_INITIALIZER(pkt_tail);
5689 
5690 	if (ifp == NULL || head == NULL || byte_limit < 1) {
5691 		return EINVAL;
5692 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5693 	    ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5694 		return ENXIO;
5695 	}
5696 	if (!ifnet_is_attached(ifp, 1)) {
5697 		return ENXIO;
5698 	}
5699 
5700 #if SKYWALK
5701 	ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5702 #endif /* SKYWALK */
5703 	rc = ifclassq_dequeue(ifp->if_snd, CLASSQ_DEQUEUE_MAX_PKT_LIMIT,
5704 	    byte_limit, &pkt_head, &pkt_tail, cnt, len, 0);
5705 	VERIFY((pkt_head.cp_ptype == QP_MBUF) || (pkt_head.cp_mbuf == NULL));
5706 	ifnet_decr_iorefcnt(ifp);
5707 	*head = pkt_head.cp_mbuf;
5708 	if (tail != NULL) {
5709 		*tail = pkt_tail.cp_mbuf;
5710 	}
5711 	return rc;
5712 }
5713 
5714 errno_t
ifnet_dequeue_service_class_multi(struct ifnet * ifp,mbuf_svc_class_t sc,u_int32_t pkt_limit,struct mbuf ** head,struct mbuf ** tail,u_int32_t * cnt,u_int32_t * len)5715 ifnet_dequeue_service_class_multi(struct ifnet *ifp, mbuf_svc_class_t sc,
5716     u_int32_t pkt_limit, struct mbuf **head, struct mbuf **tail, u_int32_t *cnt,
5717     u_int32_t *len)
5718 {
5719 	errno_t rc;
5720 	classq_pkt_t pkt_head = CLASSQ_PKT_INITIALIZER(pkt_head);
5721 	classq_pkt_t pkt_tail = CLASSQ_PKT_INITIALIZER(pkt_tail);
5722 
5723 	if (ifp == NULL || head == NULL || pkt_limit < 1 ||
5724 	    !MBUF_VALID_SC(sc)) {
5725 		return EINVAL;
5726 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5727 	    ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5728 		return ENXIO;
5729 	}
5730 	if (!ifnet_is_attached(ifp, 1)) {
5731 		return ENXIO;
5732 	}
5733 
5734 #if SKYWALK
5735 	ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5736 #endif /* SKYWALK */
5737 	rc = ifclassq_dequeue_sc(ifp->if_snd, sc, pkt_limit,
5738 	    CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, &pkt_head, &pkt_tail,
5739 	    cnt, len, 0);
5740 	VERIFY((pkt_head.cp_ptype == QP_MBUF) || (pkt_head.cp_mbuf == NULL));
5741 	ifnet_decr_iorefcnt(ifp);
5742 	*head = pkt_head.cp_mbuf;
5743 	if (tail != NULL) {
5744 		*tail = pkt_tail.cp_mbuf;
5745 	}
5746 	return rc;
5747 }
5748 
5749 #if XNU_TARGET_OS_OSX
5750 errno_t
ifnet_framer_stub(struct ifnet * ifp,struct mbuf ** m,const struct sockaddr * dest,const char * dest_linkaddr,const char * frame_type,u_int32_t * pre,u_int32_t * post)5751 ifnet_framer_stub(struct ifnet *ifp, struct mbuf **m,
5752     const struct sockaddr *dest, const char *dest_linkaddr,
5753     const char *frame_type, u_int32_t *pre, u_int32_t *post)
5754 {
5755 	if (pre != NULL) {
5756 		*pre = 0;
5757 	}
5758 	if (post != NULL) {
5759 		*post = 0;
5760 	}
5761 
5762 	return ifp->if_framer_legacy(ifp, m, dest, dest_linkaddr, frame_type);
5763 }
5764 #endif /* XNU_TARGET_OS_OSX */
5765 
5766 static boolean_t
packet_has_vlan_tag(struct mbuf * m)5767 packet_has_vlan_tag(struct mbuf * m)
5768 {
5769 	u_int   tag = 0;
5770 
5771 	if ((m->m_pkthdr.csum_flags & CSUM_VLAN_TAG_VALID) != 0) {
5772 		tag = EVL_VLANOFTAG(m->m_pkthdr.vlan_tag);
5773 		if (tag == 0) {
5774 			/* the packet is just priority-tagged, clear the bit */
5775 			m->m_pkthdr.csum_flags &= ~CSUM_VLAN_TAG_VALID;
5776 		}
5777 	}
5778 	return tag != 0;
5779 }
5780 
5781 static int
dlil_interface_filters_input(struct ifnet * ifp,struct mbuf ** m_p,char ** frame_header_p,protocol_family_t protocol_family)5782 dlil_interface_filters_input(struct ifnet *ifp, struct mbuf **m_p,
5783     char **frame_header_p, protocol_family_t protocol_family)
5784 {
5785 	boolean_t               is_vlan_packet = FALSE;
5786 	struct ifnet_filter     *filter;
5787 	struct mbuf             *m = *m_p;
5788 
5789 	is_vlan_packet = packet_has_vlan_tag(m);
5790 
5791 	if (TAILQ_EMPTY(&ifp->if_flt_head)) {
5792 		return 0;
5793 	}
5794 
5795 	/*
5796 	 * Pass the inbound packet to the interface filters
5797 	 */
5798 	lck_mtx_lock_spin(&ifp->if_flt_lock);
5799 	/* prevent filter list from changing in case we drop the lock */
5800 	if_flt_monitor_busy(ifp);
5801 	TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
5802 		int result;
5803 
5804 		/* exclude VLAN packets from external filters PR-3586856 */
5805 		if (is_vlan_packet &&
5806 		    (filter->filt_flags & DLIL_IFF_INTERNAL) == 0) {
5807 			continue;
5808 		}
5809 
5810 		if (!filter->filt_skip && filter->filt_input != NULL &&
5811 		    (filter->filt_protocol == 0 ||
5812 		    filter->filt_protocol == protocol_family)) {
5813 			lck_mtx_unlock(&ifp->if_flt_lock);
5814 
5815 			result = (*filter->filt_input)(filter->filt_cookie,
5816 			    ifp, protocol_family, m_p, frame_header_p);
5817 
5818 			lck_mtx_lock_spin(&ifp->if_flt_lock);
5819 			if (result != 0) {
5820 				/* we're done with the filter list */
5821 				if_flt_monitor_unbusy(ifp);
5822 				lck_mtx_unlock(&ifp->if_flt_lock);
5823 				return result;
5824 			}
5825 		}
5826 	}
5827 	/* we're done with the filter list */
5828 	if_flt_monitor_unbusy(ifp);
5829 	lck_mtx_unlock(&ifp->if_flt_lock);
5830 
5831 	/*
5832 	 * Strip away M_PROTO1 bit prior to sending packet up the stack as
5833 	 * it is meant to be local to a subsystem -- if_bridge for M_PROTO1
5834 	 */
5835 	if (*m_p != NULL) {
5836 		(*m_p)->m_flags &= ~M_PROTO1;
5837 	}
5838 
5839 	return 0;
5840 }
5841 
5842 __attribute__((noinline))
5843 static int
dlil_interface_filters_output(struct ifnet * ifp,struct mbuf ** m_p,protocol_family_t protocol_family)5844 dlil_interface_filters_output(struct ifnet *ifp, struct mbuf **m_p,
5845     protocol_family_t protocol_family)
5846 {
5847 	boolean_t               is_vlan_packet;
5848 	struct ifnet_filter     *filter;
5849 	struct mbuf             *m = *m_p;
5850 
5851 	is_vlan_packet = packet_has_vlan_tag(m);
5852 
5853 	/*
5854 	 * Pass the outbound packet to the interface filters
5855 	 */
5856 	lck_mtx_lock_spin(&ifp->if_flt_lock);
5857 	/* prevent filter list from changing in case we drop the lock */
5858 	if_flt_monitor_busy(ifp);
5859 	TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
5860 		int result;
5861 
5862 		/* exclude VLAN packets from external filters PR-3586856 */
5863 		if (is_vlan_packet &&
5864 		    (filter->filt_flags & DLIL_IFF_INTERNAL) == 0) {
5865 			continue;
5866 		}
5867 
5868 		if (!filter->filt_skip && filter->filt_output != NULL &&
5869 		    (filter->filt_protocol == 0 ||
5870 		    filter->filt_protocol == protocol_family)) {
5871 			lck_mtx_unlock(&ifp->if_flt_lock);
5872 
5873 			result = filter->filt_output(filter->filt_cookie, ifp,
5874 			    protocol_family, m_p);
5875 
5876 			lck_mtx_lock_spin(&ifp->if_flt_lock);
5877 			if (result != 0) {
5878 				/* we're done with the filter list */
5879 				if_flt_monitor_unbusy(ifp);
5880 				lck_mtx_unlock(&ifp->if_flt_lock);
5881 				return result;
5882 			}
5883 		}
5884 	}
5885 	/* we're done with the filter list */
5886 	if_flt_monitor_unbusy(ifp);
5887 	lck_mtx_unlock(&ifp->if_flt_lock);
5888 
5889 	return 0;
5890 }
5891 
5892 static void
dlil_ifproto_input(struct if_proto * ifproto,mbuf_t m)5893 dlil_ifproto_input(struct if_proto * ifproto, mbuf_t m)
5894 {
5895 	int error;
5896 
5897 	if (ifproto->proto_kpi == kProtoKPI_v1) {
5898 		/* Version 1 protocols get one packet at a time */
5899 		while (m != NULL) {
5900 			char *  frame_header;
5901 			mbuf_t  next_packet;
5902 
5903 			next_packet = m->m_nextpkt;
5904 			m->m_nextpkt = NULL;
5905 			frame_header = m->m_pkthdr.pkt_hdr;
5906 			m->m_pkthdr.pkt_hdr = NULL;
5907 			error = (*ifproto->kpi.v1.input)(ifproto->ifp,
5908 			    ifproto->protocol_family, m, frame_header);
5909 			if (error != 0 && error != EJUSTRETURN) {
5910 				m_freem(m);
5911 			}
5912 			m = next_packet;
5913 		}
5914 	} else if (ifproto->proto_kpi == kProtoKPI_v2) {
5915 		/* Version 2 protocols support packet lists */
5916 		error = (*ifproto->kpi.v2.input)(ifproto->ifp,
5917 		    ifproto->protocol_family, m);
5918 		if (error != 0 && error != EJUSTRETURN) {
5919 			m_freem_list(m);
5920 		}
5921 	}
5922 }
5923 
5924 static void
dlil_input_stats_add(const struct ifnet_stat_increment_param * s,struct dlil_threading_info * inp,struct ifnet * ifp,boolean_t poll)5925 dlil_input_stats_add(const struct ifnet_stat_increment_param *s,
5926     struct dlil_threading_info *inp, struct ifnet *ifp, boolean_t poll)
5927 {
5928 	struct ifnet_stat_increment_param *d = &inp->dlth_stats;
5929 
5930 	if (s->packets_in != 0) {
5931 		d->packets_in += s->packets_in;
5932 	}
5933 	if (s->bytes_in != 0) {
5934 		d->bytes_in += s->bytes_in;
5935 	}
5936 	if (s->errors_in != 0) {
5937 		d->errors_in += s->errors_in;
5938 	}
5939 
5940 	if (s->packets_out != 0) {
5941 		d->packets_out += s->packets_out;
5942 	}
5943 	if (s->bytes_out != 0) {
5944 		d->bytes_out += s->bytes_out;
5945 	}
5946 	if (s->errors_out != 0) {
5947 		d->errors_out += s->errors_out;
5948 	}
5949 
5950 	if (s->collisions != 0) {
5951 		d->collisions += s->collisions;
5952 	}
5953 	if (s->dropped != 0) {
5954 		d->dropped += s->dropped;
5955 	}
5956 
5957 	if (poll) {
5958 		PKTCNTR_ADD(&ifp->if_poll_tstats, s->packets_in, s->bytes_in);
5959 	}
5960 }
5961 
5962 static boolean_t
dlil_input_stats_sync(struct ifnet * ifp,struct dlil_threading_info * inp)5963 dlil_input_stats_sync(struct ifnet *ifp, struct dlil_threading_info *inp)
5964 {
5965 	struct ifnet_stat_increment_param *s = &inp->dlth_stats;
5966 
5967 	/*
5968 	 * Use of atomic operations is unavoidable here because
5969 	 * these stats may also be incremented elsewhere via KPIs.
5970 	 */
5971 	if (s->packets_in != 0) {
5972 		atomic_add_64(&ifp->if_data.ifi_ipackets, s->packets_in);
5973 		s->packets_in = 0;
5974 	}
5975 	if (s->bytes_in != 0) {
5976 		atomic_add_64(&ifp->if_data.ifi_ibytes, s->bytes_in);
5977 		s->bytes_in = 0;
5978 	}
5979 	if (s->errors_in != 0) {
5980 		atomic_add_64(&ifp->if_data.ifi_ierrors, s->errors_in);
5981 		s->errors_in = 0;
5982 	}
5983 
5984 	if (s->packets_out != 0) {
5985 		atomic_add_64(&ifp->if_data.ifi_opackets, s->packets_out);
5986 		s->packets_out = 0;
5987 	}
5988 	if (s->bytes_out != 0) {
5989 		atomic_add_64(&ifp->if_data.ifi_obytes, s->bytes_out);
5990 		s->bytes_out = 0;
5991 	}
5992 	if (s->errors_out != 0) {
5993 		atomic_add_64(&ifp->if_data.ifi_oerrors, s->errors_out);
5994 		s->errors_out = 0;
5995 	}
5996 
5997 	if (s->collisions != 0) {
5998 		atomic_add_64(&ifp->if_data.ifi_collisions, s->collisions);
5999 		s->collisions = 0;
6000 	}
6001 	if (s->dropped != 0) {
6002 		atomic_add_64(&ifp->if_data.ifi_iqdrops, s->dropped);
6003 		s->dropped = 0;
6004 	}
6005 
6006 	/*
6007 	 * No need for atomic operations as they are modified here
6008 	 * only from within the DLIL input thread context.
6009 	 */
6010 	if (ifp->if_poll_tstats.packets != 0) {
6011 		ifp->if_poll_pstats.ifi_poll_packets += ifp->if_poll_tstats.packets;
6012 		ifp->if_poll_tstats.packets = 0;
6013 	}
6014 	if (ifp->if_poll_tstats.bytes != 0) {
6015 		ifp->if_poll_pstats.ifi_poll_bytes += ifp->if_poll_tstats.bytes;
6016 		ifp->if_poll_tstats.bytes = 0;
6017 	}
6018 
6019 	return ifp->if_data_threshold != 0;
6020 }
6021 
6022 __private_extern__ void
dlil_input_packet_list(struct ifnet * ifp,struct mbuf * m)6023 dlil_input_packet_list(struct ifnet *ifp, struct mbuf *m)
6024 {
6025 	return dlil_input_packet_list_common(ifp, m, 0,
6026 	           IFNET_MODEL_INPUT_POLL_OFF, FALSE);
6027 }
6028 
6029 __private_extern__ void
dlil_input_packet_list_extended(struct ifnet * ifp,struct mbuf * m,u_int32_t cnt,ifnet_model_t mode)6030 dlil_input_packet_list_extended(struct ifnet *ifp, struct mbuf *m,
6031     u_int32_t cnt, ifnet_model_t mode)
6032 {
6033 	return dlil_input_packet_list_common(ifp, m, cnt, mode, TRUE);
6034 }
6035 
6036 static void
dlil_input_packet_list_common(struct ifnet * ifp_param,struct mbuf * m,u_int32_t cnt,ifnet_model_t mode,boolean_t ext)6037 dlil_input_packet_list_common(struct ifnet *ifp_param, struct mbuf *m,
6038     u_int32_t cnt, ifnet_model_t mode, boolean_t ext)
6039 {
6040 	int error = 0;
6041 	protocol_family_t protocol_family;
6042 	mbuf_t next_packet;
6043 	ifnet_t ifp = ifp_param;
6044 	char *frame_header = NULL;
6045 	struct if_proto *last_ifproto = NULL;
6046 	mbuf_t pkt_first = NULL;
6047 	mbuf_t *pkt_next = NULL;
6048 	u_int32_t poll_thresh = 0, poll_ival = 0;
6049 	int iorefcnt = 0;
6050 
6051 	KERNEL_DEBUG(DBG_FNC_DLIL_INPUT | DBG_FUNC_START, 0, 0, 0, 0, 0);
6052 
6053 	if (ext && mode == IFNET_MODEL_INPUT_POLL_ON && cnt > 1 &&
6054 	    (poll_ival = if_rxpoll_interval_pkts) > 0) {
6055 		poll_thresh = cnt;
6056 	}
6057 
6058 	while (m != NULL) {
6059 		struct if_proto *ifproto = NULL;
6060 		uint32_t pktf_mask;     /* pkt flags to preserve */
6061 
6062 		m_add_crumb(m, PKT_CRUMB_DLIL_INPUT);
6063 
6064 		if (ifp_param == NULL) {
6065 			ifp = m->m_pkthdr.rcvif;
6066 		}
6067 
6068 		if ((ifp->if_eflags & IFEF_RXPOLL) &&
6069 		    (ifp->if_xflags & IFXF_LEGACY) && poll_thresh != 0 &&
6070 		    poll_ival > 0 && (--poll_thresh % poll_ival) == 0) {
6071 			ifnet_poll(ifp);
6072 		}
6073 
6074 		/* Check if this mbuf looks valid */
6075 		MBUF_INPUT_CHECK(m, ifp);
6076 
6077 		next_packet = m->m_nextpkt;
6078 		m->m_nextpkt = NULL;
6079 		frame_header = m->m_pkthdr.pkt_hdr;
6080 		m->m_pkthdr.pkt_hdr = NULL;
6081 
6082 		/*
6083 		 * Get an IO reference count if the interface is not
6084 		 * loopback (lo0) and it is attached; lo0 never goes
6085 		 * away, so optimize for that.
6086 		 */
6087 		if (ifp != lo_ifp) {
6088 			/* iorefcnt is 0 if it hasn't been taken yet */
6089 			if (iorefcnt == 0) {
6090 				if (!ifnet_datamov_begin(ifp)) {
6091 					m_freem(m);
6092 					goto next;
6093 				}
6094 			}
6095 			iorefcnt = 1;
6096 			/*
6097 			 * Preserve the time stamp and skip pktap flags.
6098 			 */
6099 			pktf_mask = PKTF_TS_VALID | PKTF_SKIP_PKTAP;
6100 		} else {
6101 			/*
6102 			 * If this arrived on lo0, preserve interface addr
6103 			 * info to allow for connectivity between loopback
6104 			 * and local interface addresses.
6105 			 */
6106 			pktf_mask = (PKTF_LOOP | PKTF_IFAINFO);
6107 		}
6108 		pktf_mask |= PKTF_WAKE_PKT;
6109 
6110 		/* make sure packet comes in clean */
6111 		m_classifier_init(m, pktf_mask);
6112 
6113 		ifp_inc_traffic_class_in(ifp, m);
6114 
6115 		/* find which protocol family this packet is for */
6116 		ifnet_lock_shared(ifp);
6117 		error = (*ifp->if_demux)(ifp, m, frame_header,
6118 		    &protocol_family);
6119 		ifnet_lock_done(ifp);
6120 		if (error != 0) {
6121 			if (error == EJUSTRETURN) {
6122 				goto next;
6123 			}
6124 			protocol_family = 0;
6125 		}
6126 
6127 #if (DEVELOPMENT || DEBUG)
6128 		/*
6129 		 * For testing we do not care about broadcast and multicast packets as
6130 		 * they are not as controllable as unicast traffic
6131 		 */
6132 		if (__improbable(ifp->if_xflags & IFXF_MARK_WAKE_PKT)) {
6133 			if ((protocol_family == PF_INET || protocol_family == PF_INET6) &&
6134 			    (m->m_flags & (M_BCAST | M_MCAST)) == 0) {
6135 				/*
6136 				 * This is a one-shot command
6137 				 */
6138 				ifp->if_xflags &= ~IFXF_MARK_WAKE_PKT;
6139 				m->m_pkthdr.pkt_flags |= PKTF_WAKE_PKT;
6140 			}
6141 		}
6142 #endif /* (DEVELOPMENT || DEBUG) */
6143 		if (__improbable(net_wake_pkt_debug > 0 && (m->m_pkthdr.pkt_flags & PKTF_WAKE_PKT))) {
6144 			char buffer[64];
6145 			size_t buflen = MIN(mbuf_pkthdr_len(m), sizeof(buffer));
6146 
6147 			os_log(OS_LOG_DEFAULT, "wake packet from %s len %d",
6148 			    ifp->if_xname, m_pktlen(m));
6149 			if (mbuf_copydata(m, 0, buflen, buffer) == 0) {
6150 				log_hexdump(buffer, buflen);
6151 			}
6152 		}
6153 
6154 		pktap_input(ifp, protocol_family, m, frame_header);
6155 
6156 		/* Drop v4 packets received on CLAT46 enabled cell interface */
6157 		if (protocol_family == PF_INET && IS_INTF_CLAT46(ifp) &&
6158 		    ifp->if_type == IFT_CELLULAR) {
6159 			m_freem(m);
6160 			ip6stat.ip6s_clat464_in_v4_drop++;
6161 			goto next;
6162 		}
6163 
6164 		/* Translate the packet if it is received on CLAT interface */
6165 		if (protocol_family == PF_INET6 && IS_INTF_CLAT46(ifp)
6166 		    && dlil_is_clat_needed(protocol_family, m)) {
6167 			char *data = NULL;
6168 			struct ether_header eh;
6169 			struct ether_header *ehp = NULL;
6170 
6171 			if (ifp->if_type == IFT_ETHER) {
6172 				ehp = (struct ether_header *)(void *)frame_header;
6173 				/* Skip RX Ethernet packets if they are not IPV6 */
6174 				if (ntohs(ehp->ether_type) != ETHERTYPE_IPV6) {
6175 					goto skip_clat;
6176 				}
6177 
6178 				/* Keep a copy of frame_header for Ethernet packets */
6179 				bcopy(frame_header, (caddr_t)&eh, ETHER_HDR_LEN);
6180 			}
6181 			error = dlil_clat64(ifp, &protocol_family, &m);
6182 			data = (char *) mbuf_data(m);
6183 			if (error != 0) {
6184 				m_freem(m);
6185 				ip6stat.ip6s_clat464_in_drop++;
6186 				goto next;
6187 			}
6188 			/* Native v6 should be No-op */
6189 			if (protocol_family != PF_INET) {
6190 				goto skip_clat;
6191 			}
6192 
6193 			/* Do this only for translated v4 packets. */
6194 			switch (ifp->if_type) {
6195 			case IFT_CELLULAR:
6196 				frame_header = data;
6197 				break;
6198 			case IFT_ETHER:
6199 				/*
6200 				 * Drop if the mbuf doesn't have enough
6201 				 * space for Ethernet header
6202 				 */
6203 				if (M_LEADINGSPACE(m) < ETHER_HDR_LEN) {
6204 					m_free(m);
6205 					ip6stat.ip6s_clat464_in_drop++;
6206 					goto next;
6207 				}
6208 				/*
6209 				 * Set the frame_header ETHER_HDR_LEN bytes
6210 				 * preceeding the data pointer. Change
6211 				 * the ether_type too.
6212 				 */
6213 				frame_header = data - ETHER_HDR_LEN;
6214 				eh.ether_type = htons(ETHERTYPE_IP);
6215 				bcopy((caddr_t)&eh, frame_header, ETHER_HDR_LEN);
6216 				break;
6217 			}
6218 		}
6219 skip_clat:
6220 		/*
6221 		 * Match the wake packet against the list of ports that has been
6222 		 * been queried by the driver before the device went to sleep
6223 		 */
6224 		if (__improbable(m->m_pkthdr.pkt_flags & PKTF_WAKE_PKT)) {
6225 			if (protocol_family != PF_INET && protocol_family != PF_INET6) {
6226 				if_ports_used_match_mbuf(ifp, protocol_family, m);
6227 			}
6228 		}
6229 		if (hwcksum_dbg != 0 && !(ifp->if_flags & IFF_LOOPBACK) &&
6230 		    !(m->m_pkthdr.pkt_flags & PKTF_LOOP)) {
6231 			dlil_input_cksum_dbg(ifp, m, frame_header,
6232 			    protocol_family);
6233 		}
6234 		/*
6235 		 * For partial checksum offload, we expect the driver to
6236 		 * set the start offset indicating the start of the span
6237 		 * that is covered by the hardware-computed checksum;
6238 		 * adjust this start offset accordingly because the data
6239 		 * pointer has been advanced beyond the link-layer header.
6240 		 *
6241 		 * Virtual lan types (bridge, vlan, bond) can call
6242 		 * dlil_input_packet_list() with the same packet with the
6243 		 * checksum flags set. Set a flag indicating that the
6244 		 * adjustment has already been done.
6245 		 */
6246 		if ((m->m_pkthdr.csum_flags & CSUM_ADJUST_DONE) != 0) {
6247 			/* adjustment has already been done */
6248 		} else if ((m->m_pkthdr.csum_flags &
6249 		    (CSUM_DATA_VALID | CSUM_PARTIAL)) ==
6250 		    (CSUM_DATA_VALID | CSUM_PARTIAL)) {
6251 			int adj;
6252 			if (frame_header == NULL ||
6253 			    frame_header < (char *)mbuf_datastart(m) ||
6254 			    frame_header > (char *)m->m_data ||
6255 			    (adj = (int)(m->m_data - frame_header)) >
6256 			    m->m_pkthdr.csum_rx_start) {
6257 				m->m_pkthdr.csum_data = 0;
6258 				m->m_pkthdr.csum_flags &= ~CSUM_DATA_VALID;
6259 				hwcksum_in_invalidated++;
6260 			} else {
6261 				m->m_pkthdr.csum_rx_start -= adj;
6262 			}
6263 			/* make sure we don't adjust more than once */
6264 			m->m_pkthdr.csum_flags |= CSUM_ADJUST_DONE;
6265 		}
6266 		if (clat_debug) {
6267 			pktap_input(ifp, protocol_family, m, frame_header);
6268 		}
6269 
6270 		if (m->m_flags & (M_BCAST | M_MCAST)) {
6271 			atomic_add_64(&ifp->if_imcasts, 1);
6272 		}
6273 
6274 		/* run interface filters */
6275 		error = dlil_interface_filters_input(ifp, &m,
6276 		    &frame_header, protocol_family);
6277 		if (error != 0) {
6278 			if (error != EJUSTRETURN) {
6279 				m_freem(m);
6280 			}
6281 			goto next;
6282 		}
6283 		/*
6284 		 * A VLAN interface receives VLAN-tagged packets by attaching
6285 		 * its PF_VLAN protocol to a parent interface. When a VLAN
6286 		 * interface is a member of a bridge, the parent interface
6287 		 * receives VLAN-tagged M_PROMISC packets. A VLAN-tagged
6288 		 * M_PROMISC packet must be processed by the VLAN protocol
6289 		 * so that it can be sent up the stack via
6290 		 * dlil_input_packet_list(). That allows the bridge interface's
6291 		 * input filter, attached to the VLAN interface, to process
6292 		 * the packet.
6293 		 */
6294 		if (protocol_family != PF_VLAN &&
6295 		    (m->m_flags & M_PROMISC) != 0) {
6296 			m_freem(m);
6297 			goto next;
6298 		}
6299 
6300 		/* Lookup the protocol attachment to this interface */
6301 		if (protocol_family == 0) {
6302 			ifproto = NULL;
6303 		} else if (last_ifproto != NULL && last_ifproto->ifp == ifp &&
6304 		    (last_ifproto->protocol_family == protocol_family)) {
6305 			VERIFY(ifproto == NULL);
6306 			ifproto = last_ifproto;
6307 			if_proto_ref(last_ifproto);
6308 		} else {
6309 			VERIFY(ifproto == NULL);
6310 			ifnet_lock_shared(ifp);
6311 			/* callee holds a proto refcnt upon success */
6312 			ifproto = find_attached_proto(ifp, protocol_family);
6313 			ifnet_lock_done(ifp);
6314 		}
6315 		if (ifproto == NULL) {
6316 			/* no protocol for this packet, discard */
6317 			m_freem(m);
6318 			goto next;
6319 		}
6320 		if (ifproto != last_ifproto) {
6321 			if (last_ifproto != NULL) {
6322 				/* pass up the list for the previous protocol */
6323 				dlil_ifproto_input(last_ifproto, pkt_first);
6324 				pkt_first = NULL;
6325 				if_proto_free(last_ifproto);
6326 			}
6327 			last_ifproto = ifproto;
6328 			if_proto_ref(ifproto);
6329 		}
6330 		/* extend the list */
6331 		m->m_pkthdr.pkt_hdr = frame_header;
6332 		if (pkt_first == NULL) {
6333 			pkt_first = m;
6334 		} else {
6335 			*pkt_next = m;
6336 		}
6337 		pkt_next = &m->m_nextpkt;
6338 
6339 next:
6340 		if (next_packet == NULL && last_ifproto != NULL) {
6341 			/* pass up the last list of packets */
6342 			dlil_ifproto_input(last_ifproto, pkt_first);
6343 			if_proto_free(last_ifproto);
6344 			last_ifproto = NULL;
6345 		}
6346 		if (ifproto != NULL) {
6347 			if_proto_free(ifproto);
6348 			ifproto = NULL;
6349 		}
6350 
6351 		m = next_packet;
6352 
6353 		/* update the driver's multicast filter, if needed */
6354 		if (ifp->if_updatemcasts > 0 && if_mcasts_update(ifp) == 0) {
6355 			ifp->if_updatemcasts = 0;
6356 		}
6357 		if (iorefcnt == 1) {
6358 			/* If the next mbuf is on a different interface, unlock data-mov */
6359 			if (!m || (ifp != ifp_param && ifp != m->m_pkthdr.rcvif)) {
6360 				ifnet_datamov_end(ifp);
6361 				iorefcnt = 0;
6362 			}
6363 		}
6364 	}
6365 
6366 	KERNEL_DEBUG(DBG_FNC_DLIL_INPUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
6367 }
6368 
6369 errno_t
if_mcasts_update(struct ifnet * ifp)6370 if_mcasts_update(struct ifnet *ifp)
6371 {
6372 	errno_t err;
6373 
6374 	err = ifnet_ioctl(ifp, 0, SIOCADDMULTI, NULL);
6375 	if (err == EAFNOSUPPORT) {
6376 		err = 0;
6377 	}
6378 	DLIL_PRINTF("%s: %s %d suspended link-layer multicast membership(s) "
6379 	    "(err=%d)\n", if_name(ifp),
6380 	    (err == 0 ? "successfully restored" : "failed to restore"),
6381 	    ifp->if_updatemcasts, err);
6382 
6383 	/* just return success */
6384 	return 0;
6385 }
6386 
6387 /* If ifp is set, we will increment the generation for the interface */
6388 int
dlil_post_complete_msg(struct ifnet * ifp,struct kev_msg * event)6389 dlil_post_complete_msg(struct ifnet *ifp, struct kev_msg *event)
6390 {
6391 	if (ifp != NULL) {
6392 		ifnet_increment_generation(ifp);
6393 	}
6394 
6395 #if NECP
6396 	necp_update_all_clients();
6397 #endif /* NECP */
6398 
6399 	return kev_post_msg(event);
6400 }
6401 
6402 __private_extern__ void
dlil_post_sifflags_msg(struct ifnet * ifp)6403 dlil_post_sifflags_msg(struct ifnet * ifp)
6404 {
6405 	struct kev_msg ev_msg;
6406 	struct net_event_data ev_data;
6407 
6408 	bzero(&ev_data, sizeof(ev_data));
6409 	bzero(&ev_msg, sizeof(ev_msg));
6410 	ev_msg.vendor_code = KEV_VENDOR_APPLE;
6411 	ev_msg.kev_class = KEV_NETWORK_CLASS;
6412 	ev_msg.kev_subclass = KEV_DL_SUBCLASS;
6413 	ev_msg.event_code = KEV_DL_SIFFLAGS;
6414 	strlcpy(&ev_data.if_name[0], ifp->if_name, IFNAMSIZ);
6415 	ev_data.if_family = ifp->if_family;
6416 	ev_data.if_unit = (u_int32_t) ifp->if_unit;
6417 	ev_msg.dv[0].data_length = sizeof(struct net_event_data);
6418 	ev_msg.dv[0].data_ptr = &ev_data;
6419 	ev_msg.dv[1].data_length = 0;
6420 	dlil_post_complete_msg(ifp, &ev_msg);
6421 }
6422 
6423 #define TMP_IF_PROTO_ARR_SIZE   10
6424 static int
dlil_event_internal(struct ifnet * ifp,struct kev_msg * event,bool update_generation)6425 dlil_event_internal(struct ifnet *ifp, struct kev_msg *event, bool update_generation)
6426 {
6427 	struct ifnet_filter *filter = NULL;
6428 	struct if_proto *proto = NULL;
6429 	int if_proto_count = 0;
6430 	struct if_proto *tmp_ifproto_stack_arr[TMP_IF_PROTO_ARR_SIZE] = {NULL};
6431 	struct if_proto **tmp_ifproto_arr = tmp_ifproto_stack_arr;
6432 	int tmp_ifproto_arr_idx = 0;
6433 
6434 	/*
6435 	 * Pass the event to the interface filters
6436 	 */
6437 	lck_mtx_lock_spin(&ifp->if_flt_lock);
6438 	/* prevent filter list from changing in case we drop the lock */
6439 	if_flt_monitor_busy(ifp);
6440 	TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
6441 		if (filter->filt_event != NULL) {
6442 			lck_mtx_unlock(&ifp->if_flt_lock);
6443 
6444 			filter->filt_event(filter->filt_cookie, ifp,
6445 			    filter->filt_protocol, event);
6446 
6447 			lck_mtx_lock_spin(&ifp->if_flt_lock);
6448 		}
6449 	}
6450 	/* we're done with the filter list */
6451 	if_flt_monitor_unbusy(ifp);
6452 	lck_mtx_unlock(&ifp->if_flt_lock);
6453 
6454 	/* Get an io ref count if the interface is attached */
6455 	if (!ifnet_is_attached(ifp, 1)) {
6456 		goto done;
6457 	}
6458 
6459 	/*
6460 	 * An embedded tmp_list_entry in if_proto may still get
6461 	 * over-written by another thread after giving up ifnet lock,
6462 	 * therefore we are avoiding embedded pointers here.
6463 	 */
6464 	ifnet_lock_shared(ifp);
6465 	if_proto_count = dlil_ifp_protolist(ifp, NULL, 0);
6466 	if (if_proto_count) {
6467 		int i;
6468 		VERIFY(ifp->if_proto_hash != NULL);
6469 		if (if_proto_count <= TMP_IF_PROTO_ARR_SIZE) {
6470 			tmp_ifproto_arr = tmp_ifproto_stack_arr;
6471 		} else {
6472 			tmp_ifproto_arr = kalloc_type(struct if_proto *,
6473 			    if_proto_count, Z_WAITOK | Z_ZERO);
6474 			if (tmp_ifproto_arr == NULL) {
6475 				ifnet_lock_done(ifp);
6476 				goto cleanup;
6477 			}
6478 		}
6479 
6480 		for (i = 0; i < PROTO_HASH_SLOTS; i++) {
6481 			SLIST_FOREACH(proto, &ifp->if_proto_hash[i],
6482 			    next_hash) {
6483 				if_proto_ref(proto);
6484 				tmp_ifproto_arr[tmp_ifproto_arr_idx] = proto;
6485 				tmp_ifproto_arr_idx++;
6486 			}
6487 		}
6488 		VERIFY(if_proto_count == tmp_ifproto_arr_idx);
6489 	}
6490 	ifnet_lock_done(ifp);
6491 
6492 	for (tmp_ifproto_arr_idx = 0; tmp_ifproto_arr_idx < if_proto_count;
6493 	    tmp_ifproto_arr_idx++) {
6494 		proto = tmp_ifproto_arr[tmp_ifproto_arr_idx];
6495 		VERIFY(proto != NULL);
6496 		proto_media_event eventp =
6497 		    (proto->proto_kpi == kProtoKPI_v1 ?
6498 		    proto->kpi.v1.event :
6499 		    proto->kpi.v2.event);
6500 
6501 		if (eventp != NULL) {
6502 			eventp(ifp, proto->protocol_family,
6503 			    event);
6504 		}
6505 		if_proto_free(proto);
6506 	}
6507 
6508 cleanup:
6509 	if (tmp_ifproto_arr != tmp_ifproto_stack_arr) {
6510 		kfree_type(struct if_proto *, if_proto_count, tmp_ifproto_arr);
6511 	}
6512 
6513 	/* Pass the event to the interface */
6514 	if (ifp->if_event != NULL) {
6515 		ifp->if_event(ifp, event);
6516 	}
6517 
6518 	/* Release the io ref count */
6519 	ifnet_decr_iorefcnt(ifp);
6520 done:
6521 	return dlil_post_complete_msg(update_generation ? ifp : NULL, event);
6522 }
6523 
6524 errno_t
ifnet_event(ifnet_t ifp,struct kern_event_msg * event)6525 ifnet_event(ifnet_t ifp, struct kern_event_msg *event)
6526 {
6527 	struct kev_msg kev_msg;
6528 	int result = 0;
6529 
6530 	if (ifp == NULL || event == NULL) {
6531 		return EINVAL;
6532 	}
6533 
6534 	bzero(&kev_msg, sizeof(kev_msg));
6535 	kev_msg.vendor_code = event->vendor_code;
6536 	kev_msg.kev_class = event->kev_class;
6537 	kev_msg.kev_subclass = event->kev_subclass;
6538 	kev_msg.event_code = event->event_code;
6539 	kev_msg.dv[0].data_ptr = &event->event_data[0];
6540 	kev_msg.dv[0].data_length = event->total_size - KEV_MSG_HEADER_SIZE;
6541 	kev_msg.dv[1].data_length = 0;
6542 
6543 	result = dlil_event_internal(ifp, &kev_msg, TRUE);
6544 
6545 	return result;
6546 }
6547 
6548 static void
dlil_count_chain_len(mbuf_t m,struct chain_len_stats * cls)6549 dlil_count_chain_len(mbuf_t m, struct chain_len_stats *cls)
6550 {
6551 	mbuf_t  n = m;
6552 	int chainlen = 0;
6553 
6554 	while (n != NULL) {
6555 		chainlen++;
6556 		n = n->m_next;
6557 	}
6558 	switch (chainlen) {
6559 	case 0:
6560 		break;
6561 	case 1:
6562 		atomic_add_64(&cls->cls_one, 1);
6563 		break;
6564 	case 2:
6565 		atomic_add_64(&cls->cls_two, 1);
6566 		break;
6567 	case 3:
6568 		atomic_add_64(&cls->cls_three, 1);
6569 		break;
6570 	case 4:
6571 		atomic_add_64(&cls->cls_four, 1);
6572 		break;
6573 	case 5:
6574 	default:
6575 		atomic_add_64(&cls->cls_five_or_more, 1);
6576 		break;
6577 	}
6578 }
6579 
6580 #if CONFIG_DTRACE
6581 __attribute__((noinline))
6582 static void
dlil_output_dtrace(ifnet_t ifp,protocol_family_t proto_family,mbuf_t m)6583 dlil_output_dtrace(ifnet_t ifp, protocol_family_t proto_family, mbuf_t  m)
6584 {
6585 	if (proto_family == PF_INET) {
6586 		struct ip *ip = mtod(m, struct ip *);
6587 		DTRACE_IP6(send, struct mbuf *, m, struct inpcb *, NULL,
6588 		    struct ip *, ip, struct ifnet *, ifp,
6589 		    struct ip *, ip, struct ip6_hdr *, NULL);
6590 	} else if (proto_family == PF_INET6) {
6591 		struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
6592 		DTRACE_IP6(send, struct mbuf *, m, struct inpcb *, NULL,
6593 		    struct ip6_hdr *, ip6, struct ifnet *, ifp,
6594 		    struct ip *, NULL, struct ip6_hdr *, ip6);
6595 	}
6596 }
6597 #endif /* CONFIG_DTRACE */
6598 
6599 /*
6600  * dlil_output
6601  *
6602  * Caller should have a lock on the protocol domain if the protocol
6603  * doesn't support finer grained locking. In most cases, the lock
6604  * will be held from the socket layer and won't be released until
6605  * we return back to the socket layer.
6606  *
6607  * This does mean that we must take a protocol lock before we take
6608  * an interface lock if we're going to take both. This makes sense
6609  * because a protocol is likely to interact with an ifp while it
6610  * is under the protocol lock.
6611  *
6612  * An advisory code will be returned if adv is not null. This
6613  * can be used to provide feedback about interface queues to the
6614  * application.
6615  */
6616 errno_t
dlil_output(ifnet_t ifp,protocol_family_t proto_family,mbuf_t packetlist,void * route,const struct sockaddr * dest,int raw,struct flowadv * adv)6617 dlil_output(ifnet_t ifp, protocol_family_t proto_family, mbuf_t packetlist,
6618     void *route, const struct sockaddr *dest, int raw, struct flowadv *adv)
6619 {
6620 	char *frame_type = NULL;
6621 	char *dst_linkaddr = NULL;
6622 	int retval = 0;
6623 	char frame_type_buffer[MAX_FRAME_TYPE_SIZE * 4];
6624 	char dst_linkaddr_buffer[MAX_LINKADDR * 4];
6625 	struct if_proto *proto = NULL;
6626 	mbuf_t  m = NULL;
6627 	mbuf_t  send_head = NULL;
6628 	mbuf_t  *send_tail = &send_head;
6629 	int iorefcnt = 0;
6630 	u_int32_t pre = 0, post = 0;
6631 	u_int32_t fpkts = 0, fbytes = 0;
6632 	int32_t flen = 0;
6633 	struct timespec now;
6634 	u_int64_t now_nsec;
6635 	boolean_t did_clat46 = FALSE;
6636 	protocol_family_t old_proto_family = proto_family;
6637 	struct sockaddr_in6 dest6;
6638 	struct rtentry *rt = NULL;
6639 	u_int32_t m_loop_set = 0;
6640 
6641 	KERNEL_DEBUG(DBG_FNC_DLIL_OUTPUT | DBG_FUNC_START, 0, 0, 0, 0, 0);
6642 
6643 	/*
6644 	 * Get an io refcnt if the interface is attached to prevent ifnet_detach
6645 	 * from happening while this operation is in progress
6646 	 */
6647 	if (!ifnet_datamov_begin(ifp)) {
6648 		retval = ENXIO;
6649 		goto cleanup;
6650 	}
6651 	iorefcnt = 1;
6652 
6653 	VERIFY(ifp->if_output_dlil != NULL);
6654 
6655 	/* update the driver's multicast filter, if needed */
6656 	if (ifp->if_updatemcasts > 0 && if_mcasts_update(ifp) == 0) {
6657 		ifp->if_updatemcasts = 0;
6658 	}
6659 
6660 	frame_type = frame_type_buffer;
6661 	dst_linkaddr = dst_linkaddr_buffer;
6662 
6663 	if (raw == 0) {
6664 		ifnet_lock_shared(ifp);
6665 		/* callee holds a proto refcnt upon success */
6666 		proto = find_attached_proto(ifp, proto_family);
6667 		if (proto == NULL) {
6668 			ifnet_lock_done(ifp);
6669 			retval = ENXIO;
6670 			goto cleanup;
6671 		}
6672 		ifnet_lock_done(ifp);
6673 	}
6674 
6675 preout_again:
6676 	if (packetlist == NULL) {
6677 		goto cleanup;
6678 	}
6679 
6680 	m = packetlist;
6681 	packetlist = packetlist->m_nextpkt;
6682 	m->m_nextpkt = NULL;
6683 
6684 	m_add_crumb(m, PKT_CRUMB_DLIL_OUTPUT);
6685 
6686 	/*
6687 	 * Perform address family translation for the first
6688 	 * packet outside the loop in order to perform address
6689 	 * lookup for the translated proto family.
6690 	 */
6691 	if (proto_family == PF_INET && IS_INTF_CLAT46(ifp) &&
6692 	    (ifp->if_type == IFT_CELLULAR ||
6693 	    dlil_is_clat_needed(proto_family, m))) {
6694 		retval = dlil_clat46(ifp, &proto_family, &m);
6695 		/*
6696 		 * Go to the next packet if translation fails
6697 		 */
6698 		if (retval != 0) {
6699 			m_freem(m);
6700 			m = NULL;
6701 			ip6stat.ip6s_clat464_out_drop++;
6702 			/* Make sure that the proto family is PF_INET */
6703 			ASSERT(proto_family == PF_INET);
6704 			goto preout_again;
6705 		}
6706 		/*
6707 		 * Free the old one and make it point to the IPv6 proto structure.
6708 		 *
6709 		 * Change proto for the first time we have successfully
6710 		 * performed address family translation.
6711 		 */
6712 		if (!did_clat46 && proto_family == PF_INET6) {
6713 			did_clat46 = TRUE;
6714 
6715 			if (proto != NULL) {
6716 				if_proto_free(proto);
6717 			}
6718 			ifnet_lock_shared(ifp);
6719 			/* callee holds a proto refcnt upon success */
6720 			proto = find_attached_proto(ifp, proto_family);
6721 			if (proto == NULL) {
6722 				ifnet_lock_done(ifp);
6723 				retval = ENXIO;
6724 				m_freem(m);
6725 				m = NULL;
6726 				goto cleanup;
6727 			}
6728 			ifnet_lock_done(ifp);
6729 			if (ifp->if_type == IFT_ETHER) {
6730 				/* Update the dest to translated v6 address */
6731 				dest6.sin6_len = sizeof(struct sockaddr_in6);
6732 				dest6.sin6_family = AF_INET6;
6733 				dest6.sin6_addr = (mtod(m, struct ip6_hdr *))->ip6_dst;
6734 				dest = (const struct sockaddr *)&dest6;
6735 
6736 				/*
6737 				 * Lookup route to the translated destination
6738 				 * Free this route ref during cleanup
6739 				 */
6740 				rt = rtalloc1_scoped((struct sockaddr *)&dest6,
6741 				    0, 0, ifp->if_index);
6742 
6743 				route = rt;
6744 			}
6745 		}
6746 	}
6747 
6748 	/*
6749 	 * This path gets packet chain going to the same destination.
6750 	 * The pre output routine is used to either trigger resolution of
6751 	 * the next hop or retreive the next hop's link layer addressing.
6752 	 * For ex: ether_inet(6)_pre_output routine.
6753 	 *
6754 	 * If the routine returns EJUSTRETURN, it implies that packet has
6755 	 * been queued, and therefore we have to call preout_again for the
6756 	 * following packet in the chain.
6757 	 *
6758 	 * For errors other than EJUSTRETURN, the current packet is freed
6759 	 * and the rest of the chain (pointed by packetlist is freed as
6760 	 * part of clean up.
6761 	 *
6762 	 * Else if there is no error the retrieved information is used for
6763 	 * all the packets in the chain.
6764 	 */
6765 	if (raw == 0) {
6766 		proto_media_preout preoutp = (proto->proto_kpi == kProtoKPI_v1 ?
6767 		    proto->kpi.v1.pre_output : proto->kpi.v2.pre_output);
6768 		retval = 0;
6769 		if (preoutp != NULL) {
6770 			retval = preoutp(ifp, proto_family, &m, dest, route,
6771 			    frame_type, dst_linkaddr);
6772 
6773 			if (retval != 0) {
6774 				if (retval == EJUSTRETURN) {
6775 					goto preout_again;
6776 				}
6777 				m_freem(m);
6778 				m = NULL;
6779 				goto cleanup;
6780 			}
6781 		}
6782 	}
6783 
6784 	do {
6785 		/*
6786 		 * pkt_hdr is set here to point to m_data prior to
6787 		 * calling into the framer. This value of pkt_hdr is
6788 		 * used by the netif gso logic to retrieve the ip header
6789 		 * for the TCP packets, offloaded for TSO processing.
6790 		 */
6791 		if ((raw != 0) && (ifp->if_family == IFNET_FAMILY_ETHERNET)) {
6792 			uint8_t vlan_encap_len = 0;
6793 
6794 			if ((m->m_pkthdr.csum_flags & CSUM_VLAN_ENCAP_PRESENT) != 0) {
6795 				vlan_encap_len = ETHER_VLAN_ENCAP_LEN;
6796 			}
6797 			m->m_pkthdr.pkt_hdr = mtod(m, char *) + ETHER_HDR_LEN + vlan_encap_len;
6798 		} else {
6799 			m->m_pkthdr.pkt_hdr = mtod(m, void *);
6800 		}
6801 
6802 		/*
6803 		 * Perform address family translation if needed.
6804 		 * For now we only support stateless 4 to 6 translation
6805 		 * on the out path.
6806 		 *
6807 		 * The routine below translates IP header, updates protocol
6808 		 * checksum and also translates ICMP.
6809 		 *
6810 		 * We skip the first packet as it is already translated and
6811 		 * the proto family is set to PF_INET6.
6812 		 */
6813 		if (proto_family == PF_INET && IS_INTF_CLAT46(ifp) &&
6814 		    (ifp->if_type == IFT_CELLULAR ||
6815 		    dlil_is_clat_needed(proto_family, m))) {
6816 			retval = dlil_clat46(ifp, &proto_family, &m);
6817 			/* Goto the next packet if the translation fails */
6818 			if (retval != 0) {
6819 				m_freem(m);
6820 				m = NULL;
6821 				ip6stat.ip6s_clat464_out_drop++;
6822 				goto next;
6823 			}
6824 		}
6825 
6826 #if CONFIG_DTRACE
6827 		if (!raw) {
6828 			dlil_output_dtrace(ifp, proto_family, m);
6829 		}
6830 #endif /* CONFIG_DTRACE */
6831 
6832 		if (raw == 0 && ifp->if_framer != NULL) {
6833 			int rcvif_set = 0;
6834 
6835 			/*
6836 			 * If this is a broadcast packet that needs to be
6837 			 * looped back into the system, set the inbound ifp
6838 			 * to that of the outbound ifp.  This will allow
6839 			 * us to determine that it is a legitimate packet
6840 			 * for the system.  Only set the ifp if it's not
6841 			 * already set, just to be safe.
6842 			 */
6843 			if ((m->m_flags & (M_BCAST | M_LOOP)) &&
6844 			    m->m_pkthdr.rcvif == NULL) {
6845 				m->m_pkthdr.rcvif = ifp;
6846 				rcvif_set = 1;
6847 			}
6848 			m_loop_set = m->m_flags & M_LOOP;
6849 			retval = ifp->if_framer(ifp, &m, dest, dst_linkaddr,
6850 			    frame_type, &pre, &post);
6851 			if (retval != 0) {
6852 				if (retval != EJUSTRETURN) {
6853 					m_freem(m);
6854 				}
6855 				goto next;
6856 			}
6857 
6858 			/*
6859 			 * For partial checksum offload, adjust the start
6860 			 * and stuff offsets based on the prepended header.
6861 			 */
6862 			if ((m->m_pkthdr.csum_flags &
6863 			    (CSUM_DATA_VALID | CSUM_PARTIAL)) ==
6864 			    (CSUM_DATA_VALID | CSUM_PARTIAL)) {
6865 				m->m_pkthdr.csum_tx_stuff += pre;
6866 				m->m_pkthdr.csum_tx_start += pre;
6867 			}
6868 
6869 			if (hwcksum_dbg != 0 && !(ifp->if_flags & IFF_LOOPBACK)) {
6870 				dlil_output_cksum_dbg(ifp, m, pre,
6871 				    proto_family);
6872 			}
6873 
6874 			/*
6875 			 * Clear the ifp if it was set above, and to be
6876 			 * safe, only if it is still the same as the
6877 			 * outbound ifp we have in context.  If it was
6878 			 * looped back, then a copy of it was sent to the
6879 			 * loopback interface with the rcvif set, and we
6880 			 * are clearing the one that will go down to the
6881 			 * layer below.
6882 			 */
6883 			if (rcvif_set && m->m_pkthdr.rcvif == ifp) {
6884 				m->m_pkthdr.rcvif = NULL;
6885 			}
6886 		}
6887 
6888 		/*
6889 		 * Let interface filters (if any) do their thing ...
6890 		 */
6891 		retval = dlil_interface_filters_output(ifp, &m, proto_family);
6892 		if (retval != 0) {
6893 			if (retval != EJUSTRETURN) {
6894 				m_freem(m);
6895 			}
6896 			goto next;
6897 		}
6898 		/*
6899 		 * Strip away M_PROTO1 bit prior to sending packet
6900 		 * to the driver as this field may be used by the driver
6901 		 */
6902 		m->m_flags &= ~M_PROTO1;
6903 
6904 		/*
6905 		 * If the underlying interface is not capable of handling a
6906 		 * packet whose data portion spans across physically disjoint
6907 		 * pages, we need to "normalize" the packet so that we pass
6908 		 * down a chain of mbufs where each mbuf points to a span that
6909 		 * resides in the system page boundary.  If the packet does
6910 		 * not cross page(s), the following is a no-op.
6911 		 */
6912 		if (!(ifp->if_hwassist & IFNET_MULTIPAGES)) {
6913 			if ((m = m_normalize(m)) == NULL) {
6914 				goto next;
6915 			}
6916 		}
6917 
6918 		/*
6919 		 * If this is a TSO packet, make sure the interface still
6920 		 * advertise TSO capability.
6921 		 */
6922 		if (TSO_IPV4_NOTOK(ifp, m) || TSO_IPV6_NOTOK(ifp, m)) {
6923 			retval = EMSGSIZE;
6924 			m_freem(m);
6925 			goto cleanup;
6926 		}
6927 
6928 		ifp_inc_traffic_class_out(ifp, m);
6929 
6930 #if SKYWALK
6931 		/*
6932 		 * For native skywalk devices, packets will be passed to pktap
6933 		 * after GSO or after the mbuf to packet conversion.
6934 		 * This is done for IPv4/IPv6 packets only because there is no
6935 		 * space in the mbuf to pass down the proto family.
6936 		 */
6937 		if (dlil_is_native_netif_nexus(ifp)) {
6938 			if (raw || m->m_pkthdr.pkt_proto == 0) {
6939 				pktap_output(ifp, proto_family, m, pre, post);
6940 				m->m_pkthdr.pkt_flags |= PKTF_SKIP_PKTAP;
6941 			}
6942 		} else {
6943 			pktap_output(ifp, proto_family, m, pre, post);
6944 		}
6945 #else /* SKYWALK */
6946 		pktap_output(ifp, proto_family, m, pre, post);
6947 #endif /* SKYWALK */
6948 
6949 		/*
6950 		 * Count the number of elements in the mbuf chain
6951 		 */
6952 		if (tx_chain_len_count) {
6953 			dlil_count_chain_len(m, &tx_chain_len_stats);
6954 		}
6955 
6956 		/*
6957 		 * Record timestamp; ifnet_enqueue() will use this info
6958 		 * rather than redoing the work.  An optimization could
6959 		 * involve doing this just once at the top, if there are
6960 		 * no interface filters attached, but that's probably
6961 		 * not a big deal.
6962 		 */
6963 		nanouptime(&now);
6964 		net_timernsec(&now, &now_nsec);
6965 		(void) mbuf_set_timestamp(m, now_nsec, TRUE);
6966 
6967 		/*
6968 		 * Discard partial sum information if this packet originated
6969 		 * from another interface; the packet would already have the
6970 		 * final checksum and we shouldn't recompute it.
6971 		 */
6972 		if ((m->m_pkthdr.pkt_flags & PKTF_FORWARDED) &&
6973 		    (m->m_pkthdr.csum_flags & (CSUM_DATA_VALID | CSUM_PARTIAL)) ==
6974 		    (CSUM_DATA_VALID | CSUM_PARTIAL)) {
6975 			m->m_pkthdr.csum_flags &= ~CSUM_TX_FLAGS;
6976 			m->m_pkthdr.csum_data = 0;
6977 		}
6978 
6979 		/*
6980 		 * Finally, call the driver.
6981 		 */
6982 		if (ifp->if_eflags & (IFEF_SENDLIST | IFEF_ENQUEUE_MULTI)) {
6983 			if (m->m_pkthdr.pkt_flags & PKTF_FORWARDED) {
6984 				flen += (m_pktlen(m) - (pre + post));
6985 				m->m_pkthdr.pkt_flags &= ~PKTF_FORWARDED;
6986 			}
6987 			*send_tail = m;
6988 			send_tail = &m->m_nextpkt;
6989 		} else {
6990 			if (m->m_pkthdr.pkt_flags & PKTF_FORWARDED) {
6991 				flen = (m_pktlen(m) - (pre + post));
6992 				m->m_pkthdr.pkt_flags &= ~PKTF_FORWARDED;
6993 			} else {
6994 				flen = 0;
6995 			}
6996 			KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_START,
6997 			    0, 0, 0, 0, 0);
6998 			retval = (*ifp->if_output_dlil)(ifp, m);
6999 			if (retval == EQFULL || retval == EQSUSPENDED) {
7000 				if (adv != NULL && adv->code == FADV_SUCCESS) {
7001 					adv->code = (retval == EQFULL ?
7002 					    FADV_FLOW_CONTROLLED :
7003 					    FADV_SUSPENDED);
7004 				}
7005 				retval = 0;
7006 			}
7007 			if (retval == 0 && flen > 0) {
7008 				fbytes += flen;
7009 				fpkts++;
7010 			}
7011 			if (retval != 0 && dlil_verbose) {
7012 				DLIL_PRINTF("%s: output error on %s retval = %d\n",
7013 				    __func__, if_name(ifp),
7014 				    retval);
7015 			}
7016 			KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_END,
7017 			    0, 0, 0, 0, 0);
7018 		}
7019 		KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
7020 
7021 next:
7022 		m = packetlist;
7023 		if (m != NULL) {
7024 			m->m_flags |= m_loop_set;
7025 			packetlist = packetlist->m_nextpkt;
7026 			m->m_nextpkt = NULL;
7027 		}
7028 		/* Reset the proto family to old proto family for CLAT */
7029 		if (did_clat46) {
7030 			proto_family = old_proto_family;
7031 		}
7032 	} while (m != NULL);
7033 
7034 	if (send_head != NULL) {
7035 		KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_START,
7036 		    0, 0, 0, 0, 0);
7037 		if (ifp->if_eflags & IFEF_SENDLIST) {
7038 			retval = (*ifp->if_output_dlil)(ifp, send_head);
7039 			if (retval == EQFULL || retval == EQSUSPENDED) {
7040 				if (adv != NULL) {
7041 					adv->code = (retval == EQFULL ?
7042 					    FADV_FLOW_CONTROLLED :
7043 					    FADV_SUSPENDED);
7044 				}
7045 				retval = 0;
7046 			}
7047 			if (retval == 0 && flen > 0) {
7048 				fbytes += flen;
7049 				fpkts++;
7050 			}
7051 			if (retval != 0 && dlil_verbose) {
7052 				DLIL_PRINTF("%s: output error on %s retval = %d\n",
7053 				    __func__, if_name(ifp), retval);
7054 			}
7055 		} else {
7056 			struct mbuf *send_m;
7057 			int enq_cnt = 0;
7058 			VERIFY(ifp->if_eflags & IFEF_ENQUEUE_MULTI);
7059 			while (send_head != NULL) {
7060 				send_m = send_head;
7061 				send_head = send_m->m_nextpkt;
7062 				send_m->m_nextpkt = NULL;
7063 				retval = (*ifp->if_output_dlil)(ifp, send_m);
7064 				if (retval == EQFULL || retval == EQSUSPENDED) {
7065 					if (adv != NULL) {
7066 						adv->code = (retval == EQFULL ?
7067 						    FADV_FLOW_CONTROLLED :
7068 						    FADV_SUSPENDED);
7069 					}
7070 					retval = 0;
7071 				}
7072 				if (retval == 0) {
7073 					enq_cnt++;
7074 					if (flen > 0) {
7075 						fpkts++;
7076 					}
7077 				}
7078 				if (retval != 0 && dlil_verbose) {
7079 					DLIL_PRINTF("%s: output error on %s "
7080 					    "retval = %d\n",
7081 					    __func__, if_name(ifp), retval);
7082 				}
7083 			}
7084 			if (enq_cnt > 0) {
7085 				fbytes += flen;
7086 				ifnet_start(ifp);
7087 			}
7088 		}
7089 		KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
7090 	}
7091 
7092 	KERNEL_DEBUG(DBG_FNC_DLIL_OUTPUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
7093 
7094 cleanup:
7095 	if (fbytes > 0) {
7096 		ifp->if_fbytes += fbytes;
7097 	}
7098 	if (fpkts > 0) {
7099 		ifp->if_fpackets += fpkts;
7100 	}
7101 	if (proto != NULL) {
7102 		if_proto_free(proto);
7103 	}
7104 	if (packetlist) { /* if any packets are left, clean up */
7105 		mbuf_freem_list(packetlist);
7106 	}
7107 	if (retval == EJUSTRETURN) {
7108 		retval = 0;
7109 	}
7110 	if (iorefcnt == 1) {
7111 		ifnet_datamov_end(ifp);
7112 	}
7113 	if (rt != NULL) {
7114 		rtfree(rt);
7115 		rt = NULL;
7116 	}
7117 
7118 	return retval;
7119 }
7120 
7121 /*
7122  * This routine checks if the destination address is not a loopback, link-local,
7123  * multicast or broadcast address.
7124  */
7125 static int
dlil_is_clat_needed(protocol_family_t proto_family,mbuf_t m)7126 dlil_is_clat_needed(protocol_family_t proto_family, mbuf_t m)
7127 {
7128 	int ret = 0;
7129 	switch (proto_family) {
7130 	case PF_INET: {
7131 		struct ip *iph = mtod(m, struct ip *);
7132 		if (CLAT46_NEEDED(ntohl(iph->ip_dst.s_addr))) {
7133 			ret = 1;
7134 		}
7135 		break;
7136 	}
7137 	case PF_INET6: {
7138 		struct ip6_hdr *ip6h = mtod(m, struct ip6_hdr *);
7139 		if ((size_t)m_pktlen(m) >= sizeof(struct ip6_hdr) &&
7140 		    CLAT64_NEEDED(&ip6h->ip6_dst)) {
7141 			ret = 1;
7142 		}
7143 		break;
7144 	}
7145 	}
7146 
7147 	return ret;
7148 }
7149 /*
7150  * @brief This routine translates IPv4 packet to IPv6 packet,
7151  *     updates protocol checksum and also translates ICMP for code
7152  *     along with inner header translation.
7153  *
7154  * @param ifp Pointer to the interface
7155  * @param proto_family pointer to protocol family. It is updated if function
7156  *     performs the translation successfully.
7157  * @param m Pointer to the pointer pointing to the packet. Needed because this
7158  *     routine can end up changing the mbuf to a different one.
7159  *
7160  * @return 0 on success or else a negative value.
7161  */
7162 static errno_t
dlil_clat46(ifnet_t ifp,protocol_family_t * proto_family,mbuf_t * m)7163 dlil_clat46(ifnet_t ifp, protocol_family_t *proto_family, mbuf_t *m)
7164 {
7165 	VERIFY(*proto_family == PF_INET);
7166 	VERIFY(IS_INTF_CLAT46(ifp));
7167 
7168 	pbuf_t pbuf_store, *pbuf = NULL;
7169 	struct ip *iph = NULL;
7170 	struct in_addr osrc, odst;
7171 	uint8_t proto = 0;
7172 	struct in6_ifaddr *ia6_clat_src = NULL;
7173 	struct in6_addr *src = NULL;
7174 	struct in6_addr dst;
7175 	int error = 0;
7176 	uint16_t off = 0;
7177 	uint16_t tot_len = 0;
7178 	uint16_t ip_id_val = 0;
7179 	uint16_t ip_frag_off = 0;
7180 
7181 	boolean_t is_frag = FALSE;
7182 	boolean_t is_first_frag = TRUE;
7183 	boolean_t is_last_frag = TRUE;
7184 
7185 	pbuf_init_mbuf(&pbuf_store, *m, ifp);
7186 	pbuf = &pbuf_store;
7187 	iph = pbuf->pb_data;
7188 
7189 	osrc = iph->ip_src;
7190 	odst = iph->ip_dst;
7191 	proto = iph->ip_p;
7192 	off = (uint16_t)(iph->ip_hl << 2);
7193 	ip_id_val = iph->ip_id;
7194 	ip_frag_off = ntohs(iph->ip_off) & IP_OFFMASK;
7195 
7196 	tot_len = ntohs(iph->ip_len);
7197 
7198 	/*
7199 	 * For packets that are not first frags
7200 	 * we only need to adjust CSUM.
7201 	 * For 4 to 6, Fragmentation header gets appended
7202 	 * after proto translation.
7203 	 */
7204 	if (ntohs(iph->ip_off) & ~(IP_DF | IP_RF)) {
7205 		is_frag = TRUE;
7206 
7207 		/* If the offset is not zero, it is not first frag */
7208 		if (ip_frag_off != 0) {
7209 			is_first_frag = FALSE;
7210 		}
7211 
7212 		/* If IP_MF is set, then it is not last frag */
7213 		if (ntohs(iph->ip_off) & IP_MF) {
7214 			is_last_frag = FALSE;
7215 		}
7216 	}
7217 
7218 	/*
7219 	 * Retrive the local IPv6 CLAT46 address reserved for stateless
7220 	 * translation.
7221 	 */
7222 	ia6_clat_src = in6ifa_ifpwithflag(ifp, IN6_IFF_CLAT46);
7223 	if (ia6_clat_src == NULL) {
7224 		ip6stat.ip6s_clat464_out_nov6addr_drop++;
7225 		error = -1;
7226 		goto cleanup;
7227 	}
7228 
7229 	src = &ia6_clat_src->ia_addr.sin6_addr;
7230 
7231 	/*
7232 	 * Translate IPv4 destination to IPv6 destination by using the
7233 	 * prefixes learned through prior PLAT discovery.
7234 	 */
7235 	if ((error = nat464_synthesize_ipv6(ifp, &odst, &dst)) != 0) {
7236 		ip6stat.ip6s_clat464_out_v6synthfail_drop++;
7237 		goto cleanup;
7238 	}
7239 
7240 	/* Translate the IP header part first */
7241 	error = (nat464_translate_46(pbuf, off, iph->ip_tos, iph->ip_p,
7242 	    iph->ip_ttl, *src, dst, tot_len) == NT_NAT64) ? 0 : -1;
7243 
7244 	iph = NULL;     /* Invalidate iph as pbuf has been modified */
7245 
7246 	if (error != 0) {
7247 		ip6stat.ip6s_clat464_out_46transfail_drop++;
7248 		goto cleanup;
7249 	}
7250 
7251 	/*
7252 	 * Translate protocol header, update checksum, checksum flags
7253 	 * and related fields.
7254 	 */
7255 	error = (nat464_translate_proto(pbuf, (struct nat464_addr *)&osrc, (struct nat464_addr *)&odst,
7256 	    proto, PF_INET, PF_INET6, NT_OUT, !is_first_frag) == NT_NAT64) ? 0 : -1;
7257 
7258 	if (error != 0) {
7259 		ip6stat.ip6s_clat464_out_46proto_transfail_drop++;
7260 		goto cleanup;
7261 	}
7262 
7263 	/* Now insert the IPv6 fragment header */
7264 	if (is_frag) {
7265 		error = nat464_insert_frag46(pbuf, ip_id_val, ip_frag_off, is_last_frag);
7266 
7267 		if (error != 0) {
7268 			ip6stat.ip6s_clat464_out_46frag_transfail_drop++;
7269 			goto cleanup;
7270 		}
7271 	}
7272 
7273 cleanup:
7274 	if (ia6_clat_src != NULL) {
7275 		IFA_REMREF(&ia6_clat_src->ia_ifa);
7276 	}
7277 
7278 	if (pbuf_is_valid(pbuf)) {
7279 		*m = pbuf->pb_mbuf;
7280 		pbuf->pb_mbuf = NULL;
7281 		pbuf_destroy(pbuf);
7282 	} else {
7283 		error = -1;
7284 		ip6stat.ip6s_clat464_out_invalpbuf_drop++;
7285 	}
7286 
7287 	if (error == 0) {
7288 		*proto_family = PF_INET6;
7289 		ip6stat.ip6s_clat464_out_success++;
7290 	}
7291 
7292 	return error;
7293 }
7294 
7295 /*
7296  * @brief This routine translates incoming IPv6 to IPv4 packet,
7297  *     updates protocol checksum and also translates ICMPv6 outer
7298  *     and inner headers
7299  *
7300  * @return 0 on success or else a negative value.
7301  */
7302 static errno_t
dlil_clat64(ifnet_t ifp,protocol_family_t * proto_family,mbuf_t * m)7303 dlil_clat64(ifnet_t ifp, protocol_family_t *proto_family, mbuf_t *m)
7304 {
7305 	VERIFY(*proto_family == PF_INET6);
7306 	VERIFY(IS_INTF_CLAT46(ifp));
7307 
7308 	struct ip6_hdr *ip6h = NULL;
7309 	struct in6_addr osrc, odst;
7310 	uint8_t proto = 0;
7311 	struct in6_ifaddr *ia6_clat_dst = NULL;
7312 	struct in_ifaddr *ia4_clat_dst = NULL;
7313 	struct in_addr *dst = NULL;
7314 	struct in_addr src;
7315 	int error = 0;
7316 	uint32_t off = 0;
7317 	u_int64_t tot_len = 0;
7318 	uint8_t tos = 0;
7319 	boolean_t is_first_frag = TRUE;
7320 
7321 	/* Incoming mbuf does not contain valid IP6 header */
7322 	if ((size_t)(*m)->m_pkthdr.len < sizeof(struct ip6_hdr) ||
7323 	    ((size_t)(*m)->m_len < sizeof(struct ip6_hdr) &&
7324 	    (*m = m_pullup(*m, sizeof(struct ip6_hdr))) == NULL)) {
7325 		ip6stat.ip6s_clat464_in_tooshort_drop++;
7326 		return -1;
7327 	}
7328 
7329 	ip6h = mtod(*m, struct ip6_hdr *);
7330 	/* Validate that mbuf contains IP payload equal to ip6_plen  */
7331 	if ((size_t)(*m)->m_pkthdr.len < ntohs(ip6h->ip6_plen) + sizeof(struct ip6_hdr)) {
7332 		ip6stat.ip6s_clat464_in_tooshort_drop++;
7333 		return -1;
7334 	}
7335 
7336 	osrc = ip6h->ip6_src;
7337 	odst = ip6h->ip6_dst;
7338 
7339 	/*
7340 	 * Retrieve the local CLAT46 reserved IPv6 address.
7341 	 * Let the packet pass if we don't find one, as the flag
7342 	 * may get set before IPv6 configuration has taken place.
7343 	 */
7344 	ia6_clat_dst = in6ifa_ifpwithflag(ifp, IN6_IFF_CLAT46);
7345 	if (ia6_clat_dst == NULL) {
7346 		goto done;
7347 	}
7348 
7349 	/*
7350 	 * Check if the original dest in the packet is same as the reserved
7351 	 * CLAT46 IPv6 address
7352 	 */
7353 	if (IN6_ARE_ADDR_EQUAL(&odst, &ia6_clat_dst->ia_addr.sin6_addr)) {
7354 		pbuf_t pbuf_store, *pbuf = NULL;
7355 		pbuf_init_mbuf(&pbuf_store, *m, ifp);
7356 		pbuf = &pbuf_store;
7357 
7358 		/*
7359 		 * Retrive the local CLAT46 IPv4 address reserved for stateless
7360 		 * translation.
7361 		 */
7362 		ia4_clat_dst = inifa_ifpclatv4(ifp);
7363 		if (ia4_clat_dst == NULL) {
7364 			IFA_REMREF(&ia6_clat_dst->ia_ifa);
7365 			ip6stat.ip6s_clat464_in_nov4addr_drop++;
7366 			error = -1;
7367 			goto cleanup;
7368 		}
7369 		IFA_REMREF(&ia6_clat_dst->ia_ifa);
7370 
7371 		/* Translate IPv6 src to IPv4 src by removing the NAT64 prefix */
7372 		dst = &ia4_clat_dst->ia_addr.sin_addr;
7373 		if ((error = nat464_synthesize_ipv4(ifp, &osrc, &src)) != 0) {
7374 			ip6stat.ip6s_clat464_in_v4synthfail_drop++;
7375 			error = -1;
7376 			goto cleanup;
7377 		}
7378 
7379 		ip6h = pbuf->pb_data;
7380 		off = sizeof(struct ip6_hdr);
7381 		proto = ip6h->ip6_nxt;
7382 		tos = (ntohl(ip6h->ip6_flow) >> 20) & 0xff;
7383 		tot_len = ntohs(ip6h->ip6_plen) + sizeof(struct ip6_hdr);
7384 
7385 		/*
7386 		 * Translate the IP header and update the fragmentation
7387 		 * header if needed
7388 		 */
7389 		error = (nat464_translate_64(pbuf, off, tos, &proto,
7390 		    ip6h->ip6_hlim, src, *dst, tot_len, &is_first_frag) == NT_NAT64) ?
7391 		    0 : -1;
7392 
7393 		ip6h = NULL; /* Invalidate ip6h as pbuf has been changed */
7394 
7395 		if (error != 0) {
7396 			ip6stat.ip6s_clat464_in_64transfail_drop++;
7397 			goto cleanup;
7398 		}
7399 
7400 		/*
7401 		 * Translate protocol header, update checksum, checksum flags
7402 		 * and related fields.
7403 		 */
7404 		error = (nat464_translate_proto(pbuf, (struct nat464_addr *)&osrc,
7405 		    (struct nat464_addr *)&odst, proto, PF_INET6, PF_INET,
7406 		    NT_IN, !is_first_frag) == NT_NAT64) ? 0 : -1;
7407 
7408 		if (error != 0) {
7409 			ip6stat.ip6s_clat464_in_64proto_transfail_drop++;
7410 			goto cleanup;
7411 		}
7412 
7413 cleanup:
7414 		if (ia4_clat_dst != NULL) {
7415 			IFA_REMREF(&ia4_clat_dst->ia_ifa);
7416 		}
7417 
7418 		if (pbuf_is_valid(pbuf)) {
7419 			*m = pbuf->pb_mbuf;
7420 			pbuf->pb_mbuf = NULL;
7421 			pbuf_destroy(pbuf);
7422 		} else {
7423 			error = -1;
7424 			ip6stat.ip6s_clat464_in_invalpbuf_drop++;
7425 		}
7426 
7427 		if (error == 0) {
7428 			*proto_family = PF_INET;
7429 			ip6stat.ip6s_clat464_in_success++;
7430 		}
7431 	} /* CLAT traffic */
7432 
7433 done:
7434 	return error;
7435 }
7436 
7437 /* The following is used to enqueue work items for ifnet ioctl events */
7438 static void ifnet_ioctl_event_callback(struct nwk_wq_entry *);
7439 
7440 struct ifnet_ioctl_event {
7441 	struct ifnet *ifp;
7442 	u_long ioctl_code;
7443 };
7444 
7445 struct ifnet_ioctl_event_nwk_wq_entry {
7446 	struct nwk_wq_entry nwk_wqe;
7447 	struct ifnet_ioctl_event ifnet_ioctl_ev_arg;
7448 };
7449 
7450 void
ifnet_ioctl_async(struct ifnet * ifp,u_long ioctl_code)7451 ifnet_ioctl_async(struct ifnet *ifp, u_long ioctl_code)
7452 {
7453 	struct ifnet_ioctl_event_nwk_wq_entry *p_ifnet_ioctl_ev = NULL;
7454 
7455 	/*
7456 	 * Get an io ref count if the interface is attached.
7457 	 * At this point it most likely is. We are taking a reference for
7458 	 * deferred processing.
7459 	 */
7460 	if (!ifnet_is_attached(ifp, 1)) {
7461 		os_log(OS_LOG_DEFAULT, "%s:%d %s Failed for ioctl %lu as interface "
7462 		    "is not attached",
7463 		    __func__, __LINE__, if_name(ifp), ioctl_code);
7464 		return;
7465 	}
7466 
7467 	p_ifnet_ioctl_ev = kalloc_type(struct ifnet_ioctl_event_nwk_wq_entry,
7468 	    Z_WAITOK | Z_ZERO | Z_NOFAIL);
7469 
7470 	p_ifnet_ioctl_ev->ifnet_ioctl_ev_arg.ifp = ifp;
7471 	p_ifnet_ioctl_ev->ifnet_ioctl_ev_arg.ioctl_code = ioctl_code;
7472 	p_ifnet_ioctl_ev->nwk_wqe.func = ifnet_ioctl_event_callback;
7473 	nwk_wq_enqueue(&p_ifnet_ioctl_ev->nwk_wqe);
7474 }
7475 
7476 static void
ifnet_ioctl_event_callback(struct nwk_wq_entry * nwk_item)7477 ifnet_ioctl_event_callback(struct nwk_wq_entry *nwk_item)
7478 {
7479 	struct ifnet_ioctl_event_nwk_wq_entry *p_ev = __container_of(nwk_item,
7480 	    struct ifnet_ioctl_event_nwk_wq_entry, nwk_wqe);
7481 
7482 	struct ifnet *ifp = p_ev->ifnet_ioctl_ev_arg.ifp;
7483 	u_long ioctl_code = p_ev->ifnet_ioctl_ev_arg.ioctl_code;
7484 	int ret = 0;
7485 
7486 	if ((ret = ifnet_ioctl(ifp, 0, ioctl_code, NULL)) != 0) {
7487 		os_log(OS_LOG_DEFAULT, "%s:%d %s ifnet_ioctl returned %d for ioctl %lu",
7488 		    __func__, __LINE__, if_name(ifp), ret, ioctl_code);
7489 	} else if (dlil_verbose) {
7490 		os_log(OS_LOG_DEFAULT, "%s:%d %s ifnet_ioctl returned successfully "
7491 		    "for ioctl %lu",
7492 		    __func__, __LINE__, if_name(ifp), ioctl_code);
7493 	}
7494 	ifnet_decr_iorefcnt(ifp);
7495 	kfree_type(struct ifnet_ioctl_event_nwk_wq_entry, p_ev);
7496 	return;
7497 }
7498 
7499 errno_t
ifnet_ioctl(ifnet_t ifp,protocol_family_t proto_fam,u_long ioctl_code,void * ioctl_arg)7500 ifnet_ioctl(ifnet_t ifp, protocol_family_t proto_fam, u_long ioctl_code,
7501     void *ioctl_arg)
7502 {
7503 	struct ifnet_filter *filter;
7504 	int retval = EOPNOTSUPP;
7505 	int result = 0;
7506 
7507 	if (ifp == NULL || ioctl_code == 0) {
7508 		return EINVAL;
7509 	}
7510 
7511 	/* Get an io ref count if the interface is attached */
7512 	if (!ifnet_is_attached(ifp, 1)) {
7513 		return EOPNOTSUPP;
7514 	}
7515 
7516 	/*
7517 	 * Run the interface filters first.
7518 	 * We want to run all filters before calling the protocol,
7519 	 * interface family, or interface.
7520 	 */
7521 	lck_mtx_lock_spin(&ifp->if_flt_lock);
7522 	/* prevent filter list from changing in case we drop the lock */
7523 	if_flt_monitor_busy(ifp);
7524 	TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
7525 		if (filter->filt_ioctl != NULL && (filter->filt_protocol == 0 ||
7526 		    filter->filt_protocol == proto_fam)) {
7527 			lck_mtx_unlock(&ifp->if_flt_lock);
7528 
7529 			result = filter->filt_ioctl(filter->filt_cookie, ifp,
7530 			    proto_fam, ioctl_code, ioctl_arg);
7531 
7532 			lck_mtx_lock_spin(&ifp->if_flt_lock);
7533 
7534 			/* Only update retval if no one has handled the ioctl */
7535 			if (retval == EOPNOTSUPP || result == EJUSTRETURN) {
7536 				if (result == ENOTSUP) {
7537 					result = EOPNOTSUPP;
7538 				}
7539 				retval = result;
7540 				if (retval != 0 && retval != EOPNOTSUPP) {
7541 					/* we're done with the filter list */
7542 					if_flt_monitor_unbusy(ifp);
7543 					lck_mtx_unlock(&ifp->if_flt_lock);
7544 					goto cleanup;
7545 				}
7546 			}
7547 		}
7548 	}
7549 	/* we're done with the filter list */
7550 	if_flt_monitor_unbusy(ifp);
7551 	lck_mtx_unlock(&ifp->if_flt_lock);
7552 
7553 	/* Allow the protocol to handle the ioctl */
7554 	if (proto_fam != 0) {
7555 		struct if_proto *proto;
7556 
7557 		/* callee holds a proto refcnt upon success */
7558 		ifnet_lock_shared(ifp);
7559 		proto = find_attached_proto(ifp, proto_fam);
7560 		ifnet_lock_done(ifp);
7561 		if (proto != NULL) {
7562 			proto_media_ioctl ioctlp =
7563 			    (proto->proto_kpi == kProtoKPI_v1 ?
7564 			    proto->kpi.v1.ioctl : proto->kpi.v2.ioctl);
7565 			result = EOPNOTSUPP;
7566 			if (ioctlp != NULL) {
7567 				result = ioctlp(ifp, proto_fam, ioctl_code,
7568 				    ioctl_arg);
7569 			}
7570 			if_proto_free(proto);
7571 
7572 			/* Only update retval if no one has handled the ioctl */
7573 			if (retval == EOPNOTSUPP || result == EJUSTRETURN) {
7574 				if (result == ENOTSUP) {
7575 					result = EOPNOTSUPP;
7576 				}
7577 				retval = result;
7578 				if (retval && retval != EOPNOTSUPP) {
7579 					goto cleanup;
7580 				}
7581 			}
7582 		}
7583 	}
7584 
7585 	/* retval is either 0 or EOPNOTSUPP */
7586 
7587 	/*
7588 	 * Let the interface handle this ioctl.
7589 	 * If it returns EOPNOTSUPP, ignore that, we may have
7590 	 * already handled this in the protocol or family.
7591 	 */
7592 	if (ifp->if_ioctl) {
7593 		result = (*ifp->if_ioctl)(ifp, ioctl_code, ioctl_arg);
7594 	}
7595 
7596 	/* Only update retval if no one has handled the ioctl */
7597 	if (retval == EOPNOTSUPP || result == EJUSTRETURN) {
7598 		if (result == ENOTSUP) {
7599 			result = EOPNOTSUPP;
7600 		}
7601 		retval = result;
7602 		if (retval && retval != EOPNOTSUPP) {
7603 			goto cleanup;
7604 		}
7605 	}
7606 
7607 cleanup:
7608 	if (retval == EJUSTRETURN) {
7609 		retval = 0;
7610 	}
7611 
7612 	ifnet_decr_iorefcnt(ifp);
7613 
7614 	return retval;
7615 }
7616 
7617 __private_extern__ errno_t
dlil_set_bpf_tap(ifnet_t ifp,bpf_tap_mode mode,bpf_packet_func callback)7618 dlil_set_bpf_tap(ifnet_t ifp, bpf_tap_mode mode, bpf_packet_func callback)
7619 {
7620 	errno_t error = 0;
7621 
7622 
7623 	if (ifp->if_set_bpf_tap) {
7624 		/* Get an io reference on the interface if it is attached */
7625 		if (!ifnet_is_attached(ifp, 1)) {
7626 			return ENXIO;
7627 		}
7628 		error = ifp->if_set_bpf_tap(ifp, mode, callback);
7629 		ifnet_decr_iorefcnt(ifp);
7630 	}
7631 	return error;
7632 }
7633 
7634 errno_t
dlil_resolve_multi(struct ifnet * ifp,const struct sockaddr * proto_addr,struct sockaddr * ll_addr,size_t ll_len)7635 dlil_resolve_multi(struct ifnet *ifp, const struct sockaddr *proto_addr,
7636     struct sockaddr *ll_addr, size_t ll_len)
7637 {
7638 	errno_t result = EOPNOTSUPP;
7639 	struct if_proto *proto;
7640 	const struct sockaddr *verify;
7641 	proto_media_resolve_multi resolvep;
7642 
7643 	if (!ifnet_is_attached(ifp, 1)) {
7644 		return result;
7645 	}
7646 
7647 	bzero(ll_addr, ll_len);
7648 
7649 	/* Call the protocol first; callee holds a proto refcnt upon success */
7650 	ifnet_lock_shared(ifp);
7651 	proto = find_attached_proto(ifp, proto_addr->sa_family);
7652 	ifnet_lock_done(ifp);
7653 	if (proto != NULL) {
7654 		resolvep = (proto->proto_kpi == kProtoKPI_v1 ?
7655 		    proto->kpi.v1.resolve_multi : proto->kpi.v2.resolve_multi);
7656 		if (resolvep != NULL) {
7657 			result = resolvep(ifp, proto_addr,
7658 			    (struct sockaddr_dl *)(void *)ll_addr, ll_len);
7659 		}
7660 		if_proto_free(proto);
7661 	}
7662 
7663 	/* Let the interface verify the multicast address */
7664 	if ((result == EOPNOTSUPP || result == 0) && ifp->if_check_multi) {
7665 		if (result == 0) {
7666 			verify = ll_addr;
7667 		} else {
7668 			verify = proto_addr;
7669 		}
7670 		result = ifp->if_check_multi(ifp, verify);
7671 	}
7672 
7673 	ifnet_decr_iorefcnt(ifp);
7674 	return result;
7675 }
7676 
7677 __private_extern__ errno_t
dlil_send_arp_internal(ifnet_t ifp,u_short arpop,const struct sockaddr_dl * sender_hw,const struct sockaddr * sender_proto,const struct sockaddr_dl * target_hw,const struct sockaddr * target_proto)7678 dlil_send_arp_internal(ifnet_t ifp, u_short arpop,
7679     const struct sockaddr_dl *sender_hw, const struct sockaddr *sender_proto,
7680     const struct sockaddr_dl *target_hw, const struct sockaddr *target_proto)
7681 {
7682 	struct if_proto *proto;
7683 	errno_t result = 0;
7684 
7685 	if ((ifp->if_flags & IFF_NOARP) != 0) {
7686 		result = ENOTSUP;
7687 		goto done;
7688 	}
7689 
7690 	/* callee holds a proto refcnt upon success */
7691 	ifnet_lock_shared(ifp);
7692 	proto = find_attached_proto(ifp, target_proto->sa_family);
7693 	ifnet_lock_done(ifp);
7694 	if (proto == NULL) {
7695 		result = ENOTSUP;
7696 	} else {
7697 		proto_media_send_arp    arpp;
7698 		arpp = (proto->proto_kpi == kProtoKPI_v1 ?
7699 		    proto->kpi.v1.send_arp : proto->kpi.v2.send_arp);
7700 		if (arpp == NULL) {
7701 			result = ENOTSUP;
7702 		} else {
7703 			switch (arpop) {
7704 			case ARPOP_REQUEST:
7705 				arpstat.txrequests++;
7706 				if (target_hw != NULL) {
7707 					arpstat.txurequests++;
7708 				}
7709 				break;
7710 			case ARPOP_REPLY:
7711 				arpstat.txreplies++;
7712 				break;
7713 			}
7714 			result = arpp(ifp, arpop, sender_hw, sender_proto,
7715 			    target_hw, target_proto);
7716 		}
7717 		if_proto_free(proto);
7718 	}
7719 done:
7720 	return result;
7721 }
7722 
7723 struct net_thread_marks { };
7724 static const struct net_thread_marks net_thread_marks_base = { };
7725 
7726 __private_extern__ const net_thread_marks_t net_thread_marks_none =
7727     &net_thread_marks_base;
7728 
7729 __private_extern__ net_thread_marks_t
net_thread_marks_push(u_int32_t push)7730 net_thread_marks_push(u_int32_t push)
7731 {
7732 	static const char *const base = (const void*)&net_thread_marks_base;
7733 	u_int32_t pop = 0;
7734 
7735 	if (push != 0) {
7736 		struct uthread *uth = current_uthread();
7737 
7738 		pop = push & ~uth->uu_network_marks;
7739 		if (pop != 0) {
7740 			uth->uu_network_marks |= pop;
7741 		}
7742 	}
7743 
7744 	return (net_thread_marks_t)&base[pop];
7745 }
7746 
7747 __private_extern__ net_thread_marks_t
net_thread_unmarks_push(u_int32_t unpush)7748 net_thread_unmarks_push(u_int32_t unpush)
7749 {
7750 	static const char *const base = (const void*)&net_thread_marks_base;
7751 	u_int32_t unpop = 0;
7752 
7753 	if (unpush != 0) {
7754 		struct uthread *uth = current_uthread();
7755 
7756 		unpop = unpush & uth->uu_network_marks;
7757 		if (unpop != 0) {
7758 			uth->uu_network_marks &= ~unpop;
7759 		}
7760 	}
7761 
7762 	return (net_thread_marks_t)&base[unpop];
7763 }
7764 
7765 __private_extern__ void
net_thread_marks_pop(net_thread_marks_t popx)7766 net_thread_marks_pop(net_thread_marks_t popx)
7767 {
7768 	static const char *const base = (const void*)&net_thread_marks_base;
7769 	const ptrdiff_t pop = (const char *)popx - (const char *)base;
7770 
7771 	if (pop != 0) {
7772 		static const ptrdiff_t ones = (ptrdiff_t)(u_int32_t)~0U;
7773 		struct uthread *uth = current_uthread();
7774 
7775 		VERIFY((pop & ones) == pop);
7776 		VERIFY((ptrdiff_t)(uth->uu_network_marks & pop) == pop);
7777 		uth->uu_network_marks &= ~pop;
7778 	}
7779 }
7780 
7781 __private_extern__ void
net_thread_unmarks_pop(net_thread_marks_t unpopx)7782 net_thread_unmarks_pop(net_thread_marks_t unpopx)
7783 {
7784 	static const char *const base = (const void*)&net_thread_marks_base;
7785 	ptrdiff_t unpop = (const char *)unpopx - (const char *)base;
7786 
7787 	if (unpop != 0) {
7788 		static const ptrdiff_t ones = (ptrdiff_t)(u_int32_t)~0U;
7789 		struct uthread *uth = current_uthread();
7790 
7791 		VERIFY((unpop & ones) == unpop);
7792 		VERIFY((ptrdiff_t)(uth->uu_network_marks & unpop) == 0);
7793 		uth->uu_network_marks |= unpop;
7794 	}
7795 }
7796 
7797 __private_extern__ u_int32_t
net_thread_is_marked(u_int32_t check)7798 net_thread_is_marked(u_int32_t check)
7799 {
7800 	if (check != 0) {
7801 		struct uthread *uth = current_uthread();
7802 		return uth->uu_network_marks & check;
7803 	} else {
7804 		return 0;
7805 	}
7806 }
7807 
7808 __private_extern__ u_int32_t
net_thread_is_unmarked(u_int32_t check)7809 net_thread_is_unmarked(u_int32_t check)
7810 {
7811 	if (check != 0) {
7812 		struct uthread *uth = current_uthread();
7813 		return ~uth->uu_network_marks & check;
7814 	} else {
7815 		return 0;
7816 	}
7817 }
7818 
7819 static __inline__ int
_is_announcement(const struct sockaddr_in * sender_sin,const struct sockaddr_in * target_sin)7820 _is_announcement(const struct sockaddr_in * sender_sin,
7821     const struct sockaddr_in * target_sin)
7822 {
7823 	if (target_sin == NULL || sender_sin == NULL) {
7824 		return FALSE;
7825 	}
7826 
7827 	return sender_sin->sin_addr.s_addr == target_sin->sin_addr.s_addr;
7828 }
7829 
7830 __private_extern__ errno_t
dlil_send_arp(ifnet_t ifp,u_short arpop,const struct sockaddr_dl * sender_hw,const struct sockaddr * sender_proto,const struct sockaddr_dl * target_hw,const struct sockaddr * target_proto0,u_int32_t rtflags)7831 dlil_send_arp(ifnet_t ifp, u_short arpop, const struct sockaddr_dl *sender_hw,
7832     const struct sockaddr *sender_proto, const struct sockaddr_dl *target_hw,
7833     const struct sockaddr *target_proto0, u_int32_t rtflags)
7834 {
7835 	errno_t result = 0;
7836 	const struct sockaddr_in * sender_sin;
7837 	const struct sockaddr_in * target_sin;
7838 	struct sockaddr_inarp target_proto_sinarp;
7839 	struct sockaddr *target_proto = (void *)(uintptr_t)target_proto0;
7840 
7841 	if (target_proto == NULL || sender_proto == NULL) {
7842 		return EINVAL;
7843 	}
7844 
7845 	if (sender_proto->sa_family != target_proto->sa_family) {
7846 		return EINVAL;
7847 	}
7848 
7849 	/*
7850 	 * If the target is a (default) router, provide that
7851 	 * information to the send_arp callback routine.
7852 	 */
7853 	if (rtflags & RTF_ROUTER) {
7854 		bcopy(target_proto, &target_proto_sinarp,
7855 		    sizeof(struct sockaddr_in));
7856 		target_proto_sinarp.sin_other |= SIN_ROUTER;
7857 		target_proto = (struct sockaddr *)&target_proto_sinarp;
7858 	}
7859 
7860 	/*
7861 	 * If this is an ARP request and the target IP is IPv4LL,
7862 	 * send the request on all interfaces.  The exception is
7863 	 * an announcement, which must only appear on the specific
7864 	 * interface.
7865 	 */
7866 	sender_sin = (struct sockaddr_in *)(void *)(uintptr_t)sender_proto;
7867 	target_sin = (struct sockaddr_in *)(void *)(uintptr_t)target_proto;
7868 	if (target_proto->sa_family == AF_INET &&
7869 	    IN_LINKLOCAL(ntohl(target_sin->sin_addr.s_addr)) &&
7870 	    ipv4_ll_arp_aware != 0 && arpop == ARPOP_REQUEST &&
7871 	    !_is_announcement(sender_sin, target_sin)) {
7872 		ifnet_t         *ifp_list;
7873 		u_int32_t       count;
7874 		u_int32_t       ifp_on;
7875 
7876 		result = ENOTSUP;
7877 
7878 		if (ifnet_list_get(IFNET_FAMILY_ANY, &ifp_list, &count) == 0) {
7879 			for (ifp_on = 0; ifp_on < count; ifp_on++) {
7880 				errno_t new_result;
7881 				ifaddr_t source_hw = NULL;
7882 				ifaddr_t source_ip = NULL;
7883 				struct sockaddr_in source_ip_copy;
7884 				struct ifnet *cur_ifp = ifp_list[ifp_on];
7885 
7886 				/*
7887 				 * Only arp on interfaces marked for IPv4LL
7888 				 * ARPing.  This may mean that we don't ARP on
7889 				 * the interface the subnet route points to.
7890 				 */
7891 				if (!(cur_ifp->if_eflags & IFEF_ARPLL)) {
7892 					continue;
7893 				}
7894 
7895 				/* Find the source IP address */
7896 				ifnet_lock_shared(cur_ifp);
7897 				source_hw = cur_ifp->if_lladdr;
7898 				TAILQ_FOREACH(source_ip, &cur_ifp->if_addrhead,
7899 				    ifa_link) {
7900 					IFA_LOCK(source_ip);
7901 					if (source_ip->ifa_addr != NULL &&
7902 					    source_ip->ifa_addr->sa_family ==
7903 					    AF_INET) {
7904 						/* Copy the source IP address */
7905 						source_ip_copy =
7906 						    *(struct sockaddr_in *)
7907 						    (void *)source_ip->ifa_addr;
7908 						IFA_UNLOCK(source_ip);
7909 						break;
7910 					}
7911 					IFA_UNLOCK(source_ip);
7912 				}
7913 
7914 				/* No IP Source, don't arp */
7915 				if (source_ip == NULL) {
7916 					ifnet_lock_done(cur_ifp);
7917 					continue;
7918 				}
7919 
7920 				IFA_ADDREF(source_hw);
7921 				ifnet_lock_done(cur_ifp);
7922 
7923 				/* Send the ARP */
7924 				new_result = dlil_send_arp_internal(cur_ifp,
7925 				    arpop, (struct sockaddr_dl *)(void *)
7926 				    source_hw->ifa_addr,
7927 				    (struct sockaddr *)&source_ip_copy, NULL,
7928 				    target_proto);
7929 
7930 				IFA_REMREF(source_hw);
7931 				if (result == ENOTSUP) {
7932 					result = new_result;
7933 				}
7934 			}
7935 			ifnet_list_free(ifp_list);
7936 		}
7937 	} else {
7938 		result = dlil_send_arp_internal(ifp, arpop, sender_hw,
7939 		    sender_proto, target_hw, target_proto);
7940 	}
7941 
7942 	return result;
7943 }
7944 
7945 /*
7946  * Caller must hold ifnet head lock.
7947  */
7948 static int
ifnet_lookup(struct ifnet * ifp)7949 ifnet_lookup(struct ifnet *ifp)
7950 {
7951 	struct ifnet *_ifp;
7952 
7953 	LCK_RW_ASSERT(&ifnet_head_lock, LCK_RW_ASSERT_HELD);
7954 	TAILQ_FOREACH(_ifp, &ifnet_head, if_link) {
7955 		if (_ifp == ifp) {
7956 			break;
7957 		}
7958 	}
7959 	return _ifp != NULL;
7960 }
7961 
7962 /*
7963  * Caller has to pass a non-zero refio argument to get a
7964  * IO reference count. This will prevent ifnet_detach from
7965  * being called when there are outstanding io reference counts.
7966  */
7967 int
ifnet_is_attached(struct ifnet * ifp,int refio)7968 ifnet_is_attached(struct ifnet *ifp, int refio)
7969 {
7970 	int ret;
7971 
7972 	lck_mtx_lock_spin(&ifp->if_ref_lock);
7973 	if ((ret = IF_FULLY_ATTACHED(ifp))) {
7974 		if (refio > 0) {
7975 			ifp->if_refio++;
7976 		}
7977 	}
7978 	lck_mtx_unlock(&ifp->if_ref_lock);
7979 
7980 	return ret;
7981 }
7982 
7983 void
ifnet_incr_pending_thread_count(struct ifnet * ifp)7984 ifnet_incr_pending_thread_count(struct ifnet *ifp)
7985 {
7986 	lck_mtx_lock_spin(&ifp->if_ref_lock);
7987 	ifp->if_threads_pending++;
7988 	lck_mtx_unlock(&ifp->if_ref_lock);
7989 }
7990 
7991 void
ifnet_decr_pending_thread_count(struct ifnet * ifp)7992 ifnet_decr_pending_thread_count(struct ifnet *ifp)
7993 {
7994 	lck_mtx_lock_spin(&ifp->if_ref_lock);
7995 	VERIFY(ifp->if_threads_pending > 0);
7996 	ifp->if_threads_pending--;
7997 	if (ifp->if_threads_pending == 0) {
7998 		wakeup(&ifp->if_threads_pending);
7999 	}
8000 	lck_mtx_unlock(&ifp->if_ref_lock);
8001 }
8002 
8003 /*
8004  * Caller must ensure the interface is attached; the assumption is that
8005  * there is at least an outstanding IO reference count held already.
8006  * Most callers would call ifnet_is_{attached,data_ready}() instead.
8007  */
8008 void
ifnet_incr_iorefcnt(struct ifnet * ifp)8009 ifnet_incr_iorefcnt(struct ifnet *ifp)
8010 {
8011 	lck_mtx_lock_spin(&ifp->if_ref_lock);
8012 	VERIFY(IF_FULLY_ATTACHED(ifp));
8013 	VERIFY(ifp->if_refio > 0);
8014 	ifp->if_refio++;
8015 	lck_mtx_unlock(&ifp->if_ref_lock);
8016 }
8017 
8018 __attribute__((always_inline))
8019 static void
ifnet_decr_iorefcnt_locked(struct ifnet * ifp)8020 ifnet_decr_iorefcnt_locked(struct ifnet *ifp)
8021 {
8022 	LCK_MTX_ASSERT(&ifp->if_ref_lock, LCK_MTX_ASSERT_OWNED);
8023 
8024 	VERIFY(ifp->if_refio > 0);
8025 	VERIFY(ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING));
8026 
8027 	ifp->if_refio--;
8028 	VERIFY(ifp->if_refio != 0 || ifp->if_datamov == 0);
8029 
8030 	/*
8031 	 * if there are no more outstanding io references, wakeup the
8032 	 * ifnet_detach thread if detaching flag is set.
8033 	 */
8034 	if (ifp->if_refio == 0 && (ifp->if_refflags & IFRF_DETACHING)) {
8035 		wakeup(&(ifp->if_refio));
8036 	}
8037 }
8038 
8039 void
ifnet_decr_iorefcnt(struct ifnet * ifp)8040 ifnet_decr_iorefcnt(struct ifnet *ifp)
8041 {
8042 	lck_mtx_lock_spin(&ifp->if_ref_lock);
8043 	ifnet_decr_iorefcnt_locked(ifp);
8044 	lck_mtx_unlock(&ifp->if_ref_lock);
8045 }
8046 
8047 boolean_t
ifnet_datamov_begin(struct ifnet * ifp)8048 ifnet_datamov_begin(struct ifnet *ifp)
8049 {
8050 	boolean_t ret;
8051 
8052 	lck_mtx_lock_spin(&ifp->if_ref_lock);
8053 	if ((ret = IF_FULLY_ATTACHED_AND_READY(ifp))) {
8054 		ifp->if_refio++;
8055 		ifp->if_datamov++;
8056 	}
8057 	lck_mtx_unlock(&ifp->if_ref_lock);
8058 
8059 	return ret;
8060 }
8061 
8062 void
ifnet_datamov_end(struct ifnet * ifp)8063 ifnet_datamov_end(struct ifnet *ifp)
8064 {
8065 	lck_mtx_lock_spin(&ifp->if_ref_lock);
8066 	VERIFY(ifp->if_datamov > 0);
8067 	/*
8068 	 * if there's no more thread moving data, wakeup any
8069 	 * drainers that's blocked waiting for this.
8070 	 */
8071 	if (--ifp->if_datamov == 0 && ifp->if_drainers > 0) {
8072 		DLIL_PRINTF("Waking up drainers on %s\n", if_name(ifp));
8073 		DTRACE_IP1(datamov__drain__wake, struct ifnet *, ifp);
8074 		wakeup(&(ifp->if_datamov));
8075 	}
8076 	ifnet_decr_iorefcnt_locked(ifp);
8077 	lck_mtx_unlock(&ifp->if_ref_lock);
8078 }
8079 
8080 static void
ifnet_datamov_suspend_locked(struct ifnet * ifp)8081 ifnet_datamov_suspend_locked(struct ifnet *ifp)
8082 {
8083 	LCK_MTX_ASSERT(&ifp->if_ref_lock, LCK_MTX_ASSERT_OWNED);
8084 	ifp->if_refio++;
8085 	if (ifp->if_suspend++ == 0) {
8086 		VERIFY(ifp->if_refflags & IFRF_READY);
8087 		ifp->if_refflags &= ~IFRF_READY;
8088 	}
8089 }
8090 
8091 void
ifnet_datamov_suspend(struct ifnet * ifp)8092 ifnet_datamov_suspend(struct ifnet *ifp)
8093 {
8094 	lck_mtx_lock_spin(&ifp->if_ref_lock);
8095 	VERIFY(ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING));
8096 	ifnet_datamov_suspend_locked(ifp);
8097 	lck_mtx_unlock(&ifp->if_ref_lock);
8098 }
8099 
8100 boolean_t
ifnet_datamov_suspend_if_needed(struct ifnet * ifp)8101 ifnet_datamov_suspend_if_needed(struct ifnet *ifp)
8102 {
8103 	lck_mtx_lock_spin(&ifp->if_ref_lock);
8104 	VERIFY(ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING));
8105 	if (ifp->if_suspend > 0) {
8106 		lck_mtx_unlock(&ifp->if_ref_lock);
8107 		return FALSE;
8108 	}
8109 	ifnet_datamov_suspend_locked(ifp);
8110 	lck_mtx_unlock(&ifp->if_ref_lock);
8111 	return TRUE;
8112 }
8113 
8114 void
ifnet_datamov_drain(struct ifnet * ifp)8115 ifnet_datamov_drain(struct ifnet *ifp)
8116 {
8117 	lck_mtx_lock(&ifp->if_ref_lock);
8118 	VERIFY(ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING));
8119 	/* data movement must already be suspended */
8120 	VERIFY(ifp->if_suspend > 0);
8121 	VERIFY(!(ifp->if_refflags & IFRF_READY));
8122 	ifp->if_drainers++;
8123 	while (ifp->if_datamov != 0) {
8124 		DLIL_PRINTF("Waiting for data path(s) to quiesce on %s\n",
8125 		    if_name(ifp));
8126 		DTRACE_IP1(datamov__wait, struct ifnet *, ifp);
8127 		(void) msleep(&(ifp->if_datamov), &ifp->if_ref_lock,
8128 		    (PZERO - 1), __func__, NULL);
8129 		DTRACE_IP1(datamov__wake, struct ifnet *, ifp);
8130 	}
8131 	VERIFY(!(ifp->if_refflags & IFRF_READY));
8132 	VERIFY(ifp->if_drainers > 0);
8133 	ifp->if_drainers--;
8134 	lck_mtx_unlock(&ifp->if_ref_lock);
8135 
8136 	/* purge the interface queues */
8137 	if ((ifp->if_eflags & IFEF_TXSTART) != 0) {
8138 		if_qflush_snd(ifp, false);
8139 	}
8140 }
8141 
8142 void
ifnet_datamov_suspend_and_drain(struct ifnet * ifp)8143 ifnet_datamov_suspend_and_drain(struct ifnet *ifp)
8144 {
8145 	ifnet_datamov_suspend(ifp);
8146 	ifnet_datamov_drain(ifp);
8147 }
8148 
8149 void
ifnet_datamov_resume(struct ifnet * ifp)8150 ifnet_datamov_resume(struct ifnet *ifp)
8151 {
8152 	lck_mtx_lock(&ifp->if_ref_lock);
8153 	/* data movement must already be suspended */
8154 	VERIFY(ifp->if_suspend > 0);
8155 	if (--ifp->if_suspend == 0) {
8156 		VERIFY(!(ifp->if_refflags & IFRF_READY));
8157 		ifp->if_refflags |= IFRF_READY;
8158 	}
8159 	ifnet_decr_iorefcnt_locked(ifp);
8160 	lck_mtx_unlock(&ifp->if_ref_lock);
8161 }
8162 
8163 static void
dlil_if_trace(struct dlil_ifnet * dl_if,int refhold)8164 dlil_if_trace(struct dlil_ifnet *dl_if, int refhold)
8165 {
8166 	struct dlil_ifnet_dbg *dl_if_dbg = (struct dlil_ifnet_dbg *)dl_if;
8167 	ctrace_t *tr;
8168 	u_int32_t idx;
8169 	u_int16_t *cnt;
8170 
8171 	if (!(dl_if->dl_if_flags & DLIF_DEBUG)) {
8172 		panic("%s: dl_if %p has no debug structure", __func__, dl_if);
8173 		/* NOTREACHED */
8174 	}
8175 
8176 	if (refhold) {
8177 		cnt = &dl_if_dbg->dldbg_if_refhold_cnt;
8178 		tr = dl_if_dbg->dldbg_if_refhold;
8179 	} else {
8180 		cnt = &dl_if_dbg->dldbg_if_refrele_cnt;
8181 		tr = dl_if_dbg->dldbg_if_refrele;
8182 	}
8183 
8184 	idx = atomic_add_16_ov(cnt, 1) % IF_REF_TRACE_HIST_SIZE;
8185 	ctrace_record(&tr[idx]);
8186 }
8187 
8188 errno_t
dlil_if_ref(struct ifnet * ifp)8189 dlil_if_ref(struct ifnet *ifp)
8190 {
8191 	struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
8192 
8193 	if (dl_if == NULL) {
8194 		return EINVAL;
8195 	}
8196 
8197 	lck_mtx_lock_spin(&dl_if->dl_if_lock);
8198 	++dl_if->dl_if_refcnt;
8199 	if (dl_if->dl_if_refcnt == 0) {
8200 		panic("%s: wraparound refcnt for ifp=%p", __func__, ifp);
8201 		/* NOTREACHED */
8202 	}
8203 	if (dl_if->dl_if_trace != NULL) {
8204 		(*dl_if->dl_if_trace)(dl_if, TRUE);
8205 	}
8206 	lck_mtx_unlock(&dl_if->dl_if_lock);
8207 
8208 	return 0;
8209 }
8210 
8211 errno_t
dlil_if_free(struct ifnet * ifp)8212 dlil_if_free(struct ifnet *ifp)
8213 {
8214 	struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
8215 	bool need_release = FALSE;
8216 
8217 	if (dl_if == NULL) {
8218 		return EINVAL;
8219 	}
8220 
8221 	lck_mtx_lock_spin(&dl_if->dl_if_lock);
8222 	switch (dl_if->dl_if_refcnt) {
8223 	case 0:
8224 		panic("%s: negative refcnt for ifp=%p", __func__, ifp);
8225 		/* NOTREACHED */
8226 		break;
8227 	case 1:
8228 		if ((ifp->if_refflags & IFRF_EMBRYONIC) != 0) {
8229 			need_release = TRUE;
8230 		}
8231 		break;
8232 	default:
8233 		break;
8234 	}
8235 	--dl_if->dl_if_refcnt;
8236 	if (dl_if->dl_if_trace != NULL) {
8237 		(*dl_if->dl_if_trace)(dl_if, FALSE);
8238 	}
8239 	lck_mtx_unlock(&dl_if->dl_if_lock);
8240 	if (need_release) {
8241 		_dlil_if_release(ifp, true);
8242 	}
8243 	return 0;
8244 }
8245 
8246 static errno_t
dlil_attach_protocol(struct if_proto * proto,const struct ifnet_demux_desc * demux_list,u_int32_t demux_count,uint32_t * proto_count)8247 dlil_attach_protocol(struct if_proto *proto,
8248     const struct ifnet_demux_desc *demux_list, u_int32_t demux_count,
8249     uint32_t * proto_count)
8250 {
8251 	struct kev_dl_proto_data ev_pr_data;
8252 	struct ifnet *ifp = proto->ifp;
8253 	errno_t retval = 0;
8254 	u_int32_t hash_value = proto_hash_value(proto->protocol_family);
8255 	struct if_proto *prev_proto;
8256 	struct if_proto *_proto;
8257 
8258 	/* don't allow attaching anything but PF_BRIDGE to vmnet interfaces */
8259 	if (IFNET_IS_VMNET(ifp) && proto->protocol_family != PF_BRIDGE) {
8260 		return EINVAL;
8261 	}
8262 
8263 	if (!ifnet_is_attached(ifp, 1)) {
8264 		os_log(OS_LOG_DEFAULT, "%s: %s is no longer attached",
8265 		    __func__, if_name(ifp));
8266 		return ENXIO;
8267 	}
8268 	/* callee holds a proto refcnt upon success */
8269 	ifnet_lock_exclusive(ifp);
8270 	_proto = find_attached_proto(ifp, proto->protocol_family);
8271 	if (_proto != NULL) {
8272 		ifnet_lock_done(ifp);
8273 		if_proto_free(_proto);
8274 		retval = EEXIST;
8275 		goto ioref_done;
8276 	}
8277 
8278 	/*
8279 	 * Call family module add_proto routine so it can refine the
8280 	 * demux descriptors as it wishes.
8281 	 */
8282 	retval = ifp->if_add_proto(ifp, proto->protocol_family, demux_list,
8283 	    demux_count);
8284 	if (retval) {
8285 		ifnet_lock_done(ifp);
8286 		goto ioref_done;
8287 	}
8288 
8289 	/*
8290 	 * Insert the protocol in the hash
8291 	 */
8292 	prev_proto = SLIST_FIRST(&ifp->if_proto_hash[hash_value]);
8293 	while (prev_proto != NULL && SLIST_NEXT(prev_proto, next_hash) != NULL) {
8294 		prev_proto = SLIST_NEXT(prev_proto, next_hash);
8295 	}
8296 	if (prev_proto) {
8297 		SLIST_INSERT_AFTER(prev_proto, proto, next_hash);
8298 	} else {
8299 		SLIST_INSERT_HEAD(&ifp->if_proto_hash[hash_value],
8300 		    proto, next_hash);
8301 	}
8302 
8303 	/* hold a proto refcnt for attach */
8304 	if_proto_ref(proto);
8305 
8306 	/*
8307 	 * The reserved field carries the number of protocol still attached
8308 	 * (subject to change)
8309 	 */
8310 	ev_pr_data.proto_family = proto->protocol_family;
8311 	ev_pr_data.proto_remaining_count = dlil_ifp_protolist(ifp, NULL, 0);
8312 
8313 	ifnet_lock_done(ifp);
8314 
8315 	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_PROTO_ATTACHED,
8316 	    (struct net_event_data *)&ev_pr_data,
8317 	    sizeof(struct kev_dl_proto_data), FALSE);
8318 	if (proto_count != NULL) {
8319 		*proto_count = ev_pr_data.proto_remaining_count;
8320 	}
8321 ioref_done:
8322 	ifnet_decr_iorefcnt(ifp);
8323 	return retval;
8324 }
8325 
8326 static void
dlil_handle_proto_attach(ifnet_t ifp,protocol_family_t protocol)8327 dlil_handle_proto_attach(ifnet_t ifp, protocol_family_t protocol)
8328 {
8329 	/*
8330 	 * A protocol has been attached, mark the interface up.
8331 	 * This used to be done by configd.KernelEventMonitor, but that
8332 	 * is inherently prone to races (rdar://problem/30810208).
8333 	 */
8334 	(void) ifnet_set_flags(ifp, IFF_UP, IFF_UP);
8335 	(void) ifnet_ioctl(ifp, 0, SIOCSIFFLAGS, NULL);
8336 	dlil_post_sifflags_msg(ifp);
8337 #if SKYWALK
8338 	switch (protocol) {
8339 	case AF_INET:
8340 	case AF_INET6:
8341 		/* don't attach the flowswitch unless attaching IP */
8342 		dlil_attach_flowswitch_nexus(ifp);
8343 		break;
8344 	default:
8345 		break;
8346 	}
8347 #endif /* SKYWALK */
8348 }
8349 
8350 errno_t
ifnet_attach_protocol(ifnet_t ifp,protocol_family_t protocol,const struct ifnet_attach_proto_param * proto_details)8351 ifnet_attach_protocol(ifnet_t ifp, protocol_family_t protocol,
8352     const struct ifnet_attach_proto_param *proto_details)
8353 {
8354 	int retval = 0;
8355 	struct if_proto  *ifproto = NULL;
8356 	uint32_t proto_count = 0;
8357 
8358 	ifnet_head_lock_shared();
8359 	if (ifp == NULL || protocol == 0 || proto_details == NULL) {
8360 		retval = EINVAL;
8361 		goto end;
8362 	}
8363 	/* Check that the interface is in the global list */
8364 	if (!ifnet_lookup(ifp)) {
8365 		retval = ENXIO;
8366 		goto end;
8367 	}
8368 
8369 	ifproto = zalloc_flags(dlif_proto_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
8370 
8371 	/* refcnt held above during lookup */
8372 	ifproto->ifp = ifp;
8373 	ifproto->protocol_family = protocol;
8374 	ifproto->proto_kpi = kProtoKPI_v1;
8375 	ifproto->kpi.v1.input = proto_details->input;
8376 	ifproto->kpi.v1.pre_output = proto_details->pre_output;
8377 	ifproto->kpi.v1.event = proto_details->event;
8378 	ifproto->kpi.v1.ioctl = proto_details->ioctl;
8379 	ifproto->kpi.v1.detached = proto_details->detached;
8380 	ifproto->kpi.v1.resolve_multi = proto_details->resolve;
8381 	ifproto->kpi.v1.send_arp = proto_details->send_arp;
8382 
8383 	retval = dlil_attach_protocol(ifproto,
8384 	    proto_details->demux_list, proto_details->demux_count,
8385 	    &proto_count);
8386 
8387 end:
8388 	if (retval == EEXIST) {
8389 		/* already attached */
8390 		if (dlil_verbose) {
8391 			DLIL_PRINTF("%s: protocol %d already attached\n",
8392 			    ifp != NULL ? if_name(ifp) : "N/A",
8393 			    protocol);
8394 		}
8395 	} else if (retval != 0) {
8396 		DLIL_PRINTF("%s: failed to attach v1 protocol %d (err=%d)\n",
8397 		    ifp != NULL ? if_name(ifp) : "N/A", protocol, retval);
8398 	} else if (dlil_verbose) {
8399 		DLIL_PRINTF("%s: attached v1 protocol %d (count = %d)\n",
8400 		    ifp != NULL ? if_name(ifp) : "N/A",
8401 		    protocol, proto_count);
8402 	}
8403 	ifnet_head_done();
8404 	if (retval == 0) {
8405 		dlil_handle_proto_attach(ifp, protocol);
8406 	} else if (ifproto != NULL) {
8407 		zfree(dlif_proto_zone, ifproto);
8408 	}
8409 	return retval;
8410 }
8411 
8412 errno_t
ifnet_attach_protocol_v2(ifnet_t ifp,protocol_family_t protocol,const struct ifnet_attach_proto_param_v2 * proto_details)8413 ifnet_attach_protocol_v2(ifnet_t ifp, protocol_family_t protocol,
8414     const struct ifnet_attach_proto_param_v2 *proto_details)
8415 {
8416 	int retval = 0;
8417 	struct if_proto  *ifproto = NULL;
8418 	uint32_t proto_count = 0;
8419 
8420 	ifnet_head_lock_shared();
8421 	if (ifp == NULL || protocol == 0 || proto_details == NULL) {
8422 		retval = EINVAL;
8423 		goto end;
8424 	}
8425 	/* Check that the interface is in the global list */
8426 	if (!ifnet_lookup(ifp)) {
8427 		retval = ENXIO;
8428 		goto end;
8429 	}
8430 
8431 	ifproto = zalloc_flags(dlif_proto_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
8432 
8433 	/* refcnt held above during lookup */
8434 	ifproto->ifp = ifp;
8435 	ifproto->protocol_family = protocol;
8436 	ifproto->proto_kpi = kProtoKPI_v2;
8437 	ifproto->kpi.v2.input = proto_details->input;
8438 	ifproto->kpi.v2.pre_output = proto_details->pre_output;
8439 	ifproto->kpi.v2.event = proto_details->event;
8440 	ifproto->kpi.v2.ioctl = proto_details->ioctl;
8441 	ifproto->kpi.v2.detached = proto_details->detached;
8442 	ifproto->kpi.v2.resolve_multi = proto_details->resolve;
8443 	ifproto->kpi.v2.send_arp = proto_details->send_arp;
8444 
8445 	retval = dlil_attach_protocol(ifproto,
8446 	    proto_details->demux_list, proto_details->demux_count,
8447 	    &proto_count);
8448 
8449 end:
8450 	if (retval == EEXIST) {
8451 		/* already attached */
8452 		if (dlil_verbose) {
8453 			DLIL_PRINTF("%s: protocol %d already attached\n",
8454 			    ifp != NULL ? if_name(ifp) : "N/A",
8455 			    protocol);
8456 		}
8457 	} else if (retval != 0) {
8458 		DLIL_PRINTF("%s: failed to attach v2 protocol %d (err=%d)\n",
8459 		    ifp != NULL ? if_name(ifp) : "N/A", protocol, retval);
8460 	} else if (dlil_verbose) {
8461 		DLIL_PRINTF("%s: attached v2 protocol %d (count = %d)\n",
8462 		    ifp != NULL ? if_name(ifp) : "N/A",
8463 		    protocol, proto_count);
8464 	}
8465 	ifnet_head_done();
8466 	if (retval == 0) {
8467 		dlil_handle_proto_attach(ifp, protocol);
8468 	} else if (ifproto != NULL) {
8469 		zfree(dlif_proto_zone, ifproto);
8470 	}
8471 	return retval;
8472 }
8473 
8474 errno_t
ifnet_detach_protocol(ifnet_t ifp,protocol_family_t proto_family)8475 ifnet_detach_protocol(ifnet_t ifp, protocol_family_t proto_family)
8476 {
8477 	struct if_proto *proto = NULL;
8478 	int     retval = 0;
8479 
8480 	if (ifp == NULL || proto_family == 0) {
8481 		retval = EINVAL;
8482 		goto end;
8483 	}
8484 
8485 	ifnet_lock_exclusive(ifp);
8486 	/* callee holds a proto refcnt upon success */
8487 	proto = find_attached_proto(ifp, proto_family);
8488 	if (proto == NULL) {
8489 		retval = ENXIO;
8490 		ifnet_lock_done(ifp);
8491 		goto end;
8492 	}
8493 
8494 	/* call family module del_proto */
8495 	if (ifp->if_del_proto) {
8496 		ifp->if_del_proto(ifp, proto->protocol_family);
8497 	}
8498 
8499 	SLIST_REMOVE(&ifp->if_proto_hash[proto_hash_value(proto_family)],
8500 	    proto, if_proto, next_hash);
8501 
8502 	if (proto->proto_kpi == kProtoKPI_v1) {
8503 		proto->kpi.v1.input = ifproto_media_input_v1;
8504 		proto->kpi.v1.pre_output = ifproto_media_preout;
8505 		proto->kpi.v1.event = ifproto_media_event;
8506 		proto->kpi.v1.ioctl = ifproto_media_ioctl;
8507 		proto->kpi.v1.resolve_multi = ifproto_media_resolve_multi;
8508 		proto->kpi.v1.send_arp = ifproto_media_send_arp;
8509 	} else {
8510 		proto->kpi.v2.input = ifproto_media_input_v2;
8511 		proto->kpi.v2.pre_output = ifproto_media_preout;
8512 		proto->kpi.v2.event = ifproto_media_event;
8513 		proto->kpi.v2.ioctl = ifproto_media_ioctl;
8514 		proto->kpi.v2.resolve_multi = ifproto_media_resolve_multi;
8515 		proto->kpi.v2.send_arp = ifproto_media_send_arp;
8516 	}
8517 	proto->detached = 1;
8518 	ifnet_lock_done(ifp);
8519 
8520 	if (dlil_verbose) {
8521 		DLIL_PRINTF("%s: detached %s protocol %d\n", if_name(ifp),
8522 		    (proto->proto_kpi == kProtoKPI_v1) ?
8523 		    "v1" : "v2", proto_family);
8524 	}
8525 
8526 	/* release proto refcnt held during protocol attach */
8527 	if_proto_free(proto);
8528 
8529 	/*
8530 	 * Release proto refcnt held during lookup; the rest of
8531 	 * protocol detach steps will happen when the last proto
8532 	 * reference is released.
8533 	 */
8534 	if_proto_free(proto);
8535 
8536 end:
8537 	return retval;
8538 }
8539 
8540 
8541 static errno_t
ifproto_media_input_v1(struct ifnet * ifp,protocol_family_t protocol,struct mbuf * packet,char * header)8542 ifproto_media_input_v1(struct ifnet *ifp, protocol_family_t protocol,
8543     struct mbuf *packet, char *header)
8544 {
8545 #pragma unused(ifp, protocol, packet, header)
8546 	return ENXIO;
8547 }
8548 
8549 static errno_t
ifproto_media_input_v2(struct ifnet * ifp,protocol_family_t protocol,struct mbuf * packet)8550 ifproto_media_input_v2(struct ifnet *ifp, protocol_family_t protocol,
8551     struct mbuf *packet)
8552 {
8553 #pragma unused(ifp, protocol, packet)
8554 	return ENXIO;
8555 }
8556 
8557 static errno_t
ifproto_media_preout(struct ifnet * ifp,protocol_family_t protocol,mbuf_t * packet,const struct sockaddr * dest,void * route,char * frame_type,char * link_layer_dest)8558 ifproto_media_preout(struct ifnet *ifp, protocol_family_t protocol,
8559     mbuf_t *packet, const struct sockaddr *dest, void *route, char *frame_type,
8560     char *link_layer_dest)
8561 {
8562 #pragma unused(ifp, protocol, packet, dest, route, frame_type, link_layer_dest)
8563 	return ENXIO;
8564 }
8565 
8566 static void
ifproto_media_event(struct ifnet * ifp,protocol_family_t protocol,const struct kev_msg * event)8567 ifproto_media_event(struct ifnet *ifp, protocol_family_t protocol,
8568     const struct kev_msg *event)
8569 {
8570 #pragma unused(ifp, protocol, event)
8571 }
8572 
8573 static errno_t
ifproto_media_ioctl(struct ifnet * ifp,protocol_family_t protocol,unsigned long command,void * argument)8574 ifproto_media_ioctl(struct ifnet *ifp, protocol_family_t protocol,
8575     unsigned long command, void *argument)
8576 {
8577 #pragma unused(ifp, protocol, command, argument)
8578 	return ENXIO;
8579 }
8580 
8581 static errno_t
ifproto_media_resolve_multi(ifnet_t ifp,const struct sockaddr * proto_addr,struct sockaddr_dl * out_ll,size_t ll_len)8582 ifproto_media_resolve_multi(ifnet_t ifp, const struct sockaddr *proto_addr,
8583     struct sockaddr_dl *out_ll, size_t ll_len)
8584 {
8585 #pragma unused(ifp, proto_addr, out_ll, ll_len)
8586 	return ENXIO;
8587 }
8588 
8589 static errno_t
ifproto_media_send_arp(struct ifnet * ifp,u_short arpop,const struct sockaddr_dl * sender_hw,const struct sockaddr * sender_proto,const struct sockaddr_dl * target_hw,const struct sockaddr * target_proto)8590 ifproto_media_send_arp(struct ifnet *ifp, u_short arpop,
8591     const struct sockaddr_dl *sender_hw, const struct sockaddr *sender_proto,
8592     const struct sockaddr_dl *target_hw, const struct sockaddr *target_proto)
8593 {
8594 #pragma unused(ifp, arpop, sender_hw, sender_proto, target_hw, target_proto)
8595 	return ENXIO;
8596 }
8597 
8598 extern int if_next_index(void);
8599 extern int tcp_ecn_outbound;
8600 
8601 void
dlil_ifclassq_setup(struct ifnet * ifp,struct ifclassq * ifcq)8602 dlil_ifclassq_setup(struct ifnet *ifp, struct ifclassq *ifcq)
8603 {
8604 	uint32_t sflags = 0;
8605 	int err;
8606 
8607 	if (if_flowadv) {
8608 		sflags |= PKTSCHEDF_QALG_FLOWCTL;
8609 	}
8610 
8611 	if (if_delaybased_queue) {
8612 		sflags |= PKTSCHEDF_QALG_DELAYBASED;
8613 	}
8614 
8615 	if (ifp->if_output_sched_model ==
8616 	    IFNET_SCHED_MODEL_DRIVER_MANAGED) {
8617 		sflags |= PKTSCHEDF_QALG_DRIVER_MANAGED;
8618 	}
8619 	/* Inherit drop limit from the default queue */
8620 	if (ifp->if_snd != ifcq) {
8621 		IFCQ_PKT_DROP_LIMIT(ifcq) = IFCQ_PKT_DROP_LIMIT(ifp->if_snd);
8622 	}
8623 	/* Initialize transmit queue(s) */
8624 	err = ifclassq_setup(ifcq, ifp, sflags);
8625 	if (err != 0) {
8626 		panic_plain("%s: ifp=%p couldn't initialize transmit queue; "
8627 		    "err=%d", __func__, ifp, err);
8628 		/* NOTREACHED */
8629 	}
8630 }
8631 
8632 errno_t
ifnet_attach(ifnet_t ifp,const struct sockaddr_dl * ll_addr)8633 ifnet_attach(ifnet_t ifp, const struct sockaddr_dl *ll_addr)
8634 {
8635 #if SKYWALK
8636 	boolean_t netif_compat;
8637 	if_nexus_netif  nexus_netif;
8638 #endif /* SKYWALK */
8639 	struct ifnet *tmp_if;
8640 	struct ifaddr *ifa;
8641 	struct if_data_internal if_data_saved;
8642 	struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
8643 	struct dlil_threading_info *dl_inp;
8644 	thread_continue_t thfunc = NULL;
8645 	int err;
8646 
8647 	if (ifp == NULL) {
8648 		return EINVAL;
8649 	}
8650 
8651 	/*
8652 	 * Serialize ifnet attach using dlil_ifnet_lock, in order to
8653 	 * prevent the interface from being configured while it is
8654 	 * embryonic, as ifnet_head_lock is dropped and reacquired
8655 	 * below prior to marking the ifnet with IFRF_ATTACHED.
8656 	 */
8657 	dlil_if_lock();
8658 	ifnet_head_lock_exclusive();
8659 	/* Verify we aren't already on the list */
8660 	TAILQ_FOREACH(tmp_if, &ifnet_head, if_link) {
8661 		if (tmp_if == ifp) {
8662 			ifnet_head_done();
8663 			dlil_if_unlock();
8664 			return EEXIST;
8665 		}
8666 	}
8667 
8668 	lck_mtx_lock_spin(&ifp->if_ref_lock);
8669 	if (!(ifp->if_refflags & IFRF_EMBRYONIC)) {
8670 		panic_plain("%s: flags mismatch (embryonic not set) ifp=%p",
8671 		    __func__, ifp);
8672 		/* NOTREACHED */
8673 	}
8674 	lck_mtx_unlock(&ifp->if_ref_lock);
8675 
8676 	ifnet_lock_exclusive(ifp);
8677 
8678 	/* Sanity check */
8679 	VERIFY(ifp->if_detaching_link.tqe_next == NULL);
8680 	VERIFY(ifp->if_detaching_link.tqe_prev == NULL);
8681 	VERIFY(ifp->if_threads_pending == 0);
8682 
8683 	if (ll_addr != NULL) {
8684 		if (ifp->if_addrlen == 0) {
8685 			ifp->if_addrlen = ll_addr->sdl_alen;
8686 		} else if (ll_addr->sdl_alen != ifp->if_addrlen) {
8687 			ifnet_lock_done(ifp);
8688 			ifnet_head_done();
8689 			dlil_if_unlock();
8690 			return EINVAL;
8691 		}
8692 	}
8693 
8694 	/*
8695 	 * Allow interfaces without protocol families to attach
8696 	 * only if they have the necessary fields filled out.
8697 	 */
8698 	if (ifp->if_add_proto == NULL || ifp->if_del_proto == NULL) {
8699 		DLIL_PRINTF("%s: Attempt to attach interface without "
8700 		    "family module - %d\n", __func__, ifp->if_family);
8701 		ifnet_lock_done(ifp);
8702 		ifnet_head_done();
8703 		dlil_if_unlock();
8704 		return ENODEV;
8705 	}
8706 
8707 	/* Allocate protocol hash table */
8708 	VERIFY(ifp->if_proto_hash == NULL);
8709 	ifp->if_proto_hash = kalloc_type(struct proto_hash_entry,
8710 	    PROTO_HASH_SLOTS, Z_WAITOK | Z_ZERO | Z_NOFAIL);
8711 
8712 	lck_mtx_lock_spin(&ifp->if_flt_lock);
8713 	VERIFY(TAILQ_EMPTY(&ifp->if_flt_head));
8714 	TAILQ_INIT(&ifp->if_flt_head);
8715 	VERIFY(ifp->if_flt_busy == 0);
8716 	VERIFY(ifp->if_flt_waiters == 0);
8717 	VERIFY(ifp->if_flt_non_os_count == 0);
8718 	VERIFY(ifp->if_flt_no_tso_count == 0);
8719 	lck_mtx_unlock(&ifp->if_flt_lock);
8720 
8721 	if (!(dl_if->dl_if_flags & DLIF_REUSE)) {
8722 		VERIFY(LIST_EMPTY(&ifp->if_multiaddrs));
8723 		LIST_INIT(&ifp->if_multiaddrs);
8724 	}
8725 
8726 	VERIFY(ifp->if_allhostsinm == NULL);
8727 	VERIFY(TAILQ_EMPTY(&ifp->if_addrhead));
8728 	TAILQ_INIT(&ifp->if_addrhead);
8729 
8730 	if (ifp->if_index == 0) {
8731 		int idx = if_next_index();
8732 
8733 		/*
8734 		 * Since we exhausted the list of
8735 		 * if_index's, try to find an empty slot
8736 		 * in ifindex2ifnet.
8737 		 */
8738 		if (idx == -1 && if_index >= UINT16_MAX) {
8739 			for (int i = 1; i < if_index; i++) {
8740 				if (ifindex2ifnet[i] == NULL &&
8741 				    ifnet_addrs[i - 1] == NULL) {
8742 					idx = i;
8743 					break;
8744 				}
8745 			}
8746 		}
8747 		if (idx == -1) {
8748 			ifp->if_index = 0;
8749 			ifnet_lock_done(ifp);
8750 			ifnet_head_done();
8751 			dlil_if_unlock();
8752 			return ENOBUFS;
8753 		}
8754 		ifp->if_index = (uint16_t)idx;
8755 
8756 		/* the lladdr passed at attach time is the permanent address */
8757 		if (ll_addr != NULL && ifp->if_type == IFT_ETHER &&
8758 		    ll_addr->sdl_alen == ETHER_ADDR_LEN) {
8759 			bcopy(CONST_LLADDR(ll_addr),
8760 			    dl_if->dl_if_permanent_ether,
8761 			    ETHER_ADDR_LEN);
8762 			dl_if->dl_if_permanent_ether_is_set = 1;
8763 		}
8764 	}
8765 	/* There should not be anything occupying this slot */
8766 	VERIFY(ifindex2ifnet[ifp->if_index] == NULL);
8767 
8768 	/* allocate (if needed) and initialize a link address */
8769 	ifa = dlil_alloc_lladdr(ifp, ll_addr);
8770 	if (ifa == NULL) {
8771 		ifnet_lock_done(ifp);
8772 		ifnet_head_done();
8773 		dlil_if_unlock();
8774 		return ENOBUFS;
8775 	}
8776 
8777 	VERIFY(ifnet_addrs[ifp->if_index - 1] == NULL);
8778 	ifnet_addrs[ifp->if_index - 1] = ifa;
8779 
8780 	/* make this address the first on the list */
8781 	IFA_LOCK(ifa);
8782 	/* hold a reference for ifnet_addrs[] */
8783 	IFA_ADDREF_LOCKED(ifa);
8784 	/* if_attach_link_ifa() holds a reference for ifa_link */
8785 	if_attach_link_ifa(ifp, ifa);
8786 	IFA_UNLOCK(ifa);
8787 
8788 	TAILQ_INSERT_TAIL(&ifnet_head, ifp, if_link);
8789 	ifindex2ifnet[ifp->if_index] = ifp;
8790 
8791 	/* Hold a reference to the underlying dlil_ifnet */
8792 	ifnet_reference(ifp);
8793 
8794 	/* Clear stats (save and restore other fields that we care) */
8795 	if_data_saved = ifp->if_data;
8796 	bzero(&ifp->if_data, sizeof(ifp->if_data));
8797 	ifp->if_data.ifi_type = if_data_saved.ifi_type;
8798 	ifp->if_data.ifi_typelen = if_data_saved.ifi_typelen;
8799 	ifp->if_data.ifi_physical = if_data_saved.ifi_physical;
8800 	ifp->if_data.ifi_addrlen = if_data_saved.ifi_addrlen;
8801 	ifp->if_data.ifi_hdrlen = if_data_saved.ifi_hdrlen;
8802 	ifp->if_data.ifi_mtu = if_data_saved.ifi_mtu;
8803 	ifp->if_data.ifi_baudrate = if_data_saved.ifi_baudrate;
8804 	ifp->if_data.ifi_hwassist = if_data_saved.ifi_hwassist;
8805 	ifp->if_data.ifi_tso_v4_mtu = if_data_saved.ifi_tso_v4_mtu;
8806 	ifp->if_data.ifi_tso_v6_mtu = if_data_saved.ifi_tso_v6_mtu;
8807 	ifnet_touch_lastchange(ifp);
8808 
8809 	VERIFY(ifp->if_output_sched_model == IFNET_SCHED_MODEL_NORMAL ||
8810 	    ifp->if_output_sched_model == IFNET_SCHED_MODEL_DRIVER_MANAGED ||
8811 	    ifp->if_output_sched_model == IFNET_SCHED_MODEL_FQ_CODEL);
8812 
8813 	dlil_ifclassq_setup(ifp, ifp->if_snd);
8814 
8815 	/* Sanity checks on the input thread storage */
8816 	dl_inp = &dl_if->dl_if_inpstorage;
8817 	bzero(&dl_inp->dlth_stats, sizeof(dl_inp->dlth_stats));
8818 	VERIFY(dl_inp->dlth_flags == 0);
8819 	VERIFY(dl_inp->dlth_wtot == 0);
8820 	VERIFY(dl_inp->dlth_ifp == NULL);
8821 	VERIFY(qhead(&dl_inp->dlth_pkts) == NULL && qempty(&dl_inp->dlth_pkts));
8822 	VERIFY(qlimit(&dl_inp->dlth_pkts) == 0);
8823 	VERIFY(!dl_inp->dlth_affinity);
8824 	VERIFY(ifp->if_inp == NULL);
8825 	VERIFY(dl_inp->dlth_thread == THREAD_NULL);
8826 	VERIFY(dl_inp->dlth_strategy == NULL);
8827 	VERIFY(dl_inp->dlth_driver_thread == THREAD_NULL);
8828 	VERIFY(dl_inp->dlth_poller_thread == THREAD_NULL);
8829 	VERIFY(dl_inp->dlth_affinity_tag == 0);
8830 
8831 #if IFNET_INPUT_SANITY_CHK
8832 	VERIFY(dl_inp->dlth_pkts_cnt == 0);
8833 #endif /* IFNET_INPUT_SANITY_CHK */
8834 
8835 	VERIFY(ifp->if_poll_thread == THREAD_NULL);
8836 	dlil_reset_rxpoll_params(ifp);
8837 	/*
8838 	 * A specific DLIL input thread is created per non-loopback interface.
8839 	 */
8840 	if (ifp->if_family != IFNET_FAMILY_LOOPBACK) {
8841 		ifp->if_inp = dl_inp;
8842 		ifnet_incr_pending_thread_count(ifp);
8843 		err = dlil_create_input_thread(ifp, ifp->if_inp, &thfunc);
8844 		if (err == ENODEV) {
8845 			VERIFY(thfunc == NULL);
8846 			ifnet_decr_pending_thread_count(ifp);
8847 		} else if (err != 0) {
8848 			panic_plain("%s: ifp=%p couldn't get an input thread; "
8849 			    "err=%d", __func__, ifp, err);
8850 			/* NOTREACHED */
8851 		}
8852 	}
8853 	/*
8854 	 * If the driver supports the new transmit model, calculate flow hash
8855 	 * and create a workloop starter thread to invoke the if_start callback
8856 	 * where the packets may be dequeued and transmitted.
8857 	 */
8858 	if (ifp->if_eflags & IFEF_TXSTART) {
8859 		thread_precedence_policy_data_t info;
8860 		__unused kern_return_t kret;
8861 
8862 		ifp->if_flowhash = ifnet_calc_flowhash(ifp);
8863 		VERIFY(ifp->if_flowhash != 0);
8864 		VERIFY(ifp->if_start_thread == THREAD_NULL);
8865 
8866 		ifnet_set_start_cycle(ifp, NULL);
8867 		ifp->if_start_active = 0;
8868 		ifp->if_start_req = 0;
8869 		ifp->if_start_flags = 0;
8870 		VERIFY(ifp->if_start != NULL);
8871 		ifnet_incr_pending_thread_count(ifp);
8872 		if ((err = kernel_thread_start(ifnet_start_thread_func,
8873 		    ifp, &ifp->if_start_thread)) != KERN_SUCCESS) {
8874 			panic_plain("%s: "
8875 			    "ifp=%p couldn't get a start thread; "
8876 			    "err=%d", __func__, ifp, err);
8877 			/* NOTREACHED */
8878 		}
8879 		bzero(&info, sizeof(info));
8880 		info.importance = 1;
8881 		kret = thread_policy_set(ifp->if_start_thread,
8882 		    THREAD_PRECEDENCE_POLICY, (thread_policy_t)&info,
8883 		    THREAD_PRECEDENCE_POLICY_COUNT);
8884 		ASSERT(kret == KERN_SUCCESS);
8885 	} else {
8886 		ifp->if_flowhash = 0;
8887 	}
8888 
8889 	/* Reset polling parameters */
8890 	ifnet_set_poll_cycle(ifp, NULL);
8891 	ifp->if_poll_update = 0;
8892 	ifp->if_poll_flags = 0;
8893 	ifp->if_poll_req = 0;
8894 	VERIFY(ifp->if_poll_thread == THREAD_NULL);
8895 
8896 	/*
8897 	 * If the driver supports the new receive model, create a poller
8898 	 * thread to invoke if_input_poll callback where the packets may
8899 	 * be dequeued from the driver and processed for reception.
8900 	 * if the interface is netif compat then the poller thread is
8901 	 * managed by netif.
8902 	 */
8903 	if (thfunc == dlil_rxpoll_input_thread_func) {
8904 		thread_precedence_policy_data_t info;
8905 		__unused kern_return_t kret;
8906 #if SKYWALK
8907 		VERIFY(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
8908 #endif /* SKYWALK */
8909 		VERIFY(ifp->if_input_poll != NULL);
8910 		VERIFY(ifp->if_input_ctl != NULL);
8911 		ifnet_incr_pending_thread_count(ifp);
8912 		if ((err = kernel_thread_start(ifnet_poll_thread_func, ifp,
8913 		    &ifp->if_poll_thread)) != KERN_SUCCESS) {
8914 			panic_plain("%s: ifp=%p couldn't get a poll thread; "
8915 			    "err=%d", __func__, ifp, err);
8916 			/* NOTREACHED */
8917 		}
8918 		bzero(&info, sizeof(info));
8919 		info.importance = 1;
8920 		kret = thread_policy_set(ifp->if_poll_thread,
8921 		    THREAD_PRECEDENCE_POLICY, (thread_policy_t)&info,
8922 		    THREAD_PRECEDENCE_POLICY_COUNT);
8923 		ASSERT(kret == KERN_SUCCESS);
8924 	}
8925 
8926 	VERIFY(ifp->if_desc.ifd_maxlen == IF_DESCSIZE);
8927 	VERIFY(ifp->if_desc.ifd_len == 0);
8928 	VERIFY(ifp->if_desc.ifd_desc != NULL);
8929 
8930 	/* Record attach PC stacktrace */
8931 	ctrace_record(&((struct dlil_ifnet *)ifp)->dl_if_attach);
8932 
8933 	ifp->if_updatemcasts = 0;
8934 	if (!LIST_EMPTY(&ifp->if_multiaddrs)) {
8935 		struct ifmultiaddr *ifma;
8936 		LIST_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
8937 			IFMA_LOCK(ifma);
8938 			if (ifma->ifma_addr->sa_family == AF_LINK ||
8939 			    ifma->ifma_addr->sa_family == AF_UNSPEC) {
8940 				ifp->if_updatemcasts++;
8941 			}
8942 			IFMA_UNLOCK(ifma);
8943 		}
8944 
8945 		DLIL_PRINTF("%s: attached with %d suspended link-layer multicast "
8946 		    "membership(s)\n", if_name(ifp),
8947 		    ifp->if_updatemcasts);
8948 	}
8949 
8950 	/* Clear logging parameters */
8951 	bzero(&ifp->if_log, sizeof(ifp->if_log));
8952 
8953 	/* Clear foreground/realtime activity timestamps */
8954 	ifp->if_fg_sendts = 0;
8955 	ifp->if_rt_sendts = 0;
8956 
8957 	/* Clear throughput estimates and radio type */
8958 	ifp->if_estimated_up_bucket = 0;
8959 	ifp->if_estimated_down_bucket = 0;
8960 	ifp->if_radio_type = 0;
8961 	ifp->if_radio_channel = 0;
8962 
8963 	VERIFY(ifp->if_delegated.ifp == NULL);
8964 	VERIFY(ifp->if_delegated.type == 0);
8965 	VERIFY(ifp->if_delegated.family == 0);
8966 	VERIFY(ifp->if_delegated.subfamily == 0);
8967 	VERIFY(ifp->if_delegated.expensive == 0);
8968 	VERIFY(ifp->if_delegated.constrained == 0);
8969 
8970 	VERIFY(ifp->if_agentids == NULL);
8971 	VERIFY(ifp->if_agentcount == 0);
8972 
8973 	/* Reset interface state */
8974 	bzero(&ifp->if_interface_state, sizeof(ifp->if_interface_state));
8975 	ifp->if_interface_state.valid_bitmask |=
8976 	    IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID;
8977 	ifp->if_interface_state.interface_availability =
8978 	    IF_INTERFACE_STATE_INTERFACE_AVAILABLE;
8979 
8980 	/* Initialize Link Quality Metric (loopback [lo0] is always good) */
8981 	if (ifp == lo_ifp) {
8982 		ifp->if_interface_state.lqm_state = IFNET_LQM_THRESH_GOOD;
8983 		ifp->if_interface_state.valid_bitmask |=
8984 		    IF_INTERFACE_STATE_LQM_STATE_VALID;
8985 	} else {
8986 		ifp->if_interface_state.lqm_state = IFNET_LQM_THRESH_UNKNOWN;
8987 	}
8988 
8989 	/*
8990 	 * Enable ECN capability on this interface depending on the
8991 	 * value of ECN global setting
8992 	 */
8993 	if (tcp_ecn_outbound == 2 && !IFNET_IS_CELLULAR(ifp)) {
8994 		if_set_eflags(ifp, IFEF_ECN_ENABLE);
8995 		if_clear_eflags(ifp, IFEF_ECN_DISABLE);
8996 	}
8997 
8998 	/*
8999 	 * Built-in Cyclops always on policy for WiFi infra
9000 	 */
9001 	if (IFNET_IS_WIFI_INFRA(ifp) && net_qos_policy_wifi_enabled != 0) {
9002 		errno_t error;
9003 
9004 		error = if_set_qosmarking_mode(ifp,
9005 		    IFRTYPE_QOSMARKING_FASTLANE);
9006 		if (error != 0) {
9007 			DLIL_PRINTF("%s if_set_qosmarking_mode(%s) error %d\n",
9008 			    __func__, ifp->if_xname, error);
9009 		} else {
9010 			if_set_eflags(ifp, IFEF_QOSMARKING_ENABLED);
9011 #if (DEVELOPMENT || DEBUG)
9012 			DLIL_PRINTF("%s fastlane enabled on %s\n",
9013 			    __func__, ifp->if_xname);
9014 #endif /* (DEVELOPMENT || DEBUG) */
9015 		}
9016 	}
9017 
9018 	ifnet_lock_done(ifp);
9019 	ifnet_head_done();
9020 
9021 #if SKYWALK
9022 	netif_compat = dlil_attach_netif_compat_nexus(ifp, &nexus_netif);
9023 #endif /* SKYWALK */
9024 
9025 	lck_mtx_lock(&ifp->if_cached_route_lock);
9026 	/* Enable forwarding cached route */
9027 	ifp->if_fwd_cacheok = 1;
9028 	/* Clean up any existing cached routes */
9029 	ROUTE_RELEASE(&ifp->if_fwd_route);
9030 	bzero(&ifp->if_fwd_route, sizeof(ifp->if_fwd_route));
9031 	ROUTE_RELEASE(&ifp->if_src_route);
9032 	bzero(&ifp->if_src_route, sizeof(ifp->if_src_route));
9033 	ROUTE_RELEASE(&ifp->if_src_route6);
9034 	bzero(&ifp->if_src_route6, sizeof(ifp->if_src_route6));
9035 	lck_mtx_unlock(&ifp->if_cached_route_lock);
9036 
9037 	ifnet_llreach_ifattach(ifp, (dl_if->dl_if_flags & DLIF_REUSE));
9038 
9039 	/*
9040 	 * Allocate and attach IGMPv3/MLDv2 interface specific variables
9041 	 * and trees; do this before the ifnet is marked as attached.
9042 	 * The ifnet keeps the reference to the info structures even after
9043 	 * the ifnet is detached, since the network-layer records still
9044 	 * refer to the info structures even after that.  This also
9045 	 * makes it possible for them to still function after the ifnet
9046 	 * is recycled or reattached.
9047 	 */
9048 #if INET
9049 	if (IGMP_IFINFO(ifp) == NULL) {
9050 		IGMP_IFINFO(ifp) = igmp_domifattach(ifp, Z_WAITOK);
9051 		VERIFY(IGMP_IFINFO(ifp) != NULL);
9052 	} else {
9053 		VERIFY(IGMP_IFINFO(ifp)->igi_ifp == ifp);
9054 		igmp_domifreattach(IGMP_IFINFO(ifp));
9055 	}
9056 #endif /* INET */
9057 	if (MLD_IFINFO(ifp) == NULL) {
9058 		MLD_IFINFO(ifp) = mld_domifattach(ifp, Z_WAITOK);
9059 		VERIFY(MLD_IFINFO(ifp) != NULL);
9060 	} else {
9061 		VERIFY(MLD_IFINFO(ifp)->mli_ifp == ifp);
9062 		mld_domifreattach(MLD_IFINFO(ifp));
9063 	}
9064 
9065 	VERIFY(ifp->if_data_threshold == 0);
9066 	VERIFY(ifp->if_dt_tcall != NULL);
9067 
9068 	/*
9069 	 * Wait for the created kernel threads for I/O to get
9070 	 * scheduled and run at least once before we proceed
9071 	 * to mark interface as attached.
9072 	 */
9073 	lck_mtx_lock(&ifp->if_ref_lock);
9074 	while (ifp->if_threads_pending != 0) {
9075 		DLIL_PRINTF("%s: Waiting for all kernel threads created for "
9076 		    "interface %s to get scheduled at least once.\n",
9077 		    __func__, ifp->if_xname);
9078 		(void) msleep(&ifp->if_threads_pending, &ifp->if_ref_lock, (PZERO - 1),
9079 		    __func__, NULL);
9080 		LCK_MTX_ASSERT(&ifp->if_ref_lock, LCK_ASSERT_OWNED);
9081 	}
9082 	lck_mtx_unlock(&ifp->if_ref_lock);
9083 	DLIL_PRINTF("%s: All kernel threads created for interface %s have been scheduled "
9084 	    "at least once. Proceeding.\n", __func__, ifp->if_xname);
9085 
9086 	/* Final mark this ifnet as attached. */
9087 	ifnet_lock_exclusive(ifp);
9088 	lck_mtx_lock_spin(&ifp->if_ref_lock);
9089 	ifp->if_refflags = (IFRF_ATTACHED | IFRF_READY); /* clears embryonic */
9090 	lck_mtx_unlock(&ifp->if_ref_lock);
9091 	if (net_rtref) {
9092 		/* boot-args override; enable idle notification */
9093 		(void) ifnet_set_idle_flags_locked(ifp, IFRF_IDLE_NOTIFY,
9094 		    IFRF_IDLE_NOTIFY);
9095 	} else {
9096 		/* apply previous request(s) to set the idle flags, if any */
9097 		(void) ifnet_set_idle_flags_locked(ifp, ifp->if_idle_new_flags,
9098 		    ifp->if_idle_new_flags_mask);
9099 	}
9100 #if SKYWALK
9101 	/* the interface is fully attached; let the nexus adapter know */
9102 	if (netif_compat || dlil_is_native_netif_nexus(ifp)) {
9103 		if (netif_compat) {
9104 			if (sk_netif_compat_txmodel ==
9105 			    NETIF_COMPAT_TXMODEL_ENQUEUE_MULTI) {
9106 				ifnet_enqueue_multi_setup(ifp,
9107 				    sk_tx_delay_qlen, sk_tx_delay_timeout);
9108 			}
9109 			ifp->if_nx_netif = nexus_netif;
9110 		}
9111 		ifp->if_na_ops->ni_finalize(ifp->if_na, ifp);
9112 	}
9113 #endif /* SKYWALK */
9114 	ifnet_lock_done(ifp);
9115 	dlil_if_unlock();
9116 
9117 #if PF
9118 	/*
9119 	 * Attach packet filter to this interface, if enabled.
9120 	 */
9121 	pf_ifnet_hook(ifp, 1);
9122 #endif /* PF */
9123 
9124 	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_IF_ATTACHED, NULL, 0, FALSE);
9125 
9126 	if (dlil_verbose) {
9127 		DLIL_PRINTF("%s: attached%s\n", if_name(ifp),
9128 		    (dl_if->dl_if_flags & DLIF_REUSE) ? " (recycled)" : "");
9129 	}
9130 
9131 	return 0;
9132 }
9133 
9134 /*
9135  * Prepare the storage for the first/permanent link address, which must
9136  * must have the same lifetime as the ifnet itself.  Although the link
9137  * address gets removed from if_addrhead and ifnet_addrs[] at detach time,
9138  * its location in memory must never change as it may still be referred
9139  * to by some parts of the system afterwards (unfortunate implementation
9140  * artifacts inherited from BSD.)
9141  *
9142  * Caller must hold ifnet lock as writer.
9143  */
9144 static struct ifaddr *
dlil_alloc_lladdr(struct ifnet * ifp,const struct sockaddr_dl * ll_addr)9145 dlil_alloc_lladdr(struct ifnet *ifp, const struct sockaddr_dl *ll_addr)
9146 {
9147 	struct ifaddr *ifa, *oifa;
9148 	struct sockaddr_dl *asdl, *msdl;
9149 	char workbuf[IFNAMSIZ * 2];
9150 	int namelen, masklen, socksize;
9151 	struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
9152 
9153 	ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_EXCLUSIVE);
9154 	VERIFY(ll_addr == NULL || ll_addr->sdl_alen == ifp->if_addrlen);
9155 
9156 	namelen = scnprintf(workbuf, sizeof(workbuf), "%s",
9157 	    if_name(ifp));
9158 	masklen = offsetof(struct sockaddr_dl, sdl_data[0])
9159 	    + ((namelen > 0) ? namelen : 0);
9160 	socksize = masklen + ifp->if_addrlen;
9161 #define ROUNDUP(a) (1 + (((a) - 1) | (sizeof (u_int32_t) - 1)))
9162 	if ((u_int32_t)socksize < sizeof(struct sockaddr_dl)) {
9163 		socksize = sizeof(struct sockaddr_dl);
9164 	}
9165 	socksize = ROUNDUP(socksize);
9166 #undef ROUNDUP
9167 
9168 	ifa = ifp->if_lladdr;
9169 	if (socksize > DLIL_SDLMAXLEN ||
9170 	    (ifa != NULL && ifa != &dl_if->dl_if_lladdr.ifa)) {
9171 		/*
9172 		 * Rare, but in the event that the link address requires
9173 		 * more storage space than DLIL_SDLMAXLEN, allocate the
9174 		 * largest possible storages for address and mask, such
9175 		 * that we can reuse the same space when if_addrlen grows.
9176 		 * This same space will be used when if_addrlen shrinks.
9177 		 */
9178 		if (ifa == NULL || ifa == &dl_if->dl_if_lladdr.ifa) {
9179 			int ifasize = sizeof(*ifa) + 2 * SOCK_MAXADDRLEN;
9180 
9181 			ifa = zalloc_permanent(ifasize, ZALIGN(struct ifaddr));
9182 			ifa_lock_init(ifa);
9183 			/* Don't set IFD_ALLOC, as this is permanent */
9184 			ifa->ifa_debug = IFD_LINK;
9185 		}
9186 		IFA_LOCK(ifa);
9187 		/* address and mask sockaddr_dl locations */
9188 		asdl = (struct sockaddr_dl *)(ifa + 1);
9189 		bzero(asdl, SOCK_MAXADDRLEN);
9190 		msdl = (struct sockaddr_dl *)(void *)
9191 		    ((char *)asdl + SOCK_MAXADDRLEN);
9192 		bzero(msdl, SOCK_MAXADDRLEN);
9193 	} else {
9194 		VERIFY(ifa == NULL || ifa == &dl_if->dl_if_lladdr.ifa);
9195 		/*
9196 		 * Use the storage areas for address and mask within the
9197 		 * dlil_ifnet structure.  This is the most common case.
9198 		 */
9199 		if (ifa == NULL) {
9200 			ifa = &dl_if->dl_if_lladdr.ifa;
9201 			ifa_lock_init(ifa);
9202 			/* Don't set IFD_ALLOC, as this is permanent */
9203 			ifa->ifa_debug = IFD_LINK;
9204 		}
9205 		IFA_LOCK(ifa);
9206 		/* address and mask sockaddr_dl locations */
9207 		asdl = (struct sockaddr_dl *)(void *)&dl_if->dl_if_lladdr.asdl;
9208 		bzero(asdl, sizeof(dl_if->dl_if_lladdr.asdl));
9209 		msdl = (struct sockaddr_dl *)(void *)&dl_if->dl_if_lladdr.msdl;
9210 		bzero(msdl, sizeof(dl_if->dl_if_lladdr.msdl));
9211 	}
9212 
9213 	/* hold a permanent reference for the ifnet itself */
9214 	IFA_ADDREF_LOCKED(ifa);
9215 	oifa = ifp->if_lladdr;
9216 	ifp->if_lladdr = ifa;
9217 
9218 	VERIFY(ifa->ifa_debug == IFD_LINK);
9219 	ifa->ifa_ifp = ifp;
9220 	ifa->ifa_rtrequest = link_rtrequest;
9221 	ifa->ifa_addr = (struct sockaddr *)asdl;
9222 	asdl->sdl_len = (u_char)socksize;
9223 	asdl->sdl_family = AF_LINK;
9224 	if (namelen > 0) {
9225 		bcopy(workbuf, asdl->sdl_data, min(namelen,
9226 		    sizeof(asdl->sdl_data)));
9227 		asdl->sdl_nlen = (u_char)namelen;
9228 	} else {
9229 		asdl->sdl_nlen = 0;
9230 	}
9231 	asdl->sdl_index = ifp->if_index;
9232 	asdl->sdl_type = ifp->if_type;
9233 	if (ll_addr != NULL) {
9234 		asdl->sdl_alen = ll_addr->sdl_alen;
9235 		bcopy(CONST_LLADDR(ll_addr), LLADDR(asdl), asdl->sdl_alen);
9236 	} else {
9237 		asdl->sdl_alen = 0;
9238 	}
9239 	ifa->ifa_netmask = (struct sockaddr *)msdl;
9240 	msdl->sdl_len = (u_char)masklen;
9241 	while (namelen > 0) {
9242 		msdl->sdl_data[--namelen] = 0xff;
9243 	}
9244 	IFA_UNLOCK(ifa);
9245 
9246 	if (oifa != NULL) {
9247 		IFA_REMREF(oifa);
9248 	}
9249 
9250 	return ifa;
9251 }
9252 
9253 static void
if_purgeaddrs(struct ifnet * ifp)9254 if_purgeaddrs(struct ifnet *ifp)
9255 {
9256 #if INET
9257 	in_purgeaddrs(ifp);
9258 #endif /* INET */
9259 	in6_purgeaddrs(ifp);
9260 }
9261 
9262 errno_t
ifnet_detach(ifnet_t ifp)9263 ifnet_detach(ifnet_t ifp)
9264 {
9265 	struct ifnet *delegated_ifp;
9266 	struct nd_ifinfo *ndi = NULL;
9267 
9268 	if (ifp == NULL) {
9269 		return EINVAL;
9270 	}
9271 
9272 	ndi = ND_IFINFO(ifp);
9273 	if (NULL != ndi) {
9274 		ndi->cga_initialized = FALSE;
9275 	}
9276 
9277 	/* Mark the interface down */
9278 	if_down(ifp);
9279 
9280 	/*
9281 	 * IMPORTANT NOTE
9282 	 *
9283 	 * Any field in the ifnet that relies on IF_FULLY_ATTACHED()
9284 	 * or equivalently, ifnet_is_attached(ifp, 1), can't be modified
9285 	 * until after we've waited for all I/O references to drain
9286 	 * in ifnet_detach_final().
9287 	 */
9288 
9289 	ifnet_head_lock_exclusive();
9290 	ifnet_lock_exclusive(ifp);
9291 
9292 	if (ifp->if_output_netem != NULL) {
9293 		netem_destroy(ifp->if_output_netem);
9294 		ifp->if_output_netem = NULL;
9295 	}
9296 
9297 	/*
9298 	 * Check to see if this interface has previously triggered
9299 	 * aggressive protocol draining; if so, decrement the global
9300 	 * refcnt and clear PR_AGGDRAIN on the route domain if
9301 	 * there are no more of such an interface around.
9302 	 */
9303 	(void) ifnet_set_idle_flags_locked(ifp, 0, ~0);
9304 
9305 	lck_mtx_lock_spin(&ifp->if_ref_lock);
9306 	if (!(ifp->if_refflags & IFRF_ATTACHED)) {
9307 		lck_mtx_unlock(&ifp->if_ref_lock);
9308 		ifnet_lock_done(ifp);
9309 		ifnet_head_done();
9310 		return EINVAL;
9311 	} else if (ifp->if_refflags & IFRF_DETACHING) {
9312 		/* Interface has already been detached */
9313 		lck_mtx_unlock(&ifp->if_ref_lock);
9314 		ifnet_lock_done(ifp);
9315 		ifnet_head_done();
9316 		return ENXIO;
9317 	}
9318 	VERIFY(!(ifp->if_refflags & IFRF_EMBRYONIC));
9319 	/* Indicate this interface is being detached */
9320 	ifp->if_refflags &= ~IFRF_ATTACHED;
9321 	ifp->if_refflags |= IFRF_DETACHING;
9322 	lck_mtx_unlock(&ifp->if_ref_lock);
9323 
9324 	if (dlil_verbose) {
9325 		DLIL_PRINTF("%s: detaching\n", if_name(ifp));
9326 	}
9327 
9328 	/* clean up flow control entry object if there's any */
9329 	if (ifp->if_eflags & IFEF_TXSTART) {
9330 		ifnet_flowadv(ifp->if_flowhash);
9331 	}
9332 
9333 	/* Reset ECN enable/disable flags */
9334 	/* Reset CLAT46 flag */
9335 	if_clear_eflags(ifp, IFEF_ECN_ENABLE | IFEF_ECN_DISABLE | IFEF_CLAT46);
9336 
9337 	/*
9338 	 * We do not reset the TCP keep alive counters in case
9339 	 * a TCP connection stays connection after the interface
9340 	 * went down
9341 	 */
9342 	if (ifp->if_tcp_kao_cnt > 0) {
9343 		os_log(OS_LOG_DEFAULT, "%s %s tcp_kao_cnt %u not zero",
9344 		    __func__, if_name(ifp), ifp->if_tcp_kao_cnt);
9345 	}
9346 	ifp->if_tcp_kao_max = 0;
9347 
9348 	/*
9349 	 * Remove ifnet from the ifnet_head, ifindex2ifnet[]; it will
9350 	 * no longer be visible during lookups from this point.
9351 	 */
9352 	VERIFY(ifindex2ifnet[ifp->if_index] == ifp);
9353 	TAILQ_REMOVE(&ifnet_head, ifp, if_link);
9354 	ifp->if_link.tqe_next = NULL;
9355 	ifp->if_link.tqe_prev = NULL;
9356 	if (ifp->if_ordered_link.tqe_next != NULL ||
9357 	    ifp->if_ordered_link.tqe_prev != NULL) {
9358 		ifnet_remove_from_ordered_list(ifp);
9359 	}
9360 	ifindex2ifnet[ifp->if_index] = NULL;
9361 
9362 	/* 18717626 - reset router mode */
9363 	if_clear_eflags(ifp, IFEF_IPV4_ROUTER);
9364 	ifp->if_ipv6_router_mode = IPV6_ROUTER_MODE_DISABLED;
9365 
9366 	/* Record detach PC stacktrace */
9367 	ctrace_record(&((struct dlil_ifnet *)ifp)->dl_if_detach);
9368 
9369 	/* Clear logging parameters */
9370 	bzero(&ifp->if_log, sizeof(ifp->if_log));
9371 
9372 	/* Clear delegated interface info (reference released below) */
9373 	delegated_ifp = ifp->if_delegated.ifp;
9374 	bzero(&ifp->if_delegated, sizeof(ifp->if_delegated));
9375 
9376 	/* Reset interface state */
9377 	bzero(&ifp->if_interface_state, sizeof(ifp->if_interface_state));
9378 
9379 	ifnet_lock_done(ifp);
9380 	ifnet_head_done();
9381 
9382 	/* Release reference held on the delegated interface */
9383 	if (delegated_ifp != NULL) {
9384 		ifnet_release(delegated_ifp);
9385 	}
9386 
9387 	/* Reset Link Quality Metric (unless loopback [lo0]) */
9388 	if (ifp != lo_ifp) {
9389 		if_lqm_update(ifp, IFNET_LQM_THRESH_OFF, 0);
9390 	}
9391 
9392 	/* Reset TCP local statistics */
9393 	if (ifp->if_tcp_stat != NULL) {
9394 		bzero(ifp->if_tcp_stat, sizeof(*ifp->if_tcp_stat));
9395 	}
9396 
9397 	/* Reset UDP local statistics */
9398 	if (ifp->if_udp_stat != NULL) {
9399 		bzero(ifp->if_udp_stat, sizeof(*ifp->if_udp_stat));
9400 	}
9401 
9402 	/* Reset ifnet IPv4 stats */
9403 	if (ifp->if_ipv4_stat != NULL) {
9404 		bzero(ifp->if_ipv4_stat, sizeof(*ifp->if_ipv4_stat));
9405 	}
9406 
9407 	/* Reset ifnet IPv6 stats */
9408 	if (ifp->if_ipv6_stat != NULL) {
9409 		bzero(ifp->if_ipv6_stat, sizeof(*ifp->if_ipv6_stat));
9410 	}
9411 
9412 	/* Release memory held for interface link status report */
9413 	if (ifp->if_link_status != NULL) {
9414 		kfree_type(struct if_link_status, ifp->if_link_status);
9415 		ifp->if_link_status = NULL;
9416 	}
9417 
9418 	/* Let BPF know we're detaching */
9419 	bpfdetach(ifp);
9420 
9421 	/* Disable forwarding cached route */
9422 	lck_mtx_lock(&ifp->if_cached_route_lock);
9423 	ifp->if_fwd_cacheok = 0;
9424 	lck_mtx_unlock(&ifp->if_cached_route_lock);
9425 
9426 	/* Disable data threshold and wait for any pending event posting */
9427 	ifp->if_data_threshold = 0;
9428 	VERIFY(ifp->if_dt_tcall != NULL);
9429 	(void) thread_call_cancel_wait(ifp->if_dt_tcall);
9430 
9431 	/*
9432 	 * Drain any deferred IGMPv3/MLDv2 query responses, but keep the
9433 	 * references to the info structures and leave them attached to
9434 	 * this ifnet.
9435 	 */
9436 #if INET
9437 	igmp_domifdetach(ifp);
9438 #endif /* INET */
9439 	mld_domifdetach(ifp);
9440 
9441 #if SKYWALK
9442 	/* Clean up any netns tokens still pointing to to this ifnet */
9443 	netns_ifnet_detach(ifp);
9444 #endif /* SKYWALK */
9445 	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_IF_DETACHING, NULL, 0, FALSE);
9446 
9447 	/* Let worker thread take care of the rest, to avoid reentrancy */
9448 	dlil_if_lock();
9449 	ifnet_detaching_enqueue(ifp);
9450 	dlil_if_unlock();
9451 
9452 	return 0;
9453 }
9454 
9455 static void
ifnet_detaching_enqueue(struct ifnet * ifp)9456 ifnet_detaching_enqueue(struct ifnet *ifp)
9457 {
9458 	dlil_if_lock_assert();
9459 
9460 	++ifnet_detaching_cnt;
9461 	VERIFY(ifnet_detaching_cnt != 0);
9462 	TAILQ_INSERT_TAIL(&ifnet_detaching_head, ifp, if_detaching_link);
9463 	wakeup((caddr_t)&ifnet_delayed_run);
9464 }
9465 
9466 static struct ifnet *
ifnet_detaching_dequeue(void)9467 ifnet_detaching_dequeue(void)
9468 {
9469 	struct ifnet *ifp;
9470 
9471 	dlil_if_lock_assert();
9472 
9473 	ifp = TAILQ_FIRST(&ifnet_detaching_head);
9474 	VERIFY(ifnet_detaching_cnt != 0 || ifp == NULL);
9475 	if (ifp != NULL) {
9476 		VERIFY(ifnet_detaching_cnt != 0);
9477 		--ifnet_detaching_cnt;
9478 		TAILQ_REMOVE(&ifnet_detaching_head, ifp, if_detaching_link);
9479 		ifp->if_detaching_link.tqe_next = NULL;
9480 		ifp->if_detaching_link.tqe_prev = NULL;
9481 	}
9482 	return ifp;
9483 }
9484 
9485 __attribute__((noreturn))
9486 static void
ifnet_detacher_thread_cont(void * v,wait_result_t wres)9487 ifnet_detacher_thread_cont(void *v, wait_result_t wres)
9488 {
9489 #pragma unused(v, wres)
9490 	struct ifnet *ifp;
9491 
9492 	dlil_if_lock();
9493 	if (__improbable(ifnet_detaching_embryonic)) {
9494 		ifnet_detaching_embryonic = FALSE;
9495 		/* there's no lock ordering constrain so OK to do this here */
9496 		dlil_decr_pending_thread_count();
9497 	}
9498 
9499 	for (;;) {
9500 		dlil_if_lock_assert();
9501 
9502 		if (ifnet_detaching_cnt == 0) {
9503 			break;
9504 		}
9505 
9506 		net_update_uptime();
9507 
9508 		VERIFY(TAILQ_FIRST(&ifnet_detaching_head) != NULL);
9509 
9510 		/* Take care of detaching ifnet */
9511 		ifp = ifnet_detaching_dequeue();
9512 		if (ifp != NULL) {
9513 			dlil_if_unlock();
9514 			ifnet_detach_final(ifp);
9515 			dlil_if_lock();
9516 		}
9517 	}
9518 
9519 	(void) assert_wait(&ifnet_delayed_run, THREAD_UNINT);
9520 	dlil_if_unlock();
9521 	(void) thread_block(ifnet_detacher_thread_cont);
9522 
9523 	VERIFY(0);      /* we should never get here */
9524 	/* NOTREACHED */
9525 	__builtin_unreachable();
9526 }
9527 
9528 __dead2
9529 static void
ifnet_detacher_thread_func(void * v,wait_result_t w)9530 ifnet_detacher_thread_func(void *v, wait_result_t w)
9531 {
9532 #pragma unused(v, w)
9533 	dlil_if_lock();
9534 	(void) assert_wait(&ifnet_delayed_run, THREAD_UNINT);
9535 	ifnet_detaching_embryonic = TRUE;
9536 	/* wake up once to get out of embryonic state */
9537 	wakeup((caddr_t)&ifnet_delayed_run);
9538 	dlil_if_unlock();
9539 	(void) thread_block(ifnet_detacher_thread_cont);
9540 	VERIFY(0);
9541 	/* NOTREACHED */
9542 	__builtin_unreachable();
9543 }
9544 
9545 static void
ifnet_detach_final(struct ifnet * ifp)9546 ifnet_detach_final(struct ifnet *ifp)
9547 {
9548 	struct ifnet_filter *filter, *filter_next;
9549 	struct dlil_ifnet *dlifp;
9550 	struct ifnet_filter_head fhead;
9551 	struct dlil_threading_info *inp;
9552 	struct ifaddr *ifa;
9553 	ifnet_detached_func if_free;
9554 	int i;
9555 
9556 #if SKYWALK
9557 	dlil_netif_detach_notify(ifp);
9558 	/*
9559 	 * Wait for the datapath to quiesce before tearing down
9560 	 * netif/flowswitch nexuses.
9561 	 */
9562 	dlil_quiesce_and_detach_nexuses(ifp);
9563 #endif /* SKYWALK */
9564 
9565 	lck_mtx_lock(&ifp->if_ref_lock);
9566 	if (!(ifp->if_refflags & IFRF_DETACHING)) {
9567 		panic("%s: flags mismatch (detaching not set) ifp=%p",
9568 		    __func__, ifp);
9569 		/* NOTREACHED */
9570 	}
9571 
9572 	/*
9573 	 * Wait until the existing IO references get released
9574 	 * before we proceed with ifnet_detach.  This is not a
9575 	 * common case, so block without using a continuation.
9576 	 */
9577 	while (ifp->if_refio > 0) {
9578 		DLIL_PRINTF("%s: Waiting for IO references on %s interface "
9579 		    "to be released\n", __func__, if_name(ifp));
9580 		(void) msleep(&(ifp->if_refio), &ifp->if_ref_lock,
9581 		    (PZERO - 1), "ifnet_ioref_wait", NULL);
9582 	}
9583 
9584 	VERIFY(ifp->if_datamov == 0);
9585 	VERIFY(ifp->if_drainers == 0);
9586 	VERIFY(ifp->if_suspend == 0);
9587 	ifp->if_refflags &= ~IFRF_READY;
9588 	lck_mtx_unlock(&ifp->if_ref_lock);
9589 
9590 	/* Clear agent IDs */
9591 	if (ifp->if_agentids != NULL) {
9592 		kfree_data(ifp->if_agentids,
9593 		    sizeof(uuid_t) * ifp->if_agentcount);
9594 		ifp->if_agentids = NULL;
9595 	}
9596 	ifp->if_agentcount = 0;
9597 
9598 #if SKYWALK
9599 	VERIFY(SLIST_EMPTY(&ifp->if_netns_tokens));
9600 #endif /* SKYWALK */
9601 	/* Drain and destroy send queue */
9602 	ifclassq_teardown(ifp->if_snd);
9603 
9604 	/* Detach interface filters */
9605 	lck_mtx_lock(&ifp->if_flt_lock);
9606 	if_flt_monitor_enter(ifp);
9607 
9608 	LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
9609 	fhead = ifp->if_flt_head;
9610 	TAILQ_INIT(&ifp->if_flt_head);
9611 
9612 	for (filter = TAILQ_FIRST(&fhead); filter; filter = filter_next) {
9613 		filter_next = TAILQ_NEXT(filter, filt_next);
9614 		lck_mtx_unlock(&ifp->if_flt_lock);
9615 
9616 		dlil_detach_filter_internal(filter, 1);
9617 		lck_mtx_lock(&ifp->if_flt_lock);
9618 	}
9619 	if_flt_monitor_leave(ifp);
9620 	lck_mtx_unlock(&ifp->if_flt_lock);
9621 
9622 	/* Tell upper layers to drop their network addresses */
9623 	if_purgeaddrs(ifp);
9624 
9625 	ifnet_lock_exclusive(ifp);
9626 
9627 	/* Unplumb all protocols */
9628 	for (i = 0; i < PROTO_HASH_SLOTS; i++) {
9629 		struct if_proto *proto;
9630 
9631 		proto = SLIST_FIRST(&ifp->if_proto_hash[i]);
9632 		while (proto != NULL) {
9633 			protocol_family_t family = proto->protocol_family;
9634 			ifnet_lock_done(ifp);
9635 			proto_unplumb(family, ifp);
9636 			ifnet_lock_exclusive(ifp);
9637 			proto = SLIST_FIRST(&ifp->if_proto_hash[i]);
9638 		}
9639 		/* There should not be any protocols left */
9640 		VERIFY(SLIST_EMPTY(&ifp->if_proto_hash[i]));
9641 	}
9642 	kfree_type(struct proto_hash_entry, PROTO_HASH_SLOTS, ifp->if_proto_hash);
9643 	ifp->if_proto_hash = NULL;
9644 
9645 	/* Detach (permanent) link address from if_addrhead */
9646 	ifa = TAILQ_FIRST(&ifp->if_addrhead);
9647 	VERIFY(ifnet_addrs[ifp->if_index - 1] == ifa);
9648 	IFA_LOCK(ifa);
9649 	if_detach_link_ifa(ifp, ifa);
9650 	IFA_UNLOCK(ifa);
9651 
9652 	/* Remove (permanent) link address from ifnet_addrs[] */
9653 	IFA_REMREF(ifa);
9654 	ifnet_addrs[ifp->if_index - 1] = NULL;
9655 
9656 	/* This interface should not be on {ifnet_head,detaching} */
9657 	VERIFY(ifp->if_link.tqe_next == NULL);
9658 	VERIFY(ifp->if_link.tqe_prev == NULL);
9659 	VERIFY(ifp->if_detaching_link.tqe_next == NULL);
9660 	VERIFY(ifp->if_detaching_link.tqe_prev == NULL);
9661 	VERIFY(ifp->if_ordered_link.tqe_next == NULL);
9662 	VERIFY(ifp->if_ordered_link.tqe_prev == NULL);
9663 
9664 	/* The slot should have been emptied */
9665 	VERIFY(ifindex2ifnet[ifp->if_index] == NULL);
9666 
9667 	/* There should not be any addresses left */
9668 	VERIFY(TAILQ_EMPTY(&ifp->if_addrhead));
9669 
9670 	/*
9671 	 * Signal the starter thread to terminate itself, and wait until
9672 	 * it has exited.
9673 	 */
9674 	if (ifp->if_start_thread != THREAD_NULL) {
9675 		lck_mtx_lock_spin(&ifp->if_start_lock);
9676 		ifp->if_start_flags |= IFSF_TERMINATING;
9677 		wakeup_one((caddr_t)&ifp->if_start_thread);
9678 		lck_mtx_unlock(&ifp->if_start_lock);
9679 
9680 		/* wait for starter thread to terminate */
9681 		lck_mtx_lock(&ifp->if_start_lock);
9682 		while (ifp->if_start_thread != THREAD_NULL) {
9683 			if (dlil_verbose) {
9684 				DLIL_PRINTF("%s: waiting for %s starter thread to terminate\n",
9685 				    __func__,
9686 				    if_name(ifp));
9687 			}
9688 			(void) msleep(&ifp->if_start_thread,
9689 			    &ifp->if_start_lock, (PZERO - 1),
9690 			    "ifnet_start_thread_exit", NULL);
9691 		}
9692 		lck_mtx_unlock(&ifp->if_start_lock);
9693 		if (dlil_verbose) {
9694 			DLIL_PRINTF("%s: %s starter thread termination complete",
9695 			    __func__, if_name(ifp));
9696 		}
9697 	}
9698 
9699 	/*
9700 	 * Signal the poller thread to terminate itself, and wait until
9701 	 * it has exited.
9702 	 */
9703 	if (ifp->if_poll_thread != THREAD_NULL) {
9704 #if SKYWALK
9705 		VERIFY(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
9706 #endif /* SKYWALK */
9707 		lck_mtx_lock_spin(&ifp->if_poll_lock);
9708 		ifp->if_poll_flags |= IF_POLLF_TERMINATING;
9709 		wakeup_one((caddr_t)&ifp->if_poll_thread);
9710 		lck_mtx_unlock(&ifp->if_poll_lock);
9711 
9712 		/* wait for poller thread to terminate */
9713 		lck_mtx_lock(&ifp->if_poll_lock);
9714 		while (ifp->if_poll_thread != THREAD_NULL) {
9715 			if (dlil_verbose) {
9716 				DLIL_PRINTF("%s: waiting for %s poller thread to terminate\n",
9717 				    __func__,
9718 				    if_name(ifp));
9719 			}
9720 			(void) msleep(&ifp->if_poll_thread,
9721 			    &ifp->if_poll_lock, (PZERO - 1),
9722 			    "ifnet_poll_thread_exit", NULL);
9723 		}
9724 		lck_mtx_unlock(&ifp->if_poll_lock);
9725 		if (dlil_verbose) {
9726 			DLIL_PRINTF("%s: %s poller thread termination complete\n",
9727 			    __func__, if_name(ifp));
9728 		}
9729 	}
9730 
9731 	/*
9732 	 * If thread affinity was set for the workloop thread, we will need
9733 	 * to tear down the affinity and release the extra reference count
9734 	 * taken at attach time.  Does not apply to lo0 or other interfaces
9735 	 * without dedicated input threads.
9736 	 */
9737 	if ((inp = ifp->if_inp) != NULL) {
9738 		VERIFY(inp != dlil_main_input_thread);
9739 
9740 		if (inp->dlth_affinity) {
9741 			struct thread *tp, *wtp, *ptp;
9742 
9743 			lck_mtx_lock_spin(&inp->dlth_lock);
9744 			wtp = inp->dlth_driver_thread;
9745 			inp->dlth_driver_thread = THREAD_NULL;
9746 			ptp = inp->dlth_poller_thread;
9747 			inp->dlth_poller_thread = THREAD_NULL;
9748 			ASSERT(inp->dlth_thread != THREAD_NULL);
9749 			tp = inp->dlth_thread;    /* don't nullify now */
9750 			inp->dlth_affinity_tag = 0;
9751 			inp->dlth_affinity = FALSE;
9752 			lck_mtx_unlock(&inp->dlth_lock);
9753 
9754 			/* Tear down poll thread affinity */
9755 			if (ptp != NULL) {
9756 				VERIFY(ifp->if_eflags & IFEF_RXPOLL);
9757 				VERIFY(ifp->if_xflags & IFXF_LEGACY);
9758 				(void) dlil_affinity_set(ptp,
9759 				    THREAD_AFFINITY_TAG_NULL);
9760 				thread_deallocate(ptp);
9761 			}
9762 
9763 			/* Tear down workloop thread affinity */
9764 			if (wtp != NULL) {
9765 				(void) dlil_affinity_set(wtp,
9766 				    THREAD_AFFINITY_TAG_NULL);
9767 				thread_deallocate(wtp);
9768 			}
9769 
9770 			/* Tear down DLIL input thread affinity */
9771 			(void) dlil_affinity_set(tp, THREAD_AFFINITY_TAG_NULL);
9772 			thread_deallocate(tp);
9773 		}
9774 
9775 		/* disassociate ifp DLIL input thread */
9776 		ifp->if_inp = NULL;
9777 
9778 		/* if the worker thread was created, tell it to terminate */
9779 		if (inp->dlth_thread != THREAD_NULL) {
9780 			lck_mtx_lock_spin(&inp->dlth_lock);
9781 			inp->dlth_flags |= DLIL_INPUT_TERMINATE;
9782 			if (!(inp->dlth_flags & DLIL_INPUT_RUNNING)) {
9783 				wakeup_one((caddr_t)&inp->dlth_flags);
9784 			}
9785 			lck_mtx_unlock(&inp->dlth_lock);
9786 			ifnet_lock_done(ifp);
9787 
9788 			/* wait for the input thread to terminate */
9789 			lck_mtx_lock_spin(&inp->dlth_lock);
9790 			while ((inp->dlth_flags & DLIL_INPUT_TERMINATE_COMPLETE)
9791 			    == 0) {
9792 				(void) msleep(&inp->dlth_flags, &inp->dlth_lock,
9793 				    (PZERO - 1) | PSPIN, inp->dlth_name, NULL);
9794 			}
9795 			lck_mtx_unlock(&inp->dlth_lock);
9796 			ifnet_lock_exclusive(ifp);
9797 		}
9798 
9799 		/* clean-up input thread state */
9800 		dlil_clean_threading_info(inp);
9801 		/* clean-up poll parameters */
9802 		VERIFY(ifp->if_poll_thread == THREAD_NULL);
9803 		dlil_reset_rxpoll_params(ifp);
9804 	}
9805 
9806 	/* The driver might unload, so point these to ourselves */
9807 	if_free = ifp->if_free;
9808 	ifp->if_output_dlil = ifp_if_output;
9809 	ifp->if_output = ifp_if_output;
9810 	ifp->if_pre_enqueue = ifp_if_output;
9811 	ifp->if_start = ifp_if_start;
9812 	ifp->if_output_ctl = ifp_if_ctl;
9813 	ifp->if_input_dlil = ifp_if_input;
9814 	ifp->if_input_poll = ifp_if_input_poll;
9815 	ifp->if_input_ctl = ifp_if_ctl;
9816 	ifp->if_ioctl = ifp_if_ioctl;
9817 	ifp->if_set_bpf_tap = ifp_if_set_bpf_tap;
9818 	ifp->if_free = ifp_if_free;
9819 	ifp->if_demux = ifp_if_demux;
9820 	ifp->if_event = ifp_if_event;
9821 	ifp->if_framer_legacy = ifp_if_framer;
9822 	ifp->if_framer = ifp_if_framer_extended;
9823 	ifp->if_add_proto = ifp_if_add_proto;
9824 	ifp->if_del_proto = ifp_if_del_proto;
9825 	ifp->if_check_multi = ifp_if_check_multi;
9826 
9827 	/* wipe out interface description */
9828 	VERIFY(ifp->if_desc.ifd_maxlen == IF_DESCSIZE);
9829 	ifp->if_desc.ifd_len = 0;
9830 	VERIFY(ifp->if_desc.ifd_desc != NULL);
9831 	bzero(ifp->if_desc.ifd_desc, IF_DESCSIZE);
9832 
9833 	/* there shouldn't be any delegation by now */
9834 	VERIFY(ifp->if_delegated.ifp == NULL);
9835 	VERIFY(ifp->if_delegated.type == 0);
9836 	VERIFY(ifp->if_delegated.family == 0);
9837 	VERIFY(ifp->if_delegated.subfamily == 0);
9838 	VERIFY(ifp->if_delegated.expensive == 0);
9839 	VERIFY(ifp->if_delegated.constrained == 0);
9840 
9841 	/* QoS marking get cleared */
9842 	if_clear_eflags(ifp, IFEF_QOSMARKING_ENABLED);
9843 	if_set_qosmarking_mode(ifp, IFRTYPE_QOSMARKING_MODE_NONE);
9844 
9845 #if SKYWALK
9846 	/* the nexus destructor is responsible for clearing these */
9847 	VERIFY(ifp->if_na_ops == NULL);
9848 	VERIFY(ifp->if_na == NULL);
9849 #endif /* SKYWALK */
9850 
9851 	/* promiscuous/allmulti counts need to start at zero again */
9852 	ifp->if_pcount = 0;
9853 	ifp->if_amcount = 0;
9854 	ifp->if_flags &= ~(IFF_PROMISC | IFF_ALLMULTI);
9855 
9856 	ifnet_lock_done(ifp);
9857 
9858 #if PF
9859 	/*
9860 	 * Detach this interface from packet filter, if enabled.
9861 	 */
9862 	pf_ifnet_hook(ifp, 0);
9863 #endif /* PF */
9864 
9865 	/* Filter list should be empty */
9866 	lck_mtx_lock_spin(&ifp->if_flt_lock);
9867 	VERIFY(TAILQ_EMPTY(&ifp->if_flt_head));
9868 	VERIFY(ifp->if_flt_busy == 0);
9869 	VERIFY(ifp->if_flt_waiters == 0);
9870 	VERIFY(ifp->if_flt_non_os_count == 0);
9871 	VERIFY(ifp->if_flt_no_tso_count == 0);
9872 	lck_mtx_unlock(&ifp->if_flt_lock);
9873 
9874 	/* Last chance to drain send queue */
9875 	if_qflush_snd(ifp, 0);
9876 
9877 	/* Last chance to cleanup any cached route */
9878 	lck_mtx_lock(&ifp->if_cached_route_lock);
9879 	VERIFY(!ifp->if_fwd_cacheok);
9880 	ROUTE_RELEASE(&ifp->if_fwd_route);
9881 	bzero(&ifp->if_fwd_route, sizeof(ifp->if_fwd_route));
9882 	ROUTE_RELEASE(&ifp->if_src_route);
9883 	bzero(&ifp->if_src_route, sizeof(ifp->if_src_route));
9884 	ROUTE_RELEASE(&ifp->if_src_route6);
9885 	bzero(&ifp->if_src_route6, sizeof(ifp->if_src_route6));
9886 	lck_mtx_unlock(&ifp->if_cached_route_lock);
9887 
9888 	VERIFY(ifp->if_data_threshold == 0);
9889 	VERIFY(ifp->if_dt_tcall != NULL);
9890 	VERIFY(!thread_call_isactive(ifp->if_dt_tcall));
9891 
9892 	ifnet_llreach_ifdetach(ifp);
9893 
9894 	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_IF_DETACHED, NULL, 0, FALSE);
9895 
9896 	/*
9897 	 * Finally, mark this ifnet as detached.
9898 	 */
9899 	if (dlil_verbose) {
9900 		DLIL_PRINTF("%s: detached\n", if_name(ifp));
9901 	}
9902 	lck_mtx_lock_spin(&ifp->if_ref_lock);
9903 	if (!(ifp->if_refflags & IFRF_DETACHING)) {
9904 		panic("%s: flags mismatch (detaching not set) ifp=%p",
9905 		    __func__, ifp);
9906 		/* NOTREACHED */
9907 	}
9908 	ifp->if_refflags &= ~IFRF_DETACHING;
9909 	lck_mtx_unlock(&ifp->if_ref_lock);
9910 	if (if_free != NULL) {
9911 		if_free(ifp);
9912 	}
9913 
9914 	ifclassq_release(&ifp->if_snd);
9915 
9916 	/* we're fully detached, clear the "in use" bit */
9917 	dlifp = (struct dlil_ifnet *)ifp;
9918 	lck_mtx_lock(&dlifp->dl_if_lock);
9919 	ASSERT((dlifp->dl_if_flags & DLIF_INUSE) != 0);
9920 	dlifp->dl_if_flags &= ~DLIF_INUSE;
9921 	lck_mtx_unlock(&dlifp->dl_if_lock);
9922 
9923 	/* Release reference held during ifnet attach */
9924 	ifnet_release(ifp);
9925 }
9926 
9927 errno_t
ifp_if_output(struct ifnet * ifp,struct mbuf * m)9928 ifp_if_output(struct ifnet *ifp, struct mbuf *m)
9929 {
9930 #pragma unused(ifp)
9931 	m_freem_list(m);
9932 	return 0;
9933 }
9934 
9935 void
ifp_if_start(struct ifnet * ifp)9936 ifp_if_start(struct ifnet *ifp)
9937 {
9938 	ifnet_purge(ifp);
9939 }
9940 
9941 static errno_t
ifp_if_input(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t poll,struct thread * tp)9942 ifp_if_input(struct ifnet *ifp, struct mbuf *m_head,
9943     struct mbuf *m_tail, const struct ifnet_stat_increment_param *s,
9944     boolean_t poll, struct thread *tp)
9945 {
9946 #pragma unused(ifp, m_tail, s, poll, tp)
9947 	m_freem_list(m_head);
9948 	return ENXIO;
9949 }
9950 
9951 static void
ifp_if_input_poll(struct ifnet * ifp,u_int32_t flags,u_int32_t max_cnt,struct mbuf ** m_head,struct mbuf ** m_tail,u_int32_t * cnt,u_int32_t * len)9952 ifp_if_input_poll(struct ifnet *ifp, u_int32_t flags, u_int32_t max_cnt,
9953     struct mbuf **m_head, struct mbuf **m_tail, u_int32_t *cnt, u_int32_t *len)
9954 {
9955 #pragma unused(ifp, flags, max_cnt)
9956 	if (m_head != NULL) {
9957 		*m_head = NULL;
9958 	}
9959 	if (m_tail != NULL) {
9960 		*m_tail = NULL;
9961 	}
9962 	if (cnt != NULL) {
9963 		*cnt = 0;
9964 	}
9965 	if (len != NULL) {
9966 		*len = 0;
9967 	}
9968 }
9969 
9970 static errno_t
ifp_if_ctl(struct ifnet * ifp,ifnet_ctl_cmd_t cmd,u_int32_t arglen,void * arg)9971 ifp_if_ctl(struct ifnet *ifp, ifnet_ctl_cmd_t cmd, u_int32_t arglen, void *arg)
9972 {
9973 #pragma unused(ifp, cmd, arglen, arg)
9974 	return EOPNOTSUPP;
9975 }
9976 
9977 static errno_t
ifp_if_demux(struct ifnet * ifp,struct mbuf * m,char * fh,protocol_family_t * pf)9978 ifp_if_demux(struct ifnet *ifp, struct mbuf *m, char *fh, protocol_family_t *pf)
9979 {
9980 #pragma unused(ifp, fh, pf)
9981 	m_freem(m);
9982 	return EJUSTRETURN;
9983 }
9984 
9985 static errno_t
ifp_if_add_proto(struct ifnet * ifp,protocol_family_t pf,const struct ifnet_demux_desc * da,u_int32_t dc)9986 ifp_if_add_proto(struct ifnet *ifp, protocol_family_t pf,
9987     const struct ifnet_demux_desc *da, u_int32_t dc)
9988 {
9989 #pragma unused(ifp, pf, da, dc)
9990 	return EINVAL;
9991 }
9992 
9993 static errno_t
ifp_if_del_proto(struct ifnet * ifp,protocol_family_t pf)9994 ifp_if_del_proto(struct ifnet *ifp, protocol_family_t pf)
9995 {
9996 #pragma unused(ifp, pf)
9997 	return EINVAL;
9998 }
9999 
10000 static errno_t
ifp_if_check_multi(struct ifnet * ifp,const struct sockaddr * sa)10001 ifp_if_check_multi(struct ifnet *ifp, const struct sockaddr *sa)
10002 {
10003 #pragma unused(ifp, sa)
10004 	return EOPNOTSUPP;
10005 }
10006 
10007 #if !XNU_TARGET_OS_OSX
10008 static errno_t
ifp_if_framer(struct ifnet * ifp,struct mbuf ** m,const struct sockaddr * sa,const char * ll,const char * t,u_int32_t * pre,u_int32_t * post)10009 ifp_if_framer(struct ifnet *ifp, struct mbuf **m,
10010     const struct sockaddr *sa, const char *ll, const char *t,
10011     u_int32_t *pre, u_int32_t *post)
10012 #else /* XNU_TARGET_OS_OSX */
10013 static errno_t
10014 ifp_if_framer(struct ifnet *ifp, struct mbuf **m,
10015     const struct sockaddr *sa, const char *ll, const char *t)
10016 #endif /* XNU_TARGET_OS_OSX */
10017 {
10018 #pragma unused(ifp, m, sa, ll, t)
10019 #if !XNU_TARGET_OS_OSX
10020 	return ifp_if_framer_extended(ifp, m, sa, ll, t, pre, post);
10021 #else /* XNU_TARGET_OS_OSX */
10022 	return ifp_if_framer_extended(ifp, m, sa, ll, t, NULL, NULL);
10023 #endif /* XNU_TARGET_OS_OSX */
10024 }
10025 
10026 static errno_t
ifp_if_framer_extended(struct ifnet * ifp,struct mbuf ** m,const struct sockaddr * sa,const char * ll,const char * t,u_int32_t * pre,u_int32_t * post)10027 ifp_if_framer_extended(struct ifnet *ifp, struct mbuf **m,
10028     const struct sockaddr *sa, const char *ll, const char *t,
10029     u_int32_t *pre, u_int32_t *post)
10030 {
10031 #pragma unused(ifp, sa, ll, t)
10032 	m_freem(*m);
10033 	*m = NULL;
10034 
10035 	if (pre != NULL) {
10036 		*pre = 0;
10037 	}
10038 	if (post != NULL) {
10039 		*post = 0;
10040 	}
10041 
10042 	return EJUSTRETURN;
10043 }
10044 
10045 errno_t
ifp_if_ioctl(struct ifnet * ifp,unsigned long cmd,void * arg)10046 ifp_if_ioctl(struct ifnet *ifp, unsigned long cmd, void *arg)
10047 {
10048 #pragma unused(ifp, cmd, arg)
10049 	return EOPNOTSUPP;
10050 }
10051 
10052 static errno_t
ifp_if_set_bpf_tap(struct ifnet * ifp,bpf_tap_mode tm,bpf_packet_func f)10053 ifp_if_set_bpf_tap(struct ifnet *ifp, bpf_tap_mode tm, bpf_packet_func f)
10054 {
10055 #pragma unused(ifp, tm, f)
10056 	/* XXX not sure what to do here */
10057 	return 0;
10058 }
10059 
10060 static void
ifp_if_free(struct ifnet * ifp)10061 ifp_if_free(struct ifnet *ifp)
10062 {
10063 #pragma unused(ifp)
10064 }
10065 
10066 static void
ifp_if_event(struct ifnet * ifp,const struct kev_msg * e)10067 ifp_if_event(struct ifnet *ifp, const struct kev_msg *e)
10068 {
10069 #pragma unused(ifp, e)
10070 }
10071 
10072 int
dlil_if_acquire(u_int32_t family,const void * uniqueid,size_t uniqueid_len,const char * ifxname,struct ifnet ** ifp)10073 dlil_if_acquire(u_int32_t family, const void *uniqueid,
10074     size_t uniqueid_len, const char *ifxname, struct ifnet **ifp)
10075 {
10076 	struct ifnet *ifp1 = NULL;
10077 	struct dlil_ifnet *dlifp1 = NULL;
10078 	struct dlil_ifnet *dlifp1_saved = NULL;
10079 	void *buf, *base, **pbuf;
10080 	int ret = 0;
10081 
10082 	VERIFY(*ifp == NULL);
10083 	dlil_if_lock();
10084 	/*
10085 	 * We absolutely can't have an interface with the same name
10086 	 * in in-use state.
10087 	 * To make sure of that list has to be traversed completely
10088 	 */
10089 	TAILQ_FOREACH(dlifp1, &dlil_ifnet_head, dl_if_link) {
10090 		ifp1 = (struct ifnet *)dlifp1;
10091 
10092 		if (ifp1->if_family != family) {
10093 			continue;
10094 		}
10095 
10096 		/*
10097 		 * If interface is in use, return EBUSY if either unique id
10098 		 * or interface extended names are the same
10099 		 */
10100 		lck_mtx_lock(&dlifp1->dl_if_lock);
10101 		if (strncmp(ifxname, ifp1->if_xname, IFXNAMSIZ) == 0 &&
10102 		    (dlifp1->dl_if_flags & DLIF_INUSE) != 0) {
10103 			lck_mtx_unlock(&dlifp1->dl_if_lock);
10104 			ret = EBUSY;
10105 			goto end;
10106 		}
10107 
10108 		if (uniqueid_len != 0 &&
10109 		    uniqueid_len == dlifp1->dl_if_uniqueid_len &&
10110 		    bcmp(uniqueid, dlifp1->dl_if_uniqueid, uniqueid_len) == 0) {
10111 			if ((dlifp1->dl_if_flags & DLIF_INUSE) != 0) {
10112 				lck_mtx_unlock(&dlifp1->dl_if_lock);
10113 				ret = EBUSY;
10114 				goto end;
10115 			}
10116 			if (dlifp1_saved == NULL) {
10117 				/* cache the first match */
10118 				dlifp1_saved = dlifp1;
10119 			}
10120 			/*
10121 			 * Do not break or jump to end as we have to traverse
10122 			 * the whole list to ensure there are no name collisions
10123 			 */
10124 		}
10125 		lck_mtx_unlock(&dlifp1->dl_if_lock);
10126 	}
10127 
10128 	/* If there's an interface that can be recycled, use that */
10129 	if (dlifp1_saved != NULL) {
10130 		lck_mtx_lock(&dlifp1_saved->dl_if_lock);
10131 		if ((dlifp1_saved->dl_if_flags & DLIF_INUSE) != 0) {
10132 			/* some other thread got in ahead of us */
10133 			lck_mtx_unlock(&dlifp1_saved->dl_if_lock);
10134 			ret = EBUSY;
10135 			goto end;
10136 		}
10137 		dlifp1_saved->dl_if_flags |= (DLIF_INUSE | DLIF_REUSE);
10138 		lck_mtx_unlock(&dlifp1_saved->dl_if_lock);
10139 		*ifp = (struct ifnet *)dlifp1_saved;
10140 		dlil_if_ref(*ifp);
10141 		goto end;
10142 	}
10143 
10144 	/* no interface found, allocate a new one */
10145 	buf = zalloc_flags(dlif_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
10146 
10147 	/* Get the 64-bit aligned base address for this object */
10148 	base = (void *)P2ROUNDUP((intptr_t)buf + sizeof(u_int64_t),
10149 	    sizeof(u_int64_t));
10150 	VERIFY(((intptr_t)base + dlif_size) <= ((intptr_t)buf + dlif_bufsize));
10151 
10152 	/*
10153 	 * Wind back a pointer size from the aligned base and
10154 	 * save the original address so we can free it later.
10155 	 */
10156 	pbuf = (void **)((intptr_t)base - sizeof(void *));
10157 	*pbuf = buf;
10158 	dlifp1 = base;
10159 
10160 	if (uniqueid_len) {
10161 		dlifp1->dl_if_uniqueid = kalloc_data(uniqueid_len,
10162 		    Z_WAITOK);
10163 		if (dlifp1->dl_if_uniqueid == NULL) {
10164 			zfree(dlif_zone, buf);
10165 			ret = ENOMEM;
10166 			goto end;
10167 		}
10168 		bcopy(uniqueid, dlifp1->dl_if_uniqueid, uniqueid_len);
10169 		dlifp1->dl_if_uniqueid_len = uniqueid_len;
10170 	}
10171 
10172 	ifp1 = (struct ifnet *)dlifp1;
10173 	dlifp1->dl_if_flags = DLIF_INUSE;
10174 	if (ifnet_debug) {
10175 		dlifp1->dl_if_flags |= DLIF_DEBUG;
10176 		dlifp1->dl_if_trace = dlil_if_trace;
10177 	}
10178 	ifp1->if_name = dlifp1->dl_if_namestorage;
10179 	ifp1->if_xname = dlifp1->dl_if_xnamestorage;
10180 
10181 	/* initialize interface description */
10182 	ifp1->if_desc.ifd_maxlen = IF_DESCSIZE;
10183 	ifp1->if_desc.ifd_len = 0;
10184 	ifp1->if_desc.ifd_desc = dlifp1->dl_if_descstorage;
10185 
10186 #if SKYWALK
10187 	SLIST_INIT(&ifp1->if_netns_tokens);
10188 #endif /* SKYWALK */
10189 
10190 	if ((ret = dlil_alloc_local_stats(ifp1)) != 0) {
10191 		DLIL_PRINTF("%s: failed to allocate if local stats, "
10192 		    "error: %d\n", __func__, ret);
10193 		/* This probably shouldn't be fatal */
10194 		ret = 0;
10195 	}
10196 
10197 	lck_mtx_init(&dlifp1->dl_if_lock, &ifnet_lock_group, &ifnet_lock_attr);
10198 	lck_rw_init(&ifp1->if_lock, &ifnet_lock_group, &ifnet_lock_attr);
10199 	lck_mtx_init(&ifp1->if_ref_lock, &ifnet_lock_group, &ifnet_lock_attr);
10200 	lck_mtx_init(&ifp1->if_flt_lock, &ifnet_lock_group, &ifnet_lock_attr);
10201 	lck_mtx_init(&ifp1->if_addrconfig_lock, &ifnet_lock_group,
10202 	    &ifnet_lock_attr);
10203 	lck_rw_init(&ifp1->if_llreach_lock, &ifnet_lock_group, &ifnet_lock_attr);
10204 #if INET
10205 	lck_rw_init(&ifp1->if_inetdata_lock, &ifnet_lock_group,
10206 	    &ifnet_lock_attr);
10207 	ifp1->if_inetdata = NULL;
10208 #endif
10209 	lck_mtx_init(&ifp1->if_inet6_ioctl_lock, &ifnet_lock_group, &ifnet_lock_attr);
10210 	ifp1->if_inet6_ioctl_busy = FALSE;
10211 	lck_rw_init(&ifp1->if_inet6data_lock, &ifnet_lock_group,
10212 	    &ifnet_lock_attr);
10213 	ifp1->if_inet6data = NULL;
10214 	lck_rw_init(&ifp1->if_link_status_lock, &ifnet_lock_group,
10215 	    &ifnet_lock_attr);
10216 	ifp1->if_link_status = NULL;
10217 
10218 	/* for send data paths */
10219 	lck_mtx_init(&ifp1->if_start_lock, &ifnet_snd_lock_group,
10220 	    &ifnet_lock_attr);
10221 	lck_mtx_init(&ifp1->if_cached_route_lock, &ifnet_snd_lock_group,
10222 	    &ifnet_lock_attr);
10223 
10224 	/* for receive data paths */
10225 	lck_mtx_init(&ifp1->if_poll_lock, &ifnet_rcv_lock_group,
10226 	    &ifnet_lock_attr);
10227 
10228 	/* thread call allocation is done with sleeping zalloc */
10229 	ifp1->if_dt_tcall = thread_call_allocate_with_options(dlil_dt_tcall_fn,
10230 	    ifp1, THREAD_CALL_PRIORITY_KERNEL, THREAD_CALL_OPTIONS_ONCE);
10231 	if (ifp1->if_dt_tcall == NULL) {
10232 		panic_plain("%s: couldn't create if_dt_tcall", __func__);
10233 		/* NOTREACHED */
10234 	}
10235 
10236 	TAILQ_INSERT_TAIL(&dlil_ifnet_head, dlifp1, dl_if_link);
10237 
10238 	*ifp = ifp1;
10239 	dlil_if_ref(*ifp);
10240 
10241 end:
10242 	dlil_if_unlock();
10243 
10244 	VERIFY(dlifp1 == NULL || (IS_P2ALIGNED(dlifp1, sizeof(u_int64_t)) &&
10245 	    IS_P2ALIGNED(&ifp1->if_data, sizeof(u_int64_t))));
10246 
10247 	return ret;
10248 }
10249 
10250 static void
_dlil_if_release(ifnet_t ifp,bool clear_in_use)10251 _dlil_if_release(ifnet_t ifp, bool clear_in_use)
10252 {
10253 	struct dlil_ifnet *dlifp = (struct dlil_ifnet *)ifp;
10254 
10255 	VERIFY(OSDecrementAtomic64(&net_api_stats.nas_ifnet_alloc_count) > 0);
10256 	if (!(ifp->if_xflags & IFXF_ALLOC_KPI)) {
10257 		VERIFY(OSDecrementAtomic64(&net_api_stats.nas_ifnet_alloc_os_count) > 0);
10258 	}
10259 
10260 	ifnet_lock_exclusive(ifp);
10261 	if (ifp->if_broadcast.length > sizeof(ifp->if_broadcast.u.buffer)) {
10262 		kfree_data(ifp->if_broadcast.u.ptr, ifp->if_broadcast.length);
10263 		ifp->if_broadcast.length = 0;
10264 		ifp->if_broadcast.u.ptr = NULL;
10265 	}
10266 	lck_mtx_lock(&dlifp->dl_if_lock);
10267 	strlcpy(dlifp->dl_if_namestorage, ifp->if_name, IFNAMSIZ);
10268 	ifp->if_name = dlifp->dl_if_namestorage;
10269 	/* Reset external name (name + unit) */
10270 	ifp->if_xname = dlifp->dl_if_xnamestorage;
10271 	snprintf(__DECONST(char *, ifp->if_xname), IFXNAMSIZ,
10272 	    "%s?", ifp->if_name);
10273 	if (clear_in_use) {
10274 		ASSERT((dlifp->dl_if_flags & DLIF_INUSE) != 0);
10275 		dlifp->dl_if_flags &= ~DLIF_INUSE;
10276 	}
10277 	lck_mtx_unlock(&dlifp->dl_if_lock);
10278 	ifnet_lock_done(ifp);
10279 }
10280 
10281 __private_extern__ void
dlil_if_release(ifnet_t ifp)10282 dlil_if_release(ifnet_t ifp)
10283 {
10284 	_dlil_if_release(ifp, false);
10285 }
10286 
10287 __private_extern__ void
dlil_if_lock(void)10288 dlil_if_lock(void)
10289 {
10290 	lck_mtx_lock(&dlil_ifnet_lock);
10291 }
10292 
10293 __private_extern__ void
dlil_if_unlock(void)10294 dlil_if_unlock(void)
10295 {
10296 	lck_mtx_unlock(&dlil_ifnet_lock);
10297 }
10298 
10299 __private_extern__ void
dlil_if_lock_assert(void)10300 dlil_if_lock_assert(void)
10301 {
10302 	LCK_MTX_ASSERT(&dlil_ifnet_lock, LCK_MTX_ASSERT_OWNED);
10303 }
10304 
10305 __private_extern__ void
dlil_proto_unplumb_all(struct ifnet * ifp)10306 dlil_proto_unplumb_all(struct ifnet *ifp)
10307 {
10308 	/*
10309 	 * if_proto_hash[0-2] are for PF_INET, PF_INET6 and PF_VLAN, where
10310 	 * each bucket contains exactly one entry; PF_VLAN does not need an
10311 	 * explicit unplumb.
10312 	 *
10313 	 * if_proto_hash[3] is for other protocols; we expect anything
10314 	 * in this bucket to respond to the DETACHING event (which would
10315 	 * have happened by now) and do the unplumb then.
10316 	 */
10317 	(void) proto_unplumb(PF_INET, ifp);
10318 	(void) proto_unplumb(PF_INET6, ifp);
10319 }
10320 
10321 static void
ifp_src_route_copyout(struct ifnet * ifp,struct route * dst)10322 ifp_src_route_copyout(struct ifnet *ifp, struct route *dst)
10323 {
10324 	lck_mtx_lock_spin(&ifp->if_cached_route_lock);
10325 	lck_mtx_convert_spin(&ifp->if_cached_route_lock);
10326 
10327 	route_copyout(dst, &ifp->if_src_route, sizeof(*dst));
10328 
10329 	lck_mtx_unlock(&ifp->if_cached_route_lock);
10330 }
10331 
10332 static void
ifp_src_route_copyin(struct ifnet * ifp,struct route * src)10333 ifp_src_route_copyin(struct ifnet *ifp, struct route *src)
10334 {
10335 	lck_mtx_lock_spin(&ifp->if_cached_route_lock);
10336 	lck_mtx_convert_spin(&ifp->if_cached_route_lock);
10337 
10338 	if (ifp->if_fwd_cacheok) {
10339 		route_copyin(src, &ifp->if_src_route, sizeof(*src));
10340 	} else {
10341 		ROUTE_RELEASE(src);
10342 	}
10343 	lck_mtx_unlock(&ifp->if_cached_route_lock);
10344 }
10345 
10346 static void
ifp_src_route6_copyout(struct ifnet * ifp,struct route_in6 * dst)10347 ifp_src_route6_copyout(struct ifnet *ifp, struct route_in6 *dst)
10348 {
10349 	lck_mtx_lock_spin(&ifp->if_cached_route_lock);
10350 	lck_mtx_convert_spin(&ifp->if_cached_route_lock);
10351 
10352 	route_copyout((struct route *)dst, (struct route *)&ifp->if_src_route6,
10353 	    sizeof(*dst));
10354 
10355 	lck_mtx_unlock(&ifp->if_cached_route_lock);
10356 }
10357 
10358 static void
ifp_src_route6_copyin(struct ifnet * ifp,struct route_in6 * src)10359 ifp_src_route6_copyin(struct ifnet *ifp, struct route_in6 *src)
10360 {
10361 	lck_mtx_lock_spin(&ifp->if_cached_route_lock);
10362 	lck_mtx_convert_spin(&ifp->if_cached_route_lock);
10363 
10364 	if (ifp->if_fwd_cacheok) {
10365 		route_copyin((struct route *)src,
10366 		    (struct route *)&ifp->if_src_route6, sizeof(*src));
10367 	} else {
10368 		ROUTE_RELEASE(src);
10369 	}
10370 	lck_mtx_unlock(&ifp->if_cached_route_lock);
10371 }
10372 
10373 struct rtentry *
ifnet_cached_rtlookup_inet(struct ifnet * ifp,struct in_addr src_ip)10374 ifnet_cached_rtlookup_inet(struct ifnet *ifp, struct in_addr src_ip)
10375 {
10376 	struct route            src_rt;
10377 	struct sockaddr_in      *dst;
10378 
10379 	dst = (struct sockaddr_in *)(void *)(&src_rt.ro_dst);
10380 
10381 	ifp_src_route_copyout(ifp, &src_rt);
10382 
10383 	if (ROUTE_UNUSABLE(&src_rt) || src_ip.s_addr != dst->sin_addr.s_addr) {
10384 		ROUTE_RELEASE(&src_rt);
10385 		if (dst->sin_family != AF_INET) {
10386 			bzero(&src_rt.ro_dst, sizeof(src_rt.ro_dst));
10387 			dst->sin_len = sizeof(src_rt.ro_dst);
10388 			dst->sin_family = AF_INET;
10389 		}
10390 		dst->sin_addr = src_ip;
10391 
10392 		VERIFY(src_rt.ro_rt == NULL);
10393 		src_rt.ro_rt = rtalloc1_scoped((struct sockaddr *)dst,
10394 		    0, 0, ifp->if_index);
10395 
10396 		if (src_rt.ro_rt != NULL) {
10397 			/* retain a ref, copyin consumes one */
10398 			struct rtentry  *rte = src_rt.ro_rt;
10399 			RT_ADDREF(rte);
10400 			ifp_src_route_copyin(ifp, &src_rt);
10401 			src_rt.ro_rt = rte;
10402 		}
10403 	}
10404 
10405 	return src_rt.ro_rt;
10406 }
10407 
10408 struct rtentry *
ifnet_cached_rtlookup_inet6(struct ifnet * ifp,struct in6_addr * src_ip6)10409 ifnet_cached_rtlookup_inet6(struct ifnet *ifp, struct in6_addr *src_ip6)
10410 {
10411 	struct route_in6 src_rt;
10412 
10413 	ifp_src_route6_copyout(ifp, &src_rt);
10414 
10415 	if (ROUTE_UNUSABLE(&src_rt) ||
10416 	    !IN6_ARE_ADDR_EQUAL(src_ip6, &src_rt.ro_dst.sin6_addr)) {
10417 		ROUTE_RELEASE(&src_rt);
10418 		if (src_rt.ro_dst.sin6_family != AF_INET6) {
10419 			bzero(&src_rt.ro_dst, sizeof(src_rt.ro_dst));
10420 			src_rt.ro_dst.sin6_len = sizeof(src_rt.ro_dst);
10421 			src_rt.ro_dst.sin6_family = AF_INET6;
10422 		}
10423 		src_rt.ro_dst.sin6_scope_id = in6_addr2scopeid(ifp, src_ip6);
10424 		bcopy(src_ip6, &src_rt.ro_dst.sin6_addr,
10425 		    sizeof(src_rt.ro_dst.sin6_addr));
10426 
10427 		if (src_rt.ro_rt == NULL) {
10428 			src_rt.ro_rt = rtalloc1_scoped(
10429 				(struct sockaddr *)&src_rt.ro_dst, 0, 0,
10430 				ifp->if_index);
10431 
10432 			if (src_rt.ro_rt != NULL) {
10433 				/* retain a ref, copyin consumes one */
10434 				struct rtentry  *rte = src_rt.ro_rt;
10435 				RT_ADDREF(rte);
10436 				ifp_src_route6_copyin(ifp, &src_rt);
10437 				src_rt.ro_rt = rte;
10438 			}
10439 		}
10440 	}
10441 
10442 	return src_rt.ro_rt;
10443 }
10444 
10445 void
if_lqm_update(struct ifnet * ifp,int lqm,int locked)10446 if_lqm_update(struct ifnet *ifp, int lqm, int locked)
10447 {
10448 	struct kev_dl_link_quality_metric_data ev_lqm_data;
10449 
10450 	VERIFY(lqm >= IFNET_LQM_MIN && lqm <= IFNET_LQM_MAX);
10451 
10452 	/* Normalize to edge */
10453 	if (lqm >= 0 && lqm <= IFNET_LQM_THRESH_ABORT) {
10454 		lqm = IFNET_LQM_THRESH_ABORT;
10455 		atomic_bitset_32(&tcbinfo.ipi_flags,
10456 		    INPCBINFO_HANDLE_LQM_ABORT);
10457 		inpcb_timer_sched(&tcbinfo, INPCB_TIMER_FAST);
10458 	} else if (lqm > IFNET_LQM_THRESH_ABORT &&
10459 	    lqm <= IFNET_LQM_THRESH_MINIMALLY_VIABLE) {
10460 		lqm = IFNET_LQM_THRESH_MINIMALLY_VIABLE;
10461 	} else if (lqm > IFNET_LQM_THRESH_MINIMALLY_VIABLE &&
10462 	    lqm <= IFNET_LQM_THRESH_POOR) {
10463 		lqm = IFNET_LQM_THRESH_POOR;
10464 	} else if (lqm > IFNET_LQM_THRESH_POOR &&
10465 	    lqm <= IFNET_LQM_THRESH_GOOD) {
10466 		lqm = IFNET_LQM_THRESH_GOOD;
10467 	}
10468 
10469 	/*
10470 	 * Take the lock if needed
10471 	 */
10472 	if (!locked) {
10473 		ifnet_lock_exclusive(ifp);
10474 	}
10475 
10476 	if (lqm == ifp->if_interface_state.lqm_state &&
10477 	    (ifp->if_interface_state.valid_bitmask &
10478 	    IF_INTERFACE_STATE_LQM_STATE_VALID)) {
10479 		/*
10480 		 * Release the lock if was not held by the caller
10481 		 */
10482 		if (!locked) {
10483 			ifnet_lock_done(ifp);
10484 		}
10485 		return;         /* nothing to update */
10486 	}
10487 	ifp->if_interface_state.valid_bitmask |=
10488 	    IF_INTERFACE_STATE_LQM_STATE_VALID;
10489 	ifp->if_interface_state.lqm_state = (int8_t)lqm;
10490 
10491 	/*
10492 	 * Don't want to hold the lock when issuing kernel events
10493 	 */
10494 	ifnet_lock_done(ifp);
10495 
10496 	bzero(&ev_lqm_data, sizeof(ev_lqm_data));
10497 	ev_lqm_data.link_quality_metric = lqm;
10498 
10499 	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_LINK_QUALITY_METRIC_CHANGED,
10500 	    (struct net_event_data *)&ev_lqm_data, sizeof(ev_lqm_data), FALSE);
10501 
10502 	/*
10503 	 * Reacquire the lock for the caller
10504 	 */
10505 	if (locked) {
10506 		ifnet_lock_exclusive(ifp);
10507 	}
10508 }
10509 
10510 static void
if_rrc_state_update(struct ifnet * ifp,unsigned int rrc_state)10511 if_rrc_state_update(struct ifnet *ifp, unsigned int rrc_state)
10512 {
10513 	struct kev_dl_rrc_state kev;
10514 
10515 	if (rrc_state == ifp->if_interface_state.rrc_state &&
10516 	    (ifp->if_interface_state.valid_bitmask &
10517 	    IF_INTERFACE_STATE_RRC_STATE_VALID)) {
10518 		return;
10519 	}
10520 
10521 	ifp->if_interface_state.valid_bitmask |=
10522 	    IF_INTERFACE_STATE_RRC_STATE_VALID;
10523 
10524 	ifp->if_interface_state.rrc_state = (uint8_t)rrc_state;
10525 
10526 	/*
10527 	 * Don't want to hold the lock when issuing kernel events
10528 	 */
10529 	ifnet_lock_done(ifp);
10530 
10531 	bzero(&kev, sizeof(struct kev_dl_rrc_state));
10532 	kev.rrc_state = rrc_state;
10533 
10534 	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_RRC_STATE_CHANGED,
10535 	    (struct net_event_data *)&kev, sizeof(struct kev_dl_rrc_state), FALSE);
10536 
10537 	ifnet_lock_exclusive(ifp);
10538 }
10539 
10540 errno_t
if_state_update(struct ifnet * ifp,struct if_interface_state * if_interface_state)10541 if_state_update(struct ifnet *ifp,
10542     struct if_interface_state *if_interface_state)
10543 {
10544 	u_short if_index_available = 0;
10545 
10546 	ifnet_lock_exclusive(ifp);
10547 
10548 	if ((ifp->if_type != IFT_CELLULAR) &&
10549 	    (if_interface_state->valid_bitmask &
10550 	    IF_INTERFACE_STATE_RRC_STATE_VALID)) {
10551 		ifnet_lock_done(ifp);
10552 		return ENOTSUP;
10553 	}
10554 	if ((if_interface_state->valid_bitmask &
10555 	    IF_INTERFACE_STATE_LQM_STATE_VALID) &&
10556 	    (if_interface_state->lqm_state < IFNET_LQM_MIN ||
10557 	    if_interface_state->lqm_state > IFNET_LQM_MAX)) {
10558 		ifnet_lock_done(ifp);
10559 		return EINVAL;
10560 	}
10561 	if ((if_interface_state->valid_bitmask &
10562 	    IF_INTERFACE_STATE_RRC_STATE_VALID) &&
10563 	    if_interface_state->rrc_state !=
10564 	    IF_INTERFACE_STATE_RRC_STATE_IDLE &&
10565 	    if_interface_state->rrc_state !=
10566 	    IF_INTERFACE_STATE_RRC_STATE_CONNECTED) {
10567 		ifnet_lock_done(ifp);
10568 		return EINVAL;
10569 	}
10570 
10571 	if (if_interface_state->valid_bitmask &
10572 	    IF_INTERFACE_STATE_LQM_STATE_VALID) {
10573 		if_lqm_update(ifp, if_interface_state->lqm_state, 1);
10574 	}
10575 	if (if_interface_state->valid_bitmask &
10576 	    IF_INTERFACE_STATE_RRC_STATE_VALID) {
10577 		if_rrc_state_update(ifp, if_interface_state->rrc_state);
10578 	}
10579 	if (if_interface_state->valid_bitmask &
10580 	    IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID) {
10581 		ifp->if_interface_state.valid_bitmask |=
10582 		    IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID;
10583 		ifp->if_interface_state.interface_availability =
10584 		    if_interface_state->interface_availability;
10585 
10586 		if (ifp->if_interface_state.interface_availability ==
10587 		    IF_INTERFACE_STATE_INTERFACE_AVAILABLE) {
10588 			os_log(OS_LOG_DEFAULT, "%s: interface %s (%u) available\n",
10589 			    __func__, if_name(ifp), ifp->if_index);
10590 			if_index_available = ifp->if_index;
10591 		} else {
10592 			os_log(OS_LOG_DEFAULT, "%s: interface %s (%u) unavailable)\n",
10593 			    __func__, if_name(ifp), ifp->if_index);
10594 		}
10595 	}
10596 	ifnet_lock_done(ifp);
10597 
10598 	/*
10599 	 * Check if the TCP connections going on this interface should be
10600 	 * forced to send probe packets instead of waiting for TCP timers
10601 	 * to fire. This is done on an explicit notification such as
10602 	 * SIOCSIFINTERFACESTATE which marks the interface as available.
10603 	 */
10604 	if (if_index_available > 0) {
10605 		tcp_interface_send_probe(if_index_available);
10606 	}
10607 
10608 	return 0;
10609 }
10610 
10611 void
if_get_state(struct ifnet * ifp,struct if_interface_state * if_interface_state)10612 if_get_state(struct ifnet *ifp,
10613     struct if_interface_state *if_interface_state)
10614 {
10615 	ifnet_lock_shared(ifp);
10616 
10617 	if_interface_state->valid_bitmask = 0;
10618 
10619 	if (ifp->if_interface_state.valid_bitmask &
10620 	    IF_INTERFACE_STATE_RRC_STATE_VALID) {
10621 		if_interface_state->valid_bitmask |=
10622 		    IF_INTERFACE_STATE_RRC_STATE_VALID;
10623 		if_interface_state->rrc_state =
10624 		    ifp->if_interface_state.rrc_state;
10625 	}
10626 	if (ifp->if_interface_state.valid_bitmask &
10627 	    IF_INTERFACE_STATE_LQM_STATE_VALID) {
10628 		if_interface_state->valid_bitmask |=
10629 		    IF_INTERFACE_STATE_LQM_STATE_VALID;
10630 		if_interface_state->lqm_state =
10631 		    ifp->if_interface_state.lqm_state;
10632 	}
10633 	if (ifp->if_interface_state.valid_bitmask &
10634 	    IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID) {
10635 		if_interface_state->valid_bitmask |=
10636 		    IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID;
10637 		if_interface_state->interface_availability =
10638 		    ifp->if_interface_state.interface_availability;
10639 	}
10640 
10641 	ifnet_lock_done(ifp);
10642 }
10643 
10644 errno_t
if_probe_connectivity(struct ifnet * ifp,u_int32_t conn_probe)10645 if_probe_connectivity(struct ifnet *ifp, u_int32_t conn_probe)
10646 {
10647 	if (conn_probe > 1) {
10648 		return EINVAL;
10649 	}
10650 	if (conn_probe == 0) {
10651 		if_clear_eflags(ifp, IFEF_PROBE_CONNECTIVITY);
10652 	} else {
10653 		if_set_eflags(ifp, IFEF_PROBE_CONNECTIVITY);
10654 	}
10655 
10656 #if NECP
10657 	necp_update_all_clients();
10658 #endif /* NECP */
10659 
10660 	tcp_probe_connectivity(ifp, conn_probe);
10661 	return 0;
10662 }
10663 
10664 /* for uuid.c */
10665 static int
get_ether_index(int * ret_other_index)10666 get_ether_index(int * ret_other_index)
10667 {
10668 	struct ifnet *ifp;
10669 	int en0_index = 0;
10670 	int other_en_index = 0;
10671 	int any_ether_index = 0;
10672 	short best_unit = 0;
10673 
10674 	*ret_other_index = 0;
10675 	TAILQ_FOREACH(ifp, &ifnet_head, if_link) {
10676 		/*
10677 		 * find en0, or if not en0, the lowest unit en*, and if not
10678 		 * that, any ethernet
10679 		 */
10680 		ifnet_lock_shared(ifp);
10681 		if (strcmp(ifp->if_name, "en") == 0) {
10682 			if (ifp->if_unit == 0) {
10683 				/* found en0, we're done */
10684 				en0_index = ifp->if_index;
10685 				ifnet_lock_done(ifp);
10686 				break;
10687 			}
10688 			if (other_en_index == 0 || ifp->if_unit < best_unit) {
10689 				other_en_index = ifp->if_index;
10690 				best_unit = ifp->if_unit;
10691 			}
10692 		} else if (ifp->if_type == IFT_ETHER && any_ether_index == 0) {
10693 			any_ether_index = ifp->if_index;
10694 		}
10695 		ifnet_lock_done(ifp);
10696 	}
10697 	if (en0_index == 0) {
10698 		if (other_en_index != 0) {
10699 			*ret_other_index = other_en_index;
10700 		} else if (any_ether_index != 0) {
10701 			*ret_other_index = any_ether_index;
10702 		}
10703 	}
10704 	return en0_index;
10705 }
10706 
10707 int
uuid_get_ethernet(u_int8_t * node)10708 uuid_get_ethernet(u_int8_t *node)
10709 {
10710 	static int en0_index;
10711 	struct ifnet *ifp;
10712 	int other_index = 0;
10713 	int the_index = 0;
10714 	int ret;
10715 
10716 	ifnet_head_lock_shared();
10717 	if (en0_index == 0 || ifindex2ifnet[en0_index] == NULL) {
10718 		en0_index = get_ether_index(&other_index);
10719 	}
10720 	if (en0_index != 0) {
10721 		the_index = en0_index;
10722 	} else if (other_index != 0) {
10723 		the_index = other_index;
10724 	}
10725 	if (the_index != 0) {
10726 		struct dlil_ifnet *dl_if;
10727 
10728 		ifp = ifindex2ifnet[the_index];
10729 		VERIFY(ifp != NULL);
10730 		dl_if = (struct dlil_ifnet *)ifp;
10731 		if (dl_if->dl_if_permanent_ether_is_set != 0) {
10732 			/*
10733 			 * Use the permanent ethernet address if it is
10734 			 * available because it will never change.
10735 			 */
10736 			memcpy(node, dl_if->dl_if_permanent_ether,
10737 			    ETHER_ADDR_LEN);
10738 		} else {
10739 			memcpy(node, IF_LLADDR(ifp), ETHER_ADDR_LEN);
10740 		}
10741 		ret = 0;
10742 	} else {
10743 		ret = -1;
10744 	}
10745 	ifnet_head_done();
10746 	return ret;
10747 }
10748 
10749 static int
10750 sysctl_rxpoll SYSCTL_HANDLER_ARGS
10751 {
10752 #pragma unused(arg1, arg2)
10753 	uint32_t i;
10754 	int err;
10755 
10756 	i = if_rxpoll;
10757 
10758 	err = sysctl_handle_int(oidp, &i, 0, req);
10759 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
10760 		return err;
10761 	}
10762 
10763 	if (net_rxpoll == 0) {
10764 		return ENXIO;
10765 	}
10766 
10767 	if_rxpoll = i;
10768 	return err;
10769 }
10770 
10771 static int
10772 sysctl_rxpoll_mode_holdtime SYSCTL_HANDLER_ARGS
10773 {
10774 #pragma unused(arg1, arg2)
10775 	uint64_t q;
10776 	int err;
10777 
10778 	q = if_rxpoll_mode_holdtime;
10779 
10780 	err = sysctl_handle_quad(oidp, &q, 0, req);
10781 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
10782 		return err;
10783 	}
10784 
10785 	if (q < IF_RXPOLL_MODE_HOLDTIME_MIN) {
10786 		q = IF_RXPOLL_MODE_HOLDTIME_MIN;
10787 	}
10788 
10789 	if_rxpoll_mode_holdtime = q;
10790 
10791 	return err;
10792 }
10793 
10794 static int
10795 sysctl_rxpoll_sample_holdtime SYSCTL_HANDLER_ARGS
10796 {
10797 #pragma unused(arg1, arg2)
10798 	uint64_t q;
10799 	int err;
10800 
10801 	q = if_rxpoll_sample_holdtime;
10802 
10803 	err = sysctl_handle_quad(oidp, &q, 0, req);
10804 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
10805 		return err;
10806 	}
10807 
10808 	if (q < IF_RXPOLL_SAMPLETIME_MIN) {
10809 		q = IF_RXPOLL_SAMPLETIME_MIN;
10810 	}
10811 
10812 	if_rxpoll_sample_holdtime = q;
10813 
10814 	return err;
10815 }
10816 
10817 static int
10818 sysctl_rxpoll_interval_time SYSCTL_HANDLER_ARGS
10819 {
10820 #pragma unused(arg1, arg2)
10821 	uint64_t q;
10822 	int err;
10823 
10824 	q = if_rxpoll_interval_time;
10825 
10826 	err = sysctl_handle_quad(oidp, &q, 0, req);
10827 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
10828 		return err;
10829 	}
10830 
10831 	if (q < IF_RXPOLL_INTERVALTIME_MIN) {
10832 		q = IF_RXPOLL_INTERVALTIME_MIN;
10833 	}
10834 
10835 	if_rxpoll_interval_time = q;
10836 
10837 	return err;
10838 }
10839 
10840 static int
10841 sysctl_rxpoll_wlowat SYSCTL_HANDLER_ARGS
10842 {
10843 #pragma unused(arg1, arg2)
10844 	uint32_t i;
10845 	int err;
10846 
10847 	i = if_sysctl_rxpoll_wlowat;
10848 
10849 	err = sysctl_handle_int(oidp, &i, 0, req);
10850 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
10851 		return err;
10852 	}
10853 
10854 	if (i == 0 || i >= if_sysctl_rxpoll_whiwat) {
10855 		return EINVAL;
10856 	}
10857 
10858 	if_sysctl_rxpoll_wlowat = i;
10859 	return err;
10860 }
10861 
10862 static int
10863 sysctl_rxpoll_whiwat SYSCTL_HANDLER_ARGS
10864 {
10865 #pragma unused(arg1, arg2)
10866 	uint32_t i;
10867 	int err;
10868 
10869 	i = if_sysctl_rxpoll_whiwat;
10870 
10871 	err = sysctl_handle_int(oidp, &i, 0, req);
10872 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
10873 		return err;
10874 	}
10875 
10876 	if (i <= if_sysctl_rxpoll_wlowat) {
10877 		return EINVAL;
10878 	}
10879 
10880 	if_sysctl_rxpoll_whiwat = i;
10881 	return err;
10882 }
10883 
10884 static int
10885 sysctl_sndq_maxlen SYSCTL_HANDLER_ARGS
10886 {
10887 #pragma unused(arg1, arg2)
10888 	int i, err;
10889 
10890 	i = if_sndq_maxlen;
10891 
10892 	err = sysctl_handle_int(oidp, &i, 0, req);
10893 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
10894 		return err;
10895 	}
10896 
10897 	if (i < IF_SNDQ_MINLEN) {
10898 		i = IF_SNDQ_MINLEN;
10899 	}
10900 
10901 	if_sndq_maxlen = i;
10902 	return err;
10903 }
10904 
10905 static int
10906 sysctl_rcvq_maxlen SYSCTL_HANDLER_ARGS
10907 {
10908 #pragma unused(arg1, arg2)
10909 	int i, err;
10910 
10911 	i = if_rcvq_maxlen;
10912 
10913 	err = sysctl_handle_int(oidp, &i, 0, req);
10914 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
10915 		return err;
10916 	}
10917 
10918 	if (i < IF_RCVQ_MINLEN) {
10919 		i = IF_RCVQ_MINLEN;
10920 	}
10921 
10922 	if_rcvq_maxlen = i;
10923 	return err;
10924 }
10925 
10926 int
dlil_node_present(struct ifnet * ifp,struct sockaddr * sa,int32_t rssi,int lqm,int npm,u_int8_t srvinfo[48])10927 dlil_node_present(struct ifnet *ifp, struct sockaddr *sa,
10928     int32_t rssi, int lqm, int npm, u_int8_t srvinfo[48])
10929 {
10930 	struct kev_dl_node_presence kev;
10931 	struct sockaddr_dl *sdl;
10932 	struct sockaddr_in6 *sin6;
10933 	int ret = 0;
10934 
10935 	VERIFY(ifp);
10936 	VERIFY(sa);
10937 	VERIFY(sa->sa_family == AF_LINK || sa->sa_family == AF_INET6);
10938 
10939 	bzero(&kev, sizeof(kev));
10940 	sin6 = &kev.sin6_node_address;
10941 	sdl = &kev.sdl_node_address;
10942 	nd6_alt_node_addr_decompose(ifp, sa, sdl, sin6);
10943 	kev.rssi = rssi;
10944 	kev.link_quality_metric = lqm;
10945 	kev.node_proximity_metric = npm;
10946 	bcopy(srvinfo, kev.node_service_info, sizeof(kev.node_service_info));
10947 
10948 	ret = nd6_alt_node_present(ifp, sin6, sdl, rssi, lqm, npm);
10949 	if (ret == 0 || ret == EEXIST) {
10950 		int err = dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_NODE_PRESENCE,
10951 		    &kev.link_data, sizeof(kev), (ret == EEXIST) ? TRUE : FALSE);
10952 		if (err != 0) {
10953 			log(LOG_ERR, "%s: Post DL_NODE_PRESENCE failed with"
10954 			    "error %d\n", __func__, err);
10955 		}
10956 	}
10957 
10958 	if (ret == EEXIST) {
10959 		ret = 0;
10960 	}
10961 	return ret;
10962 }
10963 
10964 void
dlil_node_absent(struct ifnet * ifp,struct sockaddr * sa)10965 dlil_node_absent(struct ifnet *ifp, struct sockaddr *sa)
10966 {
10967 	struct kev_dl_node_absence kev = {};
10968 	struct sockaddr_in6 *kev_sin6 = NULL;
10969 	struct sockaddr_dl *kev_sdl = NULL;
10970 	int error = 0;
10971 
10972 	VERIFY(ifp != NULL);
10973 	VERIFY(sa != NULL);
10974 	VERIFY(sa->sa_family == AF_LINK || sa->sa_family == AF_INET6);
10975 
10976 	kev_sin6 = &kev.sin6_node_address;
10977 	kev_sdl = &kev.sdl_node_address;
10978 
10979 	if (sa->sa_family == AF_INET6) {
10980 		/*
10981 		 * If IPv6 address is given, get the link layer
10982 		 * address from what was cached in the neighbor cache
10983 		 */
10984 		VERIFY(sa->sa_len <= sizeof(*kev_sin6));
10985 		bcopy(sa, kev_sin6, sa->sa_len);
10986 		error = nd6_alt_node_absent(ifp, kev_sin6, kev_sdl);
10987 	} else {
10988 		/*
10989 		 * If passed address is AF_LINK type, derive the address
10990 		 * based on the link address.
10991 		 */
10992 		nd6_alt_node_addr_decompose(ifp, sa, kev_sdl, kev_sin6);
10993 		error = nd6_alt_node_absent(ifp, kev_sin6, NULL);
10994 	}
10995 
10996 	if (error == 0) {
10997 		kev_sdl->sdl_type = ifp->if_type;
10998 		kev_sdl->sdl_index = ifp->if_index;
10999 
11000 		dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_NODE_ABSENCE,
11001 		    &kev.link_data, sizeof(kev), FALSE);
11002 	}
11003 }
11004 
11005 int
dlil_node_present_v2(struct ifnet * ifp,struct sockaddr * sa,struct sockaddr_dl * sdl,int32_t rssi,int lqm,int npm,u_int8_t srvinfo[48])11006 dlil_node_present_v2(struct ifnet *ifp, struct sockaddr *sa, struct sockaddr_dl *sdl,
11007     int32_t rssi, int lqm, int npm, u_int8_t srvinfo[48])
11008 {
11009 	struct kev_dl_node_presence kev = {};
11010 	struct sockaddr_dl *kev_sdl = NULL;
11011 	struct sockaddr_in6 *kev_sin6 = NULL;
11012 	int ret = 0;
11013 
11014 	VERIFY(ifp != NULL);
11015 	VERIFY(sa != NULL && sdl != NULL);
11016 	VERIFY(sa->sa_family == AF_INET6 && sdl->sdl_family == AF_LINK);
11017 
11018 	kev_sin6 = &kev.sin6_node_address;
11019 	kev_sdl = &kev.sdl_node_address;
11020 
11021 	VERIFY(sdl->sdl_len <= sizeof(*kev_sdl));
11022 	bcopy(sdl, kev_sdl, sdl->sdl_len);
11023 	kev_sdl->sdl_type = ifp->if_type;
11024 	kev_sdl->sdl_index = ifp->if_index;
11025 
11026 	VERIFY(sa->sa_len <= sizeof(*kev_sin6));
11027 	bcopy(sa, kev_sin6, sa->sa_len);
11028 
11029 	kev.rssi = rssi;
11030 	kev.link_quality_metric = lqm;
11031 	kev.node_proximity_metric = npm;
11032 	bcopy(srvinfo, kev.node_service_info, sizeof(kev.node_service_info));
11033 
11034 	ret = nd6_alt_node_present(ifp, SIN6(sa), sdl, rssi, lqm, npm);
11035 	if (ret == 0 || ret == EEXIST) {
11036 		int err = dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_NODE_PRESENCE,
11037 		    &kev.link_data, sizeof(kev), (ret == EEXIST) ? TRUE : FALSE);
11038 		if (err != 0) {
11039 			log(LOG_ERR, "%s: Post DL_NODE_PRESENCE failed with error %d\n", __func__, err);
11040 		}
11041 	}
11042 
11043 	if (ret == EEXIST) {
11044 		ret = 0;
11045 	}
11046 	return ret;
11047 }
11048 
11049 const void *
dlil_ifaddr_bytes(const struct sockaddr_dl * sdl,size_t * sizep,kauth_cred_t * credp)11050 dlil_ifaddr_bytes(const struct sockaddr_dl *sdl, size_t *sizep,
11051     kauth_cred_t *credp)
11052 {
11053 	const u_int8_t *bytes;
11054 	size_t size;
11055 
11056 	bytes = CONST_LLADDR(sdl);
11057 	size = sdl->sdl_alen;
11058 
11059 #if CONFIG_MACF
11060 	if (dlil_lladdr_ckreq) {
11061 		switch (sdl->sdl_type) {
11062 		case IFT_ETHER:
11063 		case IFT_IEEE1394:
11064 			break;
11065 		default:
11066 			credp = NULL;
11067 			break;
11068 		}
11069 		;
11070 
11071 		if (credp && mac_system_check_info(*credp, "net.link.addr")) {
11072 			static const u_int8_t unspec[FIREWIRE_EUI64_LEN] = {
11073 				[0] = 2
11074 			};
11075 
11076 			bytes = unspec;
11077 		}
11078 	}
11079 #else
11080 #pragma unused(credp)
11081 #endif
11082 
11083 	if (sizep != NULL) {
11084 		*sizep = size;
11085 	}
11086 	return bytes;
11087 }
11088 
11089 void
dlil_report_issues(struct ifnet * ifp,u_int8_t modid[DLIL_MODIDLEN],u_int8_t info[DLIL_MODARGLEN])11090 dlil_report_issues(struct ifnet *ifp, u_int8_t modid[DLIL_MODIDLEN],
11091     u_int8_t info[DLIL_MODARGLEN])
11092 {
11093 	struct kev_dl_issues kev;
11094 	struct timeval tv;
11095 
11096 	VERIFY(ifp != NULL);
11097 	VERIFY(modid != NULL);
11098 	_CASSERT(sizeof(kev.modid) == DLIL_MODIDLEN);
11099 	_CASSERT(sizeof(kev.info) == DLIL_MODARGLEN);
11100 
11101 	bzero(&kev, sizeof(kev));
11102 
11103 	microtime(&tv);
11104 	kev.timestamp = tv.tv_sec;
11105 	bcopy(modid, &kev.modid, DLIL_MODIDLEN);
11106 	if (info != NULL) {
11107 		bcopy(info, &kev.info, DLIL_MODARGLEN);
11108 	}
11109 
11110 	dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_ISSUES,
11111 	    &kev.link_data, sizeof(kev), FALSE);
11112 }
11113 
11114 errno_t
ifnet_getset_opportunistic(ifnet_t ifp,u_long cmd,struct ifreq * ifr,struct proc * p)11115 ifnet_getset_opportunistic(ifnet_t ifp, u_long cmd, struct ifreq *ifr,
11116     struct proc *p)
11117 {
11118 	u_int32_t level = IFNET_THROTTLE_OFF;
11119 	errno_t result = 0;
11120 
11121 	VERIFY(cmd == SIOCSIFOPPORTUNISTIC || cmd == SIOCGIFOPPORTUNISTIC);
11122 
11123 	if (cmd == SIOCSIFOPPORTUNISTIC) {
11124 		/*
11125 		 * XXX: Use priv_check_cred() instead of root check?
11126 		 */
11127 		if ((result = proc_suser(p)) != 0) {
11128 			return result;
11129 		}
11130 
11131 		if (ifr->ifr_opportunistic.ifo_flags ==
11132 		    IFRIFOF_BLOCK_OPPORTUNISTIC) {
11133 			level = IFNET_THROTTLE_OPPORTUNISTIC;
11134 		} else if (ifr->ifr_opportunistic.ifo_flags == 0) {
11135 			level = IFNET_THROTTLE_OFF;
11136 		} else {
11137 			result = EINVAL;
11138 		}
11139 
11140 		if (result == 0) {
11141 			result = ifnet_set_throttle(ifp, level);
11142 		}
11143 	} else if ((result = ifnet_get_throttle(ifp, &level)) == 0) {
11144 		ifr->ifr_opportunistic.ifo_flags = 0;
11145 		if (level == IFNET_THROTTLE_OPPORTUNISTIC) {
11146 			ifr->ifr_opportunistic.ifo_flags |=
11147 			    IFRIFOF_BLOCK_OPPORTUNISTIC;
11148 		}
11149 	}
11150 
11151 	/*
11152 	 * Return the count of current opportunistic connections
11153 	 * over the interface.
11154 	 */
11155 	if (result == 0) {
11156 		uint32_t flags = 0;
11157 		flags |= (cmd == SIOCSIFOPPORTUNISTIC) ?
11158 		    INPCB_OPPORTUNISTIC_SETCMD : 0;
11159 		flags |= (level == IFNET_THROTTLE_OPPORTUNISTIC) ?
11160 		    INPCB_OPPORTUNISTIC_THROTTLEON : 0;
11161 		ifr->ifr_opportunistic.ifo_inuse =
11162 		    udp_count_opportunistic(ifp->if_index, flags) +
11163 		    tcp_count_opportunistic(ifp->if_index, flags);
11164 	}
11165 
11166 	if (result == EALREADY) {
11167 		result = 0;
11168 	}
11169 
11170 	return result;
11171 }
11172 
11173 int
ifnet_get_throttle(struct ifnet * ifp,u_int32_t * level)11174 ifnet_get_throttle(struct ifnet *ifp, u_int32_t *level)
11175 {
11176 	struct ifclassq *ifq;
11177 	int err = 0;
11178 
11179 	if (!(ifp->if_eflags & IFEF_TXSTART)) {
11180 		return ENXIO;
11181 	}
11182 
11183 	*level = IFNET_THROTTLE_OFF;
11184 
11185 	ifq = ifp->if_snd;
11186 	IFCQ_LOCK(ifq);
11187 	/* Throttling works only for IFCQ, not ALTQ instances */
11188 	if (IFCQ_IS_ENABLED(ifq)) {
11189 		cqrq_throttle_t req = { 0, IFNET_THROTTLE_OFF };
11190 
11191 		err = fq_if_request_classq(ifq, CLASSQRQ_THROTTLE, &req);
11192 		*level = req.level;
11193 	}
11194 	IFCQ_UNLOCK(ifq);
11195 
11196 	return err;
11197 }
11198 
11199 int
ifnet_set_throttle(struct ifnet * ifp,u_int32_t level)11200 ifnet_set_throttle(struct ifnet *ifp, u_int32_t level)
11201 {
11202 	struct ifclassq *ifq;
11203 	int err = 0;
11204 
11205 	if (!(ifp->if_eflags & IFEF_TXSTART)) {
11206 		return ENXIO;
11207 	}
11208 
11209 	ifq = ifp->if_snd;
11210 
11211 	switch (level) {
11212 	case IFNET_THROTTLE_OFF:
11213 	case IFNET_THROTTLE_OPPORTUNISTIC:
11214 		break;
11215 	default:
11216 		return EINVAL;
11217 	}
11218 
11219 	IFCQ_LOCK(ifq);
11220 	if (IFCQ_IS_ENABLED(ifq)) {
11221 		cqrq_throttle_t req = { 1, level };
11222 
11223 		err = fq_if_request_classq(ifq, CLASSQRQ_THROTTLE, &req);
11224 	}
11225 	IFCQ_UNLOCK(ifq);
11226 
11227 	if (err == 0) {
11228 		DLIL_PRINTF("%s: throttling level set to %d\n", if_name(ifp),
11229 		    level);
11230 #if NECP
11231 		necp_update_all_clients();
11232 #endif /* NECP */
11233 		if (level == IFNET_THROTTLE_OFF) {
11234 			ifnet_start(ifp);
11235 		}
11236 	}
11237 
11238 	return err;
11239 }
11240 
11241 errno_t
ifnet_getset_log(ifnet_t ifp,u_long cmd,struct ifreq * ifr,struct proc * p)11242 ifnet_getset_log(ifnet_t ifp, u_long cmd, struct ifreq *ifr,
11243     struct proc *p)
11244 {
11245 #pragma unused(p)
11246 	errno_t result = 0;
11247 	uint32_t flags;
11248 	int level, category, subcategory;
11249 
11250 	VERIFY(cmd == SIOCSIFLOG || cmd == SIOCGIFLOG);
11251 
11252 	if (cmd == SIOCSIFLOG) {
11253 		if ((result = priv_check_cred(kauth_cred_get(),
11254 		    PRIV_NET_INTERFACE_CONTROL, 0)) != 0) {
11255 			return result;
11256 		}
11257 
11258 		level = ifr->ifr_log.ifl_level;
11259 		if (level < IFNET_LOG_MIN || level > IFNET_LOG_MAX) {
11260 			result = EINVAL;
11261 		}
11262 
11263 		flags = ifr->ifr_log.ifl_flags;
11264 		if ((flags &= IFNET_LOGF_MASK) == 0) {
11265 			result = EINVAL;
11266 		}
11267 
11268 		category = ifr->ifr_log.ifl_category;
11269 		subcategory = ifr->ifr_log.ifl_subcategory;
11270 
11271 		if (result == 0) {
11272 			result = ifnet_set_log(ifp, level, flags,
11273 			    category, subcategory);
11274 		}
11275 	} else {
11276 		result = ifnet_get_log(ifp, &level, &flags, &category,
11277 		    &subcategory);
11278 		if (result == 0) {
11279 			ifr->ifr_log.ifl_level = level;
11280 			ifr->ifr_log.ifl_flags = flags;
11281 			ifr->ifr_log.ifl_category = category;
11282 			ifr->ifr_log.ifl_subcategory = subcategory;
11283 		}
11284 	}
11285 
11286 	return result;
11287 }
11288 
11289 int
ifnet_set_log(struct ifnet * ifp,int32_t level,uint32_t flags,int32_t category,int32_t subcategory)11290 ifnet_set_log(struct ifnet *ifp, int32_t level, uint32_t flags,
11291     int32_t category, int32_t subcategory)
11292 {
11293 	int err = 0;
11294 
11295 	VERIFY(level >= IFNET_LOG_MIN && level <= IFNET_LOG_MAX);
11296 	VERIFY(flags & IFNET_LOGF_MASK);
11297 
11298 	/*
11299 	 * The logging level applies to all facilities; make sure to
11300 	 * update them all with the most current level.
11301 	 */
11302 	flags |= ifp->if_log.flags;
11303 
11304 	if (ifp->if_output_ctl != NULL) {
11305 		struct ifnet_log_params l;
11306 
11307 		bzero(&l, sizeof(l));
11308 		l.level = level;
11309 		l.flags = flags;
11310 		l.flags &= ~IFNET_LOGF_DLIL;
11311 		l.category = category;
11312 		l.subcategory = subcategory;
11313 
11314 		/* Send this request to lower layers */
11315 		if (l.flags != 0) {
11316 			err = ifp->if_output_ctl(ifp, IFNET_CTL_SET_LOG,
11317 			    sizeof(l), &l);
11318 		}
11319 	} else if ((flags & ~IFNET_LOGF_DLIL) && ifp->if_output_ctl == NULL) {
11320 		/*
11321 		 * If targeted to the lower layers without an output
11322 		 * control callback registered on the interface, just
11323 		 * silently ignore facilities other than ours.
11324 		 */
11325 		flags &= IFNET_LOGF_DLIL;
11326 		if (flags == 0 && (!(ifp->if_log.flags & IFNET_LOGF_DLIL))) {
11327 			level = 0;
11328 		}
11329 	}
11330 
11331 	if (err == 0) {
11332 		if ((ifp->if_log.level = level) == IFNET_LOG_DEFAULT) {
11333 			ifp->if_log.flags = 0;
11334 		} else {
11335 			ifp->if_log.flags |= flags;
11336 		}
11337 
11338 		log(LOG_INFO, "%s: logging level set to %d flags=%b "
11339 		    "arg=%b, category=%d subcategory=%d\n", if_name(ifp),
11340 		    ifp->if_log.level, ifp->if_log.flags,
11341 		    IFNET_LOGF_BITS, flags, IFNET_LOGF_BITS,
11342 		    category, subcategory);
11343 	}
11344 
11345 	return err;
11346 }
11347 
11348 int
ifnet_get_log(struct ifnet * ifp,int32_t * level,uint32_t * flags,int32_t * category,int32_t * subcategory)11349 ifnet_get_log(struct ifnet *ifp, int32_t *level, uint32_t *flags,
11350     int32_t *category, int32_t *subcategory)
11351 {
11352 	if (level != NULL) {
11353 		*level = ifp->if_log.level;
11354 	}
11355 	if (flags != NULL) {
11356 		*flags = ifp->if_log.flags;
11357 	}
11358 	if (category != NULL) {
11359 		*category = ifp->if_log.category;
11360 	}
11361 	if (subcategory != NULL) {
11362 		*subcategory = ifp->if_log.subcategory;
11363 	}
11364 
11365 	return 0;
11366 }
11367 
11368 int
ifnet_notify_address(struct ifnet * ifp,int af)11369 ifnet_notify_address(struct ifnet *ifp, int af)
11370 {
11371 	struct ifnet_notify_address_params na;
11372 
11373 #if PF
11374 	(void) pf_ifaddr_hook(ifp);
11375 #endif /* PF */
11376 
11377 	if (ifp->if_output_ctl == NULL) {
11378 		return EOPNOTSUPP;
11379 	}
11380 
11381 	bzero(&na, sizeof(na));
11382 	na.address_family = (sa_family_t)af;
11383 
11384 	return ifp->if_output_ctl(ifp, IFNET_CTL_NOTIFY_ADDRESS,
11385 	           sizeof(na), &na);
11386 }
11387 
11388 errno_t
ifnet_flowid(struct ifnet * ifp,uint32_t * flowid)11389 ifnet_flowid(struct ifnet *ifp, uint32_t *flowid)
11390 {
11391 	if (ifp == NULL || flowid == NULL) {
11392 		return EINVAL;
11393 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
11394 	    !IF_FULLY_ATTACHED(ifp)) {
11395 		return ENXIO;
11396 	}
11397 
11398 	*flowid = ifp->if_flowhash;
11399 
11400 	return 0;
11401 }
11402 
11403 errno_t
ifnet_disable_output(struct ifnet * ifp)11404 ifnet_disable_output(struct ifnet *ifp)
11405 {
11406 	int err;
11407 
11408 	if (ifp == NULL) {
11409 		return EINVAL;
11410 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
11411 	    !IF_FULLY_ATTACHED(ifp)) {
11412 		return ENXIO;
11413 	}
11414 
11415 	if ((err = ifnet_fc_add(ifp)) == 0) {
11416 		lck_mtx_lock_spin(&ifp->if_start_lock);
11417 		ifp->if_start_flags |= IFSF_FLOW_CONTROLLED;
11418 		lck_mtx_unlock(&ifp->if_start_lock);
11419 	}
11420 	return err;
11421 }
11422 
11423 errno_t
ifnet_enable_output(struct ifnet * ifp)11424 ifnet_enable_output(struct ifnet *ifp)
11425 {
11426 	if (ifp == NULL) {
11427 		return EINVAL;
11428 	} else if (!(ifp->if_eflags & IFEF_TXSTART) ||
11429 	    !IF_FULLY_ATTACHED(ifp)) {
11430 		return ENXIO;
11431 	}
11432 
11433 	ifnet_start_common(ifp, TRUE);
11434 	return 0;
11435 }
11436 
11437 void
ifnet_flowadv(uint32_t flowhash)11438 ifnet_flowadv(uint32_t flowhash)
11439 {
11440 	struct ifnet_fc_entry *ifce;
11441 	struct ifnet *ifp;
11442 
11443 	ifce = ifnet_fc_get(flowhash);
11444 	if (ifce == NULL) {
11445 		return;
11446 	}
11447 
11448 	VERIFY(ifce->ifce_ifp != NULL);
11449 	ifp = ifce->ifce_ifp;
11450 
11451 	/* flow hash gets recalculated per attach, so check */
11452 	if (ifnet_is_attached(ifp, 1)) {
11453 		if (ifp->if_flowhash == flowhash) {
11454 			(void) ifnet_enable_output(ifp);
11455 		}
11456 		ifnet_decr_iorefcnt(ifp);
11457 	}
11458 	ifnet_fc_entry_free(ifce);
11459 }
11460 
11461 /*
11462  * Function to compare ifnet_fc_entries in ifnet flow control tree
11463  */
11464 static inline int
ifce_cmp(const struct ifnet_fc_entry * fc1,const struct ifnet_fc_entry * fc2)11465 ifce_cmp(const struct ifnet_fc_entry *fc1, const struct ifnet_fc_entry *fc2)
11466 {
11467 	return fc1->ifce_flowhash - fc2->ifce_flowhash;
11468 }
11469 
11470 static int
ifnet_fc_add(struct ifnet * ifp)11471 ifnet_fc_add(struct ifnet *ifp)
11472 {
11473 	struct ifnet_fc_entry keyfc, *ifce;
11474 	uint32_t flowhash;
11475 
11476 	VERIFY(ifp != NULL && (ifp->if_eflags & IFEF_TXSTART));
11477 	VERIFY(ifp->if_flowhash != 0);
11478 	flowhash = ifp->if_flowhash;
11479 
11480 	bzero(&keyfc, sizeof(keyfc));
11481 	keyfc.ifce_flowhash = flowhash;
11482 
11483 	lck_mtx_lock_spin(&ifnet_fc_lock);
11484 	ifce = RB_FIND(ifnet_fc_tree, &ifnet_fc_tree, &keyfc);
11485 	if (ifce != NULL && ifce->ifce_ifp == ifp) {
11486 		/* Entry is already in ifnet_fc_tree, return */
11487 		lck_mtx_unlock(&ifnet_fc_lock);
11488 		return 0;
11489 	}
11490 
11491 	if (ifce != NULL) {
11492 		/*
11493 		 * There is a different fc entry with the same flow hash
11494 		 * but different ifp pointer.  There can be a collision
11495 		 * on flow hash but the probability is low.  Let's just
11496 		 * avoid adding a second one when there is a collision.
11497 		 */
11498 		lck_mtx_unlock(&ifnet_fc_lock);
11499 		return EAGAIN;
11500 	}
11501 
11502 	/* become regular mutex */
11503 	lck_mtx_convert_spin(&ifnet_fc_lock);
11504 
11505 	ifce = zalloc_flags(ifnet_fc_zone, Z_WAITOK | Z_ZERO);
11506 	ifce->ifce_flowhash = flowhash;
11507 	ifce->ifce_ifp = ifp;
11508 
11509 	RB_INSERT(ifnet_fc_tree, &ifnet_fc_tree, ifce);
11510 	lck_mtx_unlock(&ifnet_fc_lock);
11511 	return 0;
11512 }
11513 
11514 static struct ifnet_fc_entry *
ifnet_fc_get(uint32_t flowhash)11515 ifnet_fc_get(uint32_t flowhash)
11516 {
11517 	struct ifnet_fc_entry keyfc, *ifce;
11518 	struct ifnet *ifp;
11519 
11520 	bzero(&keyfc, sizeof(keyfc));
11521 	keyfc.ifce_flowhash = flowhash;
11522 
11523 	lck_mtx_lock_spin(&ifnet_fc_lock);
11524 	ifce = RB_FIND(ifnet_fc_tree, &ifnet_fc_tree, &keyfc);
11525 	if (ifce == NULL) {
11526 		/* Entry is not present in ifnet_fc_tree, return */
11527 		lck_mtx_unlock(&ifnet_fc_lock);
11528 		return NULL;
11529 	}
11530 
11531 	RB_REMOVE(ifnet_fc_tree, &ifnet_fc_tree, ifce);
11532 
11533 	VERIFY(ifce->ifce_ifp != NULL);
11534 	ifp = ifce->ifce_ifp;
11535 
11536 	/* become regular mutex */
11537 	lck_mtx_convert_spin(&ifnet_fc_lock);
11538 
11539 	if (!ifnet_is_attached(ifp, 0)) {
11540 		/*
11541 		 * This ifp is not attached or in the process of being
11542 		 * detached; just don't process it.
11543 		 */
11544 		ifnet_fc_entry_free(ifce);
11545 		ifce = NULL;
11546 	}
11547 	lck_mtx_unlock(&ifnet_fc_lock);
11548 
11549 	return ifce;
11550 }
11551 
11552 static void
ifnet_fc_entry_free(struct ifnet_fc_entry * ifce)11553 ifnet_fc_entry_free(struct ifnet_fc_entry *ifce)
11554 {
11555 	zfree(ifnet_fc_zone, ifce);
11556 }
11557 
11558 static uint32_t
ifnet_calc_flowhash(struct ifnet * ifp)11559 ifnet_calc_flowhash(struct ifnet *ifp)
11560 {
11561 	struct ifnet_flowhash_key fh __attribute__((aligned(8)));
11562 	uint32_t flowhash = 0;
11563 
11564 	if (ifnet_flowhash_seed == 0) {
11565 		ifnet_flowhash_seed = RandomULong();
11566 	}
11567 
11568 	bzero(&fh, sizeof(fh));
11569 
11570 	(void) snprintf(fh.ifk_name, sizeof(fh.ifk_name), "%s", ifp->if_name);
11571 	fh.ifk_unit = ifp->if_unit;
11572 	fh.ifk_flags = ifp->if_flags;
11573 	fh.ifk_eflags = ifp->if_eflags;
11574 	fh.ifk_capabilities = ifp->if_capabilities;
11575 	fh.ifk_capenable = ifp->if_capenable;
11576 	fh.ifk_output_sched_model = ifp->if_output_sched_model;
11577 	fh.ifk_rand1 = RandomULong();
11578 	fh.ifk_rand2 = RandomULong();
11579 
11580 try_again:
11581 	flowhash = net_flowhash(&fh, sizeof(fh), ifnet_flowhash_seed);
11582 	if (flowhash == 0) {
11583 		/* try to get a non-zero flowhash */
11584 		ifnet_flowhash_seed = RandomULong();
11585 		goto try_again;
11586 	}
11587 
11588 	return flowhash;
11589 }
11590 
11591 int
ifnet_set_netsignature(struct ifnet * ifp,uint8_t family,uint8_t len,uint16_t flags,uint8_t * data)11592 ifnet_set_netsignature(struct ifnet *ifp, uint8_t family, uint8_t len,
11593     uint16_t flags, uint8_t *data)
11594 {
11595 #pragma unused(flags)
11596 	int error = 0;
11597 
11598 	switch (family) {
11599 	case AF_INET:
11600 		if_inetdata_lock_exclusive(ifp);
11601 		if (IN_IFEXTRA(ifp) != NULL) {
11602 			if (len == 0) {
11603 				/* Allow clearing the signature */
11604 				IN_IFEXTRA(ifp)->netsig_len = 0;
11605 				bzero(IN_IFEXTRA(ifp)->netsig,
11606 				    sizeof(IN_IFEXTRA(ifp)->netsig));
11607 				if_inetdata_lock_done(ifp);
11608 				break;
11609 			} else if (len > sizeof(IN_IFEXTRA(ifp)->netsig)) {
11610 				error = EINVAL;
11611 				if_inetdata_lock_done(ifp);
11612 				break;
11613 			}
11614 			IN_IFEXTRA(ifp)->netsig_len = len;
11615 			bcopy(data, IN_IFEXTRA(ifp)->netsig, len);
11616 		} else {
11617 			error = ENOMEM;
11618 		}
11619 		if_inetdata_lock_done(ifp);
11620 		break;
11621 
11622 	case AF_INET6:
11623 		if_inet6data_lock_exclusive(ifp);
11624 		if (IN6_IFEXTRA(ifp) != NULL) {
11625 			if (len == 0) {
11626 				/* Allow clearing the signature */
11627 				IN6_IFEXTRA(ifp)->netsig_len = 0;
11628 				bzero(IN6_IFEXTRA(ifp)->netsig,
11629 				    sizeof(IN6_IFEXTRA(ifp)->netsig));
11630 				if_inet6data_lock_done(ifp);
11631 				break;
11632 			} else if (len > sizeof(IN6_IFEXTRA(ifp)->netsig)) {
11633 				error = EINVAL;
11634 				if_inet6data_lock_done(ifp);
11635 				break;
11636 			}
11637 			IN6_IFEXTRA(ifp)->netsig_len = len;
11638 			bcopy(data, IN6_IFEXTRA(ifp)->netsig, len);
11639 		} else {
11640 			error = ENOMEM;
11641 		}
11642 		if_inet6data_lock_done(ifp);
11643 		break;
11644 
11645 	default:
11646 		error = EINVAL;
11647 		break;
11648 	}
11649 
11650 	return error;
11651 }
11652 
11653 int
ifnet_get_netsignature(struct ifnet * ifp,uint8_t family,uint8_t * len,uint16_t * flags,uint8_t * data)11654 ifnet_get_netsignature(struct ifnet *ifp, uint8_t family, uint8_t *len,
11655     uint16_t *flags, uint8_t *data)
11656 {
11657 	int error = 0;
11658 
11659 	if (ifp == NULL || len == NULL || data == NULL) {
11660 		return EINVAL;
11661 	}
11662 
11663 	switch (family) {
11664 	case AF_INET:
11665 		if_inetdata_lock_shared(ifp);
11666 		if (IN_IFEXTRA(ifp) != NULL) {
11667 			if (*len == 0 || *len < IN_IFEXTRA(ifp)->netsig_len) {
11668 				error = EINVAL;
11669 				if_inetdata_lock_done(ifp);
11670 				break;
11671 			}
11672 			if ((*len = (uint8_t)IN_IFEXTRA(ifp)->netsig_len) > 0) {
11673 				bcopy(IN_IFEXTRA(ifp)->netsig, data, *len);
11674 			} else {
11675 				error = ENOENT;
11676 			}
11677 		} else {
11678 			error = ENOMEM;
11679 		}
11680 		if_inetdata_lock_done(ifp);
11681 		break;
11682 
11683 	case AF_INET6:
11684 		if_inet6data_lock_shared(ifp);
11685 		if (IN6_IFEXTRA(ifp) != NULL) {
11686 			if (*len == 0 || *len < IN6_IFEXTRA(ifp)->netsig_len) {
11687 				error = EINVAL;
11688 				if_inet6data_lock_done(ifp);
11689 				break;
11690 			}
11691 			if ((*len = (uint8_t)IN6_IFEXTRA(ifp)->netsig_len) > 0) {
11692 				bcopy(IN6_IFEXTRA(ifp)->netsig, data, *len);
11693 			} else {
11694 				error = ENOENT;
11695 			}
11696 		} else {
11697 			error = ENOMEM;
11698 		}
11699 		if_inet6data_lock_done(ifp);
11700 		break;
11701 
11702 	default:
11703 		error = EINVAL;
11704 		break;
11705 	}
11706 
11707 	if (error == 0 && flags != NULL) {
11708 		*flags = 0;
11709 	}
11710 
11711 	return error;
11712 }
11713 
11714 int
ifnet_set_nat64prefix(struct ifnet * ifp,struct ipv6_prefix * prefixes)11715 ifnet_set_nat64prefix(struct ifnet *ifp, struct ipv6_prefix *prefixes)
11716 {
11717 	int i, error = 0, one_set = 0;
11718 
11719 	if_inet6data_lock_exclusive(ifp);
11720 
11721 	if (IN6_IFEXTRA(ifp) == NULL) {
11722 		error = ENOMEM;
11723 		goto out;
11724 	}
11725 
11726 	for (i = 0; i < NAT64_MAX_NUM_PREFIXES; i++) {
11727 		uint32_t prefix_len =
11728 		    prefixes[i].prefix_len;
11729 		struct in6_addr *prefix =
11730 		    &prefixes[i].ipv6_prefix;
11731 
11732 		if (prefix_len == 0) {
11733 			clat_log0((LOG_DEBUG,
11734 			    "NAT64 prefixes purged from Interface %s\n",
11735 			    if_name(ifp)));
11736 			/* Allow clearing the signature */
11737 			IN6_IFEXTRA(ifp)->nat64_prefixes[i].prefix_len = 0;
11738 			bzero(&IN6_IFEXTRA(ifp)->nat64_prefixes[i].ipv6_prefix,
11739 			    sizeof(struct in6_addr));
11740 
11741 			continue;
11742 		} else if (prefix_len != NAT64_PREFIX_LEN_32 &&
11743 		    prefix_len != NAT64_PREFIX_LEN_40 &&
11744 		    prefix_len != NAT64_PREFIX_LEN_48 &&
11745 		    prefix_len != NAT64_PREFIX_LEN_56 &&
11746 		    prefix_len != NAT64_PREFIX_LEN_64 &&
11747 		    prefix_len != NAT64_PREFIX_LEN_96) {
11748 			clat_log0((LOG_DEBUG,
11749 			    "NAT64 prefixlen is incorrect %d\n", prefix_len));
11750 			error = EINVAL;
11751 			goto out;
11752 		}
11753 
11754 		if (IN6_IS_SCOPE_EMBED(prefix)) {
11755 			clat_log0((LOG_DEBUG,
11756 			    "NAT64 prefix has interface/link local scope.\n"));
11757 			error = EINVAL;
11758 			goto out;
11759 		}
11760 
11761 		IN6_IFEXTRA(ifp)->nat64_prefixes[i].prefix_len = prefix_len;
11762 		bcopy(prefix, &IN6_IFEXTRA(ifp)->nat64_prefixes[i].ipv6_prefix,
11763 		    sizeof(struct in6_addr));
11764 		clat_log0((LOG_DEBUG,
11765 		    "NAT64 prefix set to %s with prefixlen: %d\n",
11766 		    ip6_sprintf(prefix), prefix_len));
11767 		one_set = 1;
11768 	}
11769 
11770 out:
11771 	if_inet6data_lock_done(ifp);
11772 
11773 	if (error == 0 && one_set != 0) {
11774 		necp_update_all_clients();
11775 	}
11776 
11777 	return error;
11778 }
11779 
11780 int
ifnet_get_nat64prefix(struct ifnet * ifp,struct ipv6_prefix * prefixes)11781 ifnet_get_nat64prefix(struct ifnet *ifp, struct ipv6_prefix *prefixes)
11782 {
11783 	int i, found_one = 0, error = 0;
11784 
11785 	if (ifp == NULL) {
11786 		return EINVAL;
11787 	}
11788 
11789 	if_inet6data_lock_shared(ifp);
11790 
11791 	if (IN6_IFEXTRA(ifp) == NULL) {
11792 		error = ENOMEM;
11793 		goto out;
11794 	}
11795 
11796 	for (i = 0; i < NAT64_MAX_NUM_PREFIXES; i++) {
11797 		if (IN6_IFEXTRA(ifp)->nat64_prefixes[i].prefix_len != 0) {
11798 			found_one = 1;
11799 		}
11800 	}
11801 
11802 	if (found_one == 0) {
11803 		error = ENOENT;
11804 		goto out;
11805 	}
11806 
11807 	if (prefixes) {
11808 		bcopy(IN6_IFEXTRA(ifp)->nat64_prefixes, prefixes,
11809 		    sizeof(IN6_IFEXTRA(ifp)->nat64_prefixes));
11810 	}
11811 
11812 out:
11813 	if_inet6data_lock_done(ifp);
11814 
11815 	return error;
11816 }
11817 
11818 __attribute__((noinline))
11819 static void
dlil_output_cksum_dbg(struct ifnet * ifp,struct mbuf * m,uint32_t hoff,protocol_family_t pf)11820 dlil_output_cksum_dbg(struct ifnet *ifp, struct mbuf *m, uint32_t hoff,
11821     protocol_family_t pf)
11822 {
11823 #pragma unused(ifp)
11824 	uint32_t did_sw;
11825 
11826 	if (!(hwcksum_dbg_mode & HWCKSUM_DBG_FINALIZE_FORCED) ||
11827 	    (m->m_pkthdr.csum_flags & (CSUM_TSO_IPV4 | CSUM_TSO_IPV6))) {
11828 		return;
11829 	}
11830 
11831 	switch (pf) {
11832 	case PF_INET:
11833 		did_sw = in_finalize_cksum(m, hoff, m->m_pkthdr.csum_flags);
11834 		if (did_sw & CSUM_DELAY_IP) {
11835 			hwcksum_dbg_finalized_hdr++;
11836 		}
11837 		if (did_sw & CSUM_DELAY_DATA) {
11838 			hwcksum_dbg_finalized_data++;
11839 		}
11840 		break;
11841 	case PF_INET6:
11842 		/*
11843 		 * Checksum offload should not have been enabled when
11844 		 * extension headers exist; that also means that we
11845 		 * cannot force-finalize packets with extension headers.
11846 		 * Indicate to the callee should it skip such case by
11847 		 * setting optlen to -1.
11848 		 */
11849 		did_sw = in6_finalize_cksum(m, hoff, -1, -1,
11850 		    m->m_pkthdr.csum_flags);
11851 		if (did_sw & CSUM_DELAY_IPV6_DATA) {
11852 			hwcksum_dbg_finalized_data++;
11853 		}
11854 		break;
11855 	default:
11856 		return;
11857 	}
11858 }
11859 
11860 static void
dlil_input_cksum_dbg(struct ifnet * ifp,struct mbuf * m,char * frame_header,protocol_family_t pf)11861 dlil_input_cksum_dbg(struct ifnet *ifp, struct mbuf *m, char *frame_header,
11862     protocol_family_t pf)
11863 {
11864 	uint16_t sum = 0;
11865 	uint32_t hlen;
11866 
11867 	if (frame_header == NULL ||
11868 	    frame_header < (char *)mbuf_datastart(m) ||
11869 	    frame_header > (char *)m->m_data) {
11870 		DLIL_PRINTF("%s: frame header pointer 0x%llx out of range "
11871 		    "[0x%llx,0x%llx] for mbuf 0x%llx\n", if_name(ifp),
11872 		    (uint64_t)VM_KERNEL_ADDRPERM(frame_header),
11873 		    (uint64_t)VM_KERNEL_ADDRPERM(mbuf_datastart(m)),
11874 		    (uint64_t)VM_KERNEL_ADDRPERM(m->m_data),
11875 		    (uint64_t)VM_KERNEL_ADDRPERM(m));
11876 		return;
11877 	}
11878 	hlen = (uint32_t)(m->m_data - frame_header);
11879 
11880 	switch (pf) {
11881 	case PF_INET:
11882 	case PF_INET6:
11883 		break;
11884 	default:
11885 		return;
11886 	}
11887 
11888 	/*
11889 	 * Force partial checksum offload; useful to simulate cases
11890 	 * where the hardware does not support partial checksum offload,
11891 	 * in order to validate correctness throughout the layers above.
11892 	 */
11893 	if (hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_FORCED) {
11894 		uint32_t foff = hwcksum_dbg_partial_rxoff_forced;
11895 
11896 		if (foff > (uint32_t)m->m_pkthdr.len) {
11897 			return;
11898 		}
11899 
11900 		m->m_pkthdr.csum_flags &= ~CSUM_RX_FLAGS;
11901 
11902 		/* Compute 16-bit 1's complement sum from forced offset */
11903 		sum = m_sum16(m, foff, (m->m_pkthdr.len - foff));
11904 
11905 		m->m_pkthdr.csum_flags |= (CSUM_DATA_VALID | CSUM_PARTIAL);
11906 		m->m_pkthdr.csum_rx_val = sum;
11907 		m->m_pkthdr.csum_rx_start = (uint16_t)(foff + hlen);
11908 
11909 		hwcksum_dbg_partial_forced++;
11910 		hwcksum_dbg_partial_forced_bytes += m->m_pkthdr.len;
11911 	}
11912 
11913 	/*
11914 	 * Partial checksum offload verification (and adjustment);
11915 	 * useful to validate and test cases where the hardware
11916 	 * supports partial checksum offload.
11917 	 */
11918 	if ((m->m_pkthdr.csum_flags &
11919 	    (CSUM_DATA_VALID | CSUM_PARTIAL | CSUM_PSEUDO_HDR)) ==
11920 	    (CSUM_DATA_VALID | CSUM_PARTIAL)) {
11921 		uint32_t rxoff;
11922 
11923 		/* Start offset must begin after frame header */
11924 		rxoff = m->m_pkthdr.csum_rx_start;
11925 		if (hlen > rxoff) {
11926 			hwcksum_dbg_bad_rxoff++;
11927 			if (dlil_verbose) {
11928 				DLIL_PRINTF("%s: partial cksum start offset %d "
11929 				    "is less than frame header length %d for "
11930 				    "mbuf 0x%llx\n", if_name(ifp), rxoff, hlen,
11931 				    (uint64_t)VM_KERNEL_ADDRPERM(m));
11932 			}
11933 			return;
11934 		}
11935 		rxoff -= hlen;
11936 
11937 		if (!(hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_FORCED)) {
11938 			/*
11939 			 * Compute the expected 16-bit 1's complement sum;
11940 			 * skip this if we've already computed it above
11941 			 * when partial checksum offload is forced.
11942 			 */
11943 			sum = m_sum16(m, rxoff, (m->m_pkthdr.len - rxoff));
11944 
11945 			/* Hardware or driver is buggy */
11946 			if (sum != m->m_pkthdr.csum_rx_val) {
11947 				hwcksum_dbg_bad_cksum++;
11948 				if (dlil_verbose) {
11949 					DLIL_PRINTF("%s: bad partial cksum value "
11950 					    "0x%x (expected 0x%x) for mbuf "
11951 					    "0x%llx [rx_start %d]\n",
11952 					    if_name(ifp),
11953 					    m->m_pkthdr.csum_rx_val, sum,
11954 					    (uint64_t)VM_KERNEL_ADDRPERM(m),
11955 					    m->m_pkthdr.csum_rx_start);
11956 				}
11957 				return;
11958 			}
11959 		}
11960 		hwcksum_dbg_verified++;
11961 
11962 		/*
11963 		 * This code allows us to emulate various hardwares that
11964 		 * perform 16-bit 1's complement sum beginning at various
11965 		 * start offset values.
11966 		 */
11967 		if (hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_RXOFF_ADJ) {
11968 			uint32_t aoff = hwcksum_dbg_partial_rxoff_adj;
11969 
11970 			if (aoff == rxoff || aoff > (uint32_t)m->m_pkthdr.len) {
11971 				return;
11972 			}
11973 
11974 			sum = m_adj_sum16(m, rxoff, aoff,
11975 			    m_pktlen(m) - aoff, sum);
11976 
11977 			m->m_pkthdr.csum_rx_val = sum;
11978 			m->m_pkthdr.csum_rx_start = (uint16_t)(aoff + hlen);
11979 
11980 			hwcksum_dbg_adjusted++;
11981 		}
11982 	}
11983 }
11984 
11985 static int
11986 sysctl_hwcksum_dbg_mode SYSCTL_HANDLER_ARGS
11987 {
11988 #pragma unused(arg1, arg2)
11989 	u_int32_t i;
11990 	int err;
11991 
11992 	i = hwcksum_dbg_mode;
11993 
11994 	err = sysctl_handle_int(oidp, &i, 0, req);
11995 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
11996 		return err;
11997 	}
11998 
11999 	if (hwcksum_dbg == 0) {
12000 		return ENODEV;
12001 	}
12002 
12003 	if ((i & ~HWCKSUM_DBG_MASK) != 0) {
12004 		return EINVAL;
12005 	}
12006 
12007 	hwcksum_dbg_mode = (i & HWCKSUM_DBG_MASK);
12008 
12009 	return err;
12010 }
12011 
12012 static int
12013 sysctl_hwcksum_dbg_partial_rxoff_forced SYSCTL_HANDLER_ARGS
12014 {
12015 #pragma unused(arg1, arg2)
12016 	u_int32_t i;
12017 	int err;
12018 
12019 	i = hwcksum_dbg_partial_rxoff_forced;
12020 
12021 	err = sysctl_handle_int(oidp, &i, 0, req);
12022 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
12023 		return err;
12024 	}
12025 
12026 	if (!(hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_FORCED)) {
12027 		return ENODEV;
12028 	}
12029 
12030 	hwcksum_dbg_partial_rxoff_forced = i;
12031 
12032 	return err;
12033 }
12034 
12035 static int
12036 sysctl_hwcksum_dbg_partial_rxoff_adj SYSCTL_HANDLER_ARGS
12037 {
12038 #pragma unused(arg1, arg2)
12039 	u_int32_t i;
12040 	int err;
12041 
12042 	i = hwcksum_dbg_partial_rxoff_adj;
12043 
12044 	err = sysctl_handle_int(oidp, &i, 0, req);
12045 	if (err != 0 || req->newptr == USER_ADDR_NULL) {
12046 		return err;
12047 	}
12048 
12049 	if (!(hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_RXOFF_ADJ)) {
12050 		return ENODEV;
12051 	}
12052 
12053 	hwcksum_dbg_partial_rxoff_adj = i;
12054 
12055 	return err;
12056 }
12057 
12058 static int
12059 sysctl_tx_chain_len_stats SYSCTL_HANDLER_ARGS
12060 {
12061 #pragma unused(oidp, arg1, arg2)
12062 	int err;
12063 
12064 	if (req->oldptr == USER_ADDR_NULL) {
12065 	}
12066 	if (req->newptr != USER_ADDR_NULL) {
12067 		return EPERM;
12068 	}
12069 	err = SYSCTL_OUT(req, &tx_chain_len_stats,
12070 	    sizeof(struct chain_len_stats));
12071 
12072 	return err;
12073 }
12074 
12075 
12076 #if DEBUG || DEVELOPMENT
12077 /* Blob for sum16 verification */
12078 static uint8_t sumdata[] = {
12079 	0x1f, 0x8b, 0x08, 0x08, 0x4c, 0xe5, 0x9a, 0x4f, 0x00, 0x03,
12080 	0x5f, 0x00, 0x5d, 0x91, 0x41, 0x4e, 0xc4, 0x30, 0x0c, 0x45,
12081 	0xf7, 0x9c, 0xc2, 0x07, 0x18, 0xf5, 0x0e, 0xb0, 0xe2, 0x00,
12082 	0x48, 0x88, 0xa5, 0xdb, 0xba, 0x49, 0x34, 0x69, 0xdc, 0x71,
12083 	0x92, 0xa9, 0xc2, 0x8a, 0x6b, 0x70, 0x3d, 0x4e, 0x82, 0x93,
12084 	0xb4, 0x08, 0xd8, 0xc5, 0xb1, 0xfd, 0xff, 0xb3, 0xfd, 0x4c,
12085 	0x42, 0x5f, 0x1f, 0x9f, 0x11, 0x12, 0x43, 0xb2, 0x04, 0x93,
12086 	0xe0, 0x7b, 0x01, 0x0e, 0x14, 0x07, 0x78, 0xd1, 0x78, 0x75,
12087 	0x71, 0x71, 0xe9, 0x08, 0x84, 0x46, 0xf2, 0xc7, 0x3b, 0x09,
12088 	0xe7, 0xd1, 0xd3, 0x8a, 0x57, 0x92, 0x33, 0xcd, 0x39, 0xcc,
12089 	0xb0, 0x91, 0x89, 0xe0, 0x42, 0x53, 0x8b, 0xb7, 0x8c, 0x42,
12090 	0x60, 0xd9, 0x9f, 0x7a, 0x55, 0x19, 0x76, 0xcb, 0x10, 0x49,
12091 	0x35, 0xac, 0x0b, 0x5a, 0x3c, 0xbb, 0x65, 0x51, 0x8c, 0x90,
12092 	0x7c, 0x69, 0x45, 0x45, 0x81, 0xb4, 0x2b, 0x70, 0x82, 0x85,
12093 	0x55, 0x91, 0x17, 0x90, 0xdc, 0x14, 0x1e, 0x35, 0x52, 0xdd,
12094 	0x02, 0x16, 0xef, 0xb5, 0x40, 0x89, 0xe2, 0x46, 0x53, 0xad,
12095 	0x93, 0x6e, 0x98, 0x30, 0xe5, 0x08, 0xb7, 0xcc, 0x03, 0xbc,
12096 	0x71, 0x86, 0x09, 0x43, 0x0d, 0x52, 0xf5, 0xa2, 0xf5, 0xa2,
12097 	0x56, 0x11, 0x8d, 0xa8, 0xf5, 0xee, 0x92, 0x3d, 0xfe, 0x8c,
12098 	0x67, 0x71, 0x8b, 0x0e, 0x2d, 0x70, 0x77, 0xbe, 0xbe, 0xea,
12099 	0xbf, 0x9a, 0x8d, 0x9c, 0x53, 0x53, 0xe5, 0xe0, 0x4b, 0x87,
12100 	0x85, 0xd2, 0x45, 0x95, 0x30, 0xc1, 0xcc, 0xe0, 0x74, 0x54,
12101 	0x13, 0x58, 0xe8, 0xe8, 0x79, 0xa2, 0x09, 0x73, 0xa4, 0x0e,
12102 	0x39, 0x59, 0x0c, 0xe6, 0x9c, 0xb2, 0x4f, 0x06, 0x5b, 0x8e,
12103 	0xcd, 0x17, 0x6c, 0x5e, 0x95, 0x4d, 0x70, 0xa2, 0x0a, 0xbf,
12104 	0xa3, 0xcc, 0x03, 0xbc, 0x5a, 0xe7, 0x75, 0x06, 0x5e, 0x75,
12105 	0xef, 0x58, 0x8e, 0x15, 0xd1, 0x0a, 0x18, 0xff, 0xdd, 0xe6,
12106 	0x02, 0x3b, 0xb5, 0xb4, 0xa1, 0xe0, 0x72, 0xfc, 0xe3, 0xab,
12107 	0x07, 0xe0, 0x4d, 0x65, 0xea, 0x92, 0xeb, 0xf2, 0x7b, 0x17,
12108 	0x05, 0xce, 0xc6, 0xf6, 0x2b, 0xbb, 0x70, 0x3d, 0x00, 0x95,
12109 	0xe0, 0x07, 0x52, 0x3b, 0x58, 0xfc, 0x7c, 0x69, 0x4d, 0xe9,
12110 	0xf7, 0xa9, 0x66, 0x1e, 0x1e, 0xbe, 0x01, 0x69, 0x98, 0xfe,
12111 	0xc8, 0x28, 0x02, 0x00, 0x00
12112 };
12113 
12114 /* Precomputed 16-bit 1's complement sums for various spans of the above data */
12115 static struct {
12116 	boolean_t       init;
12117 	uint16_t        len;
12118 	uint16_t        sumr;   /* reference */
12119 	uint16_t        sumrp;  /* reference, precomputed */
12120 } sumtbl[] = {
12121 	{ FALSE, 0, 0, 0x0000 },
12122 	{ FALSE, 1, 0, 0x001f },
12123 	{ FALSE, 2, 0, 0x8b1f },
12124 	{ FALSE, 3, 0, 0x8b27 },
12125 	{ FALSE, 7, 0, 0x790e },
12126 	{ FALSE, 11, 0, 0xcb6d },
12127 	{ FALSE, 20, 0, 0x20dd },
12128 	{ FALSE, 27, 0, 0xbabd },
12129 	{ FALSE, 32, 0, 0xf3e8 },
12130 	{ FALSE, 37, 0, 0x197d },
12131 	{ FALSE, 43, 0, 0x9eae },
12132 	{ FALSE, 64, 0, 0x4678 },
12133 	{ FALSE, 127, 0, 0x9399 },
12134 	{ FALSE, 256, 0, 0xd147 },
12135 	{ FALSE, 325, 0, 0x0358 },
12136 };
12137 #define SUMTBL_MAX      ((int)sizeof (sumtbl) / (int)sizeof (sumtbl[0]))
12138 
12139 static void
dlil_verify_sum16(void)12140 dlil_verify_sum16(void)
12141 {
12142 	struct mbuf *m;
12143 	uint8_t *buf;
12144 	int n;
12145 
12146 	/* Make sure test data plus extra room for alignment fits in cluster */
12147 	_CASSERT((sizeof(sumdata) + (sizeof(uint64_t) * 2)) <= MCLBYTES);
12148 
12149 	kprintf("DLIL: running SUM16 self-tests ... ");
12150 
12151 	m = m_getcl(M_WAITOK, MT_DATA, M_PKTHDR);
12152 	m_align(m, sizeof(sumdata) + (sizeof(uint64_t) * 2));
12153 
12154 	buf = mtod(m, uint8_t *);               /* base address */
12155 
12156 	for (n = 0; n < SUMTBL_MAX; n++) {
12157 		uint16_t len = sumtbl[n].len;
12158 		int i;
12159 
12160 		/* Verify for all possible alignments */
12161 		for (i = 0; i < (int)sizeof(uint64_t); i++) {
12162 			uint16_t sum, sumr;
12163 			uint8_t *c;
12164 
12165 			/* Copy over test data to mbuf */
12166 			VERIFY(len <= sizeof(sumdata));
12167 			c = buf + i;
12168 			bcopy(sumdata, c, len);
12169 
12170 			/* Zero-offset test (align by data pointer) */
12171 			m->m_data = (caddr_t)c;
12172 			m->m_len = len;
12173 			sum = m_sum16(m, 0, len);
12174 
12175 			if (!sumtbl[n].init) {
12176 				sumr = (uint16_t)in_cksum_mbuf_ref(m, len, 0, 0);
12177 				sumtbl[n].sumr = sumr;
12178 				sumtbl[n].init = TRUE;
12179 			} else {
12180 				sumr = sumtbl[n].sumr;
12181 			}
12182 
12183 			/* Something is horribly broken; stop now */
12184 			if (sumr != sumtbl[n].sumrp) {
12185 				panic_plain("\n%s: broken in_cksum_mbuf_ref() "
12186 				    "for len=%d align=%d sum=0x%04x "
12187 				    "[expected=0x%04x]\n", __func__,
12188 				    len, i, sum, sumr);
12189 				/* NOTREACHED */
12190 			} else if (sum != sumr) {
12191 				panic_plain("\n%s: broken m_sum16() for len=%d "
12192 				    "align=%d sum=0x%04x [expected=0x%04x]\n",
12193 				    __func__, len, i, sum, sumr);
12194 				/* NOTREACHED */
12195 			}
12196 
12197 			/* Alignment test by offset (fixed data pointer) */
12198 			m->m_data = (caddr_t)buf;
12199 			m->m_len = i + len;
12200 			sum = m_sum16(m, i, len);
12201 
12202 			/* Something is horribly broken; stop now */
12203 			if (sum != sumr) {
12204 				panic_plain("\n%s: broken m_sum16() for len=%d "
12205 				    "offset=%d sum=0x%04x [expected=0x%04x]\n",
12206 				    __func__, len, i, sum, sumr);
12207 				/* NOTREACHED */
12208 			}
12209 #if INET
12210 			/* Simple sum16 contiguous buffer test by aligment */
12211 			sum = b_sum16(c, len);
12212 
12213 			/* Something is horribly broken; stop now */
12214 			if (sum != sumr) {
12215 				panic_plain("\n%s: broken b_sum16() for len=%d "
12216 				    "align=%d sum=0x%04x [expected=0x%04x]\n",
12217 				    __func__, len, i, sum, sumr);
12218 				/* NOTREACHED */
12219 			}
12220 #endif /* INET */
12221 		}
12222 	}
12223 	m_freem(m);
12224 
12225 	kprintf("PASSED\n");
12226 }
12227 #endif /* DEBUG || DEVELOPMENT */
12228 
12229 #define CASE_STRINGIFY(x) case x: return #x
12230 
12231 __private_extern__ const char *
dlil_kev_dl_code_str(u_int32_t event_code)12232 dlil_kev_dl_code_str(u_int32_t event_code)
12233 {
12234 	switch (event_code) {
12235 		CASE_STRINGIFY(KEV_DL_SIFFLAGS);
12236 		CASE_STRINGIFY(KEV_DL_SIFMETRICS);
12237 		CASE_STRINGIFY(KEV_DL_SIFMTU);
12238 		CASE_STRINGIFY(KEV_DL_SIFPHYS);
12239 		CASE_STRINGIFY(KEV_DL_SIFMEDIA);
12240 		CASE_STRINGIFY(KEV_DL_SIFGENERIC);
12241 		CASE_STRINGIFY(KEV_DL_ADDMULTI);
12242 		CASE_STRINGIFY(KEV_DL_DELMULTI);
12243 		CASE_STRINGIFY(KEV_DL_IF_ATTACHED);
12244 		CASE_STRINGIFY(KEV_DL_IF_DETACHING);
12245 		CASE_STRINGIFY(KEV_DL_IF_DETACHED);
12246 		CASE_STRINGIFY(KEV_DL_LINK_OFF);
12247 		CASE_STRINGIFY(KEV_DL_LINK_ON);
12248 		CASE_STRINGIFY(KEV_DL_PROTO_ATTACHED);
12249 		CASE_STRINGIFY(KEV_DL_PROTO_DETACHED);
12250 		CASE_STRINGIFY(KEV_DL_LINK_ADDRESS_CHANGED);
12251 		CASE_STRINGIFY(KEV_DL_WAKEFLAGS_CHANGED);
12252 		CASE_STRINGIFY(KEV_DL_IF_IDLE_ROUTE_REFCNT);
12253 		CASE_STRINGIFY(KEV_DL_IFCAP_CHANGED);
12254 		CASE_STRINGIFY(KEV_DL_LINK_QUALITY_METRIC_CHANGED);
12255 		CASE_STRINGIFY(KEV_DL_NODE_PRESENCE);
12256 		CASE_STRINGIFY(KEV_DL_NODE_ABSENCE);
12257 		CASE_STRINGIFY(KEV_DL_PRIMARY_ELECTED);
12258 		CASE_STRINGIFY(KEV_DL_ISSUES);
12259 		CASE_STRINGIFY(KEV_DL_IFDELEGATE_CHANGED);
12260 	default:
12261 		break;
12262 	}
12263 	return "";
12264 }
12265 
12266 static void
dlil_dt_tcall_fn(thread_call_param_t arg0,thread_call_param_t arg1)12267 dlil_dt_tcall_fn(thread_call_param_t arg0, thread_call_param_t arg1)
12268 {
12269 #pragma unused(arg1)
12270 	struct ifnet *ifp = arg0;
12271 
12272 	if (ifnet_is_attached(ifp, 1)) {
12273 		nstat_ifnet_threshold_reached(ifp->if_index);
12274 		ifnet_decr_iorefcnt(ifp);
12275 	}
12276 }
12277 
12278 void
ifnet_notify_data_threshold(struct ifnet * ifp)12279 ifnet_notify_data_threshold(struct ifnet *ifp)
12280 {
12281 	uint64_t bytes = (ifp->if_ibytes + ifp->if_obytes);
12282 	uint64_t oldbytes = ifp->if_dt_bytes;
12283 
12284 	ASSERT(ifp->if_dt_tcall != NULL);
12285 
12286 	/*
12287 	 * If we went over the threshold, notify NetworkStatistics.
12288 	 * We rate-limit it based on the threshold interval value.
12289 	 */
12290 	if (threshold_notify && (bytes - oldbytes) > ifp->if_data_threshold &&
12291 	    OSCompareAndSwap64(oldbytes, bytes, &ifp->if_dt_bytes) &&
12292 	    !thread_call_isactive(ifp->if_dt_tcall)) {
12293 		uint64_t tival = (threshold_interval * NSEC_PER_SEC);
12294 		uint64_t now = mach_absolute_time(), deadline = now;
12295 		uint64_t ival;
12296 
12297 		if (tival != 0) {
12298 			nanoseconds_to_absolutetime(tival, &ival);
12299 			clock_deadline_for_periodic_event(ival, now, &deadline);
12300 			(void) thread_call_enter_delayed(ifp->if_dt_tcall,
12301 			    deadline);
12302 		} else {
12303 			(void) thread_call_enter(ifp->if_dt_tcall);
12304 		}
12305 	}
12306 }
12307 
12308 #if (DEVELOPMENT || DEBUG)
12309 /*
12310  * The sysctl variable name contains the input parameters of
12311  * ifnet_get_keepalive_offload_frames()
12312  *  ifp (interface index): name[0]
12313  *  frames_array_count:    name[1]
12314  *  frame_data_offset:     name[2]
12315  * The return length gives used_frames_count
12316  */
12317 static int
12318 sysctl_get_kao_frames SYSCTL_HANDLER_ARGS
12319 {
12320 #pragma unused(oidp)
12321 	int *name = (int *)arg1;
12322 	u_int namelen = arg2;
12323 	int idx;
12324 	ifnet_t ifp = NULL;
12325 	u_int32_t frames_array_count;
12326 	size_t frame_data_offset;
12327 	u_int32_t used_frames_count;
12328 	struct ifnet_keepalive_offload_frame *frames_array = NULL;
12329 	int error = 0;
12330 	u_int32_t i;
12331 
12332 	/*
12333 	 * Only root can get look at other people TCP frames
12334 	 */
12335 	error = proc_suser(current_proc());
12336 	if (error != 0) {
12337 		goto done;
12338 	}
12339 	/*
12340 	 * Validate the input parameters
12341 	 */
12342 	if (req->newptr != USER_ADDR_NULL) {
12343 		error = EPERM;
12344 		goto done;
12345 	}
12346 	if (namelen != 3) {
12347 		error = EINVAL;
12348 		goto done;
12349 	}
12350 	if (req->oldptr == USER_ADDR_NULL) {
12351 		error = EINVAL;
12352 		goto done;
12353 	}
12354 	if (req->oldlen == 0) {
12355 		error = EINVAL;
12356 		goto done;
12357 	}
12358 	idx = name[0];
12359 	frames_array_count = name[1];
12360 	frame_data_offset = name[2];
12361 
12362 	/* Make sure the passed buffer is large enough */
12363 	if (frames_array_count * sizeof(struct ifnet_keepalive_offload_frame) >
12364 	    req->oldlen) {
12365 		error = ENOMEM;
12366 		goto done;
12367 	}
12368 
12369 	ifnet_head_lock_shared();
12370 	if (!IF_INDEX_IN_RANGE(idx)) {
12371 		ifnet_head_done();
12372 		error = ENOENT;
12373 		goto done;
12374 	}
12375 	ifp = ifindex2ifnet[idx];
12376 	ifnet_head_done();
12377 
12378 	frames_array = (struct ifnet_keepalive_offload_frame *)kalloc_data(
12379 		frames_array_count * sizeof(struct ifnet_keepalive_offload_frame),
12380 		Z_WAITOK);
12381 	if (frames_array == NULL) {
12382 		error = ENOMEM;
12383 		goto done;
12384 	}
12385 
12386 	error = ifnet_get_keepalive_offload_frames(ifp, frames_array,
12387 	    frames_array_count, frame_data_offset, &used_frames_count);
12388 	if (error != 0) {
12389 		DLIL_PRINTF("%s: ifnet_get_keepalive_offload_frames error %d\n",
12390 		    __func__, error);
12391 		goto done;
12392 	}
12393 
12394 	for (i = 0; i < used_frames_count; i++) {
12395 		error = SYSCTL_OUT(req, frames_array + i,
12396 		    sizeof(struct ifnet_keepalive_offload_frame));
12397 		if (error != 0) {
12398 			goto done;
12399 		}
12400 	}
12401 done:
12402 	if (frames_array != NULL) {
12403 		kfree_data(frames_array, frames_array_count *
12404 		    sizeof(struct ifnet_keepalive_offload_frame));
12405 	}
12406 	return error;
12407 }
12408 #endif /* DEVELOPMENT || DEBUG */
12409 
12410 void
ifnet_update_stats_per_flow(struct ifnet_stats_per_flow * ifs,struct ifnet * ifp)12411 ifnet_update_stats_per_flow(struct ifnet_stats_per_flow *ifs,
12412     struct ifnet *ifp)
12413 {
12414 	tcp_update_stats_per_flow(ifs, ifp);
12415 }
12416 
12417 static inline u_int32_t
_set_flags(u_int32_t * flags_p,u_int32_t set_flags)12418 _set_flags(u_int32_t *flags_p, u_int32_t set_flags)
12419 {
12420 	return (u_int32_t)OSBitOrAtomic(set_flags, flags_p);
12421 }
12422 
12423 static inline void
_clear_flags(u_int32_t * flags_p,u_int32_t clear_flags)12424 _clear_flags(u_int32_t *flags_p, u_int32_t clear_flags)
12425 {
12426 	OSBitAndAtomic(~clear_flags, flags_p);
12427 }
12428 
12429 __private_extern__ u_int32_t
if_set_eflags(ifnet_t interface,u_int32_t set_flags)12430 if_set_eflags(ifnet_t interface, u_int32_t set_flags)
12431 {
12432 	return _set_flags(&interface->if_eflags, set_flags);
12433 }
12434 
12435 __private_extern__ void
if_clear_eflags(ifnet_t interface,u_int32_t clear_flags)12436 if_clear_eflags(ifnet_t interface, u_int32_t clear_flags)
12437 {
12438 	_clear_flags(&interface->if_eflags, clear_flags);
12439 }
12440 
12441 __private_extern__ u_int32_t
if_set_xflags(ifnet_t interface,u_int32_t set_flags)12442 if_set_xflags(ifnet_t interface, u_int32_t set_flags)
12443 {
12444 	return _set_flags(&interface->if_xflags, set_flags);
12445 }
12446 
12447 __private_extern__ void
if_clear_xflags(ifnet_t interface,u_int32_t clear_flags)12448 if_clear_xflags(ifnet_t interface, u_int32_t clear_flags)
12449 {
12450 	_clear_flags(&interface->if_xflags, clear_flags);
12451 }
12452 
12453 __private_extern__ void
ifnet_update_traffic_rule_genid(ifnet_t ifp)12454 ifnet_update_traffic_rule_genid(ifnet_t ifp)
12455 {
12456 	atomic_add_32(&ifp->if_traffic_rule_genid, 1);
12457 }
12458 
12459 __private_extern__ boolean_t
ifnet_sync_traffic_rule_genid(ifnet_t ifp,uint32_t * genid)12460 ifnet_sync_traffic_rule_genid(ifnet_t ifp, uint32_t *genid)
12461 {
12462 	if (*genid != ifp->if_traffic_rule_genid) {
12463 		*genid = ifp->if_traffic_rule_genid;
12464 		return TRUE;
12465 	}
12466 	return FALSE;
12467 }
12468 __private_extern__ void
ifnet_update_traffic_rule_count(ifnet_t ifp,uint32_t count)12469 ifnet_update_traffic_rule_count(ifnet_t ifp, uint32_t count)
12470 {
12471 	atomic_set_32(&ifp->if_traffic_rule_count, count);
12472 	ifnet_update_traffic_rule_genid(ifp);
12473 }
12474 
12475 static void
log_hexdump(void * data,size_t len)12476 log_hexdump(void *data, size_t len)
12477 {
12478 	size_t i, j, k;
12479 	unsigned char *ptr = (unsigned char *)data;
12480 #define MAX_DUMP_BUF 32
12481 	unsigned char buf[3 * MAX_DUMP_BUF + 1];
12482 
12483 	for (i = 0; i < len; i += MAX_DUMP_BUF) {
12484 		for (j = i, k = 0; j < i + MAX_DUMP_BUF && j < len; j++) {
12485 			unsigned char msnbl = ptr[j] >> 4;
12486 			unsigned char lsnbl = ptr[j] & 0x0f;
12487 
12488 			buf[k++] = msnbl < 10 ? msnbl + '0' : msnbl + 'a' - 10;
12489 			buf[k++] = lsnbl < 10 ? lsnbl + '0' : lsnbl + 'a' - 10;
12490 
12491 			if ((j % 2) == 1) {
12492 				buf[k++] = ' ';
12493 			}
12494 			if ((j % MAX_DUMP_BUF) == MAX_DUMP_BUF - 1) {
12495 				buf[k++] = ' ';
12496 			}
12497 		}
12498 		buf[k] = 0;
12499 		os_log(OS_LOG_DEFAULT, "%3lu: %s", i, buf);
12500 	}
12501 }
12502 
12503 #if SKYWALK && defined(XNU_TARGET_OS_OSX)
12504 static bool
net_check_compatible_if_filter(struct ifnet * ifp)12505 net_check_compatible_if_filter(struct ifnet *ifp)
12506 {
12507 	if (ifp == NULL) {
12508 		if (net_api_stats.nas_iflt_attach_count > net_api_stats.nas_iflt_attach_os_count) {
12509 			return false;
12510 		}
12511 	} else {
12512 		if (ifp->if_flt_non_os_count > 0) {
12513 			return false;
12514 		}
12515 	}
12516 	return true;
12517 }
12518 #endif /* SKYWALK && XNU_TARGET_OS_OSX */
12519 
12520 #define DUMP_BUF_CHK() {        \
12521 	clen -= k;              \
12522 	if (clen < 1)           \
12523 	        goto done;      \
12524 	c += k;                 \
12525 }
12526 
12527 int dlil_dump_top_if_qlen(char *, int);
12528 int
dlil_dump_top_if_qlen(char * str,int str_len)12529 dlil_dump_top_if_qlen(char *str, int str_len)
12530 {
12531 	char *c = str;
12532 	int k, clen = str_len;
12533 	struct ifnet *top_ifcq_ifp = NULL;
12534 	uint32_t top_ifcq_len = 0;
12535 	struct ifnet *top_inq_ifp = NULL;
12536 	uint32_t top_inq_len = 0;
12537 
12538 	for (int ifidx = 1; ifidx < if_index; ifidx++) {
12539 		struct ifnet *ifp = ifindex2ifnet[ifidx];
12540 		struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
12541 
12542 		if (ifp == NULL) {
12543 			continue;
12544 		}
12545 		if (ifp->if_snd != NULL && ifp->if_snd->ifcq_len > top_ifcq_len) {
12546 			top_ifcq_len = ifp->if_snd->ifcq_len;
12547 			top_ifcq_ifp = ifp;
12548 		}
12549 		if (dl_if->dl_if_inpstorage.dlth_pkts.qlen > top_inq_len) {
12550 			top_inq_len = dl_if->dl_if_inpstorage.dlth_pkts.qlen;
12551 			top_inq_ifp = ifp;
12552 		}
12553 	}
12554 
12555 	if (top_ifcq_ifp != NULL) {
12556 		k = scnprintf(c, clen, "\ntop ifcq_len %u packets by %s\n",
12557 		    top_ifcq_len, top_ifcq_ifp->if_xname);
12558 		DUMP_BUF_CHK();
12559 	}
12560 	if (top_inq_ifp != NULL) {
12561 		k = scnprintf(c, clen, "\ntop inq_len %u packets by %s\n",
12562 		    top_inq_len, top_inq_ifp->if_xname);
12563 		DUMP_BUF_CHK();
12564 	}
12565 done:
12566 	return str_len - clen;
12567 }
12568 
12569 #if DEVELOPMENT || DEBUG
12570 __private_extern__ int
packet_dump_trace_update(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)12571 packet_dump_trace_update(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
12572 {
12573 	struct flow_key key = {};
12574 	int error = 0;
12575 
12576 	if (req->newptr == USER_ADDR_NULL) {
12577 		return EINVAL;
12578 	}
12579 	if (req->newlen < sizeof(struct flow_key)) {
12580 		return EINVAL;
12581 	}
12582 	error = SYSCTL_IN(req, &key, sizeof(struct flow_key));
12583 	if (error != 0) {
12584 		return error;
12585 	}
12586 
12587 	switch (key.fk_ipver) {
12588 	case IPVERSION:
12589 		if (key.fk_proto != IPPROTO_UDP ||
12590 		    key.fk_sport == 0 || key.fk_dport == 0) {
12591 			return EINVAL;
12592 		}
12593 
12594 		if (key.fk_src4.s_addr == INADDR_ANY ||
12595 		    key.fk_dst4.s_addr == INADDR_ANY) {
12596 			return EINVAL;
12597 		}
12598 
12599 		break;
12600 	case IPV6_VERSION:
12601 		if (key.fk_proto != IPPROTO_UDP ||
12602 		    key.fk_sport == 0 || key.fk_dport == 0) {
12603 			return EINVAL;
12604 		}
12605 
12606 		if (IN6_IS_ADDR_UNSPECIFIED(&key.fk_src6) ||
12607 		    IN6_IS_ADDR_UNSPECIFIED(&key.fk_dst6)) {
12608 			return EINVAL;
12609 		}
12610 
12611 		break;
12612 	case 0:
12613 		if (key.fk_proto != 0 ||
12614 		    key.fk_sport != 0 || key.fk_dport != 0) {
12615 			return EINVAL;
12616 		}
12617 
12618 		if (!IN6_IS_ADDR_UNSPECIFIED(&key.fk_src6) ||
12619 		    !IN6_IS_ADDR_UNSPECIFIED(&key.fk_dst6)) {
12620 			return EINVAL;
12621 		}
12622 
12623 		break;
12624 	default:
12625 		return EINVAL;
12626 	}
12627 
12628 	memcpy(&flow_key_trace, &key, sizeof(struct flow_key));
12629 	return 0;
12630 }
12631 #endif /* DEVELOPMENT || DEBUG */
12632