1 /*
2 * Copyright (c) 1999-2023 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
30 * support for mandatory and extensible security protections. This notice
31 * is included in support of clause 2.2 (b) of the Apple Public License,
32 * Version 2.0.
33 */
34 #include "kpi_interface.h"
35 #include <stddef.h>
36 #include <ptrauth.h>
37
38 #include <sys/param.h>
39 #include <sys/systm.h>
40 #include <sys/kernel.h>
41 #include <sys/malloc.h>
42 #include <sys/mbuf.h>
43 #include <sys/socket.h>
44 #include <sys/domain.h>
45 #include <sys/user.h>
46 #include <sys/random.h>
47 #include <sys/socketvar.h>
48 #include <net/if_dl.h>
49 #include <net/if.h>
50 #include <net/route.h>
51 #include <net/if_var.h>
52 #include <net/dlil.h>
53 #include <net/if_arp.h>
54 #include <net/iptap.h>
55 #include <net/pktap.h>
56 #include <net/nwk_wq.h>
57 #include <sys/kern_event.h>
58 #include <sys/kdebug.h>
59 #include <sys/mcache.h>
60 #include <sys/syslog.h>
61 #include <sys/protosw.h>
62 #include <sys/priv.h>
63
64 #include <kern/assert.h>
65 #include <kern/task.h>
66 #include <kern/thread.h>
67 #include <kern/sched_prim.h>
68 #include <kern/locks.h>
69 #include <kern/zalloc.h>
70
71 #include <net/kpi_protocol.h>
72 #include <net/if_types.h>
73 #include <net/if_ipsec.h>
74 #include <net/if_llreach.h>
75 #include <net/if_utun.h>
76 #include <net/kpi_interfacefilter.h>
77 #include <net/classq/classq.h>
78 #include <net/classq/classq_sfb.h>
79 #include <net/flowhash.h>
80 #include <net/ntstat.h>
81 #if SKYWALK && defined(XNU_TARGET_OS_OSX)
82 #include <skywalk/lib/net_filter_event.h>
83 #endif /* SKYWALK && XNU_TARGET_OS_OSX */
84 #include <net/net_api_stats.h>
85 #include <net/if_ports_used.h>
86 #include <net/if_vlan_var.h>
87 #include <netinet/in.h>
88 #if INET
89 #include <netinet/in_var.h>
90 #include <netinet/igmp_var.h>
91 #include <netinet/ip_var.h>
92 #include <netinet/tcp.h>
93 #include <netinet/tcp_var.h>
94 #include <netinet/udp.h>
95 #include <netinet/udp_var.h>
96 #include <netinet/if_ether.h>
97 #include <netinet/in_pcb.h>
98 #include <netinet/in_tclass.h>
99 #include <netinet/ip.h>
100 #include <netinet/ip_icmp.h>
101 #include <netinet/icmp_var.h>
102 #endif /* INET */
103
104 #include <net/nat464_utils.h>
105 #include <netinet6/in6_var.h>
106 #include <netinet6/nd6.h>
107 #include <netinet6/mld6_var.h>
108 #include <netinet6/scope6_var.h>
109 #include <netinet/ip6.h>
110 #include <netinet/icmp6.h>
111 #include <net/pf_pbuf.h>
112 #include <libkern/OSAtomic.h>
113 #include <libkern/tree.h>
114
115 #include <dev/random/randomdev.h>
116 #include <machine/machine_routines.h>
117
118 #include <mach/thread_act.h>
119 #include <mach/sdt.h>
120
121 #if CONFIG_MACF
122 #include <sys/kauth.h>
123 #include <security/mac_framework.h>
124 #include <net/ethernet.h>
125 #include <net/firewire.h>
126 #endif
127
128 #if PF
129 #include <net/pfvar.h>
130 #endif /* PF */
131 #include <net/pktsched/pktsched.h>
132 #include <net/pktsched/pktsched_netem.h>
133
134 #if NECP
135 #include <net/necp.h>
136 #endif /* NECP */
137
138 #if SKYWALK
139 #include <skywalk/packet/packet_queue.h>
140 #include <skywalk/nexus/netif/nx_netif.h>
141 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
142 #endif /* SKYWALK */
143
144 #include <os/log.h>
145
146 #define DBG_LAYER_BEG DLILDBG_CODE(DBG_DLIL_STATIC, 0)
147 #define DBG_LAYER_END DLILDBG_CODE(DBG_DLIL_STATIC, 2)
148 #define DBG_FNC_DLIL_INPUT DLILDBG_CODE(DBG_DLIL_STATIC, (1 << 8))
149 #define DBG_FNC_DLIL_OUTPUT DLILDBG_CODE(DBG_DLIL_STATIC, (2 << 8))
150 #define DBG_FNC_DLIL_IFOUT DLILDBG_CODE(DBG_DLIL_STATIC, (3 << 8))
151
152 #define MAX_FRAME_TYPE_SIZE 4 /* LONGWORDS */
153 #define MAX_LINKADDR 4 /* LONGWORDS */
154
155 #if 1
156 #define DLIL_PRINTF printf
157 #else
158 #define DLIL_PRINTF kprintf
159 #endif
160
161 #define IF_DATA_REQUIRE_ALIGNED_64(f) \
162 _CASSERT(!(offsetof(struct if_data_internal, f) % sizeof (u_int64_t)))
163
164 #define IFNET_IF_DATA_REQUIRE_ALIGNED_64(f) \
165 _CASSERT(!(offsetof(struct ifnet, if_data.f) % sizeof (u_int64_t)))
166
167 enum {
168 kProtoKPI_v1 = 1,
169 kProtoKPI_v2 = 2
170 };
171
172 uint64_t if_creation_generation_count = 0;
173
174 /*
175 * List of if_proto structures in if_proto_hash[] is protected by
176 * the ifnet lock. The rest of the fields are initialized at protocol
177 * attach time and never change, thus no lock required as long as
178 * a reference to it is valid, via if_proto_ref().
179 */
180 struct if_proto {
181 SLIST_ENTRY(if_proto) next_hash;
182 u_int32_t refcount;
183 u_int32_t detached;
184 struct ifnet *ifp;
185 protocol_family_t protocol_family;
186 int proto_kpi;
187 union {
188 struct {
189 proto_media_input input;
190 proto_media_preout pre_output;
191 proto_media_event event;
192 proto_media_ioctl ioctl;
193 proto_media_detached detached;
194 proto_media_resolve_multi resolve_multi;
195 proto_media_send_arp send_arp;
196 } v1;
197 struct {
198 proto_media_input_v2 input;
199 proto_media_preout pre_output;
200 proto_media_event event;
201 proto_media_ioctl ioctl;
202 proto_media_detached detached;
203 proto_media_resolve_multi resolve_multi;
204 proto_media_send_arp send_arp;
205 } v2;
206 } kpi;
207 };
208
209 SLIST_HEAD(proto_hash_entry, if_proto);
210
211 #define DLIL_SDLDATALEN \
212 (DLIL_SDLMAXLEN - offsetof(struct sockaddr_dl, sdl_data[0]))
213
214 struct dlil_ifnet {
215 struct ifnet dl_if; /* public ifnet */
216 /*
217 * DLIL private fields, protected by dl_if_lock
218 */
219 decl_lck_mtx_data(, dl_if_lock);
220 TAILQ_ENTRY(dlil_ifnet) dl_if_link; /* dlil_ifnet link */
221 u_int32_t dl_if_flags; /* flags (below) */
222 u_int32_t dl_if_refcnt; /* refcnt */
223 void (*dl_if_trace)(struct dlil_ifnet *, int); /* ref trace callback */
224 void *dl_if_uniqueid; /* unique interface id */
225 size_t dl_if_uniqueid_len; /* length of the unique id */
226 char dl_if_namestorage[IFNAMSIZ]; /* interface name storage */
227 char dl_if_xnamestorage[IFXNAMSIZ]; /* external name storage */
228 struct {
229 struct ifaddr ifa; /* lladdr ifa */
230 u_int8_t asdl[DLIL_SDLMAXLEN]; /* addr storage */
231 u_int8_t msdl[DLIL_SDLMAXLEN]; /* mask storage */
232 } dl_if_lladdr;
233 u_int8_t dl_if_descstorage[IF_DESCSIZE]; /* desc storage */
234 u_int8_t dl_if_permanent_ether[ETHER_ADDR_LEN]; /* permanent address */
235 u_int8_t dl_if_permanent_ether_is_set;
236 u_int8_t dl_if_unused;
237 struct dlil_threading_info dl_if_inpstorage; /* input thread storage */
238 ctrace_t dl_if_attach; /* attach PC stacktrace */
239 ctrace_t dl_if_detach; /* detach PC stacktrace */
240 };
241
242 /* Values for dl_if_flags (private to DLIL) */
243 #define DLIF_INUSE 0x1 /* DLIL ifnet recycler, ifnet in use */
244 #define DLIF_REUSE 0x2 /* DLIL ifnet recycles, ifnet is not new */
245 #define DLIF_DEBUG 0x4 /* has debugging info */
246
247 #define IF_REF_TRACE_HIST_SIZE 8 /* size of ref trace history */
248
249 /* For gdb */
250 __private_extern__ unsigned int if_ref_trace_hist_size = IF_REF_TRACE_HIST_SIZE;
251
252 struct dlil_ifnet_dbg {
253 struct dlil_ifnet dldbg_dlif; /* dlil_ifnet */
254 u_int16_t dldbg_if_refhold_cnt; /* # ifnet references */
255 u_int16_t dldbg_if_refrele_cnt; /* # ifnet releases */
256 /*
257 * Circular lists of ifnet_{reference,release} callers.
258 */
259 ctrace_t dldbg_if_refhold[IF_REF_TRACE_HIST_SIZE];
260 ctrace_t dldbg_if_refrele[IF_REF_TRACE_HIST_SIZE];
261 };
262
263 #define DLIL_TO_IFP(s) (&s->dl_if)
264 #define IFP_TO_DLIL(s) ((struct dlil_ifnet *)s)
265
266 struct ifnet_filter {
267 TAILQ_ENTRY(ifnet_filter) filt_next;
268 u_int32_t filt_skip;
269 u_int32_t filt_flags;
270 ifnet_t filt_ifp;
271 const char *filt_name;
272 void *filt_cookie;
273 protocol_family_t filt_protocol;
274 iff_input_func filt_input;
275 iff_output_func filt_output;
276 iff_event_func filt_event;
277 iff_ioctl_func filt_ioctl;
278 iff_detached_func filt_detached;
279 };
280
281 /* Mbuf queue used for freeing the excessive mbufs */
282 typedef MBUFQ_HEAD(dlil_freeq) dlil_freeq_t;
283
284 struct proto_input_entry;
285
286 static TAILQ_HEAD(, dlil_ifnet) dlil_ifnet_head;
287
288 static LCK_ATTR_DECLARE(dlil_lck_attributes, 0, 0);
289
290 static LCK_GRP_DECLARE(dlil_lock_group, "DLIL internal locks");
291 LCK_GRP_DECLARE(ifnet_lock_group, "ifnet locks");
292 static LCK_GRP_DECLARE(ifnet_head_lock_group, "ifnet head lock");
293 static LCK_GRP_DECLARE(ifnet_snd_lock_group, "ifnet snd locks");
294 static LCK_GRP_DECLARE(ifnet_rcv_lock_group, "ifnet rcv locks");
295
296 LCK_ATTR_DECLARE(ifnet_lock_attr, 0, 0);
297 static LCK_RW_DECLARE_ATTR(ifnet_head_lock, &ifnet_head_lock_group,
298 &dlil_lck_attributes);
299 static LCK_MTX_DECLARE_ATTR(dlil_ifnet_lock, &dlil_lock_group,
300 &dlil_lck_attributes);
301
302 #if DEBUG
303 static unsigned int ifnet_debug = 1; /* debugging (enabled) */
304 #else
305 static unsigned int ifnet_debug; /* debugging (disabled) */
306 #endif /* !DEBUG */
307 static unsigned int dlif_size; /* size of dlil_ifnet to allocate */
308 static unsigned int dlif_bufsize; /* size of dlif_size + headroom */
309 static struct zone *dlif_zone; /* zone for dlil_ifnet */
310 #define DLIF_ZONE_NAME "ifnet" /* zone name */
311
312 static KALLOC_TYPE_DEFINE(dlif_filt_zone, struct ifnet_filter, NET_KT_DEFAULT);
313
314 static KALLOC_TYPE_DEFINE(dlif_proto_zone, struct if_proto, NET_KT_DEFAULT);
315
316 static unsigned int dlif_tcpstat_size; /* size of tcpstat_local to allocate */
317 static unsigned int dlif_tcpstat_bufsize; /* size of dlif_tcpstat_size + headroom */
318 static struct zone *dlif_tcpstat_zone; /* zone for tcpstat_local */
319 #define DLIF_TCPSTAT_ZONE_NAME "ifnet_tcpstat" /* zone name */
320
321 static unsigned int dlif_udpstat_size; /* size of udpstat_local to allocate */
322 static unsigned int dlif_udpstat_bufsize; /* size of dlif_udpstat_size + headroom */
323 static struct zone *dlif_udpstat_zone; /* zone for udpstat_local */
324 #define DLIF_UDPSTAT_ZONE_NAME "ifnet_udpstat" /* zone name */
325
326 static u_int32_t net_rtref;
327
328 static struct dlil_main_threading_info dlil_main_input_thread_info;
329 __private_extern__ struct dlil_threading_info *dlil_main_input_thread =
330 (struct dlil_threading_info *)&dlil_main_input_thread_info;
331
332 static int dlil_event_internal(struct ifnet *ifp, struct kev_msg *msg, bool update_generation);
333 static int dlil_detach_filter_internal(interface_filter_t filter, int detached);
334 static void dlil_if_trace(struct dlil_ifnet *, int);
335 static void if_proto_ref(struct if_proto *);
336 static void if_proto_free(struct if_proto *);
337 static struct if_proto *find_attached_proto(struct ifnet *, u_int32_t);
338 static u_int32_t dlil_ifp_protolist(struct ifnet *ifp, protocol_family_t *list,
339 u_int32_t list_count);
340 static void _dlil_if_release(ifnet_t ifp, bool clear_in_use);
341 static void if_flt_monitor_busy(struct ifnet *);
342 static void if_flt_monitor_unbusy(struct ifnet *);
343 static void if_flt_monitor_enter(struct ifnet *);
344 static void if_flt_monitor_leave(struct ifnet *);
345 static int dlil_interface_filters_input(struct ifnet *, struct mbuf **,
346 char **, protocol_family_t);
347 static int dlil_interface_filters_output(struct ifnet *, struct mbuf **,
348 protocol_family_t);
349 static struct ifaddr *dlil_alloc_lladdr(struct ifnet *,
350 const struct sockaddr_dl *);
351 static int ifnet_lookup(struct ifnet *);
352 static void if_purgeaddrs(struct ifnet *);
353
354 static errno_t ifproto_media_input_v1(struct ifnet *, protocol_family_t,
355 struct mbuf *, char *);
356 static errno_t ifproto_media_input_v2(struct ifnet *, protocol_family_t,
357 struct mbuf *);
358 static errno_t ifproto_media_preout(struct ifnet *, protocol_family_t,
359 mbuf_t *, const struct sockaddr *, void *, char *, char *);
360 static void ifproto_media_event(struct ifnet *, protocol_family_t,
361 const struct kev_msg *);
362 static errno_t ifproto_media_ioctl(struct ifnet *, protocol_family_t,
363 unsigned long, void *);
364 static errno_t ifproto_media_resolve_multi(ifnet_t, const struct sockaddr *,
365 struct sockaddr_dl *, size_t);
366 static errno_t ifproto_media_send_arp(struct ifnet *, u_short,
367 const struct sockaddr_dl *, const struct sockaddr *,
368 const struct sockaddr_dl *, const struct sockaddr *);
369
370 static errno_t ifp_if_input(struct ifnet *ifp, struct mbuf *m_head,
371 struct mbuf *m_tail, const struct ifnet_stat_increment_param *s,
372 boolean_t poll, struct thread *tp);
373 static void ifp_if_input_poll(struct ifnet *, u_int32_t, u_int32_t,
374 struct mbuf **, struct mbuf **, u_int32_t *, u_int32_t *);
375 static errno_t ifp_if_ctl(struct ifnet *, ifnet_ctl_cmd_t, u_int32_t, void *);
376 static errno_t ifp_if_demux(struct ifnet *, struct mbuf *, char *,
377 protocol_family_t *);
378 static errno_t ifp_if_add_proto(struct ifnet *, protocol_family_t,
379 const struct ifnet_demux_desc *, u_int32_t);
380 static errno_t ifp_if_del_proto(struct ifnet *, protocol_family_t);
381 static errno_t ifp_if_check_multi(struct ifnet *, const struct sockaddr *);
382 #if !XNU_TARGET_OS_OSX
383 static errno_t ifp_if_framer(struct ifnet *, struct mbuf **,
384 const struct sockaddr *, const char *, const char *,
385 u_int32_t *, u_int32_t *);
386 #else /* XNU_TARGET_OS_OSX */
387 static errno_t ifp_if_framer(struct ifnet *, struct mbuf **,
388 const struct sockaddr *, const char *, const char *);
389 #endif /* XNU_TARGET_OS_OSX */
390 static errno_t ifp_if_framer_extended(struct ifnet *, struct mbuf **,
391 const struct sockaddr *, const char *, const char *,
392 u_int32_t *, u_int32_t *);
393 static errno_t ifp_if_set_bpf_tap(struct ifnet *, bpf_tap_mode, bpf_packet_func);
394 static void ifp_if_free(struct ifnet *);
395 static void ifp_if_event(struct ifnet *, const struct kev_msg *);
396 static __inline void ifp_inc_traffic_class_in(struct ifnet *, struct mbuf *);
397 static __inline void ifp_inc_traffic_class_out(struct ifnet *, struct mbuf *);
398
399 static uint32_t dlil_trim_overcomitted_queue_locked(class_queue_t *,
400 dlil_freeq_t *, struct ifnet_stat_increment_param *);
401
402 static errno_t dlil_input_async(struct dlil_threading_info *, struct ifnet *,
403 struct mbuf *, struct mbuf *, const struct ifnet_stat_increment_param *,
404 boolean_t, struct thread *);
405 static errno_t dlil_input_sync(struct dlil_threading_info *, struct ifnet *,
406 struct mbuf *, struct mbuf *, const struct ifnet_stat_increment_param *,
407 boolean_t, struct thread *);
408
409 static void dlil_main_input_thread_func(void *, wait_result_t);
410 static void dlil_main_input_thread_cont(void *, wait_result_t);
411
412 static void dlil_input_thread_func(void *, wait_result_t);
413 static void dlil_input_thread_cont(void *, wait_result_t);
414
415 static void dlil_rxpoll_input_thread_func(void *, wait_result_t);
416 static void dlil_rxpoll_input_thread_cont(void *, wait_result_t);
417
418 static int dlil_create_input_thread(ifnet_t, struct dlil_threading_info *,
419 thread_continue_t *);
420 static void dlil_terminate_input_thread(struct dlil_threading_info *);
421 static void dlil_input_stats_add(const struct ifnet_stat_increment_param *,
422 struct dlil_threading_info *, struct ifnet *, boolean_t);
423 static boolean_t dlil_input_stats_sync(struct ifnet *,
424 struct dlil_threading_info *);
425 static void dlil_input_packet_list_common(struct ifnet *, struct mbuf *,
426 u_int32_t, ifnet_model_t, boolean_t);
427 static errno_t ifnet_input_common(struct ifnet *, struct mbuf *, struct mbuf *,
428 const struct ifnet_stat_increment_param *, boolean_t, boolean_t);
429 static int dlil_is_clat_needed(protocol_family_t, mbuf_t );
430 static errno_t dlil_clat46(ifnet_t, protocol_family_t *, mbuf_t *);
431 static errno_t dlil_clat64(ifnet_t, protocol_family_t *, mbuf_t *);
432 #if DEBUG || DEVELOPMENT
433 static void dlil_verify_sum16(void);
434 #endif /* DEBUG || DEVELOPMENT */
435 static void dlil_output_cksum_dbg(struct ifnet *, struct mbuf *, uint32_t,
436 protocol_family_t);
437 static void dlil_input_cksum_dbg(struct ifnet *, struct mbuf *, char *,
438 protocol_family_t);
439
440 static void dlil_incr_pending_thread_count(void);
441 static void dlil_decr_pending_thread_count(void);
442
443 static void ifnet_detacher_thread_func(void *, wait_result_t);
444 static void ifnet_detacher_thread_cont(void *, wait_result_t);
445 static void ifnet_detach_final(struct ifnet *);
446 static void ifnet_detaching_enqueue(struct ifnet *);
447 static struct ifnet *ifnet_detaching_dequeue(void);
448
449 static void ifnet_start_thread_func(void *, wait_result_t);
450 static void ifnet_start_thread_cont(void *, wait_result_t);
451
452 static void ifnet_poll_thread_func(void *, wait_result_t);
453 static void ifnet_poll_thread_cont(void *, wait_result_t);
454
455 static errno_t ifnet_enqueue_common(struct ifnet *, struct ifclassq *,
456 classq_pkt_t *, boolean_t, boolean_t *);
457
458 static void ifp_src_route_copyout(struct ifnet *, struct route *);
459 static void ifp_src_route_copyin(struct ifnet *, struct route *);
460 static void ifp_src_route6_copyout(struct ifnet *, struct route_in6 *);
461 static void ifp_src_route6_copyin(struct ifnet *, struct route_in6 *);
462
463 static errno_t if_mcasts_update_async(struct ifnet *);
464
465 static int sysctl_rxpoll SYSCTL_HANDLER_ARGS;
466 static int sysctl_rxpoll_mode_holdtime SYSCTL_HANDLER_ARGS;
467 static int sysctl_rxpoll_sample_holdtime SYSCTL_HANDLER_ARGS;
468 static int sysctl_rxpoll_interval_time SYSCTL_HANDLER_ARGS;
469 static int sysctl_rxpoll_wlowat SYSCTL_HANDLER_ARGS;
470 static int sysctl_rxpoll_whiwat SYSCTL_HANDLER_ARGS;
471 static int sysctl_sndq_maxlen SYSCTL_HANDLER_ARGS;
472 static int sysctl_rcvq_maxlen SYSCTL_HANDLER_ARGS;
473 static int sysctl_rcvq_burst_limit SYSCTL_HANDLER_ARGS;
474 static int sysctl_rcvq_trim_pct SYSCTL_HANDLER_ARGS;
475 static int sysctl_hwcksum_dbg_mode SYSCTL_HANDLER_ARGS;
476 static int sysctl_hwcksum_dbg_partial_rxoff_forced SYSCTL_HANDLER_ARGS;
477 static int sysctl_hwcksum_dbg_partial_rxoff_adj SYSCTL_HANDLER_ARGS;
478
479 struct chain_len_stats tx_chain_len_stats;
480 static int sysctl_tx_chain_len_stats SYSCTL_HANDLER_ARGS;
481
482 #if TEST_INPUT_THREAD_TERMINATION
483 static int sysctl_input_thread_termination_spin SYSCTL_HANDLER_ARGS;
484 #endif /* TEST_INPUT_THREAD_TERMINATION */
485
486 /* The following are protected by dlil_ifnet_lock */
487 static TAILQ_HEAD(, ifnet) ifnet_detaching_head;
488 static u_int32_t ifnet_detaching_cnt;
489 static boolean_t ifnet_detaching_embryonic;
490 static void *ifnet_delayed_run; /* wait channel for detaching thread */
491
492 static LCK_MTX_DECLARE_ATTR(ifnet_fc_lock, &dlil_lock_group,
493 &dlil_lck_attributes);
494
495 static uint32_t ifnet_flowhash_seed;
496
497 struct ifnet_flowhash_key {
498 char ifk_name[IFNAMSIZ];
499 uint32_t ifk_unit;
500 uint32_t ifk_flags;
501 uint32_t ifk_eflags;
502 uint32_t ifk_capabilities;
503 uint32_t ifk_capenable;
504 uint32_t ifk_output_sched_model;
505 uint32_t ifk_rand1;
506 uint32_t ifk_rand2;
507 };
508
509 /* Flow control entry per interface */
510 struct ifnet_fc_entry {
511 RB_ENTRY(ifnet_fc_entry) ifce_entry;
512 u_int32_t ifce_flowhash;
513 struct ifnet *ifce_ifp;
514 };
515
516 static uint32_t ifnet_calc_flowhash(struct ifnet *);
517 static int ifce_cmp(const struct ifnet_fc_entry *,
518 const struct ifnet_fc_entry *);
519 static int ifnet_fc_add(struct ifnet *);
520 static struct ifnet_fc_entry *ifnet_fc_get(u_int32_t);
521 static void ifnet_fc_entry_free(struct ifnet_fc_entry *);
522
523 /* protected by ifnet_fc_lock */
524 RB_HEAD(ifnet_fc_tree, ifnet_fc_entry) ifnet_fc_tree;
525 RB_PROTOTYPE(ifnet_fc_tree, ifnet_fc_entry, ifce_entry, ifce_cmp);
526 RB_GENERATE(ifnet_fc_tree, ifnet_fc_entry, ifce_entry, ifce_cmp);
527
528 static KALLOC_TYPE_DEFINE(ifnet_fc_zone, struct ifnet_fc_entry, NET_KT_DEFAULT);
529
530 extern void bpfdetach(struct ifnet *);
531 extern void proto_input_run(void);
532
533 extern uint32_t udp_count_opportunistic(unsigned int ifindex,
534 u_int32_t flags);
535 extern uint32_t tcp_count_opportunistic(unsigned int ifindex,
536 u_int32_t flags);
537
538 __private_extern__ void link_rtrequest(int, struct rtentry *, struct sockaddr *);
539
540 #if CONFIG_MACF
541 #if !XNU_TARGET_OS_OSX
542 int dlil_lladdr_ckreq = 1;
543 #else /* XNU_TARGET_OS_OSX */
544 int dlil_lladdr_ckreq = 0;
545 #endif /* XNU_TARGET_OS_OSX */
546 #endif /* CONFIG_MACF */
547
548 #if DEBUG
549 int dlil_verbose = 1;
550 #else
551 int dlil_verbose = 0;
552 #endif /* DEBUG */
553 #if IFNET_INPUT_SANITY_CHK
554 /* sanity checking of input packet lists received */
555 static u_int32_t dlil_input_sanity_check = 0;
556 #endif /* IFNET_INPUT_SANITY_CHK */
557 /* rate limit debug messages */
558 struct timespec dlil_dbgrate = { .tv_sec = 1, .tv_nsec = 0 };
559
560 SYSCTL_DECL(_net_link_generic_system);
561
562 SYSCTL_INT(_net_link_generic_system, OID_AUTO, dlil_verbose,
563 CTLFLAG_RW | CTLFLAG_LOCKED, &dlil_verbose, 0, "Log DLIL error messages");
564
565 #define IF_SNDQ_MINLEN 32
566 u_int32_t if_sndq_maxlen = IFQ_MAXLEN;
567 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, sndq_maxlen,
568 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_sndq_maxlen, IFQ_MAXLEN,
569 sysctl_sndq_maxlen, "I", "Default transmit queue max length");
570
571 #define IF_RCVQ_MINLEN 32
572 #define IF_RCVQ_MAXLEN 256
573 u_int32_t if_rcvq_maxlen = IF_RCVQ_MAXLEN;
574 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rcvq_maxlen,
575 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rcvq_maxlen, IFQ_MAXLEN,
576 sysctl_rcvq_maxlen, "I", "Default receive queue max length");
577
578 /*
579 * Protect against possible memory starvation that may happen
580 * when the driver is pushing data faster than the AP can process.
581 *
582 * If at any point during DLIL input phase any of the input queues
583 * exceeds the burst limit, DLIL will start to trim the queue,
584 * by returning mbufs in the input queue to the cache from which
585 * the mbufs were originally allocated, starting from the oldest
586 * mbuf and continuing until the new limit (see below) is reached.
587 *
588 * In order to avoid a steplocked equilibrium, the trimming
589 * will continue PAST the burst limit, until the corresponding
590 * input queue is reduced to `if_rcvq_trim_pct' %.
591 *
592 * For example, if the input queue limit is 1024 packets,
593 * and the trim percentage (`if_rcvq_trim_pct') is 80 %,
594 * the trimming will continue until the queue contains 819 packets
595 * (1024 * 80 / 100 == 819).
596 *
597 * Setting the burst limit too low can hurt the throughput,
598 * while setting the burst limit too high can defeat the purpose.
599 */
600 #define IF_RCVQ_BURST_LIMIT_MIN 1024
601 #define IF_RCVQ_BURST_LIMIT_DEFAULT 8192
602 #define IF_RCVQ_BURST_LIMIT_MAX 32768
603 uint32_t if_rcvq_burst_limit = IF_RCVQ_BURST_LIMIT_DEFAULT;
604 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rcvq_burst_limit,
605 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rcvq_burst_limit, IF_RCVQ_BURST_LIMIT_DEFAULT,
606 sysctl_rcvq_burst_limit, "I", "Upper memory limit for inbound data");
607
608 #define IF_RCVQ_TRIM_PCT_MIN 20
609 #define IF_RCVQ_TRIM_PCT_DEFAULT 80
610 #define IF_RCVQ_TRIM_PCT_MAX 100
611 uint32_t if_rcvq_trim_pct = IF_RCVQ_TRIM_PCT_DEFAULT;
612 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rcvq_trim_pct,
613 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rcvq_trim_pct, IF_RCVQ_TRIM_PCT_DEFAULT,
614 sysctl_rcvq_trim_pct, "I",
615 "Percentage (0 - 100) of the queue limit to keep after detecting an overflow burst");
616
617 #define IF_RXPOLL_DECAY 2 /* ilog2 of EWMA decay rate (4) */
618 u_int32_t if_rxpoll_decay = IF_RXPOLL_DECAY;
619 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, rxpoll_decay,
620 CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_decay, IF_RXPOLL_DECAY,
621 "ilog2 of EWMA decay rate of avg inbound packets");
622
623 #define IF_RXPOLL_MODE_HOLDTIME_MIN (10ULL * 1000 * 1000) /* 10 ms */
624 #define IF_RXPOLL_MODE_HOLDTIME (1000ULL * 1000 * 1000) /* 1 sec */
625 static u_int64_t if_rxpoll_mode_holdtime = IF_RXPOLL_MODE_HOLDTIME;
626 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_freeze_time,
627 CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_mode_holdtime,
628 IF_RXPOLL_MODE_HOLDTIME, sysctl_rxpoll_mode_holdtime,
629 "Q", "input poll mode freeze time");
630
631 #define IF_RXPOLL_SAMPLETIME_MIN (1ULL * 1000 * 1000) /* 1 ms */
632 #define IF_RXPOLL_SAMPLETIME (10ULL * 1000 * 1000) /* 10 ms */
633 static u_int64_t if_rxpoll_sample_holdtime = IF_RXPOLL_SAMPLETIME;
634 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_sample_time,
635 CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_sample_holdtime,
636 IF_RXPOLL_SAMPLETIME, sysctl_rxpoll_sample_holdtime,
637 "Q", "input poll sampling time");
638
639 static u_int64_t if_rxpoll_interval_time = IF_RXPOLL_INTERVALTIME;
640 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_interval_time,
641 CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_interval_time,
642 IF_RXPOLL_INTERVALTIME, sysctl_rxpoll_interval_time,
643 "Q", "input poll interval (time)");
644
645 #define IF_RXPOLL_INTERVAL_PKTS 0 /* 0 (disabled) */
646 u_int32_t if_rxpoll_interval_pkts = IF_RXPOLL_INTERVAL_PKTS;
647 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, rxpoll_interval_pkts,
648 CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_interval_pkts,
649 IF_RXPOLL_INTERVAL_PKTS, "input poll interval (packets)");
650
651 #define IF_RXPOLL_WLOWAT 10
652 static u_int32_t if_sysctl_rxpoll_wlowat = IF_RXPOLL_WLOWAT;
653 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_wakeups_lowat,
654 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_sysctl_rxpoll_wlowat,
655 IF_RXPOLL_WLOWAT, sysctl_rxpoll_wlowat,
656 "I", "input poll wakeup low watermark");
657
658 #define IF_RXPOLL_WHIWAT 100
659 static u_int32_t if_sysctl_rxpoll_whiwat = IF_RXPOLL_WHIWAT;
660 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_wakeups_hiwat,
661 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_sysctl_rxpoll_whiwat,
662 IF_RXPOLL_WHIWAT, sysctl_rxpoll_whiwat,
663 "I", "input poll wakeup high watermark");
664
665 static u_int32_t if_rxpoll_max = 0; /* 0 (automatic) */
666 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, rxpoll_max,
667 CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_max, 0,
668 "max packets per poll call");
669
670 u_int32_t if_rxpoll = 1;
671 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll,
672 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll, 0,
673 sysctl_rxpoll, "I", "enable opportunistic input polling");
674
675 #if TEST_INPUT_THREAD_TERMINATION
676 static u_int32_t if_input_thread_termination_spin = 0;
677 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, input_thread_termination_spin,
678 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
679 &if_input_thread_termination_spin, 0,
680 sysctl_input_thread_termination_spin,
681 "I", "input thread termination spin limit");
682 #endif /* TEST_INPUT_THREAD_TERMINATION */
683
684 static u_int32_t cur_dlil_input_threads = 0;
685 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, dlil_input_threads,
686 CTLFLAG_RD | CTLFLAG_LOCKED, &cur_dlil_input_threads, 0,
687 "Current number of DLIL input threads");
688
689 #if IFNET_INPUT_SANITY_CHK
690 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, dlil_input_sanity_check,
691 CTLFLAG_RW | CTLFLAG_LOCKED, &dlil_input_sanity_check, 0,
692 "Turn on sanity checking in DLIL input");
693 #endif /* IFNET_INPUT_SANITY_CHK */
694
695 static u_int32_t if_flowadv = 1;
696 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, flow_advisory,
697 CTLFLAG_RW | CTLFLAG_LOCKED, &if_flowadv, 1,
698 "enable flow-advisory mechanism");
699
700 static u_int32_t if_delaybased_queue = 1;
701 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, delaybased_queue,
702 CTLFLAG_RW | CTLFLAG_LOCKED, &if_delaybased_queue, 1,
703 "enable delay based dynamic queue sizing");
704
705 static uint64_t hwcksum_in_invalidated = 0;
706 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
707 hwcksum_in_invalidated, CTLFLAG_RD | CTLFLAG_LOCKED,
708 &hwcksum_in_invalidated, "inbound packets with invalidated hardware cksum");
709
710 uint32_t hwcksum_dbg = 0;
711 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, hwcksum_dbg,
712 CTLFLAG_RW | CTLFLAG_LOCKED, &hwcksum_dbg, 0,
713 "enable hardware cksum debugging");
714
715 u_int32_t ifnet_start_delayed = 0;
716 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, start_delayed,
717 CTLFLAG_RW | CTLFLAG_LOCKED, &ifnet_start_delayed, 0,
718 "number of times start was delayed");
719
720 u_int32_t ifnet_delay_start_disabled = 0;
721 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, start_delay_disabled,
722 CTLFLAG_RW | CTLFLAG_LOCKED, &ifnet_delay_start_disabled, 0,
723 "number of times start was delayed");
724
725 static inline void
ifnet_delay_start_disabled_increment(void)726 ifnet_delay_start_disabled_increment(void)
727 {
728 OSIncrementAtomic(&ifnet_delay_start_disabled);
729 }
730
731 #define HWCKSUM_DBG_PARTIAL_FORCED 0x1 /* forced partial checksum */
732 #define HWCKSUM_DBG_PARTIAL_RXOFF_ADJ 0x2 /* adjust start offset */
733 #define HWCKSUM_DBG_FINALIZE_FORCED 0x10 /* forced finalize */
734 #define HWCKSUM_DBG_MASK \
735 (HWCKSUM_DBG_PARTIAL_FORCED | HWCKSUM_DBG_PARTIAL_RXOFF_ADJ | \
736 HWCKSUM_DBG_FINALIZE_FORCED)
737
738 static uint32_t hwcksum_dbg_mode = 0;
739 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, hwcksum_dbg_mode,
740 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &hwcksum_dbg_mode,
741 0, sysctl_hwcksum_dbg_mode, "I", "hardware cksum debugging mode");
742
743 static uint64_t hwcksum_dbg_partial_forced = 0;
744 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
745 hwcksum_dbg_partial_forced, CTLFLAG_RD | CTLFLAG_LOCKED,
746 &hwcksum_dbg_partial_forced, "packets forced using partial cksum");
747
748 static uint64_t hwcksum_dbg_partial_forced_bytes = 0;
749 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
750 hwcksum_dbg_partial_forced_bytes, CTLFLAG_RD | CTLFLAG_LOCKED,
751 &hwcksum_dbg_partial_forced_bytes, "bytes forced using partial cksum");
752
753 static uint32_t hwcksum_dbg_partial_rxoff_forced = 0;
754 SYSCTL_PROC(_net_link_generic_system, OID_AUTO,
755 hwcksum_dbg_partial_rxoff_forced, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
756 &hwcksum_dbg_partial_rxoff_forced, 0,
757 sysctl_hwcksum_dbg_partial_rxoff_forced, "I",
758 "forced partial cksum rx offset");
759
760 static uint32_t hwcksum_dbg_partial_rxoff_adj = 0;
761 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, hwcksum_dbg_partial_rxoff_adj,
762 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &hwcksum_dbg_partial_rxoff_adj,
763 0, sysctl_hwcksum_dbg_partial_rxoff_adj, "I",
764 "adjusted partial cksum rx offset");
765
766 static uint64_t hwcksum_dbg_verified = 0;
767 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
768 hwcksum_dbg_verified, CTLFLAG_RD | CTLFLAG_LOCKED,
769 &hwcksum_dbg_verified, "packets verified for having good checksum");
770
771 static uint64_t hwcksum_dbg_bad_cksum = 0;
772 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
773 hwcksum_dbg_bad_cksum, CTLFLAG_RD | CTLFLAG_LOCKED,
774 &hwcksum_dbg_bad_cksum, "packets with bad hardware calculated checksum");
775
776 static uint64_t hwcksum_dbg_bad_rxoff = 0;
777 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
778 hwcksum_dbg_bad_rxoff, CTLFLAG_RD | CTLFLAG_LOCKED,
779 &hwcksum_dbg_bad_rxoff, "packets with invalid rxoff");
780
781 static uint64_t hwcksum_dbg_adjusted = 0;
782 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
783 hwcksum_dbg_adjusted, CTLFLAG_RD | CTLFLAG_LOCKED,
784 &hwcksum_dbg_adjusted, "packets with rxoff adjusted");
785
786 static uint64_t hwcksum_dbg_finalized_hdr = 0;
787 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
788 hwcksum_dbg_finalized_hdr, CTLFLAG_RD | CTLFLAG_LOCKED,
789 &hwcksum_dbg_finalized_hdr, "finalized headers");
790
791 static uint64_t hwcksum_dbg_finalized_data = 0;
792 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
793 hwcksum_dbg_finalized_data, CTLFLAG_RD | CTLFLAG_LOCKED,
794 &hwcksum_dbg_finalized_data, "finalized payloads");
795
796 uint32_t hwcksum_tx = 1;
797 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, hwcksum_tx,
798 CTLFLAG_RW | CTLFLAG_LOCKED, &hwcksum_tx, 0,
799 "enable transmit hardware checksum offload");
800
801 uint32_t hwcksum_rx = 1;
802 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, hwcksum_rx,
803 CTLFLAG_RW | CTLFLAG_LOCKED, &hwcksum_rx, 0,
804 "enable receive hardware checksum offload");
805
806 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, tx_chain_len_stats,
807 CTLFLAG_RD | CTLFLAG_LOCKED, 0, 9,
808 sysctl_tx_chain_len_stats, "S", "");
809
810 uint32_t tx_chain_len_count = 0;
811 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, tx_chain_len_count,
812 CTLFLAG_RW | CTLFLAG_LOCKED, &tx_chain_len_count, 0, "");
813
814 static uint32_t threshold_notify = 1; /* enable/disable */
815 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, threshold_notify,
816 CTLFLAG_RW | CTLFLAG_LOCKED, &threshold_notify, 0, "");
817
818 static uint32_t threshold_interval = 2; /* in seconds */
819 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, threshold_interval,
820 CTLFLAG_RW | CTLFLAG_LOCKED, &threshold_interval, 0, "");
821
822 #if (DEVELOPMENT || DEBUG)
823 static int sysctl_get_kao_frames SYSCTL_HANDLER_ARGS;
824 SYSCTL_NODE(_net_link_generic_system, OID_AUTO, get_kao_frames,
825 CTLFLAG_RD | CTLFLAG_LOCKED, sysctl_get_kao_frames, "");
826 #endif /* DEVELOPMENT || DEBUG */
827
828 struct net_api_stats net_api_stats;
829 SYSCTL_STRUCT(_net, OID_AUTO, api_stats, CTLFLAG_RD | CTLFLAG_LOCKED,
830 &net_api_stats, net_api_stats, "");
831
832 uint32_t net_wake_pkt_debug = 0;
833 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, wake_pkt_debug,
834 CTLFLAG_RW | CTLFLAG_LOCKED, &net_wake_pkt_debug, 0, "");
835
836 static void log_hexdump(void *data, size_t len);
837
838 unsigned int net_rxpoll = 1;
839 unsigned int net_affinity = 1;
840 unsigned int net_async = 1; /* 0: synchronous, 1: asynchronous */
841
842 static kern_return_t dlil_affinity_set(struct thread *, u_int32_t);
843
844 extern u_int32_t inject_buckets;
845
846 /* DLIL data threshold thread call */
847 static void dlil_dt_tcall_fn(thread_call_param_t, thread_call_param_t);
848
849 void
ifnet_filter_update_tso(struct ifnet * ifp,boolean_t filter_enable)850 ifnet_filter_update_tso(struct ifnet *ifp, boolean_t filter_enable)
851 {
852 /*
853 * update filter count and route_generation ID to let TCP
854 * know it should reevalute doing TSO or not
855 */
856 if (filter_enable) {
857 OSAddAtomic(1, &ifp->if_flt_no_tso_count);
858 } else {
859 VERIFY(ifp->if_flt_no_tso_count != 0);
860 OSAddAtomic(-1, &ifp->if_flt_no_tso_count);
861 }
862 routegenid_update();
863 }
864
865 #if SKYWALK
866
867 #if defined(XNU_TARGET_OS_OSX)
868 static bool net_check_compatible_if_filter(struct ifnet *ifp);
869 #endif /* XNU_TARGET_OS_OSX */
870
871 /* if_attach_nx flags defined in os_skywalk_private.h */
872 static unsigned int if_attach_nx = IF_ATTACH_NX_DEFAULT;
873 unsigned int if_enable_fsw_ip_netagent =
874 ((IF_ATTACH_NX_DEFAULT & IF_ATTACH_NX_FSW_IP_NETAGENT) != 0);
875 unsigned int if_enable_fsw_transport_netagent =
876 ((IF_ATTACH_NX_DEFAULT & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0);
877
878 unsigned int if_netif_all =
879 ((IF_ATTACH_NX_DEFAULT & IF_ATTACH_NX_NETIF_ALL) != 0);
880
881 /* Configure flowswitch to use max mtu sized buffer */
882 static bool fsw_use_max_mtu_buffer = false;
883
884 #if (DEVELOPMENT || DEBUG)
885 static int
886 if_attach_nx_sysctl SYSCTL_HANDLER_ARGS
887 {
888 #pragma unused(oidp, arg1, arg2)
889 unsigned int new_value;
890 int changed;
891 int error = sysctl_io_number(req, if_attach_nx, sizeof(if_attach_nx),
892 &new_value, &changed);
893 if (error) {
894 return error;
895 }
896 if (changed) {
897 if ((new_value & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) !=
898 (if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT)) {
899 return ENOTSUP;
900 }
901 if_attach_nx = new_value;
902 }
903 return 0;
904 }
905
906 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, if_attach_nx,
907 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
908 0, 0, &if_attach_nx_sysctl, "IU", "attach nexus");
909
910 #endif /* DEVELOPMENT || DEBUG */
911
912 static int
913 if_enable_fsw_transport_netagent_sysctl SYSCTL_HANDLER_ARGS
914 {
915 #pragma unused(oidp, arg1, arg2)
916 unsigned int new_value;
917 int changed;
918 int error;
919
920 error = sysctl_io_number(req, if_enable_fsw_transport_netagent,
921 sizeof(if_enable_fsw_transport_netagent),
922 &new_value, &changed);
923 if (error == 0 && changed != 0) {
924 if (new_value != 0 && new_value != 1) {
925 /* only allow 0 or 1 */
926 error = EINVAL;
927 } else if ((if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0) {
928 /* netagent can be enabled/disabled */
929 if_enable_fsw_transport_netagent = new_value;
930 if (new_value == 0) {
931 kern_nexus_deregister_netagents();
932 } else {
933 kern_nexus_register_netagents();
934 }
935 } else {
936 /* netagent can't be enabled */
937 error = ENOTSUP;
938 }
939 }
940 return error;
941 }
942
943 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, enable_netagent,
944 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
945 0, 0, &if_enable_fsw_transport_netagent_sysctl, "IU",
946 "enable flowswitch netagent");
947
948 static void dlil_detach_flowswitch_nexus(if_nexus_flowswitch_t nexus_fsw);
949
950 #include <skywalk/os_skywalk_private.h>
951
952 boolean_t
ifnet_nx_noauto(ifnet_t ifp)953 ifnet_nx_noauto(ifnet_t ifp)
954 {
955 return (ifp->if_xflags & IFXF_NX_NOAUTO) != 0;
956 }
957
958 boolean_t
ifnet_nx_noauto_flowswitch(ifnet_t ifp)959 ifnet_nx_noauto_flowswitch(ifnet_t ifp)
960 {
961 return ifnet_is_low_latency(ifp);
962 }
963
964 boolean_t
ifnet_is_low_latency(ifnet_t ifp)965 ifnet_is_low_latency(ifnet_t ifp)
966 {
967 return (ifp->if_xflags & IFXF_LOW_LATENCY) != 0;
968 }
969
970 boolean_t
ifnet_needs_compat(ifnet_t ifp)971 ifnet_needs_compat(ifnet_t ifp)
972 {
973 if ((if_attach_nx & IF_ATTACH_NX_NETIF_COMPAT) == 0) {
974 return FALSE;
975 }
976 #if !XNU_TARGET_OS_OSX
977 /*
978 * To conserve memory, we plumb in the compat layer selectively; this
979 * can be overridden via if_attach_nx flag IF_ATTACH_NX_NETIF_ALL.
980 * In particular, we check for Wi-Fi Access Point.
981 */
982 if (IFNET_IS_WIFI(ifp)) {
983 /* Wi-Fi Access Point */
984 if (ifp->if_name[0] == 'a' && ifp->if_name[1] == 'p' &&
985 ifp->if_name[2] == '\0') {
986 return if_netif_all;
987 }
988 }
989 #else /* XNU_TARGET_OS_OSX */
990 #pragma unused(ifp)
991 #endif /* XNU_TARGET_OS_OSX */
992 return TRUE;
993 }
994
995 boolean_t
ifnet_needs_fsw_transport_netagent(ifnet_t ifp)996 ifnet_needs_fsw_transport_netagent(ifnet_t ifp)
997 {
998 if (if_is_fsw_transport_netagent_enabled()) {
999 /* check if netagent has been manually enabled for ipsec/utun */
1000 if (ifp->if_family == IFNET_FAMILY_IPSEC) {
1001 return ipsec_interface_needs_netagent(ifp);
1002 } else if (ifp->if_family == IFNET_FAMILY_UTUN) {
1003 return utun_interface_needs_netagent(ifp);
1004 }
1005
1006 /* check ifnet no auto nexus override */
1007 if (ifnet_nx_noauto(ifp)) {
1008 return FALSE;
1009 }
1010
1011 /* check global if_attach_nx configuration */
1012 switch (ifp->if_family) {
1013 case IFNET_FAMILY_CELLULAR:
1014 case IFNET_FAMILY_ETHERNET:
1015 if ((if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0) {
1016 return TRUE;
1017 }
1018 break;
1019 default:
1020 break;
1021 }
1022 }
1023 return FALSE;
1024 }
1025
1026 boolean_t
ifnet_needs_fsw_ip_netagent(ifnet_t ifp)1027 ifnet_needs_fsw_ip_netagent(ifnet_t ifp)
1028 {
1029 #pragma unused(ifp)
1030 if ((if_attach_nx & IF_ATTACH_NX_FSW_IP_NETAGENT) != 0) {
1031 return TRUE;
1032 }
1033 return FALSE;
1034 }
1035
1036 boolean_t
ifnet_needs_netif_netagent(ifnet_t ifp)1037 ifnet_needs_netif_netagent(ifnet_t ifp)
1038 {
1039 #pragma unused(ifp)
1040 return (if_attach_nx & IF_ATTACH_NX_NETIF_NETAGENT) != 0;
1041 }
1042
1043 static boolean_t
dlil_detach_nexus_instance(nexus_controller_t controller,const char * func_str,uuid_t instance,uuid_t device)1044 dlil_detach_nexus_instance(nexus_controller_t controller,
1045 const char *func_str, uuid_t instance, uuid_t device)
1046 {
1047 errno_t err;
1048
1049 if (instance == NULL || uuid_is_null(instance)) {
1050 return FALSE;
1051 }
1052
1053 /* followed by the device port */
1054 if (device != NULL && !uuid_is_null(device)) {
1055 err = kern_nexus_ifdetach(controller, instance, device);
1056 if (err != 0) {
1057 DLIL_PRINTF("%s kern_nexus_ifdetach device failed %d\n",
1058 func_str, err);
1059 }
1060 }
1061 err = kern_nexus_controller_free_provider_instance(controller,
1062 instance);
1063 if (err != 0) {
1064 DLIL_PRINTF("%s free_provider_instance failed %d\n",
1065 func_str, err);
1066 }
1067 return TRUE;
1068 }
1069
1070 static boolean_t
dlil_detach_nexus(const char * func_str,uuid_t provider,uuid_t instance,uuid_t device)1071 dlil_detach_nexus(const char *func_str, uuid_t provider, uuid_t instance,
1072 uuid_t device)
1073 {
1074 boolean_t detached = FALSE;
1075 nexus_controller_t controller = kern_nexus_shared_controller();
1076 int err;
1077
1078 if (dlil_detach_nexus_instance(controller, func_str, instance,
1079 device)) {
1080 detached = TRUE;
1081 }
1082 if (provider != NULL && !uuid_is_null(provider)) {
1083 detached = TRUE;
1084 err = kern_nexus_controller_deregister_provider(controller,
1085 provider);
1086 if (err != 0) {
1087 DLIL_PRINTF("%s deregister_provider %d\n",
1088 func_str, err);
1089 }
1090 }
1091 return detached;
1092 }
1093
1094 static errno_t
dlil_create_provider_and_instance(nexus_controller_t controller,nexus_type_t type,ifnet_t ifp,uuid_t * provider,uuid_t * instance,nexus_attr_t attr)1095 dlil_create_provider_and_instance(nexus_controller_t controller,
1096 nexus_type_t type, ifnet_t ifp, uuid_t *provider, uuid_t *instance,
1097 nexus_attr_t attr)
1098 {
1099 uuid_t dom_prov;
1100 errno_t err;
1101 nexus_name_t provider_name;
1102 const char *type_name =
1103 (type == NEXUS_TYPE_NET_IF) ? "netif" : "flowswitch";
1104 struct kern_nexus_init init;
1105
1106 err = kern_nexus_get_default_domain_provider(type, &dom_prov);
1107 if (err != 0) {
1108 DLIL_PRINTF("%s can't get %s provider, error %d\n",
1109 __func__, type_name, err);
1110 goto failed;
1111 }
1112
1113 snprintf((char *)provider_name, sizeof(provider_name),
1114 "com.apple.%s.%s", type_name, if_name(ifp));
1115 err = kern_nexus_controller_register_provider(controller,
1116 dom_prov,
1117 provider_name,
1118 NULL,
1119 0,
1120 attr,
1121 provider);
1122 if (err != 0) {
1123 DLIL_PRINTF("%s register %s provider failed, error %d\n",
1124 __func__, type_name, err);
1125 goto failed;
1126 }
1127 bzero(&init, sizeof(init));
1128 init.nxi_version = KERN_NEXUS_CURRENT_VERSION;
1129 err = kern_nexus_controller_alloc_provider_instance(controller,
1130 *provider,
1131 NULL, NULL,
1132 instance, &init);
1133 if (err != 0) {
1134 DLIL_PRINTF("%s alloc_provider_instance %s failed, %d\n",
1135 __func__, type_name, err);
1136 kern_nexus_controller_deregister_provider(controller,
1137 *provider);
1138 goto failed;
1139 }
1140 failed:
1141 return err;
1142 }
1143
1144 static boolean_t
dlil_attach_netif_nexus_common(ifnet_t ifp,if_nexus_netif_t netif_nx)1145 dlil_attach_netif_nexus_common(ifnet_t ifp, if_nexus_netif_t netif_nx)
1146 {
1147 nexus_attr_t attr = NULL;
1148 nexus_controller_t controller;
1149 errno_t err;
1150
1151 if ((ifp->if_capabilities & IFCAP_SKYWALK) != 0) {
1152 /* it's already attached */
1153 if (dlil_verbose) {
1154 DLIL_PRINTF("%s: %s already has nexus attached\n",
1155 __func__, if_name(ifp));
1156 /* already attached */
1157 }
1158 goto failed;
1159 }
1160
1161 err = kern_nexus_attr_create(&attr);
1162 if (err != 0) {
1163 DLIL_PRINTF("%s: nexus attr create for %s\n", __func__,
1164 if_name(ifp));
1165 goto failed;
1166 }
1167 err = kern_nexus_attr_set(attr, NEXUS_ATTR_IFINDEX, ifp->if_index);
1168 VERIFY(err == 0);
1169
1170 controller = kern_nexus_shared_controller();
1171
1172 /* create the netif provider and instance */
1173 err = dlil_create_provider_and_instance(controller,
1174 NEXUS_TYPE_NET_IF, ifp, &netif_nx->if_nif_provider,
1175 &netif_nx->if_nif_instance, attr);
1176 if (err != 0) {
1177 goto failed;
1178 }
1179 err = kern_nexus_ifattach(controller, netif_nx->if_nif_instance,
1180 ifp, NULL, FALSE, &netif_nx->if_nif_attach);
1181 if (err != 0) {
1182 DLIL_PRINTF("%s kern_nexus_ifattach %d\n",
1183 __func__, err);
1184 /* cleanup provider and instance */
1185 dlil_detach_nexus(__func__, netif_nx->if_nif_provider,
1186 netif_nx->if_nif_instance, NULL);
1187 goto failed;
1188 }
1189 return TRUE;
1190
1191 failed:
1192 if (attr != NULL) {
1193 kern_nexus_attr_destroy(attr);
1194 }
1195 return FALSE;
1196 }
1197
1198 static boolean_t
dlil_attach_netif_compat_nexus(ifnet_t ifp,if_nexus_netif_t netif_nx)1199 dlil_attach_netif_compat_nexus(ifnet_t ifp, if_nexus_netif_t netif_nx)
1200 {
1201 if (ifnet_nx_noauto(ifp) || IFNET_IS_INTCOPROC(ifp) ||
1202 IFNET_IS_MANAGEMENT(ifp) || IFNET_IS_VMNET(ifp)) {
1203 goto failed;
1204 }
1205 switch (ifp->if_type) {
1206 case IFT_CELLULAR:
1207 case IFT_ETHER:
1208 if ((if_attach_nx & IF_ATTACH_NX_NETIF_COMPAT) == 0) {
1209 /* don't auto-attach */
1210 goto failed;
1211 }
1212 break;
1213 default:
1214 /* don't auto-attach */
1215 goto failed;
1216 }
1217 return dlil_attach_netif_nexus_common(ifp, netif_nx);
1218
1219 failed:
1220 return FALSE;
1221 }
1222
1223 static boolean_t
dlil_is_native_netif_nexus(ifnet_t ifp)1224 dlil_is_native_netif_nexus(ifnet_t ifp)
1225 {
1226 return (ifp->if_eflags & IFEF_SKYWALK_NATIVE) && ifp->if_na != NULL;
1227 }
1228
1229 __attribute__((noinline))
1230 static void
dlil_detach_netif_nexus(if_nexus_netif_t nexus_netif)1231 dlil_detach_netif_nexus(if_nexus_netif_t nexus_netif)
1232 {
1233 dlil_detach_nexus(__func__, nexus_netif->if_nif_provider,
1234 nexus_netif->if_nif_instance, nexus_netif->if_nif_attach);
1235 }
1236
1237 static inline int
dlil_siocgifdevmtu(struct ifnet * ifp,struct ifdevmtu * ifdm_p)1238 dlil_siocgifdevmtu(struct ifnet * ifp, struct ifdevmtu * ifdm_p)
1239 {
1240 struct ifreq ifr;
1241 int error;
1242
1243 bzero(&ifr, sizeof(ifr));
1244 error = ifnet_ioctl(ifp, 0, SIOCGIFDEVMTU, &ifr);
1245 if (error == 0) {
1246 *ifdm_p = ifr.ifr_devmtu;
1247 }
1248 return error;
1249 }
1250
1251 static inline void
_dlil_adjust_large_buf_size_for_tso(ifnet_t ifp,uint32_t * large_buf_size)1252 _dlil_adjust_large_buf_size_for_tso(ifnet_t ifp, uint32_t *large_buf_size)
1253 {
1254 #ifdef XNU_TARGET_OS_OSX
1255 uint32_t tso_v4_mtu = 0;
1256 uint32_t tso_v6_mtu = 0;
1257
1258 if (!dlil_is_native_netif_nexus(ifp)) {
1259 return;
1260 }
1261 /*
1262 * Note that we are reading the real hwassist flags set by the driver
1263 * and not the adjusted ones because nx_netif_host_adjust_if_capabilities()
1264 * hasn't been called yet.
1265 */
1266 if ((ifp->if_hwassist & IFNET_TSO_IPV4) != 0) {
1267 tso_v4_mtu = ifp->if_tso_v4_mtu;
1268 }
1269 if ((ifp->if_hwassist & IFNET_TSO_IPV6) != 0) {
1270 tso_v6_mtu = ifp->if_tso_v6_mtu;
1271 }
1272 /*
1273 * If the hardware supports TSO, adjust the large buf size to match the
1274 * supported TSO MTU size.
1275 */
1276 if (tso_v4_mtu != 0 || tso_v6_mtu != 0) {
1277 *large_buf_size = MAX(tso_v4_mtu, tso_v6_mtu);
1278 } else {
1279 *large_buf_size = MAX(*large_buf_size, sk_fsw_gso_mtu);
1280 }
1281 *large_buf_size = MIN(NX_FSW_MAX_LARGE_BUFSIZE, *large_buf_size);
1282 #else
1283 #pragma unused(ifp, large_buf_size)
1284 #endif /* XNU_TARGET_OS_OSX */
1285 }
1286
1287 static inline int
_dlil_get_flowswitch_buffer_size(ifnet_t ifp,uuid_t netif,uint32_t * buf_size,bool * use_multi_buflet,uint32_t * large_buf_size)1288 _dlil_get_flowswitch_buffer_size(ifnet_t ifp, uuid_t netif, uint32_t *buf_size,
1289 bool *use_multi_buflet, uint32_t *large_buf_size)
1290 {
1291 struct kern_pbufpool_memory_info rx_pp_info;
1292 struct kern_pbufpool_memory_info tx_pp_info;
1293 uint32_t if_max_mtu = 0;
1294 uint32_t drv_buf_size;
1295 struct ifdevmtu ifdm;
1296 int err;
1297
1298 /*
1299 * To perform intra-stack RX aggregation flowswitch needs to use
1300 * multi-buflet packet.
1301 */
1302 *use_multi_buflet = NX_FSW_TCP_RX_AGG_ENABLED();
1303
1304 *large_buf_size = *use_multi_buflet ? NX_FSW_DEF_LARGE_BUFSIZE : 0;
1305 /*
1306 * IP over Thunderbolt interface can deliver the largest IP packet,
1307 * but the driver advertises the MAX MTU as only 9K.
1308 */
1309 if (IFNET_IS_THUNDERBOLT_IP(ifp)) {
1310 if_max_mtu = IP_MAXPACKET;
1311 goto skip_mtu_ioctl;
1312 }
1313
1314 /* determine max mtu */
1315 bzero(&ifdm, sizeof(ifdm));
1316 err = dlil_siocgifdevmtu(ifp, &ifdm);
1317 if (__improbable(err != 0)) {
1318 DLIL_PRINTF("%s: SIOCGIFDEVMTU failed for %s\n",
1319 __func__, if_name(ifp));
1320 /* use default flowswitch buffer size */
1321 if_max_mtu = NX_FSW_BUFSIZE;
1322 } else {
1323 DLIL_PRINTF("%s: %s %d %d\n", __func__, if_name(ifp),
1324 ifdm.ifdm_max, ifdm.ifdm_current);
1325 /* rdar://problem/44589731 */
1326 if_max_mtu = MAX(ifdm.ifdm_max, ifdm.ifdm_current);
1327 }
1328
1329 skip_mtu_ioctl:
1330 if (if_max_mtu == 0) {
1331 DLIL_PRINTF("%s: can't determine MAX MTU for %s\n",
1332 __func__, if_name(ifp));
1333 return EINVAL;
1334 }
1335 if ((if_max_mtu > NX_FSW_MAXBUFSIZE) && fsw_use_max_mtu_buffer) {
1336 DLIL_PRINTF("%s: interace (%s) has MAX MTU (%u) > flowswitch "
1337 "max bufsize(%d)\n", __func__,
1338 if_name(ifp), if_max_mtu, NX_FSW_MAXBUFSIZE);
1339 return EINVAL;
1340 }
1341
1342 /*
1343 * for skywalk native driver, consult the driver packet pool also.
1344 */
1345 if (dlil_is_native_netif_nexus(ifp)) {
1346 err = kern_nexus_get_pbufpool_info(netif, &rx_pp_info,
1347 &tx_pp_info);
1348 if (err != 0) {
1349 DLIL_PRINTF("%s: can't get pbufpool info for %s\n",
1350 __func__, if_name(ifp));
1351 return ENXIO;
1352 }
1353 drv_buf_size = tx_pp_info.kpm_bufsize *
1354 tx_pp_info.kpm_max_frags;
1355 if (if_max_mtu > drv_buf_size) {
1356 DLIL_PRINTF("%s: interface %s packet pool (rx %d * %d, "
1357 "tx %d * %d) can't support max mtu(%d)\n", __func__,
1358 if_name(ifp), rx_pp_info.kpm_bufsize,
1359 rx_pp_info.kpm_max_frags, tx_pp_info.kpm_bufsize,
1360 tx_pp_info.kpm_max_frags, if_max_mtu);
1361 return EINVAL;
1362 }
1363 } else {
1364 drv_buf_size = if_max_mtu;
1365 }
1366
1367 if ((drv_buf_size > NX_FSW_BUFSIZE) && (!fsw_use_max_mtu_buffer)) {
1368 _CASSERT((NX_FSW_BUFSIZE * NX_PBUF_FRAGS_MAX) >= IP_MAXPACKET);
1369 *use_multi_buflet = true;
1370 /* default flowswitch buffer size */
1371 *buf_size = NX_FSW_BUFSIZE;
1372 *large_buf_size = MIN(NX_FSW_MAX_LARGE_BUFSIZE, drv_buf_size);
1373 } else {
1374 *buf_size = MAX(drv_buf_size, NX_FSW_BUFSIZE);
1375 }
1376 _dlil_adjust_large_buf_size_for_tso(ifp, large_buf_size);
1377 ASSERT(*buf_size <= NX_FSW_MAXBUFSIZE);
1378 if (*buf_size >= *large_buf_size) {
1379 *large_buf_size = 0;
1380 }
1381 return 0;
1382 }
1383
1384 static boolean_t
_dlil_attach_flowswitch_nexus(ifnet_t ifp,if_nexus_flowswitch_t nexus_fsw)1385 _dlil_attach_flowswitch_nexus(ifnet_t ifp, if_nexus_flowswitch_t nexus_fsw)
1386 {
1387 nexus_attr_t attr = NULL;
1388 nexus_controller_t controller;
1389 errno_t err = 0;
1390 uuid_t netif;
1391 uint32_t buf_size = 0;
1392 uint32_t large_buf_size = 0;
1393 bool multi_buflet;
1394
1395 if (ifnet_nx_noauto(ifp) || ifnet_nx_noauto_flowswitch(ifp) ||
1396 IFNET_IS_VMNET(ifp)) {
1397 goto failed;
1398 }
1399
1400 if ((ifp->if_capabilities & IFCAP_SKYWALK) == 0) {
1401 /* not possible to attach (netif native/compat not plumbed) */
1402 goto failed;
1403 }
1404
1405 if ((if_attach_nx & IF_ATTACH_NX_FLOWSWITCH) == 0) {
1406 /* don't auto-attach */
1407 goto failed;
1408 }
1409
1410 /* get the netif instance from the ifp */
1411 err = kern_nexus_get_netif_instance(ifp, netif);
1412 if (err != 0) {
1413 DLIL_PRINTF("%s: can't find netif for %s\n", __func__,
1414 if_name(ifp));
1415 goto failed;
1416 }
1417
1418 err = kern_nexus_attr_create(&attr);
1419 if (err != 0) {
1420 DLIL_PRINTF("%s: nexus attr create for %s\n", __func__,
1421 if_name(ifp));
1422 goto failed;
1423 }
1424
1425 err = _dlil_get_flowswitch_buffer_size(ifp, netif, &buf_size,
1426 &multi_buflet, &large_buf_size);
1427 if (err != 0) {
1428 goto failed;
1429 }
1430 ASSERT((buf_size >= NX_FSW_BUFSIZE) && (buf_size <= NX_FSW_MAXBUFSIZE));
1431 ASSERT(large_buf_size <= NX_FSW_MAX_LARGE_BUFSIZE);
1432
1433 /* Configure flowswitch buffer size */
1434 err = kern_nexus_attr_set(attr, NEXUS_ATTR_SLOT_BUF_SIZE, buf_size);
1435 VERIFY(err == 0);
1436 err = kern_nexus_attr_set(attr, NEXUS_ATTR_LARGE_BUF_SIZE,
1437 large_buf_size);
1438 VERIFY(err == 0);
1439
1440 /*
1441 * Configure flowswitch to use super-packet (multi-buflet).
1442 */
1443 err = kern_nexus_attr_set(attr, NEXUS_ATTR_MAX_FRAGS,
1444 multi_buflet ? NX_PBUF_FRAGS_MAX : 1);
1445 VERIFY(err == 0);
1446
1447 /* create the flowswitch provider and instance */
1448 controller = kern_nexus_shared_controller();
1449 err = dlil_create_provider_and_instance(controller,
1450 NEXUS_TYPE_FLOW_SWITCH, ifp, &nexus_fsw->if_fsw_provider,
1451 &nexus_fsw->if_fsw_instance, attr);
1452 if (err != 0) {
1453 goto failed;
1454 }
1455
1456 /* attach the device port */
1457 err = kern_nexus_ifattach(controller, nexus_fsw->if_fsw_instance,
1458 NULL, netif, FALSE, &nexus_fsw->if_fsw_device);
1459 if (err != 0) {
1460 DLIL_PRINTF("%s kern_nexus_ifattach device failed %d %s\n",
1461 __func__, err, if_name(ifp));
1462 /* cleanup provider and instance */
1463 dlil_detach_nexus(__func__, nexus_fsw->if_fsw_provider,
1464 nexus_fsw->if_fsw_instance, nexus_fsw->if_fsw_device);
1465 goto failed;
1466 }
1467 return TRUE;
1468
1469 failed:
1470 if (err != 0) {
1471 DLIL_PRINTF("%s: failed to attach flowswitch to %s, error %d\n",
1472 __func__, if_name(ifp), err);
1473 } else {
1474 DLIL_PRINTF("%s: not attaching flowswitch to %s\n",
1475 __func__, if_name(ifp));
1476 }
1477 if (attr != NULL) {
1478 kern_nexus_attr_destroy(attr);
1479 }
1480 return FALSE;
1481 }
1482
1483 static boolean_t
dlil_attach_flowswitch_nexus(ifnet_t ifp)1484 dlil_attach_flowswitch_nexus(ifnet_t ifp)
1485 {
1486 boolean_t attached;
1487 if_nexus_flowswitch nexus_fsw;
1488
1489 #if (DEVELOPMENT || DEBUG)
1490 if (skywalk_netif_direct_allowed(if_name(ifp))) {
1491 DLIL_PRINTF("skip attaching fsw to %s", if_name(ifp));
1492 return FALSE;
1493 }
1494 #endif /* (DEVELOPMENT || DEBUG) */
1495
1496 /*
1497 * flowswitch attachment is not supported for interface using the
1498 * legacy model (IFNET_INIT_LEGACY)
1499 */
1500 if ((ifp->if_eflags & IFEF_TXSTART) == 0) {
1501 DLIL_PRINTF("skip attaching fsw to %s using legacy TX model",
1502 if_name(ifp));
1503 return FALSE;
1504 }
1505
1506 if (uuid_is_null(ifp->if_nx_flowswitch.if_fsw_instance) == 0) {
1507 /* it's already attached */
1508 return FALSE;
1509 }
1510 bzero(&nexus_fsw, sizeof(nexus_fsw));
1511 attached = _dlil_attach_flowswitch_nexus(ifp, &nexus_fsw);
1512 if (attached) {
1513 ifnet_lock_exclusive(ifp);
1514 if (!IF_FULLY_ATTACHED(ifp)) {
1515 /* interface is going away */
1516 attached = FALSE;
1517 } else {
1518 ifp->if_nx_flowswitch = nexus_fsw;
1519 }
1520 ifnet_lock_done(ifp);
1521 if (!attached) {
1522 /* clean up flowswitch nexus */
1523 dlil_detach_flowswitch_nexus(&nexus_fsw);
1524 }
1525 }
1526 return attached;
1527 }
1528
1529 __attribute__((noinline))
1530 static void
dlil_detach_flowswitch_nexus(if_nexus_flowswitch_t nexus_fsw)1531 dlil_detach_flowswitch_nexus(if_nexus_flowswitch_t nexus_fsw)
1532 {
1533 dlil_detach_nexus(__func__, nexus_fsw->if_fsw_provider,
1534 nexus_fsw->if_fsw_instance, nexus_fsw->if_fsw_device);
1535 }
1536
1537 __attribute__((noinline))
1538 static void
dlil_netif_detach_notify(ifnet_t ifp)1539 dlil_netif_detach_notify(ifnet_t ifp)
1540 {
1541 ifnet_detach_notify_cb_t notify = NULL;
1542 void *arg = NULL;
1543
1544 ifnet_get_detach_notify(ifp, ¬ify, &arg);
1545 if (notify == NULL) {
1546 DTRACE_SKYWALK1(no__notify, ifnet_t, ifp);
1547 return;
1548 }
1549 (*notify)(arg);
1550 }
1551
1552 __attribute__((noinline))
1553 static void
dlil_quiesce_and_detach_nexuses(ifnet_t ifp)1554 dlil_quiesce_and_detach_nexuses(ifnet_t ifp)
1555 {
1556 if_nexus_flowswitch *nx_fsw = &ifp->if_nx_flowswitch;
1557 if_nexus_netif *nx_netif = &ifp->if_nx_netif;
1558
1559 ifnet_datamov_suspend_and_drain(ifp);
1560 if (!uuid_is_null(nx_fsw->if_fsw_device)) {
1561 ASSERT(!uuid_is_null(nx_fsw->if_fsw_provider));
1562 ASSERT(!uuid_is_null(nx_fsw->if_fsw_instance));
1563 dlil_detach_flowswitch_nexus(nx_fsw);
1564 bzero(nx_fsw, sizeof(*nx_fsw));
1565 } else {
1566 ASSERT(uuid_is_null(nx_fsw->if_fsw_provider));
1567 ASSERT(uuid_is_null(nx_fsw->if_fsw_instance));
1568 DTRACE_IP1(fsw__not__attached, ifnet_t, ifp);
1569 }
1570
1571 if (!uuid_is_null(nx_netif->if_nif_attach)) {
1572 ASSERT(!uuid_is_null(nx_netif->if_nif_provider));
1573 ASSERT(!uuid_is_null(nx_netif->if_nif_instance));
1574 dlil_detach_netif_nexus(nx_netif);
1575 bzero(nx_netif, sizeof(*nx_netif));
1576 } else {
1577 ASSERT(uuid_is_null(nx_netif->if_nif_provider));
1578 ASSERT(uuid_is_null(nx_netif->if_nif_instance));
1579 DTRACE_IP1(netif__not__attached, ifnet_t, ifp);
1580 }
1581 ifnet_datamov_resume(ifp);
1582 }
1583
1584 boolean_t
ifnet_add_netagent(ifnet_t ifp)1585 ifnet_add_netagent(ifnet_t ifp)
1586 {
1587 int error;
1588
1589 error = kern_nexus_interface_add_netagent(ifp);
1590 os_log(OS_LOG_DEFAULT,
1591 "kern_nexus_interface_add_netagent(%s) returned %d",
1592 ifp->if_xname, error);
1593 return error == 0;
1594 }
1595
1596 boolean_t
ifnet_remove_netagent(ifnet_t ifp)1597 ifnet_remove_netagent(ifnet_t ifp)
1598 {
1599 int error;
1600
1601 error = kern_nexus_interface_remove_netagent(ifp);
1602 os_log(OS_LOG_DEFAULT,
1603 "kern_nexus_interface_remove_netagent(%s) returned %d",
1604 ifp->if_xname, error);
1605 return error == 0;
1606 }
1607
1608 boolean_t
ifnet_attach_flowswitch_nexus(ifnet_t ifp)1609 ifnet_attach_flowswitch_nexus(ifnet_t ifp)
1610 {
1611 if (!IF_FULLY_ATTACHED(ifp)) {
1612 return FALSE;
1613 }
1614 return dlil_attach_flowswitch_nexus(ifp);
1615 }
1616
1617 boolean_t
ifnet_detach_flowswitch_nexus(ifnet_t ifp)1618 ifnet_detach_flowswitch_nexus(ifnet_t ifp)
1619 {
1620 if_nexus_flowswitch nexus_fsw;
1621
1622 ifnet_lock_exclusive(ifp);
1623 nexus_fsw = ifp->if_nx_flowswitch;
1624 bzero(&ifp->if_nx_flowswitch, sizeof(ifp->if_nx_flowswitch));
1625 ifnet_lock_done(ifp);
1626 return dlil_detach_nexus(__func__, nexus_fsw.if_fsw_provider,
1627 nexus_fsw.if_fsw_instance, nexus_fsw.if_fsw_device);
1628 }
1629
1630 boolean_t
ifnet_attach_netif_nexus(ifnet_t ifp)1631 ifnet_attach_netif_nexus(ifnet_t ifp)
1632 {
1633 boolean_t nexus_attached;
1634 if_nexus_netif nexus_netif;
1635
1636 if (!IF_FULLY_ATTACHED(ifp)) {
1637 return FALSE;
1638 }
1639 nexus_attached = dlil_attach_netif_nexus_common(ifp, &nexus_netif);
1640 if (nexus_attached) {
1641 ifnet_lock_exclusive(ifp);
1642 ifp->if_nx_netif = nexus_netif;
1643 ifnet_lock_done(ifp);
1644 }
1645 return nexus_attached;
1646 }
1647
1648 boolean_t
ifnet_detach_netif_nexus(ifnet_t ifp)1649 ifnet_detach_netif_nexus(ifnet_t ifp)
1650 {
1651 if_nexus_netif nexus_netif;
1652
1653 ifnet_lock_exclusive(ifp);
1654 nexus_netif = ifp->if_nx_netif;
1655 bzero(&ifp->if_nx_netif, sizeof(ifp->if_nx_netif));
1656 ifnet_lock_done(ifp);
1657
1658 return dlil_detach_nexus(__func__, nexus_netif.if_nif_provider,
1659 nexus_netif.if_nif_instance, nexus_netif.if_nif_attach);
1660 }
1661
1662 void
ifnet_attach_native_flowswitch(ifnet_t ifp)1663 ifnet_attach_native_flowswitch(ifnet_t ifp)
1664 {
1665 if (!dlil_is_native_netif_nexus(ifp)) {
1666 /* not a native netif */
1667 return;
1668 }
1669 ifnet_attach_flowswitch_nexus(ifp);
1670 }
1671
1672 int
ifnet_set_flowswitch_rx_callback(ifnet_t ifp,ifnet_fsw_rx_cb_t cb,void * arg)1673 ifnet_set_flowswitch_rx_callback(ifnet_t ifp, ifnet_fsw_rx_cb_t cb, void *arg)
1674 {
1675 lck_mtx_lock(&ifp->if_delegate_lock);
1676 while (ifp->if_fsw_rx_cb_ref > 0) {
1677 DTRACE_SKYWALK1(wait__fsw, ifnet_t, ifp);
1678 (void) msleep(&ifp->if_fsw_rx_cb_ref, &ifp->if_delegate_lock,
1679 (PZERO + 1), __FUNCTION__, NULL);
1680 DTRACE_SKYWALK1(wake__fsw, ifnet_t, ifp);
1681 }
1682 ifp->if_fsw_rx_cb = cb;
1683 ifp->if_fsw_rx_cb_arg = arg;
1684 lck_mtx_unlock(&ifp->if_delegate_lock);
1685 return 0;
1686 }
1687
1688 int
ifnet_get_flowswitch_rx_callback(ifnet_t ifp,ifnet_fsw_rx_cb_t * cbp,void ** argp)1689 ifnet_get_flowswitch_rx_callback(ifnet_t ifp, ifnet_fsw_rx_cb_t *cbp, void **argp)
1690 {
1691 /*
1692 * This is for avoiding the unnecessary lock acquire for interfaces
1693 * not used by a redirect interface.
1694 */
1695 if (ifp->if_fsw_rx_cb == NULL) {
1696 return ENOENT;
1697 }
1698 lck_mtx_lock(&ifp->if_delegate_lock);
1699 if (ifp->if_fsw_rx_cb == NULL) {
1700 lck_mtx_unlock(&ifp->if_delegate_lock);
1701 return ENOENT;
1702 }
1703 *cbp = ifp->if_fsw_rx_cb;
1704 *argp = ifp->if_fsw_rx_cb_arg;
1705 ifp->if_fsw_rx_cb_ref++;
1706 lck_mtx_unlock(&ifp->if_delegate_lock);
1707 return 0;
1708 }
1709
1710 void
ifnet_release_flowswitch_rx_callback(ifnet_t ifp)1711 ifnet_release_flowswitch_rx_callback(ifnet_t ifp)
1712 {
1713 lck_mtx_lock(&ifp->if_delegate_lock);
1714 if (--ifp->if_fsw_rx_cb_ref == 0) {
1715 wakeup(&ifp->if_fsw_rx_cb_ref);
1716 }
1717 lck_mtx_unlock(&ifp->if_delegate_lock);
1718 }
1719
1720 int
ifnet_set_delegate_parent(ifnet_t difp,ifnet_t parent)1721 ifnet_set_delegate_parent(ifnet_t difp, ifnet_t parent)
1722 {
1723 lck_mtx_lock(&difp->if_delegate_lock);
1724 while (difp->if_delegate_parent_ref > 0) {
1725 DTRACE_SKYWALK1(wait__parent, ifnet_t, difp);
1726 (void) msleep(&difp->if_delegate_parent_ref, &difp->if_delegate_lock,
1727 (PZERO + 1), __FUNCTION__, NULL);
1728 DTRACE_SKYWALK1(wake__parent, ifnet_t, difp);
1729 }
1730 difp->if_delegate_parent = parent;
1731 lck_mtx_unlock(&difp->if_delegate_lock);
1732 return 0;
1733 }
1734
1735 int
ifnet_get_delegate_parent(ifnet_t difp,ifnet_t * parentp)1736 ifnet_get_delegate_parent(ifnet_t difp, ifnet_t *parentp)
1737 {
1738 lck_mtx_lock(&difp->if_delegate_lock);
1739 if (difp->if_delegate_parent == NULL) {
1740 lck_mtx_unlock(&difp->if_delegate_lock);
1741 return ENOENT;
1742 }
1743 *parentp = difp->if_delegate_parent;
1744 difp->if_delegate_parent_ref++;
1745 lck_mtx_unlock(&difp->if_delegate_lock);
1746 return 0;
1747 }
1748
1749 void
ifnet_release_delegate_parent(ifnet_t difp)1750 ifnet_release_delegate_parent(ifnet_t difp)
1751 {
1752 lck_mtx_lock(&difp->if_delegate_lock);
1753 if (--difp->if_delegate_parent_ref == 0) {
1754 wakeup(&difp->if_delegate_parent_ref);
1755 }
1756 lck_mtx_unlock(&difp->if_delegate_lock);
1757 }
1758
1759 __attribute__((noinline))
1760 void
ifnet_set_detach_notify_locked(ifnet_t ifp,ifnet_detach_notify_cb_t notify,void * arg)1761 ifnet_set_detach_notify_locked(ifnet_t ifp, ifnet_detach_notify_cb_t notify, void *arg)
1762 {
1763 ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_EXCLUSIVE);
1764 ifp->if_detach_notify = notify;
1765 ifp->if_detach_notify_arg = arg;
1766 }
1767
1768 __attribute__((noinline))
1769 void
ifnet_get_detach_notify_locked(ifnet_t ifp,ifnet_detach_notify_cb_t * notifyp,void ** argp)1770 ifnet_get_detach_notify_locked(ifnet_t ifp, ifnet_detach_notify_cb_t *notifyp, void **argp)
1771 {
1772 ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_EXCLUSIVE);
1773 *notifyp = ifp->if_detach_notify;
1774 *argp = ifp->if_detach_notify_arg;
1775 }
1776
1777 __attribute__((noinline))
1778 void
ifnet_set_detach_notify(ifnet_t ifp,ifnet_detach_notify_cb_t notify,void * arg)1779 ifnet_set_detach_notify(ifnet_t ifp, ifnet_detach_notify_cb_t notify, void *arg)
1780 {
1781 ifnet_lock_exclusive(ifp);
1782 ifnet_set_detach_notify_locked(ifp, notify, arg);
1783 ifnet_lock_done(ifp);
1784 }
1785
1786 __attribute__((noinline))
1787 void
ifnet_get_detach_notify(ifnet_t ifp,ifnet_detach_notify_cb_t * notifyp,void ** argp)1788 ifnet_get_detach_notify(ifnet_t ifp, ifnet_detach_notify_cb_t *notifyp, void **argp)
1789 {
1790 ifnet_lock_exclusive(ifp);
1791 ifnet_get_detach_notify_locked(ifp, notifyp, argp);
1792 ifnet_lock_done(ifp);
1793 }
1794 #endif /* SKYWALK */
1795
1796 #define DLIL_INPUT_CHECK(m, ifp) { \
1797 struct ifnet *_rcvif = mbuf_pkthdr_rcvif(m); \
1798 if (_rcvif == NULL || (ifp != lo_ifp && _rcvif != ifp) || \
1799 !(mbuf_flags(m) & MBUF_PKTHDR)) { \
1800 panic_plain("%s: invalid mbuf %p\n", __func__, m); \
1801 /* NOTREACHED */ \
1802 } \
1803 }
1804
1805 #define DLIL_EWMA(old, new, decay) do { \
1806 u_int32_t _avg; \
1807 if ((_avg = (old)) > 0) \
1808 _avg = (((_avg << (decay)) - _avg) + (new)) >> (decay); \
1809 else \
1810 _avg = (new); \
1811 (old) = _avg; \
1812 } while (0)
1813
1814 #define MBPS (1ULL * 1000 * 1000)
1815 #define GBPS (MBPS * 1000)
1816
1817 struct rxpoll_time_tbl {
1818 u_int64_t speed; /* downlink speed */
1819 u_int32_t plowat; /* packets low watermark */
1820 u_int32_t phiwat; /* packets high watermark */
1821 u_int32_t blowat; /* bytes low watermark */
1822 u_int32_t bhiwat; /* bytes high watermark */
1823 };
1824
1825 static struct rxpoll_time_tbl rxpoll_tbl[] = {
1826 { .speed = 10 * MBPS, .plowat = 2, .phiwat = 8, .blowat = (1 * 1024), .bhiwat = (6 * 1024) },
1827 { .speed = 100 * MBPS, .plowat = 10, .phiwat = 40, .blowat = (4 * 1024), .bhiwat = (64 * 1024) },
1828 { .speed = 1 * GBPS, .plowat = 10, .phiwat = 40, .blowat = (4 * 1024), .bhiwat = (64 * 1024) },
1829 { .speed = 10 * GBPS, .plowat = 10, .phiwat = 40, .blowat = (4 * 1024), .bhiwat = (64 * 1024) },
1830 { .speed = 100 * GBPS, .plowat = 10, .phiwat = 40, .blowat = (4 * 1024), .bhiwat = (64 * 1024) },
1831 { .speed = 0, .plowat = 0, .phiwat = 0, .blowat = 0, .bhiwat = 0 }
1832 };
1833
1834 static LCK_MTX_DECLARE_ATTR(dlil_thread_sync_lock, &dlil_lock_group,
1835 &dlil_lck_attributes);
1836 static uint32_t dlil_pending_thread_cnt = 0;
1837
1838 static void
dlil_incr_pending_thread_count(void)1839 dlil_incr_pending_thread_count(void)
1840 {
1841 LCK_MTX_ASSERT(&dlil_thread_sync_lock, LCK_MTX_ASSERT_NOTOWNED);
1842 lck_mtx_lock(&dlil_thread_sync_lock);
1843 dlil_pending_thread_cnt++;
1844 lck_mtx_unlock(&dlil_thread_sync_lock);
1845 }
1846
1847 static void
dlil_decr_pending_thread_count(void)1848 dlil_decr_pending_thread_count(void)
1849 {
1850 LCK_MTX_ASSERT(&dlil_thread_sync_lock, LCK_MTX_ASSERT_NOTOWNED);
1851 lck_mtx_lock(&dlil_thread_sync_lock);
1852 VERIFY(dlil_pending_thread_cnt > 0);
1853 dlil_pending_thread_cnt--;
1854 if (dlil_pending_thread_cnt == 0) {
1855 wakeup(&dlil_pending_thread_cnt);
1856 }
1857 lck_mtx_unlock(&dlil_thread_sync_lock);
1858 }
1859
1860 int
proto_hash_value(u_int32_t protocol_family)1861 proto_hash_value(u_int32_t protocol_family)
1862 {
1863 /*
1864 * dlil_proto_unplumb_all() depends on the mapping between
1865 * the hash bucket index and the protocol family defined
1866 * here; future changes must be applied there as well.
1867 */
1868 switch (protocol_family) {
1869 case PF_INET:
1870 return 0;
1871 case PF_INET6:
1872 return 1;
1873 case PF_VLAN:
1874 return 2;
1875 case PF_UNSPEC:
1876 default:
1877 return 3;
1878 }
1879 }
1880
1881 /*
1882 * Caller must already be holding ifnet lock.
1883 */
1884 static struct if_proto *
find_attached_proto(struct ifnet * ifp,u_int32_t protocol_family)1885 find_attached_proto(struct ifnet *ifp, u_int32_t protocol_family)
1886 {
1887 struct if_proto *proto = NULL;
1888 u_int32_t i = proto_hash_value(protocol_family);
1889
1890 ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_OWNED);
1891
1892 if (ifp->if_proto_hash != NULL) {
1893 proto = SLIST_FIRST(&ifp->if_proto_hash[i]);
1894 }
1895
1896 while (proto != NULL && proto->protocol_family != protocol_family) {
1897 proto = SLIST_NEXT(proto, next_hash);
1898 }
1899
1900 if (proto != NULL) {
1901 if_proto_ref(proto);
1902 }
1903
1904 return proto;
1905 }
1906
1907 static void
if_proto_ref(struct if_proto * proto)1908 if_proto_ref(struct if_proto *proto)
1909 {
1910 os_atomic_inc(&proto->refcount, relaxed);
1911 }
1912
1913 extern void if_rtproto_del(struct ifnet *ifp, int protocol);
1914
1915 static void
if_proto_free(struct if_proto * proto)1916 if_proto_free(struct if_proto *proto)
1917 {
1918 u_int32_t oldval;
1919 struct ifnet *ifp = proto->ifp;
1920 u_int32_t proto_family = proto->protocol_family;
1921 struct kev_dl_proto_data ev_pr_data;
1922
1923 oldval = os_atomic_dec_orig(&proto->refcount, relaxed);
1924 if (oldval > 1) {
1925 return;
1926 }
1927
1928 if (proto->proto_kpi == kProtoKPI_v1) {
1929 if (proto->kpi.v1.detached) {
1930 proto->kpi.v1.detached(ifp, proto->protocol_family);
1931 }
1932 }
1933 if (proto->proto_kpi == kProtoKPI_v2) {
1934 if (proto->kpi.v2.detached) {
1935 proto->kpi.v2.detached(ifp, proto->protocol_family);
1936 }
1937 }
1938
1939 /*
1940 * Cleanup routes that may still be in the routing table for that
1941 * interface/protocol pair.
1942 */
1943 if_rtproto_del(ifp, proto_family);
1944
1945 ifnet_lock_shared(ifp);
1946
1947 /* No more reference on this, protocol must have been detached */
1948 VERIFY(proto->detached);
1949
1950 /*
1951 * The reserved field carries the number of protocol still attached
1952 * (subject to change)
1953 */
1954 ev_pr_data.proto_family = proto_family;
1955 ev_pr_data.proto_remaining_count = dlil_ifp_protolist(ifp, NULL, 0);
1956
1957 ifnet_lock_done(ifp);
1958
1959 dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_PROTO_DETACHED,
1960 (struct net_event_data *)&ev_pr_data,
1961 sizeof(struct kev_dl_proto_data), FALSE);
1962
1963 if (ev_pr_data.proto_remaining_count == 0) {
1964 /*
1965 * The protocol count has gone to zero, mark the interface down.
1966 * This used to be done by configd.KernelEventMonitor, but that
1967 * is inherently prone to races (rdar://problem/30810208).
1968 */
1969 (void) ifnet_set_flags(ifp, 0, IFF_UP);
1970 (void) ifnet_ioctl(ifp, 0, SIOCSIFFLAGS, NULL);
1971 dlil_post_sifflags_msg(ifp);
1972 }
1973
1974 zfree(dlif_proto_zone, proto);
1975 }
1976
1977 __private_extern__ void
ifnet_lock_assert(struct ifnet * ifp,ifnet_lock_assert_t what)1978 ifnet_lock_assert(struct ifnet *ifp, ifnet_lock_assert_t what)
1979 {
1980 #if !MACH_ASSERT
1981 #pragma unused(ifp)
1982 #endif
1983 unsigned int type = 0;
1984 int ass = 1;
1985
1986 switch (what) {
1987 case IFNET_LCK_ASSERT_EXCLUSIVE:
1988 type = LCK_RW_ASSERT_EXCLUSIVE;
1989 break;
1990
1991 case IFNET_LCK_ASSERT_SHARED:
1992 type = LCK_RW_ASSERT_SHARED;
1993 break;
1994
1995 case IFNET_LCK_ASSERT_OWNED:
1996 type = LCK_RW_ASSERT_HELD;
1997 break;
1998
1999 case IFNET_LCK_ASSERT_NOTOWNED:
2000 /* nothing to do here for RW lock; bypass assert */
2001 ass = 0;
2002 break;
2003
2004 default:
2005 panic("bad ifnet assert type: %d", what);
2006 /* NOTREACHED */
2007 }
2008 if (ass) {
2009 LCK_RW_ASSERT(&ifp->if_lock, type);
2010 }
2011 }
2012
2013 __private_extern__ void
ifnet_lock_shared(struct ifnet * ifp)2014 ifnet_lock_shared(struct ifnet *ifp)
2015 {
2016 lck_rw_lock_shared(&ifp->if_lock);
2017 }
2018
2019 __private_extern__ void
ifnet_lock_exclusive(struct ifnet * ifp)2020 ifnet_lock_exclusive(struct ifnet *ifp)
2021 {
2022 lck_rw_lock_exclusive(&ifp->if_lock);
2023 }
2024
2025 __private_extern__ void
ifnet_lock_done(struct ifnet * ifp)2026 ifnet_lock_done(struct ifnet *ifp)
2027 {
2028 lck_rw_done(&ifp->if_lock);
2029 }
2030
2031 #if INET
2032 __private_extern__ void
if_inetdata_lock_shared(struct ifnet * ifp)2033 if_inetdata_lock_shared(struct ifnet *ifp)
2034 {
2035 lck_rw_lock_shared(&ifp->if_inetdata_lock);
2036 }
2037
2038 __private_extern__ void
if_inetdata_lock_exclusive(struct ifnet * ifp)2039 if_inetdata_lock_exclusive(struct ifnet *ifp)
2040 {
2041 lck_rw_lock_exclusive(&ifp->if_inetdata_lock);
2042 }
2043
2044 __private_extern__ void
if_inetdata_lock_done(struct ifnet * ifp)2045 if_inetdata_lock_done(struct ifnet *ifp)
2046 {
2047 lck_rw_done(&ifp->if_inetdata_lock);
2048 }
2049 #endif
2050
2051 __private_extern__ void
if_inet6data_lock_shared(struct ifnet * ifp)2052 if_inet6data_lock_shared(struct ifnet *ifp)
2053 {
2054 lck_rw_lock_shared(&ifp->if_inet6data_lock);
2055 }
2056
2057 __private_extern__ void
if_inet6data_lock_exclusive(struct ifnet * ifp)2058 if_inet6data_lock_exclusive(struct ifnet *ifp)
2059 {
2060 lck_rw_lock_exclusive(&ifp->if_inet6data_lock);
2061 }
2062
2063 __private_extern__ void
if_inet6data_lock_done(struct ifnet * ifp)2064 if_inet6data_lock_done(struct ifnet *ifp)
2065 {
2066 lck_rw_done(&ifp->if_inet6data_lock);
2067 }
2068
2069 __private_extern__ void
ifnet_head_lock_shared(void)2070 ifnet_head_lock_shared(void)
2071 {
2072 lck_rw_lock_shared(&ifnet_head_lock);
2073 }
2074
2075 __private_extern__ void
ifnet_head_lock_exclusive(void)2076 ifnet_head_lock_exclusive(void)
2077 {
2078 lck_rw_lock_exclusive(&ifnet_head_lock);
2079 }
2080
2081 __private_extern__ void
ifnet_head_done(void)2082 ifnet_head_done(void)
2083 {
2084 lck_rw_done(&ifnet_head_lock);
2085 }
2086
2087 __private_extern__ void
ifnet_head_assert_exclusive(void)2088 ifnet_head_assert_exclusive(void)
2089 {
2090 LCK_RW_ASSERT(&ifnet_head_lock, LCK_RW_ASSERT_EXCLUSIVE);
2091 }
2092
2093 /*
2094 * dlil_ifp_protolist
2095 * - get the list of protocols attached to the interface, or just the number
2096 * of attached protocols
2097 * - if the number returned is greater than 'list_count', truncation occurred
2098 *
2099 * Note:
2100 * - caller must already be holding ifnet lock.
2101 */
2102 static u_int32_t
dlil_ifp_protolist(struct ifnet * ifp,protocol_family_t * list,u_int32_t list_count)2103 dlil_ifp_protolist(struct ifnet *ifp, protocol_family_t *list,
2104 u_int32_t list_count)
2105 {
2106 u_int32_t count = 0;
2107 int i;
2108
2109 ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_OWNED);
2110
2111 if (ifp->if_proto_hash == NULL) {
2112 goto done;
2113 }
2114
2115 for (i = 0; i < PROTO_HASH_SLOTS; i++) {
2116 struct if_proto *proto;
2117 SLIST_FOREACH(proto, &ifp->if_proto_hash[i], next_hash) {
2118 if (list != NULL && count < list_count) {
2119 list[count] = proto->protocol_family;
2120 }
2121 count++;
2122 }
2123 }
2124 done:
2125 return count;
2126 }
2127
2128 __private_extern__ u_int32_t
if_get_protolist(struct ifnet * ifp,u_int32_t * protolist,u_int32_t count)2129 if_get_protolist(struct ifnet * ifp, u_int32_t *protolist, u_int32_t count)
2130 {
2131 ifnet_lock_shared(ifp);
2132 count = dlil_ifp_protolist(ifp, protolist, count);
2133 ifnet_lock_done(ifp);
2134 return count;
2135 }
2136
2137 __private_extern__ void
if_free_protolist(u_int32_t * list)2138 if_free_protolist(u_int32_t *list)
2139 {
2140 kfree_data_addr(list);
2141 }
2142
2143 __private_extern__ int
dlil_post_msg(struct ifnet * ifp,u_int32_t event_subclass,u_int32_t event_code,struct net_event_data * event_data,u_int32_t event_data_len,boolean_t suppress_generation)2144 dlil_post_msg(struct ifnet *ifp, u_int32_t event_subclass,
2145 u_int32_t event_code, struct net_event_data *event_data,
2146 u_int32_t event_data_len, boolean_t suppress_generation)
2147 {
2148 struct net_event_data ev_data;
2149 struct kev_msg ev_msg;
2150
2151 bzero(&ev_msg, sizeof(ev_msg));
2152 bzero(&ev_data, sizeof(ev_data));
2153 /*
2154 * a net event always starts with a net_event_data structure
2155 * but the caller can generate a simple net event or
2156 * provide a longer event structure to post
2157 */
2158 ev_msg.vendor_code = KEV_VENDOR_APPLE;
2159 ev_msg.kev_class = KEV_NETWORK_CLASS;
2160 ev_msg.kev_subclass = event_subclass;
2161 ev_msg.event_code = event_code;
2162
2163 if (event_data == NULL) {
2164 event_data = &ev_data;
2165 event_data_len = sizeof(struct net_event_data);
2166 }
2167
2168 strlcpy(&event_data->if_name[0], ifp->if_name, IFNAMSIZ);
2169 event_data->if_family = ifp->if_family;
2170 event_data->if_unit = (u_int32_t)ifp->if_unit;
2171
2172 ev_msg.dv[0].data_length = event_data_len;
2173 ev_msg.dv[0].data_ptr = event_data;
2174 ev_msg.dv[1].data_length = 0;
2175
2176 bool update_generation = true;
2177 if (event_subclass == KEV_DL_SUBCLASS) {
2178 /* Don't update interface generation for frequent link quality and state changes */
2179 switch (event_code) {
2180 case KEV_DL_LINK_QUALITY_METRIC_CHANGED:
2181 case KEV_DL_RRC_STATE_CHANGED:
2182 case KEV_DL_PRIMARY_ELECTED:
2183 update_generation = false;
2184 break;
2185 default:
2186 break;
2187 }
2188 }
2189
2190 /*
2191 * Some events that update generation counts might
2192 * want to suppress generation count.
2193 * One example is node presence/absence where we still
2194 * issue kernel event for the invocation but want to avoid
2195 * expensive operation of updating generation which triggers
2196 * NECP client updates.
2197 */
2198 if (suppress_generation) {
2199 update_generation = false;
2200 }
2201
2202 return dlil_event_internal(ifp, &ev_msg, update_generation);
2203 }
2204
2205 __private_extern__ int
dlil_alloc_local_stats(struct ifnet * ifp)2206 dlil_alloc_local_stats(struct ifnet *ifp)
2207 {
2208 int ret = EINVAL;
2209 void *buf, *base, **pbuf;
2210
2211 if (ifp == NULL) {
2212 goto end;
2213 }
2214
2215 if (ifp->if_tcp_stat == NULL && ifp->if_udp_stat == NULL) {
2216 /* allocate tcpstat_local structure */
2217 buf = zalloc_flags(dlif_tcpstat_zone,
2218 Z_WAITOK | Z_ZERO | Z_NOFAIL);
2219
2220 /* Get the 64-bit aligned base address for this object */
2221 base = (void *)P2ROUNDUP((intptr_t)buf + sizeof(u_int64_t),
2222 sizeof(u_int64_t));
2223 VERIFY(((intptr_t)base + dlif_tcpstat_size) <=
2224 ((intptr_t)buf + dlif_tcpstat_bufsize));
2225
2226 /*
2227 * Wind back a pointer size from the aligned base and
2228 * save the original address so we can free it later.
2229 */
2230 pbuf = (void **)((intptr_t)base - sizeof(void *));
2231 *pbuf = buf;
2232 ifp->if_tcp_stat = base;
2233
2234 /* allocate udpstat_local structure */
2235 buf = zalloc_flags(dlif_udpstat_zone,
2236 Z_WAITOK | Z_ZERO | Z_NOFAIL);
2237
2238 /* Get the 64-bit aligned base address for this object */
2239 base = (void *)P2ROUNDUP((intptr_t)buf + sizeof(u_int64_t),
2240 sizeof(u_int64_t));
2241 VERIFY(((intptr_t)base + dlif_udpstat_size) <=
2242 ((intptr_t)buf + dlif_udpstat_bufsize));
2243
2244 /*
2245 * Wind back a pointer size from the aligned base and
2246 * save the original address so we can free it later.
2247 */
2248 pbuf = (void **)((intptr_t)base - sizeof(void *));
2249 *pbuf = buf;
2250 ifp->if_udp_stat = base;
2251
2252 VERIFY(IS_P2ALIGNED(ifp->if_tcp_stat, sizeof(u_int64_t)) &&
2253 IS_P2ALIGNED(ifp->if_udp_stat, sizeof(u_int64_t)));
2254
2255 ret = 0;
2256 }
2257
2258 if (ifp->if_ipv4_stat == NULL) {
2259 ifp->if_ipv4_stat = kalloc_type(struct if_tcp_ecn_stat, Z_WAITOK | Z_ZERO);
2260 }
2261
2262 if (ifp->if_ipv6_stat == NULL) {
2263 ifp->if_ipv6_stat = kalloc_type(struct if_tcp_ecn_stat, Z_WAITOK | Z_ZERO);
2264 }
2265 end:
2266 if (ifp != NULL && ret != 0) {
2267 if (ifp->if_tcp_stat != NULL) {
2268 pbuf = (void **)
2269 ((intptr_t)ifp->if_tcp_stat - sizeof(void *));
2270 zfree(dlif_tcpstat_zone, *pbuf);
2271 ifp->if_tcp_stat = NULL;
2272 }
2273 if (ifp->if_udp_stat != NULL) {
2274 pbuf = (void **)
2275 ((intptr_t)ifp->if_udp_stat - sizeof(void *));
2276 zfree(dlif_udpstat_zone, *pbuf);
2277 ifp->if_udp_stat = NULL;
2278 }
2279 /* The macro kfree_type sets the passed pointer to NULL */
2280 if (ifp->if_ipv4_stat != NULL) {
2281 kfree_type(struct if_tcp_ecn_stat, ifp->if_ipv4_stat);
2282 }
2283 if (ifp->if_ipv6_stat != NULL) {
2284 kfree_type(struct if_tcp_ecn_stat, ifp->if_ipv6_stat);
2285 }
2286 }
2287
2288 return ret;
2289 }
2290
2291 static void
dlil_reset_rxpoll_params(ifnet_t ifp)2292 dlil_reset_rxpoll_params(ifnet_t ifp)
2293 {
2294 ASSERT(ifp != NULL);
2295 ifnet_set_poll_cycle(ifp, NULL);
2296 ifp->if_poll_update = 0;
2297 ifp->if_poll_flags = 0;
2298 ifp->if_poll_req = 0;
2299 ifp->if_poll_mode = IFNET_MODEL_INPUT_POLL_OFF;
2300 bzero(&ifp->if_poll_tstats, sizeof(ifp->if_poll_tstats));
2301 bzero(&ifp->if_poll_pstats, sizeof(ifp->if_poll_pstats));
2302 bzero(&ifp->if_poll_sstats, sizeof(ifp->if_poll_sstats));
2303 net_timerclear(&ifp->if_poll_mode_holdtime);
2304 net_timerclear(&ifp->if_poll_mode_lasttime);
2305 net_timerclear(&ifp->if_poll_sample_holdtime);
2306 net_timerclear(&ifp->if_poll_sample_lasttime);
2307 net_timerclear(&ifp->if_poll_dbg_lasttime);
2308 }
2309
2310 static int
dlil_create_input_thread(ifnet_t ifp,struct dlil_threading_info * inp,thread_continue_t * thfunc)2311 dlil_create_input_thread(ifnet_t ifp, struct dlil_threading_info *inp,
2312 thread_continue_t *thfunc)
2313 {
2314 boolean_t dlil_rxpoll_input;
2315 thread_continue_t func = NULL;
2316 u_int32_t limit;
2317 int error = 0;
2318
2319 dlil_rxpoll_input = (ifp != NULL && net_rxpoll &&
2320 (ifp->if_eflags & IFEF_RXPOLL) && (ifp->if_xflags & IFXF_LEGACY));
2321
2322 /* default strategy utilizes the DLIL worker thread */
2323 inp->dlth_strategy = dlil_input_async;
2324
2325 /* NULL ifp indicates the main input thread, called at dlil_init time */
2326 if (ifp == NULL) {
2327 /*
2328 * Main input thread only.
2329 */
2330 func = dlil_main_input_thread_func;
2331 VERIFY(inp == dlil_main_input_thread);
2332 (void) strlcat(inp->dlth_name,
2333 "main_input", DLIL_THREADNAME_LEN);
2334 } else if (dlil_rxpoll_input) {
2335 /*
2336 * Legacy (non-netif) hybrid polling.
2337 */
2338 func = dlil_rxpoll_input_thread_func;
2339 VERIFY(inp != dlil_main_input_thread);
2340 (void) snprintf(inp->dlth_name, DLIL_THREADNAME_LEN,
2341 "%s_input_poll", if_name(ifp));
2342 } else if (net_async || (ifp->if_xflags & IFXF_LEGACY)) {
2343 /*
2344 * Asynchronous strategy.
2345 */
2346 func = dlil_input_thread_func;
2347 VERIFY(inp != dlil_main_input_thread);
2348 (void) snprintf(inp->dlth_name, DLIL_THREADNAME_LEN,
2349 "%s_input", if_name(ifp));
2350 } else {
2351 /*
2352 * Synchronous strategy if there's a netif below and
2353 * the device isn't capable of hybrid polling.
2354 */
2355 ASSERT(func == NULL);
2356 ASSERT(!(ifp->if_xflags & IFXF_LEGACY));
2357 VERIFY(inp != dlil_main_input_thread);
2358 ASSERT(!inp->dlth_affinity);
2359 inp->dlth_strategy = dlil_input_sync;
2360 }
2361 VERIFY(inp->dlth_thread == THREAD_NULL);
2362
2363 /* let caller know */
2364 if (thfunc != NULL) {
2365 *thfunc = func;
2366 }
2367
2368 inp->dlth_lock_grp = lck_grp_alloc_init(inp->dlth_name, LCK_GRP_ATTR_NULL);
2369 lck_mtx_init(&inp->dlth_lock, inp->dlth_lock_grp, &dlil_lck_attributes);
2370
2371 inp->dlth_ifp = ifp; /* NULL for main input thread */
2372
2373 /*
2374 * For interfaces that support opportunistic polling, set the
2375 * low and high watermarks for outstanding inbound packets/bytes.
2376 * Also define freeze times for transitioning between modes
2377 * and updating the average.
2378 */
2379 if (ifp != NULL && net_rxpoll && (ifp->if_eflags & IFEF_RXPOLL)) {
2380 limit = MAX(if_rcvq_maxlen, IF_RCVQ_MINLEN);
2381 if (ifp->if_xflags & IFXF_LEGACY) {
2382 (void) dlil_rxpoll_set_params(ifp, NULL, FALSE);
2383 }
2384 } else {
2385 /*
2386 * For interfaces that don't support opportunistic
2387 * polling, set the burst limit to prevent memory exhaustion.
2388 * The values of `if_rcvq_burst_limit' are safeguarded
2389 * on customer builds by `sysctl_rcvq_burst_limit'.
2390 */
2391 limit = if_rcvq_burst_limit;
2392 }
2393
2394 _qinit(&inp->dlth_pkts, Q_DROPTAIL, limit, QP_MBUF);
2395 if (inp == dlil_main_input_thread) {
2396 struct dlil_main_threading_info *inpm =
2397 (struct dlil_main_threading_info *)inp;
2398 _qinit(&inpm->lo_rcvq_pkts, Q_DROPTAIL, limit, QP_MBUF);
2399 }
2400
2401 if (func == NULL) {
2402 ASSERT(!(ifp->if_xflags & IFXF_LEGACY));
2403 ASSERT(error == 0);
2404 error = ENODEV;
2405 goto done;
2406 }
2407
2408 error = kernel_thread_start(func, inp, &inp->dlth_thread);
2409 if (error == KERN_SUCCESS) {
2410 thread_precedence_policy_data_t info;
2411 __unused kern_return_t kret;
2412
2413 bzero(&info, sizeof(info));
2414 info.importance = 0;
2415 kret = thread_policy_set(inp->dlth_thread,
2416 THREAD_PRECEDENCE_POLICY, (thread_policy_t)&info,
2417 THREAD_PRECEDENCE_POLICY_COUNT);
2418 ASSERT(kret == KERN_SUCCESS);
2419 /*
2420 * We create an affinity set so that the matching workloop
2421 * thread or the starter thread (for loopback) can be
2422 * scheduled on the same processor set as the input thread.
2423 */
2424 if (net_affinity) {
2425 struct thread *tp = inp->dlth_thread;
2426 u_int32_t tag;
2427 /*
2428 * Randomize to reduce the probability
2429 * of affinity tag namespace collision.
2430 */
2431 read_frandom(&tag, sizeof(tag));
2432 if (dlil_affinity_set(tp, tag) == KERN_SUCCESS) {
2433 thread_reference(tp);
2434 inp->dlth_affinity_tag = tag;
2435 inp->dlth_affinity = TRUE;
2436 }
2437 }
2438 } else if (inp == dlil_main_input_thread) {
2439 panic_plain("%s: couldn't create main input thread", __func__);
2440 /* NOTREACHED */
2441 } else {
2442 panic_plain("%s: couldn't create %s input thread", __func__,
2443 if_name(ifp));
2444 /* NOTREACHED */
2445 }
2446 OSAddAtomic(1, &cur_dlil_input_threads);
2447
2448 done:
2449 return error;
2450 }
2451
2452 #if TEST_INPUT_THREAD_TERMINATION
2453 static int
2454 sysctl_input_thread_termination_spin SYSCTL_HANDLER_ARGS
2455 {
2456 #pragma unused(arg1, arg2)
2457 uint32_t i;
2458 int err;
2459
2460 i = if_input_thread_termination_spin;
2461
2462 err = sysctl_handle_int(oidp, &i, 0, req);
2463 if (err != 0 || req->newptr == USER_ADDR_NULL) {
2464 return err;
2465 }
2466
2467 if (net_rxpoll == 0) {
2468 return ENXIO;
2469 }
2470
2471 if_input_thread_termination_spin = i;
2472 return err;
2473 }
2474 #endif /* TEST_INPUT_THREAD_TERMINATION */
2475
2476 static void
dlil_clean_threading_info(struct dlil_threading_info * inp)2477 dlil_clean_threading_info(struct dlil_threading_info *inp)
2478 {
2479 lck_mtx_destroy(&inp->dlth_lock, inp->dlth_lock_grp);
2480 lck_grp_free(inp->dlth_lock_grp);
2481 inp->dlth_lock_grp = NULL;
2482
2483 inp->dlth_flags = 0;
2484 inp->dlth_wtot = 0;
2485 bzero(inp->dlth_name, sizeof(inp->dlth_name));
2486 inp->dlth_ifp = NULL;
2487 VERIFY(qhead(&inp->dlth_pkts) == NULL && qempty(&inp->dlth_pkts));
2488 qlimit(&inp->dlth_pkts) = 0;
2489 bzero(&inp->dlth_stats, sizeof(inp->dlth_stats));
2490
2491 VERIFY(!inp->dlth_affinity);
2492 inp->dlth_thread = THREAD_NULL;
2493 inp->dlth_strategy = NULL;
2494 VERIFY(inp->dlth_driver_thread == THREAD_NULL);
2495 VERIFY(inp->dlth_poller_thread == THREAD_NULL);
2496 VERIFY(inp->dlth_affinity_tag == 0);
2497 #if IFNET_INPUT_SANITY_CHK
2498 inp->dlth_pkts_cnt = 0;
2499 #endif /* IFNET_INPUT_SANITY_CHK */
2500 }
2501
2502 static void
dlil_terminate_input_thread(struct dlil_threading_info * inp)2503 dlil_terminate_input_thread(struct dlil_threading_info *inp)
2504 {
2505 struct ifnet *ifp = inp->dlth_ifp;
2506 classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
2507
2508 VERIFY(current_thread() == inp->dlth_thread);
2509 VERIFY(inp != dlil_main_input_thread);
2510
2511 OSAddAtomic(-1, &cur_dlil_input_threads);
2512
2513 #if TEST_INPUT_THREAD_TERMINATION
2514 { /* do something useless that won't get optimized away */
2515 uint32_t v = 1;
2516 for (uint32_t i = 0;
2517 i < if_input_thread_termination_spin;
2518 i++) {
2519 v = (i + 1) * v;
2520 }
2521 DLIL_PRINTF("the value is %d\n", v);
2522 }
2523 #endif /* TEST_INPUT_THREAD_TERMINATION */
2524
2525 lck_mtx_lock_spin(&inp->dlth_lock);
2526 _getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
2527 VERIFY((inp->dlth_flags & DLIL_INPUT_TERMINATE) != 0);
2528 inp->dlth_flags |= DLIL_INPUT_TERMINATE_COMPLETE;
2529 wakeup_one((caddr_t)&inp->dlth_flags);
2530 lck_mtx_unlock(&inp->dlth_lock);
2531
2532 /* free up pending packets */
2533 if (pkt.cp_mbuf != NULL) {
2534 mbuf_freem_list(pkt.cp_mbuf);
2535 }
2536
2537 /* for the extra refcnt from kernel_thread_start() */
2538 thread_deallocate(current_thread());
2539
2540 if (dlil_verbose) {
2541 DLIL_PRINTF("%s: input thread terminated\n",
2542 if_name(ifp));
2543 }
2544
2545 /* this is the end */
2546 thread_terminate(current_thread());
2547 /* NOTREACHED */
2548 }
2549
2550 static kern_return_t
dlil_affinity_set(struct thread * tp,u_int32_t tag)2551 dlil_affinity_set(struct thread *tp, u_int32_t tag)
2552 {
2553 thread_affinity_policy_data_t policy;
2554
2555 bzero(&policy, sizeof(policy));
2556 policy.affinity_tag = tag;
2557 return thread_policy_set(tp, THREAD_AFFINITY_POLICY,
2558 (thread_policy_t)&policy, THREAD_AFFINITY_POLICY_COUNT);
2559 }
2560
2561 #if SKYWALK && defined(XNU_TARGET_OS_OSX)
2562 static void
dlil_filter_event(struct eventhandler_entry_arg arg __unused,enum net_filter_event_subsystems state)2563 dlil_filter_event(struct eventhandler_entry_arg arg __unused,
2564 enum net_filter_event_subsystems state)
2565 {
2566 bool old_if_enable_fsw_transport_netagent = if_enable_fsw_transport_netagent;
2567 if ((state & ~NET_FILTER_EVENT_PF_PRIVATE_PROXY) == 0) {
2568 if_enable_fsw_transport_netagent = 1;
2569 } else {
2570 if_enable_fsw_transport_netagent = 0;
2571 }
2572 if (old_if_enable_fsw_transport_netagent != if_enable_fsw_transport_netagent) {
2573 kern_nexus_update_netagents();
2574 } else if (!if_enable_fsw_transport_netagent) {
2575 necp_update_all_clients();
2576 }
2577 }
2578 #endif /* SKYWALK && XNU_TARGET_OS_OSX */
2579
2580 void
dlil_init(void)2581 dlil_init(void)
2582 {
2583 thread_t thread = THREAD_NULL;
2584
2585 /*
2586 * The following fields must be 64-bit aligned for atomic operations.
2587 */
2588 IF_DATA_REQUIRE_ALIGNED_64(ifi_ipackets);
2589 IF_DATA_REQUIRE_ALIGNED_64(ifi_ierrors);
2590 IF_DATA_REQUIRE_ALIGNED_64(ifi_opackets);
2591 IF_DATA_REQUIRE_ALIGNED_64(ifi_oerrors);
2592 IF_DATA_REQUIRE_ALIGNED_64(ifi_collisions);
2593 IF_DATA_REQUIRE_ALIGNED_64(ifi_ibytes);
2594 IF_DATA_REQUIRE_ALIGNED_64(ifi_obytes);
2595 IF_DATA_REQUIRE_ALIGNED_64(ifi_imcasts);
2596 IF_DATA_REQUIRE_ALIGNED_64(ifi_omcasts);
2597 IF_DATA_REQUIRE_ALIGNED_64(ifi_iqdrops);
2598 IF_DATA_REQUIRE_ALIGNED_64(ifi_noproto);
2599 IF_DATA_REQUIRE_ALIGNED_64(ifi_alignerrs);
2600 IF_DATA_REQUIRE_ALIGNED_64(ifi_dt_bytes);
2601 IF_DATA_REQUIRE_ALIGNED_64(ifi_fpackets);
2602 IF_DATA_REQUIRE_ALIGNED_64(ifi_fbytes);
2603
2604 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_ipackets);
2605 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_ierrors);
2606 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_opackets);
2607 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_oerrors);
2608 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_collisions);
2609 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_ibytes);
2610 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_obytes);
2611 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_imcasts);
2612 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_omcasts);
2613 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_iqdrops);
2614 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_noproto);
2615 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_alignerrs);
2616 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_dt_bytes);
2617 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_fpackets);
2618 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_fbytes);
2619
2620 /*
2621 * These IF_HWASSIST_ flags must be equal to their IFNET_* counterparts.
2622 */
2623 _CASSERT(IF_HWASSIST_CSUM_IP == IFNET_CSUM_IP);
2624 _CASSERT(IF_HWASSIST_CSUM_TCP == IFNET_CSUM_TCP);
2625 _CASSERT(IF_HWASSIST_CSUM_UDP == IFNET_CSUM_UDP);
2626 _CASSERT(IF_HWASSIST_CSUM_IP_FRAGS == IFNET_CSUM_FRAGMENT);
2627 _CASSERT(IF_HWASSIST_CSUM_FRAGMENT == IFNET_IP_FRAGMENT);
2628 _CASSERT(IF_HWASSIST_CSUM_TCPIPV6 == IFNET_CSUM_TCPIPV6);
2629 _CASSERT(IF_HWASSIST_CSUM_UDPIPV6 == IFNET_CSUM_UDPIPV6);
2630 _CASSERT(IF_HWASSIST_CSUM_FRAGMENT_IPV6 == IFNET_IPV6_FRAGMENT);
2631 _CASSERT(IF_HWASSIST_CSUM_PARTIAL == IFNET_CSUM_PARTIAL);
2632 _CASSERT(IF_HWASSIST_CSUM_ZERO_INVERT == IFNET_CSUM_ZERO_INVERT);
2633 _CASSERT(IF_HWASSIST_VLAN_TAGGING == IFNET_VLAN_TAGGING);
2634 _CASSERT(IF_HWASSIST_VLAN_MTU == IFNET_VLAN_MTU);
2635 _CASSERT(IF_HWASSIST_TSO_V4 == IFNET_TSO_IPV4);
2636 _CASSERT(IF_HWASSIST_TSO_V6 == IFNET_TSO_IPV6);
2637
2638 /*
2639 * ... as well as the mbuf checksum flags counterparts.
2640 */
2641 _CASSERT(CSUM_IP == IF_HWASSIST_CSUM_IP);
2642 _CASSERT(CSUM_TCP == IF_HWASSIST_CSUM_TCP);
2643 _CASSERT(CSUM_UDP == IF_HWASSIST_CSUM_UDP);
2644 _CASSERT(CSUM_IP_FRAGS == IF_HWASSIST_CSUM_IP_FRAGS);
2645 _CASSERT(CSUM_FRAGMENT == IF_HWASSIST_CSUM_FRAGMENT);
2646 _CASSERT(CSUM_TCPIPV6 == IF_HWASSIST_CSUM_TCPIPV6);
2647 _CASSERT(CSUM_UDPIPV6 == IF_HWASSIST_CSUM_UDPIPV6);
2648 _CASSERT(CSUM_FRAGMENT_IPV6 == IF_HWASSIST_CSUM_FRAGMENT_IPV6);
2649 _CASSERT(CSUM_PARTIAL == IF_HWASSIST_CSUM_PARTIAL);
2650 _CASSERT(CSUM_ZERO_INVERT == IF_HWASSIST_CSUM_ZERO_INVERT);
2651 _CASSERT(CSUM_VLAN_TAG_VALID == IF_HWASSIST_VLAN_TAGGING);
2652
2653 /*
2654 * Make sure we have at least IF_LLREACH_MAXLEN in the llreach info.
2655 */
2656 _CASSERT(IF_LLREACH_MAXLEN <= IF_LLREACHINFO_ADDRLEN);
2657 _CASSERT(IFNET_LLREACHINFO_ADDRLEN == IF_LLREACHINFO_ADDRLEN);
2658
2659 _CASSERT(IFRLOGF_DLIL == IFNET_LOGF_DLIL);
2660 _CASSERT(IFRLOGF_FAMILY == IFNET_LOGF_FAMILY);
2661 _CASSERT(IFRLOGF_DRIVER == IFNET_LOGF_DRIVER);
2662 _CASSERT(IFRLOGF_FIRMWARE == IFNET_LOGF_FIRMWARE);
2663
2664 _CASSERT(IFRLOGCAT_CONNECTIVITY == IFNET_LOGCAT_CONNECTIVITY);
2665 _CASSERT(IFRLOGCAT_QUALITY == IFNET_LOGCAT_QUALITY);
2666 _CASSERT(IFRLOGCAT_PERFORMANCE == IFNET_LOGCAT_PERFORMANCE);
2667
2668 _CASSERT(IFRTYPE_FAMILY_ANY == IFNET_FAMILY_ANY);
2669 _CASSERT(IFRTYPE_FAMILY_LOOPBACK == IFNET_FAMILY_LOOPBACK);
2670 _CASSERT(IFRTYPE_FAMILY_ETHERNET == IFNET_FAMILY_ETHERNET);
2671 _CASSERT(IFRTYPE_FAMILY_SLIP == IFNET_FAMILY_SLIP);
2672 _CASSERT(IFRTYPE_FAMILY_TUN == IFNET_FAMILY_TUN);
2673 _CASSERT(IFRTYPE_FAMILY_VLAN == IFNET_FAMILY_VLAN);
2674 _CASSERT(IFRTYPE_FAMILY_PPP == IFNET_FAMILY_PPP);
2675 _CASSERT(IFRTYPE_FAMILY_PVC == IFNET_FAMILY_PVC);
2676 _CASSERT(IFRTYPE_FAMILY_DISC == IFNET_FAMILY_DISC);
2677 _CASSERT(IFRTYPE_FAMILY_MDECAP == IFNET_FAMILY_MDECAP);
2678 _CASSERT(IFRTYPE_FAMILY_GIF == IFNET_FAMILY_GIF);
2679 _CASSERT(IFRTYPE_FAMILY_FAITH == IFNET_FAMILY_FAITH);
2680 _CASSERT(IFRTYPE_FAMILY_STF == IFNET_FAMILY_STF);
2681 _CASSERT(IFRTYPE_FAMILY_FIREWIRE == IFNET_FAMILY_FIREWIRE);
2682 _CASSERT(IFRTYPE_FAMILY_BOND == IFNET_FAMILY_BOND);
2683 _CASSERT(IFRTYPE_FAMILY_CELLULAR == IFNET_FAMILY_CELLULAR);
2684 _CASSERT(IFRTYPE_FAMILY_UTUN == IFNET_FAMILY_UTUN);
2685 _CASSERT(IFRTYPE_FAMILY_IPSEC == IFNET_FAMILY_IPSEC);
2686
2687 _CASSERT(IFRTYPE_SUBFAMILY_ANY == IFNET_SUBFAMILY_ANY);
2688 _CASSERT(IFRTYPE_SUBFAMILY_USB == IFNET_SUBFAMILY_USB);
2689 _CASSERT(IFRTYPE_SUBFAMILY_BLUETOOTH == IFNET_SUBFAMILY_BLUETOOTH);
2690 _CASSERT(IFRTYPE_SUBFAMILY_WIFI == IFNET_SUBFAMILY_WIFI);
2691 _CASSERT(IFRTYPE_SUBFAMILY_THUNDERBOLT == IFNET_SUBFAMILY_THUNDERBOLT);
2692 _CASSERT(IFRTYPE_SUBFAMILY_RESERVED == IFNET_SUBFAMILY_RESERVED);
2693 _CASSERT(IFRTYPE_SUBFAMILY_INTCOPROC == IFNET_SUBFAMILY_INTCOPROC);
2694 _CASSERT(IFRTYPE_SUBFAMILY_QUICKRELAY == IFNET_SUBFAMILY_QUICKRELAY);
2695 _CASSERT(IFRTYPE_SUBFAMILY_VMNET == IFNET_SUBFAMILY_VMNET);
2696 _CASSERT(IFRTYPE_SUBFAMILY_SIMCELL == IFNET_SUBFAMILY_SIMCELL);
2697 _CASSERT(IFRTYPE_SUBFAMILY_MANAGEMENT == IFNET_SUBFAMILY_MANAGEMENT);
2698
2699 _CASSERT(DLIL_MODIDLEN == IFNET_MODIDLEN);
2700 _CASSERT(DLIL_MODARGLEN == IFNET_MODARGLEN);
2701
2702 PE_parse_boot_argn("net_affinity", &net_affinity,
2703 sizeof(net_affinity));
2704
2705 PE_parse_boot_argn("net_rxpoll", &net_rxpoll, sizeof(net_rxpoll));
2706
2707 PE_parse_boot_argn("net_rtref", &net_rtref, sizeof(net_rtref));
2708
2709 PE_parse_boot_argn("net_async", &net_async, sizeof(net_async));
2710
2711 PE_parse_boot_argn("ifnet_debug", &ifnet_debug, sizeof(ifnet_debug));
2712
2713 VERIFY(dlil_pending_thread_cnt == 0);
2714 #if SKYWALK
2715 boolean_t pe_enable_fsw_transport_netagent = FALSE;
2716 boolean_t pe_disable_fsw_transport_netagent = FALSE;
2717 boolean_t enable_fsw_netagent =
2718 (((if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0) ||
2719 (if_attach_nx & IF_ATTACH_NX_FSW_IP_NETAGENT) != 0);
2720
2721 /*
2722 * Check the device tree to see if Skywalk netagent has been explicitly
2723 * enabled or disabled. This can be overridden via if_attach_nx below.
2724 * Note that the property is a 0-length key, and so checking for the
2725 * presence itself is enough (no need to check for the actual value of
2726 * the retrieved variable.)
2727 */
2728 pe_enable_fsw_transport_netagent =
2729 PE_get_default("kern.skywalk_netagent_enable",
2730 &pe_enable_fsw_transport_netagent,
2731 sizeof(pe_enable_fsw_transport_netagent));
2732 pe_disable_fsw_transport_netagent =
2733 PE_get_default("kern.skywalk_netagent_disable",
2734 &pe_disable_fsw_transport_netagent,
2735 sizeof(pe_disable_fsw_transport_netagent));
2736
2737 /*
2738 * These two are mutually exclusive, i.e. they both can be absent,
2739 * but only one can be present at a time, and so we assert to make
2740 * sure it is correct.
2741 */
2742 VERIFY((!pe_enable_fsw_transport_netagent &&
2743 !pe_disable_fsw_transport_netagent) ||
2744 (pe_enable_fsw_transport_netagent ^
2745 pe_disable_fsw_transport_netagent));
2746
2747 if (pe_enable_fsw_transport_netagent) {
2748 kprintf("SK: netagent is enabled via an override for "
2749 "this platform\n");
2750 if_attach_nx = SKYWALK_NETWORKING_ENABLED;
2751 } else if (pe_disable_fsw_transport_netagent) {
2752 kprintf("SK: netagent is disabled via an override for "
2753 "this platform\n");
2754 if_attach_nx = SKYWALK_NETWORKING_DISABLED;
2755 } else {
2756 kprintf("SK: netagent is %s by default for this platform\n",
2757 (enable_fsw_netagent ? "enabled" : "disabled"));
2758 if_attach_nx = IF_ATTACH_NX_DEFAULT;
2759 }
2760
2761 /*
2762 * Now see if there's a boot-arg override.
2763 */
2764 (void) PE_parse_boot_argn("if_attach_nx", &if_attach_nx,
2765 sizeof(if_attach_nx));
2766 if_enable_fsw_transport_netagent =
2767 ((if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0);
2768
2769 if_netif_all = ((if_attach_nx & IF_ATTACH_NX_NETIF_ALL) != 0);
2770
2771 if (pe_disable_fsw_transport_netagent &&
2772 if_enable_fsw_transport_netagent) {
2773 kprintf("SK: netagent is force-enabled\n");
2774 } else if (!pe_disable_fsw_transport_netagent &&
2775 !if_enable_fsw_transport_netagent) {
2776 kprintf("SK: netagent is force-disabled\n");
2777 }
2778 #ifdef XNU_TARGET_OS_OSX
2779 if (if_enable_fsw_transport_netagent) {
2780 net_filter_event_register(dlil_filter_event);
2781 }
2782 #endif /* XNU_TARGET_OS_OSX */
2783
2784 #if (DEVELOPMENT || DEBUG)
2785 (void) PE_parse_boot_argn("fsw_use_max_mtu_buffer",
2786 &fsw_use_max_mtu_buffer, sizeof(fsw_use_max_mtu_buffer));
2787 #endif /* (DEVELOPMENT || DEBUG) */
2788
2789 #endif /* SKYWALK */
2790 dlif_size = (ifnet_debug == 0) ? sizeof(struct dlil_ifnet) :
2791 sizeof(struct dlil_ifnet_dbg);
2792 /* Enforce 64-bit alignment for dlil_ifnet structure */
2793 dlif_bufsize = dlif_size + sizeof(void *) + sizeof(u_int64_t);
2794 dlif_bufsize = (uint32_t)P2ROUNDUP(dlif_bufsize, sizeof(u_int64_t));
2795 dlif_zone = zone_create(DLIF_ZONE_NAME, dlif_bufsize, ZC_ZFREE_CLEARMEM);
2796
2797 dlif_tcpstat_size = sizeof(struct tcpstat_local);
2798 /* Enforce 64-bit alignment for tcpstat_local structure */
2799 dlif_tcpstat_bufsize =
2800 dlif_tcpstat_size + sizeof(void *) + sizeof(u_int64_t);
2801 dlif_tcpstat_bufsize = (uint32_t)
2802 P2ROUNDUP(dlif_tcpstat_bufsize, sizeof(u_int64_t));
2803 dlif_tcpstat_zone = zone_create(DLIF_TCPSTAT_ZONE_NAME,
2804 dlif_tcpstat_bufsize, ZC_ZFREE_CLEARMEM);
2805
2806 dlif_udpstat_size = sizeof(struct udpstat_local);
2807 /* Enforce 64-bit alignment for udpstat_local structure */
2808 dlif_udpstat_bufsize =
2809 dlif_udpstat_size + sizeof(void *) + sizeof(u_int64_t);
2810 dlif_udpstat_bufsize = (uint32_t)
2811 P2ROUNDUP(dlif_udpstat_bufsize, sizeof(u_int64_t));
2812 dlif_udpstat_zone = zone_create(DLIF_UDPSTAT_ZONE_NAME,
2813 dlif_udpstat_bufsize, ZC_ZFREE_CLEARMEM);
2814
2815 eventhandler_lists_ctxt_init(&ifnet_evhdlr_ctxt);
2816
2817 TAILQ_INIT(&dlil_ifnet_head);
2818 TAILQ_INIT(&ifnet_head);
2819 TAILQ_INIT(&ifnet_detaching_head);
2820 TAILQ_INIT(&ifnet_ordered_head);
2821
2822 /* Initialize interface address subsystem */
2823 ifa_init();
2824
2825 #if PF
2826 /* Initialize the packet filter */
2827 pfinit();
2828 #endif /* PF */
2829
2830 /* Initialize queue algorithms */
2831 classq_init();
2832
2833 /* Initialize packet schedulers */
2834 pktsched_init();
2835
2836 /* Initialize flow advisory subsystem */
2837 flowadv_init();
2838
2839 /* Initialize the pktap virtual interface */
2840 pktap_init();
2841
2842 /* Initialize the service class to dscp map */
2843 net_qos_map_init();
2844
2845 /* Initialize the interface low power mode event handler */
2846 if_low_power_evhdlr_init();
2847
2848 /* Initialize the interface offload port list subsystem */
2849 if_ports_used_init();
2850
2851 #if DEBUG || DEVELOPMENT
2852 /* Run self-tests */
2853 dlil_verify_sum16();
2854 #endif /* DEBUG || DEVELOPMENT */
2855
2856 /*
2857 * Create and start up the main DLIL input thread and the interface
2858 * detacher threads once everything is initialized.
2859 */
2860 dlil_incr_pending_thread_count();
2861 (void) dlil_create_input_thread(NULL, dlil_main_input_thread, NULL);
2862
2863 /*
2864 * Create ifnet detacher thread.
2865 * When an interface gets detached, part of the detach processing
2866 * is delayed. The interface is added to delayed detach list
2867 * and this thread is woken up to call ifnet_detach_final
2868 * on these interfaces.
2869 */
2870 dlil_incr_pending_thread_count();
2871 if (kernel_thread_start(ifnet_detacher_thread_func,
2872 NULL, &thread) != KERN_SUCCESS) {
2873 panic_plain("%s: couldn't create detacher thread", __func__);
2874 /* NOTREACHED */
2875 }
2876 thread_deallocate(thread);
2877
2878 /*
2879 * Wait for the created kernel threads for dlil to get
2880 * scheduled and run at least once before we proceed
2881 */
2882 lck_mtx_lock(&dlil_thread_sync_lock);
2883 while (dlil_pending_thread_cnt != 0) {
2884 DLIL_PRINTF("%s: Waiting for all the create dlil kernel "
2885 "threads to get scheduled at least once.\n", __func__);
2886 (void) msleep(&dlil_pending_thread_cnt, &dlil_thread_sync_lock,
2887 (PZERO - 1), __func__, NULL);
2888 LCK_MTX_ASSERT(&dlil_thread_sync_lock, LCK_ASSERT_OWNED);
2889 }
2890 lck_mtx_unlock(&dlil_thread_sync_lock);
2891 DLIL_PRINTF("%s: All the created dlil kernel threads have been "
2892 "scheduled at least once. Proceeding.\n", __func__);
2893 }
2894
2895 static void
if_flt_monitor_busy(struct ifnet * ifp)2896 if_flt_monitor_busy(struct ifnet *ifp)
2897 {
2898 LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
2899
2900 ++ifp->if_flt_busy;
2901 VERIFY(ifp->if_flt_busy != 0);
2902 }
2903
2904 static void
if_flt_monitor_unbusy(struct ifnet * ifp)2905 if_flt_monitor_unbusy(struct ifnet *ifp)
2906 {
2907 if_flt_monitor_leave(ifp);
2908 }
2909
2910 static void
if_flt_monitor_enter(struct ifnet * ifp)2911 if_flt_monitor_enter(struct ifnet *ifp)
2912 {
2913 LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
2914
2915 while (ifp->if_flt_busy) {
2916 ++ifp->if_flt_waiters;
2917 (void) msleep(&ifp->if_flt_head, &ifp->if_flt_lock,
2918 (PZERO - 1), "if_flt_monitor", NULL);
2919 }
2920 if_flt_monitor_busy(ifp);
2921 }
2922
2923 static void
if_flt_monitor_leave(struct ifnet * ifp)2924 if_flt_monitor_leave(struct ifnet *ifp)
2925 {
2926 LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
2927
2928 VERIFY(ifp->if_flt_busy != 0);
2929 --ifp->if_flt_busy;
2930
2931 if (ifp->if_flt_busy == 0 && ifp->if_flt_waiters > 0) {
2932 ifp->if_flt_waiters = 0;
2933 wakeup(&ifp->if_flt_head);
2934 }
2935 }
2936
2937 __private_extern__ int
dlil_attach_filter(struct ifnet * ifp,const struct iff_filter * if_filter,interface_filter_t * filter_ref,u_int32_t flags)2938 dlil_attach_filter(struct ifnet *ifp, const struct iff_filter *if_filter,
2939 interface_filter_t *filter_ref, u_int32_t flags)
2940 {
2941 int retval = 0;
2942 struct ifnet_filter *filter = NULL;
2943
2944 ifnet_head_lock_shared();
2945
2946 /* Check that the interface is in the global list */
2947 if (!ifnet_lookup(ifp)) {
2948 retval = ENXIO;
2949 goto done;
2950 }
2951 if (!ifnet_is_attached(ifp, 1)) {
2952 os_log(OS_LOG_DEFAULT, "%s: %s is no longer attached",
2953 __func__, if_name(ifp));
2954 retval = ENXIO;
2955 goto done;
2956 }
2957
2958 filter = zalloc_flags(dlif_filt_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
2959
2960 /* refcnt held above during lookup */
2961 filter->filt_flags = flags;
2962 filter->filt_ifp = ifp;
2963 filter->filt_cookie = if_filter->iff_cookie;
2964 filter->filt_name = if_filter->iff_name;
2965 filter->filt_protocol = if_filter->iff_protocol;
2966 /*
2967 * Do not install filter callbacks for internal coproc interface
2968 * and for management interfaces
2969 */
2970 if (!IFNET_IS_INTCOPROC(ifp) && !IFNET_IS_MANAGEMENT(ifp)) {
2971 filter->filt_input = if_filter->iff_input;
2972 filter->filt_output = if_filter->iff_output;
2973 filter->filt_event = if_filter->iff_event;
2974 filter->filt_ioctl = if_filter->iff_ioctl;
2975 }
2976 filter->filt_detached = if_filter->iff_detached;
2977
2978 lck_mtx_lock(&ifp->if_flt_lock);
2979 if_flt_monitor_enter(ifp);
2980
2981 LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
2982 TAILQ_INSERT_TAIL(&ifp->if_flt_head, filter, filt_next);
2983
2984 *filter_ref = filter;
2985
2986 /*
2987 * Bump filter count and route_generation ID to let TCP
2988 * know it shouldn't do TSO on this connection
2989 */
2990 if ((filter->filt_flags & DLIL_IFF_TSO) == 0) {
2991 ifnet_filter_update_tso(ifp, TRUE);
2992 }
2993 OSIncrementAtomic64(&net_api_stats.nas_iflt_attach_count);
2994 INC_ATOMIC_INT64_LIM(net_api_stats.nas_iflt_attach_total);
2995 if (filter->filt_flags & DLIL_IFF_INTERNAL) {
2996 OSIncrementAtomic64(&net_api_stats.nas_iflt_attach_os_count);
2997 INC_ATOMIC_INT64_LIM(net_api_stats.nas_iflt_attach_os_total);
2998 } else {
2999 OSAddAtomic(1, &ifp->if_flt_non_os_count);
3000 }
3001 if_flt_monitor_leave(ifp);
3002 lck_mtx_unlock(&ifp->if_flt_lock);
3003
3004 #if SKYWALK && defined(XNU_TARGET_OS_OSX)
3005 net_filter_event_mark(NET_FILTER_EVENT_INTERFACE,
3006 net_check_compatible_if_filter(NULL));
3007 #endif /* SKYWALK && XNU_TARGET_OS_OSX */
3008
3009 if (dlil_verbose) {
3010 DLIL_PRINTF("%s: %s filter attached\n", if_name(ifp),
3011 if_filter->iff_name);
3012 }
3013 ifnet_decr_iorefcnt(ifp);
3014
3015 done:
3016 ifnet_head_done();
3017 if (retval != 0 && ifp != NULL) {
3018 DLIL_PRINTF("%s: failed to attach %s (err=%d)\n",
3019 if_name(ifp), if_filter->iff_name, retval);
3020 }
3021 if (retval != 0 && filter != NULL) {
3022 zfree(dlif_filt_zone, filter);
3023 }
3024
3025 return retval;
3026 }
3027
3028 static int
dlil_detach_filter_internal(interface_filter_t filter,int detached)3029 dlil_detach_filter_internal(interface_filter_t filter, int detached)
3030 {
3031 int retval = 0;
3032
3033 if (detached == 0) {
3034 ifnet_t ifp = NULL;
3035
3036 ifnet_head_lock_shared();
3037 TAILQ_FOREACH(ifp, &ifnet_head, if_link) {
3038 interface_filter_t entry = NULL;
3039
3040 lck_mtx_lock(&ifp->if_flt_lock);
3041 TAILQ_FOREACH(entry, &ifp->if_flt_head, filt_next) {
3042 if (entry != filter || entry->filt_skip) {
3043 continue;
3044 }
3045 /*
3046 * We've found a match; since it's possible
3047 * that the thread gets blocked in the monitor,
3048 * we do the lock dance. Interface should
3049 * not be detached since we still have a use
3050 * count held during filter attach.
3051 */
3052 entry->filt_skip = 1; /* skip input/output */
3053 lck_mtx_unlock(&ifp->if_flt_lock);
3054 ifnet_head_done();
3055
3056 lck_mtx_lock(&ifp->if_flt_lock);
3057 if_flt_monitor_enter(ifp);
3058 LCK_MTX_ASSERT(&ifp->if_flt_lock,
3059 LCK_MTX_ASSERT_OWNED);
3060
3061 /* Remove the filter from the list */
3062 TAILQ_REMOVE(&ifp->if_flt_head, filter,
3063 filt_next);
3064
3065 if (dlil_verbose) {
3066 DLIL_PRINTF("%s: %s filter detached\n",
3067 if_name(ifp), filter->filt_name);
3068 }
3069 if (!(filter->filt_flags & DLIL_IFF_INTERNAL)) {
3070 VERIFY(ifp->if_flt_non_os_count != 0);
3071 OSAddAtomic(-1, &ifp->if_flt_non_os_count);
3072 }
3073 /*
3074 * Decrease filter count and route_generation
3075 * ID to let TCP know it should reevalute doing
3076 * TSO or not.
3077 */
3078 if ((filter->filt_flags & DLIL_IFF_TSO) == 0) {
3079 ifnet_filter_update_tso(ifp, FALSE);
3080 }
3081 if_flt_monitor_leave(ifp);
3082 lck_mtx_unlock(&ifp->if_flt_lock);
3083 goto destroy;
3084 }
3085 lck_mtx_unlock(&ifp->if_flt_lock);
3086 }
3087 ifnet_head_done();
3088
3089 /* filter parameter is not a valid filter ref */
3090 retval = EINVAL;
3091 goto done;
3092 } else {
3093 struct ifnet *ifp = filter->filt_ifp;
3094 /*
3095 * Here we are called from ifnet_detach_final(); the
3096 * caller had emptied if_flt_head and we're doing an
3097 * implicit filter detach because the interface is
3098 * about to go away. Make sure to adjust the counters
3099 * in this case. We don't need the protection of the
3100 * filter monitor since we're called as part of the
3101 * final detach in the context of the detacher thread.
3102 */
3103 if (!(filter->filt_flags & DLIL_IFF_INTERNAL)) {
3104 VERIFY(ifp->if_flt_non_os_count != 0);
3105 OSAddAtomic(-1, &ifp->if_flt_non_os_count);
3106 }
3107 /*
3108 * Decrease filter count and route_generation
3109 * ID to let TCP know it should reevalute doing
3110 * TSO or not.
3111 */
3112 if ((filter->filt_flags & DLIL_IFF_TSO) == 0) {
3113 ifnet_filter_update_tso(ifp, FALSE);
3114 }
3115 }
3116
3117 if (dlil_verbose) {
3118 DLIL_PRINTF("%s filter detached\n", filter->filt_name);
3119 }
3120
3121 destroy:
3122
3123 /* Call the detached function if there is one */
3124 if (filter->filt_detached) {
3125 filter->filt_detached(filter->filt_cookie, filter->filt_ifp);
3126 }
3127
3128 VERIFY(OSDecrementAtomic64(&net_api_stats.nas_iflt_attach_count) > 0);
3129 if (filter->filt_flags & DLIL_IFF_INTERNAL) {
3130 VERIFY(OSDecrementAtomic64(&net_api_stats.nas_iflt_attach_os_count) > 0);
3131 }
3132 #if SKYWALK && defined(XNU_TARGET_OS_OSX)
3133 net_filter_event_mark(NET_FILTER_EVENT_INTERFACE,
3134 net_check_compatible_if_filter(NULL));
3135 #endif /* SKYWALK && XNU_TARGET_OS_OSX */
3136
3137 /* Free the filter */
3138 zfree(dlif_filt_zone, filter);
3139 filter = NULL;
3140 done:
3141 if (retval != 0 && filter != NULL) {
3142 DLIL_PRINTF("failed to detach %s filter (err=%d)\n",
3143 filter->filt_name, retval);
3144 }
3145
3146 return retval;
3147 }
3148
3149 __private_extern__ void
dlil_detach_filter(interface_filter_t filter)3150 dlil_detach_filter(interface_filter_t filter)
3151 {
3152 if (filter == NULL) {
3153 return;
3154 }
3155 dlil_detach_filter_internal(filter, 0);
3156 }
3157
3158 __private_extern__ boolean_t
dlil_has_ip_filter(void)3159 dlil_has_ip_filter(void)
3160 {
3161 boolean_t has_filter = ((net_api_stats.nas_ipf_add_count - net_api_stats.nas_ipf_add_os_count) > 0);
3162
3163 VERIFY(net_api_stats.nas_ipf_add_count >= net_api_stats.nas_ipf_add_os_count);
3164
3165 DTRACE_IP1(dlil_has_ip_filter, boolean_t, has_filter);
3166 return has_filter;
3167 }
3168
3169 __private_extern__ boolean_t
dlil_has_if_filter(struct ifnet * ifp)3170 dlil_has_if_filter(struct ifnet *ifp)
3171 {
3172 boolean_t has_filter = !TAILQ_EMPTY(&ifp->if_flt_head);
3173 DTRACE_IP1(dlil_has_if_filter, boolean_t, has_filter);
3174 return has_filter;
3175 }
3176
3177 static inline void
dlil_input_wakeup(struct dlil_threading_info * inp)3178 dlil_input_wakeup(struct dlil_threading_info *inp)
3179 {
3180 LCK_MTX_ASSERT(&inp->dlth_lock, LCK_MTX_ASSERT_OWNED);
3181
3182 inp->dlth_flags |= DLIL_INPUT_WAITING;
3183 if (!(inp->dlth_flags & DLIL_INPUT_RUNNING)) {
3184 inp->dlth_wtot++;
3185 wakeup_one((caddr_t)&inp->dlth_flags);
3186 }
3187 }
3188
3189 __attribute__((noreturn))
3190 static void
dlil_main_input_thread_func(void * v,wait_result_t w)3191 dlil_main_input_thread_func(void *v, wait_result_t w)
3192 {
3193 #pragma unused(w)
3194 struct dlil_threading_info *inp = v;
3195
3196 VERIFY(inp == dlil_main_input_thread);
3197 VERIFY(inp->dlth_ifp == NULL);
3198 VERIFY(current_thread() == inp->dlth_thread);
3199
3200 lck_mtx_lock(&inp->dlth_lock);
3201 VERIFY(!(inp->dlth_flags & (DLIL_INPUT_EMBRYONIC | DLIL_INPUT_RUNNING)));
3202 (void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3203 inp->dlth_flags |= DLIL_INPUT_EMBRYONIC;
3204 /* wake up once to get out of embryonic state */
3205 dlil_input_wakeup(inp);
3206 lck_mtx_unlock(&inp->dlth_lock);
3207 (void) thread_block_parameter(dlil_main_input_thread_cont, inp);
3208 /* NOTREACHED */
3209 __builtin_unreachable();
3210 }
3211
3212 /*
3213 * Main input thread:
3214 *
3215 * a) handles all inbound packets for lo0
3216 * b) handles all inbound packets for interfaces with no dedicated
3217 * input thread (e.g. anything but Ethernet/PDP or those that support
3218 * opportunistic polling.)
3219 * c) protocol registrations
3220 * d) packet injections
3221 */
3222 __attribute__((noreturn))
3223 static void
dlil_main_input_thread_cont(void * v,wait_result_t wres)3224 dlil_main_input_thread_cont(void *v, wait_result_t wres)
3225 {
3226 struct dlil_main_threading_info *inpm = v;
3227 struct dlil_threading_info *inp = v;
3228
3229 /* main input thread is uninterruptible */
3230 VERIFY(wres != THREAD_INTERRUPTED);
3231 lck_mtx_lock_spin(&inp->dlth_lock);
3232 VERIFY(!(inp->dlth_flags & (DLIL_INPUT_TERMINATE |
3233 DLIL_INPUT_RUNNING)));
3234 inp->dlth_flags |= DLIL_INPUT_RUNNING;
3235
3236 while (1) {
3237 struct mbuf *m = NULL, *m_loop = NULL;
3238 u_int32_t m_cnt, m_cnt_loop;
3239 classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
3240 boolean_t proto_req;
3241 boolean_t embryonic;
3242
3243 inp->dlth_flags &= ~DLIL_INPUT_WAITING;
3244
3245 if (__improbable(embryonic =
3246 (inp->dlth_flags & DLIL_INPUT_EMBRYONIC))) {
3247 inp->dlth_flags &= ~DLIL_INPUT_EMBRYONIC;
3248 }
3249
3250 proto_req = (inp->dlth_flags &
3251 (DLIL_PROTO_WAITING | DLIL_PROTO_REGISTER));
3252
3253 /* Packets for non-dedicated interfaces other than lo0 */
3254 m_cnt = qlen(&inp->dlth_pkts);
3255 _getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
3256 m = pkt.cp_mbuf;
3257
3258 /* Packets exclusive to lo0 */
3259 m_cnt_loop = qlen(&inpm->lo_rcvq_pkts);
3260 _getq_all(&inpm->lo_rcvq_pkts, &pkt, NULL, NULL, NULL);
3261 m_loop = pkt.cp_mbuf;
3262
3263 inp->dlth_wtot = 0;
3264
3265 lck_mtx_unlock(&inp->dlth_lock);
3266
3267 if (__improbable(embryonic)) {
3268 dlil_decr_pending_thread_count();
3269 }
3270
3271 /*
3272 * NOTE warning %%% attention !!!!
3273 * We should think about putting some thread starvation
3274 * safeguards if we deal with long chains of packets.
3275 */
3276 if (__probable(m_loop != NULL)) {
3277 dlil_input_packet_list_extended(lo_ifp, m_loop,
3278 m_cnt_loop, IFNET_MODEL_INPUT_POLL_OFF);
3279 }
3280
3281 if (__probable(m != NULL)) {
3282 dlil_input_packet_list_extended(NULL, m,
3283 m_cnt, IFNET_MODEL_INPUT_POLL_OFF);
3284 }
3285
3286 if (__improbable(proto_req)) {
3287 proto_input_run();
3288 }
3289
3290 lck_mtx_lock_spin(&inp->dlth_lock);
3291 VERIFY(inp->dlth_flags & DLIL_INPUT_RUNNING);
3292 /* main input thread cannot be terminated */
3293 VERIFY(!(inp->dlth_flags & DLIL_INPUT_TERMINATE));
3294 if (!(inp->dlth_flags & ~DLIL_INPUT_RUNNING)) {
3295 break;
3296 }
3297 }
3298
3299 inp->dlth_flags &= ~DLIL_INPUT_RUNNING;
3300 (void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3301 lck_mtx_unlock(&inp->dlth_lock);
3302 (void) thread_block_parameter(dlil_main_input_thread_cont, inp);
3303
3304 VERIFY(0); /* we should never get here */
3305 /* NOTREACHED */
3306 __builtin_unreachable();
3307 }
3308
3309 /*
3310 * Input thread for interfaces with legacy input model.
3311 */
3312 __attribute__((noreturn))
3313 static void
dlil_input_thread_func(void * v,wait_result_t w)3314 dlil_input_thread_func(void *v, wait_result_t w)
3315 {
3316 #pragma unused(w)
3317 char thread_name[MAXTHREADNAMESIZE];
3318 struct dlil_threading_info *inp = v;
3319 struct ifnet *ifp = inp->dlth_ifp;
3320
3321 VERIFY(inp != dlil_main_input_thread);
3322 VERIFY(ifp != NULL);
3323 VERIFY(!(ifp->if_eflags & IFEF_RXPOLL) || !net_rxpoll ||
3324 !(ifp->if_xflags & IFXF_LEGACY));
3325 VERIFY(ifp->if_poll_mode == IFNET_MODEL_INPUT_POLL_OFF ||
3326 !(ifp->if_xflags & IFXF_LEGACY));
3327 VERIFY(current_thread() == inp->dlth_thread);
3328
3329 /* construct the name for this thread, and then apply it */
3330 bzero(thread_name, sizeof(thread_name));
3331 (void) snprintf(thread_name, sizeof(thread_name),
3332 "dlil_input_%s", ifp->if_xname);
3333 thread_set_thread_name(inp->dlth_thread, thread_name);
3334
3335 lck_mtx_lock(&inp->dlth_lock);
3336 VERIFY(!(inp->dlth_flags & (DLIL_INPUT_EMBRYONIC | DLIL_INPUT_RUNNING)));
3337 (void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3338 inp->dlth_flags |= DLIL_INPUT_EMBRYONIC;
3339 /* wake up once to get out of embryonic state */
3340 dlil_input_wakeup(inp);
3341 lck_mtx_unlock(&inp->dlth_lock);
3342 (void) thread_block_parameter(dlil_input_thread_cont, inp);
3343 /* NOTREACHED */
3344 __builtin_unreachable();
3345 }
3346
3347 __attribute__((noreturn))
3348 static void
dlil_input_thread_cont(void * v,wait_result_t wres)3349 dlil_input_thread_cont(void *v, wait_result_t wres)
3350 {
3351 struct dlil_threading_info *inp = v;
3352 struct ifnet *ifp = inp->dlth_ifp;
3353
3354 lck_mtx_lock_spin(&inp->dlth_lock);
3355 if (__improbable(wres == THREAD_INTERRUPTED ||
3356 (inp->dlth_flags & DLIL_INPUT_TERMINATE))) {
3357 goto terminate;
3358 }
3359
3360 VERIFY(!(inp->dlth_flags & DLIL_INPUT_RUNNING));
3361 inp->dlth_flags |= DLIL_INPUT_RUNNING;
3362
3363 while (1) {
3364 struct mbuf *m = NULL;
3365 classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
3366 boolean_t notify = FALSE;
3367 boolean_t embryonic;
3368 u_int32_t m_cnt;
3369
3370 inp->dlth_flags &= ~DLIL_INPUT_WAITING;
3371
3372 if (__improbable(embryonic =
3373 (inp->dlth_flags & DLIL_INPUT_EMBRYONIC))) {
3374 inp->dlth_flags &= ~DLIL_INPUT_EMBRYONIC;
3375 }
3376
3377 /*
3378 * Protocol registration and injection must always use
3379 * the main input thread; in theory the latter can utilize
3380 * the corresponding input thread where the packet arrived
3381 * on, but that requires our knowing the interface in advance
3382 * (and the benefits might not worth the trouble.)
3383 */
3384 VERIFY(!(inp->dlth_flags &
3385 (DLIL_PROTO_WAITING | DLIL_PROTO_REGISTER)));
3386
3387 /* Packets for this interface */
3388 m_cnt = qlen(&inp->dlth_pkts);
3389 _getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
3390 m = pkt.cp_mbuf;
3391
3392 inp->dlth_wtot = 0;
3393
3394 #if SKYWALK
3395 /*
3396 * If this interface is attached to a netif nexus,
3397 * the stats are already incremented there; otherwise
3398 * do it here.
3399 */
3400 if (!(ifp->if_capabilities & IFCAP_SKYWALK))
3401 #endif /* SKYWALK */
3402 notify = dlil_input_stats_sync(ifp, inp);
3403
3404 lck_mtx_unlock(&inp->dlth_lock);
3405
3406 if (__improbable(embryonic)) {
3407 ifnet_decr_pending_thread_count(ifp);
3408 }
3409
3410 if (__improbable(notify)) {
3411 ifnet_notify_data_threshold(ifp);
3412 }
3413
3414 /*
3415 * NOTE warning %%% attention !!!!
3416 * We should think about putting some thread starvation
3417 * safeguards if we deal with long chains of packets.
3418 */
3419 if (__probable(m != NULL)) {
3420 dlil_input_packet_list_extended(NULL, m,
3421 m_cnt, ifp->if_poll_mode);
3422 }
3423
3424 lck_mtx_lock_spin(&inp->dlth_lock);
3425 VERIFY(inp->dlth_flags & DLIL_INPUT_RUNNING);
3426 if (!(inp->dlth_flags & ~(DLIL_INPUT_RUNNING |
3427 DLIL_INPUT_TERMINATE))) {
3428 break;
3429 }
3430 }
3431
3432 inp->dlth_flags &= ~DLIL_INPUT_RUNNING;
3433
3434 if (__improbable(inp->dlth_flags & DLIL_INPUT_TERMINATE)) {
3435 terminate:
3436 lck_mtx_unlock(&inp->dlth_lock);
3437 dlil_terminate_input_thread(inp);
3438 /* NOTREACHED */
3439 } else {
3440 (void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3441 lck_mtx_unlock(&inp->dlth_lock);
3442 (void) thread_block_parameter(dlil_input_thread_cont, inp);
3443 /* NOTREACHED */
3444 }
3445
3446 VERIFY(0); /* we should never get here */
3447 /* NOTREACHED */
3448 __builtin_unreachable();
3449 }
3450
3451 /*
3452 * Input thread for interfaces with opportunistic polling input model.
3453 */
3454 __attribute__((noreturn))
3455 static void
dlil_rxpoll_input_thread_func(void * v,wait_result_t w)3456 dlil_rxpoll_input_thread_func(void *v, wait_result_t w)
3457 {
3458 #pragma unused(w)
3459 char thread_name[MAXTHREADNAMESIZE];
3460 struct dlil_threading_info *inp = v;
3461 struct ifnet *ifp = inp->dlth_ifp;
3462
3463 VERIFY(inp != dlil_main_input_thread);
3464 VERIFY(ifp != NULL && (ifp->if_eflags & IFEF_RXPOLL) &&
3465 (ifp->if_xflags & IFXF_LEGACY));
3466 VERIFY(current_thread() == inp->dlth_thread);
3467
3468 /* construct the name for this thread, and then apply it */
3469 bzero(thread_name, sizeof(thread_name));
3470 (void) snprintf(thread_name, sizeof(thread_name),
3471 "dlil_input_poll_%s", ifp->if_xname);
3472 thread_set_thread_name(inp->dlth_thread, thread_name);
3473
3474 lck_mtx_lock(&inp->dlth_lock);
3475 VERIFY(!(inp->dlth_flags & (DLIL_INPUT_EMBRYONIC | DLIL_INPUT_RUNNING)));
3476 (void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3477 inp->dlth_flags |= DLIL_INPUT_EMBRYONIC;
3478 /* wake up once to get out of embryonic state */
3479 dlil_input_wakeup(inp);
3480 lck_mtx_unlock(&inp->dlth_lock);
3481 (void) thread_block_parameter(dlil_rxpoll_input_thread_cont, inp);
3482 /* NOTREACHED */
3483 __builtin_unreachable();
3484 }
3485
3486 __attribute__((noreturn))
3487 static void
dlil_rxpoll_input_thread_cont(void * v,wait_result_t wres)3488 dlil_rxpoll_input_thread_cont(void *v, wait_result_t wres)
3489 {
3490 struct dlil_threading_info *inp = v;
3491 struct ifnet *ifp = inp->dlth_ifp;
3492 struct timespec ts;
3493
3494 lck_mtx_lock_spin(&inp->dlth_lock);
3495 if (__improbable(wres == THREAD_INTERRUPTED ||
3496 (inp->dlth_flags & DLIL_INPUT_TERMINATE))) {
3497 goto terminate;
3498 }
3499
3500 VERIFY(!(inp->dlth_flags & DLIL_INPUT_RUNNING));
3501 inp->dlth_flags |= DLIL_INPUT_RUNNING;
3502
3503 while (1) {
3504 struct mbuf *m = NULL;
3505 uint32_t m_cnt, poll_req = 0;
3506 uint64_t m_size = 0;
3507 ifnet_model_t mode;
3508 struct timespec now, delta;
3509 classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
3510 boolean_t notify;
3511 boolean_t embryonic;
3512 uint64_t ival;
3513
3514 inp->dlth_flags &= ~DLIL_INPUT_WAITING;
3515
3516 if (__improbable(embryonic =
3517 (inp->dlth_flags & DLIL_INPUT_EMBRYONIC))) {
3518 inp->dlth_flags &= ~DLIL_INPUT_EMBRYONIC;
3519 goto skip;
3520 }
3521
3522 if ((ival = ifp->if_rxpoll_ival) < IF_RXPOLL_INTERVALTIME_MIN) {
3523 ival = IF_RXPOLL_INTERVALTIME_MIN;
3524 }
3525
3526 /* Link parameters changed? */
3527 if (ifp->if_poll_update != 0) {
3528 ifp->if_poll_update = 0;
3529 (void) dlil_rxpoll_set_params(ifp, NULL, TRUE);
3530 }
3531
3532 /* Current operating mode */
3533 mode = ifp->if_poll_mode;
3534
3535 /*
3536 * Protocol registration and injection must always use
3537 * the main input thread; in theory the latter can utilize
3538 * the corresponding input thread where the packet arrived
3539 * on, but that requires our knowing the interface in advance
3540 * (and the benefits might not worth the trouble.)
3541 */
3542 VERIFY(!(inp->dlth_flags &
3543 (DLIL_PROTO_WAITING | DLIL_PROTO_REGISTER)));
3544
3545 /* Total count of all packets */
3546 m_cnt = qlen(&inp->dlth_pkts);
3547
3548 /* Total bytes of all packets */
3549 m_size = qsize(&inp->dlth_pkts);
3550
3551 /* Packets for this interface */
3552 _getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
3553 m = pkt.cp_mbuf;
3554 VERIFY(m != NULL || m_cnt == 0);
3555
3556 nanouptime(&now);
3557 if (!net_timerisset(&ifp->if_poll_sample_lasttime)) {
3558 *(&ifp->if_poll_sample_lasttime) = *(&now);
3559 }
3560
3561 net_timersub(&now, &ifp->if_poll_sample_lasttime, &delta);
3562 if (if_rxpoll && net_timerisset(&ifp->if_poll_sample_holdtime)) {
3563 u_int32_t ptot, btot;
3564
3565 /* Accumulate statistics for current sampling */
3566 PKTCNTR_ADD(&ifp->if_poll_sstats, m_cnt, m_size);
3567
3568 if (net_timercmp(&delta, &ifp->if_poll_sample_holdtime, <)) {
3569 goto skip;
3570 }
3571
3572 *(&ifp->if_poll_sample_lasttime) = *(&now);
3573
3574 /* Calculate min/max of inbound bytes */
3575 btot = (u_int32_t)ifp->if_poll_sstats.bytes;
3576 if (ifp->if_rxpoll_bmin == 0 || ifp->if_rxpoll_bmin > btot) {
3577 ifp->if_rxpoll_bmin = btot;
3578 }
3579 if (btot > ifp->if_rxpoll_bmax) {
3580 ifp->if_rxpoll_bmax = btot;
3581 }
3582
3583 /* Calculate EWMA of inbound bytes */
3584 DLIL_EWMA(ifp->if_rxpoll_bavg, btot, if_rxpoll_decay);
3585
3586 /* Calculate min/max of inbound packets */
3587 ptot = (u_int32_t)ifp->if_poll_sstats.packets;
3588 if (ifp->if_rxpoll_pmin == 0 || ifp->if_rxpoll_pmin > ptot) {
3589 ifp->if_rxpoll_pmin = ptot;
3590 }
3591 if (ptot > ifp->if_rxpoll_pmax) {
3592 ifp->if_rxpoll_pmax = ptot;
3593 }
3594
3595 /* Calculate EWMA of inbound packets */
3596 DLIL_EWMA(ifp->if_rxpoll_pavg, ptot, if_rxpoll_decay);
3597
3598 /* Reset sampling statistics */
3599 PKTCNTR_CLEAR(&ifp->if_poll_sstats);
3600
3601 /* Calculate EWMA of wakeup requests */
3602 DLIL_EWMA(ifp->if_rxpoll_wavg, inp->dlth_wtot,
3603 if_rxpoll_decay);
3604 inp->dlth_wtot = 0;
3605
3606 if (dlil_verbose) {
3607 if (!net_timerisset(&ifp->if_poll_dbg_lasttime)) {
3608 *(&ifp->if_poll_dbg_lasttime) = *(&now);
3609 }
3610 net_timersub(&now, &ifp->if_poll_dbg_lasttime, &delta);
3611 if (net_timercmp(&delta, &dlil_dbgrate, >=)) {
3612 *(&ifp->if_poll_dbg_lasttime) = *(&now);
3613 DLIL_PRINTF("%s: [%s] pkts avg %d max %d "
3614 "limits [%d/%d], wreq avg %d "
3615 "limits [%d/%d], bytes avg %d "
3616 "limits [%d/%d]\n", if_name(ifp),
3617 (ifp->if_poll_mode ==
3618 IFNET_MODEL_INPUT_POLL_ON) ?
3619 "ON" : "OFF", ifp->if_rxpoll_pavg,
3620 ifp->if_rxpoll_pmax,
3621 ifp->if_rxpoll_plowat,
3622 ifp->if_rxpoll_phiwat,
3623 ifp->if_rxpoll_wavg,
3624 ifp->if_rxpoll_wlowat,
3625 ifp->if_rxpoll_whiwat,
3626 ifp->if_rxpoll_bavg,
3627 ifp->if_rxpoll_blowat,
3628 ifp->if_rxpoll_bhiwat);
3629 }
3630 }
3631
3632 /* Perform mode transition, if necessary */
3633 if (!net_timerisset(&ifp->if_poll_mode_lasttime)) {
3634 *(&ifp->if_poll_mode_lasttime) = *(&now);
3635 }
3636
3637 net_timersub(&now, &ifp->if_poll_mode_lasttime, &delta);
3638 if (net_timercmp(&delta, &ifp->if_poll_mode_holdtime, <)) {
3639 goto skip;
3640 }
3641
3642 if (ifp->if_rxpoll_pavg <= ifp->if_rxpoll_plowat &&
3643 ifp->if_rxpoll_bavg <= ifp->if_rxpoll_blowat &&
3644 ifp->if_poll_mode != IFNET_MODEL_INPUT_POLL_OFF) {
3645 mode = IFNET_MODEL_INPUT_POLL_OFF;
3646 } else if (ifp->if_rxpoll_pavg >= ifp->if_rxpoll_phiwat &&
3647 (ifp->if_rxpoll_bavg >= ifp->if_rxpoll_bhiwat ||
3648 ifp->if_rxpoll_wavg >= ifp->if_rxpoll_whiwat) &&
3649 ifp->if_poll_mode != IFNET_MODEL_INPUT_POLL_ON) {
3650 mode = IFNET_MODEL_INPUT_POLL_ON;
3651 }
3652
3653 if (mode != ifp->if_poll_mode) {
3654 ifp->if_poll_mode = mode;
3655 *(&ifp->if_poll_mode_lasttime) = *(&now);
3656 poll_req++;
3657 }
3658 }
3659 skip:
3660 notify = dlil_input_stats_sync(ifp, inp);
3661
3662 lck_mtx_unlock(&inp->dlth_lock);
3663
3664 if (__improbable(embryonic)) {
3665 ifnet_decr_pending_thread_count(ifp);
3666 }
3667
3668 if (__improbable(notify)) {
3669 ifnet_notify_data_threshold(ifp);
3670 }
3671
3672 /*
3673 * If there's a mode change and interface is still attached,
3674 * perform a downcall to the driver for the new mode. Also
3675 * hold an IO refcnt on the interface to prevent it from
3676 * being detached (will be release below.)
3677 */
3678 if (poll_req != 0 && ifnet_is_attached(ifp, 1)) {
3679 struct ifnet_model_params p = {
3680 .model = mode, .reserved = { 0 }
3681 };
3682 errno_t err;
3683
3684 if (dlil_verbose) {
3685 DLIL_PRINTF("%s: polling is now %s, "
3686 "pkts avg %d max %d limits [%d/%d], "
3687 "wreq avg %d limits [%d/%d], "
3688 "bytes avg %d limits [%d/%d]\n",
3689 if_name(ifp),
3690 (mode == IFNET_MODEL_INPUT_POLL_ON) ?
3691 "ON" : "OFF", ifp->if_rxpoll_pavg,
3692 ifp->if_rxpoll_pmax, ifp->if_rxpoll_plowat,
3693 ifp->if_rxpoll_phiwat, ifp->if_rxpoll_wavg,
3694 ifp->if_rxpoll_wlowat, ifp->if_rxpoll_whiwat,
3695 ifp->if_rxpoll_bavg, ifp->if_rxpoll_blowat,
3696 ifp->if_rxpoll_bhiwat);
3697 }
3698
3699 if ((err = ((*ifp->if_input_ctl)(ifp,
3700 IFNET_CTL_SET_INPUT_MODEL, sizeof(p), &p))) != 0) {
3701 DLIL_PRINTF("%s: error setting polling mode "
3702 "to %s (%d)\n", if_name(ifp),
3703 (mode == IFNET_MODEL_INPUT_POLL_ON) ?
3704 "ON" : "OFF", err);
3705 }
3706
3707 switch (mode) {
3708 case IFNET_MODEL_INPUT_POLL_OFF:
3709 ifnet_set_poll_cycle(ifp, NULL);
3710 ifp->if_rxpoll_offreq++;
3711 if (err != 0) {
3712 ifp->if_rxpoll_offerr++;
3713 }
3714 break;
3715
3716 case IFNET_MODEL_INPUT_POLL_ON:
3717 net_nsectimer(&ival, &ts);
3718 ifnet_set_poll_cycle(ifp, &ts);
3719 ifnet_poll(ifp);
3720 ifp->if_rxpoll_onreq++;
3721 if (err != 0) {
3722 ifp->if_rxpoll_onerr++;
3723 }
3724 break;
3725
3726 default:
3727 VERIFY(0);
3728 /* NOTREACHED */
3729 }
3730
3731 /* Release the IO refcnt */
3732 ifnet_decr_iorefcnt(ifp);
3733 }
3734
3735 /*
3736 * NOTE warning %%% attention !!!!
3737 * We should think about putting some thread starvation
3738 * safeguards if we deal with long chains of packets.
3739 */
3740 if (__probable(m != NULL)) {
3741 dlil_input_packet_list_extended(NULL, m, m_cnt, mode);
3742 }
3743
3744 lck_mtx_lock_spin(&inp->dlth_lock);
3745 VERIFY(inp->dlth_flags & DLIL_INPUT_RUNNING);
3746 if (!(inp->dlth_flags & ~(DLIL_INPUT_RUNNING |
3747 DLIL_INPUT_TERMINATE))) {
3748 break;
3749 }
3750 }
3751
3752 inp->dlth_flags &= ~DLIL_INPUT_RUNNING;
3753
3754 if (__improbable(inp->dlth_flags & DLIL_INPUT_TERMINATE)) {
3755 terminate:
3756 lck_mtx_unlock(&inp->dlth_lock);
3757 dlil_terminate_input_thread(inp);
3758 /* NOTREACHED */
3759 } else {
3760 (void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3761 lck_mtx_unlock(&inp->dlth_lock);
3762 (void) thread_block_parameter(dlil_rxpoll_input_thread_cont,
3763 inp);
3764 /* NOTREACHED */
3765 }
3766
3767 VERIFY(0); /* we should never get here */
3768 /* NOTREACHED */
3769 __builtin_unreachable();
3770 }
3771
3772 errno_t
dlil_rxpoll_validate_params(struct ifnet_poll_params * p)3773 dlil_rxpoll_validate_params(struct ifnet_poll_params *p)
3774 {
3775 if (p != NULL) {
3776 if ((p->packets_lowat == 0 && p->packets_hiwat != 0) ||
3777 (p->packets_lowat != 0 && p->packets_hiwat == 0)) {
3778 return EINVAL;
3779 }
3780 if (p->packets_lowat != 0 && /* hiwat must be non-zero */
3781 p->packets_lowat >= p->packets_hiwat) {
3782 return EINVAL;
3783 }
3784 if ((p->bytes_lowat == 0 && p->bytes_hiwat != 0) ||
3785 (p->bytes_lowat != 0 && p->bytes_hiwat == 0)) {
3786 return EINVAL;
3787 }
3788 if (p->bytes_lowat != 0 && /* hiwat must be non-zero */
3789 p->bytes_lowat >= p->bytes_hiwat) {
3790 return EINVAL;
3791 }
3792 if (p->interval_time != 0 &&
3793 p->interval_time < IF_RXPOLL_INTERVALTIME_MIN) {
3794 p->interval_time = IF_RXPOLL_INTERVALTIME_MIN;
3795 }
3796 }
3797 return 0;
3798 }
3799
3800 void
dlil_rxpoll_update_params(struct ifnet * ifp,struct ifnet_poll_params * p)3801 dlil_rxpoll_update_params(struct ifnet *ifp, struct ifnet_poll_params *p)
3802 {
3803 u_int64_t sample_holdtime, inbw;
3804
3805 if ((inbw = ifnet_input_linkrate(ifp)) == 0 && p == NULL) {
3806 sample_holdtime = 0; /* polling is disabled */
3807 ifp->if_rxpoll_wlowat = ifp->if_rxpoll_plowat =
3808 ifp->if_rxpoll_blowat = 0;
3809 ifp->if_rxpoll_whiwat = ifp->if_rxpoll_phiwat =
3810 ifp->if_rxpoll_bhiwat = (u_int32_t)-1;
3811 ifp->if_rxpoll_plim = 0;
3812 ifp->if_rxpoll_ival = IF_RXPOLL_INTERVALTIME_MIN;
3813 } else {
3814 u_int32_t plowat, phiwat, blowat, bhiwat, plim;
3815 u_int64_t ival;
3816 unsigned int n, i;
3817
3818 for (n = 0, i = 0; rxpoll_tbl[i].speed != 0; i++) {
3819 if (inbw < rxpoll_tbl[i].speed) {
3820 break;
3821 }
3822 n = i;
3823 }
3824 /* auto-tune if caller didn't specify a value */
3825 plowat = ((p == NULL || p->packets_lowat == 0) ?
3826 rxpoll_tbl[n].plowat : p->packets_lowat);
3827 phiwat = ((p == NULL || p->packets_hiwat == 0) ?
3828 rxpoll_tbl[n].phiwat : p->packets_hiwat);
3829 blowat = ((p == NULL || p->bytes_lowat == 0) ?
3830 rxpoll_tbl[n].blowat : p->bytes_lowat);
3831 bhiwat = ((p == NULL || p->bytes_hiwat == 0) ?
3832 rxpoll_tbl[n].bhiwat : p->bytes_hiwat);
3833 plim = ((p == NULL || p->packets_limit == 0 ||
3834 if_rxpoll_max != 0) ? if_rxpoll_max : p->packets_limit);
3835 ival = ((p == NULL || p->interval_time == 0 ||
3836 if_rxpoll_interval_time != IF_RXPOLL_INTERVALTIME) ?
3837 if_rxpoll_interval_time : p->interval_time);
3838
3839 VERIFY(plowat != 0 && phiwat != 0);
3840 VERIFY(blowat != 0 && bhiwat != 0);
3841 VERIFY(ival >= IF_RXPOLL_INTERVALTIME_MIN);
3842
3843 sample_holdtime = if_rxpoll_sample_holdtime;
3844 ifp->if_rxpoll_wlowat = if_sysctl_rxpoll_wlowat;
3845 ifp->if_rxpoll_whiwat = if_sysctl_rxpoll_whiwat;
3846 ifp->if_rxpoll_plowat = plowat;
3847 ifp->if_rxpoll_phiwat = phiwat;
3848 ifp->if_rxpoll_blowat = blowat;
3849 ifp->if_rxpoll_bhiwat = bhiwat;
3850 ifp->if_rxpoll_plim = plim;
3851 ifp->if_rxpoll_ival = ival;
3852 }
3853
3854 net_nsectimer(&if_rxpoll_mode_holdtime, &ifp->if_poll_mode_holdtime);
3855 net_nsectimer(&sample_holdtime, &ifp->if_poll_sample_holdtime);
3856
3857 if (dlil_verbose) {
3858 DLIL_PRINTF("%s: speed %llu bps, sample per %llu nsec, "
3859 "poll interval %llu nsec, pkts per poll %u, "
3860 "pkt limits [%u/%u], wreq limits [%u/%u], "
3861 "bytes limits [%u/%u]\n", if_name(ifp),
3862 inbw, sample_holdtime, ifp->if_rxpoll_ival,
3863 ifp->if_rxpoll_plim, ifp->if_rxpoll_plowat,
3864 ifp->if_rxpoll_phiwat, ifp->if_rxpoll_wlowat,
3865 ifp->if_rxpoll_whiwat, ifp->if_rxpoll_blowat,
3866 ifp->if_rxpoll_bhiwat);
3867 }
3868 }
3869
3870 /*
3871 * Must be called on an attached ifnet (caller is expected to check.)
3872 * Caller may pass NULL for poll parameters to indicate "auto-tuning."
3873 */
3874 errno_t
dlil_rxpoll_set_params(struct ifnet * ifp,struct ifnet_poll_params * p,boolean_t locked)3875 dlil_rxpoll_set_params(struct ifnet *ifp, struct ifnet_poll_params *p,
3876 boolean_t locked)
3877 {
3878 errno_t err;
3879 struct dlil_threading_info *inp;
3880
3881 VERIFY(ifp != NULL);
3882 if (!(ifp->if_eflags & IFEF_RXPOLL) || (inp = ifp->if_inp) == NULL) {
3883 return ENXIO;
3884 }
3885 err = dlil_rxpoll_validate_params(p);
3886 if (err != 0) {
3887 return err;
3888 }
3889
3890 if (!locked) {
3891 lck_mtx_lock(&inp->dlth_lock);
3892 }
3893 LCK_MTX_ASSERT(&inp->dlth_lock, LCK_MTX_ASSERT_OWNED);
3894 /*
3895 * Normally, we'd reset the parameters to the auto-tuned values
3896 * if the the input thread detects a change in link rate. If the
3897 * driver provides its own parameters right after a link rate
3898 * changes, but before the input thread gets to run, we want to
3899 * make sure to keep the driver's values. Clearing if_poll_update
3900 * will achieve that.
3901 */
3902 if (p != NULL && !locked && ifp->if_poll_update != 0) {
3903 ifp->if_poll_update = 0;
3904 }
3905 dlil_rxpoll_update_params(ifp, p);
3906 if (!locked) {
3907 lck_mtx_unlock(&inp->dlth_lock);
3908 }
3909 return 0;
3910 }
3911
3912 /*
3913 * Must be called on an attached ifnet (caller is expected to check.)
3914 */
3915 errno_t
dlil_rxpoll_get_params(struct ifnet * ifp,struct ifnet_poll_params * p)3916 dlil_rxpoll_get_params(struct ifnet *ifp, struct ifnet_poll_params *p)
3917 {
3918 struct dlil_threading_info *inp;
3919
3920 VERIFY(ifp != NULL && p != NULL);
3921 if (!(ifp->if_eflags & IFEF_RXPOLL) || (inp = ifp->if_inp) == NULL) {
3922 return ENXIO;
3923 }
3924
3925 bzero(p, sizeof(*p));
3926
3927 lck_mtx_lock(&inp->dlth_lock);
3928 p->packets_limit = ifp->if_rxpoll_plim;
3929 p->packets_lowat = ifp->if_rxpoll_plowat;
3930 p->packets_hiwat = ifp->if_rxpoll_phiwat;
3931 p->bytes_lowat = ifp->if_rxpoll_blowat;
3932 p->bytes_hiwat = ifp->if_rxpoll_bhiwat;
3933 p->interval_time = ifp->if_rxpoll_ival;
3934 lck_mtx_unlock(&inp->dlth_lock);
3935
3936 return 0;
3937 }
3938
3939 errno_t
ifnet_input(struct ifnet * ifp,struct mbuf * m_head,const struct ifnet_stat_increment_param * s)3940 ifnet_input(struct ifnet *ifp, struct mbuf *m_head,
3941 const struct ifnet_stat_increment_param *s)
3942 {
3943 return ifnet_input_common(ifp, m_head, NULL, s, FALSE, FALSE);
3944 }
3945
3946 errno_t
ifnet_input_extended(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s)3947 ifnet_input_extended(struct ifnet *ifp, struct mbuf *m_head,
3948 struct mbuf *m_tail, const struct ifnet_stat_increment_param *s)
3949 {
3950 return ifnet_input_common(ifp, m_head, m_tail, s, TRUE, FALSE);
3951 }
3952
3953 errno_t
ifnet_input_poll(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s)3954 ifnet_input_poll(struct ifnet *ifp, struct mbuf *m_head,
3955 struct mbuf *m_tail, const struct ifnet_stat_increment_param *s)
3956 {
3957 return ifnet_input_common(ifp, m_head, m_tail, s,
3958 (m_head != NULL), TRUE);
3959 }
3960
3961 static errno_t
ifnet_input_common(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t ext,boolean_t poll)3962 ifnet_input_common(struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail,
3963 const struct ifnet_stat_increment_param *s, boolean_t ext, boolean_t poll)
3964 {
3965 dlil_input_func input_func;
3966 struct ifnet_stat_increment_param _s;
3967 u_int32_t m_cnt = 0, m_size = 0;
3968 struct mbuf *last;
3969 errno_t err = 0;
3970
3971 if ((m_head == NULL && !poll) || (s == NULL && ext)) {
3972 if (m_head != NULL) {
3973 mbuf_freem_list(m_head);
3974 }
3975 return EINVAL;
3976 }
3977
3978 VERIFY(m_head != NULL || (s == NULL && m_tail == NULL && !ext && poll));
3979 VERIFY(m_tail == NULL || ext);
3980 VERIFY(s != NULL || !ext);
3981
3982 /*
3983 * Drop the packet(s) if the parameters are invalid, or if the
3984 * interface is no longer attached; else hold an IO refcnt to
3985 * prevent it from being detached (will be released below.)
3986 */
3987 if (ifp == NULL || (ifp != lo_ifp && !ifnet_datamov_begin(ifp))) {
3988 if (m_head != NULL) {
3989 mbuf_freem_list(m_head);
3990 }
3991 return EINVAL;
3992 }
3993
3994 input_func = ifp->if_input_dlil;
3995 VERIFY(input_func != NULL);
3996
3997 if (m_tail == NULL) {
3998 last = m_head;
3999 while (m_head != NULL) {
4000 #if IFNET_INPUT_SANITY_CHK
4001 if (__improbable(dlil_input_sanity_check != 0)) {
4002 DLIL_INPUT_CHECK(last, ifp);
4003 }
4004 #endif /* IFNET_INPUT_SANITY_CHK */
4005 m_cnt++;
4006 m_size += m_length(last);
4007 if (mbuf_nextpkt(last) == NULL) {
4008 break;
4009 }
4010 last = mbuf_nextpkt(last);
4011 }
4012 m_tail = last;
4013 } else {
4014 #if IFNET_INPUT_SANITY_CHK
4015 if (__improbable(dlil_input_sanity_check != 0)) {
4016 last = m_head;
4017 while (1) {
4018 DLIL_INPUT_CHECK(last, ifp);
4019 m_cnt++;
4020 m_size += m_length(last);
4021 if (mbuf_nextpkt(last) == NULL) {
4022 break;
4023 }
4024 last = mbuf_nextpkt(last);
4025 }
4026 } else {
4027 m_cnt = s->packets_in;
4028 m_size = s->bytes_in;
4029 last = m_tail;
4030 }
4031 #else
4032 m_cnt = s->packets_in;
4033 m_size = s->bytes_in;
4034 last = m_tail;
4035 #endif /* IFNET_INPUT_SANITY_CHK */
4036 }
4037
4038 if (last != m_tail) {
4039 panic_plain("%s: invalid input packet chain for %s, "
4040 "tail mbuf %p instead of %p\n", __func__, if_name(ifp),
4041 m_tail, last);
4042 }
4043
4044 /*
4045 * Assert packet count only for the extended variant, for backwards
4046 * compatibility, since this came directly from the device driver.
4047 * Relax this assertion for input bytes, as the driver may have
4048 * included the link-layer headers in the computation; hence
4049 * m_size is just an approximation.
4050 */
4051 if (ext && s->packets_in != m_cnt) {
4052 panic_plain("%s: input packet count mismatch for %s, "
4053 "%d instead of %d\n", __func__, if_name(ifp),
4054 s->packets_in, m_cnt);
4055 }
4056
4057 if (s == NULL) {
4058 bzero(&_s, sizeof(_s));
4059 s = &_s;
4060 } else {
4061 _s = *s;
4062 }
4063 _s.packets_in = m_cnt;
4064 _s.bytes_in = m_size;
4065
4066 err = (*input_func)(ifp, m_head, m_tail, s, poll, current_thread());
4067
4068 if (ifp != lo_ifp) {
4069 /* Release the IO refcnt */
4070 ifnet_datamov_end(ifp);
4071 }
4072
4073 return err;
4074 }
4075
4076 #if SKYWALK
4077 errno_t
dlil_set_input_handler(struct ifnet * ifp,dlil_input_func fn)4078 dlil_set_input_handler(struct ifnet *ifp, dlil_input_func fn)
4079 {
4080 return os_atomic_cmpxchg((void * volatile *)&ifp->if_input_dlil,
4081 ptrauth_nop_cast(void *, &dlil_input_handler),
4082 ptrauth_nop_cast(void *, fn), acq_rel) ? 0 : EBUSY;
4083 }
4084
4085 void
dlil_reset_input_handler(struct ifnet * ifp)4086 dlil_reset_input_handler(struct ifnet *ifp)
4087 {
4088 while (!os_atomic_cmpxchg((void * volatile *)&ifp->if_input_dlil,
4089 ptrauth_nop_cast(void *, ifp->if_input_dlil),
4090 ptrauth_nop_cast(void *, &dlil_input_handler), acq_rel)) {
4091 ;
4092 }
4093 }
4094
4095 errno_t
dlil_set_output_handler(struct ifnet * ifp,dlil_output_func fn)4096 dlil_set_output_handler(struct ifnet *ifp, dlil_output_func fn)
4097 {
4098 return os_atomic_cmpxchg((void * volatile *)&ifp->if_output_dlil,
4099 ptrauth_nop_cast(void *, &dlil_output_handler),
4100 ptrauth_nop_cast(void *, fn), acq_rel) ? 0 : EBUSY;
4101 }
4102
4103 void
dlil_reset_output_handler(struct ifnet * ifp)4104 dlil_reset_output_handler(struct ifnet *ifp)
4105 {
4106 while (!os_atomic_cmpxchg((void * volatile *)&ifp->if_output_dlil,
4107 ptrauth_nop_cast(void *, ifp->if_output_dlil),
4108 ptrauth_nop_cast(void *, &dlil_output_handler), acq_rel)) {
4109 ;
4110 }
4111 }
4112 #endif /* SKYWALK */
4113
4114 errno_t
dlil_output_handler(struct ifnet * ifp,struct mbuf * m)4115 dlil_output_handler(struct ifnet *ifp, struct mbuf *m)
4116 {
4117 return ifp->if_output(ifp, m);
4118 }
4119
4120 errno_t
dlil_input_handler(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t poll,struct thread * tp)4121 dlil_input_handler(struct ifnet *ifp, struct mbuf *m_head,
4122 struct mbuf *m_tail, const struct ifnet_stat_increment_param *s,
4123 boolean_t poll, struct thread *tp)
4124 {
4125 struct dlil_threading_info *inp = ifp->if_inp;
4126
4127 if (__improbable(inp == NULL)) {
4128 inp = dlil_main_input_thread;
4129 }
4130
4131 #if (DEVELOPMENT || DEBUG)
4132 if (__improbable(net_thread_is_marked(NET_THREAD_SYNC_RX))) {
4133 return dlil_input_sync(inp, ifp, m_head, m_tail, s, poll, tp);
4134 } else
4135 #endif /* (DEVELOPMENT || DEBUG) */
4136 {
4137 return inp->dlth_strategy(inp, ifp, m_head, m_tail, s, poll, tp);
4138 }
4139 }
4140
4141 /*
4142 * Detect whether a queue contains a burst that needs to be trimmed.
4143 */
4144 #define MBUF_QUEUE_IS_OVERCOMMITTED(q) \
4145 __improbable(MAX(if_rcvq_burst_limit, qlimit(q)) < qlen(q) && \
4146 qtype(q) == QP_MBUF)
4147
4148 #define MAX_KNOWN_MBUF_CLASS 8
4149
4150 static uint32_t
dlil_trim_overcomitted_queue_locked(class_queue_t * input_queue,dlil_freeq_t * freeq,struct ifnet_stat_increment_param * stat_delta)4151 dlil_trim_overcomitted_queue_locked(class_queue_t *input_queue,
4152 dlil_freeq_t *freeq, struct ifnet_stat_increment_param *stat_delta)
4153 {
4154 uint32_t overcommitted_qlen; /* Length in packets. */
4155 uint64_t overcommitted_qsize; /* Size in bytes. */
4156 uint32_t target_qlen; /* The desired queue length after trimming. */
4157 uint32_t pkts_to_drop = 0; /* Number of packets to drop. */
4158 uint32_t dropped_pkts = 0; /* Number of packets that were dropped. */
4159 uint32_t dropped_bytes = 0; /* Number of dropped bytes. */
4160 struct mbuf *m = NULL, *m_tmp = NULL;
4161
4162 overcommitted_qlen = qlen(input_queue);
4163 overcommitted_qsize = qsize(input_queue);
4164 target_qlen = (qlimit(input_queue) * if_rcvq_trim_pct) / 100;
4165
4166 if (overcommitted_qlen <= target_qlen) {
4167 /*
4168 * The queue is already within the target limits.
4169 */
4170 dropped_pkts = 0;
4171 goto out;
4172 }
4173
4174 pkts_to_drop = overcommitted_qlen - target_qlen;
4175
4176 /*
4177 * Proceed to removing packets from the head of the queue,
4178 * starting from the oldest, until the desired number of packets
4179 * has been dropped.
4180 */
4181 MBUFQ_FOREACH_SAFE(m, &qmbufq(input_queue), m_tmp) {
4182 if (pkts_to_drop <= dropped_pkts) {
4183 break;
4184 }
4185 MBUFQ_REMOVE(&qmbufq(input_queue), m);
4186 MBUFQ_NEXT(m) = NULL;
4187 MBUFQ_ENQUEUE(freeq, m);
4188
4189 dropped_pkts += 1;
4190 dropped_bytes += m_length(m);
4191 }
4192
4193 /*
4194 * Adjust the length and the estimated size of the queue
4195 * after trimming.
4196 */
4197 VERIFY(overcommitted_qlen == target_qlen + dropped_pkts);
4198 qlen(input_queue) = target_qlen;
4199
4200 /* qsize() is an approximation. */
4201 if (dropped_bytes < qsize(input_queue)) {
4202 qsize(input_queue) -= dropped_bytes;
4203 } else {
4204 qsize(input_queue) = 0;
4205 }
4206
4207 /*
4208 * Adjust the ifnet statistics increments, if needed.
4209 */
4210 stat_delta->dropped += dropped_pkts;
4211 if (dropped_pkts < stat_delta->packets_in) {
4212 stat_delta->packets_in -= dropped_pkts;
4213 } else {
4214 stat_delta->packets_in = 0;
4215 }
4216 if (dropped_bytes < stat_delta->bytes_in) {
4217 stat_delta->bytes_in -= dropped_bytes;
4218 } else {
4219 stat_delta->bytes_in = 0;
4220 }
4221
4222 out:
4223 if (dlil_verbose) {
4224 /*
4225 * The basic information about the drop is logged
4226 * by the invoking function (dlil_input_{,a}sync).
4227 * If `dlil_verbose' flag is set, provide more information
4228 * that can be useful for debugging.
4229 */
4230 DLIL_PRINTF("%s: "
4231 "qlen: %u -> %u, "
4232 "qsize: %llu -> %llu "
4233 "qlimit: %u (sysctl: %u) "
4234 "target_qlen: %u (if_rcvq_trim_pct: %u) pkts_to_drop: %u "
4235 "dropped_pkts: %u dropped_bytes %u\n",
4236 __func__,
4237 overcommitted_qlen, qlen(input_queue),
4238 overcommitted_qsize, qsize(input_queue),
4239 qlimit(input_queue), if_rcvq_burst_limit,
4240 target_qlen, if_rcvq_trim_pct, pkts_to_drop,
4241 dropped_pkts, dropped_bytes);
4242 }
4243
4244 return dropped_pkts;
4245 }
4246
4247 static errno_t
dlil_input_async(struct dlil_threading_info * inp,struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t poll,struct thread * tp)4248 dlil_input_async(struct dlil_threading_info *inp,
4249 struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail,
4250 const struct ifnet_stat_increment_param *s, boolean_t poll,
4251 struct thread *tp)
4252 {
4253 u_int32_t m_cnt = s->packets_in;
4254 u_int32_t m_size = s->bytes_in;
4255 boolean_t notify = FALSE;
4256 struct ifnet_stat_increment_param s_adj = *s;
4257 dlil_freeq_t freeq;
4258 MBUFQ_INIT(&freeq);
4259
4260 /*
4261 * If there is a matching DLIL input thread associated with an
4262 * affinity set, associate this thread with the same set. We
4263 * will only do this once.
4264 */
4265 lck_mtx_lock_spin(&inp->dlth_lock);
4266 if (inp != dlil_main_input_thread && inp->dlth_affinity && tp != NULL &&
4267 ((!poll && inp->dlth_driver_thread == THREAD_NULL) ||
4268 (poll && inp->dlth_poller_thread == THREAD_NULL))) {
4269 u_int32_t tag = inp->dlth_affinity_tag;
4270
4271 if (poll) {
4272 VERIFY(inp->dlth_poller_thread == THREAD_NULL);
4273 inp->dlth_poller_thread = tp;
4274 } else {
4275 VERIFY(inp->dlth_driver_thread == THREAD_NULL);
4276 inp->dlth_driver_thread = tp;
4277 }
4278 lck_mtx_unlock(&inp->dlth_lock);
4279
4280 /* Associate the current thread with the new affinity tag */
4281 (void) dlil_affinity_set(tp, tag);
4282
4283 /*
4284 * Take a reference on the current thread; during detach,
4285 * we will need to refer to it in order to tear down its
4286 * affinity.
4287 */
4288 thread_reference(tp);
4289 lck_mtx_lock_spin(&inp->dlth_lock);
4290 }
4291
4292 VERIFY(m_head != NULL || (m_tail == NULL && m_cnt == 0));
4293
4294 /*
4295 * Because of loopbacked multicast we cannot stuff the ifp in
4296 * the rcvif of the packet header: loopback (lo0) packets use a
4297 * dedicated list so that we can later associate them with lo_ifp
4298 * on their way up the stack. Packets for other interfaces without
4299 * dedicated input threads go to the regular list.
4300 */
4301 if (m_head != NULL) {
4302 classq_pkt_t head, tail;
4303 class_queue_t *input_queue;
4304 CLASSQ_PKT_INIT_MBUF(&head, m_head);
4305 CLASSQ_PKT_INIT_MBUF(&tail, m_tail);
4306 if (inp == dlil_main_input_thread && ifp == lo_ifp) {
4307 struct dlil_main_threading_info *inpm =
4308 (struct dlil_main_threading_info *)inp;
4309 input_queue = &inpm->lo_rcvq_pkts;
4310 } else {
4311 input_queue = &inp->dlth_pkts;
4312 }
4313
4314 _addq_multi(input_queue, &head, &tail, m_cnt, m_size);
4315
4316 if (MBUF_QUEUE_IS_OVERCOMMITTED(input_queue)) {
4317 dlil_trim_overcomitted_queue_locked(input_queue, &freeq, &s_adj);
4318 inp->dlth_trim_pkts_dropped += s_adj.dropped;
4319 inp->dlth_trim_cnt += 1;
4320
4321 os_log_error(OS_LOG_DEFAULT,
4322 "%s %s burst limit %u (sysctl: %u) exceeded. "
4323 "%u packets dropped [%u total in %u events]. new qlen %u ",
4324 __func__, if_name(ifp), qlimit(input_queue), if_rcvq_burst_limit,
4325 s_adj.dropped, inp->dlth_trim_pkts_dropped, inp->dlth_trim_cnt,
4326 qlen(input_queue));
4327 }
4328 }
4329
4330 #if IFNET_INPUT_SANITY_CHK
4331 /*
4332 * Verify that the original stat increment parameter
4333 * accurately describes the input chain `m_head`.
4334 * This is not affected by the trimming of input queue.
4335 */
4336 if (__improbable(dlil_input_sanity_check != 0)) {
4337 u_int32_t count = 0, size = 0;
4338 struct mbuf *m0;
4339
4340 for (m0 = m_head; m0; m0 = mbuf_nextpkt(m0)) {
4341 size += m_length(m0);
4342 count++;
4343 }
4344
4345 if (count != m_cnt) {
4346 panic_plain("%s: invalid total packet count %u "
4347 "(expected %u)\n", if_name(ifp), count, m_cnt);
4348 /* NOTREACHED */
4349 __builtin_unreachable();
4350 } else if (size != m_size) {
4351 panic_plain("%s: invalid total packet size %u "
4352 "(expected %u)\n", if_name(ifp), size, m_size);
4353 /* NOTREACHED */
4354 __builtin_unreachable();
4355 }
4356
4357 inp->dlth_pkts_cnt += m_cnt;
4358 }
4359 #endif /* IFNET_INPUT_SANITY_CHK */
4360
4361 /* NOTE: use the adjusted parameter, vs the original one */
4362 dlil_input_stats_add(&s_adj, inp, ifp, poll);
4363 /*
4364 * If we're using the main input thread, synchronize the
4365 * stats now since we have the interface context. All
4366 * other cases involving dedicated input threads will
4367 * have their stats synchronized there.
4368 */
4369 if (inp == dlil_main_input_thread) {
4370 notify = dlil_input_stats_sync(ifp, inp);
4371 }
4372
4373 dlil_input_wakeup(inp);
4374 lck_mtx_unlock(&inp->dlth_lock);
4375
4376 /*
4377 * Actual freeing of the excess packets must happen
4378 * after the dlth_lock had been released.
4379 */
4380 if (!MBUFQ_EMPTY(&freeq)) {
4381 m_freem_list(MBUFQ_FIRST(&freeq));
4382 }
4383
4384 if (notify) {
4385 ifnet_notify_data_threshold(ifp);
4386 }
4387
4388 return 0;
4389 }
4390
4391 static errno_t
dlil_input_sync(struct dlil_threading_info * inp,struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t poll,struct thread * tp)4392 dlil_input_sync(struct dlil_threading_info *inp,
4393 struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail,
4394 const struct ifnet_stat_increment_param *s, boolean_t poll,
4395 struct thread *tp)
4396 {
4397 #pragma unused(tp)
4398 u_int32_t m_cnt = s->packets_in;
4399 u_int32_t m_size = s->bytes_in;
4400 boolean_t notify = FALSE;
4401 classq_pkt_t head, tail;
4402 struct ifnet_stat_increment_param s_adj = *s;
4403 dlil_freeq_t freeq;
4404 MBUFQ_INIT(&freeq);
4405
4406 ASSERT(inp != dlil_main_input_thread);
4407
4408 /* XXX: should we just assert instead? */
4409 if (__improbable(m_head == NULL)) {
4410 return 0;
4411 }
4412
4413 CLASSQ_PKT_INIT_MBUF(&head, m_head);
4414 CLASSQ_PKT_INIT_MBUF(&tail, m_tail);
4415
4416 lck_mtx_lock_spin(&inp->dlth_lock);
4417 _addq_multi(&inp->dlth_pkts, &head, &tail, m_cnt, m_size);
4418
4419 if (MBUF_QUEUE_IS_OVERCOMMITTED(&inp->dlth_pkts)) {
4420 dlil_trim_overcomitted_queue_locked(&inp->dlth_pkts, &freeq, &s_adj);
4421 inp->dlth_trim_pkts_dropped += s_adj.dropped;
4422 inp->dlth_trim_cnt += 1;
4423
4424 os_log_error(OS_LOG_DEFAULT,
4425 "%s %s burst limit %u (sysctl: %u) exceeded. "
4426 "%u packets dropped [%u total in %u events]. new qlen %u \n",
4427 __func__, if_name(ifp), qlimit(&inp->dlth_pkts), if_rcvq_burst_limit,
4428 s_adj.dropped, inp->dlth_trim_pkts_dropped, inp->dlth_trim_cnt,
4429 qlen(&inp->dlth_pkts));
4430 }
4431
4432 #if IFNET_INPUT_SANITY_CHK
4433 if (__improbable(dlil_input_sanity_check != 0)) {
4434 u_int32_t count = 0, size = 0;
4435 struct mbuf *m0;
4436
4437 for (m0 = m_head; m0; m0 = mbuf_nextpkt(m0)) {
4438 size += m_length(m0);
4439 count++;
4440 }
4441
4442 if (count != m_cnt) {
4443 panic_plain("%s: invalid total packet count %u "
4444 "(expected %u)\n", if_name(ifp), count, m_cnt);
4445 /* NOTREACHED */
4446 __builtin_unreachable();
4447 } else if (size != m_size) {
4448 panic_plain("%s: invalid total packet size %u "
4449 "(expected %u)\n", if_name(ifp), size, m_size);
4450 /* NOTREACHED */
4451 __builtin_unreachable();
4452 }
4453
4454 inp->dlth_pkts_cnt += m_cnt;
4455 }
4456 #endif /* IFNET_INPUT_SANITY_CHK */
4457
4458 /* NOTE: use the adjusted parameter, vs the original one */
4459 dlil_input_stats_add(&s_adj, inp, ifp, poll);
4460
4461 m_cnt = qlen(&inp->dlth_pkts);
4462 _getq_all(&inp->dlth_pkts, &head, NULL, NULL, NULL);
4463
4464 #if SKYWALK
4465 /*
4466 * If this interface is attached to a netif nexus,
4467 * the stats are already incremented there; otherwise
4468 * do it here.
4469 */
4470 if (!(ifp->if_capabilities & IFCAP_SKYWALK))
4471 #endif /* SKYWALK */
4472 notify = dlil_input_stats_sync(ifp, inp);
4473
4474 lck_mtx_unlock(&inp->dlth_lock);
4475
4476 /*
4477 * Actual freeing of the excess packets must happen
4478 * after the dlth_lock had been released.
4479 */
4480 if (!MBUFQ_EMPTY(&freeq)) {
4481 m_freem_list(MBUFQ_FIRST(&freeq));
4482 }
4483
4484 if (notify) {
4485 ifnet_notify_data_threshold(ifp);
4486 }
4487
4488 /*
4489 * NOTE warning %%% attention !!!!
4490 * We should think about putting some thread starvation
4491 * safeguards if we deal with long chains of packets.
4492 */
4493 if (head.cp_mbuf != NULL) {
4494 dlil_input_packet_list_extended(NULL, head.cp_mbuf,
4495 m_cnt, ifp->if_poll_mode);
4496 }
4497
4498 return 0;
4499 }
4500
4501 #if SKYWALK
4502 errno_t
ifnet_set_output_handler(struct ifnet * ifp,ifnet_output_func fn)4503 ifnet_set_output_handler(struct ifnet *ifp, ifnet_output_func fn)
4504 {
4505 return os_atomic_cmpxchg((void * volatile *)&ifp->if_output,
4506 ptrauth_nop_cast(void *, ifp->if_save_output),
4507 ptrauth_nop_cast(void *, fn), acq_rel) ? 0 : EBUSY;
4508 }
4509
4510 void
ifnet_reset_output_handler(struct ifnet * ifp)4511 ifnet_reset_output_handler(struct ifnet *ifp)
4512 {
4513 while (!os_atomic_cmpxchg((void * volatile *)&ifp->if_output,
4514 ptrauth_nop_cast(void *, ifp->if_output),
4515 ptrauth_nop_cast(void *, ifp->if_save_output), acq_rel)) {
4516 ;
4517 }
4518 }
4519
4520 errno_t
ifnet_set_start_handler(struct ifnet * ifp,ifnet_start_func fn)4521 ifnet_set_start_handler(struct ifnet *ifp, ifnet_start_func fn)
4522 {
4523 return os_atomic_cmpxchg((void * volatile *)&ifp->if_start,
4524 ptrauth_nop_cast(void *, ifp->if_save_start),
4525 ptrauth_nop_cast(void *, fn), acq_rel) ? 0 : EBUSY;
4526 }
4527
4528 void
ifnet_reset_start_handler(struct ifnet * ifp)4529 ifnet_reset_start_handler(struct ifnet *ifp)
4530 {
4531 while (!os_atomic_cmpxchg((void * volatile *)&ifp->if_start,
4532 ptrauth_nop_cast(void *, ifp->if_start),
4533 ptrauth_nop_cast(void *, ifp->if_save_start), acq_rel)) {
4534 ;
4535 }
4536 }
4537 #endif /* SKYWALK */
4538
4539 static void
ifnet_start_common(struct ifnet * ifp,boolean_t resetfc,boolean_t ignore_delay)4540 ifnet_start_common(struct ifnet *ifp, boolean_t resetfc, boolean_t ignore_delay)
4541 {
4542 if (!(ifp->if_eflags & IFEF_TXSTART)) {
4543 return;
4544 }
4545 /*
4546 * If the starter thread is inactive, signal it to do work,
4547 * unless the interface is being flow controlled from below,
4548 * e.g. a virtual interface being flow controlled by a real
4549 * network interface beneath it, or it's been disabled via
4550 * a call to ifnet_disable_output().
4551 */
4552 lck_mtx_lock_spin(&ifp->if_start_lock);
4553 if (ignore_delay) {
4554 ifp->if_start_flags |= IFSF_NO_DELAY;
4555 }
4556 if (resetfc) {
4557 ifp->if_start_flags &= ~IFSF_FLOW_CONTROLLED;
4558 } else if (ifp->if_start_flags & IFSF_FLOW_CONTROLLED) {
4559 lck_mtx_unlock(&ifp->if_start_lock);
4560 return;
4561 }
4562 ifp->if_start_req++;
4563 if (!ifp->if_start_active && ifp->if_start_thread != THREAD_NULL &&
4564 (resetfc || !(ifp->if_eflags & IFEF_ENQUEUE_MULTI) ||
4565 IFCQ_LEN(ifp->if_snd) >= ifp->if_start_delay_qlen ||
4566 ifp->if_start_delayed == 0)) {
4567 (void) wakeup_one((caddr_t)&ifp->if_start_thread);
4568 }
4569 lck_mtx_unlock(&ifp->if_start_lock);
4570 }
4571
4572 void
ifnet_start_set_pacemaker_time(struct ifnet * ifp,uint64_t tx_time)4573 ifnet_start_set_pacemaker_time(struct ifnet *ifp, uint64_t tx_time)
4574 {
4575 ifp->if_start_pacemaker_time = tx_time;
4576 }
4577
4578 void
ifnet_start(struct ifnet * ifp)4579 ifnet_start(struct ifnet *ifp)
4580 {
4581 ifnet_start_common(ifp, FALSE, FALSE);
4582 }
4583
4584 void
ifnet_start_ignore_delay(struct ifnet * ifp)4585 ifnet_start_ignore_delay(struct ifnet *ifp)
4586 {
4587 ifnet_start_common(ifp, FALSE, TRUE);
4588 }
4589
4590 __attribute__((noreturn))
4591 static void
ifnet_start_thread_func(void * v,wait_result_t w)4592 ifnet_start_thread_func(void *v, wait_result_t w)
4593 {
4594 #pragma unused(w)
4595 struct ifnet *ifp = v;
4596 char thread_name[MAXTHREADNAMESIZE];
4597
4598 /* Construct the name for this thread, and then apply it. */
4599 bzero(thread_name, sizeof(thread_name));
4600 (void) snprintf(thread_name, sizeof(thread_name),
4601 "ifnet_start_%s", ifp->if_xname);
4602 #if SKYWALK
4603 /* override name for native Skywalk interface */
4604 if (ifp->if_eflags & IFEF_SKYWALK_NATIVE) {
4605 (void) snprintf(thread_name, sizeof(thread_name),
4606 "skywalk_doorbell_%s_tx", ifp->if_xname);
4607 }
4608 #endif /* SKYWALK */
4609 ASSERT(ifp->if_start_thread == current_thread());
4610 thread_set_thread_name(current_thread(), thread_name);
4611
4612 /*
4613 * Treat the dedicated starter thread for lo0 as equivalent to
4614 * the driver workloop thread; if net_affinity is enabled for
4615 * the main input thread, associate this starter thread to it
4616 * by binding them with the same affinity tag. This is done
4617 * only once (as we only have one lo_ifp which never goes away.)
4618 */
4619 if (ifp == lo_ifp) {
4620 struct dlil_threading_info *inp = dlil_main_input_thread;
4621 struct thread *tp = current_thread();
4622 #if SKYWALK
4623 /* native skywalk loopback not yet implemented */
4624 VERIFY(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
4625 #endif /* SKYWALK */
4626
4627 lck_mtx_lock(&inp->dlth_lock);
4628 if (inp->dlth_affinity) {
4629 u_int32_t tag = inp->dlth_affinity_tag;
4630
4631 VERIFY(inp->dlth_driver_thread == THREAD_NULL);
4632 VERIFY(inp->dlth_poller_thread == THREAD_NULL);
4633 inp->dlth_driver_thread = tp;
4634 lck_mtx_unlock(&inp->dlth_lock);
4635
4636 /* Associate this thread with the affinity tag */
4637 (void) dlil_affinity_set(tp, tag);
4638 } else {
4639 lck_mtx_unlock(&inp->dlth_lock);
4640 }
4641 }
4642
4643 lck_mtx_lock(&ifp->if_start_lock);
4644 VERIFY(!ifp->if_start_embryonic && !ifp->if_start_active);
4645 (void) assert_wait(&ifp->if_start_thread, THREAD_UNINT);
4646 ifp->if_start_embryonic = 1;
4647 /* wake up once to get out of embryonic state */
4648 ifp->if_start_req++;
4649 (void) wakeup_one((caddr_t)&ifp->if_start_thread);
4650 lck_mtx_unlock(&ifp->if_start_lock);
4651 (void) thread_block_parameter(ifnet_start_thread_cont, ifp);
4652 /* NOTREACHED */
4653 __builtin_unreachable();
4654 }
4655
4656 __attribute__((noreturn))
4657 static void
ifnet_start_thread_cont(void * v,wait_result_t wres)4658 ifnet_start_thread_cont(void *v, wait_result_t wres)
4659 {
4660 struct ifnet *ifp = v;
4661 struct ifclassq *ifq = ifp->if_snd;
4662
4663 lck_mtx_lock_spin(&ifp->if_start_lock);
4664 if (__improbable(wres == THREAD_INTERRUPTED ||
4665 (ifp->if_start_flags & IFSF_TERMINATING) != 0)) {
4666 goto terminate;
4667 }
4668
4669 if (__improbable(ifp->if_start_embryonic)) {
4670 ifp->if_start_embryonic = 0;
4671 lck_mtx_unlock(&ifp->if_start_lock);
4672 ifnet_decr_pending_thread_count(ifp);
4673 lck_mtx_lock_spin(&ifp->if_start_lock);
4674 goto skip;
4675 }
4676
4677 ifp->if_start_active = 1;
4678
4679 /*
4680 * Keep on servicing until no more request.
4681 */
4682 for (;;) {
4683 u_int32_t req = ifp->if_start_req;
4684 if ((ifp->if_start_flags & IFSF_NO_DELAY) == 0 &&
4685 !IFCQ_IS_EMPTY(ifq) &&
4686 (ifp->if_eflags & IFEF_ENQUEUE_MULTI) &&
4687 ifp->if_start_delayed == 0 &&
4688 IFCQ_LEN(ifq) < ifp->if_start_delay_qlen &&
4689 (ifp->if_eflags & IFEF_DELAY_START)) {
4690 ifp->if_start_delayed = 1;
4691 ifnet_start_delayed++;
4692 break;
4693 }
4694 ifp->if_start_flags &= ~IFSF_NO_DELAY;
4695 ifp->if_start_delayed = 0;
4696 lck_mtx_unlock(&ifp->if_start_lock);
4697
4698 /*
4699 * If no longer attached, don't call start because ifp
4700 * is being destroyed; else hold an IO refcnt to
4701 * prevent the interface from being detached (will be
4702 * released below.)
4703 */
4704 if (!ifnet_datamov_begin(ifp)) {
4705 lck_mtx_lock_spin(&ifp->if_start_lock);
4706 break;
4707 }
4708
4709 /* invoke the driver's start routine */
4710 ((*ifp->if_start)(ifp));
4711
4712 /*
4713 * Release the io ref count taken above.
4714 */
4715 ifnet_datamov_end(ifp);
4716
4717 lck_mtx_lock_spin(&ifp->if_start_lock);
4718
4719 /*
4720 * If there's no pending request or if the
4721 * interface has been disabled, we're done.
4722 */
4723 #define _IFSF_DISABLED (IFSF_FLOW_CONTROLLED | IFSF_TERMINATING)
4724 if (req == ifp->if_start_req ||
4725 (ifp->if_start_flags & _IFSF_DISABLED) != 0) {
4726 break;
4727 }
4728 }
4729 skip:
4730 ifp->if_start_req = 0;
4731 ifp->if_start_active = 0;
4732
4733 #if SKYWALK
4734 /*
4735 * Wakeup any waiters, e.g. any threads waiting to
4736 * detach the interface from the flowswitch, etc.
4737 */
4738 if (ifp->if_start_waiters != 0) {
4739 ifp->if_start_waiters = 0;
4740 wakeup(&ifp->if_start_waiters);
4741 }
4742 #endif /* SKYWALK */
4743 if (__probable((ifp->if_start_flags & IFSF_TERMINATING) == 0)) {
4744 uint64_t deadline = TIMEOUT_WAIT_FOREVER;
4745 struct timespec delay_start_ts;
4746 struct timespec pacemaker_ts;
4747 struct timespec *ts = NULL;
4748
4749 /*
4750 * Wakeup N ns from now if rate-controlled by TBR, and if
4751 * there are still packets in the send queue which haven't
4752 * been dequeued so far; else sleep indefinitely (ts = NULL)
4753 * until ifnet_start() is called again.
4754 */
4755 if (ifp->if_start_pacemaker_time != 0) {
4756 struct timespec now_ts;
4757 uint64_t now;
4758
4759 nanouptime(&now_ts);
4760 now = ((uint64_t)now_ts.tv_sec * NSEC_PER_SEC) + now_ts.tv_nsec;
4761
4762 if (ifp->if_start_pacemaker_time != 0 &&
4763 ifp->if_start_pacemaker_time > now) {
4764 pacemaker_ts.tv_sec = 0;
4765 pacemaker_ts.tv_nsec = ifp->if_start_pacemaker_time - now;
4766
4767 ts = &pacemaker_ts;
4768 ifp->if_start_flags |= IFSF_NO_DELAY;
4769 DTRACE_SKYWALK2(pacemaker__schedule, struct ifnet*, ifp,
4770 uint64_t, pacemaker_ts.tv_nsec);
4771 } else {
4772 DTRACE_SKYWALK2(pacemaker__timer__miss, struct ifnet*, ifp,
4773 uint64_t, now - ifp->if_start_pacemaker_time);
4774 ifp->if_start_pacemaker_time = 0;
4775 ifp->if_start_flags &= ~IFSF_NO_DELAY;
4776 }
4777 }
4778
4779 if (ts == NULL) {
4780 ts = ((IFCQ_TBR_IS_ENABLED(ifq) && !IFCQ_IS_EMPTY(ifq)) ?
4781 &ifp->if_start_cycle : NULL);
4782 }
4783
4784 if (ts == NULL && ifp->if_start_delayed == 1) {
4785 delay_start_ts.tv_sec = 0;
4786 delay_start_ts.tv_nsec = ifp->if_start_delay_timeout;
4787 ts = &delay_start_ts;
4788 }
4789
4790 if (ts != NULL && ts->tv_sec == 0 && ts->tv_nsec == 0) {
4791 ts = NULL;
4792 }
4793
4794 if (__improbable(ts != NULL)) {
4795 clock_interval_to_deadline((uint32_t)(ts->tv_nsec +
4796 (ts->tv_sec * NSEC_PER_SEC)), 1, &deadline);
4797 }
4798
4799 (void) assert_wait_deadline(&ifp->if_start_thread,
4800 THREAD_UNINT, deadline);
4801 lck_mtx_unlock(&ifp->if_start_lock);
4802 (void) thread_block_parameter(ifnet_start_thread_cont, ifp);
4803 /* NOTREACHED */
4804 } else {
4805 terminate:
4806 /* interface is detached? */
4807 ifnet_set_start_cycle(ifp, NULL);
4808
4809 ifp->if_start_pacemaker_time = 0;
4810 /* clear if_start_thread to allow termination to continue */
4811 ASSERT(ifp->if_start_thread != THREAD_NULL);
4812 ifp->if_start_thread = THREAD_NULL;
4813 wakeup((caddr_t)&ifp->if_start_thread);
4814 lck_mtx_unlock(&ifp->if_start_lock);
4815
4816 if (dlil_verbose) {
4817 DLIL_PRINTF("%s: starter thread terminated\n",
4818 if_name(ifp));
4819 }
4820
4821 /* for the extra refcnt from kernel_thread_start() */
4822 thread_deallocate(current_thread());
4823 /* this is the end */
4824 thread_terminate(current_thread());
4825 /* NOTREACHED */
4826 }
4827
4828 /* must never get here */
4829 VERIFY(0);
4830 /* NOTREACHED */
4831 __builtin_unreachable();
4832 }
4833
4834 void
ifnet_set_start_cycle(struct ifnet * ifp,struct timespec * ts)4835 ifnet_set_start_cycle(struct ifnet *ifp, struct timespec *ts)
4836 {
4837 if (ts == NULL) {
4838 bzero(&ifp->if_start_cycle, sizeof(ifp->if_start_cycle));
4839 } else {
4840 *(&ifp->if_start_cycle) = *ts;
4841 }
4842
4843 if (ts != NULL && ts->tv_nsec != 0 && dlil_verbose) {
4844 DLIL_PRINTF("%s: restart interval set to %lu nsec\n",
4845 if_name(ifp), ts->tv_nsec);
4846 }
4847 }
4848
4849 static inline void
ifnet_poll_wakeup(struct ifnet * ifp)4850 ifnet_poll_wakeup(struct ifnet *ifp)
4851 {
4852 LCK_MTX_ASSERT(&ifp->if_poll_lock, LCK_MTX_ASSERT_OWNED);
4853
4854 ifp->if_poll_req++;
4855 if (!(ifp->if_poll_flags & IF_POLLF_RUNNING) &&
4856 ifp->if_poll_thread != THREAD_NULL) {
4857 wakeup_one((caddr_t)&ifp->if_poll_thread);
4858 }
4859 }
4860
4861 void
ifnet_poll(struct ifnet * ifp)4862 ifnet_poll(struct ifnet *ifp)
4863 {
4864 /*
4865 * If the poller thread is inactive, signal it to do work.
4866 */
4867 lck_mtx_lock_spin(&ifp->if_poll_lock);
4868 ifnet_poll_wakeup(ifp);
4869 lck_mtx_unlock(&ifp->if_poll_lock);
4870 }
4871
4872 __attribute__((noreturn))
4873 static void
ifnet_poll_thread_func(void * v,wait_result_t w)4874 ifnet_poll_thread_func(void *v, wait_result_t w)
4875 {
4876 #pragma unused(w)
4877 char thread_name[MAXTHREADNAMESIZE];
4878 struct ifnet *ifp = v;
4879
4880 VERIFY(ifp->if_eflags & IFEF_RXPOLL);
4881 VERIFY(current_thread() == ifp->if_poll_thread);
4882
4883 /* construct the name for this thread, and then apply it */
4884 bzero(thread_name, sizeof(thread_name));
4885 (void) snprintf(thread_name, sizeof(thread_name),
4886 "ifnet_poller_%s", ifp->if_xname);
4887 thread_set_thread_name(ifp->if_poll_thread, thread_name);
4888
4889 lck_mtx_lock(&ifp->if_poll_lock);
4890 VERIFY(!(ifp->if_poll_flags & (IF_POLLF_EMBRYONIC | IF_POLLF_RUNNING)));
4891 (void) assert_wait(&ifp->if_poll_thread, THREAD_UNINT);
4892 ifp->if_poll_flags |= IF_POLLF_EMBRYONIC;
4893 /* wake up once to get out of embryonic state */
4894 ifnet_poll_wakeup(ifp);
4895 lck_mtx_unlock(&ifp->if_poll_lock);
4896 (void) thread_block_parameter(ifnet_poll_thread_cont, ifp);
4897 /* NOTREACHED */
4898 __builtin_unreachable();
4899 }
4900
4901 __attribute__((noreturn))
4902 static void
ifnet_poll_thread_cont(void * v,wait_result_t wres)4903 ifnet_poll_thread_cont(void *v, wait_result_t wres)
4904 {
4905 struct dlil_threading_info *inp;
4906 struct ifnet *ifp = v;
4907 struct ifnet_stat_increment_param s;
4908 struct timespec start_time;
4909
4910 VERIFY(ifp->if_eflags & IFEF_RXPOLL);
4911
4912 bzero(&s, sizeof(s));
4913 net_timerclear(&start_time);
4914
4915 lck_mtx_lock_spin(&ifp->if_poll_lock);
4916 if (__improbable(wres == THREAD_INTERRUPTED ||
4917 (ifp->if_poll_flags & IF_POLLF_TERMINATING) != 0)) {
4918 goto terminate;
4919 }
4920
4921 inp = ifp->if_inp;
4922 VERIFY(inp != NULL);
4923
4924 if (__improbable(ifp->if_poll_flags & IF_POLLF_EMBRYONIC)) {
4925 ifp->if_poll_flags &= ~IF_POLLF_EMBRYONIC;
4926 lck_mtx_unlock(&ifp->if_poll_lock);
4927 ifnet_decr_pending_thread_count(ifp);
4928 lck_mtx_lock_spin(&ifp->if_poll_lock);
4929 goto skip;
4930 }
4931
4932 ifp->if_poll_flags |= IF_POLLF_RUNNING;
4933
4934 /*
4935 * Keep on servicing until no more request.
4936 */
4937 for (;;) {
4938 struct mbuf *m_head, *m_tail;
4939 u_int32_t m_lim, m_cnt, m_totlen;
4940 u_int16_t req = ifp->if_poll_req;
4941
4942 m_lim = (ifp->if_rxpoll_plim != 0) ? ifp->if_rxpoll_plim :
4943 MAX((qlimit(&inp->dlth_pkts)), (ifp->if_rxpoll_phiwat << 2));
4944 lck_mtx_unlock(&ifp->if_poll_lock);
4945
4946 /*
4947 * If no longer attached, there's nothing to do;
4948 * else hold an IO refcnt to prevent the interface
4949 * from being detached (will be released below.)
4950 */
4951 if (!ifnet_is_attached(ifp, 1)) {
4952 lck_mtx_lock_spin(&ifp->if_poll_lock);
4953 break;
4954 }
4955
4956 if (dlil_verbose > 1) {
4957 DLIL_PRINTF("%s: polling up to %d pkts, "
4958 "pkts avg %d max %d, wreq avg %d, "
4959 "bytes avg %d\n",
4960 if_name(ifp), m_lim,
4961 ifp->if_rxpoll_pavg, ifp->if_rxpoll_pmax,
4962 ifp->if_rxpoll_wavg, ifp->if_rxpoll_bavg);
4963 }
4964
4965 /* invoke the driver's input poll routine */
4966 ((*ifp->if_input_poll)(ifp, 0, m_lim, &m_head, &m_tail,
4967 &m_cnt, &m_totlen));
4968
4969 if (m_head != NULL) {
4970 VERIFY(m_tail != NULL && m_cnt > 0);
4971
4972 if (dlil_verbose > 1) {
4973 DLIL_PRINTF("%s: polled %d pkts, "
4974 "pkts avg %d max %d, wreq avg %d, "
4975 "bytes avg %d\n",
4976 if_name(ifp), m_cnt,
4977 ifp->if_rxpoll_pavg, ifp->if_rxpoll_pmax,
4978 ifp->if_rxpoll_wavg, ifp->if_rxpoll_bavg);
4979 }
4980
4981 /* stats are required for extended variant */
4982 s.packets_in = m_cnt;
4983 s.bytes_in = m_totlen;
4984
4985 (void) ifnet_input_common(ifp, m_head, m_tail,
4986 &s, TRUE, TRUE);
4987 } else {
4988 if (dlil_verbose > 1) {
4989 DLIL_PRINTF("%s: no packets, "
4990 "pkts avg %d max %d, wreq avg %d, "
4991 "bytes avg %d\n",
4992 if_name(ifp), ifp->if_rxpoll_pavg,
4993 ifp->if_rxpoll_pmax, ifp->if_rxpoll_wavg,
4994 ifp->if_rxpoll_bavg);
4995 }
4996
4997 (void) ifnet_input_common(ifp, NULL, NULL,
4998 NULL, FALSE, TRUE);
4999 }
5000
5001 /* Release the io ref count */
5002 ifnet_decr_iorefcnt(ifp);
5003
5004 lck_mtx_lock_spin(&ifp->if_poll_lock);
5005
5006 /* if there's no pending request, we're done */
5007 if (req == ifp->if_poll_req ||
5008 (ifp->if_poll_flags & IF_POLLF_TERMINATING) != 0) {
5009 break;
5010 }
5011 }
5012 skip:
5013 ifp->if_poll_req = 0;
5014 ifp->if_poll_flags &= ~IF_POLLF_RUNNING;
5015
5016 if (__probable((ifp->if_poll_flags & IF_POLLF_TERMINATING) == 0)) {
5017 uint64_t deadline = TIMEOUT_WAIT_FOREVER;
5018 struct timespec *ts;
5019
5020 /*
5021 * Wakeup N ns from now, else sleep indefinitely (ts = NULL)
5022 * until ifnet_poll() is called again.
5023 */
5024 ts = &ifp->if_poll_cycle;
5025 if (ts->tv_sec == 0 && ts->tv_nsec == 0) {
5026 ts = NULL;
5027 }
5028
5029 if (ts != NULL) {
5030 clock_interval_to_deadline((uint32_t)(ts->tv_nsec +
5031 (ts->tv_sec * NSEC_PER_SEC)), 1, &deadline);
5032 }
5033
5034 (void) assert_wait_deadline(&ifp->if_poll_thread,
5035 THREAD_UNINT, deadline);
5036 lck_mtx_unlock(&ifp->if_poll_lock);
5037 (void) thread_block_parameter(ifnet_poll_thread_cont, ifp);
5038 /* NOTREACHED */
5039 } else {
5040 terminate:
5041 /* interface is detached (maybe while asleep)? */
5042 ifnet_set_poll_cycle(ifp, NULL);
5043
5044 /* clear if_poll_thread to allow termination to continue */
5045 ASSERT(ifp->if_poll_thread != THREAD_NULL);
5046 ifp->if_poll_thread = THREAD_NULL;
5047 wakeup((caddr_t)&ifp->if_poll_thread);
5048 lck_mtx_unlock(&ifp->if_poll_lock);
5049
5050 if (dlil_verbose) {
5051 DLIL_PRINTF("%s: poller thread terminated\n",
5052 if_name(ifp));
5053 }
5054
5055 /* for the extra refcnt from kernel_thread_start() */
5056 thread_deallocate(current_thread());
5057 /* this is the end */
5058 thread_terminate(current_thread());
5059 /* NOTREACHED */
5060 }
5061
5062 /* must never get here */
5063 VERIFY(0);
5064 /* NOTREACHED */
5065 __builtin_unreachable();
5066 }
5067
5068 void
ifnet_set_poll_cycle(struct ifnet * ifp,struct timespec * ts)5069 ifnet_set_poll_cycle(struct ifnet *ifp, struct timespec *ts)
5070 {
5071 if (ts == NULL) {
5072 bzero(&ifp->if_poll_cycle, sizeof(ifp->if_poll_cycle));
5073 } else {
5074 *(&ifp->if_poll_cycle) = *ts;
5075 }
5076
5077 if (ts != NULL && ts->tv_nsec != 0 && dlil_verbose) {
5078 DLIL_PRINTF("%s: poll interval set to %lu nsec\n",
5079 if_name(ifp), ts->tv_nsec);
5080 }
5081 }
5082
5083 void
ifnet_purge(struct ifnet * ifp)5084 ifnet_purge(struct ifnet *ifp)
5085 {
5086 if (ifp != NULL && (ifp->if_eflags & IFEF_TXSTART)) {
5087 if_qflush_snd(ifp, false);
5088 }
5089 }
5090
5091 void
ifnet_update_sndq(struct ifclassq * ifq,cqev_t ev)5092 ifnet_update_sndq(struct ifclassq *ifq, cqev_t ev)
5093 {
5094 IFCQ_LOCK_ASSERT_HELD(ifq);
5095
5096 if (!(IFCQ_IS_READY(ifq))) {
5097 return;
5098 }
5099
5100 if (IFCQ_TBR_IS_ENABLED(ifq)) {
5101 struct tb_profile tb = {
5102 .rate = ifq->ifcq_tbr.tbr_rate_raw,
5103 .percent = ifq->ifcq_tbr.tbr_percent, .depth = 0
5104 };
5105 (void) ifclassq_tbr_set(ifq, &tb, FALSE);
5106 }
5107
5108 ifclassq_update(ifq, ev);
5109 }
5110
5111 void
ifnet_update_rcv(struct ifnet * ifp,cqev_t ev)5112 ifnet_update_rcv(struct ifnet *ifp, cqev_t ev)
5113 {
5114 switch (ev) {
5115 case CLASSQ_EV_LINK_BANDWIDTH:
5116 if (net_rxpoll && (ifp->if_eflags & IFEF_RXPOLL)) {
5117 ifp->if_poll_update++;
5118 }
5119 break;
5120
5121 default:
5122 break;
5123 }
5124 }
5125
5126 errno_t
ifnet_set_output_sched_model(struct ifnet * ifp,u_int32_t model)5127 ifnet_set_output_sched_model(struct ifnet *ifp, u_int32_t model)
5128 {
5129 struct ifclassq *ifq;
5130 u_int32_t omodel;
5131 errno_t err;
5132
5133 if (ifp == NULL || model >= IFNET_SCHED_MODEL_MAX) {
5134 return EINVAL;
5135 } else if (!(ifp->if_eflags & IFEF_TXSTART)) {
5136 return ENXIO;
5137 }
5138
5139 ifq = ifp->if_snd;
5140 IFCQ_LOCK(ifq);
5141 omodel = ifp->if_output_sched_model;
5142 ifp->if_output_sched_model = model;
5143 if ((err = ifclassq_pktsched_setup(ifq)) != 0) {
5144 ifp->if_output_sched_model = omodel;
5145 }
5146 IFCQ_UNLOCK(ifq);
5147
5148 return err;
5149 }
5150
5151 errno_t
ifnet_set_sndq_maxlen(struct ifnet * ifp,u_int32_t maxqlen)5152 ifnet_set_sndq_maxlen(struct ifnet *ifp, u_int32_t maxqlen)
5153 {
5154 if (ifp == NULL) {
5155 return EINVAL;
5156 } else if (!(ifp->if_eflags & IFEF_TXSTART)) {
5157 return ENXIO;
5158 }
5159
5160 ifclassq_set_maxlen(ifp->if_snd, maxqlen);
5161
5162 return 0;
5163 }
5164
5165 errno_t
ifnet_get_sndq_maxlen(struct ifnet * ifp,u_int32_t * maxqlen)5166 ifnet_get_sndq_maxlen(struct ifnet *ifp, u_int32_t *maxqlen)
5167 {
5168 if (ifp == NULL || maxqlen == NULL) {
5169 return EINVAL;
5170 } else if (!(ifp->if_eflags & IFEF_TXSTART)) {
5171 return ENXIO;
5172 }
5173
5174 *maxqlen = ifclassq_get_maxlen(ifp->if_snd);
5175
5176 return 0;
5177 }
5178
5179 errno_t
ifnet_get_sndq_len(struct ifnet * ifp,u_int32_t * pkts)5180 ifnet_get_sndq_len(struct ifnet *ifp, u_int32_t *pkts)
5181 {
5182 errno_t err;
5183
5184 if (ifp == NULL || pkts == NULL) {
5185 err = EINVAL;
5186 } else if (!(ifp->if_eflags & IFEF_TXSTART)) {
5187 err = ENXIO;
5188 } else {
5189 err = ifclassq_get_len(ifp->if_snd, MBUF_SC_UNSPEC,
5190 IF_CLASSQ_ALL_GRPS, pkts, NULL);
5191 }
5192
5193 return err;
5194 }
5195
5196 errno_t
ifnet_get_service_class_sndq_len(struct ifnet * ifp,mbuf_svc_class_t sc,u_int32_t * pkts,u_int32_t * bytes)5197 ifnet_get_service_class_sndq_len(struct ifnet *ifp, mbuf_svc_class_t sc,
5198 u_int32_t *pkts, u_int32_t *bytes)
5199 {
5200 errno_t err;
5201
5202 if (ifp == NULL || !MBUF_VALID_SC(sc) ||
5203 (pkts == NULL && bytes == NULL)) {
5204 err = EINVAL;
5205 } else if (!(ifp->if_eflags & IFEF_TXSTART)) {
5206 err = ENXIO;
5207 } else {
5208 err = ifclassq_get_len(ifp->if_snd, sc, IF_CLASSQ_ALL_GRPS,
5209 pkts, bytes);
5210 }
5211
5212 return err;
5213 }
5214
5215 errno_t
ifnet_set_rcvq_maxlen(struct ifnet * ifp,u_int32_t maxqlen)5216 ifnet_set_rcvq_maxlen(struct ifnet *ifp, u_int32_t maxqlen)
5217 {
5218 struct dlil_threading_info *inp;
5219
5220 if (ifp == NULL) {
5221 return EINVAL;
5222 } else if (!(ifp->if_eflags & IFEF_RXPOLL) || ifp->if_inp == NULL) {
5223 return ENXIO;
5224 }
5225
5226 if (maxqlen == 0) {
5227 maxqlen = if_rcvq_maxlen;
5228 } else if (maxqlen < IF_RCVQ_MINLEN) {
5229 maxqlen = IF_RCVQ_MINLEN;
5230 }
5231
5232 inp = ifp->if_inp;
5233 lck_mtx_lock(&inp->dlth_lock);
5234 qlimit(&inp->dlth_pkts) = maxqlen;
5235 lck_mtx_unlock(&inp->dlth_lock);
5236
5237 return 0;
5238 }
5239
5240 errno_t
ifnet_get_rcvq_maxlen(struct ifnet * ifp,u_int32_t * maxqlen)5241 ifnet_get_rcvq_maxlen(struct ifnet *ifp, u_int32_t *maxqlen)
5242 {
5243 struct dlil_threading_info *inp;
5244
5245 if (ifp == NULL || maxqlen == NULL) {
5246 return EINVAL;
5247 } else if (!(ifp->if_eflags & IFEF_RXPOLL) || ifp->if_inp == NULL) {
5248 return ENXIO;
5249 }
5250
5251 inp = ifp->if_inp;
5252 lck_mtx_lock(&inp->dlth_lock);
5253 *maxqlen = qlimit(&inp->dlth_pkts);
5254 lck_mtx_unlock(&inp->dlth_lock);
5255 return 0;
5256 }
5257
5258 void
ifnet_enqueue_multi_setup(struct ifnet * ifp,uint16_t delay_qlen,uint16_t delay_timeout)5259 ifnet_enqueue_multi_setup(struct ifnet *ifp, uint16_t delay_qlen,
5260 uint16_t delay_timeout)
5261 {
5262 if (delay_qlen > 0 && delay_timeout > 0) {
5263 if_set_eflags(ifp, IFEF_ENQUEUE_MULTI);
5264 ifp->if_start_delay_qlen = MIN(100, delay_qlen);
5265 ifp->if_start_delay_timeout = min(20000, delay_timeout);
5266 /* convert timeout to nanoseconds */
5267 ifp->if_start_delay_timeout *= 1000;
5268 kprintf("%s: forced IFEF_ENQUEUE_MULTI qlen %u timeout %u\n",
5269 ifp->if_xname, (uint32_t)delay_qlen,
5270 (uint32_t)delay_timeout);
5271 } else {
5272 if_clear_eflags(ifp, IFEF_ENQUEUE_MULTI);
5273 }
5274 }
5275
5276 /*
5277 * This function clears the DSCP bits in the IPV4/V6 header pointed to by buf.
5278 * While it's ok for buf to be not 32 bit aligned, the caller must ensure that
5279 * buf holds the full header.
5280 */
5281 static __attribute__((noinline)) void
ifnet_mcast_clear_dscp(uint8_t * buf,uint8_t ip_ver)5282 ifnet_mcast_clear_dscp(uint8_t *buf, uint8_t ip_ver)
5283 {
5284 struct ip *ip;
5285 struct ip6_hdr *ip6;
5286 uint8_t lbuf[64] __attribute__((aligned(8)));
5287 uint8_t *p = buf;
5288
5289 if (ip_ver == IPVERSION) {
5290 uint8_t old_tos;
5291 uint32_t sum;
5292
5293 if (__improbable(!IP_HDR_ALIGNED_P(p))) {
5294 DTRACE_IP1(not__aligned__v4, uint8_t *, buf);
5295 bcopy(buf, lbuf, sizeof(struct ip));
5296 p = lbuf;
5297 }
5298 ip = (struct ip *)(void *)p;
5299 if (__probable((ip->ip_tos & ~IPTOS_ECN_MASK) == 0)) {
5300 return;
5301 }
5302
5303 DTRACE_IP1(clear__v4, struct ip *, ip);
5304 old_tos = ip->ip_tos;
5305 ip->ip_tos &= IPTOS_ECN_MASK;
5306 sum = ip->ip_sum + htons(old_tos) - htons(ip->ip_tos);
5307 sum = (sum >> 16) + (sum & 0xffff);
5308 ip->ip_sum = (uint16_t)(sum & 0xffff);
5309
5310 if (__improbable(p == lbuf)) {
5311 bcopy(lbuf, buf, sizeof(struct ip));
5312 }
5313 } else {
5314 uint32_t flow;
5315 ASSERT(ip_ver == IPV6_VERSION);
5316
5317 if (__improbable(!IP_HDR_ALIGNED_P(p))) {
5318 DTRACE_IP1(not__aligned__v6, uint8_t *, buf);
5319 bcopy(buf, lbuf, sizeof(struct ip6_hdr));
5320 p = lbuf;
5321 }
5322 ip6 = (struct ip6_hdr *)(void *)p;
5323 flow = ntohl(ip6->ip6_flow);
5324 if (__probable((flow & IP6FLOW_DSCP_MASK) == 0)) {
5325 return;
5326 }
5327
5328 DTRACE_IP1(clear__v6, struct ip6_hdr *, ip6);
5329 ip6->ip6_flow = htonl(flow & ~IP6FLOW_DSCP_MASK);
5330
5331 if (__improbable(p == lbuf)) {
5332 bcopy(lbuf, buf, sizeof(struct ip6_hdr));
5333 }
5334 }
5335 }
5336
5337 static inline errno_t
ifnet_enqueue_ifclassq(struct ifnet * ifp,struct ifclassq * ifcq,classq_pkt_t * p,boolean_t flush,boolean_t * pdrop)5338 ifnet_enqueue_ifclassq(struct ifnet *ifp, struct ifclassq *ifcq,
5339 classq_pkt_t *p, boolean_t flush, boolean_t *pdrop)
5340 {
5341 #if SKYWALK
5342 volatile struct sk_nexusadv *nxadv = NULL;
5343 #endif /* SKYWALK */
5344 volatile uint64_t *fg_ts = NULL;
5345 volatile uint64_t *rt_ts = NULL;
5346 struct timespec now;
5347 u_int64_t now_nsec = 0;
5348 int error = 0;
5349 uint8_t *mcast_buf = NULL;
5350 uint8_t ip_ver;
5351 uint32_t pktlen;
5352
5353 ASSERT(ifp->if_eflags & IFEF_TXSTART);
5354 #if SKYWALK
5355 /*
5356 * If attached to flowswitch, grab pointers to the
5357 * timestamp variables in the nexus advisory region.
5358 */
5359 if ((ifp->if_capabilities & IFCAP_SKYWALK) && ifp->if_na != NULL &&
5360 (nxadv = ifp->if_na->nifna_netif->nif_fsw_nxadv) != NULL) {
5361 fg_ts = &nxadv->nxadv_fg_sendts;
5362 rt_ts = &nxadv->nxadv_rt_sendts;
5363 }
5364 #endif /* SKYWALK */
5365
5366 /*
5367 * If packet already carries a timestamp, either from dlil_output()
5368 * or from flowswitch, use it here. Otherwise, record timestamp.
5369 * PKTF_TS_VALID is always cleared prior to entering classq, i.e.
5370 * the timestamp value is used internally there.
5371 */
5372 switch (p->cp_ptype) {
5373 case QP_MBUF:
5374 #if SKYWALK
5375 /*
5376 * Valid only for non-native (compat) Skywalk interface.
5377 * If the data source uses packet, caller must convert
5378 * it to mbuf first prior to calling this routine.
5379 */
5380 ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5381 #endif /* SKYWALK */
5382 ASSERT(p->cp_mbuf->m_flags & M_PKTHDR);
5383 ASSERT(p->cp_mbuf->m_nextpkt == NULL);
5384
5385 if (!(p->cp_mbuf->m_pkthdr.pkt_flags & PKTF_TS_VALID) ||
5386 p->cp_mbuf->m_pkthdr.pkt_timestamp == 0) {
5387 nanouptime(&now);
5388 net_timernsec(&now, &now_nsec);
5389 p->cp_mbuf->m_pkthdr.pkt_timestamp = now_nsec;
5390 }
5391 p->cp_mbuf->m_pkthdr.pkt_flags &= ~PKTF_TS_VALID;
5392 /*
5393 * If the packet service class is not background,
5394 * update the timestamp to indicate recent activity
5395 * on a foreground socket.
5396 */
5397 if ((p->cp_mbuf->m_pkthdr.pkt_flags & PKTF_FLOW_ID) &&
5398 p->cp_mbuf->m_pkthdr.pkt_flowsrc == FLOWSRC_INPCB) {
5399 if (!(p->cp_mbuf->m_pkthdr.pkt_flags &
5400 PKTF_SO_BACKGROUND)) {
5401 ifp->if_fg_sendts = (uint32_t)_net_uptime;
5402 if (fg_ts != NULL) {
5403 *fg_ts = (uint32_t)_net_uptime;
5404 }
5405 }
5406 if (p->cp_mbuf->m_pkthdr.pkt_flags & PKTF_SO_REALTIME) {
5407 ifp->if_rt_sendts = (uint32_t)_net_uptime;
5408 if (rt_ts != NULL) {
5409 *rt_ts = (uint32_t)_net_uptime;
5410 }
5411 }
5412 }
5413 pktlen = m_pktlen(p->cp_mbuf);
5414
5415 /*
5416 * Some Wi-Fi AP implementations do not correctly handle
5417 * multicast IP packets with DSCP bits set (radr://9331522).
5418 * As a workaround we clear the DSCP bits but keep service
5419 * class (rdar://51507725).
5420 */
5421 if ((p->cp_mbuf->m_flags & M_MCAST) != 0 &&
5422 IFNET_IS_WIFI_INFRA(ifp)) {
5423 size_t len = mbuf_len(p->cp_mbuf), hlen;
5424 struct ether_header *eh;
5425 boolean_t pullup = FALSE;
5426 uint16_t etype;
5427
5428 if (__improbable(len < sizeof(struct ether_header))) {
5429 DTRACE_IP1(small__ether, size_t, len);
5430 if ((p->cp_mbuf = m_pullup(p->cp_mbuf,
5431 sizeof(struct ether_header))) == NULL) {
5432 return ENOMEM;
5433 }
5434 }
5435 eh = (struct ether_header *)mbuf_data(p->cp_mbuf);
5436 etype = ntohs(eh->ether_type);
5437 if (etype == ETHERTYPE_IP) {
5438 hlen = sizeof(struct ether_header) +
5439 sizeof(struct ip);
5440 if (len < hlen) {
5441 DTRACE_IP1(small__v4, size_t, len);
5442 pullup = TRUE;
5443 }
5444 ip_ver = IPVERSION;
5445 } else if (etype == ETHERTYPE_IPV6) {
5446 hlen = sizeof(struct ether_header) +
5447 sizeof(struct ip6_hdr);
5448 if (len < hlen) {
5449 DTRACE_IP1(small__v6, size_t, len);
5450 pullup = TRUE;
5451 }
5452 ip_ver = IPV6_VERSION;
5453 } else {
5454 DTRACE_IP1(invalid__etype, uint16_t, etype);
5455 break;
5456 }
5457 if (pullup) {
5458 if ((p->cp_mbuf = m_pullup(p->cp_mbuf, (int)hlen)) ==
5459 NULL) {
5460 return ENOMEM;
5461 }
5462
5463 eh = (struct ether_header *)mbuf_data(
5464 p->cp_mbuf);
5465 }
5466 mcast_buf = (uint8_t *)(eh + 1);
5467 /*
5468 * ifnet_mcast_clear_dscp() will finish the work below.
5469 * Note that the pullups above ensure that mcast_buf
5470 * points to a full IP header.
5471 */
5472 }
5473 break;
5474
5475 #if SKYWALK
5476 case QP_PACKET:
5477 /*
5478 * Valid only for native Skywalk interface. If the data
5479 * source uses mbuf, caller must convert it to packet first
5480 * prior to calling this routine.
5481 */
5482 ASSERT(ifp->if_eflags & IFEF_SKYWALK_NATIVE);
5483 if (!(p->cp_kpkt->pkt_pflags & PKT_F_TS_VALID) ||
5484 p->cp_kpkt->pkt_timestamp == 0) {
5485 nanouptime(&now);
5486 net_timernsec(&now, &now_nsec);
5487 p->cp_kpkt->pkt_timestamp = now_nsec;
5488 }
5489 p->cp_kpkt->pkt_pflags &= ~PKT_F_TS_VALID;
5490 /*
5491 * If the packet service class is not background,
5492 * update the timestamps on the interface, as well as
5493 * the ones in nexus-wide advisory to indicate recent
5494 * activity on a foreground flow.
5495 */
5496 if (!(p->cp_kpkt->pkt_pflags & PKT_F_BACKGROUND)) {
5497 ifp->if_fg_sendts = (uint32_t)_net_uptime;
5498 if (fg_ts != NULL) {
5499 *fg_ts = (uint32_t)_net_uptime;
5500 }
5501 }
5502 if (p->cp_kpkt->pkt_pflags & PKT_F_REALTIME) {
5503 ifp->if_rt_sendts = (uint32_t)_net_uptime;
5504 if (rt_ts != NULL) {
5505 *rt_ts = (uint32_t)_net_uptime;
5506 }
5507 }
5508 pktlen = p->cp_kpkt->pkt_length;
5509
5510 /*
5511 * Some Wi-Fi AP implementations do not correctly handle
5512 * multicast IP packets with DSCP bits set (radr://9331522).
5513 * As a workaround we clear the DSCP bits but keep service
5514 * class (rdar://51507725).
5515 */
5516 if ((p->cp_kpkt->pkt_link_flags & PKT_LINKF_MCAST) != 0 &&
5517 IFNET_IS_WIFI_INFRA(ifp)) {
5518 uint8_t *baddr;
5519 struct ether_header *eh;
5520 uint16_t etype;
5521
5522 MD_BUFLET_ADDR_ABS(p->cp_kpkt, baddr);
5523 baddr += p->cp_kpkt->pkt_headroom;
5524 if (__improbable(pktlen < sizeof(struct ether_header))) {
5525 DTRACE_IP1(pkt__small__ether, __kern_packet *,
5526 p->cp_kpkt);
5527 break;
5528 }
5529 eh = (struct ether_header *)(void *)baddr;
5530 etype = ntohs(eh->ether_type);
5531 if (etype == ETHERTYPE_IP) {
5532 if (pktlen < sizeof(struct ether_header) +
5533 sizeof(struct ip)) {
5534 DTRACE_IP1(pkt__small__v4, uint32_t,
5535 pktlen);
5536 break;
5537 }
5538 ip_ver = IPVERSION;
5539 } else if (etype == ETHERTYPE_IPV6) {
5540 if (pktlen < sizeof(struct ether_header) +
5541 sizeof(struct ip6_hdr)) {
5542 DTRACE_IP1(pkt__small__v6, uint32_t,
5543 pktlen);
5544 break;
5545 }
5546 ip_ver = IPV6_VERSION;
5547 } else {
5548 DTRACE_IP1(pkt__invalid__etype, uint16_t,
5549 etype);
5550 break;
5551 }
5552 mcast_buf = (uint8_t *)(eh + 1);
5553 /*
5554 * ifnet_mcast_clear_dscp() will finish the work below.
5555 * The checks above verify that the IP header is in the
5556 * first buflet.
5557 */
5558 }
5559 break;
5560 #endif /* SKYWALK */
5561
5562 default:
5563 VERIFY(0);
5564 /* NOTREACHED */
5565 __builtin_unreachable();
5566 }
5567
5568 if (mcast_buf != NULL) {
5569 ifnet_mcast_clear_dscp(mcast_buf, ip_ver);
5570 }
5571
5572 if (ifp->if_eflags & IFEF_ENQUEUE_MULTI) {
5573 if (now_nsec == 0) {
5574 nanouptime(&now);
5575 net_timernsec(&now, &now_nsec);
5576 }
5577 /*
5578 * If the driver chose to delay start callback for
5579 * coalescing multiple packets, Then use the following
5580 * heuristics to make sure that start callback will
5581 * be delayed only when bulk data transfer is detected.
5582 * 1. number of packets enqueued in (delay_win * 2) is
5583 * greater than or equal to the delay qlen.
5584 * 2. If delay_start is enabled it will stay enabled for
5585 * another 10 idle windows. This is to take into account
5586 * variable RTT and burst traffic.
5587 * 3. If the time elapsed since last enqueue is more
5588 * than 200ms we disable delaying start callback. This is
5589 * is to take idle time into account.
5590 */
5591 u_int64_t dwin = (ifp->if_start_delay_timeout << 1);
5592 if (ifp->if_start_delay_swin > 0) {
5593 if ((ifp->if_start_delay_swin + dwin) > now_nsec) {
5594 ifp->if_start_delay_cnt++;
5595 } else if ((now_nsec - ifp->if_start_delay_swin)
5596 >= (200 * 1000 * 1000)) {
5597 ifp->if_start_delay_swin = now_nsec;
5598 ifp->if_start_delay_cnt = 1;
5599 ifp->if_start_delay_idle = 0;
5600 if (ifp->if_eflags & IFEF_DELAY_START) {
5601 if_clear_eflags(ifp, IFEF_DELAY_START);
5602 ifnet_delay_start_disabled_increment();
5603 }
5604 } else {
5605 if (ifp->if_start_delay_cnt >=
5606 ifp->if_start_delay_qlen) {
5607 if_set_eflags(ifp, IFEF_DELAY_START);
5608 ifp->if_start_delay_idle = 0;
5609 } else {
5610 if (ifp->if_start_delay_idle >= 10) {
5611 if_clear_eflags(ifp,
5612 IFEF_DELAY_START);
5613 ifnet_delay_start_disabled_increment();
5614 } else {
5615 ifp->if_start_delay_idle++;
5616 }
5617 }
5618 ifp->if_start_delay_swin = now_nsec;
5619 ifp->if_start_delay_cnt = 1;
5620 }
5621 } else {
5622 ifp->if_start_delay_swin = now_nsec;
5623 ifp->if_start_delay_cnt = 1;
5624 ifp->if_start_delay_idle = 0;
5625 if_clear_eflags(ifp, IFEF_DELAY_START);
5626 }
5627 } else {
5628 if_clear_eflags(ifp, IFEF_DELAY_START);
5629 }
5630
5631 /* enqueue the packet (caller consumes object) */
5632 error = ifclassq_enqueue(((ifcq != NULL) ? ifcq : ifp->if_snd), p, p,
5633 1, pktlen, pdrop);
5634
5635 /*
5636 * Tell the driver to start dequeueing; do this even when the queue
5637 * for the packet is suspended (EQSUSPENDED), as the driver could still
5638 * be dequeueing from other unsuspended queues.
5639 */
5640 if (!(ifp->if_eflags & IFEF_ENQUEUE_MULTI) &&
5641 ((error == 0 && flush) || error == EQFULL || error == EQSUSPENDED)) {
5642 ifnet_start(ifp);
5643 }
5644
5645 return error;
5646 }
5647
5648 static inline errno_t
ifnet_enqueue_ifclassq_chain(struct ifnet * ifp,struct ifclassq * ifcq,classq_pkt_t * head,classq_pkt_t * tail,uint32_t cnt,uint32_t bytes,boolean_t flush,boolean_t * pdrop)5649 ifnet_enqueue_ifclassq_chain(struct ifnet *ifp, struct ifclassq *ifcq,
5650 classq_pkt_t *head, classq_pkt_t *tail, uint32_t cnt, uint32_t bytes,
5651 boolean_t flush, boolean_t *pdrop)
5652 {
5653 int error;
5654
5655 /* enqueue the packet (caller consumes object) */
5656 error = ifclassq_enqueue(ifcq != NULL ? ifcq : ifp->if_snd, head, tail,
5657 cnt, bytes, pdrop);
5658
5659 /*
5660 * Tell the driver to start dequeueing; do this even when the queue
5661 * for the packet is suspended (EQSUSPENDED), as the driver could still
5662 * be dequeueing from other unsuspended queues.
5663 */
5664 if ((error == 0 && flush) || error == EQFULL || error == EQSUSPENDED) {
5665 ifnet_start(ifp);
5666 }
5667 return error;
5668 }
5669
5670 int
ifnet_enqueue_netem(void * handle,pktsched_pkt_t * pkts,uint32_t n_pkts)5671 ifnet_enqueue_netem(void *handle, pktsched_pkt_t *pkts, uint32_t n_pkts)
5672 {
5673 struct ifnet *ifp = handle;
5674 boolean_t pdrop; /* dummy */
5675 uint32_t i;
5676
5677 ASSERT(n_pkts >= 1);
5678 for (i = 0; i < n_pkts - 1; i++) {
5679 (void) ifnet_enqueue_ifclassq(ifp, NULL, &pkts[i].pktsched_pkt,
5680 FALSE, &pdrop);
5681 }
5682 /* flush with the last packet */
5683 (void) ifnet_enqueue_ifclassq(ifp, NULL, &pkts[i].pktsched_pkt,
5684 TRUE, &pdrop);
5685
5686 return 0;
5687 }
5688
5689 static inline errno_t
ifnet_enqueue_common(struct ifnet * ifp,struct ifclassq * ifcq,classq_pkt_t * pkt,boolean_t flush,boolean_t * pdrop)5690 ifnet_enqueue_common(struct ifnet *ifp, struct ifclassq *ifcq,
5691 classq_pkt_t *pkt, boolean_t flush, boolean_t *pdrop)
5692 {
5693 if (ifp->if_output_netem != NULL) {
5694 bool drop;
5695 errno_t error;
5696 error = netem_enqueue(ifp->if_output_netem, pkt, &drop);
5697 *pdrop = drop ? TRUE : FALSE;
5698 return error;
5699 } else {
5700 return ifnet_enqueue_ifclassq(ifp, ifcq, pkt, flush, pdrop);
5701 }
5702 }
5703
5704 errno_t
ifnet_enqueue(struct ifnet * ifp,struct mbuf * m)5705 ifnet_enqueue(struct ifnet *ifp, struct mbuf *m)
5706 {
5707 uint32_t bytes = m_pktlen(m);
5708 struct mbuf *tail = m;
5709 uint32_t cnt = 1;
5710 boolean_t pdrop;
5711
5712 while (tail->m_nextpkt) {
5713 VERIFY(tail->m_flags & M_PKTHDR);
5714 tail = tail->m_nextpkt;
5715 cnt++;
5716 bytes += m_pktlen(tail);
5717 }
5718
5719 return ifnet_enqueue_mbuf_chain(ifp, m, tail, cnt, bytes, TRUE, &pdrop);
5720 }
5721
5722 errno_t
ifnet_enqueue_mbuf(struct ifnet * ifp,struct mbuf * m,boolean_t flush,boolean_t * pdrop)5723 ifnet_enqueue_mbuf(struct ifnet *ifp, struct mbuf *m, boolean_t flush,
5724 boolean_t *pdrop)
5725 {
5726 classq_pkt_t pkt;
5727
5728 if (ifp == NULL || m == NULL || !(m->m_flags & M_PKTHDR) ||
5729 m->m_nextpkt != NULL) {
5730 if (m != NULL) {
5731 m_freem_list(m);
5732 *pdrop = TRUE;
5733 }
5734 return EINVAL;
5735 } else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5736 !IF_FULLY_ATTACHED(ifp)) {
5737 /* flag tested without lock for performance */
5738 m_freem(m);
5739 *pdrop = TRUE;
5740 return ENXIO;
5741 } else if (!(ifp->if_flags & IFF_UP)) {
5742 m_freem(m);
5743 *pdrop = TRUE;
5744 return ENETDOWN;
5745 }
5746
5747 CLASSQ_PKT_INIT_MBUF(&pkt, m);
5748 return ifnet_enqueue_common(ifp, NULL, &pkt, flush, pdrop);
5749 }
5750
5751 errno_t
ifnet_enqueue_mbuf_chain(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,uint32_t cnt,uint32_t bytes,boolean_t flush,boolean_t * pdrop)5752 ifnet_enqueue_mbuf_chain(struct ifnet *ifp, struct mbuf *m_head,
5753 struct mbuf *m_tail, uint32_t cnt, uint32_t bytes, boolean_t flush,
5754 boolean_t *pdrop)
5755 {
5756 classq_pkt_t head, tail;
5757
5758 ASSERT(m_head != NULL);
5759 ASSERT((m_head->m_flags & M_PKTHDR) != 0);
5760 ASSERT(m_tail != NULL);
5761 ASSERT((m_tail->m_flags & M_PKTHDR) != 0);
5762 ASSERT(ifp != NULL);
5763 ASSERT((ifp->if_eflags & IFEF_TXSTART) != 0);
5764
5765 if (!IF_FULLY_ATTACHED(ifp)) {
5766 /* flag tested without lock for performance */
5767 m_freem_list(m_head);
5768 *pdrop = TRUE;
5769 return ENXIO;
5770 } else if (!(ifp->if_flags & IFF_UP)) {
5771 m_freem_list(m_head);
5772 *pdrop = TRUE;
5773 return ENETDOWN;
5774 }
5775
5776 CLASSQ_PKT_INIT_MBUF(&head, m_head);
5777 CLASSQ_PKT_INIT_MBUF(&tail, m_tail);
5778 return ifnet_enqueue_ifclassq_chain(ifp, NULL, &head, &tail, cnt, bytes,
5779 flush, pdrop);
5780 }
5781
5782 #if SKYWALK
5783 static errno_t
ifnet_enqueue_pkt_common(struct ifnet * ifp,struct ifclassq * ifcq,struct __kern_packet * kpkt,boolean_t flush,boolean_t * pdrop)5784 ifnet_enqueue_pkt_common(struct ifnet *ifp, struct ifclassq *ifcq,
5785 struct __kern_packet *kpkt, boolean_t flush, boolean_t *pdrop)
5786 {
5787 classq_pkt_t pkt;
5788
5789 ASSERT(kpkt == NULL || kpkt->pkt_nextpkt == NULL);
5790
5791 if (__improbable(ifp == NULL || kpkt == NULL)) {
5792 if (kpkt != NULL) {
5793 pp_free_packet(__DECONST(struct kern_pbufpool *,
5794 kpkt->pkt_qum.qum_pp), SK_PTR_ADDR(kpkt));
5795 *pdrop = TRUE;
5796 }
5797 return EINVAL;
5798 } else if (__improbable(!(ifp->if_eflags & IFEF_TXSTART) ||
5799 !IF_FULLY_ATTACHED(ifp))) {
5800 /* flag tested without lock for performance */
5801 pp_free_packet(__DECONST(struct kern_pbufpool *,
5802 kpkt->pkt_qum.qum_pp), SK_PTR_ADDR(kpkt));
5803 *pdrop = TRUE;
5804 return ENXIO;
5805 } else if (__improbable(!(ifp->if_flags & IFF_UP))) {
5806 pp_free_packet(__DECONST(struct kern_pbufpool *,
5807 kpkt->pkt_qum.qum_pp), SK_PTR_ADDR(kpkt));
5808 *pdrop = TRUE;
5809 return ENETDOWN;
5810 }
5811
5812 CLASSQ_PKT_INIT_PACKET(&pkt, kpkt);
5813 return ifnet_enqueue_common(ifp, ifcq, &pkt, flush, pdrop);
5814 }
5815
5816 errno_t
ifnet_enqueue_pkt(struct ifnet * ifp,struct __kern_packet * kpkt,boolean_t flush,boolean_t * pdrop)5817 ifnet_enqueue_pkt(struct ifnet *ifp, struct __kern_packet *kpkt,
5818 boolean_t flush, boolean_t *pdrop)
5819 {
5820 return ifnet_enqueue_pkt_common(ifp, NULL, kpkt, flush, pdrop);
5821 }
5822
5823 errno_t
ifnet_enqueue_ifcq_pkt(struct ifnet * ifp,struct ifclassq * ifcq,struct __kern_packet * kpkt,boolean_t flush,boolean_t * pdrop)5824 ifnet_enqueue_ifcq_pkt(struct ifnet *ifp, struct ifclassq *ifcq,
5825 struct __kern_packet *kpkt, boolean_t flush, boolean_t *pdrop)
5826 {
5827 return ifnet_enqueue_pkt_common(ifp, ifcq, kpkt, flush, pdrop);
5828 }
5829
5830 static errno_t
ifnet_enqueue_pkt_chain_common(struct ifnet * ifp,struct ifclassq * ifcq,struct __kern_packet * k_head,struct __kern_packet * k_tail,uint32_t cnt,uint32_t bytes,boolean_t flush,boolean_t * pdrop)5831 ifnet_enqueue_pkt_chain_common(struct ifnet *ifp, struct ifclassq *ifcq,
5832 struct __kern_packet *k_head, struct __kern_packet *k_tail, uint32_t cnt,
5833 uint32_t bytes, boolean_t flush, boolean_t *pdrop)
5834 {
5835 classq_pkt_t head, tail;
5836
5837 ASSERT(k_head != NULL);
5838 ASSERT(k_tail != NULL);
5839 ASSERT(ifp != NULL);
5840 ASSERT((ifp->if_eflags & IFEF_TXSTART) != 0);
5841
5842 if (!IF_FULLY_ATTACHED(ifp)) {
5843 /* flag tested without lock for performance */
5844 pp_free_packet_chain(k_head, NULL);
5845 *pdrop = TRUE;
5846 return ENXIO;
5847 } else if (__improbable(!(ifp->if_flags & IFF_UP))) {
5848 pp_free_packet_chain(k_head, NULL);
5849 *pdrop = TRUE;
5850 return ENETDOWN;
5851 }
5852
5853 CLASSQ_PKT_INIT_PACKET(&head, k_head);
5854 CLASSQ_PKT_INIT_PACKET(&tail, k_tail);
5855 return ifnet_enqueue_ifclassq_chain(ifp, ifcq, &head, &tail, cnt, bytes,
5856 flush, pdrop);
5857 }
5858
5859 errno_t
ifnet_enqueue_pkt_chain(struct ifnet * ifp,struct __kern_packet * k_head,struct __kern_packet * k_tail,uint32_t cnt,uint32_t bytes,boolean_t flush,boolean_t * pdrop)5860 ifnet_enqueue_pkt_chain(struct ifnet *ifp, struct __kern_packet *k_head,
5861 struct __kern_packet *k_tail, uint32_t cnt, uint32_t bytes, boolean_t flush,
5862 boolean_t *pdrop)
5863 {
5864 return ifnet_enqueue_pkt_chain_common(ifp, NULL, k_head, k_tail,
5865 cnt, bytes, flush, pdrop);
5866 }
5867
5868 errno_t
ifnet_enqueue_ifcq_pkt_chain(struct ifnet * ifp,struct ifclassq * ifcq,struct __kern_packet * k_head,struct __kern_packet * k_tail,uint32_t cnt,uint32_t bytes,boolean_t flush,boolean_t * pdrop)5869 ifnet_enqueue_ifcq_pkt_chain(struct ifnet *ifp, struct ifclassq *ifcq,
5870 struct __kern_packet *k_head, struct __kern_packet *k_tail, uint32_t cnt,
5871 uint32_t bytes, boolean_t flush, boolean_t *pdrop)
5872 {
5873 return ifnet_enqueue_pkt_chain_common(ifp, ifcq, k_head, k_tail,
5874 cnt, bytes, flush, pdrop);
5875 }
5876 #endif /* SKYWALK */
5877
5878 errno_t
ifnet_dequeue(struct ifnet * ifp,struct mbuf ** mp)5879 ifnet_dequeue(struct ifnet *ifp, struct mbuf **mp)
5880 {
5881 errno_t rc;
5882 classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
5883
5884 if (ifp == NULL || mp == NULL) {
5885 return EINVAL;
5886 } else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5887 ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5888 return ENXIO;
5889 }
5890 if (!ifnet_is_attached(ifp, 1)) {
5891 return ENXIO;
5892 }
5893
5894 #if SKYWALK
5895 ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5896 #endif /* SKYWALK */
5897 rc = ifclassq_dequeue(ifp->if_snd, 1, CLASSQ_DEQUEUE_MAX_BYTE_LIMIT,
5898 &pkt, NULL, NULL, NULL, 0);
5899 VERIFY((pkt.cp_ptype == QP_MBUF) || (pkt.cp_mbuf == NULL));
5900 ifnet_decr_iorefcnt(ifp);
5901 *mp = pkt.cp_mbuf;
5902 return rc;
5903 }
5904
5905 errno_t
ifnet_dequeue_service_class(struct ifnet * ifp,mbuf_svc_class_t sc,struct mbuf ** mp)5906 ifnet_dequeue_service_class(struct ifnet *ifp, mbuf_svc_class_t sc,
5907 struct mbuf **mp)
5908 {
5909 errno_t rc;
5910 classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
5911
5912 if (ifp == NULL || mp == NULL || !MBUF_VALID_SC(sc)) {
5913 return EINVAL;
5914 } else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5915 ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5916 return ENXIO;
5917 }
5918 if (!ifnet_is_attached(ifp, 1)) {
5919 return ENXIO;
5920 }
5921
5922 #if SKYWALK
5923 ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5924 #endif /* SKYWALK */
5925 rc = ifclassq_dequeue_sc(ifp->if_snd, sc, 1,
5926 CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, &pkt, NULL, NULL, NULL, 0);
5927 VERIFY((pkt.cp_ptype == QP_MBUF) || (pkt.cp_mbuf == NULL));
5928 ifnet_decr_iorefcnt(ifp);
5929 *mp = pkt.cp_mbuf;
5930 return rc;
5931 }
5932
5933 errno_t
ifnet_dequeue_multi(struct ifnet * ifp,u_int32_t pkt_limit,struct mbuf ** head,struct mbuf ** tail,u_int32_t * cnt,u_int32_t * len)5934 ifnet_dequeue_multi(struct ifnet *ifp, u_int32_t pkt_limit,
5935 struct mbuf **head, struct mbuf **tail, u_int32_t *cnt, u_int32_t *len)
5936 {
5937 errno_t rc;
5938 classq_pkt_t pkt_head = CLASSQ_PKT_INITIALIZER(pkt_head);
5939 classq_pkt_t pkt_tail = CLASSQ_PKT_INITIALIZER(pkt_tail);
5940
5941 if (ifp == NULL || head == NULL || pkt_limit < 1) {
5942 return EINVAL;
5943 } else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5944 ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5945 return ENXIO;
5946 }
5947 if (!ifnet_is_attached(ifp, 1)) {
5948 return ENXIO;
5949 }
5950
5951 #if SKYWALK
5952 ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5953 #endif /* SKYWALK */
5954 rc = ifclassq_dequeue(ifp->if_snd, pkt_limit,
5955 CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, &pkt_head, &pkt_tail, cnt, len, 0);
5956 VERIFY((pkt_head.cp_ptype == QP_MBUF) || (pkt_head.cp_mbuf == NULL));
5957 ifnet_decr_iorefcnt(ifp);
5958 *head = pkt_head.cp_mbuf;
5959 if (tail != NULL) {
5960 *tail = pkt_tail.cp_mbuf;
5961 }
5962 return rc;
5963 }
5964
5965 errno_t
ifnet_dequeue_multi_bytes(struct ifnet * ifp,u_int32_t byte_limit,struct mbuf ** head,struct mbuf ** tail,u_int32_t * cnt,u_int32_t * len)5966 ifnet_dequeue_multi_bytes(struct ifnet *ifp, u_int32_t byte_limit,
5967 struct mbuf **head, struct mbuf **tail, u_int32_t *cnt, u_int32_t *len)
5968 {
5969 errno_t rc;
5970 classq_pkt_t pkt_head = CLASSQ_PKT_INITIALIZER(pkt_head);
5971 classq_pkt_t pkt_tail = CLASSQ_PKT_INITIALIZER(pkt_tail);
5972
5973 if (ifp == NULL || head == NULL || byte_limit < 1) {
5974 return EINVAL;
5975 } else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5976 ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5977 return ENXIO;
5978 }
5979 if (!ifnet_is_attached(ifp, 1)) {
5980 return ENXIO;
5981 }
5982
5983 #if SKYWALK
5984 ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5985 #endif /* SKYWALK */
5986 rc = ifclassq_dequeue(ifp->if_snd, CLASSQ_DEQUEUE_MAX_PKT_LIMIT,
5987 byte_limit, &pkt_head, &pkt_tail, cnt, len, 0);
5988 VERIFY((pkt_head.cp_ptype == QP_MBUF) || (pkt_head.cp_mbuf == NULL));
5989 ifnet_decr_iorefcnt(ifp);
5990 *head = pkt_head.cp_mbuf;
5991 if (tail != NULL) {
5992 *tail = pkt_tail.cp_mbuf;
5993 }
5994 return rc;
5995 }
5996
5997 errno_t
ifnet_dequeue_service_class_multi(struct ifnet * ifp,mbuf_svc_class_t sc,u_int32_t pkt_limit,struct mbuf ** head,struct mbuf ** tail,u_int32_t * cnt,u_int32_t * len)5998 ifnet_dequeue_service_class_multi(struct ifnet *ifp, mbuf_svc_class_t sc,
5999 u_int32_t pkt_limit, struct mbuf **head, struct mbuf **tail, u_int32_t *cnt,
6000 u_int32_t *len)
6001 {
6002 errno_t rc;
6003 classq_pkt_t pkt_head = CLASSQ_PKT_INITIALIZER(pkt_head);
6004 classq_pkt_t pkt_tail = CLASSQ_PKT_INITIALIZER(pkt_tail);
6005
6006 if (ifp == NULL || head == NULL || pkt_limit < 1 ||
6007 !MBUF_VALID_SC(sc)) {
6008 return EINVAL;
6009 } else if (!(ifp->if_eflags & IFEF_TXSTART) ||
6010 ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
6011 return ENXIO;
6012 }
6013 if (!ifnet_is_attached(ifp, 1)) {
6014 return ENXIO;
6015 }
6016
6017 #if SKYWALK
6018 ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
6019 #endif /* SKYWALK */
6020 rc = ifclassq_dequeue_sc(ifp->if_snd, sc, pkt_limit,
6021 CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, &pkt_head, &pkt_tail,
6022 cnt, len, 0);
6023 VERIFY((pkt_head.cp_ptype == QP_MBUF) || (pkt_head.cp_mbuf == NULL));
6024 ifnet_decr_iorefcnt(ifp);
6025 *head = pkt_head.cp_mbuf;
6026 if (tail != NULL) {
6027 *tail = pkt_tail.cp_mbuf;
6028 }
6029 return rc;
6030 }
6031
6032 #if XNU_TARGET_OS_OSX
6033 errno_t
ifnet_framer_stub(struct ifnet * ifp,struct mbuf ** m,const struct sockaddr * dest,const char * dest_linkaddr,const char * frame_type,u_int32_t * pre,u_int32_t * post)6034 ifnet_framer_stub(struct ifnet *ifp, struct mbuf **m,
6035 const struct sockaddr *dest, const char *dest_linkaddr,
6036 const char *frame_type, u_int32_t *pre, u_int32_t *post)
6037 {
6038 if (pre != NULL) {
6039 *pre = 0;
6040 }
6041 if (post != NULL) {
6042 *post = 0;
6043 }
6044
6045 return ifp->if_framer_legacy(ifp, m, dest, dest_linkaddr, frame_type);
6046 }
6047 #endif /* XNU_TARGET_OS_OSX */
6048
6049 static boolean_t
packet_has_vlan_tag(struct mbuf * m)6050 packet_has_vlan_tag(struct mbuf * m)
6051 {
6052 u_int tag = 0;
6053
6054 if ((m->m_pkthdr.csum_flags & CSUM_VLAN_TAG_VALID) != 0) {
6055 tag = EVL_VLANOFTAG(m->m_pkthdr.vlan_tag);
6056 if (tag == 0) {
6057 /* the packet is just priority-tagged, clear the bit */
6058 m->m_pkthdr.csum_flags &= ~CSUM_VLAN_TAG_VALID;
6059 }
6060 }
6061 return tag != 0;
6062 }
6063
6064 static int
dlil_interface_filters_input(struct ifnet * ifp,struct mbuf ** m_p,char ** frame_header_p,protocol_family_t protocol_family)6065 dlil_interface_filters_input(struct ifnet *ifp, struct mbuf **m_p,
6066 char **frame_header_p, protocol_family_t protocol_family)
6067 {
6068 boolean_t is_vlan_packet = FALSE;
6069 struct ifnet_filter *filter;
6070 struct mbuf *m = *m_p;
6071
6072 is_vlan_packet = packet_has_vlan_tag(m);
6073
6074 if (TAILQ_EMPTY(&ifp->if_flt_head)) {
6075 return 0;
6076 }
6077
6078 /*
6079 * Pass the inbound packet to the interface filters
6080 */
6081 lck_mtx_lock_spin(&ifp->if_flt_lock);
6082 /* prevent filter list from changing in case we drop the lock */
6083 if_flt_monitor_busy(ifp);
6084 TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
6085 int result;
6086
6087 /* exclude VLAN packets from external filters PR-3586856 */
6088 if (is_vlan_packet &&
6089 (filter->filt_flags & DLIL_IFF_INTERNAL) == 0) {
6090 continue;
6091 }
6092
6093 if (!filter->filt_skip && filter->filt_input != NULL &&
6094 (filter->filt_protocol == 0 ||
6095 filter->filt_protocol == protocol_family)) {
6096 lck_mtx_unlock(&ifp->if_flt_lock);
6097
6098 result = (*filter->filt_input)(filter->filt_cookie,
6099 ifp, protocol_family, m_p, frame_header_p);
6100
6101 lck_mtx_lock_spin(&ifp->if_flt_lock);
6102 if (result != 0) {
6103 /* we're done with the filter list */
6104 if_flt_monitor_unbusy(ifp);
6105 lck_mtx_unlock(&ifp->if_flt_lock);
6106 return result;
6107 }
6108 }
6109 }
6110 /* we're done with the filter list */
6111 if_flt_monitor_unbusy(ifp);
6112 lck_mtx_unlock(&ifp->if_flt_lock);
6113
6114 /*
6115 * Strip away M_PROTO1 bit prior to sending packet up the stack as
6116 * it is meant to be local to a subsystem -- if_bridge for M_PROTO1
6117 */
6118 if (*m_p != NULL) {
6119 (*m_p)->m_flags &= ~M_PROTO1;
6120 }
6121
6122 return 0;
6123 }
6124
6125 __attribute__((noinline))
6126 static int
dlil_interface_filters_output(struct ifnet * ifp,struct mbuf ** m_p,protocol_family_t protocol_family)6127 dlil_interface_filters_output(struct ifnet *ifp, struct mbuf **m_p,
6128 protocol_family_t protocol_family)
6129 {
6130 boolean_t is_vlan_packet;
6131 struct ifnet_filter *filter;
6132 struct mbuf *m = *m_p;
6133
6134 if (TAILQ_EMPTY(&ifp->if_flt_head)) {
6135 return 0;
6136 }
6137 is_vlan_packet = packet_has_vlan_tag(m);
6138
6139 /*
6140 * Pass the outbound packet to the interface filters
6141 */
6142 lck_mtx_lock_spin(&ifp->if_flt_lock);
6143 /* prevent filter list from changing in case we drop the lock */
6144 if_flt_monitor_busy(ifp);
6145 TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
6146 int result;
6147
6148 /* exclude VLAN packets from external filters PR-3586856 */
6149 if (is_vlan_packet &&
6150 (filter->filt_flags & DLIL_IFF_INTERNAL) == 0) {
6151 continue;
6152 }
6153
6154 if (!filter->filt_skip && filter->filt_output != NULL &&
6155 (filter->filt_protocol == 0 ||
6156 filter->filt_protocol == protocol_family)) {
6157 lck_mtx_unlock(&ifp->if_flt_lock);
6158
6159 result = filter->filt_output(filter->filt_cookie, ifp,
6160 protocol_family, m_p);
6161
6162 lck_mtx_lock_spin(&ifp->if_flt_lock);
6163 if (result != 0) {
6164 /* we're done with the filter list */
6165 if_flt_monitor_unbusy(ifp);
6166 lck_mtx_unlock(&ifp->if_flt_lock);
6167 return result;
6168 }
6169 }
6170 }
6171 /* we're done with the filter list */
6172 if_flt_monitor_unbusy(ifp);
6173 lck_mtx_unlock(&ifp->if_flt_lock);
6174
6175 return 0;
6176 }
6177
6178 static void
dlil_ifproto_input(struct if_proto * ifproto,mbuf_t m)6179 dlil_ifproto_input(struct if_proto * ifproto, mbuf_t m)
6180 {
6181 int error;
6182
6183 if (ifproto->proto_kpi == kProtoKPI_v1) {
6184 /* Version 1 protocols get one packet at a time */
6185 while (m != NULL) {
6186 char * frame_header;
6187 mbuf_t next_packet;
6188
6189 next_packet = m->m_nextpkt;
6190 m->m_nextpkt = NULL;
6191 frame_header = m->m_pkthdr.pkt_hdr;
6192 m->m_pkthdr.pkt_hdr = NULL;
6193 error = (*ifproto->kpi.v1.input)(ifproto->ifp,
6194 ifproto->protocol_family, m, frame_header);
6195 if (error != 0 && error != EJUSTRETURN) {
6196 m_freem(m);
6197 }
6198 m = next_packet;
6199 }
6200 } else if (ifproto->proto_kpi == kProtoKPI_v2) {
6201 /* Version 2 protocols support packet lists */
6202 error = (*ifproto->kpi.v2.input)(ifproto->ifp,
6203 ifproto->protocol_family, m);
6204 if (error != 0 && error != EJUSTRETURN) {
6205 m_freem_list(m);
6206 }
6207 }
6208 }
6209
6210 static void
dlil_input_stats_add(const struct ifnet_stat_increment_param * s,struct dlil_threading_info * inp,struct ifnet * ifp,boolean_t poll)6211 dlil_input_stats_add(const struct ifnet_stat_increment_param *s,
6212 struct dlil_threading_info *inp, struct ifnet *ifp, boolean_t poll)
6213 {
6214 struct ifnet_stat_increment_param *d = &inp->dlth_stats;
6215
6216 if (s->packets_in != 0) {
6217 d->packets_in += s->packets_in;
6218 }
6219 if (s->bytes_in != 0) {
6220 d->bytes_in += s->bytes_in;
6221 }
6222 if (s->errors_in != 0) {
6223 d->errors_in += s->errors_in;
6224 }
6225
6226 if (s->packets_out != 0) {
6227 d->packets_out += s->packets_out;
6228 }
6229 if (s->bytes_out != 0) {
6230 d->bytes_out += s->bytes_out;
6231 }
6232 if (s->errors_out != 0) {
6233 d->errors_out += s->errors_out;
6234 }
6235
6236 if (s->collisions != 0) {
6237 d->collisions += s->collisions;
6238 }
6239 if (s->dropped != 0) {
6240 d->dropped += s->dropped;
6241 }
6242
6243 if (poll) {
6244 PKTCNTR_ADD(&ifp->if_poll_tstats, s->packets_in, s->bytes_in);
6245 }
6246 }
6247
6248 static boolean_t
dlil_input_stats_sync(struct ifnet * ifp,struct dlil_threading_info * inp)6249 dlil_input_stats_sync(struct ifnet *ifp, struct dlil_threading_info *inp)
6250 {
6251 struct ifnet_stat_increment_param *s = &inp->dlth_stats;
6252
6253 /*
6254 * Use of atomic operations is unavoidable here because
6255 * these stats may also be incremented elsewhere via KPIs.
6256 */
6257 if (s->packets_in != 0) {
6258 os_atomic_add(&ifp->if_data.ifi_ipackets, s->packets_in, relaxed);
6259 s->packets_in = 0;
6260 }
6261 if (s->bytes_in != 0) {
6262 os_atomic_add(&ifp->if_data.ifi_ibytes, s->bytes_in, relaxed);
6263 s->bytes_in = 0;
6264 }
6265 if (s->errors_in != 0) {
6266 os_atomic_add(&ifp->if_data.ifi_ierrors, s->errors_in, relaxed);
6267 s->errors_in = 0;
6268 }
6269
6270 if (s->packets_out != 0) {
6271 os_atomic_add(&ifp->if_data.ifi_opackets, s->packets_out, relaxed);
6272 s->packets_out = 0;
6273 }
6274 if (s->bytes_out != 0) {
6275 os_atomic_add(&ifp->if_data.ifi_obytes, s->bytes_out, relaxed);
6276 s->bytes_out = 0;
6277 }
6278 if (s->errors_out != 0) {
6279 os_atomic_add(&ifp->if_data.ifi_oerrors, s->errors_out, relaxed);
6280 s->errors_out = 0;
6281 }
6282
6283 if (s->collisions != 0) {
6284 os_atomic_add(&ifp->if_data.ifi_collisions, s->collisions, relaxed);
6285 s->collisions = 0;
6286 }
6287 if (s->dropped != 0) {
6288 os_atomic_add(&ifp->if_data.ifi_iqdrops, s->dropped, relaxed);
6289 s->dropped = 0;
6290 }
6291
6292 /*
6293 * No need for atomic operations as they are modified here
6294 * only from within the DLIL input thread context.
6295 */
6296 if (ifp->if_poll_tstats.packets != 0) {
6297 ifp->if_poll_pstats.ifi_poll_packets += ifp->if_poll_tstats.packets;
6298 ifp->if_poll_tstats.packets = 0;
6299 }
6300 if (ifp->if_poll_tstats.bytes != 0) {
6301 ifp->if_poll_pstats.ifi_poll_bytes += ifp->if_poll_tstats.bytes;
6302 ifp->if_poll_tstats.bytes = 0;
6303 }
6304
6305 return ifp->if_data_threshold != 0;
6306 }
6307
6308 __private_extern__ void
dlil_input_packet_list(struct ifnet * ifp,struct mbuf * m)6309 dlil_input_packet_list(struct ifnet *ifp, struct mbuf *m)
6310 {
6311 return dlil_input_packet_list_common(ifp, m, 0,
6312 IFNET_MODEL_INPUT_POLL_OFF, FALSE);
6313 }
6314
6315 __private_extern__ void
dlil_input_packet_list_extended(struct ifnet * ifp,struct mbuf * m,u_int32_t cnt,ifnet_model_t mode)6316 dlil_input_packet_list_extended(struct ifnet *ifp, struct mbuf *m,
6317 u_int32_t cnt, ifnet_model_t mode)
6318 {
6319 return dlil_input_packet_list_common(ifp, m, cnt, mode, TRUE);
6320 }
6321
6322 static void
dlil_input_packet_list_common(struct ifnet * ifp_param,struct mbuf * m,u_int32_t cnt,ifnet_model_t mode,boolean_t ext)6323 dlil_input_packet_list_common(struct ifnet *ifp_param, struct mbuf *m,
6324 u_int32_t cnt, ifnet_model_t mode, boolean_t ext)
6325 {
6326 int error = 0;
6327 protocol_family_t protocol_family;
6328 mbuf_t next_packet;
6329 ifnet_t ifp = ifp_param;
6330 char *frame_header = NULL;
6331 struct if_proto *last_ifproto = NULL;
6332 mbuf_t pkt_first = NULL;
6333 mbuf_t *pkt_next = NULL;
6334 u_int32_t poll_thresh = 0, poll_ival = 0;
6335 int iorefcnt = 0;
6336
6337 KERNEL_DEBUG(DBG_FNC_DLIL_INPUT | DBG_FUNC_START, 0, 0, 0, 0, 0);
6338
6339 if (ext && mode == IFNET_MODEL_INPUT_POLL_ON && cnt > 1 &&
6340 (poll_ival = if_rxpoll_interval_pkts) > 0) {
6341 poll_thresh = cnt;
6342 }
6343
6344 while (m != NULL) {
6345 struct if_proto *ifproto = NULL;
6346 uint32_t pktf_mask; /* pkt flags to preserve */
6347
6348 m_add_crumb(m, PKT_CRUMB_DLIL_INPUT);
6349
6350 if (ifp_param == NULL) {
6351 ifp = m->m_pkthdr.rcvif;
6352 }
6353
6354 if ((ifp->if_eflags & IFEF_RXPOLL) &&
6355 (ifp->if_xflags & IFXF_LEGACY) && poll_thresh != 0 &&
6356 poll_ival > 0 && (--poll_thresh % poll_ival) == 0) {
6357 ifnet_poll(ifp);
6358 }
6359
6360 /* Check if this mbuf looks valid */
6361 MBUF_INPUT_CHECK(m, ifp);
6362
6363 next_packet = m->m_nextpkt;
6364 m->m_nextpkt = NULL;
6365 frame_header = m->m_pkthdr.pkt_hdr;
6366 m->m_pkthdr.pkt_hdr = NULL;
6367
6368 /*
6369 * Get an IO reference count if the interface is not
6370 * loopback (lo0) and it is attached; lo0 never goes
6371 * away, so optimize for that.
6372 */
6373 if (ifp != lo_ifp) {
6374 /* iorefcnt is 0 if it hasn't been taken yet */
6375 if (iorefcnt == 0) {
6376 if (!ifnet_datamov_begin(ifp)) {
6377 m_freem(m);
6378 goto next;
6379 }
6380 }
6381 iorefcnt = 1;
6382 /*
6383 * Preserve the time stamp and skip pktap flags.
6384 */
6385 pktf_mask = PKTF_TS_VALID | PKTF_SKIP_PKTAP;
6386 } else {
6387 /*
6388 * If this arrived on lo0, preserve interface addr
6389 * info to allow for connectivity between loopback
6390 * and local interface addresses.
6391 */
6392 pktf_mask = (PKTF_LOOP | PKTF_IFAINFO);
6393 }
6394 pktf_mask |= PKTF_WAKE_PKT;
6395
6396 /* make sure packet comes in clean */
6397 m_classifier_init(m, pktf_mask);
6398
6399 ifp_inc_traffic_class_in(ifp, m);
6400
6401 /* find which protocol family this packet is for */
6402 ifnet_lock_shared(ifp);
6403 error = (*ifp->if_demux)(ifp, m, frame_header,
6404 &protocol_family);
6405 ifnet_lock_done(ifp);
6406 if (error != 0) {
6407 if (error == EJUSTRETURN) {
6408 goto next;
6409 }
6410 protocol_family = 0;
6411 }
6412
6413 #if (DEVELOPMENT || DEBUG)
6414 /*
6415 * For testing we do not care about broadcast and multicast packets as
6416 * they are not as controllable as unicast traffic
6417 */
6418 if (__improbable(ifp->if_xflags & IFXF_MARK_WAKE_PKT)) {
6419 if ((protocol_family == PF_INET || protocol_family == PF_INET6) &&
6420 (m->m_flags & (M_BCAST | M_MCAST)) == 0) {
6421 /*
6422 * This is a one-shot command
6423 */
6424 ifp->if_xflags &= ~IFXF_MARK_WAKE_PKT;
6425 m->m_pkthdr.pkt_flags |= PKTF_WAKE_PKT;
6426 }
6427 }
6428 #endif /* (DEVELOPMENT || DEBUG) */
6429 if (__improbable(net_wake_pkt_debug > 0 && (m->m_pkthdr.pkt_flags & PKTF_WAKE_PKT))) {
6430 char buffer[64];
6431 size_t buflen = MIN(mbuf_pkthdr_len(m), sizeof(buffer));
6432
6433 os_log(OS_LOG_DEFAULT, "wake packet from %s len %d",
6434 ifp->if_xname, m_pktlen(m));
6435 if (mbuf_copydata(m, 0, buflen, buffer) == 0) {
6436 log_hexdump(buffer, buflen);
6437 }
6438 }
6439
6440 pktap_input(ifp, protocol_family, m, frame_header);
6441
6442 /* Drop v4 packets received on CLAT46 enabled cell interface */
6443 if (protocol_family == PF_INET && IS_INTF_CLAT46(ifp) &&
6444 ifp->if_type == IFT_CELLULAR) {
6445 m_freem(m);
6446 ip6stat.ip6s_clat464_in_v4_drop++;
6447 goto next;
6448 }
6449
6450 /* Translate the packet if it is received on CLAT interface */
6451 if (protocol_family == PF_INET6 && IS_INTF_CLAT46(ifp)
6452 && dlil_is_clat_needed(protocol_family, m)) {
6453 char *data = NULL;
6454 struct ether_header eh;
6455 struct ether_header *ehp = NULL;
6456
6457 if (ifp->if_type == IFT_ETHER) {
6458 ehp = (struct ether_header *)(void *)frame_header;
6459 /* Skip RX Ethernet packets if they are not IPV6 */
6460 if (ntohs(ehp->ether_type) != ETHERTYPE_IPV6) {
6461 goto skip_clat;
6462 }
6463
6464 /* Keep a copy of frame_header for Ethernet packets */
6465 bcopy(frame_header, (caddr_t)&eh, ETHER_HDR_LEN);
6466 }
6467 error = dlil_clat64(ifp, &protocol_family, &m);
6468 data = (char *) mbuf_data(m);
6469 if (error != 0) {
6470 m_freem(m);
6471 ip6stat.ip6s_clat464_in_drop++;
6472 goto next;
6473 }
6474 /* Native v6 should be No-op */
6475 if (protocol_family != PF_INET) {
6476 goto skip_clat;
6477 }
6478
6479 /* Do this only for translated v4 packets. */
6480 switch (ifp->if_type) {
6481 case IFT_CELLULAR:
6482 frame_header = data;
6483 break;
6484 case IFT_ETHER:
6485 /*
6486 * Drop if the mbuf doesn't have enough
6487 * space for Ethernet header
6488 */
6489 if (M_LEADINGSPACE(m) < ETHER_HDR_LEN) {
6490 m_free(m);
6491 ip6stat.ip6s_clat464_in_drop++;
6492 goto next;
6493 }
6494 /*
6495 * Set the frame_header ETHER_HDR_LEN bytes
6496 * preceeding the data pointer. Change
6497 * the ether_type too.
6498 */
6499 frame_header = data - ETHER_HDR_LEN;
6500 eh.ether_type = htons(ETHERTYPE_IP);
6501 bcopy((caddr_t)&eh, frame_header, ETHER_HDR_LEN);
6502 break;
6503 }
6504 }
6505 skip_clat:
6506 /*
6507 * Match the wake packet against the list of ports that has been
6508 * been queried by the driver before the device went to sleep
6509 */
6510 if (__improbable(m->m_pkthdr.pkt_flags & PKTF_WAKE_PKT)) {
6511 if (protocol_family != PF_INET && protocol_family != PF_INET6) {
6512 if_ports_used_match_mbuf(ifp, protocol_family, m);
6513 }
6514 }
6515 if (hwcksum_dbg != 0 && !(ifp->if_flags & IFF_LOOPBACK) &&
6516 !(m->m_pkthdr.pkt_flags & PKTF_LOOP)) {
6517 dlil_input_cksum_dbg(ifp, m, frame_header,
6518 protocol_family);
6519 }
6520 /*
6521 * For partial checksum offload, we expect the driver to
6522 * set the start offset indicating the start of the span
6523 * that is covered by the hardware-computed checksum;
6524 * adjust this start offset accordingly because the data
6525 * pointer has been advanced beyond the link-layer header.
6526 *
6527 * Virtual lan types (bridge, vlan, bond) can call
6528 * dlil_input_packet_list() with the same packet with the
6529 * checksum flags set. Set a flag indicating that the
6530 * adjustment has already been done.
6531 */
6532 if ((m->m_pkthdr.csum_flags & CSUM_ADJUST_DONE) != 0) {
6533 /* adjustment has already been done */
6534 } else if ((m->m_pkthdr.csum_flags &
6535 (CSUM_DATA_VALID | CSUM_PARTIAL)) ==
6536 (CSUM_DATA_VALID | CSUM_PARTIAL)) {
6537 int adj;
6538 if (frame_header == NULL ||
6539 frame_header < (char *)mbuf_datastart(m) ||
6540 frame_header > (char *)m->m_data ||
6541 (adj = (int)(m->m_data - frame_header)) >
6542 m->m_pkthdr.csum_rx_start) {
6543 m->m_pkthdr.csum_data = 0;
6544 m->m_pkthdr.csum_flags &= ~CSUM_DATA_VALID;
6545 hwcksum_in_invalidated++;
6546 } else {
6547 m->m_pkthdr.csum_rx_start -= adj;
6548 }
6549 /* make sure we don't adjust more than once */
6550 m->m_pkthdr.csum_flags |= CSUM_ADJUST_DONE;
6551 }
6552 if (clat_debug) {
6553 pktap_input(ifp, protocol_family, m, frame_header);
6554 }
6555
6556 if (m->m_flags & (M_BCAST | M_MCAST)) {
6557 os_atomic_inc(&ifp->if_imcasts, relaxed);
6558 }
6559
6560 /* run interface filters */
6561 error = dlil_interface_filters_input(ifp, &m,
6562 &frame_header, protocol_family);
6563 if (error != 0) {
6564 if (error != EJUSTRETURN) {
6565 m_freem(m);
6566 }
6567 goto next;
6568 }
6569 /*
6570 * A VLAN interface receives VLAN-tagged packets by attaching
6571 * its PF_VLAN protocol to a parent interface. When a VLAN
6572 * interface is a member of a bridge, the parent interface
6573 * receives VLAN-tagged M_PROMISC packets. A VLAN-tagged
6574 * M_PROMISC packet must be processed by the VLAN protocol
6575 * so that it can be sent up the stack via
6576 * dlil_input_packet_list(). That allows the bridge interface's
6577 * input filter, attached to the VLAN interface, to process
6578 * the packet.
6579 */
6580 if (protocol_family != PF_VLAN &&
6581 (m->m_flags & M_PROMISC) != 0) {
6582 m_freem(m);
6583 goto next;
6584 }
6585
6586 /* Lookup the protocol attachment to this interface */
6587 if (protocol_family == 0) {
6588 ifproto = NULL;
6589 } else if (last_ifproto != NULL && last_ifproto->ifp == ifp &&
6590 (last_ifproto->protocol_family == protocol_family)) {
6591 VERIFY(ifproto == NULL);
6592 ifproto = last_ifproto;
6593 if_proto_ref(last_ifproto);
6594 } else {
6595 VERIFY(ifproto == NULL);
6596 ifnet_lock_shared(ifp);
6597 /* callee holds a proto refcnt upon success */
6598 ifproto = find_attached_proto(ifp, protocol_family);
6599 ifnet_lock_done(ifp);
6600 }
6601 if (ifproto == NULL) {
6602 /* no protocol for this packet, discard */
6603 m_freem(m);
6604 goto next;
6605 }
6606 if (ifproto != last_ifproto) {
6607 if (last_ifproto != NULL) {
6608 /* pass up the list for the previous protocol */
6609 dlil_ifproto_input(last_ifproto, pkt_first);
6610 pkt_first = NULL;
6611 if_proto_free(last_ifproto);
6612 }
6613 last_ifproto = ifproto;
6614 if_proto_ref(ifproto);
6615 }
6616 /* extend the list */
6617 m->m_pkthdr.pkt_hdr = frame_header;
6618 if (pkt_first == NULL) {
6619 pkt_first = m;
6620 } else {
6621 *pkt_next = m;
6622 }
6623 pkt_next = &m->m_nextpkt;
6624
6625 next:
6626 if (next_packet == NULL && last_ifproto != NULL) {
6627 /* pass up the last list of packets */
6628 dlil_ifproto_input(last_ifproto, pkt_first);
6629 if_proto_free(last_ifproto);
6630 last_ifproto = NULL;
6631 }
6632 if (ifproto != NULL) {
6633 if_proto_free(ifproto);
6634 ifproto = NULL;
6635 }
6636
6637 m = next_packet;
6638
6639 /* update the driver's multicast filter, if needed */
6640 if (ifp->if_updatemcasts > 0 && if_mcasts_update(ifp) == 0) {
6641 ifp->if_updatemcasts = 0;
6642 }
6643 if (iorefcnt == 1) {
6644 /* If the next mbuf is on a different interface, unlock data-mov */
6645 if (!m || (ifp != ifp_param && ifp != m->m_pkthdr.rcvif)) {
6646 ifnet_datamov_end(ifp);
6647 iorefcnt = 0;
6648 }
6649 }
6650 }
6651
6652 KERNEL_DEBUG(DBG_FNC_DLIL_INPUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
6653 }
6654
6655 static errno_t
if_mcasts_update_common(struct ifnet * ifp,bool sync)6656 if_mcasts_update_common(struct ifnet * ifp, bool sync)
6657 {
6658 errno_t err;
6659
6660 if (sync) {
6661 err = ifnet_ioctl(ifp, 0, SIOCADDMULTI, NULL);
6662 if (err == EAFNOSUPPORT) {
6663 err = 0;
6664 }
6665 } else {
6666 ifnet_ioctl_async(ifp, SIOCADDMULTI);
6667 err = 0;
6668 }
6669 DLIL_PRINTF("%s: %s %d suspended link-layer multicast membership(s) "
6670 "(err=%d)\n", if_name(ifp),
6671 (err == 0 ? "successfully restored" : "failed to restore"),
6672 ifp->if_updatemcasts, err);
6673
6674 /* just return success */
6675 return 0;
6676 }
6677
6678 static errno_t
if_mcasts_update_async(struct ifnet * ifp)6679 if_mcasts_update_async(struct ifnet *ifp)
6680 {
6681 return if_mcasts_update_common(ifp, false);
6682 }
6683
6684 errno_t
if_mcasts_update(struct ifnet * ifp)6685 if_mcasts_update(struct ifnet *ifp)
6686 {
6687 return if_mcasts_update_common(ifp, true);
6688 }
6689
6690 /* If ifp is set, we will increment the generation for the interface */
6691 int
dlil_post_complete_msg(struct ifnet * ifp,struct kev_msg * event)6692 dlil_post_complete_msg(struct ifnet *ifp, struct kev_msg *event)
6693 {
6694 if (ifp != NULL) {
6695 ifnet_increment_generation(ifp);
6696 }
6697
6698 #if NECP
6699 necp_update_all_clients();
6700 #endif /* NECP */
6701
6702 return kev_post_msg(event);
6703 }
6704
6705 __private_extern__ void
dlil_post_sifflags_msg(struct ifnet * ifp)6706 dlil_post_sifflags_msg(struct ifnet * ifp)
6707 {
6708 struct kev_msg ev_msg;
6709 struct net_event_data ev_data;
6710
6711 bzero(&ev_data, sizeof(ev_data));
6712 bzero(&ev_msg, sizeof(ev_msg));
6713 ev_msg.vendor_code = KEV_VENDOR_APPLE;
6714 ev_msg.kev_class = KEV_NETWORK_CLASS;
6715 ev_msg.kev_subclass = KEV_DL_SUBCLASS;
6716 ev_msg.event_code = KEV_DL_SIFFLAGS;
6717 strlcpy(&ev_data.if_name[0], ifp->if_name, IFNAMSIZ);
6718 ev_data.if_family = ifp->if_family;
6719 ev_data.if_unit = (u_int32_t) ifp->if_unit;
6720 ev_msg.dv[0].data_length = sizeof(struct net_event_data);
6721 ev_msg.dv[0].data_ptr = &ev_data;
6722 ev_msg.dv[1].data_length = 0;
6723 dlil_post_complete_msg(ifp, &ev_msg);
6724 }
6725
6726 #define TMP_IF_PROTO_ARR_SIZE 10
6727 static int
dlil_event_internal(struct ifnet * ifp,struct kev_msg * event,bool update_generation)6728 dlil_event_internal(struct ifnet *ifp, struct kev_msg *event, bool update_generation)
6729 {
6730 struct ifnet_filter *filter = NULL;
6731 struct if_proto *proto = NULL;
6732 int if_proto_count = 0;
6733 struct if_proto *tmp_ifproto_stack_arr[TMP_IF_PROTO_ARR_SIZE] = {NULL};
6734 struct if_proto **tmp_ifproto_arr = tmp_ifproto_stack_arr;
6735 int tmp_ifproto_arr_idx = 0;
6736
6737 /*
6738 * Pass the event to the interface filters
6739 */
6740 lck_mtx_lock_spin(&ifp->if_flt_lock);
6741 /* prevent filter list from changing in case we drop the lock */
6742 if_flt_monitor_busy(ifp);
6743 TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
6744 if (filter->filt_event != NULL) {
6745 lck_mtx_unlock(&ifp->if_flt_lock);
6746
6747 filter->filt_event(filter->filt_cookie, ifp,
6748 filter->filt_protocol, event);
6749
6750 lck_mtx_lock_spin(&ifp->if_flt_lock);
6751 }
6752 }
6753 /* we're done with the filter list */
6754 if_flt_monitor_unbusy(ifp);
6755 lck_mtx_unlock(&ifp->if_flt_lock);
6756
6757 /* Get an io ref count if the interface is attached */
6758 if (!ifnet_is_attached(ifp, 1)) {
6759 goto done;
6760 }
6761
6762 /*
6763 * An embedded tmp_list_entry in if_proto may still get
6764 * over-written by another thread after giving up ifnet lock,
6765 * therefore we are avoiding embedded pointers here.
6766 */
6767 ifnet_lock_shared(ifp);
6768 if_proto_count = dlil_ifp_protolist(ifp, NULL, 0);
6769 if (if_proto_count) {
6770 int i;
6771 VERIFY(ifp->if_proto_hash != NULL);
6772 if (if_proto_count <= TMP_IF_PROTO_ARR_SIZE) {
6773 tmp_ifproto_arr = tmp_ifproto_stack_arr;
6774 } else {
6775 tmp_ifproto_arr = kalloc_type(struct if_proto *,
6776 if_proto_count, Z_WAITOK | Z_ZERO);
6777 if (tmp_ifproto_arr == NULL) {
6778 ifnet_lock_done(ifp);
6779 goto cleanup;
6780 }
6781 }
6782
6783 for (i = 0; i < PROTO_HASH_SLOTS; i++) {
6784 SLIST_FOREACH(proto, &ifp->if_proto_hash[i],
6785 next_hash) {
6786 if_proto_ref(proto);
6787 tmp_ifproto_arr[tmp_ifproto_arr_idx] = proto;
6788 tmp_ifproto_arr_idx++;
6789 }
6790 }
6791 VERIFY(if_proto_count == tmp_ifproto_arr_idx);
6792 }
6793 ifnet_lock_done(ifp);
6794
6795 for (tmp_ifproto_arr_idx = 0; tmp_ifproto_arr_idx < if_proto_count;
6796 tmp_ifproto_arr_idx++) {
6797 proto = tmp_ifproto_arr[tmp_ifproto_arr_idx];
6798 VERIFY(proto != NULL);
6799 proto_media_event eventp =
6800 (proto->proto_kpi == kProtoKPI_v1 ?
6801 proto->kpi.v1.event :
6802 proto->kpi.v2.event);
6803
6804 if (eventp != NULL) {
6805 eventp(ifp, proto->protocol_family,
6806 event);
6807 }
6808 if_proto_free(proto);
6809 }
6810
6811 cleanup:
6812 if (tmp_ifproto_arr != tmp_ifproto_stack_arr) {
6813 kfree_type(struct if_proto *, if_proto_count, tmp_ifproto_arr);
6814 }
6815
6816 /* Pass the event to the interface */
6817 if (ifp->if_event != NULL) {
6818 ifp->if_event(ifp, event);
6819 }
6820
6821 /* Release the io ref count */
6822 ifnet_decr_iorefcnt(ifp);
6823 done:
6824 return dlil_post_complete_msg(update_generation ? ifp : NULL, event);
6825 }
6826
6827 errno_t
ifnet_event(ifnet_t ifp,struct kern_event_msg * event)6828 ifnet_event(ifnet_t ifp, struct kern_event_msg *event)
6829 {
6830 struct kev_msg kev_msg;
6831 int result = 0;
6832
6833 if (ifp == NULL || event == NULL) {
6834 return EINVAL;
6835 }
6836
6837 bzero(&kev_msg, sizeof(kev_msg));
6838 kev_msg.vendor_code = event->vendor_code;
6839 kev_msg.kev_class = event->kev_class;
6840 kev_msg.kev_subclass = event->kev_subclass;
6841 kev_msg.event_code = event->event_code;
6842 kev_msg.dv[0].data_ptr = &event->event_data[0];
6843 kev_msg.dv[0].data_length = event->total_size - KEV_MSG_HEADER_SIZE;
6844 kev_msg.dv[1].data_length = 0;
6845
6846 result = dlil_event_internal(ifp, &kev_msg, TRUE);
6847
6848 return result;
6849 }
6850
6851 static void
dlil_count_chain_len(mbuf_t m,struct chain_len_stats * cls)6852 dlil_count_chain_len(mbuf_t m, struct chain_len_stats *cls)
6853 {
6854 mbuf_t n = m;
6855 int chainlen = 0;
6856
6857 while (n != NULL) {
6858 chainlen++;
6859 n = n->m_next;
6860 }
6861 switch (chainlen) {
6862 case 0:
6863 break;
6864 case 1:
6865 os_atomic_inc(&cls->cls_one, relaxed);
6866 break;
6867 case 2:
6868 os_atomic_inc(&cls->cls_two, relaxed);
6869 break;
6870 case 3:
6871 os_atomic_inc(&cls->cls_three, relaxed);
6872 break;
6873 case 4:
6874 os_atomic_inc(&cls->cls_four, relaxed);
6875 break;
6876 case 5:
6877 default:
6878 os_atomic_inc(&cls->cls_five_or_more, relaxed);
6879 break;
6880 }
6881 }
6882
6883 #if CONFIG_DTRACE
6884 __attribute__((noinline))
6885 static void
dlil_output_dtrace(ifnet_t ifp,protocol_family_t proto_family,mbuf_t m)6886 dlil_output_dtrace(ifnet_t ifp, protocol_family_t proto_family, mbuf_t m)
6887 {
6888 if (proto_family == PF_INET) {
6889 struct ip *ip = mtod(m, struct ip *);
6890 DTRACE_IP6(send, struct mbuf *, m, struct inpcb *, NULL,
6891 struct ip *, ip, struct ifnet *, ifp,
6892 struct ip *, ip, struct ip6_hdr *, NULL);
6893 } else if (proto_family == PF_INET6) {
6894 struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
6895 DTRACE_IP6(send, struct mbuf *, m, struct inpcb *, NULL,
6896 struct ip6_hdr *, ip6, struct ifnet *, ifp,
6897 struct ip *, NULL, struct ip6_hdr *, ip6);
6898 }
6899 }
6900 #endif /* CONFIG_DTRACE */
6901
6902 /*
6903 * dlil_output
6904 *
6905 * Caller should have a lock on the protocol domain if the protocol
6906 * doesn't support finer grained locking. In most cases, the lock
6907 * will be held from the socket layer and won't be released until
6908 * we return back to the socket layer.
6909 *
6910 * This does mean that we must take a protocol lock before we take
6911 * an interface lock if we're going to take both. This makes sense
6912 * because a protocol is likely to interact with an ifp while it
6913 * is under the protocol lock.
6914 *
6915 * An advisory code will be returned if adv is not null. This
6916 * can be used to provide feedback about interface queues to the
6917 * application.
6918 */
6919 errno_t
dlil_output(ifnet_t ifp,protocol_family_t proto_family,mbuf_t packetlist,void * route,const struct sockaddr * dest,int raw,struct flowadv * adv)6920 dlil_output(ifnet_t ifp, protocol_family_t proto_family, mbuf_t packetlist,
6921 void *route, const struct sockaddr *dest, int raw, struct flowadv *adv)
6922 {
6923 char *frame_type = NULL;
6924 char *dst_linkaddr = NULL;
6925 int retval = 0;
6926 char frame_type_buffer[MAX_FRAME_TYPE_SIZE * 4];
6927 char dst_linkaddr_buffer[MAX_LINKADDR * 4];
6928 struct if_proto *proto = NULL;
6929 mbuf_t m = NULL;
6930 mbuf_t send_head = NULL;
6931 mbuf_t *send_tail = &send_head;
6932 int iorefcnt = 0;
6933 u_int32_t pre = 0, post = 0;
6934 u_int32_t fpkts = 0, fbytes = 0;
6935 int32_t flen = 0;
6936 struct timespec now;
6937 u_int64_t now_nsec;
6938 boolean_t did_clat46 = FALSE;
6939 protocol_family_t old_proto_family = proto_family;
6940 struct sockaddr_in6 dest6;
6941 struct rtentry *rt = NULL;
6942 u_int32_t m_loop_set = 0;
6943
6944 KERNEL_DEBUG(DBG_FNC_DLIL_OUTPUT | DBG_FUNC_START, 0, 0, 0, 0, 0);
6945
6946 /*
6947 * Get an io refcnt if the interface is attached to prevent ifnet_detach
6948 * from happening while this operation is in progress
6949 */
6950 if (!ifnet_datamov_begin(ifp)) {
6951 retval = ENXIO;
6952 goto cleanup;
6953 }
6954 iorefcnt = 1;
6955
6956 VERIFY(ifp->if_output_dlil != NULL);
6957
6958 /* update the driver's multicast filter, if needed */
6959 if (ifp->if_updatemcasts > 0) {
6960 if_mcasts_update_async(ifp);
6961 ifp->if_updatemcasts = 0;
6962 }
6963
6964 frame_type = frame_type_buffer;
6965 dst_linkaddr = dst_linkaddr_buffer;
6966
6967 if (raw == 0) {
6968 ifnet_lock_shared(ifp);
6969 /* callee holds a proto refcnt upon success */
6970 proto = find_attached_proto(ifp, proto_family);
6971 if (proto == NULL) {
6972 ifnet_lock_done(ifp);
6973 retval = ENXIO;
6974 goto cleanup;
6975 }
6976 ifnet_lock_done(ifp);
6977 }
6978
6979 preout_again:
6980 if (packetlist == NULL) {
6981 goto cleanup;
6982 }
6983
6984 m = packetlist;
6985 packetlist = packetlist->m_nextpkt;
6986 m->m_nextpkt = NULL;
6987
6988 m_add_crumb(m, PKT_CRUMB_DLIL_OUTPUT);
6989
6990 /*
6991 * Perform address family translation for the first
6992 * packet outside the loop in order to perform address
6993 * lookup for the translated proto family.
6994 */
6995 if (proto_family == PF_INET && IS_INTF_CLAT46(ifp) &&
6996 (ifp->if_type == IFT_CELLULAR ||
6997 dlil_is_clat_needed(proto_family, m))) {
6998 retval = dlil_clat46(ifp, &proto_family, &m);
6999 /*
7000 * Go to the next packet if translation fails
7001 */
7002 if (retval != 0) {
7003 m_freem(m);
7004 m = NULL;
7005 ip6stat.ip6s_clat464_out_drop++;
7006 /* Make sure that the proto family is PF_INET */
7007 ASSERT(proto_family == PF_INET);
7008 goto preout_again;
7009 }
7010 /*
7011 * Free the old one and make it point to the IPv6 proto structure.
7012 *
7013 * Change proto for the first time we have successfully
7014 * performed address family translation.
7015 */
7016 if (!did_clat46 && proto_family == PF_INET6) {
7017 did_clat46 = TRUE;
7018
7019 if (proto != NULL) {
7020 if_proto_free(proto);
7021 }
7022 ifnet_lock_shared(ifp);
7023 /* callee holds a proto refcnt upon success */
7024 proto = find_attached_proto(ifp, proto_family);
7025 if (proto == NULL) {
7026 ifnet_lock_done(ifp);
7027 retval = ENXIO;
7028 m_freem(m);
7029 m = NULL;
7030 goto cleanup;
7031 }
7032 ifnet_lock_done(ifp);
7033 if (ifp->if_type == IFT_ETHER) {
7034 /* Update the dest to translated v6 address */
7035 dest6.sin6_len = sizeof(struct sockaddr_in6);
7036 dest6.sin6_family = AF_INET6;
7037 dest6.sin6_addr = (mtod(m, struct ip6_hdr *))->ip6_dst;
7038 dest = (const struct sockaddr *)&dest6;
7039
7040 /*
7041 * Lookup route to the translated destination
7042 * Free this route ref during cleanup
7043 */
7044 rt = rtalloc1_scoped((struct sockaddr *)&dest6,
7045 0, 0, ifp->if_index);
7046
7047 route = rt;
7048 }
7049 }
7050 }
7051
7052 /*
7053 * This path gets packet chain going to the same destination.
7054 * The pre output routine is used to either trigger resolution of
7055 * the next hop or retreive the next hop's link layer addressing.
7056 * For ex: ether_inet(6)_pre_output routine.
7057 *
7058 * If the routine returns EJUSTRETURN, it implies that packet has
7059 * been queued, and therefore we have to call preout_again for the
7060 * following packet in the chain.
7061 *
7062 * For errors other than EJUSTRETURN, the current packet is freed
7063 * and the rest of the chain (pointed by packetlist is freed as
7064 * part of clean up.
7065 *
7066 * Else if there is no error the retrieved information is used for
7067 * all the packets in the chain.
7068 */
7069 if (raw == 0) {
7070 proto_media_preout preoutp = (proto->proto_kpi == kProtoKPI_v1 ?
7071 proto->kpi.v1.pre_output : proto->kpi.v2.pre_output);
7072 retval = 0;
7073 if (preoutp != NULL) {
7074 retval = preoutp(ifp, proto_family, &m, dest, route,
7075 frame_type, dst_linkaddr);
7076
7077 if (retval != 0) {
7078 if (retval == EJUSTRETURN) {
7079 goto preout_again;
7080 }
7081 m_freem(m);
7082 m = NULL;
7083 goto cleanup;
7084 }
7085 }
7086 }
7087
7088 do {
7089 /*
7090 * pkt_hdr is set here to point to m_data prior to
7091 * calling into the framer. This value of pkt_hdr is
7092 * used by the netif gso logic to retrieve the ip header
7093 * for the TCP packets, offloaded for TSO processing.
7094 */
7095 if ((raw != 0) && (ifp->if_family == IFNET_FAMILY_ETHERNET)) {
7096 uint8_t vlan_encap_len = 0;
7097
7098 if ((m->m_pkthdr.csum_flags & CSUM_VLAN_ENCAP_PRESENT) != 0) {
7099 vlan_encap_len = ETHER_VLAN_ENCAP_LEN;
7100 }
7101 m->m_pkthdr.pkt_hdr = mtod(m, char *) + ETHER_HDR_LEN + vlan_encap_len;
7102 } else {
7103 m->m_pkthdr.pkt_hdr = mtod(m, void *);
7104 }
7105
7106 /*
7107 * Perform address family translation if needed.
7108 * For now we only support stateless 4 to 6 translation
7109 * on the out path.
7110 *
7111 * The routine below translates IP header, updates protocol
7112 * checksum and also translates ICMP.
7113 *
7114 * We skip the first packet as it is already translated and
7115 * the proto family is set to PF_INET6.
7116 */
7117 if (proto_family == PF_INET && IS_INTF_CLAT46(ifp) &&
7118 (ifp->if_type == IFT_CELLULAR ||
7119 dlil_is_clat_needed(proto_family, m))) {
7120 retval = dlil_clat46(ifp, &proto_family, &m);
7121 /* Goto the next packet if the translation fails */
7122 if (retval != 0) {
7123 m_freem(m);
7124 m = NULL;
7125 ip6stat.ip6s_clat464_out_drop++;
7126 goto next;
7127 }
7128 }
7129
7130 #if CONFIG_DTRACE
7131 if (!raw) {
7132 dlil_output_dtrace(ifp, proto_family, m);
7133 }
7134 #endif /* CONFIG_DTRACE */
7135
7136 if (raw == 0 && ifp->if_framer != NULL) {
7137 int rcvif_set = 0;
7138
7139 /*
7140 * If this is a broadcast packet that needs to be
7141 * looped back into the system, set the inbound ifp
7142 * to that of the outbound ifp. This will allow
7143 * us to determine that it is a legitimate packet
7144 * for the system. Only set the ifp if it's not
7145 * already set, just to be safe.
7146 */
7147 if ((m->m_flags & (M_BCAST | M_LOOP)) &&
7148 m->m_pkthdr.rcvif == NULL) {
7149 m->m_pkthdr.rcvif = ifp;
7150 rcvif_set = 1;
7151 }
7152 m_loop_set = m->m_flags & M_LOOP;
7153 retval = ifp->if_framer(ifp, &m, dest, dst_linkaddr,
7154 frame_type, &pre, &post);
7155 if (retval != 0) {
7156 if (retval != EJUSTRETURN) {
7157 m_freem(m);
7158 }
7159 goto next;
7160 }
7161
7162 /*
7163 * For partial checksum offload, adjust the start
7164 * and stuff offsets based on the prepended header.
7165 */
7166 if ((m->m_pkthdr.csum_flags &
7167 (CSUM_DATA_VALID | CSUM_PARTIAL)) ==
7168 (CSUM_DATA_VALID | CSUM_PARTIAL)) {
7169 m->m_pkthdr.csum_tx_stuff += pre;
7170 m->m_pkthdr.csum_tx_start += pre;
7171 }
7172
7173 if (hwcksum_dbg != 0 && !(ifp->if_flags & IFF_LOOPBACK)) {
7174 dlil_output_cksum_dbg(ifp, m, pre,
7175 proto_family);
7176 }
7177
7178 /*
7179 * Clear the ifp if it was set above, and to be
7180 * safe, only if it is still the same as the
7181 * outbound ifp we have in context. If it was
7182 * looped back, then a copy of it was sent to the
7183 * loopback interface with the rcvif set, and we
7184 * are clearing the one that will go down to the
7185 * layer below.
7186 */
7187 if (rcvif_set && m->m_pkthdr.rcvif == ifp) {
7188 m->m_pkthdr.rcvif = NULL;
7189 }
7190 }
7191
7192 /*
7193 * Let interface filters (if any) do their thing ...
7194 */
7195 retval = dlil_interface_filters_output(ifp, &m, proto_family);
7196 if (retval != 0) {
7197 if (retval != EJUSTRETURN) {
7198 m_freem(m);
7199 }
7200 goto next;
7201 }
7202 /*
7203 * Strip away M_PROTO1 bit prior to sending packet
7204 * to the driver as this field may be used by the driver
7205 */
7206 m->m_flags &= ~M_PROTO1;
7207
7208 /*
7209 * If the underlying interface is not capable of handling a
7210 * packet whose data portion spans across physically disjoint
7211 * pages, we need to "normalize" the packet so that we pass
7212 * down a chain of mbufs where each mbuf points to a span that
7213 * resides in the system page boundary. If the packet does
7214 * not cross page(s), the following is a no-op.
7215 */
7216 if (!(ifp->if_hwassist & IFNET_MULTIPAGES)) {
7217 if ((m = m_normalize(m)) == NULL) {
7218 goto next;
7219 }
7220 }
7221
7222 /*
7223 * If this is a TSO packet, make sure the interface still
7224 * advertise TSO capability.
7225 */
7226 if (TSO_IPV4_NOTOK(ifp, m) || TSO_IPV6_NOTOK(ifp, m)) {
7227 retval = EMSGSIZE;
7228 m_freem(m);
7229 goto cleanup;
7230 }
7231
7232 ifp_inc_traffic_class_out(ifp, m);
7233
7234 #if SKYWALK
7235 /*
7236 * For native skywalk devices, packets will be passed to pktap
7237 * after GSO or after the mbuf to packet conversion.
7238 * This is done for IPv4/IPv6 packets only because there is no
7239 * space in the mbuf to pass down the proto family.
7240 */
7241 if (dlil_is_native_netif_nexus(ifp)) {
7242 if (raw || m->m_pkthdr.pkt_proto == 0) {
7243 pktap_output(ifp, proto_family, m, pre, post);
7244 m->m_pkthdr.pkt_flags |= PKTF_SKIP_PKTAP;
7245 }
7246 } else {
7247 pktap_output(ifp, proto_family, m, pre, post);
7248 }
7249 #else /* SKYWALK */
7250 pktap_output(ifp, proto_family, m, pre, post);
7251 #endif /* SKYWALK */
7252
7253 /*
7254 * Count the number of elements in the mbuf chain
7255 */
7256 if (tx_chain_len_count) {
7257 dlil_count_chain_len(m, &tx_chain_len_stats);
7258 }
7259
7260 /*
7261 * Record timestamp; ifnet_enqueue() will use this info
7262 * rather than redoing the work. An optimization could
7263 * involve doing this just once at the top, if there are
7264 * no interface filters attached, but that's probably
7265 * not a big deal.
7266 */
7267 nanouptime(&now);
7268 net_timernsec(&now, &now_nsec);
7269 (void) mbuf_set_timestamp(m, now_nsec, TRUE);
7270
7271 /*
7272 * Discard partial sum information if this packet originated
7273 * from another interface; the packet would already have the
7274 * final checksum and we shouldn't recompute it.
7275 */
7276 if ((m->m_pkthdr.pkt_flags & PKTF_FORWARDED) &&
7277 (m->m_pkthdr.csum_flags & (CSUM_DATA_VALID | CSUM_PARTIAL)) ==
7278 (CSUM_DATA_VALID | CSUM_PARTIAL)) {
7279 m->m_pkthdr.csum_flags &= ~CSUM_TX_FLAGS;
7280 m->m_pkthdr.csum_data = 0;
7281 }
7282
7283 /*
7284 * Finally, call the driver.
7285 */
7286 if (ifp->if_eflags & (IFEF_SENDLIST | IFEF_ENQUEUE_MULTI)) {
7287 if (m->m_pkthdr.pkt_flags & PKTF_FORWARDED) {
7288 flen += (m_pktlen(m) - (pre + post));
7289 m->m_pkthdr.pkt_flags &= ~PKTF_FORWARDED;
7290 }
7291 *send_tail = m;
7292 send_tail = &m->m_nextpkt;
7293 } else {
7294 if (m->m_pkthdr.pkt_flags & PKTF_FORWARDED) {
7295 flen = (m_pktlen(m) - (pre + post));
7296 m->m_pkthdr.pkt_flags &= ~PKTF_FORWARDED;
7297 } else {
7298 flen = 0;
7299 }
7300 KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_START,
7301 0, 0, 0, 0, 0);
7302 retval = (*ifp->if_output_dlil)(ifp, m);
7303 if (retval == EQFULL || retval == EQSUSPENDED) {
7304 if (adv != NULL && adv->code == FADV_SUCCESS) {
7305 adv->code = (retval == EQFULL ?
7306 FADV_FLOW_CONTROLLED :
7307 FADV_SUSPENDED);
7308 }
7309 retval = 0;
7310 }
7311 if (retval == 0 && flen > 0) {
7312 fbytes += flen;
7313 fpkts++;
7314 }
7315 if (retval != 0 && dlil_verbose) {
7316 DLIL_PRINTF("%s: output error on %s retval = %d\n",
7317 __func__, if_name(ifp),
7318 retval);
7319 }
7320 KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_END,
7321 0, 0, 0, 0, 0);
7322 }
7323 KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
7324
7325 next:
7326 m = packetlist;
7327 if (m != NULL) {
7328 m->m_flags |= m_loop_set;
7329 packetlist = packetlist->m_nextpkt;
7330 m->m_nextpkt = NULL;
7331 }
7332 /* Reset the proto family to old proto family for CLAT */
7333 if (did_clat46) {
7334 proto_family = old_proto_family;
7335 }
7336 } while (m != NULL);
7337
7338 if (send_head != NULL) {
7339 KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_START,
7340 0, 0, 0, 0, 0);
7341 if (ifp->if_eflags & IFEF_SENDLIST) {
7342 retval = (*ifp->if_output_dlil)(ifp, send_head);
7343 if (retval == EQFULL || retval == EQSUSPENDED) {
7344 if (adv != NULL) {
7345 adv->code = (retval == EQFULL ?
7346 FADV_FLOW_CONTROLLED :
7347 FADV_SUSPENDED);
7348 }
7349 retval = 0;
7350 }
7351 if (retval == 0 && flen > 0) {
7352 fbytes += flen;
7353 fpkts++;
7354 }
7355 if (retval != 0 && dlil_verbose) {
7356 DLIL_PRINTF("%s: output error on %s retval = %d\n",
7357 __func__, if_name(ifp), retval);
7358 }
7359 } else {
7360 struct mbuf *send_m;
7361 int enq_cnt = 0;
7362 VERIFY(ifp->if_eflags & IFEF_ENQUEUE_MULTI);
7363 while (send_head != NULL) {
7364 send_m = send_head;
7365 send_head = send_m->m_nextpkt;
7366 send_m->m_nextpkt = NULL;
7367 retval = (*ifp->if_output_dlil)(ifp, send_m);
7368 if (retval == EQFULL || retval == EQSUSPENDED) {
7369 if (adv != NULL) {
7370 adv->code = (retval == EQFULL ?
7371 FADV_FLOW_CONTROLLED :
7372 FADV_SUSPENDED);
7373 }
7374 retval = 0;
7375 }
7376 if (retval == 0) {
7377 enq_cnt++;
7378 if (flen > 0) {
7379 fpkts++;
7380 }
7381 }
7382 if (retval != 0 && dlil_verbose) {
7383 DLIL_PRINTF("%s: output error on %s "
7384 "retval = %d\n",
7385 __func__, if_name(ifp), retval);
7386 }
7387 }
7388 if (enq_cnt > 0) {
7389 fbytes += flen;
7390 ifnet_start(ifp);
7391 }
7392 }
7393 KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
7394 }
7395
7396 KERNEL_DEBUG(DBG_FNC_DLIL_OUTPUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
7397
7398 cleanup:
7399 if (fbytes > 0) {
7400 ifp->if_fbytes += fbytes;
7401 }
7402 if (fpkts > 0) {
7403 ifp->if_fpackets += fpkts;
7404 }
7405 if (proto != NULL) {
7406 if_proto_free(proto);
7407 }
7408 if (packetlist) { /* if any packets are left, clean up */
7409 mbuf_freem_list(packetlist);
7410 }
7411 if (retval == EJUSTRETURN) {
7412 retval = 0;
7413 }
7414 if (iorefcnt == 1) {
7415 ifnet_datamov_end(ifp);
7416 }
7417 if (rt != NULL) {
7418 rtfree(rt);
7419 rt = NULL;
7420 }
7421
7422 return retval;
7423 }
7424
7425 /*
7426 * This routine checks if the destination address is not a loopback, link-local,
7427 * multicast or broadcast address.
7428 */
7429 static int
dlil_is_clat_needed(protocol_family_t proto_family,mbuf_t m)7430 dlil_is_clat_needed(protocol_family_t proto_family, mbuf_t m)
7431 {
7432 int ret = 0;
7433 switch (proto_family) {
7434 case PF_INET: {
7435 struct ip *iph = mtod(m, struct ip *);
7436 if (CLAT46_NEEDED(ntohl(iph->ip_dst.s_addr))) {
7437 ret = 1;
7438 }
7439 break;
7440 }
7441 case PF_INET6: {
7442 struct ip6_hdr *ip6h = mtod(m, struct ip6_hdr *);
7443 if ((size_t)m_pktlen(m) >= sizeof(struct ip6_hdr) &&
7444 CLAT64_NEEDED(&ip6h->ip6_dst)) {
7445 ret = 1;
7446 }
7447 break;
7448 }
7449 }
7450
7451 return ret;
7452 }
7453 /*
7454 * @brief This routine translates IPv4 packet to IPv6 packet,
7455 * updates protocol checksum and also translates ICMP for code
7456 * along with inner header translation.
7457 *
7458 * @param ifp Pointer to the interface
7459 * @param proto_family pointer to protocol family. It is updated if function
7460 * performs the translation successfully.
7461 * @param m Pointer to the pointer pointing to the packet. Needed because this
7462 * routine can end up changing the mbuf to a different one.
7463 *
7464 * @return 0 on success or else a negative value.
7465 */
7466 static errno_t
dlil_clat46(ifnet_t ifp,protocol_family_t * proto_family,mbuf_t * m)7467 dlil_clat46(ifnet_t ifp, protocol_family_t *proto_family, mbuf_t *m)
7468 {
7469 VERIFY(*proto_family == PF_INET);
7470 VERIFY(IS_INTF_CLAT46(ifp));
7471
7472 pbuf_t pbuf_store, *pbuf = NULL;
7473 struct ip *iph = NULL;
7474 struct in_addr osrc, odst;
7475 uint8_t proto = 0;
7476 struct in6_ifaddr *ia6_clat_src = NULL;
7477 struct in6_addr *src = NULL;
7478 struct in6_addr dst;
7479 int error = 0;
7480 uint16_t off = 0;
7481 uint16_t tot_len = 0;
7482 uint16_t ip_id_val = 0;
7483 uint16_t ip_frag_off = 0;
7484
7485 boolean_t is_frag = FALSE;
7486 boolean_t is_first_frag = TRUE;
7487 boolean_t is_last_frag = TRUE;
7488
7489 pbuf_init_mbuf(&pbuf_store, *m, ifp);
7490 pbuf = &pbuf_store;
7491 iph = pbuf->pb_data;
7492
7493 osrc = iph->ip_src;
7494 odst = iph->ip_dst;
7495 proto = iph->ip_p;
7496 off = (uint16_t)(iph->ip_hl << 2);
7497 ip_id_val = iph->ip_id;
7498 ip_frag_off = ntohs(iph->ip_off) & IP_OFFMASK;
7499
7500 tot_len = ntohs(iph->ip_len);
7501
7502 /*
7503 * For packets that are not first frags
7504 * we only need to adjust CSUM.
7505 * For 4 to 6, Fragmentation header gets appended
7506 * after proto translation.
7507 */
7508 if (ntohs(iph->ip_off) & ~(IP_DF | IP_RF)) {
7509 is_frag = TRUE;
7510
7511 /* If the offset is not zero, it is not first frag */
7512 if (ip_frag_off != 0) {
7513 is_first_frag = FALSE;
7514 }
7515
7516 /* If IP_MF is set, then it is not last frag */
7517 if (ntohs(iph->ip_off) & IP_MF) {
7518 is_last_frag = FALSE;
7519 }
7520 }
7521
7522 /*
7523 * Retrive the local IPv6 CLAT46 address reserved for stateless
7524 * translation.
7525 */
7526 ia6_clat_src = in6ifa_ifpwithflag(ifp, IN6_IFF_CLAT46);
7527 if (ia6_clat_src == NULL) {
7528 ip6stat.ip6s_clat464_out_nov6addr_drop++;
7529 error = -1;
7530 goto cleanup;
7531 }
7532
7533 src = &ia6_clat_src->ia_addr.sin6_addr;
7534
7535 /*
7536 * Translate IPv4 destination to IPv6 destination by using the
7537 * prefixes learned through prior PLAT discovery.
7538 */
7539 if ((error = nat464_synthesize_ipv6(ifp, &odst, &dst)) != 0) {
7540 ip6stat.ip6s_clat464_out_v6synthfail_drop++;
7541 goto cleanup;
7542 }
7543
7544 /* Translate the IP header part first */
7545 error = (nat464_translate_46(pbuf, off, iph->ip_tos, iph->ip_p,
7546 iph->ip_ttl, *src, dst, tot_len) == NT_NAT64) ? 0 : -1;
7547
7548 iph = NULL; /* Invalidate iph as pbuf has been modified */
7549
7550 if (error != 0) {
7551 ip6stat.ip6s_clat464_out_46transfail_drop++;
7552 goto cleanup;
7553 }
7554
7555 /*
7556 * Translate protocol header, update checksum, checksum flags
7557 * and related fields.
7558 */
7559 error = (nat464_translate_proto(pbuf, (struct nat464_addr *)&osrc, (struct nat464_addr *)&odst,
7560 proto, PF_INET, PF_INET6, NT_OUT, !is_first_frag) == NT_NAT64) ? 0 : -1;
7561
7562 if (error != 0) {
7563 ip6stat.ip6s_clat464_out_46proto_transfail_drop++;
7564 goto cleanup;
7565 }
7566
7567 /* Now insert the IPv6 fragment header */
7568 if (is_frag) {
7569 error = nat464_insert_frag46(pbuf, ip_id_val, ip_frag_off, is_last_frag);
7570
7571 if (error != 0) {
7572 ip6stat.ip6s_clat464_out_46frag_transfail_drop++;
7573 goto cleanup;
7574 }
7575 }
7576
7577 cleanup:
7578 if (ia6_clat_src != NULL) {
7579 IFA_REMREF(&ia6_clat_src->ia_ifa);
7580 }
7581
7582 if (pbuf_is_valid(pbuf)) {
7583 *m = pbuf->pb_mbuf;
7584 pbuf->pb_mbuf = NULL;
7585 pbuf_destroy(pbuf);
7586 } else {
7587 error = -1;
7588 *m = NULL;
7589 ip6stat.ip6s_clat464_out_invalpbuf_drop++;
7590 }
7591
7592 if (error == 0) {
7593 *proto_family = PF_INET6;
7594 ip6stat.ip6s_clat464_out_success++;
7595 }
7596
7597 return error;
7598 }
7599
7600 /*
7601 * @brief This routine translates incoming IPv6 to IPv4 packet,
7602 * updates protocol checksum and also translates ICMPv6 outer
7603 * and inner headers
7604 *
7605 * @return 0 on success or else a negative value.
7606 */
7607 static errno_t
dlil_clat64(ifnet_t ifp,protocol_family_t * proto_family,mbuf_t * m)7608 dlil_clat64(ifnet_t ifp, protocol_family_t *proto_family, mbuf_t *m)
7609 {
7610 VERIFY(*proto_family == PF_INET6);
7611 VERIFY(IS_INTF_CLAT46(ifp));
7612
7613 struct ip6_hdr *ip6h = NULL;
7614 struct in6_addr osrc, odst;
7615 uint8_t proto = 0;
7616 struct in6_ifaddr *ia6_clat_dst = NULL;
7617 struct in_ifaddr *ia4_clat_dst = NULL;
7618 struct in_addr *dst = NULL;
7619 struct in_addr src;
7620 int error = 0;
7621 uint32_t off = 0;
7622 u_int64_t tot_len = 0;
7623 uint8_t tos = 0;
7624 boolean_t is_first_frag = TRUE;
7625
7626 /* Incoming mbuf does not contain valid IP6 header */
7627 if ((size_t)(*m)->m_pkthdr.len < sizeof(struct ip6_hdr) ||
7628 ((size_t)(*m)->m_len < sizeof(struct ip6_hdr) &&
7629 (*m = m_pullup(*m, sizeof(struct ip6_hdr))) == NULL)) {
7630 ip6stat.ip6s_clat464_in_tooshort_drop++;
7631 return -1;
7632 }
7633
7634 ip6h = mtod(*m, struct ip6_hdr *);
7635 /* Validate that mbuf contains IP payload equal to ip6_plen */
7636 if ((size_t)(*m)->m_pkthdr.len < ntohs(ip6h->ip6_plen) + sizeof(struct ip6_hdr)) {
7637 ip6stat.ip6s_clat464_in_tooshort_drop++;
7638 return -1;
7639 }
7640
7641 osrc = ip6h->ip6_src;
7642 odst = ip6h->ip6_dst;
7643
7644 /*
7645 * Retrieve the local CLAT46 reserved IPv6 address.
7646 * Let the packet pass if we don't find one, as the flag
7647 * may get set before IPv6 configuration has taken place.
7648 */
7649 ia6_clat_dst = in6ifa_ifpwithflag(ifp, IN6_IFF_CLAT46);
7650 if (ia6_clat_dst == NULL) {
7651 goto done;
7652 }
7653
7654 /*
7655 * Check if the original dest in the packet is same as the reserved
7656 * CLAT46 IPv6 address
7657 */
7658 if (IN6_ARE_ADDR_EQUAL(&odst, &ia6_clat_dst->ia_addr.sin6_addr)) {
7659 pbuf_t pbuf_store, *pbuf = NULL;
7660 pbuf_init_mbuf(&pbuf_store, *m, ifp);
7661 pbuf = &pbuf_store;
7662
7663 /*
7664 * Retrive the local CLAT46 IPv4 address reserved for stateless
7665 * translation.
7666 */
7667 ia4_clat_dst = inifa_ifpclatv4(ifp);
7668 if (ia4_clat_dst == NULL) {
7669 IFA_REMREF(&ia6_clat_dst->ia_ifa);
7670 ip6stat.ip6s_clat464_in_nov4addr_drop++;
7671 error = -1;
7672 goto cleanup;
7673 }
7674 IFA_REMREF(&ia6_clat_dst->ia_ifa);
7675
7676 /* Translate IPv6 src to IPv4 src by removing the NAT64 prefix */
7677 dst = &ia4_clat_dst->ia_addr.sin_addr;
7678 if ((error = nat464_synthesize_ipv4(ifp, &osrc, &src)) != 0) {
7679 ip6stat.ip6s_clat464_in_v4synthfail_drop++;
7680 error = -1;
7681 goto cleanup;
7682 }
7683
7684 ip6h = pbuf->pb_data;
7685 off = sizeof(struct ip6_hdr);
7686 proto = ip6h->ip6_nxt;
7687 tos = (ntohl(ip6h->ip6_flow) >> 20) & 0xff;
7688 tot_len = ntohs(ip6h->ip6_plen) + sizeof(struct ip6_hdr);
7689
7690 /*
7691 * Translate the IP header and update the fragmentation
7692 * header if needed
7693 */
7694 error = (nat464_translate_64(pbuf, off, tos, &proto,
7695 ip6h->ip6_hlim, src, *dst, tot_len, &is_first_frag) == NT_NAT64) ?
7696 0 : -1;
7697
7698 ip6h = NULL; /* Invalidate ip6h as pbuf has been changed */
7699
7700 if (error != 0) {
7701 ip6stat.ip6s_clat464_in_64transfail_drop++;
7702 goto cleanup;
7703 }
7704
7705 /*
7706 * Translate protocol header, update checksum, checksum flags
7707 * and related fields.
7708 */
7709 error = (nat464_translate_proto(pbuf, (struct nat464_addr *)&osrc,
7710 (struct nat464_addr *)&odst, proto, PF_INET6, PF_INET,
7711 NT_IN, !is_first_frag) == NT_NAT64) ? 0 : -1;
7712
7713 if (error != 0) {
7714 ip6stat.ip6s_clat464_in_64proto_transfail_drop++;
7715 goto cleanup;
7716 }
7717
7718 cleanup:
7719 if (ia4_clat_dst != NULL) {
7720 IFA_REMREF(&ia4_clat_dst->ia_ifa);
7721 }
7722
7723 if (pbuf_is_valid(pbuf)) {
7724 *m = pbuf->pb_mbuf;
7725 pbuf->pb_mbuf = NULL;
7726 pbuf_destroy(pbuf);
7727 } else {
7728 error = -1;
7729 ip6stat.ip6s_clat464_in_invalpbuf_drop++;
7730 }
7731
7732 if (error == 0) {
7733 *proto_family = PF_INET;
7734 ip6stat.ip6s_clat464_in_success++;
7735 }
7736 } /* CLAT traffic */
7737
7738 done:
7739 return error;
7740 }
7741
7742 /* The following is used to enqueue work items for ifnet ioctl events */
7743 static void ifnet_ioctl_event_callback(struct nwk_wq_entry *);
7744
7745 struct ifnet_ioctl_event {
7746 struct ifnet *ifp;
7747 u_long ioctl_code;
7748 };
7749
7750 struct ifnet_ioctl_event_nwk_wq_entry {
7751 struct nwk_wq_entry nwk_wqe;
7752 struct ifnet_ioctl_event ifnet_ioctl_ev_arg;
7753 };
7754
7755 void
ifnet_ioctl_async(struct ifnet * ifp,u_long ioctl_code)7756 ifnet_ioctl_async(struct ifnet *ifp, u_long ioctl_code)
7757 {
7758 struct ifnet_ioctl_event_nwk_wq_entry *p_ifnet_ioctl_ev = NULL;
7759 bool compare_expected;
7760
7761 /*
7762 * Get an io ref count if the interface is attached.
7763 * At this point it most likely is. We are taking a reference for
7764 * deferred processing.
7765 */
7766 if (!ifnet_is_attached(ifp, 1)) {
7767 os_log(OS_LOG_DEFAULT, "%s:%d %s Failed for ioctl %lu as interface "
7768 "is not attached",
7769 __func__, __LINE__, if_name(ifp), ioctl_code);
7770 return;
7771 }
7772 switch (ioctl_code) {
7773 case SIOCADDMULTI:
7774 compare_expected = false;
7775 if (!atomic_compare_exchange_strong(&ifp->if_mcast_add_signaled, &compare_expected, true)) {
7776 ifnet_decr_iorefcnt(ifp);
7777 return;
7778 }
7779 break;
7780 case SIOCDELMULTI:
7781 compare_expected = false;
7782 if (!atomic_compare_exchange_strong(&ifp->if_mcast_del_signaled, &compare_expected, true)) {
7783 ifnet_decr_iorefcnt(ifp);
7784 return;
7785 }
7786 break;
7787 default:
7788 os_log(OS_LOG_DEFAULT, "%s:%d %s unknown ioctl %lu",
7789 __func__, __LINE__, if_name(ifp), ioctl_code);
7790 return;
7791 }
7792
7793 p_ifnet_ioctl_ev = kalloc_type(struct ifnet_ioctl_event_nwk_wq_entry,
7794 Z_WAITOK | Z_ZERO | Z_NOFAIL);
7795
7796 p_ifnet_ioctl_ev->ifnet_ioctl_ev_arg.ifp = ifp;
7797 p_ifnet_ioctl_ev->ifnet_ioctl_ev_arg.ioctl_code = ioctl_code;
7798 p_ifnet_ioctl_ev->nwk_wqe.func = ifnet_ioctl_event_callback;
7799 nwk_wq_enqueue(&p_ifnet_ioctl_ev->nwk_wqe);
7800 }
7801
7802 static void
ifnet_ioctl_event_callback(struct nwk_wq_entry * nwk_item)7803 ifnet_ioctl_event_callback(struct nwk_wq_entry *nwk_item)
7804 {
7805 struct ifnet_ioctl_event_nwk_wq_entry *p_ev = __container_of(nwk_item,
7806 struct ifnet_ioctl_event_nwk_wq_entry, nwk_wqe);
7807
7808 struct ifnet *ifp = p_ev->ifnet_ioctl_ev_arg.ifp;
7809 u_long ioctl_code = p_ev->ifnet_ioctl_ev_arg.ioctl_code;
7810 int ret = 0;
7811
7812 switch (ioctl_code) {
7813 case SIOCADDMULTI:
7814 atomic_store(&ifp->if_mcast_add_signaled, false);
7815 break;
7816 case SIOCDELMULTI:
7817 atomic_store(&ifp->if_mcast_del_signaled, false);
7818 break;
7819 }
7820 if ((ret = ifnet_ioctl(ifp, 0, ioctl_code, NULL)) != 0) {
7821 os_log(OS_LOG_DEFAULT, "%s:%d %s ifnet_ioctl returned %d for ioctl %lu",
7822 __func__, __LINE__, if_name(ifp), ret, ioctl_code);
7823 } else if (dlil_verbose) {
7824 os_log(OS_LOG_DEFAULT, "%s:%d %s ifnet_ioctl returned successfully "
7825 "for ioctl %lu",
7826 __func__, __LINE__, if_name(ifp), ioctl_code);
7827 }
7828 ifnet_decr_iorefcnt(ifp);
7829 kfree_type(struct ifnet_ioctl_event_nwk_wq_entry, p_ev);
7830 return;
7831 }
7832
7833 errno_t
ifnet_ioctl(ifnet_t ifp,protocol_family_t proto_fam,u_long ioctl_code,void * ioctl_arg)7834 ifnet_ioctl(ifnet_t ifp, protocol_family_t proto_fam, u_long ioctl_code,
7835 void *ioctl_arg)
7836 {
7837 struct ifnet_filter *filter;
7838 int retval = EOPNOTSUPP;
7839 int result = 0;
7840
7841 if (ifp == NULL || ioctl_code == 0) {
7842 return EINVAL;
7843 }
7844
7845 /* Get an io ref count if the interface is attached */
7846 if (!ifnet_is_attached(ifp, 1)) {
7847 return EOPNOTSUPP;
7848 }
7849
7850 /*
7851 * Run the interface filters first.
7852 * We want to run all filters before calling the protocol,
7853 * interface family, or interface.
7854 */
7855 lck_mtx_lock_spin(&ifp->if_flt_lock);
7856 /* prevent filter list from changing in case we drop the lock */
7857 if_flt_monitor_busy(ifp);
7858 TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
7859 if (filter->filt_ioctl != NULL && (filter->filt_protocol == 0 ||
7860 filter->filt_protocol == proto_fam)) {
7861 lck_mtx_unlock(&ifp->if_flt_lock);
7862
7863 result = filter->filt_ioctl(filter->filt_cookie, ifp,
7864 proto_fam, ioctl_code, ioctl_arg);
7865
7866 lck_mtx_lock_spin(&ifp->if_flt_lock);
7867
7868 /* Only update retval if no one has handled the ioctl */
7869 if (retval == EOPNOTSUPP || result == EJUSTRETURN) {
7870 if (result == ENOTSUP) {
7871 result = EOPNOTSUPP;
7872 }
7873 retval = result;
7874 if (retval != 0 && retval != EOPNOTSUPP) {
7875 /* we're done with the filter list */
7876 if_flt_monitor_unbusy(ifp);
7877 lck_mtx_unlock(&ifp->if_flt_lock);
7878 goto cleanup;
7879 }
7880 }
7881 }
7882 }
7883 /* we're done with the filter list */
7884 if_flt_monitor_unbusy(ifp);
7885 lck_mtx_unlock(&ifp->if_flt_lock);
7886
7887 /* Allow the protocol to handle the ioctl */
7888 if (proto_fam != 0) {
7889 struct if_proto *proto;
7890
7891 /* callee holds a proto refcnt upon success */
7892 ifnet_lock_shared(ifp);
7893 proto = find_attached_proto(ifp, proto_fam);
7894 ifnet_lock_done(ifp);
7895 if (proto != NULL) {
7896 proto_media_ioctl ioctlp =
7897 (proto->proto_kpi == kProtoKPI_v1 ?
7898 proto->kpi.v1.ioctl : proto->kpi.v2.ioctl);
7899 result = EOPNOTSUPP;
7900 if (ioctlp != NULL) {
7901 result = ioctlp(ifp, proto_fam, ioctl_code,
7902 ioctl_arg);
7903 }
7904 if_proto_free(proto);
7905
7906 /* Only update retval if no one has handled the ioctl */
7907 if (retval == EOPNOTSUPP || result == EJUSTRETURN) {
7908 if (result == ENOTSUP) {
7909 result = EOPNOTSUPP;
7910 }
7911 retval = result;
7912 if (retval && retval != EOPNOTSUPP) {
7913 goto cleanup;
7914 }
7915 }
7916 }
7917 }
7918
7919 /* retval is either 0 or EOPNOTSUPP */
7920
7921 /*
7922 * Let the interface handle this ioctl.
7923 * If it returns EOPNOTSUPP, ignore that, we may have
7924 * already handled this in the protocol or family.
7925 */
7926 if (ifp->if_ioctl) {
7927 result = (*ifp->if_ioctl)(ifp, ioctl_code, ioctl_arg);
7928 }
7929
7930 /* Only update retval if no one has handled the ioctl */
7931 if (retval == EOPNOTSUPP || result == EJUSTRETURN) {
7932 if (result == ENOTSUP) {
7933 result = EOPNOTSUPP;
7934 }
7935 retval = result;
7936 if (retval && retval != EOPNOTSUPP) {
7937 goto cleanup;
7938 }
7939 }
7940
7941 cleanup:
7942 if (retval == EJUSTRETURN) {
7943 retval = 0;
7944 }
7945
7946 ifnet_decr_iorefcnt(ifp);
7947
7948 return retval;
7949 }
7950
7951 __private_extern__ errno_t
dlil_set_bpf_tap(ifnet_t ifp,bpf_tap_mode mode,bpf_packet_func callback)7952 dlil_set_bpf_tap(ifnet_t ifp, bpf_tap_mode mode, bpf_packet_func callback)
7953 {
7954 errno_t error = 0;
7955
7956 if (ifp->if_set_bpf_tap) {
7957 /* Get an io reference on the interface if it is attached */
7958 if (!ifnet_is_attached(ifp, 1)) {
7959 return ENXIO;
7960 }
7961 error = ifp->if_set_bpf_tap(ifp, mode, callback);
7962 ifnet_decr_iorefcnt(ifp);
7963 }
7964 return error;
7965 }
7966
7967 errno_t
dlil_resolve_multi(struct ifnet * ifp,const struct sockaddr * proto_addr,struct sockaddr * ll_addr,size_t ll_len)7968 dlil_resolve_multi(struct ifnet *ifp, const struct sockaddr *proto_addr,
7969 struct sockaddr *ll_addr, size_t ll_len)
7970 {
7971 errno_t result = EOPNOTSUPP;
7972 struct if_proto *proto;
7973 const struct sockaddr *verify;
7974 proto_media_resolve_multi resolvep;
7975
7976 if (!ifnet_is_attached(ifp, 1)) {
7977 return result;
7978 }
7979
7980 bzero(ll_addr, ll_len);
7981
7982 /* Call the protocol first; callee holds a proto refcnt upon success */
7983 ifnet_lock_shared(ifp);
7984 proto = find_attached_proto(ifp, proto_addr->sa_family);
7985 ifnet_lock_done(ifp);
7986 if (proto != NULL) {
7987 resolvep = (proto->proto_kpi == kProtoKPI_v1 ?
7988 proto->kpi.v1.resolve_multi : proto->kpi.v2.resolve_multi);
7989 if (resolvep != NULL) {
7990 result = resolvep(ifp, proto_addr,
7991 (struct sockaddr_dl *)(void *)ll_addr, ll_len);
7992 }
7993 if_proto_free(proto);
7994 }
7995
7996 /* Let the interface verify the multicast address */
7997 if ((result == EOPNOTSUPP || result == 0) && ifp->if_check_multi) {
7998 if (result == 0) {
7999 verify = ll_addr;
8000 } else {
8001 verify = proto_addr;
8002 }
8003 result = ifp->if_check_multi(ifp, verify);
8004 }
8005
8006 ifnet_decr_iorefcnt(ifp);
8007 return result;
8008 }
8009
8010 __private_extern__ errno_t
dlil_send_arp_internal(ifnet_t ifp,u_short arpop,const struct sockaddr_dl * sender_hw,const struct sockaddr * sender_proto,const struct sockaddr_dl * target_hw,const struct sockaddr * target_proto)8011 dlil_send_arp_internal(ifnet_t ifp, u_short arpop,
8012 const struct sockaddr_dl *sender_hw, const struct sockaddr *sender_proto,
8013 const struct sockaddr_dl *target_hw, const struct sockaddr *target_proto)
8014 {
8015 struct if_proto *proto;
8016 errno_t result = 0;
8017
8018 if ((ifp->if_flags & IFF_NOARP) != 0) {
8019 result = ENOTSUP;
8020 goto done;
8021 }
8022
8023 /* callee holds a proto refcnt upon success */
8024 ifnet_lock_shared(ifp);
8025 proto = find_attached_proto(ifp, target_proto->sa_family);
8026 ifnet_lock_done(ifp);
8027 if (proto == NULL) {
8028 result = ENOTSUP;
8029 } else {
8030 proto_media_send_arp arpp;
8031 arpp = (proto->proto_kpi == kProtoKPI_v1 ?
8032 proto->kpi.v1.send_arp : proto->kpi.v2.send_arp);
8033 if (arpp == NULL) {
8034 result = ENOTSUP;
8035 } else {
8036 switch (arpop) {
8037 case ARPOP_REQUEST:
8038 arpstat.txrequests++;
8039 if (target_hw != NULL) {
8040 arpstat.txurequests++;
8041 }
8042 break;
8043 case ARPOP_REPLY:
8044 arpstat.txreplies++;
8045 break;
8046 }
8047 result = arpp(ifp, arpop, sender_hw, sender_proto,
8048 target_hw, target_proto);
8049 }
8050 if_proto_free(proto);
8051 }
8052 done:
8053 return result;
8054 }
8055
8056 struct net_thread_marks { };
8057 static const struct net_thread_marks net_thread_marks_base = { };
8058
8059 __private_extern__ const net_thread_marks_t net_thread_marks_none =
8060 &net_thread_marks_base;
8061
8062 __private_extern__ net_thread_marks_t
net_thread_marks_push(u_int32_t push)8063 net_thread_marks_push(u_int32_t push)
8064 {
8065 static const char *const base = (const void*)&net_thread_marks_base;
8066 u_int32_t pop = 0;
8067
8068 if (push != 0) {
8069 struct uthread *uth = current_uthread();
8070
8071 pop = push & ~uth->uu_network_marks;
8072 if (pop != 0) {
8073 uth->uu_network_marks |= pop;
8074 }
8075 }
8076
8077 return (net_thread_marks_t)&base[pop];
8078 }
8079
8080 __private_extern__ net_thread_marks_t
net_thread_unmarks_push(u_int32_t unpush)8081 net_thread_unmarks_push(u_int32_t unpush)
8082 {
8083 static const char *const base = (const void*)&net_thread_marks_base;
8084 u_int32_t unpop = 0;
8085
8086 if (unpush != 0) {
8087 struct uthread *uth = current_uthread();
8088
8089 unpop = unpush & uth->uu_network_marks;
8090 if (unpop != 0) {
8091 uth->uu_network_marks &= ~unpop;
8092 }
8093 }
8094
8095 return (net_thread_marks_t)&base[unpop];
8096 }
8097
8098 __private_extern__ void
net_thread_marks_pop(net_thread_marks_t popx)8099 net_thread_marks_pop(net_thread_marks_t popx)
8100 {
8101 static const char *const base = (const void*)&net_thread_marks_base;
8102 const ptrdiff_t pop = (const char *)popx - (const char *)base;
8103
8104 if (pop != 0) {
8105 static const ptrdiff_t ones = (ptrdiff_t)(u_int32_t)~0U;
8106 struct uthread *uth = current_uthread();
8107
8108 VERIFY((pop & ones) == pop);
8109 VERIFY((ptrdiff_t)(uth->uu_network_marks & pop) == pop);
8110 uth->uu_network_marks &= ~pop;
8111 }
8112 }
8113
8114 __private_extern__ void
net_thread_unmarks_pop(net_thread_marks_t unpopx)8115 net_thread_unmarks_pop(net_thread_marks_t unpopx)
8116 {
8117 static const char *const base = (const void*)&net_thread_marks_base;
8118 ptrdiff_t unpop = (const char *)unpopx - (const char *)base;
8119
8120 if (unpop != 0) {
8121 static const ptrdiff_t ones = (ptrdiff_t)(u_int32_t)~0U;
8122 struct uthread *uth = current_uthread();
8123
8124 VERIFY((unpop & ones) == unpop);
8125 VERIFY((ptrdiff_t)(uth->uu_network_marks & unpop) == 0);
8126 uth->uu_network_marks |= unpop;
8127 }
8128 }
8129
8130 __private_extern__ u_int32_t
net_thread_is_marked(u_int32_t check)8131 net_thread_is_marked(u_int32_t check)
8132 {
8133 if (check != 0) {
8134 struct uthread *uth = current_uthread();
8135 return uth->uu_network_marks & check;
8136 } else {
8137 return 0;
8138 }
8139 }
8140
8141 __private_extern__ u_int32_t
net_thread_is_unmarked(u_int32_t check)8142 net_thread_is_unmarked(u_int32_t check)
8143 {
8144 if (check != 0) {
8145 struct uthread *uth = current_uthread();
8146 return ~uth->uu_network_marks & check;
8147 } else {
8148 return 0;
8149 }
8150 }
8151
8152 static __inline__ int
_is_announcement(const struct sockaddr_in * sender_sin,const struct sockaddr_in * target_sin)8153 _is_announcement(const struct sockaddr_in * sender_sin,
8154 const struct sockaddr_in * target_sin)
8155 {
8156 if (target_sin == NULL || sender_sin == NULL) {
8157 return FALSE;
8158 }
8159
8160 return sender_sin->sin_addr.s_addr == target_sin->sin_addr.s_addr;
8161 }
8162
8163 __private_extern__ errno_t
dlil_send_arp(ifnet_t ifp,u_short arpop,const struct sockaddr_dl * sender_hw,const struct sockaddr * sender_proto,const struct sockaddr_dl * target_hw,const struct sockaddr * target_proto0,u_int32_t rtflags)8164 dlil_send_arp(ifnet_t ifp, u_short arpop, const struct sockaddr_dl *sender_hw,
8165 const struct sockaddr *sender_proto, const struct sockaddr_dl *target_hw,
8166 const struct sockaddr *target_proto0, u_int32_t rtflags)
8167 {
8168 errno_t result = 0;
8169 const struct sockaddr_in * sender_sin;
8170 const struct sockaddr_in * target_sin;
8171 struct sockaddr_inarp target_proto_sinarp;
8172 struct sockaddr *target_proto = (void *)(uintptr_t)target_proto0;
8173
8174 if (target_proto == NULL || sender_proto == NULL) {
8175 return EINVAL;
8176 }
8177
8178 if (sender_proto->sa_family != target_proto->sa_family) {
8179 return EINVAL;
8180 }
8181
8182 /*
8183 * If the target is a (default) router, provide that
8184 * information to the send_arp callback routine.
8185 */
8186 if (rtflags & RTF_ROUTER) {
8187 bcopy(target_proto, &target_proto_sinarp,
8188 sizeof(struct sockaddr_in));
8189 target_proto_sinarp.sin_other |= SIN_ROUTER;
8190 target_proto = (struct sockaddr *)&target_proto_sinarp;
8191 }
8192
8193 /*
8194 * If this is an ARP request and the target IP is IPv4LL,
8195 * send the request on all interfaces. The exception is
8196 * an announcement, which must only appear on the specific
8197 * interface.
8198 */
8199 sender_sin = (struct sockaddr_in *)(void *)(uintptr_t)sender_proto;
8200 target_sin = (struct sockaddr_in *)(void *)(uintptr_t)target_proto;
8201 if (target_proto->sa_family == AF_INET &&
8202 IN_LINKLOCAL(ntohl(target_sin->sin_addr.s_addr)) &&
8203 ipv4_ll_arp_aware != 0 && arpop == ARPOP_REQUEST &&
8204 !_is_announcement(sender_sin, target_sin)) {
8205 ifnet_t *ifp_list;
8206 u_int32_t count;
8207 u_int32_t ifp_on;
8208
8209 result = ENOTSUP;
8210
8211 if (ifnet_list_get(IFNET_FAMILY_ANY, &ifp_list, &count) == 0) {
8212 for (ifp_on = 0; ifp_on < count; ifp_on++) {
8213 errno_t new_result;
8214 ifaddr_t source_hw = NULL;
8215 ifaddr_t source_ip = NULL;
8216 struct sockaddr_in source_ip_copy;
8217 struct ifnet *cur_ifp = ifp_list[ifp_on];
8218
8219 /*
8220 * Only arp on interfaces marked for IPv4LL
8221 * ARPing. This may mean that we don't ARP on
8222 * the interface the subnet route points to.
8223 */
8224 if (!(cur_ifp->if_eflags & IFEF_ARPLL)) {
8225 continue;
8226 }
8227
8228 /* Find the source IP address */
8229 ifnet_lock_shared(cur_ifp);
8230 source_hw = cur_ifp->if_lladdr;
8231 TAILQ_FOREACH(source_ip, &cur_ifp->if_addrhead,
8232 ifa_link) {
8233 IFA_LOCK(source_ip);
8234 if (source_ip->ifa_addr != NULL &&
8235 source_ip->ifa_addr->sa_family ==
8236 AF_INET) {
8237 /* Copy the source IP address */
8238 source_ip_copy =
8239 *(struct sockaddr_in *)
8240 (void *)source_ip->ifa_addr;
8241 IFA_UNLOCK(source_ip);
8242 break;
8243 }
8244 IFA_UNLOCK(source_ip);
8245 }
8246
8247 /* No IP Source, don't arp */
8248 if (source_ip == NULL) {
8249 ifnet_lock_done(cur_ifp);
8250 continue;
8251 }
8252
8253 IFA_ADDREF(source_hw);
8254 ifnet_lock_done(cur_ifp);
8255
8256 /* Send the ARP */
8257 new_result = dlil_send_arp_internal(cur_ifp,
8258 arpop, (struct sockaddr_dl *)(void *)
8259 source_hw->ifa_addr,
8260 (struct sockaddr *)&source_ip_copy, NULL,
8261 target_proto);
8262
8263 IFA_REMREF(source_hw);
8264 if (result == ENOTSUP) {
8265 result = new_result;
8266 }
8267 }
8268 ifnet_list_free(ifp_list);
8269 }
8270 } else {
8271 result = dlil_send_arp_internal(ifp, arpop, sender_hw,
8272 sender_proto, target_hw, target_proto);
8273 }
8274
8275 return result;
8276 }
8277
8278 /*
8279 * Caller must hold ifnet head lock.
8280 */
8281 static int
ifnet_lookup(struct ifnet * ifp)8282 ifnet_lookup(struct ifnet *ifp)
8283 {
8284 struct ifnet *_ifp;
8285
8286 LCK_RW_ASSERT(&ifnet_head_lock, LCK_RW_ASSERT_HELD);
8287 TAILQ_FOREACH(_ifp, &ifnet_head, if_link) {
8288 if (_ifp == ifp) {
8289 break;
8290 }
8291 }
8292 return _ifp != NULL;
8293 }
8294
8295 /*
8296 * Caller has to pass a non-zero refio argument to get a
8297 * IO reference count. This will prevent ifnet_detach from
8298 * being called when there are outstanding io reference counts.
8299 */
8300 int
ifnet_is_attached(struct ifnet * ifp,int refio)8301 ifnet_is_attached(struct ifnet *ifp, int refio)
8302 {
8303 int ret;
8304
8305 lck_mtx_lock_spin(&ifp->if_ref_lock);
8306 if ((ret = IF_FULLY_ATTACHED(ifp))) {
8307 if (refio > 0) {
8308 ifp->if_refio++;
8309 }
8310 }
8311 lck_mtx_unlock(&ifp->if_ref_lock);
8312
8313 return ret;
8314 }
8315
8316 void
ifnet_incr_pending_thread_count(struct ifnet * ifp)8317 ifnet_incr_pending_thread_count(struct ifnet *ifp)
8318 {
8319 lck_mtx_lock_spin(&ifp->if_ref_lock);
8320 ifp->if_threads_pending++;
8321 lck_mtx_unlock(&ifp->if_ref_lock);
8322 }
8323
8324 void
ifnet_decr_pending_thread_count(struct ifnet * ifp)8325 ifnet_decr_pending_thread_count(struct ifnet *ifp)
8326 {
8327 lck_mtx_lock_spin(&ifp->if_ref_lock);
8328 VERIFY(ifp->if_threads_pending > 0);
8329 ifp->if_threads_pending--;
8330 if (ifp->if_threads_pending == 0) {
8331 wakeup(&ifp->if_threads_pending);
8332 }
8333 lck_mtx_unlock(&ifp->if_ref_lock);
8334 }
8335
8336 /*
8337 * Caller must ensure the interface is attached; the assumption is that
8338 * there is at least an outstanding IO reference count held already.
8339 * Most callers would call ifnet_is_{attached,data_ready}() instead.
8340 */
8341 void
ifnet_incr_iorefcnt(struct ifnet * ifp)8342 ifnet_incr_iorefcnt(struct ifnet *ifp)
8343 {
8344 lck_mtx_lock_spin(&ifp->if_ref_lock);
8345 VERIFY(IF_FULLY_ATTACHED(ifp));
8346 VERIFY(ifp->if_refio > 0);
8347 ifp->if_refio++;
8348 lck_mtx_unlock(&ifp->if_ref_lock);
8349 }
8350
8351 __attribute__((always_inline))
8352 static void
ifnet_decr_iorefcnt_locked(struct ifnet * ifp)8353 ifnet_decr_iorefcnt_locked(struct ifnet *ifp)
8354 {
8355 LCK_MTX_ASSERT(&ifp->if_ref_lock, LCK_MTX_ASSERT_OWNED);
8356
8357 VERIFY(ifp->if_refio > 0);
8358 VERIFY(ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING));
8359
8360 ifp->if_refio--;
8361 VERIFY(ifp->if_refio != 0 || ifp->if_datamov == 0);
8362
8363 /*
8364 * if there are no more outstanding io references, wakeup the
8365 * ifnet_detach thread if detaching flag is set.
8366 */
8367 if (ifp->if_refio == 0 && (ifp->if_refflags & IFRF_DETACHING)) {
8368 wakeup(&(ifp->if_refio));
8369 }
8370 }
8371
8372 void
ifnet_decr_iorefcnt(struct ifnet * ifp)8373 ifnet_decr_iorefcnt(struct ifnet *ifp)
8374 {
8375 lck_mtx_lock_spin(&ifp->if_ref_lock);
8376 ifnet_decr_iorefcnt_locked(ifp);
8377 lck_mtx_unlock(&ifp->if_ref_lock);
8378 }
8379
8380 boolean_t
ifnet_datamov_begin(struct ifnet * ifp)8381 ifnet_datamov_begin(struct ifnet *ifp)
8382 {
8383 boolean_t ret;
8384
8385 lck_mtx_lock_spin(&ifp->if_ref_lock);
8386 if ((ret = IF_FULLY_ATTACHED_AND_READY(ifp))) {
8387 ifp->if_refio++;
8388 ifp->if_datamov++;
8389 }
8390 lck_mtx_unlock(&ifp->if_ref_lock);
8391
8392 return ret;
8393 }
8394
8395 void
ifnet_datamov_end(struct ifnet * ifp)8396 ifnet_datamov_end(struct ifnet *ifp)
8397 {
8398 lck_mtx_lock_spin(&ifp->if_ref_lock);
8399 VERIFY(ifp->if_datamov > 0);
8400 /*
8401 * if there's no more thread moving data, wakeup any
8402 * drainers that's blocked waiting for this.
8403 */
8404 if (--ifp->if_datamov == 0 && ifp->if_drainers > 0) {
8405 DLIL_PRINTF("Waking up drainers on %s\n", if_name(ifp));
8406 DTRACE_IP1(datamov__drain__wake, struct ifnet *, ifp);
8407 wakeup(&(ifp->if_datamov));
8408 }
8409 ifnet_decr_iorefcnt_locked(ifp);
8410 lck_mtx_unlock(&ifp->if_ref_lock);
8411 }
8412
8413 static void
ifnet_datamov_suspend_locked(struct ifnet * ifp)8414 ifnet_datamov_suspend_locked(struct ifnet *ifp)
8415 {
8416 LCK_MTX_ASSERT(&ifp->if_ref_lock, LCK_MTX_ASSERT_OWNED);
8417 ifp->if_refio++;
8418 if (ifp->if_suspend++ == 0) {
8419 VERIFY(ifp->if_refflags & IFRF_READY);
8420 ifp->if_refflags &= ~IFRF_READY;
8421 }
8422 }
8423
8424 void
ifnet_datamov_suspend(struct ifnet * ifp)8425 ifnet_datamov_suspend(struct ifnet *ifp)
8426 {
8427 lck_mtx_lock_spin(&ifp->if_ref_lock);
8428 VERIFY(ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING));
8429 ifnet_datamov_suspend_locked(ifp);
8430 lck_mtx_unlock(&ifp->if_ref_lock);
8431 }
8432
8433 boolean_t
ifnet_datamov_suspend_if_needed(struct ifnet * ifp)8434 ifnet_datamov_suspend_if_needed(struct ifnet *ifp)
8435 {
8436 lck_mtx_lock_spin(&ifp->if_ref_lock);
8437 VERIFY(ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING));
8438 if (ifp->if_suspend > 0) {
8439 lck_mtx_unlock(&ifp->if_ref_lock);
8440 return FALSE;
8441 }
8442 ifnet_datamov_suspend_locked(ifp);
8443 lck_mtx_unlock(&ifp->if_ref_lock);
8444 return TRUE;
8445 }
8446
8447 void
ifnet_datamov_drain(struct ifnet * ifp)8448 ifnet_datamov_drain(struct ifnet *ifp)
8449 {
8450 lck_mtx_lock(&ifp->if_ref_lock);
8451 VERIFY(ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING));
8452 /* data movement must already be suspended */
8453 VERIFY(ifp->if_suspend > 0);
8454 VERIFY(!(ifp->if_refflags & IFRF_READY));
8455 ifp->if_drainers++;
8456 while (ifp->if_datamov != 0) {
8457 DLIL_PRINTF("Waiting for data path(s) to quiesce on %s\n",
8458 if_name(ifp));
8459 DTRACE_IP1(datamov__wait, struct ifnet *, ifp);
8460 (void) msleep(&(ifp->if_datamov), &ifp->if_ref_lock,
8461 (PZERO - 1), __func__, NULL);
8462 DTRACE_IP1(datamov__wake, struct ifnet *, ifp);
8463 }
8464 VERIFY(!(ifp->if_refflags & IFRF_READY));
8465 VERIFY(ifp->if_drainers > 0);
8466 ifp->if_drainers--;
8467 lck_mtx_unlock(&ifp->if_ref_lock);
8468
8469 /* purge the interface queues */
8470 if ((ifp->if_eflags & IFEF_TXSTART) != 0) {
8471 if_qflush_snd(ifp, false);
8472 }
8473 }
8474
8475 void
ifnet_datamov_suspend_and_drain(struct ifnet * ifp)8476 ifnet_datamov_suspend_and_drain(struct ifnet *ifp)
8477 {
8478 ifnet_datamov_suspend(ifp);
8479 ifnet_datamov_drain(ifp);
8480 }
8481
8482 void
ifnet_datamov_resume(struct ifnet * ifp)8483 ifnet_datamov_resume(struct ifnet *ifp)
8484 {
8485 lck_mtx_lock(&ifp->if_ref_lock);
8486 /* data movement must already be suspended */
8487 VERIFY(ifp->if_suspend > 0);
8488 if (--ifp->if_suspend == 0) {
8489 VERIFY(!(ifp->if_refflags & IFRF_READY));
8490 ifp->if_refflags |= IFRF_READY;
8491 }
8492 ifnet_decr_iorefcnt_locked(ifp);
8493 lck_mtx_unlock(&ifp->if_ref_lock);
8494 }
8495
8496 static void
dlil_if_trace(struct dlil_ifnet * dl_if,int refhold)8497 dlil_if_trace(struct dlil_ifnet *dl_if, int refhold)
8498 {
8499 struct dlil_ifnet_dbg *dl_if_dbg = (struct dlil_ifnet_dbg *)dl_if;
8500 ctrace_t *tr;
8501 u_int32_t idx;
8502 u_int16_t *cnt;
8503
8504 if (!(dl_if->dl_if_flags & DLIF_DEBUG)) {
8505 panic("%s: dl_if %p has no debug structure", __func__, dl_if);
8506 /* NOTREACHED */
8507 }
8508
8509 if (refhold) {
8510 cnt = &dl_if_dbg->dldbg_if_refhold_cnt;
8511 tr = dl_if_dbg->dldbg_if_refhold;
8512 } else {
8513 cnt = &dl_if_dbg->dldbg_if_refrele_cnt;
8514 tr = dl_if_dbg->dldbg_if_refrele;
8515 }
8516
8517 idx = os_atomic_inc_orig(cnt, relaxed) % IF_REF_TRACE_HIST_SIZE;
8518 ctrace_record(&tr[idx]);
8519 }
8520
8521 errno_t
dlil_if_ref(struct ifnet * ifp)8522 dlil_if_ref(struct ifnet *ifp)
8523 {
8524 struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
8525
8526 if (dl_if == NULL) {
8527 return EINVAL;
8528 }
8529
8530 lck_mtx_lock_spin(&dl_if->dl_if_lock);
8531 ++dl_if->dl_if_refcnt;
8532 if (dl_if->dl_if_refcnt == 0) {
8533 panic("%s: wraparound refcnt for ifp=%p", __func__, ifp);
8534 /* NOTREACHED */
8535 }
8536 if (dl_if->dl_if_trace != NULL) {
8537 (*dl_if->dl_if_trace)(dl_if, TRUE);
8538 }
8539 lck_mtx_unlock(&dl_if->dl_if_lock);
8540
8541 return 0;
8542 }
8543
8544 errno_t
dlil_if_free(struct ifnet * ifp)8545 dlil_if_free(struct ifnet *ifp)
8546 {
8547 struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
8548 bool need_release = FALSE;
8549
8550 if (dl_if == NULL) {
8551 return EINVAL;
8552 }
8553
8554 lck_mtx_lock_spin(&dl_if->dl_if_lock);
8555 switch (dl_if->dl_if_refcnt) {
8556 case 0:
8557 panic("%s: negative refcnt for ifp=%p", __func__, ifp);
8558 /* NOTREACHED */
8559 break;
8560 case 1:
8561 if ((ifp->if_refflags & IFRF_EMBRYONIC) != 0) {
8562 need_release = TRUE;
8563 }
8564 break;
8565 default:
8566 break;
8567 }
8568 --dl_if->dl_if_refcnt;
8569 if (dl_if->dl_if_trace != NULL) {
8570 (*dl_if->dl_if_trace)(dl_if, FALSE);
8571 }
8572 lck_mtx_unlock(&dl_if->dl_if_lock);
8573 if (need_release) {
8574 _dlil_if_release(ifp, true);
8575 }
8576 return 0;
8577 }
8578
8579 static errno_t
dlil_attach_protocol(struct if_proto * proto,const struct ifnet_demux_desc * demux_list,u_int32_t demux_count,uint32_t * proto_count)8580 dlil_attach_protocol(struct if_proto *proto,
8581 const struct ifnet_demux_desc *demux_list, u_int32_t demux_count,
8582 uint32_t * proto_count)
8583 {
8584 struct kev_dl_proto_data ev_pr_data;
8585 struct ifnet *ifp = proto->ifp;
8586 errno_t retval = 0;
8587 u_int32_t hash_value = proto_hash_value(proto->protocol_family);
8588 struct if_proto *prev_proto;
8589 struct if_proto *_proto;
8590
8591 /* don't allow attaching anything but PF_BRIDGE to vmnet interfaces */
8592 if (IFNET_IS_VMNET(ifp) && proto->protocol_family != PF_BRIDGE) {
8593 return EINVAL;
8594 }
8595
8596 if (!ifnet_is_attached(ifp, 1)) {
8597 os_log(OS_LOG_DEFAULT, "%s: %s is no longer attached",
8598 __func__, if_name(ifp));
8599 return ENXIO;
8600 }
8601 /* callee holds a proto refcnt upon success */
8602 ifnet_lock_exclusive(ifp);
8603 _proto = find_attached_proto(ifp, proto->protocol_family);
8604 if (_proto != NULL) {
8605 ifnet_lock_done(ifp);
8606 if_proto_free(_proto);
8607 retval = EEXIST;
8608 goto ioref_done;
8609 }
8610
8611 /*
8612 * Call family module add_proto routine so it can refine the
8613 * demux descriptors as it wishes.
8614 */
8615 retval = ifp->if_add_proto(ifp, proto->protocol_family, demux_list,
8616 demux_count);
8617 if (retval) {
8618 ifnet_lock_done(ifp);
8619 goto ioref_done;
8620 }
8621
8622 /*
8623 * Insert the protocol in the hash
8624 */
8625 prev_proto = SLIST_FIRST(&ifp->if_proto_hash[hash_value]);
8626 while (prev_proto != NULL && SLIST_NEXT(prev_proto, next_hash) != NULL) {
8627 prev_proto = SLIST_NEXT(prev_proto, next_hash);
8628 }
8629 if (prev_proto) {
8630 SLIST_INSERT_AFTER(prev_proto, proto, next_hash);
8631 } else {
8632 SLIST_INSERT_HEAD(&ifp->if_proto_hash[hash_value],
8633 proto, next_hash);
8634 }
8635
8636 /* hold a proto refcnt for attach */
8637 if_proto_ref(proto);
8638
8639 /*
8640 * The reserved field carries the number of protocol still attached
8641 * (subject to change)
8642 */
8643 ev_pr_data.proto_family = proto->protocol_family;
8644 ev_pr_data.proto_remaining_count = dlil_ifp_protolist(ifp, NULL, 0);
8645
8646 ifnet_lock_done(ifp);
8647
8648 dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_PROTO_ATTACHED,
8649 (struct net_event_data *)&ev_pr_data,
8650 sizeof(struct kev_dl_proto_data), FALSE);
8651 if (proto_count != NULL) {
8652 *proto_count = ev_pr_data.proto_remaining_count;
8653 }
8654 ioref_done:
8655 ifnet_decr_iorefcnt(ifp);
8656 return retval;
8657 }
8658
8659 static void
dlil_handle_proto_attach(ifnet_t ifp,protocol_family_t protocol)8660 dlil_handle_proto_attach(ifnet_t ifp, protocol_family_t protocol)
8661 {
8662 /*
8663 * A protocol has been attached, mark the interface up.
8664 * This used to be done by configd.KernelEventMonitor, but that
8665 * is inherently prone to races (rdar://problem/30810208).
8666 */
8667 (void) ifnet_set_flags(ifp, IFF_UP, IFF_UP);
8668 (void) ifnet_ioctl(ifp, 0, SIOCSIFFLAGS, NULL);
8669 dlil_post_sifflags_msg(ifp);
8670 #if SKYWALK
8671 switch (protocol) {
8672 case AF_INET:
8673 case AF_INET6:
8674 /* don't attach the flowswitch unless attaching IP */
8675 dlil_attach_flowswitch_nexus(ifp);
8676 break;
8677 default:
8678 break;
8679 }
8680 #endif /* SKYWALK */
8681 }
8682
8683 errno_t
ifnet_attach_protocol(ifnet_t ifp,protocol_family_t protocol,const struct ifnet_attach_proto_param * proto_details)8684 ifnet_attach_protocol(ifnet_t ifp, protocol_family_t protocol,
8685 const struct ifnet_attach_proto_param *proto_details)
8686 {
8687 int retval = 0;
8688 struct if_proto *ifproto = NULL;
8689 uint32_t proto_count = 0;
8690
8691 ifnet_head_lock_shared();
8692 if (ifp == NULL || protocol == 0 || proto_details == NULL) {
8693 retval = EINVAL;
8694 goto end;
8695 }
8696 /* Check that the interface is in the global list */
8697 if (!ifnet_lookup(ifp)) {
8698 retval = ENXIO;
8699 goto end;
8700 }
8701
8702 ifproto = zalloc_flags(dlif_proto_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
8703
8704 /* refcnt held above during lookup */
8705 ifproto->ifp = ifp;
8706 ifproto->protocol_family = protocol;
8707 ifproto->proto_kpi = kProtoKPI_v1;
8708 ifproto->kpi.v1.input = proto_details->input;
8709 ifproto->kpi.v1.pre_output = proto_details->pre_output;
8710 ifproto->kpi.v1.event = proto_details->event;
8711 ifproto->kpi.v1.ioctl = proto_details->ioctl;
8712 ifproto->kpi.v1.detached = proto_details->detached;
8713 ifproto->kpi.v1.resolve_multi = proto_details->resolve;
8714 ifproto->kpi.v1.send_arp = proto_details->send_arp;
8715
8716 retval = dlil_attach_protocol(ifproto,
8717 proto_details->demux_list, proto_details->demux_count,
8718 &proto_count);
8719
8720 end:
8721 if (retval == EEXIST) {
8722 /* already attached */
8723 if (dlil_verbose) {
8724 DLIL_PRINTF("%s: protocol %d already attached\n",
8725 ifp != NULL ? if_name(ifp) : "N/A",
8726 protocol);
8727 }
8728 } else if (retval != 0) {
8729 DLIL_PRINTF("%s: failed to attach v1 protocol %d (err=%d)\n",
8730 ifp != NULL ? if_name(ifp) : "N/A", protocol, retval);
8731 } else if (dlil_verbose) {
8732 DLIL_PRINTF("%s: attached v1 protocol %d (count = %d)\n",
8733 ifp != NULL ? if_name(ifp) : "N/A",
8734 protocol, proto_count);
8735 }
8736 ifnet_head_done();
8737 if (retval == 0) {
8738 dlil_handle_proto_attach(ifp, protocol);
8739 } else if (ifproto != NULL) {
8740 zfree(dlif_proto_zone, ifproto);
8741 }
8742 return retval;
8743 }
8744
8745 errno_t
ifnet_attach_protocol_v2(ifnet_t ifp,protocol_family_t protocol,const struct ifnet_attach_proto_param_v2 * proto_details)8746 ifnet_attach_protocol_v2(ifnet_t ifp, protocol_family_t protocol,
8747 const struct ifnet_attach_proto_param_v2 *proto_details)
8748 {
8749 int retval = 0;
8750 struct if_proto *ifproto = NULL;
8751 uint32_t proto_count = 0;
8752
8753 ifnet_head_lock_shared();
8754 if (ifp == NULL || protocol == 0 || proto_details == NULL) {
8755 retval = EINVAL;
8756 goto end;
8757 }
8758 /* Check that the interface is in the global list */
8759 if (!ifnet_lookup(ifp)) {
8760 retval = ENXIO;
8761 goto end;
8762 }
8763
8764 ifproto = zalloc_flags(dlif_proto_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
8765
8766 /* refcnt held above during lookup */
8767 ifproto->ifp = ifp;
8768 ifproto->protocol_family = protocol;
8769 ifproto->proto_kpi = kProtoKPI_v2;
8770 ifproto->kpi.v2.input = proto_details->input;
8771 ifproto->kpi.v2.pre_output = proto_details->pre_output;
8772 ifproto->kpi.v2.event = proto_details->event;
8773 ifproto->kpi.v2.ioctl = proto_details->ioctl;
8774 ifproto->kpi.v2.detached = proto_details->detached;
8775 ifproto->kpi.v2.resolve_multi = proto_details->resolve;
8776 ifproto->kpi.v2.send_arp = proto_details->send_arp;
8777
8778 retval = dlil_attach_protocol(ifproto,
8779 proto_details->demux_list, proto_details->demux_count,
8780 &proto_count);
8781
8782 end:
8783 if (retval == EEXIST) {
8784 /* already attached */
8785 if (dlil_verbose) {
8786 DLIL_PRINTF("%s: protocol %d already attached\n",
8787 ifp != NULL ? if_name(ifp) : "N/A",
8788 protocol);
8789 }
8790 } else if (retval != 0) {
8791 DLIL_PRINTF("%s: failed to attach v2 protocol %d (err=%d)\n",
8792 ifp != NULL ? if_name(ifp) : "N/A", protocol, retval);
8793 } else if (dlil_verbose) {
8794 DLIL_PRINTF("%s: attached v2 protocol %d (count = %d)\n",
8795 ifp != NULL ? if_name(ifp) : "N/A",
8796 protocol, proto_count);
8797 }
8798 ifnet_head_done();
8799 if (retval == 0) {
8800 dlil_handle_proto_attach(ifp, protocol);
8801 } else if (ifproto != NULL) {
8802 zfree(dlif_proto_zone, ifproto);
8803 }
8804 return retval;
8805 }
8806
8807 errno_t
ifnet_detach_protocol(ifnet_t ifp,protocol_family_t proto_family)8808 ifnet_detach_protocol(ifnet_t ifp, protocol_family_t proto_family)
8809 {
8810 struct if_proto *proto = NULL;
8811 int retval = 0;
8812
8813 if (ifp == NULL || proto_family == 0) {
8814 retval = EINVAL;
8815 goto end;
8816 }
8817
8818 ifnet_lock_exclusive(ifp);
8819 /* callee holds a proto refcnt upon success */
8820 proto = find_attached_proto(ifp, proto_family);
8821 if (proto == NULL) {
8822 retval = ENXIO;
8823 ifnet_lock_done(ifp);
8824 goto end;
8825 }
8826
8827 /* call family module del_proto */
8828 if (ifp->if_del_proto) {
8829 ifp->if_del_proto(ifp, proto->protocol_family);
8830 }
8831
8832 SLIST_REMOVE(&ifp->if_proto_hash[proto_hash_value(proto_family)],
8833 proto, if_proto, next_hash);
8834
8835 if (proto->proto_kpi == kProtoKPI_v1) {
8836 proto->kpi.v1.input = ifproto_media_input_v1;
8837 proto->kpi.v1.pre_output = ifproto_media_preout;
8838 proto->kpi.v1.event = ifproto_media_event;
8839 proto->kpi.v1.ioctl = ifproto_media_ioctl;
8840 proto->kpi.v1.resolve_multi = ifproto_media_resolve_multi;
8841 proto->kpi.v1.send_arp = ifproto_media_send_arp;
8842 } else {
8843 proto->kpi.v2.input = ifproto_media_input_v2;
8844 proto->kpi.v2.pre_output = ifproto_media_preout;
8845 proto->kpi.v2.event = ifproto_media_event;
8846 proto->kpi.v2.ioctl = ifproto_media_ioctl;
8847 proto->kpi.v2.resolve_multi = ifproto_media_resolve_multi;
8848 proto->kpi.v2.send_arp = ifproto_media_send_arp;
8849 }
8850 proto->detached = 1;
8851 ifnet_lock_done(ifp);
8852
8853 if (dlil_verbose) {
8854 DLIL_PRINTF("%s: detached %s protocol %d\n", if_name(ifp),
8855 (proto->proto_kpi == kProtoKPI_v1) ?
8856 "v1" : "v2", proto_family);
8857 }
8858
8859 /* release proto refcnt held during protocol attach */
8860 if_proto_free(proto);
8861
8862 /*
8863 * Release proto refcnt held during lookup; the rest of
8864 * protocol detach steps will happen when the last proto
8865 * reference is released.
8866 */
8867 if_proto_free(proto);
8868
8869 end:
8870 return retval;
8871 }
8872
8873 static errno_t
ifproto_media_input_v1(struct ifnet * ifp,protocol_family_t protocol,struct mbuf * packet,char * header)8874 ifproto_media_input_v1(struct ifnet *ifp, protocol_family_t protocol,
8875 struct mbuf *packet, char *header)
8876 {
8877 #pragma unused(ifp, protocol, packet, header)
8878 return ENXIO;
8879 }
8880
8881 static errno_t
ifproto_media_input_v2(struct ifnet * ifp,protocol_family_t protocol,struct mbuf * packet)8882 ifproto_media_input_v2(struct ifnet *ifp, protocol_family_t protocol,
8883 struct mbuf *packet)
8884 {
8885 #pragma unused(ifp, protocol, packet)
8886 return ENXIO;
8887 }
8888
8889 static errno_t
ifproto_media_preout(struct ifnet * ifp,protocol_family_t protocol,mbuf_t * packet,const struct sockaddr * dest,void * route,char * frame_type,char * link_layer_dest)8890 ifproto_media_preout(struct ifnet *ifp, protocol_family_t protocol,
8891 mbuf_t *packet, const struct sockaddr *dest, void *route, char *frame_type,
8892 char *link_layer_dest)
8893 {
8894 #pragma unused(ifp, protocol, packet, dest, route, frame_type, link_layer_dest)
8895 return ENXIO;
8896 }
8897
8898 static void
ifproto_media_event(struct ifnet * ifp,protocol_family_t protocol,const struct kev_msg * event)8899 ifproto_media_event(struct ifnet *ifp, protocol_family_t protocol,
8900 const struct kev_msg *event)
8901 {
8902 #pragma unused(ifp, protocol, event)
8903 }
8904
8905 static errno_t
ifproto_media_ioctl(struct ifnet * ifp,protocol_family_t protocol,unsigned long command,void * argument)8906 ifproto_media_ioctl(struct ifnet *ifp, protocol_family_t protocol,
8907 unsigned long command, void *argument)
8908 {
8909 #pragma unused(ifp, protocol, command, argument)
8910 return ENXIO;
8911 }
8912
8913 static errno_t
ifproto_media_resolve_multi(ifnet_t ifp,const struct sockaddr * proto_addr,struct sockaddr_dl * out_ll,size_t ll_len)8914 ifproto_media_resolve_multi(ifnet_t ifp, const struct sockaddr *proto_addr,
8915 struct sockaddr_dl *out_ll, size_t ll_len)
8916 {
8917 #pragma unused(ifp, proto_addr, out_ll, ll_len)
8918 return ENXIO;
8919 }
8920
8921 static errno_t
ifproto_media_send_arp(struct ifnet * ifp,u_short arpop,const struct sockaddr_dl * sender_hw,const struct sockaddr * sender_proto,const struct sockaddr_dl * target_hw,const struct sockaddr * target_proto)8922 ifproto_media_send_arp(struct ifnet *ifp, u_short arpop,
8923 const struct sockaddr_dl *sender_hw, const struct sockaddr *sender_proto,
8924 const struct sockaddr_dl *target_hw, const struct sockaddr *target_proto)
8925 {
8926 #pragma unused(ifp, arpop, sender_hw, sender_proto, target_hw, target_proto)
8927 return ENXIO;
8928 }
8929
8930 extern int if_next_index(void);
8931 extern int tcp_ecn_outbound;
8932
8933 void
dlil_ifclassq_setup(struct ifnet * ifp,struct ifclassq * ifcq)8934 dlil_ifclassq_setup(struct ifnet *ifp, struct ifclassq *ifcq)
8935 {
8936 uint32_t sflags = 0;
8937 int err;
8938
8939 if (if_flowadv) {
8940 sflags |= PKTSCHEDF_QALG_FLOWCTL;
8941 }
8942
8943 if (if_delaybased_queue) {
8944 sflags |= PKTSCHEDF_QALG_DELAYBASED;
8945 }
8946
8947 if (ifp->if_output_sched_model ==
8948 IFNET_SCHED_MODEL_DRIVER_MANAGED) {
8949 sflags |= PKTSCHEDF_QALG_DRIVER_MANAGED;
8950 }
8951 /* Inherit drop limit from the default queue */
8952 if (ifp->if_snd != ifcq) {
8953 IFCQ_PKT_DROP_LIMIT(ifcq) = IFCQ_PKT_DROP_LIMIT(ifp->if_snd);
8954 }
8955 /* Initialize transmit queue(s) */
8956 err = ifclassq_setup(ifcq, ifp, sflags);
8957 if (err != 0) {
8958 panic_plain("%s: ifp=%p couldn't initialize transmit queue; "
8959 "err=%d", __func__, ifp, err);
8960 /* NOTREACHED */
8961 }
8962 }
8963
8964 errno_t
ifnet_attach(ifnet_t ifp,const struct sockaddr_dl * ll_addr)8965 ifnet_attach(ifnet_t ifp, const struct sockaddr_dl *ll_addr)
8966 {
8967 #if SKYWALK
8968 boolean_t netif_compat;
8969 if_nexus_netif nexus_netif;
8970 #endif /* SKYWALK */
8971 struct ifnet *tmp_if;
8972 struct ifaddr *ifa;
8973 struct if_data_internal if_data_saved;
8974 struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
8975 struct dlil_threading_info *dl_inp;
8976 thread_continue_t thfunc = NULL;
8977 int err;
8978
8979 if (ifp == NULL) {
8980 return EINVAL;
8981 }
8982
8983 /*
8984 * Serialize ifnet attach using dlil_ifnet_lock, in order to
8985 * prevent the interface from being configured while it is
8986 * embryonic, as ifnet_head_lock is dropped and reacquired
8987 * below prior to marking the ifnet with IFRF_ATTACHED.
8988 */
8989 dlil_if_lock();
8990 ifnet_head_lock_exclusive();
8991 /* Verify we aren't already on the list */
8992 TAILQ_FOREACH(tmp_if, &ifnet_head, if_link) {
8993 if (tmp_if == ifp) {
8994 ifnet_head_done();
8995 dlil_if_unlock();
8996 return EEXIST;
8997 }
8998 }
8999
9000 lck_mtx_lock_spin(&ifp->if_ref_lock);
9001 if (!(ifp->if_refflags & IFRF_EMBRYONIC)) {
9002 panic_plain("%s: flags mismatch (embryonic not set) ifp=%p",
9003 __func__, ifp);
9004 /* NOTREACHED */
9005 }
9006 lck_mtx_unlock(&ifp->if_ref_lock);
9007
9008 ifnet_lock_exclusive(ifp);
9009
9010 /* Sanity check */
9011 VERIFY(ifp->if_detaching_link.tqe_next == NULL);
9012 VERIFY(ifp->if_detaching_link.tqe_prev == NULL);
9013 VERIFY(ifp->if_threads_pending == 0);
9014
9015 if (ll_addr != NULL) {
9016 if (ifp->if_addrlen == 0) {
9017 ifp->if_addrlen = ll_addr->sdl_alen;
9018 } else if (ll_addr->sdl_alen != ifp->if_addrlen) {
9019 ifnet_lock_done(ifp);
9020 ifnet_head_done();
9021 dlil_if_unlock();
9022 return EINVAL;
9023 }
9024 }
9025
9026 /*
9027 * Allow interfaces without protocol families to attach
9028 * only if they have the necessary fields filled out.
9029 */
9030 if (ifp->if_add_proto == NULL || ifp->if_del_proto == NULL) {
9031 DLIL_PRINTF("%s: Attempt to attach interface without "
9032 "family module - %d\n", __func__, ifp->if_family);
9033 ifnet_lock_done(ifp);
9034 ifnet_head_done();
9035 dlil_if_unlock();
9036 return ENODEV;
9037 }
9038
9039 /* Allocate protocol hash table */
9040 VERIFY(ifp->if_proto_hash == NULL);
9041 ifp->if_proto_hash = kalloc_type(struct proto_hash_entry,
9042 PROTO_HASH_SLOTS, Z_WAITOK | Z_ZERO | Z_NOFAIL);
9043
9044 lck_mtx_lock_spin(&ifp->if_flt_lock);
9045 VERIFY(TAILQ_EMPTY(&ifp->if_flt_head));
9046 TAILQ_INIT(&ifp->if_flt_head);
9047 VERIFY(ifp->if_flt_busy == 0);
9048 VERIFY(ifp->if_flt_waiters == 0);
9049 VERIFY(ifp->if_flt_non_os_count == 0);
9050 VERIFY(ifp->if_flt_no_tso_count == 0);
9051 lck_mtx_unlock(&ifp->if_flt_lock);
9052
9053 if (!(dl_if->dl_if_flags & DLIF_REUSE)) {
9054 VERIFY(LIST_EMPTY(&ifp->if_multiaddrs));
9055 LIST_INIT(&ifp->if_multiaddrs);
9056 }
9057
9058 VERIFY(ifp->if_allhostsinm == NULL);
9059 VERIFY(TAILQ_EMPTY(&ifp->if_addrhead));
9060 TAILQ_INIT(&ifp->if_addrhead);
9061
9062 if (ifp->if_index == 0) {
9063 int idx = if_next_index();
9064
9065 /*
9066 * Since we exhausted the list of
9067 * if_index's, try to find an empty slot
9068 * in ifindex2ifnet.
9069 */
9070 if (idx == -1 && if_index >= UINT16_MAX) {
9071 for (int i = 1; i < if_index; i++) {
9072 if (ifindex2ifnet[i] == NULL &&
9073 ifnet_addrs[i - 1] == NULL) {
9074 idx = i;
9075 break;
9076 }
9077 }
9078 }
9079 if (idx == -1) {
9080 ifp->if_index = 0;
9081 ifnet_lock_done(ifp);
9082 ifnet_head_done();
9083 dlil_if_unlock();
9084 return ENOBUFS;
9085 }
9086 ifp->if_index = (uint16_t)idx;
9087
9088 /* the lladdr passed at attach time is the permanent address */
9089 if (ll_addr != NULL && ifp->if_type == IFT_ETHER &&
9090 ll_addr->sdl_alen == ETHER_ADDR_LEN) {
9091 bcopy(CONST_LLADDR(ll_addr),
9092 dl_if->dl_if_permanent_ether,
9093 ETHER_ADDR_LEN);
9094 dl_if->dl_if_permanent_ether_is_set = 1;
9095 }
9096 }
9097 /* There should not be anything occupying this slot */
9098 VERIFY(ifindex2ifnet[ifp->if_index] == NULL);
9099
9100 /* allocate (if needed) and initialize a link address */
9101 ifa = dlil_alloc_lladdr(ifp, ll_addr);
9102 if (ifa == NULL) {
9103 ifnet_lock_done(ifp);
9104 ifnet_head_done();
9105 dlil_if_unlock();
9106 return ENOBUFS;
9107 }
9108
9109 VERIFY(ifnet_addrs[ifp->if_index - 1] == NULL);
9110 ifnet_addrs[ifp->if_index - 1] = ifa;
9111
9112 /* make this address the first on the list */
9113 IFA_LOCK(ifa);
9114 /* hold a reference for ifnet_addrs[] */
9115 IFA_ADDREF_LOCKED(ifa);
9116 /* if_attach_link_ifa() holds a reference for ifa_link */
9117 if_attach_link_ifa(ifp, ifa);
9118 IFA_UNLOCK(ifa);
9119
9120 TAILQ_INSERT_TAIL(&ifnet_head, ifp, if_link);
9121 ifindex2ifnet[ifp->if_index] = ifp;
9122
9123 /* Hold a reference to the underlying dlil_ifnet */
9124 ifnet_reference(ifp);
9125
9126 /* Clear stats (save and restore other fields that we care) */
9127 if_data_saved = ifp->if_data;
9128 bzero(&ifp->if_data, sizeof(ifp->if_data));
9129 ifp->if_data.ifi_type = if_data_saved.ifi_type;
9130 ifp->if_data.ifi_typelen = if_data_saved.ifi_typelen;
9131 ifp->if_data.ifi_physical = if_data_saved.ifi_physical;
9132 ifp->if_data.ifi_addrlen = if_data_saved.ifi_addrlen;
9133 ifp->if_data.ifi_hdrlen = if_data_saved.ifi_hdrlen;
9134 ifp->if_data.ifi_mtu = if_data_saved.ifi_mtu;
9135 ifp->if_data.ifi_baudrate = if_data_saved.ifi_baudrate;
9136 ifp->if_data.ifi_hwassist = if_data_saved.ifi_hwassist;
9137 ifp->if_data.ifi_tso_v4_mtu = if_data_saved.ifi_tso_v4_mtu;
9138 ifp->if_data.ifi_tso_v6_mtu = if_data_saved.ifi_tso_v6_mtu;
9139 ifnet_touch_lastchange(ifp);
9140
9141 VERIFY(ifp->if_output_sched_model == IFNET_SCHED_MODEL_NORMAL ||
9142 ifp->if_output_sched_model == IFNET_SCHED_MODEL_DRIVER_MANAGED ||
9143 ifp->if_output_sched_model == IFNET_SCHED_MODEL_FQ_CODEL);
9144
9145 dlil_ifclassq_setup(ifp, ifp->if_snd);
9146
9147 /* Sanity checks on the input thread storage */
9148 dl_inp = &dl_if->dl_if_inpstorage;
9149 bzero(&dl_inp->dlth_stats, sizeof(dl_inp->dlth_stats));
9150 VERIFY(dl_inp->dlth_flags == 0);
9151 VERIFY(dl_inp->dlth_wtot == 0);
9152 VERIFY(dl_inp->dlth_ifp == NULL);
9153 VERIFY(qhead(&dl_inp->dlth_pkts) == NULL && qempty(&dl_inp->dlth_pkts));
9154 VERIFY(qlimit(&dl_inp->dlth_pkts) == 0);
9155 VERIFY(!dl_inp->dlth_affinity);
9156 VERIFY(ifp->if_inp == NULL);
9157 VERIFY(dl_inp->dlth_thread == THREAD_NULL);
9158 VERIFY(dl_inp->dlth_strategy == NULL);
9159 VERIFY(dl_inp->dlth_driver_thread == THREAD_NULL);
9160 VERIFY(dl_inp->dlth_poller_thread == THREAD_NULL);
9161 VERIFY(dl_inp->dlth_affinity_tag == 0);
9162
9163 #if IFNET_INPUT_SANITY_CHK
9164 VERIFY(dl_inp->dlth_pkts_cnt == 0);
9165 #endif /* IFNET_INPUT_SANITY_CHK */
9166
9167 VERIFY(ifp->if_poll_thread == THREAD_NULL);
9168 dlil_reset_rxpoll_params(ifp);
9169 /*
9170 * A specific DLIL input thread is created per non-loopback interface.
9171 */
9172 if (ifp->if_family != IFNET_FAMILY_LOOPBACK) {
9173 ifp->if_inp = dl_inp;
9174 ifnet_incr_pending_thread_count(ifp);
9175 err = dlil_create_input_thread(ifp, ifp->if_inp, &thfunc);
9176 if (err == ENODEV) {
9177 VERIFY(thfunc == NULL);
9178 ifnet_decr_pending_thread_count(ifp);
9179 } else if (err != 0) {
9180 panic_plain("%s: ifp=%p couldn't get an input thread; "
9181 "err=%d", __func__, ifp, err);
9182 /* NOTREACHED */
9183 }
9184 }
9185 /*
9186 * If the driver supports the new transmit model, calculate flow hash
9187 * and create a workloop starter thread to invoke the if_start callback
9188 * where the packets may be dequeued and transmitted.
9189 */
9190 if (ifp->if_eflags & IFEF_TXSTART) {
9191 thread_precedence_policy_data_t info;
9192 __unused kern_return_t kret;
9193
9194 ifp->if_flowhash = ifnet_calc_flowhash(ifp);
9195 VERIFY(ifp->if_flowhash != 0);
9196 VERIFY(ifp->if_start_thread == THREAD_NULL);
9197
9198 ifnet_set_start_cycle(ifp, NULL);
9199 ifp->if_start_pacemaker_time = 0;
9200 ifp->if_start_active = 0;
9201 ifp->if_start_req = 0;
9202 ifp->if_start_flags = 0;
9203 VERIFY(ifp->if_start != NULL);
9204 ifnet_incr_pending_thread_count(ifp);
9205 if ((err = kernel_thread_start(ifnet_start_thread_func,
9206 ifp, &ifp->if_start_thread)) != KERN_SUCCESS) {
9207 panic_plain("%s: "
9208 "ifp=%p couldn't get a start thread; "
9209 "err=%d", __func__, ifp, err);
9210 /* NOTREACHED */
9211 }
9212 bzero(&info, sizeof(info));
9213 info.importance = 1;
9214 kret = thread_policy_set(ifp->if_start_thread,
9215 THREAD_PRECEDENCE_POLICY, (thread_policy_t)&info,
9216 THREAD_PRECEDENCE_POLICY_COUNT);
9217 ASSERT(kret == KERN_SUCCESS);
9218 } else {
9219 ifp->if_flowhash = 0;
9220 }
9221
9222 /* Reset polling parameters */
9223 ifnet_set_poll_cycle(ifp, NULL);
9224 ifp->if_poll_update = 0;
9225 ifp->if_poll_flags = 0;
9226 ifp->if_poll_req = 0;
9227 VERIFY(ifp->if_poll_thread == THREAD_NULL);
9228
9229 /*
9230 * If the driver supports the new receive model, create a poller
9231 * thread to invoke if_input_poll callback where the packets may
9232 * be dequeued from the driver and processed for reception.
9233 * if the interface is netif compat then the poller thread is
9234 * managed by netif.
9235 */
9236 if (thfunc == dlil_rxpoll_input_thread_func) {
9237 thread_precedence_policy_data_t info;
9238 __unused kern_return_t kret;
9239 #if SKYWALK
9240 VERIFY(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
9241 #endif /* SKYWALK */
9242 VERIFY(ifp->if_input_poll != NULL);
9243 VERIFY(ifp->if_input_ctl != NULL);
9244 ifnet_incr_pending_thread_count(ifp);
9245 if ((err = kernel_thread_start(ifnet_poll_thread_func, ifp,
9246 &ifp->if_poll_thread)) != KERN_SUCCESS) {
9247 panic_plain("%s: ifp=%p couldn't get a poll thread; "
9248 "err=%d", __func__, ifp, err);
9249 /* NOTREACHED */
9250 }
9251 bzero(&info, sizeof(info));
9252 info.importance = 1;
9253 kret = thread_policy_set(ifp->if_poll_thread,
9254 THREAD_PRECEDENCE_POLICY, (thread_policy_t)&info,
9255 THREAD_PRECEDENCE_POLICY_COUNT);
9256 ASSERT(kret == KERN_SUCCESS);
9257 }
9258
9259 VERIFY(ifp->if_desc.ifd_maxlen == IF_DESCSIZE);
9260 VERIFY(ifp->if_desc.ifd_len == 0);
9261 VERIFY(ifp->if_desc.ifd_desc != NULL);
9262
9263 /* Record attach PC stacktrace */
9264 ctrace_record(&((struct dlil_ifnet *)ifp)->dl_if_attach);
9265
9266 ifp->if_updatemcasts = 0;
9267 if (!LIST_EMPTY(&ifp->if_multiaddrs)) {
9268 struct ifmultiaddr *ifma;
9269 LIST_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
9270 IFMA_LOCK(ifma);
9271 if (ifma->ifma_addr->sa_family == AF_LINK ||
9272 ifma->ifma_addr->sa_family == AF_UNSPEC) {
9273 ifp->if_updatemcasts++;
9274 }
9275 IFMA_UNLOCK(ifma);
9276 }
9277
9278 DLIL_PRINTF("%s: attached with %d suspended link-layer multicast "
9279 "membership(s)\n", if_name(ifp),
9280 ifp->if_updatemcasts);
9281 }
9282
9283 /* Clear logging parameters */
9284 bzero(&ifp->if_log, sizeof(ifp->if_log));
9285
9286 /* Clear foreground/realtime activity timestamps */
9287 ifp->if_fg_sendts = 0;
9288 ifp->if_rt_sendts = 0;
9289
9290 /* Clear throughput estimates and radio type */
9291 ifp->if_estimated_up_bucket = 0;
9292 ifp->if_estimated_down_bucket = 0;
9293 ifp->if_radio_type = 0;
9294 ifp->if_radio_channel = 0;
9295
9296 VERIFY(ifp->if_delegated.ifp == NULL);
9297 VERIFY(ifp->if_delegated.type == 0);
9298 VERIFY(ifp->if_delegated.family == 0);
9299 VERIFY(ifp->if_delegated.subfamily == 0);
9300 VERIFY(ifp->if_delegated.expensive == 0);
9301 VERIFY(ifp->if_delegated.constrained == 0);
9302
9303 VERIFY(ifp->if_agentids == NULL);
9304 VERIFY(ifp->if_agentcount == 0);
9305
9306 /* Reset interface state */
9307 bzero(&ifp->if_interface_state, sizeof(ifp->if_interface_state));
9308 ifp->if_interface_state.valid_bitmask |=
9309 IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID;
9310 ifp->if_interface_state.interface_availability =
9311 IF_INTERFACE_STATE_INTERFACE_AVAILABLE;
9312
9313 /* Initialize Link Quality Metric (loopback [lo0] is always good) */
9314 if (ifp == lo_ifp) {
9315 ifp->if_interface_state.lqm_state = IFNET_LQM_THRESH_GOOD;
9316 ifp->if_interface_state.valid_bitmask |=
9317 IF_INTERFACE_STATE_LQM_STATE_VALID;
9318 } else {
9319 ifp->if_interface_state.lqm_state = IFNET_LQM_THRESH_UNKNOWN;
9320 }
9321
9322 /*
9323 * Enable ECN capability on this interface depending on the
9324 * value of ECN global setting
9325 */
9326 if (tcp_ecn_outbound == 2 && !IFNET_IS_CELLULAR(ifp)) {
9327 if_set_eflags(ifp, IFEF_ECN_ENABLE);
9328 if_clear_eflags(ifp, IFEF_ECN_DISABLE);
9329 }
9330
9331 /*
9332 * Built-in Cyclops always on policy for WiFi infra
9333 */
9334 if (IFNET_IS_WIFI_INFRA(ifp) && net_qos_policy_wifi_enabled != 0) {
9335 errno_t error;
9336
9337 error = if_set_qosmarking_mode(ifp,
9338 IFRTYPE_QOSMARKING_FASTLANE);
9339 if (error != 0) {
9340 DLIL_PRINTF("%s if_set_qosmarking_mode(%s) error %d\n",
9341 __func__, ifp->if_xname, error);
9342 } else {
9343 if_set_eflags(ifp, IFEF_QOSMARKING_ENABLED);
9344 #if (DEVELOPMENT || DEBUG)
9345 DLIL_PRINTF("%s fastlane enabled on %s\n",
9346 __func__, ifp->if_xname);
9347 #endif /* (DEVELOPMENT || DEBUG) */
9348 }
9349 }
9350
9351 ifnet_lock_done(ifp);
9352 ifnet_head_done();
9353
9354 #if SKYWALK
9355 netif_compat = dlil_attach_netif_compat_nexus(ifp, &nexus_netif);
9356 #endif /* SKYWALK */
9357
9358 lck_mtx_lock(&ifp->if_cached_route_lock);
9359 /* Enable forwarding cached route */
9360 ifp->if_fwd_cacheok = 1;
9361 /* Clean up any existing cached routes */
9362 ROUTE_RELEASE(&ifp->if_fwd_route);
9363 bzero(&ifp->if_fwd_route, sizeof(ifp->if_fwd_route));
9364 ROUTE_RELEASE(&ifp->if_src_route);
9365 bzero(&ifp->if_src_route, sizeof(ifp->if_src_route));
9366 ROUTE_RELEASE(&ifp->if_src_route6);
9367 bzero(&ifp->if_src_route6, sizeof(ifp->if_src_route6));
9368 lck_mtx_unlock(&ifp->if_cached_route_lock);
9369
9370 ifnet_llreach_ifattach(ifp, (dl_if->dl_if_flags & DLIF_REUSE));
9371
9372 /*
9373 * Allocate and attach IGMPv3/MLDv2 interface specific variables
9374 * and trees; do this before the ifnet is marked as attached.
9375 * The ifnet keeps the reference to the info structures even after
9376 * the ifnet is detached, since the network-layer records still
9377 * refer to the info structures even after that. This also
9378 * makes it possible for them to still function after the ifnet
9379 * is recycled or reattached.
9380 */
9381 #if INET
9382 if (IGMP_IFINFO(ifp) == NULL) {
9383 IGMP_IFINFO(ifp) = igmp_domifattach(ifp, Z_WAITOK);
9384 VERIFY(IGMP_IFINFO(ifp) != NULL);
9385 } else {
9386 VERIFY(IGMP_IFINFO(ifp)->igi_ifp == ifp);
9387 igmp_domifreattach(IGMP_IFINFO(ifp));
9388 }
9389 #endif /* INET */
9390 if (MLD_IFINFO(ifp) == NULL) {
9391 MLD_IFINFO(ifp) = mld_domifattach(ifp, Z_WAITOK);
9392 VERIFY(MLD_IFINFO(ifp) != NULL);
9393 } else {
9394 VERIFY(MLD_IFINFO(ifp)->mli_ifp == ifp);
9395 mld_domifreattach(MLD_IFINFO(ifp));
9396 }
9397
9398 VERIFY(ifp->if_data_threshold == 0);
9399 VERIFY(ifp->if_dt_tcall != NULL);
9400
9401 /*
9402 * Wait for the created kernel threads for I/O to get
9403 * scheduled and run at least once before we proceed
9404 * to mark interface as attached.
9405 */
9406 lck_mtx_lock(&ifp->if_ref_lock);
9407 while (ifp->if_threads_pending != 0) {
9408 DLIL_PRINTF("%s: Waiting for all kernel threads created for "
9409 "interface %s to get scheduled at least once.\n",
9410 __func__, ifp->if_xname);
9411 (void) msleep(&ifp->if_threads_pending, &ifp->if_ref_lock, (PZERO - 1),
9412 __func__, NULL);
9413 LCK_MTX_ASSERT(&ifp->if_ref_lock, LCK_ASSERT_OWNED);
9414 }
9415 lck_mtx_unlock(&ifp->if_ref_lock);
9416 DLIL_PRINTF("%s: All kernel threads created for interface %s have been scheduled "
9417 "at least once. Proceeding.\n", __func__, ifp->if_xname);
9418
9419 /* Final mark this ifnet as attached. */
9420 ifnet_lock_exclusive(ifp);
9421 lck_mtx_lock_spin(&ifp->if_ref_lock);
9422 ifp->if_refflags = (IFRF_ATTACHED | IFRF_READY); /* clears embryonic */
9423 lck_mtx_unlock(&ifp->if_ref_lock);
9424 if (net_rtref) {
9425 /* boot-args override; enable idle notification */
9426 (void) ifnet_set_idle_flags_locked(ifp, IFRF_IDLE_NOTIFY,
9427 IFRF_IDLE_NOTIFY);
9428 } else {
9429 /* apply previous request(s) to set the idle flags, if any */
9430 (void) ifnet_set_idle_flags_locked(ifp, ifp->if_idle_new_flags,
9431 ifp->if_idle_new_flags_mask);
9432 }
9433 #if SKYWALK
9434 /* the interface is fully attached; let the nexus adapter know */
9435 if (netif_compat || dlil_is_native_netif_nexus(ifp)) {
9436 if (netif_compat) {
9437 if (sk_netif_compat_txmodel ==
9438 NETIF_COMPAT_TXMODEL_ENQUEUE_MULTI) {
9439 ifnet_enqueue_multi_setup(ifp,
9440 sk_tx_delay_qlen, sk_tx_delay_timeout);
9441 }
9442 ifp->if_nx_netif = nexus_netif;
9443 }
9444 ifp->if_na_ops->ni_finalize(ifp->if_na, ifp);
9445 }
9446 #endif /* SKYWALK */
9447 ifnet_lock_done(ifp);
9448 dlil_if_unlock();
9449
9450 #if PF
9451 /*
9452 * Attach packet filter to this interface, if enabled.
9453 */
9454 pf_ifnet_hook(ifp, 1);
9455 #endif /* PF */
9456
9457 dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_IF_ATTACHED, NULL, 0, FALSE);
9458
9459 if (dlil_verbose) {
9460 DLIL_PRINTF("%s: attached%s\n", if_name(ifp),
9461 (dl_if->dl_if_flags & DLIF_REUSE) ? " (recycled)" : "");
9462 }
9463
9464 return 0;
9465 }
9466
9467 /*
9468 * Prepare the storage for the first/permanent link address, which must
9469 * must have the same lifetime as the ifnet itself. Although the link
9470 * address gets removed from if_addrhead and ifnet_addrs[] at detach time,
9471 * its location in memory must never change as it may still be referred
9472 * to by some parts of the system afterwards (unfortunate implementation
9473 * artifacts inherited from BSD.)
9474 *
9475 * Caller must hold ifnet lock as writer.
9476 */
9477 static struct ifaddr *
dlil_alloc_lladdr(struct ifnet * ifp,const struct sockaddr_dl * ll_addr)9478 dlil_alloc_lladdr(struct ifnet *ifp, const struct sockaddr_dl *ll_addr)
9479 {
9480 struct ifaddr *ifa, *oifa;
9481 struct sockaddr_dl *asdl, *msdl;
9482 char workbuf[IFNAMSIZ * 2];
9483 int namelen, masklen, socksize;
9484 struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
9485
9486 ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_EXCLUSIVE);
9487 VERIFY(ll_addr == NULL || ll_addr->sdl_alen == ifp->if_addrlen);
9488
9489 namelen = scnprintf(workbuf, sizeof(workbuf), "%s",
9490 if_name(ifp));
9491 masklen = offsetof(struct sockaddr_dl, sdl_data[0])
9492 + ((namelen > 0) ? namelen : 0);
9493 socksize = masklen + ifp->if_addrlen;
9494 #define ROUNDUP(a) (1 + (((a) - 1) | (sizeof (u_int32_t) - 1)))
9495 if ((u_int32_t)socksize < sizeof(struct sockaddr_dl)) {
9496 socksize = sizeof(struct sockaddr_dl);
9497 }
9498 socksize = ROUNDUP(socksize);
9499 #undef ROUNDUP
9500
9501 ifa = ifp->if_lladdr;
9502 if (socksize > DLIL_SDLMAXLEN ||
9503 (ifa != NULL && ifa != &dl_if->dl_if_lladdr.ifa)) {
9504 /*
9505 * Rare, but in the event that the link address requires
9506 * more storage space than DLIL_SDLMAXLEN, allocate the
9507 * largest possible storages for address and mask, such
9508 * that we can reuse the same space when if_addrlen grows.
9509 * This same space will be used when if_addrlen shrinks.
9510 */
9511 if (ifa == NULL || ifa == &dl_if->dl_if_lladdr.ifa) {
9512 int ifasize = sizeof(*ifa) + 2 * SOCK_MAXADDRLEN;
9513
9514 ifa = zalloc_permanent(ifasize, ZALIGN(struct ifaddr));
9515 ifa_lock_init(ifa);
9516 /* Don't set IFD_ALLOC, as this is permanent */
9517 ifa->ifa_debug = IFD_LINK;
9518 }
9519 IFA_LOCK(ifa);
9520 /* address and mask sockaddr_dl locations */
9521 asdl = (struct sockaddr_dl *)(ifa + 1);
9522 bzero(asdl, SOCK_MAXADDRLEN);
9523 msdl = (struct sockaddr_dl *)(void *)
9524 ((char *)asdl + SOCK_MAXADDRLEN);
9525 bzero(msdl, SOCK_MAXADDRLEN);
9526 } else {
9527 VERIFY(ifa == NULL || ifa == &dl_if->dl_if_lladdr.ifa);
9528 /*
9529 * Use the storage areas for address and mask within the
9530 * dlil_ifnet structure. This is the most common case.
9531 */
9532 if (ifa == NULL) {
9533 ifa = &dl_if->dl_if_lladdr.ifa;
9534 ifa_lock_init(ifa);
9535 /* Don't set IFD_ALLOC, as this is permanent */
9536 ifa->ifa_debug = IFD_LINK;
9537 }
9538 IFA_LOCK(ifa);
9539 /* address and mask sockaddr_dl locations */
9540 asdl = (struct sockaddr_dl *)(void *)&dl_if->dl_if_lladdr.asdl;
9541 bzero(asdl, sizeof(dl_if->dl_if_lladdr.asdl));
9542 msdl = (struct sockaddr_dl *)(void *)&dl_if->dl_if_lladdr.msdl;
9543 bzero(msdl, sizeof(dl_if->dl_if_lladdr.msdl));
9544 }
9545
9546 /* hold a permanent reference for the ifnet itself */
9547 IFA_ADDREF_LOCKED(ifa);
9548 oifa = ifp->if_lladdr;
9549 ifp->if_lladdr = ifa;
9550
9551 VERIFY(ifa->ifa_debug == IFD_LINK);
9552 ifa->ifa_ifp = ifp;
9553 ifa->ifa_rtrequest = link_rtrequest;
9554 ifa->ifa_addr = (struct sockaddr *)asdl;
9555 asdl->sdl_len = (u_char)socksize;
9556 asdl->sdl_family = AF_LINK;
9557 if (namelen > 0) {
9558 bcopy(workbuf, asdl->sdl_data, min(namelen,
9559 sizeof(asdl->sdl_data)));
9560 asdl->sdl_nlen = (u_char)namelen;
9561 } else {
9562 asdl->sdl_nlen = 0;
9563 }
9564 asdl->sdl_index = ifp->if_index;
9565 asdl->sdl_type = ifp->if_type;
9566 if (ll_addr != NULL) {
9567 asdl->sdl_alen = ll_addr->sdl_alen;
9568 bcopy(CONST_LLADDR(ll_addr), LLADDR(asdl), asdl->sdl_alen);
9569 } else {
9570 asdl->sdl_alen = 0;
9571 }
9572 ifa->ifa_netmask = (struct sockaddr *)msdl;
9573 msdl->sdl_len = (u_char)masklen;
9574 while (namelen > 0) {
9575 msdl->sdl_data[--namelen] = 0xff;
9576 }
9577 IFA_UNLOCK(ifa);
9578
9579 if (oifa != NULL) {
9580 IFA_REMREF(oifa);
9581 }
9582
9583 return ifa;
9584 }
9585
9586 static void
if_purgeaddrs(struct ifnet * ifp)9587 if_purgeaddrs(struct ifnet *ifp)
9588 {
9589 #if INET
9590 in_purgeaddrs(ifp);
9591 #endif /* INET */
9592 in6_purgeaddrs(ifp);
9593 }
9594
9595 errno_t
ifnet_detach(ifnet_t ifp)9596 ifnet_detach(ifnet_t ifp)
9597 {
9598 struct ifnet *delegated_ifp;
9599 struct nd_ifinfo *ndi = NULL;
9600
9601 if (ifp == NULL) {
9602 return EINVAL;
9603 }
9604
9605 ndi = ND_IFINFO(ifp);
9606 if (NULL != ndi) {
9607 ndi->cga_initialized = FALSE;
9608 }
9609
9610 /* Mark the interface down */
9611 if_down(ifp);
9612
9613 /*
9614 * IMPORTANT NOTE
9615 *
9616 * Any field in the ifnet that relies on IF_FULLY_ATTACHED()
9617 * or equivalently, ifnet_is_attached(ifp, 1), can't be modified
9618 * until after we've waited for all I/O references to drain
9619 * in ifnet_detach_final().
9620 */
9621
9622 ifnet_head_lock_exclusive();
9623 ifnet_lock_exclusive(ifp);
9624
9625 if (ifp->if_output_netem != NULL) {
9626 netem_destroy(ifp->if_output_netem);
9627 ifp->if_output_netem = NULL;
9628 }
9629
9630 /*
9631 * Check to see if this interface has previously triggered
9632 * aggressive protocol draining; if so, decrement the global
9633 * refcnt and clear PR_AGGDRAIN on the route domain if
9634 * there are no more of such an interface around.
9635 */
9636 (void) ifnet_set_idle_flags_locked(ifp, 0, ~0);
9637
9638 lck_mtx_lock_spin(&ifp->if_ref_lock);
9639 if (!(ifp->if_refflags & IFRF_ATTACHED)) {
9640 lck_mtx_unlock(&ifp->if_ref_lock);
9641 ifnet_lock_done(ifp);
9642 ifnet_head_done();
9643 return EINVAL;
9644 } else if (ifp->if_refflags & IFRF_DETACHING) {
9645 /* Interface has already been detached */
9646 lck_mtx_unlock(&ifp->if_ref_lock);
9647 ifnet_lock_done(ifp);
9648 ifnet_head_done();
9649 return ENXIO;
9650 }
9651 VERIFY(!(ifp->if_refflags & IFRF_EMBRYONIC));
9652 /* Indicate this interface is being detached */
9653 ifp->if_refflags &= ~IFRF_ATTACHED;
9654 ifp->if_refflags |= IFRF_DETACHING;
9655 lck_mtx_unlock(&ifp->if_ref_lock);
9656
9657 if (dlil_verbose) {
9658 DLIL_PRINTF("%s: detaching\n", if_name(ifp));
9659 }
9660
9661 /* clean up flow control entry object if there's any */
9662 if (ifp->if_eflags & IFEF_TXSTART) {
9663 ifnet_flowadv(ifp->if_flowhash);
9664 }
9665
9666 /* Reset ECN enable/disable flags */
9667 /* Reset CLAT46 flag */
9668 if_clear_eflags(ifp, IFEF_ECN_ENABLE | IFEF_ECN_DISABLE | IFEF_CLAT46);
9669
9670 /*
9671 * We do not reset the TCP keep alive counters in case
9672 * a TCP connection stays connection after the interface
9673 * went down
9674 */
9675 if (ifp->if_tcp_kao_cnt > 0) {
9676 os_log(OS_LOG_DEFAULT, "%s %s tcp_kao_cnt %u not zero",
9677 __func__, if_name(ifp), ifp->if_tcp_kao_cnt);
9678 }
9679 ifp->if_tcp_kao_max = 0;
9680
9681 /*
9682 * Remove ifnet from the ifnet_head, ifindex2ifnet[]; it will
9683 * no longer be visible during lookups from this point.
9684 */
9685 VERIFY(ifindex2ifnet[ifp->if_index] == ifp);
9686 TAILQ_REMOVE(&ifnet_head, ifp, if_link);
9687 ifp->if_link.tqe_next = NULL;
9688 ifp->if_link.tqe_prev = NULL;
9689 if (ifp->if_ordered_link.tqe_next != NULL ||
9690 ifp->if_ordered_link.tqe_prev != NULL) {
9691 ifnet_remove_from_ordered_list(ifp);
9692 }
9693 ifindex2ifnet[ifp->if_index] = NULL;
9694
9695 /* 18717626 - reset router mode */
9696 if_clear_eflags(ifp, IFEF_IPV4_ROUTER);
9697 ifp->if_ipv6_router_mode = IPV6_ROUTER_MODE_DISABLED;
9698
9699 /* Record detach PC stacktrace */
9700 ctrace_record(&((struct dlil_ifnet *)ifp)->dl_if_detach);
9701
9702 /* Clear logging parameters */
9703 bzero(&ifp->if_log, sizeof(ifp->if_log));
9704
9705 /* Clear delegated interface info (reference released below) */
9706 delegated_ifp = ifp->if_delegated.ifp;
9707 bzero(&ifp->if_delegated, sizeof(ifp->if_delegated));
9708
9709 /* Reset interface state */
9710 bzero(&ifp->if_interface_state, sizeof(ifp->if_interface_state));
9711
9712 /*
9713 * Increment the generation count on interface deletion
9714 */
9715 ifp->if_creation_generation_id = os_atomic_inc(&if_creation_generation_count, relaxed);
9716
9717 ifnet_lock_done(ifp);
9718 ifnet_head_done();
9719
9720 /* Release reference held on the delegated interface */
9721 if (delegated_ifp != NULL) {
9722 ifnet_release(delegated_ifp);
9723 }
9724
9725 /* Reset Link Quality Metric (unless loopback [lo0]) */
9726 if (ifp != lo_ifp) {
9727 if_lqm_update(ifp, IFNET_LQM_THRESH_OFF, 0);
9728 }
9729
9730 /* Reset TCP local statistics */
9731 if (ifp->if_tcp_stat != NULL) {
9732 bzero(ifp->if_tcp_stat, sizeof(*ifp->if_tcp_stat));
9733 }
9734
9735 /* Reset UDP local statistics */
9736 if (ifp->if_udp_stat != NULL) {
9737 bzero(ifp->if_udp_stat, sizeof(*ifp->if_udp_stat));
9738 }
9739
9740 /* Reset ifnet IPv4 stats */
9741 if (ifp->if_ipv4_stat != NULL) {
9742 bzero(ifp->if_ipv4_stat, sizeof(*ifp->if_ipv4_stat));
9743 }
9744
9745 /* Reset ifnet IPv6 stats */
9746 if (ifp->if_ipv6_stat != NULL) {
9747 bzero(ifp->if_ipv6_stat, sizeof(*ifp->if_ipv6_stat));
9748 }
9749
9750 /* Release memory held for interface link status report */
9751 if (ifp->if_link_status != NULL) {
9752 kfree_type(struct if_link_status, ifp->if_link_status);
9753 ifp->if_link_status = NULL;
9754 }
9755
9756 /* Disable forwarding cached route */
9757 lck_mtx_lock(&ifp->if_cached_route_lock);
9758 ifp->if_fwd_cacheok = 0;
9759 lck_mtx_unlock(&ifp->if_cached_route_lock);
9760
9761 /* Disable data threshold and wait for any pending event posting */
9762 ifp->if_data_threshold = 0;
9763 VERIFY(ifp->if_dt_tcall != NULL);
9764 (void) thread_call_cancel_wait(ifp->if_dt_tcall);
9765
9766 /*
9767 * Drain any deferred IGMPv3/MLDv2 query responses, but keep the
9768 * references to the info structures and leave them attached to
9769 * this ifnet.
9770 */
9771 #if INET
9772 igmp_domifdetach(ifp);
9773 #endif /* INET */
9774 mld_domifdetach(ifp);
9775
9776 #if SKYWALK
9777 /* Clean up any netns tokens still pointing to to this ifnet */
9778 netns_ifnet_detach(ifp);
9779 #endif /* SKYWALK */
9780 dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_IF_DETACHING, NULL, 0, FALSE);
9781
9782 /* Let worker thread take care of the rest, to avoid reentrancy */
9783 dlil_if_lock();
9784 ifnet_detaching_enqueue(ifp);
9785 dlil_if_unlock();
9786
9787 return 0;
9788 }
9789
9790 static void
ifnet_detaching_enqueue(struct ifnet * ifp)9791 ifnet_detaching_enqueue(struct ifnet *ifp)
9792 {
9793 dlil_if_lock_assert();
9794
9795 ++ifnet_detaching_cnt;
9796 VERIFY(ifnet_detaching_cnt != 0);
9797 TAILQ_INSERT_TAIL(&ifnet_detaching_head, ifp, if_detaching_link);
9798 wakeup((caddr_t)&ifnet_delayed_run);
9799 }
9800
9801 static struct ifnet *
ifnet_detaching_dequeue(void)9802 ifnet_detaching_dequeue(void)
9803 {
9804 struct ifnet *ifp;
9805
9806 dlil_if_lock_assert();
9807
9808 ifp = TAILQ_FIRST(&ifnet_detaching_head);
9809 VERIFY(ifnet_detaching_cnt != 0 || ifp == NULL);
9810 if (ifp != NULL) {
9811 VERIFY(ifnet_detaching_cnt != 0);
9812 --ifnet_detaching_cnt;
9813 TAILQ_REMOVE(&ifnet_detaching_head, ifp, if_detaching_link);
9814 ifp->if_detaching_link.tqe_next = NULL;
9815 ifp->if_detaching_link.tqe_prev = NULL;
9816 }
9817 return ifp;
9818 }
9819
9820 __attribute__((noreturn))
9821 static void
ifnet_detacher_thread_cont(void * v,wait_result_t wres)9822 ifnet_detacher_thread_cont(void *v, wait_result_t wres)
9823 {
9824 #pragma unused(v, wres)
9825 struct ifnet *ifp;
9826
9827 dlil_if_lock();
9828 if (__improbable(ifnet_detaching_embryonic)) {
9829 ifnet_detaching_embryonic = FALSE;
9830 /* there's no lock ordering constrain so OK to do this here */
9831 dlil_decr_pending_thread_count();
9832 }
9833
9834 for (;;) {
9835 dlil_if_lock_assert();
9836
9837 if (ifnet_detaching_cnt == 0) {
9838 break;
9839 }
9840
9841 net_update_uptime();
9842
9843 VERIFY(TAILQ_FIRST(&ifnet_detaching_head) != NULL);
9844
9845 /* Take care of detaching ifnet */
9846 ifp = ifnet_detaching_dequeue();
9847 if (ifp != NULL) {
9848 dlil_if_unlock();
9849 ifnet_detach_final(ifp);
9850 dlil_if_lock();
9851 }
9852 }
9853
9854 (void) assert_wait(&ifnet_delayed_run, THREAD_UNINT);
9855 dlil_if_unlock();
9856 (void) thread_block(ifnet_detacher_thread_cont);
9857
9858 VERIFY(0); /* we should never get here */
9859 /* NOTREACHED */
9860 __builtin_unreachable();
9861 }
9862
9863 __dead2
9864 static void
ifnet_detacher_thread_func(void * v,wait_result_t w)9865 ifnet_detacher_thread_func(void *v, wait_result_t w)
9866 {
9867 #pragma unused(v, w)
9868 dlil_if_lock();
9869 (void) assert_wait(&ifnet_delayed_run, THREAD_UNINT);
9870 ifnet_detaching_embryonic = TRUE;
9871 /* wake up once to get out of embryonic state */
9872 wakeup((caddr_t)&ifnet_delayed_run);
9873 dlil_if_unlock();
9874 (void) thread_block(ifnet_detacher_thread_cont);
9875 VERIFY(0);
9876 /* NOTREACHED */
9877 __builtin_unreachable();
9878 }
9879
9880 static void
ifnet_detach_final(struct ifnet * ifp)9881 ifnet_detach_final(struct ifnet *ifp)
9882 {
9883 struct ifnet_filter *filter, *filter_next;
9884 struct dlil_ifnet *dlifp;
9885 struct ifnet_filter_head fhead;
9886 struct dlil_threading_info *inp;
9887 struct ifaddr *ifa;
9888 ifnet_detached_func if_free;
9889 int i;
9890
9891 /* Let BPF know we're detaching */
9892 bpfdetach(ifp);
9893
9894 #if SKYWALK
9895 dlil_netif_detach_notify(ifp);
9896 /*
9897 * Wait for the datapath to quiesce before tearing down
9898 * netif/flowswitch nexuses.
9899 */
9900 dlil_quiesce_and_detach_nexuses(ifp);
9901 #endif /* SKYWALK */
9902
9903 lck_mtx_lock(&ifp->if_ref_lock);
9904 if (!(ifp->if_refflags & IFRF_DETACHING)) {
9905 panic("%s: flags mismatch (detaching not set) ifp=%p",
9906 __func__, ifp);
9907 /* NOTREACHED */
9908 }
9909
9910 /*
9911 * Wait until the existing IO references get released
9912 * before we proceed with ifnet_detach. This is not a
9913 * common case, so block without using a continuation.
9914 */
9915 while (ifp->if_refio > 0) {
9916 DLIL_PRINTF("%s: Waiting for IO references on %s interface "
9917 "to be released\n", __func__, if_name(ifp));
9918 (void) msleep(&(ifp->if_refio), &ifp->if_ref_lock,
9919 (PZERO - 1), "ifnet_ioref_wait", NULL);
9920 }
9921
9922 VERIFY(ifp->if_datamov == 0);
9923 VERIFY(ifp->if_drainers == 0);
9924 VERIFY(ifp->if_suspend == 0);
9925 ifp->if_refflags &= ~IFRF_READY;
9926 lck_mtx_unlock(&ifp->if_ref_lock);
9927
9928 /* Clear agent IDs */
9929 if (ifp->if_agentids != NULL) {
9930 kfree_data(ifp->if_agentids,
9931 sizeof(uuid_t) * ifp->if_agentcount);
9932 ifp->if_agentids = NULL;
9933 }
9934 ifp->if_agentcount = 0;
9935
9936 #if SKYWALK
9937 VERIFY(SLIST_EMPTY(&ifp->if_netns_tokens));
9938 #endif /* SKYWALK */
9939 /* Drain and destroy send queue */
9940 ifclassq_teardown(ifp->if_snd);
9941
9942 /* Detach interface filters */
9943 lck_mtx_lock(&ifp->if_flt_lock);
9944 if_flt_monitor_enter(ifp);
9945
9946 LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
9947 fhead = ifp->if_flt_head;
9948 TAILQ_INIT(&ifp->if_flt_head);
9949
9950 for (filter = TAILQ_FIRST(&fhead); filter; filter = filter_next) {
9951 filter_next = TAILQ_NEXT(filter, filt_next);
9952 lck_mtx_unlock(&ifp->if_flt_lock);
9953
9954 dlil_detach_filter_internal(filter, 1);
9955 lck_mtx_lock(&ifp->if_flt_lock);
9956 }
9957 if_flt_monitor_leave(ifp);
9958 lck_mtx_unlock(&ifp->if_flt_lock);
9959
9960 /* Tell upper layers to drop their network addresses */
9961 if_purgeaddrs(ifp);
9962
9963 ifnet_lock_exclusive(ifp);
9964
9965 /* Unplumb all protocols */
9966 for (i = 0; i < PROTO_HASH_SLOTS; i++) {
9967 struct if_proto *proto;
9968
9969 proto = SLIST_FIRST(&ifp->if_proto_hash[i]);
9970 while (proto != NULL) {
9971 protocol_family_t family = proto->protocol_family;
9972 ifnet_lock_done(ifp);
9973 proto_unplumb(family, ifp);
9974 ifnet_lock_exclusive(ifp);
9975 proto = SLIST_FIRST(&ifp->if_proto_hash[i]);
9976 }
9977 /* There should not be any protocols left */
9978 VERIFY(SLIST_EMPTY(&ifp->if_proto_hash[i]));
9979 }
9980 kfree_type(struct proto_hash_entry, PROTO_HASH_SLOTS, ifp->if_proto_hash);
9981 ifp->if_proto_hash = NULL;
9982
9983 /* Detach (permanent) link address from if_addrhead */
9984 ifa = TAILQ_FIRST(&ifp->if_addrhead);
9985 VERIFY(ifnet_addrs[ifp->if_index - 1] == ifa);
9986 IFA_LOCK(ifa);
9987 if_detach_link_ifa(ifp, ifa);
9988 IFA_UNLOCK(ifa);
9989
9990 /* Remove (permanent) link address from ifnet_addrs[] */
9991 IFA_REMREF(ifa);
9992 ifnet_addrs[ifp->if_index - 1] = NULL;
9993
9994 /* This interface should not be on {ifnet_head,detaching} */
9995 VERIFY(ifp->if_link.tqe_next == NULL);
9996 VERIFY(ifp->if_link.tqe_prev == NULL);
9997 VERIFY(ifp->if_detaching_link.tqe_next == NULL);
9998 VERIFY(ifp->if_detaching_link.tqe_prev == NULL);
9999 VERIFY(ifp->if_ordered_link.tqe_next == NULL);
10000 VERIFY(ifp->if_ordered_link.tqe_prev == NULL);
10001
10002 /* The slot should have been emptied */
10003 VERIFY(ifindex2ifnet[ifp->if_index] == NULL);
10004
10005 /* There should not be any addresses left */
10006 VERIFY(TAILQ_EMPTY(&ifp->if_addrhead));
10007
10008 /*
10009 * Signal the starter thread to terminate itself, and wait until
10010 * it has exited.
10011 */
10012 if (ifp->if_start_thread != THREAD_NULL) {
10013 lck_mtx_lock_spin(&ifp->if_start_lock);
10014 ifp->if_start_flags |= IFSF_TERMINATING;
10015 wakeup_one((caddr_t)&ifp->if_start_thread);
10016 lck_mtx_unlock(&ifp->if_start_lock);
10017
10018 /* wait for starter thread to terminate */
10019 lck_mtx_lock(&ifp->if_start_lock);
10020 while (ifp->if_start_thread != THREAD_NULL) {
10021 if (dlil_verbose) {
10022 DLIL_PRINTF("%s: waiting for %s starter thread to terminate\n",
10023 __func__,
10024 if_name(ifp));
10025 }
10026 (void) msleep(&ifp->if_start_thread,
10027 &ifp->if_start_lock, (PZERO - 1),
10028 "ifnet_start_thread_exit", NULL);
10029 }
10030 lck_mtx_unlock(&ifp->if_start_lock);
10031 if (dlil_verbose) {
10032 DLIL_PRINTF("%s: %s starter thread termination complete",
10033 __func__, if_name(ifp));
10034 }
10035 }
10036
10037 /*
10038 * Signal the poller thread to terminate itself, and wait until
10039 * it has exited.
10040 */
10041 if (ifp->if_poll_thread != THREAD_NULL) {
10042 #if SKYWALK
10043 VERIFY(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
10044 #endif /* SKYWALK */
10045 lck_mtx_lock_spin(&ifp->if_poll_lock);
10046 ifp->if_poll_flags |= IF_POLLF_TERMINATING;
10047 wakeup_one((caddr_t)&ifp->if_poll_thread);
10048 lck_mtx_unlock(&ifp->if_poll_lock);
10049
10050 /* wait for poller thread to terminate */
10051 lck_mtx_lock(&ifp->if_poll_lock);
10052 while (ifp->if_poll_thread != THREAD_NULL) {
10053 if (dlil_verbose) {
10054 DLIL_PRINTF("%s: waiting for %s poller thread to terminate\n",
10055 __func__,
10056 if_name(ifp));
10057 }
10058 (void) msleep(&ifp->if_poll_thread,
10059 &ifp->if_poll_lock, (PZERO - 1),
10060 "ifnet_poll_thread_exit", NULL);
10061 }
10062 lck_mtx_unlock(&ifp->if_poll_lock);
10063 if (dlil_verbose) {
10064 DLIL_PRINTF("%s: %s poller thread termination complete\n",
10065 __func__, if_name(ifp));
10066 }
10067 }
10068
10069 /*
10070 * If thread affinity was set for the workloop thread, we will need
10071 * to tear down the affinity and release the extra reference count
10072 * taken at attach time. Does not apply to lo0 or other interfaces
10073 * without dedicated input threads.
10074 */
10075 if ((inp = ifp->if_inp) != NULL) {
10076 VERIFY(inp != dlil_main_input_thread);
10077
10078 if (inp->dlth_affinity) {
10079 struct thread *tp, *wtp, *ptp;
10080
10081 lck_mtx_lock_spin(&inp->dlth_lock);
10082 wtp = inp->dlth_driver_thread;
10083 inp->dlth_driver_thread = THREAD_NULL;
10084 ptp = inp->dlth_poller_thread;
10085 inp->dlth_poller_thread = THREAD_NULL;
10086 ASSERT(inp->dlth_thread != THREAD_NULL);
10087 tp = inp->dlth_thread; /* don't nullify now */
10088 inp->dlth_affinity_tag = 0;
10089 inp->dlth_affinity = FALSE;
10090 lck_mtx_unlock(&inp->dlth_lock);
10091
10092 /* Tear down poll thread affinity */
10093 if (ptp != NULL) {
10094 VERIFY(ifp->if_eflags & IFEF_RXPOLL);
10095 VERIFY(ifp->if_xflags & IFXF_LEGACY);
10096 (void) dlil_affinity_set(ptp,
10097 THREAD_AFFINITY_TAG_NULL);
10098 thread_deallocate(ptp);
10099 }
10100
10101 /* Tear down workloop thread affinity */
10102 if (wtp != NULL) {
10103 (void) dlil_affinity_set(wtp,
10104 THREAD_AFFINITY_TAG_NULL);
10105 thread_deallocate(wtp);
10106 }
10107
10108 /* Tear down DLIL input thread affinity */
10109 (void) dlil_affinity_set(tp, THREAD_AFFINITY_TAG_NULL);
10110 thread_deallocate(tp);
10111 }
10112
10113 /* disassociate ifp DLIL input thread */
10114 ifp->if_inp = NULL;
10115
10116 /* if the worker thread was created, tell it to terminate */
10117 if (inp->dlth_thread != THREAD_NULL) {
10118 lck_mtx_lock_spin(&inp->dlth_lock);
10119 inp->dlth_flags |= DLIL_INPUT_TERMINATE;
10120 if (!(inp->dlth_flags & DLIL_INPUT_RUNNING)) {
10121 wakeup_one((caddr_t)&inp->dlth_flags);
10122 }
10123 lck_mtx_unlock(&inp->dlth_lock);
10124 ifnet_lock_done(ifp);
10125
10126 /* wait for the input thread to terminate */
10127 lck_mtx_lock_spin(&inp->dlth_lock);
10128 while ((inp->dlth_flags & DLIL_INPUT_TERMINATE_COMPLETE)
10129 == 0) {
10130 (void) msleep(&inp->dlth_flags, &inp->dlth_lock,
10131 (PZERO - 1) | PSPIN, inp->dlth_name, NULL);
10132 }
10133 lck_mtx_unlock(&inp->dlth_lock);
10134 ifnet_lock_exclusive(ifp);
10135 }
10136
10137 /* clean-up input thread state */
10138 dlil_clean_threading_info(inp);
10139 /* clean-up poll parameters */
10140 VERIFY(ifp->if_poll_thread == THREAD_NULL);
10141 dlil_reset_rxpoll_params(ifp);
10142 }
10143
10144 /* The driver might unload, so point these to ourselves */
10145 if_free = ifp->if_free;
10146 ifp->if_output_dlil = ifp_if_output;
10147 ifp->if_output = ifp_if_output;
10148 ifp->if_pre_enqueue = ifp_if_output;
10149 ifp->if_start = ifp_if_start;
10150 ifp->if_output_ctl = ifp_if_ctl;
10151 ifp->if_input_dlil = ifp_if_input;
10152 ifp->if_input_poll = ifp_if_input_poll;
10153 ifp->if_input_ctl = ifp_if_ctl;
10154 ifp->if_ioctl = ifp_if_ioctl;
10155 ifp->if_set_bpf_tap = ifp_if_set_bpf_tap;
10156 ifp->if_free = ifp_if_free;
10157 ifp->if_demux = ifp_if_demux;
10158 ifp->if_event = ifp_if_event;
10159 ifp->if_framer_legacy = ifp_if_framer;
10160 ifp->if_framer = ifp_if_framer_extended;
10161 ifp->if_add_proto = ifp_if_add_proto;
10162 ifp->if_del_proto = ifp_if_del_proto;
10163 ifp->if_check_multi = ifp_if_check_multi;
10164
10165 /* wipe out interface description */
10166 VERIFY(ifp->if_desc.ifd_maxlen == IF_DESCSIZE);
10167 ifp->if_desc.ifd_len = 0;
10168 VERIFY(ifp->if_desc.ifd_desc != NULL);
10169 bzero(ifp->if_desc.ifd_desc, IF_DESCSIZE);
10170
10171 /* there shouldn't be any delegation by now */
10172 VERIFY(ifp->if_delegated.ifp == NULL);
10173 VERIFY(ifp->if_delegated.type == 0);
10174 VERIFY(ifp->if_delegated.family == 0);
10175 VERIFY(ifp->if_delegated.subfamily == 0);
10176 VERIFY(ifp->if_delegated.expensive == 0);
10177 VERIFY(ifp->if_delegated.constrained == 0);
10178
10179 /* QoS marking get cleared */
10180 if_clear_eflags(ifp, IFEF_QOSMARKING_ENABLED);
10181 if_set_qosmarking_mode(ifp, IFRTYPE_QOSMARKING_MODE_NONE);
10182
10183 #if SKYWALK
10184 /* the nexus destructor is responsible for clearing these */
10185 VERIFY(ifp->if_na_ops == NULL);
10186 VERIFY(ifp->if_na == NULL);
10187 #endif /* SKYWALK */
10188
10189 /* promiscuous/allmulti counts need to start at zero again */
10190 ifp->if_pcount = 0;
10191 ifp->if_amcount = 0;
10192 ifp->if_flags &= ~(IFF_PROMISC | IFF_ALLMULTI);
10193
10194 ifnet_lock_done(ifp);
10195
10196 #if PF
10197 /*
10198 * Detach this interface from packet filter, if enabled.
10199 */
10200 pf_ifnet_hook(ifp, 0);
10201 #endif /* PF */
10202
10203 /* Filter list should be empty */
10204 lck_mtx_lock_spin(&ifp->if_flt_lock);
10205 VERIFY(TAILQ_EMPTY(&ifp->if_flt_head));
10206 VERIFY(ifp->if_flt_busy == 0);
10207 VERIFY(ifp->if_flt_waiters == 0);
10208 VERIFY(ifp->if_flt_non_os_count == 0);
10209 VERIFY(ifp->if_flt_no_tso_count == 0);
10210 lck_mtx_unlock(&ifp->if_flt_lock);
10211
10212 /* Last chance to drain send queue */
10213 if_qflush_snd(ifp, 0);
10214
10215 /* Last chance to cleanup any cached route */
10216 lck_mtx_lock(&ifp->if_cached_route_lock);
10217 VERIFY(!ifp->if_fwd_cacheok);
10218 ROUTE_RELEASE(&ifp->if_fwd_route);
10219 bzero(&ifp->if_fwd_route, sizeof(ifp->if_fwd_route));
10220 ROUTE_RELEASE(&ifp->if_src_route);
10221 bzero(&ifp->if_src_route, sizeof(ifp->if_src_route));
10222 ROUTE_RELEASE(&ifp->if_src_route6);
10223 bzero(&ifp->if_src_route6, sizeof(ifp->if_src_route6));
10224 lck_mtx_unlock(&ifp->if_cached_route_lock);
10225
10226 VERIFY(ifp->if_data_threshold == 0);
10227 VERIFY(ifp->if_dt_tcall != NULL);
10228 VERIFY(!thread_call_isactive(ifp->if_dt_tcall));
10229
10230 ifnet_llreach_ifdetach(ifp);
10231
10232 dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_IF_DETACHED, NULL, 0, FALSE);
10233
10234 /*
10235 * Finally, mark this ifnet as detached.
10236 */
10237 if (dlil_verbose) {
10238 DLIL_PRINTF("%s: detached\n", if_name(ifp));
10239 }
10240 lck_mtx_lock_spin(&ifp->if_ref_lock);
10241 if (!(ifp->if_refflags & IFRF_DETACHING)) {
10242 panic("%s: flags mismatch (detaching not set) ifp=%p",
10243 __func__, ifp);
10244 /* NOTREACHED */
10245 }
10246 ifp->if_refflags &= ~IFRF_DETACHING;
10247 lck_mtx_unlock(&ifp->if_ref_lock);
10248 if (if_free != NULL) {
10249 if_free(ifp);
10250 }
10251
10252 ifclassq_release(&ifp->if_snd);
10253
10254 /* we're fully detached, clear the "in use" bit */
10255 dlifp = (struct dlil_ifnet *)ifp;
10256 lck_mtx_lock(&dlifp->dl_if_lock);
10257 ASSERT((dlifp->dl_if_flags & DLIF_INUSE) != 0);
10258 dlifp->dl_if_flags &= ~DLIF_INUSE;
10259 lck_mtx_unlock(&dlifp->dl_if_lock);
10260
10261 /* Release reference held during ifnet attach */
10262 ifnet_release(ifp);
10263 }
10264
10265 errno_t
ifp_if_output(struct ifnet * ifp,struct mbuf * m)10266 ifp_if_output(struct ifnet *ifp, struct mbuf *m)
10267 {
10268 #pragma unused(ifp)
10269 m_freem_list(m);
10270 return 0;
10271 }
10272
10273 void
ifp_if_start(struct ifnet * ifp)10274 ifp_if_start(struct ifnet *ifp)
10275 {
10276 ifnet_purge(ifp);
10277 }
10278
10279 static errno_t
ifp_if_input(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t poll,struct thread * tp)10280 ifp_if_input(struct ifnet *ifp, struct mbuf *m_head,
10281 struct mbuf *m_tail, const struct ifnet_stat_increment_param *s,
10282 boolean_t poll, struct thread *tp)
10283 {
10284 #pragma unused(ifp, m_tail, s, poll, tp)
10285 m_freem_list(m_head);
10286 return ENXIO;
10287 }
10288
10289 static void
ifp_if_input_poll(struct ifnet * ifp,u_int32_t flags,u_int32_t max_cnt,struct mbuf ** m_head,struct mbuf ** m_tail,u_int32_t * cnt,u_int32_t * len)10290 ifp_if_input_poll(struct ifnet *ifp, u_int32_t flags, u_int32_t max_cnt,
10291 struct mbuf **m_head, struct mbuf **m_tail, u_int32_t *cnt, u_int32_t *len)
10292 {
10293 #pragma unused(ifp, flags, max_cnt)
10294 if (m_head != NULL) {
10295 *m_head = NULL;
10296 }
10297 if (m_tail != NULL) {
10298 *m_tail = NULL;
10299 }
10300 if (cnt != NULL) {
10301 *cnt = 0;
10302 }
10303 if (len != NULL) {
10304 *len = 0;
10305 }
10306 }
10307
10308 static errno_t
ifp_if_ctl(struct ifnet * ifp,ifnet_ctl_cmd_t cmd,u_int32_t arglen,void * arg)10309 ifp_if_ctl(struct ifnet *ifp, ifnet_ctl_cmd_t cmd, u_int32_t arglen, void *arg)
10310 {
10311 #pragma unused(ifp, cmd, arglen, arg)
10312 return EOPNOTSUPP;
10313 }
10314
10315 static errno_t
ifp_if_demux(struct ifnet * ifp,struct mbuf * m,char * fh,protocol_family_t * pf)10316 ifp_if_demux(struct ifnet *ifp, struct mbuf *m, char *fh, protocol_family_t *pf)
10317 {
10318 #pragma unused(ifp, fh, pf)
10319 m_freem(m);
10320 return EJUSTRETURN;
10321 }
10322
10323 static errno_t
ifp_if_add_proto(struct ifnet * ifp,protocol_family_t pf,const struct ifnet_demux_desc * da,u_int32_t dc)10324 ifp_if_add_proto(struct ifnet *ifp, protocol_family_t pf,
10325 const struct ifnet_demux_desc *da, u_int32_t dc)
10326 {
10327 #pragma unused(ifp, pf, da, dc)
10328 return EINVAL;
10329 }
10330
10331 static errno_t
ifp_if_del_proto(struct ifnet * ifp,protocol_family_t pf)10332 ifp_if_del_proto(struct ifnet *ifp, protocol_family_t pf)
10333 {
10334 #pragma unused(ifp, pf)
10335 return EINVAL;
10336 }
10337
10338 static errno_t
ifp_if_check_multi(struct ifnet * ifp,const struct sockaddr * sa)10339 ifp_if_check_multi(struct ifnet *ifp, const struct sockaddr *sa)
10340 {
10341 #pragma unused(ifp, sa)
10342 return EOPNOTSUPP;
10343 }
10344
10345 #if !XNU_TARGET_OS_OSX
10346 static errno_t
ifp_if_framer(struct ifnet * ifp,struct mbuf ** m,const struct sockaddr * sa,const char * ll,const char * t,u_int32_t * pre,u_int32_t * post)10347 ifp_if_framer(struct ifnet *ifp, struct mbuf **m,
10348 const struct sockaddr *sa, const char *ll, const char *t,
10349 u_int32_t *pre, u_int32_t *post)
10350 #else /* XNU_TARGET_OS_OSX */
10351 static errno_t
10352 ifp_if_framer(struct ifnet *ifp, struct mbuf **m,
10353 const struct sockaddr *sa, const char *ll, const char *t)
10354 #endif /* XNU_TARGET_OS_OSX */
10355 {
10356 #pragma unused(ifp, m, sa, ll, t)
10357 #if !XNU_TARGET_OS_OSX
10358 return ifp_if_framer_extended(ifp, m, sa, ll, t, pre, post);
10359 #else /* XNU_TARGET_OS_OSX */
10360 return ifp_if_framer_extended(ifp, m, sa, ll, t, NULL, NULL);
10361 #endif /* XNU_TARGET_OS_OSX */
10362 }
10363
10364 static errno_t
ifp_if_framer_extended(struct ifnet * ifp,struct mbuf ** m,const struct sockaddr * sa,const char * ll,const char * t,u_int32_t * pre,u_int32_t * post)10365 ifp_if_framer_extended(struct ifnet *ifp, struct mbuf **m,
10366 const struct sockaddr *sa, const char *ll, const char *t,
10367 u_int32_t *pre, u_int32_t *post)
10368 {
10369 #pragma unused(ifp, sa, ll, t)
10370 m_freem(*m);
10371 *m = NULL;
10372
10373 if (pre != NULL) {
10374 *pre = 0;
10375 }
10376 if (post != NULL) {
10377 *post = 0;
10378 }
10379
10380 return EJUSTRETURN;
10381 }
10382
10383 errno_t
ifp_if_ioctl(struct ifnet * ifp,unsigned long cmd,void * arg)10384 ifp_if_ioctl(struct ifnet *ifp, unsigned long cmd, void *arg)
10385 {
10386 #pragma unused(ifp, cmd, arg)
10387 return EOPNOTSUPP;
10388 }
10389
10390 static errno_t
ifp_if_set_bpf_tap(struct ifnet * ifp,bpf_tap_mode tm,bpf_packet_func f)10391 ifp_if_set_bpf_tap(struct ifnet *ifp, bpf_tap_mode tm, bpf_packet_func f)
10392 {
10393 #pragma unused(ifp, tm, f)
10394 /* XXX not sure what to do here */
10395 return 0;
10396 }
10397
10398 static void
ifp_if_free(struct ifnet * ifp)10399 ifp_if_free(struct ifnet *ifp)
10400 {
10401 #pragma unused(ifp)
10402 }
10403
10404 static void
ifp_if_event(struct ifnet * ifp,const struct kev_msg * e)10405 ifp_if_event(struct ifnet *ifp, const struct kev_msg *e)
10406 {
10407 #pragma unused(ifp, e)
10408 }
10409
10410 int
dlil_if_acquire(u_int32_t family,const void * uniqueid,size_t uniqueid_len,const char * ifxname,struct ifnet ** ifp)10411 dlil_if_acquire(u_int32_t family, const void *uniqueid,
10412 size_t uniqueid_len, const char *ifxname, struct ifnet **ifp)
10413 {
10414 struct ifnet *ifp1 = NULL;
10415 struct dlil_ifnet *dlifp1 = NULL;
10416 struct dlil_ifnet *dlifp1_saved = NULL;
10417 void *buf, *base, **pbuf;
10418 int ret = 0;
10419
10420 VERIFY(*ifp == NULL);
10421 dlil_if_lock();
10422 /*
10423 * We absolutely can't have an interface with the same name
10424 * in in-use state.
10425 * To make sure of that list has to be traversed completely
10426 */
10427 TAILQ_FOREACH(dlifp1, &dlil_ifnet_head, dl_if_link) {
10428 ifp1 = (struct ifnet *)dlifp1;
10429
10430 if (ifp1->if_family != family) {
10431 continue;
10432 }
10433
10434 /*
10435 * If interface is in use, return EBUSY if either unique id
10436 * or interface extended names are the same
10437 */
10438 lck_mtx_lock(&dlifp1->dl_if_lock);
10439 if (strncmp(ifxname, ifp1->if_xname, IFXNAMSIZ) == 0 &&
10440 (dlifp1->dl_if_flags & DLIF_INUSE) != 0) {
10441 lck_mtx_unlock(&dlifp1->dl_if_lock);
10442 ret = EBUSY;
10443 goto end;
10444 }
10445
10446 if (uniqueid_len != 0 &&
10447 uniqueid_len == dlifp1->dl_if_uniqueid_len &&
10448 bcmp(uniqueid, dlifp1->dl_if_uniqueid, uniqueid_len) == 0) {
10449 if ((dlifp1->dl_if_flags & DLIF_INUSE) != 0) {
10450 lck_mtx_unlock(&dlifp1->dl_if_lock);
10451 ret = EBUSY;
10452 goto end;
10453 }
10454 if (dlifp1_saved == NULL) {
10455 /* cache the first match */
10456 dlifp1_saved = dlifp1;
10457 }
10458 /*
10459 * Do not break or jump to end as we have to traverse
10460 * the whole list to ensure there are no name collisions
10461 */
10462 }
10463 lck_mtx_unlock(&dlifp1->dl_if_lock);
10464 }
10465
10466 /* If there's an interface that can be recycled, use that */
10467 if (dlifp1_saved != NULL) {
10468 lck_mtx_lock(&dlifp1_saved->dl_if_lock);
10469 if ((dlifp1_saved->dl_if_flags & DLIF_INUSE) != 0) {
10470 /* some other thread got in ahead of us */
10471 lck_mtx_unlock(&dlifp1_saved->dl_if_lock);
10472 ret = EBUSY;
10473 goto end;
10474 }
10475 dlifp1_saved->dl_if_flags |= (DLIF_INUSE | DLIF_REUSE);
10476 lck_mtx_unlock(&dlifp1_saved->dl_if_lock);
10477 *ifp = (struct ifnet *)dlifp1_saved;
10478 dlil_if_ref(*ifp);
10479 goto end;
10480 }
10481
10482 /* no interface found, allocate a new one */
10483 buf = zalloc_flags(dlif_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
10484
10485 /* Get the 64-bit aligned base address for this object */
10486 base = (void *)P2ROUNDUP((intptr_t)buf + sizeof(u_int64_t),
10487 sizeof(u_int64_t));
10488 VERIFY(((intptr_t)base + dlif_size) <= ((intptr_t)buf + dlif_bufsize));
10489
10490 /*
10491 * Wind back a pointer size from the aligned base and
10492 * save the original address so we can free it later.
10493 */
10494 pbuf = (void **)((intptr_t)base - sizeof(void *));
10495 *pbuf = buf;
10496 dlifp1 = base;
10497
10498 if (uniqueid_len) {
10499 dlifp1->dl_if_uniqueid = kalloc_data(uniqueid_len,
10500 Z_WAITOK);
10501 if (dlifp1->dl_if_uniqueid == NULL) {
10502 zfree(dlif_zone, buf);
10503 ret = ENOMEM;
10504 goto end;
10505 }
10506 bcopy(uniqueid, dlifp1->dl_if_uniqueid, uniqueid_len);
10507 dlifp1->dl_if_uniqueid_len = uniqueid_len;
10508 }
10509
10510 ifp1 = (struct ifnet *)dlifp1;
10511 dlifp1->dl_if_flags = DLIF_INUSE;
10512 if (ifnet_debug) {
10513 dlifp1->dl_if_flags |= DLIF_DEBUG;
10514 dlifp1->dl_if_trace = dlil_if_trace;
10515 }
10516 ifp1->if_name = dlifp1->dl_if_namestorage;
10517 ifp1->if_xname = dlifp1->dl_if_xnamestorage;
10518
10519 /* initialize interface description */
10520 ifp1->if_desc.ifd_maxlen = IF_DESCSIZE;
10521 ifp1->if_desc.ifd_len = 0;
10522 ifp1->if_desc.ifd_desc = dlifp1->dl_if_descstorage;
10523
10524 #if SKYWALK
10525 SLIST_INIT(&ifp1->if_netns_tokens);
10526 #endif /* SKYWALK */
10527
10528 if ((ret = dlil_alloc_local_stats(ifp1)) != 0) {
10529 DLIL_PRINTF("%s: failed to allocate if local stats, "
10530 "error: %d\n", __func__, ret);
10531 /* This probably shouldn't be fatal */
10532 ret = 0;
10533 }
10534
10535 lck_mtx_init(&dlifp1->dl_if_lock, &ifnet_lock_group, &ifnet_lock_attr);
10536 lck_rw_init(&ifp1->if_lock, &ifnet_lock_group, &ifnet_lock_attr);
10537 lck_mtx_init(&ifp1->if_ref_lock, &ifnet_lock_group, &ifnet_lock_attr);
10538 lck_mtx_init(&ifp1->if_flt_lock, &ifnet_lock_group, &ifnet_lock_attr);
10539 lck_mtx_init(&ifp1->if_addrconfig_lock, &ifnet_lock_group,
10540 &ifnet_lock_attr);
10541 lck_rw_init(&ifp1->if_llreach_lock, &ifnet_lock_group, &ifnet_lock_attr);
10542 #if INET
10543 lck_rw_init(&ifp1->if_inetdata_lock, &ifnet_lock_group,
10544 &ifnet_lock_attr);
10545 ifp1->if_inetdata = NULL;
10546 #endif
10547 lck_mtx_init(&ifp1->if_inet6_ioctl_lock, &ifnet_lock_group, &ifnet_lock_attr);
10548 ifp1->if_inet6_ioctl_busy = FALSE;
10549 lck_rw_init(&ifp1->if_inet6data_lock, &ifnet_lock_group,
10550 &ifnet_lock_attr);
10551 ifp1->if_inet6data = NULL;
10552 lck_rw_init(&ifp1->if_link_status_lock, &ifnet_lock_group,
10553 &ifnet_lock_attr);
10554 ifp1->if_link_status = NULL;
10555 lck_mtx_init(&ifp1->if_delegate_lock, &ifnet_lock_group, &ifnet_lock_attr);
10556
10557 /* for send data paths */
10558 lck_mtx_init(&ifp1->if_start_lock, &ifnet_snd_lock_group,
10559 &ifnet_lock_attr);
10560 lck_mtx_init(&ifp1->if_cached_route_lock, &ifnet_snd_lock_group,
10561 &ifnet_lock_attr);
10562
10563 /* for receive data paths */
10564 lck_mtx_init(&ifp1->if_poll_lock, &ifnet_rcv_lock_group,
10565 &ifnet_lock_attr);
10566
10567 /* thread call allocation is done with sleeping zalloc */
10568 ifp1->if_dt_tcall = thread_call_allocate_with_options(dlil_dt_tcall_fn,
10569 ifp1, THREAD_CALL_PRIORITY_KERNEL, THREAD_CALL_OPTIONS_ONCE);
10570 if (ifp1->if_dt_tcall == NULL) {
10571 panic_plain("%s: couldn't create if_dt_tcall", __func__);
10572 /* NOTREACHED */
10573 }
10574
10575 TAILQ_INSERT_TAIL(&dlil_ifnet_head, dlifp1, dl_if_link);
10576
10577 *ifp = ifp1;
10578 dlil_if_ref(*ifp);
10579
10580 end:
10581 dlil_if_unlock();
10582
10583 VERIFY(dlifp1 == NULL || (IS_P2ALIGNED(dlifp1, sizeof(u_int64_t)) &&
10584 IS_P2ALIGNED(&ifp1->if_data, sizeof(u_int64_t))));
10585
10586 return ret;
10587 }
10588
10589 static void
_dlil_if_release(ifnet_t ifp,bool clear_in_use)10590 _dlil_if_release(ifnet_t ifp, bool clear_in_use)
10591 {
10592 struct dlil_ifnet *dlifp = (struct dlil_ifnet *)ifp;
10593
10594 VERIFY(OSDecrementAtomic64(&net_api_stats.nas_ifnet_alloc_count) > 0);
10595 if (!(ifp->if_xflags & IFXF_ALLOC_KPI)) {
10596 VERIFY(OSDecrementAtomic64(&net_api_stats.nas_ifnet_alloc_os_count) > 0);
10597 }
10598
10599 ifnet_lock_exclusive(ifp);
10600 if (ifp->if_broadcast.length > sizeof(ifp->if_broadcast.u.buffer)) {
10601 kfree_data(ifp->if_broadcast.u.ptr, ifp->if_broadcast.length);
10602 ifp->if_broadcast.length = 0;
10603 ifp->if_broadcast.u.ptr = NULL;
10604 }
10605 lck_mtx_lock(&dlifp->dl_if_lock);
10606 strlcpy(dlifp->dl_if_namestorage, ifp->if_name, IFNAMSIZ);
10607 ifp->if_name = dlifp->dl_if_namestorage;
10608 /* Reset external name (name + unit) */
10609 ifp->if_xname = dlifp->dl_if_xnamestorage;
10610 snprintf(__DECONST(char *, ifp->if_xname), IFXNAMSIZ,
10611 "%s?", ifp->if_name);
10612 if (clear_in_use) {
10613 ASSERT((dlifp->dl_if_flags & DLIF_INUSE) != 0);
10614 dlifp->dl_if_flags &= ~DLIF_INUSE;
10615 }
10616 lck_mtx_unlock(&dlifp->dl_if_lock);
10617 ifnet_lock_done(ifp);
10618 }
10619
10620 __private_extern__ void
dlil_if_release(ifnet_t ifp)10621 dlil_if_release(ifnet_t ifp)
10622 {
10623 _dlil_if_release(ifp, false);
10624 }
10625
10626 __private_extern__ void
dlil_if_lock(void)10627 dlil_if_lock(void)
10628 {
10629 lck_mtx_lock(&dlil_ifnet_lock);
10630 }
10631
10632 __private_extern__ void
dlil_if_unlock(void)10633 dlil_if_unlock(void)
10634 {
10635 lck_mtx_unlock(&dlil_ifnet_lock);
10636 }
10637
10638 __private_extern__ void
dlil_if_lock_assert(void)10639 dlil_if_lock_assert(void)
10640 {
10641 LCK_MTX_ASSERT(&dlil_ifnet_lock, LCK_MTX_ASSERT_OWNED);
10642 }
10643
10644 __private_extern__ void
dlil_proto_unplumb_all(struct ifnet * ifp)10645 dlil_proto_unplumb_all(struct ifnet *ifp)
10646 {
10647 /*
10648 * if_proto_hash[0-2] are for PF_INET, PF_INET6 and PF_VLAN, where
10649 * each bucket contains exactly one entry; PF_VLAN does not need an
10650 * explicit unplumb.
10651 *
10652 * if_proto_hash[3] is for other protocols; we expect anything
10653 * in this bucket to respond to the DETACHING event (which would
10654 * have happened by now) and do the unplumb then.
10655 */
10656 (void) proto_unplumb(PF_INET, ifp);
10657 (void) proto_unplumb(PF_INET6, ifp);
10658 }
10659
10660 static void
ifp_src_route_copyout(struct ifnet * ifp,struct route * dst)10661 ifp_src_route_copyout(struct ifnet *ifp, struct route *dst)
10662 {
10663 lck_mtx_lock_spin(&ifp->if_cached_route_lock);
10664 lck_mtx_convert_spin(&ifp->if_cached_route_lock);
10665
10666 route_copyout(dst, &ifp->if_src_route, sizeof(*dst));
10667
10668 lck_mtx_unlock(&ifp->if_cached_route_lock);
10669 }
10670
10671 static void
ifp_src_route_copyin(struct ifnet * ifp,struct route * src)10672 ifp_src_route_copyin(struct ifnet *ifp, struct route *src)
10673 {
10674 lck_mtx_lock_spin(&ifp->if_cached_route_lock);
10675 lck_mtx_convert_spin(&ifp->if_cached_route_lock);
10676
10677 if (ifp->if_fwd_cacheok) {
10678 route_copyin(src, &ifp->if_src_route, sizeof(*src));
10679 } else {
10680 ROUTE_RELEASE(src);
10681 }
10682 lck_mtx_unlock(&ifp->if_cached_route_lock);
10683 }
10684
10685 static void
ifp_src_route6_copyout(struct ifnet * ifp,struct route_in6 * dst)10686 ifp_src_route6_copyout(struct ifnet *ifp, struct route_in6 *dst)
10687 {
10688 lck_mtx_lock_spin(&ifp->if_cached_route_lock);
10689 lck_mtx_convert_spin(&ifp->if_cached_route_lock);
10690
10691 route_copyout((struct route *)dst, (struct route *)&ifp->if_src_route6,
10692 sizeof(*dst));
10693
10694 lck_mtx_unlock(&ifp->if_cached_route_lock);
10695 }
10696
10697 static void
ifp_src_route6_copyin(struct ifnet * ifp,struct route_in6 * src)10698 ifp_src_route6_copyin(struct ifnet *ifp, struct route_in6 *src)
10699 {
10700 lck_mtx_lock_spin(&ifp->if_cached_route_lock);
10701 lck_mtx_convert_spin(&ifp->if_cached_route_lock);
10702
10703 if (ifp->if_fwd_cacheok) {
10704 route_copyin((struct route *)src,
10705 (struct route *)&ifp->if_src_route6, sizeof(*src));
10706 } else {
10707 ROUTE_RELEASE(src);
10708 }
10709 lck_mtx_unlock(&ifp->if_cached_route_lock);
10710 }
10711
10712 struct rtentry *
ifnet_cached_rtlookup_inet(struct ifnet * ifp,struct in_addr src_ip)10713 ifnet_cached_rtlookup_inet(struct ifnet *ifp, struct in_addr src_ip)
10714 {
10715 struct route src_rt;
10716 struct sockaddr_in *dst;
10717
10718 dst = (struct sockaddr_in *)(void *)(&src_rt.ro_dst);
10719
10720 ifp_src_route_copyout(ifp, &src_rt);
10721
10722 if (ROUTE_UNUSABLE(&src_rt) || src_ip.s_addr != dst->sin_addr.s_addr) {
10723 ROUTE_RELEASE(&src_rt);
10724 if (dst->sin_family != AF_INET) {
10725 bzero(&src_rt.ro_dst, sizeof(src_rt.ro_dst));
10726 dst->sin_len = sizeof(src_rt.ro_dst);
10727 dst->sin_family = AF_INET;
10728 }
10729 dst->sin_addr = src_ip;
10730
10731 VERIFY(src_rt.ro_rt == NULL);
10732 src_rt.ro_rt = rtalloc1_scoped((struct sockaddr *)dst,
10733 0, 0, ifp->if_index);
10734
10735 if (src_rt.ro_rt != NULL) {
10736 /* retain a ref, copyin consumes one */
10737 struct rtentry *rte = src_rt.ro_rt;
10738 RT_ADDREF(rte);
10739 ifp_src_route_copyin(ifp, &src_rt);
10740 src_rt.ro_rt = rte;
10741 }
10742 }
10743
10744 return src_rt.ro_rt;
10745 }
10746
10747 struct rtentry *
ifnet_cached_rtlookup_inet6(struct ifnet * ifp,struct in6_addr * src_ip6)10748 ifnet_cached_rtlookup_inet6(struct ifnet *ifp, struct in6_addr *src_ip6)
10749 {
10750 struct route_in6 src_rt;
10751
10752 ifp_src_route6_copyout(ifp, &src_rt);
10753
10754 if (ROUTE_UNUSABLE(&src_rt) ||
10755 !IN6_ARE_ADDR_EQUAL(src_ip6, &src_rt.ro_dst.sin6_addr)) {
10756 ROUTE_RELEASE(&src_rt);
10757 if (src_rt.ro_dst.sin6_family != AF_INET6) {
10758 bzero(&src_rt.ro_dst, sizeof(src_rt.ro_dst));
10759 src_rt.ro_dst.sin6_len = sizeof(src_rt.ro_dst);
10760 src_rt.ro_dst.sin6_family = AF_INET6;
10761 }
10762 src_rt.ro_dst.sin6_scope_id = in6_addr2scopeid(ifp, src_ip6);
10763 bcopy(src_ip6, &src_rt.ro_dst.sin6_addr,
10764 sizeof(src_rt.ro_dst.sin6_addr));
10765
10766 if (src_rt.ro_rt == NULL) {
10767 src_rt.ro_rt = rtalloc1_scoped(
10768 (struct sockaddr *)&src_rt.ro_dst, 0, 0,
10769 ifp->if_index);
10770
10771 if (src_rt.ro_rt != NULL) {
10772 /* retain a ref, copyin consumes one */
10773 struct rtentry *rte = src_rt.ro_rt;
10774 RT_ADDREF(rte);
10775 ifp_src_route6_copyin(ifp, &src_rt);
10776 src_rt.ro_rt = rte;
10777 }
10778 }
10779 }
10780
10781 return src_rt.ro_rt;
10782 }
10783
10784 void
if_lqm_update(struct ifnet * ifp,int lqm,int locked)10785 if_lqm_update(struct ifnet *ifp, int lqm, int locked)
10786 {
10787 struct kev_dl_link_quality_metric_data ev_lqm_data;
10788
10789 VERIFY(lqm >= IFNET_LQM_MIN && lqm <= IFNET_LQM_MAX);
10790
10791 /* Normalize to edge */
10792 if (lqm >= 0 && lqm <= IFNET_LQM_THRESH_ABORT) {
10793 lqm = IFNET_LQM_THRESH_ABORT;
10794 os_atomic_or(&tcbinfo.ipi_flags, INPCBINFO_HANDLE_LQM_ABORT, relaxed);
10795 inpcb_timer_sched(&tcbinfo, INPCB_TIMER_FAST);
10796 } else if (lqm > IFNET_LQM_THRESH_ABORT &&
10797 lqm <= IFNET_LQM_THRESH_MINIMALLY_VIABLE) {
10798 lqm = IFNET_LQM_THRESH_MINIMALLY_VIABLE;
10799 } else if (lqm > IFNET_LQM_THRESH_MINIMALLY_VIABLE &&
10800 lqm <= IFNET_LQM_THRESH_POOR) {
10801 lqm = IFNET_LQM_THRESH_POOR;
10802 } else if (lqm > IFNET_LQM_THRESH_POOR &&
10803 lqm <= IFNET_LQM_THRESH_GOOD) {
10804 lqm = IFNET_LQM_THRESH_GOOD;
10805 }
10806
10807 /*
10808 * Take the lock if needed
10809 */
10810 if (!locked) {
10811 ifnet_lock_exclusive(ifp);
10812 }
10813
10814 if (lqm == ifp->if_interface_state.lqm_state &&
10815 (ifp->if_interface_state.valid_bitmask &
10816 IF_INTERFACE_STATE_LQM_STATE_VALID)) {
10817 /*
10818 * Release the lock if was not held by the caller
10819 */
10820 if (!locked) {
10821 ifnet_lock_done(ifp);
10822 }
10823 return; /* nothing to update */
10824 }
10825 ifp->if_interface_state.valid_bitmask |=
10826 IF_INTERFACE_STATE_LQM_STATE_VALID;
10827 ifp->if_interface_state.lqm_state = (int8_t)lqm;
10828
10829 /*
10830 * Don't want to hold the lock when issuing kernel events
10831 */
10832 ifnet_lock_done(ifp);
10833
10834 bzero(&ev_lqm_data, sizeof(ev_lqm_data));
10835 ev_lqm_data.link_quality_metric = lqm;
10836
10837 dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_LINK_QUALITY_METRIC_CHANGED,
10838 (struct net_event_data *)&ev_lqm_data, sizeof(ev_lqm_data), FALSE);
10839
10840 /*
10841 * Reacquire the lock for the caller
10842 */
10843 if (locked) {
10844 ifnet_lock_exclusive(ifp);
10845 }
10846 }
10847
10848 static void
if_rrc_state_update(struct ifnet * ifp,unsigned int rrc_state)10849 if_rrc_state_update(struct ifnet *ifp, unsigned int rrc_state)
10850 {
10851 struct kev_dl_rrc_state kev;
10852
10853 if (rrc_state == ifp->if_interface_state.rrc_state &&
10854 (ifp->if_interface_state.valid_bitmask &
10855 IF_INTERFACE_STATE_RRC_STATE_VALID)) {
10856 return;
10857 }
10858
10859 ifp->if_interface_state.valid_bitmask |=
10860 IF_INTERFACE_STATE_RRC_STATE_VALID;
10861
10862 ifp->if_interface_state.rrc_state = (uint8_t)rrc_state;
10863
10864 /*
10865 * Don't want to hold the lock when issuing kernel events
10866 */
10867 ifnet_lock_done(ifp);
10868
10869 bzero(&kev, sizeof(struct kev_dl_rrc_state));
10870 kev.rrc_state = rrc_state;
10871
10872 dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_RRC_STATE_CHANGED,
10873 (struct net_event_data *)&kev, sizeof(struct kev_dl_rrc_state), FALSE);
10874
10875 ifnet_lock_exclusive(ifp);
10876 }
10877
10878 errno_t
if_state_update(struct ifnet * ifp,struct if_interface_state * if_interface_state)10879 if_state_update(struct ifnet *ifp,
10880 struct if_interface_state *if_interface_state)
10881 {
10882 u_short if_index_available = 0;
10883
10884 ifnet_lock_exclusive(ifp);
10885
10886 if ((ifp->if_type != IFT_CELLULAR) &&
10887 (if_interface_state->valid_bitmask &
10888 IF_INTERFACE_STATE_RRC_STATE_VALID)) {
10889 ifnet_lock_done(ifp);
10890 return ENOTSUP;
10891 }
10892 if ((if_interface_state->valid_bitmask &
10893 IF_INTERFACE_STATE_LQM_STATE_VALID) &&
10894 (if_interface_state->lqm_state < IFNET_LQM_MIN ||
10895 if_interface_state->lqm_state > IFNET_LQM_MAX)) {
10896 ifnet_lock_done(ifp);
10897 return EINVAL;
10898 }
10899 if ((if_interface_state->valid_bitmask &
10900 IF_INTERFACE_STATE_RRC_STATE_VALID) &&
10901 if_interface_state->rrc_state !=
10902 IF_INTERFACE_STATE_RRC_STATE_IDLE &&
10903 if_interface_state->rrc_state !=
10904 IF_INTERFACE_STATE_RRC_STATE_CONNECTED) {
10905 ifnet_lock_done(ifp);
10906 return EINVAL;
10907 }
10908
10909 if (if_interface_state->valid_bitmask &
10910 IF_INTERFACE_STATE_LQM_STATE_VALID) {
10911 if_lqm_update(ifp, if_interface_state->lqm_state, 1);
10912 }
10913 if (if_interface_state->valid_bitmask &
10914 IF_INTERFACE_STATE_RRC_STATE_VALID) {
10915 if_rrc_state_update(ifp, if_interface_state->rrc_state);
10916 }
10917 if (if_interface_state->valid_bitmask &
10918 IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID) {
10919 ifp->if_interface_state.valid_bitmask |=
10920 IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID;
10921 ifp->if_interface_state.interface_availability =
10922 if_interface_state->interface_availability;
10923
10924 if (ifp->if_interface_state.interface_availability ==
10925 IF_INTERFACE_STATE_INTERFACE_AVAILABLE) {
10926 os_log(OS_LOG_DEFAULT, "%s: interface %s (%u) available\n",
10927 __func__, if_name(ifp), ifp->if_index);
10928 if_index_available = ifp->if_index;
10929 } else {
10930 os_log(OS_LOG_DEFAULT, "%s: interface %s (%u) unavailable)\n",
10931 __func__, if_name(ifp), ifp->if_index);
10932 }
10933 }
10934 ifnet_lock_done(ifp);
10935
10936 /*
10937 * Check if the TCP connections going on this interface should be
10938 * forced to send probe packets instead of waiting for TCP timers
10939 * to fire. This is done on an explicit notification such as
10940 * SIOCSIFINTERFACESTATE which marks the interface as available.
10941 */
10942 if (if_index_available > 0) {
10943 tcp_interface_send_probe(if_index_available);
10944 }
10945
10946 return 0;
10947 }
10948
10949 void
if_get_state(struct ifnet * ifp,struct if_interface_state * if_interface_state)10950 if_get_state(struct ifnet *ifp,
10951 struct if_interface_state *if_interface_state)
10952 {
10953 ifnet_lock_shared(ifp);
10954
10955 if_interface_state->valid_bitmask = 0;
10956
10957 if (ifp->if_interface_state.valid_bitmask &
10958 IF_INTERFACE_STATE_RRC_STATE_VALID) {
10959 if_interface_state->valid_bitmask |=
10960 IF_INTERFACE_STATE_RRC_STATE_VALID;
10961 if_interface_state->rrc_state =
10962 ifp->if_interface_state.rrc_state;
10963 }
10964 if (ifp->if_interface_state.valid_bitmask &
10965 IF_INTERFACE_STATE_LQM_STATE_VALID) {
10966 if_interface_state->valid_bitmask |=
10967 IF_INTERFACE_STATE_LQM_STATE_VALID;
10968 if_interface_state->lqm_state =
10969 ifp->if_interface_state.lqm_state;
10970 }
10971 if (ifp->if_interface_state.valid_bitmask &
10972 IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID) {
10973 if_interface_state->valid_bitmask |=
10974 IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID;
10975 if_interface_state->interface_availability =
10976 ifp->if_interface_state.interface_availability;
10977 }
10978
10979 ifnet_lock_done(ifp);
10980 }
10981
10982 errno_t
if_probe_connectivity(struct ifnet * ifp,u_int32_t conn_probe)10983 if_probe_connectivity(struct ifnet *ifp, u_int32_t conn_probe)
10984 {
10985 if (conn_probe > 1) {
10986 return EINVAL;
10987 }
10988 if (conn_probe == 0) {
10989 if_clear_eflags(ifp, IFEF_PROBE_CONNECTIVITY);
10990 } else {
10991 if_set_eflags(ifp, IFEF_PROBE_CONNECTIVITY);
10992 }
10993
10994 #if NECP
10995 necp_update_all_clients();
10996 #endif /* NECP */
10997
10998 tcp_probe_connectivity(ifp, conn_probe);
10999 return 0;
11000 }
11001
11002 /* for uuid.c */
11003 static int
get_ether_index(int * ret_other_index)11004 get_ether_index(int * ret_other_index)
11005 {
11006 struct ifnet *ifp;
11007 int en0_index = 0;
11008 int other_en_index = 0;
11009 int any_ether_index = 0;
11010 short best_unit = 0;
11011
11012 *ret_other_index = 0;
11013 TAILQ_FOREACH(ifp, &ifnet_head, if_link) {
11014 /*
11015 * find en0, or if not en0, the lowest unit en*, and if not
11016 * that, any ethernet
11017 */
11018 ifnet_lock_shared(ifp);
11019 if (strcmp(ifp->if_name, "en") == 0) {
11020 if (ifp->if_unit == 0) {
11021 /* found en0, we're done */
11022 en0_index = ifp->if_index;
11023 ifnet_lock_done(ifp);
11024 break;
11025 }
11026 if (other_en_index == 0 || ifp->if_unit < best_unit) {
11027 other_en_index = ifp->if_index;
11028 best_unit = ifp->if_unit;
11029 }
11030 } else if (ifp->if_type == IFT_ETHER && any_ether_index == 0) {
11031 any_ether_index = ifp->if_index;
11032 }
11033 ifnet_lock_done(ifp);
11034 }
11035 if (en0_index == 0) {
11036 if (other_en_index != 0) {
11037 *ret_other_index = other_en_index;
11038 } else if (any_ether_index != 0) {
11039 *ret_other_index = any_ether_index;
11040 }
11041 }
11042 return en0_index;
11043 }
11044
11045 int
uuid_get_ethernet(u_int8_t * node)11046 uuid_get_ethernet(u_int8_t *node)
11047 {
11048 static int en0_index;
11049 struct ifnet *ifp;
11050 int other_index = 0;
11051 int the_index = 0;
11052 int ret;
11053
11054 ifnet_head_lock_shared();
11055 if (en0_index == 0 || ifindex2ifnet[en0_index] == NULL) {
11056 en0_index = get_ether_index(&other_index);
11057 }
11058 if (en0_index != 0) {
11059 the_index = en0_index;
11060 } else if (other_index != 0) {
11061 the_index = other_index;
11062 }
11063 if (the_index != 0) {
11064 struct dlil_ifnet *dl_if;
11065
11066 ifp = ifindex2ifnet[the_index];
11067 VERIFY(ifp != NULL);
11068 dl_if = (struct dlil_ifnet *)ifp;
11069 if (dl_if->dl_if_permanent_ether_is_set != 0) {
11070 /*
11071 * Use the permanent ethernet address if it is
11072 * available because it will never change.
11073 */
11074 memcpy(node, dl_if->dl_if_permanent_ether,
11075 ETHER_ADDR_LEN);
11076 } else {
11077 memcpy(node, IF_LLADDR(ifp), ETHER_ADDR_LEN);
11078 }
11079 ret = 0;
11080 } else {
11081 ret = -1;
11082 }
11083 ifnet_head_done();
11084 return ret;
11085 }
11086
11087 static int
11088 sysctl_rxpoll SYSCTL_HANDLER_ARGS
11089 {
11090 #pragma unused(arg1, arg2)
11091 uint32_t i;
11092 int err;
11093
11094 i = if_rxpoll;
11095
11096 err = sysctl_handle_int(oidp, &i, 0, req);
11097 if (err != 0 || req->newptr == USER_ADDR_NULL) {
11098 return err;
11099 }
11100
11101 if (net_rxpoll == 0) {
11102 return ENXIO;
11103 }
11104
11105 if_rxpoll = i;
11106 return err;
11107 }
11108
11109 static int
11110 sysctl_rxpoll_mode_holdtime SYSCTL_HANDLER_ARGS
11111 {
11112 #pragma unused(arg1, arg2)
11113 uint64_t q;
11114 int err;
11115
11116 q = if_rxpoll_mode_holdtime;
11117
11118 err = sysctl_handle_quad(oidp, &q, 0, req);
11119 if (err != 0 || req->newptr == USER_ADDR_NULL) {
11120 return err;
11121 }
11122
11123 if (q < IF_RXPOLL_MODE_HOLDTIME_MIN) {
11124 q = IF_RXPOLL_MODE_HOLDTIME_MIN;
11125 }
11126
11127 if_rxpoll_mode_holdtime = q;
11128
11129 return err;
11130 }
11131
11132 static int
11133 sysctl_rxpoll_sample_holdtime SYSCTL_HANDLER_ARGS
11134 {
11135 #pragma unused(arg1, arg2)
11136 uint64_t q;
11137 int err;
11138
11139 q = if_rxpoll_sample_holdtime;
11140
11141 err = sysctl_handle_quad(oidp, &q, 0, req);
11142 if (err != 0 || req->newptr == USER_ADDR_NULL) {
11143 return err;
11144 }
11145
11146 if (q < IF_RXPOLL_SAMPLETIME_MIN) {
11147 q = IF_RXPOLL_SAMPLETIME_MIN;
11148 }
11149
11150 if_rxpoll_sample_holdtime = q;
11151
11152 return err;
11153 }
11154
11155 static int
11156 sysctl_rxpoll_interval_time SYSCTL_HANDLER_ARGS
11157 {
11158 #pragma unused(arg1, arg2)
11159 uint64_t q;
11160 int err;
11161
11162 q = if_rxpoll_interval_time;
11163
11164 err = sysctl_handle_quad(oidp, &q, 0, req);
11165 if (err != 0 || req->newptr == USER_ADDR_NULL) {
11166 return err;
11167 }
11168
11169 if (q < IF_RXPOLL_INTERVALTIME_MIN) {
11170 q = IF_RXPOLL_INTERVALTIME_MIN;
11171 }
11172
11173 if_rxpoll_interval_time = q;
11174
11175 return err;
11176 }
11177
11178 static int
11179 sysctl_rxpoll_wlowat SYSCTL_HANDLER_ARGS
11180 {
11181 #pragma unused(arg1, arg2)
11182 uint32_t i;
11183 int err;
11184
11185 i = if_sysctl_rxpoll_wlowat;
11186
11187 err = sysctl_handle_int(oidp, &i, 0, req);
11188 if (err != 0 || req->newptr == USER_ADDR_NULL) {
11189 return err;
11190 }
11191
11192 if (i == 0 || i >= if_sysctl_rxpoll_whiwat) {
11193 return EINVAL;
11194 }
11195
11196 if_sysctl_rxpoll_wlowat = i;
11197 return err;
11198 }
11199
11200 static int
11201 sysctl_rxpoll_whiwat SYSCTL_HANDLER_ARGS
11202 {
11203 #pragma unused(arg1, arg2)
11204 uint32_t i;
11205 int err;
11206
11207 i = if_sysctl_rxpoll_whiwat;
11208
11209 err = sysctl_handle_int(oidp, &i, 0, req);
11210 if (err != 0 || req->newptr == USER_ADDR_NULL) {
11211 return err;
11212 }
11213
11214 if (i <= if_sysctl_rxpoll_wlowat) {
11215 return EINVAL;
11216 }
11217
11218 if_sysctl_rxpoll_whiwat = i;
11219 return err;
11220 }
11221
11222 static int
11223 sysctl_sndq_maxlen SYSCTL_HANDLER_ARGS
11224 {
11225 #pragma unused(arg1, arg2)
11226 int i, err;
11227
11228 i = if_sndq_maxlen;
11229
11230 err = sysctl_handle_int(oidp, &i, 0, req);
11231 if (err != 0 || req->newptr == USER_ADDR_NULL) {
11232 return err;
11233 }
11234
11235 if (i < IF_SNDQ_MINLEN) {
11236 i = IF_SNDQ_MINLEN;
11237 }
11238
11239 if_sndq_maxlen = i;
11240 return err;
11241 }
11242
11243 static int
11244 sysctl_rcvq_maxlen SYSCTL_HANDLER_ARGS
11245 {
11246 #pragma unused(arg1, arg2)
11247 int i, err;
11248
11249 i = if_rcvq_maxlen;
11250
11251 err = sysctl_handle_int(oidp, &i, 0, req);
11252 if (err != 0 || req->newptr == USER_ADDR_NULL) {
11253 return err;
11254 }
11255
11256 if (i < IF_RCVQ_MINLEN) {
11257 i = IF_RCVQ_MINLEN;
11258 }
11259
11260 if_rcvq_maxlen = i;
11261 return err;
11262 }
11263
11264 static int
11265 sysctl_rcvq_burst_limit SYSCTL_HANDLER_ARGS
11266 {
11267 #pragma unused(arg1, arg2)
11268 int i, err;
11269
11270 i = if_rcvq_burst_limit;
11271
11272 err = sysctl_handle_int(oidp, &i, 0, req);
11273 if (err != 0 || req->newptr == USER_ADDR_NULL) {
11274 return err;
11275 }
11276
11277 /*
11278 * Safeguard the burst limit to "sane" values on customer builds.
11279 */
11280 #if !(DEVELOPMENT || DEBUG)
11281 if (i < IF_RCVQ_BURST_LIMIT_MIN) {
11282 i = IF_RCVQ_BURST_LIMIT_MIN;
11283 }
11284
11285 if (IF_RCVQ_BURST_LIMIT_MAX < i) {
11286 i = IF_RCVQ_BURST_LIMIT_MAX;
11287 }
11288 #endif
11289
11290 if_rcvq_burst_limit = i;
11291 return err;
11292 }
11293
11294 static int
11295 sysctl_rcvq_trim_pct SYSCTL_HANDLER_ARGS
11296 {
11297 #pragma unused(arg1, arg2)
11298 int i, err;
11299
11300 i = if_rcvq_burst_limit;
11301
11302 err = sysctl_handle_int(oidp, &i, 0, req);
11303 if (err != 0 || req->newptr == USER_ADDR_NULL) {
11304 return err;
11305 }
11306
11307 if (IF_RCVQ_TRIM_PCT_MAX < i) {
11308 i = IF_RCVQ_TRIM_PCT_MAX;
11309 }
11310
11311 if (i < IF_RCVQ_TRIM_PCT_MIN) {
11312 i = IF_RCVQ_TRIM_PCT_MIN;
11313 }
11314
11315 if_rcvq_trim_pct = i;
11316 return err;
11317 }
11318
11319 int
dlil_node_present(struct ifnet * ifp,struct sockaddr * sa,int32_t rssi,int lqm,int npm,u_int8_t srvinfo[48])11320 dlil_node_present(struct ifnet *ifp, struct sockaddr *sa,
11321 int32_t rssi, int lqm, int npm, u_int8_t srvinfo[48])
11322 {
11323 struct kev_dl_node_presence kev;
11324 struct sockaddr_dl *sdl;
11325 struct sockaddr_in6 *sin6;
11326 int ret = 0;
11327
11328 VERIFY(ifp);
11329 VERIFY(sa);
11330 VERIFY(sa->sa_family == AF_LINK || sa->sa_family == AF_INET6);
11331
11332 bzero(&kev, sizeof(kev));
11333 sin6 = &kev.sin6_node_address;
11334 sdl = &kev.sdl_node_address;
11335 nd6_alt_node_addr_decompose(ifp, sa, sdl, sin6);
11336 kev.rssi = rssi;
11337 kev.link_quality_metric = lqm;
11338 kev.node_proximity_metric = npm;
11339 bcopy(srvinfo, kev.node_service_info, sizeof(kev.node_service_info));
11340
11341 ret = nd6_alt_node_present(ifp, sin6, sdl, rssi, lqm, npm);
11342 if (ret == 0 || ret == EEXIST) {
11343 int err = dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_NODE_PRESENCE,
11344 &kev.link_data, sizeof(kev), (ret == EEXIST) ? TRUE : FALSE);
11345 if (err != 0) {
11346 log(LOG_ERR, "%s: Post DL_NODE_PRESENCE failed with"
11347 "error %d\n", __func__, err);
11348 }
11349 }
11350
11351 if (ret == EEXIST) {
11352 ret = 0;
11353 }
11354 return ret;
11355 }
11356
11357 void
dlil_node_absent(struct ifnet * ifp,struct sockaddr * sa)11358 dlil_node_absent(struct ifnet *ifp, struct sockaddr *sa)
11359 {
11360 struct kev_dl_node_absence kev = {};
11361 struct sockaddr_in6 *kev_sin6 = NULL;
11362 struct sockaddr_dl *kev_sdl = NULL;
11363 int error = 0;
11364
11365 VERIFY(ifp != NULL);
11366 VERIFY(sa != NULL);
11367 VERIFY(sa->sa_family == AF_LINK || sa->sa_family == AF_INET6);
11368
11369 kev_sin6 = &kev.sin6_node_address;
11370 kev_sdl = &kev.sdl_node_address;
11371
11372 if (sa->sa_family == AF_INET6) {
11373 /*
11374 * If IPv6 address is given, get the link layer
11375 * address from what was cached in the neighbor cache
11376 */
11377 VERIFY(sa->sa_len <= sizeof(*kev_sin6));
11378 bcopy(sa, kev_sin6, sa->sa_len);
11379 error = nd6_alt_node_absent(ifp, kev_sin6, kev_sdl);
11380 } else {
11381 /*
11382 * If passed address is AF_LINK type, derive the address
11383 * based on the link address.
11384 */
11385 nd6_alt_node_addr_decompose(ifp, sa, kev_sdl, kev_sin6);
11386 error = nd6_alt_node_absent(ifp, kev_sin6, NULL);
11387 }
11388
11389 if (error == 0) {
11390 kev_sdl->sdl_type = ifp->if_type;
11391 kev_sdl->sdl_index = ifp->if_index;
11392
11393 dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_NODE_ABSENCE,
11394 &kev.link_data, sizeof(kev), FALSE);
11395 }
11396 }
11397
11398 int
dlil_node_present_v2(struct ifnet * ifp,struct sockaddr * sa,struct sockaddr_dl * sdl,int32_t rssi,int lqm,int npm,u_int8_t srvinfo[48])11399 dlil_node_present_v2(struct ifnet *ifp, struct sockaddr *sa, struct sockaddr_dl *sdl,
11400 int32_t rssi, int lqm, int npm, u_int8_t srvinfo[48])
11401 {
11402 struct kev_dl_node_presence kev = {};
11403 struct sockaddr_dl *kev_sdl = NULL;
11404 struct sockaddr_in6 *kev_sin6 = NULL;
11405 int ret = 0;
11406
11407 VERIFY(ifp != NULL);
11408 VERIFY(sa != NULL && sdl != NULL);
11409 VERIFY(sa->sa_family == AF_INET6 && sdl->sdl_family == AF_LINK);
11410
11411 kev_sin6 = &kev.sin6_node_address;
11412 kev_sdl = &kev.sdl_node_address;
11413
11414 VERIFY(sdl->sdl_len <= sizeof(*kev_sdl));
11415 bcopy(sdl, kev_sdl, sdl->sdl_len);
11416 kev_sdl->sdl_type = ifp->if_type;
11417 kev_sdl->sdl_index = ifp->if_index;
11418
11419 VERIFY(sa->sa_len <= sizeof(*kev_sin6));
11420 bcopy(sa, kev_sin6, sa->sa_len);
11421
11422 kev.rssi = rssi;
11423 kev.link_quality_metric = lqm;
11424 kev.node_proximity_metric = npm;
11425 bcopy(srvinfo, kev.node_service_info, sizeof(kev.node_service_info));
11426
11427 ret = nd6_alt_node_present(ifp, SIN6(sa), sdl, rssi, lqm, npm);
11428 if (ret == 0 || ret == EEXIST) {
11429 int err = dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_NODE_PRESENCE,
11430 &kev.link_data, sizeof(kev), (ret == EEXIST) ? TRUE : FALSE);
11431 if (err != 0) {
11432 log(LOG_ERR, "%s: Post DL_NODE_PRESENCE failed with error %d\n", __func__, err);
11433 }
11434 }
11435
11436 if (ret == EEXIST) {
11437 ret = 0;
11438 }
11439 return ret;
11440 }
11441
11442 const void *
dlil_ifaddr_bytes(const struct sockaddr_dl * sdl,size_t * sizep,kauth_cred_t * credp)11443 dlil_ifaddr_bytes(const struct sockaddr_dl *sdl, size_t *sizep,
11444 kauth_cred_t *credp)
11445 {
11446 const u_int8_t *bytes;
11447 size_t size;
11448
11449 bytes = CONST_LLADDR(sdl);
11450 size = sdl->sdl_alen;
11451
11452 #if CONFIG_MACF
11453 if (dlil_lladdr_ckreq) {
11454 switch (sdl->sdl_type) {
11455 case IFT_ETHER:
11456 case IFT_IEEE1394:
11457 break;
11458 default:
11459 credp = NULL;
11460 break;
11461 }
11462 ;
11463
11464 if (credp && mac_system_check_info(*credp, "net.link.addr")) {
11465 static const u_int8_t unspec[FIREWIRE_EUI64_LEN] = {
11466 [0] = 2
11467 };
11468
11469 bytes = unspec;
11470 }
11471 }
11472 #else
11473 #pragma unused(credp)
11474 #endif
11475
11476 if (sizep != NULL) {
11477 *sizep = size;
11478 }
11479 return bytes;
11480 }
11481
11482 void
dlil_report_issues(struct ifnet * ifp,u_int8_t modid[DLIL_MODIDLEN],u_int8_t info[DLIL_MODARGLEN])11483 dlil_report_issues(struct ifnet *ifp, u_int8_t modid[DLIL_MODIDLEN],
11484 u_int8_t info[DLIL_MODARGLEN])
11485 {
11486 struct kev_dl_issues kev;
11487 struct timeval tv;
11488
11489 VERIFY(ifp != NULL);
11490 VERIFY(modid != NULL);
11491 _CASSERT(sizeof(kev.modid) == DLIL_MODIDLEN);
11492 _CASSERT(sizeof(kev.info) == DLIL_MODARGLEN);
11493
11494 bzero(&kev, sizeof(kev));
11495
11496 microtime(&tv);
11497 kev.timestamp = tv.tv_sec;
11498 bcopy(modid, &kev.modid, DLIL_MODIDLEN);
11499 if (info != NULL) {
11500 bcopy(info, &kev.info, DLIL_MODARGLEN);
11501 }
11502
11503 dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_ISSUES,
11504 &kev.link_data, sizeof(kev), FALSE);
11505 }
11506
11507 errno_t
ifnet_getset_opportunistic(ifnet_t ifp,u_long cmd,struct ifreq * ifr,struct proc * p)11508 ifnet_getset_opportunistic(ifnet_t ifp, u_long cmd, struct ifreq *ifr,
11509 struct proc *p)
11510 {
11511 u_int32_t level = IFNET_THROTTLE_OFF;
11512 errno_t result = 0;
11513
11514 VERIFY(cmd == SIOCSIFOPPORTUNISTIC || cmd == SIOCGIFOPPORTUNISTIC);
11515
11516 if (cmd == SIOCSIFOPPORTUNISTIC) {
11517 /*
11518 * XXX: Use priv_check_cred() instead of root check?
11519 */
11520 if ((result = proc_suser(p)) != 0) {
11521 return result;
11522 }
11523
11524 if (ifr->ifr_opportunistic.ifo_flags ==
11525 IFRIFOF_BLOCK_OPPORTUNISTIC) {
11526 level = IFNET_THROTTLE_OPPORTUNISTIC;
11527 } else if (ifr->ifr_opportunistic.ifo_flags == 0) {
11528 level = IFNET_THROTTLE_OFF;
11529 } else {
11530 result = EINVAL;
11531 }
11532
11533 if (result == 0) {
11534 result = ifnet_set_throttle(ifp, level);
11535 }
11536 } else if ((result = ifnet_get_throttle(ifp, &level)) == 0) {
11537 ifr->ifr_opportunistic.ifo_flags = 0;
11538 if (level == IFNET_THROTTLE_OPPORTUNISTIC) {
11539 ifr->ifr_opportunistic.ifo_flags |=
11540 IFRIFOF_BLOCK_OPPORTUNISTIC;
11541 }
11542 }
11543
11544 /*
11545 * Return the count of current opportunistic connections
11546 * over the interface.
11547 */
11548 if (result == 0) {
11549 uint32_t flags = 0;
11550 flags |= (cmd == SIOCSIFOPPORTUNISTIC) ?
11551 INPCB_OPPORTUNISTIC_SETCMD : 0;
11552 flags |= (level == IFNET_THROTTLE_OPPORTUNISTIC) ?
11553 INPCB_OPPORTUNISTIC_THROTTLEON : 0;
11554 ifr->ifr_opportunistic.ifo_inuse =
11555 udp_count_opportunistic(ifp->if_index, flags) +
11556 tcp_count_opportunistic(ifp->if_index, flags);
11557 }
11558
11559 if (result == EALREADY) {
11560 result = 0;
11561 }
11562
11563 return result;
11564 }
11565
11566 int
ifnet_get_throttle(struct ifnet * ifp,u_int32_t * level)11567 ifnet_get_throttle(struct ifnet *ifp, u_int32_t *level)
11568 {
11569 struct ifclassq *ifq;
11570 int err = 0;
11571
11572 if (!(ifp->if_eflags & IFEF_TXSTART)) {
11573 return ENXIO;
11574 }
11575
11576 *level = IFNET_THROTTLE_OFF;
11577
11578 ifq = ifp->if_snd;
11579 IFCQ_LOCK(ifq);
11580 /* Throttling works only for IFCQ, not ALTQ instances */
11581 if (IFCQ_IS_ENABLED(ifq)) {
11582 cqrq_throttle_t req = { 0, IFNET_THROTTLE_OFF };
11583
11584 err = fq_if_request_classq(ifq, CLASSQRQ_THROTTLE, &req);
11585 *level = req.level;
11586 }
11587 IFCQ_UNLOCK(ifq);
11588
11589 return err;
11590 }
11591
11592 int
ifnet_set_throttle(struct ifnet * ifp,u_int32_t level)11593 ifnet_set_throttle(struct ifnet *ifp, u_int32_t level)
11594 {
11595 struct ifclassq *ifq;
11596 int err = 0;
11597
11598 if (!(ifp->if_eflags & IFEF_TXSTART)) {
11599 return ENXIO;
11600 }
11601
11602 ifq = ifp->if_snd;
11603
11604 switch (level) {
11605 case IFNET_THROTTLE_OFF:
11606 case IFNET_THROTTLE_OPPORTUNISTIC:
11607 break;
11608 default:
11609 return EINVAL;
11610 }
11611
11612 IFCQ_LOCK(ifq);
11613 if (IFCQ_IS_ENABLED(ifq)) {
11614 cqrq_throttle_t req = { 1, level };
11615
11616 err = fq_if_request_classq(ifq, CLASSQRQ_THROTTLE, &req);
11617 }
11618 IFCQ_UNLOCK(ifq);
11619
11620 if (err == 0) {
11621 DLIL_PRINTF("%s: throttling level set to %d\n", if_name(ifp),
11622 level);
11623 #if NECP
11624 necp_update_all_clients();
11625 #endif /* NECP */
11626 if (level == IFNET_THROTTLE_OFF) {
11627 ifnet_start(ifp);
11628 }
11629 }
11630
11631 return err;
11632 }
11633
11634 errno_t
ifnet_getset_log(ifnet_t ifp,u_long cmd,struct ifreq * ifr,struct proc * p)11635 ifnet_getset_log(ifnet_t ifp, u_long cmd, struct ifreq *ifr,
11636 struct proc *p)
11637 {
11638 #pragma unused(p)
11639 errno_t result = 0;
11640 uint32_t flags;
11641 int level, category, subcategory;
11642
11643 VERIFY(cmd == SIOCSIFLOG || cmd == SIOCGIFLOG);
11644
11645 if (cmd == SIOCSIFLOG) {
11646 if ((result = priv_check_cred(kauth_cred_get(),
11647 PRIV_NET_INTERFACE_CONTROL, 0)) != 0) {
11648 return result;
11649 }
11650
11651 level = ifr->ifr_log.ifl_level;
11652 if (level < IFNET_LOG_MIN || level > IFNET_LOG_MAX) {
11653 result = EINVAL;
11654 }
11655
11656 flags = ifr->ifr_log.ifl_flags;
11657 if ((flags &= IFNET_LOGF_MASK) == 0) {
11658 result = EINVAL;
11659 }
11660
11661 category = ifr->ifr_log.ifl_category;
11662 subcategory = ifr->ifr_log.ifl_subcategory;
11663
11664 if (result == 0) {
11665 result = ifnet_set_log(ifp, level, flags,
11666 category, subcategory);
11667 }
11668 } else {
11669 result = ifnet_get_log(ifp, &level, &flags, &category,
11670 &subcategory);
11671 if (result == 0) {
11672 ifr->ifr_log.ifl_level = level;
11673 ifr->ifr_log.ifl_flags = flags;
11674 ifr->ifr_log.ifl_category = category;
11675 ifr->ifr_log.ifl_subcategory = subcategory;
11676 }
11677 }
11678
11679 return result;
11680 }
11681
11682 int
ifnet_set_log(struct ifnet * ifp,int32_t level,uint32_t flags,int32_t category,int32_t subcategory)11683 ifnet_set_log(struct ifnet *ifp, int32_t level, uint32_t flags,
11684 int32_t category, int32_t subcategory)
11685 {
11686 int err = 0;
11687
11688 VERIFY(level >= IFNET_LOG_MIN && level <= IFNET_LOG_MAX);
11689 VERIFY(flags & IFNET_LOGF_MASK);
11690
11691 /*
11692 * The logging level applies to all facilities; make sure to
11693 * update them all with the most current level.
11694 */
11695 flags |= ifp->if_log.flags;
11696
11697 if (ifp->if_output_ctl != NULL) {
11698 struct ifnet_log_params l;
11699
11700 bzero(&l, sizeof(l));
11701 l.level = level;
11702 l.flags = flags;
11703 l.flags &= ~IFNET_LOGF_DLIL;
11704 l.category = category;
11705 l.subcategory = subcategory;
11706
11707 /* Send this request to lower layers */
11708 if (l.flags != 0) {
11709 err = ifp->if_output_ctl(ifp, IFNET_CTL_SET_LOG,
11710 sizeof(l), &l);
11711 }
11712 } else if ((flags & ~IFNET_LOGF_DLIL) && ifp->if_output_ctl == NULL) {
11713 /*
11714 * If targeted to the lower layers without an output
11715 * control callback registered on the interface, just
11716 * silently ignore facilities other than ours.
11717 */
11718 flags &= IFNET_LOGF_DLIL;
11719 if (flags == 0 && (!(ifp->if_log.flags & IFNET_LOGF_DLIL))) {
11720 level = 0;
11721 }
11722 }
11723
11724 if (err == 0) {
11725 if ((ifp->if_log.level = level) == IFNET_LOG_DEFAULT) {
11726 ifp->if_log.flags = 0;
11727 } else {
11728 ifp->if_log.flags |= flags;
11729 }
11730
11731 log(LOG_INFO, "%s: logging level set to %d flags=%b "
11732 "arg=%b, category=%d subcategory=%d\n", if_name(ifp),
11733 ifp->if_log.level, ifp->if_log.flags,
11734 IFNET_LOGF_BITS, flags, IFNET_LOGF_BITS,
11735 category, subcategory);
11736 }
11737
11738 return err;
11739 }
11740
11741 int
ifnet_get_log(struct ifnet * ifp,int32_t * level,uint32_t * flags,int32_t * category,int32_t * subcategory)11742 ifnet_get_log(struct ifnet *ifp, int32_t *level, uint32_t *flags,
11743 int32_t *category, int32_t *subcategory)
11744 {
11745 if (level != NULL) {
11746 *level = ifp->if_log.level;
11747 }
11748 if (flags != NULL) {
11749 *flags = ifp->if_log.flags;
11750 }
11751 if (category != NULL) {
11752 *category = ifp->if_log.category;
11753 }
11754 if (subcategory != NULL) {
11755 *subcategory = ifp->if_log.subcategory;
11756 }
11757
11758 return 0;
11759 }
11760
11761 int
ifnet_notify_address(struct ifnet * ifp,int af)11762 ifnet_notify_address(struct ifnet *ifp, int af)
11763 {
11764 struct ifnet_notify_address_params na;
11765
11766 #if PF
11767 (void) pf_ifaddr_hook(ifp);
11768 #endif /* PF */
11769
11770 if (ifp->if_output_ctl == NULL) {
11771 return EOPNOTSUPP;
11772 }
11773
11774 bzero(&na, sizeof(na));
11775 na.address_family = (sa_family_t)af;
11776
11777 return ifp->if_output_ctl(ifp, IFNET_CTL_NOTIFY_ADDRESS,
11778 sizeof(na), &na);
11779 }
11780
11781 errno_t
ifnet_flowid(struct ifnet * ifp,uint32_t * flowid)11782 ifnet_flowid(struct ifnet *ifp, uint32_t *flowid)
11783 {
11784 if (ifp == NULL || flowid == NULL) {
11785 return EINVAL;
11786 } else if (!(ifp->if_eflags & IFEF_TXSTART) ||
11787 !IF_FULLY_ATTACHED(ifp)) {
11788 return ENXIO;
11789 }
11790
11791 *flowid = ifp->if_flowhash;
11792
11793 return 0;
11794 }
11795
11796 errno_t
ifnet_disable_output(struct ifnet * ifp)11797 ifnet_disable_output(struct ifnet *ifp)
11798 {
11799 int err;
11800
11801 if (ifp == NULL) {
11802 return EINVAL;
11803 } else if (!(ifp->if_eflags & IFEF_TXSTART) ||
11804 !IF_FULLY_ATTACHED(ifp)) {
11805 return ENXIO;
11806 }
11807
11808 if ((err = ifnet_fc_add(ifp)) == 0) {
11809 lck_mtx_lock_spin(&ifp->if_start_lock);
11810 ifp->if_start_flags |= IFSF_FLOW_CONTROLLED;
11811 lck_mtx_unlock(&ifp->if_start_lock);
11812 }
11813 return err;
11814 }
11815
11816 errno_t
ifnet_enable_output(struct ifnet * ifp)11817 ifnet_enable_output(struct ifnet *ifp)
11818 {
11819 if (ifp == NULL) {
11820 return EINVAL;
11821 } else if (!(ifp->if_eflags & IFEF_TXSTART) ||
11822 !IF_FULLY_ATTACHED(ifp)) {
11823 return ENXIO;
11824 }
11825
11826 ifnet_start_common(ifp, TRUE, FALSE);
11827 return 0;
11828 }
11829
11830 void
ifnet_flowadv(uint32_t flowhash)11831 ifnet_flowadv(uint32_t flowhash)
11832 {
11833 struct ifnet_fc_entry *ifce;
11834 struct ifnet *ifp;
11835
11836 ifce = ifnet_fc_get(flowhash);
11837 if (ifce == NULL) {
11838 return;
11839 }
11840
11841 VERIFY(ifce->ifce_ifp != NULL);
11842 ifp = ifce->ifce_ifp;
11843
11844 /* flow hash gets recalculated per attach, so check */
11845 if (ifnet_is_attached(ifp, 1)) {
11846 if (ifp->if_flowhash == flowhash) {
11847 (void) ifnet_enable_output(ifp);
11848 }
11849 ifnet_decr_iorefcnt(ifp);
11850 }
11851 ifnet_fc_entry_free(ifce);
11852 }
11853
11854 /*
11855 * Function to compare ifnet_fc_entries in ifnet flow control tree
11856 */
11857 static inline int
ifce_cmp(const struct ifnet_fc_entry * fc1,const struct ifnet_fc_entry * fc2)11858 ifce_cmp(const struct ifnet_fc_entry *fc1, const struct ifnet_fc_entry *fc2)
11859 {
11860 return fc1->ifce_flowhash - fc2->ifce_flowhash;
11861 }
11862
11863 static int
ifnet_fc_add(struct ifnet * ifp)11864 ifnet_fc_add(struct ifnet *ifp)
11865 {
11866 struct ifnet_fc_entry keyfc, *ifce;
11867 uint32_t flowhash;
11868
11869 VERIFY(ifp != NULL && (ifp->if_eflags & IFEF_TXSTART));
11870 VERIFY(ifp->if_flowhash != 0);
11871 flowhash = ifp->if_flowhash;
11872
11873 bzero(&keyfc, sizeof(keyfc));
11874 keyfc.ifce_flowhash = flowhash;
11875
11876 lck_mtx_lock_spin(&ifnet_fc_lock);
11877 ifce = RB_FIND(ifnet_fc_tree, &ifnet_fc_tree, &keyfc);
11878 if (ifce != NULL && ifce->ifce_ifp == ifp) {
11879 /* Entry is already in ifnet_fc_tree, return */
11880 lck_mtx_unlock(&ifnet_fc_lock);
11881 return 0;
11882 }
11883
11884 if (ifce != NULL) {
11885 /*
11886 * There is a different fc entry with the same flow hash
11887 * but different ifp pointer. There can be a collision
11888 * on flow hash but the probability is low. Let's just
11889 * avoid adding a second one when there is a collision.
11890 */
11891 lck_mtx_unlock(&ifnet_fc_lock);
11892 return EAGAIN;
11893 }
11894
11895 /* become regular mutex */
11896 lck_mtx_convert_spin(&ifnet_fc_lock);
11897
11898 ifce = zalloc_flags(ifnet_fc_zone, Z_WAITOK | Z_ZERO);
11899 ifce->ifce_flowhash = flowhash;
11900 ifce->ifce_ifp = ifp;
11901
11902 RB_INSERT(ifnet_fc_tree, &ifnet_fc_tree, ifce);
11903 lck_mtx_unlock(&ifnet_fc_lock);
11904 return 0;
11905 }
11906
11907 static struct ifnet_fc_entry *
ifnet_fc_get(uint32_t flowhash)11908 ifnet_fc_get(uint32_t flowhash)
11909 {
11910 struct ifnet_fc_entry keyfc, *ifce;
11911 struct ifnet *ifp;
11912
11913 bzero(&keyfc, sizeof(keyfc));
11914 keyfc.ifce_flowhash = flowhash;
11915
11916 lck_mtx_lock_spin(&ifnet_fc_lock);
11917 ifce = RB_FIND(ifnet_fc_tree, &ifnet_fc_tree, &keyfc);
11918 if (ifce == NULL) {
11919 /* Entry is not present in ifnet_fc_tree, return */
11920 lck_mtx_unlock(&ifnet_fc_lock);
11921 return NULL;
11922 }
11923
11924 RB_REMOVE(ifnet_fc_tree, &ifnet_fc_tree, ifce);
11925
11926 VERIFY(ifce->ifce_ifp != NULL);
11927 ifp = ifce->ifce_ifp;
11928
11929 /* become regular mutex */
11930 lck_mtx_convert_spin(&ifnet_fc_lock);
11931
11932 if (!ifnet_is_attached(ifp, 0)) {
11933 /*
11934 * This ifp is not attached or in the process of being
11935 * detached; just don't process it.
11936 */
11937 ifnet_fc_entry_free(ifce);
11938 ifce = NULL;
11939 }
11940 lck_mtx_unlock(&ifnet_fc_lock);
11941
11942 return ifce;
11943 }
11944
11945 static void
ifnet_fc_entry_free(struct ifnet_fc_entry * ifce)11946 ifnet_fc_entry_free(struct ifnet_fc_entry *ifce)
11947 {
11948 zfree(ifnet_fc_zone, ifce);
11949 }
11950
11951 static uint32_t
ifnet_calc_flowhash(struct ifnet * ifp)11952 ifnet_calc_flowhash(struct ifnet *ifp)
11953 {
11954 struct ifnet_flowhash_key fh __attribute__((aligned(8)));
11955 uint32_t flowhash = 0;
11956
11957 if (ifnet_flowhash_seed == 0) {
11958 ifnet_flowhash_seed = RandomULong();
11959 }
11960
11961 bzero(&fh, sizeof(fh));
11962
11963 (void) snprintf(fh.ifk_name, sizeof(fh.ifk_name), "%s", ifp->if_name);
11964 fh.ifk_unit = ifp->if_unit;
11965 fh.ifk_flags = ifp->if_flags;
11966 fh.ifk_eflags = ifp->if_eflags;
11967 fh.ifk_capabilities = ifp->if_capabilities;
11968 fh.ifk_capenable = ifp->if_capenable;
11969 fh.ifk_output_sched_model = ifp->if_output_sched_model;
11970 fh.ifk_rand1 = RandomULong();
11971 fh.ifk_rand2 = RandomULong();
11972
11973 try_again:
11974 flowhash = net_flowhash(&fh, sizeof(fh), ifnet_flowhash_seed);
11975 if (flowhash == 0) {
11976 /* try to get a non-zero flowhash */
11977 ifnet_flowhash_seed = RandomULong();
11978 goto try_again;
11979 }
11980
11981 return flowhash;
11982 }
11983
11984 int
ifnet_set_netsignature(struct ifnet * ifp,uint8_t family,uint8_t len,uint16_t flags,uint8_t * data)11985 ifnet_set_netsignature(struct ifnet *ifp, uint8_t family, uint8_t len,
11986 uint16_t flags, uint8_t *data)
11987 {
11988 #pragma unused(flags)
11989 int error = 0;
11990
11991 switch (family) {
11992 case AF_INET:
11993 if_inetdata_lock_exclusive(ifp);
11994 if (IN_IFEXTRA(ifp) != NULL) {
11995 if (len == 0) {
11996 /* Allow clearing the signature */
11997 IN_IFEXTRA(ifp)->netsig_len = 0;
11998 bzero(IN_IFEXTRA(ifp)->netsig,
11999 sizeof(IN_IFEXTRA(ifp)->netsig));
12000 if_inetdata_lock_done(ifp);
12001 break;
12002 } else if (len > sizeof(IN_IFEXTRA(ifp)->netsig)) {
12003 error = EINVAL;
12004 if_inetdata_lock_done(ifp);
12005 break;
12006 }
12007 IN_IFEXTRA(ifp)->netsig_len = len;
12008 bcopy(data, IN_IFEXTRA(ifp)->netsig, len);
12009 } else {
12010 error = ENOMEM;
12011 }
12012 if_inetdata_lock_done(ifp);
12013 break;
12014
12015 case AF_INET6:
12016 if_inet6data_lock_exclusive(ifp);
12017 if (IN6_IFEXTRA(ifp) != NULL) {
12018 if (len == 0) {
12019 /* Allow clearing the signature */
12020 IN6_IFEXTRA(ifp)->netsig_len = 0;
12021 bzero(IN6_IFEXTRA(ifp)->netsig,
12022 sizeof(IN6_IFEXTRA(ifp)->netsig));
12023 if_inet6data_lock_done(ifp);
12024 break;
12025 } else if (len > sizeof(IN6_IFEXTRA(ifp)->netsig)) {
12026 error = EINVAL;
12027 if_inet6data_lock_done(ifp);
12028 break;
12029 }
12030 IN6_IFEXTRA(ifp)->netsig_len = len;
12031 bcopy(data, IN6_IFEXTRA(ifp)->netsig, len);
12032 } else {
12033 error = ENOMEM;
12034 }
12035 if_inet6data_lock_done(ifp);
12036 break;
12037
12038 default:
12039 error = EINVAL;
12040 break;
12041 }
12042
12043 return error;
12044 }
12045
12046 int
ifnet_get_netsignature(struct ifnet * ifp,uint8_t family,uint8_t * len,uint16_t * flags,uint8_t * data)12047 ifnet_get_netsignature(struct ifnet *ifp, uint8_t family, uint8_t *len,
12048 uint16_t *flags, uint8_t *data)
12049 {
12050 int error = 0;
12051
12052 if (ifp == NULL || len == NULL || data == NULL) {
12053 return EINVAL;
12054 }
12055
12056 switch (family) {
12057 case AF_INET:
12058 if_inetdata_lock_shared(ifp);
12059 if (IN_IFEXTRA(ifp) != NULL) {
12060 if (*len == 0 || *len < IN_IFEXTRA(ifp)->netsig_len) {
12061 error = EINVAL;
12062 if_inetdata_lock_done(ifp);
12063 break;
12064 }
12065 if ((*len = (uint8_t)IN_IFEXTRA(ifp)->netsig_len) > 0) {
12066 bcopy(IN_IFEXTRA(ifp)->netsig, data, *len);
12067 } else {
12068 error = ENOENT;
12069 }
12070 } else {
12071 error = ENOMEM;
12072 }
12073 if_inetdata_lock_done(ifp);
12074 break;
12075
12076 case AF_INET6:
12077 if_inet6data_lock_shared(ifp);
12078 if (IN6_IFEXTRA(ifp) != NULL) {
12079 if (*len == 0 || *len < IN6_IFEXTRA(ifp)->netsig_len) {
12080 error = EINVAL;
12081 if_inet6data_lock_done(ifp);
12082 break;
12083 }
12084 if ((*len = (uint8_t)IN6_IFEXTRA(ifp)->netsig_len) > 0) {
12085 bcopy(IN6_IFEXTRA(ifp)->netsig, data, *len);
12086 } else {
12087 error = ENOENT;
12088 }
12089 } else {
12090 error = ENOMEM;
12091 }
12092 if_inet6data_lock_done(ifp);
12093 break;
12094
12095 default:
12096 error = EINVAL;
12097 break;
12098 }
12099
12100 if (error == 0 && flags != NULL) {
12101 *flags = 0;
12102 }
12103
12104 return error;
12105 }
12106
12107 int
ifnet_set_nat64prefix(struct ifnet * ifp,struct ipv6_prefix * prefixes)12108 ifnet_set_nat64prefix(struct ifnet *ifp, struct ipv6_prefix *prefixes)
12109 {
12110 int i, error = 0, one_set = 0;
12111
12112 if_inet6data_lock_exclusive(ifp);
12113
12114 if (IN6_IFEXTRA(ifp) == NULL) {
12115 error = ENOMEM;
12116 goto out;
12117 }
12118
12119 for (i = 0; i < NAT64_MAX_NUM_PREFIXES; i++) {
12120 uint32_t prefix_len =
12121 prefixes[i].prefix_len;
12122 struct in6_addr *prefix =
12123 &prefixes[i].ipv6_prefix;
12124
12125 if (prefix_len == 0) {
12126 clat_log0((LOG_DEBUG,
12127 "NAT64 prefixes purged from Interface %s\n",
12128 if_name(ifp)));
12129 /* Allow clearing the signature */
12130 IN6_IFEXTRA(ifp)->nat64_prefixes[i].prefix_len = 0;
12131 bzero(&IN6_IFEXTRA(ifp)->nat64_prefixes[i].ipv6_prefix,
12132 sizeof(struct in6_addr));
12133
12134 continue;
12135 } else if (prefix_len != NAT64_PREFIX_LEN_32 &&
12136 prefix_len != NAT64_PREFIX_LEN_40 &&
12137 prefix_len != NAT64_PREFIX_LEN_48 &&
12138 prefix_len != NAT64_PREFIX_LEN_56 &&
12139 prefix_len != NAT64_PREFIX_LEN_64 &&
12140 prefix_len != NAT64_PREFIX_LEN_96) {
12141 clat_log0((LOG_DEBUG,
12142 "NAT64 prefixlen is incorrect %d\n", prefix_len));
12143 error = EINVAL;
12144 goto out;
12145 }
12146
12147 if (IN6_IS_SCOPE_EMBED(prefix)) {
12148 clat_log0((LOG_DEBUG,
12149 "NAT64 prefix has interface/link local scope.\n"));
12150 error = EINVAL;
12151 goto out;
12152 }
12153
12154 IN6_IFEXTRA(ifp)->nat64_prefixes[i].prefix_len = prefix_len;
12155 bcopy(prefix, &IN6_IFEXTRA(ifp)->nat64_prefixes[i].ipv6_prefix,
12156 sizeof(struct in6_addr));
12157 clat_log0((LOG_DEBUG,
12158 "NAT64 prefix set to %s with prefixlen: %d\n",
12159 ip6_sprintf(prefix), prefix_len));
12160 one_set = 1;
12161 }
12162
12163 out:
12164 if_inet6data_lock_done(ifp);
12165
12166 if (error == 0 && one_set != 0) {
12167 necp_update_all_clients();
12168 }
12169
12170 return error;
12171 }
12172
12173 int
ifnet_get_nat64prefix(struct ifnet * ifp,struct ipv6_prefix * prefixes)12174 ifnet_get_nat64prefix(struct ifnet *ifp, struct ipv6_prefix *prefixes)
12175 {
12176 int i, found_one = 0, error = 0;
12177
12178 if (ifp == NULL) {
12179 return EINVAL;
12180 }
12181
12182 if_inet6data_lock_shared(ifp);
12183
12184 if (IN6_IFEXTRA(ifp) == NULL) {
12185 error = ENOMEM;
12186 goto out;
12187 }
12188
12189 for (i = 0; i < NAT64_MAX_NUM_PREFIXES; i++) {
12190 if (IN6_IFEXTRA(ifp)->nat64_prefixes[i].prefix_len != 0) {
12191 found_one = 1;
12192 }
12193 }
12194
12195 if (found_one == 0) {
12196 error = ENOENT;
12197 goto out;
12198 }
12199
12200 if (prefixes) {
12201 bcopy(IN6_IFEXTRA(ifp)->nat64_prefixes, prefixes,
12202 sizeof(IN6_IFEXTRA(ifp)->nat64_prefixes));
12203 }
12204
12205 out:
12206 if_inet6data_lock_done(ifp);
12207
12208 return error;
12209 }
12210
12211 __attribute__((noinline))
12212 static void
dlil_output_cksum_dbg(struct ifnet * ifp,struct mbuf * m,uint32_t hoff,protocol_family_t pf)12213 dlil_output_cksum_dbg(struct ifnet *ifp, struct mbuf *m, uint32_t hoff,
12214 protocol_family_t pf)
12215 {
12216 #pragma unused(ifp)
12217 uint32_t did_sw;
12218
12219 if (!(hwcksum_dbg_mode & HWCKSUM_DBG_FINALIZE_FORCED) ||
12220 (m->m_pkthdr.csum_flags & (CSUM_TSO_IPV4 | CSUM_TSO_IPV6))) {
12221 return;
12222 }
12223
12224 switch (pf) {
12225 case PF_INET:
12226 did_sw = in_finalize_cksum(m, hoff, m->m_pkthdr.csum_flags);
12227 if (did_sw & CSUM_DELAY_IP) {
12228 hwcksum_dbg_finalized_hdr++;
12229 }
12230 if (did_sw & CSUM_DELAY_DATA) {
12231 hwcksum_dbg_finalized_data++;
12232 }
12233 break;
12234 case PF_INET6:
12235 /*
12236 * Checksum offload should not have been enabled when
12237 * extension headers exist; that also means that we
12238 * cannot force-finalize packets with extension headers.
12239 * Indicate to the callee should it skip such case by
12240 * setting optlen to -1.
12241 */
12242 did_sw = in6_finalize_cksum(m, hoff, -1, -1,
12243 m->m_pkthdr.csum_flags);
12244 if (did_sw & CSUM_DELAY_IPV6_DATA) {
12245 hwcksum_dbg_finalized_data++;
12246 }
12247 break;
12248 default:
12249 return;
12250 }
12251 }
12252
12253 static void
dlil_input_cksum_dbg(struct ifnet * ifp,struct mbuf * m,char * frame_header,protocol_family_t pf)12254 dlil_input_cksum_dbg(struct ifnet *ifp, struct mbuf *m, char *frame_header,
12255 protocol_family_t pf)
12256 {
12257 uint16_t sum = 0;
12258 uint32_t hlen;
12259
12260 if (frame_header == NULL ||
12261 frame_header < (char *)mbuf_datastart(m) ||
12262 frame_header > (char *)m->m_data) {
12263 DLIL_PRINTF("%s: frame header pointer 0x%llx out of range "
12264 "[0x%llx,0x%llx] for mbuf 0x%llx\n", if_name(ifp),
12265 (uint64_t)VM_KERNEL_ADDRPERM(frame_header),
12266 (uint64_t)VM_KERNEL_ADDRPERM(mbuf_datastart(m)),
12267 (uint64_t)VM_KERNEL_ADDRPERM(m->m_data),
12268 (uint64_t)VM_KERNEL_ADDRPERM(m));
12269 return;
12270 }
12271 hlen = (uint32_t)(m->m_data - frame_header);
12272
12273 switch (pf) {
12274 case PF_INET:
12275 case PF_INET6:
12276 break;
12277 default:
12278 return;
12279 }
12280
12281 /*
12282 * Force partial checksum offload; useful to simulate cases
12283 * where the hardware does not support partial checksum offload,
12284 * in order to validate correctness throughout the layers above.
12285 */
12286 if (hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_FORCED) {
12287 uint32_t foff = hwcksum_dbg_partial_rxoff_forced;
12288
12289 if (foff > (uint32_t)m->m_pkthdr.len) {
12290 return;
12291 }
12292
12293 m->m_pkthdr.csum_flags &= ~CSUM_RX_FLAGS;
12294
12295 /* Compute 16-bit 1's complement sum from forced offset */
12296 sum = m_sum16(m, foff, (m->m_pkthdr.len - foff));
12297
12298 m->m_pkthdr.csum_flags |= (CSUM_DATA_VALID | CSUM_PARTIAL);
12299 m->m_pkthdr.csum_rx_val = sum;
12300 m->m_pkthdr.csum_rx_start = (uint16_t)(foff + hlen);
12301
12302 hwcksum_dbg_partial_forced++;
12303 hwcksum_dbg_partial_forced_bytes += m->m_pkthdr.len;
12304 }
12305
12306 /*
12307 * Partial checksum offload verification (and adjustment);
12308 * useful to validate and test cases where the hardware
12309 * supports partial checksum offload.
12310 */
12311 if ((m->m_pkthdr.csum_flags &
12312 (CSUM_DATA_VALID | CSUM_PARTIAL | CSUM_PSEUDO_HDR)) ==
12313 (CSUM_DATA_VALID | CSUM_PARTIAL)) {
12314 uint32_t rxoff;
12315
12316 /* Start offset must begin after frame header */
12317 rxoff = m->m_pkthdr.csum_rx_start;
12318 if (hlen > rxoff) {
12319 hwcksum_dbg_bad_rxoff++;
12320 if (dlil_verbose) {
12321 DLIL_PRINTF("%s: partial cksum start offset %d "
12322 "is less than frame header length %d for "
12323 "mbuf 0x%llx\n", if_name(ifp), rxoff, hlen,
12324 (uint64_t)VM_KERNEL_ADDRPERM(m));
12325 }
12326 return;
12327 }
12328 rxoff -= hlen;
12329
12330 if (!(hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_FORCED)) {
12331 /*
12332 * Compute the expected 16-bit 1's complement sum;
12333 * skip this if we've already computed it above
12334 * when partial checksum offload is forced.
12335 */
12336 sum = m_sum16(m, rxoff, (m->m_pkthdr.len - rxoff));
12337
12338 /* Hardware or driver is buggy */
12339 if (sum != m->m_pkthdr.csum_rx_val) {
12340 hwcksum_dbg_bad_cksum++;
12341 if (dlil_verbose) {
12342 DLIL_PRINTF("%s: bad partial cksum value "
12343 "0x%x (expected 0x%x) for mbuf "
12344 "0x%llx [rx_start %d]\n",
12345 if_name(ifp),
12346 m->m_pkthdr.csum_rx_val, sum,
12347 (uint64_t)VM_KERNEL_ADDRPERM(m),
12348 m->m_pkthdr.csum_rx_start);
12349 }
12350 return;
12351 }
12352 }
12353 hwcksum_dbg_verified++;
12354
12355 /*
12356 * This code allows us to emulate various hardwares that
12357 * perform 16-bit 1's complement sum beginning at various
12358 * start offset values.
12359 */
12360 if (hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_RXOFF_ADJ) {
12361 uint32_t aoff = hwcksum_dbg_partial_rxoff_adj;
12362
12363 if (aoff == rxoff || aoff > (uint32_t)m->m_pkthdr.len) {
12364 return;
12365 }
12366
12367 sum = m_adj_sum16(m, rxoff, aoff,
12368 m_pktlen(m) - aoff, sum);
12369
12370 m->m_pkthdr.csum_rx_val = sum;
12371 m->m_pkthdr.csum_rx_start = (uint16_t)(aoff + hlen);
12372
12373 hwcksum_dbg_adjusted++;
12374 }
12375 }
12376 }
12377
12378 static int
12379 sysctl_hwcksum_dbg_mode SYSCTL_HANDLER_ARGS
12380 {
12381 #pragma unused(arg1, arg2)
12382 u_int32_t i;
12383 int err;
12384
12385 i = hwcksum_dbg_mode;
12386
12387 err = sysctl_handle_int(oidp, &i, 0, req);
12388 if (err != 0 || req->newptr == USER_ADDR_NULL) {
12389 return err;
12390 }
12391
12392 if (hwcksum_dbg == 0) {
12393 return ENODEV;
12394 }
12395
12396 if ((i & ~HWCKSUM_DBG_MASK) != 0) {
12397 return EINVAL;
12398 }
12399
12400 hwcksum_dbg_mode = (i & HWCKSUM_DBG_MASK);
12401
12402 return err;
12403 }
12404
12405 static int
12406 sysctl_hwcksum_dbg_partial_rxoff_forced SYSCTL_HANDLER_ARGS
12407 {
12408 #pragma unused(arg1, arg2)
12409 u_int32_t i;
12410 int err;
12411
12412 i = hwcksum_dbg_partial_rxoff_forced;
12413
12414 err = sysctl_handle_int(oidp, &i, 0, req);
12415 if (err != 0 || req->newptr == USER_ADDR_NULL) {
12416 return err;
12417 }
12418
12419 if (!(hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_FORCED)) {
12420 return ENODEV;
12421 }
12422
12423 hwcksum_dbg_partial_rxoff_forced = i;
12424
12425 return err;
12426 }
12427
12428 static int
12429 sysctl_hwcksum_dbg_partial_rxoff_adj SYSCTL_HANDLER_ARGS
12430 {
12431 #pragma unused(arg1, arg2)
12432 u_int32_t i;
12433 int err;
12434
12435 i = hwcksum_dbg_partial_rxoff_adj;
12436
12437 err = sysctl_handle_int(oidp, &i, 0, req);
12438 if (err != 0 || req->newptr == USER_ADDR_NULL) {
12439 return err;
12440 }
12441
12442 if (!(hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_RXOFF_ADJ)) {
12443 return ENODEV;
12444 }
12445
12446 hwcksum_dbg_partial_rxoff_adj = i;
12447
12448 return err;
12449 }
12450
12451 static int
12452 sysctl_tx_chain_len_stats SYSCTL_HANDLER_ARGS
12453 {
12454 #pragma unused(oidp, arg1, arg2)
12455 int err;
12456
12457 if (req->oldptr == USER_ADDR_NULL) {
12458 }
12459 if (req->newptr != USER_ADDR_NULL) {
12460 return EPERM;
12461 }
12462 err = SYSCTL_OUT(req, &tx_chain_len_stats,
12463 sizeof(struct chain_len_stats));
12464
12465 return err;
12466 }
12467
12468 #if DEBUG || DEVELOPMENT
12469 /* Blob for sum16 verification */
12470 static uint8_t sumdata[] = {
12471 0x1f, 0x8b, 0x08, 0x08, 0x4c, 0xe5, 0x9a, 0x4f, 0x00, 0x03,
12472 0x5f, 0x00, 0x5d, 0x91, 0x41, 0x4e, 0xc4, 0x30, 0x0c, 0x45,
12473 0xf7, 0x9c, 0xc2, 0x07, 0x18, 0xf5, 0x0e, 0xb0, 0xe2, 0x00,
12474 0x48, 0x88, 0xa5, 0xdb, 0xba, 0x49, 0x34, 0x69, 0xdc, 0x71,
12475 0x92, 0xa9, 0xc2, 0x8a, 0x6b, 0x70, 0x3d, 0x4e, 0x82, 0x93,
12476 0xb4, 0x08, 0xd8, 0xc5, 0xb1, 0xfd, 0xff, 0xb3, 0xfd, 0x4c,
12477 0x42, 0x5f, 0x1f, 0x9f, 0x11, 0x12, 0x43, 0xb2, 0x04, 0x93,
12478 0xe0, 0x7b, 0x01, 0x0e, 0x14, 0x07, 0x78, 0xd1, 0x78, 0x75,
12479 0x71, 0x71, 0xe9, 0x08, 0x84, 0x46, 0xf2, 0xc7, 0x3b, 0x09,
12480 0xe7, 0xd1, 0xd3, 0x8a, 0x57, 0x92, 0x33, 0xcd, 0x39, 0xcc,
12481 0xb0, 0x91, 0x89, 0xe0, 0x42, 0x53, 0x8b, 0xb7, 0x8c, 0x42,
12482 0x60, 0xd9, 0x9f, 0x7a, 0x55, 0x19, 0x76, 0xcb, 0x10, 0x49,
12483 0x35, 0xac, 0x0b, 0x5a, 0x3c, 0xbb, 0x65, 0x51, 0x8c, 0x90,
12484 0x7c, 0x69, 0x45, 0x45, 0x81, 0xb4, 0x2b, 0x70, 0x82, 0x85,
12485 0x55, 0x91, 0x17, 0x90, 0xdc, 0x14, 0x1e, 0x35, 0x52, 0xdd,
12486 0x02, 0x16, 0xef, 0xb5, 0x40, 0x89, 0xe2, 0x46, 0x53, 0xad,
12487 0x93, 0x6e, 0x98, 0x30, 0xe5, 0x08, 0xb7, 0xcc, 0x03, 0xbc,
12488 0x71, 0x86, 0x09, 0x43, 0x0d, 0x52, 0xf5, 0xa2, 0xf5, 0xa2,
12489 0x56, 0x11, 0x8d, 0xa8, 0xf5, 0xee, 0x92, 0x3d, 0xfe, 0x8c,
12490 0x67, 0x71, 0x8b, 0x0e, 0x2d, 0x70, 0x77, 0xbe, 0xbe, 0xea,
12491 0xbf, 0x9a, 0x8d, 0x9c, 0x53, 0x53, 0xe5, 0xe0, 0x4b, 0x87,
12492 0x85, 0xd2, 0x45, 0x95, 0x30, 0xc1, 0xcc, 0xe0, 0x74, 0x54,
12493 0x13, 0x58, 0xe8, 0xe8, 0x79, 0xa2, 0x09, 0x73, 0xa4, 0x0e,
12494 0x39, 0x59, 0x0c, 0xe6, 0x9c, 0xb2, 0x4f, 0x06, 0x5b, 0x8e,
12495 0xcd, 0x17, 0x6c, 0x5e, 0x95, 0x4d, 0x70, 0xa2, 0x0a, 0xbf,
12496 0xa3, 0xcc, 0x03, 0xbc, 0x5a, 0xe7, 0x75, 0x06, 0x5e, 0x75,
12497 0xef, 0x58, 0x8e, 0x15, 0xd1, 0x0a, 0x18, 0xff, 0xdd, 0xe6,
12498 0x02, 0x3b, 0xb5, 0xb4, 0xa1, 0xe0, 0x72, 0xfc, 0xe3, 0xab,
12499 0x07, 0xe0, 0x4d, 0x65, 0xea, 0x92, 0xeb, 0xf2, 0x7b, 0x17,
12500 0x05, 0xce, 0xc6, 0xf6, 0x2b, 0xbb, 0x70, 0x3d, 0x00, 0x95,
12501 0xe0, 0x07, 0x52, 0x3b, 0x58, 0xfc, 0x7c, 0x69, 0x4d, 0xe9,
12502 0xf7, 0xa9, 0x66, 0x1e, 0x1e, 0xbe, 0x01, 0x69, 0x98, 0xfe,
12503 0xc8, 0x28, 0x02, 0x00, 0x00
12504 };
12505
12506 /* Precomputed 16-bit 1's complement sums for various spans of the above data */
12507 static struct {
12508 boolean_t init;
12509 uint16_t len;
12510 uint16_t sumr; /* reference */
12511 uint16_t sumrp; /* reference, precomputed */
12512 } sumtbl[] = {
12513 { FALSE, 0, 0, 0x0000 },
12514 { FALSE, 1, 0, 0x001f },
12515 { FALSE, 2, 0, 0x8b1f },
12516 { FALSE, 3, 0, 0x8b27 },
12517 { FALSE, 7, 0, 0x790e },
12518 { FALSE, 11, 0, 0xcb6d },
12519 { FALSE, 20, 0, 0x20dd },
12520 { FALSE, 27, 0, 0xbabd },
12521 { FALSE, 32, 0, 0xf3e8 },
12522 { FALSE, 37, 0, 0x197d },
12523 { FALSE, 43, 0, 0x9eae },
12524 { FALSE, 64, 0, 0x4678 },
12525 { FALSE, 127, 0, 0x9399 },
12526 { FALSE, 256, 0, 0xd147 },
12527 { FALSE, 325, 0, 0x0358 },
12528 };
12529 #define SUMTBL_MAX ((int)sizeof (sumtbl) / (int)sizeof (sumtbl[0]))
12530
12531 static void
dlil_verify_sum16(void)12532 dlil_verify_sum16(void)
12533 {
12534 struct mbuf *m;
12535 uint8_t *buf;
12536 int n;
12537
12538 /* Make sure test data plus extra room for alignment fits in cluster */
12539 _CASSERT((sizeof(sumdata) + (sizeof(uint64_t) * 2)) <= MCLBYTES);
12540
12541 kprintf("DLIL: running SUM16 self-tests ... ");
12542
12543 m = m_getcl(M_WAITOK, MT_DATA, M_PKTHDR);
12544 m_align(m, sizeof(sumdata) + (sizeof(uint64_t) * 2));
12545
12546 buf = mtod(m, uint8_t *); /* base address */
12547
12548 for (n = 0; n < SUMTBL_MAX; n++) {
12549 uint16_t len = sumtbl[n].len;
12550 int i;
12551
12552 /* Verify for all possible alignments */
12553 for (i = 0; i < (int)sizeof(uint64_t); i++) {
12554 uint16_t sum, sumr;
12555 uint8_t *c;
12556
12557 /* Copy over test data to mbuf */
12558 VERIFY(len <= sizeof(sumdata));
12559 c = buf + i;
12560 bcopy(sumdata, c, len);
12561
12562 /* Zero-offset test (align by data pointer) */
12563 m->m_data = (caddr_t)c;
12564 m->m_len = len;
12565 sum = m_sum16(m, 0, len);
12566
12567 if (!sumtbl[n].init) {
12568 sumr = (uint16_t)in_cksum_mbuf_ref(m, len, 0, 0);
12569 sumtbl[n].sumr = sumr;
12570 sumtbl[n].init = TRUE;
12571 } else {
12572 sumr = sumtbl[n].sumr;
12573 }
12574
12575 /* Something is horribly broken; stop now */
12576 if (sumr != sumtbl[n].sumrp) {
12577 panic_plain("\n%s: broken in_cksum_mbuf_ref() "
12578 "for len=%d align=%d sum=0x%04x "
12579 "[expected=0x%04x]\n", __func__,
12580 len, i, sum, sumr);
12581 /* NOTREACHED */
12582 } else if (sum != sumr) {
12583 panic_plain("\n%s: broken m_sum16() for len=%d "
12584 "align=%d sum=0x%04x [expected=0x%04x]\n",
12585 __func__, len, i, sum, sumr);
12586 /* NOTREACHED */
12587 }
12588
12589 /* Alignment test by offset (fixed data pointer) */
12590 m->m_data = (caddr_t)buf;
12591 m->m_len = i + len;
12592 sum = m_sum16(m, i, len);
12593
12594 /* Something is horribly broken; stop now */
12595 if (sum != sumr) {
12596 panic_plain("\n%s: broken m_sum16() for len=%d "
12597 "offset=%d sum=0x%04x [expected=0x%04x]\n",
12598 __func__, len, i, sum, sumr);
12599 /* NOTREACHED */
12600 }
12601 #if INET
12602 /* Simple sum16 contiguous buffer test by aligment */
12603 sum = b_sum16(c, len);
12604
12605 /* Something is horribly broken; stop now */
12606 if (sum != sumr) {
12607 panic_plain("\n%s: broken b_sum16() for len=%d "
12608 "align=%d sum=0x%04x [expected=0x%04x]\n",
12609 __func__, len, i, sum, sumr);
12610 /* NOTREACHED */
12611 }
12612 #endif /* INET */
12613 }
12614 }
12615 m_freem(m);
12616
12617 kprintf("PASSED\n");
12618 }
12619 #endif /* DEBUG || DEVELOPMENT */
12620
12621 #define CASE_STRINGIFY(x) case x: return #x
12622
12623 __private_extern__ const char *
dlil_kev_dl_code_str(u_int32_t event_code)12624 dlil_kev_dl_code_str(u_int32_t event_code)
12625 {
12626 switch (event_code) {
12627 CASE_STRINGIFY(KEV_DL_SIFFLAGS);
12628 CASE_STRINGIFY(KEV_DL_SIFMETRICS);
12629 CASE_STRINGIFY(KEV_DL_SIFMTU);
12630 CASE_STRINGIFY(KEV_DL_SIFPHYS);
12631 CASE_STRINGIFY(KEV_DL_SIFMEDIA);
12632 CASE_STRINGIFY(KEV_DL_SIFGENERIC);
12633 CASE_STRINGIFY(KEV_DL_ADDMULTI);
12634 CASE_STRINGIFY(KEV_DL_DELMULTI);
12635 CASE_STRINGIFY(KEV_DL_IF_ATTACHED);
12636 CASE_STRINGIFY(KEV_DL_IF_DETACHING);
12637 CASE_STRINGIFY(KEV_DL_IF_DETACHED);
12638 CASE_STRINGIFY(KEV_DL_LINK_OFF);
12639 CASE_STRINGIFY(KEV_DL_LINK_ON);
12640 CASE_STRINGIFY(KEV_DL_PROTO_ATTACHED);
12641 CASE_STRINGIFY(KEV_DL_PROTO_DETACHED);
12642 CASE_STRINGIFY(KEV_DL_LINK_ADDRESS_CHANGED);
12643 CASE_STRINGIFY(KEV_DL_WAKEFLAGS_CHANGED);
12644 CASE_STRINGIFY(KEV_DL_IF_IDLE_ROUTE_REFCNT);
12645 CASE_STRINGIFY(KEV_DL_IFCAP_CHANGED);
12646 CASE_STRINGIFY(KEV_DL_LINK_QUALITY_METRIC_CHANGED);
12647 CASE_STRINGIFY(KEV_DL_NODE_PRESENCE);
12648 CASE_STRINGIFY(KEV_DL_NODE_ABSENCE);
12649 CASE_STRINGIFY(KEV_DL_PRIMARY_ELECTED);
12650 CASE_STRINGIFY(KEV_DL_ISSUES);
12651 CASE_STRINGIFY(KEV_DL_IFDELEGATE_CHANGED);
12652 default:
12653 break;
12654 }
12655 return "";
12656 }
12657
12658 static void
dlil_dt_tcall_fn(thread_call_param_t arg0,thread_call_param_t arg1)12659 dlil_dt_tcall_fn(thread_call_param_t arg0, thread_call_param_t arg1)
12660 {
12661 #pragma unused(arg1)
12662 struct ifnet *ifp = arg0;
12663
12664 if (ifnet_is_attached(ifp, 1)) {
12665 nstat_ifnet_threshold_reached(ifp->if_index);
12666 ifnet_decr_iorefcnt(ifp);
12667 }
12668 }
12669
12670 void
ifnet_notify_data_threshold(struct ifnet * ifp)12671 ifnet_notify_data_threshold(struct ifnet *ifp)
12672 {
12673 uint64_t bytes = (ifp->if_ibytes + ifp->if_obytes);
12674 uint64_t oldbytes = ifp->if_dt_bytes;
12675
12676 ASSERT(ifp->if_dt_tcall != NULL);
12677
12678 /*
12679 * If we went over the threshold, notify NetworkStatistics.
12680 * We rate-limit it based on the threshold interval value.
12681 */
12682 if (threshold_notify && (bytes - oldbytes) > ifp->if_data_threshold &&
12683 OSCompareAndSwap64(oldbytes, bytes, &ifp->if_dt_bytes) &&
12684 !thread_call_isactive(ifp->if_dt_tcall)) {
12685 uint64_t tival = (threshold_interval * NSEC_PER_SEC);
12686 uint64_t now = mach_absolute_time(), deadline = now;
12687 uint64_t ival;
12688
12689 if (tival != 0) {
12690 nanoseconds_to_absolutetime(tival, &ival);
12691 clock_deadline_for_periodic_event(ival, now, &deadline);
12692 (void) thread_call_enter_delayed(ifp->if_dt_tcall,
12693 deadline);
12694 } else {
12695 (void) thread_call_enter(ifp->if_dt_tcall);
12696 }
12697 }
12698 }
12699
12700 #if (DEVELOPMENT || DEBUG)
12701 /*
12702 * The sysctl variable name contains the input parameters of
12703 * ifnet_get_keepalive_offload_frames()
12704 * ifp (interface index): name[0]
12705 * frames_array_count: name[1]
12706 * frame_data_offset: name[2]
12707 * The return length gives used_frames_count
12708 */
12709 static int
12710 sysctl_get_kao_frames SYSCTL_HANDLER_ARGS
12711 {
12712 #pragma unused(oidp)
12713 int *name = (int *)arg1;
12714 u_int namelen = arg2;
12715 int idx;
12716 ifnet_t ifp = NULL;
12717 u_int32_t frames_array_count;
12718 size_t frame_data_offset;
12719 u_int32_t used_frames_count;
12720 struct ifnet_keepalive_offload_frame *frames_array = NULL;
12721 int error = 0;
12722 u_int32_t i;
12723
12724 /*
12725 * Only root can get look at other people TCP frames
12726 */
12727 error = proc_suser(current_proc());
12728 if (error != 0) {
12729 goto done;
12730 }
12731 /*
12732 * Validate the input parameters
12733 */
12734 if (req->newptr != USER_ADDR_NULL) {
12735 error = EPERM;
12736 goto done;
12737 }
12738 if (namelen != 3) {
12739 error = EINVAL;
12740 goto done;
12741 }
12742 if (req->oldptr == USER_ADDR_NULL) {
12743 error = EINVAL;
12744 goto done;
12745 }
12746 if (req->oldlen == 0) {
12747 error = EINVAL;
12748 goto done;
12749 }
12750 idx = name[0];
12751 frames_array_count = name[1];
12752 frame_data_offset = name[2];
12753
12754 /* Make sure the passed buffer is large enough */
12755 if (frames_array_count * sizeof(struct ifnet_keepalive_offload_frame) >
12756 req->oldlen) {
12757 error = ENOMEM;
12758 goto done;
12759 }
12760
12761 ifnet_head_lock_shared();
12762 if (!IF_INDEX_IN_RANGE(idx)) {
12763 ifnet_head_done();
12764 error = ENOENT;
12765 goto done;
12766 }
12767 ifp = ifindex2ifnet[idx];
12768 ifnet_head_done();
12769
12770 frames_array = (struct ifnet_keepalive_offload_frame *)kalloc_data(
12771 frames_array_count * sizeof(struct ifnet_keepalive_offload_frame),
12772 Z_WAITOK);
12773 if (frames_array == NULL) {
12774 error = ENOMEM;
12775 goto done;
12776 }
12777
12778 error = ifnet_get_keepalive_offload_frames(ifp, frames_array,
12779 frames_array_count, frame_data_offset, &used_frames_count);
12780 if (error != 0) {
12781 DLIL_PRINTF("%s: ifnet_get_keepalive_offload_frames error %d\n",
12782 __func__, error);
12783 goto done;
12784 }
12785
12786 for (i = 0; i < used_frames_count; i++) {
12787 error = SYSCTL_OUT(req, frames_array + i,
12788 sizeof(struct ifnet_keepalive_offload_frame));
12789 if (error != 0) {
12790 goto done;
12791 }
12792 }
12793 done:
12794 if (frames_array != NULL) {
12795 kfree_data(frames_array, frames_array_count *
12796 sizeof(struct ifnet_keepalive_offload_frame));
12797 }
12798 return error;
12799 }
12800 #endif /* DEVELOPMENT || DEBUG */
12801
12802 void
ifnet_update_stats_per_flow(struct ifnet_stats_per_flow * ifs,struct ifnet * ifp)12803 ifnet_update_stats_per_flow(struct ifnet_stats_per_flow *ifs,
12804 struct ifnet *ifp)
12805 {
12806 tcp_update_stats_per_flow(ifs, ifp);
12807 }
12808
12809 static inline u_int32_t
_set_flags(u_int32_t * flags_p,u_int32_t set_flags)12810 _set_flags(u_int32_t *flags_p, u_int32_t set_flags)
12811 {
12812 return (u_int32_t)OSBitOrAtomic(set_flags, flags_p);
12813 }
12814
12815 static inline void
_clear_flags(u_int32_t * flags_p,u_int32_t clear_flags)12816 _clear_flags(u_int32_t *flags_p, u_int32_t clear_flags)
12817 {
12818 OSBitAndAtomic(~clear_flags, flags_p);
12819 }
12820
12821 __private_extern__ u_int32_t
if_set_eflags(ifnet_t interface,u_int32_t set_flags)12822 if_set_eflags(ifnet_t interface, u_int32_t set_flags)
12823 {
12824 return _set_flags(&interface->if_eflags, set_flags);
12825 }
12826
12827 __private_extern__ void
if_clear_eflags(ifnet_t interface,u_int32_t clear_flags)12828 if_clear_eflags(ifnet_t interface, u_int32_t clear_flags)
12829 {
12830 _clear_flags(&interface->if_eflags, clear_flags);
12831 }
12832
12833 __private_extern__ u_int32_t
if_set_xflags(ifnet_t interface,u_int32_t set_flags)12834 if_set_xflags(ifnet_t interface, u_int32_t set_flags)
12835 {
12836 return _set_flags(&interface->if_xflags, set_flags);
12837 }
12838
12839 __private_extern__ void
if_clear_xflags(ifnet_t interface,u_int32_t clear_flags)12840 if_clear_xflags(ifnet_t interface, u_int32_t clear_flags)
12841 {
12842 _clear_flags(&interface->if_xflags, clear_flags);
12843 }
12844
12845 __private_extern__ void
ifnet_update_traffic_rule_genid(ifnet_t ifp)12846 ifnet_update_traffic_rule_genid(ifnet_t ifp)
12847 {
12848 os_atomic_inc(&ifp->if_traffic_rule_genid, relaxed);
12849 }
12850
12851 __private_extern__ boolean_t
ifnet_sync_traffic_rule_genid(ifnet_t ifp,uint32_t * genid)12852 ifnet_sync_traffic_rule_genid(ifnet_t ifp, uint32_t *genid)
12853 {
12854 if (*genid != ifp->if_traffic_rule_genid) {
12855 *genid = ifp->if_traffic_rule_genid;
12856 return TRUE;
12857 }
12858 return FALSE;
12859 }
12860 __private_extern__ void
ifnet_update_traffic_rule_count(ifnet_t ifp,uint32_t count)12861 ifnet_update_traffic_rule_count(ifnet_t ifp, uint32_t count)
12862 {
12863 os_atomic_store(&ifp->if_traffic_rule_count, count, release);
12864 ifnet_update_traffic_rule_genid(ifp);
12865 }
12866
12867 static void
log_hexdump(void * data,size_t len)12868 log_hexdump(void *data, size_t len)
12869 {
12870 size_t i, j, k;
12871 unsigned char *ptr = (unsigned char *)data;
12872 #define MAX_DUMP_BUF 32
12873 unsigned char buf[3 * MAX_DUMP_BUF + 1];
12874
12875 for (i = 0; i < len; i += MAX_DUMP_BUF) {
12876 for (j = i, k = 0; j < i + MAX_DUMP_BUF && j < len; j++) {
12877 unsigned char msnbl = ptr[j] >> 4;
12878 unsigned char lsnbl = ptr[j] & 0x0f;
12879
12880 buf[k++] = msnbl < 10 ? msnbl + '0' : msnbl + 'a' - 10;
12881 buf[k++] = lsnbl < 10 ? lsnbl + '0' : lsnbl + 'a' - 10;
12882
12883 if ((j % 2) == 1) {
12884 buf[k++] = ' ';
12885 }
12886 if ((j % MAX_DUMP_BUF) == MAX_DUMP_BUF - 1) {
12887 buf[k++] = ' ';
12888 }
12889 }
12890 buf[k] = 0;
12891 os_log(OS_LOG_DEFAULT, "%3lu: %s", i, buf);
12892 }
12893 }
12894
12895 #if SKYWALK && defined(XNU_TARGET_OS_OSX)
12896 static bool
net_check_compatible_if_filter(struct ifnet * ifp)12897 net_check_compatible_if_filter(struct ifnet *ifp)
12898 {
12899 if (ifp == NULL) {
12900 if (net_api_stats.nas_iflt_attach_count > net_api_stats.nas_iflt_attach_os_count) {
12901 return false;
12902 }
12903 } else {
12904 if (ifp->if_flt_non_os_count > 0) {
12905 return false;
12906 }
12907 }
12908 return true;
12909 }
12910 #endif /* SKYWALK && XNU_TARGET_OS_OSX */
12911
12912 #define DUMP_BUF_CHK() { \
12913 clen -= k; \
12914 if (clen < 1) \
12915 goto done; \
12916 c += k; \
12917 }
12918
12919 int dlil_dump_top_if_qlen(char *, int);
12920 int
dlil_dump_top_if_qlen(char * str,int str_len)12921 dlil_dump_top_if_qlen(char *str, int str_len)
12922 {
12923 char *c = str;
12924 int k, clen = str_len;
12925 struct ifnet *top_ifcq_ifp = NULL;
12926 uint32_t top_ifcq_len = 0;
12927 struct ifnet *top_inq_ifp = NULL;
12928 uint32_t top_inq_len = 0;
12929
12930 for (int ifidx = 1; ifidx < if_index; ifidx++) {
12931 struct ifnet *ifp = ifindex2ifnet[ifidx];
12932 struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
12933
12934 if (ifp == NULL) {
12935 continue;
12936 }
12937 if (ifp->if_snd != NULL && ifp->if_snd->ifcq_len > top_ifcq_len) {
12938 top_ifcq_len = ifp->if_snd->ifcq_len;
12939 top_ifcq_ifp = ifp;
12940 }
12941 if (dl_if->dl_if_inpstorage.dlth_pkts.qlen > top_inq_len) {
12942 top_inq_len = dl_if->dl_if_inpstorage.dlth_pkts.qlen;
12943 top_inq_ifp = ifp;
12944 }
12945 }
12946
12947 if (top_ifcq_ifp != NULL) {
12948 k = scnprintf(c, clen, "\ntop ifcq_len %u packets by %s\n",
12949 top_ifcq_len, top_ifcq_ifp->if_xname);
12950 DUMP_BUF_CHK();
12951 }
12952 if (top_inq_ifp != NULL) {
12953 k = scnprintf(c, clen, "\ntop inq_len %u packets by %s\n",
12954 top_inq_len, top_inq_ifp->if_xname);
12955 DUMP_BUF_CHK();
12956 }
12957 done:
12958 return str_len - clen;
12959 }
12960