1 /*
2 * Copyright (c) 1999-2023 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
30 * support for mandatory and extensible security protections. This notice
31 * is included in support of clause 2.2 (b) of the Apple Public License,
32 * Version 2.0.
33 */
34 #include "kpi_interface.h"
35 #include <stddef.h>
36 #include <ptrauth.h>
37
38 #include <sys/param.h>
39 #include <sys/systm.h>
40 #include <sys/kernel.h>
41 #include <sys/malloc.h>
42 #include <sys/mbuf.h>
43 #include <sys/socket.h>
44 #include <sys/domain.h>
45 #include <sys/user.h>
46 #include <sys/random.h>
47 #include <sys/socketvar.h>
48 #include <net/if_dl.h>
49 #include <net/if.h>
50 #include <net/route.h>
51 #include <net/if_var.h>
52 #include <net/dlil.h>
53 #include <net/if_arp.h>
54 #include <net/iptap.h>
55 #include <net/pktap.h>
56 #include <net/nwk_wq.h>
57 #include <sys/kern_event.h>
58 #include <sys/kdebug.h>
59 #include <sys/mcache.h>
60 #include <sys/syslog.h>
61 #include <sys/protosw.h>
62 #include <sys/priv.h>
63
64 #include <kern/assert.h>
65 #include <kern/task.h>
66 #include <kern/thread.h>
67 #include <kern/sched_prim.h>
68 #include <kern/locks.h>
69 #include <kern/zalloc.h>
70
71 #include <net/kpi_protocol.h>
72 #include <net/if_types.h>
73 #include <net/if_ipsec.h>
74 #include <net/if_llreach.h>
75 #include <net/if_utun.h>
76 #include <net/kpi_interfacefilter.h>
77 #include <net/classq/classq.h>
78 #include <net/classq/classq_sfb.h>
79 #include <net/flowhash.h>
80 #include <net/ntstat.h>
81 #if SKYWALK && defined(XNU_TARGET_OS_OSX)
82 #include <skywalk/lib/net_filter_event.h>
83 #endif /* SKYWALK && XNU_TARGET_OS_OSX */
84 #include <net/net_api_stats.h>
85 #include <net/if_ports_used.h>
86 #include <net/if_vlan_var.h>
87 #include <netinet/in.h>
88 #if INET
89 #include <netinet/in_var.h>
90 #include <netinet/igmp_var.h>
91 #include <netinet/ip_var.h>
92 #include <netinet/tcp.h>
93 #include <netinet/tcp_var.h>
94 #include <netinet/udp.h>
95 #include <netinet/udp_var.h>
96 #include <netinet/if_ether.h>
97 #include <netinet/in_pcb.h>
98 #include <netinet/in_tclass.h>
99 #include <netinet/ip.h>
100 #include <netinet/ip_icmp.h>
101 #include <netinet/icmp_var.h>
102 #endif /* INET */
103
104 #include <net/nat464_utils.h>
105 #include <netinet6/in6_var.h>
106 #include <netinet6/nd6.h>
107 #include <netinet6/mld6_var.h>
108 #include <netinet6/scope6_var.h>
109 #include <netinet/ip6.h>
110 #include <netinet/icmp6.h>
111 #include <net/pf_pbuf.h>
112 #include <libkern/OSAtomic.h>
113 #include <libkern/tree.h>
114
115 #include <dev/random/randomdev.h>
116 #include <machine/machine_routines.h>
117
118 #include <mach/thread_act.h>
119 #include <mach/sdt.h>
120
121 #if CONFIG_MACF
122 #include <sys/kauth.h>
123 #include <security/mac_framework.h>
124 #include <net/ethernet.h>
125 #include <net/firewire.h>
126 #endif
127
128 #if PF
129 #include <net/pfvar.h>
130 #endif /* PF */
131 #include <net/pktsched/pktsched.h>
132 #include <net/pktsched/pktsched_netem.h>
133
134 #if NECP
135 #include <net/necp.h>
136 #endif /* NECP */
137
138 #if SKYWALK
139 #include <skywalk/packet/packet_queue.h>
140 #include <skywalk/nexus/netif/nx_netif.h>
141 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
142 #endif /* SKYWALK */
143
144 #include <os/log.h>
145
146 #define DBG_LAYER_BEG DLILDBG_CODE(DBG_DLIL_STATIC, 0)
147 #define DBG_LAYER_END DLILDBG_CODE(DBG_DLIL_STATIC, 2)
148 #define DBG_FNC_DLIL_INPUT DLILDBG_CODE(DBG_DLIL_STATIC, (1 << 8))
149 #define DBG_FNC_DLIL_OUTPUT DLILDBG_CODE(DBG_DLIL_STATIC, (2 << 8))
150 #define DBG_FNC_DLIL_IFOUT DLILDBG_CODE(DBG_DLIL_STATIC, (3 << 8))
151
152 #define MAX_FRAME_TYPE_SIZE 4 /* LONGWORDS */
153 #define MAX_LINKADDR 4 /* LONGWORDS */
154
155 #if 1
156 #define DLIL_PRINTF printf
157 #else
158 #define DLIL_PRINTF kprintf
159 #endif
160
161 #define IF_DATA_REQUIRE_ALIGNED_64(f) \
162 _CASSERT(!(offsetof(struct if_data_internal, f) % sizeof (u_int64_t)))
163
164 #define IFNET_IF_DATA_REQUIRE_ALIGNED_64(f) \
165 _CASSERT(!(offsetof(struct ifnet, if_data.f) % sizeof (u_int64_t)))
166
167 enum {
168 kProtoKPI_v1 = 1,
169 kProtoKPI_v2 = 2
170 };
171
172 uint64_t if_creation_generation_count = 0;
173
174 /*
175 * List of if_proto structures in if_proto_hash[] is protected by
176 * the ifnet lock. The rest of the fields are initialized at protocol
177 * attach time and never change, thus no lock required as long as
178 * a reference to it is valid, via if_proto_ref().
179 */
180 struct if_proto {
181 SLIST_ENTRY(if_proto) next_hash;
182 u_int32_t refcount;
183 u_int32_t detached;
184 struct ifnet *ifp;
185 protocol_family_t protocol_family;
186 int proto_kpi;
187 union {
188 struct {
189 proto_media_input input;
190 proto_media_preout pre_output;
191 proto_media_event event;
192 proto_media_ioctl ioctl;
193 proto_media_detached detached;
194 proto_media_resolve_multi resolve_multi;
195 proto_media_send_arp send_arp;
196 } v1;
197 struct {
198 proto_media_input_v2 input;
199 proto_media_preout pre_output;
200 proto_media_event event;
201 proto_media_ioctl ioctl;
202 proto_media_detached detached;
203 proto_media_resolve_multi resolve_multi;
204 proto_media_send_arp send_arp;
205 } v2;
206 } kpi;
207 };
208
209 SLIST_HEAD(proto_hash_entry, if_proto);
210
211 #define DLIL_SDLDATALEN \
212 (DLIL_SDLMAXLEN - offsetof(struct sockaddr_dl, sdl_data[0]))
213
214 struct dlil_ifnet {
215 struct ifnet dl_if; /* public ifnet */
216 /*
217 * DLIL private fields, protected by dl_if_lock
218 */
219 decl_lck_mtx_data(, dl_if_lock);
220 TAILQ_ENTRY(dlil_ifnet) dl_if_link; /* dlil_ifnet link */
221 u_int32_t dl_if_flags; /* flags (below) */
222 u_int32_t dl_if_refcnt; /* refcnt */
223 void (*dl_if_trace)(struct dlil_ifnet *, int); /* ref trace callback */
224 void *dl_if_uniqueid; /* unique interface id */
225 size_t dl_if_uniqueid_len; /* length of the unique id */
226 char dl_if_namestorage[IFNAMSIZ]; /* interface name storage */
227 char dl_if_xnamestorage[IFXNAMSIZ]; /* external name storage */
228 struct {
229 struct ifaddr ifa; /* lladdr ifa */
230 u_int8_t asdl[DLIL_SDLMAXLEN]; /* addr storage */
231 u_int8_t msdl[DLIL_SDLMAXLEN]; /* mask storage */
232 } dl_if_lladdr;
233 u_int8_t dl_if_descstorage[IF_DESCSIZE]; /* desc storage */
234 u_int8_t dl_if_permanent_ether[ETHER_ADDR_LEN]; /* permanent address */
235 u_int8_t dl_if_permanent_ether_is_set;
236 u_int8_t dl_if_unused;
237 struct dlil_threading_info dl_if_inpstorage; /* input thread storage */
238 ctrace_t dl_if_attach; /* attach PC stacktrace */
239 ctrace_t dl_if_detach; /* detach PC stacktrace */
240 };
241
242 /* Values for dl_if_flags (private to DLIL) */
243 #define DLIF_INUSE 0x1 /* DLIL ifnet recycler, ifnet in use */
244 #define DLIF_REUSE 0x2 /* DLIL ifnet recycles, ifnet is not new */
245 #define DLIF_DEBUG 0x4 /* has debugging info */
246
247 #define IF_REF_TRACE_HIST_SIZE 8 /* size of ref trace history */
248
249 /* For gdb */
250 __private_extern__ unsigned int if_ref_trace_hist_size = IF_REF_TRACE_HIST_SIZE;
251
252 struct dlil_ifnet_dbg {
253 struct dlil_ifnet dldbg_dlif; /* dlil_ifnet */
254 u_int16_t dldbg_if_refhold_cnt; /* # ifnet references */
255 u_int16_t dldbg_if_refrele_cnt; /* # ifnet releases */
256 /*
257 * Circular lists of ifnet_{reference,release} callers.
258 */
259 ctrace_t dldbg_if_refhold[IF_REF_TRACE_HIST_SIZE];
260 ctrace_t dldbg_if_refrele[IF_REF_TRACE_HIST_SIZE];
261 };
262
263 #define DLIL_TO_IFP(s) (&s->dl_if)
264 #define IFP_TO_DLIL(s) ((struct dlil_ifnet *)s)
265
266 struct ifnet_filter {
267 TAILQ_ENTRY(ifnet_filter) filt_next;
268 u_int32_t filt_skip;
269 u_int32_t filt_flags;
270 ifnet_t filt_ifp;
271 const char *filt_name;
272 void *filt_cookie;
273 protocol_family_t filt_protocol;
274 iff_input_func filt_input;
275 iff_output_func filt_output;
276 iff_event_func filt_event;
277 iff_ioctl_func filt_ioctl;
278 iff_detached_func filt_detached;
279 };
280
281 /* Mbuf queue used for freeing the excessive mbufs */
282 typedef MBUFQ_HEAD(dlil_freeq) dlil_freeq_t;
283
284 struct proto_input_entry;
285
286 static TAILQ_HEAD(, dlil_ifnet) dlil_ifnet_head;
287
288 static LCK_ATTR_DECLARE(dlil_lck_attributes, 0, 0);
289
290 static LCK_GRP_DECLARE(dlil_lock_group, "DLIL internal locks");
291 LCK_GRP_DECLARE(ifnet_lock_group, "ifnet locks");
292 static LCK_GRP_DECLARE(ifnet_head_lock_group, "ifnet head lock");
293 static LCK_GRP_DECLARE(ifnet_snd_lock_group, "ifnet snd locks");
294 static LCK_GRP_DECLARE(ifnet_rcv_lock_group, "ifnet rcv locks");
295
296 LCK_ATTR_DECLARE(ifnet_lock_attr, 0, 0);
297 static LCK_RW_DECLARE_ATTR(ifnet_head_lock, &ifnet_head_lock_group,
298 &dlil_lck_attributes);
299 static LCK_MTX_DECLARE_ATTR(dlil_ifnet_lock, &dlil_lock_group,
300 &dlil_lck_attributes);
301
302 #if DEBUG
303 static unsigned int ifnet_debug = 1; /* debugging (enabled) */
304 #else
305 static unsigned int ifnet_debug; /* debugging (disabled) */
306 #endif /* !DEBUG */
307 static unsigned int dlif_size; /* size of dlil_ifnet to allocate */
308 static unsigned int dlif_bufsize; /* size of dlif_size + headroom */
309 static struct zone *dlif_zone; /* zone for dlil_ifnet */
310 #define DLIF_ZONE_NAME "ifnet" /* zone name */
311
312 static KALLOC_TYPE_DEFINE(dlif_filt_zone, struct ifnet_filter, NET_KT_DEFAULT);
313
314 static KALLOC_TYPE_DEFINE(dlif_proto_zone, struct if_proto, NET_KT_DEFAULT);
315
316 static unsigned int dlif_tcpstat_size; /* size of tcpstat_local to allocate */
317 static unsigned int dlif_tcpstat_bufsize; /* size of dlif_tcpstat_size + headroom */
318 static struct zone *dlif_tcpstat_zone; /* zone for tcpstat_local */
319 #define DLIF_TCPSTAT_ZONE_NAME "ifnet_tcpstat" /* zone name */
320
321 static unsigned int dlif_udpstat_size; /* size of udpstat_local to allocate */
322 static unsigned int dlif_udpstat_bufsize; /* size of dlif_udpstat_size + headroom */
323 static struct zone *dlif_udpstat_zone; /* zone for udpstat_local */
324 #define DLIF_UDPSTAT_ZONE_NAME "ifnet_udpstat" /* zone name */
325
326 static u_int32_t net_rtref;
327
328 static struct dlil_main_threading_info dlil_main_input_thread_info;
329 __private_extern__ struct dlil_threading_info *dlil_main_input_thread =
330 (struct dlil_threading_info *)&dlil_main_input_thread_info;
331
332 static int dlil_event_internal(struct ifnet *ifp, struct kev_msg *msg, bool update_generation);
333 static int dlil_detach_filter_internal(interface_filter_t filter, int detached);
334 static void dlil_if_trace(struct dlil_ifnet *, int);
335 static void if_proto_ref(struct if_proto *);
336 static void if_proto_free(struct if_proto *);
337 static struct if_proto *find_attached_proto(struct ifnet *, u_int32_t);
338 static u_int32_t dlil_ifp_protolist(struct ifnet *ifp, protocol_family_t *list,
339 u_int32_t list_count);
340 static void _dlil_if_release(ifnet_t ifp, bool clear_in_use);
341 static void if_flt_monitor_busy(struct ifnet *);
342 static void if_flt_monitor_unbusy(struct ifnet *);
343 static void if_flt_monitor_enter(struct ifnet *);
344 static void if_flt_monitor_leave(struct ifnet *);
345 static int dlil_interface_filters_input(struct ifnet *, struct mbuf **,
346 char **, protocol_family_t);
347 static int dlil_interface_filters_output(struct ifnet *, struct mbuf **,
348 protocol_family_t);
349 static struct ifaddr *dlil_alloc_lladdr(struct ifnet *,
350 const struct sockaddr_dl *);
351 static int ifnet_lookup(struct ifnet *);
352 static void if_purgeaddrs(struct ifnet *);
353
354 static errno_t ifproto_media_input_v1(struct ifnet *, protocol_family_t,
355 struct mbuf *, char *);
356 static errno_t ifproto_media_input_v2(struct ifnet *, protocol_family_t,
357 struct mbuf *);
358 static errno_t ifproto_media_preout(struct ifnet *, protocol_family_t,
359 mbuf_t *, const struct sockaddr *, void *, char *, char *);
360 static void ifproto_media_event(struct ifnet *, protocol_family_t,
361 const struct kev_msg *);
362 static errno_t ifproto_media_ioctl(struct ifnet *, protocol_family_t,
363 unsigned long, void *);
364 static errno_t ifproto_media_resolve_multi(ifnet_t, const struct sockaddr *,
365 struct sockaddr_dl *, size_t);
366 static errno_t ifproto_media_send_arp(struct ifnet *, u_short,
367 const struct sockaddr_dl *, const struct sockaddr *,
368 const struct sockaddr_dl *, const struct sockaddr *);
369
370 static errno_t ifp_if_input(struct ifnet *ifp, struct mbuf *m_head,
371 struct mbuf *m_tail, const struct ifnet_stat_increment_param *s,
372 boolean_t poll, struct thread *tp);
373 static void ifp_if_input_poll(struct ifnet *, u_int32_t, u_int32_t,
374 struct mbuf **, struct mbuf **, u_int32_t *, u_int32_t *);
375 static errno_t ifp_if_ctl(struct ifnet *, ifnet_ctl_cmd_t, u_int32_t, void *);
376 static errno_t ifp_if_demux(struct ifnet *, struct mbuf *, char *,
377 protocol_family_t *);
378 static errno_t ifp_if_add_proto(struct ifnet *, protocol_family_t,
379 const struct ifnet_demux_desc *, u_int32_t);
380 static errno_t ifp_if_del_proto(struct ifnet *, protocol_family_t);
381 static errno_t ifp_if_check_multi(struct ifnet *, const struct sockaddr *);
382 #if !XNU_TARGET_OS_OSX
383 static errno_t ifp_if_framer(struct ifnet *, struct mbuf **,
384 const struct sockaddr *, const char *, const char *,
385 u_int32_t *, u_int32_t *);
386 #else /* XNU_TARGET_OS_OSX */
387 static errno_t ifp_if_framer(struct ifnet *, struct mbuf **,
388 const struct sockaddr *, const char *, const char *);
389 #endif /* XNU_TARGET_OS_OSX */
390 static errno_t ifp_if_framer_extended(struct ifnet *, struct mbuf **,
391 const struct sockaddr *, const char *, const char *,
392 u_int32_t *, u_int32_t *);
393 static errno_t ifp_if_set_bpf_tap(struct ifnet *, bpf_tap_mode, bpf_packet_func);
394 static void ifp_if_free(struct ifnet *);
395 static void ifp_if_event(struct ifnet *, const struct kev_msg *);
396 static __inline void ifp_inc_traffic_class_in(struct ifnet *, struct mbuf *);
397 static __inline void ifp_inc_traffic_class_out(struct ifnet *, struct mbuf *);
398
399 static uint32_t dlil_trim_overcomitted_queue_locked(class_queue_t *,
400 dlil_freeq_t *, struct ifnet_stat_increment_param *);
401
402 static errno_t dlil_input_async(struct dlil_threading_info *, struct ifnet *,
403 struct mbuf *, struct mbuf *, const struct ifnet_stat_increment_param *,
404 boolean_t, struct thread *);
405 static errno_t dlil_input_sync(struct dlil_threading_info *, struct ifnet *,
406 struct mbuf *, struct mbuf *, const struct ifnet_stat_increment_param *,
407 boolean_t, struct thread *);
408
409 static void dlil_main_input_thread_func(void *, wait_result_t);
410 static void dlil_main_input_thread_cont(void *, wait_result_t);
411
412 static void dlil_input_thread_func(void *, wait_result_t);
413 static void dlil_input_thread_cont(void *, wait_result_t);
414
415 static void dlil_rxpoll_input_thread_func(void *, wait_result_t);
416 static void dlil_rxpoll_input_thread_cont(void *, wait_result_t);
417
418 static int dlil_create_input_thread(ifnet_t, struct dlil_threading_info *,
419 thread_continue_t *);
420 static void dlil_terminate_input_thread(struct dlil_threading_info *);
421 static void dlil_input_stats_add(const struct ifnet_stat_increment_param *,
422 struct dlil_threading_info *, struct ifnet *, boolean_t);
423 static boolean_t dlil_input_stats_sync(struct ifnet *,
424 struct dlil_threading_info *);
425 static void dlil_input_packet_list_common(struct ifnet *, struct mbuf *,
426 u_int32_t, ifnet_model_t, boolean_t);
427 static errno_t ifnet_input_common(struct ifnet *, struct mbuf *, struct mbuf *,
428 const struct ifnet_stat_increment_param *, boolean_t, boolean_t);
429 static int dlil_is_clat_needed(protocol_family_t, mbuf_t );
430 static errno_t dlil_clat46(ifnet_t, protocol_family_t *, mbuf_t *);
431 static errno_t dlil_clat64(ifnet_t, protocol_family_t *, mbuf_t *);
432 #if DEBUG || DEVELOPMENT
433 static void dlil_verify_sum16(void);
434 #endif /* DEBUG || DEVELOPMENT */
435 static void dlil_output_cksum_dbg(struct ifnet *, struct mbuf *, uint32_t,
436 protocol_family_t);
437 static void dlil_input_cksum_dbg(struct ifnet *, struct mbuf *, char *,
438 protocol_family_t);
439
440 static void dlil_incr_pending_thread_count(void);
441 static void dlil_decr_pending_thread_count(void);
442
443 static void ifnet_detacher_thread_func(void *, wait_result_t);
444 static void ifnet_detacher_thread_cont(void *, wait_result_t);
445 static void ifnet_detach_final(struct ifnet *);
446 static void ifnet_detaching_enqueue(struct ifnet *);
447 static struct ifnet *ifnet_detaching_dequeue(void);
448
449 static void ifnet_start_thread_func(void *, wait_result_t);
450 static void ifnet_start_thread_cont(void *, wait_result_t);
451
452 static void ifnet_poll_thread_func(void *, wait_result_t);
453 static void ifnet_poll_thread_cont(void *, wait_result_t);
454
455 static errno_t ifnet_enqueue_common(struct ifnet *, struct ifclassq *,
456 classq_pkt_t *, boolean_t, boolean_t *);
457
458 static void ifp_src_route_copyout(struct ifnet *, struct route *);
459 static void ifp_src_route_copyin(struct ifnet *, struct route *);
460 static void ifp_src_route6_copyout(struct ifnet *, struct route_in6 *);
461 static void ifp_src_route6_copyin(struct ifnet *, struct route_in6 *);
462
463 static errno_t if_mcasts_update_async(struct ifnet *);
464
465 static int sysctl_rxpoll SYSCTL_HANDLER_ARGS;
466 static int sysctl_rxpoll_mode_holdtime SYSCTL_HANDLER_ARGS;
467 static int sysctl_rxpoll_sample_holdtime SYSCTL_HANDLER_ARGS;
468 static int sysctl_rxpoll_interval_time SYSCTL_HANDLER_ARGS;
469 static int sysctl_rxpoll_wlowat SYSCTL_HANDLER_ARGS;
470 static int sysctl_rxpoll_whiwat SYSCTL_HANDLER_ARGS;
471 static int sysctl_sndq_maxlen SYSCTL_HANDLER_ARGS;
472 static int sysctl_rcvq_maxlen SYSCTL_HANDLER_ARGS;
473 static int sysctl_rcvq_burst_limit SYSCTL_HANDLER_ARGS;
474 static int sysctl_rcvq_trim_pct SYSCTL_HANDLER_ARGS;
475 static int sysctl_hwcksum_dbg_mode SYSCTL_HANDLER_ARGS;
476 static int sysctl_hwcksum_dbg_partial_rxoff_forced SYSCTL_HANDLER_ARGS;
477 static int sysctl_hwcksum_dbg_partial_rxoff_adj SYSCTL_HANDLER_ARGS;
478
479 struct chain_len_stats tx_chain_len_stats;
480 static int sysctl_tx_chain_len_stats SYSCTL_HANDLER_ARGS;
481
482 #if TEST_INPUT_THREAD_TERMINATION
483 static int sysctl_input_thread_termination_spin SYSCTL_HANDLER_ARGS;
484 #endif /* TEST_INPUT_THREAD_TERMINATION */
485
486 /* The following are protected by dlil_ifnet_lock */
487 static TAILQ_HEAD(, ifnet) ifnet_detaching_head;
488 static u_int32_t ifnet_detaching_cnt;
489 static boolean_t ifnet_detaching_embryonic;
490 static void *ifnet_delayed_run; /* wait channel for detaching thread */
491
492 static LCK_MTX_DECLARE_ATTR(ifnet_fc_lock, &dlil_lock_group,
493 &dlil_lck_attributes);
494
495 static uint32_t ifnet_flowhash_seed;
496
497 struct ifnet_flowhash_key {
498 char ifk_name[IFNAMSIZ];
499 uint32_t ifk_unit;
500 uint32_t ifk_flags;
501 uint32_t ifk_eflags;
502 uint32_t ifk_capabilities;
503 uint32_t ifk_capenable;
504 uint32_t ifk_output_sched_model;
505 uint32_t ifk_rand1;
506 uint32_t ifk_rand2;
507 };
508
509 /* Flow control entry per interface */
510 struct ifnet_fc_entry {
511 RB_ENTRY(ifnet_fc_entry) ifce_entry;
512 u_int32_t ifce_flowhash;
513 struct ifnet *ifce_ifp;
514 };
515
516 static uint32_t ifnet_calc_flowhash(struct ifnet *);
517 static int ifce_cmp(const struct ifnet_fc_entry *,
518 const struct ifnet_fc_entry *);
519 static int ifnet_fc_add(struct ifnet *);
520 static struct ifnet_fc_entry *ifnet_fc_get(u_int32_t);
521 static void ifnet_fc_entry_free(struct ifnet_fc_entry *);
522
523 /* protected by ifnet_fc_lock */
524 RB_HEAD(ifnet_fc_tree, ifnet_fc_entry) ifnet_fc_tree;
525 RB_PROTOTYPE(ifnet_fc_tree, ifnet_fc_entry, ifce_entry, ifce_cmp);
526 RB_GENERATE(ifnet_fc_tree, ifnet_fc_entry, ifce_entry, ifce_cmp);
527
528 static KALLOC_TYPE_DEFINE(ifnet_fc_zone, struct ifnet_fc_entry, NET_KT_DEFAULT);
529
530 extern void bpfdetach(struct ifnet *);
531 extern void proto_input_run(void);
532
533 extern uint32_t udp_count_opportunistic(unsigned int ifindex,
534 u_int32_t flags);
535 extern uint32_t tcp_count_opportunistic(unsigned int ifindex,
536 u_int32_t flags);
537
538 __private_extern__ void link_rtrequest(int, struct rtentry *, struct sockaddr *);
539
540 #if CONFIG_MACF
541 #if !XNU_TARGET_OS_OSX
542 int dlil_lladdr_ckreq = 1;
543 #else /* XNU_TARGET_OS_OSX */
544 int dlil_lladdr_ckreq = 0;
545 #endif /* XNU_TARGET_OS_OSX */
546 #endif /* CONFIG_MACF */
547
548 #if DEBUG
549 int dlil_verbose = 1;
550 #else
551 int dlil_verbose = 0;
552 #endif /* DEBUG */
553 #if IFNET_INPUT_SANITY_CHK
554 /* sanity checking of input packet lists received */
555 static u_int32_t dlil_input_sanity_check = 0;
556 #endif /* IFNET_INPUT_SANITY_CHK */
557 /* rate limit debug messages */
558 struct timespec dlil_dbgrate = { .tv_sec = 1, .tv_nsec = 0 };
559
560 SYSCTL_DECL(_net_link_generic_system);
561
562 SYSCTL_INT(_net_link_generic_system, OID_AUTO, dlil_verbose,
563 CTLFLAG_RW | CTLFLAG_LOCKED, &dlil_verbose, 0, "Log DLIL error messages");
564
565 #define IF_SNDQ_MINLEN 32
566 u_int32_t if_sndq_maxlen = IFQ_MAXLEN;
567 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, sndq_maxlen,
568 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_sndq_maxlen, IFQ_MAXLEN,
569 sysctl_sndq_maxlen, "I", "Default transmit queue max length");
570
571 #define IF_RCVQ_MINLEN 32
572 #define IF_RCVQ_MAXLEN 256
573 u_int32_t if_rcvq_maxlen = IF_RCVQ_MAXLEN;
574 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rcvq_maxlen,
575 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rcvq_maxlen, IFQ_MAXLEN,
576 sysctl_rcvq_maxlen, "I", "Default receive queue max length");
577
578 /*
579 * Protect against possible memory starvation that may happen
580 * when the driver is pushing data faster than the AP can process.
581 *
582 * If at any point during DLIL input phase any of the input queues
583 * exceeds the burst limit, DLIL will start to trim the queue,
584 * by returning mbufs in the input queue to the cache from which
585 * the mbufs were originally allocated, starting from the oldest
586 * mbuf and continuing until the new limit (see below) is reached.
587 *
588 * In order to avoid a steplocked equilibrium, the trimming
589 * will continue PAST the burst limit, until the corresponding
590 * input queue is reduced to `if_rcvq_trim_pct' %.
591 *
592 * For example, if the input queue limit is 1024 packets,
593 * and the trim percentage (`if_rcvq_trim_pct') is 80 %,
594 * the trimming will continue until the queue contains 819 packets
595 * (1024 * 80 / 100 == 819).
596 *
597 * Setting the burst limit too low can hurt the throughput,
598 * while setting the burst limit too high can defeat the purpose.
599 */
600 #define IF_RCVQ_BURST_LIMIT_MIN 1024
601 #define IF_RCVQ_BURST_LIMIT_DEFAULT 8192
602 #define IF_RCVQ_BURST_LIMIT_MAX 32768
603 uint32_t if_rcvq_burst_limit = IF_RCVQ_BURST_LIMIT_DEFAULT;
604 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rcvq_burst_limit,
605 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rcvq_burst_limit, IF_RCVQ_BURST_LIMIT_DEFAULT,
606 sysctl_rcvq_burst_limit, "I", "Upper memory limit for inbound data");
607
608 #define IF_RCVQ_TRIM_PCT_MIN 20
609 #define IF_RCVQ_TRIM_PCT_DEFAULT 80
610 #define IF_RCVQ_TRIM_PCT_MAX 100
611 uint32_t if_rcvq_trim_pct = IF_RCVQ_TRIM_PCT_DEFAULT;
612 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rcvq_trim_pct,
613 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rcvq_trim_pct, IF_RCVQ_TRIM_PCT_DEFAULT,
614 sysctl_rcvq_trim_pct, "I",
615 "Percentage (0 - 100) of the queue limit to keep after detecting an overflow burst");
616
617 #define IF_RXPOLL_DECAY 2 /* ilog2 of EWMA decay rate (4) */
618 u_int32_t if_rxpoll_decay = IF_RXPOLL_DECAY;
619 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, rxpoll_decay,
620 CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_decay, IF_RXPOLL_DECAY,
621 "ilog2 of EWMA decay rate of avg inbound packets");
622
623 #define IF_RXPOLL_MODE_HOLDTIME_MIN (10ULL * 1000 * 1000) /* 10 ms */
624 #define IF_RXPOLL_MODE_HOLDTIME (1000ULL * 1000 * 1000) /* 1 sec */
625 static u_int64_t if_rxpoll_mode_holdtime = IF_RXPOLL_MODE_HOLDTIME;
626 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_freeze_time,
627 CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_mode_holdtime,
628 IF_RXPOLL_MODE_HOLDTIME, sysctl_rxpoll_mode_holdtime,
629 "Q", "input poll mode freeze time");
630
631 #define IF_RXPOLL_SAMPLETIME_MIN (1ULL * 1000 * 1000) /* 1 ms */
632 #define IF_RXPOLL_SAMPLETIME (10ULL * 1000 * 1000) /* 10 ms */
633 static u_int64_t if_rxpoll_sample_holdtime = IF_RXPOLL_SAMPLETIME;
634 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_sample_time,
635 CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_sample_holdtime,
636 IF_RXPOLL_SAMPLETIME, sysctl_rxpoll_sample_holdtime,
637 "Q", "input poll sampling time");
638
639 static u_int64_t if_rxpoll_interval_time = IF_RXPOLL_INTERVALTIME;
640 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_interval_time,
641 CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_interval_time,
642 IF_RXPOLL_INTERVALTIME, sysctl_rxpoll_interval_time,
643 "Q", "input poll interval (time)");
644
645 #define IF_RXPOLL_INTERVAL_PKTS 0 /* 0 (disabled) */
646 u_int32_t if_rxpoll_interval_pkts = IF_RXPOLL_INTERVAL_PKTS;
647 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, rxpoll_interval_pkts,
648 CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_interval_pkts,
649 IF_RXPOLL_INTERVAL_PKTS, "input poll interval (packets)");
650
651 #define IF_RXPOLL_WLOWAT 10
652 static u_int32_t if_sysctl_rxpoll_wlowat = IF_RXPOLL_WLOWAT;
653 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_wakeups_lowat,
654 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_sysctl_rxpoll_wlowat,
655 IF_RXPOLL_WLOWAT, sysctl_rxpoll_wlowat,
656 "I", "input poll wakeup low watermark");
657
658 #define IF_RXPOLL_WHIWAT 100
659 static u_int32_t if_sysctl_rxpoll_whiwat = IF_RXPOLL_WHIWAT;
660 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_wakeups_hiwat,
661 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_sysctl_rxpoll_whiwat,
662 IF_RXPOLL_WHIWAT, sysctl_rxpoll_whiwat,
663 "I", "input poll wakeup high watermark");
664
665 static u_int32_t if_rxpoll_max = 0; /* 0 (automatic) */
666 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, rxpoll_max,
667 CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_max, 0,
668 "max packets per poll call");
669
670 u_int32_t if_rxpoll = 1;
671 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll,
672 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll, 0,
673 sysctl_rxpoll, "I", "enable opportunistic input polling");
674
675 #if TEST_INPUT_THREAD_TERMINATION
676 static u_int32_t if_input_thread_termination_spin = 0;
677 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, input_thread_termination_spin,
678 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
679 &if_input_thread_termination_spin, 0,
680 sysctl_input_thread_termination_spin,
681 "I", "input thread termination spin limit");
682 #endif /* TEST_INPUT_THREAD_TERMINATION */
683
684 static u_int32_t cur_dlil_input_threads = 0;
685 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, dlil_input_threads,
686 CTLFLAG_RD | CTLFLAG_LOCKED, &cur_dlil_input_threads, 0,
687 "Current number of DLIL input threads");
688
689 #if IFNET_INPUT_SANITY_CHK
690 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, dlil_input_sanity_check,
691 CTLFLAG_RW | CTLFLAG_LOCKED, &dlil_input_sanity_check, 0,
692 "Turn on sanity checking in DLIL input");
693 #endif /* IFNET_INPUT_SANITY_CHK */
694
695 static u_int32_t if_flowadv = 1;
696 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, flow_advisory,
697 CTLFLAG_RW | CTLFLAG_LOCKED, &if_flowadv, 1,
698 "enable flow-advisory mechanism");
699
700 static u_int32_t if_delaybased_queue = 1;
701 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, delaybased_queue,
702 CTLFLAG_RW | CTLFLAG_LOCKED, &if_delaybased_queue, 1,
703 "enable delay based dynamic queue sizing");
704
705 static uint64_t hwcksum_in_invalidated = 0;
706 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
707 hwcksum_in_invalidated, CTLFLAG_RD | CTLFLAG_LOCKED,
708 &hwcksum_in_invalidated, "inbound packets with invalidated hardware cksum");
709
710 uint32_t hwcksum_dbg = 0;
711 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, hwcksum_dbg,
712 CTLFLAG_RW | CTLFLAG_LOCKED, &hwcksum_dbg, 0,
713 "enable hardware cksum debugging");
714
715 u_int32_t ifnet_start_delayed = 0;
716 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, start_delayed,
717 CTLFLAG_RW | CTLFLAG_LOCKED, &ifnet_start_delayed, 0,
718 "number of times start was delayed");
719
720 u_int32_t ifnet_delay_start_disabled = 0;
721 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, start_delay_disabled,
722 CTLFLAG_RW | CTLFLAG_LOCKED, &ifnet_delay_start_disabled, 0,
723 "number of times start was delayed");
724
725 static inline void
ifnet_delay_start_disabled_increment(void)726 ifnet_delay_start_disabled_increment(void)
727 {
728 OSIncrementAtomic(&ifnet_delay_start_disabled);
729 }
730
731 #define HWCKSUM_DBG_PARTIAL_FORCED 0x1 /* forced partial checksum */
732 #define HWCKSUM_DBG_PARTIAL_RXOFF_ADJ 0x2 /* adjust start offset */
733 #define HWCKSUM_DBG_FINALIZE_FORCED 0x10 /* forced finalize */
734 #define HWCKSUM_DBG_MASK \
735 (HWCKSUM_DBG_PARTIAL_FORCED | HWCKSUM_DBG_PARTIAL_RXOFF_ADJ | \
736 HWCKSUM_DBG_FINALIZE_FORCED)
737
738 static uint32_t hwcksum_dbg_mode = 0;
739 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, hwcksum_dbg_mode,
740 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &hwcksum_dbg_mode,
741 0, sysctl_hwcksum_dbg_mode, "I", "hardware cksum debugging mode");
742
743 static uint64_t hwcksum_dbg_partial_forced = 0;
744 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
745 hwcksum_dbg_partial_forced, CTLFLAG_RD | CTLFLAG_LOCKED,
746 &hwcksum_dbg_partial_forced, "packets forced using partial cksum");
747
748 static uint64_t hwcksum_dbg_partial_forced_bytes = 0;
749 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
750 hwcksum_dbg_partial_forced_bytes, CTLFLAG_RD | CTLFLAG_LOCKED,
751 &hwcksum_dbg_partial_forced_bytes, "bytes forced using partial cksum");
752
753 static uint32_t hwcksum_dbg_partial_rxoff_forced = 0;
754 SYSCTL_PROC(_net_link_generic_system, OID_AUTO,
755 hwcksum_dbg_partial_rxoff_forced, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
756 &hwcksum_dbg_partial_rxoff_forced, 0,
757 sysctl_hwcksum_dbg_partial_rxoff_forced, "I",
758 "forced partial cksum rx offset");
759
760 static uint32_t hwcksum_dbg_partial_rxoff_adj = 0;
761 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, hwcksum_dbg_partial_rxoff_adj,
762 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &hwcksum_dbg_partial_rxoff_adj,
763 0, sysctl_hwcksum_dbg_partial_rxoff_adj, "I",
764 "adjusted partial cksum rx offset");
765
766 static uint64_t hwcksum_dbg_verified = 0;
767 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
768 hwcksum_dbg_verified, CTLFLAG_RD | CTLFLAG_LOCKED,
769 &hwcksum_dbg_verified, "packets verified for having good checksum");
770
771 static uint64_t hwcksum_dbg_bad_cksum = 0;
772 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
773 hwcksum_dbg_bad_cksum, CTLFLAG_RD | CTLFLAG_LOCKED,
774 &hwcksum_dbg_bad_cksum, "packets with bad hardware calculated checksum");
775
776 static uint64_t hwcksum_dbg_bad_rxoff = 0;
777 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
778 hwcksum_dbg_bad_rxoff, CTLFLAG_RD | CTLFLAG_LOCKED,
779 &hwcksum_dbg_bad_rxoff, "packets with invalid rxoff");
780
781 static uint64_t hwcksum_dbg_adjusted = 0;
782 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
783 hwcksum_dbg_adjusted, CTLFLAG_RD | CTLFLAG_LOCKED,
784 &hwcksum_dbg_adjusted, "packets with rxoff adjusted");
785
786 static uint64_t hwcksum_dbg_finalized_hdr = 0;
787 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
788 hwcksum_dbg_finalized_hdr, CTLFLAG_RD | CTLFLAG_LOCKED,
789 &hwcksum_dbg_finalized_hdr, "finalized headers");
790
791 static uint64_t hwcksum_dbg_finalized_data = 0;
792 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
793 hwcksum_dbg_finalized_data, CTLFLAG_RD | CTLFLAG_LOCKED,
794 &hwcksum_dbg_finalized_data, "finalized payloads");
795
796 uint32_t hwcksum_tx = 1;
797 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, hwcksum_tx,
798 CTLFLAG_RW | CTLFLAG_LOCKED, &hwcksum_tx, 0,
799 "enable transmit hardware checksum offload");
800
801 uint32_t hwcksum_rx = 1;
802 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, hwcksum_rx,
803 CTLFLAG_RW | CTLFLAG_LOCKED, &hwcksum_rx, 0,
804 "enable receive hardware checksum offload");
805
806 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, tx_chain_len_stats,
807 CTLFLAG_RD | CTLFLAG_LOCKED, 0, 9,
808 sysctl_tx_chain_len_stats, "S", "");
809
810 uint32_t tx_chain_len_count = 0;
811 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, tx_chain_len_count,
812 CTLFLAG_RW | CTLFLAG_LOCKED, &tx_chain_len_count, 0, "");
813
814 static uint32_t threshold_notify = 1; /* enable/disable */
815 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, threshold_notify,
816 CTLFLAG_RW | CTLFLAG_LOCKED, &threshold_notify, 0, "");
817
818 static uint32_t threshold_interval = 2; /* in seconds */
819 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, threshold_interval,
820 CTLFLAG_RW | CTLFLAG_LOCKED, &threshold_interval, 0, "");
821
822 #if (DEVELOPMENT || DEBUG)
823 static int sysctl_get_kao_frames SYSCTL_HANDLER_ARGS;
824 SYSCTL_NODE(_net_link_generic_system, OID_AUTO, get_kao_frames,
825 CTLFLAG_RD | CTLFLAG_LOCKED, sysctl_get_kao_frames, "");
826 #endif /* DEVELOPMENT || DEBUG */
827
828 struct net_api_stats net_api_stats;
829 SYSCTL_STRUCT(_net, OID_AUTO, api_stats, CTLFLAG_RD | CTLFLAG_LOCKED,
830 &net_api_stats, net_api_stats, "");
831
832 uint32_t net_wake_pkt_debug = 0;
833 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, wake_pkt_debug,
834 CTLFLAG_RW | CTLFLAG_LOCKED, &net_wake_pkt_debug, 0, "");
835
836 static void log_hexdump(void *data, size_t len);
837
838 unsigned int net_rxpoll = 1;
839 unsigned int net_affinity = 1;
840 unsigned int net_async = 1; /* 0: synchronous, 1: asynchronous */
841
842 static kern_return_t dlil_affinity_set(struct thread *, u_int32_t);
843
844 extern u_int32_t inject_buckets;
845
846 /* DLIL data threshold thread call */
847 static void dlil_dt_tcall_fn(thread_call_param_t, thread_call_param_t);
848
849 void
ifnet_filter_update_tso(struct ifnet * ifp,boolean_t filter_enable)850 ifnet_filter_update_tso(struct ifnet *ifp, boolean_t filter_enable)
851 {
852 /*
853 * update filter count and route_generation ID to let TCP
854 * know it should reevalute doing TSO or not
855 */
856 if (filter_enable) {
857 OSAddAtomic(1, &ifp->if_flt_no_tso_count);
858 } else {
859 VERIFY(ifp->if_flt_no_tso_count != 0);
860 OSAddAtomic(-1, &ifp->if_flt_no_tso_count);
861 }
862 routegenid_update();
863 }
864
865 #if SKYWALK
866
867 #if defined(XNU_TARGET_OS_OSX)
868 static bool net_check_compatible_if_filter(struct ifnet *ifp);
869 #endif /* XNU_TARGET_OS_OSX */
870
871 /* if_attach_nx flags defined in os_skywalk_private.h */
872 static unsigned int if_attach_nx = IF_ATTACH_NX_DEFAULT;
873 unsigned int if_enable_fsw_ip_netagent =
874 ((IF_ATTACH_NX_DEFAULT & IF_ATTACH_NX_FSW_IP_NETAGENT) != 0);
875 unsigned int if_enable_fsw_transport_netagent =
876 ((IF_ATTACH_NX_DEFAULT & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0);
877
878 unsigned int if_netif_all =
879 ((IF_ATTACH_NX_DEFAULT & IF_ATTACH_NX_NETIF_ALL) != 0);
880
881 /* Configure flowswitch to use max mtu sized buffer */
882 static bool fsw_use_max_mtu_buffer = false;
883
884 #if (DEVELOPMENT || DEBUG)
885 static int
886 if_attach_nx_sysctl SYSCTL_HANDLER_ARGS
887 {
888 #pragma unused(oidp, arg1, arg2)
889 unsigned int new_value;
890 int changed;
891 int error = sysctl_io_number(req, if_attach_nx, sizeof(if_attach_nx),
892 &new_value, &changed);
893 if (error) {
894 return error;
895 }
896 if (changed) {
897 if ((new_value & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) !=
898 (if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT)) {
899 return ENOTSUP;
900 }
901 if_attach_nx = new_value;
902 }
903 return 0;
904 }
905
906 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, if_attach_nx,
907 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
908 0, 0, &if_attach_nx_sysctl, "IU", "attach nexus");
909
910 #endif /* DEVELOPMENT || DEBUG */
911
912 static int
913 if_enable_fsw_transport_netagent_sysctl SYSCTL_HANDLER_ARGS
914 {
915 #pragma unused(oidp, arg1, arg2)
916 unsigned int new_value;
917 int changed;
918 int error;
919
920 error = sysctl_io_number(req, if_enable_fsw_transport_netagent,
921 sizeof(if_enable_fsw_transport_netagent),
922 &new_value, &changed);
923 if (error == 0 && changed != 0) {
924 if (new_value != 0 && new_value != 1) {
925 /* only allow 0 or 1 */
926 error = EINVAL;
927 } else if ((if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0) {
928 /* netagent can be enabled/disabled */
929 if_enable_fsw_transport_netagent = new_value;
930 if (new_value == 0) {
931 kern_nexus_deregister_netagents();
932 } else {
933 kern_nexus_register_netagents();
934 }
935 } else {
936 /* netagent can't be enabled */
937 error = ENOTSUP;
938 }
939 }
940 return error;
941 }
942
943 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, enable_netagent,
944 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
945 0, 0, &if_enable_fsw_transport_netagent_sysctl, "IU",
946 "enable flowswitch netagent");
947
948 static void dlil_detach_flowswitch_nexus(if_nexus_flowswitch_t nexus_fsw);
949
950 #include <skywalk/os_skywalk_private.h>
951
952 boolean_t
ifnet_nx_noauto(ifnet_t ifp)953 ifnet_nx_noauto(ifnet_t ifp)
954 {
955 return (ifp->if_xflags & IFXF_NX_NOAUTO) != 0;
956 }
957
958 boolean_t
ifnet_nx_noauto_flowswitch(ifnet_t ifp)959 ifnet_nx_noauto_flowswitch(ifnet_t ifp)
960 {
961 return ifnet_is_low_latency(ifp);
962 }
963
964 boolean_t
ifnet_is_low_latency(ifnet_t ifp)965 ifnet_is_low_latency(ifnet_t ifp)
966 {
967 return (ifp->if_xflags & IFXF_LOW_LATENCY) != 0;
968 }
969
970 boolean_t
ifnet_needs_compat(ifnet_t ifp)971 ifnet_needs_compat(ifnet_t ifp)
972 {
973 if ((if_attach_nx & IF_ATTACH_NX_NETIF_COMPAT) == 0) {
974 return FALSE;
975 }
976 #if !XNU_TARGET_OS_OSX
977 /*
978 * To conserve memory, we plumb in the compat layer selectively; this
979 * can be overridden via if_attach_nx flag IF_ATTACH_NX_NETIF_ALL.
980 * In particular, we check for Wi-Fi Access Point.
981 */
982 if (IFNET_IS_WIFI(ifp)) {
983 /* Wi-Fi Access Point */
984 if (ifp->if_name[0] == 'a' && ifp->if_name[1] == 'p' &&
985 ifp->if_name[2] == '\0') {
986 return if_netif_all;
987 }
988 }
989 #else /* XNU_TARGET_OS_OSX */
990 #pragma unused(ifp)
991 #endif /* XNU_TARGET_OS_OSX */
992 return TRUE;
993 }
994
995 boolean_t
ifnet_needs_fsw_transport_netagent(ifnet_t ifp)996 ifnet_needs_fsw_transport_netagent(ifnet_t ifp)
997 {
998 if (if_is_fsw_transport_netagent_enabled()) {
999 /* check if netagent has been manually enabled for ipsec/utun */
1000 if (ifp->if_family == IFNET_FAMILY_IPSEC) {
1001 return ipsec_interface_needs_netagent(ifp);
1002 } else if (ifp->if_family == IFNET_FAMILY_UTUN) {
1003 return utun_interface_needs_netagent(ifp);
1004 }
1005
1006 /* check ifnet no auto nexus override */
1007 if (ifnet_nx_noauto(ifp)) {
1008 return FALSE;
1009 }
1010
1011 /* check global if_attach_nx configuration */
1012 switch (ifp->if_family) {
1013 case IFNET_FAMILY_CELLULAR:
1014 case IFNET_FAMILY_ETHERNET:
1015 if ((if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0) {
1016 return TRUE;
1017 }
1018 break;
1019 default:
1020 break;
1021 }
1022 }
1023 return FALSE;
1024 }
1025
1026 boolean_t
ifnet_needs_fsw_ip_netagent(ifnet_t ifp)1027 ifnet_needs_fsw_ip_netagent(ifnet_t ifp)
1028 {
1029 #pragma unused(ifp)
1030 if ((if_attach_nx & IF_ATTACH_NX_FSW_IP_NETAGENT) != 0) {
1031 return TRUE;
1032 }
1033 return FALSE;
1034 }
1035
1036 boolean_t
ifnet_needs_netif_netagent(ifnet_t ifp)1037 ifnet_needs_netif_netagent(ifnet_t ifp)
1038 {
1039 #pragma unused(ifp)
1040 return (if_attach_nx & IF_ATTACH_NX_NETIF_NETAGENT) != 0;
1041 }
1042
1043 static boolean_t
dlil_detach_nexus_instance(nexus_controller_t controller,const char * func_str,uuid_t instance,uuid_t device)1044 dlil_detach_nexus_instance(nexus_controller_t controller,
1045 const char *func_str, uuid_t instance, uuid_t device)
1046 {
1047 errno_t err;
1048
1049 if (instance == NULL || uuid_is_null(instance)) {
1050 return FALSE;
1051 }
1052
1053 /* followed by the device port */
1054 if (device != NULL && !uuid_is_null(device)) {
1055 err = kern_nexus_ifdetach(controller, instance, device);
1056 if (err != 0) {
1057 DLIL_PRINTF("%s kern_nexus_ifdetach device failed %d\n",
1058 func_str, err);
1059 }
1060 }
1061 err = kern_nexus_controller_free_provider_instance(controller,
1062 instance);
1063 if (err != 0) {
1064 DLIL_PRINTF("%s free_provider_instance failed %d\n",
1065 func_str, err);
1066 }
1067 return TRUE;
1068 }
1069
1070 static boolean_t
dlil_detach_nexus(const char * func_str,uuid_t provider,uuid_t instance,uuid_t device)1071 dlil_detach_nexus(const char *func_str, uuid_t provider, uuid_t instance,
1072 uuid_t device)
1073 {
1074 boolean_t detached = FALSE;
1075 nexus_controller_t controller = kern_nexus_shared_controller();
1076 int err;
1077
1078 if (dlil_detach_nexus_instance(controller, func_str, instance,
1079 device)) {
1080 detached = TRUE;
1081 }
1082 if (provider != NULL && !uuid_is_null(provider)) {
1083 detached = TRUE;
1084 err = kern_nexus_controller_deregister_provider(controller,
1085 provider);
1086 if (err != 0) {
1087 DLIL_PRINTF("%s deregister_provider %d\n",
1088 func_str, err);
1089 }
1090 }
1091 return detached;
1092 }
1093
1094 static errno_t
dlil_create_provider_and_instance(nexus_controller_t controller,nexus_type_t type,ifnet_t ifp,uuid_t * provider,uuid_t * instance,nexus_attr_t attr)1095 dlil_create_provider_and_instance(nexus_controller_t controller,
1096 nexus_type_t type, ifnet_t ifp, uuid_t *provider, uuid_t *instance,
1097 nexus_attr_t attr)
1098 {
1099 uuid_t dom_prov;
1100 errno_t err;
1101 nexus_name_t provider_name;
1102 const char *type_name =
1103 (type == NEXUS_TYPE_NET_IF) ? "netif" : "flowswitch";
1104 struct kern_nexus_init init;
1105
1106 err = kern_nexus_get_default_domain_provider(type, &dom_prov);
1107 if (err != 0) {
1108 DLIL_PRINTF("%s can't get %s provider, error %d\n",
1109 __func__, type_name, err);
1110 goto failed;
1111 }
1112
1113 snprintf((char *)provider_name, sizeof(provider_name),
1114 "com.apple.%s.%s", type_name, if_name(ifp));
1115 err = kern_nexus_controller_register_provider(controller,
1116 dom_prov,
1117 provider_name,
1118 NULL,
1119 0,
1120 attr,
1121 provider);
1122 if (err != 0) {
1123 DLIL_PRINTF("%s register %s provider failed, error %d\n",
1124 __func__, type_name, err);
1125 goto failed;
1126 }
1127 bzero(&init, sizeof(init));
1128 init.nxi_version = KERN_NEXUS_CURRENT_VERSION;
1129 err = kern_nexus_controller_alloc_provider_instance(controller,
1130 *provider,
1131 NULL, NULL,
1132 instance, &init);
1133 if (err != 0) {
1134 DLIL_PRINTF("%s alloc_provider_instance %s failed, %d\n",
1135 __func__, type_name, err);
1136 kern_nexus_controller_deregister_provider(controller,
1137 *provider);
1138 goto failed;
1139 }
1140 failed:
1141 return err;
1142 }
1143
1144 static boolean_t
dlil_attach_netif_nexus_common(ifnet_t ifp,if_nexus_netif_t netif_nx)1145 dlil_attach_netif_nexus_common(ifnet_t ifp, if_nexus_netif_t netif_nx)
1146 {
1147 nexus_attr_t attr = NULL;
1148 nexus_controller_t controller;
1149 errno_t err;
1150
1151 if ((ifp->if_capabilities & IFCAP_SKYWALK) != 0) {
1152 /* it's already attached */
1153 if (dlil_verbose) {
1154 DLIL_PRINTF("%s: %s already has nexus attached\n",
1155 __func__, if_name(ifp));
1156 /* already attached */
1157 }
1158 goto failed;
1159 }
1160
1161 err = kern_nexus_attr_create(&attr);
1162 if (err != 0) {
1163 DLIL_PRINTF("%s: nexus attr create for %s\n", __func__,
1164 if_name(ifp));
1165 goto failed;
1166 }
1167 err = kern_nexus_attr_set(attr, NEXUS_ATTR_IFINDEX, ifp->if_index);
1168 VERIFY(err == 0);
1169
1170 controller = kern_nexus_shared_controller();
1171
1172 /* create the netif provider and instance */
1173 err = dlil_create_provider_and_instance(controller,
1174 NEXUS_TYPE_NET_IF, ifp, &netif_nx->if_nif_provider,
1175 &netif_nx->if_nif_instance, attr);
1176 if (err != 0) {
1177 goto failed;
1178 }
1179 err = kern_nexus_ifattach(controller, netif_nx->if_nif_instance,
1180 ifp, NULL, FALSE, &netif_nx->if_nif_attach);
1181 if (err != 0) {
1182 DLIL_PRINTF("%s kern_nexus_ifattach %d\n",
1183 __func__, err);
1184 /* cleanup provider and instance */
1185 dlil_detach_nexus(__func__, netif_nx->if_nif_provider,
1186 netif_nx->if_nif_instance, NULL);
1187 goto failed;
1188 }
1189 return TRUE;
1190
1191 failed:
1192 if (attr != NULL) {
1193 kern_nexus_attr_destroy(attr);
1194 }
1195 return FALSE;
1196 }
1197
1198 static boolean_t
dlil_attach_netif_compat_nexus(ifnet_t ifp,if_nexus_netif_t netif_nx)1199 dlil_attach_netif_compat_nexus(ifnet_t ifp, if_nexus_netif_t netif_nx)
1200 {
1201 if (ifnet_nx_noauto(ifp) || IFNET_IS_INTCOPROC(ifp) ||
1202 IFNET_IS_MANAGEMENT(ifp) || IFNET_IS_VMNET(ifp)) {
1203 goto failed;
1204 }
1205 switch (ifp->if_type) {
1206 case IFT_CELLULAR:
1207 case IFT_ETHER:
1208 if ((if_attach_nx & IF_ATTACH_NX_NETIF_COMPAT) == 0) {
1209 /* don't auto-attach */
1210 goto failed;
1211 }
1212 break;
1213 default:
1214 /* don't auto-attach */
1215 goto failed;
1216 }
1217 return dlil_attach_netif_nexus_common(ifp, netif_nx);
1218
1219 failed:
1220 return FALSE;
1221 }
1222
1223 static boolean_t
dlil_is_native_netif_nexus(ifnet_t ifp)1224 dlil_is_native_netif_nexus(ifnet_t ifp)
1225 {
1226 return (ifp->if_eflags & IFEF_SKYWALK_NATIVE) && ifp->if_na != NULL;
1227 }
1228
1229 __attribute__((noinline))
1230 static void
dlil_detach_netif_nexus(if_nexus_netif_t nexus_netif)1231 dlil_detach_netif_nexus(if_nexus_netif_t nexus_netif)
1232 {
1233 dlil_detach_nexus(__func__, nexus_netif->if_nif_provider,
1234 nexus_netif->if_nif_instance, nexus_netif->if_nif_attach);
1235 }
1236
1237 static inline int
dlil_siocgifdevmtu(struct ifnet * ifp,struct ifdevmtu * ifdm_p)1238 dlil_siocgifdevmtu(struct ifnet * ifp, struct ifdevmtu * ifdm_p)
1239 {
1240 struct ifreq ifr;
1241 int error;
1242
1243 bzero(&ifr, sizeof(ifr));
1244 error = ifnet_ioctl(ifp, 0, SIOCGIFDEVMTU, &ifr);
1245 if (error == 0) {
1246 *ifdm_p = ifr.ifr_devmtu;
1247 }
1248 return error;
1249 }
1250
1251 static inline void
_dlil_adjust_large_buf_size_for_tso(ifnet_t ifp,uint32_t * large_buf_size)1252 _dlil_adjust_large_buf_size_for_tso(ifnet_t ifp, uint32_t *large_buf_size)
1253 {
1254 #ifdef XNU_TARGET_OS_OSX
1255 uint32_t tso_v4_mtu = 0;
1256 uint32_t tso_v6_mtu = 0;
1257
1258 if (!dlil_is_native_netif_nexus(ifp)) {
1259 return;
1260 }
1261 /*
1262 * Note that we are reading the real hwassist flags set by the driver
1263 * and not the adjusted ones because nx_netif_host_adjust_if_capabilities()
1264 * hasn't been called yet.
1265 */
1266 if ((ifp->if_hwassist & IFNET_TSO_IPV4) != 0) {
1267 tso_v4_mtu = ifp->if_tso_v4_mtu;
1268 }
1269 if ((ifp->if_hwassist & IFNET_TSO_IPV6) != 0) {
1270 tso_v6_mtu = ifp->if_tso_v6_mtu;
1271 }
1272 /*
1273 * If the hardware supports TSO, adjust the large buf size to match the
1274 * supported TSO MTU size.
1275 */
1276 if (tso_v4_mtu != 0 || tso_v6_mtu != 0) {
1277 *large_buf_size = MAX(tso_v4_mtu, tso_v6_mtu);
1278 } else {
1279 *large_buf_size = MAX(*large_buf_size, sk_fsw_gso_mtu);
1280 }
1281 *large_buf_size = MIN(NX_FSW_MAX_LARGE_BUFSIZE, *large_buf_size);
1282 #else
1283 #pragma unused(ifp, large_buf_size)
1284 #endif /* XNU_TARGET_OS_OSX */
1285 }
1286
1287 static inline int
_dlil_get_flowswitch_buffer_size(ifnet_t ifp,uuid_t netif,uint32_t * buf_size,bool * use_multi_buflet,uint32_t * large_buf_size)1288 _dlil_get_flowswitch_buffer_size(ifnet_t ifp, uuid_t netif, uint32_t *buf_size,
1289 bool *use_multi_buflet, uint32_t *large_buf_size)
1290 {
1291 struct kern_pbufpool_memory_info rx_pp_info;
1292 struct kern_pbufpool_memory_info tx_pp_info;
1293 uint32_t if_max_mtu = 0;
1294 uint32_t drv_buf_size;
1295 struct ifdevmtu ifdm;
1296 int err;
1297
1298 /*
1299 * To perform intra-stack RX aggregation flowswitch needs to use
1300 * multi-buflet packet.
1301 */
1302 *use_multi_buflet = NX_FSW_TCP_RX_AGG_ENABLED();
1303
1304 *large_buf_size = *use_multi_buflet ? NX_FSW_DEF_LARGE_BUFSIZE : 0;
1305 /*
1306 * IP over Thunderbolt interface can deliver the largest IP packet,
1307 * but the driver advertises the MAX MTU as only 9K.
1308 */
1309 if (IFNET_IS_THUNDERBOLT_IP(ifp)) {
1310 if_max_mtu = IP_MAXPACKET;
1311 goto skip_mtu_ioctl;
1312 }
1313
1314 /* determine max mtu */
1315 bzero(&ifdm, sizeof(ifdm));
1316 err = dlil_siocgifdevmtu(ifp, &ifdm);
1317 if (__improbable(err != 0)) {
1318 DLIL_PRINTF("%s: SIOCGIFDEVMTU failed for %s\n",
1319 __func__, if_name(ifp));
1320 /* use default flowswitch buffer size */
1321 if_max_mtu = NX_FSW_BUFSIZE;
1322 } else {
1323 DLIL_PRINTF("%s: %s %d %d\n", __func__, if_name(ifp),
1324 ifdm.ifdm_max, ifdm.ifdm_current);
1325 /* rdar://problem/44589731 */
1326 if_max_mtu = MAX(ifdm.ifdm_max, ifdm.ifdm_current);
1327 }
1328
1329 skip_mtu_ioctl:
1330 if (if_max_mtu == 0) {
1331 DLIL_PRINTF("%s: can't determine MAX MTU for %s\n",
1332 __func__, if_name(ifp));
1333 return EINVAL;
1334 }
1335 if ((if_max_mtu > NX_FSW_MAXBUFSIZE) && fsw_use_max_mtu_buffer) {
1336 DLIL_PRINTF("%s: interace (%s) has MAX MTU (%u) > flowswitch "
1337 "max bufsize(%d)\n", __func__,
1338 if_name(ifp), if_max_mtu, NX_FSW_MAXBUFSIZE);
1339 return EINVAL;
1340 }
1341
1342 /*
1343 * for skywalk native driver, consult the driver packet pool also.
1344 */
1345 if (dlil_is_native_netif_nexus(ifp)) {
1346 err = kern_nexus_get_pbufpool_info(netif, &rx_pp_info,
1347 &tx_pp_info);
1348 if (err != 0) {
1349 DLIL_PRINTF("%s: can't get pbufpool info for %s\n",
1350 __func__, if_name(ifp));
1351 return ENXIO;
1352 }
1353 drv_buf_size = tx_pp_info.kpm_bufsize *
1354 tx_pp_info.kpm_max_frags;
1355 if (if_max_mtu > drv_buf_size) {
1356 DLIL_PRINTF("%s: interface %s packet pool (rx %d * %d, "
1357 "tx %d * %d) can't support max mtu(%d)\n", __func__,
1358 if_name(ifp), rx_pp_info.kpm_bufsize,
1359 rx_pp_info.kpm_max_frags, tx_pp_info.kpm_bufsize,
1360 tx_pp_info.kpm_max_frags, if_max_mtu);
1361 return EINVAL;
1362 }
1363 } else {
1364 drv_buf_size = if_max_mtu;
1365 }
1366
1367 if ((drv_buf_size > NX_FSW_BUFSIZE) && (!fsw_use_max_mtu_buffer)) {
1368 _CASSERT((NX_FSW_BUFSIZE * NX_PBUF_FRAGS_MAX) >= IP_MAXPACKET);
1369 *use_multi_buflet = true;
1370 /* default flowswitch buffer size */
1371 *buf_size = NX_FSW_BUFSIZE;
1372 *large_buf_size = MIN(NX_FSW_MAX_LARGE_BUFSIZE, drv_buf_size);
1373 } else {
1374 *buf_size = MAX(drv_buf_size, NX_FSW_BUFSIZE);
1375 }
1376 _dlil_adjust_large_buf_size_for_tso(ifp, large_buf_size);
1377 ASSERT(*buf_size <= NX_FSW_MAXBUFSIZE);
1378 if (*buf_size >= *large_buf_size) {
1379 *large_buf_size = 0;
1380 }
1381 return 0;
1382 }
1383
1384 static boolean_t
_dlil_attach_flowswitch_nexus(ifnet_t ifp,if_nexus_flowswitch_t nexus_fsw)1385 _dlil_attach_flowswitch_nexus(ifnet_t ifp, if_nexus_flowswitch_t nexus_fsw)
1386 {
1387 nexus_attr_t attr = NULL;
1388 nexus_controller_t controller;
1389 errno_t err = 0;
1390 uuid_t netif;
1391 uint32_t buf_size = 0;
1392 uint32_t large_buf_size = 0;
1393 bool multi_buflet;
1394
1395 if (ifnet_nx_noauto(ifp) || ifnet_nx_noauto_flowswitch(ifp) ||
1396 IFNET_IS_VMNET(ifp)) {
1397 goto failed;
1398 }
1399
1400 if ((ifp->if_capabilities & IFCAP_SKYWALK) == 0) {
1401 /* not possible to attach (netif native/compat not plumbed) */
1402 goto failed;
1403 }
1404
1405 if ((if_attach_nx & IF_ATTACH_NX_FLOWSWITCH) == 0) {
1406 /* don't auto-attach */
1407 goto failed;
1408 }
1409
1410 /* get the netif instance from the ifp */
1411 err = kern_nexus_get_netif_instance(ifp, netif);
1412 if (err != 0) {
1413 DLIL_PRINTF("%s: can't find netif for %s\n", __func__,
1414 if_name(ifp));
1415 goto failed;
1416 }
1417
1418 err = kern_nexus_attr_create(&attr);
1419 if (err != 0) {
1420 DLIL_PRINTF("%s: nexus attr create for %s\n", __func__,
1421 if_name(ifp));
1422 goto failed;
1423 }
1424
1425 err = _dlil_get_flowswitch_buffer_size(ifp, netif, &buf_size,
1426 &multi_buflet, &large_buf_size);
1427 if (err != 0) {
1428 goto failed;
1429 }
1430 ASSERT((buf_size >= NX_FSW_BUFSIZE) && (buf_size <= NX_FSW_MAXBUFSIZE));
1431 ASSERT(large_buf_size <= NX_FSW_MAX_LARGE_BUFSIZE);
1432
1433 /* Configure flowswitch buffer size */
1434 err = kern_nexus_attr_set(attr, NEXUS_ATTR_SLOT_BUF_SIZE, buf_size);
1435 VERIFY(err == 0);
1436 err = kern_nexus_attr_set(attr, NEXUS_ATTR_LARGE_BUF_SIZE,
1437 large_buf_size);
1438 VERIFY(err == 0);
1439
1440 /*
1441 * Configure flowswitch to use super-packet (multi-buflet).
1442 */
1443 err = kern_nexus_attr_set(attr, NEXUS_ATTR_MAX_FRAGS,
1444 multi_buflet ? NX_PBUF_FRAGS_MAX : 1);
1445 VERIFY(err == 0);
1446
1447 /* create the flowswitch provider and instance */
1448 controller = kern_nexus_shared_controller();
1449 err = dlil_create_provider_and_instance(controller,
1450 NEXUS_TYPE_FLOW_SWITCH, ifp, &nexus_fsw->if_fsw_provider,
1451 &nexus_fsw->if_fsw_instance, attr);
1452 if (err != 0) {
1453 goto failed;
1454 }
1455
1456 /* attach the device port */
1457 err = kern_nexus_ifattach(controller, nexus_fsw->if_fsw_instance,
1458 NULL, netif, FALSE, &nexus_fsw->if_fsw_device);
1459 if (err != 0) {
1460 DLIL_PRINTF("%s kern_nexus_ifattach device failed %d %s\n",
1461 __func__, err, if_name(ifp));
1462 /* cleanup provider and instance */
1463 dlil_detach_nexus(__func__, nexus_fsw->if_fsw_provider,
1464 nexus_fsw->if_fsw_instance, nexus_fsw->if_fsw_device);
1465 goto failed;
1466 }
1467 return TRUE;
1468
1469 failed:
1470 if (err != 0) {
1471 DLIL_PRINTF("%s: failed to attach flowswitch to %s, error %d\n",
1472 __func__, if_name(ifp), err);
1473 } else {
1474 DLIL_PRINTF("%s: not attaching flowswitch to %s\n",
1475 __func__, if_name(ifp));
1476 }
1477 if (attr != NULL) {
1478 kern_nexus_attr_destroy(attr);
1479 }
1480 return FALSE;
1481 }
1482
1483 static boolean_t
dlil_attach_flowswitch_nexus(ifnet_t ifp)1484 dlil_attach_flowswitch_nexus(ifnet_t ifp)
1485 {
1486 boolean_t attached;
1487 if_nexus_flowswitch nexus_fsw;
1488
1489 #if (DEVELOPMENT || DEBUG)
1490 if (skywalk_netif_direct_allowed(if_name(ifp))) {
1491 DLIL_PRINTF("skip attaching fsw to %s", if_name(ifp));
1492 return FALSE;
1493 }
1494 #endif /* (DEVELOPMENT || DEBUG) */
1495
1496 /*
1497 * flowswitch attachment is not supported for interface using the
1498 * legacy model (IFNET_INIT_LEGACY)
1499 */
1500 if ((ifp->if_eflags & IFEF_TXSTART) == 0) {
1501 DLIL_PRINTF("skip attaching fsw to %s using legacy TX model",
1502 if_name(ifp));
1503 return FALSE;
1504 }
1505
1506 if (uuid_is_null(ifp->if_nx_flowswitch.if_fsw_instance) == 0) {
1507 /* it's already attached */
1508 return FALSE;
1509 }
1510 bzero(&nexus_fsw, sizeof(nexus_fsw));
1511 attached = _dlil_attach_flowswitch_nexus(ifp, &nexus_fsw);
1512 if (attached) {
1513 ifnet_lock_exclusive(ifp);
1514 if (!IF_FULLY_ATTACHED(ifp)) {
1515 /* interface is going away */
1516 attached = FALSE;
1517 } else {
1518 ifp->if_nx_flowswitch = nexus_fsw;
1519 }
1520 ifnet_lock_done(ifp);
1521 if (!attached) {
1522 /* clean up flowswitch nexus */
1523 dlil_detach_flowswitch_nexus(&nexus_fsw);
1524 }
1525 }
1526 return attached;
1527 }
1528
1529 __attribute__((noinline))
1530 static void
dlil_detach_flowswitch_nexus(if_nexus_flowswitch_t nexus_fsw)1531 dlil_detach_flowswitch_nexus(if_nexus_flowswitch_t nexus_fsw)
1532 {
1533 dlil_detach_nexus(__func__, nexus_fsw->if_fsw_provider,
1534 nexus_fsw->if_fsw_instance, nexus_fsw->if_fsw_device);
1535 }
1536
1537 __attribute__((noinline))
1538 static void
dlil_netif_detach_notify(ifnet_t ifp)1539 dlil_netif_detach_notify(ifnet_t ifp)
1540 {
1541 ifnet_detach_notify_cb_t notify = NULL;
1542 void *arg = NULL;
1543
1544 ifnet_get_detach_notify(ifp, ¬ify, &arg);
1545 if (notify == NULL) {
1546 DTRACE_SKYWALK1(no__notify, ifnet_t, ifp);
1547 return;
1548 }
1549 (*notify)(arg);
1550 }
1551
1552 __attribute__((noinline))
1553 static void
dlil_quiesce_and_detach_nexuses(ifnet_t ifp)1554 dlil_quiesce_and_detach_nexuses(ifnet_t ifp)
1555 {
1556 if_nexus_flowswitch *nx_fsw = &ifp->if_nx_flowswitch;
1557 if_nexus_netif *nx_netif = &ifp->if_nx_netif;
1558
1559 ifnet_datamov_suspend_and_drain(ifp);
1560 if (!uuid_is_null(nx_fsw->if_fsw_device)) {
1561 ASSERT(!uuid_is_null(nx_fsw->if_fsw_provider));
1562 ASSERT(!uuid_is_null(nx_fsw->if_fsw_instance));
1563 dlil_detach_flowswitch_nexus(nx_fsw);
1564 bzero(nx_fsw, sizeof(*nx_fsw));
1565 } else {
1566 ASSERT(uuid_is_null(nx_fsw->if_fsw_provider));
1567 ASSERT(uuid_is_null(nx_fsw->if_fsw_instance));
1568 DTRACE_IP1(fsw__not__attached, ifnet_t, ifp);
1569 }
1570
1571 if (!uuid_is_null(nx_netif->if_nif_attach)) {
1572 ASSERT(!uuid_is_null(nx_netif->if_nif_provider));
1573 ASSERT(!uuid_is_null(nx_netif->if_nif_instance));
1574 dlil_detach_netif_nexus(nx_netif);
1575 bzero(nx_netif, sizeof(*nx_netif));
1576 } else {
1577 ASSERT(uuid_is_null(nx_netif->if_nif_provider));
1578 ASSERT(uuid_is_null(nx_netif->if_nif_instance));
1579 DTRACE_IP1(netif__not__attached, ifnet_t, ifp);
1580 }
1581 ifnet_datamov_resume(ifp);
1582 }
1583
1584 boolean_t
ifnet_add_netagent(ifnet_t ifp)1585 ifnet_add_netagent(ifnet_t ifp)
1586 {
1587 int error;
1588
1589 error = kern_nexus_interface_add_netagent(ifp);
1590 os_log(OS_LOG_DEFAULT,
1591 "kern_nexus_interface_add_netagent(%s) returned %d",
1592 ifp->if_xname, error);
1593 return error == 0;
1594 }
1595
1596 boolean_t
ifnet_remove_netagent(ifnet_t ifp)1597 ifnet_remove_netagent(ifnet_t ifp)
1598 {
1599 int error;
1600
1601 error = kern_nexus_interface_remove_netagent(ifp);
1602 os_log(OS_LOG_DEFAULT,
1603 "kern_nexus_interface_remove_netagent(%s) returned %d",
1604 ifp->if_xname, error);
1605 return error == 0;
1606 }
1607
1608 boolean_t
ifnet_attach_flowswitch_nexus(ifnet_t ifp)1609 ifnet_attach_flowswitch_nexus(ifnet_t ifp)
1610 {
1611 if (!IF_FULLY_ATTACHED(ifp)) {
1612 return FALSE;
1613 }
1614 return dlil_attach_flowswitch_nexus(ifp);
1615 }
1616
1617 boolean_t
ifnet_detach_flowswitch_nexus(ifnet_t ifp)1618 ifnet_detach_flowswitch_nexus(ifnet_t ifp)
1619 {
1620 if_nexus_flowswitch nexus_fsw;
1621
1622 ifnet_lock_exclusive(ifp);
1623 nexus_fsw = ifp->if_nx_flowswitch;
1624 bzero(&ifp->if_nx_flowswitch, sizeof(ifp->if_nx_flowswitch));
1625 ifnet_lock_done(ifp);
1626 return dlil_detach_nexus(__func__, nexus_fsw.if_fsw_provider,
1627 nexus_fsw.if_fsw_instance, nexus_fsw.if_fsw_device);
1628 }
1629
1630 boolean_t
ifnet_attach_netif_nexus(ifnet_t ifp)1631 ifnet_attach_netif_nexus(ifnet_t ifp)
1632 {
1633 boolean_t nexus_attached;
1634 if_nexus_netif nexus_netif;
1635
1636 if (!IF_FULLY_ATTACHED(ifp)) {
1637 return FALSE;
1638 }
1639 nexus_attached = dlil_attach_netif_nexus_common(ifp, &nexus_netif);
1640 if (nexus_attached) {
1641 ifnet_lock_exclusive(ifp);
1642 ifp->if_nx_netif = nexus_netif;
1643 ifnet_lock_done(ifp);
1644 }
1645 return nexus_attached;
1646 }
1647
1648 boolean_t
ifnet_detach_netif_nexus(ifnet_t ifp)1649 ifnet_detach_netif_nexus(ifnet_t ifp)
1650 {
1651 if_nexus_netif nexus_netif;
1652
1653 ifnet_lock_exclusive(ifp);
1654 nexus_netif = ifp->if_nx_netif;
1655 bzero(&ifp->if_nx_netif, sizeof(ifp->if_nx_netif));
1656 ifnet_lock_done(ifp);
1657
1658 return dlil_detach_nexus(__func__, nexus_netif.if_nif_provider,
1659 nexus_netif.if_nif_instance, nexus_netif.if_nif_attach);
1660 }
1661
1662 void
ifnet_attach_native_flowswitch(ifnet_t ifp)1663 ifnet_attach_native_flowswitch(ifnet_t ifp)
1664 {
1665 if (!dlil_is_native_netif_nexus(ifp)) {
1666 /* not a native netif */
1667 return;
1668 }
1669 ifnet_attach_flowswitch_nexus(ifp);
1670 }
1671
1672 int
ifnet_set_flowswitch_rx_callback(ifnet_t ifp,ifnet_fsw_rx_cb_t cb,void * arg)1673 ifnet_set_flowswitch_rx_callback(ifnet_t ifp, ifnet_fsw_rx_cb_t cb, void *arg)
1674 {
1675 lck_mtx_lock(&ifp->if_delegate_lock);
1676 while (ifp->if_fsw_rx_cb_ref > 0) {
1677 DTRACE_SKYWALK1(wait__fsw, ifnet_t, ifp);
1678 (void) msleep(&ifp->if_fsw_rx_cb_ref, &ifp->if_delegate_lock,
1679 (PZERO + 1), __FUNCTION__, NULL);
1680 DTRACE_SKYWALK1(wake__fsw, ifnet_t, ifp);
1681 }
1682 ifp->if_fsw_rx_cb = cb;
1683 ifp->if_fsw_rx_cb_arg = arg;
1684 lck_mtx_unlock(&ifp->if_delegate_lock);
1685 return 0;
1686 }
1687
1688 int
ifnet_get_flowswitch_rx_callback(ifnet_t ifp,ifnet_fsw_rx_cb_t * cbp,void ** argp)1689 ifnet_get_flowswitch_rx_callback(ifnet_t ifp, ifnet_fsw_rx_cb_t *cbp, void **argp)
1690 {
1691 /*
1692 * This is for avoiding the unnecessary lock acquire for interfaces
1693 * not used by a redirect interface.
1694 */
1695 if (ifp->if_fsw_rx_cb == NULL) {
1696 return ENOENT;
1697 }
1698 lck_mtx_lock(&ifp->if_delegate_lock);
1699 if (ifp->if_fsw_rx_cb == NULL) {
1700 lck_mtx_unlock(&ifp->if_delegate_lock);
1701 return ENOENT;
1702 }
1703 *cbp = ifp->if_fsw_rx_cb;
1704 *argp = ifp->if_fsw_rx_cb_arg;
1705 ifp->if_fsw_rx_cb_ref++;
1706 lck_mtx_unlock(&ifp->if_delegate_lock);
1707 return 0;
1708 }
1709
1710 void
ifnet_release_flowswitch_rx_callback(ifnet_t ifp)1711 ifnet_release_flowswitch_rx_callback(ifnet_t ifp)
1712 {
1713 lck_mtx_lock(&ifp->if_delegate_lock);
1714 if (--ifp->if_fsw_rx_cb_ref == 0) {
1715 wakeup(&ifp->if_fsw_rx_cb_ref);
1716 }
1717 lck_mtx_unlock(&ifp->if_delegate_lock);
1718 }
1719
1720 int
ifnet_set_delegate_parent(ifnet_t difp,ifnet_t parent)1721 ifnet_set_delegate_parent(ifnet_t difp, ifnet_t parent)
1722 {
1723 lck_mtx_lock(&difp->if_delegate_lock);
1724 while (difp->if_delegate_parent_ref > 0) {
1725 DTRACE_SKYWALK1(wait__parent, ifnet_t, difp);
1726 (void) msleep(&difp->if_delegate_parent_ref, &difp->if_delegate_lock,
1727 (PZERO + 1), __FUNCTION__, NULL);
1728 DTRACE_SKYWALK1(wake__parent, ifnet_t, difp);
1729 }
1730 difp->if_delegate_parent = parent;
1731 lck_mtx_unlock(&difp->if_delegate_lock);
1732 return 0;
1733 }
1734
1735 int
ifnet_get_delegate_parent(ifnet_t difp,ifnet_t * parentp)1736 ifnet_get_delegate_parent(ifnet_t difp, ifnet_t *parentp)
1737 {
1738 lck_mtx_lock(&difp->if_delegate_lock);
1739 if (difp->if_delegate_parent == NULL) {
1740 lck_mtx_unlock(&difp->if_delegate_lock);
1741 return ENOENT;
1742 }
1743 *parentp = difp->if_delegate_parent;
1744 difp->if_delegate_parent_ref++;
1745 lck_mtx_unlock(&difp->if_delegate_lock);
1746 return 0;
1747 }
1748
1749 void
ifnet_release_delegate_parent(ifnet_t difp)1750 ifnet_release_delegate_parent(ifnet_t difp)
1751 {
1752 lck_mtx_lock(&difp->if_delegate_lock);
1753 if (--difp->if_delegate_parent_ref == 0) {
1754 wakeup(&difp->if_delegate_parent_ref);
1755 }
1756 lck_mtx_unlock(&difp->if_delegate_lock);
1757 }
1758
1759 __attribute__((noinline))
1760 void
ifnet_set_detach_notify_locked(ifnet_t ifp,ifnet_detach_notify_cb_t notify,void * arg)1761 ifnet_set_detach_notify_locked(ifnet_t ifp, ifnet_detach_notify_cb_t notify, void *arg)
1762 {
1763 ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_EXCLUSIVE);
1764 ifp->if_detach_notify = notify;
1765 ifp->if_detach_notify_arg = arg;
1766 }
1767
1768 __attribute__((noinline))
1769 void
ifnet_get_detach_notify_locked(ifnet_t ifp,ifnet_detach_notify_cb_t * notifyp,void ** argp)1770 ifnet_get_detach_notify_locked(ifnet_t ifp, ifnet_detach_notify_cb_t *notifyp, void **argp)
1771 {
1772 ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_EXCLUSIVE);
1773 *notifyp = ifp->if_detach_notify;
1774 *argp = ifp->if_detach_notify_arg;
1775 }
1776
1777 __attribute__((noinline))
1778 void
ifnet_set_detach_notify(ifnet_t ifp,ifnet_detach_notify_cb_t notify,void * arg)1779 ifnet_set_detach_notify(ifnet_t ifp, ifnet_detach_notify_cb_t notify, void *arg)
1780 {
1781 ifnet_lock_exclusive(ifp);
1782 ifnet_set_detach_notify_locked(ifp, notify, arg);
1783 ifnet_lock_done(ifp);
1784 }
1785
1786 __attribute__((noinline))
1787 void
ifnet_get_detach_notify(ifnet_t ifp,ifnet_detach_notify_cb_t * notifyp,void ** argp)1788 ifnet_get_detach_notify(ifnet_t ifp, ifnet_detach_notify_cb_t *notifyp, void **argp)
1789 {
1790 ifnet_lock_exclusive(ifp);
1791 ifnet_get_detach_notify_locked(ifp, notifyp, argp);
1792 ifnet_lock_done(ifp);
1793 }
1794 #endif /* SKYWALK */
1795
1796 #define DLIL_INPUT_CHECK(m, ifp) { \
1797 struct ifnet *_rcvif = mbuf_pkthdr_rcvif(m); \
1798 if (_rcvif == NULL || (ifp != lo_ifp && _rcvif != ifp) || \
1799 !(mbuf_flags(m) & MBUF_PKTHDR)) { \
1800 panic_plain("%s: invalid mbuf %p\n", __func__, m); \
1801 /* NOTREACHED */ \
1802 } \
1803 }
1804
1805 #define DLIL_EWMA(old, new, decay) do { \
1806 u_int32_t _avg; \
1807 if ((_avg = (old)) > 0) \
1808 _avg = (((_avg << (decay)) - _avg) + (new)) >> (decay); \
1809 else \
1810 _avg = (new); \
1811 (old) = _avg; \
1812 } while (0)
1813
1814 #define MBPS (1ULL * 1000 * 1000)
1815 #define GBPS (MBPS * 1000)
1816
1817 struct rxpoll_time_tbl {
1818 u_int64_t speed; /* downlink speed */
1819 u_int32_t plowat; /* packets low watermark */
1820 u_int32_t phiwat; /* packets high watermark */
1821 u_int32_t blowat; /* bytes low watermark */
1822 u_int32_t bhiwat; /* bytes high watermark */
1823 };
1824
1825 static struct rxpoll_time_tbl rxpoll_tbl[] = {
1826 { .speed = 10 * MBPS, .plowat = 2, .phiwat = 8, .blowat = (1 * 1024), .bhiwat = (6 * 1024) },
1827 { .speed = 100 * MBPS, .plowat = 10, .phiwat = 40, .blowat = (4 * 1024), .bhiwat = (64 * 1024) },
1828 { .speed = 1 * GBPS, .plowat = 10, .phiwat = 40, .blowat = (4 * 1024), .bhiwat = (64 * 1024) },
1829 { .speed = 10 * GBPS, .plowat = 10, .phiwat = 40, .blowat = (4 * 1024), .bhiwat = (64 * 1024) },
1830 { .speed = 100 * GBPS, .plowat = 10, .phiwat = 40, .blowat = (4 * 1024), .bhiwat = (64 * 1024) },
1831 { .speed = 0, .plowat = 0, .phiwat = 0, .blowat = 0, .bhiwat = 0 }
1832 };
1833
1834 static LCK_MTX_DECLARE_ATTR(dlil_thread_sync_lock, &dlil_lock_group,
1835 &dlil_lck_attributes);
1836 static uint32_t dlil_pending_thread_cnt = 0;
1837
1838 static void
dlil_incr_pending_thread_count(void)1839 dlil_incr_pending_thread_count(void)
1840 {
1841 LCK_MTX_ASSERT(&dlil_thread_sync_lock, LCK_MTX_ASSERT_NOTOWNED);
1842 lck_mtx_lock(&dlil_thread_sync_lock);
1843 dlil_pending_thread_cnt++;
1844 lck_mtx_unlock(&dlil_thread_sync_lock);
1845 }
1846
1847 static void
dlil_decr_pending_thread_count(void)1848 dlil_decr_pending_thread_count(void)
1849 {
1850 LCK_MTX_ASSERT(&dlil_thread_sync_lock, LCK_MTX_ASSERT_NOTOWNED);
1851 lck_mtx_lock(&dlil_thread_sync_lock);
1852 VERIFY(dlil_pending_thread_cnt > 0);
1853 dlil_pending_thread_cnt--;
1854 if (dlil_pending_thread_cnt == 0) {
1855 wakeup(&dlil_pending_thread_cnt);
1856 }
1857 lck_mtx_unlock(&dlil_thread_sync_lock);
1858 }
1859
1860 int
proto_hash_value(u_int32_t protocol_family)1861 proto_hash_value(u_int32_t protocol_family)
1862 {
1863 /*
1864 * dlil_proto_unplumb_all() depends on the mapping between
1865 * the hash bucket index and the protocol family defined
1866 * here; future changes must be applied there as well.
1867 */
1868 switch (protocol_family) {
1869 case PF_INET:
1870 return 0;
1871 case PF_INET6:
1872 return 1;
1873 case PF_VLAN:
1874 return 2;
1875 case PF_UNSPEC:
1876 default:
1877 return 3;
1878 }
1879 }
1880
1881 /*
1882 * Caller must already be holding ifnet lock.
1883 */
1884 static struct if_proto *
find_attached_proto(struct ifnet * ifp,u_int32_t protocol_family)1885 find_attached_proto(struct ifnet *ifp, u_int32_t protocol_family)
1886 {
1887 struct if_proto *proto = NULL;
1888 u_int32_t i = proto_hash_value(protocol_family);
1889
1890 ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_OWNED);
1891
1892 if (ifp->if_proto_hash != NULL) {
1893 proto = SLIST_FIRST(&ifp->if_proto_hash[i]);
1894 }
1895
1896 while (proto != NULL && proto->protocol_family != protocol_family) {
1897 proto = SLIST_NEXT(proto, next_hash);
1898 }
1899
1900 if (proto != NULL) {
1901 if_proto_ref(proto);
1902 }
1903
1904 return proto;
1905 }
1906
1907 static void
if_proto_ref(struct if_proto * proto)1908 if_proto_ref(struct if_proto *proto)
1909 {
1910 os_atomic_inc(&proto->refcount, relaxed);
1911 }
1912
1913 extern void if_rtproto_del(struct ifnet *ifp, int protocol);
1914
1915 static void
if_proto_free(struct if_proto * proto)1916 if_proto_free(struct if_proto *proto)
1917 {
1918 u_int32_t oldval;
1919 struct ifnet *ifp = proto->ifp;
1920 u_int32_t proto_family = proto->protocol_family;
1921 struct kev_dl_proto_data ev_pr_data;
1922
1923 oldval = os_atomic_dec_orig(&proto->refcount, relaxed);
1924 if (oldval > 1) {
1925 return;
1926 }
1927
1928 if (proto->proto_kpi == kProtoKPI_v1) {
1929 if (proto->kpi.v1.detached) {
1930 proto->kpi.v1.detached(ifp, proto->protocol_family);
1931 }
1932 }
1933 if (proto->proto_kpi == kProtoKPI_v2) {
1934 if (proto->kpi.v2.detached) {
1935 proto->kpi.v2.detached(ifp, proto->protocol_family);
1936 }
1937 }
1938
1939 /*
1940 * Cleanup routes that may still be in the routing table for that
1941 * interface/protocol pair.
1942 */
1943 if_rtproto_del(ifp, proto_family);
1944
1945 ifnet_lock_shared(ifp);
1946
1947 /* No more reference on this, protocol must have been detached */
1948 VERIFY(proto->detached);
1949
1950 /*
1951 * The reserved field carries the number of protocol still attached
1952 * (subject to change)
1953 */
1954 ev_pr_data.proto_family = proto_family;
1955 ev_pr_data.proto_remaining_count = dlil_ifp_protolist(ifp, NULL, 0);
1956
1957 ifnet_lock_done(ifp);
1958
1959 dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_PROTO_DETACHED,
1960 (struct net_event_data *)&ev_pr_data,
1961 sizeof(struct kev_dl_proto_data), FALSE);
1962
1963 if (ev_pr_data.proto_remaining_count == 0) {
1964 /*
1965 * The protocol count has gone to zero, mark the interface down.
1966 * This used to be done by configd.KernelEventMonitor, but that
1967 * is inherently prone to races (rdar://problem/30810208).
1968 */
1969 (void) ifnet_set_flags(ifp, 0, IFF_UP);
1970 (void) ifnet_ioctl(ifp, 0, SIOCSIFFLAGS, NULL);
1971 dlil_post_sifflags_msg(ifp);
1972 }
1973
1974 zfree(dlif_proto_zone, proto);
1975 }
1976
1977 __private_extern__ void
ifnet_lock_assert(struct ifnet * ifp,ifnet_lock_assert_t what)1978 ifnet_lock_assert(struct ifnet *ifp, ifnet_lock_assert_t what)
1979 {
1980 #if !MACH_ASSERT
1981 #pragma unused(ifp)
1982 #endif
1983 unsigned int type = 0;
1984 int ass = 1;
1985
1986 switch (what) {
1987 case IFNET_LCK_ASSERT_EXCLUSIVE:
1988 type = LCK_RW_ASSERT_EXCLUSIVE;
1989 break;
1990
1991 case IFNET_LCK_ASSERT_SHARED:
1992 type = LCK_RW_ASSERT_SHARED;
1993 break;
1994
1995 case IFNET_LCK_ASSERT_OWNED:
1996 type = LCK_RW_ASSERT_HELD;
1997 break;
1998
1999 case IFNET_LCK_ASSERT_NOTOWNED:
2000 /* nothing to do here for RW lock; bypass assert */
2001 ass = 0;
2002 break;
2003
2004 default:
2005 panic("bad ifnet assert type: %d", what);
2006 /* NOTREACHED */
2007 }
2008 if (ass) {
2009 LCK_RW_ASSERT(&ifp->if_lock, type);
2010 }
2011 }
2012
2013 __private_extern__ void
ifnet_lock_shared(struct ifnet * ifp)2014 ifnet_lock_shared(struct ifnet *ifp)
2015 {
2016 lck_rw_lock_shared(&ifp->if_lock);
2017 }
2018
2019 __private_extern__ void
ifnet_lock_exclusive(struct ifnet * ifp)2020 ifnet_lock_exclusive(struct ifnet *ifp)
2021 {
2022 lck_rw_lock_exclusive(&ifp->if_lock);
2023 }
2024
2025 __private_extern__ void
ifnet_lock_done(struct ifnet * ifp)2026 ifnet_lock_done(struct ifnet *ifp)
2027 {
2028 lck_rw_done(&ifp->if_lock);
2029 }
2030
2031 #if INET
2032 __private_extern__ void
if_inetdata_lock_shared(struct ifnet * ifp)2033 if_inetdata_lock_shared(struct ifnet *ifp)
2034 {
2035 lck_rw_lock_shared(&ifp->if_inetdata_lock);
2036 }
2037
2038 __private_extern__ void
if_inetdata_lock_exclusive(struct ifnet * ifp)2039 if_inetdata_lock_exclusive(struct ifnet *ifp)
2040 {
2041 lck_rw_lock_exclusive(&ifp->if_inetdata_lock);
2042 }
2043
2044 __private_extern__ void
if_inetdata_lock_done(struct ifnet * ifp)2045 if_inetdata_lock_done(struct ifnet *ifp)
2046 {
2047 lck_rw_done(&ifp->if_inetdata_lock);
2048 }
2049 #endif
2050
2051 __private_extern__ void
if_inet6data_lock_shared(struct ifnet * ifp)2052 if_inet6data_lock_shared(struct ifnet *ifp)
2053 {
2054 lck_rw_lock_shared(&ifp->if_inet6data_lock);
2055 }
2056
2057 __private_extern__ void
if_inet6data_lock_exclusive(struct ifnet * ifp)2058 if_inet6data_lock_exclusive(struct ifnet *ifp)
2059 {
2060 lck_rw_lock_exclusive(&ifp->if_inet6data_lock);
2061 }
2062
2063 __private_extern__ void
if_inet6data_lock_done(struct ifnet * ifp)2064 if_inet6data_lock_done(struct ifnet *ifp)
2065 {
2066 lck_rw_done(&ifp->if_inet6data_lock);
2067 }
2068
2069 __private_extern__ void
ifnet_head_lock_shared(void)2070 ifnet_head_lock_shared(void)
2071 {
2072 lck_rw_lock_shared(&ifnet_head_lock);
2073 }
2074
2075 __private_extern__ void
ifnet_head_lock_exclusive(void)2076 ifnet_head_lock_exclusive(void)
2077 {
2078 lck_rw_lock_exclusive(&ifnet_head_lock);
2079 }
2080
2081 __private_extern__ void
ifnet_head_done(void)2082 ifnet_head_done(void)
2083 {
2084 lck_rw_done(&ifnet_head_lock);
2085 }
2086
2087 __private_extern__ void
ifnet_head_assert_exclusive(void)2088 ifnet_head_assert_exclusive(void)
2089 {
2090 LCK_RW_ASSERT(&ifnet_head_lock, LCK_RW_ASSERT_EXCLUSIVE);
2091 }
2092
2093 /*
2094 * dlil_ifp_protolist
2095 * - get the list of protocols attached to the interface, or just the number
2096 * of attached protocols
2097 * - if the number returned is greater than 'list_count', truncation occurred
2098 *
2099 * Note:
2100 * - caller must already be holding ifnet lock.
2101 */
2102 static u_int32_t
dlil_ifp_protolist(struct ifnet * ifp,protocol_family_t * list,u_int32_t list_count)2103 dlil_ifp_protolist(struct ifnet *ifp, protocol_family_t *list,
2104 u_int32_t list_count)
2105 {
2106 u_int32_t count = 0;
2107 int i;
2108
2109 ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_OWNED);
2110
2111 if (ifp->if_proto_hash == NULL) {
2112 goto done;
2113 }
2114
2115 for (i = 0; i < PROTO_HASH_SLOTS; i++) {
2116 struct if_proto *proto;
2117 SLIST_FOREACH(proto, &ifp->if_proto_hash[i], next_hash) {
2118 if (list != NULL && count < list_count) {
2119 list[count] = proto->protocol_family;
2120 }
2121 count++;
2122 }
2123 }
2124 done:
2125 return count;
2126 }
2127
2128 __private_extern__ u_int32_t
if_get_protolist(struct ifnet * ifp,u_int32_t * protolist,u_int32_t count)2129 if_get_protolist(struct ifnet * ifp, u_int32_t *protolist, u_int32_t count)
2130 {
2131 ifnet_lock_shared(ifp);
2132 count = dlil_ifp_protolist(ifp, protolist, count);
2133 ifnet_lock_done(ifp);
2134 return count;
2135 }
2136
2137 __private_extern__ void
if_free_protolist(u_int32_t * list)2138 if_free_protolist(u_int32_t *list)
2139 {
2140 kfree_data_addr(list);
2141 }
2142
2143 __private_extern__ int
dlil_post_msg(struct ifnet * ifp,u_int32_t event_subclass,u_int32_t event_code,struct net_event_data * event_data,u_int32_t event_data_len,boolean_t suppress_generation)2144 dlil_post_msg(struct ifnet *ifp, u_int32_t event_subclass,
2145 u_int32_t event_code, struct net_event_data *event_data,
2146 u_int32_t event_data_len, boolean_t suppress_generation)
2147 {
2148 struct net_event_data ev_data;
2149 struct kev_msg ev_msg;
2150
2151 bzero(&ev_msg, sizeof(ev_msg));
2152 bzero(&ev_data, sizeof(ev_data));
2153 /*
2154 * a net event always starts with a net_event_data structure
2155 * but the caller can generate a simple net event or
2156 * provide a longer event structure to post
2157 */
2158 ev_msg.vendor_code = KEV_VENDOR_APPLE;
2159 ev_msg.kev_class = KEV_NETWORK_CLASS;
2160 ev_msg.kev_subclass = event_subclass;
2161 ev_msg.event_code = event_code;
2162
2163 if (event_data == NULL) {
2164 event_data = &ev_data;
2165 event_data_len = sizeof(struct net_event_data);
2166 }
2167
2168 strlcpy(&event_data->if_name[0], ifp->if_name, IFNAMSIZ);
2169 event_data->if_family = ifp->if_family;
2170 event_data->if_unit = (u_int32_t)ifp->if_unit;
2171
2172 ev_msg.dv[0].data_length = event_data_len;
2173 ev_msg.dv[0].data_ptr = event_data;
2174 ev_msg.dv[1].data_length = 0;
2175
2176 bool update_generation = true;
2177 if (event_subclass == KEV_DL_SUBCLASS) {
2178 /* Don't update interface generation for frequent link quality and state changes */
2179 switch (event_code) {
2180 case KEV_DL_LINK_QUALITY_METRIC_CHANGED:
2181 case KEV_DL_RRC_STATE_CHANGED:
2182 case KEV_DL_PRIMARY_ELECTED:
2183 update_generation = false;
2184 break;
2185 default:
2186 break;
2187 }
2188 }
2189
2190 /*
2191 * Some events that update generation counts might
2192 * want to suppress generation count.
2193 * One example is node presence/absence where we still
2194 * issue kernel event for the invocation but want to avoid
2195 * expensive operation of updating generation which triggers
2196 * NECP client updates.
2197 */
2198 if (suppress_generation) {
2199 update_generation = false;
2200 }
2201
2202 return dlil_event_internal(ifp, &ev_msg, update_generation);
2203 }
2204
2205 __private_extern__ int
dlil_alloc_local_stats(struct ifnet * ifp)2206 dlil_alloc_local_stats(struct ifnet *ifp)
2207 {
2208 int ret = EINVAL;
2209 void *buf, *base, **pbuf;
2210
2211 if (ifp == NULL) {
2212 goto end;
2213 }
2214
2215 if (ifp->if_tcp_stat == NULL && ifp->if_udp_stat == NULL) {
2216 /* allocate tcpstat_local structure */
2217 buf = zalloc_flags(dlif_tcpstat_zone,
2218 Z_WAITOK | Z_ZERO | Z_NOFAIL);
2219
2220 /* Get the 64-bit aligned base address for this object */
2221 base = (void *)P2ROUNDUP((intptr_t)buf + sizeof(u_int64_t),
2222 sizeof(u_int64_t));
2223 VERIFY(((intptr_t)base + dlif_tcpstat_size) <=
2224 ((intptr_t)buf + dlif_tcpstat_bufsize));
2225
2226 /*
2227 * Wind back a pointer size from the aligned base and
2228 * save the original address so we can free it later.
2229 */
2230 pbuf = (void **)((intptr_t)base - sizeof(void *));
2231 *pbuf = buf;
2232 ifp->if_tcp_stat = base;
2233
2234 /* allocate udpstat_local structure */
2235 buf = zalloc_flags(dlif_udpstat_zone,
2236 Z_WAITOK | Z_ZERO | Z_NOFAIL);
2237
2238 /* Get the 64-bit aligned base address for this object */
2239 base = (void *)P2ROUNDUP((intptr_t)buf + sizeof(u_int64_t),
2240 sizeof(u_int64_t));
2241 VERIFY(((intptr_t)base + dlif_udpstat_size) <=
2242 ((intptr_t)buf + dlif_udpstat_bufsize));
2243
2244 /*
2245 * Wind back a pointer size from the aligned base and
2246 * save the original address so we can free it later.
2247 */
2248 pbuf = (void **)((intptr_t)base - sizeof(void *));
2249 *pbuf = buf;
2250 ifp->if_udp_stat = base;
2251
2252 VERIFY(IS_P2ALIGNED(ifp->if_tcp_stat, sizeof(u_int64_t)) &&
2253 IS_P2ALIGNED(ifp->if_udp_stat, sizeof(u_int64_t)));
2254
2255 ret = 0;
2256 }
2257
2258 if (ifp->if_ipv4_stat == NULL) {
2259 ifp->if_ipv4_stat = kalloc_type(struct if_tcp_ecn_stat, Z_WAITOK | Z_ZERO);
2260 }
2261
2262 if (ifp->if_ipv6_stat == NULL) {
2263 ifp->if_ipv6_stat = kalloc_type(struct if_tcp_ecn_stat, Z_WAITOK | Z_ZERO);
2264 }
2265 end:
2266 if (ifp != NULL && ret != 0) {
2267 if (ifp->if_tcp_stat != NULL) {
2268 pbuf = (void **)
2269 ((intptr_t)ifp->if_tcp_stat - sizeof(void *));
2270 zfree(dlif_tcpstat_zone, *pbuf);
2271 ifp->if_tcp_stat = NULL;
2272 }
2273 if (ifp->if_udp_stat != NULL) {
2274 pbuf = (void **)
2275 ((intptr_t)ifp->if_udp_stat - sizeof(void *));
2276 zfree(dlif_udpstat_zone, *pbuf);
2277 ifp->if_udp_stat = NULL;
2278 }
2279 /* The macro kfree_type sets the passed pointer to NULL */
2280 if (ifp->if_ipv4_stat != NULL) {
2281 kfree_type(struct if_tcp_ecn_stat, ifp->if_ipv4_stat);
2282 }
2283 if (ifp->if_ipv6_stat != NULL) {
2284 kfree_type(struct if_tcp_ecn_stat, ifp->if_ipv6_stat);
2285 }
2286 }
2287
2288 return ret;
2289 }
2290
2291 static void
dlil_reset_rxpoll_params(ifnet_t ifp)2292 dlil_reset_rxpoll_params(ifnet_t ifp)
2293 {
2294 ASSERT(ifp != NULL);
2295 ifnet_set_poll_cycle(ifp, NULL);
2296 ifp->if_poll_update = 0;
2297 ifp->if_poll_flags = 0;
2298 ifp->if_poll_req = 0;
2299 ifp->if_poll_mode = IFNET_MODEL_INPUT_POLL_OFF;
2300 bzero(&ifp->if_poll_tstats, sizeof(ifp->if_poll_tstats));
2301 bzero(&ifp->if_poll_pstats, sizeof(ifp->if_poll_pstats));
2302 bzero(&ifp->if_poll_sstats, sizeof(ifp->if_poll_sstats));
2303 net_timerclear(&ifp->if_poll_mode_holdtime);
2304 net_timerclear(&ifp->if_poll_mode_lasttime);
2305 net_timerclear(&ifp->if_poll_sample_holdtime);
2306 net_timerclear(&ifp->if_poll_sample_lasttime);
2307 net_timerclear(&ifp->if_poll_dbg_lasttime);
2308 }
2309
2310 static int
dlil_create_input_thread(ifnet_t ifp,struct dlil_threading_info * inp,thread_continue_t * thfunc)2311 dlil_create_input_thread(ifnet_t ifp, struct dlil_threading_info *inp,
2312 thread_continue_t *thfunc)
2313 {
2314 boolean_t dlil_rxpoll_input;
2315 thread_continue_t func = NULL;
2316 u_int32_t limit;
2317 int error = 0;
2318
2319 dlil_rxpoll_input = (ifp != NULL && net_rxpoll &&
2320 (ifp->if_eflags & IFEF_RXPOLL) && (ifp->if_xflags & IFXF_LEGACY));
2321
2322 /* default strategy utilizes the DLIL worker thread */
2323 inp->dlth_strategy = dlil_input_async;
2324
2325 /* NULL ifp indicates the main input thread, called at dlil_init time */
2326 if (ifp == NULL) {
2327 /*
2328 * Main input thread only.
2329 */
2330 func = dlil_main_input_thread_func;
2331 VERIFY(inp == dlil_main_input_thread);
2332 (void) strlcat(inp->dlth_name,
2333 "main_input", DLIL_THREADNAME_LEN);
2334 } else if (dlil_rxpoll_input) {
2335 /*
2336 * Legacy (non-netif) hybrid polling.
2337 */
2338 func = dlil_rxpoll_input_thread_func;
2339 VERIFY(inp != dlil_main_input_thread);
2340 (void) snprintf(inp->dlth_name, DLIL_THREADNAME_LEN,
2341 "%s_input_poll", if_name(ifp));
2342 } else if (net_async || (ifp->if_xflags & IFXF_LEGACY)) {
2343 /*
2344 * Asynchronous strategy.
2345 */
2346 func = dlil_input_thread_func;
2347 VERIFY(inp != dlil_main_input_thread);
2348 (void) snprintf(inp->dlth_name, DLIL_THREADNAME_LEN,
2349 "%s_input", if_name(ifp));
2350 } else {
2351 /*
2352 * Synchronous strategy if there's a netif below and
2353 * the device isn't capable of hybrid polling.
2354 */
2355 ASSERT(func == NULL);
2356 ASSERT(!(ifp->if_xflags & IFXF_LEGACY));
2357 VERIFY(inp != dlil_main_input_thread);
2358 ASSERT(!inp->dlth_affinity);
2359 inp->dlth_strategy = dlil_input_sync;
2360 }
2361 VERIFY(inp->dlth_thread == THREAD_NULL);
2362
2363 /* let caller know */
2364 if (thfunc != NULL) {
2365 *thfunc = func;
2366 }
2367
2368 inp->dlth_lock_grp = lck_grp_alloc_init(inp->dlth_name, LCK_GRP_ATTR_NULL);
2369 lck_mtx_init(&inp->dlth_lock, inp->dlth_lock_grp, &dlil_lck_attributes);
2370
2371 inp->dlth_ifp = ifp; /* NULL for main input thread */
2372
2373 /*
2374 * For interfaces that support opportunistic polling, set the
2375 * low and high watermarks for outstanding inbound packets/bytes.
2376 * Also define freeze times for transitioning between modes
2377 * and updating the average.
2378 */
2379 if (ifp != NULL && net_rxpoll && (ifp->if_eflags & IFEF_RXPOLL)) {
2380 limit = MAX(if_rcvq_maxlen, IF_RCVQ_MINLEN);
2381 if (ifp->if_xflags & IFXF_LEGACY) {
2382 (void) dlil_rxpoll_set_params(ifp, NULL, FALSE);
2383 }
2384 } else {
2385 /*
2386 * For interfaces that don't support opportunistic
2387 * polling, set the burst limit to prevent memory exhaustion.
2388 * The values of `if_rcvq_burst_limit' are safeguarded
2389 * on customer builds by `sysctl_rcvq_burst_limit'.
2390 */
2391 limit = if_rcvq_burst_limit;
2392 }
2393
2394 _qinit(&inp->dlth_pkts, Q_DROPTAIL, limit, QP_MBUF);
2395 if (inp == dlil_main_input_thread) {
2396 struct dlil_main_threading_info *inpm =
2397 (struct dlil_main_threading_info *)inp;
2398 _qinit(&inpm->lo_rcvq_pkts, Q_DROPTAIL, limit, QP_MBUF);
2399 }
2400
2401 if (func == NULL) {
2402 ASSERT(!(ifp->if_xflags & IFXF_LEGACY));
2403 ASSERT(error == 0);
2404 error = ENODEV;
2405 goto done;
2406 }
2407
2408 error = kernel_thread_start(func, inp, &inp->dlth_thread);
2409 if (error == KERN_SUCCESS) {
2410 thread_precedence_policy_data_t info;
2411 __unused kern_return_t kret;
2412
2413 bzero(&info, sizeof(info));
2414 info.importance = 0;
2415 kret = thread_policy_set(inp->dlth_thread,
2416 THREAD_PRECEDENCE_POLICY, (thread_policy_t)&info,
2417 THREAD_PRECEDENCE_POLICY_COUNT);
2418 ASSERT(kret == KERN_SUCCESS);
2419 /*
2420 * We create an affinity set so that the matching workloop
2421 * thread or the starter thread (for loopback) can be
2422 * scheduled on the same processor set as the input thread.
2423 */
2424 if (net_affinity) {
2425 struct thread *tp = inp->dlth_thread;
2426 u_int32_t tag;
2427 /*
2428 * Randomize to reduce the probability
2429 * of affinity tag namespace collision.
2430 */
2431 read_frandom(&tag, sizeof(tag));
2432 if (dlil_affinity_set(tp, tag) == KERN_SUCCESS) {
2433 thread_reference(tp);
2434 inp->dlth_affinity_tag = tag;
2435 inp->dlth_affinity = TRUE;
2436 }
2437 }
2438 } else if (inp == dlil_main_input_thread) {
2439 panic_plain("%s: couldn't create main input thread", __func__);
2440 /* NOTREACHED */
2441 } else {
2442 panic_plain("%s: couldn't create %s input thread", __func__,
2443 if_name(ifp));
2444 /* NOTREACHED */
2445 }
2446 OSAddAtomic(1, &cur_dlil_input_threads);
2447
2448 done:
2449 return error;
2450 }
2451
2452 #if TEST_INPUT_THREAD_TERMINATION
2453 static int
2454 sysctl_input_thread_termination_spin SYSCTL_HANDLER_ARGS
2455 {
2456 #pragma unused(arg1, arg2)
2457 uint32_t i;
2458 int err;
2459
2460 i = if_input_thread_termination_spin;
2461
2462 err = sysctl_handle_int(oidp, &i, 0, req);
2463 if (err != 0 || req->newptr == USER_ADDR_NULL) {
2464 return err;
2465 }
2466
2467 if (net_rxpoll == 0) {
2468 return ENXIO;
2469 }
2470
2471 if_input_thread_termination_spin = i;
2472 return err;
2473 }
2474 #endif /* TEST_INPUT_THREAD_TERMINATION */
2475
2476 static void
dlil_clean_threading_info(struct dlil_threading_info * inp)2477 dlil_clean_threading_info(struct dlil_threading_info *inp)
2478 {
2479 lck_mtx_destroy(&inp->dlth_lock, inp->dlth_lock_grp);
2480 lck_grp_free(inp->dlth_lock_grp);
2481 inp->dlth_lock_grp = NULL;
2482
2483 inp->dlth_flags = 0;
2484 inp->dlth_wtot = 0;
2485 bzero(inp->dlth_name, sizeof(inp->dlth_name));
2486 inp->dlth_ifp = NULL;
2487 VERIFY(qhead(&inp->dlth_pkts) == NULL && qempty(&inp->dlth_pkts));
2488 qlimit(&inp->dlth_pkts) = 0;
2489 bzero(&inp->dlth_stats, sizeof(inp->dlth_stats));
2490
2491 VERIFY(!inp->dlth_affinity);
2492 inp->dlth_thread = THREAD_NULL;
2493 inp->dlth_strategy = NULL;
2494 VERIFY(inp->dlth_driver_thread == THREAD_NULL);
2495 VERIFY(inp->dlth_poller_thread == THREAD_NULL);
2496 VERIFY(inp->dlth_affinity_tag == 0);
2497 #if IFNET_INPUT_SANITY_CHK
2498 inp->dlth_pkts_cnt = 0;
2499 #endif /* IFNET_INPUT_SANITY_CHK */
2500 }
2501
2502 static void
dlil_terminate_input_thread(struct dlil_threading_info * inp)2503 dlil_terminate_input_thread(struct dlil_threading_info *inp)
2504 {
2505 struct ifnet *ifp = inp->dlth_ifp;
2506 classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
2507
2508 VERIFY(current_thread() == inp->dlth_thread);
2509 VERIFY(inp != dlil_main_input_thread);
2510
2511 OSAddAtomic(-1, &cur_dlil_input_threads);
2512
2513 #if TEST_INPUT_THREAD_TERMINATION
2514 { /* do something useless that won't get optimized away */
2515 uint32_t v = 1;
2516 for (uint32_t i = 0;
2517 i < if_input_thread_termination_spin;
2518 i++) {
2519 v = (i + 1) * v;
2520 }
2521 DLIL_PRINTF("the value is %d\n", v);
2522 }
2523 #endif /* TEST_INPUT_THREAD_TERMINATION */
2524
2525 lck_mtx_lock_spin(&inp->dlth_lock);
2526 _getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
2527 VERIFY((inp->dlth_flags & DLIL_INPUT_TERMINATE) != 0);
2528 inp->dlth_flags |= DLIL_INPUT_TERMINATE_COMPLETE;
2529 wakeup_one((caddr_t)&inp->dlth_flags);
2530 lck_mtx_unlock(&inp->dlth_lock);
2531
2532 /* free up pending packets */
2533 if (pkt.cp_mbuf != NULL) {
2534 mbuf_freem_list(pkt.cp_mbuf);
2535 }
2536
2537 /* for the extra refcnt from kernel_thread_start() */
2538 thread_deallocate(current_thread());
2539
2540 if (dlil_verbose) {
2541 DLIL_PRINTF("%s: input thread terminated\n",
2542 if_name(ifp));
2543 }
2544
2545 /* this is the end */
2546 thread_terminate(current_thread());
2547 /* NOTREACHED */
2548 }
2549
2550 static kern_return_t
dlil_affinity_set(struct thread * tp,u_int32_t tag)2551 dlil_affinity_set(struct thread *tp, u_int32_t tag)
2552 {
2553 thread_affinity_policy_data_t policy;
2554
2555 bzero(&policy, sizeof(policy));
2556 policy.affinity_tag = tag;
2557 return thread_policy_set(tp, THREAD_AFFINITY_POLICY,
2558 (thread_policy_t)&policy, THREAD_AFFINITY_POLICY_COUNT);
2559 }
2560
2561 #if SKYWALK && defined(XNU_TARGET_OS_OSX)
2562 static void
dlil_filter_event(struct eventhandler_entry_arg arg __unused,enum net_filter_event_subsystems state)2563 dlil_filter_event(struct eventhandler_entry_arg arg __unused,
2564 enum net_filter_event_subsystems state)
2565 {
2566 bool old_if_enable_fsw_transport_netagent = if_enable_fsw_transport_netagent;
2567 if ((state & ~NET_FILTER_EVENT_PF_PRIVATE_PROXY) == 0) {
2568 if_enable_fsw_transport_netagent = 1;
2569 } else {
2570 if_enable_fsw_transport_netagent = 0;
2571 }
2572 if (old_if_enable_fsw_transport_netagent != if_enable_fsw_transport_netagent) {
2573 kern_nexus_update_netagents();
2574 } else if (!if_enable_fsw_transport_netagent) {
2575 necp_update_all_clients();
2576 }
2577 }
2578 #endif /* SKYWALK && XNU_TARGET_OS_OSX */
2579
2580 void
dlil_init(void)2581 dlil_init(void)
2582 {
2583 thread_t thread = THREAD_NULL;
2584
2585 /*
2586 * The following fields must be 64-bit aligned for atomic operations.
2587 */
2588 IF_DATA_REQUIRE_ALIGNED_64(ifi_ipackets);
2589 IF_DATA_REQUIRE_ALIGNED_64(ifi_ierrors);
2590 IF_DATA_REQUIRE_ALIGNED_64(ifi_opackets);
2591 IF_DATA_REQUIRE_ALIGNED_64(ifi_oerrors);
2592 IF_DATA_REQUIRE_ALIGNED_64(ifi_collisions);
2593 IF_DATA_REQUIRE_ALIGNED_64(ifi_ibytes);
2594 IF_DATA_REQUIRE_ALIGNED_64(ifi_obytes);
2595 IF_DATA_REQUIRE_ALIGNED_64(ifi_imcasts);
2596 IF_DATA_REQUIRE_ALIGNED_64(ifi_omcasts);
2597 IF_DATA_REQUIRE_ALIGNED_64(ifi_iqdrops);
2598 IF_DATA_REQUIRE_ALIGNED_64(ifi_noproto);
2599 IF_DATA_REQUIRE_ALIGNED_64(ifi_alignerrs);
2600 IF_DATA_REQUIRE_ALIGNED_64(ifi_dt_bytes);
2601 IF_DATA_REQUIRE_ALIGNED_64(ifi_fpackets);
2602 IF_DATA_REQUIRE_ALIGNED_64(ifi_fbytes);
2603
2604 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_ipackets);
2605 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_ierrors);
2606 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_opackets);
2607 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_oerrors);
2608 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_collisions);
2609 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_ibytes);
2610 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_obytes);
2611 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_imcasts);
2612 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_omcasts);
2613 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_iqdrops);
2614 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_noproto);
2615 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_alignerrs);
2616 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_dt_bytes);
2617 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_fpackets);
2618 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_fbytes);
2619
2620 /*
2621 * These IF_HWASSIST_ flags must be equal to their IFNET_* counterparts.
2622 */
2623 _CASSERT(IF_HWASSIST_CSUM_IP == IFNET_CSUM_IP);
2624 _CASSERT(IF_HWASSIST_CSUM_TCP == IFNET_CSUM_TCP);
2625 _CASSERT(IF_HWASSIST_CSUM_UDP == IFNET_CSUM_UDP);
2626 _CASSERT(IF_HWASSIST_CSUM_IP_FRAGS == IFNET_CSUM_FRAGMENT);
2627 _CASSERT(IF_HWASSIST_CSUM_FRAGMENT == IFNET_IP_FRAGMENT);
2628 _CASSERT(IF_HWASSIST_CSUM_TCPIPV6 == IFNET_CSUM_TCPIPV6);
2629 _CASSERT(IF_HWASSIST_CSUM_UDPIPV6 == IFNET_CSUM_UDPIPV6);
2630 _CASSERT(IF_HWASSIST_CSUM_FRAGMENT_IPV6 == IFNET_IPV6_FRAGMENT);
2631 _CASSERT(IF_HWASSIST_CSUM_PARTIAL == IFNET_CSUM_PARTIAL);
2632 _CASSERT(IF_HWASSIST_CSUM_ZERO_INVERT == IFNET_CSUM_ZERO_INVERT);
2633 _CASSERT(IF_HWASSIST_VLAN_TAGGING == IFNET_VLAN_TAGGING);
2634 _CASSERT(IF_HWASSIST_VLAN_MTU == IFNET_VLAN_MTU);
2635 _CASSERT(IF_HWASSIST_TSO_V4 == IFNET_TSO_IPV4);
2636 _CASSERT(IF_HWASSIST_TSO_V6 == IFNET_TSO_IPV6);
2637
2638 /*
2639 * ... as well as the mbuf checksum flags counterparts.
2640 */
2641 _CASSERT(CSUM_IP == IF_HWASSIST_CSUM_IP);
2642 _CASSERT(CSUM_TCP == IF_HWASSIST_CSUM_TCP);
2643 _CASSERT(CSUM_UDP == IF_HWASSIST_CSUM_UDP);
2644 _CASSERT(CSUM_IP_FRAGS == IF_HWASSIST_CSUM_IP_FRAGS);
2645 _CASSERT(CSUM_FRAGMENT == IF_HWASSIST_CSUM_FRAGMENT);
2646 _CASSERT(CSUM_TCPIPV6 == IF_HWASSIST_CSUM_TCPIPV6);
2647 _CASSERT(CSUM_UDPIPV6 == IF_HWASSIST_CSUM_UDPIPV6);
2648 _CASSERT(CSUM_FRAGMENT_IPV6 == IF_HWASSIST_CSUM_FRAGMENT_IPV6);
2649 _CASSERT(CSUM_PARTIAL == IF_HWASSIST_CSUM_PARTIAL);
2650 _CASSERT(CSUM_ZERO_INVERT == IF_HWASSIST_CSUM_ZERO_INVERT);
2651 _CASSERT(CSUM_VLAN_TAG_VALID == IF_HWASSIST_VLAN_TAGGING);
2652
2653 /*
2654 * Make sure we have at least IF_LLREACH_MAXLEN in the llreach info.
2655 */
2656 _CASSERT(IF_LLREACH_MAXLEN <= IF_LLREACHINFO_ADDRLEN);
2657 _CASSERT(IFNET_LLREACHINFO_ADDRLEN == IF_LLREACHINFO_ADDRLEN);
2658
2659 _CASSERT(IFRLOGF_DLIL == IFNET_LOGF_DLIL);
2660 _CASSERT(IFRLOGF_FAMILY == IFNET_LOGF_FAMILY);
2661 _CASSERT(IFRLOGF_DRIVER == IFNET_LOGF_DRIVER);
2662 _CASSERT(IFRLOGF_FIRMWARE == IFNET_LOGF_FIRMWARE);
2663
2664 _CASSERT(IFRLOGCAT_CONNECTIVITY == IFNET_LOGCAT_CONNECTIVITY);
2665 _CASSERT(IFRLOGCAT_QUALITY == IFNET_LOGCAT_QUALITY);
2666 _CASSERT(IFRLOGCAT_PERFORMANCE == IFNET_LOGCAT_PERFORMANCE);
2667
2668 _CASSERT(IFRTYPE_FAMILY_ANY == IFNET_FAMILY_ANY);
2669 _CASSERT(IFRTYPE_FAMILY_LOOPBACK == IFNET_FAMILY_LOOPBACK);
2670 _CASSERT(IFRTYPE_FAMILY_ETHERNET == IFNET_FAMILY_ETHERNET);
2671 _CASSERT(IFRTYPE_FAMILY_SLIP == IFNET_FAMILY_SLIP);
2672 _CASSERT(IFRTYPE_FAMILY_TUN == IFNET_FAMILY_TUN);
2673 _CASSERT(IFRTYPE_FAMILY_VLAN == IFNET_FAMILY_VLAN);
2674 _CASSERT(IFRTYPE_FAMILY_PPP == IFNET_FAMILY_PPP);
2675 _CASSERT(IFRTYPE_FAMILY_PVC == IFNET_FAMILY_PVC);
2676 _CASSERT(IFRTYPE_FAMILY_DISC == IFNET_FAMILY_DISC);
2677 _CASSERT(IFRTYPE_FAMILY_MDECAP == IFNET_FAMILY_MDECAP);
2678 _CASSERT(IFRTYPE_FAMILY_GIF == IFNET_FAMILY_GIF);
2679 _CASSERT(IFRTYPE_FAMILY_FAITH == IFNET_FAMILY_FAITH);
2680 _CASSERT(IFRTYPE_FAMILY_STF == IFNET_FAMILY_STF);
2681 _CASSERT(IFRTYPE_FAMILY_FIREWIRE == IFNET_FAMILY_FIREWIRE);
2682 _CASSERT(IFRTYPE_FAMILY_BOND == IFNET_FAMILY_BOND);
2683 _CASSERT(IFRTYPE_FAMILY_CELLULAR == IFNET_FAMILY_CELLULAR);
2684 _CASSERT(IFRTYPE_FAMILY_UTUN == IFNET_FAMILY_UTUN);
2685 _CASSERT(IFRTYPE_FAMILY_IPSEC == IFNET_FAMILY_IPSEC);
2686
2687 _CASSERT(IFRTYPE_SUBFAMILY_ANY == IFNET_SUBFAMILY_ANY);
2688 _CASSERT(IFRTYPE_SUBFAMILY_USB == IFNET_SUBFAMILY_USB);
2689 _CASSERT(IFRTYPE_SUBFAMILY_BLUETOOTH == IFNET_SUBFAMILY_BLUETOOTH);
2690 _CASSERT(IFRTYPE_SUBFAMILY_WIFI == IFNET_SUBFAMILY_WIFI);
2691 _CASSERT(IFRTYPE_SUBFAMILY_THUNDERBOLT == IFNET_SUBFAMILY_THUNDERBOLT);
2692 _CASSERT(IFRTYPE_SUBFAMILY_RESERVED == IFNET_SUBFAMILY_RESERVED);
2693 _CASSERT(IFRTYPE_SUBFAMILY_INTCOPROC == IFNET_SUBFAMILY_INTCOPROC);
2694 _CASSERT(IFRTYPE_SUBFAMILY_QUICKRELAY == IFNET_SUBFAMILY_QUICKRELAY);
2695 _CASSERT(IFRTYPE_SUBFAMILY_VMNET == IFNET_SUBFAMILY_VMNET);
2696 _CASSERT(IFRTYPE_SUBFAMILY_SIMCELL == IFNET_SUBFAMILY_SIMCELL);
2697 _CASSERT(IFRTYPE_SUBFAMILY_MANAGEMENT == IFNET_SUBFAMILY_MANAGEMENT);
2698
2699 _CASSERT(DLIL_MODIDLEN == IFNET_MODIDLEN);
2700 _CASSERT(DLIL_MODARGLEN == IFNET_MODARGLEN);
2701
2702 PE_parse_boot_argn("net_affinity", &net_affinity,
2703 sizeof(net_affinity));
2704
2705 PE_parse_boot_argn("net_rxpoll", &net_rxpoll, sizeof(net_rxpoll));
2706
2707 PE_parse_boot_argn("net_rtref", &net_rtref, sizeof(net_rtref));
2708
2709 PE_parse_boot_argn("net_async", &net_async, sizeof(net_async));
2710
2711 PE_parse_boot_argn("ifnet_debug", &ifnet_debug, sizeof(ifnet_debug));
2712
2713 VERIFY(dlil_pending_thread_cnt == 0);
2714 #if SKYWALK
2715 boolean_t pe_enable_fsw_transport_netagent = FALSE;
2716 boolean_t pe_disable_fsw_transport_netagent = FALSE;
2717 boolean_t enable_fsw_netagent =
2718 (((if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0) ||
2719 (if_attach_nx & IF_ATTACH_NX_FSW_IP_NETAGENT) != 0);
2720
2721 /*
2722 * Check the device tree to see if Skywalk netagent has been explicitly
2723 * enabled or disabled. This can be overridden via if_attach_nx below.
2724 * Note that the property is a 0-length key, and so checking for the
2725 * presence itself is enough (no need to check for the actual value of
2726 * the retrieved variable.)
2727 */
2728 pe_enable_fsw_transport_netagent =
2729 PE_get_default("kern.skywalk_netagent_enable",
2730 &pe_enable_fsw_transport_netagent,
2731 sizeof(pe_enable_fsw_transport_netagent));
2732 pe_disable_fsw_transport_netagent =
2733 PE_get_default("kern.skywalk_netagent_disable",
2734 &pe_disable_fsw_transport_netagent,
2735 sizeof(pe_disable_fsw_transport_netagent));
2736
2737 /*
2738 * These two are mutually exclusive, i.e. they both can be absent,
2739 * but only one can be present at a time, and so we assert to make
2740 * sure it is correct.
2741 */
2742 VERIFY((!pe_enable_fsw_transport_netagent &&
2743 !pe_disable_fsw_transport_netagent) ||
2744 (pe_enable_fsw_transport_netagent ^
2745 pe_disable_fsw_transport_netagent));
2746
2747 if (pe_enable_fsw_transport_netagent) {
2748 kprintf("SK: netagent is enabled via an override for "
2749 "this platform\n");
2750 if_attach_nx = SKYWALK_NETWORKING_ENABLED;
2751 } else if (pe_disable_fsw_transport_netagent) {
2752 kprintf("SK: netagent is disabled via an override for "
2753 "this platform\n");
2754 if_attach_nx = SKYWALK_NETWORKING_DISABLED;
2755 } else {
2756 kprintf("SK: netagent is %s by default for this platform\n",
2757 (enable_fsw_netagent ? "enabled" : "disabled"));
2758 if_attach_nx = IF_ATTACH_NX_DEFAULT;
2759 }
2760
2761 /*
2762 * Now see if there's a boot-arg override.
2763 */
2764 (void) PE_parse_boot_argn("if_attach_nx", &if_attach_nx,
2765 sizeof(if_attach_nx));
2766 if_enable_fsw_transport_netagent =
2767 ((if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0);
2768
2769 if_netif_all = ((if_attach_nx & IF_ATTACH_NX_NETIF_ALL) != 0);
2770
2771 if (pe_disable_fsw_transport_netagent &&
2772 if_enable_fsw_transport_netagent) {
2773 kprintf("SK: netagent is force-enabled\n");
2774 } else if (!pe_disable_fsw_transport_netagent &&
2775 !if_enable_fsw_transport_netagent) {
2776 kprintf("SK: netagent is force-disabled\n");
2777 }
2778 #ifdef XNU_TARGET_OS_OSX
2779 if (if_enable_fsw_transport_netagent) {
2780 net_filter_event_register(dlil_filter_event);
2781 }
2782 #endif /* XNU_TARGET_OS_OSX */
2783
2784 #if (DEVELOPMENT || DEBUG)
2785 (void) PE_parse_boot_argn("fsw_use_max_mtu_buffer",
2786 &fsw_use_max_mtu_buffer, sizeof(fsw_use_max_mtu_buffer));
2787 #endif /* (DEVELOPMENT || DEBUG) */
2788
2789 #endif /* SKYWALK */
2790 dlif_size = (ifnet_debug == 0) ? sizeof(struct dlil_ifnet) :
2791 sizeof(struct dlil_ifnet_dbg);
2792 /* Enforce 64-bit alignment for dlil_ifnet structure */
2793 dlif_bufsize = dlif_size + sizeof(void *) + sizeof(u_int64_t);
2794 dlif_bufsize = (uint32_t)P2ROUNDUP(dlif_bufsize, sizeof(u_int64_t));
2795 dlif_zone = zone_create(DLIF_ZONE_NAME, dlif_bufsize, ZC_ZFREE_CLEARMEM);
2796
2797 dlif_tcpstat_size = sizeof(struct tcpstat_local);
2798 /* Enforce 64-bit alignment for tcpstat_local structure */
2799 dlif_tcpstat_bufsize =
2800 dlif_tcpstat_size + sizeof(void *) + sizeof(u_int64_t);
2801 dlif_tcpstat_bufsize = (uint32_t)
2802 P2ROUNDUP(dlif_tcpstat_bufsize, sizeof(u_int64_t));
2803 dlif_tcpstat_zone = zone_create(DLIF_TCPSTAT_ZONE_NAME,
2804 dlif_tcpstat_bufsize, ZC_ZFREE_CLEARMEM);
2805
2806 dlif_udpstat_size = sizeof(struct udpstat_local);
2807 /* Enforce 64-bit alignment for udpstat_local structure */
2808 dlif_udpstat_bufsize =
2809 dlif_udpstat_size + sizeof(void *) + sizeof(u_int64_t);
2810 dlif_udpstat_bufsize = (uint32_t)
2811 P2ROUNDUP(dlif_udpstat_bufsize, sizeof(u_int64_t));
2812 dlif_udpstat_zone = zone_create(DLIF_UDPSTAT_ZONE_NAME,
2813 dlif_udpstat_bufsize, ZC_ZFREE_CLEARMEM);
2814
2815 eventhandler_lists_ctxt_init(&ifnet_evhdlr_ctxt);
2816
2817 TAILQ_INIT(&dlil_ifnet_head);
2818 TAILQ_INIT(&ifnet_head);
2819 TAILQ_INIT(&ifnet_detaching_head);
2820 TAILQ_INIT(&ifnet_ordered_head);
2821
2822 /* Initialize interface address subsystem */
2823 ifa_init();
2824
2825 #if PF
2826 /* Initialize the packet filter */
2827 pfinit();
2828 #endif /* PF */
2829
2830 /* Initialize queue algorithms */
2831 classq_init();
2832
2833 /* Initialize packet schedulers */
2834 pktsched_init();
2835
2836 /* Initialize flow advisory subsystem */
2837 flowadv_init();
2838
2839 /* Initialize the pktap virtual interface */
2840 pktap_init();
2841
2842 /* Initialize the service class to dscp map */
2843 net_qos_map_init();
2844
2845 /* Initialize the interface low power mode event handler */
2846 if_low_power_evhdlr_init();
2847
2848 /* Initialize the interface offload port list subsystem */
2849 if_ports_used_init();
2850
2851 #if DEBUG || DEVELOPMENT
2852 /* Run self-tests */
2853 dlil_verify_sum16();
2854 #endif /* DEBUG || DEVELOPMENT */
2855
2856 /*
2857 * Create and start up the main DLIL input thread and the interface
2858 * detacher threads once everything is initialized.
2859 */
2860 dlil_incr_pending_thread_count();
2861 (void) dlil_create_input_thread(NULL, dlil_main_input_thread, NULL);
2862
2863 /*
2864 * Create ifnet detacher thread.
2865 * When an interface gets detached, part of the detach processing
2866 * is delayed. The interface is added to delayed detach list
2867 * and this thread is woken up to call ifnet_detach_final
2868 * on these interfaces.
2869 */
2870 dlil_incr_pending_thread_count();
2871 if (kernel_thread_start(ifnet_detacher_thread_func,
2872 NULL, &thread) != KERN_SUCCESS) {
2873 panic_plain("%s: couldn't create detacher thread", __func__);
2874 /* NOTREACHED */
2875 }
2876 thread_deallocate(thread);
2877
2878 /*
2879 * Wait for the created kernel threads for dlil to get
2880 * scheduled and run at least once before we proceed
2881 */
2882 lck_mtx_lock(&dlil_thread_sync_lock);
2883 while (dlil_pending_thread_cnt != 0) {
2884 DLIL_PRINTF("%s: Waiting for all the create dlil kernel "
2885 "threads to get scheduled at least once.\n", __func__);
2886 (void) msleep(&dlil_pending_thread_cnt, &dlil_thread_sync_lock,
2887 (PZERO - 1), __func__, NULL);
2888 LCK_MTX_ASSERT(&dlil_thread_sync_lock, LCK_ASSERT_OWNED);
2889 }
2890 lck_mtx_unlock(&dlil_thread_sync_lock);
2891 DLIL_PRINTF("%s: All the created dlil kernel threads have been "
2892 "scheduled at least once. Proceeding.\n", __func__);
2893 }
2894
2895 static void
if_flt_monitor_busy(struct ifnet * ifp)2896 if_flt_monitor_busy(struct ifnet *ifp)
2897 {
2898 LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
2899
2900 ++ifp->if_flt_busy;
2901 VERIFY(ifp->if_flt_busy != 0);
2902 }
2903
2904 static void
if_flt_monitor_unbusy(struct ifnet * ifp)2905 if_flt_monitor_unbusy(struct ifnet *ifp)
2906 {
2907 if_flt_monitor_leave(ifp);
2908 }
2909
2910 static void
if_flt_monitor_enter(struct ifnet * ifp)2911 if_flt_monitor_enter(struct ifnet *ifp)
2912 {
2913 LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
2914
2915 while (ifp->if_flt_busy) {
2916 ++ifp->if_flt_waiters;
2917 (void) msleep(&ifp->if_flt_head, &ifp->if_flt_lock,
2918 (PZERO - 1), "if_flt_monitor", NULL);
2919 }
2920 if_flt_monitor_busy(ifp);
2921 }
2922
2923 static void
if_flt_monitor_leave(struct ifnet * ifp)2924 if_flt_monitor_leave(struct ifnet *ifp)
2925 {
2926 LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
2927
2928 VERIFY(ifp->if_flt_busy != 0);
2929 --ifp->if_flt_busy;
2930
2931 if (ifp->if_flt_busy == 0 && ifp->if_flt_waiters > 0) {
2932 ifp->if_flt_waiters = 0;
2933 wakeup(&ifp->if_flt_head);
2934 }
2935 }
2936
2937 __private_extern__ int
dlil_attach_filter(struct ifnet * ifp,const struct iff_filter * if_filter,interface_filter_t * filter_ref,u_int32_t flags)2938 dlil_attach_filter(struct ifnet *ifp, const struct iff_filter *if_filter,
2939 interface_filter_t *filter_ref, u_int32_t flags)
2940 {
2941 int retval = 0;
2942 struct ifnet_filter *filter = NULL;
2943
2944 ifnet_head_lock_shared();
2945
2946 /* Check that the interface is in the global list */
2947 if (!ifnet_lookup(ifp)) {
2948 retval = ENXIO;
2949 goto done;
2950 }
2951 if (!ifnet_is_attached(ifp, 1)) {
2952 os_log(OS_LOG_DEFAULT, "%s: %s is no longer attached",
2953 __func__, if_name(ifp));
2954 retval = ENXIO;
2955 goto done;
2956 }
2957
2958 filter = zalloc_flags(dlif_filt_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
2959
2960 /* refcnt held above during lookup */
2961 filter->filt_flags = flags;
2962 filter->filt_ifp = ifp;
2963 filter->filt_cookie = if_filter->iff_cookie;
2964 filter->filt_name = if_filter->iff_name;
2965 filter->filt_protocol = if_filter->iff_protocol;
2966 /*
2967 * Do not install filter callbacks for internal coproc interface
2968 * and for management interfaces
2969 */
2970 if (!IFNET_IS_INTCOPROC(ifp) && !IFNET_IS_MANAGEMENT(ifp)) {
2971 filter->filt_input = if_filter->iff_input;
2972 filter->filt_output = if_filter->iff_output;
2973 filter->filt_event = if_filter->iff_event;
2974 filter->filt_ioctl = if_filter->iff_ioctl;
2975 }
2976 filter->filt_detached = if_filter->iff_detached;
2977
2978 lck_mtx_lock(&ifp->if_flt_lock);
2979 if_flt_monitor_enter(ifp);
2980
2981 LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
2982 TAILQ_INSERT_TAIL(&ifp->if_flt_head, filter, filt_next);
2983
2984 *filter_ref = filter;
2985
2986 /*
2987 * Bump filter count and route_generation ID to let TCP
2988 * know it shouldn't do TSO on this connection
2989 */
2990 if ((filter->filt_flags & DLIL_IFF_TSO) == 0) {
2991 ifnet_filter_update_tso(ifp, TRUE);
2992 }
2993 OSIncrementAtomic64(&net_api_stats.nas_iflt_attach_count);
2994 INC_ATOMIC_INT64_LIM(net_api_stats.nas_iflt_attach_total);
2995 if (filter->filt_flags & DLIL_IFF_INTERNAL) {
2996 OSIncrementAtomic64(&net_api_stats.nas_iflt_attach_os_count);
2997 INC_ATOMIC_INT64_LIM(net_api_stats.nas_iflt_attach_os_total);
2998 } else {
2999 OSAddAtomic(1, &ifp->if_flt_non_os_count);
3000 }
3001 if_flt_monitor_leave(ifp);
3002 lck_mtx_unlock(&ifp->if_flt_lock);
3003
3004 #if SKYWALK && defined(XNU_TARGET_OS_OSX)
3005 net_filter_event_mark(NET_FILTER_EVENT_INTERFACE,
3006 net_check_compatible_if_filter(NULL));
3007 #endif /* SKYWALK && XNU_TARGET_OS_OSX */
3008
3009 if (dlil_verbose) {
3010 DLIL_PRINTF("%s: %s filter attached\n", if_name(ifp),
3011 if_filter->iff_name);
3012 }
3013 ifnet_decr_iorefcnt(ifp);
3014
3015 done:
3016 ifnet_head_done();
3017 if (retval != 0 && ifp != NULL) {
3018 DLIL_PRINTF("%s: failed to attach %s (err=%d)\n",
3019 if_name(ifp), if_filter->iff_name, retval);
3020 }
3021 if (retval != 0 && filter != NULL) {
3022 zfree(dlif_filt_zone, filter);
3023 }
3024
3025 return retval;
3026 }
3027
3028 static int
dlil_detach_filter_internal(interface_filter_t filter,int detached)3029 dlil_detach_filter_internal(interface_filter_t filter, int detached)
3030 {
3031 int retval = 0;
3032
3033 if (detached == 0) {
3034 ifnet_t ifp = NULL;
3035
3036 ifnet_head_lock_shared();
3037 TAILQ_FOREACH(ifp, &ifnet_head, if_link) {
3038 interface_filter_t entry = NULL;
3039
3040 lck_mtx_lock(&ifp->if_flt_lock);
3041 TAILQ_FOREACH(entry, &ifp->if_flt_head, filt_next) {
3042 if (entry != filter || entry->filt_skip) {
3043 continue;
3044 }
3045 /*
3046 * We've found a match; since it's possible
3047 * that the thread gets blocked in the monitor,
3048 * we do the lock dance. Interface should
3049 * not be detached since we still have a use
3050 * count held during filter attach.
3051 */
3052 entry->filt_skip = 1; /* skip input/output */
3053 lck_mtx_unlock(&ifp->if_flt_lock);
3054 ifnet_head_done();
3055
3056 lck_mtx_lock(&ifp->if_flt_lock);
3057 if_flt_monitor_enter(ifp);
3058 LCK_MTX_ASSERT(&ifp->if_flt_lock,
3059 LCK_MTX_ASSERT_OWNED);
3060
3061 /* Remove the filter from the list */
3062 TAILQ_REMOVE(&ifp->if_flt_head, filter,
3063 filt_next);
3064
3065 if (dlil_verbose) {
3066 DLIL_PRINTF("%s: %s filter detached\n",
3067 if_name(ifp), filter->filt_name);
3068 }
3069 if (!(filter->filt_flags & DLIL_IFF_INTERNAL)) {
3070 VERIFY(ifp->if_flt_non_os_count != 0);
3071 OSAddAtomic(-1, &ifp->if_flt_non_os_count);
3072 }
3073 /*
3074 * Decrease filter count and route_generation
3075 * ID to let TCP know it should reevalute doing
3076 * TSO or not.
3077 */
3078 if ((filter->filt_flags & DLIL_IFF_TSO) == 0) {
3079 ifnet_filter_update_tso(ifp, FALSE);
3080 }
3081 if_flt_monitor_leave(ifp);
3082 lck_mtx_unlock(&ifp->if_flt_lock);
3083 goto destroy;
3084 }
3085 lck_mtx_unlock(&ifp->if_flt_lock);
3086 }
3087 ifnet_head_done();
3088
3089 /* filter parameter is not a valid filter ref */
3090 retval = EINVAL;
3091 goto done;
3092 } else {
3093 struct ifnet *ifp = filter->filt_ifp;
3094 /*
3095 * Here we are called from ifnet_detach_final(); the
3096 * caller had emptied if_flt_head and we're doing an
3097 * implicit filter detach because the interface is
3098 * about to go away. Make sure to adjust the counters
3099 * in this case. We don't need the protection of the
3100 * filter monitor since we're called as part of the
3101 * final detach in the context of the detacher thread.
3102 */
3103 if (!(filter->filt_flags & DLIL_IFF_INTERNAL)) {
3104 VERIFY(ifp->if_flt_non_os_count != 0);
3105 OSAddAtomic(-1, &ifp->if_flt_non_os_count);
3106 }
3107 /*
3108 * Decrease filter count and route_generation
3109 * ID to let TCP know it should reevalute doing
3110 * TSO or not.
3111 */
3112 if ((filter->filt_flags & DLIL_IFF_TSO) == 0) {
3113 ifnet_filter_update_tso(ifp, FALSE);
3114 }
3115 }
3116
3117 if (dlil_verbose) {
3118 DLIL_PRINTF("%s filter detached\n", filter->filt_name);
3119 }
3120
3121 destroy:
3122
3123 /* Call the detached function if there is one */
3124 if (filter->filt_detached) {
3125 filter->filt_detached(filter->filt_cookie, filter->filt_ifp);
3126 }
3127
3128 VERIFY(OSDecrementAtomic64(&net_api_stats.nas_iflt_attach_count) > 0);
3129 if (filter->filt_flags & DLIL_IFF_INTERNAL) {
3130 VERIFY(OSDecrementAtomic64(&net_api_stats.nas_iflt_attach_os_count) > 0);
3131 }
3132 #if SKYWALK && defined(XNU_TARGET_OS_OSX)
3133 net_filter_event_mark(NET_FILTER_EVENT_INTERFACE,
3134 net_check_compatible_if_filter(NULL));
3135 #endif /* SKYWALK && XNU_TARGET_OS_OSX */
3136
3137 /* Free the filter */
3138 zfree(dlif_filt_zone, filter);
3139 filter = NULL;
3140 done:
3141 if (retval != 0 && filter != NULL) {
3142 DLIL_PRINTF("failed to detach %s filter (err=%d)\n",
3143 filter->filt_name, retval);
3144 }
3145
3146 return retval;
3147 }
3148
3149 __private_extern__ void
dlil_detach_filter(interface_filter_t filter)3150 dlil_detach_filter(interface_filter_t filter)
3151 {
3152 if (filter == NULL) {
3153 return;
3154 }
3155 dlil_detach_filter_internal(filter, 0);
3156 }
3157
3158 __private_extern__ boolean_t
dlil_has_ip_filter(void)3159 dlil_has_ip_filter(void)
3160 {
3161 boolean_t has_filter = ((net_api_stats.nas_ipf_add_count - net_api_stats.nas_ipf_add_os_count) > 0);
3162
3163 VERIFY(net_api_stats.nas_ipf_add_count >= net_api_stats.nas_ipf_add_os_count);
3164
3165 DTRACE_IP1(dlil_has_ip_filter, boolean_t, has_filter);
3166 return has_filter;
3167 }
3168
3169 __private_extern__ boolean_t
dlil_has_if_filter(struct ifnet * ifp)3170 dlil_has_if_filter(struct ifnet *ifp)
3171 {
3172 boolean_t has_filter = !TAILQ_EMPTY(&ifp->if_flt_head);
3173 DTRACE_IP1(dlil_has_if_filter, boolean_t, has_filter);
3174 return has_filter;
3175 }
3176
3177 static inline void
dlil_input_wakeup(struct dlil_threading_info * inp)3178 dlil_input_wakeup(struct dlil_threading_info *inp)
3179 {
3180 LCK_MTX_ASSERT(&inp->dlth_lock, LCK_MTX_ASSERT_OWNED);
3181
3182 inp->dlth_flags |= DLIL_INPUT_WAITING;
3183 if (!(inp->dlth_flags & DLIL_INPUT_RUNNING)) {
3184 inp->dlth_wtot++;
3185 wakeup_one((caddr_t)&inp->dlth_flags);
3186 }
3187 }
3188
3189 __attribute__((noreturn))
3190 static void
dlil_main_input_thread_func(void * v,wait_result_t w)3191 dlil_main_input_thread_func(void *v, wait_result_t w)
3192 {
3193 #pragma unused(w)
3194 struct dlil_threading_info *inp = v;
3195
3196 VERIFY(inp == dlil_main_input_thread);
3197 VERIFY(inp->dlth_ifp == NULL);
3198 VERIFY(current_thread() == inp->dlth_thread);
3199
3200 lck_mtx_lock(&inp->dlth_lock);
3201 VERIFY(!(inp->dlth_flags & (DLIL_INPUT_EMBRYONIC | DLIL_INPUT_RUNNING)));
3202 (void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3203 inp->dlth_flags |= DLIL_INPUT_EMBRYONIC;
3204 /* wake up once to get out of embryonic state */
3205 dlil_input_wakeup(inp);
3206 lck_mtx_unlock(&inp->dlth_lock);
3207 (void) thread_block_parameter(dlil_main_input_thread_cont, inp);
3208 /* NOTREACHED */
3209 __builtin_unreachable();
3210 }
3211
3212 /*
3213 * Main input thread:
3214 *
3215 * a) handles all inbound packets for lo0
3216 * b) handles all inbound packets for interfaces with no dedicated
3217 * input thread (e.g. anything but Ethernet/PDP or those that support
3218 * opportunistic polling.)
3219 * c) protocol registrations
3220 * d) packet injections
3221 */
3222 __attribute__((noreturn))
3223 static void
dlil_main_input_thread_cont(void * v,wait_result_t wres)3224 dlil_main_input_thread_cont(void *v, wait_result_t wres)
3225 {
3226 struct dlil_main_threading_info *inpm = v;
3227 struct dlil_threading_info *inp = v;
3228
3229 /* main input thread is uninterruptible */
3230 VERIFY(wres != THREAD_INTERRUPTED);
3231 lck_mtx_lock_spin(&inp->dlth_lock);
3232 VERIFY(!(inp->dlth_flags & (DLIL_INPUT_TERMINATE |
3233 DLIL_INPUT_RUNNING)));
3234 inp->dlth_flags |= DLIL_INPUT_RUNNING;
3235
3236 while (1) {
3237 struct mbuf *m = NULL, *m_loop = NULL;
3238 u_int32_t m_cnt, m_cnt_loop;
3239 classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
3240 boolean_t proto_req;
3241 boolean_t embryonic;
3242
3243 inp->dlth_flags &= ~DLIL_INPUT_WAITING;
3244
3245 if (__improbable(embryonic =
3246 (inp->dlth_flags & DLIL_INPUT_EMBRYONIC))) {
3247 inp->dlth_flags &= ~DLIL_INPUT_EMBRYONIC;
3248 }
3249
3250 proto_req = (inp->dlth_flags &
3251 (DLIL_PROTO_WAITING | DLIL_PROTO_REGISTER));
3252
3253 /* Packets for non-dedicated interfaces other than lo0 */
3254 m_cnt = qlen(&inp->dlth_pkts);
3255 _getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
3256 m = pkt.cp_mbuf;
3257
3258 /* Packets exclusive to lo0 */
3259 m_cnt_loop = qlen(&inpm->lo_rcvq_pkts);
3260 _getq_all(&inpm->lo_rcvq_pkts, &pkt, NULL, NULL, NULL);
3261 m_loop = pkt.cp_mbuf;
3262
3263 inp->dlth_wtot = 0;
3264
3265 lck_mtx_unlock(&inp->dlth_lock);
3266
3267 if (__improbable(embryonic)) {
3268 dlil_decr_pending_thread_count();
3269 }
3270
3271 /*
3272 * NOTE warning %%% attention !!!!
3273 * We should think about putting some thread starvation
3274 * safeguards if we deal with long chains of packets.
3275 */
3276 if (__probable(m_loop != NULL)) {
3277 dlil_input_packet_list_extended(lo_ifp, m_loop,
3278 m_cnt_loop, IFNET_MODEL_INPUT_POLL_OFF);
3279 }
3280
3281 if (__probable(m != NULL)) {
3282 dlil_input_packet_list_extended(NULL, m,
3283 m_cnt, IFNET_MODEL_INPUT_POLL_OFF);
3284 }
3285
3286 if (__improbable(proto_req)) {
3287 proto_input_run();
3288 }
3289
3290 lck_mtx_lock_spin(&inp->dlth_lock);
3291 VERIFY(inp->dlth_flags & DLIL_INPUT_RUNNING);
3292 /* main input thread cannot be terminated */
3293 VERIFY(!(inp->dlth_flags & DLIL_INPUT_TERMINATE));
3294 if (!(inp->dlth_flags & ~DLIL_INPUT_RUNNING)) {
3295 break;
3296 }
3297 }
3298
3299 inp->dlth_flags &= ~DLIL_INPUT_RUNNING;
3300 (void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3301 lck_mtx_unlock(&inp->dlth_lock);
3302 (void) thread_block_parameter(dlil_main_input_thread_cont, inp);
3303
3304 VERIFY(0); /* we should never get here */
3305 /* NOTREACHED */
3306 __builtin_unreachable();
3307 }
3308
3309 /*
3310 * Input thread for interfaces with legacy input model.
3311 */
3312 __attribute__((noreturn))
3313 static void
dlil_input_thread_func(void * v,wait_result_t w)3314 dlil_input_thread_func(void *v, wait_result_t w)
3315 {
3316 #pragma unused(w)
3317 char thread_name[MAXTHREADNAMESIZE];
3318 struct dlil_threading_info *inp = v;
3319 struct ifnet *ifp = inp->dlth_ifp;
3320
3321 VERIFY(inp != dlil_main_input_thread);
3322 VERIFY(ifp != NULL);
3323 VERIFY(!(ifp->if_eflags & IFEF_RXPOLL) || !net_rxpoll ||
3324 !(ifp->if_xflags & IFXF_LEGACY));
3325 VERIFY(ifp->if_poll_mode == IFNET_MODEL_INPUT_POLL_OFF ||
3326 !(ifp->if_xflags & IFXF_LEGACY));
3327 VERIFY(current_thread() == inp->dlth_thread);
3328
3329 /* construct the name for this thread, and then apply it */
3330 bzero(thread_name, sizeof(thread_name));
3331 (void) snprintf(thread_name, sizeof(thread_name),
3332 "dlil_input_%s", ifp->if_xname);
3333 thread_set_thread_name(inp->dlth_thread, thread_name);
3334
3335 lck_mtx_lock(&inp->dlth_lock);
3336 VERIFY(!(inp->dlth_flags & (DLIL_INPUT_EMBRYONIC | DLIL_INPUT_RUNNING)));
3337 (void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3338 inp->dlth_flags |= DLIL_INPUT_EMBRYONIC;
3339 /* wake up once to get out of embryonic state */
3340 dlil_input_wakeup(inp);
3341 lck_mtx_unlock(&inp->dlth_lock);
3342 (void) thread_block_parameter(dlil_input_thread_cont, inp);
3343 /* NOTREACHED */
3344 __builtin_unreachable();
3345 }
3346
3347 __attribute__((noreturn))
3348 static void
dlil_input_thread_cont(void * v,wait_result_t wres)3349 dlil_input_thread_cont(void *v, wait_result_t wres)
3350 {
3351 struct dlil_threading_info *inp = v;
3352 struct ifnet *ifp = inp->dlth_ifp;
3353
3354 lck_mtx_lock_spin(&inp->dlth_lock);
3355 if (__improbable(wres == THREAD_INTERRUPTED ||
3356 (inp->dlth_flags & DLIL_INPUT_TERMINATE))) {
3357 goto terminate;
3358 }
3359
3360 VERIFY(!(inp->dlth_flags & DLIL_INPUT_RUNNING));
3361 inp->dlth_flags |= DLIL_INPUT_RUNNING;
3362
3363 while (1) {
3364 struct mbuf *m = NULL;
3365 classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
3366 boolean_t notify = FALSE;
3367 boolean_t embryonic;
3368 u_int32_t m_cnt;
3369
3370 inp->dlth_flags &= ~DLIL_INPUT_WAITING;
3371
3372 if (__improbable(embryonic =
3373 (inp->dlth_flags & DLIL_INPUT_EMBRYONIC))) {
3374 inp->dlth_flags &= ~DLIL_INPUT_EMBRYONIC;
3375 }
3376
3377 /*
3378 * Protocol registration and injection must always use
3379 * the main input thread; in theory the latter can utilize
3380 * the corresponding input thread where the packet arrived
3381 * on, but that requires our knowing the interface in advance
3382 * (and the benefits might not worth the trouble.)
3383 */
3384 VERIFY(!(inp->dlth_flags &
3385 (DLIL_PROTO_WAITING | DLIL_PROTO_REGISTER)));
3386
3387 /* Packets for this interface */
3388 m_cnt = qlen(&inp->dlth_pkts);
3389 _getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
3390 m = pkt.cp_mbuf;
3391
3392 inp->dlth_wtot = 0;
3393
3394 #if SKYWALK
3395 /*
3396 * If this interface is attached to a netif nexus,
3397 * the stats are already incremented there; otherwise
3398 * do it here.
3399 */
3400 if (!(ifp->if_capabilities & IFCAP_SKYWALK))
3401 #endif /* SKYWALK */
3402 notify = dlil_input_stats_sync(ifp, inp);
3403
3404 lck_mtx_unlock(&inp->dlth_lock);
3405
3406 if (__improbable(embryonic)) {
3407 ifnet_decr_pending_thread_count(ifp);
3408 }
3409
3410 if (__improbable(notify)) {
3411 ifnet_notify_data_threshold(ifp);
3412 }
3413
3414 /*
3415 * NOTE warning %%% attention !!!!
3416 * We should think about putting some thread starvation
3417 * safeguards if we deal with long chains of packets.
3418 */
3419 if (__probable(m != NULL)) {
3420 dlil_input_packet_list_extended(NULL, m,
3421 m_cnt, ifp->if_poll_mode);
3422 }
3423
3424 lck_mtx_lock_spin(&inp->dlth_lock);
3425 VERIFY(inp->dlth_flags & DLIL_INPUT_RUNNING);
3426 if (!(inp->dlth_flags & ~(DLIL_INPUT_RUNNING |
3427 DLIL_INPUT_TERMINATE))) {
3428 break;
3429 }
3430 }
3431
3432 inp->dlth_flags &= ~DLIL_INPUT_RUNNING;
3433
3434 if (__improbable(inp->dlth_flags & DLIL_INPUT_TERMINATE)) {
3435 terminate:
3436 lck_mtx_unlock(&inp->dlth_lock);
3437 dlil_terminate_input_thread(inp);
3438 /* NOTREACHED */
3439 } else {
3440 (void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3441 lck_mtx_unlock(&inp->dlth_lock);
3442 (void) thread_block_parameter(dlil_input_thread_cont, inp);
3443 /* NOTREACHED */
3444 }
3445
3446 VERIFY(0); /* we should never get here */
3447 /* NOTREACHED */
3448 __builtin_unreachable();
3449 }
3450
3451 /*
3452 * Input thread for interfaces with opportunistic polling input model.
3453 */
3454 __attribute__((noreturn))
3455 static void
dlil_rxpoll_input_thread_func(void * v,wait_result_t w)3456 dlil_rxpoll_input_thread_func(void *v, wait_result_t w)
3457 {
3458 #pragma unused(w)
3459 char thread_name[MAXTHREADNAMESIZE];
3460 struct dlil_threading_info *inp = v;
3461 struct ifnet *ifp = inp->dlth_ifp;
3462
3463 VERIFY(inp != dlil_main_input_thread);
3464 VERIFY(ifp != NULL && (ifp->if_eflags & IFEF_RXPOLL) &&
3465 (ifp->if_xflags & IFXF_LEGACY));
3466 VERIFY(current_thread() == inp->dlth_thread);
3467
3468 /* construct the name for this thread, and then apply it */
3469 bzero(thread_name, sizeof(thread_name));
3470 (void) snprintf(thread_name, sizeof(thread_name),
3471 "dlil_input_poll_%s", ifp->if_xname);
3472 thread_set_thread_name(inp->dlth_thread, thread_name);
3473
3474 lck_mtx_lock(&inp->dlth_lock);
3475 VERIFY(!(inp->dlth_flags & (DLIL_INPUT_EMBRYONIC | DLIL_INPUT_RUNNING)));
3476 (void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3477 inp->dlth_flags |= DLIL_INPUT_EMBRYONIC;
3478 /* wake up once to get out of embryonic state */
3479 dlil_input_wakeup(inp);
3480 lck_mtx_unlock(&inp->dlth_lock);
3481 (void) thread_block_parameter(dlil_rxpoll_input_thread_cont, inp);
3482 /* NOTREACHED */
3483 __builtin_unreachable();
3484 }
3485
3486 __attribute__((noreturn))
3487 static void
dlil_rxpoll_input_thread_cont(void * v,wait_result_t wres)3488 dlil_rxpoll_input_thread_cont(void *v, wait_result_t wres)
3489 {
3490 struct dlil_threading_info *inp = v;
3491 struct ifnet *ifp = inp->dlth_ifp;
3492 struct timespec ts;
3493
3494 lck_mtx_lock_spin(&inp->dlth_lock);
3495 if (__improbable(wres == THREAD_INTERRUPTED ||
3496 (inp->dlth_flags & DLIL_INPUT_TERMINATE))) {
3497 goto terminate;
3498 }
3499
3500 VERIFY(!(inp->dlth_flags & DLIL_INPUT_RUNNING));
3501 inp->dlth_flags |= DLIL_INPUT_RUNNING;
3502
3503 while (1) {
3504 struct mbuf *m = NULL;
3505 uint32_t m_cnt, poll_req = 0;
3506 uint64_t m_size = 0;
3507 ifnet_model_t mode;
3508 struct timespec now, delta;
3509 classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
3510 boolean_t notify;
3511 boolean_t embryonic;
3512 uint64_t ival;
3513
3514 inp->dlth_flags &= ~DLIL_INPUT_WAITING;
3515
3516 if (__improbable(embryonic =
3517 (inp->dlth_flags & DLIL_INPUT_EMBRYONIC))) {
3518 inp->dlth_flags &= ~DLIL_INPUT_EMBRYONIC;
3519 goto skip;
3520 }
3521
3522 if ((ival = ifp->if_rxpoll_ival) < IF_RXPOLL_INTERVALTIME_MIN) {
3523 ival = IF_RXPOLL_INTERVALTIME_MIN;
3524 }
3525
3526 /* Link parameters changed? */
3527 if (ifp->if_poll_update != 0) {
3528 ifp->if_poll_update = 0;
3529 (void) dlil_rxpoll_set_params(ifp, NULL, TRUE);
3530 }
3531
3532 /* Current operating mode */
3533 mode = ifp->if_poll_mode;
3534
3535 /*
3536 * Protocol registration and injection must always use
3537 * the main input thread; in theory the latter can utilize
3538 * the corresponding input thread where the packet arrived
3539 * on, but that requires our knowing the interface in advance
3540 * (and the benefits might not worth the trouble.)
3541 */
3542 VERIFY(!(inp->dlth_flags &
3543 (DLIL_PROTO_WAITING | DLIL_PROTO_REGISTER)));
3544
3545 /* Total count of all packets */
3546 m_cnt = qlen(&inp->dlth_pkts);
3547
3548 /* Total bytes of all packets */
3549 m_size = qsize(&inp->dlth_pkts);
3550
3551 /* Packets for this interface */
3552 _getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
3553 m = pkt.cp_mbuf;
3554 VERIFY(m != NULL || m_cnt == 0);
3555
3556 nanouptime(&now);
3557 if (!net_timerisset(&ifp->if_poll_sample_lasttime)) {
3558 *(&ifp->if_poll_sample_lasttime) = *(&now);
3559 }
3560
3561 net_timersub(&now, &ifp->if_poll_sample_lasttime, &delta);
3562 if (if_rxpoll && net_timerisset(&ifp->if_poll_sample_holdtime)) {
3563 u_int32_t ptot, btot;
3564
3565 /* Accumulate statistics for current sampling */
3566 PKTCNTR_ADD(&ifp->if_poll_sstats, m_cnt, m_size);
3567
3568 if (net_timercmp(&delta, &ifp->if_poll_sample_holdtime, <)) {
3569 goto skip;
3570 }
3571
3572 *(&ifp->if_poll_sample_lasttime) = *(&now);
3573
3574 /* Calculate min/max of inbound bytes */
3575 btot = (u_int32_t)ifp->if_poll_sstats.bytes;
3576 if (ifp->if_rxpoll_bmin == 0 || ifp->if_rxpoll_bmin > btot) {
3577 ifp->if_rxpoll_bmin = btot;
3578 }
3579 if (btot > ifp->if_rxpoll_bmax) {
3580 ifp->if_rxpoll_bmax = btot;
3581 }
3582
3583 /* Calculate EWMA of inbound bytes */
3584 DLIL_EWMA(ifp->if_rxpoll_bavg, btot, if_rxpoll_decay);
3585
3586 /* Calculate min/max of inbound packets */
3587 ptot = (u_int32_t)ifp->if_poll_sstats.packets;
3588 if (ifp->if_rxpoll_pmin == 0 || ifp->if_rxpoll_pmin > ptot) {
3589 ifp->if_rxpoll_pmin = ptot;
3590 }
3591 if (ptot > ifp->if_rxpoll_pmax) {
3592 ifp->if_rxpoll_pmax = ptot;
3593 }
3594
3595 /* Calculate EWMA of inbound packets */
3596 DLIL_EWMA(ifp->if_rxpoll_pavg, ptot, if_rxpoll_decay);
3597
3598 /* Reset sampling statistics */
3599 PKTCNTR_CLEAR(&ifp->if_poll_sstats);
3600
3601 /* Calculate EWMA of wakeup requests */
3602 DLIL_EWMA(ifp->if_rxpoll_wavg, inp->dlth_wtot,
3603 if_rxpoll_decay);
3604 inp->dlth_wtot = 0;
3605
3606 if (dlil_verbose) {
3607 if (!net_timerisset(&ifp->if_poll_dbg_lasttime)) {
3608 *(&ifp->if_poll_dbg_lasttime) = *(&now);
3609 }
3610 net_timersub(&now, &ifp->if_poll_dbg_lasttime, &delta);
3611 if (net_timercmp(&delta, &dlil_dbgrate, >=)) {
3612 *(&ifp->if_poll_dbg_lasttime) = *(&now);
3613 DLIL_PRINTF("%s: [%s] pkts avg %d max %d "
3614 "limits [%d/%d], wreq avg %d "
3615 "limits [%d/%d], bytes avg %d "
3616 "limits [%d/%d]\n", if_name(ifp),
3617 (ifp->if_poll_mode ==
3618 IFNET_MODEL_INPUT_POLL_ON) ?
3619 "ON" : "OFF", ifp->if_rxpoll_pavg,
3620 ifp->if_rxpoll_pmax,
3621 ifp->if_rxpoll_plowat,
3622 ifp->if_rxpoll_phiwat,
3623 ifp->if_rxpoll_wavg,
3624 ifp->if_rxpoll_wlowat,
3625 ifp->if_rxpoll_whiwat,
3626 ifp->if_rxpoll_bavg,
3627 ifp->if_rxpoll_blowat,
3628 ifp->if_rxpoll_bhiwat);
3629 }
3630 }
3631
3632 /* Perform mode transition, if necessary */
3633 if (!net_timerisset(&ifp->if_poll_mode_lasttime)) {
3634 *(&ifp->if_poll_mode_lasttime) = *(&now);
3635 }
3636
3637 net_timersub(&now, &ifp->if_poll_mode_lasttime, &delta);
3638 if (net_timercmp(&delta, &ifp->if_poll_mode_holdtime, <)) {
3639 goto skip;
3640 }
3641
3642 if (ifp->if_rxpoll_pavg <= ifp->if_rxpoll_plowat &&
3643 ifp->if_rxpoll_bavg <= ifp->if_rxpoll_blowat &&
3644 ifp->if_poll_mode != IFNET_MODEL_INPUT_POLL_OFF) {
3645 mode = IFNET_MODEL_INPUT_POLL_OFF;
3646 } else if (ifp->if_rxpoll_pavg >= ifp->if_rxpoll_phiwat &&
3647 (ifp->if_rxpoll_bavg >= ifp->if_rxpoll_bhiwat ||
3648 ifp->if_rxpoll_wavg >= ifp->if_rxpoll_whiwat) &&
3649 ifp->if_poll_mode != IFNET_MODEL_INPUT_POLL_ON) {
3650 mode = IFNET_MODEL_INPUT_POLL_ON;
3651 }
3652
3653 if (mode != ifp->if_poll_mode) {
3654 ifp->if_poll_mode = mode;
3655 *(&ifp->if_poll_mode_lasttime) = *(&now);
3656 poll_req++;
3657 }
3658 }
3659 skip:
3660 notify = dlil_input_stats_sync(ifp, inp);
3661
3662 lck_mtx_unlock(&inp->dlth_lock);
3663
3664 if (__improbable(embryonic)) {
3665 ifnet_decr_pending_thread_count(ifp);
3666 }
3667
3668 if (__improbable(notify)) {
3669 ifnet_notify_data_threshold(ifp);
3670 }
3671
3672 /*
3673 * If there's a mode change and interface is still attached,
3674 * perform a downcall to the driver for the new mode. Also
3675 * hold an IO refcnt on the interface to prevent it from
3676 * being detached (will be release below.)
3677 */
3678 if (poll_req != 0 && ifnet_is_attached(ifp, 1)) {
3679 struct ifnet_model_params p = {
3680 .model = mode, .reserved = { 0 }
3681 };
3682 errno_t err;
3683
3684 if (dlil_verbose) {
3685 DLIL_PRINTF("%s: polling is now %s, "
3686 "pkts avg %d max %d limits [%d/%d], "
3687 "wreq avg %d limits [%d/%d], "
3688 "bytes avg %d limits [%d/%d]\n",
3689 if_name(ifp),
3690 (mode == IFNET_MODEL_INPUT_POLL_ON) ?
3691 "ON" : "OFF", ifp->if_rxpoll_pavg,
3692 ifp->if_rxpoll_pmax, ifp->if_rxpoll_plowat,
3693 ifp->if_rxpoll_phiwat, ifp->if_rxpoll_wavg,
3694 ifp->if_rxpoll_wlowat, ifp->if_rxpoll_whiwat,
3695 ifp->if_rxpoll_bavg, ifp->if_rxpoll_blowat,
3696 ifp->if_rxpoll_bhiwat);
3697 }
3698
3699 if ((err = ((*ifp->if_input_ctl)(ifp,
3700 IFNET_CTL_SET_INPUT_MODEL, sizeof(p), &p))) != 0) {
3701 DLIL_PRINTF("%s: error setting polling mode "
3702 "to %s (%d)\n", if_name(ifp),
3703 (mode == IFNET_MODEL_INPUT_POLL_ON) ?
3704 "ON" : "OFF", err);
3705 }
3706
3707 switch (mode) {
3708 case IFNET_MODEL_INPUT_POLL_OFF:
3709 ifnet_set_poll_cycle(ifp, NULL);
3710 ifp->if_rxpoll_offreq++;
3711 if (err != 0) {
3712 ifp->if_rxpoll_offerr++;
3713 }
3714 break;
3715
3716 case IFNET_MODEL_INPUT_POLL_ON:
3717 net_nsectimer(&ival, &ts);
3718 ifnet_set_poll_cycle(ifp, &ts);
3719 ifnet_poll(ifp);
3720 ifp->if_rxpoll_onreq++;
3721 if (err != 0) {
3722 ifp->if_rxpoll_onerr++;
3723 }
3724 break;
3725
3726 default:
3727 VERIFY(0);
3728 /* NOTREACHED */
3729 }
3730
3731 /* Release the IO refcnt */
3732 ifnet_decr_iorefcnt(ifp);
3733 }
3734
3735 /*
3736 * NOTE warning %%% attention !!!!
3737 * We should think about putting some thread starvation
3738 * safeguards if we deal with long chains of packets.
3739 */
3740 if (__probable(m != NULL)) {
3741 dlil_input_packet_list_extended(NULL, m, m_cnt, mode);
3742 }
3743
3744 lck_mtx_lock_spin(&inp->dlth_lock);
3745 VERIFY(inp->dlth_flags & DLIL_INPUT_RUNNING);
3746 if (!(inp->dlth_flags & ~(DLIL_INPUT_RUNNING |
3747 DLIL_INPUT_TERMINATE))) {
3748 break;
3749 }
3750 }
3751
3752 inp->dlth_flags &= ~DLIL_INPUT_RUNNING;
3753
3754 if (__improbable(inp->dlth_flags & DLIL_INPUT_TERMINATE)) {
3755 terminate:
3756 lck_mtx_unlock(&inp->dlth_lock);
3757 dlil_terminate_input_thread(inp);
3758 /* NOTREACHED */
3759 } else {
3760 (void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3761 lck_mtx_unlock(&inp->dlth_lock);
3762 (void) thread_block_parameter(dlil_rxpoll_input_thread_cont,
3763 inp);
3764 /* NOTREACHED */
3765 }
3766
3767 VERIFY(0); /* we should never get here */
3768 /* NOTREACHED */
3769 __builtin_unreachable();
3770 }
3771
3772 errno_t
dlil_rxpoll_validate_params(struct ifnet_poll_params * p)3773 dlil_rxpoll_validate_params(struct ifnet_poll_params *p)
3774 {
3775 if (p != NULL) {
3776 if ((p->packets_lowat == 0 && p->packets_hiwat != 0) ||
3777 (p->packets_lowat != 0 && p->packets_hiwat == 0)) {
3778 return EINVAL;
3779 }
3780 if (p->packets_lowat != 0 && /* hiwat must be non-zero */
3781 p->packets_lowat >= p->packets_hiwat) {
3782 return EINVAL;
3783 }
3784 if ((p->bytes_lowat == 0 && p->bytes_hiwat != 0) ||
3785 (p->bytes_lowat != 0 && p->bytes_hiwat == 0)) {
3786 return EINVAL;
3787 }
3788 if (p->bytes_lowat != 0 && /* hiwat must be non-zero */
3789 p->bytes_lowat >= p->bytes_hiwat) {
3790 return EINVAL;
3791 }
3792 if (p->interval_time != 0 &&
3793 p->interval_time < IF_RXPOLL_INTERVALTIME_MIN) {
3794 p->interval_time = IF_RXPOLL_INTERVALTIME_MIN;
3795 }
3796 }
3797 return 0;
3798 }
3799
3800 void
dlil_rxpoll_update_params(struct ifnet * ifp,struct ifnet_poll_params * p)3801 dlil_rxpoll_update_params(struct ifnet *ifp, struct ifnet_poll_params *p)
3802 {
3803 u_int64_t sample_holdtime, inbw;
3804
3805 if ((inbw = ifnet_input_linkrate(ifp)) == 0 && p == NULL) {
3806 sample_holdtime = 0; /* polling is disabled */
3807 ifp->if_rxpoll_wlowat = ifp->if_rxpoll_plowat =
3808 ifp->if_rxpoll_blowat = 0;
3809 ifp->if_rxpoll_whiwat = ifp->if_rxpoll_phiwat =
3810 ifp->if_rxpoll_bhiwat = (u_int32_t)-1;
3811 ifp->if_rxpoll_plim = 0;
3812 ifp->if_rxpoll_ival = IF_RXPOLL_INTERVALTIME_MIN;
3813 } else {
3814 u_int32_t plowat, phiwat, blowat, bhiwat, plim;
3815 u_int64_t ival;
3816 unsigned int n, i;
3817
3818 for (n = 0, i = 0; rxpoll_tbl[i].speed != 0; i++) {
3819 if (inbw < rxpoll_tbl[i].speed) {
3820 break;
3821 }
3822 n = i;
3823 }
3824 /* auto-tune if caller didn't specify a value */
3825 plowat = ((p == NULL || p->packets_lowat == 0) ?
3826 rxpoll_tbl[n].plowat : p->packets_lowat);
3827 phiwat = ((p == NULL || p->packets_hiwat == 0) ?
3828 rxpoll_tbl[n].phiwat : p->packets_hiwat);
3829 blowat = ((p == NULL || p->bytes_lowat == 0) ?
3830 rxpoll_tbl[n].blowat : p->bytes_lowat);
3831 bhiwat = ((p == NULL || p->bytes_hiwat == 0) ?
3832 rxpoll_tbl[n].bhiwat : p->bytes_hiwat);
3833 plim = ((p == NULL || p->packets_limit == 0 ||
3834 if_rxpoll_max != 0) ? if_rxpoll_max : p->packets_limit);
3835 ival = ((p == NULL || p->interval_time == 0 ||
3836 if_rxpoll_interval_time != IF_RXPOLL_INTERVALTIME) ?
3837 if_rxpoll_interval_time : p->interval_time);
3838
3839 VERIFY(plowat != 0 && phiwat != 0);
3840 VERIFY(blowat != 0 && bhiwat != 0);
3841 VERIFY(ival >= IF_RXPOLL_INTERVALTIME_MIN);
3842
3843 sample_holdtime = if_rxpoll_sample_holdtime;
3844 ifp->if_rxpoll_wlowat = if_sysctl_rxpoll_wlowat;
3845 ifp->if_rxpoll_whiwat = if_sysctl_rxpoll_whiwat;
3846 ifp->if_rxpoll_plowat = plowat;
3847 ifp->if_rxpoll_phiwat = phiwat;
3848 ifp->if_rxpoll_blowat = blowat;
3849 ifp->if_rxpoll_bhiwat = bhiwat;
3850 ifp->if_rxpoll_plim = plim;
3851 ifp->if_rxpoll_ival = ival;
3852 }
3853
3854 net_nsectimer(&if_rxpoll_mode_holdtime, &ifp->if_poll_mode_holdtime);
3855 net_nsectimer(&sample_holdtime, &ifp->if_poll_sample_holdtime);
3856
3857 if (dlil_verbose) {
3858 DLIL_PRINTF("%s: speed %llu bps, sample per %llu nsec, "
3859 "poll interval %llu nsec, pkts per poll %u, "
3860 "pkt limits [%u/%u], wreq limits [%u/%u], "
3861 "bytes limits [%u/%u]\n", if_name(ifp),
3862 inbw, sample_holdtime, ifp->if_rxpoll_ival,
3863 ifp->if_rxpoll_plim, ifp->if_rxpoll_plowat,
3864 ifp->if_rxpoll_phiwat, ifp->if_rxpoll_wlowat,
3865 ifp->if_rxpoll_whiwat, ifp->if_rxpoll_blowat,
3866 ifp->if_rxpoll_bhiwat);
3867 }
3868 }
3869
3870 /*
3871 * Must be called on an attached ifnet (caller is expected to check.)
3872 * Caller may pass NULL for poll parameters to indicate "auto-tuning."
3873 */
3874 errno_t
dlil_rxpoll_set_params(struct ifnet * ifp,struct ifnet_poll_params * p,boolean_t locked)3875 dlil_rxpoll_set_params(struct ifnet *ifp, struct ifnet_poll_params *p,
3876 boolean_t locked)
3877 {
3878 errno_t err;
3879 struct dlil_threading_info *inp;
3880
3881 VERIFY(ifp != NULL);
3882 if (!(ifp->if_eflags & IFEF_RXPOLL) || (inp = ifp->if_inp) == NULL) {
3883 return ENXIO;
3884 }
3885 err = dlil_rxpoll_validate_params(p);
3886 if (err != 0) {
3887 return err;
3888 }
3889
3890 if (!locked) {
3891 lck_mtx_lock(&inp->dlth_lock);
3892 }
3893 LCK_MTX_ASSERT(&inp->dlth_lock, LCK_MTX_ASSERT_OWNED);
3894 /*
3895 * Normally, we'd reset the parameters to the auto-tuned values
3896 * if the the input thread detects a change in link rate. If the
3897 * driver provides its own parameters right after a link rate
3898 * changes, but before the input thread gets to run, we want to
3899 * make sure to keep the driver's values. Clearing if_poll_update
3900 * will achieve that.
3901 */
3902 if (p != NULL && !locked && ifp->if_poll_update != 0) {
3903 ifp->if_poll_update = 0;
3904 }
3905 dlil_rxpoll_update_params(ifp, p);
3906 if (!locked) {
3907 lck_mtx_unlock(&inp->dlth_lock);
3908 }
3909 return 0;
3910 }
3911
3912 /*
3913 * Must be called on an attached ifnet (caller is expected to check.)
3914 */
3915 errno_t
dlil_rxpoll_get_params(struct ifnet * ifp,struct ifnet_poll_params * p)3916 dlil_rxpoll_get_params(struct ifnet *ifp, struct ifnet_poll_params *p)
3917 {
3918 struct dlil_threading_info *inp;
3919
3920 VERIFY(ifp != NULL && p != NULL);
3921 if (!(ifp->if_eflags & IFEF_RXPOLL) || (inp = ifp->if_inp) == NULL) {
3922 return ENXIO;
3923 }
3924
3925 bzero(p, sizeof(*p));
3926
3927 lck_mtx_lock(&inp->dlth_lock);
3928 p->packets_limit = ifp->if_rxpoll_plim;
3929 p->packets_lowat = ifp->if_rxpoll_plowat;
3930 p->packets_hiwat = ifp->if_rxpoll_phiwat;
3931 p->bytes_lowat = ifp->if_rxpoll_blowat;
3932 p->bytes_hiwat = ifp->if_rxpoll_bhiwat;
3933 p->interval_time = ifp->if_rxpoll_ival;
3934 lck_mtx_unlock(&inp->dlth_lock);
3935
3936 return 0;
3937 }
3938
3939 errno_t
ifnet_input(struct ifnet * ifp,struct mbuf * m_head,const struct ifnet_stat_increment_param * s)3940 ifnet_input(struct ifnet *ifp, struct mbuf *m_head,
3941 const struct ifnet_stat_increment_param *s)
3942 {
3943 return ifnet_input_common(ifp, m_head, NULL, s, FALSE, FALSE);
3944 }
3945
3946 errno_t
ifnet_input_extended(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s)3947 ifnet_input_extended(struct ifnet *ifp, struct mbuf *m_head,
3948 struct mbuf *m_tail, const struct ifnet_stat_increment_param *s)
3949 {
3950 return ifnet_input_common(ifp, m_head, m_tail, s, TRUE, FALSE);
3951 }
3952
3953 errno_t
ifnet_input_poll(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s)3954 ifnet_input_poll(struct ifnet *ifp, struct mbuf *m_head,
3955 struct mbuf *m_tail, const struct ifnet_stat_increment_param *s)
3956 {
3957 return ifnet_input_common(ifp, m_head, m_tail, s,
3958 (m_head != NULL), TRUE);
3959 }
3960
3961 static errno_t
ifnet_input_common(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t ext,boolean_t poll)3962 ifnet_input_common(struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail,
3963 const struct ifnet_stat_increment_param *s, boolean_t ext, boolean_t poll)
3964 {
3965 dlil_input_func input_func;
3966 struct ifnet_stat_increment_param _s;
3967 u_int32_t m_cnt = 0, m_size = 0;
3968 struct mbuf *last;
3969 errno_t err = 0;
3970
3971 if ((m_head == NULL && !poll) || (s == NULL && ext)) {
3972 if (m_head != NULL) {
3973 mbuf_freem_list(m_head);
3974 }
3975 return EINVAL;
3976 }
3977
3978 VERIFY(m_head != NULL || (s == NULL && m_tail == NULL && !ext && poll));
3979 VERIFY(m_tail == NULL || ext);
3980 VERIFY(s != NULL || !ext);
3981
3982 /*
3983 * Drop the packet(s) if the parameters are invalid, or if the
3984 * interface is no longer attached; else hold an IO refcnt to
3985 * prevent it from being detached (will be released below.)
3986 */
3987 if (ifp == NULL || (ifp != lo_ifp && !ifnet_datamov_begin(ifp))) {
3988 if (m_head != NULL) {
3989 mbuf_freem_list(m_head);
3990 }
3991 return EINVAL;
3992 }
3993
3994 input_func = ifp->if_input_dlil;
3995 VERIFY(input_func != NULL);
3996
3997 if (m_tail == NULL) {
3998 last = m_head;
3999 while (m_head != NULL) {
4000 #if IFNET_INPUT_SANITY_CHK
4001 if (__improbable(dlil_input_sanity_check != 0)) {
4002 DLIL_INPUT_CHECK(last, ifp);
4003 }
4004 #endif /* IFNET_INPUT_SANITY_CHK */
4005 m_cnt++;
4006 m_size += m_length(last);
4007 if (mbuf_nextpkt(last) == NULL) {
4008 break;
4009 }
4010 last = mbuf_nextpkt(last);
4011 }
4012 m_tail = last;
4013 } else {
4014 #if IFNET_INPUT_SANITY_CHK
4015 if (__improbable(dlil_input_sanity_check != 0)) {
4016 last = m_head;
4017 while (1) {
4018 DLIL_INPUT_CHECK(last, ifp);
4019 m_cnt++;
4020 m_size += m_length(last);
4021 if (mbuf_nextpkt(last) == NULL) {
4022 break;
4023 }
4024 last = mbuf_nextpkt(last);
4025 }
4026 } else {
4027 m_cnt = s->packets_in;
4028 m_size = s->bytes_in;
4029 last = m_tail;
4030 }
4031 #else
4032 m_cnt = s->packets_in;
4033 m_size = s->bytes_in;
4034 last = m_tail;
4035 #endif /* IFNET_INPUT_SANITY_CHK */
4036 }
4037
4038 if (last != m_tail) {
4039 panic_plain("%s: invalid input packet chain for %s, "
4040 "tail mbuf %p instead of %p\n", __func__, if_name(ifp),
4041 m_tail, last);
4042 }
4043
4044 /*
4045 * Assert packet count only for the extended variant, for backwards
4046 * compatibility, since this came directly from the device driver.
4047 * Relax this assertion for input bytes, as the driver may have
4048 * included the link-layer headers in the computation; hence
4049 * m_size is just an approximation.
4050 */
4051 if (ext && s->packets_in != m_cnt) {
4052 panic_plain("%s: input packet count mismatch for %s, "
4053 "%d instead of %d\n", __func__, if_name(ifp),
4054 s->packets_in, m_cnt);
4055 }
4056
4057 if (s == NULL) {
4058 bzero(&_s, sizeof(_s));
4059 s = &_s;
4060 } else {
4061 _s = *s;
4062 }
4063 _s.packets_in = m_cnt;
4064 _s.bytes_in = m_size;
4065
4066 err = (*input_func)(ifp, m_head, m_tail, s, poll, current_thread());
4067
4068 if (ifp != lo_ifp) {
4069 /* Release the IO refcnt */
4070 ifnet_datamov_end(ifp);
4071 }
4072
4073 return err;
4074 }
4075
4076 #if SKYWALK
4077 errno_t
dlil_set_input_handler(struct ifnet * ifp,dlil_input_func fn)4078 dlil_set_input_handler(struct ifnet *ifp, dlil_input_func fn)
4079 {
4080 return os_atomic_cmpxchg((void * volatile *)&ifp->if_input_dlil,
4081 ptrauth_nop_cast(void *, &dlil_input_handler),
4082 ptrauth_nop_cast(void *, fn), acq_rel) ? 0 : EBUSY;
4083 }
4084
4085 void
dlil_reset_input_handler(struct ifnet * ifp)4086 dlil_reset_input_handler(struct ifnet *ifp)
4087 {
4088 while (!os_atomic_cmpxchg((void * volatile *)&ifp->if_input_dlil,
4089 ptrauth_nop_cast(void *, ifp->if_input_dlil),
4090 ptrauth_nop_cast(void *, &dlil_input_handler), acq_rel)) {
4091 ;
4092 }
4093 }
4094
4095 errno_t
dlil_set_output_handler(struct ifnet * ifp,dlil_output_func fn)4096 dlil_set_output_handler(struct ifnet *ifp, dlil_output_func fn)
4097 {
4098 return os_atomic_cmpxchg((void * volatile *)&ifp->if_output_dlil,
4099 ptrauth_nop_cast(void *, &dlil_output_handler),
4100 ptrauth_nop_cast(void *, fn), acq_rel) ? 0 : EBUSY;
4101 }
4102
4103 void
dlil_reset_output_handler(struct ifnet * ifp)4104 dlil_reset_output_handler(struct ifnet *ifp)
4105 {
4106 while (!os_atomic_cmpxchg((void * volatile *)&ifp->if_output_dlil,
4107 ptrauth_nop_cast(void *, ifp->if_output_dlil),
4108 ptrauth_nop_cast(void *, &dlil_output_handler), acq_rel)) {
4109 ;
4110 }
4111 }
4112 #endif /* SKYWALK */
4113
4114 errno_t
dlil_output_handler(struct ifnet * ifp,struct mbuf * m)4115 dlil_output_handler(struct ifnet *ifp, struct mbuf *m)
4116 {
4117 return ifp->if_output(ifp, m);
4118 }
4119
4120 errno_t
dlil_input_handler(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t poll,struct thread * tp)4121 dlil_input_handler(struct ifnet *ifp, struct mbuf *m_head,
4122 struct mbuf *m_tail, const struct ifnet_stat_increment_param *s,
4123 boolean_t poll, struct thread *tp)
4124 {
4125 struct dlil_threading_info *inp = ifp->if_inp;
4126
4127 if (__improbable(inp == NULL)) {
4128 inp = dlil_main_input_thread;
4129 }
4130
4131 #if (DEVELOPMENT || DEBUG)
4132 if (__improbable(net_thread_is_marked(NET_THREAD_SYNC_RX))) {
4133 return dlil_input_sync(inp, ifp, m_head, m_tail, s, poll, tp);
4134 } else
4135 #endif /* (DEVELOPMENT || DEBUG) */
4136 {
4137 return inp->dlth_strategy(inp, ifp, m_head, m_tail, s, poll, tp);
4138 }
4139 }
4140
4141 /*
4142 * Detect whether a queue contains a burst that needs to be trimmed.
4143 */
4144 #define MBUF_QUEUE_IS_OVERCOMMITTED(q) \
4145 __improbable(MAX(if_rcvq_burst_limit, qlimit(q)) < qlen(q) && \
4146 qtype(q) == QP_MBUF)
4147
4148 #define MAX_KNOWN_MBUF_CLASS 8
4149
4150 static uint32_t
dlil_trim_overcomitted_queue_locked(class_queue_t * input_queue,dlil_freeq_t * freeq,struct ifnet_stat_increment_param * stat_delta)4151 dlil_trim_overcomitted_queue_locked(class_queue_t *input_queue,
4152 dlil_freeq_t *freeq, struct ifnet_stat_increment_param *stat_delta)
4153 {
4154 uint32_t overcommitted_qlen; /* Length in packets. */
4155 uint64_t overcommitted_qsize; /* Size in bytes. */
4156 uint32_t target_qlen; /* The desired queue length after trimming. */
4157 uint32_t pkts_to_drop; /* Number of packets to drop. */
4158 uint32_t dropped_pkts = 0; /* Number of packets that were dropped. */
4159 uint32_t dropped_bytes = 0; /* Number of dropped bytes. */
4160 struct mbuf *m = NULL, *m_tmp = NULL;
4161
4162 overcommitted_qlen = qlen(input_queue);
4163 overcommitted_qsize = qsize(input_queue);
4164 target_qlen = (qlimit(input_queue) * if_rcvq_trim_pct) / 100;
4165
4166 if (overcommitted_qlen <= target_qlen) {
4167 /*
4168 * The queue is already within the target limits.
4169 */
4170 dropped_pkts = 0;
4171 goto out;
4172 }
4173
4174 pkts_to_drop = overcommitted_qlen - target_qlen;
4175
4176 /*
4177 * Proceed to removing packets from the head of the queue,
4178 * starting from the oldest, until the desired number of packets
4179 * has been dropped.
4180 */
4181 MBUFQ_FOREACH_SAFE(m, &qmbufq(input_queue), m_tmp) {
4182 if (pkts_to_drop <= dropped_pkts) {
4183 break;
4184 }
4185 MBUFQ_REMOVE(&qmbufq(input_queue), m);
4186 MBUFQ_NEXT(m) = NULL;
4187 MBUFQ_ENQUEUE(freeq, m);
4188
4189 dropped_pkts += 1;
4190 dropped_bytes += m_length(m);
4191 }
4192
4193 /*
4194 * Adjust the length and the estimated size of the queue
4195 * after trimming.
4196 */
4197 VERIFY(overcommitted_qlen == target_qlen + dropped_pkts);
4198 qlen(input_queue) = target_qlen;
4199
4200 /* qsize() is an approximation. */
4201 if (dropped_bytes < qsize(input_queue)) {
4202 qsize(input_queue) -= dropped_bytes;
4203 } else {
4204 qsize(input_queue) = 0;
4205 }
4206
4207 /*
4208 * Adjust the ifnet statistics increments, if needed.
4209 */
4210 stat_delta->dropped += dropped_pkts;
4211 if (dropped_pkts < stat_delta->packets_in) {
4212 stat_delta->packets_in -= dropped_pkts;
4213 } else {
4214 stat_delta->packets_in = 0;
4215 }
4216 if (dropped_bytes < stat_delta->bytes_in) {
4217 stat_delta->bytes_in -= dropped_bytes;
4218 } else {
4219 stat_delta->bytes_in = 0;
4220 }
4221
4222 out:
4223 if (dlil_verbose) {
4224 /*
4225 * The basic information about the drop is logged
4226 * by the invoking function (dlil_input_{,a}sync).
4227 * If `dlil_verbose' flag is set, provide more information
4228 * that can be useful for debugging.
4229 */
4230 DLIL_PRINTF("%s: "
4231 "qlen: %u -> %u, "
4232 "qsize: %llu -> %llu "
4233 "qlimit: %u (sysctl: %u) "
4234 "target_qlen: %u (if_rcvq_trim_pct: %u) pkts_to_drop: %u "
4235 "dropped_pkts: %u dropped_bytes %u\n",
4236 __func__,
4237 overcommitted_qlen, qlen(input_queue),
4238 overcommitted_qsize, qsize(input_queue),
4239 qlimit(input_queue), if_rcvq_burst_limit,
4240 target_qlen, if_rcvq_trim_pct, pkts_to_drop,
4241 dropped_pkts, dropped_bytes);
4242 }
4243
4244 return dropped_pkts;
4245 }
4246
4247 static errno_t
dlil_input_async(struct dlil_threading_info * inp,struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t poll,struct thread * tp)4248 dlil_input_async(struct dlil_threading_info *inp,
4249 struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail,
4250 const struct ifnet_stat_increment_param *s, boolean_t poll,
4251 struct thread *tp)
4252 {
4253 u_int32_t m_cnt = s->packets_in;
4254 u_int32_t m_size = s->bytes_in;
4255 boolean_t notify = FALSE;
4256 struct ifnet_stat_increment_param s_adj = *s;
4257 dlil_freeq_t freeq;
4258 MBUFQ_INIT(&freeq);
4259
4260 /*
4261 * If there is a matching DLIL input thread associated with an
4262 * affinity set, associate this thread with the same set. We
4263 * will only do this once.
4264 */
4265 lck_mtx_lock_spin(&inp->dlth_lock);
4266 if (inp != dlil_main_input_thread && inp->dlth_affinity && tp != NULL &&
4267 ((!poll && inp->dlth_driver_thread == THREAD_NULL) ||
4268 (poll && inp->dlth_poller_thread == THREAD_NULL))) {
4269 u_int32_t tag = inp->dlth_affinity_tag;
4270
4271 if (poll) {
4272 VERIFY(inp->dlth_poller_thread == THREAD_NULL);
4273 inp->dlth_poller_thread = tp;
4274 } else {
4275 VERIFY(inp->dlth_driver_thread == THREAD_NULL);
4276 inp->dlth_driver_thread = tp;
4277 }
4278 lck_mtx_unlock(&inp->dlth_lock);
4279
4280 /* Associate the current thread with the new affinity tag */
4281 (void) dlil_affinity_set(tp, tag);
4282
4283 /*
4284 * Take a reference on the current thread; during detach,
4285 * we will need to refer to it in order to tear down its
4286 * affinity.
4287 */
4288 thread_reference(tp);
4289 lck_mtx_lock_spin(&inp->dlth_lock);
4290 }
4291
4292 VERIFY(m_head != NULL || (m_tail == NULL && m_cnt == 0));
4293
4294 /*
4295 * Because of loopbacked multicast we cannot stuff the ifp in
4296 * the rcvif of the packet header: loopback (lo0) packets use a
4297 * dedicated list so that we can later associate them with lo_ifp
4298 * on their way up the stack. Packets for other interfaces without
4299 * dedicated input threads go to the regular list.
4300 */
4301 if (m_head != NULL) {
4302 classq_pkt_t head, tail;
4303 class_queue_t *input_queue;
4304 CLASSQ_PKT_INIT_MBUF(&head, m_head);
4305 CLASSQ_PKT_INIT_MBUF(&tail, m_tail);
4306 if (inp == dlil_main_input_thread && ifp == lo_ifp) {
4307 struct dlil_main_threading_info *inpm =
4308 (struct dlil_main_threading_info *)inp;
4309 input_queue = &inpm->lo_rcvq_pkts;
4310 } else {
4311 input_queue = &inp->dlth_pkts;
4312 }
4313
4314 _addq_multi(input_queue, &head, &tail, m_cnt, m_size);
4315
4316 if (MBUF_QUEUE_IS_OVERCOMMITTED(input_queue)) {
4317 dlil_trim_overcomitted_queue_locked(input_queue, &freeq, &s_adj);
4318 inp->dlth_trim_pkts_dropped += s_adj.dropped;
4319 inp->dlth_trim_cnt += 1;
4320
4321 os_log_error(OS_LOG_DEFAULT,
4322 "%s %s burst limit %u (sysctl: %u) exceeded. "
4323 "%u packets dropped [%u total in %u events]. new qlen %u ",
4324 __func__, if_name(ifp), qlimit(input_queue), if_rcvq_burst_limit,
4325 s_adj.dropped, inp->dlth_trim_pkts_dropped, inp->dlth_trim_cnt,
4326 qlen(input_queue));
4327 }
4328 }
4329
4330 #if IFNET_INPUT_SANITY_CHK
4331 /*
4332 * Verify that the original stat increment parameter
4333 * accurately describes the input chain `m_head`.
4334 * This is not affected by the trimming of input queue.
4335 */
4336 if (__improbable(dlil_input_sanity_check != 0)) {
4337 u_int32_t count = 0, size = 0;
4338 struct mbuf *m0;
4339
4340 for (m0 = m_head; m0; m0 = mbuf_nextpkt(m0)) {
4341 size += m_length(m0);
4342 count++;
4343 }
4344
4345 if (count != m_cnt) {
4346 panic_plain("%s: invalid total packet count %u "
4347 "(expected %u)\n", if_name(ifp), count, m_cnt);
4348 /* NOTREACHED */
4349 __builtin_unreachable();
4350 } else if (size != m_size) {
4351 panic_plain("%s: invalid total packet size %u "
4352 "(expected %u)\n", if_name(ifp), size, m_size);
4353 /* NOTREACHED */
4354 __builtin_unreachable();
4355 }
4356
4357 inp->dlth_pkts_cnt += m_cnt;
4358 }
4359 #endif /* IFNET_INPUT_SANITY_CHK */
4360
4361 /* NOTE: use the adjusted parameter, vs the original one */
4362 dlil_input_stats_add(&s_adj, inp, ifp, poll);
4363 /*
4364 * If we're using the main input thread, synchronize the
4365 * stats now since we have the interface context. All
4366 * other cases involving dedicated input threads will
4367 * have their stats synchronized there.
4368 */
4369 if (inp == dlil_main_input_thread) {
4370 notify = dlil_input_stats_sync(ifp, inp);
4371 }
4372
4373 dlil_input_wakeup(inp);
4374 lck_mtx_unlock(&inp->dlth_lock);
4375
4376 /*
4377 * Actual freeing of the excess packets must happen
4378 * after the dlth_lock had been released.
4379 */
4380 if (!MBUFQ_EMPTY(&freeq)) {
4381 m_freem_list(MBUFQ_FIRST(&freeq));
4382 }
4383
4384 if (notify) {
4385 ifnet_notify_data_threshold(ifp);
4386 }
4387
4388 return 0;
4389 }
4390
4391 static errno_t
dlil_input_sync(struct dlil_threading_info * inp,struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t poll,struct thread * tp)4392 dlil_input_sync(struct dlil_threading_info *inp,
4393 struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail,
4394 const struct ifnet_stat_increment_param *s, boolean_t poll,
4395 struct thread *tp)
4396 {
4397 #pragma unused(tp)
4398 u_int32_t m_cnt = s->packets_in;
4399 u_int32_t m_size = s->bytes_in;
4400 boolean_t notify = FALSE;
4401 classq_pkt_t head, tail;
4402 struct ifnet_stat_increment_param s_adj = *s;
4403 dlil_freeq_t freeq;
4404 MBUFQ_INIT(&freeq);
4405
4406 ASSERT(inp != dlil_main_input_thread);
4407
4408 /* XXX: should we just assert instead? */
4409 if (__improbable(m_head == NULL)) {
4410 return 0;
4411 }
4412
4413 CLASSQ_PKT_INIT_MBUF(&head, m_head);
4414 CLASSQ_PKT_INIT_MBUF(&tail, m_tail);
4415
4416 lck_mtx_lock_spin(&inp->dlth_lock);
4417 _addq_multi(&inp->dlth_pkts, &head, &tail, m_cnt, m_size);
4418
4419 if (MBUF_QUEUE_IS_OVERCOMMITTED(&inp->dlth_pkts)) {
4420 dlil_trim_overcomitted_queue_locked(&inp->dlth_pkts, &freeq, &s_adj);
4421 inp->dlth_trim_pkts_dropped += s_adj.dropped;
4422 inp->dlth_trim_cnt += 1;
4423
4424 os_log_error(OS_LOG_DEFAULT,
4425 "%s %s burst limit %u (sysctl: %u) exceeded. "
4426 "%u packets dropped [%u total in %u events]. new qlen %u \n",
4427 __func__, if_name(ifp), qlimit(&inp->dlth_pkts), if_rcvq_burst_limit,
4428 s_adj.dropped, inp->dlth_trim_pkts_dropped, inp->dlth_trim_cnt,
4429 qlen(&inp->dlth_pkts));
4430 }
4431
4432 #if IFNET_INPUT_SANITY_CHK
4433 if (__improbable(dlil_input_sanity_check != 0)) {
4434 u_int32_t count = 0, size = 0;
4435 struct mbuf *m0;
4436
4437 for (m0 = m_head; m0; m0 = mbuf_nextpkt(m0)) {
4438 size += m_length(m0);
4439 count++;
4440 }
4441
4442 if (count != m_cnt) {
4443 panic_plain("%s: invalid total packet count %u "
4444 "(expected %u)\n", if_name(ifp), count, m_cnt);
4445 /* NOTREACHED */
4446 __builtin_unreachable();
4447 } else if (size != m_size) {
4448 panic_plain("%s: invalid total packet size %u "
4449 "(expected %u)\n", if_name(ifp), size, m_size);
4450 /* NOTREACHED */
4451 __builtin_unreachable();
4452 }
4453
4454 inp->dlth_pkts_cnt += m_cnt;
4455 }
4456 #endif /* IFNET_INPUT_SANITY_CHK */
4457
4458 /* NOTE: use the adjusted parameter, vs the original one */
4459 dlil_input_stats_add(&s_adj, inp, ifp, poll);
4460
4461 m_cnt = qlen(&inp->dlth_pkts);
4462 _getq_all(&inp->dlth_pkts, &head, NULL, NULL, NULL);
4463
4464 #if SKYWALK
4465 /*
4466 * If this interface is attached to a netif nexus,
4467 * the stats are already incremented there; otherwise
4468 * do it here.
4469 */
4470 if (!(ifp->if_capabilities & IFCAP_SKYWALK))
4471 #endif /* SKYWALK */
4472 notify = dlil_input_stats_sync(ifp, inp);
4473
4474 lck_mtx_unlock(&inp->dlth_lock);
4475
4476 /*
4477 * Actual freeing of the excess packets must happen
4478 * after the dlth_lock had been released.
4479 */
4480 if (!MBUFQ_EMPTY(&freeq)) {
4481 m_freem_list(MBUFQ_FIRST(&freeq));
4482 }
4483
4484 if (notify) {
4485 ifnet_notify_data_threshold(ifp);
4486 }
4487
4488 /*
4489 * NOTE warning %%% attention !!!!
4490 * We should think about putting some thread starvation
4491 * safeguards if we deal with long chains of packets.
4492 */
4493 if (head.cp_mbuf != NULL) {
4494 dlil_input_packet_list_extended(NULL, head.cp_mbuf,
4495 m_cnt, ifp->if_poll_mode);
4496 }
4497
4498 return 0;
4499 }
4500
4501 #if SKYWALK
4502 errno_t
ifnet_set_output_handler(struct ifnet * ifp,ifnet_output_func fn)4503 ifnet_set_output_handler(struct ifnet *ifp, ifnet_output_func fn)
4504 {
4505 return os_atomic_cmpxchg((void * volatile *)&ifp->if_output,
4506 ptrauth_nop_cast(void *, ifp->if_save_output),
4507 ptrauth_nop_cast(void *, fn), acq_rel) ? 0 : EBUSY;
4508 }
4509
4510 void
ifnet_reset_output_handler(struct ifnet * ifp)4511 ifnet_reset_output_handler(struct ifnet *ifp)
4512 {
4513 while (!os_atomic_cmpxchg((void * volatile *)&ifp->if_output,
4514 ptrauth_nop_cast(void *, ifp->if_output),
4515 ptrauth_nop_cast(void *, ifp->if_save_output), acq_rel)) {
4516 ;
4517 }
4518 }
4519
4520 errno_t
ifnet_set_start_handler(struct ifnet * ifp,ifnet_start_func fn)4521 ifnet_set_start_handler(struct ifnet *ifp, ifnet_start_func fn)
4522 {
4523 return os_atomic_cmpxchg((void * volatile *)&ifp->if_start,
4524 ptrauth_nop_cast(void *, ifp->if_save_start),
4525 ptrauth_nop_cast(void *, fn), acq_rel) ? 0 : EBUSY;
4526 }
4527
4528 void
ifnet_reset_start_handler(struct ifnet * ifp)4529 ifnet_reset_start_handler(struct ifnet *ifp)
4530 {
4531 while (!os_atomic_cmpxchg((void * volatile *)&ifp->if_start,
4532 ptrauth_nop_cast(void *, ifp->if_start),
4533 ptrauth_nop_cast(void *, ifp->if_save_start), acq_rel)) {
4534 ;
4535 }
4536 }
4537 #endif /* SKYWALK */
4538
4539 static void
ifnet_start_common(struct ifnet * ifp,boolean_t resetfc,boolean_t ignore_delay)4540 ifnet_start_common(struct ifnet *ifp, boolean_t resetfc, boolean_t ignore_delay)
4541 {
4542 if (!(ifp->if_eflags & IFEF_TXSTART)) {
4543 return;
4544 }
4545 /*
4546 * If the starter thread is inactive, signal it to do work,
4547 * unless the interface is being flow controlled from below,
4548 * e.g. a virtual interface being flow controlled by a real
4549 * network interface beneath it, or it's been disabled via
4550 * a call to ifnet_disable_output().
4551 */
4552 lck_mtx_lock_spin(&ifp->if_start_lock);
4553 if (ignore_delay) {
4554 ifp->if_start_flags |= IFSF_NO_DELAY;
4555 }
4556 if (resetfc) {
4557 ifp->if_start_flags &= ~IFSF_FLOW_CONTROLLED;
4558 } else if (ifp->if_start_flags & IFSF_FLOW_CONTROLLED) {
4559 lck_mtx_unlock(&ifp->if_start_lock);
4560 return;
4561 }
4562 ifp->if_start_req++;
4563 if (!ifp->if_start_active && ifp->if_start_thread != THREAD_NULL &&
4564 (resetfc || !(ifp->if_eflags & IFEF_ENQUEUE_MULTI) ||
4565 IFCQ_LEN(ifp->if_snd) >= ifp->if_start_delay_qlen ||
4566 ifp->if_start_delayed == 0)) {
4567 (void) wakeup_one((caddr_t)&ifp->if_start_thread);
4568 }
4569 lck_mtx_unlock(&ifp->if_start_lock);
4570 }
4571
4572 void
ifnet_start_set_pacemaker_time(struct ifnet * ifp,uint64_t tx_time)4573 ifnet_start_set_pacemaker_time(struct ifnet *ifp, uint64_t tx_time)
4574 {
4575 ifp->if_start_pacemaker_time = tx_time;
4576 }
4577
4578 void
ifnet_start(struct ifnet * ifp)4579 ifnet_start(struct ifnet *ifp)
4580 {
4581 ifnet_start_common(ifp, FALSE, FALSE);
4582 }
4583
4584 void
ifnet_start_ignore_delay(struct ifnet * ifp)4585 ifnet_start_ignore_delay(struct ifnet *ifp)
4586 {
4587 ifnet_start_common(ifp, FALSE, TRUE);
4588 }
4589
4590 __attribute__((noreturn))
4591 static void
ifnet_start_thread_func(void * v,wait_result_t w)4592 ifnet_start_thread_func(void *v, wait_result_t w)
4593 {
4594 #pragma unused(w)
4595 struct ifnet *ifp = v;
4596 char thread_name[MAXTHREADNAMESIZE];
4597
4598 /* Construct the name for this thread, and then apply it. */
4599 bzero(thread_name, sizeof(thread_name));
4600 (void) snprintf(thread_name, sizeof(thread_name),
4601 "ifnet_start_%s", ifp->if_xname);
4602 #if SKYWALK
4603 /* override name for native Skywalk interface */
4604 if (ifp->if_eflags & IFEF_SKYWALK_NATIVE) {
4605 (void) snprintf(thread_name, sizeof(thread_name),
4606 "skywalk_doorbell_%s_tx", ifp->if_xname);
4607 }
4608 #endif /* SKYWALK */
4609 ASSERT(ifp->if_start_thread == current_thread());
4610 thread_set_thread_name(current_thread(), thread_name);
4611
4612 /*
4613 * Treat the dedicated starter thread for lo0 as equivalent to
4614 * the driver workloop thread; if net_affinity is enabled for
4615 * the main input thread, associate this starter thread to it
4616 * by binding them with the same affinity tag. This is done
4617 * only once (as we only have one lo_ifp which never goes away.)
4618 */
4619 if (ifp == lo_ifp) {
4620 struct dlil_threading_info *inp = dlil_main_input_thread;
4621 struct thread *tp = current_thread();
4622 #if SKYWALK
4623 /* native skywalk loopback not yet implemented */
4624 VERIFY(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
4625 #endif /* SKYWALK */
4626
4627 lck_mtx_lock(&inp->dlth_lock);
4628 if (inp->dlth_affinity) {
4629 u_int32_t tag = inp->dlth_affinity_tag;
4630
4631 VERIFY(inp->dlth_driver_thread == THREAD_NULL);
4632 VERIFY(inp->dlth_poller_thread == THREAD_NULL);
4633 inp->dlth_driver_thread = tp;
4634 lck_mtx_unlock(&inp->dlth_lock);
4635
4636 /* Associate this thread with the affinity tag */
4637 (void) dlil_affinity_set(tp, tag);
4638 } else {
4639 lck_mtx_unlock(&inp->dlth_lock);
4640 }
4641 }
4642
4643 lck_mtx_lock(&ifp->if_start_lock);
4644 VERIFY(!ifp->if_start_embryonic && !ifp->if_start_active);
4645 (void) assert_wait(&ifp->if_start_thread, THREAD_UNINT);
4646 ifp->if_start_embryonic = 1;
4647 /* wake up once to get out of embryonic state */
4648 ifp->if_start_req++;
4649 (void) wakeup_one((caddr_t)&ifp->if_start_thread);
4650 lck_mtx_unlock(&ifp->if_start_lock);
4651 (void) thread_block_parameter(ifnet_start_thread_cont, ifp);
4652 /* NOTREACHED */
4653 __builtin_unreachable();
4654 }
4655
4656 __attribute__((noreturn))
4657 static void
ifnet_start_thread_cont(void * v,wait_result_t wres)4658 ifnet_start_thread_cont(void *v, wait_result_t wres)
4659 {
4660 struct ifnet *ifp = v;
4661 struct ifclassq *ifq = ifp->if_snd;
4662
4663 lck_mtx_lock_spin(&ifp->if_start_lock);
4664 if (__improbable(wres == THREAD_INTERRUPTED ||
4665 (ifp->if_start_flags & IFSF_TERMINATING) != 0)) {
4666 goto terminate;
4667 }
4668
4669 if (__improbable(ifp->if_start_embryonic)) {
4670 ifp->if_start_embryonic = 0;
4671 lck_mtx_unlock(&ifp->if_start_lock);
4672 ifnet_decr_pending_thread_count(ifp);
4673 lck_mtx_lock_spin(&ifp->if_start_lock);
4674 goto skip;
4675 }
4676
4677 ifp->if_start_active = 1;
4678
4679 /*
4680 * Keep on servicing until no more request.
4681 */
4682 for (;;) {
4683 u_int32_t req = ifp->if_start_req;
4684 if ((ifp->if_start_flags & IFSF_NO_DELAY) == 0 &&
4685 !IFCQ_IS_EMPTY(ifq) &&
4686 (ifp->if_eflags & IFEF_ENQUEUE_MULTI) &&
4687 ifp->if_start_delayed == 0 &&
4688 IFCQ_LEN(ifq) < ifp->if_start_delay_qlen &&
4689 (ifp->if_eflags & IFEF_DELAY_START)) {
4690 ifp->if_start_delayed = 1;
4691 ifnet_start_delayed++;
4692 break;
4693 }
4694 ifp->if_start_flags &= ~IFSF_NO_DELAY;
4695 ifp->if_start_delayed = 0;
4696 lck_mtx_unlock(&ifp->if_start_lock);
4697
4698 /*
4699 * If no longer attached, don't call start because ifp
4700 * is being destroyed; else hold an IO refcnt to
4701 * prevent the interface from being detached (will be
4702 * released below.)
4703 */
4704 if (!ifnet_datamov_begin(ifp)) {
4705 lck_mtx_lock_spin(&ifp->if_start_lock);
4706 break;
4707 }
4708
4709 /* invoke the driver's start routine */
4710 ((*ifp->if_start)(ifp));
4711
4712 /*
4713 * Release the io ref count taken above.
4714 */
4715 ifnet_datamov_end(ifp);
4716
4717 lck_mtx_lock_spin(&ifp->if_start_lock);
4718
4719 /*
4720 * If there's no pending request or if the
4721 * interface has been disabled, we're done.
4722 */
4723 #define _IFSF_DISABLED (IFSF_FLOW_CONTROLLED | IFSF_TERMINATING)
4724 if (req == ifp->if_start_req ||
4725 (ifp->if_start_flags & _IFSF_DISABLED) != 0) {
4726 break;
4727 }
4728 }
4729 skip:
4730 ifp->if_start_req = 0;
4731 ifp->if_start_active = 0;
4732
4733 #if SKYWALK
4734 /*
4735 * Wakeup any waiters, e.g. any threads waiting to
4736 * detach the interface from the flowswitch, etc.
4737 */
4738 if (ifp->if_start_waiters != 0) {
4739 ifp->if_start_waiters = 0;
4740 wakeup(&ifp->if_start_waiters);
4741 }
4742 #endif /* SKYWALK */
4743 if (__probable((ifp->if_start_flags & IFSF_TERMINATING) == 0)) {
4744 uint64_t deadline = TIMEOUT_WAIT_FOREVER;
4745 struct timespec delay_start_ts;
4746 struct timespec pacemaker_ts;
4747 struct timespec *ts = NULL;
4748
4749 /*
4750 * Wakeup N ns from now if rate-controlled by TBR, and if
4751 * there are still packets in the send queue which haven't
4752 * been dequeued so far; else sleep indefinitely (ts = NULL)
4753 * until ifnet_start() is called again.
4754 */
4755 if (ifp->if_start_pacemaker_time != 0) {
4756 struct timespec now_ts;
4757 uint64_t now;
4758
4759 nanouptime(&now_ts);
4760 now = ((uint64_t)now_ts.tv_sec * NSEC_PER_SEC) + now_ts.tv_nsec;
4761
4762 if (ifp->if_start_pacemaker_time != 0 &&
4763 ifp->if_start_pacemaker_time > now) {
4764 pacemaker_ts.tv_sec = 0;
4765 pacemaker_ts.tv_nsec = ifp->if_start_pacemaker_time - now;
4766
4767 ts = &pacemaker_ts;
4768 ifp->if_start_flags |= IFSF_NO_DELAY;
4769 DTRACE_SKYWALK2(pacemaker__schedule, struct ifnet*, ifp,
4770 uint64_t, pacemaker_ts.tv_nsec);
4771 } else {
4772 DTRACE_SKYWALK2(pacemaker__timer__miss, struct ifnet*, ifp,
4773 uint64_t, now - ifp->if_start_pacemaker_time);
4774 ifp->if_start_pacemaker_time = 0;
4775 ifp->if_start_flags &= ~IFSF_NO_DELAY;
4776 }
4777 }
4778
4779 if (ts == NULL) {
4780 ts = ((IFCQ_TBR_IS_ENABLED(ifq) && !IFCQ_IS_EMPTY(ifq)) ?
4781 &ifp->if_start_cycle : NULL);
4782 }
4783
4784 if (ts == NULL && ifp->if_start_delayed == 1) {
4785 delay_start_ts.tv_sec = 0;
4786 delay_start_ts.tv_nsec = ifp->if_start_delay_timeout;
4787 ts = &delay_start_ts;
4788 }
4789
4790 if (ts != NULL && ts->tv_sec == 0 && ts->tv_nsec == 0) {
4791 ts = NULL;
4792 }
4793
4794 if (__improbable(ts != NULL)) {
4795 clock_interval_to_deadline((uint32_t)(ts->tv_nsec +
4796 (ts->tv_sec * NSEC_PER_SEC)), 1, &deadline);
4797 }
4798
4799 (void) assert_wait_deadline(&ifp->if_start_thread,
4800 THREAD_UNINT, deadline);
4801 lck_mtx_unlock(&ifp->if_start_lock);
4802 (void) thread_block_parameter(ifnet_start_thread_cont, ifp);
4803 /* NOTREACHED */
4804 } else {
4805 terminate:
4806 /* interface is detached? */
4807 ifnet_set_start_cycle(ifp, NULL);
4808
4809 ifp->if_start_pacemaker_time = 0;
4810 /* clear if_start_thread to allow termination to continue */
4811 ASSERT(ifp->if_start_thread != THREAD_NULL);
4812 ifp->if_start_thread = THREAD_NULL;
4813 wakeup((caddr_t)&ifp->if_start_thread);
4814 lck_mtx_unlock(&ifp->if_start_lock);
4815
4816 if (dlil_verbose) {
4817 DLIL_PRINTF("%s: starter thread terminated\n",
4818 if_name(ifp));
4819 }
4820
4821 /* for the extra refcnt from kernel_thread_start() */
4822 thread_deallocate(current_thread());
4823 /* this is the end */
4824 thread_terminate(current_thread());
4825 /* NOTREACHED */
4826 }
4827
4828 /* must never get here */
4829 VERIFY(0);
4830 /* NOTREACHED */
4831 __builtin_unreachable();
4832 }
4833
4834 void
ifnet_set_start_cycle(struct ifnet * ifp,struct timespec * ts)4835 ifnet_set_start_cycle(struct ifnet *ifp, struct timespec *ts)
4836 {
4837 if (ts == NULL) {
4838 bzero(&ifp->if_start_cycle, sizeof(ifp->if_start_cycle));
4839 } else {
4840 *(&ifp->if_start_cycle) = *ts;
4841 }
4842
4843 if (ts != NULL && ts->tv_nsec != 0 && dlil_verbose) {
4844 DLIL_PRINTF("%s: restart interval set to %lu nsec\n",
4845 if_name(ifp), ts->tv_nsec);
4846 }
4847 }
4848
4849 static inline void
ifnet_poll_wakeup(struct ifnet * ifp)4850 ifnet_poll_wakeup(struct ifnet *ifp)
4851 {
4852 LCK_MTX_ASSERT(&ifp->if_poll_lock, LCK_MTX_ASSERT_OWNED);
4853
4854 ifp->if_poll_req++;
4855 if (!(ifp->if_poll_flags & IF_POLLF_RUNNING) &&
4856 ifp->if_poll_thread != THREAD_NULL) {
4857 wakeup_one((caddr_t)&ifp->if_poll_thread);
4858 }
4859 }
4860
4861 void
ifnet_poll(struct ifnet * ifp)4862 ifnet_poll(struct ifnet *ifp)
4863 {
4864 /*
4865 * If the poller thread is inactive, signal it to do work.
4866 */
4867 lck_mtx_lock_spin(&ifp->if_poll_lock);
4868 ifnet_poll_wakeup(ifp);
4869 lck_mtx_unlock(&ifp->if_poll_lock);
4870 }
4871
4872 __attribute__((noreturn))
4873 static void
ifnet_poll_thread_func(void * v,wait_result_t w)4874 ifnet_poll_thread_func(void *v, wait_result_t w)
4875 {
4876 #pragma unused(w)
4877 char thread_name[MAXTHREADNAMESIZE];
4878 struct ifnet *ifp = v;
4879
4880 VERIFY(ifp->if_eflags & IFEF_RXPOLL);
4881 VERIFY(current_thread() == ifp->if_poll_thread);
4882
4883 /* construct the name for this thread, and then apply it */
4884 bzero(thread_name, sizeof(thread_name));
4885 (void) snprintf(thread_name, sizeof(thread_name),
4886 "ifnet_poller_%s", ifp->if_xname);
4887 thread_set_thread_name(ifp->if_poll_thread, thread_name);
4888
4889 lck_mtx_lock(&ifp->if_poll_lock);
4890 VERIFY(!(ifp->if_poll_flags & (IF_POLLF_EMBRYONIC | IF_POLLF_RUNNING)));
4891 (void) assert_wait(&ifp->if_poll_thread, THREAD_UNINT);
4892 ifp->if_poll_flags |= IF_POLLF_EMBRYONIC;
4893 /* wake up once to get out of embryonic state */
4894 ifnet_poll_wakeup(ifp);
4895 lck_mtx_unlock(&ifp->if_poll_lock);
4896 (void) thread_block_parameter(ifnet_poll_thread_cont, ifp);
4897 /* NOTREACHED */
4898 __builtin_unreachable();
4899 }
4900
4901 __attribute__((noreturn))
4902 static void
ifnet_poll_thread_cont(void * v,wait_result_t wres)4903 ifnet_poll_thread_cont(void *v, wait_result_t wres)
4904 {
4905 struct dlil_threading_info *inp;
4906 struct ifnet *ifp = v;
4907 struct ifnet_stat_increment_param s;
4908 struct timespec start_time;
4909
4910 VERIFY(ifp->if_eflags & IFEF_RXPOLL);
4911
4912 bzero(&s, sizeof(s));
4913 net_timerclear(&start_time);
4914
4915 lck_mtx_lock_spin(&ifp->if_poll_lock);
4916 if (__improbable(wres == THREAD_INTERRUPTED ||
4917 (ifp->if_poll_flags & IF_POLLF_TERMINATING) != 0)) {
4918 goto terminate;
4919 }
4920
4921 inp = ifp->if_inp;
4922 VERIFY(inp != NULL);
4923
4924 if (__improbable(ifp->if_poll_flags & IF_POLLF_EMBRYONIC)) {
4925 ifp->if_poll_flags &= ~IF_POLLF_EMBRYONIC;
4926 lck_mtx_unlock(&ifp->if_poll_lock);
4927 ifnet_decr_pending_thread_count(ifp);
4928 lck_mtx_lock_spin(&ifp->if_poll_lock);
4929 goto skip;
4930 }
4931
4932 ifp->if_poll_flags |= IF_POLLF_RUNNING;
4933
4934 /*
4935 * Keep on servicing until no more request.
4936 */
4937 for (;;) {
4938 struct mbuf *m_head, *m_tail;
4939 u_int32_t m_lim, m_cnt, m_totlen;
4940 u_int16_t req = ifp->if_poll_req;
4941
4942 m_lim = (ifp->if_rxpoll_plim != 0) ? ifp->if_rxpoll_plim :
4943 MAX((qlimit(&inp->dlth_pkts)), (ifp->if_rxpoll_phiwat << 2));
4944 lck_mtx_unlock(&ifp->if_poll_lock);
4945
4946 /*
4947 * If no longer attached, there's nothing to do;
4948 * else hold an IO refcnt to prevent the interface
4949 * from being detached (will be released below.)
4950 */
4951 if (!ifnet_is_attached(ifp, 1)) {
4952 lck_mtx_lock_spin(&ifp->if_poll_lock);
4953 break;
4954 }
4955
4956 if (dlil_verbose > 1) {
4957 DLIL_PRINTF("%s: polling up to %d pkts, "
4958 "pkts avg %d max %d, wreq avg %d, "
4959 "bytes avg %d\n",
4960 if_name(ifp), m_lim,
4961 ifp->if_rxpoll_pavg, ifp->if_rxpoll_pmax,
4962 ifp->if_rxpoll_wavg, ifp->if_rxpoll_bavg);
4963 }
4964
4965 /* invoke the driver's input poll routine */
4966 ((*ifp->if_input_poll)(ifp, 0, m_lim, &m_head, &m_tail,
4967 &m_cnt, &m_totlen));
4968
4969 if (m_head != NULL) {
4970 VERIFY(m_tail != NULL && m_cnt > 0);
4971
4972 if (dlil_verbose > 1) {
4973 DLIL_PRINTF("%s: polled %d pkts, "
4974 "pkts avg %d max %d, wreq avg %d, "
4975 "bytes avg %d\n",
4976 if_name(ifp), m_cnt,
4977 ifp->if_rxpoll_pavg, ifp->if_rxpoll_pmax,
4978 ifp->if_rxpoll_wavg, ifp->if_rxpoll_bavg);
4979 }
4980
4981 /* stats are required for extended variant */
4982 s.packets_in = m_cnt;
4983 s.bytes_in = m_totlen;
4984
4985 (void) ifnet_input_common(ifp, m_head, m_tail,
4986 &s, TRUE, TRUE);
4987 } else {
4988 if (dlil_verbose > 1) {
4989 DLIL_PRINTF("%s: no packets, "
4990 "pkts avg %d max %d, wreq avg %d, "
4991 "bytes avg %d\n",
4992 if_name(ifp), ifp->if_rxpoll_pavg,
4993 ifp->if_rxpoll_pmax, ifp->if_rxpoll_wavg,
4994 ifp->if_rxpoll_bavg);
4995 }
4996
4997 (void) ifnet_input_common(ifp, NULL, NULL,
4998 NULL, FALSE, TRUE);
4999 }
5000
5001 /* Release the io ref count */
5002 ifnet_decr_iorefcnt(ifp);
5003
5004 lck_mtx_lock_spin(&ifp->if_poll_lock);
5005
5006 /* if there's no pending request, we're done */
5007 if (req == ifp->if_poll_req ||
5008 (ifp->if_poll_flags & IF_POLLF_TERMINATING) != 0) {
5009 break;
5010 }
5011 }
5012 skip:
5013 ifp->if_poll_req = 0;
5014 ifp->if_poll_flags &= ~IF_POLLF_RUNNING;
5015
5016 if (__probable((ifp->if_poll_flags & IF_POLLF_TERMINATING) == 0)) {
5017 uint64_t deadline = TIMEOUT_WAIT_FOREVER;
5018 struct timespec *ts;
5019
5020 /*
5021 * Wakeup N ns from now, else sleep indefinitely (ts = NULL)
5022 * until ifnet_poll() is called again.
5023 */
5024 ts = &ifp->if_poll_cycle;
5025 if (ts->tv_sec == 0 && ts->tv_nsec == 0) {
5026 ts = NULL;
5027 }
5028
5029 if (ts != NULL) {
5030 clock_interval_to_deadline((uint32_t)(ts->tv_nsec +
5031 (ts->tv_sec * NSEC_PER_SEC)), 1, &deadline);
5032 }
5033
5034 (void) assert_wait_deadline(&ifp->if_poll_thread,
5035 THREAD_UNINT, deadline);
5036 lck_mtx_unlock(&ifp->if_poll_lock);
5037 (void) thread_block_parameter(ifnet_poll_thread_cont, ifp);
5038 /* NOTREACHED */
5039 } else {
5040 terminate:
5041 /* interface is detached (maybe while asleep)? */
5042 ifnet_set_poll_cycle(ifp, NULL);
5043
5044 /* clear if_poll_thread to allow termination to continue */
5045 ASSERT(ifp->if_poll_thread != THREAD_NULL);
5046 ifp->if_poll_thread = THREAD_NULL;
5047 wakeup((caddr_t)&ifp->if_poll_thread);
5048 lck_mtx_unlock(&ifp->if_poll_lock);
5049
5050 if (dlil_verbose) {
5051 DLIL_PRINTF("%s: poller thread terminated\n",
5052 if_name(ifp));
5053 }
5054
5055 /* for the extra refcnt from kernel_thread_start() */
5056 thread_deallocate(current_thread());
5057 /* this is the end */
5058 thread_terminate(current_thread());
5059 /* NOTREACHED */
5060 }
5061
5062 /* must never get here */
5063 VERIFY(0);
5064 /* NOTREACHED */
5065 __builtin_unreachable();
5066 }
5067
5068 void
ifnet_set_poll_cycle(struct ifnet * ifp,struct timespec * ts)5069 ifnet_set_poll_cycle(struct ifnet *ifp, struct timespec *ts)
5070 {
5071 if (ts == NULL) {
5072 bzero(&ifp->if_poll_cycle, sizeof(ifp->if_poll_cycle));
5073 } else {
5074 *(&ifp->if_poll_cycle) = *ts;
5075 }
5076
5077 if (ts != NULL && ts->tv_nsec != 0 && dlil_verbose) {
5078 DLIL_PRINTF("%s: poll interval set to %lu nsec\n",
5079 if_name(ifp), ts->tv_nsec);
5080 }
5081 }
5082
5083 void
ifnet_purge(struct ifnet * ifp)5084 ifnet_purge(struct ifnet *ifp)
5085 {
5086 if (ifp != NULL && (ifp->if_eflags & IFEF_TXSTART)) {
5087 if_qflush_snd(ifp, false);
5088 }
5089 }
5090
5091 void
ifnet_update_sndq(struct ifclassq * ifq,cqev_t ev)5092 ifnet_update_sndq(struct ifclassq *ifq, cqev_t ev)
5093 {
5094 IFCQ_LOCK_ASSERT_HELD(ifq);
5095
5096 if (!(IFCQ_IS_READY(ifq))) {
5097 return;
5098 }
5099
5100 if (IFCQ_TBR_IS_ENABLED(ifq)) {
5101 struct tb_profile tb = {
5102 .rate = ifq->ifcq_tbr.tbr_rate_raw,
5103 .percent = ifq->ifcq_tbr.tbr_percent, .depth = 0
5104 };
5105 (void) ifclassq_tbr_set(ifq, &tb, FALSE);
5106 }
5107
5108 ifclassq_update(ifq, ev);
5109 }
5110
5111 void
ifnet_update_rcv(struct ifnet * ifp,cqev_t ev)5112 ifnet_update_rcv(struct ifnet *ifp, cqev_t ev)
5113 {
5114 switch (ev) {
5115 case CLASSQ_EV_LINK_BANDWIDTH:
5116 if (net_rxpoll && (ifp->if_eflags & IFEF_RXPOLL)) {
5117 ifp->if_poll_update++;
5118 }
5119 break;
5120
5121 default:
5122 break;
5123 }
5124 }
5125
5126 errno_t
ifnet_set_output_sched_model(struct ifnet * ifp,u_int32_t model)5127 ifnet_set_output_sched_model(struct ifnet *ifp, u_int32_t model)
5128 {
5129 struct ifclassq *ifq;
5130 u_int32_t omodel;
5131 errno_t err;
5132
5133 if (ifp == NULL || model >= IFNET_SCHED_MODEL_MAX) {
5134 return EINVAL;
5135 } else if (!(ifp->if_eflags & IFEF_TXSTART)) {
5136 return ENXIO;
5137 }
5138
5139 ifq = ifp->if_snd;
5140 IFCQ_LOCK(ifq);
5141 omodel = ifp->if_output_sched_model;
5142 ifp->if_output_sched_model = model;
5143 if ((err = ifclassq_pktsched_setup(ifq)) != 0) {
5144 ifp->if_output_sched_model = omodel;
5145 }
5146 IFCQ_UNLOCK(ifq);
5147
5148 return err;
5149 }
5150
5151 errno_t
ifnet_set_sndq_maxlen(struct ifnet * ifp,u_int32_t maxqlen)5152 ifnet_set_sndq_maxlen(struct ifnet *ifp, u_int32_t maxqlen)
5153 {
5154 if (ifp == NULL) {
5155 return EINVAL;
5156 } else if (!(ifp->if_eflags & IFEF_TXSTART)) {
5157 return ENXIO;
5158 }
5159
5160 ifclassq_set_maxlen(ifp->if_snd, maxqlen);
5161
5162 return 0;
5163 }
5164
5165 errno_t
ifnet_get_sndq_maxlen(struct ifnet * ifp,u_int32_t * maxqlen)5166 ifnet_get_sndq_maxlen(struct ifnet *ifp, u_int32_t *maxqlen)
5167 {
5168 if (ifp == NULL || maxqlen == NULL) {
5169 return EINVAL;
5170 } else if (!(ifp->if_eflags & IFEF_TXSTART)) {
5171 return ENXIO;
5172 }
5173
5174 *maxqlen = ifclassq_get_maxlen(ifp->if_snd);
5175
5176 return 0;
5177 }
5178
5179 errno_t
ifnet_get_sndq_len(struct ifnet * ifp,u_int32_t * pkts)5180 ifnet_get_sndq_len(struct ifnet *ifp, u_int32_t *pkts)
5181 {
5182 errno_t err;
5183
5184 if (ifp == NULL || pkts == NULL) {
5185 err = EINVAL;
5186 } else if (!(ifp->if_eflags & IFEF_TXSTART)) {
5187 err = ENXIO;
5188 } else {
5189 err = ifclassq_get_len(ifp->if_snd, MBUF_SC_UNSPEC,
5190 IF_CLASSQ_ALL_GRPS, pkts, NULL);
5191 }
5192
5193 return err;
5194 }
5195
5196 errno_t
ifnet_get_service_class_sndq_len(struct ifnet * ifp,mbuf_svc_class_t sc,u_int32_t * pkts,u_int32_t * bytes)5197 ifnet_get_service_class_sndq_len(struct ifnet *ifp, mbuf_svc_class_t sc,
5198 u_int32_t *pkts, u_int32_t *bytes)
5199 {
5200 errno_t err;
5201
5202 if (ifp == NULL || !MBUF_VALID_SC(sc) ||
5203 (pkts == NULL && bytes == NULL)) {
5204 err = EINVAL;
5205 } else if (!(ifp->if_eflags & IFEF_TXSTART)) {
5206 err = ENXIO;
5207 } else {
5208 err = ifclassq_get_len(ifp->if_snd, sc, IF_CLASSQ_ALL_GRPS,
5209 pkts, bytes);
5210 }
5211
5212 return err;
5213 }
5214
5215 errno_t
ifnet_set_rcvq_maxlen(struct ifnet * ifp,u_int32_t maxqlen)5216 ifnet_set_rcvq_maxlen(struct ifnet *ifp, u_int32_t maxqlen)
5217 {
5218 struct dlil_threading_info *inp;
5219
5220 if (ifp == NULL) {
5221 return EINVAL;
5222 } else if (!(ifp->if_eflags & IFEF_RXPOLL) || ifp->if_inp == NULL) {
5223 return ENXIO;
5224 }
5225
5226 if (maxqlen == 0) {
5227 maxqlen = if_rcvq_maxlen;
5228 } else if (maxqlen < IF_RCVQ_MINLEN) {
5229 maxqlen = IF_RCVQ_MINLEN;
5230 }
5231
5232 inp = ifp->if_inp;
5233 lck_mtx_lock(&inp->dlth_lock);
5234 qlimit(&inp->dlth_pkts) = maxqlen;
5235 lck_mtx_unlock(&inp->dlth_lock);
5236
5237 return 0;
5238 }
5239
5240 errno_t
ifnet_get_rcvq_maxlen(struct ifnet * ifp,u_int32_t * maxqlen)5241 ifnet_get_rcvq_maxlen(struct ifnet *ifp, u_int32_t *maxqlen)
5242 {
5243 struct dlil_threading_info *inp;
5244
5245 if (ifp == NULL || maxqlen == NULL) {
5246 return EINVAL;
5247 } else if (!(ifp->if_eflags & IFEF_RXPOLL) || ifp->if_inp == NULL) {
5248 return ENXIO;
5249 }
5250
5251 inp = ifp->if_inp;
5252 lck_mtx_lock(&inp->dlth_lock);
5253 *maxqlen = qlimit(&inp->dlth_pkts);
5254 lck_mtx_unlock(&inp->dlth_lock);
5255 return 0;
5256 }
5257
5258 void
ifnet_enqueue_multi_setup(struct ifnet * ifp,uint16_t delay_qlen,uint16_t delay_timeout)5259 ifnet_enqueue_multi_setup(struct ifnet *ifp, uint16_t delay_qlen,
5260 uint16_t delay_timeout)
5261 {
5262 if (delay_qlen > 0 && delay_timeout > 0) {
5263 if_set_eflags(ifp, IFEF_ENQUEUE_MULTI);
5264 ifp->if_start_delay_qlen = MIN(100, delay_qlen);
5265 ifp->if_start_delay_timeout = min(20000, delay_timeout);
5266 /* convert timeout to nanoseconds */
5267 ifp->if_start_delay_timeout *= 1000;
5268 kprintf("%s: forced IFEF_ENQUEUE_MULTI qlen %u timeout %u\n",
5269 ifp->if_xname, (uint32_t)delay_qlen,
5270 (uint32_t)delay_timeout);
5271 } else {
5272 if_clear_eflags(ifp, IFEF_ENQUEUE_MULTI);
5273 }
5274 }
5275
5276 /*
5277 * This function clears the DSCP bits in the IPV4/V6 header pointed to by buf.
5278 * While it's ok for buf to be not 32 bit aligned, the caller must ensure that
5279 * buf holds the full header.
5280 */
5281 static __attribute__((noinline)) void
ifnet_mcast_clear_dscp(uint8_t * buf,uint8_t ip_ver)5282 ifnet_mcast_clear_dscp(uint8_t *buf, uint8_t ip_ver)
5283 {
5284 struct ip *ip;
5285 struct ip6_hdr *ip6;
5286 uint8_t lbuf[64] __attribute__((aligned(8)));
5287 uint8_t *p = buf;
5288
5289 if (ip_ver == IPVERSION) {
5290 uint8_t old_tos;
5291 uint32_t sum;
5292
5293 if (__improbable(!IP_HDR_ALIGNED_P(p))) {
5294 DTRACE_IP1(not__aligned__v4, uint8_t *, buf);
5295 bcopy(buf, lbuf, sizeof(struct ip));
5296 p = lbuf;
5297 }
5298 ip = (struct ip *)(void *)p;
5299 if (__probable((ip->ip_tos & ~IPTOS_ECN_MASK) == 0)) {
5300 return;
5301 }
5302
5303 DTRACE_IP1(clear__v4, struct ip *, ip);
5304 old_tos = ip->ip_tos;
5305 ip->ip_tos &= IPTOS_ECN_MASK;
5306 sum = ip->ip_sum + htons(old_tos) - htons(ip->ip_tos);
5307 sum = (sum >> 16) + (sum & 0xffff);
5308 ip->ip_sum = (uint16_t)(sum & 0xffff);
5309
5310 if (__improbable(p == lbuf)) {
5311 bcopy(lbuf, buf, sizeof(struct ip));
5312 }
5313 } else {
5314 uint32_t flow;
5315 ASSERT(ip_ver == IPV6_VERSION);
5316
5317 if (__improbable(!IP_HDR_ALIGNED_P(p))) {
5318 DTRACE_IP1(not__aligned__v6, uint8_t *, buf);
5319 bcopy(buf, lbuf, sizeof(struct ip6_hdr));
5320 p = lbuf;
5321 }
5322 ip6 = (struct ip6_hdr *)(void *)p;
5323 flow = ntohl(ip6->ip6_flow);
5324 if (__probable((flow & IP6FLOW_DSCP_MASK) == 0)) {
5325 return;
5326 }
5327
5328 DTRACE_IP1(clear__v6, struct ip6_hdr *, ip6);
5329 ip6->ip6_flow = htonl(flow & ~IP6FLOW_DSCP_MASK);
5330
5331 if (__improbable(p == lbuf)) {
5332 bcopy(lbuf, buf, sizeof(struct ip6_hdr));
5333 }
5334 }
5335 }
5336
5337 static inline errno_t
ifnet_enqueue_ifclassq(struct ifnet * ifp,struct ifclassq * ifcq,classq_pkt_t * p,boolean_t flush,boolean_t * pdrop)5338 ifnet_enqueue_ifclassq(struct ifnet *ifp, struct ifclassq *ifcq,
5339 classq_pkt_t *p, boolean_t flush, boolean_t *pdrop)
5340 {
5341 #if SKYWALK
5342 volatile struct sk_nexusadv *nxadv = NULL;
5343 #endif /* SKYWALK */
5344 volatile uint64_t *fg_ts = NULL;
5345 volatile uint64_t *rt_ts = NULL;
5346 struct timespec now;
5347 u_int64_t now_nsec = 0;
5348 int error = 0;
5349 uint8_t *mcast_buf = NULL;
5350 uint8_t ip_ver;
5351 uint32_t pktlen;
5352
5353 ASSERT(ifp->if_eflags & IFEF_TXSTART);
5354 #if SKYWALK
5355 /*
5356 * If attached to flowswitch, grab pointers to the
5357 * timestamp variables in the nexus advisory region.
5358 */
5359 if ((ifp->if_capabilities & IFCAP_SKYWALK) && ifp->if_na != NULL &&
5360 (nxadv = ifp->if_na->nifna_netif->nif_fsw_nxadv) != NULL) {
5361 fg_ts = &nxadv->nxadv_fg_sendts;
5362 rt_ts = &nxadv->nxadv_rt_sendts;
5363 }
5364 #endif /* SKYWALK */
5365
5366 /*
5367 * If packet already carries a timestamp, either from dlil_output()
5368 * or from flowswitch, use it here. Otherwise, record timestamp.
5369 * PKTF_TS_VALID is always cleared prior to entering classq, i.e.
5370 * the timestamp value is used internally there.
5371 */
5372 switch (p->cp_ptype) {
5373 case QP_MBUF:
5374 #if SKYWALK
5375 /*
5376 * Valid only for non-native (compat) Skywalk interface.
5377 * If the data source uses packet, caller must convert
5378 * it to mbuf first prior to calling this routine.
5379 */
5380 ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5381 #endif /* SKYWALK */
5382 ASSERT(p->cp_mbuf->m_flags & M_PKTHDR);
5383 ASSERT(p->cp_mbuf->m_nextpkt == NULL);
5384
5385 if (!(p->cp_mbuf->m_pkthdr.pkt_flags & PKTF_TS_VALID) ||
5386 p->cp_mbuf->m_pkthdr.pkt_timestamp == 0) {
5387 nanouptime(&now);
5388 net_timernsec(&now, &now_nsec);
5389 p->cp_mbuf->m_pkthdr.pkt_timestamp = now_nsec;
5390 }
5391 p->cp_mbuf->m_pkthdr.pkt_flags &= ~PKTF_TS_VALID;
5392 /*
5393 * If the packet service class is not background,
5394 * update the timestamp to indicate recent activity
5395 * on a foreground socket.
5396 */
5397 if ((p->cp_mbuf->m_pkthdr.pkt_flags & PKTF_FLOW_ID) &&
5398 p->cp_mbuf->m_pkthdr.pkt_flowsrc == FLOWSRC_INPCB) {
5399 if (!(p->cp_mbuf->m_pkthdr.pkt_flags &
5400 PKTF_SO_BACKGROUND)) {
5401 ifp->if_fg_sendts = (uint32_t)_net_uptime;
5402 if (fg_ts != NULL) {
5403 *fg_ts = (uint32_t)_net_uptime;
5404 }
5405 }
5406 if (p->cp_mbuf->m_pkthdr.pkt_flags & PKTF_SO_REALTIME) {
5407 ifp->if_rt_sendts = (uint32_t)_net_uptime;
5408 if (rt_ts != NULL) {
5409 *rt_ts = (uint32_t)_net_uptime;
5410 }
5411 }
5412 }
5413 pktlen = m_pktlen(p->cp_mbuf);
5414
5415 /*
5416 * Some Wi-Fi AP implementations do not correctly handle
5417 * multicast IP packets with DSCP bits set (radr://9331522).
5418 * As a workaround we clear the DSCP bits but keep service
5419 * class (rdar://51507725).
5420 */
5421 if ((p->cp_mbuf->m_flags & M_MCAST) != 0 &&
5422 IFNET_IS_WIFI_INFRA(ifp)) {
5423 size_t len = mbuf_len(p->cp_mbuf), hlen;
5424 struct ether_header *eh;
5425 boolean_t pullup = FALSE;
5426 uint16_t etype;
5427
5428 if (__improbable(len < sizeof(struct ether_header))) {
5429 DTRACE_IP1(small__ether, size_t, len);
5430 if ((p->cp_mbuf = m_pullup(p->cp_mbuf,
5431 sizeof(struct ether_header))) == NULL) {
5432 return ENOMEM;
5433 }
5434 }
5435 eh = (struct ether_header *)mbuf_data(p->cp_mbuf);
5436 etype = ntohs(eh->ether_type);
5437 if (etype == ETHERTYPE_IP) {
5438 hlen = sizeof(struct ether_header) +
5439 sizeof(struct ip);
5440 if (len < hlen) {
5441 DTRACE_IP1(small__v4, size_t, len);
5442 pullup = TRUE;
5443 }
5444 ip_ver = IPVERSION;
5445 } else if (etype == ETHERTYPE_IPV6) {
5446 hlen = sizeof(struct ether_header) +
5447 sizeof(struct ip6_hdr);
5448 if (len < hlen) {
5449 DTRACE_IP1(small__v6, size_t, len);
5450 pullup = TRUE;
5451 }
5452 ip_ver = IPV6_VERSION;
5453 } else {
5454 DTRACE_IP1(invalid__etype, uint16_t, etype);
5455 break;
5456 }
5457 if (pullup) {
5458 if ((p->cp_mbuf = m_pullup(p->cp_mbuf, (int)hlen)) ==
5459 NULL) {
5460 return ENOMEM;
5461 }
5462
5463 eh = (struct ether_header *)mbuf_data(
5464 p->cp_mbuf);
5465 }
5466 mcast_buf = (uint8_t *)(eh + 1);
5467 /*
5468 * ifnet_mcast_clear_dscp() will finish the work below.
5469 * Note that the pullups above ensure that mcast_buf
5470 * points to a full IP header.
5471 */
5472 }
5473 break;
5474
5475 #if SKYWALK
5476 case QP_PACKET:
5477 /*
5478 * Valid only for native Skywalk interface. If the data
5479 * source uses mbuf, caller must convert it to packet first
5480 * prior to calling this routine.
5481 */
5482 ASSERT(ifp->if_eflags & IFEF_SKYWALK_NATIVE);
5483 if (!(p->cp_kpkt->pkt_pflags & PKT_F_TS_VALID) ||
5484 p->cp_kpkt->pkt_timestamp == 0) {
5485 nanouptime(&now);
5486 net_timernsec(&now, &now_nsec);
5487 p->cp_kpkt->pkt_timestamp = now_nsec;
5488 }
5489 p->cp_kpkt->pkt_pflags &= ~PKT_F_TS_VALID;
5490 /*
5491 * If the packet service class is not background,
5492 * update the timestamps on the interface, as well as
5493 * the ones in nexus-wide advisory to indicate recent
5494 * activity on a foreground flow.
5495 */
5496 if (!(p->cp_kpkt->pkt_pflags & PKT_F_BACKGROUND)) {
5497 ifp->if_fg_sendts = (uint32_t)_net_uptime;
5498 if (fg_ts != NULL) {
5499 *fg_ts = (uint32_t)_net_uptime;
5500 }
5501 }
5502 if (p->cp_kpkt->pkt_pflags & PKT_F_REALTIME) {
5503 ifp->if_rt_sendts = (uint32_t)_net_uptime;
5504 if (rt_ts != NULL) {
5505 *rt_ts = (uint32_t)_net_uptime;
5506 }
5507 }
5508 pktlen = p->cp_kpkt->pkt_length;
5509
5510 /*
5511 * Some Wi-Fi AP implementations do not correctly handle
5512 * multicast IP packets with DSCP bits set (radr://9331522).
5513 * As a workaround we clear the DSCP bits but keep service
5514 * class (rdar://51507725).
5515 */
5516 if ((p->cp_kpkt->pkt_link_flags & PKT_LINKF_MCAST) != 0 &&
5517 IFNET_IS_WIFI_INFRA(ifp)) {
5518 uint8_t *baddr;
5519 struct ether_header *eh;
5520 uint16_t etype;
5521
5522 MD_BUFLET_ADDR_ABS(p->cp_kpkt, baddr);
5523 baddr += p->cp_kpkt->pkt_headroom;
5524 if (__improbable(pktlen < sizeof(struct ether_header))) {
5525 DTRACE_IP1(pkt__small__ether, __kern_packet *,
5526 p->cp_kpkt);
5527 break;
5528 }
5529 eh = (struct ether_header *)(void *)baddr;
5530 etype = ntohs(eh->ether_type);
5531 if (etype == ETHERTYPE_IP) {
5532 if (pktlen < sizeof(struct ether_header) +
5533 sizeof(struct ip)) {
5534 DTRACE_IP1(pkt__small__v4, uint32_t,
5535 pktlen);
5536 break;
5537 }
5538 ip_ver = IPVERSION;
5539 } else if (etype == ETHERTYPE_IPV6) {
5540 if (pktlen < sizeof(struct ether_header) +
5541 sizeof(struct ip6_hdr)) {
5542 DTRACE_IP1(pkt__small__v6, uint32_t,
5543 pktlen);
5544 break;
5545 }
5546 ip_ver = IPV6_VERSION;
5547 } else {
5548 DTRACE_IP1(pkt__invalid__etype, uint16_t,
5549 etype);
5550 break;
5551 }
5552 mcast_buf = (uint8_t *)(eh + 1);
5553 /*
5554 * ifnet_mcast_clear_dscp() will finish the work below.
5555 * The checks above verify that the IP header is in the
5556 * first buflet.
5557 */
5558 }
5559 break;
5560 #endif /* SKYWALK */
5561
5562 default:
5563 VERIFY(0);
5564 /* NOTREACHED */
5565 __builtin_unreachable();
5566 }
5567
5568 if (mcast_buf != NULL) {
5569 ifnet_mcast_clear_dscp(mcast_buf, ip_ver);
5570 }
5571
5572 if (ifp->if_eflags & IFEF_ENQUEUE_MULTI) {
5573 if (now_nsec == 0) {
5574 nanouptime(&now);
5575 net_timernsec(&now, &now_nsec);
5576 }
5577 /*
5578 * If the driver chose to delay start callback for
5579 * coalescing multiple packets, Then use the following
5580 * heuristics to make sure that start callback will
5581 * be delayed only when bulk data transfer is detected.
5582 * 1. number of packets enqueued in (delay_win * 2) is
5583 * greater than or equal to the delay qlen.
5584 * 2. If delay_start is enabled it will stay enabled for
5585 * another 10 idle windows. This is to take into account
5586 * variable RTT and burst traffic.
5587 * 3. If the time elapsed since last enqueue is more
5588 * than 200ms we disable delaying start callback. This is
5589 * is to take idle time into account.
5590 */
5591 u_int64_t dwin = (ifp->if_start_delay_timeout << 1);
5592 if (ifp->if_start_delay_swin > 0) {
5593 if ((ifp->if_start_delay_swin + dwin) > now_nsec) {
5594 ifp->if_start_delay_cnt++;
5595 } else if ((now_nsec - ifp->if_start_delay_swin)
5596 >= (200 * 1000 * 1000)) {
5597 ifp->if_start_delay_swin = now_nsec;
5598 ifp->if_start_delay_cnt = 1;
5599 ifp->if_start_delay_idle = 0;
5600 if (ifp->if_eflags & IFEF_DELAY_START) {
5601 if_clear_eflags(ifp, IFEF_DELAY_START);
5602 ifnet_delay_start_disabled_increment();
5603 }
5604 } else {
5605 if (ifp->if_start_delay_cnt >=
5606 ifp->if_start_delay_qlen) {
5607 if_set_eflags(ifp, IFEF_DELAY_START);
5608 ifp->if_start_delay_idle = 0;
5609 } else {
5610 if (ifp->if_start_delay_idle >= 10) {
5611 if_clear_eflags(ifp,
5612 IFEF_DELAY_START);
5613 ifnet_delay_start_disabled_increment();
5614 } else {
5615 ifp->if_start_delay_idle++;
5616 }
5617 }
5618 ifp->if_start_delay_swin = now_nsec;
5619 ifp->if_start_delay_cnt = 1;
5620 }
5621 } else {
5622 ifp->if_start_delay_swin = now_nsec;
5623 ifp->if_start_delay_cnt = 1;
5624 ifp->if_start_delay_idle = 0;
5625 if_clear_eflags(ifp, IFEF_DELAY_START);
5626 }
5627 } else {
5628 if_clear_eflags(ifp, IFEF_DELAY_START);
5629 }
5630
5631 /* enqueue the packet (caller consumes object) */
5632 error = ifclassq_enqueue(((ifcq != NULL) ? ifcq : ifp->if_snd), p, p,
5633 1, pktlen, pdrop);
5634
5635 /*
5636 * Tell the driver to start dequeueing; do this even when the queue
5637 * for the packet is suspended (EQSUSPENDED), as the driver could still
5638 * be dequeueing from other unsuspended queues.
5639 */
5640 if (!(ifp->if_eflags & IFEF_ENQUEUE_MULTI) &&
5641 ((error == 0 && flush) || error == EQFULL || error == EQSUSPENDED)) {
5642 ifnet_start(ifp);
5643 }
5644
5645 return error;
5646 }
5647
5648 static inline errno_t
ifnet_enqueue_ifclassq_chain(struct ifnet * ifp,struct ifclassq * ifcq,classq_pkt_t * head,classq_pkt_t * tail,uint32_t cnt,uint32_t bytes,boolean_t flush,boolean_t * pdrop)5649 ifnet_enqueue_ifclassq_chain(struct ifnet *ifp, struct ifclassq *ifcq,
5650 classq_pkt_t *head, classq_pkt_t *tail, uint32_t cnt, uint32_t bytes,
5651 boolean_t flush, boolean_t *pdrop)
5652 {
5653 int error;
5654
5655 /* enqueue the packet (caller consumes object) */
5656 error = ifclassq_enqueue(ifcq != NULL ? ifcq : ifp->if_snd, head, tail,
5657 cnt, bytes, pdrop);
5658
5659 /*
5660 * Tell the driver to start dequeueing; do this even when the queue
5661 * for the packet is suspended (EQSUSPENDED), as the driver could still
5662 * be dequeueing from other unsuspended queues.
5663 */
5664 if ((error == 0 && flush) || error == EQFULL || error == EQSUSPENDED) {
5665 ifnet_start(ifp);
5666 }
5667 return error;
5668 }
5669
5670 int
ifnet_enqueue_netem(void * handle,pktsched_pkt_t * pkts,uint32_t n_pkts)5671 ifnet_enqueue_netem(void *handle, pktsched_pkt_t *pkts, uint32_t n_pkts)
5672 {
5673 struct ifnet *ifp = handle;
5674 boolean_t pdrop; /* dummy */
5675 uint32_t i;
5676
5677 ASSERT(n_pkts >= 1);
5678 for (i = 0; i < n_pkts - 1; i++) {
5679 (void) ifnet_enqueue_ifclassq(ifp, NULL, &pkts[i].pktsched_pkt,
5680 FALSE, &pdrop);
5681 }
5682 /* flush with the last packet */
5683 (void) ifnet_enqueue_ifclassq(ifp, NULL, &pkts[i].pktsched_pkt,
5684 TRUE, &pdrop);
5685
5686 return 0;
5687 }
5688
5689 static inline errno_t
ifnet_enqueue_common(struct ifnet * ifp,struct ifclassq * ifcq,classq_pkt_t * pkt,boolean_t flush,boolean_t * pdrop)5690 ifnet_enqueue_common(struct ifnet *ifp, struct ifclassq *ifcq,
5691 classq_pkt_t *pkt, boolean_t flush, boolean_t *pdrop)
5692 {
5693 if (ifp->if_output_netem != NULL) {
5694 bool drop;
5695 errno_t error;
5696 error = netem_enqueue(ifp->if_output_netem, pkt, &drop);
5697 *pdrop = drop ? TRUE : FALSE;
5698 return error;
5699 } else {
5700 return ifnet_enqueue_ifclassq(ifp, ifcq, pkt, flush, pdrop);
5701 }
5702 }
5703
5704 errno_t
ifnet_enqueue(struct ifnet * ifp,struct mbuf * m)5705 ifnet_enqueue(struct ifnet *ifp, struct mbuf *m)
5706 {
5707 boolean_t pdrop;
5708 return ifnet_enqueue_mbuf(ifp, m, TRUE, &pdrop);
5709 }
5710
5711 errno_t
ifnet_enqueue_mbuf(struct ifnet * ifp,struct mbuf * m,boolean_t flush,boolean_t * pdrop)5712 ifnet_enqueue_mbuf(struct ifnet *ifp, struct mbuf *m, boolean_t flush,
5713 boolean_t *pdrop)
5714 {
5715 classq_pkt_t pkt;
5716
5717 if (ifp == NULL || m == NULL || !(m->m_flags & M_PKTHDR) ||
5718 m->m_nextpkt != NULL) {
5719 if (m != NULL) {
5720 m_freem_list(m);
5721 *pdrop = TRUE;
5722 }
5723 return EINVAL;
5724 } else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5725 !IF_FULLY_ATTACHED(ifp)) {
5726 /* flag tested without lock for performance */
5727 m_freem(m);
5728 *pdrop = TRUE;
5729 return ENXIO;
5730 } else if (!(ifp->if_flags & IFF_UP)) {
5731 m_freem(m);
5732 *pdrop = TRUE;
5733 return ENETDOWN;
5734 }
5735
5736 CLASSQ_PKT_INIT_MBUF(&pkt, m);
5737 return ifnet_enqueue_common(ifp, NULL, &pkt, flush, pdrop);
5738 }
5739
5740 errno_t
ifnet_enqueue_mbuf_chain(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,uint32_t cnt,uint32_t bytes,boolean_t flush,boolean_t * pdrop)5741 ifnet_enqueue_mbuf_chain(struct ifnet *ifp, struct mbuf *m_head,
5742 struct mbuf *m_tail, uint32_t cnt, uint32_t bytes, boolean_t flush,
5743 boolean_t *pdrop)
5744 {
5745 classq_pkt_t head, tail;
5746
5747 ASSERT(m_head != NULL);
5748 ASSERT((m_head->m_flags & M_PKTHDR) != 0);
5749 ASSERT(m_tail != NULL);
5750 ASSERT((m_tail->m_flags & M_PKTHDR) != 0);
5751 ASSERT(ifp != NULL);
5752 ASSERT((ifp->if_eflags & IFEF_TXSTART) != 0);
5753
5754 if (!IF_FULLY_ATTACHED(ifp)) {
5755 /* flag tested without lock for performance */
5756 m_freem_list(m_head);
5757 *pdrop = TRUE;
5758 return ENXIO;
5759 } else if (!(ifp->if_flags & IFF_UP)) {
5760 m_freem_list(m_head);
5761 *pdrop = TRUE;
5762 return ENETDOWN;
5763 }
5764
5765 CLASSQ_PKT_INIT_MBUF(&head, m_head);
5766 CLASSQ_PKT_INIT_MBUF(&tail, m_tail);
5767 return ifnet_enqueue_ifclassq_chain(ifp, NULL, &head, &tail, cnt, bytes,
5768 flush, pdrop);
5769 }
5770
5771 #if SKYWALK
5772 static errno_t
ifnet_enqueue_pkt_common(struct ifnet * ifp,struct ifclassq * ifcq,struct __kern_packet * kpkt,boolean_t flush,boolean_t * pdrop)5773 ifnet_enqueue_pkt_common(struct ifnet *ifp, struct ifclassq *ifcq,
5774 struct __kern_packet *kpkt, boolean_t flush, boolean_t *pdrop)
5775 {
5776 classq_pkt_t pkt;
5777
5778 ASSERT(kpkt == NULL || kpkt->pkt_nextpkt == NULL);
5779
5780 if (__improbable(ifp == NULL || kpkt == NULL)) {
5781 if (kpkt != NULL) {
5782 pp_free_packet(__DECONST(struct kern_pbufpool *,
5783 kpkt->pkt_qum.qum_pp), SK_PTR_ADDR(kpkt));
5784 *pdrop = TRUE;
5785 }
5786 return EINVAL;
5787 } else if (__improbable(!(ifp->if_eflags & IFEF_TXSTART) ||
5788 !IF_FULLY_ATTACHED(ifp))) {
5789 /* flag tested without lock for performance */
5790 pp_free_packet(__DECONST(struct kern_pbufpool *,
5791 kpkt->pkt_qum.qum_pp), SK_PTR_ADDR(kpkt));
5792 *pdrop = TRUE;
5793 return ENXIO;
5794 } else if (__improbable(!(ifp->if_flags & IFF_UP))) {
5795 pp_free_packet(__DECONST(struct kern_pbufpool *,
5796 kpkt->pkt_qum.qum_pp), SK_PTR_ADDR(kpkt));
5797 *pdrop = TRUE;
5798 return ENETDOWN;
5799 }
5800
5801 CLASSQ_PKT_INIT_PACKET(&pkt, kpkt);
5802 return ifnet_enqueue_common(ifp, ifcq, &pkt, flush, pdrop);
5803 }
5804
5805 errno_t
ifnet_enqueue_pkt(struct ifnet * ifp,struct __kern_packet * kpkt,boolean_t flush,boolean_t * pdrop)5806 ifnet_enqueue_pkt(struct ifnet *ifp, struct __kern_packet *kpkt,
5807 boolean_t flush, boolean_t *pdrop)
5808 {
5809 return ifnet_enqueue_pkt_common(ifp, NULL, kpkt, flush, pdrop);
5810 }
5811
5812 errno_t
ifnet_enqueue_ifcq_pkt(struct ifnet * ifp,struct ifclassq * ifcq,struct __kern_packet * kpkt,boolean_t flush,boolean_t * pdrop)5813 ifnet_enqueue_ifcq_pkt(struct ifnet *ifp, struct ifclassq *ifcq,
5814 struct __kern_packet *kpkt, boolean_t flush, boolean_t *pdrop)
5815 {
5816 return ifnet_enqueue_pkt_common(ifp, ifcq, kpkt, flush, pdrop);
5817 }
5818
5819 static errno_t
ifnet_enqueue_pkt_chain_common(struct ifnet * ifp,struct ifclassq * ifcq,struct __kern_packet * k_head,struct __kern_packet * k_tail,uint32_t cnt,uint32_t bytes,boolean_t flush,boolean_t * pdrop)5820 ifnet_enqueue_pkt_chain_common(struct ifnet *ifp, struct ifclassq *ifcq,
5821 struct __kern_packet *k_head, struct __kern_packet *k_tail, uint32_t cnt,
5822 uint32_t bytes, boolean_t flush, boolean_t *pdrop)
5823 {
5824 classq_pkt_t head, tail;
5825
5826 ASSERT(k_head != NULL);
5827 ASSERT(k_tail != NULL);
5828 ASSERT(ifp != NULL);
5829 ASSERT((ifp->if_eflags & IFEF_TXSTART) != 0);
5830
5831 if (!IF_FULLY_ATTACHED(ifp)) {
5832 /* flag tested without lock for performance */
5833 pp_free_packet_chain(k_head, NULL);
5834 *pdrop = TRUE;
5835 return ENXIO;
5836 } else if (__improbable(!(ifp->if_flags & IFF_UP))) {
5837 pp_free_packet_chain(k_head, NULL);
5838 *pdrop = TRUE;
5839 return ENETDOWN;
5840 }
5841
5842 CLASSQ_PKT_INIT_PACKET(&head, k_head);
5843 CLASSQ_PKT_INIT_PACKET(&tail, k_tail);
5844 return ifnet_enqueue_ifclassq_chain(ifp, ifcq, &head, &tail, cnt, bytes,
5845 flush, pdrop);
5846 }
5847
5848 errno_t
ifnet_enqueue_pkt_chain(struct ifnet * ifp,struct __kern_packet * k_head,struct __kern_packet * k_tail,uint32_t cnt,uint32_t bytes,boolean_t flush,boolean_t * pdrop)5849 ifnet_enqueue_pkt_chain(struct ifnet *ifp, struct __kern_packet *k_head,
5850 struct __kern_packet *k_tail, uint32_t cnt, uint32_t bytes, boolean_t flush,
5851 boolean_t *pdrop)
5852 {
5853 return ifnet_enqueue_pkt_chain_common(ifp, NULL, k_head, k_tail,
5854 cnt, bytes, flush, pdrop);
5855 }
5856
5857 errno_t
ifnet_enqueue_ifcq_pkt_chain(struct ifnet * ifp,struct ifclassq * ifcq,struct __kern_packet * k_head,struct __kern_packet * k_tail,uint32_t cnt,uint32_t bytes,boolean_t flush,boolean_t * pdrop)5858 ifnet_enqueue_ifcq_pkt_chain(struct ifnet *ifp, struct ifclassq *ifcq,
5859 struct __kern_packet *k_head, struct __kern_packet *k_tail, uint32_t cnt,
5860 uint32_t bytes, boolean_t flush, boolean_t *pdrop)
5861 {
5862 return ifnet_enqueue_pkt_chain_common(ifp, ifcq, k_head, k_tail,
5863 cnt, bytes, flush, pdrop);
5864 }
5865 #endif /* SKYWALK */
5866
5867 errno_t
ifnet_dequeue(struct ifnet * ifp,struct mbuf ** mp)5868 ifnet_dequeue(struct ifnet *ifp, struct mbuf **mp)
5869 {
5870 errno_t rc;
5871 classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
5872
5873 if (ifp == NULL || mp == NULL) {
5874 return EINVAL;
5875 } else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5876 ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5877 return ENXIO;
5878 }
5879 if (!ifnet_is_attached(ifp, 1)) {
5880 return ENXIO;
5881 }
5882
5883 #if SKYWALK
5884 ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5885 #endif /* SKYWALK */
5886 rc = ifclassq_dequeue(ifp->if_snd, 1, CLASSQ_DEQUEUE_MAX_BYTE_LIMIT,
5887 &pkt, NULL, NULL, NULL, 0);
5888 VERIFY((pkt.cp_ptype == QP_MBUF) || (pkt.cp_mbuf == NULL));
5889 ifnet_decr_iorefcnt(ifp);
5890 *mp = pkt.cp_mbuf;
5891 return rc;
5892 }
5893
5894 errno_t
ifnet_dequeue_service_class(struct ifnet * ifp,mbuf_svc_class_t sc,struct mbuf ** mp)5895 ifnet_dequeue_service_class(struct ifnet *ifp, mbuf_svc_class_t sc,
5896 struct mbuf **mp)
5897 {
5898 errno_t rc;
5899 classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
5900
5901 if (ifp == NULL || mp == NULL || !MBUF_VALID_SC(sc)) {
5902 return EINVAL;
5903 } else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5904 ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5905 return ENXIO;
5906 }
5907 if (!ifnet_is_attached(ifp, 1)) {
5908 return ENXIO;
5909 }
5910
5911 #if SKYWALK
5912 ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5913 #endif /* SKYWALK */
5914 rc = ifclassq_dequeue_sc(ifp->if_snd, sc, 1,
5915 CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, &pkt, NULL, NULL, NULL, 0);
5916 VERIFY((pkt.cp_ptype == QP_MBUF) || (pkt.cp_mbuf == NULL));
5917 ifnet_decr_iorefcnt(ifp);
5918 *mp = pkt.cp_mbuf;
5919 return rc;
5920 }
5921
5922 errno_t
ifnet_dequeue_multi(struct ifnet * ifp,u_int32_t pkt_limit,struct mbuf ** head,struct mbuf ** tail,u_int32_t * cnt,u_int32_t * len)5923 ifnet_dequeue_multi(struct ifnet *ifp, u_int32_t pkt_limit,
5924 struct mbuf **head, struct mbuf **tail, u_int32_t *cnt, u_int32_t *len)
5925 {
5926 errno_t rc;
5927 classq_pkt_t pkt_head = CLASSQ_PKT_INITIALIZER(pkt_head);
5928 classq_pkt_t pkt_tail = CLASSQ_PKT_INITIALIZER(pkt_tail);
5929
5930 if (ifp == NULL || head == NULL || pkt_limit < 1) {
5931 return EINVAL;
5932 } else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5933 ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5934 return ENXIO;
5935 }
5936 if (!ifnet_is_attached(ifp, 1)) {
5937 return ENXIO;
5938 }
5939
5940 #if SKYWALK
5941 ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5942 #endif /* SKYWALK */
5943 rc = ifclassq_dequeue(ifp->if_snd, pkt_limit,
5944 CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, &pkt_head, &pkt_tail, cnt, len, 0);
5945 VERIFY((pkt_head.cp_ptype == QP_MBUF) || (pkt_head.cp_mbuf == NULL));
5946 ifnet_decr_iorefcnt(ifp);
5947 *head = pkt_head.cp_mbuf;
5948 if (tail != NULL) {
5949 *tail = pkt_tail.cp_mbuf;
5950 }
5951 return rc;
5952 }
5953
5954 errno_t
ifnet_dequeue_multi_bytes(struct ifnet * ifp,u_int32_t byte_limit,struct mbuf ** head,struct mbuf ** tail,u_int32_t * cnt,u_int32_t * len)5955 ifnet_dequeue_multi_bytes(struct ifnet *ifp, u_int32_t byte_limit,
5956 struct mbuf **head, struct mbuf **tail, u_int32_t *cnt, u_int32_t *len)
5957 {
5958 errno_t rc;
5959 classq_pkt_t pkt_head = CLASSQ_PKT_INITIALIZER(pkt_head);
5960 classq_pkt_t pkt_tail = CLASSQ_PKT_INITIALIZER(pkt_tail);
5961
5962 if (ifp == NULL || head == NULL || byte_limit < 1) {
5963 return EINVAL;
5964 } else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5965 ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5966 return ENXIO;
5967 }
5968 if (!ifnet_is_attached(ifp, 1)) {
5969 return ENXIO;
5970 }
5971
5972 #if SKYWALK
5973 ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5974 #endif /* SKYWALK */
5975 rc = ifclassq_dequeue(ifp->if_snd, CLASSQ_DEQUEUE_MAX_PKT_LIMIT,
5976 byte_limit, &pkt_head, &pkt_tail, cnt, len, 0);
5977 VERIFY((pkt_head.cp_ptype == QP_MBUF) || (pkt_head.cp_mbuf == NULL));
5978 ifnet_decr_iorefcnt(ifp);
5979 *head = pkt_head.cp_mbuf;
5980 if (tail != NULL) {
5981 *tail = pkt_tail.cp_mbuf;
5982 }
5983 return rc;
5984 }
5985
5986 errno_t
ifnet_dequeue_service_class_multi(struct ifnet * ifp,mbuf_svc_class_t sc,u_int32_t pkt_limit,struct mbuf ** head,struct mbuf ** tail,u_int32_t * cnt,u_int32_t * len)5987 ifnet_dequeue_service_class_multi(struct ifnet *ifp, mbuf_svc_class_t sc,
5988 u_int32_t pkt_limit, struct mbuf **head, struct mbuf **tail, u_int32_t *cnt,
5989 u_int32_t *len)
5990 {
5991 errno_t rc;
5992 classq_pkt_t pkt_head = CLASSQ_PKT_INITIALIZER(pkt_head);
5993 classq_pkt_t pkt_tail = CLASSQ_PKT_INITIALIZER(pkt_tail);
5994
5995 if (ifp == NULL || head == NULL || pkt_limit < 1 ||
5996 !MBUF_VALID_SC(sc)) {
5997 return EINVAL;
5998 } else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5999 ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
6000 return ENXIO;
6001 }
6002 if (!ifnet_is_attached(ifp, 1)) {
6003 return ENXIO;
6004 }
6005
6006 #if SKYWALK
6007 ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
6008 #endif /* SKYWALK */
6009 rc = ifclassq_dequeue_sc(ifp->if_snd, sc, pkt_limit,
6010 CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, &pkt_head, &pkt_tail,
6011 cnt, len, 0);
6012 VERIFY((pkt_head.cp_ptype == QP_MBUF) || (pkt_head.cp_mbuf == NULL));
6013 ifnet_decr_iorefcnt(ifp);
6014 *head = pkt_head.cp_mbuf;
6015 if (tail != NULL) {
6016 *tail = pkt_tail.cp_mbuf;
6017 }
6018 return rc;
6019 }
6020
6021 #if XNU_TARGET_OS_OSX
6022 errno_t
ifnet_framer_stub(struct ifnet * ifp,struct mbuf ** m,const struct sockaddr * dest,const char * dest_linkaddr,const char * frame_type,u_int32_t * pre,u_int32_t * post)6023 ifnet_framer_stub(struct ifnet *ifp, struct mbuf **m,
6024 const struct sockaddr *dest, const char *dest_linkaddr,
6025 const char *frame_type, u_int32_t *pre, u_int32_t *post)
6026 {
6027 if (pre != NULL) {
6028 *pre = 0;
6029 }
6030 if (post != NULL) {
6031 *post = 0;
6032 }
6033
6034 return ifp->if_framer_legacy(ifp, m, dest, dest_linkaddr, frame_type);
6035 }
6036 #endif /* XNU_TARGET_OS_OSX */
6037
6038 static boolean_t
packet_has_vlan_tag(struct mbuf * m)6039 packet_has_vlan_tag(struct mbuf * m)
6040 {
6041 u_int tag = 0;
6042
6043 if ((m->m_pkthdr.csum_flags & CSUM_VLAN_TAG_VALID) != 0) {
6044 tag = EVL_VLANOFTAG(m->m_pkthdr.vlan_tag);
6045 if (tag == 0) {
6046 /* the packet is just priority-tagged, clear the bit */
6047 m->m_pkthdr.csum_flags &= ~CSUM_VLAN_TAG_VALID;
6048 }
6049 }
6050 return tag != 0;
6051 }
6052
6053 static int
dlil_interface_filters_input(struct ifnet * ifp,struct mbuf ** m_p,char ** frame_header_p,protocol_family_t protocol_family)6054 dlil_interface_filters_input(struct ifnet *ifp, struct mbuf **m_p,
6055 char **frame_header_p, protocol_family_t protocol_family)
6056 {
6057 boolean_t is_vlan_packet = FALSE;
6058 struct ifnet_filter *filter;
6059 struct mbuf *m = *m_p;
6060
6061 is_vlan_packet = packet_has_vlan_tag(m);
6062
6063 if (TAILQ_EMPTY(&ifp->if_flt_head)) {
6064 return 0;
6065 }
6066
6067 /*
6068 * Pass the inbound packet to the interface filters
6069 */
6070 lck_mtx_lock_spin(&ifp->if_flt_lock);
6071 /* prevent filter list from changing in case we drop the lock */
6072 if_flt_monitor_busy(ifp);
6073 TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
6074 int result;
6075
6076 /* exclude VLAN packets from external filters PR-3586856 */
6077 if (is_vlan_packet &&
6078 (filter->filt_flags & DLIL_IFF_INTERNAL) == 0) {
6079 continue;
6080 }
6081
6082 if (!filter->filt_skip && filter->filt_input != NULL &&
6083 (filter->filt_protocol == 0 ||
6084 filter->filt_protocol == protocol_family)) {
6085 lck_mtx_unlock(&ifp->if_flt_lock);
6086
6087 result = (*filter->filt_input)(filter->filt_cookie,
6088 ifp, protocol_family, m_p, frame_header_p);
6089
6090 lck_mtx_lock_spin(&ifp->if_flt_lock);
6091 if (result != 0) {
6092 /* we're done with the filter list */
6093 if_flt_monitor_unbusy(ifp);
6094 lck_mtx_unlock(&ifp->if_flt_lock);
6095 return result;
6096 }
6097 }
6098 }
6099 /* we're done with the filter list */
6100 if_flt_monitor_unbusy(ifp);
6101 lck_mtx_unlock(&ifp->if_flt_lock);
6102
6103 /*
6104 * Strip away M_PROTO1 bit prior to sending packet up the stack as
6105 * it is meant to be local to a subsystem -- if_bridge for M_PROTO1
6106 */
6107 if (*m_p != NULL) {
6108 (*m_p)->m_flags &= ~M_PROTO1;
6109 }
6110
6111 return 0;
6112 }
6113
6114 __attribute__((noinline))
6115 static int
dlil_interface_filters_output(struct ifnet * ifp,struct mbuf ** m_p,protocol_family_t protocol_family)6116 dlil_interface_filters_output(struct ifnet *ifp, struct mbuf **m_p,
6117 protocol_family_t protocol_family)
6118 {
6119 boolean_t is_vlan_packet;
6120 struct ifnet_filter *filter;
6121 struct mbuf *m = *m_p;
6122
6123 if (TAILQ_EMPTY(&ifp->if_flt_head)) {
6124 return 0;
6125 }
6126 is_vlan_packet = packet_has_vlan_tag(m);
6127
6128 /*
6129 * Pass the outbound packet to the interface filters
6130 */
6131 lck_mtx_lock_spin(&ifp->if_flt_lock);
6132 /* prevent filter list from changing in case we drop the lock */
6133 if_flt_monitor_busy(ifp);
6134 TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
6135 int result;
6136
6137 /* exclude VLAN packets from external filters PR-3586856 */
6138 if (is_vlan_packet &&
6139 (filter->filt_flags & DLIL_IFF_INTERNAL) == 0) {
6140 continue;
6141 }
6142
6143 if (!filter->filt_skip && filter->filt_output != NULL &&
6144 (filter->filt_protocol == 0 ||
6145 filter->filt_protocol == protocol_family)) {
6146 lck_mtx_unlock(&ifp->if_flt_lock);
6147
6148 result = filter->filt_output(filter->filt_cookie, ifp,
6149 protocol_family, m_p);
6150
6151 lck_mtx_lock_spin(&ifp->if_flt_lock);
6152 if (result != 0) {
6153 /* we're done with the filter list */
6154 if_flt_monitor_unbusy(ifp);
6155 lck_mtx_unlock(&ifp->if_flt_lock);
6156 return result;
6157 }
6158 }
6159 }
6160 /* we're done with the filter list */
6161 if_flt_monitor_unbusy(ifp);
6162 lck_mtx_unlock(&ifp->if_flt_lock);
6163
6164 return 0;
6165 }
6166
6167 static void
dlil_ifproto_input(struct if_proto * ifproto,mbuf_t m)6168 dlil_ifproto_input(struct if_proto * ifproto, mbuf_t m)
6169 {
6170 int error;
6171
6172 if (ifproto->proto_kpi == kProtoKPI_v1) {
6173 /* Version 1 protocols get one packet at a time */
6174 while (m != NULL) {
6175 char * frame_header;
6176 mbuf_t next_packet;
6177
6178 next_packet = m->m_nextpkt;
6179 m->m_nextpkt = NULL;
6180 frame_header = m->m_pkthdr.pkt_hdr;
6181 m->m_pkthdr.pkt_hdr = NULL;
6182 error = (*ifproto->kpi.v1.input)(ifproto->ifp,
6183 ifproto->protocol_family, m, frame_header);
6184 if (error != 0 && error != EJUSTRETURN) {
6185 m_freem(m);
6186 }
6187 m = next_packet;
6188 }
6189 } else if (ifproto->proto_kpi == kProtoKPI_v2) {
6190 /* Version 2 protocols support packet lists */
6191 error = (*ifproto->kpi.v2.input)(ifproto->ifp,
6192 ifproto->protocol_family, m);
6193 if (error != 0 && error != EJUSTRETURN) {
6194 m_freem_list(m);
6195 }
6196 }
6197 }
6198
6199 static void
dlil_input_stats_add(const struct ifnet_stat_increment_param * s,struct dlil_threading_info * inp,struct ifnet * ifp,boolean_t poll)6200 dlil_input_stats_add(const struct ifnet_stat_increment_param *s,
6201 struct dlil_threading_info *inp, struct ifnet *ifp, boolean_t poll)
6202 {
6203 struct ifnet_stat_increment_param *d = &inp->dlth_stats;
6204
6205 if (s->packets_in != 0) {
6206 d->packets_in += s->packets_in;
6207 }
6208 if (s->bytes_in != 0) {
6209 d->bytes_in += s->bytes_in;
6210 }
6211 if (s->errors_in != 0) {
6212 d->errors_in += s->errors_in;
6213 }
6214
6215 if (s->packets_out != 0) {
6216 d->packets_out += s->packets_out;
6217 }
6218 if (s->bytes_out != 0) {
6219 d->bytes_out += s->bytes_out;
6220 }
6221 if (s->errors_out != 0) {
6222 d->errors_out += s->errors_out;
6223 }
6224
6225 if (s->collisions != 0) {
6226 d->collisions += s->collisions;
6227 }
6228 if (s->dropped != 0) {
6229 d->dropped += s->dropped;
6230 }
6231
6232 if (poll) {
6233 PKTCNTR_ADD(&ifp->if_poll_tstats, s->packets_in, s->bytes_in);
6234 }
6235 }
6236
6237 static boolean_t
dlil_input_stats_sync(struct ifnet * ifp,struct dlil_threading_info * inp)6238 dlil_input_stats_sync(struct ifnet *ifp, struct dlil_threading_info *inp)
6239 {
6240 struct ifnet_stat_increment_param *s = &inp->dlth_stats;
6241
6242 /*
6243 * Use of atomic operations is unavoidable here because
6244 * these stats may also be incremented elsewhere via KPIs.
6245 */
6246 if (s->packets_in != 0) {
6247 os_atomic_add(&ifp->if_data.ifi_ipackets, s->packets_in, relaxed);
6248 s->packets_in = 0;
6249 }
6250 if (s->bytes_in != 0) {
6251 os_atomic_add(&ifp->if_data.ifi_ibytes, s->bytes_in, relaxed);
6252 s->bytes_in = 0;
6253 }
6254 if (s->errors_in != 0) {
6255 os_atomic_add(&ifp->if_data.ifi_ierrors, s->errors_in, relaxed);
6256 s->errors_in = 0;
6257 }
6258
6259 if (s->packets_out != 0) {
6260 os_atomic_add(&ifp->if_data.ifi_opackets, s->packets_out, relaxed);
6261 s->packets_out = 0;
6262 }
6263 if (s->bytes_out != 0) {
6264 os_atomic_add(&ifp->if_data.ifi_obytes, s->bytes_out, relaxed);
6265 s->bytes_out = 0;
6266 }
6267 if (s->errors_out != 0) {
6268 os_atomic_add(&ifp->if_data.ifi_oerrors, s->errors_out, relaxed);
6269 s->errors_out = 0;
6270 }
6271
6272 if (s->collisions != 0) {
6273 os_atomic_add(&ifp->if_data.ifi_collisions, s->collisions, relaxed);
6274 s->collisions = 0;
6275 }
6276 if (s->dropped != 0) {
6277 os_atomic_add(&ifp->if_data.ifi_iqdrops, s->dropped, relaxed);
6278 s->dropped = 0;
6279 }
6280
6281 /*
6282 * No need for atomic operations as they are modified here
6283 * only from within the DLIL input thread context.
6284 */
6285 if (ifp->if_poll_tstats.packets != 0) {
6286 ifp->if_poll_pstats.ifi_poll_packets += ifp->if_poll_tstats.packets;
6287 ifp->if_poll_tstats.packets = 0;
6288 }
6289 if (ifp->if_poll_tstats.bytes != 0) {
6290 ifp->if_poll_pstats.ifi_poll_bytes += ifp->if_poll_tstats.bytes;
6291 ifp->if_poll_tstats.bytes = 0;
6292 }
6293
6294 return ifp->if_data_threshold != 0;
6295 }
6296
6297 __private_extern__ void
dlil_input_packet_list(struct ifnet * ifp,struct mbuf * m)6298 dlil_input_packet_list(struct ifnet *ifp, struct mbuf *m)
6299 {
6300 return dlil_input_packet_list_common(ifp, m, 0,
6301 IFNET_MODEL_INPUT_POLL_OFF, FALSE);
6302 }
6303
6304 __private_extern__ void
dlil_input_packet_list_extended(struct ifnet * ifp,struct mbuf * m,u_int32_t cnt,ifnet_model_t mode)6305 dlil_input_packet_list_extended(struct ifnet *ifp, struct mbuf *m,
6306 u_int32_t cnt, ifnet_model_t mode)
6307 {
6308 return dlil_input_packet_list_common(ifp, m, cnt, mode, TRUE);
6309 }
6310
6311 static void
dlil_input_packet_list_common(struct ifnet * ifp_param,struct mbuf * m,u_int32_t cnt,ifnet_model_t mode,boolean_t ext)6312 dlil_input_packet_list_common(struct ifnet *ifp_param, struct mbuf *m,
6313 u_int32_t cnt, ifnet_model_t mode, boolean_t ext)
6314 {
6315 int error = 0;
6316 protocol_family_t protocol_family;
6317 mbuf_t next_packet;
6318 ifnet_t ifp = ifp_param;
6319 char *frame_header = NULL;
6320 struct if_proto *last_ifproto = NULL;
6321 mbuf_t pkt_first = NULL;
6322 mbuf_t *pkt_next = NULL;
6323 u_int32_t poll_thresh = 0, poll_ival = 0;
6324 int iorefcnt = 0;
6325
6326 KERNEL_DEBUG(DBG_FNC_DLIL_INPUT | DBG_FUNC_START, 0, 0, 0, 0, 0);
6327
6328 if (ext && mode == IFNET_MODEL_INPUT_POLL_ON && cnt > 1 &&
6329 (poll_ival = if_rxpoll_interval_pkts) > 0) {
6330 poll_thresh = cnt;
6331 }
6332
6333 while (m != NULL) {
6334 struct if_proto *ifproto = NULL;
6335 uint32_t pktf_mask; /* pkt flags to preserve */
6336
6337 m_add_crumb(m, PKT_CRUMB_DLIL_INPUT);
6338
6339 if (ifp_param == NULL) {
6340 ifp = m->m_pkthdr.rcvif;
6341 }
6342
6343 if ((ifp->if_eflags & IFEF_RXPOLL) &&
6344 (ifp->if_xflags & IFXF_LEGACY) && poll_thresh != 0 &&
6345 poll_ival > 0 && (--poll_thresh % poll_ival) == 0) {
6346 ifnet_poll(ifp);
6347 }
6348
6349 /* Check if this mbuf looks valid */
6350 MBUF_INPUT_CHECK(m, ifp);
6351
6352 next_packet = m->m_nextpkt;
6353 m->m_nextpkt = NULL;
6354 frame_header = m->m_pkthdr.pkt_hdr;
6355 m->m_pkthdr.pkt_hdr = NULL;
6356
6357 /*
6358 * Get an IO reference count if the interface is not
6359 * loopback (lo0) and it is attached; lo0 never goes
6360 * away, so optimize for that.
6361 */
6362 if (ifp != lo_ifp) {
6363 /* iorefcnt is 0 if it hasn't been taken yet */
6364 if (iorefcnt == 0) {
6365 if (!ifnet_datamov_begin(ifp)) {
6366 m_freem(m);
6367 goto next;
6368 }
6369 }
6370 iorefcnt = 1;
6371 /*
6372 * Preserve the time stamp and skip pktap flags.
6373 */
6374 pktf_mask = PKTF_TS_VALID | PKTF_SKIP_PKTAP;
6375 } else {
6376 /*
6377 * If this arrived on lo0, preserve interface addr
6378 * info to allow for connectivity between loopback
6379 * and local interface addresses.
6380 */
6381 pktf_mask = (PKTF_LOOP | PKTF_IFAINFO);
6382 }
6383 pktf_mask |= PKTF_WAKE_PKT;
6384
6385 /* make sure packet comes in clean */
6386 m_classifier_init(m, pktf_mask);
6387
6388 ifp_inc_traffic_class_in(ifp, m);
6389
6390 /* find which protocol family this packet is for */
6391 ifnet_lock_shared(ifp);
6392 error = (*ifp->if_demux)(ifp, m, frame_header,
6393 &protocol_family);
6394 ifnet_lock_done(ifp);
6395 if (error != 0) {
6396 if (error == EJUSTRETURN) {
6397 goto next;
6398 }
6399 protocol_family = 0;
6400 }
6401
6402 #if (DEVELOPMENT || DEBUG)
6403 /*
6404 * For testing we do not care about broadcast and multicast packets as
6405 * they are not as controllable as unicast traffic
6406 */
6407 if (__improbable(ifp->if_xflags & IFXF_MARK_WAKE_PKT)) {
6408 if ((protocol_family == PF_INET || protocol_family == PF_INET6) &&
6409 (m->m_flags & (M_BCAST | M_MCAST)) == 0) {
6410 /*
6411 * This is a one-shot command
6412 */
6413 ifp->if_xflags &= ~IFXF_MARK_WAKE_PKT;
6414 m->m_pkthdr.pkt_flags |= PKTF_WAKE_PKT;
6415 }
6416 }
6417 #endif /* (DEVELOPMENT || DEBUG) */
6418 if (__improbable(net_wake_pkt_debug > 0 && (m->m_pkthdr.pkt_flags & PKTF_WAKE_PKT))) {
6419 char buffer[64];
6420 size_t buflen = MIN(mbuf_pkthdr_len(m), sizeof(buffer));
6421
6422 os_log(OS_LOG_DEFAULT, "wake packet from %s len %d",
6423 ifp->if_xname, m_pktlen(m));
6424 if (mbuf_copydata(m, 0, buflen, buffer) == 0) {
6425 log_hexdump(buffer, buflen);
6426 }
6427 }
6428
6429 pktap_input(ifp, protocol_family, m, frame_header);
6430
6431 /* Drop v4 packets received on CLAT46 enabled cell interface */
6432 if (protocol_family == PF_INET && IS_INTF_CLAT46(ifp) &&
6433 ifp->if_type == IFT_CELLULAR) {
6434 m_freem(m);
6435 ip6stat.ip6s_clat464_in_v4_drop++;
6436 goto next;
6437 }
6438
6439 /* Translate the packet if it is received on CLAT interface */
6440 if (protocol_family == PF_INET6 && IS_INTF_CLAT46(ifp)
6441 && dlil_is_clat_needed(protocol_family, m)) {
6442 char *data = NULL;
6443 struct ether_header eh;
6444 struct ether_header *ehp = NULL;
6445
6446 if (ifp->if_type == IFT_ETHER) {
6447 ehp = (struct ether_header *)(void *)frame_header;
6448 /* Skip RX Ethernet packets if they are not IPV6 */
6449 if (ntohs(ehp->ether_type) != ETHERTYPE_IPV6) {
6450 goto skip_clat;
6451 }
6452
6453 /* Keep a copy of frame_header for Ethernet packets */
6454 bcopy(frame_header, (caddr_t)&eh, ETHER_HDR_LEN);
6455 }
6456 error = dlil_clat64(ifp, &protocol_family, &m);
6457 data = (char *) mbuf_data(m);
6458 if (error != 0) {
6459 m_freem(m);
6460 ip6stat.ip6s_clat464_in_drop++;
6461 goto next;
6462 }
6463 /* Native v6 should be No-op */
6464 if (protocol_family != PF_INET) {
6465 goto skip_clat;
6466 }
6467
6468 /* Do this only for translated v4 packets. */
6469 switch (ifp->if_type) {
6470 case IFT_CELLULAR:
6471 frame_header = data;
6472 break;
6473 case IFT_ETHER:
6474 /*
6475 * Drop if the mbuf doesn't have enough
6476 * space for Ethernet header
6477 */
6478 if (M_LEADINGSPACE(m) < ETHER_HDR_LEN) {
6479 m_free(m);
6480 ip6stat.ip6s_clat464_in_drop++;
6481 goto next;
6482 }
6483 /*
6484 * Set the frame_header ETHER_HDR_LEN bytes
6485 * preceeding the data pointer. Change
6486 * the ether_type too.
6487 */
6488 frame_header = data - ETHER_HDR_LEN;
6489 eh.ether_type = htons(ETHERTYPE_IP);
6490 bcopy((caddr_t)&eh, frame_header, ETHER_HDR_LEN);
6491 break;
6492 }
6493 }
6494 skip_clat:
6495 /*
6496 * Match the wake packet against the list of ports that has been
6497 * been queried by the driver before the device went to sleep
6498 */
6499 if (__improbable(m->m_pkthdr.pkt_flags & PKTF_WAKE_PKT)) {
6500 if (protocol_family != PF_INET && protocol_family != PF_INET6) {
6501 if_ports_used_match_mbuf(ifp, protocol_family, m);
6502 }
6503 }
6504 if (hwcksum_dbg != 0 && !(ifp->if_flags & IFF_LOOPBACK) &&
6505 !(m->m_pkthdr.pkt_flags & PKTF_LOOP)) {
6506 dlil_input_cksum_dbg(ifp, m, frame_header,
6507 protocol_family);
6508 }
6509 /*
6510 * For partial checksum offload, we expect the driver to
6511 * set the start offset indicating the start of the span
6512 * that is covered by the hardware-computed checksum;
6513 * adjust this start offset accordingly because the data
6514 * pointer has been advanced beyond the link-layer header.
6515 *
6516 * Virtual lan types (bridge, vlan, bond) can call
6517 * dlil_input_packet_list() with the same packet with the
6518 * checksum flags set. Set a flag indicating that the
6519 * adjustment has already been done.
6520 */
6521 if ((m->m_pkthdr.csum_flags & CSUM_ADJUST_DONE) != 0) {
6522 /* adjustment has already been done */
6523 } else if ((m->m_pkthdr.csum_flags &
6524 (CSUM_DATA_VALID | CSUM_PARTIAL)) ==
6525 (CSUM_DATA_VALID | CSUM_PARTIAL)) {
6526 int adj;
6527 if (frame_header == NULL ||
6528 frame_header < (char *)mbuf_datastart(m) ||
6529 frame_header > (char *)m->m_data ||
6530 (adj = (int)(m->m_data - frame_header)) >
6531 m->m_pkthdr.csum_rx_start) {
6532 m->m_pkthdr.csum_data = 0;
6533 m->m_pkthdr.csum_flags &= ~CSUM_DATA_VALID;
6534 hwcksum_in_invalidated++;
6535 } else {
6536 m->m_pkthdr.csum_rx_start -= adj;
6537 }
6538 /* make sure we don't adjust more than once */
6539 m->m_pkthdr.csum_flags |= CSUM_ADJUST_DONE;
6540 }
6541 if (clat_debug) {
6542 pktap_input(ifp, protocol_family, m, frame_header);
6543 }
6544
6545 if (m->m_flags & (M_BCAST | M_MCAST)) {
6546 os_atomic_inc(&ifp->if_imcasts, relaxed);
6547 }
6548
6549 /* run interface filters */
6550 error = dlil_interface_filters_input(ifp, &m,
6551 &frame_header, protocol_family);
6552 if (error != 0) {
6553 if (error != EJUSTRETURN) {
6554 m_freem(m);
6555 }
6556 goto next;
6557 }
6558 /*
6559 * A VLAN interface receives VLAN-tagged packets by attaching
6560 * its PF_VLAN protocol to a parent interface. When a VLAN
6561 * interface is a member of a bridge, the parent interface
6562 * receives VLAN-tagged M_PROMISC packets. A VLAN-tagged
6563 * M_PROMISC packet must be processed by the VLAN protocol
6564 * so that it can be sent up the stack via
6565 * dlil_input_packet_list(). That allows the bridge interface's
6566 * input filter, attached to the VLAN interface, to process
6567 * the packet.
6568 */
6569 if (protocol_family != PF_VLAN &&
6570 (m->m_flags & M_PROMISC) != 0) {
6571 m_freem(m);
6572 goto next;
6573 }
6574
6575 /* Lookup the protocol attachment to this interface */
6576 if (protocol_family == 0) {
6577 ifproto = NULL;
6578 } else if (last_ifproto != NULL && last_ifproto->ifp == ifp &&
6579 (last_ifproto->protocol_family == protocol_family)) {
6580 VERIFY(ifproto == NULL);
6581 ifproto = last_ifproto;
6582 if_proto_ref(last_ifproto);
6583 } else {
6584 VERIFY(ifproto == NULL);
6585 ifnet_lock_shared(ifp);
6586 /* callee holds a proto refcnt upon success */
6587 ifproto = find_attached_proto(ifp, protocol_family);
6588 ifnet_lock_done(ifp);
6589 }
6590 if (ifproto == NULL) {
6591 /* no protocol for this packet, discard */
6592 m_freem(m);
6593 goto next;
6594 }
6595 if (ifproto != last_ifproto) {
6596 if (last_ifproto != NULL) {
6597 /* pass up the list for the previous protocol */
6598 dlil_ifproto_input(last_ifproto, pkt_first);
6599 pkt_first = NULL;
6600 if_proto_free(last_ifproto);
6601 }
6602 last_ifproto = ifproto;
6603 if_proto_ref(ifproto);
6604 }
6605 /* extend the list */
6606 m->m_pkthdr.pkt_hdr = frame_header;
6607 if (pkt_first == NULL) {
6608 pkt_first = m;
6609 } else {
6610 *pkt_next = m;
6611 }
6612 pkt_next = &m->m_nextpkt;
6613
6614 next:
6615 if (next_packet == NULL && last_ifproto != NULL) {
6616 /* pass up the last list of packets */
6617 dlil_ifproto_input(last_ifproto, pkt_first);
6618 if_proto_free(last_ifproto);
6619 last_ifproto = NULL;
6620 }
6621 if (ifproto != NULL) {
6622 if_proto_free(ifproto);
6623 ifproto = NULL;
6624 }
6625
6626 m = next_packet;
6627
6628 /* update the driver's multicast filter, if needed */
6629 if (ifp->if_updatemcasts > 0 && if_mcasts_update(ifp) == 0) {
6630 ifp->if_updatemcasts = 0;
6631 }
6632 if (iorefcnt == 1) {
6633 /* If the next mbuf is on a different interface, unlock data-mov */
6634 if (!m || (ifp != ifp_param && ifp != m->m_pkthdr.rcvif)) {
6635 ifnet_datamov_end(ifp);
6636 iorefcnt = 0;
6637 }
6638 }
6639 }
6640
6641 KERNEL_DEBUG(DBG_FNC_DLIL_INPUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
6642 }
6643
6644 static errno_t
if_mcasts_update_common(struct ifnet * ifp,bool sync)6645 if_mcasts_update_common(struct ifnet * ifp, bool sync)
6646 {
6647 errno_t err;
6648
6649 if (sync) {
6650 err = ifnet_ioctl(ifp, 0, SIOCADDMULTI, NULL);
6651 if (err == EAFNOSUPPORT) {
6652 err = 0;
6653 }
6654 } else {
6655 ifnet_ioctl_async(ifp, SIOCADDMULTI);
6656 err = 0;
6657 }
6658 DLIL_PRINTF("%s: %s %d suspended link-layer multicast membership(s) "
6659 "(err=%d)\n", if_name(ifp),
6660 (err == 0 ? "successfully restored" : "failed to restore"),
6661 ifp->if_updatemcasts, err);
6662
6663 /* just return success */
6664 return 0;
6665 }
6666
6667 static errno_t
if_mcasts_update_async(struct ifnet * ifp)6668 if_mcasts_update_async(struct ifnet *ifp)
6669 {
6670 return if_mcasts_update_common(ifp, false);
6671 }
6672
6673 errno_t
if_mcasts_update(struct ifnet * ifp)6674 if_mcasts_update(struct ifnet *ifp)
6675 {
6676 return if_mcasts_update_common(ifp, true);
6677 }
6678
6679 /* If ifp is set, we will increment the generation for the interface */
6680 int
dlil_post_complete_msg(struct ifnet * ifp,struct kev_msg * event)6681 dlil_post_complete_msg(struct ifnet *ifp, struct kev_msg *event)
6682 {
6683 if (ifp != NULL) {
6684 ifnet_increment_generation(ifp);
6685 }
6686
6687 #if NECP
6688 necp_update_all_clients();
6689 #endif /* NECP */
6690
6691 return kev_post_msg(event);
6692 }
6693
6694 __private_extern__ void
dlil_post_sifflags_msg(struct ifnet * ifp)6695 dlil_post_sifflags_msg(struct ifnet * ifp)
6696 {
6697 struct kev_msg ev_msg;
6698 struct net_event_data ev_data;
6699
6700 bzero(&ev_data, sizeof(ev_data));
6701 bzero(&ev_msg, sizeof(ev_msg));
6702 ev_msg.vendor_code = KEV_VENDOR_APPLE;
6703 ev_msg.kev_class = KEV_NETWORK_CLASS;
6704 ev_msg.kev_subclass = KEV_DL_SUBCLASS;
6705 ev_msg.event_code = KEV_DL_SIFFLAGS;
6706 strlcpy(&ev_data.if_name[0], ifp->if_name, IFNAMSIZ);
6707 ev_data.if_family = ifp->if_family;
6708 ev_data.if_unit = (u_int32_t) ifp->if_unit;
6709 ev_msg.dv[0].data_length = sizeof(struct net_event_data);
6710 ev_msg.dv[0].data_ptr = &ev_data;
6711 ev_msg.dv[1].data_length = 0;
6712 dlil_post_complete_msg(ifp, &ev_msg);
6713 }
6714
6715 #define TMP_IF_PROTO_ARR_SIZE 10
6716 static int
dlil_event_internal(struct ifnet * ifp,struct kev_msg * event,bool update_generation)6717 dlil_event_internal(struct ifnet *ifp, struct kev_msg *event, bool update_generation)
6718 {
6719 struct ifnet_filter *filter = NULL;
6720 struct if_proto *proto = NULL;
6721 int if_proto_count = 0;
6722 struct if_proto *tmp_ifproto_stack_arr[TMP_IF_PROTO_ARR_SIZE] = {NULL};
6723 struct if_proto **tmp_ifproto_arr = tmp_ifproto_stack_arr;
6724 int tmp_ifproto_arr_idx = 0;
6725
6726 /*
6727 * Pass the event to the interface filters
6728 */
6729 lck_mtx_lock_spin(&ifp->if_flt_lock);
6730 /* prevent filter list from changing in case we drop the lock */
6731 if_flt_monitor_busy(ifp);
6732 TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
6733 if (filter->filt_event != NULL) {
6734 lck_mtx_unlock(&ifp->if_flt_lock);
6735
6736 filter->filt_event(filter->filt_cookie, ifp,
6737 filter->filt_protocol, event);
6738
6739 lck_mtx_lock_spin(&ifp->if_flt_lock);
6740 }
6741 }
6742 /* we're done with the filter list */
6743 if_flt_monitor_unbusy(ifp);
6744 lck_mtx_unlock(&ifp->if_flt_lock);
6745
6746 /* Get an io ref count if the interface is attached */
6747 if (!ifnet_is_attached(ifp, 1)) {
6748 goto done;
6749 }
6750
6751 /*
6752 * An embedded tmp_list_entry in if_proto may still get
6753 * over-written by another thread after giving up ifnet lock,
6754 * therefore we are avoiding embedded pointers here.
6755 */
6756 ifnet_lock_shared(ifp);
6757 if_proto_count = dlil_ifp_protolist(ifp, NULL, 0);
6758 if (if_proto_count) {
6759 int i;
6760 VERIFY(ifp->if_proto_hash != NULL);
6761 if (if_proto_count <= TMP_IF_PROTO_ARR_SIZE) {
6762 tmp_ifproto_arr = tmp_ifproto_stack_arr;
6763 } else {
6764 tmp_ifproto_arr = kalloc_type(struct if_proto *,
6765 if_proto_count, Z_WAITOK | Z_ZERO);
6766 if (tmp_ifproto_arr == NULL) {
6767 ifnet_lock_done(ifp);
6768 goto cleanup;
6769 }
6770 }
6771
6772 for (i = 0; i < PROTO_HASH_SLOTS; i++) {
6773 SLIST_FOREACH(proto, &ifp->if_proto_hash[i],
6774 next_hash) {
6775 if_proto_ref(proto);
6776 tmp_ifproto_arr[tmp_ifproto_arr_idx] = proto;
6777 tmp_ifproto_arr_idx++;
6778 }
6779 }
6780 VERIFY(if_proto_count == tmp_ifproto_arr_idx);
6781 }
6782 ifnet_lock_done(ifp);
6783
6784 for (tmp_ifproto_arr_idx = 0; tmp_ifproto_arr_idx < if_proto_count;
6785 tmp_ifproto_arr_idx++) {
6786 proto = tmp_ifproto_arr[tmp_ifproto_arr_idx];
6787 VERIFY(proto != NULL);
6788 proto_media_event eventp =
6789 (proto->proto_kpi == kProtoKPI_v1 ?
6790 proto->kpi.v1.event :
6791 proto->kpi.v2.event);
6792
6793 if (eventp != NULL) {
6794 eventp(ifp, proto->protocol_family,
6795 event);
6796 }
6797 if_proto_free(proto);
6798 }
6799
6800 cleanup:
6801 if (tmp_ifproto_arr != tmp_ifproto_stack_arr) {
6802 kfree_type(struct if_proto *, if_proto_count, tmp_ifproto_arr);
6803 }
6804
6805 /* Pass the event to the interface */
6806 if (ifp->if_event != NULL) {
6807 ifp->if_event(ifp, event);
6808 }
6809
6810 /* Release the io ref count */
6811 ifnet_decr_iorefcnt(ifp);
6812 done:
6813 return dlil_post_complete_msg(update_generation ? ifp : NULL, event);
6814 }
6815
6816 errno_t
ifnet_event(ifnet_t ifp,struct kern_event_msg * event)6817 ifnet_event(ifnet_t ifp, struct kern_event_msg *event)
6818 {
6819 struct kev_msg kev_msg;
6820 int result = 0;
6821
6822 if (ifp == NULL || event == NULL) {
6823 return EINVAL;
6824 }
6825
6826 bzero(&kev_msg, sizeof(kev_msg));
6827 kev_msg.vendor_code = event->vendor_code;
6828 kev_msg.kev_class = event->kev_class;
6829 kev_msg.kev_subclass = event->kev_subclass;
6830 kev_msg.event_code = event->event_code;
6831 kev_msg.dv[0].data_ptr = &event->event_data[0];
6832 kev_msg.dv[0].data_length = event->total_size - KEV_MSG_HEADER_SIZE;
6833 kev_msg.dv[1].data_length = 0;
6834
6835 result = dlil_event_internal(ifp, &kev_msg, TRUE);
6836
6837 return result;
6838 }
6839
6840 static void
dlil_count_chain_len(mbuf_t m,struct chain_len_stats * cls)6841 dlil_count_chain_len(mbuf_t m, struct chain_len_stats *cls)
6842 {
6843 mbuf_t n = m;
6844 int chainlen = 0;
6845
6846 while (n != NULL) {
6847 chainlen++;
6848 n = n->m_next;
6849 }
6850 switch (chainlen) {
6851 case 0:
6852 break;
6853 case 1:
6854 os_atomic_inc(&cls->cls_one, relaxed);
6855 break;
6856 case 2:
6857 os_atomic_inc(&cls->cls_two, relaxed);
6858 break;
6859 case 3:
6860 os_atomic_inc(&cls->cls_three, relaxed);
6861 break;
6862 case 4:
6863 os_atomic_inc(&cls->cls_four, relaxed);
6864 break;
6865 case 5:
6866 default:
6867 os_atomic_inc(&cls->cls_five_or_more, relaxed);
6868 break;
6869 }
6870 }
6871
6872 #if CONFIG_DTRACE
6873 __attribute__((noinline))
6874 static void
dlil_output_dtrace(ifnet_t ifp,protocol_family_t proto_family,mbuf_t m)6875 dlil_output_dtrace(ifnet_t ifp, protocol_family_t proto_family, mbuf_t m)
6876 {
6877 if (proto_family == PF_INET) {
6878 struct ip *ip = mtod(m, struct ip *);
6879 DTRACE_IP6(send, struct mbuf *, m, struct inpcb *, NULL,
6880 struct ip *, ip, struct ifnet *, ifp,
6881 struct ip *, ip, struct ip6_hdr *, NULL);
6882 } else if (proto_family == PF_INET6) {
6883 struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
6884 DTRACE_IP6(send, struct mbuf *, m, struct inpcb *, NULL,
6885 struct ip6_hdr *, ip6, struct ifnet *, ifp,
6886 struct ip *, NULL, struct ip6_hdr *, ip6);
6887 }
6888 }
6889 #endif /* CONFIG_DTRACE */
6890
6891 /*
6892 * dlil_output
6893 *
6894 * Caller should have a lock on the protocol domain if the protocol
6895 * doesn't support finer grained locking. In most cases, the lock
6896 * will be held from the socket layer and won't be released until
6897 * we return back to the socket layer.
6898 *
6899 * This does mean that we must take a protocol lock before we take
6900 * an interface lock if we're going to take both. This makes sense
6901 * because a protocol is likely to interact with an ifp while it
6902 * is under the protocol lock.
6903 *
6904 * An advisory code will be returned if adv is not null. This
6905 * can be used to provide feedback about interface queues to the
6906 * application.
6907 */
6908 errno_t
dlil_output(ifnet_t ifp,protocol_family_t proto_family,mbuf_t packetlist,void * route,const struct sockaddr * dest,int raw,struct flowadv * adv)6909 dlil_output(ifnet_t ifp, protocol_family_t proto_family, mbuf_t packetlist,
6910 void *route, const struct sockaddr *dest, int raw, struct flowadv *adv)
6911 {
6912 char *frame_type = NULL;
6913 char *dst_linkaddr = NULL;
6914 int retval = 0;
6915 char frame_type_buffer[MAX_FRAME_TYPE_SIZE * 4];
6916 char dst_linkaddr_buffer[MAX_LINKADDR * 4];
6917 struct if_proto *proto = NULL;
6918 mbuf_t m = NULL;
6919 mbuf_t send_head = NULL;
6920 mbuf_t *send_tail = &send_head;
6921 int iorefcnt = 0;
6922 u_int32_t pre = 0, post = 0;
6923 u_int32_t fpkts = 0, fbytes = 0;
6924 int32_t flen = 0;
6925 struct timespec now;
6926 u_int64_t now_nsec;
6927 boolean_t did_clat46 = FALSE;
6928 protocol_family_t old_proto_family = proto_family;
6929 struct sockaddr_in6 dest6;
6930 struct rtentry *rt = NULL;
6931 u_int32_t m_loop_set = 0;
6932
6933 KERNEL_DEBUG(DBG_FNC_DLIL_OUTPUT | DBG_FUNC_START, 0, 0, 0, 0, 0);
6934
6935 /*
6936 * Get an io refcnt if the interface is attached to prevent ifnet_detach
6937 * from happening while this operation is in progress
6938 */
6939 if (!ifnet_datamov_begin(ifp)) {
6940 retval = ENXIO;
6941 goto cleanup;
6942 }
6943 iorefcnt = 1;
6944
6945 VERIFY(ifp->if_output_dlil != NULL);
6946
6947 /* update the driver's multicast filter, if needed */
6948 if (ifp->if_updatemcasts > 0) {
6949 if_mcasts_update_async(ifp);
6950 ifp->if_updatemcasts = 0;
6951 }
6952
6953 frame_type = frame_type_buffer;
6954 dst_linkaddr = dst_linkaddr_buffer;
6955
6956 if (raw == 0) {
6957 ifnet_lock_shared(ifp);
6958 /* callee holds a proto refcnt upon success */
6959 proto = find_attached_proto(ifp, proto_family);
6960 if (proto == NULL) {
6961 ifnet_lock_done(ifp);
6962 retval = ENXIO;
6963 goto cleanup;
6964 }
6965 ifnet_lock_done(ifp);
6966 }
6967
6968 preout_again:
6969 if (packetlist == NULL) {
6970 goto cleanup;
6971 }
6972
6973 m = packetlist;
6974 packetlist = packetlist->m_nextpkt;
6975 m->m_nextpkt = NULL;
6976
6977 m_add_crumb(m, PKT_CRUMB_DLIL_OUTPUT);
6978
6979 /*
6980 * Perform address family translation for the first
6981 * packet outside the loop in order to perform address
6982 * lookup for the translated proto family.
6983 */
6984 if (proto_family == PF_INET && IS_INTF_CLAT46(ifp) &&
6985 (ifp->if_type == IFT_CELLULAR ||
6986 dlil_is_clat_needed(proto_family, m))) {
6987 retval = dlil_clat46(ifp, &proto_family, &m);
6988 /*
6989 * Go to the next packet if translation fails
6990 */
6991 if (retval != 0) {
6992 m_freem(m);
6993 m = NULL;
6994 ip6stat.ip6s_clat464_out_drop++;
6995 /* Make sure that the proto family is PF_INET */
6996 ASSERT(proto_family == PF_INET);
6997 goto preout_again;
6998 }
6999 /*
7000 * Free the old one and make it point to the IPv6 proto structure.
7001 *
7002 * Change proto for the first time we have successfully
7003 * performed address family translation.
7004 */
7005 if (!did_clat46 && proto_family == PF_INET6) {
7006 did_clat46 = TRUE;
7007
7008 if (proto != NULL) {
7009 if_proto_free(proto);
7010 }
7011 ifnet_lock_shared(ifp);
7012 /* callee holds a proto refcnt upon success */
7013 proto = find_attached_proto(ifp, proto_family);
7014 if (proto == NULL) {
7015 ifnet_lock_done(ifp);
7016 retval = ENXIO;
7017 m_freem(m);
7018 m = NULL;
7019 goto cleanup;
7020 }
7021 ifnet_lock_done(ifp);
7022 if (ifp->if_type == IFT_ETHER) {
7023 /* Update the dest to translated v6 address */
7024 dest6.sin6_len = sizeof(struct sockaddr_in6);
7025 dest6.sin6_family = AF_INET6;
7026 dest6.sin6_addr = (mtod(m, struct ip6_hdr *))->ip6_dst;
7027 dest = (const struct sockaddr *)&dest6;
7028
7029 /*
7030 * Lookup route to the translated destination
7031 * Free this route ref during cleanup
7032 */
7033 rt = rtalloc1_scoped((struct sockaddr *)&dest6,
7034 0, 0, ifp->if_index);
7035
7036 route = rt;
7037 }
7038 }
7039 }
7040
7041 /*
7042 * This path gets packet chain going to the same destination.
7043 * The pre output routine is used to either trigger resolution of
7044 * the next hop or retreive the next hop's link layer addressing.
7045 * For ex: ether_inet(6)_pre_output routine.
7046 *
7047 * If the routine returns EJUSTRETURN, it implies that packet has
7048 * been queued, and therefore we have to call preout_again for the
7049 * following packet in the chain.
7050 *
7051 * For errors other than EJUSTRETURN, the current packet is freed
7052 * and the rest of the chain (pointed by packetlist is freed as
7053 * part of clean up.
7054 *
7055 * Else if there is no error the retrieved information is used for
7056 * all the packets in the chain.
7057 */
7058 if (raw == 0) {
7059 proto_media_preout preoutp = (proto->proto_kpi == kProtoKPI_v1 ?
7060 proto->kpi.v1.pre_output : proto->kpi.v2.pre_output);
7061 retval = 0;
7062 if (preoutp != NULL) {
7063 retval = preoutp(ifp, proto_family, &m, dest, route,
7064 frame_type, dst_linkaddr);
7065
7066 if (retval != 0) {
7067 if (retval == EJUSTRETURN) {
7068 goto preout_again;
7069 }
7070 m_freem(m);
7071 m = NULL;
7072 goto cleanup;
7073 }
7074 }
7075 }
7076
7077 do {
7078 /*
7079 * pkt_hdr is set here to point to m_data prior to
7080 * calling into the framer. This value of pkt_hdr is
7081 * used by the netif gso logic to retrieve the ip header
7082 * for the TCP packets, offloaded for TSO processing.
7083 */
7084 if ((raw != 0) && (ifp->if_family == IFNET_FAMILY_ETHERNET)) {
7085 uint8_t vlan_encap_len = 0;
7086
7087 if ((m->m_pkthdr.csum_flags & CSUM_VLAN_ENCAP_PRESENT) != 0) {
7088 vlan_encap_len = ETHER_VLAN_ENCAP_LEN;
7089 }
7090 m->m_pkthdr.pkt_hdr = mtod(m, char *) + ETHER_HDR_LEN + vlan_encap_len;
7091 } else {
7092 m->m_pkthdr.pkt_hdr = mtod(m, void *);
7093 }
7094
7095 /*
7096 * Perform address family translation if needed.
7097 * For now we only support stateless 4 to 6 translation
7098 * on the out path.
7099 *
7100 * The routine below translates IP header, updates protocol
7101 * checksum and also translates ICMP.
7102 *
7103 * We skip the first packet as it is already translated and
7104 * the proto family is set to PF_INET6.
7105 */
7106 if (proto_family == PF_INET && IS_INTF_CLAT46(ifp) &&
7107 (ifp->if_type == IFT_CELLULAR ||
7108 dlil_is_clat_needed(proto_family, m))) {
7109 retval = dlil_clat46(ifp, &proto_family, &m);
7110 /* Goto the next packet if the translation fails */
7111 if (retval != 0) {
7112 m_freem(m);
7113 m = NULL;
7114 ip6stat.ip6s_clat464_out_drop++;
7115 goto next;
7116 }
7117 }
7118
7119 #if CONFIG_DTRACE
7120 if (!raw) {
7121 dlil_output_dtrace(ifp, proto_family, m);
7122 }
7123 #endif /* CONFIG_DTRACE */
7124
7125 if (raw == 0 && ifp->if_framer != NULL) {
7126 int rcvif_set = 0;
7127
7128 /*
7129 * If this is a broadcast packet that needs to be
7130 * looped back into the system, set the inbound ifp
7131 * to that of the outbound ifp. This will allow
7132 * us to determine that it is a legitimate packet
7133 * for the system. Only set the ifp if it's not
7134 * already set, just to be safe.
7135 */
7136 if ((m->m_flags & (M_BCAST | M_LOOP)) &&
7137 m->m_pkthdr.rcvif == NULL) {
7138 m->m_pkthdr.rcvif = ifp;
7139 rcvif_set = 1;
7140 }
7141 m_loop_set = m->m_flags & M_LOOP;
7142 retval = ifp->if_framer(ifp, &m, dest, dst_linkaddr,
7143 frame_type, &pre, &post);
7144 if (retval != 0) {
7145 if (retval != EJUSTRETURN) {
7146 m_freem(m);
7147 }
7148 goto next;
7149 }
7150
7151 /*
7152 * For partial checksum offload, adjust the start
7153 * and stuff offsets based on the prepended header.
7154 */
7155 if ((m->m_pkthdr.csum_flags &
7156 (CSUM_DATA_VALID | CSUM_PARTIAL)) ==
7157 (CSUM_DATA_VALID | CSUM_PARTIAL)) {
7158 m->m_pkthdr.csum_tx_stuff += pre;
7159 m->m_pkthdr.csum_tx_start += pre;
7160 }
7161
7162 if (hwcksum_dbg != 0 && !(ifp->if_flags & IFF_LOOPBACK)) {
7163 dlil_output_cksum_dbg(ifp, m, pre,
7164 proto_family);
7165 }
7166
7167 /*
7168 * Clear the ifp if it was set above, and to be
7169 * safe, only if it is still the same as the
7170 * outbound ifp we have in context. If it was
7171 * looped back, then a copy of it was sent to the
7172 * loopback interface with the rcvif set, and we
7173 * are clearing the one that will go down to the
7174 * layer below.
7175 */
7176 if (rcvif_set && m->m_pkthdr.rcvif == ifp) {
7177 m->m_pkthdr.rcvif = NULL;
7178 }
7179 }
7180
7181 /*
7182 * Let interface filters (if any) do their thing ...
7183 */
7184 retval = dlil_interface_filters_output(ifp, &m, proto_family);
7185 if (retval != 0) {
7186 if (retval != EJUSTRETURN) {
7187 m_freem(m);
7188 }
7189 goto next;
7190 }
7191 /*
7192 * Strip away M_PROTO1 bit prior to sending packet
7193 * to the driver as this field may be used by the driver
7194 */
7195 m->m_flags &= ~M_PROTO1;
7196
7197 /*
7198 * If the underlying interface is not capable of handling a
7199 * packet whose data portion spans across physically disjoint
7200 * pages, we need to "normalize" the packet so that we pass
7201 * down a chain of mbufs where each mbuf points to a span that
7202 * resides in the system page boundary. If the packet does
7203 * not cross page(s), the following is a no-op.
7204 */
7205 if (!(ifp->if_hwassist & IFNET_MULTIPAGES)) {
7206 if ((m = m_normalize(m)) == NULL) {
7207 goto next;
7208 }
7209 }
7210
7211 /*
7212 * If this is a TSO packet, make sure the interface still
7213 * advertise TSO capability.
7214 */
7215 if (TSO_IPV4_NOTOK(ifp, m) || TSO_IPV6_NOTOK(ifp, m)) {
7216 retval = EMSGSIZE;
7217 m_freem(m);
7218 goto cleanup;
7219 }
7220
7221 ifp_inc_traffic_class_out(ifp, m);
7222
7223 #if SKYWALK
7224 /*
7225 * For native skywalk devices, packets will be passed to pktap
7226 * after GSO or after the mbuf to packet conversion.
7227 * This is done for IPv4/IPv6 packets only because there is no
7228 * space in the mbuf to pass down the proto family.
7229 */
7230 if (dlil_is_native_netif_nexus(ifp)) {
7231 if (raw || m->m_pkthdr.pkt_proto == 0) {
7232 pktap_output(ifp, proto_family, m, pre, post);
7233 m->m_pkthdr.pkt_flags |= PKTF_SKIP_PKTAP;
7234 }
7235 } else {
7236 pktap_output(ifp, proto_family, m, pre, post);
7237 }
7238 #else /* SKYWALK */
7239 pktap_output(ifp, proto_family, m, pre, post);
7240 #endif /* SKYWALK */
7241
7242 /*
7243 * Count the number of elements in the mbuf chain
7244 */
7245 if (tx_chain_len_count) {
7246 dlil_count_chain_len(m, &tx_chain_len_stats);
7247 }
7248
7249 /*
7250 * Record timestamp; ifnet_enqueue() will use this info
7251 * rather than redoing the work. An optimization could
7252 * involve doing this just once at the top, if there are
7253 * no interface filters attached, but that's probably
7254 * not a big deal.
7255 */
7256 nanouptime(&now);
7257 net_timernsec(&now, &now_nsec);
7258 (void) mbuf_set_timestamp(m, now_nsec, TRUE);
7259
7260 /*
7261 * Discard partial sum information if this packet originated
7262 * from another interface; the packet would already have the
7263 * final checksum and we shouldn't recompute it.
7264 */
7265 if ((m->m_pkthdr.pkt_flags & PKTF_FORWARDED) &&
7266 (m->m_pkthdr.csum_flags & (CSUM_DATA_VALID | CSUM_PARTIAL)) ==
7267 (CSUM_DATA_VALID | CSUM_PARTIAL)) {
7268 m->m_pkthdr.csum_flags &= ~CSUM_TX_FLAGS;
7269 m->m_pkthdr.csum_data = 0;
7270 }
7271
7272 /*
7273 * Finally, call the driver.
7274 */
7275 if (ifp->if_eflags & (IFEF_SENDLIST | IFEF_ENQUEUE_MULTI)) {
7276 if (m->m_pkthdr.pkt_flags & PKTF_FORWARDED) {
7277 flen += (m_pktlen(m) - (pre + post));
7278 m->m_pkthdr.pkt_flags &= ~PKTF_FORWARDED;
7279 }
7280 *send_tail = m;
7281 send_tail = &m->m_nextpkt;
7282 } else {
7283 if (m->m_pkthdr.pkt_flags & PKTF_FORWARDED) {
7284 flen = (m_pktlen(m) - (pre + post));
7285 m->m_pkthdr.pkt_flags &= ~PKTF_FORWARDED;
7286 } else {
7287 flen = 0;
7288 }
7289 KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_START,
7290 0, 0, 0, 0, 0);
7291 retval = (*ifp->if_output_dlil)(ifp, m);
7292 if (retval == EQFULL || retval == EQSUSPENDED) {
7293 if (adv != NULL && adv->code == FADV_SUCCESS) {
7294 adv->code = (retval == EQFULL ?
7295 FADV_FLOW_CONTROLLED :
7296 FADV_SUSPENDED);
7297 }
7298 retval = 0;
7299 }
7300 if (retval == 0 && flen > 0) {
7301 fbytes += flen;
7302 fpkts++;
7303 }
7304 if (retval != 0 && dlil_verbose) {
7305 DLIL_PRINTF("%s: output error on %s retval = %d\n",
7306 __func__, if_name(ifp),
7307 retval);
7308 }
7309 KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_END,
7310 0, 0, 0, 0, 0);
7311 }
7312 KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
7313
7314 next:
7315 m = packetlist;
7316 if (m != NULL) {
7317 m->m_flags |= m_loop_set;
7318 packetlist = packetlist->m_nextpkt;
7319 m->m_nextpkt = NULL;
7320 }
7321 /* Reset the proto family to old proto family for CLAT */
7322 if (did_clat46) {
7323 proto_family = old_proto_family;
7324 }
7325 } while (m != NULL);
7326
7327 if (send_head != NULL) {
7328 KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_START,
7329 0, 0, 0, 0, 0);
7330 if (ifp->if_eflags & IFEF_SENDLIST) {
7331 retval = (*ifp->if_output_dlil)(ifp, send_head);
7332 if (retval == EQFULL || retval == EQSUSPENDED) {
7333 if (adv != NULL) {
7334 adv->code = (retval == EQFULL ?
7335 FADV_FLOW_CONTROLLED :
7336 FADV_SUSPENDED);
7337 }
7338 retval = 0;
7339 }
7340 if (retval == 0 && flen > 0) {
7341 fbytes += flen;
7342 fpkts++;
7343 }
7344 if (retval != 0 && dlil_verbose) {
7345 DLIL_PRINTF("%s: output error on %s retval = %d\n",
7346 __func__, if_name(ifp), retval);
7347 }
7348 } else {
7349 struct mbuf *send_m;
7350 int enq_cnt = 0;
7351 VERIFY(ifp->if_eflags & IFEF_ENQUEUE_MULTI);
7352 while (send_head != NULL) {
7353 send_m = send_head;
7354 send_head = send_m->m_nextpkt;
7355 send_m->m_nextpkt = NULL;
7356 retval = (*ifp->if_output_dlil)(ifp, send_m);
7357 if (retval == EQFULL || retval == EQSUSPENDED) {
7358 if (adv != NULL) {
7359 adv->code = (retval == EQFULL ?
7360 FADV_FLOW_CONTROLLED :
7361 FADV_SUSPENDED);
7362 }
7363 retval = 0;
7364 }
7365 if (retval == 0) {
7366 enq_cnt++;
7367 if (flen > 0) {
7368 fpkts++;
7369 }
7370 }
7371 if (retval != 0 && dlil_verbose) {
7372 DLIL_PRINTF("%s: output error on %s "
7373 "retval = %d\n",
7374 __func__, if_name(ifp), retval);
7375 }
7376 }
7377 if (enq_cnt > 0) {
7378 fbytes += flen;
7379 ifnet_start(ifp);
7380 }
7381 }
7382 KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
7383 }
7384
7385 KERNEL_DEBUG(DBG_FNC_DLIL_OUTPUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
7386
7387 cleanup:
7388 if (fbytes > 0) {
7389 ifp->if_fbytes += fbytes;
7390 }
7391 if (fpkts > 0) {
7392 ifp->if_fpackets += fpkts;
7393 }
7394 if (proto != NULL) {
7395 if_proto_free(proto);
7396 }
7397 if (packetlist) { /* if any packets are left, clean up */
7398 mbuf_freem_list(packetlist);
7399 }
7400 if (retval == EJUSTRETURN) {
7401 retval = 0;
7402 }
7403 if (iorefcnt == 1) {
7404 ifnet_datamov_end(ifp);
7405 }
7406 if (rt != NULL) {
7407 rtfree(rt);
7408 rt = NULL;
7409 }
7410
7411 return retval;
7412 }
7413
7414 /*
7415 * This routine checks if the destination address is not a loopback, link-local,
7416 * multicast or broadcast address.
7417 */
7418 static int
dlil_is_clat_needed(protocol_family_t proto_family,mbuf_t m)7419 dlil_is_clat_needed(protocol_family_t proto_family, mbuf_t m)
7420 {
7421 int ret = 0;
7422 switch (proto_family) {
7423 case PF_INET: {
7424 struct ip *iph = mtod(m, struct ip *);
7425 if (CLAT46_NEEDED(ntohl(iph->ip_dst.s_addr))) {
7426 ret = 1;
7427 }
7428 break;
7429 }
7430 case PF_INET6: {
7431 struct ip6_hdr *ip6h = mtod(m, struct ip6_hdr *);
7432 if ((size_t)m_pktlen(m) >= sizeof(struct ip6_hdr) &&
7433 CLAT64_NEEDED(&ip6h->ip6_dst)) {
7434 ret = 1;
7435 }
7436 break;
7437 }
7438 }
7439
7440 return ret;
7441 }
7442 /*
7443 * @brief This routine translates IPv4 packet to IPv6 packet,
7444 * updates protocol checksum and also translates ICMP for code
7445 * along with inner header translation.
7446 *
7447 * @param ifp Pointer to the interface
7448 * @param proto_family pointer to protocol family. It is updated if function
7449 * performs the translation successfully.
7450 * @param m Pointer to the pointer pointing to the packet. Needed because this
7451 * routine can end up changing the mbuf to a different one.
7452 *
7453 * @return 0 on success or else a negative value.
7454 */
7455 static errno_t
dlil_clat46(ifnet_t ifp,protocol_family_t * proto_family,mbuf_t * m)7456 dlil_clat46(ifnet_t ifp, protocol_family_t *proto_family, mbuf_t *m)
7457 {
7458 VERIFY(*proto_family == PF_INET);
7459 VERIFY(IS_INTF_CLAT46(ifp));
7460
7461 pbuf_t pbuf_store, *pbuf = NULL;
7462 struct ip *iph = NULL;
7463 struct in_addr osrc, odst;
7464 uint8_t proto = 0;
7465 struct in6_ifaddr *ia6_clat_src = NULL;
7466 struct in6_addr *src = NULL;
7467 struct in6_addr dst;
7468 int error = 0;
7469 uint16_t off = 0;
7470 uint16_t tot_len = 0;
7471 uint16_t ip_id_val = 0;
7472 uint16_t ip_frag_off = 0;
7473
7474 boolean_t is_frag = FALSE;
7475 boolean_t is_first_frag = TRUE;
7476 boolean_t is_last_frag = TRUE;
7477
7478 pbuf_init_mbuf(&pbuf_store, *m, ifp);
7479 pbuf = &pbuf_store;
7480 iph = pbuf->pb_data;
7481
7482 osrc = iph->ip_src;
7483 odst = iph->ip_dst;
7484 proto = iph->ip_p;
7485 off = (uint16_t)(iph->ip_hl << 2);
7486 ip_id_val = iph->ip_id;
7487 ip_frag_off = ntohs(iph->ip_off) & IP_OFFMASK;
7488
7489 tot_len = ntohs(iph->ip_len);
7490
7491 /*
7492 * For packets that are not first frags
7493 * we only need to adjust CSUM.
7494 * For 4 to 6, Fragmentation header gets appended
7495 * after proto translation.
7496 */
7497 if (ntohs(iph->ip_off) & ~(IP_DF | IP_RF)) {
7498 is_frag = TRUE;
7499
7500 /* If the offset is not zero, it is not first frag */
7501 if (ip_frag_off != 0) {
7502 is_first_frag = FALSE;
7503 }
7504
7505 /* If IP_MF is set, then it is not last frag */
7506 if (ntohs(iph->ip_off) & IP_MF) {
7507 is_last_frag = FALSE;
7508 }
7509 }
7510
7511 /*
7512 * Retrive the local IPv6 CLAT46 address reserved for stateless
7513 * translation.
7514 */
7515 ia6_clat_src = in6ifa_ifpwithflag(ifp, IN6_IFF_CLAT46);
7516 if (ia6_clat_src == NULL) {
7517 ip6stat.ip6s_clat464_out_nov6addr_drop++;
7518 error = -1;
7519 goto cleanup;
7520 }
7521
7522 src = &ia6_clat_src->ia_addr.sin6_addr;
7523
7524 /*
7525 * Translate IPv4 destination to IPv6 destination by using the
7526 * prefixes learned through prior PLAT discovery.
7527 */
7528 if ((error = nat464_synthesize_ipv6(ifp, &odst, &dst)) != 0) {
7529 ip6stat.ip6s_clat464_out_v6synthfail_drop++;
7530 goto cleanup;
7531 }
7532
7533 /* Translate the IP header part first */
7534 error = (nat464_translate_46(pbuf, off, iph->ip_tos, iph->ip_p,
7535 iph->ip_ttl, *src, dst, tot_len) == NT_NAT64) ? 0 : -1;
7536
7537 iph = NULL; /* Invalidate iph as pbuf has been modified */
7538
7539 if (error != 0) {
7540 ip6stat.ip6s_clat464_out_46transfail_drop++;
7541 goto cleanup;
7542 }
7543
7544 /*
7545 * Translate protocol header, update checksum, checksum flags
7546 * and related fields.
7547 */
7548 error = (nat464_translate_proto(pbuf, (struct nat464_addr *)&osrc, (struct nat464_addr *)&odst,
7549 proto, PF_INET, PF_INET6, NT_OUT, !is_first_frag) == NT_NAT64) ? 0 : -1;
7550
7551 if (error != 0) {
7552 ip6stat.ip6s_clat464_out_46proto_transfail_drop++;
7553 goto cleanup;
7554 }
7555
7556 /* Now insert the IPv6 fragment header */
7557 if (is_frag) {
7558 error = nat464_insert_frag46(pbuf, ip_id_val, ip_frag_off, is_last_frag);
7559
7560 if (error != 0) {
7561 ip6stat.ip6s_clat464_out_46frag_transfail_drop++;
7562 goto cleanup;
7563 }
7564 }
7565
7566 cleanup:
7567 if (ia6_clat_src != NULL) {
7568 IFA_REMREF(&ia6_clat_src->ia_ifa);
7569 }
7570
7571 if (pbuf_is_valid(pbuf)) {
7572 *m = pbuf->pb_mbuf;
7573 pbuf->pb_mbuf = NULL;
7574 pbuf_destroy(pbuf);
7575 } else {
7576 error = -1;
7577 *m = NULL;
7578 ip6stat.ip6s_clat464_out_invalpbuf_drop++;
7579 }
7580
7581 if (error == 0) {
7582 *proto_family = PF_INET6;
7583 ip6stat.ip6s_clat464_out_success++;
7584 }
7585
7586 return error;
7587 }
7588
7589 /*
7590 * @brief This routine translates incoming IPv6 to IPv4 packet,
7591 * updates protocol checksum and also translates ICMPv6 outer
7592 * and inner headers
7593 *
7594 * @return 0 on success or else a negative value.
7595 */
7596 static errno_t
dlil_clat64(ifnet_t ifp,protocol_family_t * proto_family,mbuf_t * m)7597 dlil_clat64(ifnet_t ifp, protocol_family_t *proto_family, mbuf_t *m)
7598 {
7599 VERIFY(*proto_family == PF_INET6);
7600 VERIFY(IS_INTF_CLAT46(ifp));
7601
7602 struct ip6_hdr *ip6h = NULL;
7603 struct in6_addr osrc, odst;
7604 uint8_t proto = 0;
7605 struct in6_ifaddr *ia6_clat_dst = NULL;
7606 struct in_ifaddr *ia4_clat_dst = NULL;
7607 struct in_addr *dst = NULL;
7608 struct in_addr src;
7609 int error = 0;
7610 uint32_t off = 0;
7611 u_int64_t tot_len = 0;
7612 uint8_t tos = 0;
7613 boolean_t is_first_frag = TRUE;
7614
7615 /* Incoming mbuf does not contain valid IP6 header */
7616 if ((size_t)(*m)->m_pkthdr.len < sizeof(struct ip6_hdr) ||
7617 ((size_t)(*m)->m_len < sizeof(struct ip6_hdr) &&
7618 (*m = m_pullup(*m, sizeof(struct ip6_hdr))) == NULL)) {
7619 ip6stat.ip6s_clat464_in_tooshort_drop++;
7620 return -1;
7621 }
7622
7623 ip6h = mtod(*m, struct ip6_hdr *);
7624 /* Validate that mbuf contains IP payload equal to ip6_plen */
7625 if ((size_t)(*m)->m_pkthdr.len < ntohs(ip6h->ip6_plen) + sizeof(struct ip6_hdr)) {
7626 ip6stat.ip6s_clat464_in_tooshort_drop++;
7627 return -1;
7628 }
7629
7630 osrc = ip6h->ip6_src;
7631 odst = ip6h->ip6_dst;
7632
7633 /*
7634 * Retrieve the local CLAT46 reserved IPv6 address.
7635 * Let the packet pass if we don't find one, as the flag
7636 * may get set before IPv6 configuration has taken place.
7637 */
7638 ia6_clat_dst = in6ifa_ifpwithflag(ifp, IN6_IFF_CLAT46);
7639 if (ia6_clat_dst == NULL) {
7640 goto done;
7641 }
7642
7643 /*
7644 * Check if the original dest in the packet is same as the reserved
7645 * CLAT46 IPv6 address
7646 */
7647 if (IN6_ARE_ADDR_EQUAL(&odst, &ia6_clat_dst->ia_addr.sin6_addr)) {
7648 pbuf_t pbuf_store, *pbuf = NULL;
7649 pbuf_init_mbuf(&pbuf_store, *m, ifp);
7650 pbuf = &pbuf_store;
7651
7652 /*
7653 * Retrive the local CLAT46 IPv4 address reserved for stateless
7654 * translation.
7655 */
7656 ia4_clat_dst = inifa_ifpclatv4(ifp);
7657 if (ia4_clat_dst == NULL) {
7658 IFA_REMREF(&ia6_clat_dst->ia_ifa);
7659 ip6stat.ip6s_clat464_in_nov4addr_drop++;
7660 error = -1;
7661 goto cleanup;
7662 }
7663 IFA_REMREF(&ia6_clat_dst->ia_ifa);
7664
7665 /* Translate IPv6 src to IPv4 src by removing the NAT64 prefix */
7666 dst = &ia4_clat_dst->ia_addr.sin_addr;
7667 if ((error = nat464_synthesize_ipv4(ifp, &osrc, &src)) != 0) {
7668 ip6stat.ip6s_clat464_in_v4synthfail_drop++;
7669 error = -1;
7670 goto cleanup;
7671 }
7672
7673 ip6h = pbuf->pb_data;
7674 off = sizeof(struct ip6_hdr);
7675 proto = ip6h->ip6_nxt;
7676 tos = (ntohl(ip6h->ip6_flow) >> 20) & 0xff;
7677 tot_len = ntohs(ip6h->ip6_plen) + sizeof(struct ip6_hdr);
7678
7679 /*
7680 * Translate the IP header and update the fragmentation
7681 * header if needed
7682 */
7683 error = (nat464_translate_64(pbuf, off, tos, &proto,
7684 ip6h->ip6_hlim, src, *dst, tot_len, &is_first_frag) == NT_NAT64) ?
7685 0 : -1;
7686
7687 ip6h = NULL; /* Invalidate ip6h as pbuf has been changed */
7688
7689 if (error != 0) {
7690 ip6stat.ip6s_clat464_in_64transfail_drop++;
7691 goto cleanup;
7692 }
7693
7694 /*
7695 * Translate protocol header, update checksum, checksum flags
7696 * and related fields.
7697 */
7698 error = (nat464_translate_proto(pbuf, (struct nat464_addr *)&osrc,
7699 (struct nat464_addr *)&odst, proto, PF_INET6, PF_INET,
7700 NT_IN, !is_first_frag) == NT_NAT64) ? 0 : -1;
7701
7702 if (error != 0) {
7703 ip6stat.ip6s_clat464_in_64proto_transfail_drop++;
7704 goto cleanup;
7705 }
7706
7707 cleanup:
7708 if (ia4_clat_dst != NULL) {
7709 IFA_REMREF(&ia4_clat_dst->ia_ifa);
7710 }
7711
7712 if (pbuf_is_valid(pbuf)) {
7713 *m = pbuf->pb_mbuf;
7714 pbuf->pb_mbuf = NULL;
7715 pbuf_destroy(pbuf);
7716 } else {
7717 error = -1;
7718 ip6stat.ip6s_clat464_in_invalpbuf_drop++;
7719 }
7720
7721 if (error == 0) {
7722 *proto_family = PF_INET;
7723 ip6stat.ip6s_clat464_in_success++;
7724 }
7725 } /* CLAT traffic */
7726
7727 done:
7728 return error;
7729 }
7730
7731 /* The following is used to enqueue work items for ifnet ioctl events */
7732 static void ifnet_ioctl_event_callback(struct nwk_wq_entry *);
7733
7734 struct ifnet_ioctl_event {
7735 struct ifnet *ifp;
7736 u_long ioctl_code;
7737 };
7738
7739 struct ifnet_ioctl_event_nwk_wq_entry {
7740 struct nwk_wq_entry nwk_wqe;
7741 struct ifnet_ioctl_event ifnet_ioctl_ev_arg;
7742 };
7743
7744 void
ifnet_ioctl_async(struct ifnet * ifp,u_long ioctl_code)7745 ifnet_ioctl_async(struct ifnet *ifp, u_long ioctl_code)
7746 {
7747 struct ifnet_ioctl_event_nwk_wq_entry *p_ifnet_ioctl_ev = NULL;
7748 bool compare_expected;
7749
7750 /*
7751 * Get an io ref count if the interface is attached.
7752 * At this point it most likely is. We are taking a reference for
7753 * deferred processing.
7754 */
7755 if (!ifnet_is_attached(ifp, 1)) {
7756 os_log(OS_LOG_DEFAULT, "%s:%d %s Failed for ioctl %lu as interface "
7757 "is not attached",
7758 __func__, __LINE__, if_name(ifp), ioctl_code);
7759 return;
7760 }
7761 switch (ioctl_code) {
7762 case SIOCADDMULTI:
7763 compare_expected = false;
7764 if (!atomic_compare_exchange_strong(&ifp->if_mcast_add_signaled, &compare_expected, true)) {
7765 ifnet_decr_iorefcnt(ifp);
7766 return;
7767 }
7768 break;
7769 case SIOCDELMULTI:
7770 compare_expected = false;
7771 if (!atomic_compare_exchange_strong(&ifp->if_mcast_del_signaled, &compare_expected, true)) {
7772 ifnet_decr_iorefcnt(ifp);
7773 return;
7774 }
7775 break;
7776 default:
7777 os_log(OS_LOG_DEFAULT, "%s:%d %s unknown ioctl %lu",
7778 __func__, __LINE__, if_name(ifp), ioctl_code);
7779 return;
7780 }
7781
7782 p_ifnet_ioctl_ev = kalloc_type(struct ifnet_ioctl_event_nwk_wq_entry,
7783 Z_WAITOK | Z_ZERO | Z_NOFAIL);
7784
7785 p_ifnet_ioctl_ev->ifnet_ioctl_ev_arg.ifp = ifp;
7786 p_ifnet_ioctl_ev->ifnet_ioctl_ev_arg.ioctl_code = ioctl_code;
7787 p_ifnet_ioctl_ev->nwk_wqe.func = ifnet_ioctl_event_callback;
7788 nwk_wq_enqueue(&p_ifnet_ioctl_ev->nwk_wqe);
7789 }
7790
7791 static void
ifnet_ioctl_event_callback(struct nwk_wq_entry * nwk_item)7792 ifnet_ioctl_event_callback(struct nwk_wq_entry *nwk_item)
7793 {
7794 struct ifnet_ioctl_event_nwk_wq_entry *p_ev = __container_of(nwk_item,
7795 struct ifnet_ioctl_event_nwk_wq_entry, nwk_wqe);
7796
7797 struct ifnet *ifp = p_ev->ifnet_ioctl_ev_arg.ifp;
7798 u_long ioctl_code = p_ev->ifnet_ioctl_ev_arg.ioctl_code;
7799 int ret = 0;
7800
7801 switch (ioctl_code) {
7802 case SIOCADDMULTI:
7803 atomic_store(&ifp->if_mcast_add_signaled, false);
7804 break;
7805 case SIOCDELMULTI:
7806 atomic_store(&ifp->if_mcast_del_signaled, false);
7807 break;
7808 }
7809 if ((ret = ifnet_ioctl(ifp, 0, ioctl_code, NULL)) != 0) {
7810 os_log(OS_LOG_DEFAULT, "%s:%d %s ifnet_ioctl returned %d for ioctl %lu",
7811 __func__, __LINE__, if_name(ifp), ret, ioctl_code);
7812 } else if (dlil_verbose) {
7813 os_log(OS_LOG_DEFAULT, "%s:%d %s ifnet_ioctl returned successfully "
7814 "for ioctl %lu",
7815 __func__, __LINE__, if_name(ifp), ioctl_code);
7816 }
7817 ifnet_decr_iorefcnt(ifp);
7818 kfree_type(struct ifnet_ioctl_event_nwk_wq_entry, p_ev);
7819 return;
7820 }
7821
7822 errno_t
ifnet_ioctl(ifnet_t ifp,protocol_family_t proto_fam,u_long ioctl_code,void * ioctl_arg)7823 ifnet_ioctl(ifnet_t ifp, protocol_family_t proto_fam, u_long ioctl_code,
7824 void *ioctl_arg)
7825 {
7826 struct ifnet_filter *filter;
7827 int retval = EOPNOTSUPP;
7828 int result = 0;
7829
7830 if (ifp == NULL || ioctl_code == 0) {
7831 return EINVAL;
7832 }
7833
7834 /* Get an io ref count if the interface is attached */
7835 if (!ifnet_is_attached(ifp, 1)) {
7836 return EOPNOTSUPP;
7837 }
7838
7839 /*
7840 * Run the interface filters first.
7841 * We want to run all filters before calling the protocol,
7842 * interface family, or interface.
7843 */
7844 lck_mtx_lock_spin(&ifp->if_flt_lock);
7845 /* prevent filter list from changing in case we drop the lock */
7846 if_flt_monitor_busy(ifp);
7847 TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
7848 if (filter->filt_ioctl != NULL && (filter->filt_protocol == 0 ||
7849 filter->filt_protocol == proto_fam)) {
7850 lck_mtx_unlock(&ifp->if_flt_lock);
7851
7852 result = filter->filt_ioctl(filter->filt_cookie, ifp,
7853 proto_fam, ioctl_code, ioctl_arg);
7854
7855 lck_mtx_lock_spin(&ifp->if_flt_lock);
7856
7857 /* Only update retval if no one has handled the ioctl */
7858 if (retval == EOPNOTSUPP || result == EJUSTRETURN) {
7859 if (result == ENOTSUP) {
7860 result = EOPNOTSUPP;
7861 }
7862 retval = result;
7863 if (retval != 0 && retval != EOPNOTSUPP) {
7864 /* we're done with the filter list */
7865 if_flt_monitor_unbusy(ifp);
7866 lck_mtx_unlock(&ifp->if_flt_lock);
7867 goto cleanup;
7868 }
7869 }
7870 }
7871 }
7872 /* we're done with the filter list */
7873 if_flt_monitor_unbusy(ifp);
7874 lck_mtx_unlock(&ifp->if_flt_lock);
7875
7876 /* Allow the protocol to handle the ioctl */
7877 if (proto_fam != 0) {
7878 struct if_proto *proto;
7879
7880 /* callee holds a proto refcnt upon success */
7881 ifnet_lock_shared(ifp);
7882 proto = find_attached_proto(ifp, proto_fam);
7883 ifnet_lock_done(ifp);
7884 if (proto != NULL) {
7885 proto_media_ioctl ioctlp =
7886 (proto->proto_kpi == kProtoKPI_v1 ?
7887 proto->kpi.v1.ioctl : proto->kpi.v2.ioctl);
7888 result = EOPNOTSUPP;
7889 if (ioctlp != NULL) {
7890 result = ioctlp(ifp, proto_fam, ioctl_code,
7891 ioctl_arg);
7892 }
7893 if_proto_free(proto);
7894
7895 /* Only update retval if no one has handled the ioctl */
7896 if (retval == EOPNOTSUPP || result == EJUSTRETURN) {
7897 if (result == ENOTSUP) {
7898 result = EOPNOTSUPP;
7899 }
7900 retval = result;
7901 if (retval && retval != EOPNOTSUPP) {
7902 goto cleanup;
7903 }
7904 }
7905 }
7906 }
7907
7908 /* retval is either 0 or EOPNOTSUPP */
7909
7910 /*
7911 * Let the interface handle this ioctl.
7912 * If it returns EOPNOTSUPP, ignore that, we may have
7913 * already handled this in the protocol or family.
7914 */
7915 if (ifp->if_ioctl) {
7916 result = (*ifp->if_ioctl)(ifp, ioctl_code, ioctl_arg);
7917 }
7918
7919 /* Only update retval if no one has handled the ioctl */
7920 if (retval == EOPNOTSUPP || result == EJUSTRETURN) {
7921 if (result == ENOTSUP) {
7922 result = EOPNOTSUPP;
7923 }
7924 retval = result;
7925 if (retval && retval != EOPNOTSUPP) {
7926 goto cleanup;
7927 }
7928 }
7929
7930 cleanup:
7931 if (retval == EJUSTRETURN) {
7932 retval = 0;
7933 }
7934
7935 ifnet_decr_iorefcnt(ifp);
7936
7937 return retval;
7938 }
7939
7940 __private_extern__ errno_t
dlil_set_bpf_tap(ifnet_t ifp,bpf_tap_mode mode,bpf_packet_func callback)7941 dlil_set_bpf_tap(ifnet_t ifp, bpf_tap_mode mode, bpf_packet_func callback)
7942 {
7943 errno_t error = 0;
7944
7945 if (ifp->if_set_bpf_tap) {
7946 /* Get an io reference on the interface if it is attached */
7947 if (!ifnet_is_attached(ifp, 1)) {
7948 return ENXIO;
7949 }
7950 error = ifp->if_set_bpf_tap(ifp, mode, callback);
7951 ifnet_decr_iorefcnt(ifp);
7952 }
7953 return error;
7954 }
7955
7956 errno_t
dlil_resolve_multi(struct ifnet * ifp,const struct sockaddr * proto_addr,struct sockaddr * ll_addr,size_t ll_len)7957 dlil_resolve_multi(struct ifnet *ifp, const struct sockaddr *proto_addr,
7958 struct sockaddr *ll_addr, size_t ll_len)
7959 {
7960 errno_t result = EOPNOTSUPP;
7961 struct if_proto *proto;
7962 const struct sockaddr *verify;
7963 proto_media_resolve_multi resolvep;
7964
7965 if (!ifnet_is_attached(ifp, 1)) {
7966 return result;
7967 }
7968
7969 bzero(ll_addr, ll_len);
7970
7971 /* Call the protocol first; callee holds a proto refcnt upon success */
7972 ifnet_lock_shared(ifp);
7973 proto = find_attached_proto(ifp, proto_addr->sa_family);
7974 ifnet_lock_done(ifp);
7975 if (proto != NULL) {
7976 resolvep = (proto->proto_kpi == kProtoKPI_v1 ?
7977 proto->kpi.v1.resolve_multi : proto->kpi.v2.resolve_multi);
7978 if (resolvep != NULL) {
7979 result = resolvep(ifp, proto_addr,
7980 (struct sockaddr_dl *)(void *)ll_addr, ll_len);
7981 }
7982 if_proto_free(proto);
7983 }
7984
7985 /* Let the interface verify the multicast address */
7986 if ((result == EOPNOTSUPP || result == 0) && ifp->if_check_multi) {
7987 if (result == 0) {
7988 verify = ll_addr;
7989 } else {
7990 verify = proto_addr;
7991 }
7992 result = ifp->if_check_multi(ifp, verify);
7993 }
7994
7995 ifnet_decr_iorefcnt(ifp);
7996 return result;
7997 }
7998
7999 __private_extern__ errno_t
dlil_send_arp_internal(ifnet_t ifp,u_short arpop,const struct sockaddr_dl * sender_hw,const struct sockaddr * sender_proto,const struct sockaddr_dl * target_hw,const struct sockaddr * target_proto)8000 dlil_send_arp_internal(ifnet_t ifp, u_short arpop,
8001 const struct sockaddr_dl *sender_hw, const struct sockaddr *sender_proto,
8002 const struct sockaddr_dl *target_hw, const struct sockaddr *target_proto)
8003 {
8004 struct if_proto *proto;
8005 errno_t result = 0;
8006
8007 if ((ifp->if_flags & IFF_NOARP) != 0) {
8008 result = ENOTSUP;
8009 goto done;
8010 }
8011
8012 /* callee holds a proto refcnt upon success */
8013 ifnet_lock_shared(ifp);
8014 proto = find_attached_proto(ifp, target_proto->sa_family);
8015 ifnet_lock_done(ifp);
8016 if (proto == NULL) {
8017 result = ENOTSUP;
8018 } else {
8019 proto_media_send_arp arpp;
8020 arpp = (proto->proto_kpi == kProtoKPI_v1 ?
8021 proto->kpi.v1.send_arp : proto->kpi.v2.send_arp);
8022 if (arpp == NULL) {
8023 result = ENOTSUP;
8024 } else {
8025 switch (arpop) {
8026 case ARPOP_REQUEST:
8027 arpstat.txrequests++;
8028 if (target_hw != NULL) {
8029 arpstat.txurequests++;
8030 }
8031 break;
8032 case ARPOP_REPLY:
8033 arpstat.txreplies++;
8034 break;
8035 }
8036 result = arpp(ifp, arpop, sender_hw, sender_proto,
8037 target_hw, target_proto);
8038 }
8039 if_proto_free(proto);
8040 }
8041 done:
8042 return result;
8043 }
8044
8045 struct net_thread_marks { };
8046 static const struct net_thread_marks net_thread_marks_base = { };
8047
8048 __private_extern__ const net_thread_marks_t net_thread_marks_none =
8049 &net_thread_marks_base;
8050
8051 __private_extern__ net_thread_marks_t
net_thread_marks_push(u_int32_t push)8052 net_thread_marks_push(u_int32_t push)
8053 {
8054 static const char *const base = (const void*)&net_thread_marks_base;
8055 u_int32_t pop = 0;
8056
8057 if (push != 0) {
8058 struct uthread *uth = current_uthread();
8059
8060 pop = push & ~uth->uu_network_marks;
8061 if (pop != 0) {
8062 uth->uu_network_marks |= pop;
8063 }
8064 }
8065
8066 return (net_thread_marks_t)&base[pop];
8067 }
8068
8069 __private_extern__ net_thread_marks_t
net_thread_unmarks_push(u_int32_t unpush)8070 net_thread_unmarks_push(u_int32_t unpush)
8071 {
8072 static const char *const base = (const void*)&net_thread_marks_base;
8073 u_int32_t unpop = 0;
8074
8075 if (unpush != 0) {
8076 struct uthread *uth = current_uthread();
8077
8078 unpop = unpush & uth->uu_network_marks;
8079 if (unpop != 0) {
8080 uth->uu_network_marks &= ~unpop;
8081 }
8082 }
8083
8084 return (net_thread_marks_t)&base[unpop];
8085 }
8086
8087 __private_extern__ void
net_thread_marks_pop(net_thread_marks_t popx)8088 net_thread_marks_pop(net_thread_marks_t popx)
8089 {
8090 static const char *const base = (const void*)&net_thread_marks_base;
8091 const ptrdiff_t pop = (const char *)popx - (const char *)base;
8092
8093 if (pop != 0) {
8094 static const ptrdiff_t ones = (ptrdiff_t)(u_int32_t)~0U;
8095 struct uthread *uth = current_uthread();
8096
8097 VERIFY((pop & ones) == pop);
8098 VERIFY((ptrdiff_t)(uth->uu_network_marks & pop) == pop);
8099 uth->uu_network_marks &= ~pop;
8100 }
8101 }
8102
8103 __private_extern__ void
net_thread_unmarks_pop(net_thread_marks_t unpopx)8104 net_thread_unmarks_pop(net_thread_marks_t unpopx)
8105 {
8106 static const char *const base = (const void*)&net_thread_marks_base;
8107 ptrdiff_t unpop = (const char *)unpopx - (const char *)base;
8108
8109 if (unpop != 0) {
8110 static const ptrdiff_t ones = (ptrdiff_t)(u_int32_t)~0U;
8111 struct uthread *uth = current_uthread();
8112
8113 VERIFY((unpop & ones) == unpop);
8114 VERIFY((ptrdiff_t)(uth->uu_network_marks & unpop) == 0);
8115 uth->uu_network_marks |= unpop;
8116 }
8117 }
8118
8119 __private_extern__ u_int32_t
net_thread_is_marked(u_int32_t check)8120 net_thread_is_marked(u_int32_t check)
8121 {
8122 if (check != 0) {
8123 struct uthread *uth = current_uthread();
8124 return uth->uu_network_marks & check;
8125 } else {
8126 return 0;
8127 }
8128 }
8129
8130 __private_extern__ u_int32_t
net_thread_is_unmarked(u_int32_t check)8131 net_thread_is_unmarked(u_int32_t check)
8132 {
8133 if (check != 0) {
8134 struct uthread *uth = current_uthread();
8135 return ~uth->uu_network_marks & check;
8136 } else {
8137 return 0;
8138 }
8139 }
8140
8141 static __inline__ int
_is_announcement(const struct sockaddr_in * sender_sin,const struct sockaddr_in * target_sin)8142 _is_announcement(const struct sockaddr_in * sender_sin,
8143 const struct sockaddr_in * target_sin)
8144 {
8145 if (target_sin == NULL || sender_sin == NULL) {
8146 return FALSE;
8147 }
8148
8149 return sender_sin->sin_addr.s_addr == target_sin->sin_addr.s_addr;
8150 }
8151
8152 __private_extern__ errno_t
dlil_send_arp(ifnet_t ifp,u_short arpop,const struct sockaddr_dl * sender_hw,const struct sockaddr * sender_proto,const struct sockaddr_dl * target_hw,const struct sockaddr * target_proto0,u_int32_t rtflags)8153 dlil_send_arp(ifnet_t ifp, u_short arpop, const struct sockaddr_dl *sender_hw,
8154 const struct sockaddr *sender_proto, const struct sockaddr_dl *target_hw,
8155 const struct sockaddr *target_proto0, u_int32_t rtflags)
8156 {
8157 errno_t result = 0;
8158 const struct sockaddr_in * sender_sin;
8159 const struct sockaddr_in * target_sin;
8160 struct sockaddr_inarp target_proto_sinarp;
8161 struct sockaddr *target_proto = (void *)(uintptr_t)target_proto0;
8162
8163 if (target_proto == NULL || sender_proto == NULL) {
8164 return EINVAL;
8165 }
8166
8167 if (sender_proto->sa_family != target_proto->sa_family) {
8168 return EINVAL;
8169 }
8170
8171 /*
8172 * If the target is a (default) router, provide that
8173 * information to the send_arp callback routine.
8174 */
8175 if (rtflags & RTF_ROUTER) {
8176 bcopy(target_proto, &target_proto_sinarp,
8177 sizeof(struct sockaddr_in));
8178 target_proto_sinarp.sin_other |= SIN_ROUTER;
8179 target_proto = (struct sockaddr *)&target_proto_sinarp;
8180 }
8181
8182 /*
8183 * If this is an ARP request and the target IP is IPv4LL,
8184 * send the request on all interfaces. The exception is
8185 * an announcement, which must only appear on the specific
8186 * interface.
8187 */
8188 sender_sin = (struct sockaddr_in *)(void *)(uintptr_t)sender_proto;
8189 target_sin = (struct sockaddr_in *)(void *)(uintptr_t)target_proto;
8190 if (target_proto->sa_family == AF_INET &&
8191 IN_LINKLOCAL(ntohl(target_sin->sin_addr.s_addr)) &&
8192 ipv4_ll_arp_aware != 0 && arpop == ARPOP_REQUEST &&
8193 !_is_announcement(sender_sin, target_sin)) {
8194 ifnet_t *ifp_list;
8195 u_int32_t count;
8196 u_int32_t ifp_on;
8197
8198 result = ENOTSUP;
8199
8200 if (ifnet_list_get(IFNET_FAMILY_ANY, &ifp_list, &count) == 0) {
8201 for (ifp_on = 0; ifp_on < count; ifp_on++) {
8202 errno_t new_result;
8203 ifaddr_t source_hw = NULL;
8204 ifaddr_t source_ip = NULL;
8205 struct sockaddr_in source_ip_copy;
8206 struct ifnet *cur_ifp = ifp_list[ifp_on];
8207
8208 /*
8209 * Only arp on interfaces marked for IPv4LL
8210 * ARPing. This may mean that we don't ARP on
8211 * the interface the subnet route points to.
8212 */
8213 if (!(cur_ifp->if_eflags & IFEF_ARPLL)) {
8214 continue;
8215 }
8216
8217 /* Find the source IP address */
8218 ifnet_lock_shared(cur_ifp);
8219 source_hw = cur_ifp->if_lladdr;
8220 TAILQ_FOREACH(source_ip, &cur_ifp->if_addrhead,
8221 ifa_link) {
8222 IFA_LOCK(source_ip);
8223 if (source_ip->ifa_addr != NULL &&
8224 source_ip->ifa_addr->sa_family ==
8225 AF_INET) {
8226 /* Copy the source IP address */
8227 source_ip_copy =
8228 *(struct sockaddr_in *)
8229 (void *)source_ip->ifa_addr;
8230 IFA_UNLOCK(source_ip);
8231 break;
8232 }
8233 IFA_UNLOCK(source_ip);
8234 }
8235
8236 /* No IP Source, don't arp */
8237 if (source_ip == NULL) {
8238 ifnet_lock_done(cur_ifp);
8239 continue;
8240 }
8241
8242 IFA_ADDREF(source_hw);
8243 ifnet_lock_done(cur_ifp);
8244
8245 /* Send the ARP */
8246 new_result = dlil_send_arp_internal(cur_ifp,
8247 arpop, (struct sockaddr_dl *)(void *)
8248 source_hw->ifa_addr,
8249 (struct sockaddr *)&source_ip_copy, NULL,
8250 target_proto);
8251
8252 IFA_REMREF(source_hw);
8253 if (result == ENOTSUP) {
8254 result = new_result;
8255 }
8256 }
8257 ifnet_list_free(ifp_list);
8258 }
8259 } else {
8260 result = dlil_send_arp_internal(ifp, arpop, sender_hw,
8261 sender_proto, target_hw, target_proto);
8262 }
8263
8264 return result;
8265 }
8266
8267 /*
8268 * Caller must hold ifnet head lock.
8269 */
8270 static int
ifnet_lookup(struct ifnet * ifp)8271 ifnet_lookup(struct ifnet *ifp)
8272 {
8273 struct ifnet *_ifp;
8274
8275 LCK_RW_ASSERT(&ifnet_head_lock, LCK_RW_ASSERT_HELD);
8276 TAILQ_FOREACH(_ifp, &ifnet_head, if_link) {
8277 if (_ifp == ifp) {
8278 break;
8279 }
8280 }
8281 return _ifp != NULL;
8282 }
8283
8284 /*
8285 * Caller has to pass a non-zero refio argument to get a
8286 * IO reference count. This will prevent ifnet_detach from
8287 * being called when there are outstanding io reference counts.
8288 */
8289 int
ifnet_is_attached(struct ifnet * ifp,int refio)8290 ifnet_is_attached(struct ifnet *ifp, int refio)
8291 {
8292 int ret;
8293
8294 lck_mtx_lock_spin(&ifp->if_ref_lock);
8295 if ((ret = IF_FULLY_ATTACHED(ifp))) {
8296 if (refio > 0) {
8297 ifp->if_refio++;
8298 }
8299 }
8300 lck_mtx_unlock(&ifp->if_ref_lock);
8301
8302 return ret;
8303 }
8304
8305 void
ifnet_incr_pending_thread_count(struct ifnet * ifp)8306 ifnet_incr_pending_thread_count(struct ifnet *ifp)
8307 {
8308 lck_mtx_lock_spin(&ifp->if_ref_lock);
8309 ifp->if_threads_pending++;
8310 lck_mtx_unlock(&ifp->if_ref_lock);
8311 }
8312
8313 void
ifnet_decr_pending_thread_count(struct ifnet * ifp)8314 ifnet_decr_pending_thread_count(struct ifnet *ifp)
8315 {
8316 lck_mtx_lock_spin(&ifp->if_ref_lock);
8317 VERIFY(ifp->if_threads_pending > 0);
8318 ifp->if_threads_pending--;
8319 if (ifp->if_threads_pending == 0) {
8320 wakeup(&ifp->if_threads_pending);
8321 }
8322 lck_mtx_unlock(&ifp->if_ref_lock);
8323 }
8324
8325 /*
8326 * Caller must ensure the interface is attached; the assumption is that
8327 * there is at least an outstanding IO reference count held already.
8328 * Most callers would call ifnet_is_{attached,data_ready}() instead.
8329 */
8330 void
ifnet_incr_iorefcnt(struct ifnet * ifp)8331 ifnet_incr_iorefcnt(struct ifnet *ifp)
8332 {
8333 lck_mtx_lock_spin(&ifp->if_ref_lock);
8334 VERIFY(IF_FULLY_ATTACHED(ifp));
8335 VERIFY(ifp->if_refio > 0);
8336 ifp->if_refio++;
8337 lck_mtx_unlock(&ifp->if_ref_lock);
8338 }
8339
8340 __attribute__((always_inline))
8341 static void
ifnet_decr_iorefcnt_locked(struct ifnet * ifp)8342 ifnet_decr_iorefcnt_locked(struct ifnet *ifp)
8343 {
8344 LCK_MTX_ASSERT(&ifp->if_ref_lock, LCK_MTX_ASSERT_OWNED);
8345
8346 VERIFY(ifp->if_refio > 0);
8347 VERIFY(ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING));
8348
8349 ifp->if_refio--;
8350 VERIFY(ifp->if_refio != 0 || ifp->if_datamov == 0);
8351
8352 /*
8353 * if there are no more outstanding io references, wakeup the
8354 * ifnet_detach thread if detaching flag is set.
8355 */
8356 if (ifp->if_refio == 0 && (ifp->if_refflags & IFRF_DETACHING)) {
8357 wakeup(&(ifp->if_refio));
8358 }
8359 }
8360
8361 void
ifnet_decr_iorefcnt(struct ifnet * ifp)8362 ifnet_decr_iorefcnt(struct ifnet *ifp)
8363 {
8364 lck_mtx_lock_spin(&ifp->if_ref_lock);
8365 ifnet_decr_iorefcnt_locked(ifp);
8366 lck_mtx_unlock(&ifp->if_ref_lock);
8367 }
8368
8369 boolean_t
ifnet_datamov_begin(struct ifnet * ifp)8370 ifnet_datamov_begin(struct ifnet *ifp)
8371 {
8372 boolean_t ret;
8373
8374 lck_mtx_lock_spin(&ifp->if_ref_lock);
8375 if ((ret = IF_FULLY_ATTACHED_AND_READY(ifp))) {
8376 ifp->if_refio++;
8377 ifp->if_datamov++;
8378 }
8379 lck_mtx_unlock(&ifp->if_ref_lock);
8380
8381 return ret;
8382 }
8383
8384 void
ifnet_datamov_end(struct ifnet * ifp)8385 ifnet_datamov_end(struct ifnet *ifp)
8386 {
8387 lck_mtx_lock_spin(&ifp->if_ref_lock);
8388 VERIFY(ifp->if_datamov > 0);
8389 /*
8390 * if there's no more thread moving data, wakeup any
8391 * drainers that's blocked waiting for this.
8392 */
8393 if (--ifp->if_datamov == 0 && ifp->if_drainers > 0) {
8394 DLIL_PRINTF("Waking up drainers on %s\n", if_name(ifp));
8395 DTRACE_IP1(datamov__drain__wake, struct ifnet *, ifp);
8396 wakeup(&(ifp->if_datamov));
8397 }
8398 ifnet_decr_iorefcnt_locked(ifp);
8399 lck_mtx_unlock(&ifp->if_ref_lock);
8400 }
8401
8402 static void
ifnet_datamov_suspend_locked(struct ifnet * ifp)8403 ifnet_datamov_suspend_locked(struct ifnet *ifp)
8404 {
8405 LCK_MTX_ASSERT(&ifp->if_ref_lock, LCK_MTX_ASSERT_OWNED);
8406 ifp->if_refio++;
8407 if (ifp->if_suspend++ == 0) {
8408 VERIFY(ifp->if_refflags & IFRF_READY);
8409 ifp->if_refflags &= ~IFRF_READY;
8410 }
8411 }
8412
8413 void
ifnet_datamov_suspend(struct ifnet * ifp)8414 ifnet_datamov_suspend(struct ifnet *ifp)
8415 {
8416 lck_mtx_lock_spin(&ifp->if_ref_lock);
8417 VERIFY(ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING));
8418 ifnet_datamov_suspend_locked(ifp);
8419 lck_mtx_unlock(&ifp->if_ref_lock);
8420 }
8421
8422 boolean_t
ifnet_datamov_suspend_if_needed(struct ifnet * ifp)8423 ifnet_datamov_suspend_if_needed(struct ifnet *ifp)
8424 {
8425 lck_mtx_lock_spin(&ifp->if_ref_lock);
8426 VERIFY(ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING));
8427 if (ifp->if_suspend > 0) {
8428 lck_mtx_unlock(&ifp->if_ref_lock);
8429 return FALSE;
8430 }
8431 ifnet_datamov_suspend_locked(ifp);
8432 lck_mtx_unlock(&ifp->if_ref_lock);
8433 return TRUE;
8434 }
8435
8436 void
ifnet_datamov_drain(struct ifnet * ifp)8437 ifnet_datamov_drain(struct ifnet *ifp)
8438 {
8439 lck_mtx_lock(&ifp->if_ref_lock);
8440 VERIFY(ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING));
8441 /* data movement must already be suspended */
8442 VERIFY(ifp->if_suspend > 0);
8443 VERIFY(!(ifp->if_refflags & IFRF_READY));
8444 ifp->if_drainers++;
8445 while (ifp->if_datamov != 0) {
8446 DLIL_PRINTF("Waiting for data path(s) to quiesce on %s\n",
8447 if_name(ifp));
8448 DTRACE_IP1(datamov__wait, struct ifnet *, ifp);
8449 (void) msleep(&(ifp->if_datamov), &ifp->if_ref_lock,
8450 (PZERO - 1), __func__, NULL);
8451 DTRACE_IP1(datamov__wake, struct ifnet *, ifp);
8452 }
8453 VERIFY(!(ifp->if_refflags & IFRF_READY));
8454 VERIFY(ifp->if_drainers > 0);
8455 ifp->if_drainers--;
8456 lck_mtx_unlock(&ifp->if_ref_lock);
8457
8458 /* purge the interface queues */
8459 if ((ifp->if_eflags & IFEF_TXSTART) != 0) {
8460 if_qflush_snd(ifp, false);
8461 }
8462 }
8463
8464 void
ifnet_datamov_suspend_and_drain(struct ifnet * ifp)8465 ifnet_datamov_suspend_and_drain(struct ifnet *ifp)
8466 {
8467 ifnet_datamov_suspend(ifp);
8468 ifnet_datamov_drain(ifp);
8469 }
8470
8471 void
ifnet_datamov_resume(struct ifnet * ifp)8472 ifnet_datamov_resume(struct ifnet *ifp)
8473 {
8474 lck_mtx_lock(&ifp->if_ref_lock);
8475 /* data movement must already be suspended */
8476 VERIFY(ifp->if_suspend > 0);
8477 if (--ifp->if_suspend == 0) {
8478 VERIFY(!(ifp->if_refflags & IFRF_READY));
8479 ifp->if_refflags |= IFRF_READY;
8480 }
8481 ifnet_decr_iorefcnt_locked(ifp);
8482 lck_mtx_unlock(&ifp->if_ref_lock);
8483 }
8484
8485 static void
dlil_if_trace(struct dlil_ifnet * dl_if,int refhold)8486 dlil_if_trace(struct dlil_ifnet *dl_if, int refhold)
8487 {
8488 struct dlil_ifnet_dbg *dl_if_dbg = (struct dlil_ifnet_dbg *)dl_if;
8489 ctrace_t *tr;
8490 u_int32_t idx;
8491 u_int16_t *cnt;
8492
8493 if (!(dl_if->dl_if_flags & DLIF_DEBUG)) {
8494 panic("%s: dl_if %p has no debug structure", __func__, dl_if);
8495 /* NOTREACHED */
8496 }
8497
8498 if (refhold) {
8499 cnt = &dl_if_dbg->dldbg_if_refhold_cnt;
8500 tr = dl_if_dbg->dldbg_if_refhold;
8501 } else {
8502 cnt = &dl_if_dbg->dldbg_if_refrele_cnt;
8503 tr = dl_if_dbg->dldbg_if_refrele;
8504 }
8505
8506 idx = os_atomic_inc_orig(cnt, relaxed) % IF_REF_TRACE_HIST_SIZE;
8507 ctrace_record(&tr[idx]);
8508 }
8509
8510 errno_t
dlil_if_ref(struct ifnet * ifp)8511 dlil_if_ref(struct ifnet *ifp)
8512 {
8513 struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
8514
8515 if (dl_if == NULL) {
8516 return EINVAL;
8517 }
8518
8519 lck_mtx_lock_spin(&dl_if->dl_if_lock);
8520 ++dl_if->dl_if_refcnt;
8521 if (dl_if->dl_if_refcnt == 0) {
8522 panic("%s: wraparound refcnt for ifp=%p", __func__, ifp);
8523 /* NOTREACHED */
8524 }
8525 if (dl_if->dl_if_trace != NULL) {
8526 (*dl_if->dl_if_trace)(dl_if, TRUE);
8527 }
8528 lck_mtx_unlock(&dl_if->dl_if_lock);
8529
8530 return 0;
8531 }
8532
8533 errno_t
dlil_if_free(struct ifnet * ifp)8534 dlil_if_free(struct ifnet *ifp)
8535 {
8536 struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
8537 bool need_release = FALSE;
8538
8539 if (dl_if == NULL) {
8540 return EINVAL;
8541 }
8542
8543 lck_mtx_lock_spin(&dl_if->dl_if_lock);
8544 switch (dl_if->dl_if_refcnt) {
8545 case 0:
8546 panic("%s: negative refcnt for ifp=%p", __func__, ifp);
8547 /* NOTREACHED */
8548 break;
8549 case 1:
8550 if ((ifp->if_refflags & IFRF_EMBRYONIC) != 0) {
8551 need_release = TRUE;
8552 }
8553 break;
8554 default:
8555 break;
8556 }
8557 --dl_if->dl_if_refcnt;
8558 if (dl_if->dl_if_trace != NULL) {
8559 (*dl_if->dl_if_trace)(dl_if, FALSE);
8560 }
8561 lck_mtx_unlock(&dl_if->dl_if_lock);
8562 if (need_release) {
8563 _dlil_if_release(ifp, true);
8564 }
8565 return 0;
8566 }
8567
8568 static errno_t
dlil_attach_protocol(struct if_proto * proto,const struct ifnet_demux_desc * demux_list,u_int32_t demux_count,uint32_t * proto_count)8569 dlil_attach_protocol(struct if_proto *proto,
8570 const struct ifnet_demux_desc *demux_list, u_int32_t demux_count,
8571 uint32_t * proto_count)
8572 {
8573 struct kev_dl_proto_data ev_pr_data;
8574 struct ifnet *ifp = proto->ifp;
8575 errno_t retval = 0;
8576 u_int32_t hash_value = proto_hash_value(proto->protocol_family);
8577 struct if_proto *prev_proto;
8578 struct if_proto *_proto;
8579
8580 /* don't allow attaching anything but PF_BRIDGE to vmnet interfaces */
8581 if (IFNET_IS_VMNET(ifp) && proto->protocol_family != PF_BRIDGE) {
8582 return EINVAL;
8583 }
8584
8585 if (!ifnet_is_attached(ifp, 1)) {
8586 os_log(OS_LOG_DEFAULT, "%s: %s is no longer attached",
8587 __func__, if_name(ifp));
8588 return ENXIO;
8589 }
8590 /* callee holds a proto refcnt upon success */
8591 ifnet_lock_exclusive(ifp);
8592 _proto = find_attached_proto(ifp, proto->protocol_family);
8593 if (_proto != NULL) {
8594 ifnet_lock_done(ifp);
8595 if_proto_free(_proto);
8596 retval = EEXIST;
8597 goto ioref_done;
8598 }
8599
8600 /*
8601 * Call family module add_proto routine so it can refine the
8602 * demux descriptors as it wishes.
8603 */
8604 retval = ifp->if_add_proto(ifp, proto->protocol_family, demux_list,
8605 demux_count);
8606 if (retval) {
8607 ifnet_lock_done(ifp);
8608 goto ioref_done;
8609 }
8610
8611 /*
8612 * Insert the protocol in the hash
8613 */
8614 prev_proto = SLIST_FIRST(&ifp->if_proto_hash[hash_value]);
8615 while (prev_proto != NULL && SLIST_NEXT(prev_proto, next_hash) != NULL) {
8616 prev_proto = SLIST_NEXT(prev_proto, next_hash);
8617 }
8618 if (prev_proto) {
8619 SLIST_INSERT_AFTER(prev_proto, proto, next_hash);
8620 } else {
8621 SLIST_INSERT_HEAD(&ifp->if_proto_hash[hash_value],
8622 proto, next_hash);
8623 }
8624
8625 /* hold a proto refcnt for attach */
8626 if_proto_ref(proto);
8627
8628 /*
8629 * The reserved field carries the number of protocol still attached
8630 * (subject to change)
8631 */
8632 ev_pr_data.proto_family = proto->protocol_family;
8633 ev_pr_data.proto_remaining_count = dlil_ifp_protolist(ifp, NULL, 0);
8634
8635 ifnet_lock_done(ifp);
8636
8637 dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_PROTO_ATTACHED,
8638 (struct net_event_data *)&ev_pr_data,
8639 sizeof(struct kev_dl_proto_data), FALSE);
8640 if (proto_count != NULL) {
8641 *proto_count = ev_pr_data.proto_remaining_count;
8642 }
8643 ioref_done:
8644 ifnet_decr_iorefcnt(ifp);
8645 return retval;
8646 }
8647
8648 static void
dlil_handle_proto_attach(ifnet_t ifp,protocol_family_t protocol)8649 dlil_handle_proto_attach(ifnet_t ifp, protocol_family_t protocol)
8650 {
8651 /*
8652 * A protocol has been attached, mark the interface up.
8653 * This used to be done by configd.KernelEventMonitor, but that
8654 * is inherently prone to races (rdar://problem/30810208).
8655 */
8656 (void) ifnet_set_flags(ifp, IFF_UP, IFF_UP);
8657 (void) ifnet_ioctl(ifp, 0, SIOCSIFFLAGS, NULL);
8658 dlil_post_sifflags_msg(ifp);
8659 #if SKYWALK
8660 switch (protocol) {
8661 case AF_INET:
8662 case AF_INET6:
8663 /* don't attach the flowswitch unless attaching IP */
8664 dlil_attach_flowswitch_nexus(ifp);
8665 break;
8666 default:
8667 break;
8668 }
8669 #endif /* SKYWALK */
8670 }
8671
8672 errno_t
ifnet_attach_protocol(ifnet_t ifp,protocol_family_t protocol,const struct ifnet_attach_proto_param * proto_details)8673 ifnet_attach_protocol(ifnet_t ifp, protocol_family_t protocol,
8674 const struct ifnet_attach_proto_param *proto_details)
8675 {
8676 int retval = 0;
8677 struct if_proto *ifproto = NULL;
8678 uint32_t proto_count = 0;
8679
8680 ifnet_head_lock_shared();
8681 if (ifp == NULL || protocol == 0 || proto_details == NULL) {
8682 retval = EINVAL;
8683 goto end;
8684 }
8685 /* Check that the interface is in the global list */
8686 if (!ifnet_lookup(ifp)) {
8687 retval = ENXIO;
8688 goto end;
8689 }
8690
8691 ifproto = zalloc_flags(dlif_proto_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
8692
8693 /* refcnt held above during lookup */
8694 ifproto->ifp = ifp;
8695 ifproto->protocol_family = protocol;
8696 ifproto->proto_kpi = kProtoKPI_v1;
8697 ifproto->kpi.v1.input = proto_details->input;
8698 ifproto->kpi.v1.pre_output = proto_details->pre_output;
8699 ifproto->kpi.v1.event = proto_details->event;
8700 ifproto->kpi.v1.ioctl = proto_details->ioctl;
8701 ifproto->kpi.v1.detached = proto_details->detached;
8702 ifproto->kpi.v1.resolve_multi = proto_details->resolve;
8703 ifproto->kpi.v1.send_arp = proto_details->send_arp;
8704
8705 retval = dlil_attach_protocol(ifproto,
8706 proto_details->demux_list, proto_details->demux_count,
8707 &proto_count);
8708
8709 end:
8710 if (retval == EEXIST) {
8711 /* already attached */
8712 if (dlil_verbose) {
8713 DLIL_PRINTF("%s: protocol %d already attached\n",
8714 ifp != NULL ? if_name(ifp) : "N/A",
8715 protocol);
8716 }
8717 } else if (retval != 0) {
8718 DLIL_PRINTF("%s: failed to attach v1 protocol %d (err=%d)\n",
8719 ifp != NULL ? if_name(ifp) : "N/A", protocol, retval);
8720 } else if (dlil_verbose) {
8721 DLIL_PRINTF("%s: attached v1 protocol %d (count = %d)\n",
8722 ifp != NULL ? if_name(ifp) : "N/A",
8723 protocol, proto_count);
8724 }
8725 ifnet_head_done();
8726 if (retval == 0) {
8727 dlil_handle_proto_attach(ifp, protocol);
8728 } else if (ifproto != NULL) {
8729 zfree(dlif_proto_zone, ifproto);
8730 }
8731 return retval;
8732 }
8733
8734 errno_t
ifnet_attach_protocol_v2(ifnet_t ifp,protocol_family_t protocol,const struct ifnet_attach_proto_param_v2 * proto_details)8735 ifnet_attach_protocol_v2(ifnet_t ifp, protocol_family_t protocol,
8736 const struct ifnet_attach_proto_param_v2 *proto_details)
8737 {
8738 int retval = 0;
8739 struct if_proto *ifproto = NULL;
8740 uint32_t proto_count = 0;
8741
8742 ifnet_head_lock_shared();
8743 if (ifp == NULL || protocol == 0 || proto_details == NULL) {
8744 retval = EINVAL;
8745 goto end;
8746 }
8747 /* Check that the interface is in the global list */
8748 if (!ifnet_lookup(ifp)) {
8749 retval = ENXIO;
8750 goto end;
8751 }
8752
8753 ifproto = zalloc_flags(dlif_proto_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
8754
8755 /* refcnt held above during lookup */
8756 ifproto->ifp = ifp;
8757 ifproto->protocol_family = protocol;
8758 ifproto->proto_kpi = kProtoKPI_v2;
8759 ifproto->kpi.v2.input = proto_details->input;
8760 ifproto->kpi.v2.pre_output = proto_details->pre_output;
8761 ifproto->kpi.v2.event = proto_details->event;
8762 ifproto->kpi.v2.ioctl = proto_details->ioctl;
8763 ifproto->kpi.v2.detached = proto_details->detached;
8764 ifproto->kpi.v2.resolve_multi = proto_details->resolve;
8765 ifproto->kpi.v2.send_arp = proto_details->send_arp;
8766
8767 retval = dlil_attach_protocol(ifproto,
8768 proto_details->demux_list, proto_details->demux_count,
8769 &proto_count);
8770
8771 end:
8772 if (retval == EEXIST) {
8773 /* already attached */
8774 if (dlil_verbose) {
8775 DLIL_PRINTF("%s: protocol %d already attached\n",
8776 ifp != NULL ? if_name(ifp) : "N/A",
8777 protocol);
8778 }
8779 } else if (retval != 0) {
8780 DLIL_PRINTF("%s: failed to attach v2 protocol %d (err=%d)\n",
8781 ifp != NULL ? if_name(ifp) : "N/A", protocol, retval);
8782 } else if (dlil_verbose) {
8783 DLIL_PRINTF("%s: attached v2 protocol %d (count = %d)\n",
8784 ifp != NULL ? if_name(ifp) : "N/A",
8785 protocol, proto_count);
8786 }
8787 ifnet_head_done();
8788 if (retval == 0) {
8789 dlil_handle_proto_attach(ifp, protocol);
8790 } else if (ifproto != NULL) {
8791 zfree(dlif_proto_zone, ifproto);
8792 }
8793 return retval;
8794 }
8795
8796 errno_t
ifnet_detach_protocol(ifnet_t ifp,protocol_family_t proto_family)8797 ifnet_detach_protocol(ifnet_t ifp, protocol_family_t proto_family)
8798 {
8799 struct if_proto *proto = NULL;
8800 int retval = 0;
8801
8802 if (ifp == NULL || proto_family == 0) {
8803 retval = EINVAL;
8804 goto end;
8805 }
8806
8807 ifnet_lock_exclusive(ifp);
8808 /* callee holds a proto refcnt upon success */
8809 proto = find_attached_proto(ifp, proto_family);
8810 if (proto == NULL) {
8811 retval = ENXIO;
8812 ifnet_lock_done(ifp);
8813 goto end;
8814 }
8815
8816 /* call family module del_proto */
8817 if (ifp->if_del_proto) {
8818 ifp->if_del_proto(ifp, proto->protocol_family);
8819 }
8820
8821 SLIST_REMOVE(&ifp->if_proto_hash[proto_hash_value(proto_family)],
8822 proto, if_proto, next_hash);
8823
8824 if (proto->proto_kpi == kProtoKPI_v1) {
8825 proto->kpi.v1.input = ifproto_media_input_v1;
8826 proto->kpi.v1.pre_output = ifproto_media_preout;
8827 proto->kpi.v1.event = ifproto_media_event;
8828 proto->kpi.v1.ioctl = ifproto_media_ioctl;
8829 proto->kpi.v1.resolve_multi = ifproto_media_resolve_multi;
8830 proto->kpi.v1.send_arp = ifproto_media_send_arp;
8831 } else {
8832 proto->kpi.v2.input = ifproto_media_input_v2;
8833 proto->kpi.v2.pre_output = ifproto_media_preout;
8834 proto->kpi.v2.event = ifproto_media_event;
8835 proto->kpi.v2.ioctl = ifproto_media_ioctl;
8836 proto->kpi.v2.resolve_multi = ifproto_media_resolve_multi;
8837 proto->kpi.v2.send_arp = ifproto_media_send_arp;
8838 }
8839 proto->detached = 1;
8840 ifnet_lock_done(ifp);
8841
8842 if (dlil_verbose) {
8843 DLIL_PRINTF("%s: detached %s protocol %d\n", if_name(ifp),
8844 (proto->proto_kpi == kProtoKPI_v1) ?
8845 "v1" : "v2", proto_family);
8846 }
8847
8848 /* release proto refcnt held during protocol attach */
8849 if_proto_free(proto);
8850
8851 /*
8852 * Release proto refcnt held during lookup; the rest of
8853 * protocol detach steps will happen when the last proto
8854 * reference is released.
8855 */
8856 if_proto_free(proto);
8857
8858 end:
8859 return retval;
8860 }
8861
8862 static errno_t
ifproto_media_input_v1(struct ifnet * ifp,protocol_family_t protocol,struct mbuf * packet,char * header)8863 ifproto_media_input_v1(struct ifnet *ifp, protocol_family_t protocol,
8864 struct mbuf *packet, char *header)
8865 {
8866 #pragma unused(ifp, protocol, packet, header)
8867 return ENXIO;
8868 }
8869
8870 static errno_t
ifproto_media_input_v2(struct ifnet * ifp,protocol_family_t protocol,struct mbuf * packet)8871 ifproto_media_input_v2(struct ifnet *ifp, protocol_family_t protocol,
8872 struct mbuf *packet)
8873 {
8874 #pragma unused(ifp, protocol, packet)
8875 return ENXIO;
8876 }
8877
8878 static errno_t
ifproto_media_preout(struct ifnet * ifp,protocol_family_t protocol,mbuf_t * packet,const struct sockaddr * dest,void * route,char * frame_type,char * link_layer_dest)8879 ifproto_media_preout(struct ifnet *ifp, protocol_family_t protocol,
8880 mbuf_t *packet, const struct sockaddr *dest, void *route, char *frame_type,
8881 char *link_layer_dest)
8882 {
8883 #pragma unused(ifp, protocol, packet, dest, route, frame_type, link_layer_dest)
8884 return ENXIO;
8885 }
8886
8887 static void
ifproto_media_event(struct ifnet * ifp,protocol_family_t protocol,const struct kev_msg * event)8888 ifproto_media_event(struct ifnet *ifp, protocol_family_t protocol,
8889 const struct kev_msg *event)
8890 {
8891 #pragma unused(ifp, protocol, event)
8892 }
8893
8894 static errno_t
ifproto_media_ioctl(struct ifnet * ifp,protocol_family_t protocol,unsigned long command,void * argument)8895 ifproto_media_ioctl(struct ifnet *ifp, protocol_family_t protocol,
8896 unsigned long command, void *argument)
8897 {
8898 #pragma unused(ifp, protocol, command, argument)
8899 return ENXIO;
8900 }
8901
8902 static errno_t
ifproto_media_resolve_multi(ifnet_t ifp,const struct sockaddr * proto_addr,struct sockaddr_dl * out_ll,size_t ll_len)8903 ifproto_media_resolve_multi(ifnet_t ifp, const struct sockaddr *proto_addr,
8904 struct sockaddr_dl *out_ll, size_t ll_len)
8905 {
8906 #pragma unused(ifp, proto_addr, out_ll, ll_len)
8907 return ENXIO;
8908 }
8909
8910 static errno_t
ifproto_media_send_arp(struct ifnet * ifp,u_short arpop,const struct sockaddr_dl * sender_hw,const struct sockaddr * sender_proto,const struct sockaddr_dl * target_hw,const struct sockaddr * target_proto)8911 ifproto_media_send_arp(struct ifnet *ifp, u_short arpop,
8912 const struct sockaddr_dl *sender_hw, const struct sockaddr *sender_proto,
8913 const struct sockaddr_dl *target_hw, const struct sockaddr *target_proto)
8914 {
8915 #pragma unused(ifp, arpop, sender_hw, sender_proto, target_hw, target_proto)
8916 return ENXIO;
8917 }
8918
8919 extern int if_next_index(void);
8920 extern int tcp_ecn_outbound;
8921
8922 void
dlil_ifclassq_setup(struct ifnet * ifp,struct ifclassq * ifcq)8923 dlil_ifclassq_setup(struct ifnet *ifp, struct ifclassq *ifcq)
8924 {
8925 uint32_t sflags = 0;
8926 int err;
8927
8928 if (if_flowadv) {
8929 sflags |= PKTSCHEDF_QALG_FLOWCTL;
8930 }
8931
8932 if (if_delaybased_queue) {
8933 sflags |= PKTSCHEDF_QALG_DELAYBASED;
8934 }
8935
8936 if (ifp->if_output_sched_model ==
8937 IFNET_SCHED_MODEL_DRIVER_MANAGED) {
8938 sflags |= PKTSCHEDF_QALG_DRIVER_MANAGED;
8939 }
8940 /* Inherit drop limit from the default queue */
8941 if (ifp->if_snd != ifcq) {
8942 IFCQ_PKT_DROP_LIMIT(ifcq) = IFCQ_PKT_DROP_LIMIT(ifp->if_snd);
8943 }
8944 /* Initialize transmit queue(s) */
8945 err = ifclassq_setup(ifcq, ifp, sflags);
8946 if (err != 0) {
8947 panic_plain("%s: ifp=%p couldn't initialize transmit queue; "
8948 "err=%d", __func__, ifp, err);
8949 /* NOTREACHED */
8950 }
8951 }
8952
8953 errno_t
ifnet_attach(ifnet_t ifp,const struct sockaddr_dl * ll_addr)8954 ifnet_attach(ifnet_t ifp, const struct sockaddr_dl *ll_addr)
8955 {
8956 #if SKYWALK
8957 boolean_t netif_compat;
8958 if_nexus_netif nexus_netif;
8959 #endif /* SKYWALK */
8960 struct ifnet *tmp_if;
8961 struct ifaddr *ifa;
8962 struct if_data_internal if_data_saved;
8963 struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
8964 struct dlil_threading_info *dl_inp;
8965 thread_continue_t thfunc = NULL;
8966 int err;
8967
8968 if (ifp == NULL) {
8969 return EINVAL;
8970 }
8971
8972 /*
8973 * Serialize ifnet attach using dlil_ifnet_lock, in order to
8974 * prevent the interface from being configured while it is
8975 * embryonic, as ifnet_head_lock is dropped and reacquired
8976 * below prior to marking the ifnet with IFRF_ATTACHED.
8977 */
8978 dlil_if_lock();
8979 ifnet_head_lock_exclusive();
8980 /* Verify we aren't already on the list */
8981 TAILQ_FOREACH(tmp_if, &ifnet_head, if_link) {
8982 if (tmp_if == ifp) {
8983 ifnet_head_done();
8984 dlil_if_unlock();
8985 return EEXIST;
8986 }
8987 }
8988
8989 lck_mtx_lock_spin(&ifp->if_ref_lock);
8990 if (!(ifp->if_refflags & IFRF_EMBRYONIC)) {
8991 panic_plain("%s: flags mismatch (embryonic not set) ifp=%p",
8992 __func__, ifp);
8993 /* NOTREACHED */
8994 }
8995 lck_mtx_unlock(&ifp->if_ref_lock);
8996
8997 ifnet_lock_exclusive(ifp);
8998
8999 /* Sanity check */
9000 VERIFY(ifp->if_detaching_link.tqe_next == NULL);
9001 VERIFY(ifp->if_detaching_link.tqe_prev == NULL);
9002 VERIFY(ifp->if_threads_pending == 0);
9003
9004 if (ll_addr != NULL) {
9005 if (ifp->if_addrlen == 0) {
9006 ifp->if_addrlen = ll_addr->sdl_alen;
9007 } else if (ll_addr->sdl_alen != ifp->if_addrlen) {
9008 ifnet_lock_done(ifp);
9009 ifnet_head_done();
9010 dlil_if_unlock();
9011 return EINVAL;
9012 }
9013 }
9014
9015 /*
9016 * Allow interfaces without protocol families to attach
9017 * only if they have the necessary fields filled out.
9018 */
9019 if (ifp->if_add_proto == NULL || ifp->if_del_proto == NULL) {
9020 DLIL_PRINTF("%s: Attempt to attach interface without "
9021 "family module - %d\n", __func__, ifp->if_family);
9022 ifnet_lock_done(ifp);
9023 ifnet_head_done();
9024 dlil_if_unlock();
9025 return ENODEV;
9026 }
9027
9028 /* Allocate protocol hash table */
9029 VERIFY(ifp->if_proto_hash == NULL);
9030 ifp->if_proto_hash = kalloc_type(struct proto_hash_entry,
9031 PROTO_HASH_SLOTS, Z_WAITOK | Z_ZERO | Z_NOFAIL);
9032
9033 lck_mtx_lock_spin(&ifp->if_flt_lock);
9034 VERIFY(TAILQ_EMPTY(&ifp->if_flt_head));
9035 TAILQ_INIT(&ifp->if_flt_head);
9036 VERIFY(ifp->if_flt_busy == 0);
9037 VERIFY(ifp->if_flt_waiters == 0);
9038 VERIFY(ifp->if_flt_non_os_count == 0);
9039 VERIFY(ifp->if_flt_no_tso_count == 0);
9040 lck_mtx_unlock(&ifp->if_flt_lock);
9041
9042 if (!(dl_if->dl_if_flags & DLIF_REUSE)) {
9043 VERIFY(LIST_EMPTY(&ifp->if_multiaddrs));
9044 LIST_INIT(&ifp->if_multiaddrs);
9045 }
9046
9047 VERIFY(ifp->if_allhostsinm == NULL);
9048 VERIFY(TAILQ_EMPTY(&ifp->if_addrhead));
9049 TAILQ_INIT(&ifp->if_addrhead);
9050
9051 if (ifp->if_index == 0) {
9052 int idx = if_next_index();
9053
9054 /*
9055 * Since we exhausted the list of
9056 * if_index's, try to find an empty slot
9057 * in ifindex2ifnet.
9058 */
9059 if (idx == -1 && if_index >= UINT16_MAX) {
9060 for (int i = 1; i < if_index; i++) {
9061 if (ifindex2ifnet[i] == NULL &&
9062 ifnet_addrs[i - 1] == NULL) {
9063 idx = i;
9064 break;
9065 }
9066 }
9067 }
9068 if (idx == -1) {
9069 ifp->if_index = 0;
9070 ifnet_lock_done(ifp);
9071 ifnet_head_done();
9072 dlil_if_unlock();
9073 return ENOBUFS;
9074 }
9075 ifp->if_index = (uint16_t)idx;
9076
9077 /* the lladdr passed at attach time is the permanent address */
9078 if (ll_addr != NULL && ifp->if_type == IFT_ETHER &&
9079 ll_addr->sdl_alen == ETHER_ADDR_LEN) {
9080 bcopy(CONST_LLADDR(ll_addr),
9081 dl_if->dl_if_permanent_ether,
9082 ETHER_ADDR_LEN);
9083 dl_if->dl_if_permanent_ether_is_set = 1;
9084 }
9085 }
9086 /* There should not be anything occupying this slot */
9087 VERIFY(ifindex2ifnet[ifp->if_index] == NULL);
9088
9089 /* allocate (if needed) and initialize a link address */
9090 ifa = dlil_alloc_lladdr(ifp, ll_addr);
9091 if (ifa == NULL) {
9092 ifnet_lock_done(ifp);
9093 ifnet_head_done();
9094 dlil_if_unlock();
9095 return ENOBUFS;
9096 }
9097
9098 VERIFY(ifnet_addrs[ifp->if_index - 1] == NULL);
9099 ifnet_addrs[ifp->if_index - 1] = ifa;
9100
9101 /* make this address the first on the list */
9102 IFA_LOCK(ifa);
9103 /* hold a reference for ifnet_addrs[] */
9104 IFA_ADDREF_LOCKED(ifa);
9105 /* if_attach_link_ifa() holds a reference for ifa_link */
9106 if_attach_link_ifa(ifp, ifa);
9107 IFA_UNLOCK(ifa);
9108
9109 TAILQ_INSERT_TAIL(&ifnet_head, ifp, if_link);
9110 ifindex2ifnet[ifp->if_index] = ifp;
9111
9112 /* Hold a reference to the underlying dlil_ifnet */
9113 ifnet_reference(ifp);
9114
9115 /* Clear stats (save and restore other fields that we care) */
9116 if_data_saved = ifp->if_data;
9117 bzero(&ifp->if_data, sizeof(ifp->if_data));
9118 ifp->if_data.ifi_type = if_data_saved.ifi_type;
9119 ifp->if_data.ifi_typelen = if_data_saved.ifi_typelen;
9120 ifp->if_data.ifi_physical = if_data_saved.ifi_physical;
9121 ifp->if_data.ifi_addrlen = if_data_saved.ifi_addrlen;
9122 ifp->if_data.ifi_hdrlen = if_data_saved.ifi_hdrlen;
9123 ifp->if_data.ifi_mtu = if_data_saved.ifi_mtu;
9124 ifp->if_data.ifi_baudrate = if_data_saved.ifi_baudrate;
9125 ifp->if_data.ifi_hwassist = if_data_saved.ifi_hwassist;
9126 ifp->if_data.ifi_tso_v4_mtu = if_data_saved.ifi_tso_v4_mtu;
9127 ifp->if_data.ifi_tso_v6_mtu = if_data_saved.ifi_tso_v6_mtu;
9128 ifnet_touch_lastchange(ifp);
9129
9130 VERIFY(ifp->if_output_sched_model == IFNET_SCHED_MODEL_NORMAL ||
9131 ifp->if_output_sched_model == IFNET_SCHED_MODEL_DRIVER_MANAGED ||
9132 ifp->if_output_sched_model == IFNET_SCHED_MODEL_FQ_CODEL);
9133
9134 dlil_ifclassq_setup(ifp, ifp->if_snd);
9135
9136 /* Sanity checks on the input thread storage */
9137 dl_inp = &dl_if->dl_if_inpstorage;
9138 bzero(&dl_inp->dlth_stats, sizeof(dl_inp->dlth_stats));
9139 VERIFY(dl_inp->dlth_flags == 0);
9140 VERIFY(dl_inp->dlth_wtot == 0);
9141 VERIFY(dl_inp->dlth_ifp == NULL);
9142 VERIFY(qhead(&dl_inp->dlth_pkts) == NULL && qempty(&dl_inp->dlth_pkts));
9143 VERIFY(qlimit(&dl_inp->dlth_pkts) == 0);
9144 VERIFY(!dl_inp->dlth_affinity);
9145 VERIFY(ifp->if_inp == NULL);
9146 VERIFY(dl_inp->dlth_thread == THREAD_NULL);
9147 VERIFY(dl_inp->dlth_strategy == NULL);
9148 VERIFY(dl_inp->dlth_driver_thread == THREAD_NULL);
9149 VERIFY(dl_inp->dlth_poller_thread == THREAD_NULL);
9150 VERIFY(dl_inp->dlth_affinity_tag == 0);
9151
9152 #if IFNET_INPUT_SANITY_CHK
9153 VERIFY(dl_inp->dlth_pkts_cnt == 0);
9154 #endif /* IFNET_INPUT_SANITY_CHK */
9155
9156 VERIFY(ifp->if_poll_thread == THREAD_NULL);
9157 dlil_reset_rxpoll_params(ifp);
9158 /*
9159 * A specific DLIL input thread is created per non-loopback interface.
9160 */
9161 if (ifp->if_family != IFNET_FAMILY_LOOPBACK) {
9162 ifp->if_inp = dl_inp;
9163 ifnet_incr_pending_thread_count(ifp);
9164 err = dlil_create_input_thread(ifp, ifp->if_inp, &thfunc);
9165 if (err == ENODEV) {
9166 VERIFY(thfunc == NULL);
9167 ifnet_decr_pending_thread_count(ifp);
9168 } else if (err != 0) {
9169 panic_plain("%s: ifp=%p couldn't get an input thread; "
9170 "err=%d", __func__, ifp, err);
9171 /* NOTREACHED */
9172 }
9173 }
9174 /*
9175 * If the driver supports the new transmit model, calculate flow hash
9176 * and create a workloop starter thread to invoke the if_start callback
9177 * where the packets may be dequeued and transmitted.
9178 */
9179 if (ifp->if_eflags & IFEF_TXSTART) {
9180 thread_precedence_policy_data_t info;
9181 __unused kern_return_t kret;
9182
9183 ifp->if_flowhash = ifnet_calc_flowhash(ifp);
9184 VERIFY(ifp->if_flowhash != 0);
9185 VERIFY(ifp->if_start_thread == THREAD_NULL);
9186
9187 ifnet_set_start_cycle(ifp, NULL);
9188 ifp->if_start_pacemaker_time = 0;
9189 ifp->if_start_active = 0;
9190 ifp->if_start_req = 0;
9191 ifp->if_start_flags = 0;
9192 VERIFY(ifp->if_start != NULL);
9193 ifnet_incr_pending_thread_count(ifp);
9194 if ((err = kernel_thread_start(ifnet_start_thread_func,
9195 ifp, &ifp->if_start_thread)) != KERN_SUCCESS) {
9196 panic_plain("%s: "
9197 "ifp=%p couldn't get a start thread; "
9198 "err=%d", __func__, ifp, err);
9199 /* NOTREACHED */
9200 }
9201 bzero(&info, sizeof(info));
9202 info.importance = 1;
9203 kret = thread_policy_set(ifp->if_start_thread,
9204 THREAD_PRECEDENCE_POLICY, (thread_policy_t)&info,
9205 THREAD_PRECEDENCE_POLICY_COUNT);
9206 ASSERT(kret == KERN_SUCCESS);
9207 } else {
9208 ifp->if_flowhash = 0;
9209 }
9210
9211 /* Reset polling parameters */
9212 ifnet_set_poll_cycle(ifp, NULL);
9213 ifp->if_poll_update = 0;
9214 ifp->if_poll_flags = 0;
9215 ifp->if_poll_req = 0;
9216 VERIFY(ifp->if_poll_thread == THREAD_NULL);
9217
9218 /*
9219 * If the driver supports the new receive model, create a poller
9220 * thread to invoke if_input_poll callback where the packets may
9221 * be dequeued from the driver and processed for reception.
9222 * if the interface is netif compat then the poller thread is
9223 * managed by netif.
9224 */
9225 if (thfunc == dlil_rxpoll_input_thread_func) {
9226 thread_precedence_policy_data_t info;
9227 __unused kern_return_t kret;
9228 #if SKYWALK
9229 VERIFY(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
9230 #endif /* SKYWALK */
9231 VERIFY(ifp->if_input_poll != NULL);
9232 VERIFY(ifp->if_input_ctl != NULL);
9233 ifnet_incr_pending_thread_count(ifp);
9234 if ((err = kernel_thread_start(ifnet_poll_thread_func, ifp,
9235 &ifp->if_poll_thread)) != KERN_SUCCESS) {
9236 panic_plain("%s: ifp=%p couldn't get a poll thread; "
9237 "err=%d", __func__, ifp, err);
9238 /* NOTREACHED */
9239 }
9240 bzero(&info, sizeof(info));
9241 info.importance = 1;
9242 kret = thread_policy_set(ifp->if_poll_thread,
9243 THREAD_PRECEDENCE_POLICY, (thread_policy_t)&info,
9244 THREAD_PRECEDENCE_POLICY_COUNT);
9245 ASSERT(kret == KERN_SUCCESS);
9246 }
9247
9248 VERIFY(ifp->if_desc.ifd_maxlen == IF_DESCSIZE);
9249 VERIFY(ifp->if_desc.ifd_len == 0);
9250 VERIFY(ifp->if_desc.ifd_desc != NULL);
9251
9252 /* Record attach PC stacktrace */
9253 ctrace_record(&((struct dlil_ifnet *)ifp)->dl_if_attach);
9254
9255 ifp->if_updatemcasts = 0;
9256 if (!LIST_EMPTY(&ifp->if_multiaddrs)) {
9257 struct ifmultiaddr *ifma;
9258 LIST_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
9259 IFMA_LOCK(ifma);
9260 if (ifma->ifma_addr->sa_family == AF_LINK ||
9261 ifma->ifma_addr->sa_family == AF_UNSPEC) {
9262 ifp->if_updatemcasts++;
9263 }
9264 IFMA_UNLOCK(ifma);
9265 }
9266
9267 DLIL_PRINTF("%s: attached with %d suspended link-layer multicast "
9268 "membership(s)\n", if_name(ifp),
9269 ifp->if_updatemcasts);
9270 }
9271
9272 /* Clear logging parameters */
9273 bzero(&ifp->if_log, sizeof(ifp->if_log));
9274
9275 /* Clear foreground/realtime activity timestamps */
9276 ifp->if_fg_sendts = 0;
9277 ifp->if_rt_sendts = 0;
9278
9279 /* Clear throughput estimates and radio type */
9280 ifp->if_estimated_up_bucket = 0;
9281 ifp->if_estimated_down_bucket = 0;
9282 ifp->if_radio_type = 0;
9283 ifp->if_radio_channel = 0;
9284
9285 VERIFY(ifp->if_delegated.ifp == NULL);
9286 VERIFY(ifp->if_delegated.type == 0);
9287 VERIFY(ifp->if_delegated.family == 0);
9288 VERIFY(ifp->if_delegated.subfamily == 0);
9289 VERIFY(ifp->if_delegated.expensive == 0);
9290 VERIFY(ifp->if_delegated.constrained == 0);
9291
9292 VERIFY(ifp->if_agentids == NULL);
9293 VERIFY(ifp->if_agentcount == 0);
9294
9295 /* Reset interface state */
9296 bzero(&ifp->if_interface_state, sizeof(ifp->if_interface_state));
9297 ifp->if_interface_state.valid_bitmask |=
9298 IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID;
9299 ifp->if_interface_state.interface_availability =
9300 IF_INTERFACE_STATE_INTERFACE_AVAILABLE;
9301
9302 /* Initialize Link Quality Metric (loopback [lo0] is always good) */
9303 if (ifp == lo_ifp) {
9304 ifp->if_interface_state.lqm_state = IFNET_LQM_THRESH_GOOD;
9305 ifp->if_interface_state.valid_bitmask |=
9306 IF_INTERFACE_STATE_LQM_STATE_VALID;
9307 } else {
9308 ifp->if_interface_state.lqm_state = IFNET_LQM_THRESH_UNKNOWN;
9309 }
9310
9311 /*
9312 * Enable ECN capability on this interface depending on the
9313 * value of ECN global setting
9314 */
9315 if (tcp_ecn_outbound == 2 && !IFNET_IS_CELLULAR(ifp)) {
9316 if_set_eflags(ifp, IFEF_ECN_ENABLE);
9317 if_clear_eflags(ifp, IFEF_ECN_DISABLE);
9318 }
9319
9320 /*
9321 * Built-in Cyclops always on policy for WiFi infra
9322 */
9323 if (IFNET_IS_WIFI_INFRA(ifp) && net_qos_policy_wifi_enabled != 0) {
9324 errno_t error;
9325
9326 error = if_set_qosmarking_mode(ifp,
9327 IFRTYPE_QOSMARKING_FASTLANE);
9328 if (error != 0) {
9329 DLIL_PRINTF("%s if_set_qosmarking_mode(%s) error %d\n",
9330 __func__, ifp->if_xname, error);
9331 } else {
9332 if_set_eflags(ifp, IFEF_QOSMARKING_ENABLED);
9333 #if (DEVELOPMENT || DEBUG)
9334 DLIL_PRINTF("%s fastlane enabled on %s\n",
9335 __func__, ifp->if_xname);
9336 #endif /* (DEVELOPMENT || DEBUG) */
9337 }
9338 }
9339
9340 ifnet_lock_done(ifp);
9341 ifnet_head_done();
9342
9343 #if SKYWALK
9344 netif_compat = dlil_attach_netif_compat_nexus(ifp, &nexus_netif);
9345 #endif /* SKYWALK */
9346
9347 lck_mtx_lock(&ifp->if_cached_route_lock);
9348 /* Enable forwarding cached route */
9349 ifp->if_fwd_cacheok = 1;
9350 /* Clean up any existing cached routes */
9351 ROUTE_RELEASE(&ifp->if_fwd_route);
9352 bzero(&ifp->if_fwd_route, sizeof(ifp->if_fwd_route));
9353 ROUTE_RELEASE(&ifp->if_src_route);
9354 bzero(&ifp->if_src_route, sizeof(ifp->if_src_route));
9355 ROUTE_RELEASE(&ifp->if_src_route6);
9356 bzero(&ifp->if_src_route6, sizeof(ifp->if_src_route6));
9357 lck_mtx_unlock(&ifp->if_cached_route_lock);
9358
9359 ifnet_llreach_ifattach(ifp, (dl_if->dl_if_flags & DLIF_REUSE));
9360
9361 /*
9362 * Allocate and attach IGMPv3/MLDv2 interface specific variables
9363 * and trees; do this before the ifnet is marked as attached.
9364 * The ifnet keeps the reference to the info structures even after
9365 * the ifnet is detached, since the network-layer records still
9366 * refer to the info structures even after that. This also
9367 * makes it possible for them to still function after the ifnet
9368 * is recycled or reattached.
9369 */
9370 #if INET
9371 if (IGMP_IFINFO(ifp) == NULL) {
9372 IGMP_IFINFO(ifp) = igmp_domifattach(ifp, Z_WAITOK);
9373 VERIFY(IGMP_IFINFO(ifp) != NULL);
9374 } else {
9375 VERIFY(IGMP_IFINFO(ifp)->igi_ifp == ifp);
9376 igmp_domifreattach(IGMP_IFINFO(ifp));
9377 }
9378 #endif /* INET */
9379 if (MLD_IFINFO(ifp) == NULL) {
9380 MLD_IFINFO(ifp) = mld_domifattach(ifp, Z_WAITOK);
9381 VERIFY(MLD_IFINFO(ifp) != NULL);
9382 } else {
9383 VERIFY(MLD_IFINFO(ifp)->mli_ifp == ifp);
9384 mld_domifreattach(MLD_IFINFO(ifp));
9385 }
9386
9387 VERIFY(ifp->if_data_threshold == 0);
9388 VERIFY(ifp->if_dt_tcall != NULL);
9389
9390 /*
9391 * Wait for the created kernel threads for I/O to get
9392 * scheduled and run at least once before we proceed
9393 * to mark interface as attached.
9394 */
9395 lck_mtx_lock(&ifp->if_ref_lock);
9396 while (ifp->if_threads_pending != 0) {
9397 DLIL_PRINTF("%s: Waiting for all kernel threads created for "
9398 "interface %s to get scheduled at least once.\n",
9399 __func__, ifp->if_xname);
9400 (void) msleep(&ifp->if_threads_pending, &ifp->if_ref_lock, (PZERO - 1),
9401 __func__, NULL);
9402 LCK_MTX_ASSERT(&ifp->if_ref_lock, LCK_ASSERT_OWNED);
9403 }
9404 lck_mtx_unlock(&ifp->if_ref_lock);
9405 DLIL_PRINTF("%s: All kernel threads created for interface %s have been scheduled "
9406 "at least once. Proceeding.\n", __func__, ifp->if_xname);
9407
9408 /* Final mark this ifnet as attached. */
9409 ifnet_lock_exclusive(ifp);
9410 lck_mtx_lock_spin(&ifp->if_ref_lock);
9411 ifp->if_refflags = (IFRF_ATTACHED | IFRF_READY); /* clears embryonic */
9412 lck_mtx_unlock(&ifp->if_ref_lock);
9413 if (net_rtref) {
9414 /* boot-args override; enable idle notification */
9415 (void) ifnet_set_idle_flags_locked(ifp, IFRF_IDLE_NOTIFY,
9416 IFRF_IDLE_NOTIFY);
9417 } else {
9418 /* apply previous request(s) to set the idle flags, if any */
9419 (void) ifnet_set_idle_flags_locked(ifp, ifp->if_idle_new_flags,
9420 ifp->if_idle_new_flags_mask);
9421 }
9422 #if SKYWALK
9423 /* the interface is fully attached; let the nexus adapter know */
9424 if (netif_compat || dlil_is_native_netif_nexus(ifp)) {
9425 if (netif_compat) {
9426 if (sk_netif_compat_txmodel ==
9427 NETIF_COMPAT_TXMODEL_ENQUEUE_MULTI) {
9428 ifnet_enqueue_multi_setup(ifp,
9429 sk_tx_delay_qlen, sk_tx_delay_timeout);
9430 }
9431 ifp->if_nx_netif = nexus_netif;
9432 }
9433 ifp->if_na_ops->ni_finalize(ifp->if_na, ifp);
9434 }
9435 #endif /* SKYWALK */
9436 ifnet_lock_done(ifp);
9437 dlil_if_unlock();
9438
9439 #if PF
9440 /*
9441 * Attach packet filter to this interface, if enabled.
9442 */
9443 pf_ifnet_hook(ifp, 1);
9444 #endif /* PF */
9445
9446 dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_IF_ATTACHED, NULL, 0, FALSE);
9447
9448 if (dlil_verbose) {
9449 DLIL_PRINTF("%s: attached%s\n", if_name(ifp),
9450 (dl_if->dl_if_flags & DLIF_REUSE) ? " (recycled)" : "");
9451 }
9452
9453 return 0;
9454 }
9455
9456 /*
9457 * Prepare the storage for the first/permanent link address, which must
9458 * must have the same lifetime as the ifnet itself. Although the link
9459 * address gets removed from if_addrhead and ifnet_addrs[] at detach time,
9460 * its location in memory must never change as it may still be referred
9461 * to by some parts of the system afterwards (unfortunate implementation
9462 * artifacts inherited from BSD.)
9463 *
9464 * Caller must hold ifnet lock as writer.
9465 */
9466 static struct ifaddr *
dlil_alloc_lladdr(struct ifnet * ifp,const struct sockaddr_dl * ll_addr)9467 dlil_alloc_lladdr(struct ifnet *ifp, const struct sockaddr_dl *ll_addr)
9468 {
9469 struct ifaddr *ifa, *oifa;
9470 struct sockaddr_dl *asdl, *msdl;
9471 char workbuf[IFNAMSIZ * 2];
9472 int namelen, masklen, socksize;
9473 struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
9474
9475 ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_EXCLUSIVE);
9476 VERIFY(ll_addr == NULL || ll_addr->sdl_alen == ifp->if_addrlen);
9477
9478 namelen = scnprintf(workbuf, sizeof(workbuf), "%s",
9479 if_name(ifp));
9480 masklen = offsetof(struct sockaddr_dl, sdl_data[0])
9481 + ((namelen > 0) ? namelen : 0);
9482 socksize = masklen + ifp->if_addrlen;
9483 #define ROUNDUP(a) (1 + (((a) - 1) | (sizeof (u_int32_t) - 1)))
9484 if ((u_int32_t)socksize < sizeof(struct sockaddr_dl)) {
9485 socksize = sizeof(struct sockaddr_dl);
9486 }
9487 socksize = ROUNDUP(socksize);
9488 #undef ROUNDUP
9489
9490 ifa = ifp->if_lladdr;
9491 if (socksize > DLIL_SDLMAXLEN ||
9492 (ifa != NULL && ifa != &dl_if->dl_if_lladdr.ifa)) {
9493 /*
9494 * Rare, but in the event that the link address requires
9495 * more storage space than DLIL_SDLMAXLEN, allocate the
9496 * largest possible storages for address and mask, such
9497 * that we can reuse the same space when if_addrlen grows.
9498 * This same space will be used when if_addrlen shrinks.
9499 */
9500 if (ifa == NULL || ifa == &dl_if->dl_if_lladdr.ifa) {
9501 int ifasize = sizeof(*ifa) + 2 * SOCK_MAXADDRLEN;
9502
9503 ifa = zalloc_permanent(ifasize, ZALIGN(struct ifaddr));
9504 ifa_lock_init(ifa);
9505 /* Don't set IFD_ALLOC, as this is permanent */
9506 ifa->ifa_debug = IFD_LINK;
9507 }
9508 IFA_LOCK(ifa);
9509 /* address and mask sockaddr_dl locations */
9510 asdl = (struct sockaddr_dl *)(ifa + 1);
9511 bzero(asdl, SOCK_MAXADDRLEN);
9512 msdl = (struct sockaddr_dl *)(void *)
9513 ((char *)asdl + SOCK_MAXADDRLEN);
9514 bzero(msdl, SOCK_MAXADDRLEN);
9515 } else {
9516 VERIFY(ifa == NULL || ifa == &dl_if->dl_if_lladdr.ifa);
9517 /*
9518 * Use the storage areas for address and mask within the
9519 * dlil_ifnet structure. This is the most common case.
9520 */
9521 if (ifa == NULL) {
9522 ifa = &dl_if->dl_if_lladdr.ifa;
9523 ifa_lock_init(ifa);
9524 /* Don't set IFD_ALLOC, as this is permanent */
9525 ifa->ifa_debug = IFD_LINK;
9526 }
9527 IFA_LOCK(ifa);
9528 /* address and mask sockaddr_dl locations */
9529 asdl = (struct sockaddr_dl *)(void *)&dl_if->dl_if_lladdr.asdl;
9530 bzero(asdl, sizeof(dl_if->dl_if_lladdr.asdl));
9531 msdl = (struct sockaddr_dl *)(void *)&dl_if->dl_if_lladdr.msdl;
9532 bzero(msdl, sizeof(dl_if->dl_if_lladdr.msdl));
9533 }
9534
9535 /* hold a permanent reference for the ifnet itself */
9536 IFA_ADDREF_LOCKED(ifa);
9537 oifa = ifp->if_lladdr;
9538 ifp->if_lladdr = ifa;
9539
9540 VERIFY(ifa->ifa_debug == IFD_LINK);
9541 ifa->ifa_ifp = ifp;
9542 ifa->ifa_rtrequest = link_rtrequest;
9543 ifa->ifa_addr = (struct sockaddr *)asdl;
9544 asdl->sdl_len = (u_char)socksize;
9545 asdl->sdl_family = AF_LINK;
9546 if (namelen > 0) {
9547 bcopy(workbuf, asdl->sdl_data, min(namelen,
9548 sizeof(asdl->sdl_data)));
9549 asdl->sdl_nlen = (u_char)namelen;
9550 } else {
9551 asdl->sdl_nlen = 0;
9552 }
9553 asdl->sdl_index = ifp->if_index;
9554 asdl->sdl_type = ifp->if_type;
9555 if (ll_addr != NULL) {
9556 asdl->sdl_alen = ll_addr->sdl_alen;
9557 bcopy(CONST_LLADDR(ll_addr), LLADDR(asdl), asdl->sdl_alen);
9558 } else {
9559 asdl->sdl_alen = 0;
9560 }
9561 ifa->ifa_netmask = (struct sockaddr *)msdl;
9562 msdl->sdl_len = (u_char)masklen;
9563 while (namelen > 0) {
9564 msdl->sdl_data[--namelen] = 0xff;
9565 }
9566 IFA_UNLOCK(ifa);
9567
9568 if (oifa != NULL) {
9569 IFA_REMREF(oifa);
9570 }
9571
9572 return ifa;
9573 }
9574
9575 static void
if_purgeaddrs(struct ifnet * ifp)9576 if_purgeaddrs(struct ifnet *ifp)
9577 {
9578 #if INET
9579 in_purgeaddrs(ifp);
9580 #endif /* INET */
9581 in6_purgeaddrs(ifp);
9582 }
9583
9584 errno_t
ifnet_detach(ifnet_t ifp)9585 ifnet_detach(ifnet_t ifp)
9586 {
9587 struct ifnet *delegated_ifp;
9588 struct nd_ifinfo *ndi = NULL;
9589
9590 if (ifp == NULL) {
9591 return EINVAL;
9592 }
9593
9594 ndi = ND_IFINFO(ifp);
9595 if (NULL != ndi) {
9596 ndi->cga_initialized = FALSE;
9597 }
9598
9599 /* Mark the interface down */
9600 if_down(ifp);
9601
9602 /*
9603 * IMPORTANT NOTE
9604 *
9605 * Any field in the ifnet that relies on IF_FULLY_ATTACHED()
9606 * or equivalently, ifnet_is_attached(ifp, 1), can't be modified
9607 * until after we've waited for all I/O references to drain
9608 * in ifnet_detach_final().
9609 */
9610
9611 ifnet_head_lock_exclusive();
9612 ifnet_lock_exclusive(ifp);
9613
9614 if (ifp->if_output_netem != NULL) {
9615 netem_destroy(ifp->if_output_netem);
9616 ifp->if_output_netem = NULL;
9617 }
9618
9619 /*
9620 * Check to see if this interface has previously triggered
9621 * aggressive protocol draining; if so, decrement the global
9622 * refcnt and clear PR_AGGDRAIN on the route domain if
9623 * there are no more of such an interface around.
9624 */
9625 (void) ifnet_set_idle_flags_locked(ifp, 0, ~0);
9626
9627 lck_mtx_lock_spin(&ifp->if_ref_lock);
9628 if (!(ifp->if_refflags & IFRF_ATTACHED)) {
9629 lck_mtx_unlock(&ifp->if_ref_lock);
9630 ifnet_lock_done(ifp);
9631 ifnet_head_done();
9632 return EINVAL;
9633 } else if (ifp->if_refflags & IFRF_DETACHING) {
9634 /* Interface has already been detached */
9635 lck_mtx_unlock(&ifp->if_ref_lock);
9636 ifnet_lock_done(ifp);
9637 ifnet_head_done();
9638 return ENXIO;
9639 }
9640 VERIFY(!(ifp->if_refflags & IFRF_EMBRYONIC));
9641 /* Indicate this interface is being detached */
9642 ifp->if_refflags &= ~IFRF_ATTACHED;
9643 ifp->if_refflags |= IFRF_DETACHING;
9644 lck_mtx_unlock(&ifp->if_ref_lock);
9645
9646 if (dlil_verbose) {
9647 DLIL_PRINTF("%s: detaching\n", if_name(ifp));
9648 }
9649
9650 /* clean up flow control entry object if there's any */
9651 if (ifp->if_eflags & IFEF_TXSTART) {
9652 ifnet_flowadv(ifp->if_flowhash);
9653 }
9654
9655 /* Reset ECN enable/disable flags */
9656 /* Reset CLAT46 flag */
9657 if_clear_eflags(ifp, IFEF_ECN_ENABLE | IFEF_ECN_DISABLE | IFEF_CLAT46);
9658
9659 /*
9660 * We do not reset the TCP keep alive counters in case
9661 * a TCP connection stays connection after the interface
9662 * went down
9663 */
9664 if (ifp->if_tcp_kao_cnt > 0) {
9665 os_log(OS_LOG_DEFAULT, "%s %s tcp_kao_cnt %u not zero",
9666 __func__, if_name(ifp), ifp->if_tcp_kao_cnt);
9667 }
9668 ifp->if_tcp_kao_max = 0;
9669
9670 /*
9671 * Remove ifnet from the ifnet_head, ifindex2ifnet[]; it will
9672 * no longer be visible during lookups from this point.
9673 */
9674 VERIFY(ifindex2ifnet[ifp->if_index] == ifp);
9675 TAILQ_REMOVE(&ifnet_head, ifp, if_link);
9676 ifp->if_link.tqe_next = NULL;
9677 ifp->if_link.tqe_prev = NULL;
9678 if (ifp->if_ordered_link.tqe_next != NULL ||
9679 ifp->if_ordered_link.tqe_prev != NULL) {
9680 ifnet_remove_from_ordered_list(ifp);
9681 }
9682 ifindex2ifnet[ifp->if_index] = NULL;
9683
9684 /* 18717626 - reset router mode */
9685 if_clear_eflags(ifp, IFEF_IPV4_ROUTER);
9686 ifp->if_ipv6_router_mode = IPV6_ROUTER_MODE_DISABLED;
9687
9688 /* Record detach PC stacktrace */
9689 ctrace_record(&((struct dlil_ifnet *)ifp)->dl_if_detach);
9690
9691 /* Clear logging parameters */
9692 bzero(&ifp->if_log, sizeof(ifp->if_log));
9693
9694 /* Clear delegated interface info (reference released below) */
9695 delegated_ifp = ifp->if_delegated.ifp;
9696 bzero(&ifp->if_delegated, sizeof(ifp->if_delegated));
9697
9698 /* Reset interface state */
9699 bzero(&ifp->if_interface_state, sizeof(ifp->if_interface_state));
9700
9701 /*
9702 * Increment the generation count on interface deletion
9703 */
9704 ifp->if_creation_generation_id = os_atomic_inc(&if_creation_generation_count, relaxed);
9705
9706 ifnet_lock_done(ifp);
9707 ifnet_head_done();
9708
9709 /* Release reference held on the delegated interface */
9710 if (delegated_ifp != NULL) {
9711 ifnet_release(delegated_ifp);
9712 }
9713
9714 /* Reset Link Quality Metric (unless loopback [lo0]) */
9715 if (ifp != lo_ifp) {
9716 if_lqm_update(ifp, IFNET_LQM_THRESH_OFF, 0);
9717 }
9718
9719 /* Reset TCP local statistics */
9720 if (ifp->if_tcp_stat != NULL) {
9721 bzero(ifp->if_tcp_stat, sizeof(*ifp->if_tcp_stat));
9722 }
9723
9724 /* Reset UDP local statistics */
9725 if (ifp->if_udp_stat != NULL) {
9726 bzero(ifp->if_udp_stat, sizeof(*ifp->if_udp_stat));
9727 }
9728
9729 /* Reset ifnet IPv4 stats */
9730 if (ifp->if_ipv4_stat != NULL) {
9731 bzero(ifp->if_ipv4_stat, sizeof(*ifp->if_ipv4_stat));
9732 }
9733
9734 /* Reset ifnet IPv6 stats */
9735 if (ifp->if_ipv6_stat != NULL) {
9736 bzero(ifp->if_ipv6_stat, sizeof(*ifp->if_ipv6_stat));
9737 }
9738
9739 /* Release memory held for interface link status report */
9740 if (ifp->if_link_status != NULL) {
9741 kfree_type(struct if_link_status, ifp->if_link_status);
9742 ifp->if_link_status = NULL;
9743 }
9744
9745 /* Disable forwarding cached route */
9746 lck_mtx_lock(&ifp->if_cached_route_lock);
9747 ifp->if_fwd_cacheok = 0;
9748 lck_mtx_unlock(&ifp->if_cached_route_lock);
9749
9750 /* Disable data threshold and wait for any pending event posting */
9751 ifp->if_data_threshold = 0;
9752 VERIFY(ifp->if_dt_tcall != NULL);
9753 (void) thread_call_cancel_wait(ifp->if_dt_tcall);
9754
9755 /*
9756 * Drain any deferred IGMPv3/MLDv2 query responses, but keep the
9757 * references to the info structures and leave them attached to
9758 * this ifnet.
9759 */
9760 #if INET
9761 igmp_domifdetach(ifp);
9762 #endif /* INET */
9763 mld_domifdetach(ifp);
9764
9765 #if SKYWALK
9766 /* Clean up any netns tokens still pointing to to this ifnet */
9767 netns_ifnet_detach(ifp);
9768 #endif /* SKYWALK */
9769 dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_IF_DETACHING, NULL, 0, FALSE);
9770
9771 /* Let worker thread take care of the rest, to avoid reentrancy */
9772 dlil_if_lock();
9773 ifnet_detaching_enqueue(ifp);
9774 dlil_if_unlock();
9775
9776 return 0;
9777 }
9778
9779 static void
ifnet_detaching_enqueue(struct ifnet * ifp)9780 ifnet_detaching_enqueue(struct ifnet *ifp)
9781 {
9782 dlil_if_lock_assert();
9783
9784 ++ifnet_detaching_cnt;
9785 VERIFY(ifnet_detaching_cnt != 0);
9786 TAILQ_INSERT_TAIL(&ifnet_detaching_head, ifp, if_detaching_link);
9787 wakeup((caddr_t)&ifnet_delayed_run);
9788 }
9789
9790 static struct ifnet *
ifnet_detaching_dequeue(void)9791 ifnet_detaching_dequeue(void)
9792 {
9793 struct ifnet *ifp;
9794
9795 dlil_if_lock_assert();
9796
9797 ifp = TAILQ_FIRST(&ifnet_detaching_head);
9798 VERIFY(ifnet_detaching_cnt != 0 || ifp == NULL);
9799 if (ifp != NULL) {
9800 VERIFY(ifnet_detaching_cnt != 0);
9801 --ifnet_detaching_cnt;
9802 TAILQ_REMOVE(&ifnet_detaching_head, ifp, if_detaching_link);
9803 ifp->if_detaching_link.tqe_next = NULL;
9804 ifp->if_detaching_link.tqe_prev = NULL;
9805 }
9806 return ifp;
9807 }
9808
9809 __attribute__((noreturn))
9810 static void
ifnet_detacher_thread_cont(void * v,wait_result_t wres)9811 ifnet_detacher_thread_cont(void *v, wait_result_t wres)
9812 {
9813 #pragma unused(v, wres)
9814 struct ifnet *ifp;
9815
9816 dlil_if_lock();
9817 if (__improbable(ifnet_detaching_embryonic)) {
9818 ifnet_detaching_embryonic = FALSE;
9819 /* there's no lock ordering constrain so OK to do this here */
9820 dlil_decr_pending_thread_count();
9821 }
9822
9823 for (;;) {
9824 dlil_if_lock_assert();
9825
9826 if (ifnet_detaching_cnt == 0) {
9827 break;
9828 }
9829
9830 net_update_uptime();
9831
9832 VERIFY(TAILQ_FIRST(&ifnet_detaching_head) != NULL);
9833
9834 /* Take care of detaching ifnet */
9835 ifp = ifnet_detaching_dequeue();
9836 if (ifp != NULL) {
9837 dlil_if_unlock();
9838 ifnet_detach_final(ifp);
9839 dlil_if_lock();
9840 }
9841 }
9842
9843 (void) assert_wait(&ifnet_delayed_run, THREAD_UNINT);
9844 dlil_if_unlock();
9845 (void) thread_block(ifnet_detacher_thread_cont);
9846
9847 VERIFY(0); /* we should never get here */
9848 /* NOTREACHED */
9849 __builtin_unreachable();
9850 }
9851
9852 __dead2
9853 static void
ifnet_detacher_thread_func(void * v,wait_result_t w)9854 ifnet_detacher_thread_func(void *v, wait_result_t w)
9855 {
9856 #pragma unused(v, w)
9857 dlil_if_lock();
9858 (void) assert_wait(&ifnet_delayed_run, THREAD_UNINT);
9859 ifnet_detaching_embryonic = TRUE;
9860 /* wake up once to get out of embryonic state */
9861 wakeup((caddr_t)&ifnet_delayed_run);
9862 dlil_if_unlock();
9863 (void) thread_block(ifnet_detacher_thread_cont);
9864 VERIFY(0);
9865 /* NOTREACHED */
9866 __builtin_unreachable();
9867 }
9868
9869 static void
ifnet_detach_final(struct ifnet * ifp)9870 ifnet_detach_final(struct ifnet *ifp)
9871 {
9872 struct ifnet_filter *filter, *filter_next;
9873 struct dlil_ifnet *dlifp;
9874 struct ifnet_filter_head fhead;
9875 struct dlil_threading_info *inp;
9876 struct ifaddr *ifa;
9877 ifnet_detached_func if_free;
9878 int i;
9879
9880 /* Let BPF know we're detaching */
9881 bpfdetach(ifp);
9882
9883 #if SKYWALK
9884 dlil_netif_detach_notify(ifp);
9885 /*
9886 * Wait for the datapath to quiesce before tearing down
9887 * netif/flowswitch nexuses.
9888 */
9889 dlil_quiesce_and_detach_nexuses(ifp);
9890 #endif /* SKYWALK */
9891
9892 lck_mtx_lock(&ifp->if_ref_lock);
9893 if (!(ifp->if_refflags & IFRF_DETACHING)) {
9894 panic("%s: flags mismatch (detaching not set) ifp=%p",
9895 __func__, ifp);
9896 /* NOTREACHED */
9897 }
9898
9899 /*
9900 * Wait until the existing IO references get released
9901 * before we proceed with ifnet_detach. This is not a
9902 * common case, so block without using a continuation.
9903 */
9904 while (ifp->if_refio > 0) {
9905 DLIL_PRINTF("%s: Waiting for IO references on %s interface "
9906 "to be released\n", __func__, if_name(ifp));
9907 (void) msleep(&(ifp->if_refio), &ifp->if_ref_lock,
9908 (PZERO - 1), "ifnet_ioref_wait", NULL);
9909 }
9910
9911 VERIFY(ifp->if_datamov == 0);
9912 VERIFY(ifp->if_drainers == 0);
9913 VERIFY(ifp->if_suspend == 0);
9914 ifp->if_refflags &= ~IFRF_READY;
9915 lck_mtx_unlock(&ifp->if_ref_lock);
9916
9917 /* Clear agent IDs */
9918 if (ifp->if_agentids != NULL) {
9919 kfree_data(ifp->if_agentids,
9920 sizeof(uuid_t) * ifp->if_agentcount);
9921 ifp->if_agentids = NULL;
9922 }
9923 ifp->if_agentcount = 0;
9924
9925 #if SKYWALK
9926 VERIFY(SLIST_EMPTY(&ifp->if_netns_tokens));
9927 #endif /* SKYWALK */
9928 /* Drain and destroy send queue */
9929 ifclassq_teardown(ifp->if_snd);
9930
9931 /* Detach interface filters */
9932 lck_mtx_lock(&ifp->if_flt_lock);
9933 if_flt_monitor_enter(ifp);
9934
9935 LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
9936 fhead = ifp->if_flt_head;
9937 TAILQ_INIT(&ifp->if_flt_head);
9938
9939 for (filter = TAILQ_FIRST(&fhead); filter; filter = filter_next) {
9940 filter_next = TAILQ_NEXT(filter, filt_next);
9941 lck_mtx_unlock(&ifp->if_flt_lock);
9942
9943 dlil_detach_filter_internal(filter, 1);
9944 lck_mtx_lock(&ifp->if_flt_lock);
9945 }
9946 if_flt_monitor_leave(ifp);
9947 lck_mtx_unlock(&ifp->if_flt_lock);
9948
9949 /* Tell upper layers to drop their network addresses */
9950 if_purgeaddrs(ifp);
9951
9952 ifnet_lock_exclusive(ifp);
9953
9954 /* Unplumb all protocols */
9955 for (i = 0; i < PROTO_HASH_SLOTS; i++) {
9956 struct if_proto *proto;
9957
9958 proto = SLIST_FIRST(&ifp->if_proto_hash[i]);
9959 while (proto != NULL) {
9960 protocol_family_t family = proto->protocol_family;
9961 ifnet_lock_done(ifp);
9962 proto_unplumb(family, ifp);
9963 ifnet_lock_exclusive(ifp);
9964 proto = SLIST_FIRST(&ifp->if_proto_hash[i]);
9965 }
9966 /* There should not be any protocols left */
9967 VERIFY(SLIST_EMPTY(&ifp->if_proto_hash[i]));
9968 }
9969 kfree_type(struct proto_hash_entry, PROTO_HASH_SLOTS, ifp->if_proto_hash);
9970 ifp->if_proto_hash = NULL;
9971
9972 /* Detach (permanent) link address from if_addrhead */
9973 ifa = TAILQ_FIRST(&ifp->if_addrhead);
9974 VERIFY(ifnet_addrs[ifp->if_index - 1] == ifa);
9975 IFA_LOCK(ifa);
9976 if_detach_link_ifa(ifp, ifa);
9977 IFA_UNLOCK(ifa);
9978
9979 /* Remove (permanent) link address from ifnet_addrs[] */
9980 IFA_REMREF(ifa);
9981 ifnet_addrs[ifp->if_index - 1] = NULL;
9982
9983 /* This interface should not be on {ifnet_head,detaching} */
9984 VERIFY(ifp->if_link.tqe_next == NULL);
9985 VERIFY(ifp->if_link.tqe_prev == NULL);
9986 VERIFY(ifp->if_detaching_link.tqe_next == NULL);
9987 VERIFY(ifp->if_detaching_link.tqe_prev == NULL);
9988 VERIFY(ifp->if_ordered_link.tqe_next == NULL);
9989 VERIFY(ifp->if_ordered_link.tqe_prev == NULL);
9990
9991 /* The slot should have been emptied */
9992 VERIFY(ifindex2ifnet[ifp->if_index] == NULL);
9993
9994 /* There should not be any addresses left */
9995 VERIFY(TAILQ_EMPTY(&ifp->if_addrhead));
9996
9997 /*
9998 * Signal the starter thread to terminate itself, and wait until
9999 * it has exited.
10000 */
10001 if (ifp->if_start_thread != THREAD_NULL) {
10002 lck_mtx_lock_spin(&ifp->if_start_lock);
10003 ifp->if_start_flags |= IFSF_TERMINATING;
10004 wakeup_one((caddr_t)&ifp->if_start_thread);
10005 lck_mtx_unlock(&ifp->if_start_lock);
10006
10007 /* wait for starter thread to terminate */
10008 lck_mtx_lock(&ifp->if_start_lock);
10009 while (ifp->if_start_thread != THREAD_NULL) {
10010 if (dlil_verbose) {
10011 DLIL_PRINTF("%s: waiting for %s starter thread to terminate\n",
10012 __func__,
10013 if_name(ifp));
10014 }
10015 (void) msleep(&ifp->if_start_thread,
10016 &ifp->if_start_lock, (PZERO - 1),
10017 "ifnet_start_thread_exit", NULL);
10018 }
10019 lck_mtx_unlock(&ifp->if_start_lock);
10020 if (dlil_verbose) {
10021 DLIL_PRINTF("%s: %s starter thread termination complete",
10022 __func__, if_name(ifp));
10023 }
10024 }
10025
10026 /*
10027 * Signal the poller thread to terminate itself, and wait until
10028 * it has exited.
10029 */
10030 if (ifp->if_poll_thread != THREAD_NULL) {
10031 #if SKYWALK
10032 VERIFY(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
10033 #endif /* SKYWALK */
10034 lck_mtx_lock_spin(&ifp->if_poll_lock);
10035 ifp->if_poll_flags |= IF_POLLF_TERMINATING;
10036 wakeup_one((caddr_t)&ifp->if_poll_thread);
10037 lck_mtx_unlock(&ifp->if_poll_lock);
10038
10039 /* wait for poller thread to terminate */
10040 lck_mtx_lock(&ifp->if_poll_lock);
10041 while (ifp->if_poll_thread != THREAD_NULL) {
10042 if (dlil_verbose) {
10043 DLIL_PRINTF("%s: waiting for %s poller thread to terminate\n",
10044 __func__,
10045 if_name(ifp));
10046 }
10047 (void) msleep(&ifp->if_poll_thread,
10048 &ifp->if_poll_lock, (PZERO - 1),
10049 "ifnet_poll_thread_exit", NULL);
10050 }
10051 lck_mtx_unlock(&ifp->if_poll_lock);
10052 if (dlil_verbose) {
10053 DLIL_PRINTF("%s: %s poller thread termination complete\n",
10054 __func__, if_name(ifp));
10055 }
10056 }
10057
10058 /*
10059 * If thread affinity was set for the workloop thread, we will need
10060 * to tear down the affinity and release the extra reference count
10061 * taken at attach time. Does not apply to lo0 or other interfaces
10062 * without dedicated input threads.
10063 */
10064 if ((inp = ifp->if_inp) != NULL) {
10065 VERIFY(inp != dlil_main_input_thread);
10066
10067 if (inp->dlth_affinity) {
10068 struct thread *tp, *wtp, *ptp;
10069
10070 lck_mtx_lock_spin(&inp->dlth_lock);
10071 wtp = inp->dlth_driver_thread;
10072 inp->dlth_driver_thread = THREAD_NULL;
10073 ptp = inp->dlth_poller_thread;
10074 inp->dlth_poller_thread = THREAD_NULL;
10075 ASSERT(inp->dlth_thread != THREAD_NULL);
10076 tp = inp->dlth_thread; /* don't nullify now */
10077 inp->dlth_affinity_tag = 0;
10078 inp->dlth_affinity = FALSE;
10079 lck_mtx_unlock(&inp->dlth_lock);
10080
10081 /* Tear down poll thread affinity */
10082 if (ptp != NULL) {
10083 VERIFY(ifp->if_eflags & IFEF_RXPOLL);
10084 VERIFY(ifp->if_xflags & IFXF_LEGACY);
10085 (void) dlil_affinity_set(ptp,
10086 THREAD_AFFINITY_TAG_NULL);
10087 thread_deallocate(ptp);
10088 }
10089
10090 /* Tear down workloop thread affinity */
10091 if (wtp != NULL) {
10092 (void) dlil_affinity_set(wtp,
10093 THREAD_AFFINITY_TAG_NULL);
10094 thread_deallocate(wtp);
10095 }
10096
10097 /* Tear down DLIL input thread affinity */
10098 (void) dlil_affinity_set(tp, THREAD_AFFINITY_TAG_NULL);
10099 thread_deallocate(tp);
10100 }
10101
10102 /* disassociate ifp DLIL input thread */
10103 ifp->if_inp = NULL;
10104
10105 /* if the worker thread was created, tell it to terminate */
10106 if (inp->dlth_thread != THREAD_NULL) {
10107 lck_mtx_lock_spin(&inp->dlth_lock);
10108 inp->dlth_flags |= DLIL_INPUT_TERMINATE;
10109 if (!(inp->dlth_flags & DLIL_INPUT_RUNNING)) {
10110 wakeup_one((caddr_t)&inp->dlth_flags);
10111 }
10112 lck_mtx_unlock(&inp->dlth_lock);
10113 ifnet_lock_done(ifp);
10114
10115 /* wait for the input thread to terminate */
10116 lck_mtx_lock_spin(&inp->dlth_lock);
10117 while ((inp->dlth_flags & DLIL_INPUT_TERMINATE_COMPLETE)
10118 == 0) {
10119 (void) msleep(&inp->dlth_flags, &inp->dlth_lock,
10120 (PZERO - 1) | PSPIN, inp->dlth_name, NULL);
10121 }
10122 lck_mtx_unlock(&inp->dlth_lock);
10123 ifnet_lock_exclusive(ifp);
10124 }
10125
10126 /* clean-up input thread state */
10127 dlil_clean_threading_info(inp);
10128 /* clean-up poll parameters */
10129 VERIFY(ifp->if_poll_thread == THREAD_NULL);
10130 dlil_reset_rxpoll_params(ifp);
10131 }
10132
10133 /* The driver might unload, so point these to ourselves */
10134 if_free = ifp->if_free;
10135 ifp->if_output_dlil = ifp_if_output;
10136 ifp->if_output = ifp_if_output;
10137 ifp->if_pre_enqueue = ifp_if_output;
10138 ifp->if_start = ifp_if_start;
10139 ifp->if_output_ctl = ifp_if_ctl;
10140 ifp->if_input_dlil = ifp_if_input;
10141 ifp->if_input_poll = ifp_if_input_poll;
10142 ifp->if_input_ctl = ifp_if_ctl;
10143 ifp->if_ioctl = ifp_if_ioctl;
10144 ifp->if_set_bpf_tap = ifp_if_set_bpf_tap;
10145 ifp->if_free = ifp_if_free;
10146 ifp->if_demux = ifp_if_demux;
10147 ifp->if_event = ifp_if_event;
10148 ifp->if_framer_legacy = ifp_if_framer;
10149 ifp->if_framer = ifp_if_framer_extended;
10150 ifp->if_add_proto = ifp_if_add_proto;
10151 ifp->if_del_proto = ifp_if_del_proto;
10152 ifp->if_check_multi = ifp_if_check_multi;
10153
10154 /* wipe out interface description */
10155 VERIFY(ifp->if_desc.ifd_maxlen == IF_DESCSIZE);
10156 ifp->if_desc.ifd_len = 0;
10157 VERIFY(ifp->if_desc.ifd_desc != NULL);
10158 bzero(ifp->if_desc.ifd_desc, IF_DESCSIZE);
10159
10160 /* there shouldn't be any delegation by now */
10161 VERIFY(ifp->if_delegated.ifp == NULL);
10162 VERIFY(ifp->if_delegated.type == 0);
10163 VERIFY(ifp->if_delegated.family == 0);
10164 VERIFY(ifp->if_delegated.subfamily == 0);
10165 VERIFY(ifp->if_delegated.expensive == 0);
10166 VERIFY(ifp->if_delegated.constrained == 0);
10167
10168 /* QoS marking get cleared */
10169 if_clear_eflags(ifp, IFEF_QOSMARKING_ENABLED);
10170 if_set_qosmarking_mode(ifp, IFRTYPE_QOSMARKING_MODE_NONE);
10171
10172 #if SKYWALK
10173 /* the nexus destructor is responsible for clearing these */
10174 VERIFY(ifp->if_na_ops == NULL);
10175 VERIFY(ifp->if_na == NULL);
10176 #endif /* SKYWALK */
10177
10178 /* promiscuous/allmulti counts need to start at zero again */
10179 ifp->if_pcount = 0;
10180 ifp->if_amcount = 0;
10181 ifp->if_flags &= ~(IFF_PROMISC | IFF_ALLMULTI);
10182
10183 ifnet_lock_done(ifp);
10184
10185 #if PF
10186 /*
10187 * Detach this interface from packet filter, if enabled.
10188 */
10189 pf_ifnet_hook(ifp, 0);
10190 #endif /* PF */
10191
10192 /* Filter list should be empty */
10193 lck_mtx_lock_spin(&ifp->if_flt_lock);
10194 VERIFY(TAILQ_EMPTY(&ifp->if_flt_head));
10195 VERIFY(ifp->if_flt_busy == 0);
10196 VERIFY(ifp->if_flt_waiters == 0);
10197 VERIFY(ifp->if_flt_non_os_count == 0);
10198 VERIFY(ifp->if_flt_no_tso_count == 0);
10199 lck_mtx_unlock(&ifp->if_flt_lock);
10200
10201 /* Last chance to drain send queue */
10202 if_qflush_snd(ifp, 0);
10203
10204 /* Last chance to cleanup any cached route */
10205 lck_mtx_lock(&ifp->if_cached_route_lock);
10206 VERIFY(!ifp->if_fwd_cacheok);
10207 ROUTE_RELEASE(&ifp->if_fwd_route);
10208 bzero(&ifp->if_fwd_route, sizeof(ifp->if_fwd_route));
10209 ROUTE_RELEASE(&ifp->if_src_route);
10210 bzero(&ifp->if_src_route, sizeof(ifp->if_src_route));
10211 ROUTE_RELEASE(&ifp->if_src_route6);
10212 bzero(&ifp->if_src_route6, sizeof(ifp->if_src_route6));
10213 lck_mtx_unlock(&ifp->if_cached_route_lock);
10214
10215 VERIFY(ifp->if_data_threshold == 0);
10216 VERIFY(ifp->if_dt_tcall != NULL);
10217 VERIFY(!thread_call_isactive(ifp->if_dt_tcall));
10218
10219 ifnet_llreach_ifdetach(ifp);
10220
10221 dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_IF_DETACHED, NULL, 0, FALSE);
10222
10223 /*
10224 * Finally, mark this ifnet as detached.
10225 */
10226 if (dlil_verbose) {
10227 DLIL_PRINTF("%s: detached\n", if_name(ifp));
10228 }
10229 lck_mtx_lock_spin(&ifp->if_ref_lock);
10230 if (!(ifp->if_refflags & IFRF_DETACHING)) {
10231 panic("%s: flags mismatch (detaching not set) ifp=%p",
10232 __func__, ifp);
10233 /* NOTREACHED */
10234 }
10235 ifp->if_refflags &= ~IFRF_DETACHING;
10236 lck_mtx_unlock(&ifp->if_ref_lock);
10237 if (if_free != NULL) {
10238 if_free(ifp);
10239 }
10240
10241 ifclassq_release(&ifp->if_snd);
10242
10243 /* we're fully detached, clear the "in use" bit */
10244 dlifp = (struct dlil_ifnet *)ifp;
10245 lck_mtx_lock(&dlifp->dl_if_lock);
10246 ASSERT((dlifp->dl_if_flags & DLIF_INUSE) != 0);
10247 dlifp->dl_if_flags &= ~DLIF_INUSE;
10248 lck_mtx_unlock(&dlifp->dl_if_lock);
10249
10250 /* Release reference held during ifnet attach */
10251 ifnet_release(ifp);
10252 }
10253
10254 errno_t
ifp_if_output(struct ifnet * ifp,struct mbuf * m)10255 ifp_if_output(struct ifnet *ifp, struct mbuf *m)
10256 {
10257 #pragma unused(ifp)
10258 m_freem_list(m);
10259 return 0;
10260 }
10261
10262 void
ifp_if_start(struct ifnet * ifp)10263 ifp_if_start(struct ifnet *ifp)
10264 {
10265 ifnet_purge(ifp);
10266 }
10267
10268 static errno_t
ifp_if_input(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t poll,struct thread * tp)10269 ifp_if_input(struct ifnet *ifp, struct mbuf *m_head,
10270 struct mbuf *m_tail, const struct ifnet_stat_increment_param *s,
10271 boolean_t poll, struct thread *tp)
10272 {
10273 #pragma unused(ifp, m_tail, s, poll, tp)
10274 m_freem_list(m_head);
10275 return ENXIO;
10276 }
10277
10278 static void
ifp_if_input_poll(struct ifnet * ifp,u_int32_t flags,u_int32_t max_cnt,struct mbuf ** m_head,struct mbuf ** m_tail,u_int32_t * cnt,u_int32_t * len)10279 ifp_if_input_poll(struct ifnet *ifp, u_int32_t flags, u_int32_t max_cnt,
10280 struct mbuf **m_head, struct mbuf **m_tail, u_int32_t *cnt, u_int32_t *len)
10281 {
10282 #pragma unused(ifp, flags, max_cnt)
10283 if (m_head != NULL) {
10284 *m_head = NULL;
10285 }
10286 if (m_tail != NULL) {
10287 *m_tail = NULL;
10288 }
10289 if (cnt != NULL) {
10290 *cnt = 0;
10291 }
10292 if (len != NULL) {
10293 *len = 0;
10294 }
10295 }
10296
10297 static errno_t
ifp_if_ctl(struct ifnet * ifp,ifnet_ctl_cmd_t cmd,u_int32_t arglen,void * arg)10298 ifp_if_ctl(struct ifnet *ifp, ifnet_ctl_cmd_t cmd, u_int32_t arglen, void *arg)
10299 {
10300 #pragma unused(ifp, cmd, arglen, arg)
10301 return EOPNOTSUPP;
10302 }
10303
10304 static errno_t
ifp_if_demux(struct ifnet * ifp,struct mbuf * m,char * fh,protocol_family_t * pf)10305 ifp_if_demux(struct ifnet *ifp, struct mbuf *m, char *fh, protocol_family_t *pf)
10306 {
10307 #pragma unused(ifp, fh, pf)
10308 m_freem(m);
10309 return EJUSTRETURN;
10310 }
10311
10312 static errno_t
ifp_if_add_proto(struct ifnet * ifp,protocol_family_t pf,const struct ifnet_demux_desc * da,u_int32_t dc)10313 ifp_if_add_proto(struct ifnet *ifp, protocol_family_t pf,
10314 const struct ifnet_demux_desc *da, u_int32_t dc)
10315 {
10316 #pragma unused(ifp, pf, da, dc)
10317 return EINVAL;
10318 }
10319
10320 static errno_t
ifp_if_del_proto(struct ifnet * ifp,protocol_family_t pf)10321 ifp_if_del_proto(struct ifnet *ifp, protocol_family_t pf)
10322 {
10323 #pragma unused(ifp, pf)
10324 return EINVAL;
10325 }
10326
10327 static errno_t
ifp_if_check_multi(struct ifnet * ifp,const struct sockaddr * sa)10328 ifp_if_check_multi(struct ifnet *ifp, const struct sockaddr *sa)
10329 {
10330 #pragma unused(ifp, sa)
10331 return EOPNOTSUPP;
10332 }
10333
10334 #if !XNU_TARGET_OS_OSX
10335 static errno_t
ifp_if_framer(struct ifnet * ifp,struct mbuf ** m,const struct sockaddr * sa,const char * ll,const char * t,u_int32_t * pre,u_int32_t * post)10336 ifp_if_framer(struct ifnet *ifp, struct mbuf **m,
10337 const struct sockaddr *sa, const char *ll, const char *t,
10338 u_int32_t *pre, u_int32_t *post)
10339 #else /* XNU_TARGET_OS_OSX */
10340 static errno_t
10341 ifp_if_framer(struct ifnet *ifp, struct mbuf **m,
10342 const struct sockaddr *sa, const char *ll, const char *t)
10343 #endif /* XNU_TARGET_OS_OSX */
10344 {
10345 #pragma unused(ifp, m, sa, ll, t)
10346 #if !XNU_TARGET_OS_OSX
10347 return ifp_if_framer_extended(ifp, m, sa, ll, t, pre, post);
10348 #else /* XNU_TARGET_OS_OSX */
10349 return ifp_if_framer_extended(ifp, m, sa, ll, t, NULL, NULL);
10350 #endif /* XNU_TARGET_OS_OSX */
10351 }
10352
10353 static errno_t
ifp_if_framer_extended(struct ifnet * ifp,struct mbuf ** m,const struct sockaddr * sa,const char * ll,const char * t,u_int32_t * pre,u_int32_t * post)10354 ifp_if_framer_extended(struct ifnet *ifp, struct mbuf **m,
10355 const struct sockaddr *sa, const char *ll, const char *t,
10356 u_int32_t *pre, u_int32_t *post)
10357 {
10358 #pragma unused(ifp, sa, ll, t)
10359 m_freem(*m);
10360 *m = NULL;
10361
10362 if (pre != NULL) {
10363 *pre = 0;
10364 }
10365 if (post != NULL) {
10366 *post = 0;
10367 }
10368
10369 return EJUSTRETURN;
10370 }
10371
10372 errno_t
ifp_if_ioctl(struct ifnet * ifp,unsigned long cmd,void * arg)10373 ifp_if_ioctl(struct ifnet *ifp, unsigned long cmd, void *arg)
10374 {
10375 #pragma unused(ifp, cmd, arg)
10376 return EOPNOTSUPP;
10377 }
10378
10379 static errno_t
ifp_if_set_bpf_tap(struct ifnet * ifp,bpf_tap_mode tm,bpf_packet_func f)10380 ifp_if_set_bpf_tap(struct ifnet *ifp, bpf_tap_mode tm, bpf_packet_func f)
10381 {
10382 #pragma unused(ifp, tm, f)
10383 /* XXX not sure what to do here */
10384 return 0;
10385 }
10386
10387 static void
ifp_if_free(struct ifnet * ifp)10388 ifp_if_free(struct ifnet *ifp)
10389 {
10390 #pragma unused(ifp)
10391 }
10392
10393 static void
ifp_if_event(struct ifnet * ifp,const struct kev_msg * e)10394 ifp_if_event(struct ifnet *ifp, const struct kev_msg *e)
10395 {
10396 #pragma unused(ifp, e)
10397 }
10398
10399 int
dlil_if_acquire(u_int32_t family,const void * uniqueid,size_t uniqueid_len,const char * ifxname,struct ifnet ** ifp)10400 dlil_if_acquire(u_int32_t family, const void *uniqueid,
10401 size_t uniqueid_len, const char *ifxname, struct ifnet **ifp)
10402 {
10403 struct ifnet *ifp1 = NULL;
10404 struct dlil_ifnet *dlifp1 = NULL;
10405 struct dlil_ifnet *dlifp1_saved = NULL;
10406 void *buf, *base, **pbuf;
10407 int ret = 0;
10408
10409 VERIFY(*ifp == NULL);
10410 dlil_if_lock();
10411 /*
10412 * We absolutely can't have an interface with the same name
10413 * in in-use state.
10414 * To make sure of that list has to be traversed completely
10415 */
10416 TAILQ_FOREACH(dlifp1, &dlil_ifnet_head, dl_if_link) {
10417 ifp1 = (struct ifnet *)dlifp1;
10418
10419 if (ifp1->if_family != family) {
10420 continue;
10421 }
10422
10423 /*
10424 * If interface is in use, return EBUSY if either unique id
10425 * or interface extended names are the same
10426 */
10427 lck_mtx_lock(&dlifp1->dl_if_lock);
10428 if (strncmp(ifxname, ifp1->if_xname, IFXNAMSIZ) == 0 &&
10429 (dlifp1->dl_if_flags & DLIF_INUSE) != 0) {
10430 lck_mtx_unlock(&dlifp1->dl_if_lock);
10431 ret = EBUSY;
10432 goto end;
10433 }
10434
10435 if (uniqueid_len != 0 &&
10436 uniqueid_len == dlifp1->dl_if_uniqueid_len &&
10437 bcmp(uniqueid, dlifp1->dl_if_uniqueid, uniqueid_len) == 0) {
10438 if ((dlifp1->dl_if_flags & DLIF_INUSE) != 0) {
10439 lck_mtx_unlock(&dlifp1->dl_if_lock);
10440 ret = EBUSY;
10441 goto end;
10442 }
10443 if (dlifp1_saved == NULL) {
10444 /* cache the first match */
10445 dlifp1_saved = dlifp1;
10446 }
10447 /*
10448 * Do not break or jump to end as we have to traverse
10449 * the whole list to ensure there are no name collisions
10450 */
10451 }
10452 lck_mtx_unlock(&dlifp1->dl_if_lock);
10453 }
10454
10455 /* If there's an interface that can be recycled, use that */
10456 if (dlifp1_saved != NULL) {
10457 lck_mtx_lock(&dlifp1_saved->dl_if_lock);
10458 if ((dlifp1_saved->dl_if_flags & DLIF_INUSE) != 0) {
10459 /* some other thread got in ahead of us */
10460 lck_mtx_unlock(&dlifp1_saved->dl_if_lock);
10461 ret = EBUSY;
10462 goto end;
10463 }
10464 dlifp1_saved->dl_if_flags |= (DLIF_INUSE | DLIF_REUSE);
10465 lck_mtx_unlock(&dlifp1_saved->dl_if_lock);
10466 *ifp = (struct ifnet *)dlifp1_saved;
10467 dlil_if_ref(*ifp);
10468 goto end;
10469 }
10470
10471 /* no interface found, allocate a new one */
10472 buf = zalloc_flags(dlif_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
10473
10474 /* Get the 64-bit aligned base address for this object */
10475 base = (void *)P2ROUNDUP((intptr_t)buf + sizeof(u_int64_t),
10476 sizeof(u_int64_t));
10477 VERIFY(((intptr_t)base + dlif_size) <= ((intptr_t)buf + dlif_bufsize));
10478
10479 /*
10480 * Wind back a pointer size from the aligned base and
10481 * save the original address so we can free it later.
10482 */
10483 pbuf = (void **)((intptr_t)base - sizeof(void *));
10484 *pbuf = buf;
10485 dlifp1 = base;
10486
10487 if (uniqueid_len) {
10488 dlifp1->dl_if_uniqueid = kalloc_data(uniqueid_len,
10489 Z_WAITOK);
10490 if (dlifp1->dl_if_uniqueid == NULL) {
10491 zfree(dlif_zone, buf);
10492 ret = ENOMEM;
10493 goto end;
10494 }
10495 bcopy(uniqueid, dlifp1->dl_if_uniqueid, uniqueid_len);
10496 dlifp1->dl_if_uniqueid_len = uniqueid_len;
10497 }
10498
10499 ifp1 = (struct ifnet *)dlifp1;
10500 dlifp1->dl_if_flags = DLIF_INUSE;
10501 if (ifnet_debug) {
10502 dlifp1->dl_if_flags |= DLIF_DEBUG;
10503 dlifp1->dl_if_trace = dlil_if_trace;
10504 }
10505 ifp1->if_name = dlifp1->dl_if_namestorage;
10506 ifp1->if_xname = dlifp1->dl_if_xnamestorage;
10507
10508 /* initialize interface description */
10509 ifp1->if_desc.ifd_maxlen = IF_DESCSIZE;
10510 ifp1->if_desc.ifd_len = 0;
10511 ifp1->if_desc.ifd_desc = dlifp1->dl_if_descstorage;
10512
10513 #if SKYWALK
10514 SLIST_INIT(&ifp1->if_netns_tokens);
10515 #endif /* SKYWALK */
10516
10517 if ((ret = dlil_alloc_local_stats(ifp1)) != 0) {
10518 DLIL_PRINTF("%s: failed to allocate if local stats, "
10519 "error: %d\n", __func__, ret);
10520 /* This probably shouldn't be fatal */
10521 ret = 0;
10522 }
10523
10524 lck_mtx_init(&dlifp1->dl_if_lock, &ifnet_lock_group, &ifnet_lock_attr);
10525 lck_rw_init(&ifp1->if_lock, &ifnet_lock_group, &ifnet_lock_attr);
10526 lck_mtx_init(&ifp1->if_ref_lock, &ifnet_lock_group, &ifnet_lock_attr);
10527 lck_mtx_init(&ifp1->if_flt_lock, &ifnet_lock_group, &ifnet_lock_attr);
10528 lck_mtx_init(&ifp1->if_addrconfig_lock, &ifnet_lock_group,
10529 &ifnet_lock_attr);
10530 lck_rw_init(&ifp1->if_llreach_lock, &ifnet_lock_group, &ifnet_lock_attr);
10531 #if INET
10532 lck_rw_init(&ifp1->if_inetdata_lock, &ifnet_lock_group,
10533 &ifnet_lock_attr);
10534 ifp1->if_inetdata = NULL;
10535 #endif
10536 lck_mtx_init(&ifp1->if_inet6_ioctl_lock, &ifnet_lock_group, &ifnet_lock_attr);
10537 ifp1->if_inet6_ioctl_busy = FALSE;
10538 lck_rw_init(&ifp1->if_inet6data_lock, &ifnet_lock_group,
10539 &ifnet_lock_attr);
10540 ifp1->if_inet6data = NULL;
10541 lck_rw_init(&ifp1->if_link_status_lock, &ifnet_lock_group,
10542 &ifnet_lock_attr);
10543 ifp1->if_link_status = NULL;
10544 lck_mtx_init(&ifp1->if_delegate_lock, &ifnet_lock_group, &ifnet_lock_attr);
10545
10546 /* for send data paths */
10547 lck_mtx_init(&ifp1->if_start_lock, &ifnet_snd_lock_group,
10548 &ifnet_lock_attr);
10549 lck_mtx_init(&ifp1->if_cached_route_lock, &ifnet_snd_lock_group,
10550 &ifnet_lock_attr);
10551
10552 /* for receive data paths */
10553 lck_mtx_init(&ifp1->if_poll_lock, &ifnet_rcv_lock_group,
10554 &ifnet_lock_attr);
10555
10556 /* thread call allocation is done with sleeping zalloc */
10557 ifp1->if_dt_tcall = thread_call_allocate_with_options(dlil_dt_tcall_fn,
10558 ifp1, THREAD_CALL_PRIORITY_KERNEL, THREAD_CALL_OPTIONS_ONCE);
10559 if (ifp1->if_dt_tcall == NULL) {
10560 panic_plain("%s: couldn't create if_dt_tcall", __func__);
10561 /* NOTREACHED */
10562 }
10563
10564 TAILQ_INSERT_TAIL(&dlil_ifnet_head, dlifp1, dl_if_link);
10565
10566 *ifp = ifp1;
10567 dlil_if_ref(*ifp);
10568
10569 end:
10570 dlil_if_unlock();
10571
10572 VERIFY(dlifp1 == NULL || (IS_P2ALIGNED(dlifp1, sizeof(u_int64_t)) &&
10573 IS_P2ALIGNED(&ifp1->if_data, sizeof(u_int64_t))));
10574
10575 return ret;
10576 }
10577
10578 static void
_dlil_if_release(ifnet_t ifp,bool clear_in_use)10579 _dlil_if_release(ifnet_t ifp, bool clear_in_use)
10580 {
10581 struct dlil_ifnet *dlifp = (struct dlil_ifnet *)ifp;
10582
10583 VERIFY(OSDecrementAtomic64(&net_api_stats.nas_ifnet_alloc_count) > 0);
10584 if (!(ifp->if_xflags & IFXF_ALLOC_KPI)) {
10585 VERIFY(OSDecrementAtomic64(&net_api_stats.nas_ifnet_alloc_os_count) > 0);
10586 }
10587
10588 ifnet_lock_exclusive(ifp);
10589 if (ifp->if_broadcast.length > sizeof(ifp->if_broadcast.u.buffer)) {
10590 kfree_data(ifp->if_broadcast.u.ptr, ifp->if_broadcast.length);
10591 ifp->if_broadcast.length = 0;
10592 ifp->if_broadcast.u.ptr = NULL;
10593 }
10594 lck_mtx_lock(&dlifp->dl_if_lock);
10595 strlcpy(dlifp->dl_if_namestorage, ifp->if_name, IFNAMSIZ);
10596 ifp->if_name = dlifp->dl_if_namestorage;
10597 /* Reset external name (name + unit) */
10598 ifp->if_xname = dlifp->dl_if_xnamestorage;
10599 snprintf(__DECONST(char *, ifp->if_xname), IFXNAMSIZ,
10600 "%s?", ifp->if_name);
10601 if (clear_in_use) {
10602 ASSERT((dlifp->dl_if_flags & DLIF_INUSE) != 0);
10603 dlifp->dl_if_flags &= ~DLIF_INUSE;
10604 }
10605 lck_mtx_unlock(&dlifp->dl_if_lock);
10606 ifnet_lock_done(ifp);
10607 }
10608
10609 __private_extern__ void
dlil_if_release(ifnet_t ifp)10610 dlil_if_release(ifnet_t ifp)
10611 {
10612 _dlil_if_release(ifp, false);
10613 }
10614
10615 __private_extern__ void
dlil_if_lock(void)10616 dlil_if_lock(void)
10617 {
10618 lck_mtx_lock(&dlil_ifnet_lock);
10619 }
10620
10621 __private_extern__ void
dlil_if_unlock(void)10622 dlil_if_unlock(void)
10623 {
10624 lck_mtx_unlock(&dlil_ifnet_lock);
10625 }
10626
10627 __private_extern__ void
dlil_if_lock_assert(void)10628 dlil_if_lock_assert(void)
10629 {
10630 LCK_MTX_ASSERT(&dlil_ifnet_lock, LCK_MTX_ASSERT_OWNED);
10631 }
10632
10633 __private_extern__ void
dlil_proto_unplumb_all(struct ifnet * ifp)10634 dlil_proto_unplumb_all(struct ifnet *ifp)
10635 {
10636 /*
10637 * if_proto_hash[0-2] are for PF_INET, PF_INET6 and PF_VLAN, where
10638 * each bucket contains exactly one entry; PF_VLAN does not need an
10639 * explicit unplumb.
10640 *
10641 * if_proto_hash[3] is for other protocols; we expect anything
10642 * in this bucket to respond to the DETACHING event (which would
10643 * have happened by now) and do the unplumb then.
10644 */
10645 (void) proto_unplumb(PF_INET, ifp);
10646 (void) proto_unplumb(PF_INET6, ifp);
10647 }
10648
10649 static void
ifp_src_route_copyout(struct ifnet * ifp,struct route * dst)10650 ifp_src_route_copyout(struct ifnet *ifp, struct route *dst)
10651 {
10652 lck_mtx_lock_spin(&ifp->if_cached_route_lock);
10653 lck_mtx_convert_spin(&ifp->if_cached_route_lock);
10654
10655 route_copyout(dst, &ifp->if_src_route, sizeof(*dst));
10656
10657 lck_mtx_unlock(&ifp->if_cached_route_lock);
10658 }
10659
10660 static void
ifp_src_route_copyin(struct ifnet * ifp,struct route * src)10661 ifp_src_route_copyin(struct ifnet *ifp, struct route *src)
10662 {
10663 lck_mtx_lock_spin(&ifp->if_cached_route_lock);
10664 lck_mtx_convert_spin(&ifp->if_cached_route_lock);
10665
10666 if (ifp->if_fwd_cacheok) {
10667 route_copyin(src, &ifp->if_src_route, sizeof(*src));
10668 } else {
10669 ROUTE_RELEASE(src);
10670 }
10671 lck_mtx_unlock(&ifp->if_cached_route_lock);
10672 }
10673
10674 static void
ifp_src_route6_copyout(struct ifnet * ifp,struct route_in6 * dst)10675 ifp_src_route6_copyout(struct ifnet *ifp, struct route_in6 *dst)
10676 {
10677 lck_mtx_lock_spin(&ifp->if_cached_route_lock);
10678 lck_mtx_convert_spin(&ifp->if_cached_route_lock);
10679
10680 route_copyout((struct route *)dst, (struct route *)&ifp->if_src_route6,
10681 sizeof(*dst));
10682
10683 lck_mtx_unlock(&ifp->if_cached_route_lock);
10684 }
10685
10686 static void
ifp_src_route6_copyin(struct ifnet * ifp,struct route_in6 * src)10687 ifp_src_route6_copyin(struct ifnet *ifp, struct route_in6 *src)
10688 {
10689 lck_mtx_lock_spin(&ifp->if_cached_route_lock);
10690 lck_mtx_convert_spin(&ifp->if_cached_route_lock);
10691
10692 if (ifp->if_fwd_cacheok) {
10693 route_copyin((struct route *)src,
10694 (struct route *)&ifp->if_src_route6, sizeof(*src));
10695 } else {
10696 ROUTE_RELEASE(src);
10697 }
10698 lck_mtx_unlock(&ifp->if_cached_route_lock);
10699 }
10700
10701 struct rtentry *
ifnet_cached_rtlookup_inet(struct ifnet * ifp,struct in_addr src_ip)10702 ifnet_cached_rtlookup_inet(struct ifnet *ifp, struct in_addr src_ip)
10703 {
10704 struct route src_rt;
10705 struct sockaddr_in *dst;
10706
10707 dst = (struct sockaddr_in *)(void *)(&src_rt.ro_dst);
10708
10709 ifp_src_route_copyout(ifp, &src_rt);
10710
10711 if (ROUTE_UNUSABLE(&src_rt) || src_ip.s_addr != dst->sin_addr.s_addr) {
10712 ROUTE_RELEASE(&src_rt);
10713 if (dst->sin_family != AF_INET) {
10714 bzero(&src_rt.ro_dst, sizeof(src_rt.ro_dst));
10715 dst->sin_len = sizeof(src_rt.ro_dst);
10716 dst->sin_family = AF_INET;
10717 }
10718 dst->sin_addr = src_ip;
10719
10720 VERIFY(src_rt.ro_rt == NULL);
10721 src_rt.ro_rt = rtalloc1_scoped((struct sockaddr *)dst,
10722 0, 0, ifp->if_index);
10723
10724 if (src_rt.ro_rt != NULL) {
10725 /* retain a ref, copyin consumes one */
10726 struct rtentry *rte = src_rt.ro_rt;
10727 RT_ADDREF(rte);
10728 ifp_src_route_copyin(ifp, &src_rt);
10729 src_rt.ro_rt = rte;
10730 }
10731 }
10732
10733 return src_rt.ro_rt;
10734 }
10735
10736 struct rtentry *
ifnet_cached_rtlookup_inet6(struct ifnet * ifp,struct in6_addr * src_ip6)10737 ifnet_cached_rtlookup_inet6(struct ifnet *ifp, struct in6_addr *src_ip6)
10738 {
10739 struct route_in6 src_rt;
10740
10741 ifp_src_route6_copyout(ifp, &src_rt);
10742
10743 if (ROUTE_UNUSABLE(&src_rt) ||
10744 !IN6_ARE_ADDR_EQUAL(src_ip6, &src_rt.ro_dst.sin6_addr)) {
10745 ROUTE_RELEASE(&src_rt);
10746 if (src_rt.ro_dst.sin6_family != AF_INET6) {
10747 bzero(&src_rt.ro_dst, sizeof(src_rt.ro_dst));
10748 src_rt.ro_dst.sin6_len = sizeof(src_rt.ro_dst);
10749 src_rt.ro_dst.sin6_family = AF_INET6;
10750 }
10751 src_rt.ro_dst.sin6_scope_id = in6_addr2scopeid(ifp, src_ip6);
10752 bcopy(src_ip6, &src_rt.ro_dst.sin6_addr,
10753 sizeof(src_rt.ro_dst.sin6_addr));
10754
10755 if (src_rt.ro_rt == NULL) {
10756 src_rt.ro_rt = rtalloc1_scoped(
10757 (struct sockaddr *)&src_rt.ro_dst, 0, 0,
10758 ifp->if_index);
10759
10760 if (src_rt.ro_rt != NULL) {
10761 /* retain a ref, copyin consumes one */
10762 struct rtentry *rte = src_rt.ro_rt;
10763 RT_ADDREF(rte);
10764 ifp_src_route6_copyin(ifp, &src_rt);
10765 src_rt.ro_rt = rte;
10766 }
10767 }
10768 }
10769
10770 return src_rt.ro_rt;
10771 }
10772
10773 void
if_lqm_update(struct ifnet * ifp,int lqm,int locked)10774 if_lqm_update(struct ifnet *ifp, int lqm, int locked)
10775 {
10776 struct kev_dl_link_quality_metric_data ev_lqm_data;
10777
10778 VERIFY(lqm >= IFNET_LQM_MIN && lqm <= IFNET_LQM_MAX);
10779
10780 /* Normalize to edge */
10781 if (lqm >= 0 && lqm <= IFNET_LQM_THRESH_ABORT) {
10782 lqm = IFNET_LQM_THRESH_ABORT;
10783 os_atomic_or(&tcbinfo.ipi_flags, INPCBINFO_HANDLE_LQM_ABORT, relaxed);
10784 inpcb_timer_sched(&tcbinfo, INPCB_TIMER_FAST);
10785 } else if (lqm > IFNET_LQM_THRESH_ABORT &&
10786 lqm <= IFNET_LQM_THRESH_MINIMALLY_VIABLE) {
10787 lqm = IFNET_LQM_THRESH_MINIMALLY_VIABLE;
10788 } else if (lqm > IFNET_LQM_THRESH_MINIMALLY_VIABLE &&
10789 lqm <= IFNET_LQM_THRESH_POOR) {
10790 lqm = IFNET_LQM_THRESH_POOR;
10791 } else if (lqm > IFNET_LQM_THRESH_POOR &&
10792 lqm <= IFNET_LQM_THRESH_GOOD) {
10793 lqm = IFNET_LQM_THRESH_GOOD;
10794 }
10795
10796 /*
10797 * Take the lock if needed
10798 */
10799 if (!locked) {
10800 ifnet_lock_exclusive(ifp);
10801 }
10802
10803 if (lqm == ifp->if_interface_state.lqm_state &&
10804 (ifp->if_interface_state.valid_bitmask &
10805 IF_INTERFACE_STATE_LQM_STATE_VALID)) {
10806 /*
10807 * Release the lock if was not held by the caller
10808 */
10809 if (!locked) {
10810 ifnet_lock_done(ifp);
10811 }
10812 return; /* nothing to update */
10813 }
10814 ifp->if_interface_state.valid_bitmask |=
10815 IF_INTERFACE_STATE_LQM_STATE_VALID;
10816 ifp->if_interface_state.lqm_state = (int8_t)lqm;
10817
10818 /*
10819 * Don't want to hold the lock when issuing kernel events
10820 */
10821 ifnet_lock_done(ifp);
10822
10823 bzero(&ev_lqm_data, sizeof(ev_lqm_data));
10824 ev_lqm_data.link_quality_metric = lqm;
10825
10826 dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_LINK_QUALITY_METRIC_CHANGED,
10827 (struct net_event_data *)&ev_lqm_data, sizeof(ev_lqm_data), FALSE);
10828
10829 /*
10830 * Reacquire the lock for the caller
10831 */
10832 if (locked) {
10833 ifnet_lock_exclusive(ifp);
10834 }
10835 }
10836
10837 static void
if_rrc_state_update(struct ifnet * ifp,unsigned int rrc_state)10838 if_rrc_state_update(struct ifnet *ifp, unsigned int rrc_state)
10839 {
10840 struct kev_dl_rrc_state kev;
10841
10842 if (rrc_state == ifp->if_interface_state.rrc_state &&
10843 (ifp->if_interface_state.valid_bitmask &
10844 IF_INTERFACE_STATE_RRC_STATE_VALID)) {
10845 return;
10846 }
10847
10848 ifp->if_interface_state.valid_bitmask |=
10849 IF_INTERFACE_STATE_RRC_STATE_VALID;
10850
10851 ifp->if_interface_state.rrc_state = (uint8_t)rrc_state;
10852
10853 /*
10854 * Don't want to hold the lock when issuing kernel events
10855 */
10856 ifnet_lock_done(ifp);
10857
10858 bzero(&kev, sizeof(struct kev_dl_rrc_state));
10859 kev.rrc_state = rrc_state;
10860
10861 dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_RRC_STATE_CHANGED,
10862 (struct net_event_data *)&kev, sizeof(struct kev_dl_rrc_state), FALSE);
10863
10864 ifnet_lock_exclusive(ifp);
10865 }
10866
10867 errno_t
if_state_update(struct ifnet * ifp,struct if_interface_state * if_interface_state)10868 if_state_update(struct ifnet *ifp,
10869 struct if_interface_state *if_interface_state)
10870 {
10871 u_short if_index_available = 0;
10872
10873 ifnet_lock_exclusive(ifp);
10874
10875 if ((ifp->if_type != IFT_CELLULAR) &&
10876 (if_interface_state->valid_bitmask &
10877 IF_INTERFACE_STATE_RRC_STATE_VALID)) {
10878 ifnet_lock_done(ifp);
10879 return ENOTSUP;
10880 }
10881 if ((if_interface_state->valid_bitmask &
10882 IF_INTERFACE_STATE_LQM_STATE_VALID) &&
10883 (if_interface_state->lqm_state < IFNET_LQM_MIN ||
10884 if_interface_state->lqm_state > IFNET_LQM_MAX)) {
10885 ifnet_lock_done(ifp);
10886 return EINVAL;
10887 }
10888 if ((if_interface_state->valid_bitmask &
10889 IF_INTERFACE_STATE_RRC_STATE_VALID) &&
10890 if_interface_state->rrc_state !=
10891 IF_INTERFACE_STATE_RRC_STATE_IDLE &&
10892 if_interface_state->rrc_state !=
10893 IF_INTERFACE_STATE_RRC_STATE_CONNECTED) {
10894 ifnet_lock_done(ifp);
10895 return EINVAL;
10896 }
10897
10898 if (if_interface_state->valid_bitmask &
10899 IF_INTERFACE_STATE_LQM_STATE_VALID) {
10900 if_lqm_update(ifp, if_interface_state->lqm_state, 1);
10901 }
10902 if (if_interface_state->valid_bitmask &
10903 IF_INTERFACE_STATE_RRC_STATE_VALID) {
10904 if_rrc_state_update(ifp, if_interface_state->rrc_state);
10905 }
10906 if (if_interface_state->valid_bitmask &
10907 IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID) {
10908 ifp->if_interface_state.valid_bitmask |=
10909 IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID;
10910 ifp->if_interface_state.interface_availability =
10911 if_interface_state->interface_availability;
10912
10913 if (ifp->if_interface_state.interface_availability ==
10914 IF_INTERFACE_STATE_INTERFACE_AVAILABLE) {
10915 os_log(OS_LOG_DEFAULT, "%s: interface %s (%u) available\n",
10916 __func__, if_name(ifp), ifp->if_index);
10917 if_index_available = ifp->if_index;
10918 } else {
10919 os_log(OS_LOG_DEFAULT, "%s: interface %s (%u) unavailable)\n",
10920 __func__, if_name(ifp), ifp->if_index);
10921 }
10922 }
10923 ifnet_lock_done(ifp);
10924
10925 /*
10926 * Check if the TCP connections going on this interface should be
10927 * forced to send probe packets instead of waiting for TCP timers
10928 * to fire. This is done on an explicit notification such as
10929 * SIOCSIFINTERFACESTATE which marks the interface as available.
10930 */
10931 if (if_index_available > 0) {
10932 tcp_interface_send_probe(if_index_available);
10933 }
10934
10935 return 0;
10936 }
10937
10938 void
if_get_state(struct ifnet * ifp,struct if_interface_state * if_interface_state)10939 if_get_state(struct ifnet *ifp,
10940 struct if_interface_state *if_interface_state)
10941 {
10942 ifnet_lock_shared(ifp);
10943
10944 if_interface_state->valid_bitmask = 0;
10945
10946 if (ifp->if_interface_state.valid_bitmask &
10947 IF_INTERFACE_STATE_RRC_STATE_VALID) {
10948 if_interface_state->valid_bitmask |=
10949 IF_INTERFACE_STATE_RRC_STATE_VALID;
10950 if_interface_state->rrc_state =
10951 ifp->if_interface_state.rrc_state;
10952 }
10953 if (ifp->if_interface_state.valid_bitmask &
10954 IF_INTERFACE_STATE_LQM_STATE_VALID) {
10955 if_interface_state->valid_bitmask |=
10956 IF_INTERFACE_STATE_LQM_STATE_VALID;
10957 if_interface_state->lqm_state =
10958 ifp->if_interface_state.lqm_state;
10959 }
10960 if (ifp->if_interface_state.valid_bitmask &
10961 IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID) {
10962 if_interface_state->valid_bitmask |=
10963 IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID;
10964 if_interface_state->interface_availability =
10965 ifp->if_interface_state.interface_availability;
10966 }
10967
10968 ifnet_lock_done(ifp);
10969 }
10970
10971 errno_t
if_probe_connectivity(struct ifnet * ifp,u_int32_t conn_probe)10972 if_probe_connectivity(struct ifnet *ifp, u_int32_t conn_probe)
10973 {
10974 if (conn_probe > 1) {
10975 return EINVAL;
10976 }
10977 if (conn_probe == 0) {
10978 if_clear_eflags(ifp, IFEF_PROBE_CONNECTIVITY);
10979 } else {
10980 if_set_eflags(ifp, IFEF_PROBE_CONNECTIVITY);
10981 }
10982
10983 #if NECP
10984 necp_update_all_clients();
10985 #endif /* NECP */
10986
10987 tcp_probe_connectivity(ifp, conn_probe);
10988 return 0;
10989 }
10990
10991 /* for uuid.c */
10992 static int
get_ether_index(int * ret_other_index)10993 get_ether_index(int * ret_other_index)
10994 {
10995 struct ifnet *ifp;
10996 int en0_index = 0;
10997 int other_en_index = 0;
10998 int any_ether_index = 0;
10999 short best_unit = 0;
11000
11001 *ret_other_index = 0;
11002 TAILQ_FOREACH(ifp, &ifnet_head, if_link) {
11003 /*
11004 * find en0, or if not en0, the lowest unit en*, and if not
11005 * that, any ethernet
11006 */
11007 ifnet_lock_shared(ifp);
11008 if (strcmp(ifp->if_name, "en") == 0) {
11009 if (ifp->if_unit == 0) {
11010 /* found en0, we're done */
11011 en0_index = ifp->if_index;
11012 ifnet_lock_done(ifp);
11013 break;
11014 }
11015 if (other_en_index == 0 || ifp->if_unit < best_unit) {
11016 other_en_index = ifp->if_index;
11017 best_unit = ifp->if_unit;
11018 }
11019 } else if (ifp->if_type == IFT_ETHER && any_ether_index == 0) {
11020 any_ether_index = ifp->if_index;
11021 }
11022 ifnet_lock_done(ifp);
11023 }
11024 if (en0_index == 0) {
11025 if (other_en_index != 0) {
11026 *ret_other_index = other_en_index;
11027 } else if (any_ether_index != 0) {
11028 *ret_other_index = any_ether_index;
11029 }
11030 }
11031 return en0_index;
11032 }
11033
11034 int
uuid_get_ethernet(u_int8_t * node)11035 uuid_get_ethernet(u_int8_t *node)
11036 {
11037 static int en0_index;
11038 struct ifnet *ifp;
11039 int other_index = 0;
11040 int the_index = 0;
11041 int ret;
11042
11043 ifnet_head_lock_shared();
11044 if (en0_index == 0 || ifindex2ifnet[en0_index] == NULL) {
11045 en0_index = get_ether_index(&other_index);
11046 }
11047 if (en0_index != 0) {
11048 the_index = en0_index;
11049 } else if (other_index != 0) {
11050 the_index = other_index;
11051 }
11052 if (the_index != 0) {
11053 struct dlil_ifnet *dl_if;
11054
11055 ifp = ifindex2ifnet[the_index];
11056 VERIFY(ifp != NULL);
11057 dl_if = (struct dlil_ifnet *)ifp;
11058 if (dl_if->dl_if_permanent_ether_is_set != 0) {
11059 /*
11060 * Use the permanent ethernet address if it is
11061 * available because it will never change.
11062 */
11063 memcpy(node, dl_if->dl_if_permanent_ether,
11064 ETHER_ADDR_LEN);
11065 } else {
11066 memcpy(node, IF_LLADDR(ifp), ETHER_ADDR_LEN);
11067 }
11068 ret = 0;
11069 } else {
11070 ret = -1;
11071 }
11072 ifnet_head_done();
11073 return ret;
11074 }
11075
11076 static int
11077 sysctl_rxpoll SYSCTL_HANDLER_ARGS
11078 {
11079 #pragma unused(arg1, arg2)
11080 uint32_t i;
11081 int err;
11082
11083 i = if_rxpoll;
11084
11085 err = sysctl_handle_int(oidp, &i, 0, req);
11086 if (err != 0 || req->newptr == USER_ADDR_NULL) {
11087 return err;
11088 }
11089
11090 if (net_rxpoll == 0) {
11091 return ENXIO;
11092 }
11093
11094 if_rxpoll = i;
11095 return err;
11096 }
11097
11098 static int
11099 sysctl_rxpoll_mode_holdtime SYSCTL_HANDLER_ARGS
11100 {
11101 #pragma unused(arg1, arg2)
11102 uint64_t q;
11103 int err;
11104
11105 q = if_rxpoll_mode_holdtime;
11106
11107 err = sysctl_handle_quad(oidp, &q, 0, req);
11108 if (err != 0 || req->newptr == USER_ADDR_NULL) {
11109 return err;
11110 }
11111
11112 if (q < IF_RXPOLL_MODE_HOLDTIME_MIN) {
11113 q = IF_RXPOLL_MODE_HOLDTIME_MIN;
11114 }
11115
11116 if_rxpoll_mode_holdtime = q;
11117
11118 return err;
11119 }
11120
11121 static int
11122 sysctl_rxpoll_sample_holdtime SYSCTL_HANDLER_ARGS
11123 {
11124 #pragma unused(arg1, arg2)
11125 uint64_t q;
11126 int err;
11127
11128 q = if_rxpoll_sample_holdtime;
11129
11130 err = sysctl_handle_quad(oidp, &q, 0, req);
11131 if (err != 0 || req->newptr == USER_ADDR_NULL) {
11132 return err;
11133 }
11134
11135 if (q < IF_RXPOLL_SAMPLETIME_MIN) {
11136 q = IF_RXPOLL_SAMPLETIME_MIN;
11137 }
11138
11139 if_rxpoll_sample_holdtime = q;
11140
11141 return err;
11142 }
11143
11144 static int
11145 sysctl_rxpoll_interval_time SYSCTL_HANDLER_ARGS
11146 {
11147 #pragma unused(arg1, arg2)
11148 uint64_t q;
11149 int err;
11150
11151 q = if_rxpoll_interval_time;
11152
11153 err = sysctl_handle_quad(oidp, &q, 0, req);
11154 if (err != 0 || req->newptr == USER_ADDR_NULL) {
11155 return err;
11156 }
11157
11158 if (q < IF_RXPOLL_INTERVALTIME_MIN) {
11159 q = IF_RXPOLL_INTERVALTIME_MIN;
11160 }
11161
11162 if_rxpoll_interval_time = q;
11163
11164 return err;
11165 }
11166
11167 static int
11168 sysctl_rxpoll_wlowat SYSCTL_HANDLER_ARGS
11169 {
11170 #pragma unused(arg1, arg2)
11171 uint32_t i;
11172 int err;
11173
11174 i = if_sysctl_rxpoll_wlowat;
11175
11176 err = sysctl_handle_int(oidp, &i, 0, req);
11177 if (err != 0 || req->newptr == USER_ADDR_NULL) {
11178 return err;
11179 }
11180
11181 if (i == 0 || i >= if_sysctl_rxpoll_whiwat) {
11182 return EINVAL;
11183 }
11184
11185 if_sysctl_rxpoll_wlowat = i;
11186 return err;
11187 }
11188
11189 static int
11190 sysctl_rxpoll_whiwat SYSCTL_HANDLER_ARGS
11191 {
11192 #pragma unused(arg1, arg2)
11193 uint32_t i;
11194 int err;
11195
11196 i = if_sysctl_rxpoll_whiwat;
11197
11198 err = sysctl_handle_int(oidp, &i, 0, req);
11199 if (err != 0 || req->newptr == USER_ADDR_NULL) {
11200 return err;
11201 }
11202
11203 if (i <= if_sysctl_rxpoll_wlowat) {
11204 return EINVAL;
11205 }
11206
11207 if_sysctl_rxpoll_whiwat = i;
11208 return err;
11209 }
11210
11211 static int
11212 sysctl_sndq_maxlen SYSCTL_HANDLER_ARGS
11213 {
11214 #pragma unused(arg1, arg2)
11215 int i, err;
11216
11217 i = if_sndq_maxlen;
11218
11219 err = sysctl_handle_int(oidp, &i, 0, req);
11220 if (err != 0 || req->newptr == USER_ADDR_NULL) {
11221 return err;
11222 }
11223
11224 if (i < IF_SNDQ_MINLEN) {
11225 i = IF_SNDQ_MINLEN;
11226 }
11227
11228 if_sndq_maxlen = i;
11229 return err;
11230 }
11231
11232 static int
11233 sysctl_rcvq_maxlen SYSCTL_HANDLER_ARGS
11234 {
11235 #pragma unused(arg1, arg2)
11236 int i, err;
11237
11238 i = if_rcvq_maxlen;
11239
11240 err = sysctl_handle_int(oidp, &i, 0, req);
11241 if (err != 0 || req->newptr == USER_ADDR_NULL) {
11242 return err;
11243 }
11244
11245 if (i < IF_RCVQ_MINLEN) {
11246 i = IF_RCVQ_MINLEN;
11247 }
11248
11249 if_rcvq_maxlen = i;
11250 return err;
11251 }
11252
11253 static int
11254 sysctl_rcvq_burst_limit SYSCTL_HANDLER_ARGS
11255 {
11256 #pragma unused(arg1, arg2)
11257 int i, err;
11258
11259 i = if_rcvq_burst_limit;
11260
11261 err = sysctl_handle_int(oidp, &i, 0, req);
11262 if (err != 0 || req->newptr == USER_ADDR_NULL) {
11263 return err;
11264 }
11265
11266 /*
11267 * Safeguard the burst limit to "sane" values on customer builds.
11268 */
11269 #if !(DEVELOPMENT || DEBUG)
11270 if (i < IF_RCVQ_BURST_LIMIT_MIN) {
11271 i = IF_RCVQ_BURST_LIMIT_MIN;
11272 }
11273
11274 if (IF_RCVQ_BURST_LIMIT_MAX < i) {
11275 i = IF_RCVQ_BURST_LIMIT_MAX;
11276 }
11277 #endif
11278
11279 if_rcvq_burst_limit = i;
11280 return err;
11281 }
11282
11283 static int
11284 sysctl_rcvq_trim_pct SYSCTL_HANDLER_ARGS
11285 {
11286 #pragma unused(arg1, arg2)
11287 int i, err;
11288
11289 i = if_rcvq_burst_limit;
11290
11291 err = sysctl_handle_int(oidp, &i, 0, req);
11292 if (err != 0 || req->newptr == USER_ADDR_NULL) {
11293 return err;
11294 }
11295
11296 if (IF_RCVQ_TRIM_PCT_MAX < i) {
11297 i = IF_RCVQ_TRIM_PCT_MAX;
11298 }
11299
11300 if (i < IF_RCVQ_TRIM_PCT_MIN) {
11301 i = IF_RCVQ_TRIM_PCT_MIN;
11302 }
11303
11304 if_rcvq_trim_pct = i;
11305 return err;
11306 }
11307
11308 int
dlil_node_present(struct ifnet * ifp,struct sockaddr * sa,int32_t rssi,int lqm,int npm,u_int8_t srvinfo[48])11309 dlil_node_present(struct ifnet *ifp, struct sockaddr *sa,
11310 int32_t rssi, int lqm, int npm, u_int8_t srvinfo[48])
11311 {
11312 struct kev_dl_node_presence kev;
11313 struct sockaddr_dl *sdl;
11314 struct sockaddr_in6 *sin6;
11315 int ret = 0;
11316
11317 VERIFY(ifp);
11318 VERIFY(sa);
11319 VERIFY(sa->sa_family == AF_LINK || sa->sa_family == AF_INET6);
11320
11321 bzero(&kev, sizeof(kev));
11322 sin6 = &kev.sin6_node_address;
11323 sdl = &kev.sdl_node_address;
11324 nd6_alt_node_addr_decompose(ifp, sa, sdl, sin6);
11325 kev.rssi = rssi;
11326 kev.link_quality_metric = lqm;
11327 kev.node_proximity_metric = npm;
11328 bcopy(srvinfo, kev.node_service_info, sizeof(kev.node_service_info));
11329
11330 ret = nd6_alt_node_present(ifp, sin6, sdl, rssi, lqm, npm);
11331 if (ret == 0 || ret == EEXIST) {
11332 int err = dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_NODE_PRESENCE,
11333 &kev.link_data, sizeof(kev), (ret == EEXIST) ? TRUE : FALSE);
11334 if (err != 0) {
11335 log(LOG_ERR, "%s: Post DL_NODE_PRESENCE failed with"
11336 "error %d\n", __func__, err);
11337 }
11338 }
11339
11340 if (ret == EEXIST) {
11341 ret = 0;
11342 }
11343 return ret;
11344 }
11345
11346 void
dlil_node_absent(struct ifnet * ifp,struct sockaddr * sa)11347 dlil_node_absent(struct ifnet *ifp, struct sockaddr *sa)
11348 {
11349 struct kev_dl_node_absence kev = {};
11350 struct sockaddr_in6 *kev_sin6 = NULL;
11351 struct sockaddr_dl *kev_sdl = NULL;
11352 int error = 0;
11353
11354 VERIFY(ifp != NULL);
11355 VERIFY(sa != NULL);
11356 VERIFY(sa->sa_family == AF_LINK || sa->sa_family == AF_INET6);
11357
11358 kev_sin6 = &kev.sin6_node_address;
11359 kev_sdl = &kev.sdl_node_address;
11360
11361 if (sa->sa_family == AF_INET6) {
11362 /*
11363 * If IPv6 address is given, get the link layer
11364 * address from what was cached in the neighbor cache
11365 */
11366 VERIFY(sa->sa_len <= sizeof(*kev_sin6));
11367 bcopy(sa, kev_sin6, sa->sa_len);
11368 error = nd6_alt_node_absent(ifp, kev_sin6, kev_sdl);
11369 } else {
11370 /*
11371 * If passed address is AF_LINK type, derive the address
11372 * based on the link address.
11373 */
11374 nd6_alt_node_addr_decompose(ifp, sa, kev_sdl, kev_sin6);
11375 error = nd6_alt_node_absent(ifp, kev_sin6, NULL);
11376 }
11377
11378 if (error == 0) {
11379 kev_sdl->sdl_type = ifp->if_type;
11380 kev_sdl->sdl_index = ifp->if_index;
11381
11382 dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_NODE_ABSENCE,
11383 &kev.link_data, sizeof(kev), FALSE);
11384 }
11385 }
11386
11387 int
dlil_node_present_v2(struct ifnet * ifp,struct sockaddr * sa,struct sockaddr_dl * sdl,int32_t rssi,int lqm,int npm,u_int8_t srvinfo[48])11388 dlil_node_present_v2(struct ifnet *ifp, struct sockaddr *sa, struct sockaddr_dl *sdl,
11389 int32_t rssi, int lqm, int npm, u_int8_t srvinfo[48])
11390 {
11391 struct kev_dl_node_presence kev = {};
11392 struct sockaddr_dl *kev_sdl = NULL;
11393 struct sockaddr_in6 *kev_sin6 = NULL;
11394 int ret = 0;
11395
11396 VERIFY(ifp != NULL);
11397 VERIFY(sa != NULL && sdl != NULL);
11398 VERIFY(sa->sa_family == AF_INET6 && sdl->sdl_family == AF_LINK);
11399
11400 kev_sin6 = &kev.sin6_node_address;
11401 kev_sdl = &kev.sdl_node_address;
11402
11403 VERIFY(sdl->sdl_len <= sizeof(*kev_sdl));
11404 bcopy(sdl, kev_sdl, sdl->sdl_len);
11405 kev_sdl->sdl_type = ifp->if_type;
11406 kev_sdl->sdl_index = ifp->if_index;
11407
11408 VERIFY(sa->sa_len <= sizeof(*kev_sin6));
11409 bcopy(sa, kev_sin6, sa->sa_len);
11410
11411 kev.rssi = rssi;
11412 kev.link_quality_metric = lqm;
11413 kev.node_proximity_metric = npm;
11414 bcopy(srvinfo, kev.node_service_info, sizeof(kev.node_service_info));
11415
11416 ret = nd6_alt_node_present(ifp, SIN6(sa), sdl, rssi, lqm, npm);
11417 if (ret == 0 || ret == EEXIST) {
11418 int err = dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_NODE_PRESENCE,
11419 &kev.link_data, sizeof(kev), (ret == EEXIST) ? TRUE : FALSE);
11420 if (err != 0) {
11421 log(LOG_ERR, "%s: Post DL_NODE_PRESENCE failed with error %d\n", __func__, err);
11422 }
11423 }
11424
11425 if (ret == EEXIST) {
11426 ret = 0;
11427 }
11428 return ret;
11429 }
11430
11431 const void *
dlil_ifaddr_bytes(const struct sockaddr_dl * sdl,size_t * sizep,kauth_cred_t * credp)11432 dlil_ifaddr_bytes(const struct sockaddr_dl *sdl, size_t *sizep,
11433 kauth_cred_t *credp)
11434 {
11435 const u_int8_t *bytes;
11436 size_t size;
11437
11438 bytes = CONST_LLADDR(sdl);
11439 size = sdl->sdl_alen;
11440
11441 #if CONFIG_MACF
11442 if (dlil_lladdr_ckreq) {
11443 switch (sdl->sdl_type) {
11444 case IFT_ETHER:
11445 case IFT_IEEE1394:
11446 break;
11447 default:
11448 credp = NULL;
11449 break;
11450 }
11451 ;
11452
11453 if (credp && mac_system_check_info(*credp, "net.link.addr")) {
11454 static const u_int8_t unspec[FIREWIRE_EUI64_LEN] = {
11455 [0] = 2
11456 };
11457
11458 bytes = unspec;
11459 }
11460 }
11461 #else
11462 #pragma unused(credp)
11463 #endif
11464
11465 if (sizep != NULL) {
11466 *sizep = size;
11467 }
11468 return bytes;
11469 }
11470
11471 void
dlil_report_issues(struct ifnet * ifp,u_int8_t modid[DLIL_MODIDLEN],u_int8_t info[DLIL_MODARGLEN])11472 dlil_report_issues(struct ifnet *ifp, u_int8_t modid[DLIL_MODIDLEN],
11473 u_int8_t info[DLIL_MODARGLEN])
11474 {
11475 struct kev_dl_issues kev;
11476 struct timeval tv;
11477
11478 VERIFY(ifp != NULL);
11479 VERIFY(modid != NULL);
11480 _CASSERT(sizeof(kev.modid) == DLIL_MODIDLEN);
11481 _CASSERT(sizeof(kev.info) == DLIL_MODARGLEN);
11482
11483 bzero(&kev, sizeof(kev));
11484
11485 microtime(&tv);
11486 kev.timestamp = tv.tv_sec;
11487 bcopy(modid, &kev.modid, DLIL_MODIDLEN);
11488 if (info != NULL) {
11489 bcopy(info, &kev.info, DLIL_MODARGLEN);
11490 }
11491
11492 dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_ISSUES,
11493 &kev.link_data, sizeof(kev), FALSE);
11494 }
11495
11496 errno_t
ifnet_getset_opportunistic(ifnet_t ifp,u_long cmd,struct ifreq * ifr,struct proc * p)11497 ifnet_getset_opportunistic(ifnet_t ifp, u_long cmd, struct ifreq *ifr,
11498 struct proc *p)
11499 {
11500 u_int32_t level = IFNET_THROTTLE_OFF;
11501 errno_t result = 0;
11502
11503 VERIFY(cmd == SIOCSIFOPPORTUNISTIC || cmd == SIOCGIFOPPORTUNISTIC);
11504
11505 if (cmd == SIOCSIFOPPORTUNISTIC) {
11506 /*
11507 * XXX: Use priv_check_cred() instead of root check?
11508 */
11509 if ((result = proc_suser(p)) != 0) {
11510 return result;
11511 }
11512
11513 if (ifr->ifr_opportunistic.ifo_flags ==
11514 IFRIFOF_BLOCK_OPPORTUNISTIC) {
11515 level = IFNET_THROTTLE_OPPORTUNISTIC;
11516 } else if (ifr->ifr_opportunistic.ifo_flags == 0) {
11517 level = IFNET_THROTTLE_OFF;
11518 } else {
11519 result = EINVAL;
11520 }
11521
11522 if (result == 0) {
11523 result = ifnet_set_throttle(ifp, level);
11524 }
11525 } else if ((result = ifnet_get_throttle(ifp, &level)) == 0) {
11526 ifr->ifr_opportunistic.ifo_flags = 0;
11527 if (level == IFNET_THROTTLE_OPPORTUNISTIC) {
11528 ifr->ifr_opportunistic.ifo_flags |=
11529 IFRIFOF_BLOCK_OPPORTUNISTIC;
11530 }
11531 }
11532
11533 /*
11534 * Return the count of current opportunistic connections
11535 * over the interface.
11536 */
11537 if (result == 0) {
11538 uint32_t flags = 0;
11539 flags |= (cmd == SIOCSIFOPPORTUNISTIC) ?
11540 INPCB_OPPORTUNISTIC_SETCMD : 0;
11541 flags |= (level == IFNET_THROTTLE_OPPORTUNISTIC) ?
11542 INPCB_OPPORTUNISTIC_THROTTLEON : 0;
11543 ifr->ifr_opportunistic.ifo_inuse =
11544 udp_count_opportunistic(ifp->if_index, flags) +
11545 tcp_count_opportunistic(ifp->if_index, flags);
11546 }
11547
11548 if (result == EALREADY) {
11549 result = 0;
11550 }
11551
11552 return result;
11553 }
11554
11555 int
ifnet_get_throttle(struct ifnet * ifp,u_int32_t * level)11556 ifnet_get_throttle(struct ifnet *ifp, u_int32_t *level)
11557 {
11558 struct ifclassq *ifq;
11559 int err = 0;
11560
11561 if (!(ifp->if_eflags & IFEF_TXSTART)) {
11562 return ENXIO;
11563 }
11564
11565 *level = IFNET_THROTTLE_OFF;
11566
11567 ifq = ifp->if_snd;
11568 IFCQ_LOCK(ifq);
11569 /* Throttling works only for IFCQ, not ALTQ instances */
11570 if (IFCQ_IS_ENABLED(ifq)) {
11571 cqrq_throttle_t req = { 0, IFNET_THROTTLE_OFF };
11572
11573 err = fq_if_request_classq(ifq, CLASSQRQ_THROTTLE, &req);
11574 *level = req.level;
11575 }
11576 IFCQ_UNLOCK(ifq);
11577
11578 return err;
11579 }
11580
11581 int
ifnet_set_throttle(struct ifnet * ifp,u_int32_t level)11582 ifnet_set_throttle(struct ifnet *ifp, u_int32_t level)
11583 {
11584 struct ifclassq *ifq;
11585 int err = 0;
11586
11587 if (!(ifp->if_eflags & IFEF_TXSTART)) {
11588 return ENXIO;
11589 }
11590
11591 ifq = ifp->if_snd;
11592
11593 switch (level) {
11594 case IFNET_THROTTLE_OFF:
11595 case IFNET_THROTTLE_OPPORTUNISTIC:
11596 break;
11597 default:
11598 return EINVAL;
11599 }
11600
11601 IFCQ_LOCK(ifq);
11602 if (IFCQ_IS_ENABLED(ifq)) {
11603 cqrq_throttle_t req = { 1, level };
11604
11605 err = fq_if_request_classq(ifq, CLASSQRQ_THROTTLE, &req);
11606 }
11607 IFCQ_UNLOCK(ifq);
11608
11609 if (err == 0) {
11610 DLIL_PRINTF("%s: throttling level set to %d\n", if_name(ifp),
11611 level);
11612 #if NECP
11613 necp_update_all_clients();
11614 #endif /* NECP */
11615 if (level == IFNET_THROTTLE_OFF) {
11616 ifnet_start(ifp);
11617 }
11618 }
11619
11620 return err;
11621 }
11622
11623 errno_t
ifnet_getset_log(ifnet_t ifp,u_long cmd,struct ifreq * ifr,struct proc * p)11624 ifnet_getset_log(ifnet_t ifp, u_long cmd, struct ifreq *ifr,
11625 struct proc *p)
11626 {
11627 #pragma unused(p)
11628 errno_t result = 0;
11629 uint32_t flags;
11630 int level, category, subcategory;
11631
11632 VERIFY(cmd == SIOCSIFLOG || cmd == SIOCGIFLOG);
11633
11634 if (cmd == SIOCSIFLOG) {
11635 if ((result = priv_check_cred(kauth_cred_get(),
11636 PRIV_NET_INTERFACE_CONTROL, 0)) != 0) {
11637 return result;
11638 }
11639
11640 level = ifr->ifr_log.ifl_level;
11641 if (level < IFNET_LOG_MIN || level > IFNET_LOG_MAX) {
11642 result = EINVAL;
11643 }
11644
11645 flags = ifr->ifr_log.ifl_flags;
11646 if ((flags &= IFNET_LOGF_MASK) == 0) {
11647 result = EINVAL;
11648 }
11649
11650 category = ifr->ifr_log.ifl_category;
11651 subcategory = ifr->ifr_log.ifl_subcategory;
11652
11653 if (result == 0) {
11654 result = ifnet_set_log(ifp, level, flags,
11655 category, subcategory);
11656 }
11657 } else {
11658 result = ifnet_get_log(ifp, &level, &flags, &category,
11659 &subcategory);
11660 if (result == 0) {
11661 ifr->ifr_log.ifl_level = level;
11662 ifr->ifr_log.ifl_flags = flags;
11663 ifr->ifr_log.ifl_category = category;
11664 ifr->ifr_log.ifl_subcategory = subcategory;
11665 }
11666 }
11667
11668 return result;
11669 }
11670
11671 int
ifnet_set_log(struct ifnet * ifp,int32_t level,uint32_t flags,int32_t category,int32_t subcategory)11672 ifnet_set_log(struct ifnet *ifp, int32_t level, uint32_t flags,
11673 int32_t category, int32_t subcategory)
11674 {
11675 int err = 0;
11676
11677 VERIFY(level >= IFNET_LOG_MIN && level <= IFNET_LOG_MAX);
11678 VERIFY(flags & IFNET_LOGF_MASK);
11679
11680 /*
11681 * The logging level applies to all facilities; make sure to
11682 * update them all with the most current level.
11683 */
11684 flags |= ifp->if_log.flags;
11685
11686 if (ifp->if_output_ctl != NULL) {
11687 struct ifnet_log_params l;
11688
11689 bzero(&l, sizeof(l));
11690 l.level = level;
11691 l.flags = flags;
11692 l.flags &= ~IFNET_LOGF_DLIL;
11693 l.category = category;
11694 l.subcategory = subcategory;
11695
11696 /* Send this request to lower layers */
11697 if (l.flags != 0) {
11698 err = ifp->if_output_ctl(ifp, IFNET_CTL_SET_LOG,
11699 sizeof(l), &l);
11700 }
11701 } else if ((flags & ~IFNET_LOGF_DLIL) && ifp->if_output_ctl == NULL) {
11702 /*
11703 * If targeted to the lower layers without an output
11704 * control callback registered on the interface, just
11705 * silently ignore facilities other than ours.
11706 */
11707 flags &= IFNET_LOGF_DLIL;
11708 if (flags == 0 && (!(ifp->if_log.flags & IFNET_LOGF_DLIL))) {
11709 level = 0;
11710 }
11711 }
11712
11713 if (err == 0) {
11714 if ((ifp->if_log.level = level) == IFNET_LOG_DEFAULT) {
11715 ifp->if_log.flags = 0;
11716 } else {
11717 ifp->if_log.flags |= flags;
11718 }
11719
11720 log(LOG_INFO, "%s: logging level set to %d flags=%b "
11721 "arg=%b, category=%d subcategory=%d\n", if_name(ifp),
11722 ifp->if_log.level, ifp->if_log.flags,
11723 IFNET_LOGF_BITS, flags, IFNET_LOGF_BITS,
11724 category, subcategory);
11725 }
11726
11727 return err;
11728 }
11729
11730 int
ifnet_get_log(struct ifnet * ifp,int32_t * level,uint32_t * flags,int32_t * category,int32_t * subcategory)11731 ifnet_get_log(struct ifnet *ifp, int32_t *level, uint32_t *flags,
11732 int32_t *category, int32_t *subcategory)
11733 {
11734 if (level != NULL) {
11735 *level = ifp->if_log.level;
11736 }
11737 if (flags != NULL) {
11738 *flags = ifp->if_log.flags;
11739 }
11740 if (category != NULL) {
11741 *category = ifp->if_log.category;
11742 }
11743 if (subcategory != NULL) {
11744 *subcategory = ifp->if_log.subcategory;
11745 }
11746
11747 return 0;
11748 }
11749
11750 int
ifnet_notify_address(struct ifnet * ifp,int af)11751 ifnet_notify_address(struct ifnet *ifp, int af)
11752 {
11753 struct ifnet_notify_address_params na;
11754
11755 #if PF
11756 (void) pf_ifaddr_hook(ifp);
11757 #endif /* PF */
11758
11759 if (ifp->if_output_ctl == NULL) {
11760 return EOPNOTSUPP;
11761 }
11762
11763 bzero(&na, sizeof(na));
11764 na.address_family = (sa_family_t)af;
11765
11766 return ifp->if_output_ctl(ifp, IFNET_CTL_NOTIFY_ADDRESS,
11767 sizeof(na), &na);
11768 }
11769
11770 errno_t
ifnet_flowid(struct ifnet * ifp,uint32_t * flowid)11771 ifnet_flowid(struct ifnet *ifp, uint32_t *flowid)
11772 {
11773 if (ifp == NULL || flowid == NULL) {
11774 return EINVAL;
11775 } else if (!(ifp->if_eflags & IFEF_TXSTART) ||
11776 !IF_FULLY_ATTACHED(ifp)) {
11777 return ENXIO;
11778 }
11779
11780 *flowid = ifp->if_flowhash;
11781
11782 return 0;
11783 }
11784
11785 errno_t
ifnet_disable_output(struct ifnet * ifp)11786 ifnet_disable_output(struct ifnet *ifp)
11787 {
11788 int err;
11789
11790 if (ifp == NULL) {
11791 return EINVAL;
11792 } else if (!(ifp->if_eflags & IFEF_TXSTART) ||
11793 !IF_FULLY_ATTACHED(ifp)) {
11794 return ENXIO;
11795 }
11796
11797 if ((err = ifnet_fc_add(ifp)) == 0) {
11798 lck_mtx_lock_spin(&ifp->if_start_lock);
11799 ifp->if_start_flags |= IFSF_FLOW_CONTROLLED;
11800 lck_mtx_unlock(&ifp->if_start_lock);
11801 }
11802 return err;
11803 }
11804
11805 errno_t
ifnet_enable_output(struct ifnet * ifp)11806 ifnet_enable_output(struct ifnet *ifp)
11807 {
11808 if (ifp == NULL) {
11809 return EINVAL;
11810 } else if (!(ifp->if_eflags & IFEF_TXSTART) ||
11811 !IF_FULLY_ATTACHED(ifp)) {
11812 return ENXIO;
11813 }
11814
11815 ifnet_start_common(ifp, TRUE, FALSE);
11816 return 0;
11817 }
11818
11819 void
ifnet_flowadv(uint32_t flowhash)11820 ifnet_flowadv(uint32_t flowhash)
11821 {
11822 struct ifnet_fc_entry *ifce;
11823 struct ifnet *ifp;
11824
11825 ifce = ifnet_fc_get(flowhash);
11826 if (ifce == NULL) {
11827 return;
11828 }
11829
11830 VERIFY(ifce->ifce_ifp != NULL);
11831 ifp = ifce->ifce_ifp;
11832
11833 /* flow hash gets recalculated per attach, so check */
11834 if (ifnet_is_attached(ifp, 1)) {
11835 if (ifp->if_flowhash == flowhash) {
11836 (void) ifnet_enable_output(ifp);
11837 }
11838 ifnet_decr_iorefcnt(ifp);
11839 }
11840 ifnet_fc_entry_free(ifce);
11841 }
11842
11843 /*
11844 * Function to compare ifnet_fc_entries in ifnet flow control tree
11845 */
11846 static inline int
ifce_cmp(const struct ifnet_fc_entry * fc1,const struct ifnet_fc_entry * fc2)11847 ifce_cmp(const struct ifnet_fc_entry *fc1, const struct ifnet_fc_entry *fc2)
11848 {
11849 return fc1->ifce_flowhash - fc2->ifce_flowhash;
11850 }
11851
11852 static int
ifnet_fc_add(struct ifnet * ifp)11853 ifnet_fc_add(struct ifnet *ifp)
11854 {
11855 struct ifnet_fc_entry keyfc, *ifce;
11856 uint32_t flowhash;
11857
11858 VERIFY(ifp != NULL && (ifp->if_eflags & IFEF_TXSTART));
11859 VERIFY(ifp->if_flowhash != 0);
11860 flowhash = ifp->if_flowhash;
11861
11862 bzero(&keyfc, sizeof(keyfc));
11863 keyfc.ifce_flowhash = flowhash;
11864
11865 lck_mtx_lock_spin(&ifnet_fc_lock);
11866 ifce = RB_FIND(ifnet_fc_tree, &ifnet_fc_tree, &keyfc);
11867 if (ifce != NULL && ifce->ifce_ifp == ifp) {
11868 /* Entry is already in ifnet_fc_tree, return */
11869 lck_mtx_unlock(&ifnet_fc_lock);
11870 return 0;
11871 }
11872
11873 if (ifce != NULL) {
11874 /*
11875 * There is a different fc entry with the same flow hash
11876 * but different ifp pointer. There can be a collision
11877 * on flow hash but the probability is low. Let's just
11878 * avoid adding a second one when there is a collision.
11879 */
11880 lck_mtx_unlock(&ifnet_fc_lock);
11881 return EAGAIN;
11882 }
11883
11884 /* become regular mutex */
11885 lck_mtx_convert_spin(&ifnet_fc_lock);
11886
11887 ifce = zalloc_flags(ifnet_fc_zone, Z_WAITOK | Z_ZERO);
11888 ifce->ifce_flowhash = flowhash;
11889 ifce->ifce_ifp = ifp;
11890
11891 RB_INSERT(ifnet_fc_tree, &ifnet_fc_tree, ifce);
11892 lck_mtx_unlock(&ifnet_fc_lock);
11893 return 0;
11894 }
11895
11896 static struct ifnet_fc_entry *
ifnet_fc_get(uint32_t flowhash)11897 ifnet_fc_get(uint32_t flowhash)
11898 {
11899 struct ifnet_fc_entry keyfc, *ifce;
11900 struct ifnet *ifp;
11901
11902 bzero(&keyfc, sizeof(keyfc));
11903 keyfc.ifce_flowhash = flowhash;
11904
11905 lck_mtx_lock_spin(&ifnet_fc_lock);
11906 ifce = RB_FIND(ifnet_fc_tree, &ifnet_fc_tree, &keyfc);
11907 if (ifce == NULL) {
11908 /* Entry is not present in ifnet_fc_tree, return */
11909 lck_mtx_unlock(&ifnet_fc_lock);
11910 return NULL;
11911 }
11912
11913 RB_REMOVE(ifnet_fc_tree, &ifnet_fc_tree, ifce);
11914
11915 VERIFY(ifce->ifce_ifp != NULL);
11916 ifp = ifce->ifce_ifp;
11917
11918 /* become regular mutex */
11919 lck_mtx_convert_spin(&ifnet_fc_lock);
11920
11921 if (!ifnet_is_attached(ifp, 0)) {
11922 /*
11923 * This ifp is not attached or in the process of being
11924 * detached; just don't process it.
11925 */
11926 ifnet_fc_entry_free(ifce);
11927 ifce = NULL;
11928 }
11929 lck_mtx_unlock(&ifnet_fc_lock);
11930
11931 return ifce;
11932 }
11933
11934 static void
ifnet_fc_entry_free(struct ifnet_fc_entry * ifce)11935 ifnet_fc_entry_free(struct ifnet_fc_entry *ifce)
11936 {
11937 zfree(ifnet_fc_zone, ifce);
11938 }
11939
11940 static uint32_t
ifnet_calc_flowhash(struct ifnet * ifp)11941 ifnet_calc_flowhash(struct ifnet *ifp)
11942 {
11943 struct ifnet_flowhash_key fh __attribute__((aligned(8)));
11944 uint32_t flowhash = 0;
11945
11946 if (ifnet_flowhash_seed == 0) {
11947 ifnet_flowhash_seed = RandomULong();
11948 }
11949
11950 bzero(&fh, sizeof(fh));
11951
11952 (void) snprintf(fh.ifk_name, sizeof(fh.ifk_name), "%s", ifp->if_name);
11953 fh.ifk_unit = ifp->if_unit;
11954 fh.ifk_flags = ifp->if_flags;
11955 fh.ifk_eflags = ifp->if_eflags;
11956 fh.ifk_capabilities = ifp->if_capabilities;
11957 fh.ifk_capenable = ifp->if_capenable;
11958 fh.ifk_output_sched_model = ifp->if_output_sched_model;
11959 fh.ifk_rand1 = RandomULong();
11960 fh.ifk_rand2 = RandomULong();
11961
11962 try_again:
11963 flowhash = net_flowhash(&fh, sizeof(fh), ifnet_flowhash_seed);
11964 if (flowhash == 0) {
11965 /* try to get a non-zero flowhash */
11966 ifnet_flowhash_seed = RandomULong();
11967 goto try_again;
11968 }
11969
11970 return flowhash;
11971 }
11972
11973 int
ifnet_set_netsignature(struct ifnet * ifp,uint8_t family,uint8_t len,uint16_t flags,uint8_t * data)11974 ifnet_set_netsignature(struct ifnet *ifp, uint8_t family, uint8_t len,
11975 uint16_t flags, uint8_t *data)
11976 {
11977 #pragma unused(flags)
11978 int error = 0;
11979
11980 switch (family) {
11981 case AF_INET:
11982 if_inetdata_lock_exclusive(ifp);
11983 if (IN_IFEXTRA(ifp) != NULL) {
11984 if (len == 0) {
11985 /* Allow clearing the signature */
11986 IN_IFEXTRA(ifp)->netsig_len = 0;
11987 bzero(IN_IFEXTRA(ifp)->netsig,
11988 sizeof(IN_IFEXTRA(ifp)->netsig));
11989 if_inetdata_lock_done(ifp);
11990 break;
11991 } else if (len > sizeof(IN_IFEXTRA(ifp)->netsig)) {
11992 error = EINVAL;
11993 if_inetdata_lock_done(ifp);
11994 break;
11995 }
11996 IN_IFEXTRA(ifp)->netsig_len = len;
11997 bcopy(data, IN_IFEXTRA(ifp)->netsig, len);
11998 } else {
11999 error = ENOMEM;
12000 }
12001 if_inetdata_lock_done(ifp);
12002 break;
12003
12004 case AF_INET6:
12005 if_inet6data_lock_exclusive(ifp);
12006 if (IN6_IFEXTRA(ifp) != NULL) {
12007 if (len == 0) {
12008 /* Allow clearing the signature */
12009 IN6_IFEXTRA(ifp)->netsig_len = 0;
12010 bzero(IN6_IFEXTRA(ifp)->netsig,
12011 sizeof(IN6_IFEXTRA(ifp)->netsig));
12012 if_inet6data_lock_done(ifp);
12013 break;
12014 } else if (len > sizeof(IN6_IFEXTRA(ifp)->netsig)) {
12015 error = EINVAL;
12016 if_inet6data_lock_done(ifp);
12017 break;
12018 }
12019 IN6_IFEXTRA(ifp)->netsig_len = len;
12020 bcopy(data, IN6_IFEXTRA(ifp)->netsig, len);
12021 } else {
12022 error = ENOMEM;
12023 }
12024 if_inet6data_lock_done(ifp);
12025 break;
12026
12027 default:
12028 error = EINVAL;
12029 break;
12030 }
12031
12032 return error;
12033 }
12034
12035 int
ifnet_get_netsignature(struct ifnet * ifp,uint8_t family,uint8_t * len,uint16_t * flags,uint8_t * data)12036 ifnet_get_netsignature(struct ifnet *ifp, uint8_t family, uint8_t *len,
12037 uint16_t *flags, uint8_t *data)
12038 {
12039 int error = 0;
12040
12041 if (ifp == NULL || len == NULL || data == NULL) {
12042 return EINVAL;
12043 }
12044
12045 switch (family) {
12046 case AF_INET:
12047 if_inetdata_lock_shared(ifp);
12048 if (IN_IFEXTRA(ifp) != NULL) {
12049 if (*len == 0 || *len < IN_IFEXTRA(ifp)->netsig_len) {
12050 error = EINVAL;
12051 if_inetdata_lock_done(ifp);
12052 break;
12053 }
12054 if ((*len = (uint8_t)IN_IFEXTRA(ifp)->netsig_len) > 0) {
12055 bcopy(IN_IFEXTRA(ifp)->netsig, data, *len);
12056 } else {
12057 error = ENOENT;
12058 }
12059 } else {
12060 error = ENOMEM;
12061 }
12062 if_inetdata_lock_done(ifp);
12063 break;
12064
12065 case AF_INET6:
12066 if_inet6data_lock_shared(ifp);
12067 if (IN6_IFEXTRA(ifp) != NULL) {
12068 if (*len == 0 || *len < IN6_IFEXTRA(ifp)->netsig_len) {
12069 error = EINVAL;
12070 if_inet6data_lock_done(ifp);
12071 break;
12072 }
12073 if ((*len = (uint8_t)IN6_IFEXTRA(ifp)->netsig_len) > 0) {
12074 bcopy(IN6_IFEXTRA(ifp)->netsig, data, *len);
12075 } else {
12076 error = ENOENT;
12077 }
12078 } else {
12079 error = ENOMEM;
12080 }
12081 if_inet6data_lock_done(ifp);
12082 break;
12083
12084 default:
12085 error = EINVAL;
12086 break;
12087 }
12088
12089 if (error == 0 && flags != NULL) {
12090 *flags = 0;
12091 }
12092
12093 return error;
12094 }
12095
12096 int
ifnet_set_nat64prefix(struct ifnet * ifp,struct ipv6_prefix * prefixes)12097 ifnet_set_nat64prefix(struct ifnet *ifp, struct ipv6_prefix *prefixes)
12098 {
12099 int i, error = 0, one_set = 0;
12100
12101 if_inet6data_lock_exclusive(ifp);
12102
12103 if (IN6_IFEXTRA(ifp) == NULL) {
12104 error = ENOMEM;
12105 goto out;
12106 }
12107
12108 for (i = 0; i < NAT64_MAX_NUM_PREFIXES; i++) {
12109 uint32_t prefix_len =
12110 prefixes[i].prefix_len;
12111 struct in6_addr *prefix =
12112 &prefixes[i].ipv6_prefix;
12113
12114 if (prefix_len == 0) {
12115 clat_log0((LOG_DEBUG,
12116 "NAT64 prefixes purged from Interface %s\n",
12117 if_name(ifp)));
12118 /* Allow clearing the signature */
12119 IN6_IFEXTRA(ifp)->nat64_prefixes[i].prefix_len = 0;
12120 bzero(&IN6_IFEXTRA(ifp)->nat64_prefixes[i].ipv6_prefix,
12121 sizeof(struct in6_addr));
12122
12123 continue;
12124 } else if (prefix_len != NAT64_PREFIX_LEN_32 &&
12125 prefix_len != NAT64_PREFIX_LEN_40 &&
12126 prefix_len != NAT64_PREFIX_LEN_48 &&
12127 prefix_len != NAT64_PREFIX_LEN_56 &&
12128 prefix_len != NAT64_PREFIX_LEN_64 &&
12129 prefix_len != NAT64_PREFIX_LEN_96) {
12130 clat_log0((LOG_DEBUG,
12131 "NAT64 prefixlen is incorrect %d\n", prefix_len));
12132 error = EINVAL;
12133 goto out;
12134 }
12135
12136 if (IN6_IS_SCOPE_EMBED(prefix)) {
12137 clat_log0((LOG_DEBUG,
12138 "NAT64 prefix has interface/link local scope.\n"));
12139 error = EINVAL;
12140 goto out;
12141 }
12142
12143 IN6_IFEXTRA(ifp)->nat64_prefixes[i].prefix_len = prefix_len;
12144 bcopy(prefix, &IN6_IFEXTRA(ifp)->nat64_prefixes[i].ipv6_prefix,
12145 sizeof(struct in6_addr));
12146 clat_log0((LOG_DEBUG,
12147 "NAT64 prefix set to %s with prefixlen: %d\n",
12148 ip6_sprintf(prefix), prefix_len));
12149 one_set = 1;
12150 }
12151
12152 out:
12153 if_inet6data_lock_done(ifp);
12154
12155 if (error == 0 && one_set != 0) {
12156 necp_update_all_clients();
12157 }
12158
12159 return error;
12160 }
12161
12162 int
ifnet_get_nat64prefix(struct ifnet * ifp,struct ipv6_prefix * prefixes)12163 ifnet_get_nat64prefix(struct ifnet *ifp, struct ipv6_prefix *prefixes)
12164 {
12165 int i, found_one = 0, error = 0;
12166
12167 if (ifp == NULL) {
12168 return EINVAL;
12169 }
12170
12171 if_inet6data_lock_shared(ifp);
12172
12173 if (IN6_IFEXTRA(ifp) == NULL) {
12174 error = ENOMEM;
12175 goto out;
12176 }
12177
12178 for (i = 0; i < NAT64_MAX_NUM_PREFIXES; i++) {
12179 if (IN6_IFEXTRA(ifp)->nat64_prefixes[i].prefix_len != 0) {
12180 found_one = 1;
12181 }
12182 }
12183
12184 if (found_one == 0) {
12185 error = ENOENT;
12186 goto out;
12187 }
12188
12189 if (prefixes) {
12190 bcopy(IN6_IFEXTRA(ifp)->nat64_prefixes, prefixes,
12191 sizeof(IN6_IFEXTRA(ifp)->nat64_prefixes));
12192 }
12193
12194 out:
12195 if_inet6data_lock_done(ifp);
12196
12197 return error;
12198 }
12199
12200 __attribute__((noinline))
12201 static void
dlil_output_cksum_dbg(struct ifnet * ifp,struct mbuf * m,uint32_t hoff,protocol_family_t pf)12202 dlil_output_cksum_dbg(struct ifnet *ifp, struct mbuf *m, uint32_t hoff,
12203 protocol_family_t pf)
12204 {
12205 #pragma unused(ifp)
12206 uint32_t did_sw;
12207
12208 if (!(hwcksum_dbg_mode & HWCKSUM_DBG_FINALIZE_FORCED) ||
12209 (m->m_pkthdr.csum_flags & (CSUM_TSO_IPV4 | CSUM_TSO_IPV6))) {
12210 return;
12211 }
12212
12213 switch (pf) {
12214 case PF_INET:
12215 did_sw = in_finalize_cksum(m, hoff, m->m_pkthdr.csum_flags);
12216 if (did_sw & CSUM_DELAY_IP) {
12217 hwcksum_dbg_finalized_hdr++;
12218 }
12219 if (did_sw & CSUM_DELAY_DATA) {
12220 hwcksum_dbg_finalized_data++;
12221 }
12222 break;
12223 case PF_INET6:
12224 /*
12225 * Checksum offload should not have been enabled when
12226 * extension headers exist; that also means that we
12227 * cannot force-finalize packets with extension headers.
12228 * Indicate to the callee should it skip such case by
12229 * setting optlen to -1.
12230 */
12231 did_sw = in6_finalize_cksum(m, hoff, -1, -1,
12232 m->m_pkthdr.csum_flags);
12233 if (did_sw & CSUM_DELAY_IPV6_DATA) {
12234 hwcksum_dbg_finalized_data++;
12235 }
12236 break;
12237 default:
12238 return;
12239 }
12240 }
12241
12242 static void
dlil_input_cksum_dbg(struct ifnet * ifp,struct mbuf * m,char * frame_header,protocol_family_t pf)12243 dlil_input_cksum_dbg(struct ifnet *ifp, struct mbuf *m, char *frame_header,
12244 protocol_family_t pf)
12245 {
12246 uint16_t sum = 0;
12247 uint32_t hlen;
12248
12249 if (frame_header == NULL ||
12250 frame_header < (char *)mbuf_datastart(m) ||
12251 frame_header > (char *)m->m_data) {
12252 DLIL_PRINTF("%s: frame header pointer 0x%llx out of range "
12253 "[0x%llx,0x%llx] for mbuf 0x%llx\n", if_name(ifp),
12254 (uint64_t)VM_KERNEL_ADDRPERM(frame_header),
12255 (uint64_t)VM_KERNEL_ADDRPERM(mbuf_datastart(m)),
12256 (uint64_t)VM_KERNEL_ADDRPERM(m->m_data),
12257 (uint64_t)VM_KERNEL_ADDRPERM(m));
12258 return;
12259 }
12260 hlen = (uint32_t)(m->m_data - frame_header);
12261
12262 switch (pf) {
12263 case PF_INET:
12264 case PF_INET6:
12265 break;
12266 default:
12267 return;
12268 }
12269
12270 /*
12271 * Force partial checksum offload; useful to simulate cases
12272 * where the hardware does not support partial checksum offload,
12273 * in order to validate correctness throughout the layers above.
12274 */
12275 if (hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_FORCED) {
12276 uint32_t foff = hwcksum_dbg_partial_rxoff_forced;
12277
12278 if (foff > (uint32_t)m->m_pkthdr.len) {
12279 return;
12280 }
12281
12282 m->m_pkthdr.csum_flags &= ~CSUM_RX_FLAGS;
12283
12284 /* Compute 16-bit 1's complement sum from forced offset */
12285 sum = m_sum16(m, foff, (m->m_pkthdr.len - foff));
12286
12287 m->m_pkthdr.csum_flags |= (CSUM_DATA_VALID | CSUM_PARTIAL);
12288 m->m_pkthdr.csum_rx_val = sum;
12289 m->m_pkthdr.csum_rx_start = (uint16_t)(foff + hlen);
12290
12291 hwcksum_dbg_partial_forced++;
12292 hwcksum_dbg_partial_forced_bytes += m->m_pkthdr.len;
12293 }
12294
12295 /*
12296 * Partial checksum offload verification (and adjustment);
12297 * useful to validate and test cases where the hardware
12298 * supports partial checksum offload.
12299 */
12300 if ((m->m_pkthdr.csum_flags &
12301 (CSUM_DATA_VALID | CSUM_PARTIAL | CSUM_PSEUDO_HDR)) ==
12302 (CSUM_DATA_VALID | CSUM_PARTIAL)) {
12303 uint32_t rxoff;
12304
12305 /* Start offset must begin after frame header */
12306 rxoff = m->m_pkthdr.csum_rx_start;
12307 if (hlen > rxoff) {
12308 hwcksum_dbg_bad_rxoff++;
12309 if (dlil_verbose) {
12310 DLIL_PRINTF("%s: partial cksum start offset %d "
12311 "is less than frame header length %d for "
12312 "mbuf 0x%llx\n", if_name(ifp), rxoff, hlen,
12313 (uint64_t)VM_KERNEL_ADDRPERM(m));
12314 }
12315 return;
12316 }
12317 rxoff -= hlen;
12318
12319 if (!(hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_FORCED)) {
12320 /*
12321 * Compute the expected 16-bit 1's complement sum;
12322 * skip this if we've already computed it above
12323 * when partial checksum offload is forced.
12324 */
12325 sum = m_sum16(m, rxoff, (m->m_pkthdr.len - rxoff));
12326
12327 /* Hardware or driver is buggy */
12328 if (sum != m->m_pkthdr.csum_rx_val) {
12329 hwcksum_dbg_bad_cksum++;
12330 if (dlil_verbose) {
12331 DLIL_PRINTF("%s: bad partial cksum value "
12332 "0x%x (expected 0x%x) for mbuf "
12333 "0x%llx [rx_start %d]\n",
12334 if_name(ifp),
12335 m->m_pkthdr.csum_rx_val, sum,
12336 (uint64_t)VM_KERNEL_ADDRPERM(m),
12337 m->m_pkthdr.csum_rx_start);
12338 }
12339 return;
12340 }
12341 }
12342 hwcksum_dbg_verified++;
12343
12344 /*
12345 * This code allows us to emulate various hardwares that
12346 * perform 16-bit 1's complement sum beginning at various
12347 * start offset values.
12348 */
12349 if (hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_RXOFF_ADJ) {
12350 uint32_t aoff = hwcksum_dbg_partial_rxoff_adj;
12351
12352 if (aoff == rxoff || aoff > (uint32_t)m->m_pkthdr.len) {
12353 return;
12354 }
12355
12356 sum = m_adj_sum16(m, rxoff, aoff,
12357 m_pktlen(m) - aoff, sum);
12358
12359 m->m_pkthdr.csum_rx_val = sum;
12360 m->m_pkthdr.csum_rx_start = (uint16_t)(aoff + hlen);
12361
12362 hwcksum_dbg_adjusted++;
12363 }
12364 }
12365 }
12366
12367 static int
12368 sysctl_hwcksum_dbg_mode SYSCTL_HANDLER_ARGS
12369 {
12370 #pragma unused(arg1, arg2)
12371 u_int32_t i;
12372 int err;
12373
12374 i = hwcksum_dbg_mode;
12375
12376 err = sysctl_handle_int(oidp, &i, 0, req);
12377 if (err != 0 || req->newptr == USER_ADDR_NULL) {
12378 return err;
12379 }
12380
12381 if (hwcksum_dbg == 0) {
12382 return ENODEV;
12383 }
12384
12385 if ((i & ~HWCKSUM_DBG_MASK) != 0) {
12386 return EINVAL;
12387 }
12388
12389 hwcksum_dbg_mode = (i & HWCKSUM_DBG_MASK);
12390
12391 return err;
12392 }
12393
12394 static int
12395 sysctl_hwcksum_dbg_partial_rxoff_forced SYSCTL_HANDLER_ARGS
12396 {
12397 #pragma unused(arg1, arg2)
12398 u_int32_t i;
12399 int err;
12400
12401 i = hwcksum_dbg_partial_rxoff_forced;
12402
12403 err = sysctl_handle_int(oidp, &i, 0, req);
12404 if (err != 0 || req->newptr == USER_ADDR_NULL) {
12405 return err;
12406 }
12407
12408 if (!(hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_FORCED)) {
12409 return ENODEV;
12410 }
12411
12412 hwcksum_dbg_partial_rxoff_forced = i;
12413
12414 return err;
12415 }
12416
12417 static int
12418 sysctl_hwcksum_dbg_partial_rxoff_adj SYSCTL_HANDLER_ARGS
12419 {
12420 #pragma unused(arg1, arg2)
12421 u_int32_t i;
12422 int err;
12423
12424 i = hwcksum_dbg_partial_rxoff_adj;
12425
12426 err = sysctl_handle_int(oidp, &i, 0, req);
12427 if (err != 0 || req->newptr == USER_ADDR_NULL) {
12428 return err;
12429 }
12430
12431 if (!(hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_RXOFF_ADJ)) {
12432 return ENODEV;
12433 }
12434
12435 hwcksum_dbg_partial_rxoff_adj = i;
12436
12437 return err;
12438 }
12439
12440 static int
12441 sysctl_tx_chain_len_stats SYSCTL_HANDLER_ARGS
12442 {
12443 #pragma unused(oidp, arg1, arg2)
12444 int err;
12445
12446 if (req->oldptr == USER_ADDR_NULL) {
12447 }
12448 if (req->newptr != USER_ADDR_NULL) {
12449 return EPERM;
12450 }
12451 err = SYSCTL_OUT(req, &tx_chain_len_stats,
12452 sizeof(struct chain_len_stats));
12453
12454 return err;
12455 }
12456
12457 #if DEBUG || DEVELOPMENT
12458 /* Blob for sum16 verification */
12459 static uint8_t sumdata[] = {
12460 0x1f, 0x8b, 0x08, 0x08, 0x4c, 0xe5, 0x9a, 0x4f, 0x00, 0x03,
12461 0x5f, 0x00, 0x5d, 0x91, 0x41, 0x4e, 0xc4, 0x30, 0x0c, 0x45,
12462 0xf7, 0x9c, 0xc2, 0x07, 0x18, 0xf5, 0x0e, 0xb0, 0xe2, 0x00,
12463 0x48, 0x88, 0xa5, 0xdb, 0xba, 0x49, 0x34, 0x69, 0xdc, 0x71,
12464 0x92, 0xa9, 0xc2, 0x8a, 0x6b, 0x70, 0x3d, 0x4e, 0x82, 0x93,
12465 0xb4, 0x08, 0xd8, 0xc5, 0xb1, 0xfd, 0xff, 0xb3, 0xfd, 0x4c,
12466 0x42, 0x5f, 0x1f, 0x9f, 0x11, 0x12, 0x43, 0xb2, 0x04, 0x93,
12467 0xe0, 0x7b, 0x01, 0x0e, 0x14, 0x07, 0x78, 0xd1, 0x78, 0x75,
12468 0x71, 0x71, 0xe9, 0x08, 0x84, 0x46, 0xf2, 0xc7, 0x3b, 0x09,
12469 0xe7, 0xd1, 0xd3, 0x8a, 0x57, 0x92, 0x33, 0xcd, 0x39, 0xcc,
12470 0xb0, 0x91, 0x89, 0xe0, 0x42, 0x53, 0x8b, 0xb7, 0x8c, 0x42,
12471 0x60, 0xd9, 0x9f, 0x7a, 0x55, 0x19, 0x76, 0xcb, 0x10, 0x49,
12472 0x35, 0xac, 0x0b, 0x5a, 0x3c, 0xbb, 0x65, 0x51, 0x8c, 0x90,
12473 0x7c, 0x69, 0x45, 0x45, 0x81, 0xb4, 0x2b, 0x70, 0x82, 0x85,
12474 0x55, 0x91, 0x17, 0x90, 0xdc, 0x14, 0x1e, 0x35, 0x52, 0xdd,
12475 0x02, 0x16, 0xef, 0xb5, 0x40, 0x89, 0xe2, 0x46, 0x53, 0xad,
12476 0x93, 0x6e, 0x98, 0x30, 0xe5, 0x08, 0xb7, 0xcc, 0x03, 0xbc,
12477 0x71, 0x86, 0x09, 0x43, 0x0d, 0x52, 0xf5, 0xa2, 0xf5, 0xa2,
12478 0x56, 0x11, 0x8d, 0xa8, 0xf5, 0xee, 0x92, 0x3d, 0xfe, 0x8c,
12479 0x67, 0x71, 0x8b, 0x0e, 0x2d, 0x70, 0x77, 0xbe, 0xbe, 0xea,
12480 0xbf, 0x9a, 0x8d, 0x9c, 0x53, 0x53, 0xe5, 0xe0, 0x4b, 0x87,
12481 0x85, 0xd2, 0x45, 0x95, 0x30, 0xc1, 0xcc, 0xe0, 0x74, 0x54,
12482 0x13, 0x58, 0xe8, 0xe8, 0x79, 0xa2, 0x09, 0x73, 0xa4, 0x0e,
12483 0x39, 0x59, 0x0c, 0xe6, 0x9c, 0xb2, 0x4f, 0x06, 0x5b, 0x8e,
12484 0xcd, 0x17, 0x6c, 0x5e, 0x95, 0x4d, 0x70, 0xa2, 0x0a, 0xbf,
12485 0xa3, 0xcc, 0x03, 0xbc, 0x5a, 0xe7, 0x75, 0x06, 0x5e, 0x75,
12486 0xef, 0x58, 0x8e, 0x15, 0xd1, 0x0a, 0x18, 0xff, 0xdd, 0xe6,
12487 0x02, 0x3b, 0xb5, 0xb4, 0xa1, 0xe0, 0x72, 0xfc, 0xe3, 0xab,
12488 0x07, 0xe0, 0x4d, 0x65, 0xea, 0x92, 0xeb, 0xf2, 0x7b, 0x17,
12489 0x05, 0xce, 0xc6, 0xf6, 0x2b, 0xbb, 0x70, 0x3d, 0x00, 0x95,
12490 0xe0, 0x07, 0x52, 0x3b, 0x58, 0xfc, 0x7c, 0x69, 0x4d, 0xe9,
12491 0xf7, 0xa9, 0x66, 0x1e, 0x1e, 0xbe, 0x01, 0x69, 0x98, 0xfe,
12492 0xc8, 0x28, 0x02, 0x00, 0x00
12493 };
12494
12495 /* Precomputed 16-bit 1's complement sums for various spans of the above data */
12496 static struct {
12497 boolean_t init;
12498 uint16_t len;
12499 uint16_t sumr; /* reference */
12500 uint16_t sumrp; /* reference, precomputed */
12501 } sumtbl[] = {
12502 { FALSE, 0, 0, 0x0000 },
12503 { FALSE, 1, 0, 0x001f },
12504 { FALSE, 2, 0, 0x8b1f },
12505 { FALSE, 3, 0, 0x8b27 },
12506 { FALSE, 7, 0, 0x790e },
12507 { FALSE, 11, 0, 0xcb6d },
12508 { FALSE, 20, 0, 0x20dd },
12509 { FALSE, 27, 0, 0xbabd },
12510 { FALSE, 32, 0, 0xf3e8 },
12511 { FALSE, 37, 0, 0x197d },
12512 { FALSE, 43, 0, 0x9eae },
12513 { FALSE, 64, 0, 0x4678 },
12514 { FALSE, 127, 0, 0x9399 },
12515 { FALSE, 256, 0, 0xd147 },
12516 { FALSE, 325, 0, 0x0358 },
12517 };
12518 #define SUMTBL_MAX ((int)sizeof (sumtbl) / (int)sizeof (sumtbl[0]))
12519
12520 static void
dlil_verify_sum16(void)12521 dlil_verify_sum16(void)
12522 {
12523 struct mbuf *m;
12524 uint8_t *buf;
12525 int n;
12526
12527 /* Make sure test data plus extra room for alignment fits in cluster */
12528 _CASSERT((sizeof(sumdata) + (sizeof(uint64_t) * 2)) <= MCLBYTES);
12529
12530 kprintf("DLIL: running SUM16 self-tests ... ");
12531
12532 m = m_getcl(M_WAITOK, MT_DATA, M_PKTHDR);
12533 m_align(m, sizeof(sumdata) + (sizeof(uint64_t) * 2));
12534
12535 buf = mtod(m, uint8_t *); /* base address */
12536
12537 for (n = 0; n < SUMTBL_MAX; n++) {
12538 uint16_t len = sumtbl[n].len;
12539 int i;
12540
12541 /* Verify for all possible alignments */
12542 for (i = 0; i < (int)sizeof(uint64_t); i++) {
12543 uint16_t sum, sumr;
12544 uint8_t *c;
12545
12546 /* Copy over test data to mbuf */
12547 VERIFY(len <= sizeof(sumdata));
12548 c = buf + i;
12549 bcopy(sumdata, c, len);
12550
12551 /* Zero-offset test (align by data pointer) */
12552 m->m_data = (caddr_t)c;
12553 m->m_len = len;
12554 sum = m_sum16(m, 0, len);
12555
12556 if (!sumtbl[n].init) {
12557 sumr = (uint16_t)in_cksum_mbuf_ref(m, len, 0, 0);
12558 sumtbl[n].sumr = sumr;
12559 sumtbl[n].init = TRUE;
12560 } else {
12561 sumr = sumtbl[n].sumr;
12562 }
12563
12564 /* Something is horribly broken; stop now */
12565 if (sumr != sumtbl[n].sumrp) {
12566 panic_plain("\n%s: broken in_cksum_mbuf_ref() "
12567 "for len=%d align=%d sum=0x%04x "
12568 "[expected=0x%04x]\n", __func__,
12569 len, i, sum, sumr);
12570 /* NOTREACHED */
12571 } else if (sum != sumr) {
12572 panic_plain("\n%s: broken m_sum16() for len=%d "
12573 "align=%d sum=0x%04x [expected=0x%04x]\n",
12574 __func__, len, i, sum, sumr);
12575 /* NOTREACHED */
12576 }
12577
12578 /* Alignment test by offset (fixed data pointer) */
12579 m->m_data = (caddr_t)buf;
12580 m->m_len = i + len;
12581 sum = m_sum16(m, i, len);
12582
12583 /* Something is horribly broken; stop now */
12584 if (sum != sumr) {
12585 panic_plain("\n%s: broken m_sum16() for len=%d "
12586 "offset=%d sum=0x%04x [expected=0x%04x]\n",
12587 __func__, len, i, sum, sumr);
12588 /* NOTREACHED */
12589 }
12590 #if INET
12591 /* Simple sum16 contiguous buffer test by aligment */
12592 sum = b_sum16(c, len);
12593
12594 /* Something is horribly broken; stop now */
12595 if (sum != sumr) {
12596 panic_plain("\n%s: broken b_sum16() for len=%d "
12597 "align=%d sum=0x%04x [expected=0x%04x]\n",
12598 __func__, len, i, sum, sumr);
12599 /* NOTREACHED */
12600 }
12601 #endif /* INET */
12602 }
12603 }
12604 m_freem(m);
12605
12606 kprintf("PASSED\n");
12607 }
12608 #endif /* DEBUG || DEVELOPMENT */
12609
12610 #define CASE_STRINGIFY(x) case x: return #x
12611
12612 __private_extern__ const char *
dlil_kev_dl_code_str(u_int32_t event_code)12613 dlil_kev_dl_code_str(u_int32_t event_code)
12614 {
12615 switch (event_code) {
12616 CASE_STRINGIFY(KEV_DL_SIFFLAGS);
12617 CASE_STRINGIFY(KEV_DL_SIFMETRICS);
12618 CASE_STRINGIFY(KEV_DL_SIFMTU);
12619 CASE_STRINGIFY(KEV_DL_SIFPHYS);
12620 CASE_STRINGIFY(KEV_DL_SIFMEDIA);
12621 CASE_STRINGIFY(KEV_DL_SIFGENERIC);
12622 CASE_STRINGIFY(KEV_DL_ADDMULTI);
12623 CASE_STRINGIFY(KEV_DL_DELMULTI);
12624 CASE_STRINGIFY(KEV_DL_IF_ATTACHED);
12625 CASE_STRINGIFY(KEV_DL_IF_DETACHING);
12626 CASE_STRINGIFY(KEV_DL_IF_DETACHED);
12627 CASE_STRINGIFY(KEV_DL_LINK_OFF);
12628 CASE_STRINGIFY(KEV_DL_LINK_ON);
12629 CASE_STRINGIFY(KEV_DL_PROTO_ATTACHED);
12630 CASE_STRINGIFY(KEV_DL_PROTO_DETACHED);
12631 CASE_STRINGIFY(KEV_DL_LINK_ADDRESS_CHANGED);
12632 CASE_STRINGIFY(KEV_DL_WAKEFLAGS_CHANGED);
12633 CASE_STRINGIFY(KEV_DL_IF_IDLE_ROUTE_REFCNT);
12634 CASE_STRINGIFY(KEV_DL_IFCAP_CHANGED);
12635 CASE_STRINGIFY(KEV_DL_LINK_QUALITY_METRIC_CHANGED);
12636 CASE_STRINGIFY(KEV_DL_NODE_PRESENCE);
12637 CASE_STRINGIFY(KEV_DL_NODE_ABSENCE);
12638 CASE_STRINGIFY(KEV_DL_PRIMARY_ELECTED);
12639 CASE_STRINGIFY(KEV_DL_ISSUES);
12640 CASE_STRINGIFY(KEV_DL_IFDELEGATE_CHANGED);
12641 default:
12642 break;
12643 }
12644 return "";
12645 }
12646
12647 static void
dlil_dt_tcall_fn(thread_call_param_t arg0,thread_call_param_t arg1)12648 dlil_dt_tcall_fn(thread_call_param_t arg0, thread_call_param_t arg1)
12649 {
12650 #pragma unused(arg1)
12651 struct ifnet *ifp = arg0;
12652
12653 if (ifnet_is_attached(ifp, 1)) {
12654 nstat_ifnet_threshold_reached(ifp->if_index);
12655 ifnet_decr_iorefcnt(ifp);
12656 }
12657 }
12658
12659 void
ifnet_notify_data_threshold(struct ifnet * ifp)12660 ifnet_notify_data_threshold(struct ifnet *ifp)
12661 {
12662 uint64_t bytes = (ifp->if_ibytes + ifp->if_obytes);
12663 uint64_t oldbytes = ifp->if_dt_bytes;
12664
12665 ASSERT(ifp->if_dt_tcall != NULL);
12666
12667 /*
12668 * If we went over the threshold, notify NetworkStatistics.
12669 * We rate-limit it based on the threshold interval value.
12670 */
12671 if (threshold_notify && (bytes - oldbytes) > ifp->if_data_threshold &&
12672 OSCompareAndSwap64(oldbytes, bytes, &ifp->if_dt_bytes) &&
12673 !thread_call_isactive(ifp->if_dt_tcall)) {
12674 uint64_t tival = (threshold_interval * NSEC_PER_SEC);
12675 uint64_t now = mach_absolute_time(), deadline = now;
12676 uint64_t ival;
12677
12678 if (tival != 0) {
12679 nanoseconds_to_absolutetime(tival, &ival);
12680 clock_deadline_for_periodic_event(ival, now, &deadline);
12681 (void) thread_call_enter_delayed(ifp->if_dt_tcall,
12682 deadline);
12683 } else {
12684 (void) thread_call_enter(ifp->if_dt_tcall);
12685 }
12686 }
12687 }
12688
12689 #if (DEVELOPMENT || DEBUG)
12690 /*
12691 * The sysctl variable name contains the input parameters of
12692 * ifnet_get_keepalive_offload_frames()
12693 * ifp (interface index): name[0]
12694 * frames_array_count: name[1]
12695 * frame_data_offset: name[2]
12696 * The return length gives used_frames_count
12697 */
12698 static int
12699 sysctl_get_kao_frames SYSCTL_HANDLER_ARGS
12700 {
12701 #pragma unused(oidp)
12702 int *name = (int *)arg1;
12703 u_int namelen = arg2;
12704 int idx;
12705 ifnet_t ifp = NULL;
12706 u_int32_t frames_array_count;
12707 size_t frame_data_offset;
12708 u_int32_t used_frames_count;
12709 struct ifnet_keepalive_offload_frame *frames_array = NULL;
12710 int error = 0;
12711 u_int32_t i;
12712
12713 /*
12714 * Only root can get look at other people TCP frames
12715 */
12716 error = proc_suser(current_proc());
12717 if (error != 0) {
12718 goto done;
12719 }
12720 /*
12721 * Validate the input parameters
12722 */
12723 if (req->newptr != USER_ADDR_NULL) {
12724 error = EPERM;
12725 goto done;
12726 }
12727 if (namelen != 3) {
12728 error = EINVAL;
12729 goto done;
12730 }
12731 if (req->oldptr == USER_ADDR_NULL) {
12732 error = EINVAL;
12733 goto done;
12734 }
12735 if (req->oldlen == 0) {
12736 error = EINVAL;
12737 goto done;
12738 }
12739 idx = name[0];
12740 frames_array_count = name[1];
12741 frame_data_offset = name[2];
12742
12743 /* Make sure the passed buffer is large enough */
12744 if (frames_array_count * sizeof(struct ifnet_keepalive_offload_frame) >
12745 req->oldlen) {
12746 error = ENOMEM;
12747 goto done;
12748 }
12749
12750 ifnet_head_lock_shared();
12751 if (!IF_INDEX_IN_RANGE(idx)) {
12752 ifnet_head_done();
12753 error = ENOENT;
12754 goto done;
12755 }
12756 ifp = ifindex2ifnet[idx];
12757 ifnet_head_done();
12758
12759 frames_array = (struct ifnet_keepalive_offload_frame *)kalloc_data(
12760 frames_array_count * sizeof(struct ifnet_keepalive_offload_frame),
12761 Z_WAITOK);
12762 if (frames_array == NULL) {
12763 error = ENOMEM;
12764 goto done;
12765 }
12766
12767 error = ifnet_get_keepalive_offload_frames(ifp, frames_array,
12768 frames_array_count, frame_data_offset, &used_frames_count);
12769 if (error != 0) {
12770 DLIL_PRINTF("%s: ifnet_get_keepalive_offload_frames error %d\n",
12771 __func__, error);
12772 goto done;
12773 }
12774
12775 for (i = 0; i < used_frames_count; i++) {
12776 error = SYSCTL_OUT(req, frames_array + i,
12777 sizeof(struct ifnet_keepalive_offload_frame));
12778 if (error != 0) {
12779 goto done;
12780 }
12781 }
12782 done:
12783 if (frames_array != NULL) {
12784 kfree_data(frames_array, frames_array_count *
12785 sizeof(struct ifnet_keepalive_offload_frame));
12786 }
12787 return error;
12788 }
12789 #endif /* DEVELOPMENT || DEBUG */
12790
12791 void
ifnet_update_stats_per_flow(struct ifnet_stats_per_flow * ifs,struct ifnet * ifp)12792 ifnet_update_stats_per_flow(struct ifnet_stats_per_flow *ifs,
12793 struct ifnet *ifp)
12794 {
12795 tcp_update_stats_per_flow(ifs, ifp);
12796 }
12797
12798 static inline u_int32_t
_set_flags(u_int32_t * flags_p,u_int32_t set_flags)12799 _set_flags(u_int32_t *flags_p, u_int32_t set_flags)
12800 {
12801 return (u_int32_t)OSBitOrAtomic(set_flags, flags_p);
12802 }
12803
12804 static inline void
_clear_flags(u_int32_t * flags_p,u_int32_t clear_flags)12805 _clear_flags(u_int32_t *flags_p, u_int32_t clear_flags)
12806 {
12807 OSBitAndAtomic(~clear_flags, flags_p);
12808 }
12809
12810 __private_extern__ u_int32_t
if_set_eflags(ifnet_t interface,u_int32_t set_flags)12811 if_set_eflags(ifnet_t interface, u_int32_t set_flags)
12812 {
12813 return _set_flags(&interface->if_eflags, set_flags);
12814 }
12815
12816 __private_extern__ void
if_clear_eflags(ifnet_t interface,u_int32_t clear_flags)12817 if_clear_eflags(ifnet_t interface, u_int32_t clear_flags)
12818 {
12819 _clear_flags(&interface->if_eflags, clear_flags);
12820 }
12821
12822 __private_extern__ u_int32_t
if_set_xflags(ifnet_t interface,u_int32_t set_flags)12823 if_set_xflags(ifnet_t interface, u_int32_t set_flags)
12824 {
12825 return _set_flags(&interface->if_xflags, set_flags);
12826 }
12827
12828 __private_extern__ void
if_clear_xflags(ifnet_t interface,u_int32_t clear_flags)12829 if_clear_xflags(ifnet_t interface, u_int32_t clear_flags)
12830 {
12831 _clear_flags(&interface->if_xflags, clear_flags);
12832 }
12833
12834 __private_extern__ void
ifnet_update_traffic_rule_genid(ifnet_t ifp)12835 ifnet_update_traffic_rule_genid(ifnet_t ifp)
12836 {
12837 os_atomic_inc(&ifp->if_traffic_rule_genid, relaxed);
12838 }
12839
12840 __private_extern__ boolean_t
ifnet_sync_traffic_rule_genid(ifnet_t ifp,uint32_t * genid)12841 ifnet_sync_traffic_rule_genid(ifnet_t ifp, uint32_t *genid)
12842 {
12843 if (*genid != ifp->if_traffic_rule_genid) {
12844 *genid = ifp->if_traffic_rule_genid;
12845 return TRUE;
12846 }
12847 return FALSE;
12848 }
12849 __private_extern__ void
ifnet_update_traffic_rule_count(ifnet_t ifp,uint32_t count)12850 ifnet_update_traffic_rule_count(ifnet_t ifp, uint32_t count)
12851 {
12852 os_atomic_store(&ifp->if_traffic_rule_count, count, release);
12853 ifnet_update_traffic_rule_genid(ifp);
12854 }
12855
12856 static void
log_hexdump(void * data,size_t len)12857 log_hexdump(void *data, size_t len)
12858 {
12859 size_t i, j, k;
12860 unsigned char *ptr = (unsigned char *)data;
12861 #define MAX_DUMP_BUF 32
12862 unsigned char buf[3 * MAX_DUMP_BUF + 1];
12863
12864 for (i = 0; i < len; i += MAX_DUMP_BUF) {
12865 for (j = i, k = 0; j < i + MAX_DUMP_BUF && j < len; j++) {
12866 unsigned char msnbl = ptr[j] >> 4;
12867 unsigned char lsnbl = ptr[j] & 0x0f;
12868
12869 buf[k++] = msnbl < 10 ? msnbl + '0' : msnbl + 'a' - 10;
12870 buf[k++] = lsnbl < 10 ? lsnbl + '0' : lsnbl + 'a' - 10;
12871
12872 if ((j % 2) == 1) {
12873 buf[k++] = ' ';
12874 }
12875 if ((j % MAX_DUMP_BUF) == MAX_DUMP_BUF - 1) {
12876 buf[k++] = ' ';
12877 }
12878 }
12879 buf[k] = 0;
12880 os_log(OS_LOG_DEFAULT, "%3lu: %s", i, buf);
12881 }
12882 }
12883
12884 #if SKYWALK && defined(XNU_TARGET_OS_OSX)
12885 static bool
net_check_compatible_if_filter(struct ifnet * ifp)12886 net_check_compatible_if_filter(struct ifnet *ifp)
12887 {
12888 if (ifp == NULL) {
12889 if (net_api_stats.nas_iflt_attach_count > net_api_stats.nas_iflt_attach_os_count) {
12890 return false;
12891 }
12892 } else {
12893 if (ifp->if_flt_non_os_count > 0) {
12894 return false;
12895 }
12896 }
12897 return true;
12898 }
12899 #endif /* SKYWALK && XNU_TARGET_OS_OSX */
12900
12901 #define DUMP_BUF_CHK() { \
12902 clen -= k; \
12903 if (clen < 1) \
12904 goto done; \
12905 c += k; \
12906 }
12907
12908 int dlil_dump_top_if_qlen(char *, int);
12909 int
dlil_dump_top_if_qlen(char * str,int str_len)12910 dlil_dump_top_if_qlen(char *str, int str_len)
12911 {
12912 char *c = str;
12913 int k, clen = str_len;
12914 struct ifnet *top_ifcq_ifp = NULL;
12915 uint32_t top_ifcq_len = 0;
12916 struct ifnet *top_inq_ifp = NULL;
12917 uint32_t top_inq_len = 0;
12918
12919 for (int ifidx = 1; ifidx < if_index; ifidx++) {
12920 struct ifnet *ifp = ifindex2ifnet[ifidx];
12921 struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
12922
12923 if (ifp == NULL) {
12924 continue;
12925 }
12926 if (ifp->if_snd != NULL && ifp->if_snd->ifcq_len > top_ifcq_len) {
12927 top_ifcq_len = ifp->if_snd->ifcq_len;
12928 top_ifcq_ifp = ifp;
12929 }
12930 if (dl_if->dl_if_inpstorage.dlth_pkts.qlen > top_inq_len) {
12931 top_inq_len = dl_if->dl_if_inpstorage.dlth_pkts.qlen;
12932 top_inq_ifp = ifp;
12933 }
12934 }
12935
12936 if (top_ifcq_ifp != NULL) {
12937 k = scnprintf(c, clen, "\ntop ifcq_len %u packets by %s\n",
12938 top_ifcq_len, top_ifcq_ifp->if_xname);
12939 DUMP_BUF_CHK();
12940 }
12941 if (top_inq_ifp != NULL) {
12942 k = scnprintf(c, clen, "\ntop inq_len %u packets by %s\n",
12943 top_inq_len, top_inq_ifp->if_xname);
12944 DUMP_BUF_CHK();
12945 }
12946 done:
12947 return str_len - clen;
12948 }
12949