1 /*
2 * Copyright (c) 1999-2023 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
30 * support for mandatory and extensible security protections. This notice
31 * is included in support of clause 2.2 (b) of the Apple Public License,
32 * Version 2.0.
33 */
34 #include "kpi_interface.h"
35 #include <stddef.h>
36 #include <ptrauth.h>
37
38 #include <sys/param.h>
39 #include <sys/systm.h>
40 #include <sys/kernel.h>
41 #include <sys/malloc.h>
42 #include <sys/mbuf.h>
43 #include <sys/socket.h>
44 #include <sys/domain.h>
45 #include <sys/user.h>
46 #include <sys/random.h>
47 #include <sys/socketvar.h>
48 #include <net/if_dl.h>
49 #include <net/if.h>
50 #include <net/route.h>
51 #include <net/if_var.h>
52 #include <net/dlil.h>
53 #include <net/if_arp.h>
54 #include <net/iptap.h>
55 #include <net/pktap.h>
56 #include <net/nwk_wq.h>
57 #include <sys/kern_event.h>
58 #include <sys/kdebug.h>
59 #include <sys/mcache.h>
60 #include <sys/syslog.h>
61 #include <sys/protosw.h>
62 #include <sys/priv.h>
63
64 #include <kern/assert.h>
65 #include <kern/task.h>
66 #include <kern/thread.h>
67 #include <kern/sched_prim.h>
68 #include <kern/locks.h>
69 #include <kern/zalloc.h>
70
71 #include <net/kpi_protocol.h>
72 #include <net/if_types.h>
73 #include <net/if_ipsec.h>
74 #include <net/if_llreach.h>
75 #include <net/if_utun.h>
76 #include <net/kpi_interfacefilter.h>
77 #include <net/classq/classq.h>
78 #include <net/classq/classq_sfb.h>
79 #include <net/flowhash.h>
80 #include <net/ntstat.h>
81 #if SKYWALK && defined(XNU_TARGET_OS_OSX)
82 #include <skywalk/lib/net_filter_event.h>
83 #endif /* SKYWALK && XNU_TARGET_OS_OSX */
84 #include <net/if_llatbl.h>
85 #include <net/net_api_stats.h>
86 #include <net/if_ports_used.h>
87 #include <net/if_vlan_var.h>
88 #include <netinet/in.h>
89 #if INET
90 #include <netinet/in_var.h>
91 #include <netinet/igmp_var.h>
92 #include <netinet/ip_var.h>
93 #include <netinet/tcp.h>
94 #include <netinet/tcp_var.h>
95 #include <netinet/udp.h>
96 #include <netinet/udp_var.h>
97 #include <netinet/if_ether.h>
98 #include <netinet/in_pcb.h>
99 #include <netinet/in_tclass.h>
100 #include <netinet/ip.h>
101 #include <netinet/ip_icmp.h>
102 #include <netinet/icmp_var.h>
103 #endif /* INET */
104
105 #include <net/nat464_utils.h>
106 #include <netinet6/in6_var.h>
107 #include <netinet6/nd6.h>
108 #include <netinet6/mld6_var.h>
109 #include <netinet6/scope6_var.h>
110 #include <netinet/ip6.h>
111 #include <netinet/icmp6.h>
112 #include <net/pf_pbuf.h>
113 #include <libkern/OSAtomic.h>
114 #include <libkern/tree.h>
115
116 #include <dev/random/randomdev.h>
117 #include <machine/machine_routines.h>
118
119 #include <mach/thread_act.h>
120 #include <mach/sdt.h>
121
122 #if CONFIG_MACF
123 #include <sys/kauth.h>
124 #include <security/mac_framework.h>
125 #include <net/ethernet.h>
126 #include <net/firewire.h>
127 #endif
128
129 #if PF
130 #include <net/pfvar.h>
131 #endif /* PF */
132 #include <net/pktsched/pktsched.h>
133 #include <net/pktsched/pktsched_netem.h>
134
135 #if NECP
136 #include <net/necp.h>
137 #endif /* NECP */
138
139 #if SKYWALK
140 #include <skywalk/packet/packet_queue.h>
141 #include <skywalk/nexus/netif/nx_netif.h>
142 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
143 #endif /* SKYWALK */
144
145 #include <os/log.h>
146
147 #define DBG_LAYER_BEG DLILDBG_CODE(DBG_DLIL_STATIC, 0)
148 #define DBG_LAYER_END DLILDBG_CODE(DBG_DLIL_STATIC, 2)
149 #define DBG_FNC_DLIL_INPUT DLILDBG_CODE(DBG_DLIL_STATIC, (1 << 8))
150 #define DBG_FNC_DLIL_OUTPUT DLILDBG_CODE(DBG_DLIL_STATIC, (2 << 8))
151 #define DBG_FNC_DLIL_IFOUT DLILDBG_CODE(DBG_DLIL_STATIC, (3 << 8))
152
153 #define MAX_FRAME_TYPE_SIZE 4 /* LONGWORDS */
154 #define MAX_LINKADDR 4 /* LONGWORDS */
155
156 #if 1
157 #define DLIL_PRINTF printf
158 #else
159 #define DLIL_PRINTF kprintf
160 #endif
161
162 #define IF_DATA_REQUIRE_ALIGNED_64(f) \
163 _CASSERT(!(offsetof(struct if_data_internal, f) % sizeof (u_int64_t)))
164
165 #define IFNET_IF_DATA_REQUIRE_ALIGNED_64(f) \
166 _CASSERT(!(offsetof(struct ifnet, if_data.f) % sizeof (u_int64_t)))
167
168 enum {
169 kProtoKPI_v1 = 1,
170 kProtoKPI_v2 = 2
171 };
172
173 uint64_t if_creation_generation_count = 0;
174
175 /*
176 * List of if_proto structures in if_proto_hash[] is protected by
177 * the ifnet lock. The rest of the fields are initialized at protocol
178 * attach time and never change, thus no lock required as long as
179 * a reference to it is valid, via if_proto_ref().
180 */
181 struct if_proto {
182 SLIST_ENTRY(if_proto) next_hash;
183 u_int32_t refcount;
184 u_int32_t detached;
185 struct ifnet *ifp;
186 protocol_family_t protocol_family;
187 int proto_kpi;
188 union {
189 struct {
190 proto_media_input input;
191 proto_media_preout pre_output;
192 proto_media_event event;
193 proto_media_ioctl ioctl;
194 proto_media_detached detached;
195 proto_media_resolve_multi resolve_multi;
196 proto_media_send_arp send_arp;
197 } v1;
198 struct {
199 proto_media_input_v2 input;
200 proto_media_preout pre_output;
201 proto_media_event event;
202 proto_media_ioctl ioctl;
203 proto_media_detached detached;
204 proto_media_resolve_multi resolve_multi;
205 proto_media_send_arp send_arp;
206 } v2;
207 } kpi;
208 };
209
210 SLIST_HEAD(proto_hash_entry, if_proto);
211
212 #define DLIL_SDLDATALEN \
213 (DLIL_SDLMAXLEN - offsetof(struct sockaddr_dl, sdl_data[0]))
214
215 struct dlil_ifnet {
216 struct ifnet dl_if; /* public ifnet */
217 /*
218 * DLIL private fields, protected by dl_if_lock
219 */
220 decl_lck_mtx_data(, dl_if_lock);
221 TAILQ_ENTRY(dlil_ifnet) dl_if_link; /* dlil_ifnet link */
222 u_int32_t dl_if_flags; /* flags (below) */
223 u_int32_t dl_if_refcnt; /* refcnt */
224 void (*dl_if_trace)(struct dlil_ifnet *, int); /* ref trace callback */
225 void *dl_if_uniqueid; /* unique interface id */
226 size_t dl_if_uniqueid_len; /* length of the unique id */
227 char dl_if_namestorage[IFNAMSIZ]; /* interface name storage */
228 char dl_if_xnamestorage[IFXNAMSIZ]; /* external name storage */
229 struct {
230 struct ifaddr ifa; /* lladdr ifa */
231 u_int8_t asdl[DLIL_SDLMAXLEN]; /* addr storage */
232 u_int8_t msdl[DLIL_SDLMAXLEN]; /* mask storage */
233 } dl_if_lladdr;
234 u_int8_t dl_if_descstorage[IF_DESCSIZE]; /* desc storage */
235 u_int8_t dl_if_permanent_ether[ETHER_ADDR_LEN]; /* permanent address */
236 u_int8_t dl_if_permanent_ether_is_set;
237 u_int8_t dl_if_unused;
238 struct dlil_threading_info dl_if_inpstorage; /* input thread storage */
239 ctrace_t dl_if_attach; /* attach PC stacktrace */
240 ctrace_t dl_if_detach; /* detach PC stacktrace */
241 };
242
243 /* Values for dl_if_flags (private to DLIL) */
244 #define DLIF_INUSE 0x1 /* DLIL ifnet recycler, ifnet in use */
245 #define DLIF_REUSE 0x2 /* DLIL ifnet recycles, ifnet is not new */
246 #define DLIF_DEBUG 0x4 /* has debugging info */
247
248 #define IF_REF_TRACE_HIST_SIZE 8 /* size of ref trace history */
249
250 /* For gdb */
251 __private_extern__ unsigned int if_ref_trace_hist_size = IF_REF_TRACE_HIST_SIZE;
252
253 struct dlil_ifnet_dbg {
254 struct dlil_ifnet dldbg_dlif; /* dlil_ifnet */
255 u_int16_t dldbg_if_refhold_cnt; /* # ifnet references */
256 u_int16_t dldbg_if_refrele_cnt; /* # ifnet releases */
257 /*
258 * Circular lists of ifnet_{reference,release} callers.
259 */
260 ctrace_t dldbg_if_refhold[IF_REF_TRACE_HIST_SIZE];
261 ctrace_t dldbg_if_refrele[IF_REF_TRACE_HIST_SIZE];
262 };
263
264 #define DLIL_TO_IFP(s) (&s->dl_if)
265 #define IFP_TO_DLIL(s) ((struct dlil_ifnet *)s)
266
267 struct ifnet_filter {
268 TAILQ_ENTRY(ifnet_filter) filt_next;
269 u_int32_t filt_skip;
270 u_int32_t filt_flags;
271 ifnet_t filt_ifp;
272 const char *filt_name;
273 void *filt_cookie;
274 protocol_family_t filt_protocol;
275 iff_input_func filt_input;
276 iff_output_func filt_output;
277 iff_event_func filt_event;
278 iff_ioctl_func filt_ioctl;
279 iff_detached_func filt_detached;
280 };
281
282 /* Mbuf queue used for freeing the excessive mbufs */
283 typedef MBUFQ_HEAD(dlil_freeq) dlil_freeq_t;
284
285 struct proto_input_entry;
286
287 static TAILQ_HEAD(, dlil_ifnet) dlil_ifnet_head;
288
289 static LCK_ATTR_DECLARE(dlil_lck_attributes, 0, 0);
290
291 static LCK_GRP_DECLARE(dlil_lock_group, "DLIL internal locks");
292 LCK_GRP_DECLARE(ifnet_lock_group, "ifnet locks");
293 static LCK_GRP_DECLARE(ifnet_head_lock_group, "ifnet head lock");
294 static LCK_GRP_DECLARE(ifnet_snd_lock_group, "ifnet snd locks");
295 static LCK_GRP_DECLARE(ifnet_rcv_lock_group, "ifnet rcv locks");
296
297 LCK_ATTR_DECLARE(ifnet_lock_attr, 0, 0);
298 static LCK_RW_DECLARE_ATTR(ifnet_head_lock, &ifnet_head_lock_group,
299 &dlil_lck_attributes);
300 static LCK_MTX_DECLARE_ATTR(dlil_ifnet_lock, &dlil_lock_group,
301 &dlil_lck_attributes);
302
303 #if DEBUG
304 static unsigned int ifnet_debug = 1; /* debugging (enabled) */
305 #else
306 static unsigned int ifnet_debug; /* debugging (disabled) */
307 #endif /* !DEBUG */
308 static unsigned int dlif_size; /* size of dlil_ifnet to allocate */
309 static unsigned int dlif_bufsize; /* size of dlif_size + headroom */
310 static struct zone *dlif_zone; /* zone for dlil_ifnet */
311 #define DLIF_ZONE_NAME "ifnet" /* zone name */
312
313 static KALLOC_TYPE_DEFINE(dlif_filt_zone, struct ifnet_filter, NET_KT_DEFAULT);
314
315 static KALLOC_TYPE_DEFINE(dlif_proto_zone, struct if_proto, NET_KT_DEFAULT);
316
317 static unsigned int dlif_tcpstat_size; /* size of tcpstat_local to allocate */
318 static unsigned int dlif_tcpstat_bufsize; /* size of dlif_tcpstat_size + headroom */
319 static struct zone *dlif_tcpstat_zone; /* zone for tcpstat_local */
320 #define DLIF_TCPSTAT_ZONE_NAME "ifnet_tcpstat" /* zone name */
321
322 static unsigned int dlif_udpstat_size; /* size of udpstat_local to allocate */
323 static unsigned int dlif_udpstat_bufsize; /* size of dlif_udpstat_size + headroom */
324 static struct zone *dlif_udpstat_zone; /* zone for udpstat_local */
325 #define DLIF_UDPSTAT_ZONE_NAME "ifnet_udpstat" /* zone name */
326
327 static u_int32_t net_rtref;
328
329 static struct dlil_main_threading_info dlil_main_input_thread_info;
330 __private_extern__ struct dlil_threading_info *dlil_main_input_thread =
331 (struct dlil_threading_info *)&dlil_main_input_thread_info;
332
333 static int dlil_event_internal(struct ifnet *ifp, struct kev_msg *msg, bool update_generation);
334 static int dlil_detach_filter_internal(interface_filter_t filter, int detached);
335 static void dlil_if_trace(struct dlil_ifnet *, int);
336 static void if_proto_ref(struct if_proto *);
337 static void if_proto_free(struct if_proto *);
338 static struct if_proto *find_attached_proto(struct ifnet *, u_int32_t);
339 static u_int32_t dlil_ifp_protolist(struct ifnet *ifp, protocol_family_t *list,
340 u_int32_t list_count);
341 static void _dlil_if_release(ifnet_t ifp, bool clear_in_use);
342 static void if_flt_monitor_busy(struct ifnet *);
343 static void if_flt_monitor_unbusy(struct ifnet *);
344 static void if_flt_monitor_enter(struct ifnet *);
345 static void if_flt_monitor_leave(struct ifnet *);
346 static int dlil_interface_filters_input(struct ifnet *, struct mbuf **,
347 char **, protocol_family_t);
348 static int dlil_interface_filters_output(struct ifnet *, struct mbuf **,
349 protocol_family_t);
350 static struct ifaddr *dlil_alloc_lladdr(struct ifnet *,
351 const struct sockaddr_dl *);
352 static int ifnet_lookup(struct ifnet *);
353 static void if_purgeaddrs(struct ifnet *);
354
355 static errno_t ifproto_media_input_v1(struct ifnet *, protocol_family_t,
356 struct mbuf *, char *);
357 static errno_t ifproto_media_input_v2(struct ifnet *, protocol_family_t,
358 struct mbuf *);
359 static errno_t ifproto_media_preout(struct ifnet *, protocol_family_t,
360 mbuf_t *, const struct sockaddr *, void *, char *, char *);
361 static void ifproto_media_event(struct ifnet *, protocol_family_t,
362 const struct kev_msg *);
363 static errno_t ifproto_media_ioctl(struct ifnet *, protocol_family_t,
364 unsigned long, void *);
365 static errno_t ifproto_media_resolve_multi(ifnet_t, const struct sockaddr *,
366 struct sockaddr_dl *, size_t);
367 static errno_t ifproto_media_send_arp(struct ifnet *, u_short,
368 const struct sockaddr_dl *, const struct sockaddr *,
369 const struct sockaddr_dl *, const struct sockaddr *);
370
371 static errno_t ifp_if_input(struct ifnet *ifp, struct mbuf *m_head,
372 struct mbuf *m_tail, const struct ifnet_stat_increment_param *s,
373 boolean_t poll, struct thread *tp);
374 static void ifp_if_input_poll(struct ifnet *, u_int32_t, u_int32_t,
375 struct mbuf **, struct mbuf **, u_int32_t *, u_int32_t *);
376 static errno_t ifp_if_ctl(struct ifnet *, ifnet_ctl_cmd_t, u_int32_t, void *);
377 static errno_t ifp_if_demux(struct ifnet *, struct mbuf *, char *,
378 protocol_family_t *);
379 static errno_t ifp_if_add_proto(struct ifnet *, protocol_family_t,
380 const struct ifnet_demux_desc *, u_int32_t);
381 static errno_t ifp_if_del_proto(struct ifnet *, protocol_family_t);
382 static errno_t ifp_if_check_multi(struct ifnet *, const struct sockaddr *);
383 #if !XNU_TARGET_OS_OSX
384 static errno_t ifp_if_framer(struct ifnet *, struct mbuf **,
385 const struct sockaddr *, const char *, const char *,
386 u_int32_t *, u_int32_t *);
387 #else /* XNU_TARGET_OS_OSX */
388 static errno_t ifp_if_framer(struct ifnet *, struct mbuf **,
389 const struct sockaddr *, const char *, const char *);
390 #endif /* XNU_TARGET_OS_OSX */
391 static errno_t ifp_if_framer_extended(struct ifnet *, struct mbuf **,
392 const struct sockaddr *, const char *, const char *,
393 u_int32_t *, u_int32_t *);
394 static errno_t ifp_if_set_bpf_tap(struct ifnet *, bpf_tap_mode, bpf_packet_func);
395 static void ifp_if_free(struct ifnet *);
396 static void ifp_if_event(struct ifnet *, const struct kev_msg *);
397 static __inline void ifp_inc_traffic_class_in(struct ifnet *, struct mbuf *);
398 static __inline void ifp_inc_traffic_class_out(struct ifnet *, struct mbuf *);
399
400 static uint32_t dlil_trim_overcomitted_queue_locked(class_queue_t *,
401 dlil_freeq_t *, struct ifnet_stat_increment_param *);
402
403 static errno_t dlil_input_async(struct dlil_threading_info *, struct ifnet *,
404 struct mbuf *, struct mbuf *, const struct ifnet_stat_increment_param *,
405 boolean_t, struct thread *);
406 static errno_t dlil_input_sync(struct dlil_threading_info *, struct ifnet *,
407 struct mbuf *, struct mbuf *, const struct ifnet_stat_increment_param *,
408 boolean_t, struct thread *);
409
410 static void dlil_main_input_thread_func(void *, wait_result_t);
411 static void dlil_main_input_thread_cont(void *, wait_result_t);
412
413 static void dlil_input_thread_func(void *, wait_result_t);
414 static void dlil_input_thread_cont(void *, wait_result_t);
415
416 static void dlil_rxpoll_input_thread_func(void *, wait_result_t);
417 static void dlil_rxpoll_input_thread_cont(void *, wait_result_t);
418
419 static int dlil_create_input_thread(ifnet_t, struct dlil_threading_info *,
420 thread_continue_t *);
421 static void dlil_terminate_input_thread(struct dlil_threading_info *);
422 static void dlil_input_stats_add(const struct ifnet_stat_increment_param *,
423 struct dlil_threading_info *, struct ifnet *, boolean_t);
424 static boolean_t dlil_input_stats_sync(struct ifnet *,
425 struct dlil_threading_info *);
426 static void dlil_input_packet_list_common(struct ifnet *, struct mbuf *,
427 u_int32_t, ifnet_model_t, boolean_t);
428 static errno_t ifnet_input_common(struct ifnet *, struct mbuf *, struct mbuf *,
429 const struct ifnet_stat_increment_param *, boolean_t, boolean_t);
430 static int dlil_is_clat_needed(protocol_family_t, mbuf_t );
431 static errno_t dlil_clat46(ifnet_t, protocol_family_t *, mbuf_t *);
432 static errno_t dlil_clat64(ifnet_t, protocol_family_t *, mbuf_t *);
433 #if DEBUG || DEVELOPMENT
434 static void dlil_verify_sum16(void);
435 #endif /* DEBUG || DEVELOPMENT */
436 static void dlil_output_cksum_dbg(struct ifnet *, struct mbuf *, uint32_t,
437 protocol_family_t);
438 static void dlil_input_cksum_dbg(struct ifnet *, struct mbuf *, char *,
439 protocol_family_t);
440
441 static void dlil_incr_pending_thread_count(void);
442 static void dlil_decr_pending_thread_count(void);
443
444 static void ifnet_detacher_thread_func(void *, wait_result_t);
445 static void ifnet_detacher_thread_cont(void *, wait_result_t);
446 static void ifnet_detach_final(struct ifnet *);
447 static void ifnet_detaching_enqueue(struct ifnet *);
448 static struct ifnet *ifnet_detaching_dequeue(void);
449
450 static void ifnet_start_thread_func(void *, wait_result_t);
451 static void ifnet_start_thread_cont(void *, wait_result_t);
452
453 static void ifnet_poll_thread_func(void *, wait_result_t);
454 static void ifnet_poll_thread_cont(void *, wait_result_t);
455
456 static errno_t ifnet_enqueue_common(struct ifnet *, struct ifclassq *,
457 classq_pkt_t *, boolean_t, boolean_t *);
458
459 static void ifp_src_route_copyout(struct ifnet *, struct route *);
460 static void ifp_src_route_copyin(struct ifnet *, struct route *);
461 static void ifp_src_route6_copyout(struct ifnet *, struct route_in6 *);
462 static void ifp_src_route6_copyin(struct ifnet *, struct route_in6 *);
463
464 static errno_t if_mcasts_update_async(struct ifnet *);
465
466 static int sysctl_rxpoll SYSCTL_HANDLER_ARGS;
467 static int sysctl_rxpoll_mode_holdtime SYSCTL_HANDLER_ARGS;
468 static int sysctl_rxpoll_sample_holdtime SYSCTL_HANDLER_ARGS;
469 static int sysctl_rxpoll_interval_time SYSCTL_HANDLER_ARGS;
470 static int sysctl_rxpoll_wlowat SYSCTL_HANDLER_ARGS;
471 static int sysctl_rxpoll_whiwat SYSCTL_HANDLER_ARGS;
472 static int sysctl_sndq_maxlen SYSCTL_HANDLER_ARGS;
473 static int sysctl_rcvq_maxlen SYSCTL_HANDLER_ARGS;
474 static int sysctl_rcvq_burst_limit SYSCTL_HANDLER_ARGS;
475 static int sysctl_rcvq_trim_pct SYSCTL_HANDLER_ARGS;
476 static int sysctl_hwcksum_dbg_mode SYSCTL_HANDLER_ARGS;
477 static int sysctl_hwcksum_dbg_partial_rxoff_forced SYSCTL_HANDLER_ARGS;
478 static int sysctl_hwcksum_dbg_partial_rxoff_adj SYSCTL_HANDLER_ARGS;
479
480 struct chain_len_stats tx_chain_len_stats;
481 static int sysctl_tx_chain_len_stats SYSCTL_HANDLER_ARGS;
482
483 #if TEST_INPUT_THREAD_TERMINATION
484 static int sysctl_input_thread_termination_spin SYSCTL_HANDLER_ARGS;
485 #endif /* TEST_INPUT_THREAD_TERMINATION */
486
487 /* The following are protected by dlil_ifnet_lock */
488 static TAILQ_HEAD(, ifnet) ifnet_detaching_head;
489 static u_int32_t ifnet_detaching_cnt;
490 static boolean_t ifnet_detaching_embryonic;
491 static void *ifnet_delayed_run; /* wait channel for detaching thread */
492
493 static LCK_MTX_DECLARE_ATTR(ifnet_fc_lock, &dlil_lock_group,
494 &dlil_lck_attributes);
495
496 static uint32_t ifnet_flowhash_seed;
497
498 struct ifnet_flowhash_key {
499 char ifk_name[IFNAMSIZ];
500 uint32_t ifk_unit;
501 uint32_t ifk_flags;
502 uint32_t ifk_eflags;
503 uint32_t ifk_capabilities;
504 uint32_t ifk_capenable;
505 uint32_t ifk_output_sched_model;
506 uint32_t ifk_rand1;
507 uint32_t ifk_rand2;
508 };
509
510 /* Flow control entry per interface */
511 struct ifnet_fc_entry {
512 RB_ENTRY(ifnet_fc_entry) ifce_entry;
513 u_int32_t ifce_flowhash;
514 struct ifnet *ifce_ifp;
515 };
516
517 static uint32_t ifnet_calc_flowhash(struct ifnet *);
518 static int ifce_cmp(const struct ifnet_fc_entry *,
519 const struct ifnet_fc_entry *);
520 static int ifnet_fc_add(struct ifnet *);
521 static struct ifnet_fc_entry *ifnet_fc_get(u_int32_t);
522 static void ifnet_fc_entry_free(struct ifnet_fc_entry *);
523
524 /* protected by ifnet_fc_lock */
525 RB_HEAD(ifnet_fc_tree, ifnet_fc_entry) ifnet_fc_tree;
526 RB_PROTOTYPE(ifnet_fc_tree, ifnet_fc_entry, ifce_entry, ifce_cmp);
527 RB_GENERATE(ifnet_fc_tree, ifnet_fc_entry, ifce_entry, ifce_cmp);
528
529 static KALLOC_TYPE_DEFINE(ifnet_fc_zone, struct ifnet_fc_entry, NET_KT_DEFAULT);
530
531 extern void bpfdetach(struct ifnet *);
532 extern void proto_input_run(void);
533
534 extern uint32_t udp_count_opportunistic(unsigned int ifindex,
535 u_int32_t flags);
536 extern uint32_t tcp_count_opportunistic(unsigned int ifindex,
537 u_int32_t flags);
538
539 __private_extern__ void link_rtrequest(int, struct rtentry *, struct sockaddr *);
540
541 #if CONFIG_MACF
542 #if !XNU_TARGET_OS_OSX
543 int dlil_lladdr_ckreq = 1;
544 #else /* XNU_TARGET_OS_OSX */
545 int dlil_lladdr_ckreq = 0;
546 #endif /* XNU_TARGET_OS_OSX */
547 #endif /* CONFIG_MACF */
548
549 #if DEBUG
550 int dlil_verbose = 1;
551 #else
552 int dlil_verbose = 0;
553 #endif /* DEBUG */
554 #if IFNET_INPUT_SANITY_CHK
555 /* sanity checking of input packet lists received */
556 static u_int32_t dlil_input_sanity_check = 0;
557 #endif /* IFNET_INPUT_SANITY_CHK */
558 /* rate limit debug messages */
559 struct timespec dlil_dbgrate = { .tv_sec = 1, .tv_nsec = 0 };
560
561 SYSCTL_DECL(_net_link_generic_system);
562
563 SYSCTL_INT(_net_link_generic_system, OID_AUTO, dlil_verbose,
564 CTLFLAG_RW | CTLFLAG_LOCKED, &dlil_verbose, 0, "Log DLIL error messages");
565
566 #define IF_SNDQ_MINLEN 32
567 u_int32_t if_sndq_maxlen = IFQ_MAXLEN;
568 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, sndq_maxlen,
569 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_sndq_maxlen, IFQ_MAXLEN,
570 sysctl_sndq_maxlen, "I", "Default transmit queue max length");
571
572 #define IF_RCVQ_MINLEN 32
573 #define IF_RCVQ_MAXLEN 256
574 u_int32_t if_rcvq_maxlen = IF_RCVQ_MAXLEN;
575 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rcvq_maxlen,
576 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rcvq_maxlen, IFQ_MAXLEN,
577 sysctl_rcvq_maxlen, "I", "Default receive queue max length");
578
579 /*
580 * Protect against possible memory starvation that may happen
581 * when the driver is pushing data faster than the AP can process.
582 *
583 * If at any point during DLIL input phase any of the input queues
584 * exceeds the burst limit, DLIL will start to trim the queue,
585 * by returning mbufs in the input queue to the cache from which
586 * the mbufs were originally allocated, starting from the oldest
587 * mbuf and continuing until the new limit (see below) is reached.
588 *
589 * In order to avoid a steplocked equilibrium, the trimming
590 * will continue PAST the burst limit, until the corresponding
591 * input queue is reduced to `if_rcvq_trim_pct' %.
592 *
593 * For example, if the input queue limit is 1024 packets,
594 * and the trim percentage (`if_rcvq_trim_pct') is 80 %,
595 * the trimming will continue until the queue contains 819 packets
596 * (1024 * 80 / 100 == 819).
597 *
598 * Setting the burst limit too low can hurt the throughput,
599 * while setting the burst limit too high can defeat the purpose.
600 */
601 #define IF_RCVQ_BURST_LIMIT_MIN 1024
602 #define IF_RCVQ_BURST_LIMIT_DEFAULT 8192
603 #define IF_RCVQ_BURST_LIMIT_MAX 32768
604 uint32_t if_rcvq_burst_limit = IF_RCVQ_BURST_LIMIT_DEFAULT;
605 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rcvq_burst_limit,
606 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rcvq_burst_limit, IF_RCVQ_BURST_LIMIT_DEFAULT,
607 sysctl_rcvq_burst_limit, "I", "Upper memory limit for inbound data");
608
609 #define IF_RCVQ_TRIM_PCT_MIN 20
610 #define IF_RCVQ_TRIM_PCT_DEFAULT 80
611 #define IF_RCVQ_TRIM_PCT_MAX 100
612 uint32_t if_rcvq_trim_pct = IF_RCVQ_TRIM_PCT_DEFAULT;
613 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rcvq_trim_pct,
614 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rcvq_trim_pct, IF_RCVQ_TRIM_PCT_DEFAULT,
615 sysctl_rcvq_trim_pct, "I",
616 "Percentage (0 - 100) of the queue limit to keep after detecting an overflow burst");
617
618 #define IF_RXPOLL_DECAY 2 /* ilog2 of EWMA decay rate (4) */
619 u_int32_t if_rxpoll_decay = IF_RXPOLL_DECAY;
620 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, rxpoll_decay,
621 CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_decay, IF_RXPOLL_DECAY,
622 "ilog2 of EWMA decay rate of avg inbound packets");
623
624 #define IF_RXPOLL_MODE_HOLDTIME_MIN (10ULL * 1000 * 1000) /* 10 ms */
625 #define IF_RXPOLL_MODE_HOLDTIME (1000ULL * 1000 * 1000) /* 1 sec */
626 static u_int64_t if_rxpoll_mode_holdtime = IF_RXPOLL_MODE_HOLDTIME;
627 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_freeze_time,
628 CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_mode_holdtime,
629 IF_RXPOLL_MODE_HOLDTIME, sysctl_rxpoll_mode_holdtime,
630 "Q", "input poll mode freeze time");
631
632 #define IF_RXPOLL_SAMPLETIME_MIN (1ULL * 1000 * 1000) /* 1 ms */
633 #define IF_RXPOLL_SAMPLETIME (10ULL * 1000 * 1000) /* 10 ms */
634 static u_int64_t if_rxpoll_sample_holdtime = IF_RXPOLL_SAMPLETIME;
635 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_sample_time,
636 CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_sample_holdtime,
637 IF_RXPOLL_SAMPLETIME, sysctl_rxpoll_sample_holdtime,
638 "Q", "input poll sampling time");
639
640 static u_int64_t if_rxpoll_interval_time = IF_RXPOLL_INTERVALTIME;
641 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_interval_time,
642 CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_interval_time,
643 IF_RXPOLL_INTERVALTIME, sysctl_rxpoll_interval_time,
644 "Q", "input poll interval (time)");
645
646 #define IF_RXPOLL_INTERVAL_PKTS 0 /* 0 (disabled) */
647 u_int32_t if_rxpoll_interval_pkts = IF_RXPOLL_INTERVAL_PKTS;
648 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, rxpoll_interval_pkts,
649 CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_interval_pkts,
650 IF_RXPOLL_INTERVAL_PKTS, "input poll interval (packets)");
651
652 #define IF_RXPOLL_WLOWAT 10
653 static u_int32_t if_sysctl_rxpoll_wlowat = IF_RXPOLL_WLOWAT;
654 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_wakeups_lowat,
655 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_sysctl_rxpoll_wlowat,
656 IF_RXPOLL_WLOWAT, sysctl_rxpoll_wlowat,
657 "I", "input poll wakeup low watermark");
658
659 #define IF_RXPOLL_WHIWAT 100
660 static u_int32_t if_sysctl_rxpoll_whiwat = IF_RXPOLL_WHIWAT;
661 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_wakeups_hiwat,
662 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_sysctl_rxpoll_whiwat,
663 IF_RXPOLL_WHIWAT, sysctl_rxpoll_whiwat,
664 "I", "input poll wakeup high watermark");
665
666 static u_int32_t if_rxpoll_max = 0; /* 0 (automatic) */
667 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, rxpoll_max,
668 CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_max, 0,
669 "max packets per poll call");
670
671 u_int32_t if_rxpoll = 1;
672 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll,
673 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll, 0,
674 sysctl_rxpoll, "I", "enable opportunistic input polling");
675
676 #if TEST_INPUT_THREAD_TERMINATION
677 static u_int32_t if_input_thread_termination_spin = 0;
678 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, input_thread_termination_spin,
679 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
680 &if_input_thread_termination_spin, 0,
681 sysctl_input_thread_termination_spin,
682 "I", "input thread termination spin limit");
683 #endif /* TEST_INPUT_THREAD_TERMINATION */
684
685 static u_int32_t cur_dlil_input_threads = 0;
686 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, dlil_input_threads,
687 CTLFLAG_RD | CTLFLAG_LOCKED, &cur_dlil_input_threads, 0,
688 "Current number of DLIL input threads");
689
690 #if IFNET_INPUT_SANITY_CHK
691 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, dlil_input_sanity_check,
692 CTLFLAG_RW | CTLFLAG_LOCKED, &dlil_input_sanity_check, 0,
693 "Turn on sanity checking in DLIL input");
694 #endif /* IFNET_INPUT_SANITY_CHK */
695
696 static u_int32_t if_flowadv = 1;
697 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, flow_advisory,
698 CTLFLAG_RW | CTLFLAG_LOCKED, &if_flowadv, 1,
699 "enable flow-advisory mechanism");
700
701 static u_int32_t if_delaybased_queue = 1;
702 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, delaybased_queue,
703 CTLFLAG_RW | CTLFLAG_LOCKED, &if_delaybased_queue, 1,
704 "enable delay based dynamic queue sizing");
705
706 static uint64_t hwcksum_in_invalidated = 0;
707 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
708 hwcksum_in_invalidated, CTLFLAG_RD | CTLFLAG_LOCKED,
709 &hwcksum_in_invalidated, "inbound packets with invalidated hardware cksum");
710
711 uint32_t hwcksum_dbg = 0;
712 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, hwcksum_dbg,
713 CTLFLAG_RW | CTLFLAG_LOCKED, &hwcksum_dbg, 0,
714 "enable hardware cksum debugging");
715
716 u_int32_t ifnet_start_delayed = 0;
717 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, start_delayed,
718 CTLFLAG_RW | CTLFLAG_LOCKED, &ifnet_start_delayed, 0,
719 "number of times start was delayed");
720
721 u_int32_t ifnet_delay_start_disabled = 0;
722 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, start_delay_disabled,
723 CTLFLAG_RW | CTLFLAG_LOCKED, &ifnet_delay_start_disabled, 0,
724 "number of times start was delayed");
725
726 static inline void
ifnet_delay_start_disabled_increment(void)727 ifnet_delay_start_disabled_increment(void)
728 {
729 OSIncrementAtomic(&ifnet_delay_start_disabled);
730 }
731
732 #define HWCKSUM_DBG_PARTIAL_FORCED 0x1 /* forced partial checksum */
733 #define HWCKSUM_DBG_PARTIAL_RXOFF_ADJ 0x2 /* adjust start offset */
734 #define HWCKSUM_DBG_FINALIZE_FORCED 0x10 /* forced finalize */
735 #define HWCKSUM_DBG_MASK \
736 (HWCKSUM_DBG_PARTIAL_FORCED | HWCKSUM_DBG_PARTIAL_RXOFF_ADJ | \
737 HWCKSUM_DBG_FINALIZE_FORCED)
738
739 static uint32_t hwcksum_dbg_mode = 0;
740 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, hwcksum_dbg_mode,
741 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &hwcksum_dbg_mode,
742 0, sysctl_hwcksum_dbg_mode, "I", "hardware cksum debugging mode");
743
744 static uint64_t hwcksum_dbg_partial_forced = 0;
745 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
746 hwcksum_dbg_partial_forced, CTLFLAG_RD | CTLFLAG_LOCKED,
747 &hwcksum_dbg_partial_forced, "packets forced using partial cksum");
748
749 static uint64_t hwcksum_dbg_partial_forced_bytes = 0;
750 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
751 hwcksum_dbg_partial_forced_bytes, CTLFLAG_RD | CTLFLAG_LOCKED,
752 &hwcksum_dbg_partial_forced_bytes, "bytes forced using partial cksum");
753
754 static uint32_t hwcksum_dbg_partial_rxoff_forced = 0;
755 SYSCTL_PROC(_net_link_generic_system, OID_AUTO,
756 hwcksum_dbg_partial_rxoff_forced, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
757 &hwcksum_dbg_partial_rxoff_forced, 0,
758 sysctl_hwcksum_dbg_partial_rxoff_forced, "I",
759 "forced partial cksum rx offset");
760
761 static uint32_t hwcksum_dbg_partial_rxoff_adj = 0;
762 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, hwcksum_dbg_partial_rxoff_adj,
763 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &hwcksum_dbg_partial_rxoff_adj,
764 0, sysctl_hwcksum_dbg_partial_rxoff_adj, "I",
765 "adjusted partial cksum rx offset");
766
767 static uint64_t hwcksum_dbg_verified = 0;
768 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
769 hwcksum_dbg_verified, CTLFLAG_RD | CTLFLAG_LOCKED,
770 &hwcksum_dbg_verified, "packets verified for having good checksum");
771
772 static uint64_t hwcksum_dbg_bad_cksum = 0;
773 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
774 hwcksum_dbg_bad_cksum, CTLFLAG_RD | CTLFLAG_LOCKED,
775 &hwcksum_dbg_bad_cksum, "packets with bad hardware calculated checksum");
776
777 static uint64_t hwcksum_dbg_bad_rxoff = 0;
778 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
779 hwcksum_dbg_bad_rxoff, CTLFLAG_RD | CTLFLAG_LOCKED,
780 &hwcksum_dbg_bad_rxoff, "packets with invalid rxoff");
781
782 static uint64_t hwcksum_dbg_adjusted = 0;
783 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
784 hwcksum_dbg_adjusted, CTLFLAG_RD | CTLFLAG_LOCKED,
785 &hwcksum_dbg_adjusted, "packets with rxoff adjusted");
786
787 static uint64_t hwcksum_dbg_finalized_hdr = 0;
788 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
789 hwcksum_dbg_finalized_hdr, CTLFLAG_RD | CTLFLAG_LOCKED,
790 &hwcksum_dbg_finalized_hdr, "finalized headers");
791
792 static uint64_t hwcksum_dbg_finalized_data = 0;
793 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
794 hwcksum_dbg_finalized_data, CTLFLAG_RD | CTLFLAG_LOCKED,
795 &hwcksum_dbg_finalized_data, "finalized payloads");
796
797 uint32_t hwcksum_tx = 1;
798 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, hwcksum_tx,
799 CTLFLAG_RW | CTLFLAG_LOCKED, &hwcksum_tx, 0,
800 "enable transmit hardware checksum offload");
801
802 uint32_t hwcksum_rx = 1;
803 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, hwcksum_rx,
804 CTLFLAG_RW | CTLFLAG_LOCKED, &hwcksum_rx, 0,
805 "enable receive hardware checksum offload");
806
807 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, tx_chain_len_stats,
808 CTLFLAG_RD | CTLFLAG_LOCKED, 0, 9,
809 sysctl_tx_chain_len_stats, "S", "");
810
811 uint32_t tx_chain_len_count = 0;
812 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, tx_chain_len_count,
813 CTLFLAG_RW | CTLFLAG_LOCKED, &tx_chain_len_count, 0, "");
814
815 static uint32_t threshold_notify = 1; /* enable/disable */
816 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, threshold_notify,
817 CTLFLAG_RW | CTLFLAG_LOCKED, &threshold_notify, 0, "");
818
819 static uint32_t threshold_interval = 2; /* in seconds */
820 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, threshold_interval,
821 CTLFLAG_RW | CTLFLAG_LOCKED, &threshold_interval, 0, "");
822
823 #if (DEVELOPMENT || DEBUG)
824 static int sysctl_get_kao_frames SYSCTL_HANDLER_ARGS;
825 SYSCTL_NODE(_net_link_generic_system, OID_AUTO, get_kao_frames,
826 CTLFLAG_RD | CTLFLAG_LOCKED, sysctl_get_kao_frames, "");
827 #endif /* DEVELOPMENT || DEBUG */
828
829 struct net_api_stats net_api_stats;
830 SYSCTL_STRUCT(_net, OID_AUTO, api_stats, CTLFLAG_RD | CTLFLAG_LOCKED,
831 &net_api_stats, net_api_stats, "");
832
833 uint32_t net_wake_pkt_debug = 0;
834 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, wake_pkt_debug,
835 CTLFLAG_RW | CTLFLAG_LOCKED, &net_wake_pkt_debug, 0, "");
836
837 static void log_hexdump(void *data, size_t len);
838
839 unsigned int net_rxpoll = 1;
840 unsigned int net_affinity = 1;
841 unsigned int net_async = 1; /* 0: synchronous, 1: asynchronous */
842
843 static kern_return_t dlil_affinity_set(struct thread *, u_int32_t);
844
845 extern u_int32_t inject_buckets;
846
847 /* DLIL data threshold thread call */
848 static void dlil_dt_tcall_fn(thread_call_param_t, thread_call_param_t);
849
850 void
ifnet_filter_update_tso(struct ifnet * ifp,boolean_t filter_enable)851 ifnet_filter_update_tso(struct ifnet *ifp, boolean_t filter_enable)
852 {
853 /*
854 * update filter count and route_generation ID to let TCP
855 * know it should reevalute doing TSO or not
856 */
857 if (filter_enable) {
858 OSAddAtomic(1, &ifp->if_flt_no_tso_count);
859 } else {
860 VERIFY(ifp->if_flt_no_tso_count != 0);
861 OSAddAtomic(-1, &ifp->if_flt_no_tso_count);
862 }
863 routegenid_update();
864 }
865
866 #if SKYWALK
867
868 #if defined(XNU_TARGET_OS_OSX)
869 static bool net_check_compatible_if_filter(struct ifnet *ifp);
870 #endif /* XNU_TARGET_OS_OSX */
871
872 /* if_attach_nx flags defined in os_skywalk_private.h */
873 static unsigned int if_attach_nx = IF_ATTACH_NX_DEFAULT;
874 unsigned int if_enable_fsw_ip_netagent =
875 ((IF_ATTACH_NX_DEFAULT & IF_ATTACH_NX_FSW_IP_NETAGENT) != 0);
876 unsigned int if_enable_fsw_transport_netagent =
877 ((IF_ATTACH_NX_DEFAULT & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0);
878
879 unsigned int if_netif_all =
880 ((IF_ATTACH_NX_DEFAULT & IF_ATTACH_NX_NETIF_ALL) != 0);
881
882 /* Configure flowswitch to use max mtu sized buffer */
883 static bool fsw_use_max_mtu_buffer = false;
884
885 #if (DEVELOPMENT || DEBUG)
886 static int
887 if_attach_nx_sysctl SYSCTL_HANDLER_ARGS
888 {
889 #pragma unused(oidp, arg1, arg2)
890 unsigned int new_value;
891 int changed;
892 int error = sysctl_io_number(req, if_attach_nx, sizeof(if_attach_nx),
893 &new_value, &changed);
894 if (error) {
895 return error;
896 }
897 if (changed) {
898 if ((new_value & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) !=
899 (if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT)) {
900 return ENOTSUP;
901 }
902 if_attach_nx = new_value;
903 }
904 return 0;
905 }
906
907 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, if_attach_nx,
908 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
909 0, 0, &if_attach_nx_sysctl, "IU", "attach nexus");
910
911 #endif /* DEVELOPMENT || DEBUG */
912
913 static int
914 if_enable_fsw_transport_netagent_sysctl SYSCTL_HANDLER_ARGS
915 {
916 #pragma unused(oidp, arg1, arg2)
917 unsigned int new_value;
918 int changed;
919 int error;
920
921 error = sysctl_io_number(req, if_enable_fsw_transport_netagent,
922 sizeof(if_enable_fsw_transport_netagent),
923 &new_value, &changed);
924 if (error == 0 && changed != 0) {
925 if (new_value != 0 && new_value != 1) {
926 /* only allow 0 or 1 */
927 error = EINVAL;
928 } else if ((if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0) {
929 /* netagent can be enabled/disabled */
930 if_enable_fsw_transport_netagent = new_value;
931 if (new_value == 0) {
932 kern_nexus_deregister_netagents();
933 } else {
934 kern_nexus_register_netagents();
935 }
936 } else {
937 /* netagent can't be enabled */
938 error = ENOTSUP;
939 }
940 }
941 return error;
942 }
943
944 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, enable_netagent,
945 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
946 0, 0, &if_enable_fsw_transport_netagent_sysctl, "IU",
947 "enable flowswitch netagent");
948
949 static void dlil_detach_flowswitch_nexus(if_nexus_flowswitch_t nexus_fsw);
950
951 #include <skywalk/os_skywalk_private.h>
952
953 boolean_t
ifnet_nx_noauto(ifnet_t ifp)954 ifnet_nx_noauto(ifnet_t ifp)
955 {
956 return (ifp->if_xflags & IFXF_NX_NOAUTO) != 0;
957 }
958
959 boolean_t
ifnet_nx_noauto_flowswitch(ifnet_t ifp)960 ifnet_nx_noauto_flowswitch(ifnet_t ifp)
961 {
962 return ifnet_is_low_latency(ifp);
963 }
964
965 boolean_t
ifnet_is_low_latency(ifnet_t ifp)966 ifnet_is_low_latency(ifnet_t ifp)
967 {
968 return (ifp->if_xflags & IFXF_LOW_LATENCY) != 0;
969 }
970
971 boolean_t
ifnet_needs_compat(ifnet_t ifp)972 ifnet_needs_compat(ifnet_t ifp)
973 {
974 if ((if_attach_nx & IF_ATTACH_NX_NETIF_COMPAT) == 0) {
975 return FALSE;
976 }
977 #if !XNU_TARGET_OS_OSX
978 /*
979 * To conserve memory, we plumb in the compat layer selectively; this
980 * can be overridden via if_attach_nx flag IF_ATTACH_NX_NETIF_ALL.
981 * In particular, we check for Wi-Fi Access Point.
982 */
983 if (IFNET_IS_WIFI(ifp)) {
984 /* Wi-Fi Access Point */
985 if (ifp->if_name[0] == 'a' && ifp->if_name[1] == 'p' &&
986 ifp->if_name[2] == '\0') {
987 return if_netif_all;
988 }
989 }
990 #else /* XNU_TARGET_OS_OSX */
991 #pragma unused(ifp)
992 #endif /* XNU_TARGET_OS_OSX */
993 return TRUE;
994 }
995
996 boolean_t
ifnet_needs_fsw_transport_netagent(ifnet_t ifp)997 ifnet_needs_fsw_transport_netagent(ifnet_t ifp)
998 {
999 if (if_is_fsw_transport_netagent_enabled()) {
1000 /* check if netagent has been manually enabled for ipsec/utun */
1001 if (ifp->if_family == IFNET_FAMILY_IPSEC) {
1002 return ipsec_interface_needs_netagent(ifp);
1003 } else if (ifp->if_family == IFNET_FAMILY_UTUN) {
1004 return utun_interface_needs_netagent(ifp);
1005 }
1006
1007 /* check ifnet no auto nexus override */
1008 if (ifnet_nx_noauto(ifp)) {
1009 return FALSE;
1010 }
1011
1012 /* check global if_attach_nx configuration */
1013 switch (ifp->if_family) {
1014 case IFNET_FAMILY_CELLULAR:
1015 case IFNET_FAMILY_ETHERNET:
1016 if ((if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0) {
1017 return TRUE;
1018 }
1019 break;
1020 default:
1021 break;
1022 }
1023 }
1024 return FALSE;
1025 }
1026
1027 boolean_t
ifnet_needs_fsw_ip_netagent(ifnet_t ifp)1028 ifnet_needs_fsw_ip_netagent(ifnet_t ifp)
1029 {
1030 #pragma unused(ifp)
1031 if ((if_attach_nx & IF_ATTACH_NX_FSW_IP_NETAGENT) != 0) {
1032 return TRUE;
1033 }
1034 return FALSE;
1035 }
1036
1037 boolean_t
ifnet_needs_netif_netagent(ifnet_t ifp)1038 ifnet_needs_netif_netagent(ifnet_t ifp)
1039 {
1040 #pragma unused(ifp)
1041 return (if_attach_nx & IF_ATTACH_NX_NETIF_NETAGENT) != 0;
1042 }
1043
1044 static boolean_t
dlil_detach_nexus_instance(nexus_controller_t controller,const char * func_str,uuid_t instance,uuid_t device)1045 dlil_detach_nexus_instance(nexus_controller_t controller,
1046 const char *func_str, uuid_t instance, uuid_t device)
1047 {
1048 errno_t err;
1049
1050 if (instance == NULL || uuid_is_null(instance)) {
1051 return FALSE;
1052 }
1053
1054 /* followed by the device port */
1055 if (device != NULL && !uuid_is_null(device)) {
1056 err = kern_nexus_ifdetach(controller, instance, device);
1057 if (err != 0) {
1058 DLIL_PRINTF("%s kern_nexus_ifdetach device failed %d\n",
1059 func_str, err);
1060 }
1061 }
1062 err = kern_nexus_controller_free_provider_instance(controller,
1063 instance);
1064 if (err != 0) {
1065 DLIL_PRINTF("%s free_provider_instance failed %d\n",
1066 func_str, err);
1067 }
1068 return TRUE;
1069 }
1070
1071 static boolean_t
dlil_detach_nexus(const char * func_str,uuid_t provider,uuid_t instance,uuid_t device)1072 dlil_detach_nexus(const char *func_str, uuid_t provider, uuid_t instance,
1073 uuid_t device)
1074 {
1075 boolean_t detached = FALSE;
1076 nexus_controller_t controller = kern_nexus_shared_controller();
1077 int err;
1078
1079 if (dlil_detach_nexus_instance(controller, func_str, instance,
1080 device)) {
1081 detached = TRUE;
1082 }
1083 if (provider != NULL && !uuid_is_null(provider)) {
1084 detached = TRUE;
1085 err = kern_nexus_controller_deregister_provider(controller,
1086 provider);
1087 if (err != 0) {
1088 DLIL_PRINTF("%s deregister_provider %d\n",
1089 func_str, err);
1090 }
1091 }
1092 return detached;
1093 }
1094
1095 static errno_t
dlil_create_provider_and_instance(nexus_controller_t controller,nexus_type_t type,ifnet_t ifp,uuid_t * provider,uuid_t * instance,nexus_attr_t attr)1096 dlil_create_provider_and_instance(nexus_controller_t controller,
1097 nexus_type_t type, ifnet_t ifp, uuid_t *provider, uuid_t *instance,
1098 nexus_attr_t attr)
1099 {
1100 uuid_t dom_prov;
1101 errno_t err;
1102 nexus_name_t provider_name;
1103 const char *type_name =
1104 (type == NEXUS_TYPE_NET_IF) ? "netif" : "flowswitch";
1105 struct kern_nexus_init init;
1106
1107 err = kern_nexus_get_default_domain_provider(type, &dom_prov);
1108 if (err != 0) {
1109 DLIL_PRINTF("%s can't get %s provider, error %d\n",
1110 __func__, type_name, err);
1111 goto failed;
1112 }
1113
1114 snprintf((char *)provider_name, sizeof(provider_name),
1115 "com.apple.%s.%s", type_name, if_name(ifp));
1116 err = kern_nexus_controller_register_provider(controller,
1117 dom_prov,
1118 provider_name,
1119 NULL,
1120 0,
1121 attr,
1122 provider);
1123 if (err != 0) {
1124 DLIL_PRINTF("%s register %s provider failed, error %d\n",
1125 __func__, type_name, err);
1126 goto failed;
1127 }
1128 bzero(&init, sizeof(init));
1129 init.nxi_version = KERN_NEXUS_CURRENT_VERSION;
1130 err = kern_nexus_controller_alloc_provider_instance(controller,
1131 *provider,
1132 NULL, NULL,
1133 instance, &init);
1134 if (err != 0) {
1135 DLIL_PRINTF("%s alloc_provider_instance %s failed, %d\n",
1136 __func__, type_name, err);
1137 kern_nexus_controller_deregister_provider(controller,
1138 *provider);
1139 goto failed;
1140 }
1141 failed:
1142 return err;
1143 }
1144
1145 static boolean_t
dlil_attach_netif_nexus_common(ifnet_t ifp,if_nexus_netif_t netif_nx)1146 dlil_attach_netif_nexus_common(ifnet_t ifp, if_nexus_netif_t netif_nx)
1147 {
1148 nexus_attr_t attr = NULL;
1149 nexus_controller_t controller;
1150 errno_t err;
1151
1152 if ((ifp->if_capabilities & IFCAP_SKYWALK) != 0) {
1153 /* it's already attached */
1154 if (dlil_verbose) {
1155 DLIL_PRINTF("%s: %s already has nexus attached\n",
1156 __func__, if_name(ifp));
1157 /* already attached */
1158 }
1159 goto failed;
1160 }
1161
1162 err = kern_nexus_attr_create(&attr);
1163 if (err != 0) {
1164 DLIL_PRINTF("%s: nexus attr create for %s\n", __func__,
1165 if_name(ifp));
1166 goto failed;
1167 }
1168 err = kern_nexus_attr_set(attr, NEXUS_ATTR_IFINDEX, ifp->if_index);
1169 VERIFY(err == 0);
1170
1171 controller = kern_nexus_shared_controller();
1172
1173 /* create the netif provider and instance */
1174 err = dlil_create_provider_and_instance(controller,
1175 NEXUS_TYPE_NET_IF, ifp, &netif_nx->if_nif_provider,
1176 &netif_nx->if_nif_instance, attr);
1177 if (err != 0) {
1178 goto failed;
1179 }
1180 err = kern_nexus_ifattach(controller, netif_nx->if_nif_instance,
1181 ifp, NULL, FALSE, &netif_nx->if_nif_attach);
1182 if (err != 0) {
1183 DLIL_PRINTF("%s kern_nexus_ifattach %d\n",
1184 __func__, err);
1185 /* cleanup provider and instance */
1186 dlil_detach_nexus(__func__, netif_nx->if_nif_provider,
1187 netif_nx->if_nif_instance, NULL);
1188 goto failed;
1189 }
1190 return TRUE;
1191
1192 failed:
1193 if (attr != NULL) {
1194 kern_nexus_attr_destroy(attr);
1195 }
1196 return FALSE;
1197 }
1198
1199 static boolean_t
dlil_attach_netif_compat_nexus(ifnet_t ifp,if_nexus_netif_t netif_nx)1200 dlil_attach_netif_compat_nexus(ifnet_t ifp, if_nexus_netif_t netif_nx)
1201 {
1202 if (ifnet_nx_noauto(ifp) || IFNET_IS_INTCOPROC(ifp) ||
1203 IFNET_IS_MANAGEMENT(ifp) || IFNET_IS_VMNET(ifp)) {
1204 goto failed;
1205 }
1206 switch (ifp->if_type) {
1207 case IFT_CELLULAR:
1208 case IFT_ETHER:
1209 if ((if_attach_nx & IF_ATTACH_NX_NETIF_COMPAT) == 0) {
1210 /* don't auto-attach */
1211 goto failed;
1212 }
1213 break;
1214 default:
1215 /* don't auto-attach */
1216 goto failed;
1217 }
1218 return dlil_attach_netif_nexus_common(ifp, netif_nx);
1219
1220 failed:
1221 return FALSE;
1222 }
1223
1224 static boolean_t
dlil_is_native_netif_nexus(ifnet_t ifp)1225 dlil_is_native_netif_nexus(ifnet_t ifp)
1226 {
1227 return (ifp->if_eflags & IFEF_SKYWALK_NATIVE) && ifp->if_na != NULL;
1228 }
1229
1230 __attribute__((noinline))
1231 static void
dlil_detach_netif_nexus(if_nexus_netif_t nexus_netif)1232 dlil_detach_netif_nexus(if_nexus_netif_t nexus_netif)
1233 {
1234 dlil_detach_nexus(__func__, nexus_netif->if_nif_provider,
1235 nexus_netif->if_nif_instance, nexus_netif->if_nif_attach);
1236 }
1237
1238 static inline int
dlil_siocgifdevmtu(struct ifnet * ifp,struct ifdevmtu * ifdm_p)1239 dlil_siocgifdevmtu(struct ifnet * ifp, struct ifdevmtu * ifdm_p)
1240 {
1241 struct ifreq ifr;
1242 int error;
1243
1244 bzero(&ifr, sizeof(ifr));
1245 error = ifnet_ioctl(ifp, 0, SIOCGIFDEVMTU, &ifr);
1246 if (error == 0) {
1247 *ifdm_p = ifr.ifr_devmtu;
1248 }
1249 return error;
1250 }
1251
1252 static inline void
_dlil_adjust_large_buf_size_for_tso(ifnet_t ifp,uint32_t * large_buf_size)1253 _dlil_adjust_large_buf_size_for_tso(ifnet_t ifp, uint32_t *large_buf_size)
1254 {
1255 #ifdef XNU_TARGET_OS_OSX
1256 uint32_t tso_v4_mtu = 0;
1257 uint32_t tso_v6_mtu = 0;
1258
1259 if (!dlil_is_native_netif_nexus(ifp)) {
1260 return;
1261 }
1262 /*
1263 * Note that we are reading the real hwassist flags set by the driver
1264 * and not the adjusted ones because nx_netif_host_adjust_if_capabilities()
1265 * hasn't been called yet.
1266 */
1267 if ((ifp->if_hwassist & IFNET_TSO_IPV4) != 0) {
1268 tso_v4_mtu = ifp->if_tso_v4_mtu;
1269 }
1270 if ((ifp->if_hwassist & IFNET_TSO_IPV6) != 0) {
1271 tso_v6_mtu = ifp->if_tso_v6_mtu;
1272 }
1273 /*
1274 * If the hardware supports TSO, adjust the large buf size to match the
1275 * supported TSO MTU size.
1276 */
1277 if (tso_v4_mtu != 0 || tso_v6_mtu != 0) {
1278 *large_buf_size = MAX(tso_v4_mtu, tso_v6_mtu);
1279 } else {
1280 *large_buf_size = MAX(*large_buf_size, sk_fsw_gso_mtu);
1281 }
1282 *large_buf_size = MIN(NX_FSW_MAX_LARGE_BUFSIZE, *large_buf_size);
1283 #else
1284 #pragma unused(ifp, large_buf_size)
1285 #endif /* XNU_TARGET_OS_OSX */
1286 }
1287
1288 static inline int
_dlil_get_flowswitch_buffer_size(ifnet_t ifp,uuid_t netif,uint32_t * buf_size,bool * use_multi_buflet,uint32_t * large_buf_size)1289 _dlil_get_flowswitch_buffer_size(ifnet_t ifp, uuid_t netif, uint32_t *buf_size,
1290 bool *use_multi_buflet, uint32_t *large_buf_size)
1291 {
1292 struct kern_pbufpool_memory_info rx_pp_info;
1293 struct kern_pbufpool_memory_info tx_pp_info;
1294 uint32_t if_max_mtu = 0;
1295 uint32_t drv_buf_size;
1296 struct ifdevmtu ifdm;
1297 int err;
1298
1299 /*
1300 * To perform intra-stack RX aggregation flowswitch needs to use
1301 * multi-buflet packet.
1302 */
1303 *use_multi_buflet = NX_FSW_TCP_RX_AGG_ENABLED();
1304
1305 *large_buf_size = *use_multi_buflet ? NX_FSW_DEF_LARGE_BUFSIZE : 0;
1306 /*
1307 * IP over Thunderbolt interface can deliver the largest IP packet,
1308 * but the driver advertises the MAX MTU as only 9K.
1309 */
1310 if (IFNET_IS_THUNDERBOLT_IP(ifp)) {
1311 if_max_mtu = IP_MAXPACKET;
1312 goto skip_mtu_ioctl;
1313 }
1314
1315 /* determine max mtu */
1316 bzero(&ifdm, sizeof(ifdm));
1317 err = dlil_siocgifdevmtu(ifp, &ifdm);
1318 if (__improbable(err != 0)) {
1319 DLIL_PRINTF("%s: SIOCGIFDEVMTU failed for %s\n",
1320 __func__, if_name(ifp));
1321 /* use default flowswitch buffer size */
1322 if_max_mtu = NX_FSW_BUFSIZE;
1323 } else {
1324 DLIL_PRINTF("%s: %s %d %d\n", __func__, if_name(ifp),
1325 ifdm.ifdm_max, ifdm.ifdm_current);
1326 /* rdar://problem/44589731 */
1327 if_max_mtu = MAX(ifdm.ifdm_max, ifdm.ifdm_current);
1328 }
1329
1330 skip_mtu_ioctl:
1331 if (if_max_mtu == 0) {
1332 DLIL_PRINTF("%s: can't determine MAX MTU for %s\n",
1333 __func__, if_name(ifp));
1334 return EINVAL;
1335 }
1336 if ((if_max_mtu > NX_FSW_MAXBUFSIZE) && fsw_use_max_mtu_buffer) {
1337 DLIL_PRINTF("%s: interace (%s) has MAX MTU (%u) > flowswitch "
1338 "max bufsize(%d)\n", __func__,
1339 if_name(ifp), if_max_mtu, NX_FSW_MAXBUFSIZE);
1340 return EINVAL;
1341 }
1342
1343 /*
1344 * for skywalk native driver, consult the driver packet pool also.
1345 */
1346 if (dlil_is_native_netif_nexus(ifp)) {
1347 err = kern_nexus_get_pbufpool_info(netif, &rx_pp_info,
1348 &tx_pp_info);
1349 if (err != 0) {
1350 DLIL_PRINTF("%s: can't get pbufpool info for %s\n",
1351 __func__, if_name(ifp));
1352 return ENXIO;
1353 }
1354 drv_buf_size = tx_pp_info.kpm_bufsize *
1355 tx_pp_info.kpm_max_frags;
1356 if (if_max_mtu > drv_buf_size) {
1357 DLIL_PRINTF("%s: interface %s packet pool (rx %d * %d, "
1358 "tx %d * %d) can't support max mtu(%d)\n", __func__,
1359 if_name(ifp), rx_pp_info.kpm_bufsize,
1360 rx_pp_info.kpm_max_frags, tx_pp_info.kpm_bufsize,
1361 tx_pp_info.kpm_max_frags, if_max_mtu);
1362 return EINVAL;
1363 }
1364 } else {
1365 drv_buf_size = if_max_mtu;
1366 }
1367
1368 if ((drv_buf_size > NX_FSW_BUFSIZE) && (!fsw_use_max_mtu_buffer)) {
1369 _CASSERT((NX_FSW_BUFSIZE * NX_PBUF_FRAGS_MAX) >= IP_MAXPACKET);
1370 *use_multi_buflet = true;
1371 /* default flowswitch buffer size */
1372 *buf_size = NX_FSW_BUFSIZE;
1373 *large_buf_size = MIN(NX_FSW_MAX_LARGE_BUFSIZE, drv_buf_size);
1374 } else {
1375 *buf_size = MAX(drv_buf_size, NX_FSW_BUFSIZE);
1376 }
1377 _dlil_adjust_large_buf_size_for_tso(ifp, large_buf_size);
1378 ASSERT(*buf_size <= NX_FSW_MAXBUFSIZE);
1379 if (*buf_size >= *large_buf_size) {
1380 *large_buf_size = 0;
1381 }
1382 return 0;
1383 }
1384
1385 static boolean_t
_dlil_attach_flowswitch_nexus(ifnet_t ifp,if_nexus_flowswitch_t nexus_fsw)1386 _dlil_attach_flowswitch_nexus(ifnet_t ifp, if_nexus_flowswitch_t nexus_fsw)
1387 {
1388 nexus_attr_t attr = NULL;
1389 nexus_controller_t controller;
1390 errno_t err = 0;
1391 uuid_t netif;
1392 uint32_t buf_size = 0;
1393 uint32_t large_buf_size = 0;
1394 bool multi_buflet;
1395
1396 if (ifnet_nx_noauto(ifp) || ifnet_nx_noauto_flowswitch(ifp) ||
1397 IFNET_IS_VMNET(ifp)) {
1398 goto failed;
1399 }
1400
1401 if ((ifp->if_capabilities & IFCAP_SKYWALK) == 0) {
1402 /* not possible to attach (netif native/compat not plumbed) */
1403 goto failed;
1404 }
1405
1406 if ((if_attach_nx & IF_ATTACH_NX_FLOWSWITCH) == 0) {
1407 /* don't auto-attach */
1408 goto failed;
1409 }
1410
1411 /* get the netif instance from the ifp */
1412 err = kern_nexus_get_netif_instance(ifp, netif);
1413 if (err != 0) {
1414 DLIL_PRINTF("%s: can't find netif for %s\n", __func__,
1415 if_name(ifp));
1416 goto failed;
1417 }
1418
1419 err = kern_nexus_attr_create(&attr);
1420 if (err != 0) {
1421 DLIL_PRINTF("%s: nexus attr create for %s\n", __func__,
1422 if_name(ifp));
1423 goto failed;
1424 }
1425
1426 err = _dlil_get_flowswitch_buffer_size(ifp, netif, &buf_size,
1427 &multi_buflet, &large_buf_size);
1428 if (err != 0) {
1429 goto failed;
1430 }
1431 ASSERT((buf_size >= NX_FSW_BUFSIZE) && (buf_size <= NX_FSW_MAXBUFSIZE));
1432 ASSERT(large_buf_size <= NX_FSW_MAX_LARGE_BUFSIZE);
1433
1434 /* Configure flowswitch buffer size */
1435 err = kern_nexus_attr_set(attr, NEXUS_ATTR_SLOT_BUF_SIZE, buf_size);
1436 VERIFY(err == 0);
1437 err = kern_nexus_attr_set(attr, NEXUS_ATTR_LARGE_BUF_SIZE,
1438 large_buf_size);
1439 VERIFY(err == 0);
1440
1441 /*
1442 * Configure flowswitch to use super-packet (multi-buflet).
1443 */
1444 err = kern_nexus_attr_set(attr, NEXUS_ATTR_MAX_FRAGS,
1445 multi_buflet ? NX_PBUF_FRAGS_MAX : 1);
1446 VERIFY(err == 0);
1447
1448 /* create the flowswitch provider and instance */
1449 controller = kern_nexus_shared_controller();
1450 err = dlil_create_provider_and_instance(controller,
1451 NEXUS_TYPE_FLOW_SWITCH, ifp, &nexus_fsw->if_fsw_provider,
1452 &nexus_fsw->if_fsw_instance, attr);
1453 if (err != 0) {
1454 goto failed;
1455 }
1456
1457 /* attach the device port */
1458 err = kern_nexus_ifattach(controller, nexus_fsw->if_fsw_instance,
1459 NULL, netif, FALSE, &nexus_fsw->if_fsw_device);
1460 if (err != 0) {
1461 DLIL_PRINTF("%s kern_nexus_ifattach device failed %d %s\n",
1462 __func__, err, if_name(ifp));
1463 /* cleanup provider and instance */
1464 dlil_detach_nexus(__func__, nexus_fsw->if_fsw_provider,
1465 nexus_fsw->if_fsw_instance, nexus_fsw->if_fsw_device);
1466 goto failed;
1467 }
1468 return TRUE;
1469
1470 failed:
1471 if (err != 0) {
1472 DLIL_PRINTF("%s: failed to attach flowswitch to %s, error %d\n",
1473 __func__, if_name(ifp), err);
1474 } else {
1475 DLIL_PRINTF("%s: not attaching flowswitch to %s\n",
1476 __func__, if_name(ifp));
1477 }
1478 if (attr != NULL) {
1479 kern_nexus_attr_destroy(attr);
1480 }
1481 return FALSE;
1482 }
1483
1484 static boolean_t
dlil_attach_flowswitch_nexus(ifnet_t ifp)1485 dlil_attach_flowswitch_nexus(ifnet_t ifp)
1486 {
1487 boolean_t attached;
1488 if_nexus_flowswitch nexus_fsw;
1489
1490 #if (DEVELOPMENT || DEBUG)
1491 if (skywalk_netif_direct_allowed(if_name(ifp))) {
1492 DLIL_PRINTF("skip attaching fsw to %s", if_name(ifp));
1493 return FALSE;
1494 }
1495 #endif /* (DEVELOPMENT || DEBUG) */
1496
1497 /*
1498 * flowswitch attachment is not supported for interface using the
1499 * legacy model (IFNET_INIT_LEGACY)
1500 */
1501 if ((ifp->if_eflags & IFEF_TXSTART) == 0) {
1502 DLIL_PRINTF("skip attaching fsw to %s using legacy TX model",
1503 if_name(ifp));
1504 return FALSE;
1505 }
1506
1507 if (uuid_is_null(ifp->if_nx_flowswitch.if_fsw_instance) == 0) {
1508 /* it's already attached */
1509 return FALSE;
1510 }
1511 bzero(&nexus_fsw, sizeof(nexus_fsw));
1512 attached = _dlil_attach_flowswitch_nexus(ifp, &nexus_fsw);
1513 if (attached) {
1514 ifnet_lock_exclusive(ifp);
1515 if (!IF_FULLY_ATTACHED(ifp)) {
1516 /* interface is going away */
1517 attached = FALSE;
1518 } else {
1519 ifp->if_nx_flowswitch = nexus_fsw;
1520 }
1521 ifnet_lock_done(ifp);
1522 if (!attached) {
1523 /* clean up flowswitch nexus */
1524 dlil_detach_flowswitch_nexus(&nexus_fsw);
1525 }
1526 }
1527 return attached;
1528 }
1529
1530 __attribute__((noinline))
1531 static void
dlil_detach_flowswitch_nexus(if_nexus_flowswitch_t nexus_fsw)1532 dlil_detach_flowswitch_nexus(if_nexus_flowswitch_t nexus_fsw)
1533 {
1534 dlil_detach_nexus(__func__, nexus_fsw->if_fsw_provider,
1535 nexus_fsw->if_fsw_instance, nexus_fsw->if_fsw_device);
1536 }
1537
1538 __attribute__((noinline))
1539 static void
dlil_netif_detach_notify(ifnet_t ifp)1540 dlil_netif_detach_notify(ifnet_t ifp)
1541 {
1542 ifnet_detach_notify_cb_t notify = NULL;
1543 void *arg = NULL;
1544
1545 ifnet_get_detach_notify(ifp, ¬ify, &arg);
1546 if (notify == NULL) {
1547 DTRACE_SKYWALK1(no__notify, ifnet_t, ifp);
1548 return;
1549 }
1550 (*notify)(arg);
1551 }
1552
1553 __attribute__((noinline))
1554 static void
dlil_quiesce_and_detach_nexuses(ifnet_t ifp)1555 dlil_quiesce_and_detach_nexuses(ifnet_t ifp)
1556 {
1557 if_nexus_flowswitch *nx_fsw = &ifp->if_nx_flowswitch;
1558 if_nexus_netif *nx_netif = &ifp->if_nx_netif;
1559
1560 ifnet_datamov_suspend_and_drain(ifp);
1561 if (!uuid_is_null(nx_fsw->if_fsw_device)) {
1562 ASSERT(!uuid_is_null(nx_fsw->if_fsw_provider));
1563 ASSERT(!uuid_is_null(nx_fsw->if_fsw_instance));
1564 dlil_detach_flowswitch_nexus(nx_fsw);
1565 bzero(nx_fsw, sizeof(*nx_fsw));
1566 } else {
1567 ASSERT(uuid_is_null(nx_fsw->if_fsw_provider));
1568 ASSERT(uuid_is_null(nx_fsw->if_fsw_instance));
1569 DTRACE_IP1(fsw__not__attached, ifnet_t, ifp);
1570 }
1571
1572 if (!uuid_is_null(nx_netif->if_nif_attach)) {
1573 ASSERT(!uuid_is_null(nx_netif->if_nif_provider));
1574 ASSERT(!uuid_is_null(nx_netif->if_nif_instance));
1575 dlil_detach_netif_nexus(nx_netif);
1576 bzero(nx_netif, sizeof(*nx_netif));
1577 } else {
1578 ASSERT(uuid_is_null(nx_netif->if_nif_provider));
1579 ASSERT(uuid_is_null(nx_netif->if_nif_instance));
1580 DTRACE_IP1(netif__not__attached, ifnet_t, ifp);
1581 }
1582 ifnet_datamov_resume(ifp);
1583 }
1584
1585 boolean_t
ifnet_add_netagent(ifnet_t ifp)1586 ifnet_add_netagent(ifnet_t ifp)
1587 {
1588 int error;
1589
1590 error = kern_nexus_interface_add_netagent(ifp);
1591 os_log(OS_LOG_DEFAULT,
1592 "kern_nexus_interface_add_netagent(%s) returned %d",
1593 ifp->if_xname, error);
1594 return error == 0;
1595 }
1596
1597 boolean_t
ifnet_remove_netagent(ifnet_t ifp)1598 ifnet_remove_netagent(ifnet_t ifp)
1599 {
1600 int error;
1601
1602 error = kern_nexus_interface_remove_netagent(ifp);
1603 os_log(OS_LOG_DEFAULT,
1604 "kern_nexus_interface_remove_netagent(%s) returned %d",
1605 ifp->if_xname, error);
1606 return error == 0;
1607 }
1608
1609 boolean_t
ifnet_attach_flowswitch_nexus(ifnet_t ifp)1610 ifnet_attach_flowswitch_nexus(ifnet_t ifp)
1611 {
1612 if (!IF_FULLY_ATTACHED(ifp)) {
1613 return FALSE;
1614 }
1615 return dlil_attach_flowswitch_nexus(ifp);
1616 }
1617
1618 boolean_t
ifnet_detach_flowswitch_nexus(ifnet_t ifp)1619 ifnet_detach_flowswitch_nexus(ifnet_t ifp)
1620 {
1621 if_nexus_flowswitch nexus_fsw;
1622
1623 ifnet_lock_exclusive(ifp);
1624 nexus_fsw = ifp->if_nx_flowswitch;
1625 bzero(&ifp->if_nx_flowswitch, sizeof(ifp->if_nx_flowswitch));
1626 ifnet_lock_done(ifp);
1627 return dlil_detach_nexus(__func__, nexus_fsw.if_fsw_provider,
1628 nexus_fsw.if_fsw_instance, nexus_fsw.if_fsw_device);
1629 }
1630
1631 boolean_t
ifnet_attach_netif_nexus(ifnet_t ifp)1632 ifnet_attach_netif_nexus(ifnet_t ifp)
1633 {
1634 boolean_t nexus_attached;
1635 if_nexus_netif nexus_netif;
1636
1637 if (!IF_FULLY_ATTACHED(ifp)) {
1638 return FALSE;
1639 }
1640 nexus_attached = dlil_attach_netif_nexus_common(ifp, &nexus_netif);
1641 if (nexus_attached) {
1642 ifnet_lock_exclusive(ifp);
1643 ifp->if_nx_netif = nexus_netif;
1644 ifnet_lock_done(ifp);
1645 }
1646 return nexus_attached;
1647 }
1648
1649 boolean_t
ifnet_detach_netif_nexus(ifnet_t ifp)1650 ifnet_detach_netif_nexus(ifnet_t ifp)
1651 {
1652 if_nexus_netif nexus_netif;
1653
1654 ifnet_lock_exclusive(ifp);
1655 nexus_netif = ifp->if_nx_netif;
1656 bzero(&ifp->if_nx_netif, sizeof(ifp->if_nx_netif));
1657 ifnet_lock_done(ifp);
1658
1659 return dlil_detach_nexus(__func__, nexus_netif.if_nif_provider,
1660 nexus_netif.if_nif_instance, nexus_netif.if_nif_attach);
1661 }
1662
1663 void
ifnet_attach_native_flowswitch(ifnet_t ifp)1664 ifnet_attach_native_flowswitch(ifnet_t ifp)
1665 {
1666 if (!dlil_is_native_netif_nexus(ifp)) {
1667 /* not a native netif */
1668 return;
1669 }
1670 ifnet_attach_flowswitch_nexus(ifp);
1671 }
1672
1673 int
ifnet_set_flowswitch_rx_callback(ifnet_t ifp,ifnet_fsw_rx_cb_t cb,void * arg)1674 ifnet_set_flowswitch_rx_callback(ifnet_t ifp, ifnet_fsw_rx_cb_t cb, void *arg)
1675 {
1676 lck_mtx_lock(&ifp->if_delegate_lock);
1677 while (ifp->if_fsw_rx_cb_ref > 0) {
1678 DTRACE_SKYWALK1(wait__fsw, ifnet_t, ifp);
1679 (void) msleep(&ifp->if_fsw_rx_cb_ref, &ifp->if_delegate_lock,
1680 (PZERO + 1), __FUNCTION__, NULL);
1681 DTRACE_SKYWALK1(wake__fsw, ifnet_t, ifp);
1682 }
1683 ifp->if_fsw_rx_cb = cb;
1684 ifp->if_fsw_rx_cb_arg = arg;
1685 lck_mtx_unlock(&ifp->if_delegate_lock);
1686 return 0;
1687 }
1688
1689 int
ifnet_get_flowswitch_rx_callback(ifnet_t ifp,ifnet_fsw_rx_cb_t * cbp,void ** argp)1690 ifnet_get_flowswitch_rx_callback(ifnet_t ifp, ifnet_fsw_rx_cb_t *cbp, void **argp)
1691 {
1692 /*
1693 * This is for avoiding the unnecessary lock acquire for interfaces
1694 * not used by a redirect interface.
1695 */
1696 if (ifp->if_fsw_rx_cb == NULL) {
1697 return ENOENT;
1698 }
1699 lck_mtx_lock(&ifp->if_delegate_lock);
1700 if (ifp->if_fsw_rx_cb == NULL) {
1701 lck_mtx_unlock(&ifp->if_delegate_lock);
1702 return ENOENT;
1703 }
1704 *cbp = ifp->if_fsw_rx_cb;
1705 *argp = ifp->if_fsw_rx_cb_arg;
1706 ifp->if_fsw_rx_cb_ref++;
1707 lck_mtx_unlock(&ifp->if_delegate_lock);
1708 return 0;
1709 }
1710
1711 void
ifnet_release_flowswitch_rx_callback(ifnet_t ifp)1712 ifnet_release_flowswitch_rx_callback(ifnet_t ifp)
1713 {
1714 lck_mtx_lock(&ifp->if_delegate_lock);
1715 if (--ifp->if_fsw_rx_cb_ref == 0) {
1716 wakeup(&ifp->if_fsw_rx_cb_ref);
1717 }
1718 lck_mtx_unlock(&ifp->if_delegate_lock);
1719 }
1720
1721 int
ifnet_set_delegate_parent(ifnet_t difp,ifnet_t parent)1722 ifnet_set_delegate_parent(ifnet_t difp, ifnet_t parent)
1723 {
1724 lck_mtx_lock(&difp->if_delegate_lock);
1725 while (difp->if_delegate_parent_ref > 0) {
1726 DTRACE_SKYWALK1(wait__parent, ifnet_t, difp);
1727 (void) msleep(&difp->if_delegate_parent_ref, &difp->if_delegate_lock,
1728 (PZERO + 1), __FUNCTION__, NULL);
1729 DTRACE_SKYWALK1(wake__parent, ifnet_t, difp);
1730 }
1731 difp->if_delegate_parent = parent;
1732 lck_mtx_unlock(&difp->if_delegate_lock);
1733 return 0;
1734 }
1735
1736 int
ifnet_get_delegate_parent(ifnet_t difp,ifnet_t * parentp)1737 ifnet_get_delegate_parent(ifnet_t difp, ifnet_t *parentp)
1738 {
1739 lck_mtx_lock(&difp->if_delegate_lock);
1740 if (difp->if_delegate_parent == NULL) {
1741 lck_mtx_unlock(&difp->if_delegate_lock);
1742 return ENOENT;
1743 }
1744 *parentp = difp->if_delegate_parent;
1745 difp->if_delegate_parent_ref++;
1746 lck_mtx_unlock(&difp->if_delegate_lock);
1747 return 0;
1748 }
1749
1750 void
ifnet_release_delegate_parent(ifnet_t difp)1751 ifnet_release_delegate_parent(ifnet_t difp)
1752 {
1753 lck_mtx_lock(&difp->if_delegate_lock);
1754 if (--difp->if_delegate_parent_ref == 0) {
1755 wakeup(&difp->if_delegate_parent_ref);
1756 }
1757 lck_mtx_unlock(&difp->if_delegate_lock);
1758 }
1759
1760 __attribute__((noinline))
1761 void
ifnet_set_detach_notify_locked(ifnet_t ifp,ifnet_detach_notify_cb_t notify,void * arg)1762 ifnet_set_detach_notify_locked(ifnet_t ifp, ifnet_detach_notify_cb_t notify, void *arg)
1763 {
1764 ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_EXCLUSIVE);
1765 ifp->if_detach_notify = notify;
1766 ifp->if_detach_notify_arg = arg;
1767 }
1768
1769 __attribute__((noinline))
1770 void
ifnet_get_detach_notify_locked(ifnet_t ifp,ifnet_detach_notify_cb_t * notifyp,void ** argp)1771 ifnet_get_detach_notify_locked(ifnet_t ifp, ifnet_detach_notify_cb_t *notifyp, void **argp)
1772 {
1773 ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_EXCLUSIVE);
1774 *notifyp = ifp->if_detach_notify;
1775 *argp = ifp->if_detach_notify_arg;
1776 }
1777
1778 __attribute__((noinline))
1779 void
ifnet_set_detach_notify(ifnet_t ifp,ifnet_detach_notify_cb_t notify,void * arg)1780 ifnet_set_detach_notify(ifnet_t ifp, ifnet_detach_notify_cb_t notify, void *arg)
1781 {
1782 ifnet_lock_exclusive(ifp);
1783 ifnet_set_detach_notify_locked(ifp, notify, arg);
1784 ifnet_lock_done(ifp);
1785 }
1786
1787 __attribute__((noinline))
1788 void
ifnet_get_detach_notify(ifnet_t ifp,ifnet_detach_notify_cb_t * notifyp,void ** argp)1789 ifnet_get_detach_notify(ifnet_t ifp, ifnet_detach_notify_cb_t *notifyp, void **argp)
1790 {
1791 ifnet_lock_exclusive(ifp);
1792 ifnet_get_detach_notify_locked(ifp, notifyp, argp);
1793 ifnet_lock_done(ifp);
1794 }
1795 #endif /* SKYWALK */
1796
1797 #define DLIL_INPUT_CHECK(m, ifp) { \
1798 struct ifnet *_rcvif = mbuf_pkthdr_rcvif(m); \
1799 if (_rcvif == NULL || (ifp != lo_ifp && _rcvif != ifp) || \
1800 !(mbuf_flags(m) & MBUF_PKTHDR)) { \
1801 panic_plain("%s: invalid mbuf %p\n", __func__, m); \
1802 /* NOTREACHED */ \
1803 } \
1804 }
1805
1806 #define DLIL_EWMA(old, new, decay) do { \
1807 u_int32_t _avg; \
1808 if ((_avg = (old)) > 0) \
1809 _avg = (((_avg << (decay)) - _avg) + (new)) >> (decay); \
1810 else \
1811 _avg = (new); \
1812 (old) = _avg; \
1813 } while (0)
1814
1815 #define MBPS (1ULL * 1000 * 1000)
1816 #define GBPS (MBPS * 1000)
1817
1818 struct rxpoll_time_tbl {
1819 u_int64_t speed; /* downlink speed */
1820 u_int32_t plowat; /* packets low watermark */
1821 u_int32_t phiwat; /* packets high watermark */
1822 u_int32_t blowat; /* bytes low watermark */
1823 u_int32_t bhiwat; /* bytes high watermark */
1824 };
1825
1826 static struct rxpoll_time_tbl rxpoll_tbl[] = {
1827 { .speed = 10 * MBPS, .plowat = 2, .phiwat = 8, .blowat = (1 * 1024), .bhiwat = (6 * 1024) },
1828 { .speed = 100 * MBPS, .plowat = 10, .phiwat = 40, .blowat = (4 * 1024), .bhiwat = (64 * 1024) },
1829 { .speed = 1 * GBPS, .plowat = 10, .phiwat = 40, .blowat = (4 * 1024), .bhiwat = (64 * 1024) },
1830 { .speed = 10 * GBPS, .plowat = 10, .phiwat = 40, .blowat = (4 * 1024), .bhiwat = (64 * 1024) },
1831 { .speed = 100 * GBPS, .plowat = 10, .phiwat = 40, .blowat = (4 * 1024), .bhiwat = (64 * 1024) },
1832 { .speed = 0, .plowat = 0, .phiwat = 0, .blowat = 0, .bhiwat = 0 }
1833 };
1834
1835 static LCK_MTX_DECLARE_ATTR(dlil_thread_sync_lock, &dlil_lock_group,
1836 &dlil_lck_attributes);
1837 static uint32_t dlil_pending_thread_cnt = 0;
1838
1839 static void
dlil_incr_pending_thread_count(void)1840 dlil_incr_pending_thread_count(void)
1841 {
1842 LCK_MTX_ASSERT(&dlil_thread_sync_lock, LCK_MTX_ASSERT_NOTOWNED);
1843 lck_mtx_lock(&dlil_thread_sync_lock);
1844 dlil_pending_thread_cnt++;
1845 lck_mtx_unlock(&dlil_thread_sync_lock);
1846 }
1847
1848 static void
dlil_decr_pending_thread_count(void)1849 dlil_decr_pending_thread_count(void)
1850 {
1851 LCK_MTX_ASSERT(&dlil_thread_sync_lock, LCK_MTX_ASSERT_NOTOWNED);
1852 lck_mtx_lock(&dlil_thread_sync_lock);
1853 VERIFY(dlil_pending_thread_cnt > 0);
1854 dlil_pending_thread_cnt--;
1855 if (dlil_pending_thread_cnt == 0) {
1856 wakeup(&dlil_pending_thread_cnt);
1857 }
1858 lck_mtx_unlock(&dlil_thread_sync_lock);
1859 }
1860
1861 int
proto_hash_value(u_int32_t protocol_family)1862 proto_hash_value(u_int32_t protocol_family)
1863 {
1864 /*
1865 * dlil_proto_unplumb_all() depends on the mapping between
1866 * the hash bucket index and the protocol family defined
1867 * here; future changes must be applied there as well.
1868 */
1869 switch (protocol_family) {
1870 case PF_INET:
1871 return 0;
1872 case PF_INET6:
1873 return 1;
1874 case PF_VLAN:
1875 return 2;
1876 case PF_UNSPEC:
1877 default:
1878 return 3;
1879 }
1880 }
1881
1882 /*
1883 * Caller must already be holding ifnet lock.
1884 */
1885 static struct if_proto *
find_attached_proto(struct ifnet * ifp,u_int32_t protocol_family)1886 find_attached_proto(struct ifnet *ifp, u_int32_t protocol_family)
1887 {
1888 struct if_proto *proto = NULL;
1889 u_int32_t i = proto_hash_value(protocol_family);
1890
1891 ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_OWNED);
1892
1893 if (ifp->if_proto_hash != NULL) {
1894 proto = SLIST_FIRST(&ifp->if_proto_hash[i]);
1895 }
1896
1897 while (proto != NULL && proto->protocol_family != protocol_family) {
1898 proto = SLIST_NEXT(proto, next_hash);
1899 }
1900
1901 if (proto != NULL) {
1902 if_proto_ref(proto);
1903 }
1904
1905 return proto;
1906 }
1907
1908 static void
if_proto_ref(struct if_proto * proto)1909 if_proto_ref(struct if_proto *proto)
1910 {
1911 os_atomic_inc(&proto->refcount, relaxed);
1912 }
1913
1914 extern void if_rtproto_del(struct ifnet *ifp, int protocol);
1915
1916 static void
if_proto_free(struct if_proto * proto)1917 if_proto_free(struct if_proto *proto)
1918 {
1919 u_int32_t oldval;
1920 struct ifnet *ifp = proto->ifp;
1921 u_int32_t proto_family = proto->protocol_family;
1922 struct kev_dl_proto_data ev_pr_data;
1923
1924 oldval = os_atomic_dec_orig(&proto->refcount, relaxed);
1925 if (oldval > 1) {
1926 return;
1927 }
1928
1929 if (proto->proto_kpi == kProtoKPI_v1) {
1930 if (proto->kpi.v1.detached) {
1931 proto->kpi.v1.detached(ifp, proto->protocol_family);
1932 }
1933 }
1934 if (proto->proto_kpi == kProtoKPI_v2) {
1935 if (proto->kpi.v2.detached) {
1936 proto->kpi.v2.detached(ifp, proto->protocol_family);
1937 }
1938 }
1939
1940 /*
1941 * Cleanup routes that may still be in the routing table for that
1942 * interface/protocol pair.
1943 */
1944 if_rtproto_del(ifp, proto_family);
1945
1946 ifnet_lock_shared(ifp);
1947
1948 /* No more reference on this, protocol must have been detached */
1949 VERIFY(proto->detached);
1950
1951 /*
1952 * The reserved field carries the number of protocol still attached
1953 * (subject to change)
1954 */
1955 ev_pr_data.proto_family = proto_family;
1956 ev_pr_data.proto_remaining_count = dlil_ifp_protolist(ifp, NULL, 0);
1957
1958 ifnet_lock_done(ifp);
1959
1960 dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_PROTO_DETACHED,
1961 (struct net_event_data *)&ev_pr_data,
1962 sizeof(struct kev_dl_proto_data), FALSE);
1963
1964 if (ev_pr_data.proto_remaining_count == 0) {
1965 /*
1966 * The protocol count has gone to zero, mark the interface down.
1967 * This used to be done by configd.KernelEventMonitor, but that
1968 * is inherently prone to races (rdar://problem/30810208).
1969 */
1970 (void) ifnet_set_flags(ifp, 0, IFF_UP);
1971 (void) ifnet_ioctl(ifp, 0, SIOCSIFFLAGS, NULL);
1972 dlil_post_sifflags_msg(ifp);
1973 }
1974
1975 zfree(dlif_proto_zone, proto);
1976 }
1977
1978 __private_extern__ void
ifnet_lock_assert(struct ifnet * ifp,ifnet_lock_assert_t what)1979 ifnet_lock_assert(struct ifnet *ifp, ifnet_lock_assert_t what)
1980 {
1981 #if !MACH_ASSERT
1982 #pragma unused(ifp)
1983 #endif
1984 unsigned int type = 0;
1985 int ass = 1;
1986
1987 switch (what) {
1988 case IFNET_LCK_ASSERT_EXCLUSIVE:
1989 type = LCK_RW_ASSERT_EXCLUSIVE;
1990 break;
1991
1992 case IFNET_LCK_ASSERT_SHARED:
1993 type = LCK_RW_ASSERT_SHARED;
1994 break;
1995
1996 case IFNET_LCK_ASSERT_OWNED:
1997 type = LCK_RW_ASSERT_HELD;
1998 break;
1999
2000 case IFNET_LCK_ASSERT_NOTOWNED:
2001 /* nothing to do here for RW lock; bypass assert */
2002 ass = 0;
2003 break;
2004
2005 default:
2006 panic("bad ifnet assert type: %d", what);
2007 /* NOTREACHED */
2008 }
2009 if (ass) {
2010 LCK_RW_ASSERT(&ifp->if_lock, type);
2011 }
2012 }
2013
2014 __private_extern__ void
ifnet_lock_shared(struct ifnet * ifp)2015 ifnet_lock_shared(struct ifnet *ifp)
2016 {
2017 lck_rw_lock_shared(&ifp->if_lock);
2018 }
2019
2020 __private_extern__ void
ifnet_lock_exclusive(struct ifnet * ifp)2021 ifnet_lock_exclusive(struct ifnet *ifp)
2022 {
2023 lck_rw_lock_exclusive(&ifp->if_lock);
2024 }
2025
2026 __private_extern__ void
ifnet_lock_done(struct ifnet * ifp)2027 ifnet_lock_done(struct ifnet *ifp)
2028 {
2029 lck_rw_done(&ifp->if_lock);
2030 }
2031
2032 #if INET
2033 __private_extern__ void
if_inetdata_lock_shared(struct ifnet * ifp)2034 if_inetdata_lock_shared(struct ifnet *ifp)
2035 {
2036 lck_rw_lock_shared(&ifp->if_inetdata_lock);
2037 }
2038
2039 __private_extern__ void
if_inetdata_lock_exclusive(struct ifnet * ifp)2040 if_inetdata_lock_exclusive(struct ifnet *ifp)
2041 {
2042 lck_rw_lock_exclusive(&ifp->if_inetdata_lock);
2043 }
2044
2045 __private_extern__ void
if_inetdata_lock_done(struct ifnet * ifp)2046 if_inetdata_lock_done(struct ifnet *ifp)
2047 {
2048 lck_rw_done(&ifp->if_inetdata_lock);
2049 }
2050 #endif
2051
2052 __private_extern__ void
if_inet6data_lock_shared(struct ifnet * ifp)2053 if_inet6data_lock_shared(struct ifnet *ifp)
2054 {
2055 lck_rw_lock_shared(&ifp->if_inet6data_lock);
2056 }
2057
2058 __private_extern__ void
if_inet6data_lock_exclusive(struct ifnet * ifp)2059 if_inet6data_lock_exclusive(struct ifnet *ifp)
2060 {
2061 lck_rw_lock_exclusive(&ifp->if_inet6data_lock);
2062 }
2063
2064 __private_extern__ void
if_inet6data_lock_done(struct ifnet * ifp)2065 if_inet6data_lock_done(struct ifnet *ifp)
2066 {
2067 lck_rw_done(&ifp->if_inet6data_lock);
2068 }
2069
2070 __private_extern__ void
ifnet_head_lock_shared(void)2071 ifnet_head_lock_shared(void)
2072 {
2073 lck_rw_lock_shared(&ifnet_head_lock);
2074 }
2075
2076 __private_extern__ void
ifnet_head_lock_exclusive(void)2077 ifnet_head_lock_exclusive(void)
2078 {
2079 lck_rw_lock_exclusive(&ifnet_head_lock);
2080 }
2081
2082 __private_extern__ void
ifnet_head_done(void)2083 ifnet_head_done(void)
2084 {
2085 lck_rw_done(&ifnet_head_lock);
2086 }
2087
2088 __private_extern__ void
ifnet_head_assert_exclusive(void)2089 ifnet_head_assert_exclusive(void)
2090 {
2091 LCK_RW_ASSERT(&ifnet_head_lock, LCK_RW_ASSERT_EXCLUSIVE);
2092 }
2093
2094 /*
2095 * dlil_ifp_protolist
2096 * - get the list of protocols attached to the interface, or just the number
2097 * of attached protocols
2098 * - if the number returned is greater than 'list_count', truncation occurred
2099 *
2100 * Note:
2101 * - caller must already be holding ifnet lock.
2102 */
2103 static u_int32_t
dlil_ifp_protolist(struct ifnet * ifp,protocol_family_t * list,u_int32_t list_count)2104 dlil_ifp_protolist(struct ifnet *ifp, protocol_family_t *list,
2105 u_int32_t list_count)
2106 {
2107 u_int32_t count = 0;
2108 int i;
2109
2110 ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_OWNED);
2111
2112 if (ifp->if_proto_hash == NULL) {
2113 goto done;
2114 }
2115
2116 for (i = 0; i < PROTO_HASH_SLOTS; i++) {
2117 struct if_proto *proto;
2118 SLIST_FOREACH(proto, &ifp->if_proto_hash[i], next_hash) {
2119 if (list != NULL && count < list_count) {
2120 list[count] = proto->protocol_family;
2121 }
2122 count++;
2123 }
2124 }
2125 done:
2126 return count;
2127 }
2128
2129 __private_extern__ u_int32_t
if_get_protolist(struct ifnet * ifp,u_int32_t * protolist,u_int32_t count)2130 if_get_protolist(struct ifnet * ifp, u_int32_t *protolist, u_int32_t count)
2131 {
2132 ifnet_lock_shared(ifp);
2133 count = dlil_ifp_protolist(ifp, protolist, count);
2134 ifnet_lock_done(ifp);
2135 return count;
2136 }
2137
2138 __private_extern__ void
if_free_protolist(u_int32_t * list)2139 if_free_protolist(u_int32_t *list)
2140 {
2141 kfree_data_addr(list);
2142 }
2143
2144 __private_extern__ int
dlil_post_msg(struct ifnet * ifp,u_int32_t event_subclass,u_int32_t event_code,struct net_event_data * event_data,u_int32_t event_data_len,boolean_t suppress_generation)2145 dlil_post_msg(struct ifnet *ifp, u_int32_t event_subclass,
2146 u_int32_t event_code, struct net_event_data *event_data,
2147 u_int32_t event_data_len, boolean_t suppress_generation)
2148 {
2149 struct net_event_data ev_data;
2150 struct kev_msg ev_msg;
2151
2152 bzero(&ev_msg, sizeof(ev_msg));
2153 bzero(&ev_data, sizeof(ev_data));
2154 /*
2155 * a net event always starts with a net_event_data structure
2156 * but the caller can generate a simple net event or
2157 * provide a longer event structure to post
2158 */
2159 ev_msg.vendor_code = KEV_VENDOR_APPLE;
2160 ev_msg.kev_class = KEV_NETWORK_CLASS;
2161 ev_msg.kev_subclass = event_subclass;
2162 ev_msg.event_code = event_code;
2163
2164 if (event_data == NULL) {
2165 event_data = &ev_data;
2166 event_data_len = sizeof(struct net_event_data);
2167 }
2168
2169 strlcpy(&event_data->if_name[0], ifp->if_name, IFNAMSIZ);
2170 event_data->if_family = ifp->if_family;
2171 event_data->if_unit = (u_int32_t)ifp->if_unit;
2172
2173 ev_msg.dv[0].data_length = event_data_len;
2174 ev_msg.dv[0].data_ptr = event_data;
2175 ev_msg.dv[1].data_length = 0;
2176
2177 bool update_generation = true;
2178 if (event_subclass == KEV_DL_SUBCLASS) {
2179 /* Don't update interface generation for frequent link quality and state changes */
2180 switch (event_code) {
2181 case KEV_DL_LINK_QUALITY_METRIC_CHANGED:
2182 case KEV_DL_RRC_STATE_CHANGED:
2183 case KEV_DL_PRIMARY_ELECTED:
2184 update_generation = false;
2185 break;
2186 default:
2187 break;
2188 }
2189 }
2190
2191 /*
2192 * Some events that update generation counts might
2193 * want to suppress generation count.
2194 * One example is node presence/absence where we still
2195 * issue kernel event for the invocation but want to avoid
2196 * expensive operation of updating generation which triggers
2197 * NECP client updates.
2198 */
2199 if (suppress_generation) {
2200 update_generation = false;
2201 }
2202
2203 return dlil_event_internal(ifp, &ev_msg, update_generation);
2204 }
2205
2206 __private_extern__ int
dlil_alloc_local_stats(struct ifnet * ifp)2207 dlil_alloc_local_stats(struct ifnet *ifp)
2208 {
2209 int ret = EINVAL;
2210 void *buf, *base, **pbuf;
2211
2212 if (ifp == NULL) {
2213 goto end;
2214 }
2215
2216 if (ifp->if_tcp_stat == NULL && ifp->if_udp_stat == NULL) {
2217 /* allocate tcpstat_local structure */
2218 buf = zalloc_flags(dlif_tcpstat_zone,
2219 Z_WAITOK | Z_ZERO | Z_NOFAIL);
2220
2221 /* Get the 64-bit aligned base address for this object */
2222 base = (void *)P2ROUNDUP((intptr_t)buf + sizeof(u_int64_t),
2223 sizeof(u_int64_t));
2224 VERIFY(((intptr_t)base + dlif_tcpstat_size) <=
2225 ((intptr_t)buf + dlif_tcpstat_bufsize));
2226
2227 /*
2228 * Wind back a pointer size from the aligned base and
2229 * save the original address so we can free it later.
2230 */
2231 pbuf = (void **)((intptr_t)base - sizeof(void *));
2232 *pbuf = buf;
2233 ifp->if_tcp_stat = base;
2234
2235 /* allocate udpstat_local structure */
2236 buf = zalloc_flags(dlif_udpstat_zone,
2237 Z_WAITOK | Z_ZERO | Z_NOFAIL);
2238
2239 /* Get the 64-bit aligned base address for this object */
2240 base = (void *)P2ROUNDUP((intptr_t)buf + sizeof(u_int64_t),
2241 sizeof(u_int64_t));
2242 VERIFY(((intptr_t)base + dlif_udpstat_size) <=
2243 ((intptr_t)buf + dlif_udpstat_bufsize));
2244
2245 /*
2246 * Wind back a pointer size from the aligned base and
2247 * save the original address so we can free it later.
2248 */
2249 pbuf = (void **)((intptr_t)base - sizeof(void *));
2250 *pbuf = buf;
2251 ifp->if_udp_stat = base;
2252
2253 VERIFY(IS_P2ALIGNED(ifp->if_tcp_stat, sizeof(u_int64_t)) &&
2254 IS_P2ALIGNED(ifp->if_udp_stat, sizeof(u_int64_t)));
2255
2256 ret = 0;
2257 }
2258
2259 if (ifp->if_ipv4_stat == NULL) {
2260 ifp->if_ipv4_stat = kalloc_type(struct if_tcp_ecn_stat, Z_WAITOK | Z_ZERO);
2261 }
2262
2263 if (ifp->if_ipv6_stat == NULL) {
2264 ifp->if_ipv6_stat = kalloc_type(struct if_tcp_ecn_stat, Z_WAITOK | Z_ZERO);
2265 }
2266 end:
2267 if (ifp != NULL && ret != 0) {
2268 if (ifp->if_tcp_stat != NULL) {
2269 pbuf = (void **)
2270 ((intptr_t)ifp->if_tcp_stat - sizeof(void *));
2271 zfree(dlif_tcpstat_zone, *pbuf);
2272 ifp->if_tcp_stat = NULL;
2273 }
2274 if (ifp->if_udp_stat != NULL) {
2275 pbuf = (void **)
2276 ((intptr_t)ifp->if_udp_stat - sizeof(void *));
2277 zfree(dlif_udpstat_zone, *pbuf);
2278 ifp->if_udp_stat = NULL;
2279 }
2280 /* The macro kfree_type sets the passed pointer to NULL */
2281 if (ifp->if_ipv4_stat != NULL) {
2282 kfree_type(struct if_tcp_ecn_stat, ifp->if_ipv4_stat);
2283 }
2284 if (ifp->if_ipv6_stat != NULL) {
2285 kfree_type(struct if_tcp_ecn_stat, ifp->if_ipv6_stat);
2286 }
2287 }
2288
2289 return ret;
2290 }
2291
2292 static void
dlil_reset_rxpoll_params(ifnet_t ifp)2293 dlil_reset_rxpoll_params(ifnet_t ifp)
2294 {
2295 ASSERT(ifp != NULL);
2296 ifnet_set_poll_cycle(ifp, NULL);
2297 ifp->if_poll_update = 0;
2298 ifp->if_poll_flags = 0;
2299 ifp->if_poll_req = 0;
2300 ifp->if_poll_mode = IFNET_MODEL_INPUT_POLL_OFF;
2301 bzero(&ifp->if_poll_tstats, sizeof(ifp->if_poll_tstats));
2302 bzero(&ifp->if_poll_pstats, sizeof(ifp->if_poll_pstats));
2303 bzero(&ifp->if_poll_sstats, sizeof(ifp->if_poll_sstats));
2304 net_timerclear(&ifp->if_poll_mode_holdtime);
2305 net_timerclear(&ifp->if_poll_mode_lasttime);
2306 net_timerclear(&ifp->if_poll_sample_holdtime);
2307 net_timerclear(&ifp->if_poll_sample_lasttime);
2308 net_timerclear(&ifp->if_poll_dbg_lasttime);
2309 }
2310
2311 static int
dlil_create_input_thread(ifnet_t ifp,struct dlil_threading_info * inp,thread_continue_t * thfunc)2312 dlil_create_input_thread(ifnet_t ifp, struct dlil_threading_info *inp,
2313 thread_continue_t *thfunc)
2314 {
2315 boolean_t dlil_rxpoll_input;
2316 thread_continue_t func = NULL;
2317 u_int32_t limit;
2318 int error = 0;
2319
2320 dlil_rxpoll_input = (ifp != NULL && net_rxpoll &&
2321 (ifp->if_eflags & IFEF_RXPOLL) && (ifp->if_xflags & IFXF_LEGACY));
2322
2323 /* default strategy utilizes the DLIL worker thread */
2324 inp->dlth_strategy = dlil_input_async;
2325
2326 /* NULL ifp indicates the main input thread, called at dlil_init time */
2327 if (ifp == NULL) {
2328 /*
2329 * Main input thread only.
2330 */
2331 func = dlil_main_input_thread_func;
2332 VERIFY(inp == dlil_main_input_thread);
2333 (void) strlcat(inp->dlth_name,
2334 "main_input", DLIL_THREADNAME_LEN);
2335 } else if (dlil_rxpoll_input) {
2336 /*
2337 * Legacy (non-netif) hybrid polling.
2338 */
2339 func = dlil_rxpoll_input_thread_func;
2340 VERIFY(inp != dlil_main_input_thread);
2341 (void) snprintf(inp->dlth_name, DLIL_THREADNAME_LEN,
2342 "%s_input_poll", if_name(ifp));
2343 } else if (net_async || (ifp->if_xflags & IFXF_LEGACY)) {
2344 /*
2345 * Asynchronous strategy.
2346 */
2347 func = dlil_input_thread_func;
2348 VERIFY(inp != dlil_main_input_thread);
2349 (void) snprintf(inp->dlth_name, DLIL_THREADNAME_LEN,
2350 "%s_input", if_name(ifp));
2351 } else {
2352 /*
2353 * Synchronous strategy if there's a netif below and
2354 * the device isn't capable of hybrid polling.
2355 */
2356 ASSERT(func == NULL);
2357 ASSERT(!(ifp->if_xflags & IFXF_LEGACY));
2358 VERIFY(inp != dlil_main_input_thread);
2359 ASSERT(!inp->dlth_affinity);
2360 inp->dlth_strategy = dlil_input_sync;
2361 }
2362 VERIFY(inp->dlth_thread == THREAD_NULL);
2363
2364 /* let caller know */
2365 if (thfunc != NULL) {
2366 *thfunc = func;
2367 }
2368
2369 inp->dlth_lock_grp = lck_grp_alloc_init(inp->dlth_name, LCK_GRP_ATTR_NULL);
2370 lck_mtx_init(&inp->dlth_lock, inp->dlth_lock_grp, &dlil_lck_attributes);
2371
2372 inp->dlth_ifp = ifp; /* NULL for main input thread */
2373
2374 /*
2375 * For interfaces that support opportunistic polling, set the
2376 * low and high watermarks for outstanding inbound packets/bytes.
2377 * Also define freeze times for transitioning between modes
2378 * and updating the average.
2379 */
2380 if (ifp != NULL && net_rxpoll && (ifp->if_eflags & IFEF_RXPOLL)) {
2381 limit = MAX(if_rcvq_maxlen, IF_RCVQ_MINLEN);
2382 if (ifp->if_xflags & IFXF_LEGACY) {
2383 (void) dlil_rxpoll_set_params(ifp, NULL, FALSE);
2384 }
2385 } else {
2386 /*
2387 * For interfaces that don't support opportunistic
2388 * polling, set the burst limit to prevent memory exhaustion.
2389 * The values of `if_rcvq_burst_limit' are safeguarded
2390 * on customer builds by `sysctl_rcvq_burst_limit'.
2391 */
2392 limit = if_rcvq_burst_limit;
2393 }
2394
2395 _qinit(&inp->dlth_pkts, Q_DROPTAIL, limit, QP_MBUF);
2396 if (inp == dlil_main_input_thread) {
2397 struct dlil_main_threading_info *inpm =
2398 (struct dlil_main_threading_info *)inp;
2399 _qinit(&inpm->lo_rcvq_pkts, Q_DROPTAIL, limit, QP_MBUF);
2400 }
2401
2402 if (func == NULL) {
2403 ASSERT(!(ifp->if_xflags & IFXF_LEGACY));
2404 ASSERT(error == 0);
2405 error = ENODEV;
2406 goto done;
2407 }
2408
2409 error = kernel_thread_start(func, inp, &inp->dlth_thread);
2410 if (error == KERN_SUCCESS) {
2411 thread_precedence_policy_data_t info;
2412 __unused kern_return_t kret;
2413
2414 bzero(&info, sizeof(info));
2415 info.importance = 0;
2416 kret = thread_policy_set(inp->dlth_thread,
2417 THREAD_PRECEDENCE_POLICY, (thread_policy_t)&info,
2418 THREAD_PRECEDENCE_POLICY_COUNT);
2419 ASSERT(kret == KERN_SUCCESS);
2420 /*
2421 * We create an affinity set so that the matching workloop
2422 * thread or the starter thread (for loopback) can be
2423 * scheduled on the same processor set as the input thread.
2424 */
2425 if (net_affinity) {
2426 struct thread *tp = inp->dlth_thread;
2427 u_int32_t tag;
2428 /*
2429 * Randomize to reduce the probability
2430 * of affinity tag namespace collision.
2431 */
2432 read_frandom(&tag, sizeof(tag));
2433 if (dlil_affinity_set(tp, tag) == KERN_SUCCESS) {
2434 thread_reference(tp);
2435 inp->dlth_affinity_tag = tag;
2436 inp->dlth_affinity = TRUE;
2437 }
2438 }
2439 } else if (inp == dlil_main_input_thread) {
2440 panic_plain("%s: couldn't create main input thread", __func__);
2441 /* NOTREACHED */
2442 } else {
2443 panic_plain("%s: couldn't create %s input thread", __func__,
2444 if_name(ifp));
2445 /* NOTREACHED */
2446 }
2447 OSAddAtomic(1, &cur_dlil_input_threads);
2448
2449 done:
2450 return error;
2451 }
2452
2453 #if TEST_INPUT_THREAD_TERMINATION
2454 static int
2455 sysctl_input_thread_termination_spin SYSCTL_HANDLER_ARGS
2456 {
2457 #pragma unused(arg1, arg2)
2458 uint32_t i;
2459 int err;
2460
2461 i = if_input_thread_termination_spin;
2462
2463 err = sysctl_handle_int(oidp, &i, 0, req);
2464 if (err != 0 || req->newptr == USER_ADDR_NULL) {
2465 return err;
2466 }
2467
2468 if (net_rxpoll == 0) {
2469 return ENXIO;
2470 }
2471
2472 if_input_thread_termination_spin = i;
2473 return err;
2474 }
2475 #endif /* TEST_INPUT_THREAD_TERMINATION */
2476
2477 static void
dlil_clean_threading_info(struct dlil_threading_info * inp)2478 dlil_clean_threading_info(struct dlil_threading_info *inp)
2479 {
2480 lck_mtx_destroy(&inp->dlth_lock, inp->dlth_lock_grp);
2481 lck_grp_free(inp->dlth_lock_grp);
2482 inp->dlth_lock_grp = NULL;
2483
2484 inp->dlth_flags = 0;
2485 inp->dlth_wtot = 0;
2486 bzero(inp->dlth_name, sizeof(inp->dlth_name));
2487 inp->dlth_ifp = NULL;
2488 VERIFY(qhead(&inp->dlth_pkts) == NULL && qempty(&inp->dlth_pkts));
2489 qlimit(&inp->dlth_pkts) = 0;
2490 bzero(&inp->dlth_stats, sizeof(inp->dlth_stats));
2491
2492 VERIFY(!inp->dlth_affinity);
2493 inp->dlth_thread = THREAD_NULL;
2494 inp->dlth_strategy = NULL;
2495 VERIFY(inp->dlth_driver_thread == THREAD_NULL);
2496 VERIFY(inp->dlth_poller_thread == THREAD_NULL);
2497 VERIFY(inp->dlth_affinity_tag == 0);
2498 #if IFNET_INPUT_SANITY_CHK
2499 inp->dlth_pkts_cnt = 0;
2500 #endif /* IFNET_INPUT_SANITY_CHK */
2501 }
2502
2503 static void
dlil_terminate_input_thread(struct dlil_threading_info * inp)2504 dlil_terminate_input_thread(struct dlil_threading_info *inp)
2505 {
2506 struct ifnet *ifp = inp->dlth_ifp;
2507 classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
2508
2509 VERIFY(current_thread() == inp->dlth_thread);
2510 VERIFY(inp != dlil_main_input_thread);
2511
2512 OSAddAtomic(-1, &cur_dlil_input_threads);
2513
2514 #if TEST_INPUT_THREAD_TERMINATION
2515 { /* do something useless that won't get optimized away */
2516 uint32_t v = 1;
2517 for (uint32_t i = 0;
2518 i < if_input_thread_termination_spin;
2519 i++) {
2520 v = (i + 1) * v;
2521 }
2522 DLIL_PRINTF("the value is %d\n", v);
2523 }
2524 #endif /* TEST_INPUT_THREAD_TERMINATION */
2525
2526 lck_mtx_lock_spin(&inp->dlth_lock);
2527 _getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
2528 VERIFY((inp->dlth_flags & DLIL_INPUT_TERMINATE) != 0);
2529 inp->dlth_flags |= DLIL_INPUT_TERMINATE_COMPLETE;
2530 wakeup_one((caddr_t)&inp->dlth_flags);
2531 lck_mtx_unlock(&inp->dlth_lock);
2532
2533 /* free up pending packets */
2534 if (pkt.cp_mbuf != NULL) {
2535 mbuf_freem_list(pkt.cp_mbuf);
2536 }
2537
2538 /* for the extra refcnt from kernel_thread_start() */
2539 thread_deallocate(current_thread());
2540
2541 if (dlil_verbose) {
2542 DLIL_PRINTF("%s: input thread terminated\n",
2543 if_name(ifp));
2544 }
2545
2546 /* this is the end */
2547 thread_terminate(current_thread());
2548 /* NOTREACHED */
2549 }
2550
2551 static kern_return_t
dlil_affinity_set(struct thread * tp,u_int32_t tag)2552 dlil_affinity_set(struct thread *tp, u_int32_t tag)
2553 {
2554 thread_affinity_policy_data_t policy;
2555
2556 bzero(&policy, sizeof(policy));
2557 policy.affinity_tag = tag;
2558 return thread_policy_set(tp, THREAD_AFFINITY_POLICY,
2559 (thread_policy_t)&policy, THREAD_AFFINITY_POLICY_COUNT);
2560 }
2561
2562 #if SKYWALK && defined(XNU_TARGET_OS_OSX)
2563 static void
dlil_filter_event(struct eventhandler_entry_arg arg __unused,enum net_filter_event_subsystems state)2564 dlil_filter_event(struct eventhandler_entry_arg arg __unused,
2565 enum net_filter_event_subsystems state)
2566 {
2567 bool old_if_enable_fsw_transport_netagent = if_enable_fsw_transport_netagent;
2568 if ((state & ~NET_FILTER_EVENT_PF_PRIVATE_PROXY) == 0) {
2569 if_enable_fsw_transport_netagent = 1;
2570 } else {
2571 if_enable_fsw_transport_netagent = 0;
2572 }
2573 if (old_if_enable_fsw_transport_netagent != if_enable_fsw_transport_netagent) {
2574 kern_nexus_update_netagents();
2575 } else if (!if_enable_fsw_transport_netagent) {
2576 necp_update_all_clients();
2577 }
2578 }
2579 #endif /* SKYWALK && XNU_TARGET_OS_OSX */
2580
2581 void
dlil_init(void)2582 dlil_init(void)
2583 {
2584 thread_t thread = THREAD_NULL;
2585
2586 /*
2587 * The following fields must be 64-bit aligned for atomic operations.
2588 */
2589 IF_DATA_REQUIRE_ALIGNED_64(ifi_ipackets);
2590 IF_DATA_REQUIRE_ALIGNED_64(ifi_ierrors);
2591 IF_DATA_REQUIRE_ALIGNED_64(ifi_opackets);
2592 IF_DATA_REQUIRE_ALIGNED_64(ifi_oerrors);
2593 IF_DATA_REQUIRE_ALIGNED_64(ifi_collisions);
2594 IF_DATA_REQUIRE_ALIGNED_64(ifi_ibytes);
2595 IF_DATA_REQUIRE_ALIGNED_64(ifi_obytes);
2596 IF_DATA_REQUIRE_ALIGNED_64(ifi_imcasts);
2597 IF_DATA_REQUIRE_ALIGNED_64(ifi_omcasts);
2598 IF_DATA_REQUIRE_ALIGNED_64(ifi_iqdrops);
2599 IF_DATA_REQUIRE_ALIGNED_64(ifi_noproto);
2600 IF_DATA_REQUIRE_ALIGNED_64(ifi_alignerrs);
2601 IF_DATA_REQUIRE_ALIGNED_64(ifi_dt_bytes);
2602 IF_DATA_REQUIRE_ALIGNED_64(ifi_fpackets);
2603 IF_DATA_REQUIRE_ALIGNED_64(ifi_fbytes);
2604
2605 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_ipackets);
2606 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_ierrors);
2607 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_opackets);
2608 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_oerrors);
2609 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_collisions);
2610 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_ibytes);
2611 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_obytes);
2612 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_imcasts);
2613 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_omcasts);
2614 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_iqdrops);
2615 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_noproto);
2616 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_alignerrs);
2617 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_dt_bytes);
2618 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_fpackets);
2619 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_fbytes);
2620
2621 /*
2622 * These IF_HWASSIST_ flags must be equal to their IFNET_* counterparts.
2623 */
2624 _CASSERT(IF_HWASSIST_CSUM_IP == IFNET_CSUM_IP);
2625 _CASSERT(IF_HWASSIST_CSUM_TCP == IFNET_CSUM_TCP);
2626 _CASSERT(IF_HWASSIST_CSUM_UDP == IFNET_CSUM_UDP);
2627 _CASSERT(IF_HWASSIST_CSUM_IP_FRAGS == IFNET_CSUM_FRAGMENT);
2628 _CASSERT(IF_HWASSIST_CSUM_FRAGMENT == IFNET_IP_FRAGMENT);
2629 _CASSERT(IF_HWASSIST_CSUM_TCPIPV6 == IFNET_CSUM_TCPIPV6);
2630 _CASSERT(IF_HWASSIST_CSUM_UDPIPV6 == IFNET_CSUM_UDPIPV6);
2631 _CASSERT(IF_HWASSIST_CSUM_FRAGMENT_IPV6 == IFNET_IPV6_FRAGMENT);
2632 _CASSERT(IF_HWASSIST_CSUM_PARTIAL == IFNET_CSUM_PARTIAL);
2633 _CASSERT(IF_HWASSIST_CSUM_ZERO_INVERT == IFNET_CSUM_ZERO_INVERT);
2634 _CASSERT(IF_HWASSIST_VLAN_TAGGING == IFNET_VLAN_TAGGING);
2635 _CASSERT(IF_HWASSIST_VLAN_MTU == IFNET_VLAN_MTU);
2636 _CASSERT(IF_HWASSIST_TSO_V4 == IFNET_TSO_IPV4);
2637 _CASSERT(IF_HWASSIST_TSO_V6 == IFNET_TSO_IPV6);
2638
2639 /*
2640 * ... as well as the mbuf checksum flags counterparts.
2641 */
2642 _CASSERT(CSUM_IP == IF_HWASSIST_CSUM_IP);
2643 _CASSERT(CSUM_TCP == IF_HWASSIST_CSUM_TCP);
2644 _CASSERT(CSUM_UDP == IF_HWASSIST_CSUM_UDP);
2645 _CASSERT(CSUM_IP_FRAGS == IF_HWASSIST_CSUM_IP_FRAGS);
2646 _CASSERT(CSUM_FRAGMENT == IF_HWASSIST_CSUM_FRAGMENT);
2647 _CASSERT(CSUM_TCPIPV6 == IF_HWASSIST_CSUM_TCPIPV6);
2648 _CASSERT(CSUM_UDPIPV6 == IF_HWASSIST_CSUM_UDPIPV6);
2649 _CASSERT(CSUM_FRAGMENT_IPV6 == IF_HWASSIST_CSUM_FRAGMENT_IPV6);
2650 _CASSERT(CSUM_PARTIAL == IF_HWASSIST_CSUM_PARTIAL);
2651 _CASSERT(CSUM_ZERO_INVERT == IF_HWASSIST_CSUM_ZERO_INVERT);
2652 _CASSERT(CSUM_VLAN_TAG_VALID == IF_HWASSIST_VLAN_TAGGING);
2653
2654 /*
2655 * Make sure we have at least IF_LLREACH_MAXLEN in the llreach info.
2656 */
2657 _CASSERT(IF_LLREACH_MAXLEN <= IF_LLREACHINFO_ADDRLEN);
2658 _CASSERT(IFNET_LLREACHINFO_ADDRLEN == IF_LLREACHINFO_ADDRLEN);
2659
2660 _CASSERT(IFRLOGF_DLIL == IFNET_LOGF_DLIL);
2661 _CASSERT(IFRLOGF_FAMILY == IFNET_LOGF_FAMILY);
2662 _CASSERT(IFRLOGF_DRIVER == IFNET_LOGF_DRIVER);
2663 _CASSERT(IFRLOGF_FIRMWARE == IFNET_LOGF_FIRMWARE);
2664
2665 _CASSERT(IFRLOGCAT_CONNECTIVITY == IFNET_LOGCAT_CONNECTIVITY);
2666 _CASSERT(IFRLOGCAT_QUALITY == IFNET_LOGCAT_QUALITY);
2667 _CASSERT(IFRLOGCAT_PERFORMANCE == IFNET_LOGCAT_PERFORMANCE);
2668
2669 _CASSERT(IFRTYPE_FAMILY_ANY == IFNET_FAMILY_ANY);
2670 _CASSERT(IFRTYPE_FAMILY_LOOPBACK == IFNET_FAMILY_LOOPBACK);
2671 _CASSERT(IFRTYPE_FAMILY_ETHERNET == IFNET_FAMILY_ETHERNET);
2672 _CASSERT(IFRTYPE_FAMILY_SLIP == IFNET_FAMILY_SLIP);
2673 _CASSERT(IFRTYPE_FAMILY_TUN == IFNET_FAMILY_TUN);
2674 _CASSERT(IFRTYPE_FAMILY_VLAN == IFNET_FAMILY_VLAN);
2675 _CASSERT(IFRTYPE_FAMILY_PPP == IFNET_FAMILY_PPP);
2676 _CASSERT(IFRTYPE_FAMILY_PVC == IFNET_FAMILY_PVC);
2677 _CASSERT(IFRTYPE_FAMILY_DISC == IFNET_FAMILY_DISC);
2678 _CASSERT(IFRTYPE_FAMILY_MDECAP == IFNET_FAMILY_MDECAP);
2679 _CASSERT(IFRTYPE_FAMILY_GIF == IFNET_FAMILY_GIF);
2680 _CASSERT(IFRTYPE_FAMILY_FAITH == IFNET_FAMILY_FAITH);
2681 _CASSERT(IFRTYPE_FAMILY_STF == IFNET_FAMILY_STF);
2682 _CASSERT(IFRTYPE_FAMILY_FIREWIRE == IFNET_FAMILY_FIREWIRE);
2683 _CASSERT(IFRTYPE_FAMILY_BOND == IFNET_FAMILY_BOND);
2684 _CASSERT(IFRTYPE_FAMILY_CELLULAR == IFNET_FAMILY_CELLULAR);
2685 _CASSERT(IFRTYPE_FAMILY_UTUN == IFNET_FAMILY_UTUN);
2686 _CASSERT(IFRTYPE_FAMILY_IPSEC == IFNET_FAMILY_IPSEC);
2687
2688 _CASSERT(IFRTYPE_SUBFAMILY_ANY == IFNET_SUBFAMILY_ANY);
2689 _CASSERT(IFRTYPE_SUBFAMILY_USB == IFNET_SUBFAMILY_USB);
2690 _CASSERT(IFRTYPE_SUBFAMILY_BLUETOOTH == IFNET_SUBFAMILY_BLUETOOTH);
2691 _CASSERT(IFRTYPE_SUBFAMILY_WIFI == IFNET_SUBFAMILY_WIFI);
2692 _CASSERT(IFRTYPE_SUBFAMILY_THUNDERBOLT == IFNET_SUBFAMILY_THUNDERBOLT);
2693 _CASSERT(IFRTYPE_SUBFAMILY_RESERVED == IFNET_SUBFAMILY_RESERVED);
2694 _CASSERT(IFRTYPE_SUBFAMILY_INTCOPROC == IFNET_SUBFAMILY_INTCOPROC);
2695 _CASSERT(IFRTYPE_SUBFAMILY_QUICKRELAY == IFNET_SUBFAMILY_QUICKRELAY);
2696 _CASSERT(IFRTYPE_SUBFAMILY_VMNET == IFNET_SUBFAMILY_VMNET);
2697 _CASSERT(IFRTYPE_SUBFAMILY_SIMCELL == IFNET_SUBFAMILY_SIMCELL);
2698 _CASSERT(IFRTYPE_SUBFAMILY_MANAGEMENT == IFNET_SUBFAMILY_MANAGEMENT);
2699
2700 _CASSERT(DLIL_MODIDLEN == IFNET_MODIDLEN);
2701 _CASSERT(DLIL_MODARGLEN == IFNET_MODARGLEN);
2702
2703 PE_parse_boot_argn("net_affinity", &net_affinity,
2704 sizeof(net_affinity));
2705
2706 PE_parse_boot_argn("net_rxpoll", &net_rxpoll, sizeof(net_rxpoll));
2707
2708 PE_parse_boot_argn("net_rtref", &net_rtref, sizeof(net_rtref));
2709
2710 PE_parse_boot_argn("net_async", &net_async, sizeof(net_async));
2711
2712 PE_parse_boot_argn("ifnet_debug", &ifnet_debug, sizeof(ifnet_debug));
2713
2714 VERIFY(dlil_pending_thread_cnt == 0);
2715 #if SKYWALK
2716 boolean_t pe_enable_fsw_transport_netagent = FALSE;
2717 boolean_t pe_disable_fsw_transport_netagent = FALSE;
2718 boolean_t enable_fsw_netagent =
2719 (((if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0) ||
2720 (if_attach_nx & IF_ATTACH_NX_FSW_IP_NETAGENT) != 0);
2721
2722 /*
2723 * Check the device tree to see if Skywalk netagent has been explicitly
2724 * enabled or disabled. This can be overridden via if_attach_nx below.
2725 * Note that the property is a 0-length key, and so checking for the
2726 * presence itself is enough (no need to check for the actual value of
2727 * the retrieved variable.)
2728 */
2729 pe_enable_fsw_transport_netagent =
2730 PE_get_default("kern.skywalk_netagent_enable",
2731 &pe_enable_fsw_transport_netagent,
2732 sizeof(pe_enable_fsw_transport_netagent));
2733 pe_disable_fsw_transport_netagent =
2734 PE_get_default("kern.skywalk_netagent_disable",
2735 &pe_disable_fsw_transport_netagent,
2736 sizeof(pe_disable_fsw_transport_netagent));
2737
2738 /*
2739 * These two are mutually exclusive, i.e. they both can be absent,
2740 * but only one can be present at a time, and so we assert to make
2741 * sure it is correct.
2742 */
2743 VERIFY((!pe_enable_fsw_transport_netagent &&
2744 !pe_disable_fsw_transport_netagent) ||
2745 (pe_enable_fsw_transport_netagent ^
2746 pe_disable_fsw_transport_netagent));
2747
2748 if (pe_enable_fsw_transport_netagent) {
2749 kprintf("SK: netagent is enabled via an override for "
2750 "this platform\n");
2751 if_attach_nx = SKYWALK_NETWORKING_ENABLED;
2752 } else if (pe_disable_fsw_transport_netagent) {
2753 kprintf("SK: netagent is disabled via an override for "
2754 "this platform\n");
2755 if_attach_nx = SKYWALK_NETWORKING_DISABLED;
2756 } else {
2757 kprintf("SK: netagent is %s by default for this platform\n",
2758 (enable_fsw_netagent ? "enabled" : "disabled"));
2759 if_attach_nx = IF_ATTACH_NX_DEFAULT;
2760 }
2761
2762 /*
2763 * Now see if there's a boot-arg override.
2764 */
2765 (void) PE_parse_boot_argn("if_attach_nx", &if_attach_nx,
2766 sizeof(if_attach_nx));
2767 if_enable_fsw_transport_netagent =
2768 ((if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0);
2769
2770 if_netif_all = ((if_attach_nx & IF_ATTACH_NX_NETIF_ALL) != 0);
2771
2772 if (pe_disable_fsw_transport_netagent &&
2773 if_enable_fsw_transport_netagent) {
2774 kprintf("SK: netagent is force-enabled\n");
2775 } else if (!pe_disable_fsw_transport_netagent &&
2776 !if_enable_fsw_transport_netagent) {
2777 kprintf("SK: netagent is force-disabled\n");
2778 }
2779 #ifdef XNU_TARGET_OS_OSX
2780 if (if_enable_fsw_transport_netagent) {
2781 net_filter_event_register(dlil_filter_event);
2782 }
2783 #endif /* XNU_TARGET_OS_OSX */
2784
2785 #if (DEVELOPMENT || DEBUG)
2786 (void) PE_parse_boot_argn("fsw_use_max_mtu_buffer",
2787 &fsw_use_max_mtu_buffer, sizeof(fsw_use_max_mtu_buffer));
2788 #endif /* (DEVELOPMENT || DEBUG) */
2789
2790 #endif /* SKYWALK */
2791 dlif_size = (ifnet_debug == 0) ? sizeof(struct dlil_ifnet) :
2792 sizeof(struct dlil_ifnet_dbg);
2793 /* Enforce 64-bit alignment for dlil_ifnet structure */
2794 dlif_bufsize = dlif_size + sizeof(void *) + sizeof(u_int64_t);
2795 dlif_bufsize = (uint32_t)P2ROUNDUP(dlif_bufsize, sizeof(u_int64_t));
2796 dlif_zone = zone_create(DLIF_ZONE_NAME, dlif_bufsize, ZC_ZFREE_CLEARMEM);
2797
2798 dlif_tcpstat_size = sizeof(struct tcpstat_local);
2799 /* Enforce 64-bit alignment for tcpstat_local structure */
2800 dlif_tcpstat_bufsize =
2801 dlif_tcpstat_size + sizeof(void *) + sizeof(u_int64_t);
2802 dlif_tcpstat_bufsize = (uint32_t)
2803 P2ROUNDUP(dlif_tcpstat_bufsize, sizeof(u_int64_t));
2804 dlif_tcpstat_zone = zone_create(DLIF_TCPSTAT_ZONE_NAME,
2805 dlif_tcpstat_bufsize, ZC_ZFREE_CLEARMEM);
2806
2807 dlif_udpstat_size = sizeof(struct udpstat_local);
2808 /* Enforce 64-bit alignment for udpstat_local structure */
2809 dlif_udpstat_bufsize =
2810 dlif_udpstat_size + sizeof(void *) + sizeof(u_int64_t);
2811 dlif_udpstat_bufsize = (uint32_t)
2812 P2ROUNDUP(dlif_udpstat_bufsize, sizeof(u_int64_t));
2813 dlif_udpstat_zone = zone_create(DLIF_UDPSTAT_ZONE_NAME,
2814 dlif_udpstat_bufsize, ZC_ZFREE_CLEARMEM);
2815
2816 eventhandler_lists_ctxt_init(&ifnet_evhdlr_ctxt);
2817
2818 TAILQ_INIT(&dlil_ifnet_head);
2819 TAILQ_INIT(&ifnet_head);
2820 TAILQ_INIT(&ifnet_detaching_head);
2821 TAILQ_INIT(&ifnet_ordered_head);
2822
2823 /* Initialize interface address subsystem */
2824 ifa_init();
2825
2826 #if PF
2827 /* Initialize the packet filter */
2828 pfinit();
2829 #endif /* PF */
2830
2831 /* Initialize queue algorithms */
2832 classq_init();
2833
2834 /* Initialize packet schedulers */
2835 pktsched_init();
2836
2837 /* Initialize flow advisory subsystem */
2838 flowadv_init();
2839
2840 /* Initialize the pktap virtual interface */
2841 pktap_init();
2842
2843 /* Initialize the service class to dscp map */
2844 net_qos_map_init();
2845
2846 /* Initialize the interface low power mode event handler */
2847 if_low_power_evhdlr_init();
2848
2849 /* Initialize the interface offload port list subsystem */
2850 if_ports_used_init();
2851
2852 #if DEBUG || DEVELOPMENT
2853 /* Run self-tests */
2854 dlil_verify_sum16();
2855 #endif /* DEBUG || DEVELOPMENT */
2856
2857 /*
2858 * Create and start up the main DLIL input thread and the interface
2859 * detacher threads once everything is initialized.
2860 */
2861 dlil_incr_pending_thread_count();
2862 (void) dlil_create_input_thread(NULL, dlil_main_input_thread, NULL);
2863
2864 /*
2865 * Create ifnet detacher thread.
2866 * When an interface gets detached, part of the detach processing
2867 * is delayed. The interface is added to delayed detach list
2868 * and this thread is woken up to call ifnet_detach_final
2869 * on these interfaces.
2870 */
2871 dlil_incr_pending_thread_count();
2872 if (kernel_thread_start(ifnet_detacher_thread_func,
2873 NULL, &thread) != KERN_SUCCESS) {
2874 panic_plain("%s: couldn't create detacher thread", __func__);
2875 /* NOTREACHED */
2876 }
2877 thread_deallocate(thread);
2878
2879 /*
2880 * Wait for the created kernel threads for dlil to get
2881 * scheduled and run at least once before we proceed
2882 */
2883 lck_mtx_lock(&dlil_thread_sync_lock);
2884 while (dlil_pending_thread_cnt != 0) {
2885 DLIL_PRINTF("%s: Waiting for all the create dlil kernel "
2886 "threads to get scheduled at least once.\n", __func__);
2887 (void) msleep(&dlil_pending_thread_cnt, &dlil_thread_sync_lock,
2888 (PZERO - 1), __func__, NULL);
2889 LCK_MTX_ASSERT(&dlil_thread_sync_lock, LCK_ASSERT_OWNED);
2890 }
2891 lck_mtx_unlock(&dlil_thread_sync_lock);
2892 DLIL_PRINTF("%s: All the created dlil kernel threads have been "
2893 "scheduled at least once. Proceeding.\n", __func__);
2894 }
2895
2896 static void
if_flt_monitor_busy(struct ifnet * ifp)2897 if_flt_monitor_busy(struct ifnet *ifp)
2898 {
2899 LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
2900
2901 ++ifp->if_flt_busy;
2902 VERIFY(ifp->if_flt_busy != 0);
2903 }
2904
2905 static void
if_flt_monitor_unbusy(struct ifnet * ifp)2906 if_flt_monitor_unbusy(struct ifnet *ifp)
2907 {
2908 if_flt_monitor_leave(ifp);
2909 }
2910
2911 static void
if_flt_monitor_enter(struct ifnet * ifp)2912 if_flt_monitor_enter(struct ifnet *ifp)
2913 {
2914 LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
2915
2916 while (ifp->if_flt_busy) {
2917 ++ifp->if_flt_waiters;
2918 (void) msleep(&ifp->if_flt_head, &ifp->if_flt_lock,
2919 (PZERO - 1), "if_flt_monitor", NULL);
2920 }
2921 if_flt_monitor_busy(ifp);
2922 }
2923
2924 static void
if_flt_monitor_leave(struct ifnet * ifp)2925 if_flt_monitor_leave(struct ifnet *ifp)
2926 {
2927 LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
2928
2929 VERIFY(ifp->if_flt_busy != 0);
2930 --ifp->if_flt_busy;
2931
2932 if (ifp->if_flt_busy == 0 && ifp->if_flt_waiters > 0) {
2933 ifp->if_flt_waiters = 0;
2934 wakeup(&ifp->if_flt_head);
2935 }
2936 }
2937
2938 __private_extern__ int
dlil_attach_filter(struct ifnet * ifp,const struct iff_filter * if_filter,interface_filter_t * filter_ref,u_int32_t flags)2939 dlil_attach_filter(struct ifnet *ifp, const struct iff_filter *if_filter,
2940 interface_filter_t *filter_ref, u_int32_t flags)
2941 {
2942 int retval = 0;
2943 struct ifnet_filter *filter = NULL;
2944
2945 ifnet_head_lock_shared();
2946
2947 /* Check that the interface is in the global list */
2948 if (!ifnet_lookup(ifp)) {
2949 retval = ENXIO;
2950 goto done;
2951 }
2952 if (!ifnet_is_attached(ifp, 1)) {
2953 os_log(OS_LOG_DEFAULT, "%s: %s is no longer attached",
2954 __func__, if_name(ifp));
2955 retval = ENXIO;
2956 goto done;
2957 }
2958
2959 filter = zalloc_flags(dlif_filt_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
2960
2961 /* refcnt held above during lookup */
2962 filter->filt_flags = flags;
2963 filter->filt_ifp = ifp;
2964 filter->filt_cookie = if_filter->iff_cookie;
2965 filter->filt_name = if_filter->iff_name;
2966 filter->filt_protocol = if_filter->iff_protocol;
2967 /*
2968 * Do not install filter callbacks for internal coproc interface
2969 * and for management interfaces
2970 */
2971 if (!IFNET_IS_INTCOPROC(ifp) && !IFNET_IS_MANAGEMENT(ifp)) {
2972 filter->filt_input = if_filter->iff_input;
2973 filter->filt_output = if_filter->iff_output;
2974 filter->filt_event = if_filter->iff_event;
2975 filter->filt_ioctl = if_filter->iff_ioctl;
2976 }
2977 filter->filt_detached = if_filter->iff_detached;
2978
2979 lck_mtx_lock(&ifp->if_flt_lock);
2980 if_flt_monitor_enter(ifp);
2981
2982 LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
2983 TAILQ_INSERT_TAIL(&ifp->if_flt_head, filter, filt_next);
2984
2985 *filter_ref = filter;
2986
2987 /*
2988 * Bump filter count and route_generation ID to let TCP
2989 * know it shouldn't do TSO on this connection
2990 */
2991 if ((filter->filt_flags & DLIL_IFF_TSO) == 0) {
2992 ifnet_filter_update_tso(ifp, TRUE);
2993 }
2994 OSIncrementAtomic64(&net_api_stats.nas_iflt_attach_count);
2995 INC_ATOMIC_INT64_LIM(net_api_stats.nas_iflt_attach_total);
2996 if (filter->filt_flags & DLIL_IFF_INTERNAL) {
2997 OSIncrementAtomic64(&net_api_stats.nas_iflt_attach_os_count);
2998 INC_ATOMIC_INT64_LIM(net_api_stats.nas_iflt_attach_os_total);
2999 } else {
3000 OSAddAtomic(1, &ifp->if_flt_non_os_count);
3001 }
3002 if_flt_monitor_leave(ifp);
3003 lck_mtx_unlock(&ifp->if_flt_lock);
3004
3005 #if SKYWALK && defined(XNU_TARGET_OS_OSX)
3006 net_filter_event_mark(NET_FILTER_EVENT_INTERFACE,
3007 net_check_compatible_if_filter(NULL));
3008 #endif /* SKYWALK && XNU_TARGET_OS_OSX */
3009
3010 if (dlil_verbose) {
3011 DLIL_PRINTF("%s: %s filter attached\n", if_name(ifp),
3012 if_filter->iff_name);
3013 }
3014 ifnet_decr_iorefcnt(ifp);
3015
3016 done:
3017 ifnet_head_done();
3018 if (retval != 0 && ifp != NULL) {
3019 DLIL_PRINTF("%s: failed to attach %s (err=%d)\n",
3020 if_name(ifp), if_filter->iff_name, retval);
3021 }
3022 if (retval != 0 && filter != NULL) {
3023 zfree(dlif_filt_zone, filter);
3024 }
3025
3026 return retval;
3027 }
3028
3029 static int
dlil_detach_filter_internal(interface_filter_t filter,int detached)3030 dlil_detach_filter_internal(interface_filter_t filter, int detached)
3031 {
3032 int retval = 0;
3033
3034 if (detached == 0) {
3035 ifnet_t ifp = NULL;
3036
3037 ifnet_head_lock_shared();
3038 TAILQ_FOREACH(ifp, &ifnet_head, if_link) {
3039 interface_filter_t entry = NULL;
3040
3041 lck_mtx_lock(&ifp->if_flt_lock);
3042 TAILQ_FOREACH(entry, &ifp->if_flt_head, filt_next) {
3043 if (entry != filter || entry->filt_skip) {
3044 continue;
3045 }
3046 /*
3047 * We've found a match; since it's possible
3048 * that the thread gets blocked in the monitor,
3049 * we do the lock dance. Interface should
3050 * not be detached since we still have a use
3051 * count held during filter attach.
3052 */
3053 entry->filt_skip = 1; /* skip input/output */
3054 lck_mtx_unlock(&ifp->if_flt_lock);
3055 ifnet_head_done();
3056
3057 lck_mtx_lock(&ifp->if_flt_lock);
3058 if_flt_monitor_enter(ifp);
3059 LCK_MTX_ASSERT(&ifp->if_flt_lock,
3060 LCK_MTX_ASSERT_OWNED);
3061
3062 /* Remove the filter from the list */
3063 TAILQ_REMOVE(&ifp->if_flt_head, filter,
3064 filt_next);
3065
3066 if (dlil_verbose) {
3067 DLIL_PRINTF("%s: %s filter detached\n",
3068 if_name(ifp), filter->filt_name);
3069 }
3070 if (!(filter->filt_flags & DLIL_IFF_INTERNAL)) {
3071 VERIFY(ifp->if_flt_non_os_count != 0);
3072 OSAddAtomic(-1, &ifp->if_flt_non_os_count);
3073 }
3074 /*
3075 * Decrease filter count and route_generation
3076 * ID to let TCP know it should reevalute doing
3077 * TSO or not.
3078 */
3079 if ((filter->filt_flags & DLIL_IFF_TSO) == 0) {
3080 ifnet_filter_update_tso(ifp, FALSE);
3081 }
3082 if_flt_monitor_leave(ifp);
3083 lck_mtx_unlock(&ifp->if_flt_lock);
3084 goto destroy;
3085 }
3086 lck_mtx_unlock(&ifp->if_flt_lock);
3087 }
3088 ifnet_head_done();
3089
3090 /* filter parameter is not a valid filter ref */
3091 retval = EINVAL;
3092 goto done;
3093 } else {
3094 struct ifnet *ifp = filter->filt_ifp;
3095 /*
3096 * Here we are called from ifnet_detach_final(); the
3097 * caller had emptied if_flt_head and we're doing an
3098 * implicit filter detach because the interface is
3099 * about to go away. Make sure to adjust the counters
3100 * in this case. We don't need the protection of the
3101 * filter monitor since we're called as part of the
3102 * final detach in the context of the detacher thread.
3103 */
3104 if (!(filter->filt_flags & DLIL_IFF_INTERNAL)) {
3105 VERIFY(ifp->if_flt_non_os_count != 0);
3106 OSAddAtomic(-1, &ifp->if_flt_non_os_count);
3107 }
3108 /*
3109 * Decrease filter count and route_generation
3110 * ID to let TCP know it should reevalute doing
3111 * TSO or not.
3112 */
3113 if ((filter->filt_flags & DLIL_IFF_TSO) == 0) {
3114 ifnet_filter_update_tso(ifp, FALSE);
3115 }
3116 }
3117
3118 if (dlil_verbose) {
3119 DLIL_PRINTF("%s filter detached\n", filter->filt_name);
3120 }
3121
3122 destroy:
3123
3124 /* Call the detached function if there is one */
3125 if (filter->filt_detached) {
3126 filter->filt_detached(filter->filt_cookie, filter->filt_ifp);
3127 }
3128
3129 VERIFY(OSDecrementAtomic64(&net_api_stats.nas_iflt_attach_count) > 0);
3130 if (filter->filt_flags & DLIL_IFF_INTERNAL) {
3131 VERIFY(OSDecrementAtomic64(&net_api_stats.nas_iflt_attach_os_count) > 0);
3132 }
3133 #if SKYWALK && defined(XNU_TARGET_OS_OSX)
3134 net_filter_event_mark(NET_FILTER_EVENT_INTERFACE,
3135 net_check_compatible_if_filter(NULL));
3136 #endif /* SKYWALK && XNU_TARGET_OS_OSX */
3137
3138 /* Free the filter */
3139 zfree(dlif_filt_zone, filter);
3140 filter = NULL;
3141 done:
3142 if (retval != 0 && filter != NULL) {
3143 DLIL_PRINTF("failed to detach %s filter (err=%d)\n",
3144 filter->filt_name, retval);
3145 }
3146
3147 return retval;
3148 }
3149
3150 __private_extern__ void
dlil_detach_filter(interface_filter_t filter)3151 dlil_detach_filter(interface_filter_t filter)
3152 {
3153 if (filter == NULL) {
3154 return;
3155 }
3156 dlil_detach_filter_internal(filter, 0);
3157 }
3158
3159 __private_extern__ boolean_t
dlil_has_ip_filter(void)3160 dlil_has_ip_filter(void)
3161 {
3162 boolean_t has_filter = ((net_api_stats.nas_ipf_add_count - net_api_stats.nas_ipf_add_os_count) > 0);
3163
3164 VERIFY(net_api_stats.nas_ipf_add_count >= net_api_stats.nas_ipf_add_os_count);
3165
3166 DTRACE_IP1(dlil_has_ip_filter, boolean_t, has_filter);
3167 return has_filter;
3168 }
3169
3170 __private_extern__ boolean_t
dlil_has_if_filter(struct ifnet * ifp)3171 dlil_has_if_filter(struct ifnet *ifp)
3172 {
3173 boolean_t has_filter = !TAILQ_EMPTY(&ifp->if_flt_head);
3174 DTRACE_IP1(dlil_has_if_filter, boolean_t, has_filter);
3175 return has_filter;
3176 }
3177
3178 static inline void
dlil_input_wakeup(struct dlil_threading_info * inp)3179 dlil_input_wakeup(struct dlil_threading_info *inp)
3180 {
3181 LCK_MTX_ASSERT(&inp->dlth_lock, LCK_MTX_ASSERT_OWNED);
3182
3183 inp->dlth_flags |= DLIL_INPUT_WAITING;
3184 if (!(inp->dlth_flags & DLIL_INPUT_RUNNING)) {
3185 inp->dlth_wtot++;
3186 wakeup_one((caddr_t)&inp->dlth_flags);
3187 }
3188 }
3189
3190 __attribute__((noreturn))
3191 static void
dlil_main_input_thread_func(void * v,wait_result_t w)3192 dlil_main_input_thread_func(void *v, wait_result_t w)
3193 {
3194 #pragma unused(w)
3195 struct dlil_threading_info *inp = v;
3196
3197 VERIFY(inp == dlil_main_input_thread);
3198 VERIFY(inp->dlth_ifp == NULL);
3199 VERIFY(current_thread() == inp->dlth_thread);
3200
3201 lck_mtx_lock(&inp->dlth_lock);
3202 VERIFY(!(inp->dlth_flags & (DLIL_INPUT_EMBRYONIC | DLIL_INPUT_RUNNING)));
3203 (void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3204 inp->dlth_flags |= DLIL_INPUT_EMBRYONIC;
3205 /* wake up once to get out of embryonic state */
3206 dlil_input_wakeup(inp);
3207 lck_mtx_unlock(&inp->dlth_lock);
3208 (void) thread_block_parameter(dlil_main_input_thread_cont, inp);
3209 /* NOTREACHED */
3210 __builtin_unreachable();
3211 }
3212
3213 /*
3214 * Main input thread:
3215 *
3216 * a) handles all inbound packets for lo0
3217 * b) handles all inbound packets for interfaces with no dedicated
3218 * input thread (e.g. anything but Ethernet/PDP or those that support
3219 * opportunistic polling.)
3220 * c) protocol registrations
3221 * d) packet injections
3222 */
3223 __attribute__((noreturn))
3224 static void
dlil_main_input_thread_cont(void * v,wait_result_t wres)3225 dlil_main_input_thread_cont(void *v, wait_result_t wres)
3226 {
3227 struct dlil_main_threading_info *inpm = v;
3228 struct dlil_threading_info *inp = v;
3229
3230 /* main input thread is uninterruptible */
3231 VERIFY(wres != THREAD_INTERRUPTED);
3232 lck_mtx_lock_spin(&inp->dlth_lock);
3233 VERIFY(!(inp->dlth_flags & (DLIL_INPUT_TERMINATE |
3234 DLIL_INPUT_RUNNING)));
3235 inp->dlth_flags |= DLIL_INPUT_RUNNING;
3236
3237 while (1) {
3238 struct mbuf *m = NULL, *m_loop = NULL;
3239 u_int32_t m_cnt, m_cnt_loop;
3240 classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
3241 boolean_t proto_req;
3242 boolean_t embryonic;
3243
3244 inp->dlth_flags &= ~DLIL_INPUT_WAITING;
3245
3246 if (__improbable(embryonic =
3247 (inp->dlth_flags & DLIL_INPUT_EMBRYONIC))) {
3248 inp->dlth_flags &= ~DLIL_INPUT_EMBRYONIC;
3249 }
3250
3251 proto_req = (inp->dlth_flags &
3252 (DLIL_PROTO_WAITING | DLIL_PROTO_REGISTER));
3253
3254 /* Packets for non-dedicated interfaces other than lo0 */
3255 m_cnt = qlen(&inp->dlth_pkts);
3256 _getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
3257 m = pkt.cp_mbuf;
3258
3259 /* Packets exclusive to lo0 */
3260 m_cnt_loop = qlen(&inpm->lo_rcvq_pkts);
3261 _getq_all(&inpm->lo_rcvq_pkts, &pkt, NULL, NULL, NULL);
3262 m_loop = pkt.cp_mbuf;
3263
3264 inp->dlth_wtot = 0;
3265
3266 lck_mtx_unlock(&inp->dlth_lock);
3267
3268 if (__improbable(embryonic)) {
3269 dlil_decr_pending_thread_count();
3270 }
3271
3272 /*
3273 * NOTE warning %%% attention !!!!
3274 * We should think about putting some thread starvation
3275 * safeguards if we deal with long chains of packets.
3276 */
3277 if (__probable(m_loop != NULL)) {
3278 dlil_input_packet_list_extended(lo_ifp, m_loop,
3279 m_cnt_loop, IFNET_MODEL_INPUT_POLL_OFF);
3280 }
3281
3282 if (__probable(m != NULL)) {
3283 dlil_input_packet_list_extended(NULL, m,
3284 m_cnt, IFNET_MODEL_INPUT_POLL_OFF);
3285 }
3286
3287 if (__improbable(proto_req)) {
3288 proto_input_run();
3289 }
3290
3291 lck_mtx_lock_spin(&inp->dlth_lock);
3292 VERIFY(inp->dlth_flags & DLIL_INPUT_RUNNING);
3293 /* main input thread cannot be terminated */
3294 VERIFY(!(inp->dlth_flags & DLIL_INPUT_TERMINATE));
3295 if (!(inp->dlth_flags & ~DLIL_INPUT_RUNNING)) {
3296 break;
3297 }
3298 }
3299
3300 inp->dlth_flags &= ~DLIL_INPUT_RUNNING;
3301 (void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3302 lck_mtx_unlock(&inp->dlth_lock);
3303 (void) thread_block_parameter(dlil_main_input_thread_cont, inp);
3304
3305 VERIFY(0); /* we should never get here */
3306 /* NOTREACHED */
3307 __builtin_unreachable();
3308 }
3309
3310 /*
3311 * Input thread for interfaces with legacy input model.
3312 */
3313 __attribute__((noreturn))
3314 static void
dlil_input_thread_func(void * v,wait_result_t w)3315 dlil_input_thread_func(void *v, wait_result_t w)
3316 {
3317 #pragma unused(w)
3318 char thread_name[MAXTHREADNAMESIZE];
3319 struct dlil_threading_info *inp = v;
3320 struct ifnet *ifp = inp->dlth_ifp;
3321
3322 VERIFY(inp != dlil_main_input_thread);
3323 VERIFY(ifp != NULL);
3324 VERIFY(!(ifp->if_eflags & IFEF_RXPOLL) || !net_rxpoll ||
3325 !(ifp->if_xflags & IFXF_LEGACY));
3326 VERIFY(ifp->if_poll_mode == IFNET_MODEL_INPUT_POLL_OFF ||
3327 !(ifp->if_xflags & IFXF_LEGACY));
3328 VERIFY(current_thread() == inp->dlth_thread);
3329
3330 /* construct the name for this thread, and then apply it */
3331 bzero(thread_name, sizeof(thread_name));
3332 (void) snprintf(thread_name, sizeof(thread_name),
3333 "dlil_input_%s", ifp->if_xname);
3334 thread_set_thread_name(inp->dlth_thread, thread_name);
3335
3336 lck_mtx_lock(&inp->dlth_lock);
3337 VERIFY(!(inp->dlth_flags & (DLIL_INPUT_EMBRYONIC | DLIL_INPUT_RUNNING)));
3338 (void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3339 inp->dlth_flags |= DLIL_INPUT_EMBRYONIC;
3340 /* wake up once to get out of embryonic state */
3341 dlil_input_wakeup(inp);
3342 lck_mtx_unlock(&inp->dlth_lock);
3343 (void) thread_block_parameter(dlil_input_thread_cont, inp);
3344 /* NOTREACHED */
3345 __builtin_unreachable();
3346 }
3347
3348 __attribute__((noreturn))
3349 static void
dlil_input_thread_cont(void * v,wait_result_t wres)3350 dlil_input_thread_cont(void *v, wait_result_t wres)
3351 {
3352 struct dlil_threading_info *inp = v;
3353 struct ifnet *ifp = inp->dlth_ifp;
3354
3355 lck_mtx_lock_spin(&inp->dlth_lock);
3356 if (__improbable(wres == THREAD_INTERRUPTED ||
3357 (inp->dlth_flags & DLIL_INPUT_TERMINATE))) {
3358 goto terminate;
3359 }
3360
3361 VERIFY(!(inp->dlth_flags & DLIL_INPUT_RUNNING));
3362 inp->dlth_flags |= DLIL_INPUT_RUNNING;
3363
3364 while (1) {
3365 struct mbuf *m = NULL;
3366 classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
3367 boolean_t notify = FALSE;
3368 boolean_t embryonic;
3369 u_int32_t m_cnt;
3370
3371 inp->dlth_flags &= ~DLIL_INPUT_WAITING;
3372
3373 if (__improbable(embryonic =
3374 (inp->dlth_flags & DLIL_INPUT_EMBRYONIC))) {
3375 inp->dlth_flags &= ~DLIL_INPUT_EMBRYONIC;
3376 }
3377
3378 /*
3379 * Protocol registration and injection must always use
3380 * the main input thread; in theory the latter can utilize
3381 * the corresponding input thread where the packet arrived
3382 * on, but that requires our knowing the interface in advance
3383 * (and the benefits might not worth the trouble.)
3384 */
3385 VERIFY(!(inp->dlth_flags &
3386 (DLIL_PROTO_WAITING | DLIL_PROTO_REGISTER)));
3387
3388 /* Packets for this interface */
3389 m_cnt = qlen(&inp->dlth_pkts);
3390 _getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
3391 m = pkt.cp_mbuf;
3392
3393 inp->dlth_wtot = 0;
3394
3395 #if SKYWALK
3396 /*
3397 * If this interface is attached to a netif nexus,
3398 * the stats are already incremented there; otherwise
3399 * do it here.
3400 */
3401 if (!(ifp->if_capabilities & IFCAP_SKYWALK))
3402 #endif /* SKYWALK */
3403 notify = dlil_input_stats_sync(ifp, inp);
3404
3405 lck_mtx_unlock(&inp->dlth_lock);
3406
3407 if (__improbable(embryonic)) {
3408 ifnet_decr_pending_thread_count(ifp);
3409 }
3410
3411 if (__improbable(notify)) {
3412 ifnet_notify_data_threshold(ifp);
3413 }
3414
3415 /*
3416 * NOTE warning %%% attention !!!!
3417 * We should think about putting some thread starvation
3418 * safeguards if we deal with long chains of packets.
3419 */
3420 if (__probable(m != NULL)) {
3421 dlil_input_packet_list_extended(NULL, m,
3422 m_cnt, ifp->if_poll_mode);
3423 }
3424
3425 lck_mtx_lock_spin(&inp->dlth_lock);
3426 VERIFY(inp->dlth_flags & DLIL_INPUT_RUNNING);
3427 if (!(inp->dlth_flags & ~(DLIL_INPUT_RUNNING |
3428 DLIL_INPUT_TERMINATE))) {
3429 break;
3430 }
3431 }
3432
3433 inp->dlth_flags &= ~DLIL_INPUT_RUNNING;
3434
3435 if (__improbable(inp->dlth_flags & DLIL_INPUT_TERMINATE)) {
3436 terminate:
3437 lck_mtx_unlock(&inp->dlth_lock);
3438 dlil_terminate_input_thread(inp);
3439 /* NOTREACHED */
3440 } else {
3441 (void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3442 lck_mtx_unlock(&inp->dlth_lock);
3443 (void) thread_block_parameter(dlil_input_thread_cont, inp);
3444 /* NOTREACHED */
3445 }
3446
3447 VERIFY(0); /* we should never get here */
3448 /* NOTREACHED */
3449 __builtin_unreachable();
3450 }
3451
3452 /*
3453 * Input thread for interfaces with opportunistic polling input model.
3454 */
3455 __attribute__((noreturn))
3456 static void
dlil_rxpoll_input_thread_func(void * v,wait_result_t w)3457 dlil_rxpoll_input_thread_func(void *v, wait_result_t w)
3458 {
3459 #pragma unused(w)
3460 char thread_name[MAXTHREADNAMESIZE];
3461 struct dlil_threading_info *inp = v;
3462 struct ifnet *ifp = inp->dlth_ifp;
3463
3464 VERIFY(inp != dlil_main_input_thread);
3465 VERIFY(ifp != NULL && (ifp->if_eflags & IFEF_RXPOLL) &&
3466 (ifp->if_xflags & IFXF_LEGACY));
3467 VERIFY(current_thread() == inp->dlth_thread);
3468
3469 /* construct the name for this thread, and then apply it */
3470 bzero(thread_name, sizeof(thread_name));
3471 (void) snprintf(thread_name, sizeof(thread_name),
3472 "dlil_input_poll_%s", ifp->if_xname);
3473 thread_set_thread_name(inp->dlth_thread, thread_name);
3474
3475 lck_mtx_lock(&inp->dlth_lock);
3476 VERIFY(!(inp->dlth_flags & (DLIL_INPUT_EMBRYONIC | DLIL_INPUT_RUNNING)));
3477 (void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3478 inp->dlth_flags |= DLIL_INPUT_EMBRYONIC;
3479 /* wake up once to get out of embryonic state */
3480 dlil_input_wakeup(inp);
3481 lck_mtx_unlock(&inp->dlth_lock);
3482 (void) thread_block_parameter(dlil_rxpoll_input_thread_cont, inp);
3483 /* NOTREACHED */
3484 __builtin_unreachable();
3485 }
3486
3487 __attribute__((noreturn))
3488 static void
dlil_rxpoll_input_thread_cont(void * v,wait_result_t wres)3489 dlil_rxpoll_input_thread_cont(void *v, wait_result_t wres)
3490 {
3491 struct dlil_threading_info *inp = v;
3492 struct ifnet *ifp = inp->dlth_ifp;
3493 struct timespec ts;
3494
3495 lck_mtx_lock_spin(&inp->dlth_lock);
3496 if (__improbable(wres == THREAD_INTERRUPTED ||
3497 (inp->dlth_flags & DLIL_INPUT_TERMINATE))) {
3498 goto terminate;
3499 }
3500
3501 VERIFY(!(inp->dlth_flags & DLIL_INPUT_RUNNING));
3502 inp->dlth_flags |= DLIL_INPUT_RUNNING;
3503
3504 while (1) {
3505 struct mbuf *m = NULL;
3506 uint32_t m_cnt, poll_req = 0;
3507 uint64_t m_size = 0;
3508 ifnet_model_t mode;
3509 struct timespec now, delta;
3510 classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
3511 boolean_t notify;
3512 boolean_t embryonic;
3513 uint64_t ival;
3514
3515 inp->dlth_flags &= ~DLIL_INPUT_WAITING;
3516
3517 if (__improbable(embryonic =
3518 (inp->dlth_flags & DLIL_INPUT_EMBRYONIC))) {
3519 inp->dlth_flags &= ~DLIL_INPUT_EMBRYONIC;
3520 goto skip;
3521 }
3522
3523 if ((ival = ifp->if_rxpoll_ival) < IF_RXPOLL_INTERVALTIME_MIN) {
3524 ival = IF_RXPOLL_INTERVALTIME_MIN;
3525 }
3526
3527 /* Link parameters changed? */
3528 if (ifp->if_poll_update != 0) {
3529 ifp->if_poll_update = 0;
3530 (void) dlil_rxpoll_set_params(ifp, NULL, TRUE);
3531 }
3532
3533 /* Current operating mode */
3534 mode = ifp->if_poll_mode;
3535
3536 /*
3537 * Protocol registration and injection must always use
3538 * the main input thread; in theory the latter can utilize
3539 * the corresponding input thread where the packet arrived
3540 * on, but that requires our knowing the interface in advance
3541 * (and the benefits might not worth the trouble.)
3542 */
3543 VERIFY(!(inp->dlth_flags &
3544 (DLIL_PROTO_WAITING | DLIL_PROTO_REGISTER)));
3545
3546 /* Total count of all packets */
3547 m_cnt = qlen(&inp->dlth_pkts);
3548
3549 /* Total bytes of all packets */
3550 m_size = qsize(&inp->dlth_pkts);
3551
3552 /* Packets for this interface */
3553 _getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
3554 m = pkt.cp_mbuf;
3555 VERIFY(m != NULL || m_cnt == 0);
3556
3557 nanouptime(&now);
3558 if (!net_timerisset(&ifp->if_poll_sample_lasttime)) {
3559 *(&ifp->if_poll_sample_lasttime) = *(&now);
3560 }
3561
3562 net_timersub(&now, &ifp->if_poll_sample_lasttime, &delta);
3563 if (if_rxpoll && net_timerisset(&ifp->if_poll_sample_holdtime)) {
3564 u_int32_t ptot, btot;
3565
3566 /* Accumulate statistics for current sampling */
3567 PKTCNTR_ADD(&ifp->if_poll_sstats, m_cnt, m_size);
3568
3569 if (net_timercmp(&delta, &ifp->if_poll_sample_holdtime, <)) {
3570 goto skip;
3571 }
3572
3573 *(&ifp->if_poll_sample_lasttime) = *(&now);
3574
3575 /* Calculate min/max of inbound bytes */
3576 btot = (u_int32_t)ifp->if_poll_sstats.bytes;
3577 if (ifp->if_rxpoll_bmin == 0 || ifp->if_rxpoll_bmin > btot) {
3578 ifp->if_rxpoll_bmin = btot;
3579 }
3580 if (btot > ifp->if_rxpoll_bmax) {
3581 ifp->if_rxpoll_bmax = btot;
3582 }
3583
3584 /* Calculate EWMA of inbound bytes */
3585 DLIL_EWMA(ifp->if_rxpoll_bavg, btot, if_rxpoll_decay);
3586
3587 /* Calculate min/max of inbound packets */
3588 ptot = (u_int32_t)ifp->if_poll_sstats.packets;
3589 if (ifp->if_rxpoll_pmin == 0 || ifp->if_rxpoll_pmin > ptot) {
3590 ifp->if_rxpoll_pmin = ptot;
3591 }
3592 if (ptot > ifp->if_rxpoll_pmax) {
3593 ifp->if_rxpoll_pmax = ptot;
3594 }
3595
3596 /* Calculate EWMA of inbound packets */
3597 DLIL_EWMA(ifp->if_rxpoll_pavg, ptot, if_rxpoll_decay);
3598
3599 /* Reset sampling statistics */
3600 PKTCNTR_CLEAR(&ifp->if_poll_sstats);
3601
3602 /* Calculate EWMA of wakeup requests */
3603 DLIL_EWMA(ifp->if_rxpoll_wavg, inp->dlth_wtot,
3604 if_rxpoll_decay);
3605 inp->dlth_wtot = 0;
3606
3607 if (dlil_verbose) {
3608 if (!net_timerisset(&ifp->if_poll_dbg_lasttime)) {
3609 *(&ifp->if_poll_dbg_lasttime) = *(&now);
3610 }
3611 net_timersub(&now, &ifp->if_poll_dbg_lasttime, &delta);
3612 if (net_timercmp(&delta, &dlil_dbgrate, >=)) {
3613 *(&ifp->if_poll_dbg_lasttime) = *(&now);
3614 DLIL_PRINTF("%s: [%s] pkts avg %d max %d "
3615 "limits [%d/%d], wreq avg %d "
3616 "limits [%d/%d], bytes avg %d "
3617 "limits [%d/%d]\n", if_name(ifp),
3618 (ifp->if_poll_mode ==
3619 IFNET_MODEL_INPUT_POLL_ON) ?
3620 "ON" : "OFF", ifp->if_rxpoll_pavg,
3621 ifp->if_rxpoll_pmax,
3622 ifp->if_rxpoll_plowat,
3623 ifp->if_rxpoll_phiwat,
3624 ifp->if_rxpoll_wavg,
3625 ifp->if_rxpoll_wlowat,
3626 ifp->if_rxpoll_whiwat,
3627 ifp->if_rxpoll_bavg,
3628 ifp->if_rxpoll_blowat,
3629 ifp->if_rxpoll_bhiwat);
3630 }
3631 }
3632
3633 /* Perform mode transition, if necessary */
3634 if (!net_timerisset(&ifp->if_poll_mode_lasttime)) {
3635 *(&ifp->if_poll_mode_lasttime) = *(&now);
3636 }
3637
3638 net_timersub(&now, &ifp->if_poll_mode_lasttime, &delta);
3639 if (net_timercmp(&delta, &ifp->if_poll_mode_holdtime, <)) {
3640 goto skip;
3641 }
3642
3643 if (ifp->if_rxpoll_pavg <= ifp->if_rxpoll_plowat &&
3644 ifp->if_rxpoll_bavg <= ifp->if_rxpoll_blowat &&
3645 ifp->if_poll_mode != IFNET_MODEL_INPUT_POLL_OFF) {
3646 mode = IFNET_MODEL_INPUT_POLL_OFF;
3647 } else if (ifp->if_rxpoll_pavg >= ifp->if_rxpoll_phiwat &&
3648 (ifp->if_rxpoll_bavg >= ifp->if_rxpoll_bhiwat ||
3649 ifp->if_rxpoll_wavg >= ifp->if_rxpoll_whiwat) &&
3650 ifp->if_poll_mode != IFNET_MODEL_INPUT_POLL_ON) {
3651 mode = IFNET_MODEL_INPUT_POLL_ON;
3652 }
3653
3654 if (mode != ifp->if_poll_mode) {
3655 ifp->if_poll_mode = mode;
3656 *(&ifp->if_poll_mode_lasttime) = *(&now);
3657 poll_req++;
3658 }
3659 }
3660 skip:
3661 notify = dlil_input_stats_sync(ifp, inp);
3662
3663 lck_mtx_unlock(&inp->dlth_lock);
3664
3665 if (__improbable(embryonic)) {
3666 ifnet_decr_pending_thread_count(ifp);
3667 }
3668
3669 if (__improbable(notify)) {
3670 ifnet_notify_data_threshold(ifp);
3671 }
3672
3673 /*
3674 * If there's a mode change and interface is still attached,
3675 * perform a downcall to the driver for the new mode. Also
3676 * hold an IO refcnt on the interface to prevent it from
3677 * being detached (will be release below.)
3678 */
3679 if (poll_req != 0 && ifnet_is_attached(ifp, 1)) {
3680 struct ifnet_model_params p = {
3681 .model = mode, .reserved = { 0 }
3682 };
3683 errno_t err;
3684
3685 if (dlil_verbose) {
3686 DLIL_PRINTF("%s: polling is now %s, "
3687 "pkts avg %d max %d limits [%d/%d], "
3688 "wreq avg %d limits [%d/%d], "
3689 "bytes avg %d limits [%d/%d]\n",
3690 if_name(ifp),
3691 (mode == IFNET_MODEL_INPUT_POLL_ON) ?
3692 "ON" : "OFF", ifp->if_rxpoll_pavg,
3693 ifp->if_rxpoll_pmax, ifp->if_rxpoll_plowat,
3694 ifp->if_rxpoll_phiwat, ifp->if_rxpoll_wavg,
3695 ifp->if_rxpoll_wlowat, ifp->if_rxpoll_whiwat,
3696 ifp->if_rxpoll_bavg, ifp->if_rxpoll_blowat,
3697 ifp->if_rxpoll_bhiwat);
3698 }
3699
3700 if ((err = ((*ifp->if_input_ctl)(ifp,
3701 IFNET_CTL_SET_INPUT_MODEL, sizeof(p), &p))) != 0) {
3702 DLIL_PRINTF("%s: error setting polling mode "
3703 "to %s (%d)\n", if_name(ifp),
3704 (mode == IFNET_MODEL_INPUT_POLL_ON) ?
3705 "ON" : "OFF", err);
3706 }
3707
3708 switch (mode) {
3709 case IFNET_MODEL_INPUT_POLL_OFF:
3710 ifnet_set_poll_cycle(ifp, NULL);
3711 ifp->if_rxpoll_offreq++;
3712 if (err != 0) {
3713 ifp->if_rxpoll_offerr++;
3714 }
3715 break;
3716
3717 case IFNET_MODEL_INPUT_POLL_ON:
3718 net_nsectimer(&ival, &ts);
3719 ifnet_set_poll_cycle(ifp, &ts);
3720 ifnet_poll(ifp);
3721 ifp->if_rxpoll_onreq++;
3722 if (err != 0) {
3723 ifp->if_rxpoll_onerr++;
3724 }
3725 break;
3726
3727 default:
3728 VERIFY(0);
3729 /* NOTREACHED */
3730 }
3731
3732 /* Release the IO refcnt */
3733 ifnet_decr_iorefcnt(ifp);
3734 }
3735
3736 /*
3737 * NOTE warning %%% attention !!!!
3738 * We should think about putting some thread starvation
3739 * safeguards if we deal with long chains of packets.
3740 */
3741 if (__probable(m != NULL)) {
3742 dlil_input_packet_list_extended(NULL, m, m_cnt, mode);
3743 }
3744
3745 lck_mtx_lock_spin(&inp->dlth_lock);
3746 VERIFY(inp->dlth_flags & DLIL_INPUT_RUNNING);
3747 if (!(inp->dlth_flags & ~(DLIL_INPUT_RUNNING |
3748 DLIL_INPUT_TERMINATE))) {
3749 break;
3750 }
3751 }
3752
3753 inp->dlth_flags &= ~DLIL_INPUT_RUNNING;
3754
3755 if (__improbable(inp->dlth_flags & DLIL_INPUT_TERMINATE)) {
3756 terminate:
3757 lck_mtx_unlock(&inp->dlth_lock);
3758 dlil_terminate_input_thread(inp);
3759 /* NOTREACHED */
3760 } else {
3761 (void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3762 lck_mtx_unlock(&inp->dlth_lock);
3763 (void) thread_block_parameter(dlil_rxpoll_input_thread_cont,
3764 inp);
3765 /* NOTREACHED */
3766 }
3767
3768 VERIFY(0); /* we should never get here */
3769 /* NOTREACHED */
3770 __builtin_unreachable();
3771 }
3772
3773 errno_t
dlil_rxpoll_validate_params(struct ifnet_poll_params * p)3774 dlil_rxpoll_validate_params(struct ifnet_poll_params *p)
3775 {
3776 if (p != NULL) {
3777 if ((p->packets_lowat == 0 && p->packets_hiwat != 0) ||
3778 (p->packets_lowat != 0 && p->packets_hiwat == 0)) {
3779 return EINVAL;
3780 }
3781 if (p->packets_lowat != 0 && /* hiwat must be non-zero */
3782 p->packets_lowat >= p->packets_hiwat) {
3783 return EINVAL;
3784 }
3785 if ((p->bytes_lowat == 0 && p->bytes_hiwat != 0) ||
3786 (p->bytes_lowat != 0 && p->bytes_hiwat == 0)) {
3787 return EINVAL;
3788 }
3789 if (p->bytes_lowat != 0 && /* hiwat must be non-zero */
3790 p->bytes_lowat >= p->bytes_hiwat) {
3791 return EINVAL;
3792 }
3793 if (p->interval_time != 0 &&
3794 p->interval_time < IF_RXPOLL_INTERVALTIME_MIN) {
3795 p->interval_time = IF_RXPOLL_INTERVALTIME_MIN;
3796 }
3797 }
3798 return 0;
3799 }
3800
3801 void
dlil_rxpoll_update_params(struct ifnet * ifp,struct ifnet_poll_params * p)3802 dlil_rxpoll_update_params(struct ifnet *ifp, struct ifnet_poll_params *p)
3803 {
3804 u_int64_t sample_holdtime, inbw;
3805
3806 if ((inbw = ifnet_input_linkrate(ifp)) == 0 && p == NULL) {
3807 sample_holdtime = 0; /* polling is disabled */
3808 ifp->if_rxpoll_wlowat = ifp->if_rxpoll_plowat =
3809 ifp->if_rxpoll_blowat = 0;
3810 ifp->if_rxpoll_whiwat = ifp->if_rxpoll_phiwat =
3811 ifp->if_rxpoll_bhiwat = (u_int32_t)-1;
3812 ifp->if_rxpoll_plim = 0;
3813 ifp->if_rxpoll_ival = IF_RXPOLL_INTERVALTIME_MIN;
3814 } else {
3815 u_int32_t plowat, phiwat, blowat, bhiwat, plim;
3816 u_int64_t ival;
3817 unsigned int n, i;
3818
3819 for (n = 0, i = 0; rxpoll_tbl[i].speed != 0; i++) {
3820 if (inbw < rxpoll_tbl[i].speed) {
3821 break;
3822 }
3823 n = i;
3824 }
3825 /* auto-tune if caller didn't specify a value */
3826 plowat = ((p == NULL || p->packets_lowat == 0) ?
3827 rxpoll_tbl[n].plowat : p->packets_lowat);
3828 phiwat = ((p == NULL || p->packets_hiwat == 0) ?
3829 rxpoll_tbl[n].phiwat : p->packets_hiwat);
3830 blowat = ((p == NULL || p->bytes_lowat == 0) ?
3831 rxpoll_tbl[n].blowat : p->bytes_lowat);
3832 bhiwat = ((p == NULL || p->bytes_hiwat == 0) ?
3833 rxpoll_tbl[n].bhiwat : p->bytes_hiwat);
3834 plim = ((p == NULL || p->packets_limit == 0 ||
3835 if_rxpoll_max != 0) ? if_rxpoll_max : p->packets_limit);
3836 ival = ((p == NULL || p->interval_time == 0 ||
3837 if_rxpoll_interval_time != IF_RXPOLL_INTERVALTIME) ?
3838 if_rxpoll_interval_time : p->interval_time);
3839
3840 VERIFY(plowat != 0 && phiwat != 0);
3841 VERIFY(blowat != 0 && bhiwat != 0);
3842 VERIFY(ival >= IF_RXPOLL_INTERVALTIME_MIN);
3843
3844 sample_holdtime = if_rxpoll_sample_holdtime;
3845 ifp->if_rxpoll_wlowat = if_sysctl_rxpoll_wlowat;
3846 ifp->if_rxpoll_whiwat = if_sysctl_rxpoll_whiwat;
3847 ifp->if_rxpoll_plowat = plowat;
3848 ifp->if_rxpoll_phiwat = phiwat;
3849 ifp->if_rxpoll_blowat = blowat;
3850 ifp->if_rxpoll_bhiwat = bhiwat;
3851 ifp->if_rxpoll_plim = plim;
3852 ifp->if_rxpoll_ival = ival;
3853 }
3854
3855 net_nsectimer(&if_rxpoll_mode_holdtime, &ifp->if_poll_mode_holdtime);
3856 net_nsectimer(&sample_holdtime, &ifp->if_poll_sample_holdtime);
3857
3858 if (dlil_verbose) {
3859 DLIL_PRINTF("%s: speed %llu bps, sample per %llu nsec, "
3860 "poll interval %llu nsec, pkts per poll %u, "
3861 "pkt limits [%u/%u], wreq limits [%u/%u], "
3862 "bytes limits [%u/%u]\n", if_name(ifp),
3863 inbw, sample_holdtime, ifp->if_rxpoll_ival,
3864 ifp->if_rxpoll_plim, ifp->if_rxpoll_plowat,
3865 ifp->if_rxpoll_phiwat, ifp->if_rxpoll_wlowat,
3866 ifp->if_rxpoll_whiwat, ifp->if_rxpoll_blowat,
3867 ifp->if_rxpoll_bhiwat);
3868 }
3869 }
3870
3871 /*
3872 * Must be called on an attached ifnet (caller is expected to check.)
3873 * Caller may pass NULL for poll parameters to indicate "auto-tuning."
3874 */
3875 errno_t
dlil_rxpoll_set_params(struct ifnet * ifp,struct ifnet_poll_params * p,boolean_t locked)3876 dlil_rxpoll_set_params(struct ifnet *ifp, struct ifnet_poll_params *p,
3877 boolean_t locked)
3878 {
3879 errno_t err;
3880 struct dlil_threading_info *inp;
3881
3882 VERIFY(ifp != NULL);
3883 if (!(ifp->if_eflags & IFEF_RXPOLL) || (inp = ifp->if_inp) == NULL) {
3884 return ENXIO;
3885 }
3886 err = dlil_rxpoll_validate_params(p);
3887 if (err != 0) {
3888 return err;
3889 }
3890
3891 if (!locked) {
3892 lck_mtx_lock(&inp->dlth_lock);
3893 }
3894 LCK_MTX_ASSERT(&inp->dlth_lock, LCK_MTX_ASSERT_OWNED);
3895 /*
3896 * Normally, we'd reset the parameters to the auto-tuned values
3897 * if the the input thread detects a change in link rate. If the
3898 * driver provides its own parameters right after a link rate
3899 * changes, but before the input thread gets to run, we want to
3900 * make sure to keep the driver's values. Clearing if_poll_update
3901 * will achieve that.
3902 */
3903 if (p != NULL && !locked && ifp->if_poll_update != 0) {
3904 ifp->if_poll_update = 0;
3905 }
3906 dlil_rxpoll_update_params(ifp, p);
3907 if (!locked) {
3908 lck_mtx_unlock(&inp->dlth_lock);
3909 }
3910 return 0;
3911 }
3912
3913 /*
3914 * Must be called on an attached ifnet (caller is expected to check.)
3915 */
3916 errno_t
dlil_rxpoll_get_params(struct ifnet * ifp,struct ifnet_poll_params * p)3917 dlil_rxpoll_get_params(struct ifnet *ifp, struct ifnet_poll_params *p)
3918 {
3919 struct dlil_threading_info *inp;
3920
3921 VERIFY(ifp != NULL && p != NULL);
3922 if (!(ifp->if_eflags & IFEF_RXPOLL) || (inp = ifp->if_inp) == NULL) {
3923 return ENXIO;
3924 }
3925
3926 bzero(p, sizeof(*p));
3927
3928 lck_mtx_lock(&inp->dlth_lock);
3929 p->packets_limit = ifp->if_rxpoll_plim;
3930 p->packets_lowat = ifp->if_rxpoll_plowat;
3931 p->packets_hiwat = ifp->if_rxpoll_phiwat;
3932 p->bytes_lowat = ifp->if_rxpoll_blowat;
3933 p->bytes_hiwat = ifp->if_rxpoll_bhiwat;
3934 p->interval_time = ifp->if_rxpoll_ival;
3935 lck_mtx_unlock(&inp->dlth_lock);
3936
3937 return 0;
3938 }
3939
3940 errno_t
ifnet_input(struct ifnet * ifp,struct mbuf * m_head,const struct ifnet_stat_increment_param * s)3941 ifnet_input(struct ifnet *ifp, struct mbuf *m_head,
3942 const struct ifnet_stat_increment_param *s)
3943 {
3944 return ifnet_input_common(ifp, m_head, NULL, s, FALSE, FALSE);
3945 }
3946
3947 errno_t
ifnet_input_extended(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s)3948 ifnet_input_extended(struct ifnet *ifp, struct mbuf *m_head,
3949 struct mbuf *m_tail, const struct ifnet_stat_increment_param *s)
3950 {
3951 return ifnet_input_common(ifp, m_head, m_tail, s, TRUE, FALSE);
3952 }
3953
3954 errno_t
ifnet_input_poll(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s)3955 ifnet_input_poll(struct ifnet *ifp, struct mbuf *m_head,
3956 struct mbuf *m_tail, const struct ifnet_stat_increment_param *s)
3957 {
3958 return ifnet_input_common(ifp, m_head, m_tail, s,
3959 (m_head != NULL), TRUE);
3960 }
3961
3962 static errno_t
ifnet_input_common(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t ext,boolean_t poll)3963 ifnet_input_common(struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail,
3964 const struct ifnet_stat_increment_param *s, boolean_t ext, boolean_t poll)
3965 {
3966 dlil_input_func input_func;
3967 struct ifnet_stat_increment_param _s;
3968 u_int32_t m_cnt = 0, m_size = 0;
3969 struct mbuf *last;
3970 errno_t err = 0;
3971
3972 if ((m_head == NULL && !poll) || (s == NULL && ext)) {
3973 if (m_head != NULL) {
3974 mbuf_freem_list(m_head);
3975 }
3976 return EINVAL;
3977 }
3978
3979 VERIFY(m_head != NULL || (s == NULL && m_tail == NULL && !ext && poll));
3980 VERIFY(m_tail == NULL || ext);
3981 VERIFY(s != NULL || !ext);
3982
3983 /*
3984 * Drop the packet(s) if the parameters are invalid, or if the
3985 * interface is no longer attached; else hold an IO refcnt to
3986 * prevent it from being detached (will be released below.)
3987 */
3988 if (ifp == NULL || (ifp != lo_ifp && !ifnet_datamov_begin(ifp))) {
3989 if (m_head != NULL) {
3990 mbuf_freem_list(m_head);
3991 }
3992 return EINVAL;
3993 }
3994
3995 input_func = ifp->if_input_dlil;
3996 VERIFY(input_func != NULL);
3997
3998 if (m_tail == NULL) {
3999 last = m_head;
4000 while (m_head != NULL) {
4001 #if IFNET_INPUT_SANITY_CHK
4002 if (__improbable(dlil_input_sanity_check != 0)) {
4003 DLIL_INPUT_CHECK(last, ifp);
4004 }
4005 #endif /* IFNET_INPUT_SANITY_CHK */
4006 m_cnt++;
4007 m_size += m_length(last);
4008 if (mbuf_nextpkt(last) == NULL) {
4009 break;
4010 }
4011 last = mbuf_nextpkt(last);
4012 }
4013 m_tail = last;
4014 } else {
4015 #if IFNET_INPUT_SANITY_CHK
4016 if (__improbable(dlil_input_sanity_check != 0)) {
4017 last = m_head;
4018 while (1) {
4019 DLIL_INPUT_CHECK(last, ifp);
4020 m_cnt++;
4021 m_size += m_length(last);
4022 if (mbuf_nextpkt(last) == NULL) {
4023 break;
4024 }
4025 last = mbuf_nextpkt(last);
4026 }
4027 } else {
4028 m_cnt = s->packets_in;
4029 m_size = s->bytes_in;
4030 last = m_tail;
4031 }
4032 #else
4033 m_cnt = s->packets_in;
4034 m_size = s->bytes_in;
4035 last = m_tail;
4036 #endif /* IFNET_INPUT_SANITY_CHK */
4037 }
4038
4039 if (last != m_tail) {
4040 panic_plain("%s: invalid input packet chain for %s, "
4041 "tail mbuf %p instead of %p\n", __func__, if_name(ifp),
4042 m_tail, last);
4043 }
4044
4045 /*
4046 * Assert packet count only for the extended variant, for backwards
4047 * compatibility, since this came directly from the device driver.
4048 * Relax this assertion for input bytes, as the driver may have
4049 * included the link-layer headers in the computation; hence
4050 * m_size is just an approximation.
4051 */
4052 if (ext && s->packets_in != m_cnt) {
4053 panic_plain("%s: input packet count mismatch for %s, "
4054 "%d instead of %d\n", __func__, if_name(ifp),
4055 s->packets_in, m_cnt);
4056 }
4057
4058 if (s == NULL) {
4059 bzero(&_s, sizeof(_s));
4060 s = &_s;
4061 } else {
4062 _s = *s;
4063 }
4064 _s.packets_in = m_cnt;
4065 _s.bytes_in = m_size;
4066
4067 err = (*input_func)(ifp, m_head, m_tail, s, poll, current_thread());
4068
4069 if (ifp != lo_ifp) {
4070 /* Release the IO refcnt */
4071 ifnet_datamov_end(ifp);
4072 }
4073
4074 return err;
4075 }
4076
4077 #if SKYWALK
4078 errno_t
dlil_set_input_handler(struct ifnet * ifp,dlil_input_func fn)4079 dlil_set_input_handler(struct ifnet *ifp, dlil_input_func fn)
4080 {
4081 return os_atomic_cmpxchg((void * volatile *)&ifp->if_input_dlil,
4082 ptrauth_nop_cast(void *, &dlil_input_handler),
4083 ptrauth_nop_cast(void *, fn), acq_rel) ? 0 : EBUSY;
4084 }
4085
4086 void
dlil_reset_input_handler(struct ifnet * ifp)4087 dlil_reset_input_handler(struct ifnet *ifp)
4088 {
4089 while (!os_atomic_cmpxchg((void * volatile *)&ifp->if_input_dlil,
4090 ptrauth_nop_cast(void *, ifp->if_input_dlil),
4091 ptrauth_nop_cast(void *, &dlil_input_handler), acq_rel)) {
4092 ;
4093 }
4094 }
4095
4096 errno_t
dlil_set_output_handler(struct ifnet * ifp,dlil_output_func fn)4097 dlil_set_output_handler(struct ifnet *ifp, dlil_output_func fn)
4098 {
4099 return os_atomic_cmpxchg((void * volatile *)&ifp->if_output_dlil,
4100 ptrauth_nop_cast(void *, &dlil_output_handler),
4101 ptrauth_nop_cast(void *, fn), acq_rel) ? 0 : EBUSY;
4102 }
4103
4104 void
dlil_reset_output_handler(struct ifnet * ifp)4105 dlil_reset_output_handler(struct ifnet *ifp)
4106 {
4107 while (!os_atomic_cmpxchg((void * volatile *)&ifp->if_output_dlil,
4108 ptrauth_nop_cast(void *, ifp->if_output_dlil),
4109 ptrauth_nop_cast(void *, &dlil_output_handler), acq_rel)) {
4110 ;
4111 }
4112 }
4113 #endif /* SKYWALK */
4114
4115 errno_t
dlil_output_handler(struct ifnet * ifp,struct mbuf * m)4116 dlil_output_handler(struct ifnet *ifp, struct mbuf *m)
4117 {
4118 return ifp->if_output(ifp, m);
4119 }
4120
4121 errno_t
dlil_input_handler(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t poll,struct thread * tp)4122 dlil_input_handler(struct ifnet *ifp, struct mbuf *m_head,
4123 struct mbuf *m_tail, const struct ifnet_stat_increment_param *s,
4124 boolean_t poll, struct thread *tp)
4125 {
4126 struct dlil_threading_info *inp = ifp->if_inp;
4127
4128 if (__improbable(inp == NULL)) {
4129 inp = dlil_main_input_thread;
4130 }
4131
4132 #if (DEVELOPMENT || DEBUG)
4133 if (__improbable(net_thread_is_marked(NET_THREAD_SYNC_RX))) {
4134 return dlil_input_sync(inp, ifp, m_head, m_tail, s, poll, tp);
4135 } else
4136 #endif /* (DEVELOPMENT || DEBUG) */
4137 {
4138 return inp->dlth_strategy(inp, ifp, m_head, m_tail, s, poll, tp);
4139 }
4140 }
4141
4142 /*
4143 * Detect whether a queue contains a burst that needs to be trimmed.
4144 */
4145 #define MBUF_QUEUE_IS_OVERCOMMITTED(q) \
4146 __improbable(MAX(if_rcvq_burst_limit, qlimit(q)) < qlen(q) && \
4147 qtype(q) == QP_MBUF)
4148
4149 #define MAX_KNOWN_MBUF_CLASS 8
4150
4151 static uint32_t
dlil_trim_overcomitted_queue_locked(class_queue_t * input_queue,dlil_freeq_t * freeq,struct ifnet_stat_increment_param * stat_delta)4152 dlil_trim_overcomitted_queue_locked(class_queue_t *input_queue,
4153 dlil_freeq_t *freeq, struct ifnet_stat_increment_param *stat_delta)
4154 {
4155 uint32_t overcommitted_qlen; /* Length in packets. */
4156 uint64_t overcommitted_qsize; /* Size in bytes. */
4157 uint32_t target_qlen; /* The desired queue length after trimming. */
4158 uint32_t pkts_to_drop; /* Number of packets to drop. */
4159 uint32_t dropped_pkts = 0; /* Number of packets that were dropped. */
4160 uint32_t dropped_bytes = 0; /* Number of dropped bytes. */
4161 struct mbuf *m = NULL, *m_tmp = NULL;
4162
4163 overcommitted_qlen = qlen(input_queue);
4164 overcommitted_qsize = qsize(input_queue);
4165 target_qlen = (qlimit(input_queue) * if_rcvq_trim_pct) / 100;
4166
4167 if (overcommitted_qlen <= target_qlen) {
4168 /*
4169 * The queue is already within the target limits.
4170 */
4171 dropped_pkts = 0;
4172 goto out;
4173 }
4174
4175 pkts_to_drop = overcommitted_qlen - target_qlen;
4176
4177 /*
4178 * Proceed to removing packets from the head of the queue,
4179 * starting from the oldest, until the desired number of packets
4180 * has been dropped.
4181 */
4182 MBUFQ_FOREACH_SAFE(m, &qmbufq(input_queue), m_tmp) {
4183 if (pkts_to_drop <= dropped_pkts) {
4184 break;
4185 }
4186 MBUFQ_REMOVE(&qmbufq(input_queue), m);
4187 MBUFQ_NEXT(m) = NULL;
4188 MBUFQ_ENQUEUE(freeq, m);
4189
4190 dropped_pkts += 1;
4191 dropped_bytes += m_length(m);
4192 }
4193
4194 /*
4195 * Adjust the length and the estimated size of the queue
4196 * after trimming.
4197 */
4198 VERIFY(overcommitted_qlen == target_qlen + dropped_pkts);
4199 qlen(input_queue) = target_qlen;
4200
4201 /* qsize() is an approximation. */
4202 if (dropped_bytes < qsize(input_queue)) {
4203 qsize(input_queue) -= dropped_bytes;
4204 } else {
4205 qsize(input_queue) = 0;
4206 }
4207
4208 /*
4209 * Adjust the ifnet statistics increments, if needed.
4210 */
4211 stat_delta->dropped += dropped_pkts;
4212 if (dropped_pkts < stat_delta->packets_in) {
4213 stat_delta->packets_in -= dropped_pkts;
4214 } else {
4215 stat_delta->packets_in = 0;
4216 }
4217 if (dropped_bytes < stat_delta->bytes_in) {
4218 stat_delta->bytes_in -= dropped_bytes;
4219 } else {
4220 stat_delta->bytes_in = 0;
4221 }
4222
4223 out:
4224 if (dlil_verbose) {
4225 /*
4226 * The basic information about the drop is logged
4227 * by the invoking function (dlil_input_{,a}sync).
4228 * If `dlil_verbose' flag is set, provide more information
4229 * that can be useful for debugging.
4230 */
4231 DLIL_PRINTF("%s: "
4232 "qlen: %u -> %u, "
4233 "qsize: %llu -> %llu "
4234 "qlimit: %u (sysctl: %u) "
4235 "target_qlen: %u (if_rcvq_trim_pct: %u) pkts_to_drop: %u "
4236 "dropped_pkts: %u dropped_bytes %u\n",
4237 __func__,
4238 overcommitted_qlen, qlen(input_queue),
4239 overcommitted_qsize, qsize(input_queue),
4240 qlimit(input_queue), if_rcvq_burst_limit,
4241 target_qlen, if_rcvq_trim_pct, pkts_to_drop,
4242 dropped_pkts, dropped_bytes);
4243 }
4244
4245 return dropped_pkts;
4246 }
4247
4248 static errno_t
dlil_input_async(struct dlil_threading_info * inp,struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t poll,struct thread * tp)4249 dlil_input_async(struct dlil_threading_info *inp,
4250 struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail,
4251 const struct ifnet_stat_increment_param *s, boolean_t poll,
4252 struct thread *tp)
4253 {
4254 u_int32_t m_cnt = s->packets_in;
4255 u_int32_t m_size = s->bytes_in;
4256 boolean_t notify = FALSE;
4257 struct ifnet_stat_increment_param s_adj = *s;
4258 dlil_freeq_t freeq;
4259 MBUFQ_INIT(&freeq);
4260
4261 /*
4262 * If there is a matching DLIL input thread associated with an
4263 * affinity set, associate this thread with the same set. We
4264 * will only do this once.
4265 */
4266 lck_mtx_lock_spin(&inp->dlth_lock);
4267 if (inp != dlil_main_input_thread && inp->dlth_affinity && tp != NULL &&
4268 ((!poll && inp->dlth_driver_thread == THREAD_NULL) ||
4269 (poll && inp->dlth_poller_thread == THREAD_NULL))) {
4270 u_int32_t tag = inp->dlth_affinity_tag;
4271
4272 if (poll) {
4273 VERIFY(inp->dlth_poller_thread == THREAD_NULL);
4274 inp->dlth_poller_thread = tp;
4275 } else {
4276 VERIFY(inp->dlth_driver_thread == THREAD_NULL);
4277 inp->dlth_driver_thread = tp;
4278 }
4279 lck_mtx_unlock(&inp->dlth_lock);
4280
4281 /* Associate the current thread with the new affinity tag */
4282 (void) dlil_affinity_set(tp, tag);
4283
4284 /*
4285 * Take a reference on the current thread; during detach,
4286 * we will need to refer to it in order to tear down its
4287 * affinity.
4288 */
4289 thread_reference(tp);
4290 lck_mtx_lock_spin(&inp->dlth_lock);
4291 }
4292
4293 VERIFY(m_head != NULL || (m_tail == NULL && m_cnt == 0));
4294
4295 /*
4296 * Because of loopbacked multicast we cannot stuff the ifp in
4297 * the rcvif of the packet header: loopback (lo0) packets use a
4298 * dedicated list so that we can later associate them with lo_ifp
4299 * on their way up the stack. Packets for other interfaces without
4300 * dedicated input threads go to the regular list.
4301 */
4302 if (m_head != NULL) {
4303 classq_pkt_t head, tail;
4304 class_queue_t *input_queue;
4305 CLASSQ_PKT_INIT_MBUF(&head, m_head);
4306 CLASSQ_PKT_INIT_MBUF(&tail, m_tail);
4307 if (inp == dlil_main_input_thread && ifp == lo_ifp) {
4308 struct dlil_main_threading_info *inpm =
4309 (struct dlil_main_threading_info *)inp;
4310 input_queue = &inpm->lo_rcvq_pkts;
4311 } else {
4312 input_queue = &inp->dlth_pkts;
4313 }
4314
4315 _addq_multi(input_queue, &head, &tail, m_cnt, m_size);
4316
4317 if (MBUF_QUEUE_IS_OVERCOMMITTED(input_queue)) {
4318 dlil_trim_overcomitted_queue_locked(input_queue, &freeq, &s_adj);
4319 inp->dlth_trim_pkts_dropped += s_adj.dropped;
4320 inp->dlth_trim_cnt += 1;
4321
4322 os_log_error(OS_LOG_DEFAULT,
4323 "%s %s burst limit %u (sysctl: %u) exceeded. "
4324 "%u packets dropped [%u total in %u events]. new qlen %u ",
4325 __func__, if_name(ifp), qlimit(input_queue), if_rcvq_burst_limit,
4326 s_adj.dropped, inp->dlth_trim_pkts_dropped, inp->dlth_trim_cnt,
4327 qlen(input_queue));
4328 }
4329 }
4330
4331 #if IFNET_INPUT_SANITY_CHK
4332 /*
4333 * Verify that the original stat increment parameter
4334 * accurately describes the input chain `m_head`.
4335 * This is not affected by the trimming of input queue.
4336 */
4337 if (__improbable(dlil_input_sanity_check != 0)) {
4338 u_int32_t count = 0, size = 0;
4339 struct mbuf *m0;
4340
4341 for (m0 = m_head; m0; m0 = mbuf_nextpkt(m0)) {
4342 size += m_length(m0);
4343 count++;
4344 }
4345
4346 if (count != m_cnt) {
4347 panic_plain("%s: invalid total packet count %u "
4348 "(expected %u)\n", if_name(ifp), count, m_cnt);
4349 /* NOTREACHED */
4350 __builtin_unreachable();
4351 } else if (size != m_size) {
4352 panic_plain("%s: invalid total packet size %u "
4353 "(expected %u)\n", if_name(ifp), size, m_size);
4354 /* NOTREACHED */
4355 __builtin_unreachable();
4356 }
4357
4358 inp->dlth_pkts_cnt += m_cnt;
4359 }
4360 #endif /* IFNET_INPUT_SANITY_CHK */
4361
4362 /* NOTE: use the adjusted parameter, vs the original one */
4363 dlil_input_stats_add(&s_adj, inp, ifp, poll);
4364 /*
4365 * If we're using the main input thread, synchronize the
4366 * stats now since we have the interface context. All
4367 * other cases involving dedicated input threads will
4368 * have their stats synchronized there.
4369 */
4370 if (inp == dlil_main_input_thread) {
4371 notify = dlil_input_stats_sync(ifp, inp);
4372 }
4373
4374 dlil_input_wakeup(inp);
4375 lck_mtx_unlock(&inp->dlth_lock);
4376
4377 /*
4378 * Actual freeing of the excess packets must happen
4379 * after the dlth_lock had been released.
4380 */
4381 if (!MBUFQ_EMPTY(&freeq)) {
4382 m_freem_list(MBUFQ_FIRST(&freeq));
4383 }
4384
4385 if (notify) {
4386 ifnet_notify_data_threshold(ifp);
4387 }
4388
4389 return 0;
4390 }
4391
4392 static errno_t
dlil_input_sync(struct dlil_threading_info * inp,struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t poll,struct thread * tp)4393 dlil_input_sync(struct dlil_threading_info *inp,
4394 struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail,
4395 const struct ifnet_stat_increment_param *s, boolean_t poll,
4396 struct thread *tp)
4397 {
4398 #pragma unused(tp)
4399 u_int32_t m_cnt = s->packets_in;
4400 u_int32_t m_size = s->bytes_in;
4401 boolean_t notify = FALSE;
4402 classq_pkt_t head, tail;
4403 struct ifnet_stat_increment_param s_adj = *s;
4404 dlil_freeq_t freeq;
4405 MBUFQ_INIT(&freeq);
4406
4407 ASSERT(inp != dlil_main_input_thread);
4408
4409 /* XXX: should we just assert instead? */
4410 if (__improbable(m_head == NULL)) {
4411 return 0;
4412 }
4413
4414 CLASSQ_PKT_INIT_MBUF(&head, m_head);
4415 CLASSQ_PKT_INIT_MBUF(&tail, m_tail);
4416
4417 lck_mtx_lock_spin(&inp->dlth_lock);
4418 _addq_multi(&inp->dlth_pkts, &head, &tail, m_cnt, m_size);
4419
4420 if (MBUF_QUEUE_IS_OVERCOMMITTED(&inp->dlth_pkts)) {
4421 dlil_trim_overcomitted_queue_locked(&inp->dlth_pkts, &freeq, &s_adj);
4422 inp->dlth_trim_pkts_dropped += s_adj.dropped;
4423 inp->dlth_trim_cnt += 1;
4424
4425 os_log_error(OS_LOG_DEFAULT,
4426 "%s %s burst limit %u (sysctl: %u) exceeded. "
4427 "%u packets dropped [%u total in %u events]. new qlen %u \n",
4428 __func__, if_name(ifp), qlimit(&inp->dlth_pkts), if_rcvq_burst_limit,
4429 s_adj.dropped, inp->dlth_trim_pkts_dropped, inp->dlth_trim_cnt,
4430 qlen(&inp->dlth_pkts));
4431 }
4432
4433 #if IFNET_INPUT_SANITY_CHK
4434 if (__improbable(dlil_input_sanity_check != 0)) {
4435 u_int32_t count = 0, size = 0;
4436 struct mbuf *m0;
4437
4438 for (m0 = m_head; m0; m0 = mbuf_nextpkt(m0)) {
4439 size += m_length(m0);
4440 count++;
4441 }
4442
4443 if (count != m_cnt) {
4444 panic_plain("%s: invalid total packet count %u "
4445 "(expected %u)\n", if_name(ifp), count, m_cnt);
4446 /* NOTREACHED */
4447 __builtin_unreachable();
4448 } else if (size != m_size) {
4449 panic_plain("%s: invalid total packet size %u "
4450 "(expected %u)\n", if_name(ifp), size, m_size);
4451 /* NOTREACHED */
4452 __builtin_unreachable();
4453 }
4454
4455 inp->dlth_pkts_cnt += m_cnt;
4456 }
4457 #endif /* IFNET_INPUT_SANITY_CHK */
4458
4459 /* NOTE: use the adjusted parameter, vs the original one */
4460 dlil_input_stats_add(&s_adj, inp, ifp, poll);
4461
4462 m_cnt = qlen(&inp->dlth_pkts);
4463 _getq_all(&inp->dlth_pkts, &head, NULL, NULL, NULL);
4464
4465 #if SKYWALK
4466 /*
4467 * If this interface is attached to a netif nexus,
4468 * the stats are already incremented there; otherwise
4469 * do it here.
4470 */
4471 if (!(ifp->if_capabilities & IFCAP_SKYWALK))
4472 #endif /* SKYWALK */
4473 notify = dlil_input_stats_sync(ifp, inp);
4474
4475 lck_mtx_unlock(&inp->dlth_lock);
4476
4477 /*
4478 * Actual freeing of the excess packets must happen
4479 * after the dlth_lock had been released.
4480 */
4481 if (!MBUFQ_EMPTY(&freeq)) {
4482 m_freem_list(MBUFQ_FIRST(&freeq));
4483 }
4484
4485 if (notify) {
4486 ifnet_notify_data_threshold(ifp);
4487 }
4488
4489 /*
4490 * NOTE warning %%% attention !!!!
4491 * We should think about putting some thread starvation
4492 * safeguards if we deal with long chains of packets.
4493 */
4494 if (head.cp_mbuf != NULL) {
4495 dlil_input_packet_list_extended(NULL, head.cp_mbuf,
4496 m_cnt, ifp->if_poll_mode);
4497 }
4498
4499 return 0;
4500 }
4501
4502 #if SKYWALK
4503 errno_t
ifnet_set_output_handler(struct ifnet * ifp,ifnet_output_func fn)4504 ifnet_set_output_handler(struct ifnet *ifp, ifnet_output_func fn)
4505 {
4506 return os_atomic_cmpxchg((void * volatile *)&ifp->if_output,
4507 ptrauth_nop_cast(void *, ifp->if_save_output),
4508 ptrauth_nop_cast(void *, fn), acq_rel) ? 0 : EBUSY;
4509 }
4510
4511 void
ifnet_reset_output_handler(struct ifnet * ifp)4512 ifnet_reset_output_handler(struct ifnet *ifp)
4513 {
4514 while (!os_atomic_cmpxchg((void * volatile *)&ifp->if_output,
4515 ptrauth_nop_cast(void *, ifp->if_output),
4516 ptrauth_nop_cast(void *, ifp->if_save_output), acq_rel)) {
4517 ;
4518 }
4519 }
4520
4521 errno_t
ifnet_set_start_handler(struct ifnet * ifp,ifnet_start_func fn)4522 ifnet_set_start_handler(struct ifnet *ifp, ifnet_start_func fn)
4523 {
4524 return os_atomic_cmpxchg((void * volatile *)&ifp->if_start,
4525 ptrauth_nop_cast(void *, ifp->if_save_start),
4526 ptrauth_nop_cast(void *, fn), acq_rel) ? 0 : EBUSY;
4527 }
4528
4529 void
ifnet_reset_start_handler(struct ifnet * ifp)4530 ifnet_reset_start_handler(struct ifnet *ifp)
4531 {
4532 while (!os_atomic_cmpxchg((void * volatile *)&ifp->if_start,
4533 ptrauth_nop_cast(void *, ifp->if_start),
4534 ptrauth_nop_cast(void *, ifp->if_save_start), acq_rel)) {
4535 ;
4536 }
4537 }
4538 #endif /* SKYWALK */
4539
4540 static void
ifnet_start_common(struct ifnet * ifp,boolean_t resetfc,boolean_t ignore_delay)4541 ifnet_start_common(struct ifnet *ifp, boolean_t resetfc, boolean_t ignore_delay)
4542 {
4543 if (!(ifp->if_eflags & IFEF_TXSTART)) {
4544 return;
4545 }
4546 /*
4547 * If the starter thread is inactive, signal it to do work,
4548 * unless the interface is being flow controlled from below,
4549 * e.g. a virtual interface being flow controlled by a real
4550 * network interface beneath it, or it's been disabled via
4551 * a call to ifnet_disable_output().
4552 */
4553 lck_mtx_lock_spin(&ifp->if_start_lock);
4554 if (ignore_delay) {
4555 ifp->if_start_flags |= IFSF_NO_DELAY;
4556 }
4557 if (resetfc) {
4558 ifp->if_start_flags &= ~IFSF_FLOW_CONTROLLED;
4559 } else if (ifp->if_start_flags & IFSF_FLOW_CONTROLLED) {
4560 lck_mtx_unlock(&ifp->if_start_lock);
4561 return;
4562 }
4563 ifp->if_start_req++;
4564 if (!ifp->if_start_active && ifp->if_start_thread != THREAD_NULL &&
4565 (resetfc || !(ifp->if_eflags & IFEF_ENQUEUE_MULTI) ||
4566 IFCQ_LEN(ifp->if_snd) >= ifp->if_start_delay_qlen ||
4567 ifp->if_start_delayed == 0)) {
4568 (void) wakeup_one((caddr_t)&ifp->if_start_thread);
4569 }
4570 lck_mtx_unlock(&ifp->if_start_lock);
4571 }
4572
4573 void
ifnet_start_set_pacemaker_time(struct ifnet * ifp,uint64_t tx_time)4574 ifnet_start_set_pacemaker_time(struct ifnet *ifp, uint64_t tx_time)
4575 {
4576 ifp->if_start_pacemaker_time = tx_time;
4577 }
4578
4579 void
ifnet_start(struct ifnet * ifp)4580 ifnet_start(struct ifnet *ifp)
4581 {
4582 ifnet_start_common(ifp, FALSE, FALSE);
4583 }
4584
4585 void
ifnet_start_ignore_delay(struct ifnet * ifp)4586 ifnet_start_ignore_delay(struct ifnet *ifp)
4587 {
4588 ifnet_start_common(ifp, FALSE, TRUE);
4589 }
4590
4591 __attribute__((noreturn))
4592 static void
ifnet_start_thread_func(void * v,wait_result_t w)4593 ifnet_start_thread_func(void *v, wait_result_t w)
4594 {
4595 #pragma unused(w)
4596 struct ifnet *ifp = v;
4597 char thread_name[MAXTHREADNAMESIZE];
4598
4599 /* Construct the name for this thread, and then apply it. */
4600 bzero(thread_name, sizeof(thread_name));
4601 (void) snprintf(thread_name, sizeof(thread_name),
4602 "ifnet_start_%s", ifp->if_xname);
4603 #if SKYWALK
4604 /* override name for native Skywalk interface */
4605 if (ifp->if_eflags & IFEF_SKYWALK_NATIVE) {
4606 (void) snprintf(thread_name, sizeof(thread_name),
4607 "skywalk_doorbell_%s_tx", ifp->if_xname);
4608 }
4609 #endif /* SKYWALK */
4610 ASSERT(ifp->if_start_thread == current_thread());
4611 thread_set_thread_name(current_thread(), thread_name);
4612
4613 /*
4614 * Treat the dedicated starter thread for lo0 as equivalent to
4615 * the driver workloop thread; if net_affinity is enabled for
4616 * the main input thread, associate this starter thread to it
4617 * by binding them with the same affinity tag. This is done
4618 * only once (as we only have one lo_ifp which never goes away.)
4619 */
4620 if (ifp == lo_ifp) {
4621 struct dlil_threading_info *inp = dlil_main_input_thread;
4622 struct thread *tp = current_thread();
4623 #if SKYWALK
4624 /* native skywalk loopback not yet implemented */
4625 VERIFY(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
4626 #endif /* SKYWALK */
4627
4628 lck_mtx_lock(&inp->dlth_lock);
4629 if (inp->dlth_affinity) {
4630 u_int32_t tag = inp->dlth_affinity_tag;
4631
4632 VERIFY(inp->dlth_driver_thread == THREAD_NULL);
4633 VERIFY(inp->dlth_poller_thread == THREAD_NULL);
4634 inp->dlth_driver_thread = tp;
4635 lck_mtx_unlock(&inp->dlth_lock);
4636
4637 /* Associate this thread with the affinity tag */
4638 (void) dlil_affinity_set(tp, tag);
4639 } else {
4640 lck_mtx_unlock(&inp->dlth_lock);
4641 }
4642 }
4643
4644 lck_mtx_lock(&ifp->if_start_lock);
4645 VERIFY(!ifp->if_start_embryonic && !ifp->if_start_active);
4646 (void) assert_wait(&ifp->if_start_thread, THREAD_UNINT);
4647 ifp->if_start_embryonic = 1;
4648 /* wake up once to get out of embryonic state */
4649 ifp->if_start_req++;
4650 (void) wakeup_one((caddr_t)&ifp->if_start_thread);
4651 lck_mtx_unlock(&ifp->if_start_lock);
4652 (void) thread_block_parameter(ifnet_start_thread_cont, ifp);
4653 /* NOTREACHED */
4654 __builtin_unreachable();
4655 }
4656
4657 __attribute__((noreturn))
4658 static void
ifnet_start_thread_cont(void * v,wait_result_t wres)4659 ifnet_start_thread_cont(void *v, wait_result_t wres)
4660 {
4661 struct ifnet *ifp = v;
4662 struct ifclassq *ifq = ifp->if_snd;
4663
4664 lck_mtx_lock_spin(&ifp->if_start_lock);
4665 if (__improbable(wres == THREAD_INTERRUPTED ||
4666 (ifp->if_start_flags & IFSF_TERMINATING) != 0)) {
4667 goto terminate;
4668 }
4669
4670 if (__improbable(ifp->if_start_embryonic)) {
4671 ifp->if_start_embryonic = 0;
4672 lck_mtx_unlock(&ifp->if_start_lock);
4673 ifnet_decr_pending_thread_count(ifp);
4674 lck_mtx_lock_spin(&ifp->if_start_lock);
4675 goto skip;
4676 }
4677
4678 ifp->if_start_active = 1;
4679
4680 /*
4681 * Keep on servicing until no more request.
4682 */
4683 for (;;) {
4684 u_int32_t req = ifp->if_start_req;
4685 if ((ifp->if_start_flags & IFSF_NO_DELAY) == 0 &&
4686 !IFCQ_IS_EMPTY(ifq) &&
4687 (ifp->if_eflags & IFEF_ENQUEUE_MULTI) &&
4688 ifp->if_start_delayed == 0 &&
4689 IFCQ_LEN(ifq) < ifp->if_start_delay_qlen &&
4690 (ifp->if_eflags & IFEF_DELAY_START)) {
4691 ifp->if_start_delayed = 1;
4692 ifnet_start_delayed++;
4693 break;
4694 }
4695 ifp->if_start_flags &= ~IFSF_NO_DELAY;
4696 ifp->if_start_delayed = 0;
4697 lck_mtx_unlock(&ifp->if_start_lock);
4698
4699 /*
4700 * If no longer attached, don't call start because ifp
4701 * is being destroyed; else hold an IO refcnt to
4702 * prevent the interface from being detached (will be
4703 * released below.)
4704 */
4705 if (!ifnet_datamov_begin(ifp)) {
4706 lck_mtx_lock_spin(&ifp->if_start_lock);
4707 break;
4708 }
4709
4710 /* invoke the driver's start routine */
4711 ((*ifp->if_start)(ifp));
4712
4713 /*
4714 * Release the io ref count taken above.
4715 */
4716 ifnet_datamov_end(ifp);
4717
4718 lck_mtx_lock_spin(&ifp->if_start_lock);
4719
4720 /*
4721 * If there's no pending request or if the
4722 * interface has been disabled, we're done.
4723 */
4724 #define _IFSF_DISABLED (IFSF_FLOW_CONTROLLED | IFSF_TERMINATING)
4725 if (req == ifp->if_start_req ||
4726 (ifp->if_start_flags & _IFSF_DISABLED) != 0) {
4727 break;
4728 }
4729 }
4730 skip:
4731 ifp->if_start_req = 0;
4732 ifp->if_start_active = 0;
4733
4734 #if SKYWALK
4735 /*
4736 * Wakeup any waiters, e.g. any threads waiting to
4737 * detach the interface from the flowswitch, etc.
4738 */
4739 if (ifp->if_start_waiters != 0) {
4740 ifp->if_start_waiters = 0;
4741 wakeup(&ifp->if_start_waiters);
4742 }
4743 #endif /* SKYWALK */
4744 if (__probable((ifp->if_start_flags & IFSF_TERMINATING) == 0)) {
4745 uint64_t deadline = TIMEOUT_WAIT_FOREVER;
4746 struct timespec delay_start_ts;
4747 struct timespec pacemaker_ts;
4748 struct timespec *ts = NULL;
4749
4750 /*
4751 * Wakeup N ns from now if rate-controlled by TBR, and if
4752 * there are still packets in the send queue which haven't
4753 * been dequeued so far; else sleep indefinitely (ts = NULL)
4754 * until ifnet_start() is called again.
4755 */
4756 if (ifp->if_start_pacemaker_time != 0) {
4757 struct timespec now_ts;
4758 uint64_t now;
4759
4760 nanouptime(&now_ts);
4761 now = ((uint64_t)now_ts.tv_sec * NSEC_PER_SEC) + now_ts.tv_nsec;
4762
4763 if (ifp->if_start_pacemaker_time != 0 &&
4764 ifp->if_start_pacemaker_time > now) {
4765 pacemaker_ts.tv_sec = 0;
4766 pacemaker_ts.tv_nsec = ifp->if_start_pacemaker_time - now;
4767
4768 ts = &pacemaker_ts;
4769 ifp->if_start_flags |= IFSF_NO_DELAY;
4770 DTRACE_SKYWALK2(pacemaker__schedule, struct ifnet*, ifp,
4771 uint64_t, pacemaker_ts.tv_nsec);
4772 } else {
4773 DTRACE_SKYWALK2(pacemaker__timer__miss, struct ifnet*, ifp,
4774 uint64_t, now - ifp->if_start_pacemaker_time);
4775 ifp->if_start_pacemaker_time = 0;
4776 ifp->if_start_flags &= ~IFSF_NO_DELAY;
4777 }
4778 }
4779
4780 if (ts == NULL) {
4781 ts = ((IFCQ_TBR_IS_ENABLED(ifq) && !IFCQ_IS_EMPTY(ifq)) ?
4782 &ifp->if_start_cycle : NULL);
4783 }
4784
4785 if (ts == NULL && ifp->if_start_delayed == 1) {
4786 delay_start_ts.tv_sec = 0;
4787 delay_start_ts.tv_nsec = ifp->if_start_delay_timeout;
4788 ts = &delay_start_ts;
4789 }
4790
4791 if (ts != NULL && ts->tv_sec == 0 && ts->tv_nsec == 0) {
4792 ts = NULL;
4793 }
4794
4795 if (__improbable(ts != NULL)) {
4796 clock_interval_to_deadline((uint32_t)(ts->tv_nsec +
4797 (ts->tv_sec * NSEC_PER_SEC)), 1, &deadline);
4798 }
4799
4800 (void) assert_wait_deadline(&ifp->if_start_thread,
4801 THREAD_UNINT, deadline);
4802 lck_mtx_unlock(&ifp->if_start_lock);
4803 (void) thread_block_parameter(ifnet_start_thread_cont, ifp);
4804 /* NOTREACHED */
4805 } else {
4806 terminate:
4807 /* interface is detached? */
4808 ifnet_set_start_cycle(ifp, NULL);
4809
4810 ifp->if_start_pacemaker_time = 0;
4811 /* clear if_start_thread to allow termination to continue */
4812 ASSERT(ifp->if_start_thread != THREAD_NULL);
4813 ifp->if_start_thread = THREAD_NULL;
4814 wakeup((caddr_t)&ifp->if_start_thread);
4815 lck_mtx_unlock(&ifp->if_start_lock);
4816
4817 if (dlil_verbose) {
4818 DLIL_PRINTF("%s: starter thread terminated\n",
4819 if_name(ifp));
4820 }
4821
4822 /* for the extra refcnt from kernel_thread_start() */
4823 thread_deallocate(current_thread());
4824 /* this is the end */
4825 thread_terminate(current_thread());
4826 /* NOTREACHED */
4827 }
4828
4829 /* must never get here */
4830 VERIFY(0);
4831 /* NOTREACHED */
4832 __builtin_unreachable();
4833 }
4834
4835 void
ifnet_set_start_cycle(struct ifnet * ifp,struct timespec * ts)4836 ifnet_set_start_cycle(struct ifnet *ifp, struct timespec *ts)
4837 {
4838 if (ts == NULL) {
4839 bzero(&ifp->if_start_cycle, sizeof(ifp->if_start_cycle));
4840 } else {
4841 *(&ifp->if_start_cycle) = *ts;
4842 }
4843
4844 if (ts != NULL && ts->tv_nsec != 0 && dlil_verbose) {
4845 DLIL_PRINTF("%s: restart interval set to %lu nsec\n",
4846 if_name(ifp), ts->tv_nsec);
4847 }
4848 }
4849
4850 static inline void
ifnet_poll_wakeup(struct ifnet * ifp)4851 ifnet_poll_wakeup(struct ifnet *ifp)
4852 {
4853 LCK_MTX_ASSERT(&ifp->if_poll_lock, LCK_MTX_ASSERT_OWNED);
4854
4855 ifp->if_poll_req++;
4856 if (!(ifp->if_poll_flags & IF_POLLF_RUNNING) &&
4857 ifp->if_poll_thread != THREAD_NULL) {
4858 wakeup_one((caddr_t)&ifp->if_poll_thread);
4859 }
4860 }
4861
4862 void
ifnet_poll(struct ifnet * ifp)4863 ifnet_poll(struct ifnet *ifp)
4864 {
4865 /*
4866 * If the poller thread is inactive, signal it to do work.
4867 */
4868 lck_mtx_lock_spin(&ifp->if_poll_lock);
4869 ifnet_poll_wakeup(ifp);
4870 lck_mtx_unlock(&ifp->if_poll_lock);
4871 }
4872
4873 __attribute__((noreturn))
4874 static void
ifnet_poll_thread_func(void * v,wait_result_t w)4875 ifnet_poll_thread_func(void *v, wait_result_t w)
4876 {
4877 #pragma unused(w)
4878 char thread_name[MAXTHREADNAMESIZE];
4879 struct ifnet *ifp = v;
4880
4881 VERIFY(ifp->if_eflags & IFEF_RXPOLL);
4882 VERIFY(current_thread() == ifp->if_poll_thread);
4883
4884 /* construct the name for this thread, and then apply it */
4885 bzero(thread_name, sizeof(thread_name));
4886 (void) snprintf(thread_name, sizeof(thread_name),
4887 "ifnet_poller_%s", ifp->if_xname);
4888 thread_set_thread_name(ifp->if_poll_thread, thread_name);
4889
4890 lck_mtx_lock(&ifp->if_poll_lock);
4891 VERIFY(!(ifp->if_poll_flags & (IF_POLLF_EMBRYONIC | IF_POLLF_RUNNING)));
4892 (void) assert_wait(&ifp->if_poll_thread, THREAD_UNINT);
4893 ifp->if_poll_flags |= IF_POLLF_EMBRYONIC;
4894 /* wake up once to get out of embryonic state */
4895 ifnet_poll_wakeup(ifp);
4896 lck_mtx_unlock(&ifp->if_poll_lock);
4897 (void) thread_block_parameter(ifnet_poll_thread_cont, ifp);
4898 /* NOTREACHED */
4899 __builtin_unreachable();
4900 }
4901
4902 __attribute__((noreturn))
4903 static void
ifnet_poll_thread_cont(void * v,wait_result_t wres)4904 ifnet_poll_thread_cont(void *v, wait_result_t wres)
4905 {
4906 struct dlil_threading_info *inp;
4907 struct ifnet *ifp = v;
4908 struct ifnet_stat_increment_param s;
4909 struct timespec start_time;
4910
4911 VERIFY(ifp->if_eflags & IFEF_RXPOLL);
4912
4913 bzero(&s, sizeof(s));
4914 net_timerclear(&start_time);
4915
4916 lck_mtx_lock_spin(&ifp->if_poll_lock);
4917 if (__improbable(wres == THREAD_INTERRUPTED ||
4918 (ifp->if_poll_flags & IF_POLLF_TERMINATING) != 0)) {
4919 goto terminate;
4920 }
4921
4922 inp = ifp->if_inp;
4923 VERIFY(inp != NULL);
4924
4925 if (__improbable(ifp->if_poll_flags & IF_POLLF_EMBRYONIC)) {
4926 ifp->if_poll_flags &= ~IF_POLLF_EMBRYONIC;
4927 lck_mtx_unlock(&ifp->if_poll_lock);
4928 ifnet_decr_pending_thread_count(ifp);
4929 lck_mtx_lock_spin(&ifp->if_poll_lock);
4930 goto skip;
4931 }
4932
4933 ifp->if_poll_flags |= IF_POLLF_RUNNING;
4934
4935 /*
4936 * Keep on servicing until no more request.
4937 */
4938 for (;;) {
4939 struct mbuf *m_head, *m_tail;
4940 u_int32_t m_lim, m_cnt, m_totlen;
4941 u_int16_t req = ifp->if_poll_req;
4942
4943 m_lim = (ifp->if_rxpoll_plim != 0) ? ifp->if_rxpoll_plim :
4944 MAX((qlimit(&inp->dlth_pkts)), (ifp->if_rxpoll_phiwat << 2));
4945 lck_mtx_unlock(&ifp->if_poll_lock);
4946
4947 /*
4948 * If no longer attached, there's nothing to do;
4949 * else hold an IO refcnt to prevent the interface
4950 * from being detached (will be released below.)
4951 */
4952 if (!ifnet_is_attached(ifp, 1)) {
4953 lck_mtx_lock_spin(&ifp->if_poll_lock);
4954 break;
4955 }
4956
4957 if (dlil_verbose > 1) {
4958 DLIL_PRINTF("%s: polling up to %d pkts, "
4959 "pkts avg %d max %d, wreq avg %d, "
4960 "bytes avg %d\n",
4961 if_name(ifp), m_lim,
4962 ifp->if_rxpoll_pavg, ifp->if_rxpoll_pmax,
4963 ifp->if_rxpoll_wavg, ifp->if_rxpoll_bavg);
4964 }
4965
4966 /* invoke the driver's input poll routine */
4967 ((*ifp->if_input_poll)(ifp, 0, m_lim, &m_head, &m_tail,
4968 &m_cnt, &m_totlen));
4969
4970 if (m_head != NULL) {
4971 VERIFY(m_tail != NULL && m_cnt > 0);
4972
4973 if (dlil_verbose > 1) {
4974 DLIL_PRINTF("%s: polled %d pkts, "
4975 "pkts avg %d max %d, wreq avg %d, "
4976 "bytes avg %d\n",
4977 if_name(ifp), m_cnt,
4978 ifp->if_rxpoll_pavg, ifp->if_rxpoll_pmax,
4979 ifp->if_rxpoll_wavg, ifp->if_rxpoll_bavg);
4980 }
4981
4982 /* stats are required for extended variant */
4983 s.packets_in = m_cnt;
4984 s.bytes_in = m_totlen;
4985
4986 (void) ifnet_input_common(ifp, m_head, m_tail,
4987 &s, TRUE, TRUE);
4988 } else {
4989 if (dlil_verbose > 1) {
4990 DLIL_PRINTF("%s: no packets, "
4991 "pkts avg %d max %d, wreq avg %d, "
4992 "bytes avg %d\n",
4993 if_name(ifp), ifp->if_rxpoll_pavg,
4994 ifp->if_rxpoll_pmax, ifp->if_rxpoll_wavg,
4995 ifp->if_rxpoll_bavg);
4996 }
4997
4998 (void) ifnet_input_common(ifp, NULL, NULL,
4999 NULL, FALSE, TRUE);
5000 }
5001
5002 /* Release the io ref count */
5003 ifnet_decr_iorefcnt(ifp);
5004
5005 lck_mtx_lock_spin(&ifp->if_poll_lock);
5006
5007 /* if there's no pending request, we're done */
5008 if (req == ifp->if_poll_req ||
5009 (ifp->if_poll_flags & IF_POLLF_TERMINATING) != 0) {
5010 break;
5011 }
5012 }
5013 skip:
5014 ifp->if_poll_req = 0;
5015 ifp->if_poll_flags &= ~IF_POLLF_RUNNING;
5016
5017 if (__probable((ifp->if_poll_flags & IF_POLLF_TERMINATING) == 0)) {
5018 uint64_t deadline = TIMEOUT_WAIT_FOREVER;
5019 struct timespec *ts;
5020
5021 /*
5022 * Wakeup N ns from now, else sleep indefinitely (ts = NULL)
5023 * until ifnet_poll() is called again.
5024 */
5025 ts = &ifp->if_poll_cycle;
5026 if (ts->tv_sec == 0 && ts->tv_nsec == 0) {
5027 ts = NULL;
5028 }
5029
5030 if (ts != NULL) {
5031 clock_interval_to_deadline((uint32_t)(ts->tv_nsec +
5032 (ts->tv_sec * NSEC_PER_SEC)), 1, &deadline);
5033 }
5034
5035 (void) assert_wait_deadline(&ifp->if_poll_thread,
5036 THREAD_UNINT, deadline);
5037 lck_mtx_unlock(&ifp->if_poll_lock);
5038 (void) thread_block_parameter(ifnet_poll_thread_cont, ifp);
5039 /* NOTREACHED */
5040 } else {
5041 terminate:
5042 /* interface is detached (maybe while asleep)? */
5043 ifnet_set_poll_cycle(ifp, NULL);
5044
5045 /* clear if_poll_thread to allow termination to continue */
5046 ASSERT(ifp->if_poll_thread != THREAD_NULL);
5047 ifp->if_poll_thread = THREAD_NULL;
5048 wakeup((caddr_t)&ifp->if_poll_thread);
5049 lck_mtx_unlock(&ifp->if_poll_lock);
5050
5051 if (dlil_verbose) {
5052 DLIL_PRINTF("%s: poller thread terminated\n",
5053 if_name(ifp));
5054 }
5055
5056 /* for the extra refcnt from kernel_thread_start() */
5057 thread_deallocate(current_thread());
5058 /* this is the end */
5059 thread_terminate(current_thread());
5060 /* NOTREACHED */
5061 }
5062
5063 /* must never get here */
5064 VERIFY(0);
5065 /* NOTREACHED */
5066 __builtin_unreachable();
5067 }
5068
5069 void
ifnet_set_poll_cycle(struct ifnet * ifp,struct timespec * ts)5070 ifnet_set_poll_cycle(struct ifnet *ifp, struct timespec *ts)
5071 {
5072 if (ts == NULL) {
5073 bzero(&ifp->if_poll_cycle, sizeof(ifp->if_poll_cycle));
5074 } else {
5075 *(&ifp->if_poll_cycle) = *ts;
5076 }
5077
5078 if (ts != NULL && ts->tv_nsec != 0 && dlil_verbose) {
5079 DLIL_PRINTF("%s: poll interval set to %lu nsec\n",
5080 if_name(ifp), ts->tv_nsec);
5081 }
5082 }
5083
5084 void
ifnet_purge(struct ifnet * ifp)5085 ifnet_purge(struct ifnet *ifp)
5086 {
5087 if (ifp != NULL && (ifp->if_eflags & IFEF_TXSTART)) {
5088 if_qflush_snd(ifp, false);
5089 }
5090 }
5091
5092 void
ifnet_update_sndq(struct ifclassq * ifq,cqev_t ev)5093 ifnet_update_sndq(struct ifclassq *ifq, cqev_t ev)
5094 {
5095 IFCQ_LOCK_ASSERT_HELD(ifq);
5096
5097 if (!(IFCQ_IS_READY(ifq))) {
5098 return;
5099 }
5100
5101 if (IFCQ_TBR_IS_ENABLED(ifq)) {
5102 struct tb_profile tb = {
5103 .rate = ifq->ifcq_tbr.tbr_rate_raw,
5104 .percent = ifq->ifcq_tbr.tbr_percent, .depth = 0
5105 };
5106 (void) ifclassq_tbr_set(ifq, &tb, FALSE);
5107 }
5108
5109 ifclassq_update(ifq, ev);
5110 }
5111
5112 void
ifnet_update_rcv(struct ifnet * ifp,cqev_t ev)5113 ifnet_update_rcv(struct ifnet *ifp, cqev_t ev)
5114 {
5115 switch (ev) {
5116 case CLASSQ_EV_LINK_BANDWIDTH:
5117 if (net_rxpoll && (ifp->if_eflags & IFEF_RXPOLL)) {
5118 ifp->if_poll_update++;
5119 }
5120 break;
5121
5122 default:
5123 break;
5124 }
5125 }
5126
5127 errno_t
ifnet_set_output_sched_model(struct ifnet * ifp,u_int32_t model)5128 ifnet_set_output_sched_model(struct ifnet *ifp, u_int32_t model)
5129 {
5130 struct ifclassq *ifq;
5131 u_int32_t omodel;
5132 errno_t err;
5133
5134 if (ifp == NULL || model >= IFNET_SCHED_MODEL_MAX) {
5135 return EINVAL;
5136 } else if (!(ifp->if_eflags & IFEF_TXSTART)) {
5137 return ENXIO;
5138 }
5139
5140 ifq = ifp->if_snd;
5141 IFCQ_LOCK(ifq);
5142 omodel = ifp->if_output_sched_model;
5143 ifp->if_output_sched_model = model;
5144 if ((err = ifclassq_pktsched_setup(ifq)) != 0) {
5145 ifp->if_output_sched_model = omodel;
5146 }
5147 IFCQ_UNLOCK(ifq);
5148
5149 return err;
5150 }
5151
5152 errno_t
ifnet_set_sndq_maxlen(struct ifnet * ifp,u_int32_t maxqlen)5153 ifnet_set_sndq_maxlen(struct ifnet *ifp, u_int32_t maxqlen)
5154 {
5155 if (ifp == NULL) {
5156 return EINVAL;
5157 } else if (!(ifp->if_eflags & IFEF_TXSTART)) {
5158 return ENXIO;
5159 }
5160
5161 ifclassq_set_maxlen(ifp->if_snd, maxqlen);
5162
5163 return 0;
5164 }
5165
5166 errno_t
ifnet_get_sndq_maxlen(struct ifnet * ifp,u_int32_t * maxqlen)5167 ifnet_get_sndq_maxlen(struct ifnet *ifp, u_int32_t *maxqlen)
5168 {
5169 if (ifp == NULL || maxqlen == NULL) {
5170 return EINVAL;
5171 } else if (!(ifp->if_eflags & IFEF_TXSTART)) {
5172 return ENXIO;
5173 }
5174
5175 *maxqlen = ifclassq_get_maxlen(ifp->if_snd);
5176
5177 return 0;
5178 }
5179
5180 errno_t
ifnet_get_sndq_len(struct ifnet * ifp,u_int32_t * pkts)5181 ifnet_get_sndq_len(struct ifnet *ifp, u_int32_t *pkts)
5182 {
5183 errno_t err;
5184
5185 if (ifp == NULL || pkts == NULL) {
5186 err = EINVAL;
5187 } else if (!(ifp->if_eflags & IFEF_TXSTART)) {
5188 err = ENXIO;
5189 } else {
5190 err = ifclassq_get_len(ifp->if_snd, MBUF_SC_UNSPEC,
5191 IF_CLASSQ_ALL_GRPS, pkts, NULL);
5192 }
5193
5194 return err;
5195 }
5196
5197 errno_t
ifnet_get_service_class_sndq_len(struct ifnet * ifp,mbuf_svc_class_t sc,u_int32_t * pkts,u_int32_t * bytes)5198 ifnet_get_service_class_sndq_len(struct ifnet *ifp, mbuf_svc_class_t sc,
5199 u_int32_t *pkts, u_int32_t *bytes)
5200 {
5201 errno_t err;
5202
5203 if (ifp == NULL || !MBUF_VALID_SC(sc) ||
5204 (pkts == NULL && bytes == NULL)) {
5205 err = EINVAL;
5206 } else if (!(ifp->if_eflags & IFEF_TXSTART)) {
5207 err = ENXIO;
5208 } else {
5209 err = ifclassq_get_len(ifp->if_snd, sc, IF_CLASSQ_ALL_GRPS,
5210 pkts, bytes);
5211 }
5212
5213 return err;
5214 }
5215
5216 errno_t
ifnet_set_rcvq_maxlen(struct ifnet * ifp,u_int32_t maxqlen)5217 ifnet_set_rcvq_maxlen(struct ifnet *ifp, u_int32_t maxqlen)
5218 {
5219 struct dlil_threading_info *inp;
5220
5221 if (ifp == NULL) {
5222 return EINVAL;
5223 } else if (!(ifp->if_eflags & IFEF_RXPOLL) || ifp->if_inp == NULL) {
5224 return ENXIO;
5225 }
5226
5227 if (maxqlen == 0) {
5228 maxqlen = if_rcvq_maxlen;
5229 } else if (maxqlen < IF_RCVQ_MINLEN) {
5230 maxqlen = IF_RCVQ_MINLEN;
5231 }
5232
5233 inp = ifp->if_inp;
5234 lck_mtx_lock(&inp->dlth_lock);
5235 qlimit(&inp->dlth_pkts) = maxqlen;
5236 lck_mtx_unlock(&inp->dlth_lock);
5237
5238 return 0;
5239 }
5240
5241 errno_t
ifnet_get_rcvq_maxlen(struct ifnet * ifp,u_int32_t * maxqlen)5242 ifnet_get_rcvq_maxlen(struct ifnet *ifp, u_int32_t *maxqlen)
5243 {
5244 struct dlil_threading_info *inp;
5245
5246 if (ifp == NULL || maxqlen == NULL) {
5247 return EINVAL;
5248 } else if (!(ifp->if_eflags & IFEF_RXPOLL) || ifp->if_inp == NULL) {
5249 return ENXIO;
5250 }
5251
5252 inp = ifp->if_inp;
5253 lck_mtx_lock(&inp->dlth_lock);
5254 *maxqlen = qlimit(&inp->dlth_pkts);
5255 lck_mtx_unlock(&inp->dlth_lock);
5256 return 0;
5257 }
5258
5259 void
ifnet_enqueue_multi_setup(struct ifnet * ifp,uint16_t delay_qlen,uint16_t delay_timeout)5260 ifnet_enqueue_multi_setup(struct ifnet *ifp, uint16_t delay_qlen,
5261 uint16_t delay_timeout)
5262 {
5263 if (delay_qlen > 0 && delay_timeout > 0) {
5264 if_set_eflags(ifp, IFEF_ENQUEUE_MULTI);
5265 ifp->if_start_delay_qlen = MIN(100, delay_qlen);
5266 ifp->if_start_delay_timeout = min(20000, delay_timeout);
5267 /* convert timeout to nanoseconds */
5268 ifp->if_start_delay_timeout *= 1000;
5269 kprintf("%s: forced IFEF_ENQUEUE_MULTI qlen %u timeout %u\n",
5270 ifp->if_xname, (uint32_t)delay_qlen,
5271 (uint32_t)delay_timeout);
5272 } else {
5273 if_clear_eflags(ifp, IFEF_ENQUEUE_MULTI);
5274 }
5275 }
5276
5277 /*
5278 * This function clears the DSCP bits in the IPV4/V6 header pointed to by buf.
5279 * While it's ok for buf to be not 32 bit aligned, the caller must ensure that
5280 * buf holds the full header.
5281 */
5282 static __attribute__((noinline)) void
ifnet_mcast_clear_dscp(uint8_t * buf,uint8_t ip_ver)5283 ifnet_mcast_clear_dscp(uint8_t *buf, uint8_t ip_ver)
5284 {
5285 struct ip *ip;
5286 struct ip6_hdr *ip6;
5287 uint8_t lbuf[64] __attribute__((aligned(8)));
5288 uint8_t *p = buf;
5289
5290 if (ip_ver == IPVERSION) {
5291 uint8_t old_tos;
5292 uint32_t sum;
5293
5294 if (__improbable(!IP_HDR_ALIGNED_P(p))) {
5295 DTRACE_IP1(not__aligned__v4, uint8_t *, buf);
5296 bcopy(buf, lbuf, sizeof(struct ip));
5297 p = lbuf;
5298 }
5299 ip = (struct ip *)(void *)p;
5300 if (__probable((ip->ip_tos & ~IPTOS_ECN_MASK) == 0)) {
5301 return;
5302 }
5303
5304 DTRACE_IP1(clear__v4, struct ip *, ip);
5305 old_tos = ip->ip_tos;
5306 ip->ip_tos &= IPTOS_ECN_MASK;
5307 sum = ip->ip_sum + htons(old_tos) - htons(ip->ip_tos);
5308 sum = (sum >> 16) + (sum & 0xffff);
5309 ip->ip_sum = (uint16_t)(sum & 0xffff);
5310
5311 if (__improbable(p == lbuf)) {
5312 bcopy(lbuf, buf, sizeof(struct ip));
5313 }
5314 } else {
5315 uint32_t flow;
5316 ASSERT(ip_ver == IPV6_VERSION);
5317
5318 if (__improbable(!IP_HDR_ALIGNED_P(p))) {
5319 DTRACE_IP1(not__aligned__v6, uint8_t *, buf);
5320 bcopy(buf, lbuf, sizeof(struct ip6_hdr));
5321 p = lbuf;
5322 }
5323 ip6 = (struct ip6_hdr *)(void *)p;
5324 flow = ntohl(ip6->ip6_flow);
5325 if (__probable((flow & IP6FLOW_DSCP_MASK) == 0)) {
5326 return;
5327 }
5328
5329 DTRACE_IP1(clear__v6, struct ip6_hdr *, ip6);
5330 ip6->ip6_flow = htonl(flow & ~IP6FLOW_DSCP_MASK);
5331
5332 if (__improbable(p == lbuf)) {
5333 bcopy(lbuf, buf, sizeof(struct ip6_hdr));
5334 }
5335 }
5336 }
5337
5338 static inline errno_t
ifnet_enqueue_ifclassq(struct ifnet * ifp,struct ifclassq * ifcq,classq_pkt_t * p,boolean_t flush,boolean_t * pdrop)5339 ifnet_enqueue_ifclassq(struct ifnet *ifp, struct ifclassq *ifcq,
5340 classq_pkt_t *p, boolean_t flush, boolean_t *pdrop)
5341 {
5342 #if SKYWALK
5343 volatile struct sk_nexusadv *nxadv = NULL;
5344 #endif /* SKYWALK */
5345 volatile uint64_t *fg_ts = NULL;
5346 volatile uint64_t *rt_ts = NULL;
5347 struct timespec now;
5348 u_int64_t now_nsec = 0;
5349 int error = 0;
5350 uint8_t *mcast_buf = NULL;
5351 uint8_t ip_ver;
5352 uint32_t pktlen;
5353
5354 ASSERT(ifp->if_eflags & IFEF_TXSTART);
5355 #if SKYWALK
5356 /*
5357 * If attached to flowswitch, grab pointers to the
5358 * timestamp variables in the nexus advisory region.
5359 */
5360 if ((ifp->if_capabilities & IFCAP_SKYWALK) && ifp->if_na != NULL &&
5361 (nxadv = ifp->if_na->nifna_netif->nif_fsw_nxadv) != NULL) {
5362 fg_ts = &nxadv->nxadv_fg_sendts;
5363 rt_ts = &nxadv->nxadv_rt_sendts;
5364 }
5365 #endif /* SKYWALK */
5366
5367 /*
5368 * If packet already carries a timestamp, either from dlil_output()
5369 * or from flowswitch, use it here. Otherwise, record timestamp.
5370 * PKTF_TS_VALID is always cleared prior to entering classq, i.e.
5371 * the timestamp value is used internally there.
5372 */
5373 switch (p->cp_ptype) {
5374 case QP_MBUF:
5375 #if SKYWALK
5376 /*
5377 * Valid only for non-native (compat) Skywalk interface.
5378 * If the data source uses packet, caller must convert
5379 * it to mbuf first prior to calling this routine.
5380 */
5381 ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5382 #endif /* SKYWALK */
5383 ASSERT(p->cp_mbuf->m_flags & M_PKTHDR);
5384 ASSERT(p->cp_mbuf->m_nextpkt == NULL);
5385
5386 if (!(p->cp_mbuf->m_pkthdr.pkt_flags & PKTF_TS_VALID) ||
5387 p->cp_mbuf->m_pkthdr.pkt_timestamp == 0) {
5388 nanouptime(&now);
5389 net_timernsec(&now, &now_nsec);
5390 p->cp_mbuf->m_pkthdr.pkt_timestamp = now_nsec;
5391 }
5392 p->cp_mbuf->m_pkthdr.pkt_flags &= ~PKTF_TS_VALID;
5393 /*
5394 * If the packet service class is not background,
5395 * update the timestamp to indicate recent activity
5396 * on a foreground socket.
5397 */
5398 if ((p->cp_mbuf->m_pkthdr.pkt_flags & PKTF_FLOW_ID) &&
5399 p->cp_mbuf->m_pkthdr.pkt_flowsrc == FLOWSRC_INPCB) {
5400 if (!(p->cp_mbuf->m_pkthdr.pkt_flags &
5401 PKTF_SO_BACKGROUND)) {
5402 ifp->if_fg_sendts = (uint32_t)_net_uptime;
5403 if (fg_ts != NULL) {
5404 *fg_ts = (uint32_t)_net_uptime;
5405 }
5406 }
5407 if (p->cp_mbuf->m_pkthdr.pkt_flags & PKTF_SO_REALTIME) {
5408 ifp->if_rt_sendts = (uint32_t)_net_uptime;
5409 if (rt_ts != NULL) {
5410 *rt_ts = (uint32_t)_net_uptime;
5411 }
5412 }
5413 }
5414 pktlen = m_pktlen(p->cp_mbuf);
5415
5416 /*
5417 * Some Wi-Fi AP implementations do not correctly handle
5418 * multicast IP packets with DSCP bits set (radr://9331522).
5419 * As a workaround we clear the DSCP bits but keep service
5420 * class (rdar://51507725).
5421 */
5422 if ((p->cp_mbuf->m_flags & M_MCAST) != 0 &&
5423 IFNET_IS_WIFI_INFRA(ifp)) {
5424 size_t len = mbuf_len(p->cp_mbuf), hlen;
5425 struct ether_header *eh;
5426 boolean_t pullup = FALSE;
5427 uint16_t etype;
5428
5429 if (__improbable(len < sizeof(struct ether_header))) {
5430 DTRACE_IP1(small__ether, size_t, len);
5431 if ((p->cp_mbuf = m_pullup(p->cp_mbuf,
5432 sizeof(struct ether_header))) == NULL) {
5433 return ENOMEM;
5434 }
5435 }
5436 eh = (struct ether_header *)mbuf_data(p->cp_mbuf);
5437 etype = ntohs(eh->ether_type);
5438 if (etype == ETHERTYPE_IP) {
5439 hlen = sizeof(struct ether_header) +
5440 sizeof(struct ip);
5441 if (len < hlen) {
5442 DTRACE_IP1(small__v4, size_t, len);
5443 pullup = TRUE;
5444 }
5445 ip_ver = IPVERSION;
5446 } else if (etype == ETHERTYPE_IPV6) {
5447 hlen = sizeof(struct ether_header) +
5448 sizeof(struct ip6_hdr);
5449 if (len < hlen) {
5450 DTRACE_IP1(small__v6, size_t, len);
5451 pullup = TRUE;
5452 }
5453 ip_ver = IPV6_VERSION;
5454 } else {
5455 DTRACE_IP1(invalid__etype, uint16_t, etype);
5456 break;
5457 }
5458 if (pullup) {
5459 if ((p->cp_mbuf = m_pullup(p->cp_mbuf, (int)hlen)) ==
5460 NULL) {
5461 return ENOMEM;
5462 }
5463
5464 eh = (struct ether_header *)mbuf_data(
5465 p->cp_mbuf);
5466 }
5467 mcast_buf = (uint8_t *)(eh + 1);
5468 /*
5469 * ifnet_mcast_clear_dscp() will finish the work below.
5470 * Note that the pullups above ensure that mcast_buf
5471 * points to a full IP header.
5472 */
5473 }
5474 break;
5475
5476 #if SKYWALK
5477 case QP_PACKET:
5478 /*
5479 * Valid only for native Skywalk interface. If the data
5480 * source uses mbuf, caller must convert it to packet first
5481 * prior to calling this routine.
5482 */
5483 ASSERT(ifp->if_eflags & IFEF_SKYWALK_NATIVE);
5484 if (!(p->cp_kpkt->pkt_pflags & PKT_F_TS_VALID) ||
5485 p->cp_kpkt->pkt_timestamp == 0) {
5486 nanouptime(&now);
5487 net_timernsec(&now, &now_nsec);
5488 p->cp_kpkt->pkt_timestamp = now_nsec;
5489 }
5490 p->cp_kpkt->pkt_pflags &= ~PKT_F_TS_VALID;
5491 /*
5492 * If the packet service class is not background,
5493 * update the timestamps on the interface, as well as
5494 * the ones in nexus-wide advisory to indicate recent
5495 * activity on a foreground flow.
5496 */
5497 if (!(p->cp_kpkt->pkt_pflags & PKT_F_BACKGROUND)) {
5498 ifp->if_fg_sendts = (uint32_t)_net_uptime;
5499 if (fg_ts != NULL) {
5500 *fg_ts = (uint32_t)_net_uptime;
5501 }
5502 }
5503 if (p->cp_kpkt->pkt_pflags & PKT_F_REALTIME) {
5504 ifp->if_rt_sendts = (uint32_t)_net_uptime;
5505 if (rt_ts != NULL) {
5506 *rt_ts = (uint32_t)_net_uptime;
5507 }
5508 }
5509 pktlen = p->cp_kpkt->pkt_length;
5510
5511 /*
5512 * Some Wi-Fi AP implementations do not correctly handle
5513 * multicast IP packets with DSCP bits set (radr://9331522).
5514 * As a workaround we clear the DSCP bits but keep service
5515 * class (rdar://51507725).
5516 */
5517 if ((p->cp_kpkt->pkt_link_flags & PKT_LINKF_MCAST) != 0 &&
5518 IFNET_IS_WIFI_INFRA(ifp)) {
5519 uint8_t *baddr;
5520 struct ether_header *eh;
5521 uint16_t etype;
5522
5523 MD_BUFLET_ADDR_ABS(p->cp_kpkt, baddr);
5524 baddr += p->cp_kpkt->pkt_headroom;
5525 if (__improbable(pktlen < sizeof(struct ether_header))) {
5526 DTRACE_IP1(pkt__small__ether, __kern_packet *,
5527 p->cp_kpkt);
5528 break;
5529 }
5530 eh = (struct ether_header *)(void *)baddr;
5531 etype = ntohs(eh->ether_type);
5532 if (etype == ETHERTYPE_IP) {
5533 if (pktlen < sizeof(struct ether_header) +
5534 sizeof(struct ip)) {
5535 DTRACE_IP1(pkt__small__v4, uint32_t,
5536 pktlen);
5537 break;
5538 }
5539 ip_ver = IPVERSION;
5540 } else if (etype == ETHERTYPE_IPV6) {
5541 if (pktlen < sizeof(struct ether_header) +
5542 sizeof(struct ip6_hdr)) {
5543 DTRACE_IP1(pkt__small__v6, uint32_t,
5544 pktlen);
5545 break;
5546 }
5547 ip_ver = IPV6_VERSION;
5548 } else {
5549 DTRACE_IP1(pkt__invalid__etype, uint16_t,
5550 etype);
5551 break;
5552 }
5553 mcast_buf = (uint8_t *)(eh + 1);
5554 /*
5555 * ifnet_mcast_clear_dscp() will finish the work below.
5556 * The checks above verify that the IP header is in the
5557 * first buflet.
5558 */
5559 }
5560 break;
5561 #endif /* SKYWALK */
5562
5563 default:
5564 VERIFY(0);
5565 /* NOTREACHED */
5566 __builtin_unreachable();
5567 }
5568
5569 if (mcast_buf != NULL) {
5570 ifnet_mcast_clear_dscp(mcast_buf, ip_ver);
5571 }
5572
5573 if (ifp->if_eflags & IFEF_ENQUEUE_MULTI) {
5574 if (now_nsec == 0) {
5575 nanouptime(&now);
5576 net_timernsec(&now, &now_nsec);
5577 }
5578 /*
5579 * If the driver chose to delay start callback for
5580 * coalescing multiple packets, Then use the following
5581 * heuristics to make sure that start callback will
5582 * be delayed only when bulk data transfer is detected.
5583 * 1. number of packets enqueued in (delay_win * 2) is
5584 * greater than or equal to the delay qlen.
5585 * 2. If delay_start is enabled it will stay enabled for
5586 * another 10 idle windows. This is to take into account
5587 * variable RTT and burst traffic.
5588 * 3. If the time elapsed since last enqueue is more
5589 * than 200ms we disable delaying start callback. This is
5590 * is to take idle time into account.
5591 */
5592 u_int64_t dwin = (ifp->if_start_delay_timeout << 1);
5593 if (ifp->if_start_delay_swin > 0) {
5594 if ((ifp->if_start_delay_swin + dwin) > now_nsec) {
5595 ifp->if_start_delay_cnt++;
5596 } else if ((now_nsec - ifp->if_start_delay_swin)
5597 >= (200 * 1000 * 1000)) {
5598 ifp->if_start_delay_swin = now_nsec;
5599 ifp->if_start_delay_cnt = 1;
5600 ifp->if_start_delay_idle = 0;
5601 if (ifp->if_eflags & IFEF_DELAY_START) {
5602 if_clear_eflags(ifp, IFEF_DELAY_START);
5603 ifnet_delay_start_disabled_increment();
5604 }
5605 } else {
5606 if (ifp->if_start_delay_cnt >=
5607 ifp->if_start_delay_qlen) {
5608 if_set_eflags(ifp, IFEF_DELAY_START);
5609 ifp->if_start_delay_idle = 0;
5610 } else {
5611 if (ifp->if_start_delay_idle >= 10) {
5612 if_clear_eflags(ifp,
5613 IFEF_DELAY_START);
5614 ifnet_delay_start_disabled_increment();
5615 } else {
5616 ifp->if_start_delay_idle++;
5617 }
5618 }
5619 ifp->if_start_delay_swin = now_nsec;
5620 ifp->if_start_delay_cnt = 1;
5621 }
5622 } else {
5623 ifp->if_start_delay_swin = now_nsec;
5624 ifp->if_start_delay_cnt = 1;
5625 ifp->if_start_delay_idle = 0;
5626 if_clear_eflags(ifp, IFEF_DELAY_START);
5627 }
5628 } else {
5629 if_clear_eflags(ifp, IFEF_DELAY_START);
5630 }
5631
5632 /* enqueue the packet (caller consumes object) */
5633 error = ifclassq_enqueue(((ifcq != NULL) ? ifcq : ifp->if_snd), p, p,
5634 1, pktlen, pdrop);
5635
5636 /*
5637 * Tell the driver to start dequeueing; do this even when the queue
5638 * for the packet is suspended (EQSUSPENDED), as the driver could still
5639 * be dequeueing from other unsuspended queues.
5640 */
5641 if (!(ifp->if_eflags & IFEF_ENQUEUE_MULTI) &&
5642 ((error == 0 && flush) || error == EQFULL || error == EQSUSPENDED)) {
5643 ifnet_start(ifp);
5644 }
5645
5646 return error;
5647 }
5648
5649 static inline errno_t
ifnet_enqueue_ifclassq_chain(struct ifnet * ifp,struct ifclassq * ifcq,classq_pkt_t * head,classq_pkt_t * tail,uint32_t cnt,uint32_t bytes,boolean_t flush,boolean_t * pdrop)5650 ifnet_enqueue_ifclassq_chain(struct ifnet *ifp, struct ifclassq *ifcq,
5651 classq_pkt_t *head, classq_pkt_t *tail, uint32_t cnt, uint32_t bytes,
5652 boolean_t flush, boolean_t *pdrop)
5653 {
5654 int error;
5655
5656 /* enqueue the packet (caller consumes object) */
5657 error = ifclassq_enqueue(ifcq != NULL ? ifcq : ifp->if_snd, head, tail,
5658 cnt, bytes, pdrop);
5659
5660 /*
5661 * Tell the driver to start dequeueing; do this even when the queue
5662 * for the packet is suspended (EQSUSPENDED), as the driver could still
5663 * be dequeueing from other unsuspended queues.
5664 */
5665 if ((error == 0 && flush) || error == EQFULL || error == EQSUSPENDED) {
5666 ifnet_start(ifp);
5667 }
5668 return error;
5669 }
5670
5671 int
ifnet_enqueue_netem(void * handle,pktsched_pkt_t * pkts,uint32_t n_pkts)5672 ifnet_enqueue_netem(void *handle, pktsched_pkt_t *pkts, uint32_t n_pkts)
5673 {
5674 struct ifnet *ifp = handle;
5675 boolean_t pdrop; /* dummy */
5676 uint32_t i;
5677
5678 ASSERT(n_pkts >= 1);
5679 for (i = 0; i < n_pkts - 1; i++) {
5680 (void) ifnet_enqueue_ifclassq(ifp, NULL, &pkts[i].pktsched_pkt,
5681 FALSE, &pdrop);
5682 }
5683 /* flush with the last packet */
5684 (void) ifnet_enqueue_ifclassq(ifp, NULL, &pkts[i].pktsched_pkt,
5685 TRUE, &pdrop);
5686
5687 return 0;
5688 }
5689
5690 static inline errno_t
ifnet_enqueue_common(struct ifnet * ifp,struct ifclassq * ifcq,classq_pkt_t * pkt,boolean_t flush,boolean_t * pdrop)5691 ifnet_enqueue_common(struct ifnet *ifp, struct ifclassq *ifcq,
5692 classq_pkt_t *pkt, boolean_t flush, boolean_t *pdrop)
5693 {
5694 if (ifp->if_output_netem != NULL) {
5695 bool drop;
5696 errno_t error;
5697 error = netem_enqueue(ifp->if_output_netem, pkt, &drop);
5698 *pdrop = drop ? TRUE : FALSE;
5699 return error;
5700 } else {
5701 return ifnet_enqueue_ifclassq(ifp, ifcq, pkt, flush, pdrop);
5702 }
5703 }
5704
5705 errno_t
ifnet_enqueue(struct ifnet * ifp,struct mbuf * m)5706 ifnet_enqueue(struct ifnet *ifp, struct mbuf *m)
5707 {
5708 boolean_t pdrop;
5709 return ifnet_enqueue_mbuf(ifp, m, TRUE, &pdrop);
5710 }
5711
5712 errno_t
ifnet_enqueue_mbuf(struct ifnet * ifp,struct mbuf * m,boolean_t flush,boolean_t * pdrop)5713 ifnet_enqueue_mbuf(struct ifnet *ifp, struct mbuf *m, boolean_t flush,
5714 boolean_t *pdrop)
5715 {
5716 classq_pkt_t pkt;
5717
5718 if (ifp == NULL || m == NULL || !(m->m_flags & M_PKTHDR) ||
5719 m->m_nextpkt != NULL) {
5720 if (m != NULL) {
5721 m_freem_list(m);
5722 *pdrop = TRUE;
5723 }
5724 return EINVAL;
5725 } else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5726 !IF_FULLY_ATTACHED(ifp)) {
5727 /* flag tested without lock for performance */
5728 m_freem(m);
5729 *pdrop = TRUE;
5730 return ENXIO;
5731 } else if (!(ifp->if_flags & IFF_UP)) {
5732 m_freem(m);
5733 *pdrop = TRUE;
5734 return ENETDOWN;
5735 }
5736
5737 CLASSQ_PKT_INIT_MBUF(&pkt, m);
5738 return ifnet_enqueue_common(ifp, NULL, &pkt, flush, pdrop);
5739 }
5740
5741 errno_t
ifnet_enqueue_mbuf_chain(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,uint32_t cnt,uint32_t bytes,boolean_t flush,boolean_t * pdrop)5742 ifnet_enqueue_mbuf_chain(struct ifnet *ifp, struct mbuf *m_head,
5743 struct mbuf *m_tail, uint32_t cnt, uint32_t bytes, boolean_t flush,
5744 boolean_t *pdrop)
5745 {
5746 classq_pkt_t head, tail;
5747
5748 ASSERT(m_head != NULL);
5749 ASSERT((m_head->m_flags & M_PKTHDR) != 0);
5750 ASSERT(m_tail != NULL);
5751 ASSERT((m_tail->m_flags & M_PKTHDR) != 0);
5752 ASSERT(ifp != NULL);
5753 ASSERT((ifp->if_eflags & IFEF_TXSTART) != 0);
5754
5755 if (!IF_FULLY_ATTACHED(ifp)) {
5756 /* flag tested without lock for performance */
5757 m_freem_list(m_head);
5758 *pdrop = TRUE;
5759 return ENXIO;
5760 } else if (!(ifp->if_flags & IFF_UP)) {
5761 m_freem_list(m_head);
5762 *pdrop = TRUE;
5763 return ENETDOWN;
5764 }
5765
5766 CLASSQ_PKT_INIT_MBUF(&head, m_head);
5767 CLASSQ_PKT_INIT_MBUF(&tail, m_tail);
5768 return ifnet_enqueue_ifclassq_chain(ifp, NULL, &head, &tail, cnt, bytes,
5769 flush, pdrop);
5770 }
5771
5772 #if SKYWALK
5773 static errno_t
ifnet_enqueue_pkt_common(struct ifnet * ifp,struct ifclassq * ifcq,struct __kern_packet * kpkt,boolean_t flush,boolean_t * pdrop)5774 ifnet_enqueue_pkt_common(struct ifnet *ifp, struct ifclassq *ifcq,
5775 struct __kern_packet *kpkt, boolean_t flush, boolean_t *pdrop)
5776 {
5777 classq_pkt_t pkt;
5778
5779 ASSERT(kpkt == NULL || kpkt->pkt_nextpkt == NULL);
5780
5781 if (__improbable(ifp == NULL || kpkt == NULL)) {
5782 if (kpkt != NULL) {
5783 pp_free_packet(__DECONST(struct kern_pbufpool *,
5784 kpkt->pkt_qum.qum_pp), SK_PTR_ADDR(kpkt));
5785 *pdrop = TRUE;
5786 }
5787 return EINVAL;
5788 } else if (__improbable(!(ifp->if_eflags & IFEF_TXSTART) ||
5789 !IF_FULLY_ATTACHED(ifp))) {
5790 /* flag tested without lock for performance */
5791 pp_free_packet(__DECONST(struct kern_pbufpool *,
5792 kpkt->pkt_qum.qum_pp), SK_PTR_ADDR(kpkt));
5793 *pdrop = TRUE;
5794 return ENXIO;
5795 } else if (__improbable(!(ifp->if_flags & IFF_UP))) {
5796 pp_free_packet(__DECONST(struct kern_pbufpool *,
5797 kpkt->pkt_qum.qum_pp), SK_PTR_ADDR(kpkt));
5798 *pdrop = TRUE;
5799 return ENETDOWN;
5800 }
5801
5802 CLASSQ_PKT_INIT_PACKET(&pkt, kpkt);
5803 return ifnet_enqueue_common(ifp, ifcq, &pkt, flush, pdrop);
5804 }
5805
5806 errno_t
ifnet_enqueue_pkt(struct ifnet * ifp,struct __kern_packet * kpkt,boolean_t flush,boolean_t * pdrop)5807 ifnet_enqueue_pkt(struct ifnet *ifp, struct __kern_packet *kpkt,
5808 boolean_t flush, boolean_t *pdrop)
5809 {
5810 return ifnet_enqueue_pkt_common(ifp, NULL, kpkt, flush, pdrop);
5811 }
5812
5813 errno_t
ifnet_enqueue_ifcq_pkt(struct ifnet * ifp,struct ifclassq * ifcq,struct __kern_packet * kpkt,boolean_t flush,boolean_t * pdrop)5814 ifnet_enqueue_ifcq_pkt(struct ifnet *ifp, struct ifclassq *ifcq,
5815 struct __kern_packet *kpkt, boolean_t flush, boolean_t *pdrop)
5816 {
5817 return ifnet_enqueue_pkt_common(ifp, ifcq, kpkt, flush, pdrop);
5818 }
5819
5820 static errno_t
ifnet_enqueue_pkt_chain_common(struct ifnet * ifp,struct ifclassq * ifcq,struct __kern_packet * k_head,struct __kern_packet * k_tail,uint32_t cnt,uint32_t bytes,boolean_t flush,boolean_t * pdrop)5821 ifnet_enqueue_pkt_chain_common(struct ifnet *ifp, struct ifclassq *ifcq,
5822 struct __kern_packet *k_head, struct __kern_packet *k_tail, uint32_t cnt,
5823 uint32_t bytes, boolean_t flush, boolean_t *pdrop)
5824 {
5825 classq_pkt_t head, tail;
5826
5827 ASSERT(k_head != NULL);
5828 ASSERT(k_tail != NULL);
5829 ASSERT(ifp != NULL);
5830 ASSERT((ifp->if_eflags & IFEF_TXSTART) != 0);
5831
5832 if (!IF_FULLY_ATTACHED(ifp)) {
5833 /* flag tested without lock for performance */
5834 pp_free_packet_chain(k_head, NULL);
5835 *pdrop = TRUE;
5836 return ENXIO;
5837 } else if (__improbable(!(ifp->if_flags & IFF_UP))) {
5838 pp_free_packet_chain(k_head, NULL);
5839 *pdrop = TRUE;
5840 return ENETDOWN;
5841 }
5842
5843 CLASSQ_PKT_INIT_PACKET(&head, k_head);
5844 CLASSQ_PKT_INIT_PACKET(&tail, k_tail);
5845 return ifnet_enqueue_ifclassq_chain(ifp, ifcq, &head, &tail, cnt, bytes,
5846 flush, pdrop);
5847 }
5848
5849 errno_t
ifnet_enqueue_pkt_chain(struct ifnet * ifp,struct __kern_packet * k_head,struct __kern_packet * k_tail,uint32_t cnt,uint32_t bytes,boolean_t flush,boolean_t * pdrop)5850 ifnet_enqueue_pkt_chain(struct ifnet *ifp, struct __kern_packet *k_head,
5851 struct __kern_packet *k_tail, uint32_t cnt, uint32_t bytes, boolean_t flush,
5852 boolean_t *pdrop)
5853 {
5854 return ifnet_enqueue_pkt_chain_common(ifp, NULL, k_head, k_tail,
5855 cnt, bytes, flush, pdrop);
5856 }
5857
5858 errno_t
ifnet_enqueue_ifcq_pkt_chain(struct ifnet * ifp,struct ifclassq * ifcq,struct __kern_packet * k_head,struct __kern_packet * k_tail,uint32_t cnt,uint32_t bytes,boolean_t flush,boolean_t * pdrop)5859 ifnet_enqueue_ifcq_pkt_chain(struct ifnet *ifp, struct ifclassq *ifcq,
5860 struct __kern_packet *k_head, struct __kern_packet *k_tail, uint32_t cnt,
5861 uint32_t bytes, boolean_t flush, boolean_t *pdrop)
5862 {
5863 return ifnet_enqueue_pkt_chain_common(ifp, ifcq, k_head, k_tail,
5864 cnt, bytes, flush, pdrop);
5865 }
5866 #endif /* SKYWALK */
5867
5868 errno_t
ifnet_dequeue(struct ifnet * ifp,struct mbuf ** mp)5869 ifnet_dequeue(struct ifnet *ifp, struct mbuf **mp)
5870 {
5871 errno_t rc;
5872 classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
5873
5874 if (ifp == NULL || mp == NULL) {
5875 return EINVAL;
5876 } else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5877 ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5878 return ENXIO;
5879 }
5880 if (!ifnet_is_attached(ifp, 1)) {
5881 return ENXIO;
5882 }
5883
5884 #if SKYWALK
5885 ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5886 #endif /* SKYWALK */
5887 rc = ifclassq_dequeue(ifp->if_snd, 1, CLASSQ_DEQUEUE_MAX_BYTE_LIMIT,
5888 &pkt, NULL, NULL, NULL, 0);
5889 VERIFY((pkt.cp_ptype == QP_MBUF) || (pkt.cp_mbuf == NULL));
5890 ifnet_decr_iorefcnt(ifp);
5891 *mp = pkt.cp_mbuf;
5892 return rc;
5893 }
5894
5895 errno_t
ifnet_dequeue_service_class(struct ifnet * ifp,mbuf_svc_class_t sc,struct mbuf ** mp)5896 ifnet_dequeue_service_class(struct ifnet *ifp, mbuf_svc_class_t sc,
5897 struct mbuf **mp)
5898 {
5899 errno_t rc;
5900 classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
5901
5902 if (ifp == NULL || mp == NULL || !MBUF_VALID_SC(sc)) {
5903 return EINVAL;
5904 } else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5905 ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5906 return ENXIO;
5907 }
5908 if (!ifnet_is_attached(ifp, 1)) {
5909 return ENXIO;
5910 }
5911
5912 #if SKYWALK
5913 ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5914 #endif /* SKYWALK */
5915 rc = ifclassq_dequeue_sc(ifp->if_snd, sc, 1,
5916 CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, &pkt, NULL, NULL, NULL, 0);
5917 VERIFY((pkt.cp_ptype == QP_MBUF) || (pkt.cp_mbuf == NULL));
5918 ifnet_decr_iorefcnt(ifp);
5919 *mp = pkt.cp_mbuf;
5920 return rc;
5921 }
5922
5923 errno_t
ifnet_dequeue_multi(struct ifnet * ifp,u_int32_t pkt_limit,struct mbuf ** head,struct mbuf ** tail,u_int32_t * cnt,u_int32_t * len)5924 ifnet_dequeue_multi(struct ifnet *ifp, u_int32_t pkt_limit,
5925 struct mbuf **head, struct mbuf **tail, u_int32_t *cnt, u_int32_t *len)
5926 {
5927 errno_t rc;
5928 classq_pkt_t pkt_head = CLASSQ_PKT_INITIALIZER(pkt_head);
5929 classq_pkt_t pkt_tail = CLASSQ_PKT_INITIALIZER(pkt_tail);
5930
5931 if (ifp == NULL || head == NULL || pkt_limit < 1) {
5932 return EINVAL;
5933 } else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5934 ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5935 return ENXIO;
5936 }
5937 if (!ifnet_is_attached(ifp, 1)) {
5938 return ENXIO;
5939 }
5940
5941 #if SKYWALK
5942 ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5943 #endif /* SKYWALK */
5944 rc = ifclassq_dequeue(ifp->if_snd, pkt_limit,
5945 CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, &pkt_head, &pkt_tail, cnt, len, 0);
5946 VERIFY((pkt_head.cp_ptype == QP_MBUF) || (pkt_head.cp_mbuf == NULL));
5947 ifnet_decr_iorefcnt(ifp);
5948 *head = pkt_head.cp_mbuf;
5949 if (tail != NULL) {
5950 *tail = pkt_tail.cp_mbuf;
5951 }
5952 return rc;
5953 }
5954
5955 errno_t
ifnet_dequeue_multi_bytes(struct ifnet * ifp,u_int32_t byte_limit,struct mbuf ** head,struct mbuf ** tail,u_int32_t * cnt,u_int32_t * len)5956 ifnet_dequeue_multi_bytes(struct ifnet *ifp, u_int32_t byte_limit,
5957 struct mbuf **head, struct mbuf **tail, u_int32_t *cnt, u_int32_t *len)
5958 {
5959 errno_t rc;
5960 classq_pkt_t pkt_head = CLASSQ_PKT_INITIALIZER(pkt_head);
5961 classq_pkt_t pkt_tail = CLASSQ_PKT_INITIALIZER(pkt_tail);
5962
5963 if (ifp == NULL || head == NULL || byte_limit < 1) {
5964 return EINVAL;
5965 } else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5966 ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5967 return ENXIO;
5968 }
5969 if (!ifnet_is_attached(ifp, 1)) {
5970 return ENXIO;
5971 }
5972
5973 #if SKYWALK
5974 ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5975 #endif /* SKYWALK */
5976 rc = ifclassq_dequeue(ifp->if_snd, CLASSQ_DEQUEUE_MAX_PKT_LIMIT,
5977 byte_limit, &pkt_head, &pkt_tail, cnt, len, 0);
5978 VERIFY((pkt_head.cp_ptype == QP_MBUF) || (pkt_head.cp_mbuf == NULL));
5979 ifnet_decr_iorefcnt(ifp);
5980 *head = pkt_head.cp_mbuf;
5981 if (tail != NULL) {
5982 *tail = pkt_tail.cp_mbuf;
5983 }
5984 return rc;
5985 }
5986
5987 errno_t
ifnet_dequeue_service_class_multi(struct ifnet * ifp,mbuf_svc_class_t sc,u_int32_t pkt_limit,struct mbuf ** head,struct mbuf ** tail,u_int32_t * cnt,u_int32_t * len)5988 ifnet_dequeue_service_class_multi(struct ifnet *ifp, mbuf_svc_class_t sc,
5989 u_int32_t pkt_limit, struct mbuf **head, struct mbuf **tail, u_int32_t *cnt,
5990 u_int32_t *len)
5991 {
5992 errno_t rc;
5993 classq_pkt_t pkt_head = CLASSQ_PKT_INITIALIZER(pkt_head);
5994 classq_pkt_t pkt_tail = CLASSQ_PKT_INITIALIZER(pkt_tail);
5995
5996 if (ifp == NULL || head == NULL || pkt_limit < 1 ||
5997 !MBUF_VALID_SC(sc)) {
5998 return EINVAL;
5999 } else if (!(ifp->if_eflags & IFEF_TXSTART) ||
6000 ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
6001 return ENXIO;
6002 }
6003 if (!ifnet_is_attached(ifp, 1)) {
6004 return ENXIO;
6005 }
6006
6007 #if SKYWALK
6008 ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
6009 #endif /* SKYWALK */
6010 rc = ifclassq_dequeue_sc(ifp->if_snd, sc, pkt_limit,
6011 CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, &pkt_head, &pkt_tail,
6012 cnt, len, 0);
6013 VERIFY((pkt_head.cp_ptype == QP_MBUF) || (pkt_head.cp_mbuf == NULL));
6014 ifnet_decr_iorefcnt(ifp);
6015 *head = pkt_head.cp_mbuf;
6016 if (tail != NULL) {
6017 *tail = pkt_tail.cp_mbuf;
6018 }
6019 return rc;
6020 }
6021
6022 #if XNU_TARGET_OS_OSX
6023 errno_t
ifnet_framer_stub(struct ifnet * ifp,struct mbuf ** m,const struct sockaddr * dest,const char * dest_linkaddr,const char * frame_type,u_int32_t * pre,u_int32_t * post)6024 ifnet_framer_stub(struct ifnet *ifp, struct mbuf **m,
6025 const struct sockaddr *dest, const char *dest_linkaddr,
6026 const char *frame_type, u_int32_t *pre, u_int32_t *post)
6027 {
6028 if (pre != NULL) {
6029 *pre = 0;
6030 }
6031 if (post != NULL) {
6032 *post = 0;
6033 }
6034
6035 return ifp->if_framer_legacy(ifp, m, dest, dest_linkaddr, frame_type);
6036 }
6037 #endif /* XNU_TARGET_OS_OSX */
6038
6039 static boolean_t
packet_has_vlan_tag(struct mbuf * m)6040 packet_has_vlan_tag(struct mbuf * m)
6041 {
6042 u_int tag = 0;
6043
6044 if ((m->m_pkthdr.csum_flags & CSUM_VLAN_TAG_VALID) != 0) {
6045 tag = EVL_VLANOFTAG(m->m_pkthdr.vlan_tag);
6046 if (tag == 0) {
6047 /* the packet is just priority-tagged, clear the bit */
6048 m->m_pkthdr.csum_flags &= ~CSUM_VLAN_TAG_VALID;
6049 }
6050 }
6051 return tag != 0;
6052 }
6053
6054 static int
dlil_interface_filters_input(struct ifnet * ifp,struct mbuf ** m_p,char ** frame_header_p,protocol_family_t protocol_family)6055 dlil_interface_filters_input(struct ifnet *ifp, struct mbuf **m_p,
6056 char **frame_header_p, protocol_family_t protocol_family)
6057 {
6058 boolean_t is_vlan_packet = FALSE;
6059 struct ifnet_filter *filter;
6060 struct mbuf *m = *m_p;
6061
6062 is_vlan_packet = packet_has_vlan_tag(m);
6063
6064 if (TAILQ_EMPTY(&ifp->if_flt_head)) {
6065 return 0;
6066 }
6067
6068 /*
6069 * Pass the inbound packet to the interface filters
6070 */
6071 lck_mtx_lock_spin(&ifp->if_flt_lock);
6072 /* prevent filter list from changing in case we drop the lock */
6073 if_flt_monitor_busy(ifp);
6074 TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
6075 int result;
6076
6077 /* exclude VLAN packets from external filters PR-3586856 */
6078 if (is_vlan_packet &&
6079 (filter->filt_flags & DLIL_IFF_INTERNAL) == 0) {
6080 continue;
6081 }
6082
6083 if (!filter->filt_skip && filter->filt_input != NULL &&
6084 (filter->filt_protocol == 0 ||
6085 filter->filt_protocol == protocol_family)) {
6086 lck_mtx_unlock(&ifp->if_flt_lock);
6087
6088 result = (*filter->filt_input)(filter->filt_cookie,
6089 ifp, protocol_family, m_p, frame_header_p);
6090
6091 lck_mtx_lock_spin(&ifp->if_flt_lock);
6092 if (result != 0) {
6093 /* we're done with the filter list */
6094 if_flt_monitor_unbusy(ifp);
6095 lck_mtx_unlock(&ifp->if_flt_lock);
6096 return result;
6097 }
6098 }
6099 }
6100 /* we're done with the filter list */
6101 if_flt_monitor_unbusy(ifp);
6102 lck_mtx_unlock(&ifp->if_flt_lock);
6103
6104 /*
6105 * Strip away M_PROTO1 bit prior to sending packet up the stack as
6106 * it is meant to be local to a subsystem -- if_bridge for M_PROTO1
6107 */
6108 if (*m_p != NULL) {
6109 (*m_p)->m_flags &= ~M_PROTO1;
6110 }
6111
6112 return 0;
6113 }
6114
6115 __attribute__((noinline))
6116 static int
dlil_interface_filters_output(struct ifnet * ifp,struct mbuf ** m_p,protocol_family_t protocol_family)6117 dlil_interface_filters_output(struct ifnet *ifp, struct mbuf **m_p,
6118 protocol_family_t protocol_family)
6119 {
6120 boolean_t is_vlan_packet;
6121 struct ifnet_filter *filter;
6122 struct mbuf *m = *m_p;
6123
6124 if (TAILQ_EMPTY(&ifp->if_flt_head)) {
6125 return 0;
6126 }
6127 is_vlan_packet = packet_has_vlan_tag(m);
6128
6129 /*
6130 * Pass the outbound packet to the interface filters
6131 */
6132 lck_mtx_lock_spin(&ifp->if_flt_lock);
6133 /* prevent filter list from changing in case we drop the lock */
6134 if_flt_monitor_busy(ifp);
6135 TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
6136 int result;
6137
6138 /* exclude VLAN packets from external filters PR-3586856 */
6139 if (is_vlan_packet &&
6140 (filter->filt_flags & DLIL_IFF_INTERNAL) == 0) {
6141 continue;
6142 }
6143
6144 if (!filter->filt_skip && filter->filt_output != NULL &&
6145 (filter->filt_protocol == 0 ||
6146 filter->filt_protocol == protocol_family)) {
6147 lck_mtx_unlock(&ifp->if_flt_lock);
6148
6149 result = filter->filt_output(filter->filt_cookie, ifp,
6150 protocol_family, m_p);
6151
6152 lck_mtx_lock_spin(&ifp->if_flt_lock);
6153 if (result != 0) {
6154 /* we're done with the filter list */
6155 if_flt_monitor_unbusy(ifp);
6156 lck_mtx_unlock(&ifp->if_flt_lock);
6157 return result;
6158 }
6159 }
6160 }
6161 /* we're done with the filter list */
6162 if_flt_monitor_unbusy(ifp);
6163 lck_mtx_unlock(&ifp->if_flt_lock);
6164
6165 return 0;
6166 }
6167
6168 static void
dlil_ifproto_input(struct if_proto * ifproto,mbuf_t m)6169 dlil_ifproto_input(struct if_proto * ifproto, mbuf_t m)
6170 {
6171 int error;
6172
6173 if (ifproto->proto_kpi == kProtoKPI_v1) {
6174 /* Version 1 protocols get one packet at a time */
6175 while (m != NULL) {
6176 char * frame_header;
6177 mbuf_t next_packet;
6178
6179 next_packet = m->m_nextpkt;
6180 m->m_nextpkt = NULL;
6181 frame_header = m->m_pkthdr.pkt_hdr;
6182 m->m_pkthdr.pkt_hdr = NULL;
6183 error = (*ifproto->kpi.v1.input)(ifproto->ifp,
6184 ifproto->protocol_family, m, frame_header);
6185 if (error != 0 && error != EJUSTRETURN) {
6186 m_freem(m);
6187 }
6188 m = next_packet;
6189 }
6190 } else if (ifproto->proto_kpi == kProtoKPI_v2) {
6191 /* Version 2 protocols support packet lists */
6192 error = (*ifproto->kpi.v2.input)(ifproto->ifp,
6193 ifproto->protocol_family, m);
6194 if (error != 0 && error != EJUSTRETURN) {
6195 m_freem_list(m);
6196 }
6197 }
6198 }
6199
6200 static void
dlil_input_stats_add(const struct ifnet_stat_increment_param * s,struct dlil_threading_info * inp,struct ifnet * ifp,boolean_t poll)6201 dlil_input_stats_add(const struct ifnet_stat_increment_param *s,
6202 struct dlil_threading_info *inp, struct ifnet *ifp, boolean_t poll)
6203 {
6204 struct ifnet_stat_increment_param *d = &inp->dlth_stats;
6205
6206 if (s->packets_in != 0) {
6207 d->packets_in += s->packets_in;
6208 }
6209 if (s->bytes_in != 0) {
6210 d->bytes_in += s->bytes_in;
6211 }
6212 if (s->errors_in != 0) {
6213 d->errors_in += s->errors_in;
6214 }
6215
6216 if (s->packets_out != 0) {
6217 d->packets_out += s->packets_out;
6218 }
6219 if (s->bytes_out != 0) {
6220 d->bytes_out += s->bytes_out;
6221 }
6222 if (s->errors_out != 0) {
6223 d->errors_out += s->errors_out;
6224 }
6225
6226 if (s->collisions != 0) {
6227 d->collisions += s->collisions;
6228 }
6229 if (s->dropped != 0) {
6230 d->dropped += s->dropped;
6231 }
6232
6233 if (poll) {
6234 PKTCNTR_ADD(&ifp->if_poll_tstats, s->packets_in, s->bytes_in);
6235 }
6236 }
6237
6238 static boolean_t
dlil_input_stats_sync(struct ifnet * ifp,struct dlil_threading_info * inp)6239 dlil_input_stats_sync(struct ifnet *ifp, struct dlil_threading_info *inp)
6240 {
6241 struct ifnet_stat_increment_param *s = &inp->dlth_stats;
6242
6243 /*
6244 * Use of atomic operations is unavoidable here because
6245 * these stats may also be incremented elsewhere via KPIs.
6246 */
6247 if (s->packets_in != 0) {
6248 os_atomic_add(&ifp->if_data.ifi_ipackets, s->packets_in, relaxed);
6249 s->packets_in = 0;
6250 }
6251 if (s->bytes_in != 0) {
6252 os_atomic_add(&ifp->if_data.ifi_ibytes, s->bytes_in, relaxed);
6253 s->bytes_in = 0;
6254 }
6255 if (s->errors_in != 0) {
6256 os_atomic_add(&ifp->if_data.ifi_ierrors, s->errors_in, relaxed);
6257 s->errors_in = 0;
6258 }
6259
6260 if (s->packets_out != 0) {
6261 os_atomic_add(&ifp->if_data.ifi_opackets, s->packets_out, relaxed);
6262 s->packets_out = 0;
6263 }
6264 if (s->bytes_out != 0) {
6265 os_atomic_add(&ifp->if_data.ifi_obytes, s->bytes_out, relaxed);
6266 s->bytes_out = 0;
6267 }
6268 if (s->errors_out != 0) {
6269 os_atomic_add(&ifp->if_data.ifi_oerrors, s->errors_out, relaxed);
6270 s->errors_out = 0;
6271 }
6272
6273 if (s->collisions != 0) {
6274 os_atomic_add(&ifp->if_data.ifi_collisions, s->collisions, relaxed);
6275 s->collisions = 0;
6276 }
6277 if (s->dropped != 0) {
6278 os_atomic_add(&ifp->if_data.ifi_iqdrops, s->dropped, relaxed);
6279 s->dropped = 0;
6280 }
6281
6282 /*
6283 * No need for atomic operations as they are modified here
6284 * only from within the DLIL input thread context.
6285 */
6286 if (ifp->if_poll_tstats.packets != 0) {
6287 ifp->if_poll_pstats.ifi_poll_packets += ifp->if_poll_tstats.packets;
6288 ifp->if_poll_tstats.packets = 0;
6289 }
6290 if (ifp->if_poll_tstats.bytes != 0) {
6291 ifp->if_poll_pstats.ifi_poll_bytes += ifp->if_poll_tstats.bytes;
6292 ifp->if_poll_tstats.bytes = 0;
6293 }
6294
6295 return ifp->if_data_threshold != 0;
6296 }
6297
6298 __private_extern__ void
dlil_input_packet_list(struct ifnet * ifp,struct mbuf * m)6299 dlil_input_packet_list(struct ifnet *ifp, struct mbuf *m)
6300 {
6301 return dlil_input_packet_list_common(ifp, m, 0,
6302 IFNET_MODEL_INPUT_POLL_OFF, FALSE);
6303 }
6304
6305 __private_extern__ void
dlil_input_packet_list_extended(struct ifnet * ifp,struct mbuf * m,u_int32_t cnt,ifnet_model_t mode)6306 dlil_input_packet_list_extended(struct ifnet *ifp, struct mbuf *m,
6307 u_int32_t cnt, ifnet_model_t mode)
6308 {
6309 return dlil_input_packet_list_common(ifp, m, cnt, mode, TRUE);
6310 }
6311
6312 static void
dlil_input_packet_list_common(struct ifnet * ifp_param,struct mbuf * m,u_int32_t cnt,ifnet_model_t mode,boolean_t ext)6313 dlil_input_packet_list_common(struct ifnet *ifp_param, struct mbuf *m,
6314 u_int32_t cnt, ifnet_model_t mode, boolean_t ext)
6315 {
6316 int error = 0;
6317 protocol_family_t protocol_family;
6318 mbuf_t next_packet;
6319 ifnet_t ifp = ifp_param;
6320 char *frame_header = NULL;
6321 struct if_proto *last_ifproto = NULL;
6322 mbuf_t pkt_first = NULL;
6323 mbuf_t *pkt_next = NULL;
6324 u_int32_t poll_thresh = 0, poll_ival = 0;
6325 int iorefcnt = 0;
6326
6327 KERNEL_DEBUG(DBG_FNC_DLIL_INPUT | DBG_FUNC_START, 0, 0, 0, 0, 0);
6328
6329 if (ext && mode == IFNET_MODEL_INPUT_POLL_ON && cnt > 1 &&
6330 (poll_ival = if_rxpoll_interval_pkts) > 0) {
6331 poll_thresh = cnt;
6332 }
6333
6334 while (m != NULL) {
6335 struct if_proto *ifproto = NULL;
6336 uint32_t pktf_mask; /* pkt flags to preserve */
6337
6338 m_add_crumb(m, PKT_CRUMB_DLIL_INPUT);
6339
6340 if (ifp_param == NULL) {
6341 ifp = m->m_pkthdr.rcvif;
6342 }
6343
6344 if ((ifp->if_eflags & IFEF_RXPOLL) &&
6345 (ifp->if_xflags & IFXF_LEGACY) && poll_thresh != 0 &&
6346 poll_ival > 0 && (--poll_thresh % poll_ival) == 0) {
6347 ifnet_poll(ifp);
6348 }
6349
6350 /* Check if this mbuf looks valid */
6351 MBUF_INPUT_CHECK(m, ifp);
6352
6353 next_packet = m->m_nextpkt;
6354 m->m_nextpkt = NULL;
6355 frame_header = m->m_pkthdr.pkt_hdr;
6356 m->m_pkthdr.pkt_hdr = NULL;
6357
6358 /*
6359 * Get an IO reference count if the interface is not
6360 * loopback (lo0) and it is attached; lo0 never goes
6361 * away, so optimize for that.
6362 */
6363 if (ifp != lo_ifp) {
6364 /* iorefcnt is 0 if it hasn't been taken yet */
6365 if (iorefcnt == 0) {
6366 if (!ifnet_datamov_begin(ifp)) {
6367 m_freem(m);
6368 goto next;
6369 }
6370 }
6371 iorefcnt = 1;
6372 /*
6373 * Preserve the time stamp and skip pktap flags.
6374 */
6375 pktf_mask = PKTF_TS_VALID | PKTF_SKIP_PKTAP;
6376 } else {
6377 /*
6378 * If this arrived on lo0, preserve interface addr
6379 * info to allow for connectivity between loopback
6380 * and local interface addresses.
6381 */
6382 pktf_mask = (PKTF_LOOP | PKTF_IFAINFO);
6383 }
6384 pktf_mask |= PKTF_WAKE_PKT;
6385
6386 /* make sure packet comes in clean */
6387 m_classifier_init(m, pktf_mask);
6388
6389 ifp_inc_traffic_class_in(ifp, m);
6390
6391 /* find which protocol family this packet is for */
6392 ifnet_lock_shared(ifp);
6393 error = (*ifp->if_demux)(ifp, m, frame_header,
6394 &protocol_family);
6395 ifnet_lock_done(ifp);
6396 if (error != 0) {
6397 if (error == EJUSTRETURN) {
6398 goto next;
6399 }
6400 protocol_family = 0;
6401 }
6402
6403 #if (DEVELOPMENT || DEBUG)
6404 /*
6405 * For testing we do not care about broadcast and multicast packets as
6406 * they are not as controllable as unicast traffic
6407 */
6408 if (__improbable(ifp->if_xflags & IFXF_MARK_WAKE_PKT)) {
6409 if ((protocol_family == PF_INET || protocol_family == PF_INET6) &&
6410 (m->m_flags & (M_BCAST | M_MCAST)) == 0) {
6411 /*
6412 * This is a one-shot command
6413 */
6414 ifp->if_xflags &= ~IFXF_MARK_WAKE_PKT;
6415 m->m_pkthdr.pkt_flags |= PKTF_WAKE_PKT;
6416 }
6417 }
6418 #endif /* (DEVELOPMENT || DEBUG) */
6419 if (__improbable(net_wake_pkt_debug > 0 && (m->m_pkthdr.pkt_flags & PKTF_WAKE_PKT))) {
6420 char buffer[64];
6421 size_t buflen = MIN(mbuf_pkthdr_len(m), sizeof(buffer));
6422
6423 os_log(OS_LOG_DEFAULT, "wake packet from %s len %d",
6424 ifp->if_xname, m_pktlen(m));
6425 if (mbuf_copydata(m, 0, buflen, buffer) == 0) {
6426 log_hexdump(buffer, buflen);
6427 }
6428 }
6429
6430 pktap_input(ifp, protocol_family, m, frame_header);
6431
6432 /* Drop v4 packets received on CLAT46 enabled cell interface */
6433 if (protocol_family == PF_INET && IS_INTF_CLAT46(ifp) &&
6434 ifp->if_type == IFT_CELLULAR) {
6435 m_freem(m);
6436 ip6stat.ip6s_clat464_in_v4_drop++;
6437 goto next;
6438 }
6439
6440 /* Translate the packet if it is received on CLAT interface */
6441 if (protocol_family == PF_INET6 && IS_INTF_CLAT46(ifp)
6442 && dlil_is_clat_needed(protocol_family, m)) {
6443 char *data = NULL;
6444 struct ether_header eh;
6445 struct ether_header *ehp = NULL;
6446
6447 if (ifp->if_type == IFT_ETHER) {
6448 ehp = (struct ether_header *)(void *)frame_header;
6449 /* Skip RX Ethernet packets if they are not IPV6 */
6450 if (ntohs(ehp->ether_type) != ETHERTYPE_IPV6) {
6451 goto skip_clat;
6452 }
6453
6454 /* Keep a copy of frame_header for Ethernet packets */
6455 bcopy(frame_header, (caddr_t)&eh, ETHER_HDR_LEN);
6456 }
6457 error = dlil_clat64(ifp, &protocol_family, &m);
6458 data = (char *) mbuf_data(m);
6459 if (error != 0) {
6460 m_freem(m);
6461 ip6stat.ip6s_clat464_in_drop++;
6462 goto next;
6463 }
6464 /* Native v6 should be No-op */
6465 if (protocol_family != PF_INET) {
6466 goto skip_clat;
6467 }
6468
6469 /* Do this only for translated v4 packets. */
6470 switch (ifp->if_type) {
6471 case IFT_CELLULAR:
6472 frame_header = data;
6473 break;
6474 case IFT_ETHER:
6475 /*
6476 * Drop if the mbuf doesn't have enough
6477 * space for Ethernet header
6478 */
6479 if (M_LEADINGSPACE(m) < ETHER_HDR_LEN) {
6480 m_free(m);
6481 ip6stat.ip6s_clat464_in_drop++;
6482 goto next;
6483 }
6484 /*
6485 * Set the frame_header ETHER_HDR_LEN bytes
6486 * preceeding the data pointer. Change
6487 * the ether_type too.
6488 */
6489 frame_header = data - ETHER_HDR_LEN;
6490 eh.ether_type = htons(ETHERTYPE_IP);
6491 bcopy((caddr_t)&eh, frame_header, ETHER_HDR_LEN);
6492 break;
6493 }
6494 }
6495 skip_clat:
6496 /*
6497 * Match the wake packet against the list of ports that has been
6498 * been queried by the driver before the device went to sleep
6499 */
6500 if (__improbable(m->m_pkthdr.pkt_flags & PKTF_WAKE_PKT)) {
6501 if (protocol_family != PF_INET && protocol_family != PF_INET6) {
6502 if_ports_used_match_mbuf(ifp, protocol_family, m);
6503 }
6504 }
6505 if (hwcksum_dbg != 0 && !(ifp->if_flags & IFF_LOOPBACK) &&
6506 !(m->m_pkthdr.pkt_flags & PKTF_LOOP)) {
6507 dlil_input_cksum_dbg(ifp, m, frame_header,
6508 protocol_family);
6509 }
6510 /*
6511 * For partial checksum offload, we expect the driver to
6512 * set the start offset indicating the start of the span
6513 * that is covered by the hardware-computed checksum;
6514 * adjust this start offset accordingly because the data
6515 * pointer has been advanced beyond the link-layer header.
6516 *
6517 * Virtual lan types (bridge, vlan, bond) can call
6518 * dlil_input_packet_list() with the same packet with the
6519 * checksum flags set. Set a flag indicating that the
6520 * adjustment has already been done.
6521 */
6522 if ((m->m_pkthdr.csum_flags & CSUM_ADJUST_DONE) != 0) {
6523 /* adjustment has already been done */
6524 } else if ((m->m_pkthdr.csum_flags &
6525 (CSUM_DATA_VALID | CSUM_PARTIAL)) ==
6526 (CSUM_DATA_VALID | CSUM_PARTIAL)) {
6527 int adj;
6528 if (frame_header == NULL ||
6529 frame_header < (char *)mbuf_datastart(m) ||
6530 frame_header > (char *)m->m_data ||
6531 (adj = (int)(m->m_data - frame_header)) >
6532 m->m_pkthdr.csum_rx_start) {
6533 m->m_pkthdr.csum_data = 0;
6534 m->m_pkthdr.csum_flags &= ~CSUM_DATA_VALID;
6535 hwcksum_in_invalidated++;
6536 } else {
6537 m->m_pkthdr.csum_rx_start -= adj;
6538 }
6539 /* make sure we don't adjust more than once */
6540 m->m_pkthdr.csum_flags |= CSUM_ADJUST_DONE;
6541 }
6542 if (clat_debug) {
6543 pktap_input(ifp, protocol_family, m, frame_header);
6544 }
6545
6546 if (m->m_flags & (M_BCAST | M_MCAST)) {
6547 os_atomic_inc(&ifp->if_imcasts, relaxed);
6548 }
6549
6550 /* run interface filters */
6551 error = dlil_interface_filters_input(ifp, &m,
6552 &frame_header, protocol_family);
6553 if (error != 0) {
6554 if (error != EJUSTRETURN) {
6555 m_freem(m);
6556 }
6557 goto next;
6558 }
6559 /*
6560 * A VLAN interface receives VLAN-tagged packets by attaching
6561 * its PF_VLAN protocol to a parent interface. When a VLAN
6562 * interface is a member of a bridge, the parent interface
6563 * receives VLAN-tagged M_PROMISC packets. A VLAN-tagged
6564 * M_PROMISC packet must be processed by the VLAN protocol
6565 * so that it can be sent up the stack via
6566 * dlil_input_packet_list(). That allows the bridge interface's
6567 * input filter, attached to the VLAN interface, to process
6568 * the packet.
6569 */
6570 if (protocol_family != PF_VLAN &&
6571 (m->m_flags & M_PROMISC) != 0) {
6572 m_freem(m);
6573 goto next;
6574 }
6575
6576 /* Lookup the protocol attachment to this interface */
6577 if (protocol_family == 0) {
6578 ifproto = NULL;
6579 } else if (last_ifproto != NULL && last_ifproto->ifp == ifp &&
6580 (last_ifproto->protocol_family == protocol_family)) {
6581 VERIFY(ifproto == NULL);
6582 ifproto = last_ifproto;
6583 if_proto_ref(last_ifproto);
6584 } else {
6585 VERIFY(ifproto == NULL);
6586 ifnet_lock_shared(ifp);
6587 /* callee holds a proto refcnt upon success */
6588 ifproto = find_attached_proto(ifp, protocol_family);
6589 ifnet_lock_done(ifp);
6590 }
6591 if (ifproto == NULL) {
6592 /* no protocol for this packet, discard */
6593 m_freem(m);
6594 goto next;
6595 }
6596 if (ifproto != last_ifproto) {
6597 if (last_ifproto != NULL) {
6598 /* pass up the list for the previous protocol */
6599 dlil_ifproto_input(last_ifproto, pkt_first);
6600 pkt_first = NULL;
6601 if_proto_free(last_ifproto);
6602 }
6603 last_ifproto = ifproto;
6604 if_proto_ref(ifproto);
6605 }
6606 /* extend the list */
6607 m->m_pkthdr.pkt_hdr = frame_header;
6608 if (pkt_first == NULL) {
6609 pkt_first = m;
6610 } else {
6611 *pkt_next = m;
6612 }
6613 pkt_next = &m->m_nextpkt;
6614
6615 next:
6616 if (next_packet == NULL && last_ifproto != NULL) {
6617 /* pass up the last list of packets */
6618 dlil_ifproto_input(last_ifproto, pkt_first);
6619 if_proto_free(last_ifproto);
6620 last_ifproto = NULL;
6621 }
6622 if (ifproto != NULL) {
6623 if_proto_free(ifproto);
6624 ifproto = NULL;
6625 }
6626
6627 m = next_packet;
6628
6629 /* update the driver's multicast filter, if needed */
6630 if (ifp->if_updatemcasts > 0 && if_mcasts_update(ifp) == 0) {
6631 ifp->if_updatemcasts = 0;
6632 }
6633 if (iorefcnt == 1) {
6634 /* If the next mbuf is on a different interface, unlock data-mov */
6635 if (!m || (ifp != ifp_param && ifp != m->m_pkthdr.rcvif)) {
6636 ifnet_datamov_end(ifp);
6637 iorefcnt = 0;
6638 }
6639 }
6640 }
6641
6642 KERNEL_DEBUG(DBG_FNC_DLIL_INPUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
6643 }
6644
6645 static errno_t
if_mcasts_update_common(struct ifnet * ifp,bool sync)6646 if_mcasts_update_common(struct ifnet * ifp, bool sync)
6647 {
6648 errno_t err;
6649
6650 if (sync) {
6651 err = ifnet_ioctl(ifp, 0, SIOCADDMULTI, NULL);
6652 if (err == EAFNOSUPPORT) {
6653 err = 0;
6654 }
6655 } else {
6656 ifnet_ioctl_async(ifp, SIOCADDMULTI);
6657 err = 0;
6658 }
6659 DLIL_PRINTF("%s: %s %d suspended link-layer multicast membership(s) "
6660 "(err=%d)\n", if_name(ifp),
6661 (err == 0 ? "successfully restored" : "failed to restore"),
6662 ifp->if_updatemcasts, err);
6663
6664 /* just return success */
6665 return 0;
6666 }
6667
6668 static errno_t
if_mcasts_update_async(struct ifnet * ifp)6669 if_mcasts_update_async(struct ifnet *ifp)
6670 {
6671 return if_mcasts_update_common(ifp, false);
6672 }
6673
6674 errno_t
if_mcasts_update(struct ifnet * ifp)6675 if_mcasts_update(struct ifnet *ifp)
6676 {
6677 return if_mcasts_update_common(ifp, true);
6678 }
6679
6680 /* If ifp is set, we will increment the generation for the interface */
6681 int
dlil_post_complete_msg(struct ifnet * ifp,struct kev_msg * event)6682 dlil_post_complete_msg(struct ifnet *ifp, struct kev_msg *event)
6683 {
6684 if (ifp != NULL) {
6685 ifnet_increment_generation(ifp);
6686 }
6687
6688 #if NECP
6689 necp_update_all_clients();
6690 #endif /* NECP */
6691
6692 return kev_post_msg(event);
6693 }
6694
6695 __private_extern__ void
dlil_post_sifflags_msg(struct ifnet * ifp)6696 dlil_post_sifflags_msg(struct ifnet * ifp)
6697 {
6698 struct kev_msg ev_msg;
6699 struct net_event_data ev_data;
6700
6701 bzero(&ev_data, sizeof(ev_data));
6702 bzero(&ev_msg, sizeof(ev_msg));
6703 ev_msg.vendor_code = KEV_VENDOR_APPLE;
6704 ev_msg.kev_class = KEV_NETWORK_CLASS;
6705 ev_msg.kev_subclass = KEV_DL_SUBCLASS;
6706 ev_msg.event_code = KEV_DL_SIFFLAGS;
6707 strlcpy(&ev_data.if_name[0], ifp->if_name, IFNAMSIZ);
6708 ev_data.if_family = ifp->if_family;
6709 ev_data.if_unit = (u_int32_t) ifp->if_unit;
6710 ev_msg.dv[0].data_length = sizeof(struct net_event_data);
6711 ev_msg.dv[0].data_ptr = &ev_data;
6712 ev_msg.dv[1].data_length = 0;
6713 dlil_post_complete_msg(ifp, &ev_msg);
6714 }
6715
6716 #define TMP_IF_PROTO_ARR_SIZE 10
6717 static int
dlil_event_internal(struct ifnet * ifp,struct kev_msg * event,bool update_generation)6718 dlil_event_internal(struct ifnet *ifp, struct kev_msg *event, bool update_generation)
6719 {
6720 struct ifnet_filter *filter = NULL;
6721 struct if_proto *proto = NULL;
6722 int if_proto_count = 0;
6723 struct if_proto *tmp_ifproto_stack_arr[TMP_IF_PROTO_ARR_SIZE] = {NULL};
6724 struct if_proto **tmp_ifproto_arr = tmp_ifproto_stack_arr;
6725 int tmp_ifproto_arr_idx = 0;
6726
6727 /*
6728 * Pass the event to the interface filters
6729 */
6730 lck_mtx_lock_spin(&ifp->if_flt_lock);
6731 /* prevent filter list from changing in case we drop the lock */
6732 if_flt_monitor_busy(ifp);
6733 TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
6734 if (filter->filt_event != NULL) {
6735 lck_mtx_unlock(&ifp->if_flt_lock);
6736
6737 filter->filt_event(filter->filt_cookie, ifp,
6738 filter->filt_protocol, event);
6739
6740 lck_mtx_lock_spin(&ifp->if_flt_lock);
6741 }
6742 }
6743 /* we're done with the filter list */
6744 if_flt_monitor_unbusy(ifp);
6745 lck_mtx_unlock(&ifp->if_flt_lock);
6746
6747 /* Get an io ref count if the interface is attached */
6748 if (!ifnet_is_attached(ifp, 1)) {
6749 goto done;
6750 }
6751
6752 /*
6753 * An embedded tmp_list_entry in if_proto may still get
6754 * over-written by another thread after giving up ifnet lock,
6755 * therefore we are avoiding embedded pointers here.
6756 */
6757 ifnet_lock_shared(ifp);
6758 if_proto_count = dlil_ifp_protolist(ifp, NULL, 0);
6759 if (if_proto_count) {
6760 int i;
6761 VERIFY(ifp->if_proto_hash != NULL);
6762 if (if_proto_count <= TMP_IF_PROTO_ARR_SIZE) {
6763 tmp_ifproto_arr = tmp_ifproto_stack_arr;
6764 } else {
6765 tmp_ifproto_arr = kalloc_type(struct if_proto *,
6766 if_proto_count, Z_WAITOK | Z_ZERO);
6767 if (tmp_ifproto_arr == NULL) {
6768 ifnet_lock_done(ifp);
6769 goto cleanup;
6770 }
6771 }
6772
6773 for (i = 0; i < PROTO_HASH_SLOTS; i++) {
6774 SLIST_FOREACH(proto, &ifp->if_proto_hash[i],
6775 next_hash) {
6776 if_proto_ref(proto);
6777 tmp_ifproto_arr[tmp_ifproto_arr_idx] = proto;
6778 tmp_ifproto_arr_idx++;
6779 }
6780 }
6781 VERIFY(if_proto_count == tmp_ifproto_arr_idx);
6782 }
6783 ifnet_lock_done(ifp);
6784
6785 for (tmp_ifproto_arr_idx = 0; tmp_ifproto_arr_idx < if_proto_count;
6786 tmp_ifproto_arr_idx++) {
6787 proto = tmp_ifproto_arr[tmp_ifproto_arr_idx];
6788 VERIFY(proto != NULL);
6789 proto_media_event eventp =
6790 (proto->proto_kpi == kProtoKPI_v1 ?
6791 proto->kpi.v1.event :
6792 proto->kpi.v2.event);
6793
6794 if (eventp != NULL) {
6795 eventp(ifp, proto->protocol_family,
6796 event);
6797 }
6798 if_proto_free(proto);
6799 }
6800
6801 cleanup:
6802 if (tmp_ifproto_arr != tmp_ifproto_stack_arr) {
6803 kfree_type(struct if_proto *, if_proto_count, tmp_ifproto_arr);
6804 }
6805
6806 /* Pass the event to the interface */
6807 if (ifp->if_event != NULL) {
6808 ifp->if_event(ifp, event);
6809 }
6810
6811 /* Release the io ref count */
6812 ifnet_decr_iorefcnt(ifp);
6813 done:
6814 return dlil_post_complete_msg(update_generation ? ifp : NULL, event);
6815 }
6816
6817 errno_t
ifnet_event(ifnet_t ifp,struct kern_event_msg * event)6818 ifnet_event(ifnet_t ifp, struct kern_event_msg *event)
6819 {
6820 struct kev_msg kev_msg;
6821 int result = 0;
6822
6823 if (ifp == NULL || event == NULL) {
6824 return EINVAL;
6825 }
6826
6827 bzero(&kev_msg, sizeof(kev_msg));
6828 kev_msg.vendor_code = event->vendor_code;
6829 kev_msg.kev_class = event->kev_class;
6830 kev_msg.kev_subclass = event->kev_subclass;
6831 kev_msg.event_code = event->event_code;
6832 kev_msg.dv[0].data_ptr = &event->event_data[0];
6833 kev_msg.dv[0].data_length = event->total_size - KEV_MSG_HEADER_SIZE;
6834 kev_msg.dv[1].data_length = 0;
6835
6836 result = dlil_event_internal(ifp, &kev_msg, TRUE);
6837
6838 return result;
6839 }
6840
6841 static void
dlil_count_chain_len(mbuf_t m,struct chain_len_stats * cls)6842 dlil_count_chain_len(mbuf_t m, struct chain_len_stats *cls)
6843 {
6844 mbuf_t n = m;
6845 int chainlen = 0;
6846
6847 while (n != NULL) {
6848 chainlen++;
6849 n = n->m_next;
6850 }
6851 switch (chainlen) {
6852 case 0:
6853 break;
6854 case 1:
6855 os_atomic_inc(&cls->cls_one, relaxed);
6856 break;
6857 case 2:
6858 os_atomic_inc(&cls->cls_two, relaxed);
6859 break;
6860 case 3:
6861 os_atomic_inc(&cls->cls_three, relaxed);
6862 break;
6863 case 4:
6864 os_atomic_inc(&cls->cls_four, relaxed);
6865 break;
6866 case 5:
6867 default:
6868 os_atomic_inc(&cls->cls_five_or_more, relaxed);
6869 break;
6870 }
6871 }
6872
6873 #if CONFIG_DTRACE
6874 __attribute__((noinline))
6875 static void
dlil_output_dtrace(ifnet_t ifp,protocol_family_t proto_family,mbuf_t m)6876 dlil_output_dtrace(ifnet_t ifp, protocol_family_t proto_family, mbuf_t m)
6877 {
6878 if (proto_family == PF_INET) {
6879 struct ip *ip = mtod(m, struct ip *);
6880 DTRACE_IP6(send, struct mbuf *, m, struct inpcb *, NULL,
6881 struct ip *, ip, struct ifnet *, ifp,
6882 struct ip *, ip, struct ip6_hdr *, NULL);
6883 } else if (proto_family == PF_INET6) {
6884 struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
6885 DTRACE_IP6(send, struct mbuf *, m, struct inpcb *, NULL,
6886 struct ip6_hdr *, ip6, struct ifnet *, ifp,
6887 struct ip *, NULL, struct ip6_hdr *, ip6);
6888 }
6889 }
6890 #endif /* CONFIG_DTRACE */
6891
6892 /*
6893 * dlil_output
6894 *
6895 * Caller should have a lock on the protocol domain if the protocol
6896 * doesn't support finer grained locking. In most cases, the lock
6897 * will be held from the socket layer and won't be released until
6898 * we return back to the socket layer.
6899 *
6900 * This does mean that we must take a protocol lock before we take
6901 * an interface lock if we're going to take both. This makes sense
6902 * because a protocol is likely to interact with an ifp while it
6903 * is under the protocol lock.
6904 *
6905 * An advisory code will be returned if adv is not null. This
6906 * can be used to provide feedback about interface queues to the
6907 * application.
6908 */
6909 errno_t
dlil_output(ifnet_t ifp,protocol_family_t proto_family,mbuf_t packetlist,void * route,const struct sockaddr * dest,int raw,struct flowadv * adv)6910 dlil_output(ifnet_t ifp, protocol_family_t proto_family, mbuf_t packetlist,
6911 void *route, const struct sockaddr *dest, int raw, struct flowadv *adv)
6912 {
6913 char *frame_type = NULL;
6914 char *dst_linkaddr = NULL;
6915 int retval = 0;
6916 char frame_type_buffer[MAX_FRAME_TYPE_SIZE * 4];
6917 char dst_linkaddr_buffer[MAX_LINKADDR * 4];
6918 struct if_proto *proto = NULL;
6919 mbuf_t m = NULL;
6920 mbuf_t send_head = NULL;
6921 mbuf_t *send_tail = &send_head;
6922 int iorefcnt = 0;
6923 u_int32_t pre = 0, post = 0;
6924 u_int32_t fpkts = 0, fbytes = 0;
6925 int32_t flen = 0;
6926 struct timespec now;
6927 u_int64_t now_nsec;
6928 boolean_t did_clat46 = FALSE;
6929 protocol_family_t old_proto_family = proto_family;
6930 struct sockaddr_in6 dest6;
6931 struct rtentry *rt = NULL;
6932 u_int32_t m_loop_set = 0;
6933
6934 KERNEL_DEBUG(DBG_FNC_DLIL_OUTPUT | DBG_FUNC_START, 0, 0, 0, 0, 0);
6935
6936 /*
6937 * Get an io refcnt if the interface is attached to prevent ifnet_detach
6938 * from happening while this operation is in progress
6939 */
6940 if (!ifnet_datamov_begin(ifp)) {
6941 retval = ENXIO;
6942 goto cleanup;
6943 }
6944 iorefcnt = 1;
6945
6946 VERIFY(ifp->if_output_dlil != NULL);
6947
6948 /* update the driver's multicast filter, if needed */
6949 if (ifp->if_updatemcasts > 0) {
6950 if_mcasts_update_async(ifp);
6951 ifp->if_updatemcasts = 0;
6952 }
6953
6954 frame_type = frame_type_buffer;
6955 dst_linkaddr = dst_linkaddr_buffer;
6956
6957 if (raw == 0) {
6958 ifnet_lock_shared(ifp);
6959 /* callee holds a proto refcnt upon success */
6960 proto = find_attached_proto(ifp, proto_family);
6961 if (proto == NULL) {
6962 ifnet_lock_done(ifp);
6963 retval = ENXIO;
6964 goto cleanup;
6965 }
6966 ifnet_lock_done(ifp);
6967 }
6968
6969 preout_again:
6970 if (packetlist == NULL) {
6971 goto cleanup;
6972 }
6973
6974 m = packetlist;
6975 packetlist = packetlist->m_nextpkt;
6976 m->m_nextpkt = NULL;
6977
6978 m_add_crumb(m, PKT_CRUMB_DLIL_OUTPUT);
6979
6980 /*
6981 * Perform address family translation for the first
6982 * packet outside the loop in order to perform address
6983 * lookup for the translated proto family.
6984 */
6985 if (proto_family == PF_INET && IS_INTF_CLAT46(ifp) &&
6986 (ifp->if_type == IFT_CELLULAR ||
6987 dlil_is_clat_needed(proto_family, m))) {
6988 retval = dlil_clat46(ifp, &proto_family, &m);
6989 /*
6990 * Go to the next packet if translation fails
6991 */
6992 if (retval != 0) {
6993 m_freem(m);
6994 m = NULL;
6995 ip6stat.ip6s_clat464_out_drop++;
6996 /* Make sure that the proto family is PF_INET */
6997 ASSERT(proto_family == PF_INET);
6998 goto preout_again;
6999 }
7000 /*
7001 * Free the old one and make it point to the IPv6 proto structure.
7002 *
7003 * Change proto for the first time we have successfully
7004 * performed address family translation.
7005 */
7006 if (!did_clat46 && proto_family == PF_INET6) {
7007 did_clat46 = TRUE;
7008
7009 if (proto != NULL) {
7010 if_proto_free(proto);
7011 }
7012 ifnet_lock_shared(ifp);
7013 /* callee holds a proto refcnt upon success */
7014 proto = find_attached_proto(ifp, proto_family);
7015 if (proto == NULL) {
7016 ifnet_lock_done(ifp);
7017 retval = ENXIO;
7018 m_freem(m);
7019 m = NULL;
7020 goto cleanup;
7021 }
7022 ifnet_lock_done(ifp);
7023 if (ifp->if_type == IFT_ETHER) {
7024 /* Update the dest to translated v6 address */
7025 dest6.sin6_len = sizeof(struct sockaddr_in6);
7026 dest6.sin6_family = AF_INET6;
7027 dest6.sin6_addr = (mtod(m, struct ip6_hdr *))->ip6_dst;
7028 dest = (const struct sockaddr *)&dest6;
7029
7030 /*
7031 * Lookup route to the translated destination
7032 * Free this route ref during cleanup
7033 */
7034 rt = rtalloc1_scoped((struct sockaddr *)&dest6,
7035 0, 0, ifp->if_index);
7036
7037 route = rt;
7038 }
7039 }
7040 }
7041
7042 /*
7043 * This path gets packet chain going to the same destination.
7044 * The pre output routine is used to either trigger resolution of
7045 * the next hop or retreive the next hop's link layer addressing.
7046 * For ex: ether_inet(6)_pre_output routine.
7047 *
7048 * If the routine returns EJUSTRETURN, it implies that packet has
7049 * been queued, and therefore we have to call preout_again for the
7050 * following packet in the chain.
7051 *
7052 * For errors other than EJUSTRETURN, the current packet is freed
7053 * and the rest of the chain (pointed by packetlist is freed as
7054 * part of clean up.
7055 *
7056 * Else if there is no error the retrieved information is used for
7057 * all the packets in the chain.
7058 */
7059 if (raw == 0) {
7060 proto_media_preout preoutp = (proto->proto_kpi == kProtoKPI_v1 ?
7061 proto->kpi.v1.pre_output : proto->kpi.v2.pre_output);
7062 retval = 0;
7063 if (preoutp != NULL) {
7064 retval = preoutp(ifp, proto_family, &m, dest, route,
7065 frame_type, dst_linkaddr);
7066
7067 if (retval != 0) {
7068 if (retval == EJUSTRETURN) {
7069 goto preout_again;
7070 }
7071 m_freem(m);
7072 m = NULL;
7073 goto cleanup;
7074 }
7075 }
7076 }
7077
7078 do {
7079 /*
7080 * pkt_hdr is set here to point to m_data prior to
7081 * calling into the framer. This value of pkt_hdr is
7082 * used by the netif gso logic to retrieve the ip header
7083 * for the TCP packets, offloaded for TSO processing.
7084 */
7085 if ((raw != 0) && (ifp->if_family == IFNET_FAMILY_ETHERNET)) {
7086 uint8_t vlan_encap_len = 0;
7087
7088 if ((m->m_pkthdr.csum_flags & CSUM_VLAN_ENCAP_PRESENT) != 0) {
7089 vlan_encap_len = ETHER_VLAN_ENCAP_LEN;
7090 }
7091 m->m_pkthdr.pkt_hdr = mtod(m, char *) + ETHER_HDR_LEN + vlan_encap_len;
7092 } else {
7093 m->m_pkthdr.pkt_hdr = mtod(m, void *);
7094 }
7095
7096 /*
7097 * Perform address family translation if needed.
7098 * For now we only support stateless 4 to 6 translation
7099 * on the out path.
7100 *
7101 * The routine below translates IP header, updates protocol
7102 * checksum and also translates ICMP.
7103 *
7104 * We skip the first packet as it is already translated and
7105 * the proto family is set to PF_INET6.
7106 */
7107 if (proto_family == PF_INET && IS_INTF_CLAT46(ifp) &&
7108 (ifp->if_type == IFT_CELLULAR ||
7109 dlil_is_clat_needed(proto_family, m))) {
7110 retval = dlil_clat46(ifp, &proto_family, &m);
7111 /* Goto the next packet if the translation fails */
7112 if (retval != 0) {
7113 m_freem(m);
7114 m = NULL;
7115 ip6stat.ip6s_clat464_out_drop++;
7116 goto next;
7117 }
7118 }
7119
7120 #if CONFIG_DTRACE
7121 if (!raw) {
7122 dlil_output_dtrace(ifp, proto_family, m);
7123 }
7124 #endif /* CONFIG_DTRACE */
7125
7126 if (raw == 0 && ifp->if_framer != NULL) {
7127 int rcvif_set = 0;
7128
7129 /*
7130 * If this is a broadcast packet that needs to be
7131 * looped back into the system, set the inbound ifp
7132 * to that of the outbound ifp. This will allow
7133 * us to determine that it is a legitimate packet
7134 * for the system. Only set the ifp if it's not
7135 * already set, just to be safe.
7136 */
7137 if ((m->m_flags & (M_BCAST | M_LOOP)) &&
7138 m->m_pkthdr.rcvif == NULL) {
7139 m->m_pkthdr.rcvif = ifp;
7140 rcvif_set = 1;
7141 }
7142 m_loop_set = m->m_flags & M_LOOP;
7143 retval = ifp->if_framer(ifp, &m, dest, dst_linkaddr,
7144 frame_type, &pre, &post);
7145 if (retval != 0) {
7146 if (retval != EJUSTRETURN) {
7147 m_freem(m);
7148 }
7149 goto next;
7150 }
7151
7152 /*
7153 * For partial checksum offload, adjust the start
7154 * and stuff offsets based on the prepended header.
7155 */
7156 if ((m->m_pkthdr.csum_flags &
7157 (CSUM_DATA_VALID | CSUM_PARTIAL)) ==
7158 (CSUM_DATA_VALID | CSUM_PARTIAL)) {
7159 m->m_pkthdr.csum_tx_stuff += pre;
7160 m->m_pkthdr.csum_tx_start += pre;
7161 }
7162
7163 if (hwcksum_dbg != 0 && !(ifp->if_flags & IFF_LOOPBACK)) {
7164 dlil_output_cksum_dbg(ifp, m, pre,
7165 proto_family);
7166 }
7167
7168 /*
7169 * Clear the ifp if it was set above, and to be
7170 * safe, only if it is still the same as the
7171 * outbound ifp we have in context. If it was
7172 * looped back, then a copy of it was sent to the
7173 * loopback interface with the rcvif set, and we
7174 * are clearing the one that will go down to the
7175 * layer below.
7176 */
7177 if (rcvif_set && m->m_pkthdr.rcvif == ifp) {
7178 m->m_pkthdr.rcvif = NULL;
7179 }
7180 }
7181
7182 /*
7183 * Let interface filters (if any) do their thing ...
7184 */
7185 retval = dlil_interface_filters_output(ifp, &m, proto_family);
7186 if (retval != 0) {
7187 if (retval != EJUSTRETURN) {
7188 m_freem(m);
7189 }
7190 goto next;
7191 }
7192 /*
7193 * Strip away M_PROTO1 bit prior to sending packet
7194 * to the driver as this field may be used by the driver
7195 */
7196 m->m_flags &= ~M_PROTO1;
7197
7198 /*
7199 * If the underlying interface is not capable of handling a
7200 * packet whose data portion spans across physically disjoint
7201 * pages, we need to "normalize" the packet so that we pass
7202 * down a chain of mbufs where each mbuf points to a span that
7203 * resides in the system page boundary. If the packet does
7204 * not cross page(s), the following is a no-op.
7205 */
7206 if (!(ifp->if_hwassist & IFNET_MULTIPAGES)) {
7207 if ((m = m_normalize(m)) == NULL) {
7208 goto next;
7209 }
7210 }
7211
7212 /*
7213 * If this is a TSO packet, make sure the interface still
7214 * advertise TSO capability.
7215 */
7216 if (TSO_IPV4_NOTOK(ifp, m) || TSO_IPV6_NOTOK(ifp, m)) {
7217 retval = EMSGSIZE;
7218 m_freem(m);
7219 goto cleanup;
7220 }
7221
7222 ifp_inc_traffic_class_out(ifp, m);
7223
7224 #if SKYWALK
7225 /*
7226 * For native skywalk devices, packets will be passed to pktap
7227 * after GSO or after the mbuf to packet conversion.
7228 * This is done for IPv4/IPv6 packets only because there is no
7229 * space in the mbuf to pass down the proto family.
7230 */
7231 if (dlil_is_native_netif_nexus(ifp)) {
7232 if (raw || m->m_pkthdr.pkt_proto == 0) {
7233 pktap_output(ifp, proto_family, m, pre, post);
7234 m->m_pkthdr.pkt_flags |= PKTF_SKIP_PKTAP;
7235 }
7236 } else {
7237 pktap_output(ifp, proto_family, m, pre, post);
7238 }
7239 #else /* SKYWALK */
7240 pktap_output(ifp, proto_family, m, pre, post);
7241 #endif /* SKYWALK */
7242
7243 /*
7244 * Count the number of elements in the mbuf chain
7245 */
7246 if (tx_chain_len_count) {
7247 dlil_count_chain_len(m, &tx_chain_len_stats);
7248 }
7249
7250 /*
7251 * Record timestamp; ifnet_enqueue() will use this info
7252 * rather than redoing the work. An optimization could
7253 * involve doing this just once at the top, if there are
7254 * no interface filters attached, but that's probably
7255 * not a big deal.
7256 */
7257 nanouptime(&now);
7258 net_timernsec(&now, &now_nsec);
7259 (void) mbuf_set_timestamp(m, now_nsec, TRUE);
7260
7261 /*
7262 * Discard partial sum information if this packet originated
7263 * from another interface; the packet would already have the
7264 * final checksum and we shouldn't recompute it.
7265 */
7266 if ((m->m_pkthdr.pkt_flags & PKTF_FORWARDED) &&
7267 (m->m_pkthdr.csum_flags & (CSUM_DATA_VALID | CSUM_PARTIAL)) ==
7268 (CSUM_DATA_VALID | CSUM_PARTIAL)) {
7269 m->m_pkthdr.csum_flags &= ~CSUM_TX_FLAGS;
7270 m->m_pkthdr.csum_data = 0;
7271 }
7272
7273 /*
7274 * Finally, call the driver.
7275 */
7276 if (ifp->if_eflags & (IFEF_SENDLIST | IFEF_ENQUEUE_MULTI)) {
7277 if (m->m_pkthdr.pkt_flags & PKTF_FORWARDED) {
7278 flen += (m_pktlen(m) - (pre + post));
7279 m->m_pkthdr.pkt_flags &= ~PKTF_FORWARDED;
7280 }
7281 *send_tail = m;
7282 send_tail = &m->m_nextpkt;
7283 } else {
7284 if (m->m_pkthdr.pkt_flags & PKTF_FORWARDED) {
7285 flen = (m_pktlen(m) - (pre + post));
7286 m->m_pkthdr.pkt_flags &= ~PKTF_FORWARDED;
7287 } else {
7288 flen = 0;
7289 }
7290 KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_START,
7291 0, 0, 0, 0, 0);
7292 retval = (*ifp->if_output_dlil)(ifp, m);
7293 if (retval == EQFULL || retval == EQSUSPENDED) {
7294 if (adv != NULL && adv->code == FADV_SUCCESS) {
7295 adv->code = (retval == EQFULL ?
7296 FADV_FLOW_CONTROLLED :
7297 FADV_SUSPENDED);
7298 }
7299 retval = 0;
7300 }
7301 if (retval == 0 && flen > 0) {
7302 fbytes += flen;
7303 fpkts++;
7304 }
7305 if (retval != 0 && dlil_verbose) {
7306 DLIL_PRINTF("%s: output error on %s retval = %d\n",
7307 __func__, if_name(ifp),
7308 retval);
7309 }
7310 KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_END,
7311 0, 0, 0, 0, 0);
7312 }
7313 KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
7314
7315 next:
7316 m = packetlist;
7317 if (m != NULL) {
7318 m->m_flags |= m_loop_set;
7319 packetlist = packetlist->m_nextpkt;
7320 m->m_nextpkt = NULL;
7321 }
7322 /* Reset the proto family to old proto family for CLAT */
7323 if (did_clat46) {
7324 proto_family = old_proto_family;
7325 }
7326 } while (m != NULL);
7327
7328 if (send_head != NULL) {
7329 KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_START,
7330 0, 0, 0, 0, 0);
7331 if (ifp->if_eflags & IFEF_SENDLIST) {
7332 retval = (*ifp->if_output_dlil)(ifp, send_head);
7333 if (retval == EQFULL || retval == EQSUSPENDED) {
7334 if (adv != NULL) {
7335 adv->code = (retval == EQFULL ?
7336 FADV_FLOW_CONTROLLED :
7337 FADV_SUSPENDED);
7338 }
7339 retval = 0;
7340 }
7341 if (retval == 0 && flen > 0) {
7342 fbytes += flen;
7343 fpkts++;
7344 }
7345 if (retval != 0 && dlil_verbose) {
7346 DLIL_PRINTF("%s: output error on %s retval = %d\n",
7347 __func__, if_name(ifp), retval);
7348 }
7349 } else {
7350 struct mbuf *send_m;
7351 int enq_cnt = 0;
7352 VERIFY(ifp->if_eflags & IFEF_ENQUEUE_MULTI);
7353 while (send_head != NULL) {
7354 send_m = send_head;
7355 send_head = send_m->m_nextpkt;
7356 send_m->m_nextpkt = NULL;
7357 retval = (*ifp->if_output_dlil)(ifp, send_m);
7358 if (retval == EQFULL || retval == EQSUSPENDED) {
7359 if (adv != NULL) {
7360 adv->code = (retval == EQFULL ?
7361 FADV_FLOW_CONTROLLED :
7362 FADV_SUSPENDED);
7363 }
7364 retval = 0;
7365 }
7366 if (retval == 0) {
7367 enq_cnt++;
7368 if (flen > 0) {
7369 fpkts++;
7370 }
7371 }
7372 if (retval != 0 && dlil_verbose) {
7373 DLIL_PRINTF("%s: output error on %s "
7374 "retval = %d\n",
7375 __func__, if_name(ifp), retval);
7376 }
7377 }
7378 if (enq_cnt > 0) {
7379 fbytes += flen;
7380 ifnet_start(ifp);
7381 }
7382 }
7383 KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
7384 }
7385
7386 KERNEL_DEBUG(DBG_FNC_DLIL_OUTPUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
7387
7388 cleanup:
7389 if (fbytes > 0) {
7390 ifp->if_fbytes += fbytes;
7391 }
7392 if (fpkts > 0) {
7393 ifp->if_fpackets += fpkts;
7394 }
7395 if (proto != NULL) {
7396 if_proto_free(proto);
7397 }
7398 if (packetlist) { /* if any packets are left, clean up */
7399 mbuf_freem_list(packetlist);
7400 }
7401 if (retval == EJUSTRETURN) {
7402 retval = 0;
7403 }
7404 if (iorefcnt == 1) {
7405 ifnet_datamov_end(ifp);
7406 }
7407 if (rt != NULL) {
7408 rtfree(rt);
7409 rt = NULL;
7410 }
7411
7412 return retval;
7413 }
7414
7415 /*
7416 * This routine checks if the destination address is not a loopback, link-local,
7417 * multicast or broadcast address.
7418 */
7419 static int
dlil_is_clat_needed(protocol_family_t proto_family,mbuf_t m)7420 dlil_is_clat_needed(protocol_family_t proto_family, mbuf_t m)
7421 {
7422 int ret = 0;
7423 switch (proto_family) {
7424 case PF_INET: {
7425 struct ip *iph = mtod(m, struct ip *);
7426 if (CLAT46_NEEDED(ntohl(iph->ip_dst.s_addr))) {
7427 ret = 1;
7428 }
7429 break;
7430 }
7431 case PF_INET6: {
7432 struct ip6_hdr *ip6h = mtod(m, struct ip6_hdr *);
7433 if ((size_t)m_pktlen(m) >= sizeof(struct ip6_hdr) &&
7434 CLAT64_NEEDED(&ip6h->ip6_dst)) {
7435 ret = 1;
7436 }
7437 break;
7438 }
7439 }
7440
7441 return ret;
7442 }
7443 /*
7444 * @brief This routine translates IPv4 packet to IPv6 packet,
7445 * updates protocol checksum and also translates ICMP for code
7446 * along with inner header translation.
7447 *
7448 * @param ifp Pointer to the interface
7449 * @param proto_family pointer to protocol family. It is updated if function
7450 * performs the translation successfully.
7451 * @param m Pointer to the pointer pointing to the packet. Needed because this
7452 * routine can end up changing the mbuf to a different one.
7453 *
7454 * @return 0 on success or else a negative value.
7455 */
7456 static errno_t
dlil_clat46(ifnet_t ifp,protocol_family_t * proto_family,mbuf_t * m)7457 dlil_clat46(ifnet_t ifp, protocol_family_t *proto_family, mbuf_t *m)
7458 {
7459 VERIFY(*proto_family == PF_INET);
7460 VERIFY(IS_INTF_CLAT46(ifp));
7461
7462 pbuf_t pbuf_store, *pbuf = NULL;
7463 struct ip *iph = NULL;
7464 struct in_addr osrc, odst;
7465 uint8_t proto = 0;
7466 struct in6_ifaddr *ia6_clat_src = NULL;
7467 struct in6_addr *src = NULL;
7468 struct in6_addr dst;
7469 int error = 0;
7470 uint16_t off = 0;
7471 uint16_t tot_len = 0;
7472 uint16_t ip_id_val = 0;
7473 uint16_t ip_frag_off = 0;
7474
7475 boolean_t is_frag = FALSE;
7476 boolean_t is_first_frag = TRUE;
7477 boolean_t is_last_frag = TRUE;
7478
7479 pbuf_init_mbuf(&pbuf_store, *m, ifp);
7480 pbuf = &pbuf_store;
7481 iph = pbuf->pb_data;
7482
7483 osrc = iph->ip_src;
7484 odst = iph->ip_dst;
7485 proto = iph->ip_p;
7486 off = (uint16_t)(iph->ip_hl << 2);
7487 ip_id_val = iph->ip_id;
7488 ip_frag_off = ntohs(iph->ip_off) & IP_OFFMASK;
7489
7490 tot_len = ntohs(iph->ip_len);
7491
7492 /*
7493 * For packets that are not first frags
7494 * we only need to adjust CSUM.
7495 * For 4 to 6, Fragmentation header gets appended
7496 * after proto translation.
7497 */
7498 if (ntohs(iph->ip_off) & ~(IP_DF | IP_RF)) {
7499 is_frag = TRUE;
7500
7501 /* If the offset is not zero, it is not first frag */
7502 if (ip_frag_off != 0) {
7503 is_first_frag = FALSE;
7504 }
7505
7506 /* If IP_MF is set, then it is not last frag */
7507 if (ntohs(iph->ip_off) & IP_MF) {
7508 is_last_frag = FALSE;
7509 }
7510 }
7511
7512 /*
7513 * Retrive the local IPv6 CLAT46 address reserved for stateless
7514 * translation.
7515 */
7516 ia6_clat_src = in6ifa_ifpwithflag(ifp, IN6_IFF_CLAT46);
7517 if (ia6_clat_src == NULL) {
7518 ip6stat.ip6s_clat464_out_nov6addr_drop++;
7519 error = -1;
7520 goto cleanup;
7521 }
7522
7523 src = &ia6_clat_src->ia_addr.sin6_addr;
7524
7525 /*
7526 * Translate IPv4 destination to IPv6 destination by using the
7527 * prefixes learned through prior PLAT discovery.
7528 */
7529 if ((error = nat464_synthesize_ipv6(ifp, &odst, &dst)) != 0) {
7530 ip6stat.ip6s_clat464_out_v6synthfail_drop++;
7531 goto cleanup;
7532 }
7533
7534 /* Translate the IP header part first */
7535 error = (nat464_translate_46(pbuf, off, iph->ip_tos, iph->ip_p,
7536 iph->ip_ttl, *src, dst, tot_len) == NT_NAT64) ? 0 : -1;
7537
7538 iph = NULL; /* Invalidate iph as pbuf has been modified */
7539
7540 if (error != 0) {
7541 ip6stat.ip6s_clat464_out_46transfail_drop++;
7542 goto cleanup;
7543 }
7544
7545 /*
7546 * Translate protocol header, update checksum, checksum flags
7547 * and related fields.
7548 */
7549 error = (nat464_translate_proto(pbuf, (struct nat464_addr *)&osrc, (struct nat464_addr *)&odst,
7550 proto, PF_INET, PF_INET6, NT_OUT, !is_first_frag) == NT_NAT64) ? 0 : -1;
7551
7552 if (error != 0) {
7553 ip6stat.ip6s_clat464_out_46proto_transfail_drop++;
7554 goto cleanup;
7555 }
7556
7557 /* Now insert the IPv6 fragment header */
7558 if (is_frag) {
7559 error = nat464_insert_frag46(pbuf, ip_id_val, ip_frag_off, is_last_frag);
7560
7561 if (error != 0) {
7562 ip6stat.ip6s_clat464_out_46frag_transfail_drop++;
7563 goto cleanup;
7564 }
7565 }
7566
7567 cleanup:
7568 if (ia6_clat_src != NULL) {
7569 IFA_REMREF(&ia6_clat_src->ia_ifa);
7570 }
7571
7572 if (pbuf_is_valid(pbuf)) {
7573 *m = pbuf->pb_mbuf;
7574 pbuf->pb_mbuf = NULL;
7575 pbuf_destroy(pbuf);
7576 } else {
7577 error = -1;
7578 *m = NULL;
7579 ip6stat.ip6s_clat464_out_invalpbuf_drop++;
7580 }
7581
7582 if (error == 0) {
7583 *proto_family = PF_INET6;
7584 ip6stat.ip6s_clat464_out_success++;
7585 }
7586
7587 return error;
7588 }
7589
7590 /*
7591 * @brief This routine translates incoming IPv6 to IPv4 packet,
7592 * updates protocol checksum and also translates ICMPv6 outer
7593 * and inner headers
7594 *
7595 * @return 0 on success or else a negative value.
7596 */
7597 static errno_t
dlil_clat64(ifnet_t ifp,protocol_family_t * proto_family,mbuf_t * m)7598 dlil_clat64(ifnet_t ifp, protocol_family_t *proto_family, mbuf_t *m)
7599 {
7600 VERIFY(*proto_family == PF_INET6);
7601 VERIFY(IS_INTF_CLAT46(ifp));
7602
7603 struct ip6_hdr *ip6h = NULL;
7604 struct in6_addr osrc, odst;
7605 uint8_t proto = 0;
7606 struct in6_ifaddr *ia6_clat_dst = NULL;
7607 struct in_ifaddr *ia4_clat_dst = NULL;
7608 struct in_addr *dst = NULL;
7609 struct in_addr src;
7610 int error = 0;
7611 uint32_t off = 0;
7612 u_int64_t tot_len = 0;
7613 uint8_t tos = 0;
7614 boolean_t is_first_frag = TRUE;
7615
7616 /* Incoming mbuf does not contain valid IP6 header */
7617 if ((size_t)(*m)->m_pkthdr.len < sizeof(struct ip6_hdr) ||
7618 ((size_t)(*m)->m_len < sizeof(struct ip6_hdr) &&
7619 (*m = m_pullup(*m, sizeof(struct ip6_hdr))) == NULL)) {
7620 ip6stat.ip6s_clat464_in_tooshort_drop++;
7621 return -1;
7622 }
7623
7624 ip6h = mtod(*m, struct ip6_hdr *);
7625 /* Validate that mbuf contains IP payload equal to ip6_plen */
7626 if ((size_t)(*m)->m_pkthdr.len < ntohs(ip6h->ip6_plen) + sizeof(struct ip6_hdr)) {
7627 ip6stat.ip6s_clat464_in_tooshort_drop++;
7628 return -1;
7629 }
7630
7631 osrc = ip6h->ip6_src;
7632 odst = ip6h->ip6_dst;
7633
7634 /*
7635 * Retrieve the local CLAT46 reserved IPv6 address.
7636 * Let the packet pass if we don't find one, as the flag
7637 * may get set before IPv6 configuration has taken place.
7638 */
7639 ia6_clat_dst = in6ifa_ifpwithflag(ifp, IN6_IFF_CLAT46);
7640 if (ia6_clat_dst == NULL) {
7641 goto done;
7642 }
7643
7644 /*
7645 * Check if the original dest in the packet is same as the reserved
7646 * CLAT46 IPv6 address
7647 */
7648 if (IN6_ARE_ADDR_EQUAL(&odst, &ia6_clat_dst->ia_addr.sin6_addr)) {
7649 pbuf_t pbuf_store, *pbuf = NULL;
7650 pbuf_init_mbuf(&pbuf_store, *m, ifp);
7651 pbuf = &pbuf_store;
7652
7653 /*
7654 * Retrive the local CLAT46 IPv4 address reserved for stateless
7655 * translation.
7656 */
7657 ia4_clat_dst = inifa_ifpclatv4(ifp);
7658 if (ia4_clat_dst == NULL) {
7659 IFA_REMREF(&ia6_clat_dst->ia_ifa);
7660 ip6stat.ip6s_clat464_in_nov4addr_drop++;
7661 error = -1;
7662 goto cleanup;
7663 }
7664 IFA_REMREF(&ia6_clat_dst->ia_ifa);
7665
7666 /* Translate IPv6 src to IPv4 src by removing the NAT64 prefix */
7667 dst = &ia4_clat_dst->ia_addr.sin_addr;
7668 if ((error = nat464_synthesize_ipv4(ifp, &osrc, &src)) != 0) {
7669 ip6stat.ip6s_clat464_in_v4synthfail_drop++;
7670 error = -1;
7671 goto cleanup;
7672 }
7673
7674 ip6h = pbuf->pb_data;
7675 off = sizeof(struct ip6_hdr);
7676 proto = ip6h->ip6_nxt;
7677 tos = (ntohl(ip6h->ip6_flow) >> 20) & 0xff;
7678 tot_len = ntohs(ip6h->ip6_plen) + sizeof(struct ip6_hdr);
7679
7680 /*
7681 * Translate the IP header and update the fragmentation
7682 * header if needed
7683 */
7684 error = (nat464_translate_64(pbuf, off, tos, &proto,
7685 ip6h->ip6_hlim, src, *dst, tot_len, &is_first_frag) == NT_NAT64) ?
7686 0 : -1;
7687
7688 ip6h = NULL; /* Invalidate ip6h as pbuf has been changed */
7689
7690 if (error != 0) {
7691 ip6stat.ip6s_clat464_in_64transfail_drop++;
7692 goto cleanup;
7693 }
7694
7695 /*
7696 * Translate protocol header, update checksum, checksum flags
7697 * and related fields.
7698 */
7699 error = (nat464_translate_proto(pbuf, (struct nat464_addr *)&osrc,
7700 (struct nat464_addr *)&odst, proto, PF_INET6, PF_INET,
7701 NT_IN, !is_first_frag) == NT_NAT64) ? 0 : -1;
7702
7703 if (error != 0) {
7704 ip6stat.ip6s_clat464_in_64proto_transfail_drop++;
7705 goto cleanup;
7706 }
7707
7708 cleanup:
7709 if (ia4_clat_dst != NULL) {
7710 IFA_REMREF(&ia4_clat_dst->ia_ifa);
7711 }
7712
7713 if (pbuf_is_valid(pbuf)) {
7714 *m = pbuf->pb_mbuf;
7715 pbuf->pb_mbuf = NULL;
7716 pbuf_destroy(pbuf);
7717 } else {
7718 error = -1;
7719 ip6stat.ip6s_clat464_in_invalpbuf_drop++;
7720 }
7721
7722 if (error == 0) {
7723 *proto_family = PF_INET;
7724 ip6stat.ip6s_clat464_in_success++;
7725 }
7726 } /* CLAT traffic */
7727
7728 done:
7729 return error;
7730 }
7731
7732 /* The following is used to enqueue work items for ifnet ioctl events */
7733 static void ifnet_ioctl_event_callback(struct nwk_wq_entry *);
7734
7735 struct ifnet_ioctl_event {
7736 struct ifnet *ifp;
7737 u_long ioctl_code;
7738 };
7739
7740 struct ifnet_ioctl_event_nwk_wq_entry {
7741 struct nwk_wq_entry nwk_wqe;
7742 struct ifnet_ioctl_event ifnet_ioctl_ev_arg;
7743 };
7744
7745 void
ifnet_ioctl_async(struct ifnet * ifp,u_long ioctl_code)7746 ifnet_ioctl_async(struct ifnet *ifp, u_long ioctl_code)
7747 {
7748 struct ifnet_ioctl_event_nwk_wq_entry *p_ifnet_ioctl_ev = NULL;
7749 bool compare_expected;
7750
7751 /*
7752 * Get an io ref count if the interface is attached.
7753 * At this point it most likely is. We are taking a reference for
7754 * deferred processing.
7755 */
7756 if (!ifnet_is_attached(ifp, 1)) {
7757 os_log(OS_LOG_DEFAULT, "%s:%d %s Failed for ioctl %lu as interface "
7758 "is not attached",
7759 __func__, __LINE__, if_name(ifp), ioctl_code);
7760 return;
7761 }
7762 switch (ioctl_code) {
7763 case SIOCADDMULTI:
7764 compare_expected = false;
7765 if (!atomic_compare_exchange_strong(&ifp->if_mcast_add_signaled, &compare_expected, true)) {
7766 ifnet_decr_iorefcnt(ifp);
7767 return;
7768 }
7769 break;
7770 case SIOCDELMULTI:
7771 compare_expected = false;
7772 if (!atomic_compare_exchange_strong(&ifp->if_mcast_del_signaled, &compare_expected, true)) {
7773 ifnet_decr_iorefcnt(ifp);
7774 return;
7775 }
7776 break;
7777 default:
7778 os_log(OS_LOG_DEFAULT, "%s:%d %s unknown ioctl %lu",
7779 __func__, __LINE__, if_name(ifp), ioctl_code);
7780 return;
7781 }
7782
7783 p_ifnet_ioctl_ev = kalloc_type(struct ifnet_ioctl_event_nwk_wq_entry,
7784 Z_WAITOK | Z_ZERO | Z_NOFAIL);
7785
7786 p_ifnet_ioctl_ev->ifnet_ioctl_ev_arg.ifp = ifp;
7787 p_ifnet_ioctl_ev->ifnet_ioctl_ev_arg.ioctl_code = ioctl_code;
7788 p_ifnet_ioctl_ev->nwk_wqe.func = ifnet_ioctl_event_callback;
7789 nwk_wq_enqueue(&p_ifnet_ioctl_ev->nwk_wqe);
7790 }
7791
7792 static void
ifnet_ioctl_event_callback(struct nwk_wq_entry * nwk_item)7793 ifnet_ioctl_event_callback(struct nwk_wq_entry *nwk_item)
7794 {
7795 struct ifnet_ioctl_event_nwk_wq_entry *p_ev = __container_of(nwk_item,
7796 struct ifnet_ioctl_event_nwk_wq_entry, nwk_wqe);
7797
7798 struct ifnet *ifp = p_ev->ifnet_ioctl_ev_arg.ifp;
7799 u_long ioctl_code = p_ev->ifnet_ioctl_ev_arg.ioctl_code;
7800 int ret = 0;
7801
7802 switch (ioctl_code) {
7803 case SIOCADDMULTI:
7804 atomic_store(&ifp->if_mcast_add_signaled, false);
7805 break;
7806 case SIOCDELMULTI:
7807 atomic_store(&ifp->if_mcast_del_signaled, false);
7808 break;
7809 }
7810 if ((ret = ifnet_ioctl(ifp, 0, ioctl_code, NULL)) != 0) {
7811 os_log(OS_LOG_DEFAULT, "%s:%d %s ifnet_ioctl returned %d for ioctl %lu",
7812 __func__, __LINE__, if_name(ifp), ret, ioctl_code);
7813 } else if (dlil_verbose) {
7814 os_log(OS_LOG_DEFAULT, "%s:%d %s ifnet_ioctl returned successfully "
7815 "for ioctl %lu",
7816 __func__, __LINE__, if_name(ifp), ioctl_code);
7817 }
7818 ifnet_decr_iorefcnt(ifp);
7819 kfree_type(struct ifnet_ioctl_event_nwk_wq_entry, p_ev);
7820 return;
7821 }
7822
7823 errno_t
ifnet_ioctl(ifnet_t ifp,protocol_family_t proto_fam,u_long ioctl_code,void * ioctl_arg)7824 ifnet_ioctl(ifnet_t ifp, protocol_family_t proto_fam, u_long ioctl_code,
7825 void *ioctl_arg)
7826 {
7827 struct ifnet_filter *filter;
7828 int retval = EOPNOTSUPP;
7829 int result = 0;
7830
7831 if (ifp == NULL || ioctl_code == 0) {
7832 return EINVAL;
7833 }
7834
7835 /* Get an io ref count if the interface is attached */
7836 if (!ifnet_is_attached(ifp, 1)) {
7837 return EOPNOTSUPP;
7838 }
7839
7840 /*
7841 * Run the interface filters first.
7842 * We want to run all filters before calling the protocol,
7843 * interface family, or interface.
7844 */
7845 lck_mtx_lock_spin(&ifp->if_flt_lock);
7846 /* prevent filter list from changing in case we drop the lock */
7847 if_flt_monitor_busy(ifp);
7848 TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
7849 if (filter->filt_ioctl != NULL && (filter->filt_protocol == 0 ||
7850 filter->filt_protocol == proto_fam)) {
7851 lck_mtx_unlock(&ifp->if_flt_lock);
7852
7853 result = filter->filt_ioctl(filter->filt_cookie, ifp,
7854 proto_fam, ioctl_code, ioctl_arg);
7855
7856 lck_mtx_lock_spin(&ifp->if_flt_lock);
7857
7858 /* Only update retval if no one has handled the ioctl */
7859 if (retval == EOPNOTSUPP || result == EJUSTRETURN) {
7860 if (result == ENOTSUP) {
7861 result = EOPNOTSUPP;
7862 }
7863 retval = result;
7864 if (retval != 0 && retval != EOPNOTSUPP) {
7865 /* we're done with the filter list */
7866 if_flt_monitor_unbusy(ifp);
7867 lck_mtx_unlock(&ifp->if_flt_lock);
7868 goto cleanup;
7869 }
7870 }
7871 }
7872 }
7873 /* we're done with the filter list */
7874 if_flt_monitor_unbusy(ifp);
7875 lck_mtx_unlock(&ifp->if_flt_lock);
7876
7877 /* Allow the protocol to handle the ioctl */
7878 if (proto_fam != 0) {
7879 struct if_proto *proto;
7880
7881 /* callee holds a proto refcnt upon success */
7882 ifnet_lock_shared(ifp);
7883 proto = find_attached_proto(ifp, proto_fam);
7884 ifnet_lock_done(ifp);
7885 if (proto != NULL) {
7886 proto_media_ioctl ioctlp =
7887 (proto->proto_kpi == kProtoKPI_v1 ?
7888 proto->kpi.v1.ioctl : proto->kpi.v2.ioctl);
7889 result = EOPNOTSUPP;
7890 if (ioctlp != NULL) {
7891 result = ioctlp(ifp, proto_fam, ioctl_code,
7892 ioctl_arg);
7893 }
7894 if_proto_free(proto);
7895
7896 /* Only update retval if no one has handled the ioctl */
7897 if (retval == EOPNOTSUPP || result == EJUSTRETURN) {
7898 if (result == ENOTSUP) {
7899 result = EOPNOTSUPP;
7900 }
7901 retval = result;
7902 if (retval && retval != EOPNOTSUPP) {
7903 goto cleanup;
7904 }
7905 }
7906 }
7907 }
7908
7909 /* retval is either 0 or EOPNOTSUPP */
7910
7911 /*
7912 * Let the interface handle this ioctl.
7913 * If it returns EOPNOTSUPP, ignore that, we may have
7914 * already handled this in the protocol or family.
7915 */
7916 if (ifp->if_ioctl) {
7917 result = (*ifp->if_ioctl)(ifp, ioctl_code, ioctl_arg);
7918 }
7919
7920 /* Only update retval if no one has handled the ioctl */
7921 if (retval == EOPNOTSUPP || result == EJUSTRETURN) {
7922 if (result == ENOTSUP) {
7923 result = EOPNOTSUPP;
7924 }
7925 retval = result;
7926 if (retval && retval != EOPNOTSUPP) {
7927 goto cleanup;
7928 }
7929 }
7930
7931 cleanup:
7932 if (retval == EJUSTRETURN) {
7933 retval = 0;
7934 }
7935
7936 ifnet_decr_iorefcnt(ifp);
7937
7938 return retval;
7939 }
7940
7941 __private_extern__ errno_t
dlil_set_bpf_tap(ifnet_t ifp,bpf_tap_mode mode,bpf_packet_func callback)7942 dlil_set_bpf_tap(ifnet_t ifp, bpf_tap_mode mode, bpf_packet_func callback)
7943 {
7944 errno_t error = 0;
7945
7946 if (ifp->if_set_bpf_tap) {
7947 /* Get an io reference on the interface if it is attached */
7948 if (!ifnet_is_attached(ifp, 1)) {
7949 return ENXIO;
7950 }
7951 error = ifp->if_set_bpf_tap(ifp, mode, callback);
7952 ifnet_decr_iorefcnt(ifp);
7953 }
7954 return error;
7955 }
7956
7957 errno_t
dlil_resolve_multi(struct ifnet * ifp,const struct sockaddr * proto_addr,struct sockaddr * ll_addr,size_t ll_len)7958 dlil_resolve_multi(struct ifnet *ifp, const struct sockaddr *proto_addr,
7959 struct sockaddr *ll_addr, size_t ll_len)
7960 {
7961 errno_t result = EOPNOTSUPP;
7962 struct if_proto *proto;
7963 const struct sockaddr *verify;
7964 proto_media_resolve_multi resolvep;
7965
7966 if (!ifnet_is_attached(ifp, 1)) {
7967 return result;
7968 }
7969
7970 bzero(ll_addr, ll_len);
7971
7972 /* Call the protocol first; callee holds a proto refcnt upon success */
7973 ifnet_lock_shared(ifp);
7974 proto = find_attached_proto(ifp, proto_addr->sa_family);
7975 ifnet_lock_done(ifp);
7976 if (proto != NULL) {
7977 resolvep = (proto->proto_kpi == kProtoKPI_v1 ?
7978 proto->kpi.v1.resolve_multi : proto->kpi.v2.resolve_multi);
7979 if (resolvep != NULL) {
7980 result = resolvep(ifp, proto_addr,
7981 (struct sockaddr_dl *)(void *)ll_addr, ll_len);
7982 }
7983 if_proto_free(proto);
7984 }
7985
7986 /* Let the interface verify the multicast address */
7987 if ((result == EOPNOTSUPP || result == 0) && ifp->if_check_multi) {
7988 if (result == 0) {
7989 verify = ll_addr;
7990 } else {
7991 verify = proto_addr;
7992 }
7993 result = ifp->if_check_multi(ifp, verify);
7994 }
7995
7996 ifnet_decr_iorefcnt(ifp);
7997 return result;
7998 }
7999
8000 __private_extern__ errno_t
dlil_send_arp_internal(ifnet_t ifp,u_short arpop,const struct sockaddr_dl * sender_hw,const struct sockaddr * sender_proto,const struct sockaddr_dl * target_hw,const struct sockaddr * target_proto)8001 dlil_send_arp_internal(ifnet_t ifp, u_short arpop,
8002 const struct sockaddr_dl *sender_hw, const struct sockaddr *sender_proto,
8003 const struct sockaddr_dl *target_hw, const struct sockaddr *target_proto)
8004 {
8005 struct if_proto *proto;
8006 errno_t result = 0;
8007
8008 if ((ifp->if_flags & IFF_NOARP) != 0) {
8009 result = ENOTSUP;
8010 goto done;
8011 }
8012
8013 /* callee holds a proto refcnt upon success */
8014 ifnet_lock_shared(ifp);
8015 proto = find_attached_proto(ifp, target_proto->sa_family);
8016 ifnet_lock_done(ifp);
8017 if (proto == NULL) {
8018 result = ENOTSUP;
8019 } else {
8020 proto_media_send_arp arpp;
8021 arpp = (proto->proto_kpi == kProtoKPI_v1 ?
8022 proto->kpi.v1.send_arp : proto->kpi.v2.send_arp);
8023 if (arpp == NULL) {
8024 result = ENOTSUP;
8025 } else {
8026 switch (arpop) {
8027 case ARPOP_REQUEST:
8028 arpstat.txrequests++;
8029 if (target_hw != NULL) {
8030 arpstat.txurequests++;
8031 }
8032 break;
8033 case ARPOP_REPLY:
8034 arpstat.txreplies++;
8035 break;
8036 }
8037 result = arpp(ifp, arpop, sender_hw, sender_proto,
8038 target_hw, target_proto);
8039 }
8040 if_proto_free(proto);
8041 }
8042 done:
8043 return result;
8044 }
8045
8046 struct net_thread_marks { };
8047 static const struct net_thread_marks net_thread_marks_base = { };
8048
8049 __private_extern__ const net_thread_marks_t net_thread_marks_none =
8050 &net_thread_marks_base;
8051
8052 __private_extern__ net_thread_marks_t
net_thread_marks_push(u_int32_t push)8053 net_thread_marks_push(u_int32_t push)
8054 {
8055 static const char *const base = (const void*)&net_thread_marks_base;
8056 u_int32_t pop = 0;
8057
8058 if (push != 0) {
8059 struct uthread *uth = current_uthread();
8060
8061 pop = push & ~uth->uu_network_marks;
8062 if (pop != 0) {
8063 uth->uu_network_marks |= pop;
8064 }
8065 }
8066
8067 return (net_thread_marks_t)&base[pop];
8068 }
8069
8070 __private_extern__ net_thread_marks_t
net_thread_unmarks_push(u_int32_t unpush)8071 net_thread_unmarks_push(u_int32_t unpush)
8072 {
8073 static const char *const base = (const void*)&net_thread_marks_base;
8074 u_int32_t unpop = 0;
8075
8076 if (unpush != 0) {
8077 struct uthread *uth = current_uthread();
8078
8079 unpop = unpush & uth->uu_network_marks;
8080 if (unpop != 0) {
8081 uth->uu_network_marks &= ~unpop;
8082 }
8083 }
8084
8085 return (net_thread_marks_t)&base[unpop];
8086 }
8087
8088 __private_extern__ void
net_thread_marks_pop(net_thread_marks_t popx)8089 net_thread_marks_pop(net_thread_marks_t popx)
8090 {
8091 static const char *const base = (const void*)&net_thread_marks_base;
8092 const ptrdiff_t pop = (const char *)popx - (const char *)base;
8093
8094 if (pop != 0) {
8095 static const ptrdiff_t ones = (ptrdiff_t)(u_int32_t)~0U;
8096 struct uthread *uth = current_uthread();
8097
8098 VERIFY((pop & ones) == pop);
8099 VERIFY((ptrdiff_t)(uth->uu_network_marks & pop) == pop);
8100 uth->uu_network_marks &= ~pop;
8101 }
8102 }
8103
8104 __private_extern__ void
net_thread_unmarks_pop(net_thread_marks_t unpopx)8105 net_thread_unmarks_pop(net_thread_marks_t unpopx)
8106 {
8107 static const char *const base = (const void*)&net_thread_marks_base;
8108 ptrdiff_t unpop = (const char *)unpopx - (const char *)base;
8109
8110 if (unpop != 0) {
8111 static const ptrdiff_t ones = (ptrdiff_t)(u_int32_t)~0U;
8112 struct uthread *uth = current_uthread();
8113
8114 VERIFY((unpop & ones) == unpop);
8115 VERIFY((ptrdiff_t)(uth->uu_network_marks & unpop) == 0);
8116 uth->uu_network_marks |= unpop;
8117 }
8118 }
8119
8120 __private_extern__ u_int32_t
net_thread_is_marked(u_int32_t check)8121 net_thread_is_marked(u_int32_t check)
8122 {
8123 if (check != 0) {
8124 struct uthread *uth = current_uthread();
8125 return uth->uu_network_marks & check;
8126 } else {
8127 return 0;
8128 }
8129 }
8130
8131 __private_extern__ u_int32_t
net_thread_is_unmarked(u_int32_t check)8132 net_thread_is_unmarked(u_int32_t check)
8133 {
8134 if (check != 0) {
8135 struct uthread *uth = current_uthread();
8136 return ~uth->uu_network_marks & check;
8137 } else {
8138 return 0;
8139 }
8140 }
8141
8142 static __inline__ int
_is_announcement(const struct sockaddr_in * sender_sin,const struct sockaddr_in * target_sin)8143 _is_announcement(const struct sockaddr_in * sender_sin,
8144 const struct sockaddr_in * target_sin)
8145 {
8146 if (target_sin == NULL || sender_sin == NULL) {
8147 return FALSE;
8148 }
8149
8150 return sender_sin->sin_addr.s_addr == target_sin->sin_addr.s_addr;
8151 }
8152
8153 __private_extern__ errno_t
dlil_send_arp(ifnet_t ifp,u_short arpop,const struct sockaddr_dl * sender_hw,const struct sockaddr * sender_proto,const struct sockaddr_dl * target_hw,const struct sockaddr * target_proto0,u_int32_t rtflags)8154 dlil_send_arp(ifnet_t ifp, u_short arpop, const struct sockaddr_dl *sender_hw,
8155 const struct sockaddr *sender_proto, const struct sockaddr_dl *target_hw,
8156 const struct sockaddr *target_proto0, u_int32_t rtflags)
8157 {
8158 errno_t result = 0;
8159 const struct sockaddr_in * sender_sin;
8160 const struct sockaddr_in * target_sin;
8161 struct sockaddr_inarp target_proto_sinarp;
8162 struct sockaddr *target_proto = (void *)(uintptr_t)target_proto0;
8163
8164 if (target_proto == NULL || sender_proto == NULL) {
8165 return EINVAL;
8166 }
8167
8168 if (sender_proto->sa_family != target_proto->sa_family) {
8169 return EINVAL;
8170 }
8171
8172 /*
8173 * If the target is a (default) router, provide that
8174 * information to the send_arp callback routine.
8175 */
8176 if (rtflags & RTF_ROUTER) {
8177 bcopy(target_proto, &target_proto_sinarp,
8178 sizeof(struct sockaddr_in));
8179 target_proto_sinarp.sin_other |= SIN_ROUTER;
8180 target_proto = (struct sockaddr *)&target_proto_sinarp;
8181 }
8182
8183 /*
8184 * If this is an ARP request and the target IP is IPv4LL,
8185 * send the request on all interfaces. The exception is
8186 * an announcement, which must only appear on the specific
8187 * interface.
8188 */
8189 sender_sin = (struct sockaddr_in *)(void *)(uintptr_t)sender_proto;
8190 target_sin = (struct sockaddr_in *)(void *)(uintptr_t)target_proto;
8191 if (target_proto->sa_family == AF_INET &&
8192 IN_LINKLOCAL(ntohl(target_sin->sin_addr.s_addr)) &&
8193 ipv4_ll_arp_aware != 0 && arpop == ARPOP_REQUEST &&
8194 !_is_announcement(sender_sin, target_sin)) {
8195 ifnet_t *ifp_list;
8196 u_int32_t count;
8197 u_int32_t ifp_on;
8198
8199 result = ENOTSUP;
8200
8201 if (ifnet_list_get(IFNET_FAMILY_ANY, &ifp_list, &count) == 0) {
8202 for (ifp_on = 0; ifp_on < count; ifp_on++) {
8203 errno_t new_result;
8204 ifaddr_t source_hw = NULL;
8205 ifaddr_t source_ip = NULL;
8206 struct sockaddr_in source_ip_copy;
8207 struct ifnet *cur_ifp = ifp_list[ifp_on];
8208
8209 /*
8210 * Only arp on interfaces marked for IPv4LL
8211 * ARPing. This may mean that we don't ARP on
8212 * the interface the subnet route points to.
8213 */
8214 if (!(cur_ifp->if_eflags & IFEF_ARPLL)) {
8215 continue;
8216 }
8217
8218 /* Find the source IP address */
8219 ifnet_lock_shared(cur_ifp);
8220 source_hw = cur_ifp->if_lladdr;
8221 TAILQ_FOREACH(source_ip, &cur_ifp->if_addrhead,
8222 ifa_link) {
8223 IFA_LOCK(source_ip);
8224 if (source_ip->ifa_addr != NULL &&
8225 source_ip->ifa_addr->sa_family ==
8226 AF_INET) {
8227 /* Copy the source IP address */
8228 source_ip_copy =
8229 *(struct sockaddr_in *)
8230 (void *)source_ip->ifa_addr;
8231 IFA_UNLOCK(source_ip);
8232 break;
8233 }
8234 IFA_UNLOCK(source_ip);
8235 }
8236
8237 /* No IP Source, don't arp */
8238 if (source_ip == NULL) {
8239 ifnet_lock_done(cur_ifp);
8240 continue;
8241 }
8242
8243 IFA_ADDREF(source_hw);
8244 ifnet_lock_done(cur_ifp);
8245
8246 /* Send the ARP */
8247 new_result = dlil_send_arp_internal(cur_ifp,
8248 arpop, (struct sockaddr_dl *)(void *)
8249 source_hw->ifa_addr,
8250 (struct sockaddr *)&source_ip_copy, NULL,
8251 target_proto);
8252
8253 IFA_REMREF(source_hw);
8254 if (result == ENOTSUP) {
8255 result = new_result;
8256 }
8257 }
8258 ifnet_list_free(ifp_list);
8259 }
8260 } else {
8261 result = dlil_send_arp_internal(ifp, arpop, sender_hw,
8262 sender_proto, target_hw, target_proto);
8263 }
8264
8265 return result;
8266 }
8267
8268 /*
8269 * Caller must hold ifnet head lock.
8270 */
8271 static int
ifnet_lookup(struct ifnet * ifp)8272 ifnet_lookup(struct ifnet *ifp)
8273 {
8274 struct ifnet *_ifp;
8275
8276 LCK_RW_ASSERT(&ifnet_head_lock, LCK_RW_ASSERT_HELD);
8277 TAILQ_FOREACH(_ifp, &ifnet_head, if_link) {
8278 if (_ifp == ifp) {
8279 break;
8280 }
8281 }
8282 return _ifp != NULL;
8283 }
8284
8285 /*
8286 * Caller has to pass a non-zero refio argument to get a
8287 * IO reference count. This will prevent ifnet_detach from
8288 * being called when there are outstanding io reference counts.
8289 */
8290 int
ifnet_is_attached(struct ifnet * ifp,int refio)8291 ifnet_is_attached(struct ifnet *ifp, int refio)
8292 {
8293 int ret;
8294
8295 lck_mtx_lock_spin(&ifp->if_ref_lock);
8296 if ((ret = IF_FULLY_ATTACHED(ifp))) {
8297 if (refio > 0) {
8298 ifp->if_refio++;
8299 }
8300 }
8301 lck_mtx_unlock(&ifp->if_ref_lock);
8302
8303 return ret;
8304 }
8305
8306 void
ifnet_incr_pending_thread_count(struct ifnet * ifp)8307 ifnet_incr_pending_thread_count(struct ifnet *ifp)
8308 {
8309 lck_mtx_lock_spin(&ifp->if_ref_lock);
8310 ifp->if_threads_pending++;
8311 lck_mtx_unlock(&ifp->if_ref_lock);
8312 }
8313
8314 void
ifnet_decr_pending_thread_count(struct ifnet * ifp)8315 ifnet_decr_pending_thread_count(struct ifnet *ifp)
8316 {
8317 lck_mtx_lock_spin(&ifp->if_ref_lock);
8318 VERIFY(ifp->if_threads_pending > 0);
8319 ifp->if_threads_pending--;
8320 if (ifp->if_threads_pending == 0) {
8321 wakeup(&ifp->if_threads_pending);
8322 }
8323 lck_mtx_unlock(&ifp->if_ref_lock);
8324 }
8325
8326 /*
8327 * Caller must ensure the interface is attached; the assumption is that
8328 * there is at least an outstanding IO reference count held already.
8329 * Most callers would call ifnet_is_{attached,data_ready}() instead.
8330 */
8331 void
ifnet_incr_iorefcnt(struct ifnet * ifp)8332 ifnet_incr_iorefcnt(struct ifnet *ifp)
8333 {
8334 lck_mtx_lock_spin(&ifp->if_ref_lock);
8335 VERIFY(IF_FULLY_ATTACHED(ifp));
8336 VERIFY(ifp->if_refio > 0);
8337 ifp->if_refio++;
8338 lck_mtx_unlock(&ifp->if_ref_lock);
8339 }
8340
8341 __attribute__((always_inline))
8342 static void
ifnet_decr_iorefcnt_locked(struct ifnet * ifp)8343 ifnet_decr_iorefcnt_locked(struct ifnet *ifp)
8344 {
8345 LCK_MTX_ASSERT(&ifp->if_ref_lock, LCK_MTX_ASSERT_OWNED);
8346
8347 VERIFY(ifp->if_refio > 0);
8348 VERIFY(ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING));
8349
8350 ifp->if_refio--;
8351 VERIFY(ifp->if_refio != 0 || ifp->if_datamov == 0);
8352
8353 /*
8354 * if there are no more outstanding io references, wakeup the
8355 * ifnet_detach thread if detaching flag is set.
8356 */
8357 if (ifp->if_refio == 0 && (ifp->if_refflags & IFRF_DETACHING)) {
8358 wakeup(&(ifp->if_refio));
8359 }
8360 }
8361
8362 void
ifnet_decr_iorefcnt(struct ifnet * ifp)8363 ifnet_decr_iorefcnt(struct ifnet *ifp)
8364 {
8365 lck_mtx_lock_spin(&ifp->if_ref_lock);
8366 ifnet_decr_iorefcnt_locked(ifp);
8367 lck_mtx_unlock(&ifp->if_ref_lock);
8368 }
8369
8370 boolean_t
ifnet_datamov_begin(struct ifnet * ifp)8371 ifnet_datamov_begin(struct ifnet *ifp)
8372 {
8373 boolean_t ret;
8374
8375 lck_mtx_lock_spin(&ifp->if_ref_lock);
8376 if ((ret = IF_FULLY_ATTACHED_AND_READY(ifp))) {
8377 ifp->if_refio++;
8378 ifp->if_datamov++;
8379 }
8380 lck_mtx_unlock(&ifp->if_ref_lock);
8381
8382 return ret;
8383 }
8384
8385 void
ifnet_datamov_end(struct ifnet * ifp)8386 ifnet_datamov_end(struct ifnet *ifp)
8387 {
8388 lck_mtx_lock_spin(&ifp->if_ref_lock);
8389 VERIFY(ifp->if_datamov > 0);
8390 /*
8391 * if there's no more thread moving data, wakeup any
8392 * drainers that's blocked waiting for this.
8393 */
8394 if (--ifp->if_datamov == 0 && ifp->if_drainers > 0) {
8395 DLIL_PRINTF("Waking up drainers on %s\n", if_name(ifp));
8396 DTRACE_IP1(datamov__drain__wake, struct ifnet *, ifp);
8397 wakeup(&(ifp->if_datamov));
8398 }
8399 ifnet_decr_iorefcnt_locked(ifp);
8400 lck_mtx_unlock(&ifp->if_ref_lock);
8401 }
8402
8403 static void
ifnet_datamov_suspend_locked(struct ifnet * ifp)8404 ifnet_datamov_suspend_locked(struct ifnet *ifp)
8405 {
8406 LCK_MTX_ASSERT(&ifp->if_ref_lock, LCK_MTX_ASSERT_OWNED);
8407 ifp->if_refio++;
8408 if (ifp->if_suspend++ == 0) {
8409 VERIFY(ifp->if_refflags & IFRF_READY);
8410 ifp->if_refflags &= ~IFRF_READY;
8411 }
8412 }
8413
8414 void
ifnet_datamov_suspend(struct ifnet * ifp)8415 ifnet_datamov_suspend(struct ifnet *ifp)
8416 {
8417 lck_mtx_lock_spin(&ifp->if_ref_lock);
8418 VERIFY(ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING));
8419 ifnet_datamov_suspend_locked(ifp);
8420 lck_mtx_unlock(&ifp->if_ref_lock);
8421 }
8422
8423 boolean_t
ifnet_datamov_suspend_if_needed(struct ifnet * ifp)8424 ifnet_datamov_suspend_if_needed(struct ifnet *ifp)
8425 {
8426 lck_mtx_lock_spin(&ifp->if_ref_lock);
8427 VERIFY(ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING));
8428 if (ifp->if_suspend > 0) {
8429 lck_mtx_unlock(&ifp->if_ref_lock);
8430 return FALSE;
8431 }
8432 ifnet_datamov_suspend_locked(ifp);
8433 lck_mtx_unlock(&ifp->if_ref_lock);
8434 return TRUE;
8435 }
8436
8437 void
ifnet_datamov_drain(struct ifnet * ifp)8438 ifnet_datamov_drain(struct ifnet *ifp)
8439 {
8440 lck_mtx_lock(&ifp->if_ref_lock);
8441 VERIFY(ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING));
8442 /* data movement must already be suspended */
8443 VERIFY(ifp->if_suspend > 0);
8444 VERIFY(!(ifp->if_refflags & IFRF_READY));
8445 ifp->if_drainers++;
8446 while (ifp->if_datamov != 0) {
8447 DLIL_PRINTF("Waiting for data path(s) to quiesce on %s\n",
8448 if_name(ifp));
8449 DTRACE_IP1(datamov__wait, struct ifnet *, ifp);
8450 (void) msleep(&(ifp->if_datamov), &ifp->if_ref_lock,
8451 (PZERO - 1), __func__, NULL);
8452 DTRACE_IP1(datamov__wake, struct ifnet *, ifp);
8453 }
8454 VERIFY(!(ifp->if_refflags & IFRF_READY));
8455 VERIFY(ifp->if_drainers > 0);
8456 ifp->if_drainers--;
8457 lck_mtx_unlock(&ifp->if_ref_lock);
8458
8459 /* purge the interface queues */
8460 if ((ifp->if_eflags & IFEF_TXSTART) != 0) {
8461 if_qflush_snd(ifp, false);
8462 }
8463 }
8464
8465 void
ifnet_datamov_suspend_and_drain(struct ifnet * ifp)8466 ifnet_datamov_suspend_and_drain(struct ifnet *ifp)
8467 {
8468 ifnet_datamov_suspend(ifp);
8469 ifnet_datamov_drain(ifp);
8470 }
8471
8472 void
ifnet_datamov_resume(struct ifnet * ifp)8473 ifnet_datamov_resume(struct ifnet *ifp)
8474 {
8475 lck_mtx_lock(&ifp->if_ref_lock);
8476 /* data movement must already be suspended */
8477 VERIFY(ifp->if_suspend > 0);
8478 if (--ifp->if_suspend == 0) {
8479 VERIFY(!(ifp->if_refflags & IFRF_READY));
8480 ifp->if_refflags |= IFRF_READY;
8481 }
8482 ifnet_decr_iorefcnt_locked(ifp);
8483 lck_mtx_unlock(&ifp->if_ref_lock);
8484 }
8485
8486 static void
dlil_if_trace(struct dlil_ifnet * dl_if,int refhold)8487 dlil_if_trace(struct dlil_ifnet *dl_if, int refhold)
8488 {
8489 struct dlil_ifnet_dbg *dl_if_dbg = (struct dlil_ifnet_dbg *)dl_if;
8490 ctrace_t *tr;
8491 u_int32_t idx;
8492 u_int16_t *cnt;
8493
8494 if (!(dl_if->dl_if_flags & DLIF_DEBUG)) {
8495 panic("%s: dl_if %p has no debug structure", __func__, dl_if);
8496 /* NOTREACHED */
8497 }
8498
8499 if (refhold) {
8500 cnt = &dl_if_dbg->dldbg_if_refhold_cnt;
8501 tr = dl_if_dbg->dldbg_if_refhold;
8502 } else {
8503 cnt = &dl_if_dbg->dldbg_if_refrele_cnt;
8504 tr = dl_if_dbg->dldbg_if_refrele;
8505 }
8506
8507 idx = os_atomic_inc_orig(cnt, relaxed) % IF_REF_TRACE_HIST_SIZE;
8508 ctrace_record(&tr[idx]);
8509 }
8510
8511 errno_t
dlil_if_ref(struct ifnet * ifp)8512 dlil_if_ref(struct ifnet *ifp)
8513 {
8514 struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
8515
8516 if (dl_if == NULL) {
8517 return EINVAL;
8518 }
8519
8520 lck_mtx_lock_spin(&dl_if->dl_if_lock);
8521 ++dl_if->dl_if_refcnt;
8522 if (dl_if->dl_if_refcnt == 0) {
8523 panic("%s: wraparound refcnt for ifp=%p", __func__, ifp);
8524 /* NOTREACHED */
8525 }
8526 if (dl_if->dl_if_trace != NULL) {
8527 (*dl_if->dl_if_trace)(dl_if, TRUE);
8528 }
8529 lck_mtx_unlock(&dl_if->dl_if_lock);
8530
8531 return 0;
8532 }
8533
8534 errno_t
dlil_if_free(struct ifnet * ifp)8535 dlil_if_free(struct ifnet *ifp)
8536 {
8537 struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
8538 bool need_release = FALSE;
8539
8540 if (dl_if == NULL) {
8541 return EINVAL;
8542 }
8543
8544 lck_mtx_lock_spin(&dl_if->dl_if_lock);
8545 switch (dl_if->dl_if_refcnt) {
8546 case 0:
8547 panic("%s: negative refcnt for ifp=%p", __func__, ifp);
8548 /* NOTREACHED */
8549 break;
8550 case 1:
8551 if ((ifp->if_refflags & IFRF_EMBRYONIC) != 0) {
8552 need_release = TRUE;
8553 }
8554 break;
8555 default:
8556 break;
8557 }
8558 --dl_if->dl_if_refcnt;
8559 if (dl_if->dl_if_trace != NULL) {
8560 (*dl_if->dl_if_trace)(dl_if, FALSE);
8561 }
8562 lck_mtx_unlock(&dl_if->dl_if_lock);
8563 if (need_release) {
8564 _dlil_if_release(ifp, true);
8565 }
8566 return 0;
8567 }
8568
8569 static errno_t
dlil_attach_protocol(struct if_proto * proto,const struct ifnet_demux_desc * demux_list,u_int32_t demux_count,uint32_t * proto_count)8570 dlil_attach_protocol(struct if_proto *proto,
8571 const struct ifnet_demux_desc *demux_list, u_int32_t demux_count,
8572 uint32_t * proto_count)
8573 {
8574 struct kev_dl_proto_data ev_pr_data;
8575 struct ifnet *ifp = proto->ifp;
8576 errno_t retval = 0;
8577 u_int32_t hash_value = proto_hash_value(proto->protocol_family);
8578 struct if_proto *prev_proto;
8579 struct if_proto *_proto;
8580
8581 /* don't allow attaching anything but PF_BRIDGE to vmnet interfaces */
8582 if (IFNET_IS_VMNET(ifp) && proto->protocol_family != PF_BRIDGE) {
8583 return EINVAL;
8584 }
8585
8586 if (!ifnet_is_attached(ifp, 1)) {
8587 os_log(OS_LOG_DEFAULT, "%s: %s is no longer attached",
8588 __func__, if_name(ifp));
8589 return ENXIO;
8590 }
8591 /* callee holds a proto refcnt upon success */
8592 ifnet_lock_exclusive(ifp);
8593 _proto = find_attached_proto(ifp, proto->protocol_family);
8594 if (_proto != NULL) {
8595 ifnet_lock_done(ifp);
8596 if_proto_free(_proto);
8597 retval = EEXIST;
8598 goto ioref_done;
8599 }
8600
8601 /*
8602 * Call family module add_proto routine so it can refine the
8603 * demux descriptors as it wishes.
8604 */
8605 retval = ifp->if_add_proto(ifp, proto->protocol_family, demux_list,
8606 demux_count);
8607 if (retval) {
8608 ifnet_lock_done(ifp);
8609 goto ioref_done;
8610 }
8611
8612 /*
8613 * Insert the protocol in the hash
8614 */
8615 prev_proto = SLIST_FIRST(&ifp->if_proto_hash[hash_value]);
8616 while (prev_proto != NULL && SLIST_NEXT(prev_proto, next_hash) != NULL) {
8617 prev_proto = SLIST_NEXT(prev_proto, next_hash);
8618 }
8619 if (prev_proto) {
8620 SLIST_INSERT_AFTER(prev_proto, proto, next_hash);
8621 } else {
8622 SLIST_INSERT_HEAD(&ifp->if_proto_hash[hash_value],
8623 proto, next_hash);
8624 }
8625
8626 /* hold a proto refcnt for attach */
8627 if_proto_ref(proto);
8628
8629 /*
8630 * The reserved field carries the number of protocol still attached
8631 * (subject to change)
8632 */
8633 ev_pr_data.proto_family = proto->protocol_family;
8634 ev_pr_data.proto_remaining_count = dlil_ifp_protolist(ifp, NULL, 0);
8635
8636 ifnet_lock_done(ifp);
8637
8638 dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_PROTO_ATTACHED,
8639 (struct net_event_data *)&ev_pr_data,
8640 sizeof(struct kev_dl_proto_data), FALSE);
8641 if (proto_count != NULL) {
8642 *proto_count = ev_pr_data.proto_remaining_count;
8643 }
8644 ioref_done:
8645 ifnet_decr_iorefcnt(ifp);
8646 return retval;
8647 }
8648
8649 static void
dlil_handle_proto_attach(ifnet_t ifp,protocol_family_t protocol)8650 dlil_handle_proto_attach(ifnet_t ifp, protocol_family_t protocol)
8651 {
8652 /*
8653 * A protocol has been attached, mark the interface up.
8654 * This used to be done by configd.KernelEventMonitor, but that
8655 * is inherently prone to races (rdar://problem/30810208).
8656 */
8657 (void) ifnet_set_flags(ifp, IFF_UP, IFF_UP);
8658 (void) ifnet_ioctl(ifp, 0, SIOCSIFFLAGS, NULL);
8659 dlil_post_sifflags_msg(ifp);
8660 #if SKYWALK
8661 switch (protocol) {
8662 case AF_INET:
8663 case AF_INET6:
8664 /* don't attach the flowswitch unless attaching IP */
8665 dlil_attach_flowswitch_nexus(ifp);
8666 break;
8667 default:
8668 break;
8669 }
8670 #endif /* SKYWALK */
8671 }
8672
8673 errno_t
ifnet_attach_protocol(ifnet_t ifp,protocol_family_t protocol,const struct ifnet_attach_proto_param * proto_details)8674 ifnet_attach_protocol(ifnet_t ifp, protocol_family_t protocol,
8675 const struct ifnet_attach_proto_param *proto_details)
8676 {
8677 int retval = 0;
8678 struct if_proto *ifproto = NULL;
8679 uint32_t proto_count = 0;
8680
8681 ifnet_head_lock_shared();
8682 if (ifp == NULL || protocol == 0 || proto_details == NULL) {
8683 retval = EINVAL;
8684 goto end;
8685 }
8686 /* Check that the interface is in the global list */
8687 if (!ifnet_lookup(ifp)) {
8688 retval = ENXIO;
8689 goto end;
8690 }
8691
8692 ifproto = zalloc_flags(dlif_proto_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
8693
8694 /* refcnt held above during lookup */
8695 ifproto->ifp = ifp;
8696 ifproto->protocol_family = protocol;
8697 ifproto->proto_kpi = kProtoKPI_v1;
8698 ifproto->kpi.v1.input = proto_details->input;
8699 ifproto->kpi.v1.pre_output = proto_details->pre_output;
8700 ifproto->kpi.v1.event = proto_details->event;
8701 ifproto->kpi.v1.ioctl = proto_details->ioctl;
8702 ifproto->kpi.v1.detached = proto_details->detached;
8703 ifproto->kpi.v1.resolve_multi = proto_details->resolve;
8704 ifproto->kpi.v1.send_arp = proto_details->send_arp;
8705
8706 retval = dlil_attach_protocol(ifproto,
8707 proto_details->demux_list, proto_details->demux_count,
8708 &proto_count);
8709
8710 end:
8711 if (retval == EEXIST) {
8712 /* already attached */
8713 if (dlil_verbose) {
8714 DLIL_PRINTF("%s: protocol %d already attached\n",
8715 ifp != NULL ? if_name(ifp) : "N/A",
8716 protocol);
8717 }
8718 } else if (retval != 0) {
8719 DLIL_PRINTF("%s: failed to attach v1 protocol %d (err=%d)\n",
8720 ifp != NULL ? if_name(ifp) : "N/A", protocol, retval);
8721 } else if (dlil_verbose) {
8722 DLIL_PRINTF("%s: attached v1 protocol %d (count = %d)\n",
8723 ifp != NULL ? if_name(ifp) : "N/A",
8724 protocol, proto_count);
8725 }
8726 ifnet_head_done();
8727 if (retval == 0) {
8728 dlil_handle_proto_attach(ifp, protocol);
8729 } else if (ifproto != NULL) {
8730 zfree(dlif_proto_zone, ifproto);
8731 }
8732 return retval;
8733 }
8734
8735 errno_t
ifnet_attach_protocol_v2(ifnet_t ifp,protocol_family_t protocol,const struct ifnet_attach_proto_param_v2 * proto_details)8736 ifnet_attach_protocol_v2(ifnet_t ifp, protocol_family_t protocol,
8737 const struct ifnet_attach_proto_param_v2 *proto_details)
8738 {
8739 int retval = 0;
8740 struct if_proto *ifproto = NULL;
8741 uint32_t proto_count = 0;
8742
8743 ifnet_head_lock_shared();
8744 if (ifp == NULL || protocol == 0 || proto_details == NULL) {
8745 retval = EINVAL;
8746 goto end;
8747 }
8748 /* Check that the interface is in the global list */
8749 if (!ifnet_lookup(ifp)) {
8750 retval = ENXIO;
8751 goto end;
8752 }
8753
8754 ifproto = zalloc_flags(dlif_proto_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
8755
8756 /* refcnt held above during lookup */
8757 ifproto->ifp = ifp;
8758 ifproto->protocol_family = protocol;
8759 ifproto->proto_kpi = kProtoKPI_v2;
8760 ifproto->kpi.v2.input = proto_details->input;
8761 ifproto->kpi.v2.pre_output = proto_details->pre_output;
8762 ifproto->kpi.v2.event = proto_details->event;
8763 ifproto->kpi.v2.ioctl = proto_details->ioctl;
8764 ifproto->kpi.v2.detached = proto_details->detached;
8765 ifproto->kpi.v2.resolve_multi = proto_details->resolve;
8766 ifproto->kpi.v2.send_arp = proto_details->send_arp;
8767
8768 retval = dlil_attach_protocol(ifproto,
8769 proto_details->demux_list, proto_details->demux_count,
8770 &proto_count);
8771
8772 end:
8773 if (retval == EEXIST) {
8774 /* already attached */
8775 if (dlil_verbose) {
8776 DLIL_PRINTF("%s: protocol %d already attached\n",
8777 ifp != NULL ? if_name(ifp) : "N/A",
8778 protocol);
8779 }
8780 } else if (retval != 0) {
8781 DLIL_PRINTF("%s: failed to attach v2 protocol %d (err=%d)\n",
8782 ifp != NULL ? if_name(ifp) : "N/A", protocol, retval);
8783 } else if (dlil_verbose) {
8784 DLIL_PRINTF("%s: attached v2 protocol %d (count = %d)\n",
8785 ifp != NULL ? if_name(ifp) : "N/A",
8786 protocol, proto_count);
8787 }
8788 ifnet_head_done();
8789 if (retval == 0) {
8790 dlil_handle_proto_attach(ifp, protocol);
8791 } else if (ifproto != NULL) {
8792 zfree(dlif_proto_zone, ifproto);
8793 }
8794 return retval;
8795 }
8796
8797 errno_t
ifnet_detach_protocol(ifnet_t ifp,protocol_family_t proto_family)8798 ifnet_detach_protocol(ifnet_t ifp, protocol_family_t proto_family)
8799 {
8800 struct if_proto *proto = NULL;
8801 int retval = 0;
8802
8803 if (ifp == NULL || proto_family == 0) {
8804 retval = EINVAL;
8805 goto end;
8806 }
8807
8808 ifnet_lock_exclusive(ifp);
8809 /* callee holds a proto refcnt upon success */
8810 proto = find_attached_proto(ifp, proto_family);
8811 if (proto == NULL) {
8812 retval = ENXIO;
8813 ifnet_lock_done(ifp);
8814 goto end;
8815 }
8816
8817 /* call family module del_proto */
8818 if (ifp->if_del_proto) {
8819 ifp->if_del_proto(ifp, proto->protocol_family);
8820 }
8821
8822 SLIST_REMOVE(&ifp->if_proto_hash[proto_hash_value(proto_family)],
8823 proto, if_proto, next_hash);
8824
8825 if (proto->proto_kpi == kProtoKPI_v1) {
8826 proto->kpi.v1.input = ifproto_media_input_v1;
8827 proto->kpi.v1.pre_output = ifproto_media_preout;
8828 proto->kpi.v1.event = ifproto_media_event;
8829 proto->kpi.v1.ioctl = ifproto_media_ioctl;
8830 proto->kpi.v1.resolve_multi = ifproto_media_resolve_multi;
8831 proto->kpi.v1.send_arp = ifproto_media_send_arp;
8832 } else {
8833 proto->kpi.v2.input = ifproto_media_input_v2;
8834 proto->kpi.v2.pre_output = ifproto_media_preout;
8835 proto->kpi.v2.event = ifproto_media_event;
8836 proto->kpi.v2.ioctl = ifproto_media_ioctl;
8837 proto->kpi.v2.resolve_multi = ifproto_media_resolve_multi;
8838 proto->kpi.v2.send_arp = ifproto_media_send_arp;
8839 }
8840 proto->detached = 1;
8841 ifnet_lock_done(ifp);
8842
8843 if (dlil_verbose) {
8844 DLIL_PRINTF("%s: detached %s protocol %d\n", if_name(ifp),
8845 (proto->proto_kpi == kProtoKPI_v1) ?
8846 "v1" : "v2", proto_family);
8847 }
8848
8849 /* release proto refcnt held during protocol attach */
8850 if_proto_free(proto);
8851
8852 /*
8853 * Release proto refcnt held during lookup; the rest of
8854 * protocol detach steps will happen when the last proto
8855 * reference is released.
8856 */
8857 if_proto_free(proto);
8858
8859 end:
8860 return retval;
8861 }
8862
8863 static errno_t
ifproto_media_input_v1(struct ifnet * ifp,protocol_family_t protocol,struct mbuf * packet,char * header)8864 ifproto_media_input_v1(struct ifnet *ifp, protocol_family_t protocol,
8865 struct mbuf *packet, char *header)
8866 {
8867 #pragma unused(ifp, protocol, packet, header)
8868 return ENXIO;
8869 }
8870
8871 static errno_t
ifproto_media_input_v2(struct ifnet * ifp,protocol_family_t protocol,struct mbuf * packet)8872 ifproto_media_input_v2(struct ifnet *ifp, protocol_family_t protocol,
8873 struct mbuf *packet)
8874 {
8875 #pragma unused(ifp, protocol, packet)
8876 return ENXIO;
8877 }
8878
8879 static errno_t
ifproto_media_preout(struct ifnet * ifp,protocol_family_t protocol,mbuf_t * packet,const struct sockaddr * dest,void * route,char * frame_type,char * link_layer_dest)8880 ifproto_media_preout(struct ifnet *ifp, protocol_family_t protocol,
8881 mbuf_t *packet, const struct sockaddr *dest, void *route, char *frame_type,
8882 char *link_layer_dest)
8883 {
8884 #pragma unused(ifp, protocol, packet, dest, route, frame_type, link_layer_dest)
8885 return ENXIO;
8886 }
8887
8888 static void
ifproto_media_event(struct ifnet * ifp,protocol_family_t protocol,const struct kev_msg * event)8889 ifproto_media_event(struct ifnet *ifp, protocol_family_t protocol,
8890 const struct kev_msg *event)
8891 {
8892 #pragma unused(ifp, protocol, event)
8893 }
8894
8895 static errno_t
ifproto_media_ioctl(struct ifnet * ifp,protocol_family_t protocol,unsigned long command,void * argument)8896 ifproto_media_ioctl(struct ifnet *ifp, protocol_family_t protocol,
8897 unsigned long command, void *argument)
8898 {
8899 #pragma unused(ifp, protocol, command, argument)
8900 return ENXIO;
8901 }
8902
8903 static errno_t
ifproto_media_resolve_multi(ifnet_t ifp,const struct sockaddr * proto_addr,struct sockaddr_dl * out_ll,size_t ll_len)8904 ifproto_media_resolve_multi(ifnet_t ifp, const struct sockaddr *proto_addr,
8905 struct sockaddr_dl *out_ll, size_t ll_len)
8906 {
8907 #pragma unused(ifp, proto_addr, out_ll, ll_len)
8908 return ENXIO;
8909 }
8910
8911 static errno_t
ifproto_media_send_arp(struct ifnet * ifp,u_short arpop,const struct sockaddr_dl * sender_hw,const struct sockaddr * sender_proto,const struct sockaddr_dl * target_hw,const struct sockaddr * target_proto)8912 ifproto_media_send_arp(struct ifnet *ifp, u_short arpop,
8913 const struct sockaddr_dl *sender_hw, const struct sockaddr *sender_proto,
8914 const struct sockaddr_dl *target_hw, const struct sockaddr *target_proto)
8915 {
8916 #pragma unused(ifp, arpop, sender_hw, sender_proto, target_hw, target_proto)
8917 return ENXIO;
8918 }
8919
8920 extern int if_next_index(void);
8921 extern int tcp_ecn_outbound;
8922
8923 void
dlil_ifclassq_setup(struct ifnet * ifp,struct ifclassq * ifcq)8924 dlil_ifclassq_setup(struct ifnet *ifp, struct ifclassq *ifcq)
8925 {
8926 uint32_t sflags = 0;
8927 int err;
8928
8929 if (if_flowadv) {
8930 sflags |= PKTSCHEDF_QALG_FLOWCTL;
8931 }
8932
8933 if (if_delaybased_queue) {
8934 sflags |= PKTSCHEDF_QALG_DELAYBASED;
8935 }
8936
8937 if (ifp->if_output_sched_model ==
8938 IFNET_SCHED_MODEL_DRIVER_MANAGED) {
8939 sflags |= PKTSCHEDF_QALG_DRIVER_MANAGED;
8940 }
8941 /* Inherit drop limit from the default queue */
8942 if (ifp->if_snd != ifcq) {
8943 IFCQ_PKT_DROP_LIMIT(ifcq) = IFCQ_PKT_DROP_LIMIT(ifp->if_snd);
8944 }
8945 /* Initialize transmit queue(s) */
8946 err = ifclassq_setup(ifcq, ifp, sflags);
8947 if (err != 0) {
8948 panic_plain("%s: ifp=%p couldn't initialize transmit queue; "
8949 "err=%d", __func__, ifp, err);
8950 /* NOTREACHED */
8951 }
8952 }
8953
8954 errno_t
ifnet_attach(ifnet_t ifp,const struct sockaddr_dl * ll_addr)8955 ifnet_attach(ifnet_t ifp, const struct sockaddr_dl *ll_addr)
8956 {
8957 #if SKYWALK
8958 boolean_t netif_compat;
8959 if_nexus_netif nexus_netif;
8960 #endif /* SKYWALK */
8961 struct ifnet *tmp_if;
8962 struct ifaddr *ifa;
8963 struct if_data_internal if_data_saved;
8964 struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
8965 struct dlil_threading_info *dl_inp;
8966 thread_continue_t thfunc = NULL;
8967 int err;
8968
8969 if (ifp == NULL) {
8970 return EINVAL;
8971 }
8972
8973 /*
8974 * Serialize ifnet attach using dlil_ifnet_lock, in order to
8975 * prevent the interface from being configured while it is
8976 * embryonic, as ifnet_head_lock is dropped and reacquired
8977 * below prior to marking the ifnet with IFRF_ATTACHED.
8978 */
8979 dlil_if_lock();
8980 ifnet_head_lock_exclusive();
8981 /* Verify we aren't already on the list */
8982 TAILQ_FOREACH(tmp_if, &ifnet_head, if_link) {
8983 if (tmp_if == ifp) {
8984 ifnet_head_done();
8985 dlil_if_unlock();
8986 return EEXIST;
8987 }
8988 }
8989
8990 lck_mtx_lock_spin(&ifp->if_ref_lock);
8991 if (!(ifp->if_refflags & IFRF_EMBRYONIC)) {
8992 panic_plain("%s: flags mismatch (embryonic not set) ifp=%p",
8993 __func__, ifp);
8994 /* NOTREACHED */
8995 }
8996 lck_mtx_unlock(&ifp->if_ref_lock);
8997
8998 ifnet_lock_exclusive(ifp);
8999
9000 /* Sanity check */
9001 VERIFY(ifp->if_detaching_link.tqe_next == NULL);
9002 VERIFY(ifp->if_detaching_link.tqe_prev == NULL);
9003 VERIFY(ifp->if_threads_pending == 0);
9004
9005 if (ll_addr != NULL) {
9006 if (ifp->if_addrlen == 0) {
9007 ifp->if_addrlen = ll_addr->sdl_alen;
9008 } else if (ll_addr->sdl_alen != ifp->if_addrlen) {
9009 ifnet_lock_done(ifp);
9010 ifnet_head_done();
9011 dlil_if_unlock();
9012 return EINVAL;
9013 }
9014 }
9015
9016 /*
9017 * Allow interfaces without protocol families to attach
9018 * only if they have the necessary fields filled out.
9019 */
9020 if (ifp->if_add_proto == NULL || ifp->if_del_proto == NULL) {
9021 DLIL_PRINTF("%s: Attempt to attach interface without "
9022 "family module - %d\n", __func__, ifp->if_family);
9023 ifnet_lock_done(ifp);
9024 ifnet_head_done();
9025 dlil_if_unlock();
9026 return ENODEV;
9027 }
9028
9029 /* Allocate protocol hash table */
9030 VERIFY(ifp->if_proto_hash == NULL);
9031 ifp->if_proto_hash = kalloc_type(struct proto_hash_entry,
9032 PROTO_HASH_SLOTS, Z_WAITOK | Z_ZERO | Z_NOFAIL);
9033
9034 lck_mtx_lock_spin(&ifp->if_flt_lock);
9035 VERIFY(TAILQ_EMPTY(&ifp->if_flt_head));
9036 TAILQ_INIT(&ifp->if_flt_head);
9037 VERIFY(ifp->if_flt_busy == 0);
9038 VERIFY(ifp->if_flt_waiters == 0);
9039 VERIFY(ifp->if_flt_non_os_count == 0);
9040 VERIFY(ifp->if_flt_no_tso_count == 0);
9041 lck_mtx_unlock(&ifp->if_flt_lock);
9042
9043 if (!(dl_if->dl_if_flags & DLIF_REUSE)) {
9044 VERIFY(LIST_EMPTY(&ifp->if_multiaddrs));
9045 LIST_INIT(&ifp->if_multiaddrs);
9046 }
9047
9048 VERIFY(ifp->if_allhostsinm == NULL);
9049 VERIFY(TAILQ_EMPTY(&ifp->if_addrhead));
9050 TAILQ_INIT(&ifp->if_addrhead);
9051
9052 if (ifp->if_index == 0) {
9053 int idx = if_next_index();
9054
9055 /*
9056 * Since we exhausted the list of
9057 * if_index's, try to find an empty slot
9058 * in ifindex2ifnet.
9059 */
9060 if (idx == -1 && if_index >= UINT16_MAX) {
9061 for (int i = 1; i < if_index; i++) {
9062 if (ifindex2ifnet[i] == NULL &&
9063 ifnet_addrs[i - 1] == NULL) {
9064 idx = i;
9065 break;
9066 }
9067 }
9068 }
9069 if (idx == -1) {
9070 ifp->if_index = 0;
9071 ifnet_lock_done(ifp);
9072 ifnet_head_done();
9073 dlil_if_unlock();
9074 return ENOBUFS;
9075 }
9076 ifp->if_index = (uint16_t)idx;
9077
9078 /* the lladdr passed at attach time is the permanent address */
9079 if (ll_addr != NULL && ifp->if_type == IFT_ETHER &&
9080 ll_addr->sdl_alen == ETHER_ADDR_LEN) {
9081 bcopy(CONST_LLADDR(ll_addr),
9082 dl_if->dl_if_permanent_ether,
9083 ETHER_ADDR_LEN);
9084 dl_if->dl_if_permanent_ether_is_set = 1;
9085 }
9086 }
9087 /* There should not be anything occupying this slot */
9088 VERIFY(ifindex2ifnet[ifp->if_index] == NULL);
9089
9090 /* allocate (if needed) and initialize a link address */
9091 ifa = dlil_alloc_lladdr(ifp, ll_addr);
9092 if (ifa == NULL) {
9093 ifnet_lock_done(ifp);
9094 ifnet_head_done();
9095 dlil_if_unlock();
9096 return ENOBUFS;
9097 }
9098
9099 VERIFY(ifnet_addrs[ifp->if_index - 1] == NULL);
9100 ifnet_addrs[ifp->if_index - 1] = ifa;
9101
9102 /* make this address the first on the list */
9103 IFA_LOCK(ifa);
9104 /* hold a reference for ifnet_addrs[] */
9105 IFA_ADDREF_LOCKED(ifa);
9106 /* if_attach_link_ifa() holds a reference for ifa_link */
9107 if_attach_link_ifa(ifp, ifa);
9108 IFA_UNLOCK(ifa);
9109
9110 TAILQ_INSERT_TAIL(&ifnet_head, ifp, if_link);
9111 ifindex2ifnet[ifp->if_index] = ifp;
9112
9113 /* Hold a reference to the underlying dlil_ifnet */
9114 ifnet_reference(ifp);
9115
9116 /* Clear stats (save and restore other fields that we care) */
9117 if_data_saved = ifp->if_data;
9118 bzero(&ifp->if_data, sizeof(ifp->if_data));
9119 ifp->if_data.ifi_type = if_data_saved.ifi_type;
9120 ifp->if_data.ifi_typelen = if_data_saved.ifi_typelen;
9121 ifp->if_data.ifi_physical = if_data_saved.ifi_physical;
9122 ifp->if_data.ifi_addrlen = if_data_saved.ifi_addrlen;
9123 ifp->if_data.ifi_hdrlen = if_data_saved.ifi_hdrlen;
9124 ifp->if_data.ifi_mtu = if_data_saved.ifi_mtu;
9125 ifp->if_data.ifi_baudrate = if_data_saved.ifi_baudrate;
9126 ifp->if_data.ifi_hwassist = if_data_saved.ifi_hwassist;
9127 ifp->if_data.ifi_tso_v4_mtu = if_data_saved.ifi_tso_v4_mtu;
9128 ifp->if_data.ifi_tso_v6_mtu = if_data_saved.ifi_tso_v6_mtu;
9129 ifnet_touch_lastchange(ifp);
9130
9131 VERIFY(ifp->if_output_sched_model == IFNET_SCHED_MODEL_NORMAL ||
9132 ifp->if_output_sched_model == IFNET_SCHED_MODEL_DRIVER_MANAGED ||
9133 ifp->if_output_sched_model == IFNET_SCHED_MODEL_FQ_CODEL);
9134
9135 dlil_ifclassq_setup(ifp, ifp->if_snd);
9136
9137 /* Sanity checks on the input thread storage */
9138 dl_inp = &dl_if->dl_if_inpstorage;
9139 bzero(&dl_inp->dlth_stats, sizeof(dl_inp->dlth_stats));
9140 VERIFY(dl_inp->dlth_flags == 0);
9141 VERIFY(dl_inp->dlth_wtot == 0);
9142 VERIFY(dl_inp->dlth_ifp == NULL);
9143 VERIFY(qhead(&dl_inp->dlth_pkts) == NULL && qempty(&dl_inp->dlth_pkts));
9144 VERIFY(qlimit(&dl_inp->dlth_pkts) == 0);
9145 VERIFY(!dl_inp->dlth_affinity);
9146 VERIFY(ifp->if_inp == NULL);
9147 VERIFY(dl_inp->dlth_thread == THREAD_NULL);
9148 VERIFY(dl_inp->dlth_strategy == NULL);
9149 VERIFY(dl_inp->dlth_driver_thread == THREAD_NULL);
9150 VERIFY(dl_inp->dlth_poller_thread == THREAD_NULL);
9151 VERIFY(dl_inp->dlth_affinity_tag == 0);
9152
9153 #if IFNET_INPUT_SANITY_CHK
9154 VERIFY(dl_inp->dlth_pkts_cnt == 0);
9155 #endif /* IFNET_INPUT_SANITY_CHK */
9156
9157 VERIFY(ifp->if_poll_thread == THREAD_NULL);
9158 dlil_reset_rxpoll_params(ifp);
9159 /*
9160 * A specific DLIL input thread is created per non-loopback interface.
9161 */
9162 if (ifp->if_family != IFNET_FAMILY_LOOPBACK) {
9163 ifp->if_inp = dl_inp;
9164 ifnet_incr_pending_thread_count(ifp);
9165 err = dlil_create_input_thread(ifp, ifp->if_inp, &thfunc);
9166 if (err == ENODEV) {
9167 VERIFY(thfunc == NULL);
9168 ifnet_decr_pending_thread_count(ifp);
9169 } else if (err != 0) {
9170 panic_plain("%s: ifp=%p couldn't get an input thread; "
9171 "err=%d", __func__, ifp, err);
9172 /* NOTREACHED */
9173 }
9174 }
9175 /*
9176 * If the driver supports the new transmit model, calculate flow hash
9177 * and create a workloop starter thread to invoke the if_start callback
9178 * where the packets may be dequeued and transmitted.
9179 */
9180 if (ifp->if_eflags & IFEF_TXSTART) {
9181 thread_precedence_policy_data_t info;
9182 __unused kern_return_t kret;
9183
9184 ifp->if_flowhash = ifnet_calc_flowhash(ifp);
9185 VERIFY(ifp->if_flowhash != 0);
9186 VERIFY(ifp->if_start_thread == THREAD_NULL);
9187
9188 ifnet_set_start_cycle(ifp, NULL);
9189 ifp->if_start_pacemaker_time = 0;
9190 ifp->if_start_active = 0;
9191 ifp->if_start_req = 0;
9192 ifp->if_start_flags = 0;
9193 VERIFY(ifp->if_start != NULL);
9194 ifnet_incr_pending_thread_count(ifp);
9195 if ((err = kernel_thread_start(ifnet_start_thread_func,
9196 ifp, &ifp->if_start_thread)) != KERN_SUCCESS) {
9197 panic_plain("%s: "
9198 "ifp=%p couldn't get a start thread; "
9199 "err=%d", __func__, ifp, err);
9200 /* NOTREACHED */
9201 }
9202 bzero(&info, sizeof(info));
9203 info.importance = 1;
9204 kret = thread_policy_set(ifp->if_start_thread,
9205 THREAD_PRECEDENCE_POLICY, (thread_policy_t)&info,
9206 THREAD_PRECEDENCE_POLICY_COUNT);
9207 ASSERT(kret == KERN_SUCCESS);
9208 } else {
9209 ifp->if_flowhash = 0;
9210 }
9211
9212 /* Reset polling parameters */
9213 ifnet_set_poll_cycle(ifp, NULL);
9214 ifp->if_poll_update = 0;
9215 ifp->if_poll_flags = 0;
9216 ifp->if_poll_req = 0;
9217 VERIFY(ifp->if_poll_thread == THREAD_NULL);
9218
9219 /*
9220 * If the driver supports the new receive model, create a poller
9221 * thread to invoke if_input_poll callback where the packets may
9222 * be dequeued from the driver and processed for reception.
9223 * if the interface is netif compat then the poller thread is
9224 * managed by netif.
9225 */
9226 if (thfunc == dlil_rxpoll_input_thread_func) {
9227 thread_precedence_policy_data_t info;
9228 __unused kern_return_t kret;
9229 #if SKYWALK
9230 VERIFY(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
9231 #endif /* SKYWALK */
9232 VERIFY(ifp->if_input_poll != NULL);
9233 VERIFY(ifp->if_input_ctl != NULL);
9234 ifnet_incr_pending_thread_count(ifp);
9235 if ((err = kernel_thread_start(ifnet_poll_thread_func, ifp,
9236 &ifp->if_poll_thread)) != KERN_SUCCESS) {
9237 panic_plain("%s: ifp=%p couldn't get a poll thread; "
9238 "err=%d", __func__, ifp, err);
9239 /* NOTREACHED */
9240 }
9241 bzero(&info, sizeof(info));
9242 info.importance = 1;
9243 kret = thread_policy_set(ifp->if_poll_thread,
9244 THREAD_PRECEDENCE_POLICY, (thread_policy_t)&info,
9245 THREAD_PRECEDENCE_POLICY_COUNT);
9246 ASSERT(kret == KERN_SUCCESS);
9247 }
9248
9249 VERIFY(ifp->if_desc.ifd_maxlen == IF_DESCSIZE);
9250 VERIFY(ifp->if_desc.ifd_len == 0);
9251 VERIFY(ifp->if_desc.ifd_desc != NULL);
9252
9253 /* Record attach PC stacktrace */
9254 ctrace_record(&((struct dlil_ifnet *)ifp)->dl_if_attach);
9255
9256 ifp->if_updatemcasts = 0;
9257 if (!LIST_EMPTY(&ifp->if_multiaddrs)) {
9258 struct ifmultiaddr *ifma;
9259 LIST_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
9260 IFMA_LOCK(ifma);
9261 if (ifma->ifma_addr->sa_family == AF_LINK ||
9262 ifma->ifma_addr->sa_family == AF_UNSPEC) {
9263 ifp->if_updatemcasts++;
9264 }
9265 IFMA_UNLOCK(ifma);
9266 }
9267
9268 DLIL_PRINTF("%s: attached with %d suspended link-layer multicast "
9269 "membership(s)\n", if_name(ifp),
9270 ifp->if_updatemcasts);
9271 }
9272
9273 /* Clear logging parameters */
9274 bzero(&ifp->if_log, sizeof(ifp->if_log));
9275
9276 /* Clear foreground/realtime activity timestamps */
9277 ifp->if_fg_sendts = 0;
9278 ifp->if_rt_sendts = 0;
9279
9280 /* Clear throughput estimates and radio type */
9281 ifp->if_estimated_up_bucket = 0;
9282 ifp->if_estimated_down_bucket = 0;
9283 ifp->if_radio_type = 0;
9284 ifp->if_radio_channel = 0;
9285
9286 VERIFY(ifp->if_delegated.ifp == NULL);
9287 VERIFY(ifp->if_delegated.type == 0);
9288 VERIFY(ifp->if_delegated.family == 0);
9289 VERIFY(ifp->if_delegated.subfamily == 0);
9290 VERIFY(ifp->if_delegated.expensive == 0);
9291 VERIFY(ifp->if_delegated.constrained == 0);
9292
9293 VERIFY(ifp->if_agentids == NULL);
9294 VERIFY(ifp->if_agentcount == 0);
9295
9296 /* Reset interface state */
9297 bzero(&ifp->if_interface_state, sizeof(ifp->if_interface_state));
9298 ifp->if_interface_state.valid_bitmask |=
9299 IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID;
9300 ifp->if_interface_state.interface_availability =
9301 IF_INTERFACE_STATE_INTERFACE_AVAILABLE;
9302
9303 /* Initialize Link Quality Metric (loopback [lo0] is always good) */
9304 if (ifp == lo_ifp) {
9305 ifp->if_interface_state.lqm_state = IFNET_LQM_THRESH_GOOD;
9306 ifp->if_interface_state.valid_bitmask |=
9307 IF_INTERFACE_STATE_LQM_STATE_VALID;
9308 } else {
9309 ifp->if_interface_state.lqm_state = IFNET_LQM_THRESH_UNKNOWN;
9310 }
9311
9312 /*
9313 * Enable ECN capability on this interface depending on the
9314 * value of ECN global setting
9315 */
9316 if (tcp_ecn_outbound == 2 && !IFNET_IS_CELLULAR(ifp)) {
9317 if_set_eflags(ifp, IFEF_ECN_ENABLE);
9318 if_clear_eflags(ifp, IFEF_ECN_DISABLE);
9319 }
9320
9321 /*
9322 * Built-in Cyclops always on policy for WiFi infra
9323 */
9324 if (IFNET_IS_WIFI_INFRA(ifp) && net_qos_policy_wifi_enabled != 0) {
9325 errno_t error;
9326
9327 error = if_set_qosmarking_mode(ifp,
9328 IFRTYPE_QOSMARKING_FASTLANE);
9329 if (error != 0) {
9330 DLIL_PRINTF("%s if_set_qosmarking_mode(%s) error %d\n",
9331 __func__, ifp->if_xname, error);
9332 } else {
9333 if_set_eflags(ifp, IFEF_QOSMARKING_ENABLED);
9334 #if (DEVELOPMENT || DEBUG)
9335 DLIL_PRINTF("%s fastlane enabled on %s\n",
9336 __func__, ifp->if_xname);
9337 #endif /* (DEVELOPMENT || DEBUG) */
9338 }
9339 }
9340
9341 ifnet_lock_done(ifp);
9342 ifnet_head_done();
9343
9344 #if SKYWALK
9345 netif_compat = dlil_attach_netif_compat_nexus(ifp, &nexus_netif);
9346 #endif /* SKYWALK */
9347
9348 lck_mtx_lock(&ifp->if_cached_route_lock);
9349 /* Enable forwarding cached route */
9350 ifp->if_fwd_cacheok = 1;
9351 /* Clean up any existing cached routes */
9352 ROUTE_RELEASE(&ifp->if_fwd_route);
9353 bzero(&ifp->if_fwd_route, sizeof(ifp->if_fwd_route));
9354 ROUTE_RELEASE(&ifp->if_src_route);
9355 bzero(&ifp->if_src_route, sizeof(ifp->if_src_route));
9356 ROUTE_RELEASE(&ifp->if_src_route6);
9357 bzero(&ifp->if_src_route6, sizeof(ifp->if_src_route6));
9358 lck_mtx_unlock(&ifp->if_cached_route_lock);
9359
9360 ifnet_llreach_ifattach(ifp, (dl_if->dl_if_flags & DLIF_REUSE));
9361
9362 /*
9363 * Allocate and attach IGMPv3/MLDv2 interface specific variables
9364 * and trees; do this before the ifnet is marked as attached.
9365 * The ifnet keeps the reference to the info structures even after
9366 * the ifnet is detached, since the network-layer records still
9367 * refer to the info structures even after that. This also
9368 * makes it possible for them to still function after the ifnet
9369 * is recycled or reattached.
9370 */
9371 #if INET
9372 if (IGMP_IFINFO(ifp) == NULL) {
9373 IGMP_IFINFO(ifp) = igmp_domifattach(ifp, Z_WAITOK);
9374 VERIFY(IGMP_IFINFO(ifp) != NULL);
9375 } else {
9376 VERIFY(IGMP_IFINFO(ifp)->igi_ifp == ifp);
9377 igmp_domifreattach(IGMP_IFINFO(ifp));
9378 }
9379 #endif /* INET */
9380 if (MLD_IFINFO(ifp) == NULL) {
9381 MLD_IFINFO(ifp) = mld_domifattach(ifp, Z_WAITOK);
9382 VERIFY(MLD_IFINFO(ifp) != NULL);
9383 } else {
9384 VERIFY(MLD_IFINFO(ifp)->mli_ifp == ifp);
9385 mld_domifreattach(MLD_IFINFO(ifp));
9386 }
9387
9388 VERIFY(ifp->if_data_threshold == 0);
9389 VERIFY(ifp->if_dt_tcall != NULL);
9390
9391 /*
9392 * Wait for the created kernel threads for I/O to get
9393 * scheduled and run at least once before we proceed
9394 * to mark interface as attached.
9395 */
9396 lck_mtx_lock(&ifp->if_ref_lock);
9397 while (ifp->if_threads_pending != 0) {
9398 DLIL_PRINTF("%s: Waiting for all kernel threads created for "
9399 "interface %s to get scheduled at least once.\n",
9400 __func__, ifp->if_xname);
9401 (void) msleep(&ifp->if_threads_pending, &ifp->if_ref_lock, (PZERO - 1),
9402 __func__, NULL);
9403 LCK_MTX_ASSERT(&ifp->if_ref_lock, LCK_ASSERT_OWNED);
9404 }
9405 lck_mtx_unlock(&ifp->if_ref_lock);
9406 DLIL_PRINTF("%s: All kernel threads created for interface %s have been scheduled "
9407 "at least once. Proceeding.\n", __func__, ifp->if_xname);
9408
9409 /* Final mark this ifnet as attached. */
9410 ifnet_lock_exclusive(ifp);
9411 lck_mtx_lock_spin(&ifp->if_ref_lock);
9412 ifp->if_refflags = (IFRF_ATTACHED | IFRF_READY); /* clears embryonic */
9413 lck_mtx_unlock(&ifp->if_ref_lock);
9414 if (net_rtref) {
9415 /* boot-args override; enable idle notification */
9416 (void) ifnet_set_idle_flags_locked(ifp, IFRF_IDLE_NOTIFY,
9417 IFRF_IDLE_NOTIFY);
9418 } else {
9419 /* apply previous request(s) to set the idle flags, if any */
9420 (void) ifnet_set_idle_flags_locked(ifp, ifp->if_idle_new_flags,
9421 ifp->if_idle_new_flags_mask);
9422 }
9423 #if SKYWALK
9424 /* the interface is fully attached; let the nexus adapter know */
9425 if (netif_compat || dlil_is_native_netif_nexus(ifp)) {
9426 if (netif_compat) {
9427 if (sk_netif_compat_txmodel ==
9428 NETIF_COMPAT_TXMODEL_ENQUEUE_MULTI) {
9429 ifnet_enqueue_multi_setup(ifp,
9430 sk_tx_delay_qlen, sk_tx_delay_timeout);
9431 }
9432 ifp->if_nx_netif = nexus_netif;
9433 }
9434 ifp->if_na_ops->ni_finalize(ifp->if_na, ifp);
9435 }
9436 #endif /* SKYWALK */
9437 ifnet_lock_done(ifp);
9438 dlil_if_unlock();
9439
9440 #if PF
9441 /*
9442 * Attach packet filter to this interface, if enabled.
9443 */
9444 pf_ifnet_hook(ifp, 1);
9445 #endif /* PF */
9446
9447 dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_IF_ATTACHED, NULL, 0, FALSE);
9448
9449 if (dlil_verbose) {
9450 DLIL_PRINTF("%s: attached%s\n", if_name(ifp),
9451 (dl_if->dl_if_flags & DLIF_REUSE) ? " (recycled)" : "");
9452 }
9453
9454 return 0;
9455 }
9456
9457 /*
9458 * Prepare the storage for the first/permanent link address, which must
9459 * must have the same lifetime as the ifnet itself. Although the link
9460 * address gets removed from if_addrhead and ifnet_addrs[] at detach time,
9461 * its location in memory must never change as it may still be referred
9462 * to by some parts of the system afterwards (unfortunate implementation
9463 * artifacts inherited from BSD.)
9464 *
9465 * Caller must hold ifnet lock as writer.
9466 */
9467 static struct ifaddr *
dlil_alloc_lladdr(struct ifnet * ifp,const struct sockaddr_dl * ll_addr)9468 dlil_alloc_lladdr(struct ifnet *ifp, const struct sockaddr_dl *ll_addr)
9469 {
9470 struct ifaddr *ifa, *oifa;
9471 struct sockaddr_dl *asdl, *msdl;
9472 char workbuf[IFNAMSIZ * 2];
9473 int namelen, masklen, socksize;
9474 struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
9475
9476 ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_EXCLUSIVE);
9477 VERIFY(ll_addr == NULL || ll_addr->sdl_alen == ifp->if_addrlen);
9478
9479 namelen = scnprintf(workbuf, sizeof(workbuf), "%s",
9480 if_name(ifp));
9481 masklen = offsetof(struct sockaddr_dl, sdl_data[0])
9482 + ((namelen > 0) ? namelen : 0);
9483 socksize = masklen + ifp->if_addrlen;
9484 #define ROUNDUP(a) (1 + (((a) - 1) | (sizeof (u_int32_t) - 1)))
9485 if ((u_int32_t)socksize < sizeof(struct sockaddr_dl)) {
9486 socksize = sizeof(struct sockaddr_dl);
9487 }
9488 socksize = ROUNDUP(socksize);
9489 #undef ROUNDUP
9490
9491 ifa = ifp->if_lladdr;
9492 if (socksize > DLIL_SDLMAXLEN ||
9493 (ifa != NULL && ifa != &dl_if->dl_if_lladdr.ifa)) {
9494 /*
9495 * Rare, but in the event that the link address requires
9496 * more storage space than DLIL_SDLMAXLEN, allocate the
9497 * largest possible storages for address and mask, such
9498 * that we can reuse the same space when if_addrlen grows.
9499 * This same space will be used when if_addrlen shrinks.
9500 */
9501 if (ifa == NULL || ifa == &dl_if->dl_if_lladdr.ifa) {
9502 int ifasize = sizeof(*ifa) + 2 * SOCK_MAXADDRLEN;
9503
9504 ifa = zalloc_permanent(ifasize, ZALIGN(struct ifaddr));
9505 ifa_lock_init(ifa);
9506 /* Don't set IFD_ALLOC, as this is permanent */
9507 ifa->ifa_debug = IFD_LINK;
9508 }
9509 IFA_LOCK(ifa);
9510 /* address and mask sockaddr_dl locations */
9511 asdl = (struct sockaddr_dl *)(ifa + 1);
9512 bzero(asdl, SOCK_MAXADDRLEN);
9513 msdl = (struct sockaddr_dl *)(void *)
9514 ((char *)asdl + SOCK_MAXADDRLEN);
9515 bzero(msdl, SOCK_MAXADDRLEN);
9516 } else {
9517 VERIFY(ifa == NULL || ifa == &dl_if->dl_if_lladdr.ifa);
9518 /*
9519 * Use the storage areas for address and mask within the
9520 * dlil_ifnet structure. This is the most common case.
9521 */
9522 if (ifa == NULL) {
9523 ifa = &dl_if->dl_if_lladdr.ifa;
9524 ifa_lock_init(ifa);
9525 /* Don't set IFD_ALLOC, as this is permanent */
9526 ifa->ifa_debug = IFD_LINK;
9527 }
9528 IFA_LOCK(ifa);
9529 /* address and mask sockaddr_dl locations */
9530 asdl = (struct sockaddr_dl *)(void *)&dl_if->dl_if_lladdr.asdl;
9531 bzero(asdl, sizeof(dl_if->dl_if_lladdr.asdl));
9532 msdl = (struct sockaddr_dl *)(void *)&dl_if->dl_if_lladdr.msdl;
9533 bzero(msdl, sizeof(dl_if->dl_if_lladdr.msdl));
9534 }
9535
9536 /* hold a permanent reference for the ifnet itself */
9537 IFA_ADDREF_LOCKED(ifa);
9538 oifa = ifp->if_lladdr;
9539 ifp->if_lladdr = ifa;
9540
9541 VERIFY(ifa->ifa_debug == IFD_LINK);
9542 ifa->ifa_ifp = ifp;
9543 ifa->ifa_rtrequest = link_rtrequest;
9544 ifa->ifa_addr = (struct sockaddr *)asdl;
9545 asdl->sdl_len = (u_char)socksize;
9546 asdl->sdl_family = AF_LINK;
9547 if (namelen > 0) {
9548 bcopy(workbuf, asdl->sdl_data, min(namelen,
9549 sizeof(asdl->sdl_data)));
9550 asdl->sdl_nlen = (u_char)namelen;
9551 } else {
9552 asdl->sdl_nlen = 0;
9553 }
9554 asdl->sdl_index = ifp->if_index;
9555 asdl->sdl_type = ifp->if_type;
9556 if (ll_addr != NULL) {
9557 asdl->sdl_alen = ll_addr->sdl_alen;
9558 bcopy(CONST_LLADDR(ll_addr), LLADDR(asdl), asdl->sdl_alen);
9559 } else {
9560 asdl->sdl_alen = 0;
9561 }
9562 ifa->ifa_netmask = (struct sockaddr *)msdl;
9563 msdl->sdl_len = (u_char)masklen;
9564 while (namelen > 0) {
9565 msdl->sdl_data[--namelen] = 0xff;
9566 }
9567 IFA_UNLOCK(ifa);
9568
9569 if (oifa != NULL) {
9570 IFA_REMREF(oifa);
9571 }
9572
9573 return ifa;
9574 }
9575
9576 static void
if_purgeaddrs(struct ifnet * ifp)9577 if_purgeaddrs(struct ifnet *ifp)
9578 {
9579 #if INET
9580 in_purgeaddrs(ifp);
9581 #endif /* INET */
9582 in6_purgeaddrs(ifp);
9583 }
9584
9585 errno_t
ifnet_detach(ifnet_t ifp)9586 ifnet_detach(ifnet_t ifp)
9587 {
9588 struct ifnet *delegated_ifp;
9589 struct nd_ifinfo *ndi = NULL;
9590
9591 if (ifp == NULL) {
9592 return EINVAL;
9593 }
9594
9595 ndi = ND_IFINFO(ifp);
9596 if (NULL != ndi) {
9597 ndi->cga_initialized = FALSE;
9598 }
9599
9600 /* Mark the interface down */
9601 if_down(ifp);
9602
9603 /*
9604 * IMPORTANT NOTE
9605 *
9606 * Any field in the ifnet that relies on IF_FULLY_ATTACHED()
9607 * or equivalently, ifnet_is_attached(ifp, 1), can't be modified
9608 * until after we've waited for all I/O references to drain
9609 * in ifnet_detach_final().
9610 */
9611
9612 ifnet_head_lock_exclusive();
9613 ifnet_lock_exclusive(ifp);
9614
9615 if (ifp->if_output_netem != NULL) {
9616 netem_destroy(ifp->if_output_netem);
9617 ifp->if_output_netem = NULL;
9618 }
9619
9620 /*
9621 * Check to see if this interface has previously triggered
9622 * aggressive protocol draining; if so, decrement the global
9623 * refcnt and clear PR_AGGDRAIN on the route domain if
9624 * there are no more of such an interface around.
9625 */
9626 (void) ifnet_set_idle_flags_locked(ifp, 0, ~0);
9627
9628 lck_mtx_lock_spin(&ifp->if_ref_lock);
9629 if (!(ifp->if_refflags & IFRF_ATTACHED)) {
9630 lck_mtx_unlock(&ifp->if_ref_lock);
9631 ifnet_lock_done(ifp);
9632 ifnet_head_done();
9633 return EINVAL;
9634 } else if (ifp->if_refflags & IFRF_DETACHING) {
9635 /* Interface has already been detached */
9636 lck_mtx_unlock(&ifp->if_ref_lock);
9637 ifnet_lock_done(ifp);
9638 ifnet_head_done();
9639 return ENXIO;
9640 }
9641 VERIFY(!(ifp->if_refflags & IFRF_EMBRYONIC));
9642 /* Indicate this interface is being detached */
9643 ifp->if_refflags &= ~IFRF_ATTACHED;
9644 ifp->if_refflags |= IFRF_DETACHING;
9645 lck_mtx_unlock(&ifp->if_ref_lock);
9646
9647 if (dlil_verbose) {
9648 DLIL_PRINTF("%s: detaching\n", if_name(ifp));
9649 }
9650
9651 /* clean up flow control entry object if there's any */
9652 if (ifp->if_eflags & IFEF_TXSTART) {
9653 ifnet_flowadv(ifp->if_flowhash);
9654 }
9655
9656 /* Reset ECN enable/disable flags */
9657 /* Reset CLAT46 flag */
9658 if_clear_eflags(ifp, IFEF_ECN_ENABLE | IFEF_ECN_DISABLE | IFEF_CLAT46);
9659
9660 /*
9661 * We do not reset the TCP keep alive counters in case
9662 * a TCP connection stays connection after the interface
9663 * went down
9664 */
9665 if (ifp->if_tcp_kao_cnt > 0) {
9666 os_log(OS_LOG_DEFAULT, "%s %s tcp_kao_cnt %u not zero",
9667 __func__, if_name(ifp), ifp->if_tcp_kao_cnt);
9668 }
9669 ifp->if_tcp_kao_max = 0;
9670
9671 /*
9672 * Remove ifnet from the ifnet_head, ifindex2ifnet[]; it will
9673 * no longer be visible during lookups from this point.
9674 */
9675 VERIFY(ifindex2ifnet[ifp->if_index] == ifp);
9676 TAILQ_REMOVE(&ifnet_head, ifp, if_link);
9677 ifp->if_link.tqe_next = NULL;
9678 ifp->if_link.tqe_prev = NULL;
9679 if (ifp->if_ordered_link.tqe_next != NULL ||
9680 ifp->if_ordered_link.tqe_prev != NULL) {
9681 ifnet_remove_from_ordered_list(ifp);
9682 }
9683 ifindex2ifnet[ifp->if_index] = NULL;
9684
9685 /* 18717626 - reset router mode */
9686 if_clear_eflags(ifp, IFEF_IPV4_ROUTER);
9687 ifp->if_ipv6_router_mode = IPV6_ROUTER_MODE_DISABLED;
9688
9689 /* Record detach PC stacktrace */
9690 ctrace_record(&((struct dlil_ifnet *)ifp)->dl_if_detach);
9691
9692 /* Clear logging parameters */
9693 bzero(&ifp->if_log, sizeof(ifp->if_log));
9694
9695 /* Clear delegated interface info (reference released below) */
9696 delegated_ifp = ifp->if_delegated.ifp;
9697 bzero(&ifp->if_delegated, sizeof(ifp->if_delegated));
9698
9699 /* Reset interface state */
9700 bzero(&ifp->if_interface_state, sizeof(ifp->if_interface_state));
9701
9702 /*
9703 * Increment the generation count on interface deletion
9704 */
9705 ifp->if_creation_generation_id = os_atomic_inc(&if_creation_generation_count, relaxed);
9706
9707 ifnet_lock_done(ifp);
9708 ifnet_head_done();
9709
9710 /* Release reference held on the delegated interface */
9711 if (delegated_ifp != NULL) {
9712 ifnet_release(delegated_ifp);
9713 }
9714
9715 /* Reset Link Quality Metric (unless loopback [lo0]) */
9716 if (ifp != lo_ifp) {
9717 if_lqm_update(ifp, IFNET_LQM_THRESH_OFF, 0);
9718 }
9719
9720 /* Reset TCP local statistics */
9721 if (ifp->if_tcp_stat != NULL) {
9722 bzero(ifp->if_tcp_stat, sizeof(*ifp->if_tcp_stat));
9723 }
9724
9725 /* Reset UDP local statistics */
9726 if (ifp->if_udp_stat != NULL) {
9727 bzero(ifp->if_udp_stat, sizeof(*ifp->if_udp_stat));
9728 }
9729
9730 /* Reset ifnet IPv4 stats */
9731 if (ifp->if_ipv4_stat != NULL) {
9732 bzero(ifp->if_ipv4_stat, sizeof(*ifp->if_ipv4_stat));
9733 }
9734
9735 /* Reset ifnet IPv6 stats */
9736 if (ifp->if_ipv6_stat != NULL) {
9737 bzero(ifp->if_ipv6_stat, sizeof(*ifp->if_ipv6_stat));
9738 }
9739
9740 /* Release memory held for interface link status report */
9741 if (ifp->if_link_status != NULL) {
9742 kfree_type(struct if_link_status, ifp->if_link_status);
9743 ifp->if_link_status = NULL;
9744 }
9745
9746 /* Disable forwarding cached route */
9747 lck_mtx_lock(&ifp->if_cached_route_lock);
9748 ifp->if_fwd_cacheok = 0;
9749 lck_mtx_unlock(&ifp->if_cached_route_lock);
9750
9751 /* Disable data threshold and wait for any pending event posting */
9752 ifp->if_data_threshold = 0;
9753 VERIFY(ifp->if_dt_tcall != NULL);
9754 (void) thread_call_cancel_wait(ifp->if_dt_tcall);
9755
9756 /*
9757 * Drain any deferred IGMPv3/MLDv2 query responses, but keep the
9758 * references to the info structures and leave them attached to
9759 * this ifnet.
9760 */
9761 #if INET
9762 igmp_domifdetach(ifp);
9763 #endif /* INET */
9764 mld_domifdetach(ifp);
9765
9766 #if SKYWALK
9767 /* Clean up any netns tokens still pointing to to this ifnet */
9768 netns_ifnet_detach(ifp);
9769 #endif /* SKYWALK */
9770 dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_IF_DETACHING, NULL, 0, FALSE);
9771
9772 /* Let worker thread take care of the rest, to avoid reentrancy */
9773 dlil_if_lock();
9774 ifnet_detaching_enqueue(ifp);
9775 dlil_if_unlock();
9776
9777 return 0;
9778 }
9779
9780 static void
ifnet_detaching_enqueue(struct ifnet * ifp)9781 ifnet_detaching_enqueue(struct ifnet *ifp)
9782 {
9783 dlil_if_lock_assert();
9784
9785 ++ifnet_detaching_cnt;
9786 VERIFY(ifnet_detaching_cnt != 0);
9787 TAILQ_INSERT_TAIL(&ifnet_detaching_head, ifp, if_detaching_link);
9788 wakeup((caddr_t)&ifnet_delayed_run);
9789 }
9790
9791 static struct ifnet *
ifnet_detaching_dequeue(void)9792 ifnet_detaching_dequeue(void)
9793 {
9794 struct ifnet *ifp;
9795
9796 dlil_if_lock_assert();
9797
9798 ifp = TAILQ_FIRST(&ifnet_detaching_head);
9799 VERIFY(ifnet_detaching_cnt != 0 || ifp == NULL);
9800 if (ifp != NULL) {
9801 VERIFY(ifnet_detaching_cnt != 0);
9802 --ifnet_detaching_cnt;
9803 TAILQ_REMOVE(&ifnet_detaching_head, ifp, if_detaching_link);
9804 ifp->if_detaching_link.tqe_next = NULL;
9805 ifp->if_detaching_link.tqe_prev = NULL;
9806 }
9807 return ifp;
9808 }
9809
9810 __attribute__((noreturn))
9811 static void
ifnet_detacher_thread_cont(void * v,wait_result_t wres)9812 ifnet_detacher_thread_cont(void *v, wait_result_t wres)
9813 {
9814 #pragma unused(v, wres)
9815 struct ifnet *ifp;
9816
9817 dlil_if_lock();
9818 if (__improbable(ifnet_detaching_embryonic)) {
9819 ifnet_detaching_embryonic = FALSE;
9820 /* there's no lock ordering constrain so OK to do this here */
9821 dlil_decr_pending_thread_count();
9822 }
9823
9824 for (;;) {
9825 dlil_if_lock_assert();
9826
9827 if (ifnet_detaching_cnt == 0) {
9828 break;
9829 }
9830
9831 net_update_uptime();
9832
9833 VERIFY(TAILQ_FIRST(&ifnet_detaching_head) != NULL);
9834
9835 /* Take care of detaching ifnet */
9836 ifp = ifnet_detaching_dequeue();
9837 if (ifp != NULL) {
9838 dlil_if_unlock();
9839 ifnet_detach_final(ifp);
9840 dlil_if_lock();
9841 }
9842 }
9843
9844 (void) assert_wait(&ifnet_delayed_run, THREAD_UNINT);
9845 dlil_if_unlock();
9846 (void) thread_block(ifnet_detacher_thread_cont);
9847
9848 VERIFY(0); /* we should never get here */
9849 /* NOTREACHED */
9850 __builtin_unreachable();
9851 }
9852
9853 __dead2
9854 static void
ifnet_detacher_thread_func(void * v,wait_result_t w)9855 ifnet_detacher_thread_func(void *v, wait_result_t w)
9856 {
9857 #pragma unused(v, w)
9858 dlil_if_lock();
9859 (void) assert_wait(&ifnet_delayed_run, THREAD_UNINT);
9860 ifnet_detaching_embryonic = TRUE;
9861 /* wake up once to get out of embryonic state */
9862 wakeup((caddr_t)&ifnet_delayed_run);
9863 dlil_if_unlock();
9864 (void) thread_block(ifnet_detacher_thread_cont);
9865 VERIFY(0);
9866 /* NOTREACHED */
9867 __builtin_unreachable();
9868 }
9869
9870 static void
ifnet_detach_final(struct ifnet * ifp)9871 ifnet_detach_final(struct ifnet *ifp)
9872 {
9873 struct ifnet_filter *filter, *filter_next;
9874 struct dlil_ifnet *dlifp;
9875 struct ifnet_filter_head fhead;
9876 struct dlil_threading_info *inp;
9877 struct ifaddr *ifa;
9878 ifnet_detached_func if_free;
9879 int i;
9880
9881 /* Let BPF know we're detaching */
9882 bpfdetach(ifp);
9883
9884 #if SKYWALK
9885 dlil_netif_detach_notify(ifp);
9886 /*
9887 * Wait for the datapath to quiesce before tearing down
9888 * netif/flowswitch nexuses.
9889 */
9890 dlil_quiesce_and_detach_nexuses(ifp);
9891 #endif /* SKYWALK */
9892
9893 lck_mtx_lock(&ifp->if_ref_lock);
9894 if (!(ifp->if_refflags & IFRF_DETACHING)) {
9895 panic("%s: flags mismatch (detaching not set) ifp=%p",
9896 __func__, ifp);
9897 /* NOTREACHED */
9898 }
9899
9900 /*
9901 * Wait until the existing IO references get released
9902 * before we proceed with ifnet_detach. This is not a
9903 * common case, so block without using a continuation.
9904 */
9905 while (ifp->if_refio > 0) {
9906 DLIL_PRINTF("%s: Waiting for IO references on %s interface "
9907 "to be released\n", __func__, if_name(ifp));
9908 (void) msleep(&(ifp->if_refio), &ifp->if_ref_lock,
9909 (PZERO - 1), "ifnet_ioref_wait", NULL);
9910 }
9911
9912 VERIFY(ifp->if_datamov == 0);
9913 VERIFY(ifp->if_drainers == 0);
9914 VERIFY(ifp->if_suspend == 0);
9915 ifp->if_refflags &= ~IFRF_READY;
9916 lck_mtx_unlock(&ifp->if_ref_lock);
9917
9918 /* Clear agent IDs */
9919 if (ifp->if_agentids != NULL) {
9920 kfree_data(ifp->if_agentids,
9921 sizeof(uuid_t) * ifp->if_agentcount);
9922 ifp->if_agentids = NULL;
9923 }
9924 ifp->if_agentcount = 0;
9925
9926 #if SKYWALK
9927 VERIFY(SLIST_EMPTY(&ifp->if_netns_tokens));
9928 #endif /* SKYWALK */
9929 /* Drain and destroy send queue */
9930 ifclassq_teardown(ifp->if_snd);
9931
9932 /* Detach interface filters */
9933 lck_mtx_lock(&ifp->if_flt_lock);
9934 if_flt_monitor_enter(ifp);
9935
9936 LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
9937 fhead = ifp->if_flt_head;
9938 TAILQ_INIT(&ifp->if_flt_head);
9939
9940 for (filter = TAILQ_FIRST(&fhead); filter; filter = filter_next) {
9941 filter_next = TAILQ_NEXT(filter, filt_next);
9942 lck_mtx_unlock(&ifp->if_flt_lock);
9943
9944 dlil_detach_filter_internal(filter, 1);
9945 lck_mtx_lock(&ifp->if_flt_lock);
9946 }
9947 if_flt_monitor_leave(ifp);
9948 lck_mtx_unlock(&ifp->if_flt_lock);
9949
9950 /* Tell upper layers to drop their network addresses */
9951 if_purgeaddrs(ifp);
9952
9953 ifnet_lock_exclusive(ifp);
9954
9955 /* Unplumb all protocols */
9956 for (i = 0; i < PROTO_HASH_SLOTS; i++) {
9957 struct if_proto *proto;
9958
9959 proto = SLIST_FIRST(&ifp->if_proto_hash[i]);
9960 while (proto != NULL) {
9961 protocol_family_t family = proto->protocol_family;
9962 ifnet_lock_done(ifp);
9963 proto_unplumb(family, ifp);
9964 ifnet_lock_exclusive(ifp);
9965 proto = SLIST_FIRST(&ifp->if_proto_hash[i]);
9966 }
9967 /* There should not be any protocols left */
9968 VERIFY(SLIST_EMPTY(&ifp->if_proto_hash[i]));
9969 }
9970 kfree_type(struct proto_hash_entry, PROTO_HASH_SLOTS, ifp->if_proto_hash);
9971 ifp->if_proto_hash = NULL;
9972
9973 /* Detach (permanent) link address from if_addrhead */
9974 ifa = TAILQ_FIRST(&ifp->if_addrhead);
9975 VERIFY(ifnet_addrs[ifp->if_index - 1] == ifa);
9976 IFA_LOCK(ifa);
9977 if_detach_link_ifa(ifp, ifa);
9978 IFA_UNLOCK(ifa);
9979
9980 /* Remove (permanent) link address from ifnet_addrs[] */
9981 IFA_REMREF(ifa);
9982 ifnet_addrs[ifp->if_index - 1] = NULL;
9983
9984 /* This interface should not be on {ifnet_head,detaching} */
9985 VERIFY(ifp->if_link.tqe_next == NULL);
9986 VERIFY(ifp->if_link.tqe_prev == NULL);
9987 VERIFY(ifp->if_detaching_link.tqe_next == NULL);
9988 VERIFY(ifp->if_detaching_link.tqe_prev == NULL);
9989 VERIFY(ifp->if_ordered_link.tqe_next == NULL);
9990 VERIFY(ifp->if_ordered_link.tqe_prev == NULL);
9991
9992 /* The slot should have been emptied */
9993 VERIFY(ifindex2ifnet[ifp->if_index] == NULL);
9994
9995 /* There should not be any addresses left */
9996 VERIFY(TAILQ_EMPTY(&ifp->if_addrhead));
9997
9998 /*
9999 * Signal the starter thread to terminate itself, and wait until
10000 * it has exited.
10001 */
10002 if (ifp->if_start_thread != THREAD_NULL) {
10003 lck_mtx_lock_spin(&ifp->if_start_lock);
10004 ifp->if_start_flags |= IFSF_TERMINATING;
10005 wakeup_one((caddr_t)&ifp->if_start_thread);
10006 lck_mtx_unlock(&ifp->if_start_lock);
10007
10008 /* wait for starter thread to terminate */
10009 lck_mtx_lock(&ifp->if_start_lock);
10010 while (ifp->if_start_thread != THREAD_NULL) {
10011 if (dlil_verbose) {
10012 DLIL_PRINTF("%s: waiting for %s starter thread to terminate\n",
10013 __func__,
10014 if_name(ifp));
10015 }
10016 (void) msleep(&ifp->if_start_thread,
10017 &ifp->if_start_lock, (PZERO - 1),
10018 "ifnet_start_thread_exit", NULL);
10019 }
10020 lck_mtx_unlock(&ifp->if_start_lock);
10021 if (dlil_verbose) {
10022 DLIL_PRINTF("%s: %s starter thread termination complete",
10023 __func__, if_name(ifp));
10024 }
10025 }
10026
10027 /*
10028 * Signal the poller thread to terminate itself, and wait until
10029 * it has exited.
10030 */
10031 if (ifp->if_poll_thread != THREAD_NULL) {
10032 #if SKYWALK
10033 VERIFY(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
10034 #endif /* SKYWALK */
10035 lck_mtx_lock_spin(&ifp->if_poll_lock);
10036 ifp->if_poll_flags |= IF_POLLF_TERMINATING;
10037 wakeup_one((caddr_t)&ifp->if_poll_thread);
10038 lck_mtx_unlock(&ifp->if_poll_lock);
10039
10040 /* wait for poller thread to terminate */
10041 lck_mtx_lock(&ifp->if_poll_lock);
10042 while (ifp->if_poll_thread != THREAD_NULL) {
10043 if (dlil_verbose) {
10044 DLIL_PRINTF("%s: waiting for %s poller thread to terminate\n",
10045 __func__,
10046 if_name(ifp));
10047 }
10048 (void) msleep(&ifp->if_poll_thread,
10049 &ifp->if_poll_lock, (PZERO - 1),
10050 "ifnet_poll_thread_exit", NULL);
10051 }
10052 lck_mtx_unlock(&ifp->if_poll_lock);
10053 if (dlil_verbose) {
10054 DLIL_PRINTF("%s: %s poller thread termination complete\n",
10055 __func__, if_name(ifp));
10056 }
10057 }
10058
10059 /*
10060 * If thread affinity was set for the workloop thread, we will need
10061 * to tear down the affinity and release the extra reference count
10062 * taken at attach time. Does not apply to lo0 or other interfaces
10063 * without dedicated input threads.
10064 */
10065 if ((inp = ifp->if_inp) != NULL) {
10066 VERIFY(inp != dlil_main_input_thread);
10067
10068 if (inp->dlth_affinity) {
10069 struct thread *tp, *wtp, *ptp;
10070
10071 lck_mtx_lock_spin(&inp->dlth_lock);
10072 wtp = inp->dlth_driver_thread;
10073 inp->dlth_driver_thread = THREAD_NULL;
10074 ptp = inp->dlth_poller_thread;
10075 inp->dlth_poller_thread = THREAD_NULL;
10076 ASSERT(inp->dlth_thread != THREAD_NULL);
10077 tp = inp->dlth_thread; /* don't nullify now */
10078 inp->dlth_affinity_tag = 0;
10079 inp->dlth_affinity = FALSE;
10080 lck_mtx_unlock(&inp->dlth_lock);
10081
10082 /* Tear down poll thread affinity */
10083 if (ptp != NULL) {
10084 VERIFY(ifp->if_eflags & IFEF_RXPOLL);
10085 VERIFY(ifp->if_xflags & IFXF_LEGACY);
10086 (void) dlil_affinity_set(ptp,
10087 THREAD_AFFINITY_TAG_NULL);
10088 thread_deallocate(ptp);
10089 }
10090
10091 /* Tear down workloop thread affinity */
10092 if (wtp != NULL) {
10093 (void) dlil_affinity_set(wtp,
10094 THREAD_AFFINITY_TAG_NULL);
10095 thread_deallocate(wtp);
10096 }
10097
10098 /* Tear down DLIL input thread affinity */
10099 (void) dlil_affinity_set(tp, THREAD_AFFINITY_TAG_NULL);
10100 thread_deallocate(tp);
10101 }
10102
10103 /* disassociate ifp DLIL input thread */
10104 ifp->if_inp = NULL;
10105
10106 /* if the worker thread was created, tell it to terminate */
10107 if (inp->dlth_thread != THREAD_NULL) {
10108 lck_mtx_lock_spin(&inp->dlth_lock);
10109 inp->dlth_flags |= DLIL_INPUT_TERMINATE;
10110 if (!(inp->dlth_flags & DLIL_INPUT_RUNNING)) {
10111 wakeup_one((caddr_t)&inp->dlth_flags);
10112 }
10113 lck_mtx_unlock(&inp->dlth_lock);
10114 ifnet_lock_done(ifp);
10115
10116 /* wait for the input thread to terminate */
10117 lck_mtx_lock_spin(&inp->dlth_lock);
10118 while ((inp->dlth_flags & DLIL_INPUT_TERMINATE_COMPLETE)
10119 == 0) {
10120 (void) msleep(&inp->dlth_flags, &inp->dlth_lock,
10121 (PZERO - 1) | PSPIN, inp->dlth_name, NULL);
10122 }
10123 lck_mtx_unlock(&inp->dlth_lock);
10124 ifnet_lock_exclusive(ifp);
10125 }
10126
10127 /* clean-up input thread state */
10128 dlil_clean_threading_info(inp);
10129 /* clean-up poll parameters */
10130 VERIFY(ifp->if_poll_thread == THREAD_NULL);
10131 dlil_reset_rxpoll_params(ifp);
10132 }
10133
10134 /* The driver might unload, so point these to ourselves */
10135 if_free = ifp->if_free;
10136 ifp->if_output_dlil = ifp_if_output;
10137 ifp->if_output = ifp_if_output;
10138 ifp->if_pre_enqueue = ifp_if_output;
10139 ifp->if_start = ifp_if_start;
10140 ifp->if_output_ctl = ifp_if_ctl;
10141 ifp->if_input_dlil = ifp_if_input;
10142 ifp->if_input_poll = ifp_if_input_poll;
10143 ifp->if_input_ctl = ifp_if_ctl;
10144 ifp->if_ioctl = ifp_if_ioctl;
10145 ifp->if_set_bpf_tap = ifp_if_set_bpf_tap;
10146 ifp->if_free = ifp_if_free;
10147 ifp->if_demux = ifp_if_demux;
10148 ifp->if_event = ifp_if_event;
10149 ifp->if_framer_legacy = ifp_if_framer;
10150 ifp->if_framer = ifp_if_framer_extended;
10151 ifp->if_add_proto = ifp_if_add_proto;
10152 ifp->if_del_proto = ifp_if_del_proto;
10153 ifp->if_check_multi = ifp_if_check_multi;
10154
10155 /* wipe out interface description */
10156 VERIFY(ifp->if_desc.ifd_maxlen == IF_DESCSIZE);
10157 ifp->if_desc.ifd_len = 0;
10158 VERIFY(ifp->if_desc.ifd_desc != NULL);
10159 bzero(ifp->if_desc.ifd_desc, IF_DESCSIZE);
10160
10161 /* there shouldn't be any delegation by now */
10162 VERIFY(ifp->if_delegated.ifp == NULL);
10163 VERIFY(ifp->if_delegated.type == 0);
10164 VERIFY(ifp->if_delegated.family == 0);
10165 VERIFY(ifp->if_delegated.subfamily == 0);
10166 VERIFY(ifp->if_delegated.expensive == 0);
10167 VERIFY(ifp->if_delegated.constrained == 0);
10168
10169 /* QoS marking get cleared */
10170 if_clear_eflags(ifp, IFEF_QOSMARKING_ENABLED);
10171 if_set_qosmarking_mode(ifp, IFRTYPE_QOSMARKING_MODE_NONE);
10172
10173 #if SKYWALK
10174 /* the nexus destructor is responsible for clearing these */
10175 VERIFY(ifp->if_na_ops == NULL);
10176 VERIFY(ifp->if_na == NULL);
10177 #endif /* SKYWALK */
10178
10179 /* promiscuous/allmulti counts need to start at zero again */
10180 ifp->if_pcount = 0;
10181 ifp->if_amcount = 0;
10182 ifp->if_flags &= ~(IFF_PROMISC | IFF_ALLMULTI);
10183
10184 ifnet_lock_done(ifp);
10185
10186 #if PF
10187 /*
10188 * Detach this interface from packet filter, if enabled.
10189 */
10190 pf_ifnet_hook(ifp, 0);
10191 #endif /* PF */
10192
10193 /* Filter list should be empty */
10194 lck_mtx_lock_spin(&ifp->if_flt_lock);
10195 VERIFY(TAILQ_EMPTY(&ifp->if_flt_head));
10196 VERIFY(ifp->if_flt_busy == 0);
10197 VERIFY(ifp->if_flt_waiters == 0);
10198 VERIFY(ifp->if_flt_non_os_count == 0);
10199 VERIFY(ifp->if_flt_no_tso_count == 0);
10200 lck_mtx_unlock(&ifp->if_flt_lock);
10201
10202 /* Last chance to drain send queue */
10203 if_qflush_snd(ifp, 0);
10204
10205 /* Last chance to cleanup any cached route */
10206 lck_mtx_lock(&ifp->if_cached_route_lock);
10207 VERIFY(!ifp->if_fwd_cacheok);
10208 ROUTE_RELEASE(&ifp->if_fwd_route);
10209 bzero(&ifp->if_fwd_route, sizeof(ifp->if_fwd_route));
10210 ROUTE_RELEASE(&ifp->if_src_route);
10211 bzero(&ifp->if_src_route, sizeof(ifp->if_src_route));
10212 ROUTE_RELEASE(&ifp->if_src_route6);
10213 bzero(&ifp->if_src_route6, sizeof(ifp->if_src_route6));
10214 lck_mtx_unlock(&ifp->if_cached_route_lock);
10215
10216 VERIFY(ifp->if_data_threshold == 0);
10217 VERIFY(ifp->if_dt_tcall != NULL);
10218 VERIFY(!thread_call_isactive(ifp->if_dt_tcall));
10219
10220 ifnet_llreach_ifdetach(ifp);
10221
10222 dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_IF_DETACHED, NULL, 0, FALSE);
10223
10224 /*
10225 * Finally, mark this ifnet as detached.
10226 */
10227 if (dlil_verbose) {
10228 DLIL_PRINTF("%s: detached\n", if_name(ifp));
10229 }
10230 lck_mtx_lock_spin(&ifp->if_ref_lock);
10231 if (!(ifp->if_refflags & IFRF_DETACHING)) {
10232 panic("%s: flags mismatch (detaching not set) ifp=%p",
10233 __func__, ifp);
10234 /* NOTREACHED */
10235 }
10236 ifp->if_refflags &= ~IFRF_DETACHING;
10237 lck_mtx_unlock(&ifp->if_ref_lock);
10238 if (if_free != NULL) {
10239 if_free(ifp);
10240 }
10241
10242 ifclassq_release(&ifp->if_snd);
10243
10244 /* we're fully detached, clear the "in use" bit */
10245 dlifp = (struct dlil_ifnet *)ifp;
10246 lck_mtx_lock(&dlifp->dl_if_lock);
10247 ASSERT((dlifp->dl_if_flags & DLIF_INUSE) != 0);
10248 dlifp->dl_if_flags &= ~DLIF_INUSE;
10249 lck_mtx_unlock(&dlifp->dl_if_lock);
10250
10251 /* Release reference held during ifnet attach */
10252 ifnet_release(ifp);
10253 }
10254
10255 errno_t
ifp_if_output(struct ifnet * ifp,struct mbuf * m)10256 ifp_if_output(struct ifnet *ifp, struct mbuf *m)
10257 {
10258 #pragma unused(ifp)
10259 m_freem_list(m);
10260 return 0;
10261 }
10262
10263 void
ifp_if_start(struct ifnet * ifp)10264 ifp_if_start(struct ifnet *ifp)
10265 {
10266 ifnet_purge(ifp);
10267 }
10268
10269 static errno_t
ifp_if_input(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t poll,struct thread * tp)10270 ifp_if_input(struct ifnet *ifp, struct mbuf *m_head,
10271 struct mbuf *m_tail, const struct ifnet_stat_increment_param *s,
10272 boolean_t poll, struct thread *tp)
10273 {
10274 #pragma unused(ifp, m_tail, s, poll, tp)
10275 m_freem_list(m_head);
10276 return ENXIO;
10277 }
10278
10279 static void
ifp_if_input_poll(struct ifnet * ifp,u_int32_t flags,u_int32_t max_cnt,struct mbuf ** m_head,struct mbuf ** m_tail,u_int32_t * cnt,u_int32_t * len)10280 ifp_if_input_poll(struct ifnet *ifp, u_int32_t flags, u_int32_t max_cnt,
10281 struct mbuf **m_head, struct mbuf **m_tail, u_int32_t *cnt, u_int32_t *len)
10282 {
10283 #pragma unused(ifp, flags, max_cnt)
10284 if (m_head != NULL) {
10285 *m_head = NULL;
10286 }
10287 if (m_tail != NULL) {
10288 *m_tail = NULL;
10289 }
10290 if (cnt != NULL) {
10291 *cnt = 0;
10292 }
10293 if (len != NULL) {
10294 *len = 0;
10295 }
10296 }
10297
10298 static errno_t
ifp_if_ctl(struct ifnet * ifp,ifnet_ctl_cmd_t cmd,u_int32_t arglen,void * arg)10299 ifp_if_ctl(struct ifnet *ifp, ifnet_ctl_cmd_t cmd, u_int32_t arglen, void *arg)
10300 {
10301 #pragma unused(ifp, cmd, arglen, arg)
10302 return EOPNOTSUPP;
10303 }
10304
10305 static errno_t
ifp_if_demux(struct ifnet * ifp,struct mbuf * m,char * fh,protocol_family_t * pf)10306 ifp_if_demux(struct ifnet *ifp, struct mbuf *m, char *fh, protocol_family_t *pf)
10307 {
10308 #pragma unused(ifp, fh, pf)
10309 m_freem(m);
10310 return EJUSTRETURN;
10311 }
10312
10313 static errno_t
ifp_if_add_proto(struct ifnet * ifp,protocol_family_t pf,const struct ifnet_demux_desc * da,u_int32_t dc)10314 ifp_if_add_proto(struct ifnet *ifp, protocol_family_t pf,
10315 const struct ifnet_demux_desc *da, u_int32_t dc)
10316 {
10317 #pragma unused(ifp, pf, da, dc)
10318 return EINVAL;
10319 }
10320
10321 static errno_t
ifp_if_del_proto(struct ifnet * ifp,protocol_family_t pf)10322 ifp_if_del_proto(struct ifnet *ifp, protocol_family_t pf)
10323 {
10324 #pragma unused(ifp, pf)
10325 return EINVAL;
10326 }
10327
10328 static errno_t
ifp_if_check_multi(struct ifnet * ifp,const struct sockaddr * sa)10329 ifp_if_check_multi(struct ifnet *ifp, const struct sockaddr *sa)
10330 {
10331 #pragma unused(ifp, sa)
10332 return EOPNOTSUPP;
10333 }
10334
10335 #if !XNU_TARGET_OS_OSX
10336 static errno_t
ifp_if_framer(struct ifnet * ifp,struct mbuf ** m,const struct sockaddr * sa,const char * ll,const char * t,u_int32_t * pre,u_int32_t * post)10337 ifp_if_framer(struct ifnet *ifp, struct mbuf **m,
10338 const struct sockaddr *sa, const char *ll, const char *t,
10339 u_int32_t *pre, u_int32_t *post)
10340 #else /* XNU_TARGET_OS_OSX */
10341 static errno_t
10342 ifp_if_framer(struct ifnet *ifp, struct mbuf **m,
10343 const struct sockaddr *sa, const char *ll, const char *t)
10344 #endif /* XNU_TARGET_OS_OSX */
10345 {
10346 #pragma unused(ifp, m, sa, ll, t)
10347 #if !XNU_TARGET_OS_OSX
10348 return ifp_if_framer_extended(ifp, m, sa, ll, t, pre, post);
10349 #else /* XNU_TARGET_OS_OSX */
10350 return ifp_if_framer_extended(ifp, m, sa, ll, t, NULL, NULL);
10351 #endif /* XNU_TARGET_OS_OSX */
10352 }
10353
10354 static errno_t
ifp_if_framer_extended(struct ifnet * ifp,struct mbuf ** m,const struct sockaddr * sa,const char * ll,const char * t,u_int32_t * pre,u_int32_t * post)10355 ifp_if_framer_extended(struct ifnet *ifp, struct mbuf **m,
10356 const struct sockaddr *sa, const char *ll, const char *t,
10357 u_int32_t *pre, u_int32_t *post)
10358 {
10359 #pragma unused(ifp, sa, ll, t)
10360 m_freem(*m);
10361 *m = NULL;
10362
10363 if (pre != NULL) {
10364 *pre = 0;
10365 }
10366 if (post != NULL) {
10367 *post = 0;
10368 }
10369
10370 return EJUSTRETURN;
10371 }
10372
10373 errno_t
ifp_if_ioctl(struct ifnet * ifp,unsigned long cmd,void * arg)10374 ifp_if_ioctl(struct ifnet *ifp, unsigned long cmd, void *arg)
10375 {
10376 #pragma unused(ifp, cmd, arg)
10377 return EOPNOTSUPP;
10378 }
10379
10380 static errno_t
ifp_if_set_bpf_tap(struct ifnet * ifp,bpf_tap_mode tm,bpf_packet_func f)10381 ifp_if_set_bpf_tap(struct ifnet *ifp, bpf_tap_mode tm, bpf_packet_func f)
10382 {
10383 #pragma unused(ifp, tm, f)
10384 /* XXX not sure what to do here */
10385 return 0;
10386 }
10387
10388 static void
ifp_if_free(struct ifnet * ifp)10389 ifp_if_free(struct ifnet *ifp)
10390 {
10391 #pragma unused(ifp)
10392 }
10393
10394 static void
ifp_if_event(struct ifnet * ifp,const struct kev_msg * e)10395 ifp_if_event(struct ifnet *ifp, const struct kev_msg *e)
10396 {
10397 #pragma unused(ifp, e)
10398 }
10399
10400 int
dlil_if_acquire(u_int32_t family,const void * uniqueid,size_t uniqueid_len,const char * ifxname,struct ifnet ** ifp)10401 dlil_if_acquire(u_int32_t family, const void *uniqueid,
10402 size_t uniqueid_len, const char *ifxname, struct ifnet **ifp)
10403 {
10404 struct ifnet *ifp1 = NULL;
10405 struct dlil_ifnet *dlifp1 = NULL;
10406 struct dlil_ifnet *dlifp1_saved = NULL;
10407 void *buf, *base, **pbuf;
10408 int ret = 0;
10409
10410 VERIFY(*ifp == NULL);
10411 dlil_if_lock();
10412 /*
10413 * We absolutely can't have an interface with the same name
10414 * in in-use state.
10415 * To make sure of that list has to be traversed completely
10416 */
10417 TAILQ_FOREACH(dlifp1, &dlil_ifnet_head, dl_if_link) {
10418 ifp1 = (struct ifnet *)dlifp1;
10419
10420 if (ifp1->if_family != family) {
10421 continue;
10422 }
10423
10424 /*
10425 * If interface is in use, return EBUSY if either unique id
10426 * or interface extended names are the same
10427 */
10428 lck_mtx_lock(&dlifp1->dl_if_lock);
10429 if (strncmp(ifxname, ifp1->if_xname, IFXNAMSIZ) == 0 &&
10430 (dlifp1->dl_if_flags & DLIF_INUSE) != 0) {
10431 lck_mtx_unlock(&dlifp1->dl_if_lock);
10432 ret = EBUSY;
10433 goto end;
10434 }
10435
10436 if (uniqueid_len != 0 &&
10437 uniqueid_len == dlifp1->dl_if_uniqueid_len &&
10438 bcmp(uniqueid, dlifp1->dl_if_uniqueid, uniqueid_len) == 0) {
10439 if ((dlifp1->dl_if_flags & DLIF_INUSE) != 0) {
10440 lck_mtx_unlock(&dlifp1->dl_if_lock);
10441 ret = EBUSY;
10442 goto end;
10443 }
10444 if (dlifp1_saved == NULL) {
10445 /* cache the first match */
10446 dlifp1_saved = dlifp1;
10447 }
10448 /*
10449 * Do not break or jump to end as we have to traverse
10450 * the whole list to ensure there are no name collisions
10451 */
10452 }
10453 lck_mtx_unlock(&dlifp1->dl_if_lock);
10454 }
10455
10456 /* If there's an interface that can be recycled, use that */
10457 if (dlifp1_saved != NULL) {
10458 lck_mtx_lock(&dlifp1_saved->dl_if_lock);
10459 if ((dlifp1_saved->dl_if_flags & DLIF_INUSE) != 0) {
10460 /* some other thread got in ahead of us */
10461 lck_mtx_unlock(&dlifp1_saved->dl_if_lock);
10462 ret = EBUSY;
10463 goto end;
10464 }
10465 dlifp1_saved->dl_if_flags |= (DLIF_INUSE | DLIF_REUSE);
10466 lck_mtx_unlock(&dlifp1_saved->dl_if_lock);
10467 *ifp = (struct ifnet *)dlifp1_saved;
10468 dlil_if_ref(*ifp);
10469 goto end;
10470 }
10471
10472 /* no interface found, allocate a new one */
10473 buf = zalloc_flags(dlif_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
10474
10475 /* Get the 64-bit aligned base address for this object */
10476 base = (void *)P2ROUNDUP((intptr_t)buf + sizeof(u_int64_t),
10477 sizeof(u_int64_t));
10478 VERIFY(((intptr_t)base + dlif_size) <= ((intptr_t)buf + dlif_bufsize));
10479
10480 /*
10481 * Wind back a pointer size from the aligned base and
10482 * save the original address so we can free it later.
10483 */
10484 pbuf = (void **)((intptr_t)base - sizeof(void *));
10485 *pbuf = buf;
10486 dlifp1 = base;
10487
10488 if (uniqueid_len) {
10489 dlifp1->dl_if_uniqueid = kalloc_data(uniqueid_len,
10490 Z_WAITOK);
10491 if (dlifp1->dl_if_uniqueid == NULL) {
10492 zfree(dlif_zone, buf);
10493 ret = ENOMEM;
10494 goto end;
10495 }
10496 bcopy(uniqueid, dlifp1->dl_if_uniqueid, uniqueid_len);
10497 dlifp1->dl_if_uniqueid_len = uniqueid_len;
10498 }
10499
10500 ifp1 = (struct ifnet *)dlifp1;
10501 dlifp1->dl_if_flags = DLIF_INUSE;
10502 if (ifnet_debug) {
10503 dlifp1->dl_if_flags |= DLIF_DEBUG;
10504 dlifp1->dl_if_trace = dlil_if_trace;
10505 }
10506 ifp1->if_name = dlifp1->dl_if_namestorage;
10507 ifp1->if_xname = dlifp1->dl_if_xnamestorage;
10508
10509 /* initialize interface description */
10510 ifp1->if_desc.ifd_maxlen = IF_DESCSIZE;
10511 ifp1->if_desc.ifd_len = 0;
10512 ifp1->if_desc.ifd_desc = dlifp1->dl_if_descstorage;
10513
10514 #if SKYWALK
10515 SLIST_INIT(&ifp1->if_netns_tokens);
10516 #endif /* SKYWALK */
10517
10518 if ((ret = dlil_alloc_local_stats(ifp1)) != 0) {
10519 DLIL_PRINTF("%s: failed to allocate if local stats, "
10520 "error: %d\n", __func__, ret);
10521 /* This probably shouldn't be fatal */
10522 ret = 0;
10523 }
10524
10525 lck_mtx_init(&dlifp1->dl_if_lock, &ifnet_lock_group, &ifnet_lock_attr);
10526 lck_rw_init(&ifp1->if_lock, &ifnet_lock_group, &ifnet_lock_attr);
10527 lck_mtx_init(&ifp1->if_ref_lock, &ifnet_lock_group, &ifnet_lock_attr);
10528 lck_mtx_init(&ifp1->if_flt_lock, &ifnet_lock_group, &ifnet_lock_attr);
10529 lck_mtx_init(&ifp1->if_addrconfig_lock, &ifnet_lock_group,
10530 &ifnet_lock_attr);
10531 lck_rw_init(&ifp1->if_llreach_lock, &ifnet_lock_group, &ifnet_lock_attr);
10532 #if INET
10533 lck_rw_init(&ifp1->if_inetdata_lock, &ifnet_lock_group,
10534 &ifnet_lock_attr);
10535 ifp1->if_inetdata = NULL;
10536 #endif
10537 lck_mtx_init(&ifp1->if_inet6_ioctl_lock, &ifnet_lock_group, &ifnet_lock_attr);
10538 ifp1->if_inet6_ioctl_busy = FALSE;
10539 lck_rw_init(&ifp1->if_inet6data_lock, &ifnet_lock_group,
10540 &ifnet_lock_attr);
10541 ifp1->if_inet6data = NULL;
10542 lck_rw_init(&ifp1->if_link_status_lock, &ifnet_lock_group,
10543 &ifnet_lock_attr);
10544 ifp1->if_link_status = NULL;
10545 lck_mtx_init(&ifp1->if_delegate_lock, &ifnet_lock_group, &ifnet_lock_attr);
10546
10547 /* for send data paths */
10548 lck_mtx_init(&ifp1->if_start_lock, &ifnet_snd_lock_group,
10549 &ifnet_lock_attr);
10550 lck_mtx_init(&ifp1->if_cached_route_lock, &ifnet_snd_lock_group,
10551 &ifnet_lock_attr);
10552
10553 /* for receive data paths */
10554 lck_mtx_init(&ifp1->if_poll_lock, &ifnet_rcv_lock_group,
10555 &ifnet_lock_attr);
10556
10557 /* thread call allocation is done with sleeping zalloc */
10558 ifp1->if_dt_tcall = thread_call_allocate_with_options(dlil_dt_tcall_fn,
10559 ifp1, THREAD_CALL_PRIORITY_KERNEL, THREAD_CALL_OPTIONS_ONCE);
10560 if (ifp1->if_dt_tcall == NULL) {
10561 panic_plain("%s: couldn't create if_dt_tcall", __func__);
10562 /* NOTREACHED */
10563 }
10564
10565 TAILQ_INSERT_TAIL(&dlil_ifnet_head, dlifp1, dl_if_link);
10566
10567 *ifp = ifp1;
10568 dlil_if_ref(*ifp);
10569
10570 end:
10571 dlil_if_unlock();
10572
10573 VERIFY(dlifp1 == NULL || (IS_P2ALIGNED(dlifp1, sizeof(u_int64_t)) &&
10574 IS_P2ALIGNED(&ifp1->if_data, sizeof(u_int64_t))));
10575
10576 return ret;
10577 }
10578
10579 static void
_dlil_if_release(ifnet_t ifp,bool clear_in_use)10580 _dlil_if_release(ifnet_t ifp, bool clear_in_use)
10581 {
10582 struct dlil_ifnet *dlifp = (struct dlil_ifnet *)ifp;
10583
10584 VERIFY(OSDecrementAtomic64(&net_api_stats.nas_ifnet_alloc_count) > 0);
10585 if (!(ifp->if_xflags & IFXF_ALLOC_KPI)) {
10586 VERIFY(OSDecrementAtomic64(&net_api_stats.nas_ifnet_alloc_os_count) > 0);
10587 }
10588
10589 ifnet_lock_exclusive(ifp);
10590 if (ifp->if_broadcast.length > sizeof(ifp->if_broadcast.u.buffer)) {
10591 kfree_data(ifp->if_broadcast.u.ptr, ifp->if_broadcast.length);
10592 ifp->if_broadcast.length = 0;
10593 ifp->if_broadcast.u.ptr = NULL;
10594 }
10595 lck_mtx_lock(&dlifp->dl_if_lock);
10596 strlcpy(dlifp->dl_if_namestorage, ifp->if_name, IFNAMSIZ);
10597 ifp->if_name = dlifp->dl_if_namestorage;
10598 /* Reset external name (name + unit) */
10599 ifp->if_xname = dlifp->dl_if_xnamestorage;
10600 snprintf(__DECONST(char *, ifp->if_xname), IFXNAMSIZ,
10601 "%s?", ifp->if_name);
10602 if (clear_in_use) {
10603 ASSERT((dlifp->dl_if_flags & DLIF_INUSE) != 0);
10604 dlifp->dl_if_flags &= ~DLIF_INUSE;
10605 }
10606 lck_mtx_unlock(&dlifp->dl_if_lock);
10607 ifnet_lock_done(ifp);
10608 }
10609
10610 __private_extern__ void
dlil_if_release(ifnet_t ifp)10611 dlil_if_release(ifnet_t ifp)
10612 {
10613 _dlil_if_release(ifp, false);
10614 }
10615
10616 __private_extern__ void
dlil_if_lock(void)10617 dlil_if_lock(void)
10618 {
10619 lck_mtx_lock(&dlil_ifnet_lock);
10620 }
10621
10622 __private_extern__ void
dlil_if_unlock(void)10623 dlil_if_unlock(void)
10624 {
10625 lck_mtx_unlock(&dlil_ifnet_lock);
10626 }
10627
10628 __private_extern__ void
dlil_if_lock_assert(void)10629 dlil_if_lock_assert(void)
10630 {
10631 LCK_MTX_ASSERT(&dlil_ifnet_lock, LCK_MTX_ASSERT_OWNED);
10632 }
10633
10634 __private_extern__ void
dlil_proto_unplumb_all(struct ifnet * ifp)10635 dlil_proto_unplumb_all(struct ifnet *ifp)
10636 {
10637 /*
10638 * if_proto_hash[0-2] are for PF_INET, PF_INET6 and PF_VLAN, where
10639 * each bucket contains exactly one entry; PF_VLAN does not need an
10640 * explicit unplumb.
10641 *
10642 * if_proto_hash[3] is for other protocols; we expect anything
10643 * in this bucket to respond to the DETACHING event (which would
10644 * have happened by now) and do the unplumb then.
10645 */
10646 (void) proto_unplumb(PF_INET, ifp);
10647 (void) proto_unplumb(PF_INET6, ifp);
10648 }
10649
10650 static void
ifp_src_route_copyout(struct ifnet * ifp,struct route * dst)10651 ifp_src_route_copyout(struct ifnet *ifp, struct route *dst)
10652 {
10653 lck_mtx_lock_spin(&ifp->if_cached_route_lock);
10654 lck_mtx_convert_spin(&ifp->if_cached_route_lock);
10655
10656 route_copyout(dst, &ifp->if_src_route, sizeof(*dst));
10657
10658 lck_mtx_unlock(&ifp->if_cached_route_lock);
10659 }
10660
10661 static void
ifp_src_route_copyin(struct ifnet * ifp,struct route * src)10662 ifp_src_route_copyin(struct ifnet *ifp, struct route *src)
10663 {
10664 lck_mtx_lock_spin(&ifp->if_cached_route_lock);
10665 lck_mtx_convert_spin(&ifp->if_cached_route_lock);
10666
10667 if (ifp->if_fwd_cacheok) {
10668 route_copyin(src, &ifp->if_src_route, sizeof(*src));
10669 } else {
10670 ROUTE_RELEASE(src);
10671 }
10672 lck_mtx_unlock(&ifp->if_cached_route_lock);
10673 }
10674
10675 static void
ifp_src_route6_copyout(struct ifnet * ifp,struct route_in6 * dst)10676 ifp_src_route6_copyout(struct ifnet *ifp, struct route_in6 *dst)
10677 {
10678 lck_mtx_lock_spin(&ifp->if_cached_route_lock);
10679 lck_mtx_convert_spin(&ifp->if_cached_route_lock);
10680
10681 route_copyout((struct route *)dst, (struct route *)&ifp->if_src_route6,
10682 sizeof(*dst));
10683
10684 lck_mtx_unlock(&ifp->if_cached_route_lock);
10685 }
10686
10687 static void
ifp_src_route6_copyin(struct ifnet * ifp,struct route_in6 * src)10688 ifp_src_route6_copyin(struct ifnet *ifp, struct route_in6 *src)
10689 {
10690 lck_mtx_lock_spin(&ifp->if_cached_route_lock);
10691 lck_mtx_convert_spin(&ifp->if_cached_route_lock);
10692
10693 if (ifp->if_fwd_cacheok) {
10694 route_copyin((struct route *)src,
10695 (struct route *)&ifp->if_src_route6, sizeof(*src));
10696 } else {
10697 ROUTE_RELEASE(src);
10698 }
10699 lck_mtx_unlock(&ifp->if_cached_route_lock);
10700 }
10701
10702 struct rtentry *
ifnet_cached_rtlookup_inet(struct ifnet * ifp,struct in_addr src_ip)10703 ifnet_cached_rtlookup_inet(struct ifnet *ifp, struct in_addr src_ip)
10704 {
10705 struct route src_rt;
10706 struct sockaddr_in *dst;
10707
10708 dst = (struct sockaddr_in *)(void *)(&src_rt.ro_dst);
10709
10710 ifp_src_route_copyout(ifp, &src_rt);
10711
10712 if (ROUTE_UNUSABLE(&src_rt) || src_ip.s_addr != dst->sin_addr.s_addr) {
10713 ROUTE_RELEASE(&src_rt);
10714 if (dst->sin_family != AF_INET) {
10715 bzero(&src_rt.ro_dst, sizeof(src_rt.ro_dst));
10716 dst->sin_len = sizeof(src_rt.ro_dst);
10717 dst->sin_family = AF_INET;
10718 }
10719 dst->sin_addr = src_ip;
10720
10721 VERIFY(src_rt.ro_rt == NULL);
10722 src_rt.ro_rt = rtalloc1_scoped((struct sockaddr *)dst,
10723 0, 0, ifp->if_index);
10724
10725 if (src_rt.ro_rt != NULL) {
10726 /* retain a ref, copyin consumes one */
10727 struct rtentry *rte = src_rt.ro_rt;
10728 RT_ADDREF(rte);
10729 ifp_src_route_copyin(ifp, &src_rt);
10730 src_rt.ro_rt = rte;
10731 }
10732 }
10733
10734 return src_rt.ro_rt;
10735 }
10736
10737 struct rtentry *
ifnet_cached_rtlookup_inet6(struct ifnet * ifp,struct in6_addr * src_ip6)10738 ifnet_cached_rtlookup_inet6(struct ifnet *ifp, struct in6_addr *src_ip6)
10739 {
10740 struct route_in6 src_rt;
10741
10742 ifp_src_route6_copyout(ifp, &src_rt);
10743
10744 if (ROUTE_UNUSABLE(&src_rt) ||
10745 !IN6_ARE_ADDR_EQUAL(src_ip6, &src_rt.ro_dst.sin6_addr)) {
10746 ROUTE_RELEASE(&src_rt);
10747 if (src_rt.ro_dst.sin6_family != AF_INET6) {
10748 bzero(&src_rt.ro_dst, sizeof(src_rt.ro_dst));
10749 src_rt.ro_dst.sin6_len = sizeof(src_rt.ro_dst);
10750 src_rt.ro_dst.sin6_family = AF_INET6;
10751 }
10752 src_rt.ro_dst.sin6_scope_id = in6_addr2scopeid(ifp, src_ip6);
10753 bcopy(src_ip6, &src_rt.ro_dst.sin6_addr,
10754 sizeof(src_rt.ro_dst.sin6_addr));
10755
10756 if (src_rt.ro_rt == NULL) {
10757 src_rt.ro_rt = rtalloc1_scoped(
10758 (struct sockaddr *)&src_rt.ro_dst, 0, 0,
10759 ifp->if_index);
10760
10761 if (src_rt.ro_rt != NULL) {
10762 /* retain a ref, copyin consumes one */
10763 struct rtentry *rte = src_rt.ro_rt;
10764 RT_ADDREF(rte);
10765 ifp_src_route6_copyin(ifp, &src_rt);
10766 src_rt.ro_rt = rte;
10767 }
10768 }
10769 }
10770
10771 return src_rt.ro_rt;
10772 }
10773
10774 void
if_lqm_update(struct ifnet * ifp,int lqm,int locked)10775 if_lqm_update(struct ifnet *ifp, int lqm, int locked)
10776 {
10777 struct kev_dl_link_quality_metric_data ev_lqm_data;
10778
10779 VERIFY(lqm >= IFNET_LQM_MIN && lqm <= IFNET_LQM_MAX);
10780
10781 /* Normalize to edge */
10782 if (lqm >= 0 && lqm <= IFNET_LQM_THRESH_ABORT) {
10783 lqm = IFNET_LQM_THRESH_ABORT;
10784 os_atomic_or(&tcbinfo.ipi_flags, INPCBINFO_HANDLE_LQM_ABORT, relaxed);
10785 inpcb_timer_sched(&tcbinfo, INPCB_TIMER_FAST);
10786 } else if (lqm > IFNET_LQM_THRESH_ABORT &&
10787 lqm <= IFNET_LQM_THRESH_MINIMALLY_VIABLE) {
10788 lqm = IFNET_LQM_THRESH_MINIMALLY_VIABLE;
10789 } else if (lqm > IFNET_LQM_THRESH_MINIMALLY_VIABLE &&
10790 lqm <= IFNET_LQM_THRESH_POOR) {
10791 lqm = IFNET_LQM_THRESH_POOR;
10792 } else if (lqm > IFNET_LQM_THRESH_POOR &&
10793 lqm <= IFNET_LQM_THRESH_GOOD) {
10794 lqm = IFNET_LQM_THRESH_GOOD;
10795 }
10796
10797 /*
10798 * Take the lock if needed
10799 */
10800 if (!locked) {
10801 ifnet_lock_exclusive(ifp);
10802 }
10803
10804 if (lqm == ifp->if_interface_state.lqm_state &&
10805 (ifp->if_interface_state.valid_bitmask &
10806 IF_INTERFACE_STATE_LQM_STATE_VALID)) {
10807 /*
10808 * Release the lock if was not held by the caller
10809 */
10810 if (!locked) {
10811 ifnet_lock_done(ifp);
10812 }
10813 return; /* nothing to update */
10814 }
10815 ifp->if_interface_state.valid_bitmask |=
10816 IF_INTERFACE_STATE_LQM_STATE_VALID;
10817 ifp->if_interface_state.lqm_state = (int8_t)lqm;
10818
10819 /*
10820 * Don't want to hold the lock when issuing kernel events
10821 */
10822 ifnet_lock_done(ifp);
10823
10824 bzero(&ev_lqm_data, sizeof(ev_lqm_data));
10825 ev_lqm_data.link_quality_metric = lqm;
10826
10827 dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_LINK_QUALITY_METRIC_CHANGED,
10828 (struct net_event_data *)&ev_lqm_data, sizeof(ev_lqm_data), FALSE);
10829
10830 /*
10831 * Reacquire the lock for the caller
10832 */
10833 if (locked) {
10834 ifnet_lock_exclusive(ifp);
10835 }
10836 }
10837
10838 static void
if_rrc_state_update(struct ifnet * ifp,unsigned int rrc_state)10839 if_rrc_state_update(struct ifnet *ifp, unsigned int rrc_state)
10840 {
10841 struct kev_dl_rrc_state kev;
10842
10843 if (rrc_state == ifp->if_interface_state.rrc_state &&
10844 (ifp->if_interface_state.valid_bitmask &
10845 IF_INTERFACE_STATE_RRC_STATE_VALID)) {
10846 return;
10847 }
10848
10849 ifp->if_interface_state.valid_bitmask |=
10850 IF_INTERFACE_STATE_RRC_STATE_VALID;
10851
10852 ifp->if_interface_state.rrc_state = (uint8_t)rrc_state;
10853
10854 /*
10855 * Don't want to hold the lock when issuing kernel events
10856 */
10857 ifnet_lock_done(ifp);
10858
10859 bzero(&kev, sizeof(struct kev_dl_rrc_state));
10860 kev.rrc_state = rrc_state;
10861
10862 dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_RRC_STATE_CHANGED,
10863 (struct net_event_data *)&kev, sizeof(struct kev_dl_rrc_state), FALSE);
10864
10865 ifnet_lock_exclusive(ifp);
10866 }
10867
10868 errno_t
if_state_update(struct ifnet * ifp,struct if_interface_state * if_interface_state)10869 if_state_update(struct ifnet *ifp,
10870 struct if_interface_state *if_interface_state)
10871 {
10872 u_short if_index_available = 0;
10873
10874 ifnet_lock_exclusive(ifp);
10875
10876 if ((ifp->if_type != IFT_CELLULAR) &&
10877 (if_interface_state->valid_bitmask &
10878 IF_INTERFACE_STATE_RRC_STATE_VALID)) {
10879 ifnet_lock_done(ifp);
10880 return ENOTSUP;
10881 }
10882 if ((if_interface_state->valid_bitmask &
10883 IF_INTERFACE_STATE_LQM_STATE_VALID) &&
10884 (if_interface_state->lqm_state < IFNET_LQM_MIN ||
10885 if_interface_state->lqm_state > IFNET_LQM_MAX)) {
10886 ifnet_lock_done(ifp);
10887 return EINVAL;
10888 }
10889 if ((if_interface_state->valid_bitmask &
10890 IF_INTERFACE_STATE_RRC_STATE_VALID) &&
10891 if_interface_state->rrc_state !=
10892 IF_INTERFACE_STATE_RRC_STATE_IDLE &&
10893 if_interface_state->rrc_state !=
10894 IF_INTERFACE_STATE_RRC_STATE_CONNECTED) {
10895 ifnet_lock_done(ifp);
10896 return EINVAL;
10897 }
10898
10899 if (if_interface_state->valid_bitmask &
10900 IF_INTERFACE_STATE_LQM_STATE_VALID) {
10901 if_lqm_update(ifp, if_interface_state->lqm_state, 1);
10902 }
10903 if (if_interface_state->valid_bitmask &
10904 IF_INTERFACE_STATE_RRC_STATE_VALID) {
10905 if_rrc_state_update(ifp, if_interface_state->rrc_state);
10906 }
10907 if (if_interface_state->valid_bitmask &
10908 IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID) {
10909 ifp->if_interface_state.valid_bitmask |=
10910 IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID;
10911 ifp->if_interface_state.interface_availability =
10912 if_interface_state->interface_availability;
10913
10914 if (ifp->if_interface_state.interface_availability ==
10915 IF_INTERFACE_STATE_INTERFACE_AVAILABLE) {
10916 os_log(OS_LOG_DEFAULT, "%s: interface %s (%u) available\n",
10917 __func__, if_name(ifp), ifp->if_index);
10918 if_index_available = ifp->if_index;
10919 } else {
10920 os_log(OS_LOG_DEFAULT, "%s: interface %s (%u) unavailable)\n",
10921 __func__, if_name(ifp), ifp->if_index);
10922 }
10923 }
10924 ifnet_lock_done(ifp);
10925
10926 /*
10927 * Check if the TCP connections going on this interface should be
10928 * forced to send probe packets instead of waiting for TCP timers
10929 * to fire. This is done on an explicit notification such as
10930 * SIOCSIFINTERFACESTATE which marks the interface as available.
10931 */
10932 if (if_index_available > 0) {
10933 tcp_interface_send_probe(if_index_available);
10934 }
10935
10936 return 0;
10937 }
10938
10939 void
if_get_state(struct ifnet * ifp,struct if_interface_state * if_interface_state)10940 if_get_state(struct ifnet *ifp,
10941 struct if_interface_state *if_interface_state)
10942 {
10943 ifnet_lock_shared(ifp);
10944
10945 if_interface_state->valid_bitmask = 0;
10946
10947 if (ifp->if_interface_state.valid_bitmask &
10948 IF_INTERFACE_STATE_RRC_STATE_VALID) {
10949 if_interface_state->valid_bitmask |=
10950 IF_INTERFACE_STATE_RRC_STATE_VALID;
10951 if_interface_state->rrc_state =
10952 ifp->if_interface_state.rrc_state;
10953 }
10954 if (ifp->if_interface_state.valid_bitmask &
10955 IF_INTERFACE_STATE_LQM_STATE_VALID) {
10956 if_interface_state->valid_bitmask |=
10957 IF_INTERFACE_STATE_LQM_STATE_VALID;
10958 if_interface_state->lqm_state =
10959 ifp->if_interface_state.lqm_state;
10960 }
10961 if (ifp->if_interface_state.valid_bitmask &
10962 IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID) {
10963 if_interface_state->valid_bitmask |=
10964 IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID;
10965 if_interface_state->interface_availability =
10966 ifp->if_interface_state.interface_availability;
10967 }
10968
10969 ifnet_lock_done(ifp);
10970 }
10971
10972 errno_t
if_probe_connectivity(struct ifnet * ifp,u_int32_t conn_probe)10973 if_probe_connectivity(struct ifnet *ifp, u_int32_t conn_probe)
10974 {
10975 if (conn_probe > 1) {
10976 return EINVAL;
10977 }
10978 if (conn_probe == 0) {
10979 if_clear_eflags(ifp, IFEF_PROBE_CONNECTIVITY);
10980 } else {
10981 if_set_eflags(ifp, IFEF_PROBE_CONNECTIVITY);
10982 }
10983
10984 #if NECP
10985 necp_update_all_clients();
10986 #endif /* NECP */
10987
10988 tcp_probe_connectivity(ifp, conn_probe);
10989 return 0;
10990 }
10991
10992 /* for uuid.c */
10993 static int
get_ether_index(int * ret_other_index)10994 get_ether_index(int * ret_other_index)
10995 {
10996 struct ifnet *ifp;
10997 int en0_index = 0;
10998 int other_en_index = 0;
10999 int any_ether_index = 0;
11000 short best_unit = 0;
11001
11002 *ret_other_index = 0;
11003 TAILQ_FOREACH(ifp, &ifnet_head, if_link) {
11004 /*
11005 * find en0, or if not en0, the lowest unit en*, and if not
11006 * that, any ethernet
11007 */
11008 ifnet_lock_shared(ifp);
11009 if (strcmp(ifp->if_name, "en") == 0) {
11010 if (ifp->if_unit == 0) {
11011 /* found en0, we're done */
11012 en0_index = ifp->if_index;
11013 ifnet_lock_done(ifp);
11014 break;
11015 }
11016 if (other_en_index == 0 || ifp->if_unit < best_unit) {
11017 other_en_index = ifp->if_index;
11018 best_unit = ifp->if_unit;
11019 }
11020 } else if (ifp->if_type == IFT_ETHER && any_ether_index == 0) {
11021 any_ether_index = ifp->if_index;
11022 }
11023 ifnet_lock_done(ifp);
11024 }
11025 if (en0_index == 0) {
11026 if (other_en_index != 0) {
11027 *ret_other_index = other_en_index;
11028 } else if (any_ether_index != 0) {
11029 *ret_other_index = any_ether_index;
11030 }
11031 }
11032 return en0_index;
11033 }
11034
11035 int
uuid_get_ethernet(u_int8_t * node)11036 uuid_get_ethernet(u_int8_t *node)
11037 {
11038 static int en0_index;
11039 struct ifnet *ifp;
11040 int other_index = 0;
11041 int the_index = 0;
11042 int ret;
11043
11044 ifnet_head_lock_shared();
11045 if (en0_index == 0 || ifindex2ifnet[en0_index] == NULL) {
11046 en0_index = get_ether_index(&other_index);
11047 }
11048 if (en0_index != 0) {
11049 the_index = en0_index;
11050 } else if (other_index != 0) {
11051 the_index = other_index;
11052 }
11053 if (the_index != 0) {
11054 struct dlil_ifnet *dl_if;
11055
11056 ifp = ifindex2ifnet[the_index];
11057 VERIFY(ifp != NULL);
11058 dl_if = (struct dlil_ifnet *)ifp;
11059 if (dl_if->dl_if_permanent_ether_is_set != 0) {
11060 /*
11061 * Use the permanent ethernet address if it is
11062 * available because it will never change.
11063 */
11064 memcpy(node, dl_if->dl_if_permanent_ether,
11065 ETHER_ADDR_LEN);
11066 } else {
11067 memcpy(node, IF_LLADDR(ifp), ETHER_ADDR_LEN);
11068 }
11069 ret = 0;
11070 } else {
11071 ret = -1;
11072 }
11073 ifnet_head_done();
11074 return ret;
11075 }
11076
11077 static int
11078 sysctl_rxpoll SYSCTL_HANDLER_ARGS
11079 {
11080 #pragma unused(arg1, arg2)
11081 uint32_t i;
11082 int err;
11083
11084 i = if_rxpoll;
11085
11086 err = sysctl_handle_int(oidp, &i, 0, req);
11087 if (err != 0 || req->newptr == USER_ADDR_NULL) {
11088 return err;
11089 }
11090
11091 if (net_rxpoll == 0) {
11092 return ENXIO;
11093 }
11094
11095 if_rxpoll = i;
11096 return err;
11097 }
11098
11099 static int
11100 sysctl_rxpoll_mode_holdtime SYSCTL_HANDLER_ARGS
11101 {
11102 #pragma unused(arg1, arg2)
11103 uint64_t q;
11104 int err;
11105
11106 q = if_rxpoll_mode_holdtime;
11107
11108 err = sysctl_handle_quad(oidp, &q, 0, req);
11109 if (err != 0 || req->newptr == USER_ADDR_NULL) {
11110 return err;
11111 }
11112
11113 if (q < IF_RXPOLL_MODE_HOLDTIME_MIN) {
11114 q = IF_RXPOLL_MODE_HOLDTIME_MIN;
11115 }
11116
11117 if_rxpoll_mode_holdtime = q;
11118
11119 return err;
11120 }
11121
11122 static int
11123 sysctl_rxpoll_sample_holdtime SYSCTL_HANDLER_ARGS
11124 {
11125 #pragma unused(arg1, arg2)
11126 uint64_t q;
11127 int err;
11128
11129 q = if_rxpoll_sample_holdtime;
11130
11131 err = sysctl_handle_quad(oidp, &q, 0, req);
11132 if (err != 0 || req->newptr == USER_ADDR_NULL) {
11133 return err;
11134 }
11135
11136 if (q < IF_RXPOLL_SAMPLETIME_MIN) {
11137 q = IF_RXPOLL_SAMPLETIME_MIN;
11138 }
11139
11140 if_rxpoll_sample_holdtime = q;
11141
11142 return err;
11143 }
11144
11145 static int
11146 sysctl_rxpoll_interval_time SYSCTL_HANDLER_ARGS
11147 {
11148 #pragma unused(arg1, arg2)
11149 uint64_t q;
11150 int err;
11151
11152 q = if_rxpoll_interval_time;
11153
11154 err = sysctl_handle_quad(oidp, &q, 0, req);
11155 if (err != 0 || req->newptr == USER_ADDR_NULL) {
11156 return err;
11157 }
11158
11159 if (q < IF_RXPOLL_INTERVALTIME_MIN) {
11160 q = IF_RXPOLL_INTERVALTIME_MIN;
11161 }
11162
11163 if_rxpoll_interval_time = q;
11164
11165 return err;
11166 }
11167
11168 static int
11169 sysctl_rxpoll_wlowat SYSCTL_HANDLER_ARGS
11170 {
11171 #pragma unused(arg1, arg2)
11172 uint32_t i;
11173 int err;
11174
11175 i = if_sysctl_rxpoll_wlowat;
11176
11177 err = sysctl_handle_int(oidp, &i, 0, req);
11178 if (err != 0 || req->newptr == USER_ADDR_NULL) {
11179 return err;
11180 }
11181
11182 if (i == 0 || i >= if_sysctl_rxpoll_whiwat) {
11183 return EINVAL;
11184 }
11185
11186 if_sysctl_rxpoll_wlowat = i;
11187 return err;
11188 }
11189
11190 static int
11191 sysctl_rxpoll_whiwat SYSCTL_HANDLER_ARGS
11192 {
11193 #pragma unused(arg1, arg2)
11194 uint32_t i;
11195 int err;
11196
11197 i = if_sysctl_rxpoll_whiwat;
11198
11199 err = sysctl_handle_int(oidp, &i, 0, req);
11200 if (err != 0 || req->newptr == USER_ADDR_NULL) {
11201 return err;
11202 }
11203
11204 if (i <= if_sysctl_rxpoll_wlowat) {
11205 return EINVAL;
11206 }
11207
11208 if_sysctl_rxpoll_whiwat = i;
11209 return err;
11210 }
11211
11212 static int
11213 sysctl_sndq_maxlen SYSCTL_HANDLER_ARGS
11214 {
11215 #pragma unused(arg1, arg2)
11216 int i, err;
11217
11218 i = if_sndq_maxlen;
11219
11220 err = sysctl_handle_int(oidp, &i, 0, req);
11221 if (err != 0 || req->newptr == USER_ADDR_NULL) {
11222 return err;
11223 }
11224
11225 if (i < IF_SNDQ_MINLEN) {
11226 i = IF_SNDQ_MINLEN;
11227 }
11228
11229 if_sndq_maxlen = i;
11230 return err;
11231 }
11232
11233 static int
11234 sysctl_rcvq_maxlen SYSCTL_HANDLER_ARGS
11235 {
11236 #pragma unused(arg1, arg2)
11237 int i, err;
11238
11239 i = if_rcvq_maxlen;
11240
11241 err = sysctl_handle_int(oidp, &i, 0, req);
11242 if (err != 0 || req->newptr == USER_ADDR_NULL) {
11243 return err;
11244 }
11245
11246 if (i < IF_RCVQ_MINLEN) {
11247 i = IF_RCVQ_MINLEN;
11248 }
11249
11250 if_rcvq_maxlen = i;
11251 return err;
11252 }
11253
11254 static int
11255 sysctl_rcvq_burst_limit SYSCTL_HANDLER_ARGS
11256 {
11257 #pragma unused(arg1, arg2)
11258 int i, err;
11259
11260 i = if_rcvq_burst_limit;
11261
11262 err = sysctl_handle_int(oidp, &i, 0, req);
11263 if (err != 0 || req->newptr == USER_ADDR_NULL) {
11264 return err;
11265 }
11266
11267 /*
11268 * Safeguard the burst limit to "sane" values on customer builds.
11269 */
11270 #if !(DEVELOPMENT || DEBUG)
11271 if (i < IF_RCVQ_BURST_LIMIT_MIN) {
11272 i = IF_RCVQ_BURST_LIMIT_MIN;
11273 }
11274
11275 if (IF_RCVQ_BURST_LIMIT_MAX < i) {
11276 i = IF_RCVQ_BURST_LIMIT_MAX;
11277 }
11278 #endif
11279
11280 if_rcvq_burst_limit = i;
11281 return err;
11282 }
11283
11284 static int
11285 sysctl_rcvq_trim_pct SYSCTL_HANDLER_ARGS
11286 {
11287 #pragma unused(arg1, arg2)
11288 int i, err;
11289
11290 i = if_rcvq_burst_limit;
11291
11292 err = sysctl_handle_int(oidp, &i, 0, req);
11293 if (err != 0 || req->newptr == USER_ADDR_NULL) {
11294 return err;
11295 }
11296
11297 if (IF_RCVQ_TRIM_PCT_MAX < i) {
11298 i = IF_RCVQ_TRIM_PCT_MAX;
11299 }
11300
11301 if (i < IF_RCVQ_TRIM_PCT_MIN) {
11302 i = IF_RCVQ_TRIM_PCT_MIN;
11303 }
11304
11305 if_rcvq_trim_pct = i;
11306 return err;
11307 }
11308
11309 int
dlil_node_present(struct ifnet * ifp,struct sockaddr * sa,int32_t rssi,int lqm,int npm,u_int8_t srvinfo[48])11310 dlil_node_present(struct ifnet *ifp, struct sockaddr *sa,
11311 int32_t rssi, int lqm, int npm, u_int8_t srvinfo[48])
11312 {
11313 struct kev_dl_node_presence kev;
11314 struct sockaddr_dl *sdl;
11315 struct sockaddr_in6 *sin6;
11316 int ret = 0;
11317
11318 VERIFY(ifp);
11319 VERIFY(sa);
11320 VERIFY(sa->sa_family == AF_LINK || sa->sa_family == AF_INET6);
11321
11322 bzero(&kev, sizeof(kev));
11323 sin6 = &kev.sin6_node_address;
11324 sdl = &kev.sdl_node_address;
11325 nd6_alt_node_addr_decompose(ifp, sa, sdl, sin6);
11326 kev.rssi = rssi;
11327 kev.link_quality_metric = lqm;
11328 kev.node_proximity_metric = npm;
11329 bcopy(srvinfo, kev.node_service_info, sizeof(kev.node_service_info));
11330
11331 ret = nd6_alt_node_present(ifp, sin6, sdl, rssi, lqm, npm);
11332 if (ret == 0 || ret == EEXIST) {
11333 int err = dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_NODE_PRESENCE,
11334 &kev.link_data, sizeof(kev), (ret == EEXIST) ? TRUE : FALSE);
11335 if (err != 0) {
11336 log(LOG_ERR, "%s: Post DL_NODE_PRESENCE failed with"
11337 "error %d\n", __func__, err);
11338 }
11339 }
11340
11341 if (ret == EEXIST) {
11342 ret = 0;
11343 }
11344 return ret;
11345 }
11346
11347 void
dlil_node_absent(struct ifnet * ifp,struct sockaddr * sa)11348 dlil_node_absent(struct ifnet *ifp, struct sockaddr *sa)
11349 {
11350 struct kev_dl_node_absence kev = {};
11351 struct sockaddr_in6 *kev_sin6 = NULL;
11352 struct sockaddr_dl *kev_sdl = NULL;
11353 int error = 0;
11354
11355 VERIFY(ifp != NULL);
11356 VERIFY(sa != NULL);
11357 VERIFY(sa->sa_family == AF_LINK || sa->sa_family == AF_INET6);
11358
11359 kev_sin6 = &kev.sin6_node_address;
11360 kev_sdl = &kev.sdl_node_address;
11361
11362 if (sa->sa_family == AF_INET6) {
11363 /*
11364 * If IPv6 address is given, get the link layer
11365 * address from what was cached in the neighbor cache
11366 */
11367 VERIFY(sa->sa_len <= sizeof(*kev_sin6));
11368 bcopy(sa, kev_sin6, sa->sa_len);
11369 error = nd6_alt_node_absent(ifp, kev_sin6, kev_sdl);
11370 } else {
11371 /*
11372 * If passed address is AF_LINK type, derive the address
11373 * based on the link address.
11374 */
11375 nd6_alt_node_addr_decompose(ifp, sa, kev_sdl, kev_sin6);
11376 error = nd6_alt_node_absent(ifp, kev_sin6, NULL);
11377 }
11378
11379 if (error == 0) {
11380 kev_sdl->sdl_type = ifp->if_type;
11381 kev_sdl->sdl_index = ifp->if_index;
11382
11383 dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_NODE_ABSENCE,
11384 &kev.link_data, sizeof(kev), FALSE);
11385 }
11386 }
11387
11388 int
dlil_node_present_v2(struct ifnet * ifp,struct sockaddr * sa,struct sockaddr_dl * sdl,int32_t rssi,int lqm,int npm,u_int8_t srvinfo[48])11389 dlil_node_present_v2(struct ifnet *ifp, struct sockaddr *sa, struct sockaddr_dl *sdl,
11390 int32_t rssi, int lqm, int npm, u_int8_t srvinfo[48])
11391 {
11392 struct kev_dl_node_presence kev = {};
11393 struct sockaddr_dl *kev_sdl = NULL;
11394 struct sockaddr_in6 *kev_sin6 = NULL;
11395 int ret = 0;
11396
11397 VERIFY(ifp != NULL);
11398 VERIFY(sa != NULL && sdl != NULL);
11399 VERIFY(sa->sa_family == AF_INET6 && sdl->sdl_family == AF_LINK);
11400
11401 kev_sin6 = &kev.sin6_node_address;
11402 kev_sdl = &kev.sdl_node_address;
11403
11404 VERIFY(sdl->sdl_len <= sizeof(*kev_sdl));
11405 bcopy(sdl, kev_sdl, sdl->sdl_len);
11406 kev_sdl->sdl_type = ifp->if_type;
11407 kev_sdl->sdl_index = ifp->if_index;
11408
11409 VERIFY(sa->sa_len <= sizeof(*kev_sin6));
11410 bcopy(sa, kev_sin6, sa->sa_len);
11411
11412 kev.rssi = rssi;
11413 kev.link_quality_metric = lqm;
11414 kev.node_proximity_metric = npm;
11415 bcopy(srvinfo, kev.node_service_info, sizeof(kev.node_service_info));
11416
11417 ret = nd6_alt_node_present(ifp, SIN6(sa), sdl, rssi, lqm, npm);
11418 if (ret == 0 || ret == EEXIST) {
11419 int err = dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_NODE_PRESENCE,
11420 &kev.link_data, sizeof(kev), (ret == EEXIST) ? TRUE : FALSE);
11421 if (err != 0) {
11422 log(LOG_ERR, "%s: Post DL_NODE_PRESENCE failed with error %d\n", __func__, err);
11423 }
11424 }
11425
11426 if (ret == EEXIST) {
11427 ret = 0;
11428 }
11429 return ret;
11430 }
11431
11432 const void *
dlil_ifaddr_bytes(const struct sockaddr_dl * sdl,size_t * sizep,kauth_cred_t * credp)11433 dlil_ifaddr_bytes(const struct sockaddr_dl *sdl, size_t *sizep,
11434 kauth_cred_t *credp)
11435 {
11436 const u_int8_t *bytes;
11437 size_t size;
11438
11439 bytes = CONST_LLADDR(sdl);
11440 size = sdl->sdl_alen;
11441
11442 #if CONFIG_MACF
11443 if (dlil_lladdr_ckreq) {
11444 switch (sdl->sdl_type) {
11445 case IFT_ETHER:
11446 case IFT_IEEE1394:
11447 break;
11448 default:
11449 credp = NULL;
11450 break;
11451 }
11452 ;
11453
11454 if (credp && mac_system_check_info(*credp, "net.link.addr")) {
11455 static const u_int8_t unspec[FIREWIRE_EUI64_LEN] = {
11456 [0] = 2
11457 };
11458
11459 bytes = unspec;
11460 }
11461 }
11462 #else
11463 #pragma unused(credp)
11464 #endif
11465
11466 if (sizep != NULL) {
11467 *sizep = size;
11468 }
11469 return bytes;
11470 }
11471
11472 void
dlil_report_issues(struct ifnet * ifp,u_int8_t modid[DLIL_MODIDLEN],u_int8_t info[DLIL_MODARGLEN])11473 dlil_report_issues(struct ifnet *ifp, u_int8_t modid[DLIL_MODIDLEN],
11474 u_int8_t info[DLIL_MODARGLEN])
11475 {
11476 struct kev_dl_issues kev;
11477 struct timeval tv;
11478
11479 VERIFY(ifp != NULL);
11480 VERIFY(modid != NULL);
11481 _CASSERT(sizeof(kev.modid) == DLIL_MODIDLEN);
11482 _CASSERT(sizeof(kev.info) == DLIL_MODARGLEN);
11483
11484 bzero(&kev, sizeof(kev));
11485
11486 microtime(&tv);
11487 kev.timestamp = tv.tv_sec;
11488 bcopy(modid, &kev.modid, DLIL_MODIDLEN);
11489 if (info != NULL) {
11490 bcopy(info, &kev.info, DLIL_MODARGLEN);
11491 }
11492
11493 dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_ISSUES,
11494 &kev.link_data, sizeof(kev), FALSE);
11495 }
11496
11497 errno_t
ifnet_getset_opportunistic(ifnet_t ifp,u_long cmd,struct ifreq * ifr,struct proc * p)11498 ifnet_getset_opportunistic(ifnet_t ifp, u_long cmd, struct ifreq *ifr,
11499 struct proc *p)
11500 {
11501 u_int32_t level = IFNET_THROTTLE_OFF;
11502 errno_t result = 0;
11503
11504 VERIFY(cmd == SIOCSIFOPPORTUNISTIC || cmd == SIOCGIFOPPORTUNISTIC);
11505
11506 if (cmd == SIOCSIFOPPORTUNISTIC) {
11507 /*
11508 * XXX: Use priv_check_cred() instead of root check?
11509 */
11510 if ((result = proc_suser(p)) != 0) {
11511 return result;
11512 }
11513
11514 if (ifr->ifr_opportunistic.ifo_flags ==
11515 IFRIFOF_BLOCK_OPPORTUNISTIC) {
11516 level = IFNET_THROTTLE_OPPORTUNISTIC;
11517 } else if (ifr->ifr_opportunistic.ifo_flags == 0) {
11518 level = IFNET_THROTTLE_OFF;
11519 } else {
11520 result = EINVAL;
11521 }
11522
11523 if (result == 0) {
11524 result = ifnet_set_throttle(ifp, level);
11525 }
11526 } else if ((result = ifnet_get_throttle(ifp, &level)) == 0) {
11527 ifr->ifr_opportunistic.ifo_flags = 0;
11528 if (level == IFNET_THROTTLE_OPPORTUNISTIC) {
11529 ifr->ifr_opportunistic.ifo_flags |=
11530 IFRIFOF_BLOCK_OPPORTUNISTIC;
11531 }
11532 }
11533
11534 /*
11535 * Return the count of current opportunistic connections
11536 * over the interface.
11537 */
11538 if (result == 0) {
11539 uint32_t flags = 0;
11540 flags |= (cmd == SIOCSIFOPPORTUNISTIC) ?
11541 INPCB_OPPORTUNISTIC_SETCMD : 0;
11542 flags |= (level == IFNET_THROTTLE_OPPORTUNISTIC) ?
11543 INPCB_OPPORTUNISTIC_THROTTLEON : 0;
11544 ifr->ifr_opportunistic.ifo_inuse =
11545 udp_count_opportunistic(ifp->if_index, flags) +
11546 tcp_count_opportunistic(ifp->if_index, flags);
11547 }
11548
11549 if (result == EALREADY) {
11550 result = 0;
11551 }
11552
11553 return result;
11554 }
11555
11556 int
ifnet_get_throttle(struct ifnet * ifp,u_int32_t * level)11557 ifnet_get_throttle(struct ifnet *ifp, u_int32_t *level)
11558 {
11559 struct ifclassq *ifq;
11560 int err = 0;
11561
11562 if (!(ifp->if_eflags & IFEF_TXSTART)) {
11563 return ENXIO;
11564 }
11565
11566 *level = IFNET_THROTTLE_OFF;
11567
11568 ifq = ifp->if_snd;
11569 IFCQ_LOCK(ifq);
11570 /* Throttling works only for IFCQ, not ALTQ instances */
11571 if (IFCQ_IS_ENABLED(ifq)) {
11572 cqrq_throttle_t req = { 0, IFNET_THROTTLE_OFF };
11573
11574 err = fq_if_request_classq(ifq, CLASSQRQ_THROTTLE, &req);
11575 *level = req.level;
11576 }
11577 IFCQ_UNLOCK(ifq);
11578
11579 return err;
11580 }
11581
11582 int
ifnet_set_throttle(struct ifnet * ifp,u_int32_t level)11583 ifnet_set_throttle(struct ifnet *ifp, u_int32_t level)
11584 {
11585 struct ifclassq *ifq;
11586 int err = 0;
11587
11588 if (!(ifp->if_eflags & IFEF_TXSTART)) {
11589 return ENXIO;
11590 }
11591
11592 ifq = ifp->if_snd;
11593
11594 switch (level) {
11595 case IFNET_THROTTLE_OFF:
11596 case IFNET_THROTTLE_OPPORTUNISTIC:
11597 break;
11598 default:
11599 return EINVAL;
11600 }
11601
11602 IFCQ_LOCK(ifq);
11603 if (IFCQ_IS_ENABLED(ifq)) {
11604 cqrq_throttle_t req = { 1, level };
11605
11606 err = fq_if_request_classq(ifq, CLASSQRQ_THROTTLE, &req);
11607 }
11608 IFCQ_UNLOCK(ifq);
11609
11610 if (err == 0) {
11611 DLIL_PRINTF("%s: throttling level set to %d\n", if_name(ifp),
11612 level);
11613 #if NECP
11614 necp_update_all_clients();
11615 #endif /* NECP */
11616 if (level == IFNET_THROTTLE_OFF) {
11617 ifnet_start(ifp);
11618 }
11619 }
11620
11621 return err;
11622 }
11623
11624 errno_t
ifnet_getset_log(ifnet_t ifp,u_long cmd,struct ifreq * ifr,struct proc * p)11625 ifnet_getset_log(ifnet_t ifp, u_long cmd, struct ifreq *ifr,
11626 struct proc *p)
11627 {
11628 #pragma unused(p)
11629 errno_t result = 0;
11630 uint32_t flags;
11631 int level, category, subcategory;
11632
11633 VERIFY(cmd == SIOCSIFLOG || cmd == SIOCGIFLOG);
11634
11635 if (cmd == SIOCSIFLOG) {
11636 if ((result = priv_check_cred(kauth_cred_get(),
11637 PRIV_NET_INTERFACE_CONTROL, 0)) != 0) {
11638 return result;
11639 }
11640
11641 level = ifr->ifr_log.ifl_level;
11642 if (level < IFNET_LOG_MIN || level > IFNET_LOG_MAX) {
11643 result = EINVAL;
11644 }
11645
11646 flags = ifr->ifr_log.ifl_flags;
11647 if ((flags &= IFNET_LOGF_MASK) == 0) {
11648 result = EINVAL;
11649 }
11650
11651 category = ifr->ifr_log.ifl_category;
11652 subcategory = ifr->ifr_log.ifl_subcategory;
11653
11654 if (result == 0) {
11655 result = ifnet_set_log(ifp, level, flags,
11656 category, subcategory);
11657 }
11658 } else {
11659 result = ifnet_get_log(ifp, &level, &flags, &category,
11660 &subcategory);
11661 if (result == 0) {
11662 ifr->ifr_log.ifl_level = level;
11663 ifr->ifr_log.ifl_flags = flags;
11664 ifr->ifr_log.ifl_category = category;
11665 ifr->ifr_log.ifl_subcategory = subcategory;
11666 }
11667 }
11668
11669 return result;
11670 }
11671
11672 int
ifnet_set_log(struct ifnet * ifp,int32_t level,uint32_t flags,int32_t category,int32_t subcategory)11673 ifnet_set_log(struct ifnet *ifp, int32_t level, uint32_t flags,
11674 int32_t category, int32_t subcategory)
11675 {
11676 int err = 0;
11677
11678 VERIFY(level >= IFNET_LOG_MIN && level <= IFNET_LOG_MAX);
11679 VERIFY(flags & IFNET_LOGF_MASK);
11680
11681 /*
11682 * The logging level applies to all facilities; make sure to
11683 * update them all with the most current level.
11684 */
11685 flags |= ifp->if_log.flags;
11686
11687 if (ifp->if_output_ctl != NULL) {
11688 struct ifnet_log_params l;
11689
11690 bzero(&l, sizeof(l));
11691 l.level = level;
11692 l.flags = flags;
11693 l.flags &= ~IFNET_LOGF_DLIL;
11694 l.category = category;
11695 l.subcategory = subcategory;
11696
11697 /* Send this request to lower layers */
11698 if (l.flags != 0) {
11699 err = ifp->if_output_ctl(ifp, IFNET_CTL_SET_LOG,
11700 sizeof(l), &l);
11701 }
11702 } else if ((flags & ~IFNET_LOGF_DLIL) && ifp->if_output_ctl == NULL) {
11703 /*
11704 * If targeted to the lower layers without an output
11705 * control callback registered on the interface, just
11706 * silently ignore facilities other than ours.
11707 */
11708 flags &= IFNET_LOGF_DLIL;
11709 if (flags == 0 && (!(ifp->if_log.flags & IFNET_LOGF_DLIL))) {
11710 level = 0;
11711 }
11712 }
11713
11714 if (err == 0) {
11715 if ((ifp->if_log.level = level) == IFNET_LOG_DEFAULT) {
11716 ifp->if_log.flags = 0;
11717 } else {
11718 ifp->if_log.flags |= flags;
11719 }
11720
11721 log(LOG_INFO, "%s: logging level set to %d flags=%b "
11722 "arg=%b, category=%d subcategory=%d\n", if_name(ifp),
11723 ifp->if_log.level, ifp->if_log.flags,
11724 IFNET_LOGF_BITS, flags, IFNET_LOGF_BITS,
11725 category, subcategory);
11726 }
11727
11728 return err;
11729 }
11730
11731 int
ifnet_get_log(struct ifnet * ifp,int32_t * level,uint32_t * flags,int32_t * category,int32_t * subcategory)11732 ifnet_get_log(struct ifnet *ifp, int32_t *level, uint32_t *flags,
11733 int32_t *category, int32_t *subcategory)
11734 {
11735 if (level != NULL) {
11736 *level = ifp->if_log.level;
11737 }
11738 if (flags != NULL) {
11739 *flags = ifp->if_log.flags;
11740 }
11741 if (category != NULL) {
11742 *category = ifp->if_log.category;
11743 }
11744 if (subcategory != NULL) {
11745 *subcategory = ifp->if_log.subcategory;
11746 }
11747
11748 return 0;
11749 }
11750
11751 int
ifnet_notify_address(struct ifnet * ifp,int af)11752 ifnet_notify_address(struct ifnet *ifp, int af)
11753 {
11754 struct ifnet_notify_address_params na;
11755
11756 #if PF
11757 (void) pf_ifaddr_hook(ifp);
11758 #endif /* PF */
11759
11760 if (ifp->if_output_ctl == NULL) {
11761 return EOPNOTSUPP;
11762 }
11763
11764 bzero(&na, sizeof(na));
11765 na.address_family = (sa_family_t)af;
11766
11767 return ifp->if_output_ctl(ifp, IFNET_CTL_NOTIFY_ADDRESS,
11768 sizeof(na), &na);
11769 }
11770
11771 errno_t
ifnet_flowid(struct ifnet * ifp,uint32_t * flowid)11772 ifnet_flowid(struct ifnet *ifp, uint32_t *flowid)
11773 {
11774 if (ifp == NULL || flowid == NULL) {
11775 return EINVAL;
11776 } else if (!(ifp->if_eflags & IFEF_TXSTART) ||
11777 !IF_FULLY_ATTACHED(ifp)) {
11778 return ENXIO;
11779 }
11780
11781 *flowid = ifp->if_flowhash;
11782
11783 return 0;
11784 }
11785
11786 errno_t
ifnet_disable_output(struct ifnet * ifp)11787 ifnet_disable_output(struct ifnet *ifp)
11788 {
11789 int err;
11790
11791 if (ifp == NULL) {
11792 return EINVAL;
11793 } else if (!(ifp->if_eflags & IFEF_TXSTART) ||
11794 !IF_FULLY_ATTACHED(ifp)) {
11795 return ENXIO;
11796 }
11797
11798 if ((err = ifnet_fc_add(ifp)) == 0) {
11799 lck_mtx_lock_spin(&ifp->if_start_lock);
11800 ifp->if_start_flags |= IFSF_FLOW_CONTROLLED;
11801 lck_mtx_unlock(&ifp->if_start_lock);
11802 }
11803 return err;
11804 }
11805
11806 errno_t
ifnet_enable_output(struct ifnet * ifp)11807 ifnet_enable_output(struct ifnet *ifp)
11808 {
11809 if (ifp == NULL) {
11810 return EINVAL;
11811 } else if (!(ifp->if_eflags & IFEF_TXSTART) ||
11812 !IF_FULLY_ATTACHED(ifp)) {
11813 return ENXIO;
11814 }
11815
11816 ifnet_start_common(ifp, TRUE, FALSE);
11817 return 0;
11818 }
11819
11820 void
ifnet_flowadv(uint32_t flowhash)11821 ifnet_flowadv(uint32_t flowhash)
11822 {
11823 struct ifnet_fc_entry *ifce;
11824 struct ifnet *ifp;
11825
11826 ifce = ifnet_fc_get(flowhash);
11827 if (ifce == NULL) {
11828 return;
11829 }
11830
11831 VERIFY(ifce->ifce_ifp != NULL);
11832 ifp = ifce->ifce_ifp;
11833
11834 /* flow hash gets recalculated per attach, so check */
11835 if (ifnet_is_attached(ifp, 1)) {
11836 if (ifp->if_flowhash == flowhash) {
11837 (void) ifnet_enable_output(ifp);
11838 }
11839 ifnet_decr_iorefcnt(ifp);
11840 }
11841 ifnet_fc_entry_free(ifce);
11842 }
11843
11844 /*
11845 * Function to compare ifnet_fc_entries in ifnet flow control tree
11846 */
11847 static inline int
ifce_cmp(const struct ifnet_fc_entry * fc1,const struct ifnet_fc_entry * fc2)11848 ifce_cmp(const struct ifnet_fc_entry *fc1, const struct ifnet_fc_entry *fc2)
11849 {
11850 return fc1->ifce_flowhash - fc2->ifce_flowhash;
11851 }
11852
11853 static int
ifnet_fc_add(struct ifnet * ifp)11854 ifnet_fc_add(struct ifnet *ifp)
11855 {
11856 struct ifnet_fc_entry keyfc, *ifce;
11857 uint32_t flowhash;
11858
11859 VERIFY(ifp != NULL && (ifp->if_eflags & IFEF_TXSTART));
11860 VERIFY(ifp->if_flowhash != 0);
11861 flowhash = ifp->if_flowhash;
11862
11863 bzero(&keyfc, sizeof(keyfc));
11864 keyfc.ifce_flowhash = flowhash;
11865
11866 lck_mtx_lock_spin(&ifnet_fc_lock);
11867 ifce = RB_FIND(ifnet_fc_tree, &ifnet_fc_tree, &keyfc);
11868 if (ifce != NULL && ifce->ifce_ifp == ifp) {
11869 /* Entry is already in ifnet_fc_tree, return */
11870 lck_mtx_unlock(&ifnet_fc_lock);
11871 return 0;
11872 }
11873
11874 if (ifce != NULL) {
11875 /*
11876 * There is a different fc entry with the same flow hash
11877 * but different ifp pointer. There can be a collision
11878 * on flow hash but the probability is low. Let's just
11879 * avoid adding a second one when there is a collision.
11880 */
11881 lck_mtx_unlock(&ifnet_fc_lock);
11882 return EAGAIN;
11883 }
11884
11885 /* become regular mutex */
11886 lck_mtx_convert_spin(&ifnet_fc_lock);
11887
11888 ifce = zalloc_flags(ifnet_fc_zone, Z_WAITOK | Z_ZERO);
11889 ifce->ifce_flowhash = flowhash;
11890 ifce->ifce_ifp = ifp;
11891
11892 RB_INSERT(ifnet_fc_tree, &ifnet_fc_tree, ifce);
11893 lck_mtx_unlock(&ifnet_fc_lock);
11894 return 0;
11895 }
11896
11897 static struct ifnet_fc_entry *
ifnet_fc_get(uint32_t flowhash)11898 ifnet_fc_get(uint32_t flowhash)
11899 {
11900 struct ifnet_fc_entry keyfc, *ifce;
11901 struct ifnet *ifp;
11902
11903 bzero(&keyfc, sizeof(keyfc));
11904 keyfc.ifce_flowhash = flowhash;
11905
11906 lck_mtx_lock_spin(&ifnet_fc_lock);
11907 ifce = RB_FIND(ifnet_fc_tree, &ifnet_fc_tree, &keyfc);
11908 if (ifce == NULL) {
11909 /* Entry is not present in ifnet_fc_tree, return */
11910 lck_mtx_unlock(&ifnet_fc_lock);
11911 return NULL;
11912 }
11913
11914 RB_REMOVE(ifnet_fc_tree, &ifnet_fc_tree, ifce);
11915
11916 VERIFY(ifce->ifce_ifp != NULL);
11917 ifp = ifce->ifce_ifp;
11918
11919 /* become regular mutex */
11920 lck_mtx_convert_spin(&ifnet_fc_lock);
11921
11922 if (!ifnet_is_attached(ifp, 0)) {
11923 /*
11924 * This ifp is not attached or in the process of being
11925 * detached; just don't process it.
11926 */
11927 ifnet_fc_entry_free(ifce);
11928 ifce = NULL;
11929 }
11930 lck_mtx_unlock(&ifnet_fc_lock);
11931
11932 return ifce;
11933 }
11934
11935 static void
ifnet_fc_entry_free(struct ifnet_fc_entry * ifce)11936 ifnet_fc_entry_free(struct ifnet_fc_entry *ifce)
11937 {
11938 zfree(ifnet_fc_zone, ifce);
11939 }
11940
11941 static uint32_t
ifnet_calc_flowhash(struct ifnet * ifp)11942 ifnet_calc_flowhash(struct ifnet *ifp)
11943 {
11944 struct ifnet_flowhash_key fh __attribute__((aligned(8)));
11945 uint32_t flowhash = 0;
11946
11947 if (ifnet_flowhash_seed == 0) {
11948 ifnet_flowhash_seed = RandomULong();
11949 }
11950
11951 bzero(&fh, sizeof(fh));
11952
11953 (void) snprintf(fh.ifk_name, sizeof(fh.ifk_name), "%s", ifp->if_name);
11954 fh.ifk_unit = ifp->if_unit;
11955 fh.ifk_flags = ifp->if_flags;
11956 fh.ifk_eflags = ifp->if_eflags;
11957 fh.ifk_capabilities = ifp->if_capabilities;
11958 fh.ifk_capenable = ifp->if_capenable;
11959 fh.ifk_output_sched_model = ifp->if_output_sched_model;
11960 fh.ifk_rand1 = RandomULong();
11961 fh.ifk_rand2 = RandomULong();
11962
11963 try_again:
11964 flowhash = net_flowhash(&fh, sizeof(fh), ifnet_flowhash_seed);
11965 if (flowhash == 0) {
11966 /* try to get a non-zero flowhash */
11967 ifnet_flowhash_seed = RandomULong();
11968 goto try_again;
11969 }
11970
11971 return flowhash;
11972 }
11973
11974 int
ifnet_set_netsignature(struct ifnet * ifp,uint8_t family,uint8_t len,uint16_t flags,uint8_t * data)11975 ifnet_set_netsignature(struct ifnet *ifp, uint8_t family, uint8_t len,
11976 uint16_t flags, uint8_t *data)
11977 {
11978 #pragma unused(flags)
11979 int error = 0;
11980
11981 switch (family) {
11982 case AF_INET:
11983 if_inetdata_lock_exclusive(ifp);
11984 if (IN_IFEXTRA(ifp) != NULL) {
11985 if (len == 0) {
11986 /* Allow clearing the signature */
11987 IN_IFEXTRA(ifp)->netsig_len = 0;
11988 bzero(IN_IFEXTRA(ifp)->netsig,
11989 sizeof(IN_IFEXTRA(ifp)->netsig));
11990 if_inetdata_lock_done(ifp);
11991 break;
11992 } else if (len > sizeof(IN_IFEXTRA(ifp)->netsig)) {
11993 error = EINVAL;
11994 if_inetdata_lock_done(ifp);
11995 break;
11996 }
11997 IN_IFEXTRA(ifp)->netsig_len = len;
11998 bcopy(data, IN_IFEXTRA(ifp)->netsig, len);
11999 } else {
12000 error = ENOMEM;
12001 }
12002 if_inetdata_lock_done(ifp);
12003 break;
12004
12005 case AF_INET6:
12006 if_inet6data_lock_exclusive(ifp);
12007 if (IN6_IFEXTRA(ifp) != NULL) {
12008 if (len == 0) {
12009 /* Allow clearing the signature */
12010 IN6_IFEXTRA(ifp)->netsig_len = 0;
12011 bzero(IN6_IFEXTRA(ifp)->netsig,
12012 sizeof(IN6_IFEXTRA(ifp)->netsig));
12013 if_inet6data_lock_done(ifp);
12014 break;
12015 } else if (len > sizeof(IN6_IFEXTRA(ifp)->netsig)) {
12016 error = EINVAL;
12017 if_inet6data_lock_done(ifp);
12018 break;
12019 }
12020 IN6_IFEXTRA(ifp)->netsig_len = len;
12021 bcopy(data, IN6_IFEXTRA(ifp)->netsig, len);
12022 } else {
12023 error = ENOMEM;
12024 }
12025 if_inet6data_lock_done(ifp);
12026 break;
12027
12028 default:
12029 error = EINVAL;
12030 break;
12031 }
12032
12033 return error;
12034 }
12035
12036 int
ifnet_get_netsignature(struct ifnet * ifp,uint8_t family,uint8_t * len,uint16_t * flags,uint8_t * data)12037 ifnet_get_netsignature(struct ifnet *ifp, uint8_t family, uint8_t *len,
12038 uint16_t *flags, uint8_t *data)
12039 {
12040 int error = 0;
12041
12042 if (ifp == NULL || len == NULL || data == NULL) {
12043 return EINVAL;
12044 }
12045
12046 switch (family) {
12047 case AF_INET:
12048 if_inetdata_lock_shared(ifp);
12049 if (IN_IFEXTRA(ifp) != NULL) {
12050 if (*len == 0 || *len < IN_IFEXTRA(ifp)->netsig_len) {
12051 error = EINVAL;
12052 if_inetdata_lock_done(ifp);
12053 break;
12054 }
12055 if ((*len = (uint8_t)IN_IFEXTRA(ifp)->netsig_len) > 0) {
12056 bcopy(IN_IFEXTRA(ifp)->netsig, data, *len);
12057 } else {
12058 error = ENOENT;
12059 }
12060 } else {
12061 error = ENOMEM;
12062 }
12063 if_inetdata_lock_done(ifp);
12064 break;
12065
12066 case AF_INET6:
12067 if_inet6data_lock_shared(ifp);
12068 if (IN6_IFEXTRA(ifp) != NULL) {
12069 if (*len == 0 || *len < IN6_IFEXTRA(ifp)->netsig_len) {
12070 error = EINVAL;
12071 if_inet6data_lock_done(ifp);
12072 break;
12073 }
12074 if ((*len = (uint8_t)IN6_IFEXTRA(ifp)->netsig_len) > 0) {
12075 bcopy(IN6_IFEXTRA(ifp)->netsig, data, *len);
12076 } else {
12077 error = ENOENT;
12078 }
12079 } else {
12080 error = ENOMEM;
12081 }
12082 if_inet6data_lock_done(ifp);
12083 break;
12084
12085 default:
12086 error = EINVAL;
12087 break;
12088 }
12089
12090 if (error == 0 && flags != NULL) {
12091 *flags = 0;
12092 }
12093
12094 return error;
12095 }
12096
12097 int
ifnet_set_nat64prefix(struct ifnet * ifp,struct ipv6_prefix * prefixes)12098 ifnet_set_nat64prefix(struct ifnet *ifp, struct ipv6_prefix *prefixes)
12099 {
12100 int i, error = 0, one_set = 0;
12101
12102 if_inet6data_lock_exclusive(ifp);
12103
12104 if (IN6_IFEXTRA(ifp) == NULL) {
12105 error = ENOMEM;
12106 goto out;
12107 }
12108
12109 for (i = 0; i < NAT64_MAX_NUM_PREFIXES; i++) {
12110 uint32_t prefix_len =
12111 prefixes[i].prefix_len;
12112 struct in6_addr *prefix =
12113 &prefixes[i].ipv6_prefix;
12114
12115 if (prefix_len == 0) {
12116 clat_log0((LOG_DEBUG,
12117 "NAT64 prefixes purged from Interface %s\n",
12118 if_name(ifp)));
12119 /* Allow clearing the signature */
12120 IN6_IFEXTRA(ifp)->nat64_prefixes[i].prefix_len = 0;
12121 bzero(&IN6_IFEXTRA(ifp)->nat64_prefixes[i].ipv6_prefix,
12122 sizeof(struct in6_addr));
12123
12124 continue;
12125 } else if (prefix_len != NAT64_PREFIX_LEN_32 &&
12126 prefix_len != NAT64_PREFIX_LEN_40 &&
12127 prefix_len != NAT64_PREFIX_LEN_48 &&
12128 prefix_len != NAT64_PREFIX_LEN_56 &&
12129 prefix_len != NAT64_PREFIX_LEN_64 &&
12130 prefix_len != NAT64_PREFIX_LEN_96) {
12131 clat_log0((LOG_DEBUG,
12132 "NAT64 prefixlen is incorrect %d\n", prefix_len));
12133 error = EINVAL;
12134 goto out;
12135 }
12136
12137 if (IN6_IS_SCOPE_EMBED(prefix)) {
12138 clat_log0((LOG_DEBUG,
12139 "NAT64 prefix has interface/link local scope.\n"));
12140 error = EINVAL;
12141 goto out;
12142 }
12143
12144 IN6_IFEXTRA(ifp)->nat64_prefixes[i].prefix_len = prefix_len;
12145 bcopy(prefix, &IN6_IFEXTRA(ifp)->nat64_prefixes[i].ipv6_prefix,
12146 sizeof(struct in6_addr));
12147 clat_log0((LOG_DEBUG,
12148 "NAT64 prefix set to %s with prefixlen: %d\n",
12149 ip6_sprintf(prefix), prefix_len));
12150 one_set = 1;
12151 }
12152
12153 out:
12154 if_inet6data_lock_done(ifp);
12155
12156 if (error == 0 && one_set != 0) {
12157 necp_update_all_clients();
12158 }
12159
12160 return error;
12161 }
12162
12163 int
ifnet_get_nat64prefix(struct ifnet * ifp,struct ipv6_prefix * prefixes)12164 ifnet_get_nat64prefix(struct ifnet *ifp, struct ipv6_prefix *prefixes)
12165 {
12166 int i, found_one = 0, error = 0;
12167
12168 if (ifp == NULL) {
12169 return EINVAL;
12170 }
12171
12172 if_inet6data_lock_shared(ifp);
12173
12174 if (IN6_IFEXTRA(ifp) == NULL) {
12175 error = ENOMEM;
12176 goto out;
12177 }
12178
12179 for (i = 0; i < NAT64_MAX_NUM_PREFIXES; i++) {
12180 if (IN6_IFEXTRA(ifp)->nat64_prefixes[i].prefix_len != 0) {
12181 found_one = 1;
12182 }
12183 }
12184
12185 if (found_one == 0) {
12186 error = ENOENT;
12187 goto out;
12188 }
12189
12190 if (prefixes) {
12191 bcopy(IN6_IFEXTRA(ifp)->nat64_prefixes, prefixes,
12192 sizeof(IN6_IFEXTRA(ifp)->nat64_prefixes));
12193 }
12194
12195 out:
12196 if_inet6data_lock_done(ifp);
12197
12198 return error;
12199 }
12200
12201 __attribute__((noinline))
12202 static void
dlil_output_cksum_dbg(struct ifnet * ifp,struct mbuf * m,uint32_t hoff,protocol_family_t pf)12203 dlil_output_cksum_dbg(struct ifnet *ifp, struct mbuf *m, uint32_t hoff,
12204 protocol_family_t pf)
12205 {
12206 #pragma unused(ifp)
12207 uint32_t did_sw;
12208
12209 if (!(hwcksum_dbg_mode & HWCKSUM_DBG_FINALIZE_FORCED) ||
12210 (m->m_pkthdr.csum_flags & (CSUM_TSO_IPV4 | CSUM_TSO_IPV6))) {
12211 return;
12212 }
12213
12214 switch (pf) {
12215 case PF_INET:
12216 did_sw = in_finalize_cksum(m, hoff, m->m_pkthdr.csum_flags);
12217 if (did_sw & CSUM_DELAY_IP) {
12218 hwcksum_dbg_finalized_hdr++;
12219 }
12220 if (did_sw & CSUM_DELAY_DATA) {
12221 hwcksum_dbg_finalized_data++;
12222 }
12223 break;
12224 case PF_INET6:
12225 /*
12226 * Checksum offload should not have been enabled when
12227 * extension headers exist; that also means that we
12228 * cannot force-finalize packets with extension headers.
12229 * Indicate to the callee should it skip such case by
12230 * setting optlen to -1.
12231 */
12232 did_sw = in6_finalize_cksum(m, hoff, -1, -1,
12233 m->m_pkthdr.csum_flags);
12234 if (did_sw & CSUM_DELAY_IPV6_DATA) {
12235 hwcksum_dbg_finalized_data++;
12236 }
12237 break;
12238 default:
12239 return;
12240 }
12241 }
12242
12243 static void
dlil_input_cksum_dbg(struct ifnet * ifp,struct mbuf * m,char * frame_header,protocol_family_t pf)12244 dlil_input_cksum_dbg(struct ifnet *ifp, struct mbuf *m, char *frame_header,
12245 protocol_family_t pf)
12246 {
12247 uint16_t sum = 0;
12248 uint32_t hlen;
12249
12250 if (frame_header == NULL ||
12251 frame_header < (char *)mbuf_datastart(m) ||
12252 frame_header > (char *)m->m_data) {
12253 DLIL_PRINTF("%s: frame header pointer 0x%llx out of range "
12254 "[0x%llx,0x%llx] for mbuf 0x%llx\n", if_name(ifp),
12255 (uint64_t)VM_KERNEL_ADDRPERM(frame_header),
12256 (uint64_t)VM_KERNEL_ADDRPERM(mbuf_datastart(m)),
12257 (uint64_t)VM_KERNEL_ADDRPERM(m->m_data),
12258 (uint64_t)VM_KERNEL_ADDRPERM(m));
12259 return;
12260 }
12261 hlen = (uint32_t)(m->m_data - frame_header);
12262
12263 switch (pf) {
12264 case PF_INET:
12265 case PF_INET6:
12266 break;
12267 default:
12268 return;
12269 }
12270
12271 /*
12272 * Force partial checksum offload; useful to simulate cases
12273 * where the hardware does not support partial checksum offload,
12274 * in order to validate correctness throughout the layers above.
12275 */
12276 if (hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_FORCED) {
12277 uint32_t foff = hwcksum_dbg_partial_rxoff_forced;
12278
12279 if (foff > (uint32_t)m->m_pkthdr.len) {
12280 return;
12281 }
12282
12283 m->m_pkthdr.csum_flags &= ~CSUM_RX_FLAGS;
12284
12285 /* Compute 16-bit 1's complement sum from forced offset */
12286 sum = m_sum16(m, foff, (m->m_pkthdr.len - foff));
12287
12288 m->m_pkthdr.csum_flags |= (CSUM_DATA_VALID | CSUM_PARTIAL);
12289 m->m_pkthdr.csum_rx_val = sum;
12290 m->m_pkthdr.csum_rx_start = (uint16_t)(foff + hlen);
12291
12292 hwcksum_dbg_partial_forced++;
12293 hwcksum_dbg_partial_forced_bytes += m->m_pkthdr.len;
12294 }
12295
12296 /*
12297 * Partial checksum offload verification (and adjustment);
12298 * useful to validate and test cases where the hardware
12299 * supports partial checksum offload.
12300 */
12301 if ((m->m_pkthdr.csum_flags &
12302 (CSUM_DATA_VALID | CSUM_PARTIAL | CSUM_PSEUDO_HDR)) ==
12303 (CSUM_DATA_VALID | CSUM_PARTIAL)) {
12304 uint32_t rxoff;
12305
12306 /* Start offset must begin after frame header */
12307 rxoff = m->m_pkthdr.csum_rx_start;
12308 if (hlen > rxoff) {
12309 hwcksum_dbg_bad_rxoff++;
12310 if (dlil_verbose) {
12311 DLIL_PRINTF("%s: partial cksum start offset %d "
12312 "is less than frame header length %d for "
12313 "mbuf 0x%llx\n", if_name(ifp), rxoff, hlen,
12314 (uint64_t)VM_KERNEL_ADDRPERM(m));
12315 }
12316 return;
12317 }
12318 rxoff -= hlen;
12319
12320 if (!(hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_FORCED)) {
12321 /*
12322 * Compute the expected 16-bit 1's complement sum;
12323 * skip this if we've already computed it above
12324 * when partial checksum offload is forced.
12325 */
12326 sum = m_sum16(m, rxoff, (m->m_pkthdr.len - rxoff));
12327
12328 /* Hardware or driver is buggy */
12329 if (sum != m->m_pkthdr.csum_rx_val) {
12330 hwcksum_dbg_bad_cksum++;
12331 if (dlil_verbose) {
12332 DLIL_PRINTF("%s: bad partial cksum value "
12333 "0x%x (expected 0x%x) for mbuf "
12334 "0x%llx [rx_start %d]\n",
12335 if_name(ifp),
12336 m->m_pkthdr.csum_rx_val, sum,
12337 (uint64_t)VM_KERNEL_ADDRPERM(m),
12338 m->m_pkthdr.csum_rx_start);
12339 }
12340 return;
12341 }
12342 }
12343 hwcksum_dbg_verified++;
12344
12345 /*
12346 * This code allows us to emulate various hardwares that
12347 * perform 16-bit 1's complement sum beginning at various
12348 * start offset values.
12349 */
12350 if (hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_RXOFF_ADJ) {
12351 uint32_t aoff = hwcksum_dbg_partial_rxoff_adj;
12352
12353 if (aoff == rxoff || aoff > (uint32_t)m->m_pkthdr.len) {
12354 return;
12355 }
12356
12357 sum = m_adj_sum16(m, rxoff, aoff,
12358 m_pktlen(m) - aoff, sum);
12359
12360 m->m_pkthdr.csum_rx_val = sum;
12361 m->m_pkthdr.csum_rx_start = (uint16_t)(aoff + hlen);
12362
12363 hwcksum_dbg_adjusted++;
12364 }
12365 }
12366 }
12367
12368 static int
12369 sysctl_hwcksum_dbg_mode SYSCTL_HANDLER_ARGS
12370 {
12371 #pragma unused(arg1, arg2)
12372 u_int32_t i;
12373 int err;
12374
12375 i = hwcksum_dbg_mode;
12376
12377 err = sysctl_handle_int(oidp, &i, 0, req);
12378 if (err != 0 || req->newptr == USER_ADDR_NULL) {
12379 return err;
12380 }
12381
12382 if (hwcksum_dbg == 0) {
12383 return ENODEV;
12384 }
12385
12386 if ((i & ~HWCKSUM_DBG_MASK) != 0) {
12387 return EINVAL;
12388 }
12389
12390 hwcksum_dbg_mode = (i & HWCKSUM_DBG_MASK);
12391
12392 return err;
12393 }
12394
12395 static int
12396 sysctl_hwcksum_dbg_partial_rxoff_forced SYSCTL_HANDLER_ARGS
12397 {
12398 #pragma unused(arg1, arg2)
12399 u_int32_t i;
12400 int err;
12401
12402 i = hwcksum_dbg_partial_rxoff_forced;
12403
12404 err = sysctl_handle_int(oidp, &i, 0, req);
12405 if (err != 0 || req->newptr == USER_ADDR_NULL) {
12406 return err;
12407 }
12408
12409 if (!(hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_FORCED)) {
12410 return ENODEV;
12411 }
12412
12413 hwcksum_dbg_partial_rxoff_forced = i;
12414
12415 return err;
12416 }
12417
12418 static int
12419 sysctl_hwcksum_dbg_partial_rxoff_adj SYSCTL_HANDLER_ARGS
12420 {
12421 #pragma unused(arg1, arg2)
12422 u_int32_t i;
12423 int err;
12424
12425 i = hwcksum_dbg_partial_rxoff_adj;
12426
12427 err = sysctl_handle_int(oidp, &i, 0, req);
12428 if (err != 0 || req->newptr == USER_ADDR_NULL) {
12429 return err;
12430 }
12431
12432 if (!(hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_RXOFF_ADJ)) {
12433 return ENODEV;
12434 }
12435
12436 hwcksum_dbg_partial_rxoff_adj = i;
12437
12438 return err;
12439 }
12440
12441 static int
12442 sysctl_tx_chain_len_stats SYSCTL_HANDLER_ARGS
12443 {
12444 #pragma unused(oidp, arg1, arg2)
12445 int err;
12446
12447 if (req->oldptr == USER_ADDR_NULL) {
12448 }
12449 if (req->newptr != USER_ADDR_NULL) {
12450 return EPERM;
12451 }
12452 err = SYSCTL_OUT(req, &tx_chain_len_stats,
12453 sizeof(struct chain_len_stats));
12454
12455 return err;
12456 }
12457
12458 #if DEBUG || DEVELOPMENT
12459 /* Blob for sum16 verification */
12460 static uint8_t sumdata[] = {
12461 0x1f, 0x8b, 0x08, 0x08, 0x4c, 0xe5, 0x9a, 0x4f, 0x00, 0x03,
12462 0x5f, 0x00, 0x5d, 0x91, 0x41, 0x4e, 0xc4, 0x30, 0x0c, 0x45,
12463 0xf7, 0x9c, 0xc2, 0x07, 0x18, 0xf5, 0x0e, 0xb0, 0xe2, 0x00,
12464 0x48, 0x88, 0xa5, 0xdb, 0xba, 0x49, 0x34, 0x69, 0xdc, 0x71,
12465 0x92, 0xa9, 0xc2, 0x8a, 0x6b, 0x70, 0x3d, 0x4e, 0x82, 0x93,
12466 0xb4, 0x08, 0xd8, 0xc5, 0xb1, 0xfd, 0xff, 0xb3, 0xfd, 0x4c,
12467 0x42, 0x5f, 0x1f, 0x9f, 0x11, 0x12, 0x43, 0xb2, 0x04, 0x93,
12468 0xe0, 0x7b, 0x01, 0x0e, 0x14, 0x07, 0x78, 0xd1, 0x78, 0x75,
12469 0x71, 0x71, 0xe9, 0x08, 0x84, 0x46, 0xf2, 0xc7, 0x3b, 0x09,
12470 0xe7, 0xd1, 0xd3, 0x8a, 0x57, 0x92, 0x33, 0xcd, 0x39, 0xcc,
12471 0xb0, 0x91, 0x89, 0xe0, 0x42, 0x53, 0x8b, 0xb7, 0x8c, 0x42,
12472 0x60, 0xd9, 0x9f, 0x7a, 0x55, 0x19, 0x76, 0xcb, 0x10, 0x49,
12473 0x35, 0xac, 0x0b, 0x5a, 0x3c, 0xbb, 0x65, 0x51, 0x8c, 0x90,
12474 0x7c, 0x69, 0x45, 0x45, 0x81, 0xb4, 0x2b, 0x70, 0x82, 0x85,
12475 0x55, 0x91, 0x17, 0x90, 0xdc, 0x14, 0x1e, 0x35, 0x52, 0xdd,
12476 0x02, 0x16, 0xef, 0xb5, 0x40, 0x89, 0xe2, 0x46, 0x53, 0xad,
12477 0x93, 0x6e, 0x98, 0x30, 0xe5, 0x08, 0xb7, 0xcc, 0x03, 0xbc,
12478 0x71, 0x86, 0x09, 0x43, 0x0d, 0x52, 0xf5, 0xa2, 0xf5, 0xa2,
12479 0x56, 0x11, 0x8d, 0xa8, 0xf5, 0xee, 0x92, 0x3d, 0xfe, 0x8c,
12480 0x67, 0x71, 0x8b, 0x0e, 0x2d, 0x70, 0x77, 0xbe, 0xbe, 0xea,
12481 0xbf, 0x9a, 0x8d, 0x9c, 0x53, 0x53, 0xe5, 0xe0, 0x4b, 0x87,
12482 0x85, 0xd2, 0x45, 0x95, 0x30, 0xc1, 0xcc, 0xe0, 0x74, 0x54,
12483 0x13, 0x58, 0xe8, 0xe8, 0x79, 0xa2, 0x09, 0x73, 0xa4, 0x0e,
12484 0x39, 0x59, 0x0c, 0xe6, 0x9c, 0xb2, 0x4f, 0x06, 0x5b, 0x8e,
12485 0xcd, 0x17, 0x6c, 0x5e, 0x95, 0x4d, 0x70, 0xa2, 0x0a, 0xbf,
12486 0xa3, 0xcc, 0x03, 0xbc, 0x5a, 0xe7, 0x75, 0x06, 0x5e, 0x75,
12487 0xef, 0x58, 0x8e, 0x15, 0xd1, 0x0a, 0x18, 0xff, 0xdd, 0xe6,
12488 0x02, 0x3b, 0xb5, 0xb4, 0xa1, 0xe0, 0x72, 0xfc, 0xe3, 0xab,
12489 0x07, 0xe0, 0x4d, 0x65, 0xea, 0x92, 0xeb, 0xf2, 0x7b, 0x17,
12490 0x05, 0xce, 0xc6, 0xf6, 0x2b, 0xbb, 0x70, 0x3d, 0x00, 0x95,
12491 0xe0, 0x07, 0x52, 0x3b, 0x58, 0xfc, 0x7c, 0x69, 0x4d, 0xe9,
12492 0xf7, 0xa9, 0x66, 0x1e, 0x1e, 0xbe, 0x01, 0x69, 0x98, 0xfe,
12493 0xc8, 0x28, 0x02, 0x00, 0x00
12494 };
12495
12496 /* Precomputed 16-bit 1's complement sums for various spans of the above data */
12497 static struct {
12498 boolean_t init;
12499 uint16_t len;
12500 uint16_t sumr; /* reference */
12501 uint16_t sumrp; /* reference, precomputed */
12502 } sumtbl[] = {
12503 { FALSE, 0, 0, 0x0000 },
12504 { FALSE, 1, 0, 0x001f },
12505 { FALSE, 2, 0, 0x8b1f },
12506 { FALSE, 3, 0, 0x8b27 },
12507 { FALSE, 7, 0, 0x790e },
12508 { FALSE, 11, 0, 0xcb6d },
12509 { FALSE, 20, 0, 0x20dd },
12510 { FALSE, 27, 0, 0xbabd },
12511 { FALSE, 32, 0, 0xf3e8 },
12512 { FALSE, 37, 0, 0x197d },
12513 { FALSE, 43, 0, 0x9eae },
12514 { FALSE, 64, 0, 0x4678 },
12515 { FALSE, 127, 0, 0x9399 },
12516 { FALSE, 256, 0, 0xd147 },
12517 { FALSE, 325, 0, 0x0358 },
12518 };
12519 #define SUMTBL_MAX ((int)sizeof (sumtbl) / (int)sizeof (sumtbl[0]))
12520
12521 static void
dlil_verify_sum16(void)12522 dlil_verify_sum16(void)
12523 {
12524 struct mbuf *m;
12525 uint8_t *buf;
12526 int n;
12527
12528 /* Make sure test data plus extra room for alignment fits in cluster */
12529 _CASSERT((sizeof(sumdata) + (sizeof(uint64_t) * 2)) <= MCLBYTES);
12530
12531 kprintf("DLIL: running SUM16 self-tests ... ");
12532
12533 m = m_getcl(M_WAITOK, MT_DATA, M_PKTHDR);
12534 m_align(m, sizeof(sumdata) + (sizeof(uint64_t) * 2));
12535
12536 buf = mtod(m, uint8_t *); /* base address */
12537
12538 for (n = 0; n < SUMTBL_MAX; n++) {
12539 uint16_t len = sumtbl[n].len;
12540 int i;
12541
12542 /* Verify for all possible alignments */
12543 for (i = 0; i < (int)sizeof(uint64_t); i++) {
12544 uint16_t sum, sumr;
12545 uint8_t *c;
12546
12547 /* Copy over test data to mbuf */
12548 VERIFY(len <= sizeof(sumdata));
12549 c = buf + i;
12550 bcopy(sumdata, c, len);
12551
12552 /* Zero-offset test (align by data pointer) */
12553 m->m_data = (caddr_t)c;
12554 m->m_len = len;
12555 sum = m_sum16(m, 0, len);
12556
12557 if (!sumtbl[n].init) {
12558 sumr = (uint16_t)in_cksum_mbuf_ref(m, len, 0, 0);
12559 sumtbl[n].sumr = sumr;
12560 sumtbl[n].init = TRUE;
12561 } else {
12562 sumr = sumtbl[n].sumr;
12563 }
12564
12565 /* Something is horribly broken; stop now */
12566 if (sumr != sumtbl[n].sumrp) {
12567 panic_plain("\n%s: broken in_cksum_mbuf_ref() "
12568 "for len=%d align=%d sum=0x%04x "
12569 "[expected=0x%04x]\n", __func__,
12570 len, i, sum, sumr);
12571 /* NOTREACHED */
12572 } else if (sum != sumr) {
12573 panic_plain("\n%s: broken m_sum16() for len=%d "
12574 "align=%d sum=0x%04x [expected=0x%04x]\n",
12575 __func__, len, i, sum, sumr);
12576 /* NOTREACHED */
12577 }
12578
12579 /* Alignment test by offset (fixed data pointer) */
12580 m->m_data = (caddr_t)buf;
12581 m->m_len = i + len;
12582 sum = m_sum16(m, i, len);
12583
12584 /* Something is horribly broken; stop now */
12585 if (sum != sumr) {
12586 panic_plain("\n%s: broken m_sum16() for len=%d "
12587 "offset=%d sum=0x%04x [expected=0x%04x]\n",
12588 __func__, len, i, sum, sumr);
12589 /* NOTREACHED */
12590 }
12591 #if INET
12592 /* Simple sum16 contiguous buffer test by aligment */
12593 sum = b_sum16(c, len);
12594
12595 /* Something is horribly broken; stop now */
12596 if (sum != sumr) {
12597 panic_plain("\n%s: broken b_sum16() for len=%d "
12598 "align=%d sum=0x%04x [expected=0x%04x]\n",
12599 __func__, len, i, sum, sumr);
12600 /* NOTREACHED */
12601 }
12602 #endif /* INET */
12603 }
12604 }
12605 m_freem(m);
12606
12607 kprintf("PASSED\n");
12608 }
12609 #endif /* DEBUG || DEVELOPMENT */
12610
12611 #define CASE_STRINGIFY(x) case x: return #x
12612
12613 __private_extern__ const char *
dlil_kev_dl_code_str(u_int32_t event_code)12614 dlil_kev_dl_code_str(u_int32_t event_code)
12615 {
12616 switch (event_code) {
12617 CASE_STRINGIFY(KEV_DL_SIFFLAGS);
12618 CASE_STRINGIFY(KEV_DL_SIFMETRICS);
12619 CASE_STRINGIFY(KEV_DL_SIFMTU);
12620 CASE_STRINGIFY(KEV_DL_SIFPHYS);
12621 CASE_STRINGIFY(KEV_DL_SIFMEDIA);
12622 CASE_STRINGIFY(KEV_DL_SIFGENERIC);
12623 CASE_STRINGIFY(KEV_DL_ADDMULTI);
12624 CASE_STRINGIFY(KEV_DL_DELMULTI);
12625 CASE_STRINGIFY(KEV_DL_IF_ATTACHED);
12626 CASE_STRINGIFY(KEV_DL_IF_DETACHING);
12627 CASE_STRINGIFY(KEV_DL_IF_DETACHED);
12628 CASE_STRINGIFY(KEV_DL_LINK_OFF);
12629 CASE_STRINGIFY(KEV_DL_LINK_ON);
12630 CASE_STRINGIFY(KEV_DL_PROTO_ATTACHED);
12631 CASE_STRINGIFY(KEV_DL_PROTO_DETACHED);
12632 CASE_STRINGIFY(KEV_DL_LINK_ADDRESS_CHANGED);
12633 CASE_STRINGIFY(KEV_DL_WAKEFLAGS_CHANGED);
12634 CASE_STRINGIFY(KEV_DL_IF_IDLE_ROUTE_REFCNT);
12635 CASE_STRINGIFY(KEV_DL_IFCAP_CHANGED);
12636 CASE_STRINGIFY(KEV_DL_LINK_QUALITY_METRIC_CHANGED);
12637 CASE_STRINGIFY(KEV_DL_NODE_PRESENCE);
12638 CASE_STRINGIFY(KEV_DL_NODE_ABSENCE);
12639 CASE_STRINGIFY(KEV_DL_PRIMARY_ELECTED);
12640 CASE_STRINGIFY(KEV_DL_ISSUES);
12641 CASE_STRINGIFY(KEV_DL_IFDELEGATE_CHANGED);
12642 default:
12643 break;
12644 }
12645 return "";
12646 }
12647
12648 static void
dlil_dt_tcall_fn(thread_call_param_t arg0,thread_call_param_t arg1)12649 dlil_dt_tcall_fn(thread_call_param_t arg0, thread_call_param_t arg1)
12650 {
12651 #pragma unused(arg1)
12652 struct ifnet *ifp = arg0;
12653
12654 if (ifnet_is_attached(ifp, 1)) {
12655 nstat_ifnet_threshold_reached(ifp->if_index);
12656 ifnet_decr_iorefcnt(ifp);
12657 }
12658 }
12659
12660 void
ifnet_notify_data_threshold(struct ifnet * ifp)12661 ifnet_notify_data_threshold(struct ifnet *ifp)
12662 {
12663 uint64_t bytes = (ifp->if_ibytes + ifp->if_obytes);
12664 uint64_t oldbytes = ifp->if_dt_bytes;
12665
12666 ASSERT(ifp->if_dt_tcall != NULL);
12667
12668 /*
12669 * If we went over the threshold, notify NetworkStatistics.
12670 * We rate-limit it based on the threshold interval value.
12671 */
12672 if (threshold_notify && (bytes - oldbytes) > ifp->if_data_threshold &&
12673 OSCompareAndSwap64(oldbytes, bytes, &ifp->if_dt_bytes) &&
12674 !thread_call_isactive(ifp->if_dt_tcall)) {
12675 uint64_t tival = (threshold_interval * NSEC_PER_SEC);
12676 uint64_t now = mach_absolute_time(), deadline = now;
12677 uint64_t ival;
12678
12679 if (tival != 0) {
12680 nanoseconds_to_absolutetime(tival, &ival);
12681 clock_deadline_for_periodic_event(ival, now, &deadline);
12682 (void) thread_call_enter_delayed(ifp->if_dt_tcall,
12683 deadline);
12684 } else {
12685 (void) thread_call_enter(ifp->if_dt_tcall);
12686 }
12687 }
12688 }
12689
12690 #if (DEVELOPMENT || DEBUG)
12691 /*
12692 * The sysctl variable name contains the input parameters of
12693 * ifnet_get_keepalive_offload_frames()
12694 * ifp (interface index): name[0]
12695 * frames_array_count: name[1]
12696 * frame_data_offset: name[2]
12697 * The return length gives used_frames_count
12698 */
12699 static int
12700 sysctl_get_kao_frames SYSCTL_HANDLER_ARGS
12701 {
12702 #pragma unused(oidp)
12703 int *name = (int *)arg1;
12704 u_int namelen = arg2;
12705 int idx;
12706 ifnet_t ifp = NULL;
12707 u_int32_t frames_array_count;
12708 size_t frame_data_offset;
12709 u_int32_t used_frames_count;
12710 struct ifnet_keepalive_offload_frame *frames_array = NULL;
12711 int error = 0;
12712 u_int32_t i;
12713
12714 /*
12715 * Only root can get look at other people TCP frames
12716 */
12717 error = proc_suser(current_proc());
12718 if (error != 0) {
12719 goto done;
12720 }
12721 /*
12722 * Validate the input parameters
12723 */
12724 if (req->newptr != USER_ADDR_NULL) {
12725 error = EPERM;
12726 goto done;
12727 }
12728 if (namelen != 3) {
12729 error = EINVAL;
12730 goto done;
12731 }
12732 if (req->oldptr == USER_ADDR_NULL) {
12733 error = EINVAL;
12734 goto done;
12735 }
12736 if (req->oldlen == 0) {
12737 error = EINVAL;
12738 goto done;
12739 }
12740 idx = name[0];
12741 frames_array_count = name[1];
12742 frame_data_offset = name[2];
12743
12744 /* Make sure the passed buffer is large enough */
12745 if (frames_array_count * sizeof(struct ifnet_keepalive_offload_frame) >
12746 req->oldlen) {
12747 error = ENOMEM;
12748 goto done;
12749 }
12750
12751 ifnet_head_lock_shared();
12752 if (!IF_INDEX_IN_RANGE(idx)) {
12753 ifnet_head_done();
12754 error = ENOENT;
12755 goto done;
12756 }
12757 ifp = ifindex2ifnet[idx];
12758 ifnet_head_done();
12759
12760 frames_array = (struct ifnet_keepalive_offload_frame *)kalloc_data(
12761 frames_array_count * sizeof(struct ifnet_keepalive_offload_frame),
12762 Z_WAITOK);
12763 if (frames_array == NULL) {
12764 error = ENOMEM;
12765 goto done;
12766 }
12767
12768 error = ifnet_get_keepalive_offload_frames(ifp, frames_array,
12769 frames_array_count, frame_data_offset, &used_frames_count);
12770 if (error != 0) {
12771 DLIL_PRINTF("%s: ifnet_get_keepalive_offload_frames error %d\n",
12772 __func__, error);
12773 goto done;
12774 }
12775
12776 for (i = 0; i < used_frames_count; i++) {
12777 error = SYSCTL_OUT(req, frames_array + i,
12778 sizeof(struct ifnet_keepalive_offload_frame));
12779 if (error != 0) {
12780 goto done;
12781 }
12782 }
12783 done:
12784 if (frames_array != NULL) {
12785 kfree_data(frames_array, frames_array_count *
12786 sizeof(struct ifnet_keepalive_offload_frame));
12787 }
12788 return error;
12789 }
12790 #endif /* DEVELOPMENT || DEBUG */
12791
12792 void
ifnet_update_stats_per_flow(struct ifnet_stats_per_flow * ifs,struct ifnet * ifp)12793 ifnet_update_stats_per_flow(struct ifnet_stats_per_flow *ifs,
12794 struct ifnet *ifp)
12795 {
12796 tcp_update_stats_per_flow(ifs, ifp);
12797 }
12798
12799 static inline u_int32_t
_set_flags(u_int32_t * flags_p,u_int32_t set_flags)12800 _set_flags(u_int32_t *flags_p, u_int32_t set_flags)
12801 {
12802 return (u_int32_t)OSBitOrAtomic(set_flags, flags_p);
12803 }
12804
12805 static inline void
_clear_flags(u_int32_t * flags_p,u_int32_t clear_flags)12806 _clear_flags(u_int32_t *flags_p, u_int32_t clear_flags)
12807 {
12808 OSBitAndAtomic(~clear_flags, flags_p);
12809 }
12810
12811 __private_extern__ u_int32_t
if_set_eflags(ifnet_t interface,u_int32_t set_flags)12812 if_set_eflags(ifnet_t interface, u_int32_t set_flags)
12813 {
12814 return _set_flags(&interface->if_eflags, set_flags);
12815 }
12816
12817 __private_extern__ void
if_clear_eflags(ifnet_t interface,u_int32_t clear_flags)12818 if_clear_eflags(ifnet_t interface, u_int32_t clear_flags)
12819 {
12820 _clear_flags(&interface->if_eflags, clear_flags);
12821 }
12822
12823 __private_extern__ u_int32_t
if_set_xflags(ifnet_t interface,u_int32_t set_flags)12824 if_set_xflags(ifnet_t interface, u_int32_t set_flags)
12825 {
12826 return _set_flags(&interface->if_xflags, set_flags);
12827 }
12828
12829 __private_extern__ void
if_clear_xflags(ifnet_t interface,u_int32_t clear_flags)12830 if_clear_xflags(ifnet_t interface, u_int32_t clear_flags)
12831 {
12832 _clear_flags(&interface->if_xflags, clear_flags);
12833 }
12834
12835 __private_extern__ void
ifnet_update_traffic_rule_genid(ifnet_t ifp)12836 ifnet_update_traffic_rule_genid(ifnet_t ifp)
12837 {
12838 os_atomic_inc(&ifp->if_traffic_rule_genid, relaxed);
12839 }
12840
12841 __private_extern__ boolean_t
ifnet_sync_traffic_rule_genid(ifnet_t ifp,uint32_t * genid)12842 ifnet_sync_traffic_rule_genid(ifnet_t ifp, uint32_t *genid)
12843 {
12844 if (*genid != ifp->if_traffic_rule_genid) {
12845 *genid = ifp->if_traffic_rule_genid;
12846 return TRUE;
12847 }
12848 return FALSE;
12849 }
12850 __private_extern__ void
ifnet_update_traffic_rule_count(ifnet_t ifp,uint32_t count)12851 ifnet_update_traffic_rule_count(ifnet_t ifp, uint32_t count)
12852 {
12853 os_atomic_store(&ifp->if_traffic_rule_count, count, release);
12854 ifnet_update_traffic_rule_genid(ifp);
12855 }
12856
12857 static void
log_hexdump(void * data,size_t len)12858 log_hexdump(void *data, size_t len)
12859 {
12860 size_t i, j, k;
12861 unsigned char *ptr = (unsigned char *)data;
12862 #define MAX_DUMP_BUF 32
12863 unsigned char buf[3 * MAX_DUMP_BUF + 1];
12864
12865 for (i = 0; i < len; i += MAX_DUMP_BUF) {
12866 for (j = i, k = 0; j < i + MAX_DUMP_BUF && j < len; j++) {
12867 unsigned char msnbl = ptr[j] >> 4;
12868 unsigned char lsnbl = ptr[j] & 0x0f;
12869
12870 buf[k++] = msnbl < 10 ? msnbl + '0' : msnbl + 'a' - 10;
12871 buf[k++] = lsnbl < 10 ? lsnbl + '0' : lsnbl + 'a' - 10;
12872
12873 if ((j % 2) == 1) {
12874 buf[k++] = ' ';
12875 }
12876 if ((j % MAX_DUMP_BUF) == MAX_DUMP_BUF - 1) {
12877 buf[k++] = ' ';
12878 }
12879 }
12880 buf[k] = 0;
12881 os_log(OS_LOG_DEFAULT, "%3lu: %s", i, buf);
12882 }
12883 }
12884
12885 #if SKYWALK && defined(XNU_TARGET_OS_OSX)
12886 static bool
net_check_compatible_if_filter(struct ifnet * ifp)12887 net_check_compatible_if_filter(struct ifnet *ifp)
12888 {
12889 if (ifp == NULL) {
12890 if (net_api_stats.nas_iflt_attach_count > net_api_stats.nas_iflt_attach_os_count) {
12891 return false;
12892 }
12893 } else {
12894 if (ifp->if_flt_non_os_count > 0) {
12895 return false;
12896 }
12897 }
12898 return true;
12899 }
12900 #endif /* SKYWALK && XNU_TARGET_OS_OSX */
12901
12902 #define DUMP_BUF_CHK() { \
12903 clen -= k; \
12904 if (clen < 1) \
12905 goto done; \
12906 c += k; \
12907 }
12908
12909 int dlil_dump_top_if_qlen(char *, int);
12910 int
dlil_dump_top_if_qlen(char * str,int str_len)12911 dlil_dump_top_if_qlen(char *str, int str_len)
12912 {
12913 char *c = str;
12914 int k, clen = str_len;
12915 struct ifnet *top_ifcq_ifp = NULL;
12916 uint32_t top_ifcq_len = 0;
12917 struct ifnet *top_inq_ifp = NULL;
12918 uint32_t top_inq_len = 0;
12919
12920 for (int ifidx = 1; ifidx < if_index; ifidx++) {
12921 struct ifnet *ifp = ifindex2ifnet[ifidx];
12922 struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
12923
12924 if (ifp == NULL) {
12925 continue;
12926 }
12927 if (ifp->if_snd != NULL && ifp->if_snd->ifcq_len > top_ifcq_len) {
12928 top_ifcq_len = ifp->if_snd->ifcq_len;
12929 top_ifcq_ifp = ifp;
12930 }
12931 if (dl_if->dl_if_inpstorage.dlth_pkts.qlen > top_inq_len) {
12932 top_inq_len = dl_if->dl_if_inpstorage.dlth_pkts.qlen;
12933 top_inq_ifp = ifp;
12934 }
12935 }
12936
12937 if (top_ifcq_ifp != NULL) {
12938 k = scnprintf(c, clen, "\ntop ifcq_len %u packets by %s\n",
12939 top_ifcq_len, top_ifcq_ifp->if_xname);
12940 DUMP_BUF_CHK();
12941 }
12942 if (top_inq_ifp != NULL) {
12943 k = scnprintf(c, clen, "\ntop inq_len %u packets by %s\n",
12944 top_inq_len, top_inq_ifp->if_xname);
12945 DUMP_BUF_CHK();
12946 }
12947 done:
12948 return str_len - clen;
12949 }
12950