1 /*
2 * Copyright (c) 1999-2022 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
30 * support for mandatory and extensible security protections. This notice
31 * is included in support of clause 2.2 (b) of the Apple Public License,
32 * Version 2.0.
33 */
34 #include <stddef.h>
35 #include <ptrauth.h>
36
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/kernel.h>
40 #include <sys/malloc.h>
41 #include <sys/mbuf.h>
42 #include <sys/socket.h>
43 #include <sys/domain.h>
44 #include <sys/user.h>
45 #include <sys/random.h>
46 #include <sys/socketvar.h>
47 #include <net/if_dl.h>
48 #include <net/if.h>
49 #include <net/route.h>
50 #include <net/if_var.h>
51 #include <net/dlil.h>
52 #include <net/if_arp.h>
53 #include <net/iptap.h>
54 #include <net/pktap.h>
55 #include <net/nwk_wq.h>
56 #include <sys/kern_event.h>
57 #include <sys/kdebug.h>
58 #include <sys/mcache.h>
59 #include <sys/syslog.h>
60 #include <sys/protosw.h>
61 #include <sys/priv.h>
62
63 #include <kern/assert.h>
64 #include <kern/task.h>
65 #include <kern/thread.h>
66 #include <kern/sched_prim.h>
67 #include <kern/locks.h>
68 #include <kern/zalloc.h>
69
70 #include <net/kpi_protocol.h>
71 #include <net/if_types.h>
72 #include <net/if_ipsec.h>
73 #include <net/if_llreach.h>
74 #include <net/if_utun.h>
75 #include <net/kpi_interfacefilter.h>
76 #include <net/classq/classq.h>
77 #include <net/classq/classq_sfb.h>
78 #include <net/flowhash.h>
79 #include <net/ntstat.h>
80 #if SKYWALK && defined(XNU_TARGET_OS_OSX)
81 #include <skywalk/lib/net_filter_event.h>
82 #endif /* SKYWALK && XNU_TARGET_OS_OSX */
83 #include <net/if_llatbl.h>
84 #include <net/net_api_stats.h>
85 #include <net/if_ports_used.h>
86 #include <net/if_vlan_var.h>
87 #include <netinet/in.h>
88 #if INET
89 #include <netinet/in_var.h>
90 #include <netinet/igmp_var.h>
91 #include <netinet/ip_var.h>
92 #include <netinet/tcp.h>
93 #include <netinet/tcp_var.h>
94 #include <netinet/udp.h>
95 #include <netinet/udp_var.h>
96 #include <netinet/if_ether.h>
97 #include <netinet/in_pcb.h>
98 #include <netinet/in_tclass.h>
99 #include <netinet/ip.h>
100 #include <netinet/ip_icmp.h>
101 #include <netinet/icmp_var.h>
102 #endif /* INET */
103
104 #include <net/nat464_utils.h>
105 #include <netinet6/in6_var.h>
106 #include <netinet6/nd6.h>
107 #include <netinet6/mld6_var.h>
108 #include <netinet6/scope6_var.h>
109 #include <netinet/ip6.h>
110 #include <netinet/icmp6.h>
111 #include <net/pf_pbuf.h>
112 #include <libkern/OSAtomic.h>
113 #include <libkern/tree.h>
114
115 #include <dev/random/randomdev.h>
116 #include <machine/machine_routines.h>
117
118 #include <mach/thread_act.h>
119 #include <mach/sdt.h>
120
121 #if CONFIG_MACF
122 #include <sys/kauth.h>
123 #include <security/mac_framework.h>
124 #include <net/ethernet.h>
125 #include <net/firewire.h>
126 #endif
127
128 #if PF
129 #include <net/pfvar.h>
130 #endif /* PF */
131 #include <net/pktsched/pktsched.h>
132 #include <net/pktsched/pktsched_netem.h>
133
134 #if NECP
135 #include <net/necp.h>
136 #endif /* NECP */
137
138 #if SKYWALK
139 #include <skywalk/packet/packet_queue.h>
140 #include <skywalk/nexus/netif/nx_netif.h>
141 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
142 #endif /* SKYWALK */
143
144 #include <os/log.h>
145
146 #define DBG_LAYER_BEG DLILDBG_CODE(DBG_DLIL_STATIC, 0)
147 #define DBG_LAYER_END DLILDBG_CODE(DBG_DLIL_STATIC, 2)
148 #define DBG_FNC_DLIL_INPUT DLILDBG_CODE(DBG_DLIL_STATIC, (1 << 8))
149 #define DBG_FNC_DLIL_OUTPUT DLILDBG_CODE(DBG_DLIL_STATIC, (2 << 8))
150 #define DBG_FNC_DLIL_IFOUT DLILDBG_CODE(DBG_DLIL_STATIC, (3 << 8))
151
152 #define IFNET_KTRACE_TX_PKT_DUMP IFNETDBG_CODE(DBG_IFNET, 0x001)
153 #define IFNET_KTRACE_RX_PKT_DUMP IFNETDBG_CODE(DBG_IFNET, 0x002)
154
155 #define MAX_FRAME_TYPE_SIZE 4 /* LONGWORDS */
156 #define MAX_LINKADDR 4 /* LONGWORDS */
157
158
159 #if 1
160 #define DLIL_PRINTF printf
161 #else
162 #define DLIL_PRINTF kprintf
163 #endif
164
165 #define IF_DATA_REQUIRE_ALIGNED_64(f) \
166 _CASSERT(!(offsetof(struct if_data_internal, f) % sizeof (u_int64_t)))
167
168 #define IFNET_IF_DATA_REQUIRE_ALIGNED_64(f) \
169 _CASSERT(!(offsetof(struct ifnet, if_data.f) % sizeof (u_int64_t)))
170
171 enum {
172 kProtoKPI_v1 = 1,
173 kProtoKPI_v2 = 2
174 };
175
176 /*
177 * List of if_proto structures in if_proto_hash[] is protected by
178 * the ifnet lock. The rest of the fields are initialized at protocol
179 * attach time and never change, thus no lock required as long as
180 * a reference to it is valid, via if_proto_ref().
181 */
182 struct if_proto {
183 SLIST_ENTRY(if_proto) next_hash;
184 u_int32_t refcount;
185 u_int32_t detached;
186 struct ifnet *ifp;
187 protocol_family_t protocol_family;
188 int proto_kpi;
189 union {
190 struct {
191 proto_media_input input;
192 proto_media_preout pre_output;
193 proto_media_event event;
194 proto_media_ioctl ioctl;
195 proto_media_detached detached;
196 proto_media_resolve_multi resolve_multi;
197 proto_media_send_arp send_arp;
198 } v1;
199 struct {
200 proto_media_input_v2 input;
201 proto_media_preout pre_output;
202 proto_media_event event;
203 proto_media_ioctl ioctl;
204 proto_media_detached detached;
205 proto_media_resolve_multi resolve_multi;
206 proto_media_send_arp send_arp;
207 } v2;
208 } kpi;
209 };
210
211 SLIST_HEAD(proto_hash_entry, if_proto);
212
213 #define DLIL_SDLDATALEN \
214 (DLIL_SDLMAXLEN - offsetof(struct sockaddr_dl, sdl_data[0]))
215
216 struct dlil_ifnet {
217 struct ifnet dl_if; /* public ifnet */
218 /*
219 * DLIL private fields, protected by dl_if_lock
220 */
221 decl_lck_mtx_data(, dl_if_lock);
222 TAILQ_ENTRY(dlil_ifnet) dl_if_link; /* dlil_ifnet link */
223 u_int32_t dl_if_flags; /* flags (below) */
224 u_int32_t dl_if_refcnt; /* refcnt */
225 void (*dl_if_trace)(struct dlil_ifnet *, int); /* ref trace callback */
226 void *dl_if_uniqueid; /* unique interface id */
227 size_t dl_if_uniqueid_len; /* length of the unique id */
228 char dl_if_namestorage[IFNAMSIZ]; /* interface name storage */
229 char dl_if_xnamestorage[IFXNAMSIZ]; /* external name storage */
230 struct {
231 struct ifaddr ifa; /* lladdr ifa */
232 u_int8_t asdl[DLIL_SDLMAXLEN]; /* addr storage */
233 u_int8_t msdl[DLIL_SDLMAXLEN]; /* mask storage */
234 } dl_if_lladdr;
235 u_int8_t dl_if_descstorage[IF_DESCSIZE]; /* desc storage */
236 u_int8_t dl_if_permanent_ether[ETHER_ADDR_LEN]; /* permanent address */
237 u_int8_t dl_if_permanent_ether_is_set;
238 u_int8_t dl_if_unused;
239 struct dlil_threading_info dl_if_inpstorage; /* input thread storage */
240 ctrace_t dl_if_attach; /* attach PC stacktrace */
241 ctrace_t dl_if_detach; /* detach PC stacktrace */
242 };
243
244 /* Values for dl_if_flags (private to DLIL) */
245 #define DLIF_INUSE 0x1 /* DLIL ifnet recycler, ifnet in use */
246 #define DLIF_REUSE 0x2 /* DLIL ifnet recycles, ifnet is not new */
247 #define DLIF_DEBUG 0x4 /* has debugging info */
248
249 #define IF_REF_TRACE_HIST_SIZE 8 /* size of ref trace history */
250
251 /* For gdb */
252 __private_extern__ unsigned int if_ref_trace_hist_size = IF_REF_TRACE_HIST_SIZE;
253
254 struct dlil_ifnet_dbg {
255 struct dlil_ifnet dldbg_dlif; /* dlil_ifnet */
256 u_int16_t dldbg_if_refhold_cnt; /* # ifnet references */
257 u_int16_t dldbg_if_refrele_cnt; /* # ifnet releases */
258 /*
259 * Circular lists of ifnet_{reference,release} callers.
260 */
261 ctrace_t dldbg_if_refhold[IF_REF_TRACE_HIST_SIZE];
262 ctrace_t dldbg_if_refrele[IF_REF_TRACE_HIST_SIZE];
263 };
264
265 #define DLIL_TO_IFP(s) (&s->dl_if)
266 #define IFP_TO_DLIL(s) ((struct dlil_ifnet *)s)
267
268 struct ifnet_filter {
269 TAILQ_ENTRY(ifnet_filter) filt_next;
270 u_int32_t filt_skip;
271 u_int32_t filt_flags;
272 ifnet_t filt_ifp;
273 const char *filt_name;
274 void *filt_cookie;
275 protocol_family_t filt_protocol;
276 iff_input_func filt_input;
277 iff_output_func filt_output;
278 iff_event_func filt_event;
279 iff_ioctl_func filt_ioctl;
280 iff_detached_func filt_detached;
281 };
282
283 struct proto_input_entry;
284
285 static TAILQ_HEAD(, dlil_ifnet) dlil_ifnet_head;
286
287 static LCK_ATTR_DECLARE(dlil_lck_attributes, 0, 0);
288
289 static LCK_GRP_DECLARE(dlil_lock_group, "DLIL internal locks");
290 LCK_GRP_DECLARE(ifnet_lock_group, "ifnet locks");
291 static LCK_GRP_DECLARE(ifnet_head_lock_group, "ifnet head lock");
292 static LCK_GRP_DECLARE(ifnet_snd_lock_group, "ifnet snd locks");
293 static LCK_GRP_DECLARE(ifnet_rcv_lock_group, "ifnet rcv locks");
294
295 LCK_ATTR_DECLARE(ifnet_lock_attr, 0, 0);
296 static LCK_RW_DECLARE_ATTR(ifnet_head_lock, &ifnet_head_lock_group,
297 &dlil_lck_attributes);
298 static LCK_MTX_DECLARE_ATTR(dlil_ifnet_lock, &dlil_lock_group,
299 &dlil_lck_attributes);
300
301 #if DEBUG
302 static unsigned int ifnet_debug = 1; /* debugging (enabled) */
303 #else
304 static unsigned int ifnet_debug; /* debugging (disabled) */
305 #endif /* !DEBUG */
306 static unsigned int dlif_size; /* size of dlil_ifnet to allocate */
307 static unsigned int dlif_bufsize; /* size of dlif_size + headroom */
308 static struct zone *dlif_zone; /* zone for dlil_ifnet */
309 #define DLIF_ZONE_NAME "ifnet" /* zone name */
310
311 static ZONE_DEFINE(dlif_filt_zone, "ifnet_filter",
312 sizeof(struct ifnet_filter), ZC_ZFREE_CLEARMEM);
313
314 static ZONE_DEFINE(dlif_phash_zone, "ifnet_proto_hash",
315 sizeof(struct proto_hash_entry) * PROTO_HASH_SLOTS, ZC_ZFREE_CLEARMEM);
316
317 static ZONE_DEFINE(dlif_proto_zone, "ifnet_proto",
318 sizeof(struct if_proto), ZC_ZFREE_CLEARMEM);
319
320 static unsigned int dlif_tcpstat_size; /* size of tcpstat_local to allocate */
321 static unsigned int dlif_tcpstat_bufsize; /* size of dlif_tcpstat_size + headroom */
322 static struct zone *dlif_tcpstat_zone; /* zone for tcpstat_local */
323 #define DLIF_TCPSTAT_ZONE_NAME "ifnet_tcpstat" /* zone name */
324
325 static unsigned int dlif_udpstat_size; /* size of udpstat_local to allocate */
326 static unsigned int dlif_udpstat_bufsize; /* size of dlif_udpstat_size + headroom */
327 static struct zone *dlif_udpstat_zone; /* zone for udpstat_local */
328 #define DLIF_UDPSTAT_ZONE_NAME "ifnet_udpstat" /* zone name */
329
330 static u_int32_t net_rtref;
331
332 static struct dlil_main_threading_info dlil_main_input_thread_info;
333 __private_extern__ struct dlil_threading_info *dlil_main_input_thread =
334 (struct dlil_threading_info *)&dlil_main_input_thread_info;
335
336 static int dlil_event_internal(struct ifnet *ifp, struct kev_msg *msg, bool update_generation);
337 static int dlil_detach_filter_internal(interface_filter_t filter, int detached);
338 static void dlil_if_trace(struct dlil_ifnet *, int);
339 static void if_proto_ref(struct if_proto *);
340 static void if_proto_free(struct if_proto *);
341 static struct if_proto *find_attached_proto(struct ifnet *, u_int32_t);
342 static u_int32_t dlil_ifp_protolist(struct ifnet *ifp, protocol_family_t *list,
343 u_int32_t list_count);
344 static void _dlil_if_release(ifnet_t ifp, bool clear_in_use);
345 static void if_flt_monitor_busy(struct ifnet *);
346 static void if_flt_monitor_unbusy(struct ifnet *);
347 static void if_flt_monitor_enter(struct ifnet *);
348 static void if_flt_monitor_leave(struct ifnet *);
349 static int dlil_interface_filters_input(struct ifnet *, struct mbuf **,
350 char **, protocol_family_t);
351 static int dlil_interface_filters_output(struct ifnet *, struct mbuf **,
352 protocol_family_t);
353 static struct ifaddr *dlil_alloc_lladdr(struct ifnet *,
354 const struct sockaddr_dl *);
355 static int ifnet_lookup(struct ifnet *);
356 static void if_purgeaddrs(struct ifnet *);
357
358 static errno_t ifproto_media_input_v1(struct ifnet *, protocol_family_t,
359 struct mbuf *, char *);
360 static errno_t ifproto_media_input_v2(struct ifnet *, protocol_family_t,
361 struct mbuf *);
362 static errno_t ifproto_media_preout(struct ifnet *, protocol_family_t,
363 mbuf_t *, const struct sockaddr *, void *, char *, char *);
364 static void ifproto_media_event(struct ifnet *, protocol_family_t,
365 const struct kev_msg *);
366 static errno_t ifproto_media_ioctl(struct ifnet *, protocol_family_t,
367 unsigned long, void *);
368 static errno_t ifproto_media_resolve_multi(ifnet_t, const struct sockaddr *,
369 struct sockaddr_dl *, size_t);
370 static errno_t ifproto_media_send_arp(struct ifnet *, u_short,
371 const struct sockaddr_dl *, const struct sockaddr *,
372 const struct sockaddr_dl *, const struct sockaddr *);
373
374 static errno_t ifp_if_input(struct ifnet *ifp, struct mbuf *m_head,
375 struct mbuf *m_tail, const struct ifnet_stat_increment_param *s,
376 boolean_t poll, struct thread *tp);
377 static void ifp_if_input_poll(struct ifnet *, u_int32_t, u_int32_t,
378 struct mbuf **, struct mbuf **, u_int32_t *, u_int32_t *);
379 static errno_t ifp_if_ctl(struct ifnet *, ifnet_ctl_cmd_t, u_int32_t, void *);
380 static errno_t ifp_if_demux(struct ifnet *, struct mbuf *, char *,
381 protocol_family_t *);
382 static errno_t ifp_if_add_proto(struct ifnet *, protocol_family_t,
383 const struct ifnet_demux_desc *, u_int32_t);
384 static errno_t ifp_if_del_proto(struct ifnet *, protocol_family_t);
385 static errno_t ifp_if_check_multi(struct ifnet *, const struct sockaddr *);
386 #if !XNU_TARGET_OS_OSX
387 static errno_t ifp_if_framer(struct ifnet *, struct mbuf **,
388 const struct sockaddr *, const char *, const char *,
389 u_int32_t *, u_int32_t *);
390 #else /* XNU_TARGET_OS_OSX */
391 static errno_t ifp_if_framer(struct ifnet *, struct mbuf **,
392 const struct sockaddr *, const char *, const char *);
393 #endif /* XNU_TARGET_OS_OSX */
394 static errno_t ifp_if_framer_extended(struct ifnet *, struct mbuf **,
395 const struct sockaddr *, const char *, const char *,
396 u_int32_t *, u_int32_t *);
397 static errno_t ifp_if_set_bpf_tap(struct ifnet *, bpf_tap_mode, bpf_packet_func);
398 static void ifp_if_free(struct ifnet *);
399 static void ifp_if_event(struct ifnet *, const struct kev_msg *);
400 static __inline void ifp_inc_traffic_class_in(struct ifnet *, struct mbuf *);
401 static __inline void ifp_inc_traffic_class_out(struct ifnet *, struct mbuf *);
402
403 static errno_t dlil_input_async(struct dlil_threading_info *, struct ifnet *,
404 struct mbuf *, struct mbuf *, const struct ifnet_stat_increment_param *,
405 boolean_t, struct thread *);
406 static errno_t dlil_input_sync(struct dlil_threading_info *, struct ifnet *,
407 struct mbuf *, struct mbuf *, const struct ifnet_stat_increment_param *,
408 boolean_t, struct thread *);
409
410 static void dlil_main_input_thread_func(void *, wait_result_t);
411 static void dlil_main_input_thread_cont(void *, wait_result_t);
412
413 static void dlil_input_thread_func(void *, wait_result_t);
414 static void dlil_input_thread_cont(void *, wait_result_t);
415
416 static void dlil_rxpoll_input_thread_func(void *, wait_result_t);
417 static void dlil_rxpoll_input_thread_cont(void *, wait_result_t);
418
419 static int dlil_create_input_thread(ifnet_t, struct dlil_threading_info *,
420 thread_continue_t *);
421 static void dlil_terminate_input_thread(struct dlil_threading_info *);
422 static void dlil_input_stats_add(const struct ifnet_stat_increment_param *,
423 struct dlil_threading_info *, struct ifnet *, boolean_t);
424 static boolean_t dlil_input_stats_sync(struct ifnet *,
425 struct dlil_threading_info *);
426 static void dlil_input_packet_list_common(struct ifnet *, struct mbuf *,
427 u_int32_t, ifnet_model_t, boolean_t);
428 static errno_t ifnet_input_common(struct ifnet *, struct mbuf *, struct mbuf *,
429 const struct ifnet_stat_increment_param *, boolean_t, boolean_t);
430 static int dlil_is_clat_needed(protocol_family_t, mbuf_t );
431 static errno_t dlil_clat46(ifnet_t, protocol_family_t *, mbuf_t *);
432 static errno_t dlil_clat64(ifnet_t, protocol_family_t *, mbuf_t *);
433 #if DEBUG || DEVELOPMENT
434 static void dlil_verify_sum16(void);
435 #endif /* DEBUG || DEVELOPMENT */
436 static void dlil_output_cksum_dbg(struct ifnet *, struct mbuf *, uint32_t,
437 protocol_family_t);
438 static void dlil_input_cksum_dbg(struct ifnet *, struct mbuf *, char *,
439 protocol_family_t);
440
441 static void dlil_incr_pending_thread_count(void);
442 static void dlil_decr_pending_thread_count(void);
443
444 static void ifnet_detacher_thread_func(void *, wait_result_t);
445 static void ifnet_detacher_thread_cont(void *, wait_result_t);
446 static void ifnet_detach_final(struct ifnet *);
447 static void ifnet_detaching_enqueue(struct ifnet *);
448 static struct ifnet *ifnet_detaching_dequeue(void);
449
450 static void ifnet_start_thread_func(void *, wait_result_t);
451 static void ifnet_start_thread_cont(void *, wait_result_t);
452
453 static void ifnet_poll_thread_func(void *, wait_result_t);
454 static void ifnet_poll_thread_cont(void *, wait_result_t);
455
456 static errno_t ifnet_enqueue_common(struct ifnet *, struct ifclassq *,
457 classq_pkt_t *, boolean_t, boolean_t *);
458
459 static void ifp_src_route_copyout(struct ifnet *, struct route *);
460 static void ifp_src_route_copyin(struct ifnet *, struct route *);
461 static void ifp_src_route6_copyout(struct ifnet *, struct route_in6 *);
462 static void ifp_src_route6_copyin(struct ifnet *, struct route_in6 *);
463
464 static int sysctl_rxpoll SYSCTL_HANDLER_ARGS;
465 static int sysctl_rxpoll_mode_holdtime SYSCTL_HANDLER_ARGS;
466 static int sysctl_rxpoll_sample_holdtime SYSCTL_HANDLER_ARGS;
467 static int sysctl_rxpoll_interval_time SYSCTL_HANDLER_ARGS;
468 static int sysctl_rxpoll_wlowat SYSCTL_HANDLER_ARGS;
469 static int sysctl_rxpoll_whiwat SYSCTL_HANDLER_ARGS;
470 static int sysctl_sndq_maxlen SYSCTL_HANDLER_ARGS;
471 static int sysctl_rcvq_maxlen SYSCTL_HANDLER_ARGS;
472 static int sysctl_hwcksum_dbg_mode SYSCTL_HANDLER_ARGS;
473 static int sysctl_hwcksum_dbg_partial_rxoff_forced SYSCTL_HANDLER_ARGS;
474 static int sysctl_hwcksum_dbg_partial_rxoff_adj SYSCTL_HANDLER_ARGS;
475
476 struct chain_len_stats tx_chain_len_stats;
477 static int sysctl_tx_chain_len_stats SYSCTL_HANDLER_ARGS;
478
479 #if TEST_INPUT_THREAD_TERMINATION
480 static int sysctl_input_thread_termination_spin SYSCTL_HANDLER_ARGS;
481 #endif /* TEST_INPUT_THREAD_TERMINATION */
482
483
484 /* The following are protected by dlil_ifnet_lock */
485 static TAILQ_HEAD(, ifnet) ifnet_detaching_head;
486 static u_int32_t ifnet_detaching_cnt;
487 static boolean_t ifnet_detaching_embryonic;
488 static void *ifnet_delayed_run; /* wait channel for detaching thread */
489
490 static LCK_MTX_DECLARE_ATTR(ifnet_fc_lock, &dlil_lock_group,
491 &dlil_lck_attributes);
492
493 static uint32_t ifnet_flowhash_seed;
494
495 struct ifnet_flowhash_key {
496 char ifk_name[IFNAMSIZ];
497 uint32_t ifk_unit;
498 uint32_t ifk_flags;
499 uint32_t ifk_eflags;
500 uint32_t ifk_capabilities;
501 uint32_t ifk_capenable;
502 uint32_t ifk_output_sched_model;
503 uint32_t ifk_rand1;
504 uint32_t ifk_rand2;
505 };
506
507 /* Flow control entry per interface */
508 struct ifnet_fc_entry {
509 RB_ENTRY(ifnet_fc_entry) ifce_entry;
510 u_int32_t ifce_flowhash;
511 struct ifnet *ifce_ifp;
512 };
513
514 static uint32_t ifnet_calc_flowhash(struct ifnet *);
515 static int ifce_cmp(const struct ifnet_fc_entry *,
516 const struct ifnet_fc_entry *);
517 static int ifnet_fc_add(struct ifnet *);
518 static struct ifnet_fc_entry *ifnet_fc_get(u_int32_t);
519 static void ifnet_fc_entry_free(struct ifnet_fc_entry *);
520
521 /* protected by ifnet_fc_lock */
522 RB_HEAD(ifnet_fc_tree, ifnet_fc_entry) ifnet_fc_tree;
523 RB_PROTOTYPE(ifnet_fc_tree, ifnet_fc_entry, ifce_entry, ifce_cmp);
524 RB_GENERATE(ifnet_fc_tree, ifnet_fc_entry, ifce_entry, ifce_cmp);
525
526 static ZONE_DEFINE(ifnet_fc_zone, "ifnet_fc_zone",
527 sizeof(struct ifnet_fc_entry), ZC_ZFREE_CLEARMEM);
528
529 extern void bpfdetach(struct ifnet *);
530 extern void proto_input_run(void);
531
532 extern uint32_t udp_count_opportunistic(unsigned int ifindex,
533 u_int32_t flags);
534 extern uint32_t tcp_count_opportunistic(unsigned int ifindex,
535 u_int32_t flags);
536
537 __private_extern__ void link_rtrequest(int, struct rtentry *, struct sockaddr *);
538
539 #if CONFIG_MACF
540 #if !XNU_TARGET_OS_OSX
541 int dlil_lladdr_ckreq = 1;
542 #else /* XNU_TARGET_OS_OSX */
543 int dlil_lladdr_ckreq = 0;
544 #endif /* XNU_TARGET_OS_OSX */
545 #endif /* CONFIG_MACF */
546
547 #if DEBUG
548 int dlil_verbose = 1;
549 #else
550 int dlil_verbose = 0;
551 #endif /* DEBUG */
552 #if IFNET_INPUT_SANITY_CHK
553 /* sanity checking of input packet lists received */
554 static u_int32_t dlil_input_sanity_check = 0;
555 #endif /* IFNET_INPUT_SANITY_CHK */
556 /* rate limit debug messages */
557 struct timespec dlil_dbgrate = { .tv_sec = 1, .tv_nsec = 0 };
558
559 SYSCTL_DECL(_net_link_generic_system);
560
561 SYSCTL_INT(_net_link_generic_system, OID_AUTO, dlil_verbose,
562 CTLFLAG_RW | CTLFLAG_LOCKED, &dlil_verbose, 0, "Log DLIL error messages");
563
564 #define IF_SNDQ_MINLEN 32
565 u_int32_t if_sndq_maxlen = IFQ_MAXLEN;
566 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, sndq_maxlen,
567 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_sndq_maxlen, IFQ_MAXLEN,
568 sysctl_sndq_maxlen, "I", "Default transmit queue max length");
569
570 #define IF_RCVQ_MINLEN 32
571 #define IF_RCVQ_MAXLEN 256
572 u_int32_t if_rcvq_maxlen = IF_RCVQ_MAXLEN;
573 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rcvq_maxlen,
574 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rcvq_maxlen, IFQ_MAXLEN,
575 sysctl_rcvq_maxlen, "I", "Default receive queue max length");
576
577 #define IF_RXPOLL_DECAY 2 /* ilog2 of EWMA decay rate (4) */
578 u_int32_t if_rxpoll_decay = IF_RXPOLL_DECAY;
579 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, rxpoll_decay,
580 CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_decay, IF_RXPOLL_DECAY,
581 "ilog2 of EWMA decay rate of avg inbound packets");
582
583 #define IF_RXPOLL_MODE_HOLDTIME_MIN (10ULL * 1000 * 1000) /* 10 ms */
584 #define IF_RXPOLL_MODE_HOLDTIME (1000ULL * 1000 * 1000) /* 1 sec */
585 static u_int64_t if_rxpoll_mode_holdtime = IF_RXPOLL_MODE_HOLDTIME;
586 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_freeze_time,
587 CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_mode_holdtime,
588 IF_RXPOLL_MODE_HOLDTIME, sysctl_rxpoll_mode_holdtime,
589 "Q", "input poll mode freeze time");
590
591 #define IF_RXPOLL_SAMPLETIME_MIN (1ULL * 1000 * 1000) /* 1 ms */
592 #define IF_RXPOLL_SAMPLETIME (10ULL * 1000 * 1000) /* 10 ms */
593 static u_int64_t if_rxpoll_sample_holdtime = IF_RXPOLL_SAMPLETIME;
594 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_sample_time,
595 CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_sample_holdtime,
596 IF_RXPOLL_SAMPLETIME, sysctl_rxpoll_sample_holdtime,
597 "Q", "input poll sampling time");
598
599 static u_int64_t if_rxpoll_interval_time = IF_RXPOLL_INTERVALTIME;
600 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_interval_time,
601 CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_interval_time,
602 IF_RXPOLL_INTERVALTIME, sysctl_rxpoll_interval_time,
603 "Q", "input poll interval (time)");
604
605 #define IF_RXPOLL_INTERVAL_PKTS 0 /* 0 (disabled) */
606 u_int32_t if_rxpoll_interval_pkts = IF_RXPOLL_INTERVAL_PKTS;
607 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, rxpoll_interval_pkts,
608 CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_interval_pkts,
609 IF_RXPOLL_INTERVAL_PKTS, "input poll interval (packets)");
610
611 #define IF_RXPOLL_WLOWAT 10
612 static u_int32_t if_sysctl_rxpoll_wlowat = IF_RXPOLL_WLOWAT;
613 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_wakeups_lowat,
614 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_sysctl_rxpoll_wlowat,
615 IF_RXPOLL_WLOWAT, sysctl_rxpoll_wlowat,
616 "I", "input poll wakeup low watermark");
617
618 #define IF_RXPOLL_WHIWAT 100
619 static u_int32_t if_sysctl_rxpoll_whiwat = IF_RXPOLL_WHIWAT;
620 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_wakeups_hiwat,
621 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_sysctl_rxpoll_whiwat,
622 IF_RXPOLL_WHIWAT, sysctl_rxpoll_whiwat,
623 "I", "input poll wakeup high watermark");
624
625 static u_int32_t if_rxpoll_max = 0; /* 0 (automatic) */
626 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, rxpoll_max,
627 CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_max, 0,
628 "max packets per poll call");
629
630 u_int32_t if_rxpoll = 1;
631 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll,
632 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll, 0,
633 sysctl_rxpoll, "I", "enable opportunistic input polling");
634
635 #if TEST_INPUT_THREAD_TERMINATION
636 static u_int32_t if_input_thread_termination_spin = 0;
637 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, input_thread_termination_spin,
638 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
639 &if_input_thread_termination_spin, 0,
640 sysctl_input_thread_termination_spin,
641 "I", "input thread termination spin limit");
642 #endif /* TEST_INPUT_THREAD_TERMINATION */
643
644 static u_int32_t cur_dlil_input_threads = 0;
645 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, dlil_input_threads,
646 CTLFLAG_RD | CTLFLAG_LOCKED, &cur_dlil_input_threads, 0,
647 "Current number of DLIL input threads");
648
649 #if IFNET_INPUT_SANITY_CHK
650 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, dlil_input_sanity_check,
651 CTLFLAG_RW | CTLFLAG_LOCKED, &dlil_input_sanity_check, 0,
652 "Turn on sanity checking in DLIL input");
653 #endif /* IFNET_INPUT_SANITY_CHK */
654
655 static u_int32_t if_flowadv = 1;
656 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, flow_advisory,
657 CTLFLAG_RW | CTLFLAG_LOCKED, &if_flowadv, 1,
658 "enable flow-advisory mechanism");
659
660 static u_int32_t if_delaybased_queue = 1;
661 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, delaybased_queue,
662 CTLFLAG_RW | CTLFLAG_LOCKED, &if_delaybased_queue, 1,
663 "enable delay based dynamic queue sizing");
664
665 static uint64_t hwcksum_in_invalidated = 0;
666 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
667 hwcksum_in_invalidated, CTLFLAG_RD | CTLFLAG_LOCKED,
668 &hwcksum_in_invalidated, "inbound packets with invalidated hardware cksum");
669
670 uint32_t hwcksum_dbg = 0;
671 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, hwcksum_dbg,
672 CTLFLAG_RW | CTLFLAG_LOCKED, &hwcksum_dbg, 0,
673 "enable hardware cksum debugging");
674
675 u_int32_t ifnet_start_delayed = 0;
676 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, start_delayed,
677 CTLFLAG_RW | CTLFLAG_LOCKED, &ifnet_start_delayed, 0,
678 "number of times start was delayed");
679
680 u_int32_t ifnet_delay_start_disabled = 0;
681 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, start_delay_disabled,
682 CTLFLAG_RW | CTLFLAG_LOCKED, &ifnet_delay_start_disabled, 0,
683 "number of times start was delayed");
684
685 #if DEVELOPMENT || DEBUG
686 static int packet_dump_trace_update SYSCTL_HANDLER_ARGS;
687
688 struct flow_key flow_key_trace;
689 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, flow_key_trace, CTLFLAG_WR | CTLFLAG_LOCKED |
690 CTLFLAG_KERN | CTLFLAG_ANYBODY, 0, 0, packet_dump_trace_update, "S", "Set flow key for packet tracing");
691 #endif /* DEVELOPMENT || DEBUG */
692
693 static inline void
ifnet_delay_start_disabled_increment(void)694 ifnet_delay_start_disabled_increment(void)
695 {
696 OSIncrementAtomic(&ifnet_delay_start_disabled);
697 }
698
699 #define HWCKSUM_DBG_PARTIAL_FORCED 0x1 /* forced partial checksum */
700 #define HWCKSUM_DBG_PARTIAL_RXOFF_ADJ 0x2 /* adjust start offset */
701 #define HWCKSUM_DBG_FINALIZE_FORCED 0x10 /* forced finalize */
702 #define HWCKSUM_DBG_MASK \
703 (HWCKSUM_DBG_PARTIAL_FORCED | HWCKSUM_DBG_PARTIAL_RXOFF_ADJ | \
704 HWCKSUM_DBG_FINALIZE_FORCED)
705
706 static uint32_t hwcksum_dbg_mode = 0;
707 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, hwcksum_dbg_mode,
708 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &hwcksum_dbg_mode,
709 0, sysctl_hwcksum_dbg_mode, "I", "hardware cksum debugging mode");
710
711 static uint64_t hwcksum_dbg_partial_forced = 0;
712 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
713 hwcksum_dbg_partial_forced, CTLFLAG_RD | CTLFLAG_LOCKED,
714 &hwcksum_dbg_partial_forced, "packets forced using partial cksum");
715
716 static uint64_t hwcksum_dbg_partial_forced_bytes = 0;
717 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
718 hwcksum_dbg_partial_forced_bytes, CTLFLAG_RD | CTLFLAG_LOCKED,
719 &hwcksum_dbg_partial_forced_bytes, "bytes forced using partial cksum");
720
721 static uint32_t hwcksum_dbg_partial_rxoff_forced = 0;
722 SYSCTL_PROC(_net_link_generic_system, OID_AUTO,
723 hwcksum_dbg_partial_rxoff_forced, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
724 &hwcksum_dbg_partial_rxoff_forced, 0,
725 sysctl_hwcksum_dbg_partial_rxoff_forced, "I",
726 "forced partial cksum rx offset");
727
728 static uint32_t hwcksum_dbg_partial_rxoff_adj = 0;
729 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, hwcksum_dbg_partial_rxoff_adj,
730 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &hwcksum_dbg_partial_rxoff_adj,
731 0, sysctl_hwcksum_dbg_partial_rxoff_adj, "I",
732 "adjusted partial cksum rx offset");
733
734 static uint64_t hwcksum_dbg_verified = 0;
735 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
736 hwcksum_dbg_verified, CTLFLAG_RD | CTLFLAG_LOCKED,
737 &hwcksum_dbg_verified, "packets verified for having good checksum");
738
739 static uint64_t hwcksum_dbg_bad_cksum = 0;
740 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
741 hwcksum_dbg_bad_cksum, CTLFLAG_RD | CTLFLAG_LOCKED,
742 &hwcksum_dbg_bad_cksum, "packets with bad hardware calculated checksum");
743
744 static uint64_t hwcksum_dbg_bad_rxoff = 0;
745 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
746 hwcksum_dbg_bad_rxoff, CTLFLAG_RD | CTLFLAG_LOCKED,
747 &hwcksum_dbg_bad_rxoff, "packets with invalid rxoff");
748
749 static uint64_t hwcksum_dbg_adjusted = 0;
750 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
751 hwcksum_dbg_adjusted, CTLFLAG_RD | CTLFLAG_LOCKED,
752 &hwcksum_dbg_adjusted, "packets with rxoff adjusted");
753
754 static uint64_t hwcksum_dbg_finalized_hdr = 0;
755 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
756 hwcksum_dbg_finalized_hdr, CTLFLAG_RD | CTLFLAG_LOCKED,
757 &hwcksum_dbg_finalized_hdr, "finalized headers");
758
759 static uint64_t hwcksum_dbg_finalized_data = 0;
760 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
761 hwcksum_dbg_finalized_data, CTLFLAG_RD | CTLFLAG_LOCKED,
762 &hwcksum_dbg_finalized_data, "finalized payloads");
763
764 uint32_t hwcksum_tx = 1;
765 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, hwcksum_tx,
766 CTLFLAG_RW | CTLFLAG_LOCKED, &hwcksum_tx, 0,
767 "enable transmit hardware checksum offload");
768
769 uint32_t hwcksum_rx = 1;
770 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, hwcksum_rx,
771 CTLFLAG_RW | CTLFLAG_LOCKED, &hwcksum_rx, 0,
772 "enable receive hardware checksum offload");
773
774 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, tx_chain_len_stats,
775 CTLFLAG_RD | CTLFLAG_LOCKED, 0, 9,
776 sysctl_tx_chain_len_stats, "S", "");
777
778 uint32_t tx_chain_len_count = 0;
779 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, tx_chain_len_count,
780 CTLFLAG_RW | CTLFLAG_LOCKED, &tx_chain_len_count, 0, "");
781
782 static uint32_t threshold_notify = 1; /* enable/disable */
783 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, threshold_notify,
784 CTLFLAG_RW | CTLFLAG_LOCKED, &threshold_notify, 0, "");
785
786 static uint32_t threshold_interval = 2; /* in seconds */
787 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, threshold_interval,
788 CTLFLAG_RW | CTLFLAG_LOCKED, &threshold_interval, 0, "");
789
790 #if (DEVELOPMENT || DEBUG)
791 static int sysctl_get_kao_frames SYSCTL_HANDLER_ARGS;
792 SYSCTL_NODE(_net_link_generic_system, OID_AUTO, get_kao_frames,
793 CTLFLAG_RD | CTLFLAG_LOCKED, sysctl_get_kao_frames, "");
794 #endif /* DEVELOPMENT || DEBUG */
795
796 struct net_api_stats net_api_stats;
797 SYSCTL_STRUCT(_net, OID_AUTO, api_stats, CTLFLAG_RD | CTLFLAG_LOCKED,
798 &net_api_stats, net_api_stats, "");
799
800 uint32_t net_wake_pkt_debug = 0;
801 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, wake_pkt_debug,
802 CTLFLAG_RW | CTLFLAG_LOCKED, &net_wake_pkt_debug, 0, "");
803
804 static void log_hexdump(void *data, size_t len);
805
806 unsigned int net_rxpoll = 1;
807 unsigned int net_affinity = 1;
808 unsigned int net_async = 1; /* 0: synchronous, 1: asynchronous */
809
810 static kern_return_t dlil_affinity_set(struct thread *, u_int32_t);
811
812 extern u_int32_t inject_buckets;
813
814 /* DLIL data threshold thread call */
815 static void dlil_dt_tcall_fn(thread_call_param_t, thread_call_param_t);
816
817 void
ifnet_filter_update_tso(struct ifnet * ifp,boolean_t filter_enable)818 ifnet_filter_update_tso(struct ifnet *ifp, boolean_t filter_enable)
819 {
820 /*
821 * update filter count and route_generation ID to let TCP
822 * know it should reevalute doing TSO or not
823 */
824 if (filter_enable) {
825 OSAddAtomic(1, &ifp->if_flt_no_tso_count);
826 } else {
827 VERIFY(ifp->if_flt_no_tso_count != 0);
828 OSAddAtomic(-1, &ifp->if_flt_no_tso_count);
829 }
830 routegenid_update();
831 }
832
833 #if SKYWALK
834
835 #if defined(XNU_TARGET_OS_OSX)
836 static bool net_check_compatible_if_filter(struct ifnet *ifp);
837 #endif /* XNU_TARGET_OS_OSX */
838
839 /* if_attach_nx flags defined in os_skywalk_private.h */
840 static unsigned int if_attach_nx = IF_ATTACH_NX_DEFAULT;
841 unsigned int if_enable_fsw_ip_netagent =
842 ((IF_ATTACH_NX_DEFAULT & IF_ATTACH_NX_FSW_IP_NETAGENT) != 0);
843 unsigned int if_enable_fsw_transport_netagent =
844 ((IF_ATTACH_NX_DEFAULT & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0);
845
846 unsigned int if_netif_all =
847 ((IF_ATTACH_NX_DEFAULT & IF_ATTACH_NX_NETIF_ALL) != 0);
848
849 /* Configure flowswitch to use max mtu sized buffer */
850 static bool fsw_use_max_mtu_buffer = false;
851
852 #if (DEVELOPMENT || DEBUG)
853 static int
854 if_attach_nx_sysctl SYSCTL_HANDLER_ARGS
855 {
856 #pragma unused(oidp, arg1, arg2)
857 unsigned int new_value;
858 int changed;
859 int error = sysctl_io_number(req, if_attach_nx, sizeof(if_attach_nx),
860 &new_value, &changed);
861 if (error) {
862 return error;
863 }
864 if (changed) {
865 if ((new_value & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) !=
866 (if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT)) {
867 return ENOTSUP;
868 }
869 if_attach_nx = new_value;
870 }
871 return 0;
872 }
873
874 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, if_attach_nx,
875 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
876 0, 0, &if_attach_nx_sysctl, "IU", "attach nexus");
877
878 #endif /* DEVELOPMENT || DEBUG */
879
880 static int
881 if_enable_fsw_transport_netagent_sysctl SYSCTL_HANDLER_ARGS
882 {
883 #pragma unused(oidp, arg1, arg2)
884 unsigned int new_value;
885 int changed;
886 int error;
887
888 error = sysctl_io_number(req, if_enable_fsw_transport_netagent,
889 sizeof(if_enable_fsw_transport_netagent),
890 &new_value, &changed);
891 if (error == 0 && changed != 0) {
892 if (new_value != 0 && new_value != 1) {
893 /* only allow 0 or 1 */
894 error = EINVAL;
895 } else if ((if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0) {
896 /* netagent can be enabled/disabled */
897 if_enable_fsw_transport_netagent = new_value;
898 if (new_value == 0) {
899 kern_nexus_deregister_netagents();
900 } else {
901 kern_nexus_register_netagents();
902 }
903 } else {
904 /* netagent can't be enabled */
905 error = ENOTSUP;
906 }
907 }
908 return error;
909 }
910
911 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, enable_netagent,
912 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
913 0, 0, &if_enable_fsw_transport_netagent_sysctl, "IU",
914 "enable flowswitch netagent");
915
916 static void dlil_detach_flowswitch_nexus(if_nexus_flowswitch_t nexus_fsw);
917
918 #include <skywalk/os_skywalk_private.h>
919
920 boolean_t
ifnet_nx_noauto(ifnet_t ifp)921 ifnet_nx_noauto(ifnet_t ifp)
922 {
923 return (ifp->if_xflags & IFXF_NX_NOAUTO) != 0;
924 }
925
926 boolean_t
ifnet_nx_noauto_flowswitch(ifnet_t ifp)927 ifnet_nx_noauto_flowswitch(ifnet_t ifp)
928 {
929 return ifnet_is_low_latency(ifp);
930 }
931
932 boolean_t
ifnet_is_low_latency(ifnet_t ifp)933 ifnet_is_low_latency(ifnet_t ifp)
934 {
935 return (ifp->if_xflags & IFXF_LOW_LATENCY) != 0;
936 }
937
938 boolean_t
ifnet_needs_compat(ifnet_t ifp)939 ifnet_needs_compat(ifnet_t ifp)
940 {
941 if ((if_attach_nx & IF_ATTACH_NX_NETIF_COMPAT) == 0) {
942 return FALSE;
943 }
944 #if !XNU_TARGET_OS_OSX
945 /*
946 * To conserve memory, we plumb in the compat layer selectively; this
947 * can be overridden via if_attach_nx flag IF_ATTACH_NX_NETIF_ALL.
948 * In particular, we check for Wi-Fi Access Point.
949 */
950 if (IFNET_IS_WIFI(ifp)) {
951 /* Wi-Fi Access Point */
952 if (ifp->if_name[0] == 'a' && ifp->if_name[1] == 'p' &&
953 ifp->if_name[2] == '\0') {
954 return if_netif_all;
955 }
956 }
957 #else /* XNU_TARGET_OS_OSX */
958 #pragma unused(ifp)
959 #endif /* XNU_TARGET_OS_OSX */
960 return TRUE;
961 }
962
963 boolean_t
ifnet_needs_fsw_transport_netagent(ifnet_t ifp)964 ifnet_needs_fsw_transport_netagent(ifnet_t ifp)
965 {
966 if (if_is_fsw_transport_netagent_enabled()) {
967 /* check if netagent has been manually enabled for ipsec/utun */
968 if (ifp->if_family == IFNET_FAMILY_IPSEC) {
969 return ipsec_interface_needs_netagent(ifp);
970 } else if (ifp->if_family == IFNET_FAMILY_UTUN) {
971 return utun_interface_needs_netagent(ifp);
972 }
973
974 /* check ifnet no auto nexus override */
975 if (ifnet_nx_noauto(ifp)) {
976 return FALSE;
977 }
978
979 /* check global if_attach_nx configuration */
980 switch (ifp->if_family) {
981 case IFNET_FAMILY_CELLULAR:
982 case IFNET_FAMILY_ETHERNET:
983 if ((if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0) {
984 return TRUE;
985 }
986 break;
987 default:
988 break;
989 }
990 }
991 return FALSE;
992 }
993
994 boolean_t
ifnet_needs_fsw_ip_netagent(ifnet_t ifp)995 ifnet_needs_fsw_ip_netagent(ifnet_t ifp)
996 {
997 #pragma unused(ifp)
998 if ((if_attach_nx & IF_ATTACH_NX_FSW_IP_NETAGENT) != 0) {
999 return TRUE;
1000 }
1001 return FALSE;
1002 }
1003
1004 boolean_t
ifnet_needs_netif_netagent(ifnet_t ifp)1005 ifnet_needs_netif_netagent(ifnet_t ifp)
1006 {
1007 #pragma unused(ifp)
1008 return (if_attach_nx & IF_ATTACH_NX_NETIF_NETAGENT) != 0;
1009 }
1010
1011 static boolean_t
dlil_detach_nexus_instance(nexus_controller_t controller,const char * func_str,uuid_t instance,uuid_t device)1012 dlil_detach_nexus_instance(nexus_controller_t controller,
1013 const char *func_str, uuid_t instance, uuid_t device)
1014 {
1015 errno_t err;
1016
1017 if (instance == NULL || uuid_is_null(instance)) {
1018 return FALSE;
1019 }
1020
1021 /* followed by the device port */
1022 if (device != NULL && !uuid_is_null(device)) {
1023 err = kern_nexus_ifdetach(controller, instance, device);
1024 if (err != 0) {
1025 DLIL_PRINTF("%s kern_nexus_ifdetach device failed %d\n",
1026 func_str, err);
1027 }
1028 }
1029 err = kern_nexus_controller_free_provider_instance(controller,
1030 instance);
1031 if (err != 0) {
1032 DLIL_PRINTF("%s free_provider_instance failed %d\n",
1033 func_str, err);
1034 }
1035 return TRUE;
1036 }
1037
1038 static boolean_t
dlil_detach_nexus(const char * func_str,uuid_t provider,uuid_t instance,uuid_t device)1039 dlil_detach_nexus(const char *func_str, uuid_t provider, uuid_t instance,
1040 uuid_t device)
1041 {
1042 boolean_t detached = FALSE;
1043 nexus_controller_t controller = kern_nexus_shared_controller();
1044 int err;
1045
1046 if (dlil_detach_nexus_instance(controller, func_str, instance,
1047 device)) {
1048 detached = TRUE;
1049 }
1050 if (provider != NULL && !uuid_is_null(provider)) {
1051 detached = TRUE;
1052 err = kern_nexus_controller_deregister_provider(controller,
1053 provider);
1054 if (err != 0) {
1055 DLIL_PRINTF("%s deregister_provider %d\n",
1056 func_str, err);
1057 }
1058 }
1059 return detached;
1060 }
1061
1062 static errno_t
dlil_create_provider_and_instance(nexus_controller_t controller,nexus_type_t type,ifnet_t ifp,uuid_t * provider,uuid_t * instance,nexus_attr_t attr)1063 dlil_create_provider_and_instance(nexus_controller_t controller,
1064 nexus_type_t type, ifnet_t ifp, uuid_t *provider, uuid_t *instance,
1065 nexus_attr_t attr)
1066 {
1067 uuid_t dom_prov;
1068 errno_t err;
1069 nexus_name_t provider_name;
1070 const char *type_name =
1071 (type == NEXUS_TYPE_NET_IF) ? "netif" : "flowswitch";
1072 struct kern_nexus_init init;
1073
1074 err = kern_nexus_get_default_domain_provider(type, &dom_prov);
1075 if (err != 0) {
1076 DLIL_PRINTF("%s can't get %s provider, error %d\n",
1077 __func__, type_name, err);
1078 goto failed;
1079 }
1080
1081 snprintf((char *)provider_name, sizeof(provider_name),
1082 "com.apple.%s.%s", type_name, if_name(ifp));
1083 err = kern_nexus_controller_register_provider(controller,
1084 dom_prov,
1085 provider_name,
1086 NULL,
1087 0,
1088 attr,
1089 provider);
1090 if (err != 0) {
1091 DLIL_PRINTF("%s register %s provider failed, error %d\n",
1092 __func__, type_name, err);
1093 goto failed;
1094 }
1095 bzero(&init, sizeof(init));
1096 init.nxi_version = KERN_NEXUS_CURRENT_VERSION;
1097 err = kern_nexus_controller_alloc_provider_instance(controller,
1098 *provider,
1099 NULL, NULL,
1100 instance, &init);
1101 if (err != 0) {
1102 DLIL_PRINTF("%s alloc_provider_instance %s failed, %d\n",
1103 __func__, type_name, err);
1104 kern_nexus_controller_deregister_provider(controller,
1105 *provider);
1106 goto failed;
1107 }
1108 failed:
1109 return err;
1110 }
1111
1112 static boolean_t
dlil_attach_netif_nexus_common(ifnet_t ifp,if_nexus_netif_t netif_nx)1113 dlil_attach_netif_nexus_common(ifnet_t ifp, if_nexus_netif_t netif_nx)
1114 {
1115 nexus_attr_t attr = NULL;
1116 nexus_controller_t controller;
1117 errno_t err;
1118
1119 if ((ifp->if_capabilities & IFCAP_SKYWALK) != 0) {
1120 /* it's already attached */
1121 if (dlil_verbose) {
1122 DLIL_PRINTF("%s: %s already has nexus attached\n",
1123 __func__, if_name(ifp));
1124 /* already attached */
1125 }
1126 goto failed;
1127 }
1128
1129 err = kern_nexus_attr_create(&attr);
1130 if (err != 0) {
1131 DLIL_PRINTF("%s: nexus attr create for %s\n", __func__,
1132 if_name(ifp));
1133 goto failed;
1134 }
1135 err = kern_nexus_attr_set(attr, NEXUS_ATTR_IFINDEX, ifp->if_index);
1136 VERIFY(err == 0);
1137
1138 controller = kern_nexus_shared_controller();
1139
1140 /* create the netif provider and instance */
1141 err = dlil_create_provider_and_instance(controller,
1142 NEXUS_TYPE_NET_IF, ifp, &netif_nx->if_nif_provider,
1143 &netif_nx->if_nif_instance, attr);
1144 if (err != 0) {
1145 goto failed;
1146 }
1147 err = kern_nexus_ifattach(controller, netif_nx->if_nif_instance,
1148 ifp, NULL, FALSE, &netif_nx->if_nif_attach);
1149 if (err != 0) {
1150 DLIL_PRINTF("%s kern_nexus_ifattach %d\n",
1151 __func__, err);
1152 /* cleanup provider and instance */
1153 dlil_detach_nexus(__func__, netif_nx->if_nif_provider,
1154 netif_nx->if_nif_instance, NULL);
1155 goto failed;
1156 }
1157 return TRUE;
1158
1159 failed:
1160 if (attr != NULL) {
1161 kern_nexus_attr_destroy(attr);
1162 }
1163 return FALSE;
1164 }
1165
1166 static boolean_t
dlil_attach_netif_compat_nexus(ifnet_t ifp,if_nexus_netif_t netif_nx)1167 dlil_attach_netif_compat_nexus(ifnet_t ifp, if_nexus_netif_t netif_nx)
1168 {
1169 if (ifnet_nx_noauto(ifp) || IFNET_IS_INTCOPROC(ifp) ||
1170 IFNET_IS_VMNET(ifp)) {
1171 goto failed;
1172 }
1173 switch (ifp->if_type) {
1174 case IFT_CELLULAR:
1175 case IFT_ETHER:
1176 if ((if_attach_nx & IF_ATTACH_NX_NETIF_COMPAT) == 0) {
1177 /* don't auto-attach */
1178 goto failed;
1179 }
1180 break;
1181 default:
1182 /* don't auto-attach */
1183 goto failed;
1184 }
1185 return dlil_attach_netif_nexus_common(ifp, netif_nx);
1186
1187 failed:
1188 return FALSE;
1189 }
1190
1191 static boolean_t
dlil_is_native_netif_nexus(ifnet_t ifp)1192 dlil_is_native_netif_nexus(ifnet_t ifp)
1193 {
1194 return (ifp->if_eflags & IFEF_SKYWALK_NATIVE) && ifp->if_na != NULL;
1195 }
1196
1197 __attribute__((noinline))
1198 static void
dlil_detach_netif_nexus(if_nexus_netif_t nexus_netif)1199 dlil_detach_netif_nexus(if_nexus_netif_t nexus_netif)
1200 {
1201 dlil_detach_nexus(__func__, nexus_netif->if_nif_provider,
1202 nexus_netif->if_nif_instance, nexus_netif->if_nif_attach);
1203 }
1204
1205 static inline int
dlil_siocgifdevmtu(struct ifnet * ifp,struct ifdevmtu * ifdm_p)1206 dlil_siocgifdevmtu(struct ifnet * ifp, struct ifdevmtu * ifdm_p)
1207 {
1208 struct ifreq ifr;
1209 int error;
1210
1211 bzero(&ifr, sizeof(ifr));
1212 error = ifnet_ioctl(ifp, 0, SIOCGIFDEVMTU, &ifr);
1213 if (error == 0) {
1214 *ifdm_p = ifr.ifr_devmtu;
1215 }
1216 return error;
1217 }
1218
1219 static inline int
_dlil_get_flowswitch_buffer_size(ifnet_t ifp,uuid_t netif,uint32_t * buf_size,bool * use_multi_buflet,uint32_t * large_buf_size)1220 _dlil_get_flowswitch_buffer_size(ifnet_t ifp, uuid_t netif, uint32_t *buf_size,
1221 bool *use_multi_buflet, uint32_t *large_buf_size)
1222 {
1223 struct kern_pbufpool_memory_info rx_pp_info;
1224 struct kern_pbufpool_memory_info tx_pp_info;
1225 uint32_t if_max_mtu = 0;
1226 uint32_t drv_buf_size;
1227 struct ifdevmtu ifdm;
1228 int err;
1229
1230 /*
1231 * To perform intra-stack RX aggregation flowswitch needs to use
1232 * multi-buflet packet.
1233 */
1234 *use_multi_buflet = NX_FSW_TCP_RX_AGG_ENABLED();
1235
1236 *large_buf_size = *use_multi_buflet ? NX_FSW_DEF_LARGE_BUFSIZE : 0;
1237 /*
1238 * IP over Thunderbolt interface can deliver the largest IP packet,
1239 * but the driver advertises the MAX MTU as only 9K.
1240 */
1241 if (IFNET_IS_THUNDERBOLT_IP(ifp)) {
1242 if_max_mtu = IP_MAXPACKET;
1243 goto skip_mtu_ioctl;
1244 }
1245
1246 /* determine max mtu */
1247 bzero(&ifdm, sizeof(ifdm));
1248 err = dlil_siocgifdevmtu(ifp, &ifdm);
1249 if (__improbable(err != 0)) {
1250 DLIL_PRINTF("%s: SIOCGIFDEVMTU failed for %s\n",
1251 __func__, if_name(ifp));
1252 /* use default flowswitch buffer size */
1253 if_max_mtu = NX_FSW_BUFSIZE;
1254 } else {
1255 DLIL_PRINTF("%s: %s %d %d\n", __func__, if_name(ifp),
1256 ifdm.ifdm_max, ifdm.ifdm_current);
1257 /* rdar://problem/44589731 */
1258 if_max_mtu = MAX(ifdm.ifdm_max, ifdm.ifdm_current);
1259 }
1260
1261 skip_mtu_ioctl:
1262 if (if_max_mtu == 0) {
1263 DLIL_PRINTF("%s: can't determine MAX MTU for %s\n",
1264 __func__, if_name(ifp));
1265 return EINVAL;
1266 }
1267 if ((if_max_mtu > NX_FSW_MAXBUFSIZE) && fsw_use_max_mtu_buffer) {
1268 DLIL_PRINTF("%s: interace (%s) has MAX MTU (%u) > flowswitch "
1269 "max bufsize(%d)\n", __func__,
1270 if_name(ifp), if_max_mtu, NX_FSW_MAXBUFSIZE);
1271 return EINVAL;
1272 }
1273
1274 /*
1275 * for skywalk native driver, consult the driver packet pool also.
1276 */
1277 if (dlil_is_native_netif_nexus(ifp)) {
1278 err = kern_nexus_get_pbufpool_info(netif, &rx_pp_info,
1279 &tx_pp_info);
1280 if (err != 0) {
1281 DLIL_PRINTF("%s: can't get pbufpool info for %s\n",
1282 __func__, if_name(ifp));
1283 return ENXIO;
1284 }
1285 drv_buf_size = tx_pp_info.kpm_bufsize *
1286 tx_pp_info.kpm_max_frags;
1287 if (if_max_mtu > drv_buf_size) {
1288 DLIL_PRINTF("%s: interface %s packet pool (rx %d * %d, "
1289 "tx %d * %d) can't support max mtu(%d)\n", __func__,
1290 if_name(ifp), rx_pp_info.kpm_bufsize,
1291 rx_pp_info.kpm_max_frags, tx_pp_info.kpm_bufsize,
1292 tx_pp_info.kpm_max_frags, if_max_mtu);
1293 return EINVAL;
1294 }
1295 } else {
1296 drv_buf_size = if_max_mtu;
1297 }
1298
1299 if ((drv_buf_size > NX_FSW_BUFSIZE) && (!fsw_use_max_mtu_buffer)) {
1300 _CASSERT((NX_FSW_BUFSIZE * NX_PBUF_FRAGS_MAX) >= IP_MAXPACKET);
1301 *use_multi_buflet = true;
1302 /* default flowswitch buffer size */
1303 *buf_size = NX_FSW_BUFSIZE;
1304 *large_buf_size = MIN(NX_FSW_MAX_LARGE_BUFSIZE, drv_buf_size);
1305 } else {
1306 *buf_size = MAX(drv_buf_size, NX_FSW_BUFSIZE);
1307 }
1308
1309 /*
1310 * if HW TSO is enabled on a Skywalk native interface then make
1311 * the flowswitch default buffer be able to handle max TSO segment.
1312 */
1313 uint32_t tso_v4_mtu = 0;
1314 uint32_t tso_v6_mtu = 0;
1315 #ifdef XNU_TARGET_OS_OSX
1316 if (dlil_is_native_netif_nexus(ifp)) {
1317 if ((ifp->if_hwassist & IFNET_TSO_IPV4) != 0) {
1318 tso_v4_mtu = ifp->if_tso_v4_mtu;
1319 }
1320 if ((ifp->if_hwassist & IFNET_TSO_IPV6) != 0) {
1321 tso_v6_mtu = ifp->if_tso_v6_mtu;
1322 }
1323 }
1324 #endif /* XNU_TARGET_OS_OSX */
1325 if ((tso_v4_mtu != 0) || (tso_v6_mtu != 0)) {
1326 *buf_size = max(*buf_size, max(tso_v4_mtu, tso_v6_mtu));
1327 ASSERT(*buf_size <= NX_FSW_MAXBUFSIZE);
1328 }
1329 if (*buf_size >= *large_buf_size) {
1330 *large_buf_size = 0;
1331 }
1332 return 0;
1333 }
1334
1335 static boolean_t
_dlil_attach_flowswitch_nexus(ifnet_t ifp,if_nexus_flowswitch_t nexus_fsw)1336 _dlil_attach_flowswitch_nexus(ifnet_t ifp, if_nexus_flowswitch_t nexus_fsw)
1337 {
1338 nexus_attr_t attr = NULL;
1339 nexus_controller_t controller;
1340 errno_t err = 0;
1341 uuid_t netif;
1342 uint32_t buf_size = 0;
1343 uint32_t large_buf_size = 0;
1344 bool multi_buflet;
1345
1346 if (ifnet_nx_noauto(ifp) || ifnet_nx_noauto_flowswitch(ifp) ||
1347 IFNET_IS_VMNET(ifp)) {
1348 goto failed;
1349 }
1350
1351 if ((ifp->if_capabilities & IFCAP_SKYWALK) == 0) {
1352 /* not possible to attach (netif native/compat not plumbed) */
1353 goto failed;
1354 }
1355
1356 if ((if_attach_nx & IF_ATTACH_NX_FLOWSWITCH) == 0) {
1357 /* don't auto-attach */
1358 goto failed;
1359 }
1360
1361 /* get the netif instance from the ifp */
1362 err = kern_nexus_get_netif_instance(ifp, netif);
1363 if (err != 0) {
1364 DLIL_PRINTF("%s: can't find netif for %s\n", __func__,
1365 if_name(ifp));
1366 goto failed;
1367 }
1368
1369 err = kern_nexus_attr_create(&attr);
1370 if (err != 0) {
1371 DLIL_PRINTF("%s: nexus attr create for %s\n", __func__,
1372 if_name(ifp));
1373 goto failed;
1374 }
1375
1376 err = _dlil_get_flowswitch_buffer_size(ifp, netif, &buf_size,
1377 &multi_buflet, &large_buf_size);
1378 if (err != 0) {
1379 goto failed;
1380 }
1381 ASSERT((buf_size >= NX_FSW_BUFSIZE) && (buf_size <= NX_FSW_MAXBUFSIZE));
1382 ASSERT(large_buf_size <= NX_FSW_MAX_LARGE_BUFSIZE);
1383
1384 /* Configure flowswitch buffer size */
1385 err = kern_nexus_attr_set(attr, NEXUS_ATTR_SLOT_BUF_SIZE, buf_size);
1386 VERIFY(err == 0);
1387 err = kern_nexus_attr_set(attr, NEXUS_ATTR_LARGE_BUF_SIZE,
1388 large_buf_size);
1389 VERIFY(err == 0);
1390
1391 /*
1392 * Configure flowswitch to use super-packet (multi-buflet).
1393 */
1394 err = kern_nexus_attr_set(attr, NEXUS_ATTR_MAX_FRAGS,
1395 multi_buflet ? NX_PBUF_FRAGS_MAX : 1);
1396 VERIFY(err == 0);
1397
1398 /* create the flowswitch provider and instance */
1399 controller = kern_nexus_shared_controller();
1400 err = dlil_create_provider_and_instance(controller,
1401 NEXUS_TYPE_FLOW_SWITCH, ifp, &nexus_fsw->if_fsw_provider,
1402 &nexus_fsw->if_fsw_instance, attr);
1403 if (err != 0) {
1404 goto failed;
1405 }
1406
1407 /* attach the device port */
1408 err = kern_nexus_ifattach(controller, nexus_fsw->if_fsw_instance,
1409 NULL, netif, FALSE, &nexus_fsw->if_fsw_device);
1410 if (err != 0) {
1411 DLIL_PRINTF("%s kern_nexus_ifattach device failed %d %s\n",
1412 __func__, err, if_name(ifp));
1413 /* cleanup provider and instance */
1414 dlil_detach_nexus(__func__, nexus_fsw->if_fsw_provider,
1415 nexus_fsw->if_fsw_instance, nexus_fsw->if_fsw_device);
1416 goto failed;
1417 }
1418 return TRUE;
1419
1420 failed:
1421 if (err != 0) {
1422 DLIL_PRINTF("%s: failed to attach flowswitch to %s, error %d\n",
1423 __func__, if_name(ifp), err);
1424 } else {
1425 DLIL_PRINTF("%s: not attaching flowswitch to %s\n",
1426 __func__, if_name(ifp));
1427 }
1428 if (attr != NULL) {
1429 kern_nexus_attr_destroy(attr);
1430 }
1431 return FALSE;
1432 }
1433
1434 static boolean_t
dlil_attach_flowswitch_nexus(ifnet_t ifp)1435 dlil_attach_flowswitch_nexus(ifnet_t ifp)
1436 {
1437 boolean_t attached;
1438 if_nexus_flowswitch nexus_fsw;
1439
1440 #if (DEVELOPMENT || DEBUG)
1441 if (skywalk_netif_direct_allowed(if_name(ifp))) {
1442 DLIL_PRINTF("skip attaching fsw to %s", if_name(ifp));
1443 return FALSE;
1444 }
1445 #endif /* (DEVELOPMENT || DEBUG) */
1446
1447 /*
1448 * flowswitch attachment is not supported for interface using the
1449 * legacy model (IFNET_INIT_LEGACY)
1450 */
1451 if ((ifp->if_eflags & IFEF_TXSTART) == 0) {
1452 DLIL_PRINTF("skip attaching fsw to %s using legacy TX model",
1453 if_name(ifp));
1454 return FALSE;
1455 }
1456
1457 if (uuid_is_null(ifp->if_nx_flowswitch.if_fsw_instance) == 0) {
1458 /* it's already attached */
1459 return FALSE;
1460 }
1461 bzero(&nexus_fsw, sizeof(nexus_fsw));
1462 attached = _dlil_attach_flowswitch_nexus(ifp, &nexus_fsw);
1463 if (attached) {
1464 ifnet_lock_exclusive(ifp);
1465 if (!IF_FULLY_ATTACHED(ifp)) {
1466 /* interface is going away */
1467 attached = FALSE;
1468 } else {
1469 ifp->if_nx_flowswitch = nexus_fsw;
1470 }
1471 ifnet_lock_done(ifp);
1472 if (!attached) {
1473 /* clean up flowswitch nexus */
1474 dlil_detach_flowswitch_nexus(&nexus_fsw);
1475 }
1476 }
1477 return attached;
1478 }
1479
1480 __attribute__((noinline))
1481 static void
dlil_detach_flowswitch_nexus(if_nexus_flowswitch_t nexus_fsw)1482 dlil_detach_flowswitch_nexus(if_nexus_flowswitch_t nexus_fsw)
1483 {
1484 dlil_detach_nexus(__func__, nexus_fsw->if_fsw_provider,
1485 nexus_fsw->if_fsw_instance, nexus_fsw->if_fsw_device);
1486 }
1487
1488 __attribute__((noinline))
1489 static void
dlil_netif_detach_notify(ifnet_t ifp)1490 dlil_netif_detach_notify(ifnet_t ifp)
1491 {
1492 void (*detach_notify)(struct nexus_netif_adapter *);
1493
1494 /*
1495 * This is only needed for low latency interfaces for now.
1496 */
1497 if (!ifnet_is_low_latency(ifp)) {
1498 return;
1499 }
1500 detach_notify = (ifp->if_na_ops != NULL) ? ifp->if_na_ops->ni_detach_notify : NULL;
1501 if (detach_notify != NULL) {
1502 (*detach_notify)(ifp->if_na);
1503 } else {
1504 DLIL_PRINTF("%s: %s has no detach notify calback\n",
1505 __func__, if_name(ifp));
1506 }
1507 }
1508
1509 __attribute__((noinline))
1510 static void
dlil_quiesce_and_detach_nexuses(ifnet_t ifp)1511 dlil_quiesce_and_detach_nexuses(ifnet_t ifp)
1512 {
1513 if_nexus_flowswitch *nx_fsw = &ifp->if_nx_flowswitch;
1514 if_nexus_netif *nx_netif = &ifp->if_nx_netif;
1515
1516 ifnet_datamov_suspend_and_drain(ifp);
1517 if (!uuid_is_null(nx_fsw->if_fsw_device)) {
1518 ASSERT(!uuid_is_null(nx_fsw->if_fsw_provider));
1519 ASSERT(!uuid_is_null(nx_fsw->if_fsw_instance));
1520 dlil_detach_flowswitch_nexus(nx_fsw);
1521 bzero(nx_fsw, sizeof(*nx_fsw));
1522 } else {
1523 ASSERT(uuid_is_null(nx_fsw->if_fsw_provider));
1524 ASSERT(uuid_is_null(nx_fsw->if_fsw_instance));
1525 DTRACE_IP1(fsw__not__attached, ifnet_t, ifp);
1526 }
1527
1528 if (!uuid_is_null(nx_netif->if_nif_attach)) {
1529 ASSERT(!uuid_is_null(nx_netif->if_nif_provider));
1530 ASSERT(!uuid_is_null(nx_netif->if_nif_instance));
1531 dlil_detach_netif_nexus(nx_netif);
1532 bzero(nx_netif, sizeof(*nx_netif));
1533 } else {
1534 ASSERT(uuid_is_null(nx_netif->if_nif_provider));
1535 ASSERT(uuid_is_null(nx_netif->if_nif_instance));
1536 DTRACE_IP1(netif__not__attached, ifnet_t, ifp);
1537 }
1538 ifnet_datamov_resume(ifp);
1539 }
1540
1541 boolean_t
ifnet_add_netagent(ifnet_t ifp)1542 ifnet_add_netagent(ifnet_t ifp)
1543 {
1544 int error;
1545
1546 error = kern_nexus_interface_add_netagent(ifp);
1547 os_log(OS_LOG_DEFAULT,
1548 "kern_nexus_interface_add_netagent(%s) returned %d",
1549 ifp->if_xname, error);
1550 return error == 0;
1551 }
1552
1553 boolean_t
ifnet_remove_netagent(ifnet_t ifp)1554 ifnet_remove_netagent(ifnet_t ifp)
1555 {
1556 int error;
1557
1558 error = kern_nexus_interface_remove_netagent(ifp);
1559 os_log(OS_LOG_DEFAULT,
1560 "kern_nexus_interface_remove_netagent(%s) returned %d",
1561 ifp->if_xname, error);
1562 return error == 0;
1563 }
1564
1565 boolean_t
ifnet_attach_flowswitch_nexus(ifnet_t ifp)1566 ifnet_attach_flowswitch_nexus(ifnet_t ifp)
1567 {
1568 if (!IF_FULLY_ATTACHED(ifp)) {
1569 return FALSE;
1570 }
1571 return dlil_attach_flowswitch_nexus(ifp);
1572 }
1573
1574 boolean_t
ifnet_detach_flowswitch_nexus(ifnet_t ifp)1575 ifnet_detach_flowswitch_nexus(ifnet_t ifp)
1576 {
1577 if_nexus_flowswitch nexus_fsw;
1578
1579 ifnet_lock_exclusive(ifp);
1580 nexus_fsw = ifp->if_nx_flowswitch;
1581 bzero(&ifp->if_nx_flowswitch, sizeof(ifp->if_nx_flowswitch));
1582 ifnet_lock_done(ifp);
1583 return dlil_detach_nexus(__func__, nexus_fsw.if_fsw_provider,
1584 nexus_fsw.if_fsw_instance, nexus_fsw.if_fsw_device);
1585 }
1586
1587 boolean_t
ifnet_attach_netif_nexus(ifnet_t ifp)1588 ifnet_attach_netif_nexus(ifnet_t ifp)
1589 {
1590 boolean_t nexus_attached;
1591 if_nexus_netif nexus_netif;
1592
1593 if (!IF_FULLY_ATTACHED(ifp)) {
1594 return FALSE;
1595 }
1596 nexus_attached = dlil_attach_netif_nexus_common(ifp, &nexus_netif);
1597 if (nexus_attached) {
1598 ifnet_lock_exclusive(ifp);
1599 ifp->if_nx_netif = nexus_netif;
1600 ifnet_lock_done(ifp);
1601 }
1602 return nexus_attached;
1603 }
1604
1605 boolean_t
ifnet_detach_netif_nexus(ifnet_t ifp)1606 ifnet_detach_netif_nexus(ifnet_t ifp)
1607 {
1608 if_nexus_netif nexus_netif;
1609
1610 ifnet_lock_exclusive(ifp);
1611 nexus_netif = ifp->if_nx_netif;
1612 bzero(&ifp->if_nx_netif, sizeof(ifp->if_nx_netif));
1613 ifnet_lock_done(ifp);
1614
1615 return dlil_detach_nexus(__func__, nexus_netif.if_nif_provider,
1616 nexus_netif.if_nif_instance, nexus_netif.if_nif_attach);
1617 }
1618
1619 void
ifnet_attach_native_flowswitch(ifnet_t ifp)1620 ifnet_attach_native_flowswitch(ifnet_t ifp)
1621 {
1622 if (!dlil_is_native_netif_nexus(ifp)) {
1623 /* not a native netif */
1624 return;
1625 }
1626 ifnet_attach_flowswitch_nexus(ifp);
1627 }
1628
1629 #endif /* SKYWALK */
1630
1631 #define DLIL_INPUT_CHECK(m, ifp) { \
1632 struct ifnet *_rcvif = mbuf_pkthdr_rcvif(m); \
1633 if (_rcvif == NULL || (ifp != lo_ifp && _rcvif != ifp) || \
1634 !(mbuf_flags(m) & MBUF_PKTHDR)) { \
1635 panic_plain("%s: invalid mbuf %p\n", __func__, m); \
1636 /* NOTREACHED */ \
1637 } \
1638 }
1639
1640 #define DLIL_EWMA(old, new, decay) do { \
1641 u_int32_t _avg; \
1642 if ((_avg = (old)) > 0) \
1643 _avg = (((_avg << (decay)) - _avg) + (new)) >> (decay); \
1644 else \
1645 _avg = (new); \
1646 (old) = _avg; \
1647 } while (0)
1648
1649 #define MBPS (1ULL * 1000 * 1000)
1650 #define GBPS (MBPS * 1000)
1651
1652 struct rxpoll_time_tbl {
1653 u_int64_t speed; /* downlink speed */
1654 u_int32_t plowat; /* packets low watermark */
1655 u_int32_t phiwat; /* packets high watermark */
1656 u_int32_t blowat; /* bytes low watermark */
1657 u_int32_t bhiwat; /* bytes high watermark */
1658 };
1659
1660 static struct rxpoll_time_tbl rxpoll_tbl[] = {
1661 { .speed = 10 * MBPS, .plowat = 2, .phiwat = 8, .blowat = (1 * 1024), .bhiwat = (6 * 1024) },
1662 { .speed = 100 * MBPS, .plowat = 10, .phiwat = 40, .blowat = (4 * 1024), .bhiwat = (64 * 1024) },
1663 { .speed = 1 * GBPS, .plowat = 10, .phiwat = 40, .blowat = (4 * 1024), .bhiwat = (64 * 1024) },
1664 { .speed = 10 * GBPS, .plowat = 10, .phiwat = 40, .blowat = (4 * 1024), .bhiwat = (64 * 1024) },
1665 { .speed = 100 * GBPS, .plowat = 10, .phiwat = 40, .blowat = (4 * 1024), .bhiwat = (64 * 1024) },
1666 { .speed = 0, .plowat = 0, .phiwat = 0, .blowat = 0, .bhiwat = 0 }
1667 };
1668
1669 static LCK_MTX_DECLARE_ATTR(dlil_thread_sync_lock, &dlil_lock_group,
1670 &dlil_lck_attributes);
1671 static uint32_t dlil_pending_thread_cnt = 0;
1672
1673 static void
dlil_incr_pending_thread_count(void)1674 dlil_incr_pending_thread_count(void)
1675 {
1676 LCK_MTX_ASSERT(&dlil_thread_sync_lock, LCK_MTX_ASSERT_NOTOWNED);
1677 lck_mtx_lock(&dlil_thread_sync_lock);
1678 dlil_pending_thread_cnt++;
1679 lck_mtx_unlock(&dlil_thread_sync_lock);
1680 }
1681
1682 static void
dlil_decr_pending_thread_count(void)1683 dlil_decr_pending_thread_count(void)
1684 {
1685 LCK_MTX_ASSERT(&dlil_thread_sync_lock, LCK_MTX_ASSERT_NOTOWNED);
1686 lck_mtx_lock(&dlil_thread_sync_lock);
1687 VERIFY(dlil_pending_thread_cnt > 0);
1688 dlil_pending_thread_cnt--;
1689 if (dlil_pending_thread_cnt == 0) {
1690 wakeup(&dlil_pending_thread_cnt);
1691 }
1692 lck_mtx_unlock(&dlil_thread_sync_lock);
1693 }
1694
1695 int
proto_hash_value(u_int32_t protocol_family)1696 proto_hash_value(u_int32_t protocol_family)
1697 {
1698 /*
1699 * dlil_proto_unplumb_all() depends on the mapping between
1700 * the hash bucket index and the protocol family defined
1701 * here; future changes must be applied there as well.
1702 */
1703 switch (protocol_family) {
1704 case PF_INET:
1705 return 0;
1706 case PF_INET6:
1707 return 1;
1708 case PF_VLAN:
1709 return 2;
1710 case PF_UNSPEC:
1711 default:
1712 return 3;
1713 }
1714 }
1715
1716 /*
1717 * Caller must already be holding ifnet lock.
1718 */
1719 static struct if_proto *
find_attached_proto(struct ifnet * ifp,u_int32_t protocol_family)1720 find_attached_proto(struct ifnet *ifp, u_int32_t protocol_family)
1721 {
1722 struct if_proto *proto = NULL;
1723 u_int32_t i = proto_hash_value(protocol_family);
1724
1725 ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_OWNED);
1726
1727 if (ifp->if_proto_hash != NULL) {
1728 proto = SLIST_FIRST(&ifp->if_proto_hash[i]);
1729 }
1730
1731 while (proto != NULL && proto->protocol_family != protocol_family) {
1732 proto = SLIST_NEXT(proto, next_hash);
1733 }
1734
1735 if (proto != NULL) {
1736 if_proto_ref(proto);
1737 }
1738
1739 return proto;
1740 }
1741
1742 static void
if_proto_ref(struct if_proto * proto)1743 if_proto_ref(struct if_proto *proto)
1744 {
1745 atomic_add_32(&proto->refcount, 1);
1746 }
1747
1748 extern void if_rtproto_del(struct ifnet *ifp, int protocol);
1749
1750 static void
if_proto_free(struct if_proto * proto)1751 if_proto_free(struct if_proto *proto)
1752 {
1753 u_int32_t oldval;
1754 struct ifnet *ifp = proto->ifp;
1755 u_int32_t proto_family = proto->protocol_family;
1756 struct kev_dl_proto_data ev_pr_data;
1757
1758 oldval = atomic_add_32_ov(&proto->refcount, -1);
1759 if (oldval > 1) {
1760 return;
1761 }
1762
1763 if (proto->proto_kpi == kProtoKPI_v1) {
1764 if (proto->kpi.v1.detached) {
1765 proto->kpi.v1.detached(ifp, proto->protocol_family);
1766 }
1767 }
1768 if (proto->proto_kpi == kProtoKPI_v2) {
1769 if (proto->kpi.v2.detached) {
1770 proto->kpi.v2.detached(ifp, proto->protocol_family);
1771 }
1772 }
1773
1774 /*
1775 * Cleanup routes that may still be in the routing table for that
1776 * interface/protocol pair.
1777 */
1778 if_rtproto_del(ifp, proto_family);
1779
1780 ifnet_lock_shared(ifp);
1781
1782 /* No more reference on this, protocol must have been detached */
1783 VERIFY(proto->detached);
1784
1785 /*
1786 * The reserved field carries the number of protocol still attached
1787 * (subject to change)
1788 */
1789 ev_pr_data.proto_family = proto_family;
1790 ev_pr_data.proto_remaining_count = dlil_ifp_protolist(ifp, NULL, 0);
1791
1792 ifnet_lock_done(ifp);
1793
1794 dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_PROTO_DETACHED,
1795 (struct net_event_data *)&ev_pr_data,
1796 sizeof(struct kev_dl_proto_data), FALSE);
1797
1798 if (ev_pr_data.proto_remaining_count == 0) {
1799 /*
1800 * The protocol count has gone to zero, mark the interface down.
1801 * This used to be done by configd.KernelEventMonitor, but that
1802 * is inherently prone to races (rdar://problem/30810208).
1803 */
1804 (void) ifnet_set_flags(ifp, 0, IFF_UP);
1805 (void) ifnet_ioctl(ifp, 0, SIOCSIFFLAGS, NULL);
1806 dlil_post_sifflags_msg(ifp);
1807 }
1808
1809 zfree(dlif_proto_zone, proto);
1810 }
1811
1812 __private_extern__ void
ifnet_lock_assert(struct ifnet * ifp,ifnet_lock_assert_t what)1813 ifnet_lock_assert(struct ifnet *ifp, ifnet_lock_assert_t what)
1814 {
1815 #if !MACH_ASSERT
1816 #pragma unused(ifp)
1817 #endif
1818 unsigned int type = 0;
1819 int ass = 1;
1820
1821 switch (what) {
1822 case IFNET_LCK_ASSERT_EXCLUSIVE:
1823 type = LCK_RW_ASSERT_EXCLUSIVE;
1824 break;
1825
1826 case IFNET_LCK_ASSERT_SHARED:
1827 type = LCK_RW_ASSERT_SHARED;
1828 break;
1829
1830 case IFNET_LCK_ASSERT_OWNED:
1831 type = LCK_RW_ASSERT_HELD;
1832 break;
1833
1834 case IFNET_LCK_ASSERT_NOTOWNED:
1835 /* nothing to do here for RW lock; bypass assert */
1836 ass = 0;
1837 break;
1838
1839 default:
1840 panic("bad ifnet assert type: %d", what);
1841 /* NOTREACHED */
1842 }
1843 if (ass) {
1844 LCK_RW_ASSERT(&ifp->if_lock, type);
1845 }
1846 }
1847
1848 __private_extern__ void
ifnet_lock_shared(struct ifnet * ifp)1849 ifnet_lock_shared(struct ifnet *ifp)
1850 {
1851 lck_rw_lock_shared(&ifp->if_lock);
1852 }
1853
1854 __private_extern__ void
ifnet_lock_exclusive(struct ifnet * ifp)1855 ifnet_lock_exclusive(struct ifnet *ifp)
1856 {
1857 lck_rw_lock_exclusive(&ifp->if_lock);
1858 }
1859
1860 __private_extern__ void
ifnet_lock_done(struct ifnet * ifp)1861 ifnet_lock_done(struct ifnet *ifp)
1862 {
1863 lck_rw_done(&ifp->if_lock);
1864 }
1865
1866 #if INET
1867 __private_extern__ void
if_inetdata_lock_shared(struct ifnet * ifp)1868 if_inetdata_lock_shared(struct ifnet *ifp)
1869 {
1870 lck_rw_lock_shared(&ifp->if_inetdata_lock);
1871 }
1872
1873 __private_extern__ void
if_inetdata_lock_exclusive(struct ifnet * ifp)1874 if_inetdata_lock_exclusive(struct ifnet *ifp)
1875 {
1876 lck_rw_lock_exclusive(&ifp->if_inetdata_lock);
1877 }
1878
1879 __private_extern__ void
if_inetdata_lock_done(struct ifnet * ifp)1880 if_inetdata_lock_done(struct ifnet *ifp)
1881 {
1882 lck_rw_done(&ifp->if_inetdata_lock);
1883 }
1884 #endif
1885
1886 __private_extern__ void
if_inet6data_lock_shared(struct ifnet * ifp)1887 if_inet6data_lock_shared(struct ifnet *ifp)
1888 {
1889 lck_rw_lock_shared(&ifp->if_inet6data_lock);
1890 }
1891
1892 __private_extern__ void
if_inet6data_lock_exclusive(struct ifnet * ifp)1893 if_inet6data_lock_exclusive(struct ifnet *ifp)
1894 {
1895 lck_rw_lock_exclusive(&ifp->if_inet6data_lock);
1896 }
1897
1898 __private_extern__ void
if_inet6data_lock_done(struct ifnet * ifp)1899 if_inet6data_lock_done(struct ifnet *ifp)
1900 {
1901 lck_rw_done(&ifp->if_inet6data_lock);
1902 }
1903
1904 __private_extern__ void
ifnet_head_lock_shared(void)1905 ifnet_head_lock_shared(void)
1906 {
1907 lck_rw_lock_shared(&ifnet_head_lock);
1908 }
1909
1910 __private_extern__ void
ifnet_head_lock_exclusive(void)1911 ifnet_head_lock_exclusive(void)
1912 {
1913 lck_rw_lock_exclusive(&ifnet_head_lock);
1914 }
1915
1916 __private_extern__ void
ifnet_head_done(void)1917 ifnet_head_done(void)
1918 {
1919 lck_rw_done(&ifnet_head_lock);
1920 }
1921
1922 __private_extern__ void
ifnet_head_assert_exclusive(void)1923 ifnet_head_assert_exclusive(void)
1924 {
1925 LCK_RW_ASSERT(&ifnet_head_lock, LCK_RW_ASSERT_EXCLUSIVE);
1926 }
1927
1928 /*
1929 * dlil_ifp_protolist
1930 * - get the list of protocols attached to the interface, or just the number
1931 * of attached protocols
1932 * - if the number returned is greater than 'list_count', truncation occurred
1933 *
1934 * Note:
1935 * - caller must already be holding ifnet lock.
1936 */
1937 static u_int32_t
dlil_ifp_protolist(struct ifnet * ifp,protocol_family_t * list,u_int32_t list_count)1938 dlil_ifp_protolist(struct ifnet *ifp, protocol_family_t *list,
1939 u_int32_t list_count)
1940 {
1941 u_int32_t count = 0;
1942 int i;
1943
1944 ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_OWNED);
1945
1946 if (ifp->if_proto_hash == NULL) {
1947 goto done;
1948 }
1949
1950 for (i = 0; i < PROTO_HASH_SLOTS; i++) {
1951 struct if_proto *proto;
1952 SLIST_FOREACH(proto, &ifp->if_proto_hash[i], next_hash) {
1953 if (list != NULL && count < list_count) {
1954 list[count] = proto->protocol_family;
1955 }
1956 count++;
1957 }
1958 }
1959 done:
1960 return count;
1961 }
1962
1963 __private_extern__ u_int32_t
if_get_protolist(struct ifnet * ifp,u_int32_t * protolist,u_int32_t count)1964 if_get_protolist(struct ifnet * ifp, u_int32_t *protolist, u_int32_t count)
1965 {
1966 ifnet_lock_shared(ifp);
1967 count = dlil_ifp_protolist(ifp, protolist, count);
1968 ifnet_lock_done(ifp);
1969 return count;
1970 }
1971
1972 __private_extern__ void
if_free_protolist(u_int32_t * list)1973 if_free_protolist(u_int32_t *list)
1974 {
1975 kfree_data_addr(list);
1976 }
1977
1978 __private_extern__ int
dlil_post_msg(struct ifnet * ifp,u_int32_t event_subclass,u_int32_t event_code,struct net_event_data * event_data,u_int32_t event_data_len,boolean_t suppress_generation)1979 dlil_post_msg(struct ifnet *ifp, u_int32_t event_subclass,
1980 u_int32_t event_code, struct net_event_data *event_data,
1981 u_int32_t event_data_len, boolean_t suppress_generation)
1982 {
1983 struct net_event_data ev_data;
1984 struct kev_msg ev_msg;
1985
1986 bzero(&ev_msg, sizeof(ev_msg));
1987 bzero(&ev_data, sizeof(ev_data));
1988 /*
1989 * a net event always starts with a net_event_data structure
1990 * but the caller can generate a simple net event or
1991 * provide a longer event structure to post
1992 */
1993 ev_msg.vendor_code = KEV_VENDOR_APPLE;
1994 ev_msg.kev_class = KEV_NETWORK_CLASS;
1995 ev_msg.kev_subclass = event_subclass;
1996 ev_msg.event_code = event_code;
1997
1998 if (event_data == NULL) {
1999 event_data = &ev_data;
2000 event_data_len = sizeof(struct net_event_data);
2001 }
2002
2003 strlcpy(&event_data->if_name[0], ifp->if_name, IFNAMSIZ);
2004 event_data->if_family = ifp->if_family;
2005 event_data->if_unit = (u_int32_t)ifp->if_unit;
2006
2007 ev_msg.dv[0].data_length = event_data_len;
2008 ev_msg.dv[0].data_ptr = event_data;
2009 ev_msg.dv[1].data_length = 0;
2010
2011 bool update_generation = true;
2012 if (event_subclass == KEV_DL_SUBCLASS) {
2013 /* Don't update interface generation for frequent link quality and state changes */
2014 switch (event_code) {
2015 case KEV_DL_LINK_QUALITY_METRIC_CHANGED:
2016 case KEV_DL_RRC_STATE_CHANGED:
2017 case KEV_DL_PRIMARY_ELECTED:
2018 update_generation = false;
2019 break;
2020 default:
2021 break;
2022 }
2023 }
2024
2025 /*
2026 * Some events that update generation counts might
2027 * want to suppress generation count.
2028 * One example is node presence/absence where we still
2029 * issue kernel event for the invocation but want to avoid
2030 * expensive operation of updating generation which triggers
2031 * NECP client updates.
2032 */
2033 if (suppress_generation) {
2034 update_generation = false;
2035 }
2036
2037 return dlil_event_internal(ifp, &ev_msg, update_generation);
2038 }
2039
2040 __private_extern__ int
dlil_alloc_local_stats(struct ifnet * ifp)2041 dlil_alloc_local_stats(struct ifnet *ifp)
2042 {
2043 int ret = EINVAL;
2044 void *buf, *base, **pbuf;
2045
2046 if (ifp == NULL) {
2047 goto end;
2048 }
2049
2050 if (ifp->if_tcp_stat == NULL && ifp->if_udp_stat == NULL) {
2051 /* allocate tcpstat_local structure */
2052 buf = zalloc_flags(dlif_tcpstat_zone,
2053 Z_WAITOK | Z_ZERO | Z_NOFAIL);
2054
2055 /* Get the 64-bit aligned base address for this object */
2056 base = (void *)P2ROUNDUP((intptr_t)buf + sizeof(u_int64_t),
2057 sizeof(u_int64_t));
2058 VERIFY(((intptr_t)base + dlif_tcpstat_size) <=
2059 ((intptr_t)buf + dlif_tcpstat_bufsize));
2060
2061 /*
2062 * Wind back a pointer size from the aligned base and
2063 * save the original address so we can free it later.
2064 */
2065 pbuf = (void **)((intptr_t)base - sizeof(void *));
2066 *pbuf = buf;
2067 ifp->if_tcp_stat = base;
2068
2069 /* allocate udpstat_local structure */
2070 buf = zalloc_flags(dlif_udpstat_zone,
2071 Z_WAITOK | Z_ZERO | Z_NOFAIL);
2072
2073 /* Get the 64-bit aligned base address for this object */
2074 base = (void *)P2ROUNDUP((intptr_t)buf + sizeof(u_int64_t),
2075 sizeof(u_int64_t));
2076 VERIFY(((intptr_t)base + dlif_udpstat_size) <=
2077 ((intptr_t)buf + dlif_udpstat_bufsize));
2078
2079 /*
2080 * Wind back a pointer size from the aligned base and
2081 * save the original address so we can free it later.
2082 */
2083 pbuf = (void **)((intptr_t)base - sizeof(void *));
2084 *pbuf = buf;
2085 ifp->if_udp_stat = base;
2086
2087 VERIFY(IS_P2ALIGNED(ifp->if_tcp_stat, sizeof(u_int64_t)) &&
2088 IS_P2ALIGNED(ifp->if_udp_stat, sizeof(u_int64_t)));
2089
2090 ret = 0;
2091 }
2092
2093 if (ifp->if_ipv4_stat == NULL) {
2094 ifp->if_ipv4_stat = kalloc_type(struct if_tcp_ecn_stat, Z_WAITOK | Z_ZERO);
2095 }
2096
2097 if (ifp->if_ipv6_stat == NULL) {
2098 ifp->if_ipv6_stat = kalloc_type(struct if_tcp_ecn_stat, Z_WAITOK | Z_ZERO);
2099 }
2100 end:
2101 if (ifp != NULL && ret != 0) {
2102 if (ifp->if_tcp_stat != NULL) {
2103 pbuf = (void **)
2104 ((intptr_t)ifp->if_tcp_stat - sizeof(void *));
2105 zfree(dlif_tcpstat_zone, *pbuf);
2106 ifp->if_tcp_stat = NULL;
2107 }
2108 if (ifp->if_udp_stat != NULL) {
2109 pbuf = (void **)
2110 ((intptr_t)ifp->if_udp_stat - sizeof(void *));
2111 zfree(dlif_udpstat_zone, *pbuf);
2112 ifp->if_udp_stat = NULL;
2113 }
2114 /* The macro kfree_type sets the passed pointer to NULL */
2115 if (ifp->if_ipv4_stat != NULL) {
2116 kfree_type(struct if_tcp_ecn_stat, ifp->if_ipv4_stat);
2117 }
2118 if (ifp->if_ipv6_stat != NULL) {
2119 kfree_type(struct if_tcp_ecn_stat, ifp->if_ipv6_stat);
2120 }
2121 }
2122
2123 return ret;
2124 }
2125
2126 static void
dlil_reset_rxpoll_params(ifnet_t ifp)2127 dlil_reset_rxpoll_params(ifnet_t ifp)
2128 {
2129 ASSERT(ifp != NULL);
2130 ifnet_set_poll_cycle(ifp, NULL);
2131 ifp->if_poll_update = 0;
2132 ifp->if_poll_flags = 0;
2133 ifp->if_poll_req = 0;
2134 ifp->if_poll_mode = IFNET_MODEL_INPUT_POLL_OFF;
2135 bzero(&ifp->if_poll_tstats, sizeof(ifp->if_poll_tstats));
2136 bzero(&ifp->if_poll_pstats, sizeof(ifp->if_poll_pstats));
2137 bzero(&ifp->if_poll_sstats, sizeof(ifp->if_poll_sstats));
2138 net_timerclear(&ifp->if_poll_mode_holdtime);
2139 net_timerclear(&ifp->if_poll_mode_lasttime);
2140 net_timerclear(&ifp->if_poll_sample_holdtime);
2141 net_timerclear(&ifp->if_poll_sample_lasttime);
2142 net_timerclear(&ifp->if_poll_dbg_lasttime);
2143 }
2144
2145 static int
dlil_create_input_thread(ifnet_t ifp,struct dlil_threading_info * inp,thread_continue_t * thfunc)2146 dlil_create_input_thread(ifnet_t ifp, struct dlil_threading_info *inp,
2147 thread_continue_t *thfunc)
2148 {
2149 boolean_t dlil_rxpoll_input;
2150 thread_continue_t func = NULL;
2151 u_int32_t limit;
2152 int error = 0;
2153
2154 dlil_rxpoll_input = (ifp != NULL && net_rxpoll &&
2155 (ifp->if_eflags & IFEF_RXPOLL) && (ifp->if_xflags & IFXF_LEGACY));
2156
2157 /* default strategy utilizes the DLIL worker thread */
2158 inp->dlth_strategy = dlil_input_async;
2159
2160 /* NULL ifp indicates the main input thread, called at dlil_init time */
2161 if (ifp == NULL) {
2162 /*
2163 * Main input thread only.
2164 */
2165 func = dlil_main_input_thread_func;
2166 VERIFY(inp == dlil_main_input_thread);
2167 (void) strlcat(inp->dlth_name,
2168 "main_input", DLIL_THREADNAME_LEN);
2169 } else if (dlil_rxpoll_input) {
2170 /*
2171 * Legacy (non-netif) hybrid polling.
2172 */
2173 func = dlil_rxpoll_input_thread_func;
2174 VERIFY(inp != dlil_main_input_thread);
2175 (void) snprintf(inp->dlth_name, DLIL_THREADNAME_LEN,
2176 "%s_input_poll", if_name(ifp));
2177 } else if (net_async || (ifp->if_xflags & IFXF_LEGACY)) {
2178 /*
2179 * Asynchronous strategy.
2180 */
2181 func = dlil_input_thread_func;
2182 VERIFY(inp != dlil_main_input_thread);
2183 (void) snprintf(inp->dlth_name, DLIL_THREADNAME_LEN,
2184 "%s_input", if_name(ifp));
2185 } else {
2186 /*
2187 * Synchronous strategy if there's a netif below and
2188 * the device isn't capable of hybrid polling.
2189 */
2190 ASSERT(func == NULL);
2191 ASSERT(!(ifp->if_xflags & IFXF_LEGACY));
2192 VERIFY(inp != dlil_main_input_thread);
2193 ASSERT(!inp->dlth_affinity);
2194 inp->dlth_strategy = dlil_input_sync;
2195 }
2196 VERIFY(inp->dlth_thread == THREAD_NULL);
2197
2198 /* let caller know */
2199 if (thfunc != NULL) {
2200 *thfunc = func;
2201 }
2202
2203 inp->dlth_lock_grp = lck_grp_alloc_init(inp->dlth_name, LCK_GRP_ATTR_NULL);
2204 lck_mtx_init(&inp->dlth_lock, inp->dlth_lock_grp, &dlil_lck_attributes);
2205
2206 inp->dlth_ifp = ifp; /* NULL for main input thread */
2207 /*
2208 * For interfaces that support opportunistic polling, set the
2209 * low and high watermarks for outstanding inbound packets/bytes.
2210 * Also define freeze times for transitioning between modes
2211 * and updating the average.
2212 */
2213 if (ifp != NULL && net_rxpoll && (ifp->if_eflags & IFEF_RXPOLL)) {
2214 limit = MAX(if_rcvq_maxlen, IF_RCVQ_MINLEN);
2215 if (ifp->if_xflags & IFXF_LEGACY) {
2216 (void) dlil_rxpoll_set_params(ifp, NULL, FALSE);
2217 }
2218 } else {
2219 limit = (u_int32_t)-1;
2220 }
2221
2222 _qinit(&inp->dlth_pkts, Q_DROPTAIL, limit, QP_MBUF);
2223 if (inp == dlil_main_input_thread) {
2224 struct dlil_main_threading_info *inpm =
2225 (struct dlil_main_threading_info *)inp;
2226 _qinit(&inpm->lo_rcvq_pkts, Q_DROPTAIL, limit, QP_MBUF);
2227 }
2228
2229 if (func == NULL) {
2230 ASSERT(!(ifp->if_xflags & IFXF_LEGACY));
2231 ASSERT(error == 0);
2232 error = ENODEV;
2233 goto done;
2234 }
2235
2236 error = kernel_thread_start(func, inp, &inp->dlth_thread);
2237 if (error == KERN_SUCCESS) {
2238 thread_precedence_policy_data_t info;
2239 __unused kern_return_t kret;
2240
2241 bzero(&info, sizeof(info));
2242 info.importance = 0;
2243 kret = thread_policy_set(inp->dlth_thread,
2244 THREAD_PRECEDENCE_POLICY, (thread_policy_t)&info,
2245 THREAD_PRECEDENCE_POLICY_COUNT);
2246 ASSERT(kret == KERN_SUCCESS);
2247 /*
2248 * We create an affinity set so that the matching workloop
2249 * thread or the starter thread (for loopback) can be
2250 * scheduled on the same processor set as the input thread.
2251 */
2252 if (net_affinity) {
2253 struct thread *tp = inp->dlth_thread;
2254 u_int32_t tag;
2255 /*
2256 * Randomize to reduce the probability
2257 * of affinity tag namespace collision.
2258 */
2259 read_frandom(&tag, sizeof(tag));
2260 if (dlil_affinity_set(tp, tag) == KERN_SUCCESS) {
2261 thread_reference(tp);
2262 inp->dlth_affinity_tag = tag;
2263 inp->dlth_affinity = TRUE;
2264 }
2265 }
2266 } else if (inp == dlil_main_input_thread) {
2267 panic_plain("%s: couldn't create main input thread", __func__);
2268 /* NOTREACHED */
2269 } else {
2270 panic_plain("%s: couldn't create %s input thread", __func__,
2271 if_name(ifp));
2272 /* NOTREACHED */
2273 }
2274 OSAddAtomic(1, &cur_dlil_input_threads);
2275
2276 done:
2277 return error;
2278 }
2279
2280 #if TEST_INPUT_THREAD_TERMINATION
2281 static int
2282 sysctl_input_thread_termination_spin SYSCTL_HANDLER_ARGS
2283 {
2284 #pragma unused(arg1, arg2)
2285 uint32_t i;
2286 int err;
2287
2288 i = if_input_thread_termination_spin;
2289
2290 err = sysctl_handle_int(oidp, &i, 0, req);
2291 if (err != 0 || req->newptr == USER_ADDR_NULL) {
2292 return err;
2293 }
2294
2295 if (net_rxpoll == 0) {
2296 return ENXIO;
2297 }
2298
2299 if_input_thread_termination_spin = i;
2300 return err;
2301 }
2302 #endif /* TEST_INPUT_THREAD_TERMINATION */
2303
2304 static void
dlil_clean_threading_info(struct dlil_threading_info * inp)2305 dlil_clean_threading_info(struct dlil_threading_info *inp)
2306 {
2307 lck_mtx_destroy(&inp->dlth_lock, inp->dlth_lock_grp);
2308 lck_grp_free(inp->dlth_lock_grp);
2309 inp->dlth_lock_grp = NULL;
2310
2311 inp->dlth_flags = 0;
2312 inp->dlth_wtot = 0;
2313 bzero(inp->dlth_name, sizeof(inp->dlth_name));
2314 inp->dlth_ifp = NULL;
2315 VERIFY(qhead(&inp->dlth_pkts) == NULL && qempty(&inp->dlth_pkts));
2316 qlimit(&inp->dlth_pkts) = 0;
2317 bzero(&inp->dlth_stats, sizeof(inp->dlth_stats));
2318
2319 VERIFY(!inp->dlth_affinity);
2320 inp->dlth_thread = THREAD_NULL;
2321 inp->dlth_strategy = NULL;
2322 VERIFY(inp->dlth_driver_thread == THREAD_NULL);
2323 VERIFY(inp->dlth_poller_thread == THREAD_NULL);
2324 VERIFY(inp->dlth_affinity_tag == 0);
2325 #if IFNET_INPUT_SANITY_CHK
2326 inp->dlth_pkts_cnt = 0;
2327 #endif /* IFNET_INPUT_SANITY_CHK */
2328 }
2329
2330 static void
dlil_terminate_input_thread(struct dlil_threading_info * inp)2331 dlil_terminate_input_thread(struct dlil_threading_info *inp)
2332 {
2333 struct ifnet *ifp = inp->dlth_ifp;
2334 classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
2335
2336 VERIFY(current_thread() == inp->dlth_thread);
2337 VERIFY(inp != dlil_main_input_thread);
2338
2339 OSAddAtomic(-1, &cur_dlil_input_threads);
2340
2341 #if TEST_INPUT_THREAD_TERMINATION
2342 { /* do something useless that won't get optimized away */
2343 uint32_t v = 1;
2344 for (uint32_t i = 0;
2345 i < if_input_thread_termination_spin;
2346 i++) {
2347 v = (i + 1) * v;
2348 }
2349 DLIL_PRINTF("the value is %d\n", v);
2350 }
2351 #endif /* TEST_INPUT_THREAD_TERMINATION */
2352
2353 lck_mtx_lock_spin(&inp->dlth_lock);
2354 _getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
2355 VERIFY((inp->dlth_flags & DLIL_INPUT_TERMINATE) != 0);
2356 inp->dlth_flags |= DLIL_INPUT_TERMINATE_COMPLETE;
2357 wakeup_one((caddr_t)&inp->dlth_flags);
2358 lck_mtx_unlock(&inp->dlth_lock);
2359
2360 /* free up pending packets */
2361 if (pkt.cp_mbuf != NULL) {
2362 mbuf_freem_list(pkt.cp_mbuf);
2363 }
2364
2365 /* for the extra refcnt from kernel_thread_start() */
2366 thread_deallocate(current_thread());
2367
2368 if (dlil_verbose) {
2369 DLIL_PRINTF("%s: input thread terminated\n",
2370 if_name(ifp));
2371 }
2372
2373 /* this is the end */
2374 thread_terminate(current_thread());
2375 /* NOTREACHED */
2376 }
2377
2378 static kern_return_t
dlil_affinity_set(struct thread * tp,u_int32_t tag)2379 dlil_affinity_set(struct thread *tp, u_int32_t tag)
2380 {
2381 thread_affinity_policy_data_t policy;
2382
2383 bzero(&policy, sizeof(policy));
2384 policy.affinity_tag = tag;
2385 return thread_policy_set(tp, THREAD_AFFINITY_POLICY,
2386 (thread_policy_t)&policy, THREAD_AFFINITY_POLICY_COUNT);
2387 }
2388
2389 #if SKYWALK && defined(XNU_TARGET_OS_OSX)
2390 static void
dlil_filter_event(struct eventhandler_entry_arg arg __unused,enum net_filter_event_subsystems state)2391 dlil_filter_event(struct eventhandler_entry_arg arg __unused,
2392 enum net_filter_event_subsystems state)
2393 {
2394 if (state == 0) {
2395 if_enable_fsw_transport_netagent = 1;
2396 } else {
2397 if_enable_fsw_transport_netagent = 0;
2398 }
2399 kern_nexus_update_netagents();
2400 }
2401 #endif /* SKYWALK && XNU_TARGET_OS_OSX */
2402
2403 void
dlil_init(void)2404 dlil_init(void)
2405 {
2406 thread_t thread = THREAD_NULL;
2407
2408 /*
2409 * The following fields must be 64-bit aligned for atomic operations.
2410 */
2411 IF_DATA_REQUIRE_ALIGNED_64(ifi_ipackets);
2412 IF_DATA_REQUIRE_ALIGNED_64(ifi_ierrors);
2413 IF_DATA_REQUIRE_ALIGNED_64(ifi_opackets);
2414 IF_DATA_REQUIRE_ALIGNED_64(ifi_oerrors);
2415 IF_DATA_REQUIRE_ALIGNED_64(ifi_collisions);
2416 IF_DATA_REQUIRE_ALIGNED_64(ifi_ibytes);
2417 IF_DATA_REQUIRE_ALIGNED_64(ifi_obytes);
2418 IF_DATA_REQUIRE_ALIGNED_64(ifi_imcasts);
2419 IF_DATA_REQUIRE_ALIGNED_64(ifi_omcasts);
2420 IF_DATA_REQUIRE_ALIGNED_64(ifi_iqdrops);
2421 IF_DATA_REQUIRE_ALIGNED_64(ifi_noproto);
2422 IF_DATA_REQUIRE_ALIGNED_64(ifi_alignerrs);
2423 IF_DATA_REQUIRE_ALIGNED_64(ifi_dt_bytes);
2424 IF_DATA_REQUIRE_ALIGNED_64(ifi_fpackets);
2425 IF_DATA_REQUIRE_ALIGNED_64(ifi_fbytes);
2426
2427 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_ipackets);
2428 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_ierrors);
2429 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_opackets);
2430 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_oerrors);
2431 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_collisions);
2432 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_ibytes);
2433 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_obytes);
2434 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_imcasts);
2435 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_omcasts);
2436 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_iqdrops);
2437 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_noproto);
2438 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_alignerrs);
2439 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_dt_bytes);
2440 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_fpackets);
2441 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_fbytes);
2442
2443 /*
2444 * These IF_HWASSIST_ flags must be equal to their IFNET_* counterparts.
2445 */
2446 _CASSERT(IF_HWASSIST_CSUM_IP == IFNET_CSUM_IP);
2447 _CASSERT(IF_HWASSIST_CSUM_TCP == IFNET_CSUM_TCP);
2448 _CASSERT(IF_HWASSIST_CSUM_UDP == IFNET_CSUM_UDP);
2449 _CASSERT(IF_HWASSIST_CSUM_IP_FRAGS == IFNET_CSUM_FRAGMENT);
2450 _CASSERT(IF_HWASSIST_CSUM_FRAGMENT == IFNET_IP_FRAGMENT);
2451 _CASSERT(IF_HWASSIST_CSUM_TCPIPV6 == IFNET_CSUM_TCPIPV6);
2452 _CASSERT(IF_HWASSIST_CSUM_UDPIPV6 == IFNET_CSUM_UDPIPV6);
2453 _CASSERT(IF_HWASSIST_CSUM_FRAGMENT_IPV6 == IFNET_IPV6_FRAGMENT);
2454 _CASSERT(IF_HWASSIST_CSUM_PARTIAL == IFNET_CSUM_PARTIAL);
2455 _CASSERT(IF_HWASSIST_CSUM_ZERO_INVERT == IFNET_CSUM_ZERO_INVERT);
2456 _CASSERT(IF_HWASSIST_VLAN_TAGGING == IFNET_VLAN_TAGGING);
2457 _CASSERT(IF_HWASSIST_VLAN_MTU == IFNET_VLAN_MTU);
2458 _CASSERT(IF_HWASSIST_TSO_V4 == IFNET_TSO_IPV4);
2459 _CASSERT(IF_HWASSIST_TSO_V6 == IFNET_TSO_IPV6);
2460
2461 /*
2462 * ... as well as the mbuf checksum flags counterparts.
2463 */
2464 _CASSERT(CSUM_IP == IF_HWASSIST_CSUM_IP);
2465 _CASSERT(CSUM_TCP == IF_HWASSIST_CSUM_TCP);
2466 _CASSERT(CSUM_UDP == IF_HWASSIST_CSUM_UDP);
2467 _CASSERT(CSUM_IP_FRAGS == IF_HWASSIST_CSUM_IP_FRAGS);
2468 _CASSERT(CSUM_FRAGMENT == IF_HWASSIST_CSUM_FRAGMENT);
2469 _CASSERT(CSUM_TCPIPV6 == IF_HWASSIST_CSUM_TCPIPV6);
2470 _CASSERT(CSUM_UDPIPV6 == IF_HWASSIST_CSUM_UDPIPV6);
2471 _CASSERT(CSUM_FRAGMENT_IPV6 == IF_HWASSIST_CSUM_FRAGMENT_IPV6);
2472 _CASSERT(CSUM_PARTIAL == IF_HWASSIST_CSUM_PARTIAL);
2473 _CASSERT(CSUM_ZERO_INVERT == IF_HWASSIST_CSUM_ZERO_INVERT);
2474 _CASSERT(CSUM_VLAN_TAG_VALID == IF_HWASSIST_VLAN_TAGGING);
2475
2476 /*
2477 * Make sure we have at least IF_LLREACH_MAXLEN in the llreach info.
2478 */
2479 _CASSERT(IF_LLREACH_MAXLEN <= IF_LLREACHINFO_ADDRLEN);
2480 _CASSERT(IFNET_LLREACHINFO_ADDRLEN == IF_LLREACHINFO_ADDRLEN);
2481
2482 _CASSERT(IFRLOGF_DLIL == IFNET_LOGF_DLIL);
2483 _CASSERT(IFRLOGF_FAMILY == IFNET_LOGF_FAMILY);
2484 _CASSERT(IFRLOGF_DRIVER == IFNET_LOGF_DRIVER);
2485 _CASSERT(IFRLOGF_FIRMWARE == IFNET_LOGF_FIRMWARE);
2486
2487 _CASSERT(IFRLOGCAT_CONNECTIVITY == IFNET_LOGCAT_CONNECTIVITY);
2488 _CASSERT(IFRLOGCAT_QUALITY == IFNET_LOGCAT_QUALITY);
2489 _CASSERT(IFRLOGCAT_PERFORMANCE == IFNET_LOGCAT_PERFORMANCE);
2490
2491 _CASSERT(IFRTYPE_FAMILY_ANY == IFNET_FAMILY_ANY);
2492 _CASSERT(IFRTYPE_FAMILY_LOOPBACK == IFNET_FAMILY_LOOPBACK);
2493 _CASSERT(IFRTYPE_FAMILY_ETHERNET == IFNET_FAMILY_ETHERNET);
2494 _CASSERT(IFRTYPE_FAMILY_SLIP == IFNET_FAMILY_SLIP);
2495 _CASSERT(IFRTYPE_FAMILY_TUN == IFNET_FAMILY_TUN);
2496 _CASSERT(IFRTYPE_FAMILY_VLAN == IFNET_FAMILY_VLAN);
2497 _CASSERT(IFRTYPE_FAMILY_PPP == IFNET_FAMILY_PPP);
2498 _CASSERT(IFRTYPE_FAMILY_PVC == IFNET_FAMILY_PVC);
2499 _CASSERT(IFRTYPE_FAMILY_DISC == IFNET_FAMILY_DISC);
2500 _CASSERT(IFRTYPE_FAMILY_MDECAP == IFNET_FAMILY_MDECAP);
2501 _CASSERT(IFRTYPE_FAMILY_GIF == IFNET_FAMILY_GIF);
2502 _CASSERT(IFRTYPE_FAMILY_FAITH == IFNET_FAMILY_FAITH);
2503 _CASSERT(IFRTYPE_FAMILY_STF == IFNET_FAMILY_STF);
2504 _CASSERT(IFRTYPE_FAMILY_FIREWIRE == IFNET_FAMILY_FIREWIRE);
2505 _CASSERT(IFRTYPE_FAMILY_BOND == IFNET_FAMILY_BOND);
2506 _CASSERT(IFRTYPE_FAMILY_CELLULAR == IFNET_FAMILY_CELLULAR);
2507 _CASSERT(IFRTYPE_FAMILY_UTUN == IFNET_FAMILY_UTUN);
2508 _CASSERT(IFRTYPE_FAMILY_IPSEC == IFNET_FAMILY_IPSEC);
2509
2510 _CASSERT(IFRTYPE_SUBFAMILY_ANY == IFNET_SUBFAMILY_ANY);
2511 _CASSERT(IFRTYPE_SUBFAMILY_USB == IFNET_SUBFAMILY_USB);
2512 _CASSERT(IFRTYPE_SUBFAMILY_BLUETOOTH == IFNET_SUBFAMILY_BLUETOOTH);
2513 _CASSERT(IFRTYPE_SUBFAMILY_WIFI == IFNET_SUBFAMILY_WIFI);
2514 _CASSERT(IFRTYPE_SUBFAMILY_THUNDERBOLT == IFNET_SUBFAMILY_THUNDERBOLT);
2515 _CASSERT(IFRTYPE_SUBFAMILY_RESERVED == IFNET_SUBFAMILY_RESERVED);
2516 _CASSERT(IFRTYPE_SUBFAMILY_INTCOPROC == IFNET_SUBFAMILY_INTCOPROC);
2517 _CASSERT(IFRTYPE_SUBFAMILY_QUICKRELAY == IFNET_SUBFAMILY_QUICKRELAY);
2518 _CASSERT(IFRTYPE_SUBFAMILY_DEFAULT == IFNET_SUBFAMILY_DEFAULT);
2519
2520 _CASSERT(DLIL_MODIDLEN == IFNET_MODIDLEN);
2521 _CASSERT(DLIL_MODARGLEN == IFNET_MODARGLEN);
2522
2523 PE_parse_boot_argn("net_affinity", &net_affinity,
2524 sizeof(net_affinity));
2525
2526 PE_parse_boot_argn("net_rxpoll", &net_rxpoll, sizeof(net_rxpoll));
2527
2528 PE_parse_boot_argn("net_rtref", &net_rtref, sizeof(net_rtref));
2529
2530 PE_parse_boot_argn("net_async", &net_async, sizeof(net_async));
2531
2532 PE_parse_boot_argn("ifnet_debug", &ifnet_debug, sizeof(ifnet_debug));
2533
2534 VERIFY(dlil_pending_thread_cnt == 0);
2535 #if SKYWALK
2536 boolean_t pe_enable_fsw_transport_netagent = FALSE;
2537 boolean_t pe_disable_fsw_transport_netagent = FALSE;
2538 boolean_t enable_fsw_netagent =
2539 (((if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0) ||
2540 (if_attach_nx & IF_ATTACH_NX_FSW_IP_NETAGENT) != 0);
2541
2542 /*
2543 * Check the device tree to see if Skywalk netagent has been explicitly
2544 * enabled or disabled. This can be overridden via if_attach_nx below.
2545 * Note that the property is a 0-length key, and so checking for the
2546 * presence itself is enough (no need to check for the actual value of
2547 * the retrieved variable.)
2548 */
2549 pe_enable_fsw_transport_netagent =
2550 PE_get_default("kern.skywalk_netagent_enable",
2551 &pe_enable_fsw_transport_netagent,
2552 sizeof(pe_enable_fsw_transport_netagent));
2553 pe_disable_fsw_transport_netagent =
2554 PE_get_default("kern.skywalk_netagent_disable",
2555 &pe_disable_fsw_transport_netagent,
2556 sizeof(pe_disable_fsw_transport_netagent));
2557
2558 /*
2559 * These two are mutually exclusive, i.e. they both can be absent,
2560 * but only one can be present at a time, and so we assert to make
2561 * sure it is correct.
2562 */
2563 VERIFY((!pe_enable_fsw_transport_netagent &&
2564 !pe_disable_fsw_transport_netagent) ||
2565 (pe_enable_fsw_transport_netagent ^
2566 pe_disable_fsw_transport_netagent));
2567
2568 if (pe_enable_fsw_transport_netagent) {
2569 kprintf("SK: netagent is enabled via an override for "
2570 "this platform\n");
2571 if_attach_nx = SKYWALK_NETWORKING_ENABLED;
2572 } else if (pe_disable_fsw_transport_netagent) {
2573 kprintf("SK: netagent is disabled via an override for "
2574 "this platform\n");
2575 if_attach_nx = SKYWALK_NETWORKING_DISABLED;
2576 } else {
2577 kprintf("SK: netagent is %s by default for this platform\n",
2578 (enable_fsw_netagent ? "enabled" : "disabled"));
2579 if_attach_nx = IF_ATTACH_NX_DEFAULT;
2580 }
2581
2582 /*
2583 * Now see if there's a boot-arg override.
2584 */
2585 (void) PE_parse_boot_argn("if_attach_nx", &if_attach_nx,
2586 sizeof(if_attach_nx));
2587 if_enable_fsw_transport_netagent =
2588 ((if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0);
2589
2590 if_netif_all = ((if_attach_nx & IF_ATTACH_NX_NETIF_ALL) != 0);
2591
2592 if (pe_disable_fsw_transport_netagent &&
2593 if_enable_fsw_transport_netagent) {
2594 kprintf("SK: netagent is force-enabled\n");
2595 } else if (!pe_disable_fsw_transport_netagent &&
2596 !if_enable_fsw_transport_netagent) {
2597 kprintf("SK: netagent is force-disabled\n");
2598 }
2599 #ifdef XNU_TARGET_OS_OSX
2600 if (if_enable_fsw_transport_netagent) {
2601 net_filter_event_register(dlil_filter_event);
2602 }
2603 #endif /* XNU_TARGET_OS_OSX */
2604
2605 #if (DEVELOPMENT || DEBUG)
2606 (void) PE_parse_boot_argn("fsw_use_max_mtu_buffer",
2607 &fsw_use_max_mtu_buffer, sizeof(fsw_use_max_mtu_buffer));
2608 #endif /* (DEVELOPMENT || DEBUG) */
2609
2610 #endif /* SKYWALK */
2611 dlif_size = (ifnet_debug == 0) ? sizeof(struct dlil_ifnet) :
2612 sizeof(struct dlil_ifnet_dbg);
2613 /* Enforce 64-bit alignment for dlil_ifnet structure */
2614 dlif_bufsize = dlif_size + sizeof(void *) + sizeof(u_int64_t);
2615 dlif_bufsize = (uint32_t)P2ROUNDUP(dlif_bufsize, sizeof(u_int64_t));
2616 dlif_zone = zone_create(DLIF_ZONE_NAME, dlif_bufsize, ZC_ZFREE_CLEARMEM);
2617
2618 dlif_tcpstat_size = sizeof(struct tcpstat_local);
2619 /* Enforce 64-bit alignment for tcpstat_local structure */
2620 dlif_tcpstat_bufsize =
2621 dlif_tcpstat_size + sizeof(void *) + sizeof(u_int64_t);
2622 dlif_tcpstat_bufsize = (uint32_t)
2623 P2ROUNDUP(dlif_tcpstat_bufsize, sizeof(u_int64_t));
2624 dlif_tcpstat_zone = zone_create(DLIF_TCPSTAT_ZONE_NAME,
2625 dlif_tcpstat_bufsize, ZC_ZFREE_CLEARMEM);
2626
2627 dlif_udpstat_size = sizeof(struct udpstat_local);
2628 /* Enforce 64-bit alignment for udpstat_local structure */
2629 dlif_udpstat_bufsize =
2630 dlif_udpstat_size + sizeof(void *) + sizeof(u_int64_t);
2631 dlif_udpstat_bufsize = (uint32_t)
2632 P2ROUNDUP(dlif_udpstat_bufsize, sizeof(u_int64_t));
2633 dlif_udpstat_zone = zone_create(DLIF_UDPSTAT_ZONE_NAME,
2634 dlif_udpstat_bufsize, ZC_ZFREE_CLEARMEM);
2635
2636 eventhandler_lists_ctxt_init(&ifnet_evhdlr_ctxt);
2637
2638 TAILQ_INIT(&dlil_ifnet_head);
2639 TAILQ_INIT(&ifnet_head);
2640 TAILQ_INIT(&ifnet_detaching_head);
2641 TAILQ_INIT(&ifnet_ordered_head);
2642
2643 /* Initialize interface address subsystem */
2644 ifa_init();
2645
2646 #if PF
2647 /* Initialize the packet filter */
2648 pfinit();
2649 #endif /* PF */
2650
2651 /* Initialize queue algorithms */
2652 classq_init();
2653
2654 /* Initialize packet schedulers */
2655 pktsched_init();
2656
2657 /* Initialize flow advisory subsystem */
2658 flowadv_init();
2659
2660 /* Initialize the pktap virtual interface */
2661 pktap_init();
2662
2663 /* Initialize the service class to dscp map */
2664 net_qos_map_init();
2665
2666 /* Initialize the interface low power mode event handler */
2667 if_low_power_evhdlr_init();
2668
2669 /* Initialize the interface offload port list subsystem */
2670 if_ports_used_init();
2671
2672 #if DEBUG || DEVELOPMENT
2673 /* Run self-tests */
2674 dlil_verify_sum16();
2675 #endif /* DEBUG || DEVELOPMENT */
2676
2677 /*
2678 * Create and start up the main DLIL input thread and the interface
2679 * detacher threads once everything is initialized.
2680 */
2681 dlil_incr_pending_thread_count();
2682 (void) dlil_create_input_thread(NULL, dlil_main_input_thread, NULL);
2683
2684 /*
2685 * Create ifnet detacher thread.
2686 * When an interface gets detached, part of the detach processing
2687 * is delayed. The interface is added to delayed detach list
2688 * and this thread is woken up to call ifnet_detach_final
2689 * on these interfaces.
2690 */
2691 dlil_incr_pending_thread_count();
2692 if (kernel_thread_start(ifnet_detacher_thread_func,
2693 NULL, &thread) != KERN_SUCCESS) {
2694 panic_plain("%s: couldn't create detacher thread", __func__);
2695 /* NOTREACHED */
2696 }
2697 thread_deallocate(thread);
2698
2699 /*
2700 * Wait for the created kernel threads for dlil to get
2701 * scheduled and run at least once before we proceed
2702 */
2703 lck_mtx_lock(&dlil_thread_sync_lock);
2704 while (dlil_pending_thread_cnt != 0) {
2705 DLIL_PRINTF("%s: Waiting for all the create dlil kernel "
2706 "threads to get scheduled at least once.\n", __func__);
2707 (void) msleep(&dlil_pending_thread_cnt, &dlil_thread_sync_lock,
2708 (PZERO - 1), __func__, NULL);
2709 LCK_MTX_ASSERT(&dlil_thread_sync_lock, LCK_ASSERT_OWNED);
2710 }
2711 lck_mtx_unlock(&dlil_thread_sync_lock);
2712 DLIL_PRINTF("%s: All the created dlil kernel threads have been "
2713 "scheduled at least once. Proceeding.\n", __func__);
2714 }
2715
2716 static void
if_flt_monitor_busy(struct ifnet * ifp)2717 if_flt_monitor_busy(struct ifnet *ifp)
2718 {
2719 LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
2720
2721 ++ifp->if_flt_busy;
2722 VERIFY(ifp->if_flt_busy != 0);
2723 }
2724
2725 static void
if_flt_monitor_unbusy(struct ifnet * ifp)2726 if_flt_monitor_unbusy(struct ifnet *ifp)
2727 {
2728 if_flt_monitor_leave(ifp);
2729 }
2730
2731 static void
if_flt_monitor_enter(struct ifnet * ifp)2732 if_flt_monitor_enter(struct ifnet *ifp)
2733 {
2734 LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
2735
2736 while (ifp->if_flt_busy) {
2737 ++ifp->if_flt_waiters;
2738 (void) msleep(&ifp->if_flt_head, &ifp->if_flt_lock,
2739 (PZERO - 1), "if_flt_monitor", NULL);
2740 }
2741 if_flt_monitor_busy(ifp);
2742 }
2743
2744 static void
if_flt_monitor_leave(struct ifnet * ifp)2745 if_flt_monitor_leave(struct ifnet *ifp)
2746 {
2747 LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
2748
2749 VERIFY(ifp->if_flt_busy != 0);
2750 --ifp->if_flt_busy;
2751
2752 if (ifp->if_flt_busy == 0 && ifp->if_flt_waiters > 0) {
2753 ifp->if_flt_waiters = 0;
2754 wakeup(&ifp->if_flt_head);
2755 }
2756 }
2757
2758 __private_extern__ int
dlil_attach_filter(struct ifnet * ifp,const struct iff_filter * if_filter,interface_filter_t * filter_ref,u_int32_t flags)2759 dlil_attach_filter(struct ifnet *ifp, const struct iff_filter *if_filter,
2760 interface_filter_t *filter_ref, u_int32_t flags)
2761 {
2762 int retval = 0;
2763 struct ifnet_filter *filter = NULL;
2764
2765 ifnet_head_lock_shared();
2766
2767 /* Check that the interface is in the global list */
2768 if (!ifnet_lookup(ifp)) {
2769 retval = ENXIO;
2770 goto done;
2771 }
2772 if (!ifnet_is_attached(ifp, 1)) {
2773 os_log(OS_LOG_DEFAULT, "%s: %s is no longer attached",
2774 __func__, if_name(ifp));
2775 retval = ENXIO;
2776 goto done;
2777 }
2778
2779 filter = zalloc_flags(dlif_filt_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
2780
2781 /* refcnt held above during lookup */
2782 filter->filt_flags = flags;
2783 filter->filt_ifp = ifp;
2784 filter->filt_cookie = if_filter->iff_cookie;
2785 filter->filt_name = if_filter->iff_name;
2786 filter->filt_protocol = if_filter->iff_protocol;
2787 /*
2788 * Do not install filter callbacks for internal coproc interface
2789 */
2790 if (!IFNET_IS_INTCOPROC(ifp)) {
2791 filter->filt_input = if_filter->iff_input;
2792 filter->filt_output = if_filter->iff_output;
2793 filter->filt_event = if_filter->iff_event;
2794 filter->filt_ioctl = if_filter->iff_ioctl;
2795 }
2796 filter->filt_detached = if_filter->iff_detached;
2797
2798 lck_mtx_lock(&ifp->if_flt_lock);
2799 if_flt_monitor_enter(ifp);
2800
2801 LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
2802 TAILQ_INSERT_TAIL(&ifp->if_flt_head, filter, filt_next);
2803
2804 *filter_ref = filter;
2805
2806 /*
2807 * Bump filter count and route_generation ID to let TCP
2808 * know it shouldn't do TSO on this connection
2809 */
2810 if ((filter->filt_flags & DLIL_IFF_TSO) == 0) {
2811 ifnet_filter_update_tso(ifp, TRUE);
2812 }
2813 OSIncrementAtomic64(&net_api_stats.nas_iflt_attach_count);
2814 INC_ATOMIC_INT64_LIM(net_api_stats.nas_iflt_attach_total);
2815 if (filter->filt_flags & DLIL_IFF_INTERNAL) {
2816 OSIncrementAtomic64(&net_api_stats.nas_iflt_attach_os_count);
2817 INC_ATOMIC_INT64_LIM(net_api_stats.nas_iflt_attach_os_total);
2818 } else {
2819 OSAddAtomic(1, &ifp->if_flt_non_os_count);
2820 }
2821 if_flt_monitor_leave(ifp);
2822 lck_mtx_unlock(&ifp->if_flt_lock);
2823
2824 #if SKYWALK && defined(XNU_TARGET_OS_OSX)
2825 net_filter_event_mark(NET_FILTER_EVENT_INTERFACE,
2826 net_check_compatible_if_filter(NULL));
2827 #endif /* SKYWALK && XNU_TARGET_OS_OSX */
2828
2829 if (dlil_verbose) {
2830 DLIL_PRINTF("%s: %s filter attached\n", if_name(ifp),
2831 if_filter->iff_name);
2832 }
2833 ifnet_decr_iorefcnt(ifp);
2834
2835 done:
2836 ifnet_head_done();
2837 if (retval != 0 && ifp != NULL) {
2838 DLIL_PRINTF("%s: failed to attach %s (err=%d)\n",
2839 if_name(ifp), if_filter->iff_name, retval);
2840 }
2841 if (retval != 0 && filter != NULL) {
2842 zfree(dlif_filt_zone, filter);
2843 }
2844
2845 return retval;
2846 }
2847
2848 static int
dlil_detach_filter_internal(interface_filter_t filter,int detached)2849 dlil_detach_filter_internal(interface_filter_t filter, int detached)
2850 {
2851 int retval = 0;
2852
2853 if (detached == 0) {
2854 ifnet_t ifp = NULL;
2855
2856 ifnet_head_lock_shared();
2857 TAILQ_FOREACH(ifp, &ifnet_head, if_link) {
2858 interface_filter_t entry = NULL;
2859
2860 lck_mtx_lock(&ifp->if_flt_lock);
2861 TAILQ_FOREACH(entry, &ifp->if_flt_head, filt_next) {
2862 if (entry != filter || entry->filt_skip) {
2863 continue;
2864 }
2865 /*
2866 * We've found a match; since it's possible
2867 * that the thread gets blocked in the monitor,
2868 * we do the lock dance. Interface should
2869 * not be detached since we still have a use
2870 * count held during filter attach.
2871 */
2872 entry->filt_skip = 1; /* skip input/output */
2873 lck_mtx_unlock(&ifp->if_flt_lock);
2874 ifnet_head_done();
2875
2876 lck_mtx_lock(&ifp->if_flt_lock);
2877 if_flt_monitor_enter(ifp);
2878 LCK_MTX_ASSERT(&ifp->if_flt_lock,
2879 LCK_MTX_ASSERT_OWNED);
2880
2881 /* Remove the filter from the list */
2882 TAILQ_REMOVE(&ifp->if_flt_head, filter,
2883 filt_next);
2884
2885 if (dlil_verbose) {
2886 DLIL_PRINTF("%s: %s filter detached\n",
2887 if_name(ifp), filter->filt_name);
2888 }
2889 if (!(filter->filt_flags & DLIL_IFF_INTERNAL)) {
2890 VERIFY(ifp->if_flt_non_os_count != 0);
2891 OSAddAtomic(-1, &ifp->if_flt_non_os_count);
2892 }
2893 /*
2894 * Decrease filter count and route_generation
2895 * ID to let TCP know it should reevalute doing
2896 * TSO or not.
2897 */
2898 if ((filter->filt_flags & DLIL_IFF_TSO) == 0) {
2899 ifnet_filter_update_tso(ifp, FALSE);
2900 }
2901 if_flt_monitor_leave(ifp);
2902 lck_mtx_unlock(&ifp->if_flt_lock);
2903 goto destroy;
2904 }
2905 lck_mtx_unlock(&ifp->if_flt_lock);
2906 }
2907 ifnet_head_done();
2908
2909 /* filter parameter is not a valid filter ref */
2910 retval = EINVAL;
2911 goto done;
2912 } else {
2913 struct ifnet *ifp = filter->filt_ifp;
2914 /*
2915 * Here we are called from ifnet_detach_final(); the
2916 * caller had emptied if_flt_head and we're doing an
2917 * implicit filter detach because the interface is
2918 * about to go away. Make sure to adjust the counters
2919 * in this case. We don't need the protection of the
2920 * filter monitor since we're called as part of the
2921 * final detach in the context of the detacher thread.
2922 */
2923 if (!(filter->filt_flags & DLIL_IFF_INTERNAL)) {
2924 VERIFY(ifp->if_flt_non_os_count != 0);
2925 OSAddAtomic(-1, &ifp->if_flt_non_os_count);
2926 }
2927 /*
2928 * Decrease filter count and route_generation
2929 * ID to let TCP know it should reevalute doing
2930 * TSO or not.
2931 */
2932 if ((filter->filt_flags & DLIL_IFF_TSO) == 0) {
2933 ifnet_filter_update_tso(ifp, FALSE);
2934 }
2935 }
2936
2937 if (dlil_verbose) {
2938 DLIL_PRINTF("%s filter detached\n", filter->filt_name);
2939 }
2940
2941 destroy:
2942
2943 /* Call the detached function if there is one */
2944 if (filter->filt_detached) {
2945 filter->filt_detached(filter->filt_cookie, filter->filt_ifp);
2946 }
2947
2948 VERIFY(OSDecrementAtomic64(&net_api_stats.nas_iflt_attach_count) > 0);
2949 if (filter->filt_flags & DLIL_IFF_INTERNAL) {
2950 VERIFY(OSDecrementAtomic64(&net_api_stats.nas_iflt_attach_os_count) > 0);
2951 }
2952 #if SKYWALK && defined(XNU_TARGET_OS_OSX)
2953 net_filter_event_mark(NET_FILTER_EVENT_INTERFACE,
2954 net_check_compatible_if_filter(NULL));
2955 #endif /* SKYWALK && XNU_TARGET_OS_OSX */
2956
2957 /* Free the filter */
2958 zfree(dlif_filt_zone, filter);
2959 filter = NULL;
2960 done:
2961 if (retval != 0 && filter != NULL) {
2962 DLIL_PRINTF("failed to detach %s filter (err=%d)\n",
2963 filter->filt_name, retval);
2964 }
2965
2966 return retval;
2967 }
2968
2969 __private_extern__ void
dlil_detach_filter(interface_filter_t filter)2970 dlil_detach_filter(interface_filter_t filter)
2971 {
2972 if (filter == NULL) {
2973 return;
2974 }
2975 dlil_detach_filter_internal(filter, 0);
2976 }
2977
2978 __private_extern__ boolean_t
dlil_has_ip_filter(void)2979 dlil_has_ip_filter(void)
2980 {
2981 boolean_t has_filter = (net_api_stats.nas_ipf_add_count > 0);
2982 DTRACE_IP1(dlil_has_ip_filter, boolean_t, has_filter);
2983 return has_filter;
2984 }
2985
2986 __private_extern__ boolean_t
dlil_has_if_filter(struct ifnet * ifp)2987 dlil_has_if_filter(struct ifnet *ifp)
2988 {
2989 boolean_t has_filter = !TAILQ_EMPTY(&ifp->if_flt_head);
2990 DTRACE_IP1(dlil_has_if_filter, boolean_t, has_filter);
2991 return has_filter;
2992 }
2993
2994 static inline void
dlil_input_wakeup(struct dlil_threading_info * inp)2995 dlil_input_wakeup(struct dlil_threading_info *inp)
2996 {
2997 LCK_MTX_ASSERT(&inp->dlth_lock, LCK_MTX_ASSERT_OWNED);
2998
2999 inp->dlth_flags |= DLIL_INPUT_WAITING;
3000 if (!(inp->dlth_flags & DLIL_INPUT_RUNNING)) {
3001 inp->dlth_wtot++;
3002 wakeup_one((caddr_t)&inp->dlth_flags);
3003 }
3004 }
3005
3006 __attribute__((noreturn))
3007 static void
dlil_main_input_thread_func(void * v,wait_result_t w)3008 dlil_main_input_thread_func(void *v, wait_result_t w)
3009 {
3010 #pragma unused(w)
3011 struct dlil_threading_info *inp = v;
3012
3013 VERIFY(inp == dlil_main_input_thread);
3014 VERIFY(inp->dlth_ifp == NULL);
3015 VERIFY(current_thread() == inp->dlth_thread);
3016
3017 lck_mtx_lock(&inp->dlth_lock);
3018 VERIFY(!(inp->dlth_flags & (DLIL_INPUT_EMBRYONIC | DLIL_INPUT_RUNNING)));
3019 (void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3020 inp->dlth_flags |= DLIL_INPUT_EMBRYONIC;
3021 /* wake up once to get out of embryonic state */
3022 dlil_input_wakeup(inp);
3023 lck_mtx_unlock(&inp->dlth_lock);
3024 (void) thread_block_parameter(dlil_main_input_thread_cont, inp);
3025 /* NOTREACHED */
3026 __builtin_unreachable();
3027 }
3028
3029 /*
3030 * Main input thread:
3031 *
3032 * a) handles all inbound packets for lo0
3033 * b) handles all inbound packets for interfaces with no dedicated
3034 * input thread (e.g. anything but Ethernet/PDP or those that support
3035 * opportunistic polling.)
3036 * c) protocol registrations
3037 * d) packet injections
3038 */
3039 __attribute__((noreturn))
3040 static void
dlil_main_input_thread_cont(void * v,wait_result_t wres)3041 dlil_main_input_thread_cont(void *v, wait_result_t wres)
3042 {
3043 struct dlil_main_threading_info *inpm = v;
3044 struct dlil_threading_info *inp = v;
3045
3046 /* main input thread is uninterruptible */
3047 VERIFY(wres != THREAD_INTERRUPTED);
3048 lck_mtx_lock_spin(&inp->dlth_lock);
3049 VERIFY(!(inp->dlth_flags & (DLIL_INPUT_TERMINATE |
3050 DLIL_INPUT_RUNNING)));
3051 inp->dlth_flags |= DLIL_INPUT_RUNNING;
3052
3053 while (1) {
3054 struct mbuf *m = NULL, *m_loop = NULL;
3055 u_int32_t m_cnt, m_cnt_loop;
3056 classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
3057 boolean_t proto_req;
3058 boolean_t embryonic;
3059
3060 inp->dlth_flags &= ~DLIL_INPUT_WAITING;
3061
3062 if (__improbable(embryonic =
3063 (inp->dlth_flags & DLIL_INPUT_EMBRYONIC))) {
3064 inp->dlth_flags &= ~DLIL_INPUT_EMBRYONIC;
3065 }
3066
3067 proto_req = (inp->dlth_flags &
3068 (DLIL_PROTO_WAITING | DLIL_PROTO_REGISTER));
3069
3070 /* Packets for non-dedicated interfaces other than lo0 */
3071 m_cnt = qlen(&inp->dlth_pkts);
3072 _getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
3073 m = pkt.cp_mbuf;
3074
3075 /* Packets exclusive to lo0 */
3076 m_cnt_loop = qlen(&inpm->lo_rcvq_pkts);
3077 _getq_all(&inpm->lo_rcvq_pkts, &pkt, NULL, NULL, NULL);
3078 m_loop = pkt.cp_mbuf;
3079
3080 inp->dlth_wtot = 0;
3081
3082 lck_mtx_unlock(&inp->dlth_lock);
3083
3084 if (__improbable(embryonic)) {
3085 dlil_decr_pending_thread_count();
3086 }
3087
3088 /*
3089 * NOTE warning %%% attention !!!!
3090 * We should think about putting some thread starvation
3091 * safeguards if we deal with long chains of packets.
3092 */
3093 if (__probable(m_loop != NULL)) {
3094 dlil_input_packet_list_extended(lo_ifp, m_loop,
3095 m_cnt_loop, IFNET_MODEL_INPUT_POLL_OFF);
3096 }
3097
3098 if (__probable(m != NULL)) {
3099 dlil_input_packet_list_extended(NULL, m,
3100 m_cnt, IFNET_MODEL_INPUT_POLL_OFF);
3101 }
3102
3103 if (__improbable(proto_req)) {
3104 proto_input_run();
3105 }
3106
3107 lck_mtx_lock_spin(&inp->dlth_lock);
3108 VERIFY(inp->dlth_flags & DLIL_INPUT_RUNNING);
3109 /* main input thread cannot be terminated */
3110 VERIFY(!(inp->dlth_flags & DLIL_INPUT_TERMINATE));
3111 if (!(inp->dlth_flags & ~DLIL_INPUT_RUNNING)) {
3112 break;
3113 }
3114 }
3115
3116 inp->dlth_flags &= ~DLIL_INPUT_RUNNING;
3117 (void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3118 lck_mtx_unlock(&inp->dlth_lock);
3119 (void) thread_block_parameter(dlil_main_input_thread_cont, inp);
3120
3121 VERIFY(0); /* we should never get here */
3122 /* NOTREACHED */
3123 __builtin_unreachable();
3124 }
3125
3126 /*
3127 * Input thread for interfaces with legacy input model.
3128 */
3129 __attribute__((noreturn))
3130 static void
dlil_input_thread_func(void * v,wait_result_t w)3131 dlil_input_thread_func(void *v, wait_result_t w)
3132 {
3133 #pragma unused(w)
3134 char thread_name[MAXTHREADNAMESIZE];
3135 struct dlil_threading_info *inp = v;
3136 struct ifnet *ifp = inp->dlth_ifp;
3137
3138 VERIFY(inp != dlil_main_input_thread);
3139 VERIFY(ifp != NULL);
3140 VERIFY(!(ifp->if_eflags & IFEF_RXPOLL) || !net_rxpoll ||
3141 !(ifp->if_xflags & IFXF_LEGACY));
3142 VERIFY(ifp->if_poll_mode == IFNET_MODEL_INPUT_POLL_OFF ||
3143 !(ifp->if_xflags & IFXF_LEGACY));
3144 VERIFY(current_thread() == inp->dlth_thread);
3145
3146 /* construct the name for this thread, and then apply it */
3147 bzero(thread_name, sizeof(thread_name));
3148 (void) snprintf(thread_name, sizeof(thread_name),
3149 "dlil_input_%s", ifp->if_xname);
3150 thread_set_thread_name(inp->dlth_thread, thread_name);
3151
3152 lck_mtx_lock(&inp->dlth_lock);
3153 VERIFY(!(inp->dlth_flags & (DLIL_INPUT_EMBRYONIC | DLIL_INPUT_RUNNING)));
3154 (void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3155 inp->dlth_flags |= DLIL_INPUT_EMBRYONIC;
3156 /* wake up once to get out of embryonic state */
3157 dlil_input_wakeup(inp);
3158 lck_mtx_unlock(&inp->dlth_lock);
3159 (void) thread_block_parameter(dlil_input_thread_cont, inp);
3160 /* NOTREACHED */
3161 __builtin_unreachable();
3162 }
3163
3164 __attribute__((noreturn))
3165 static void
dlil_input_thread_cont(void * v,wait_result_t wres)3166 dlil_input_thread_cont(void *v, wait_result_t wres)
3167 {
3168 struct dlil_threading_info *inp = v;
3169 struct ifnet *ifp = inp->dlth_ifp;
3170
3171 lck_mtx_lock_spin(&inp->dlth_lock);
3172 if (__improbable(wres == THREAD_INTERRUPTED ||
3173 (inp->dlth_flags & DLIL_INPUT_TERMINATE))) {
3174 goto terminate;
3175 }
3176
3177 VERIFY(!(inp->dlth_flags & DLIL_INPUT_RUNNING));
3178 inp->dlth_flags |= DLIL_INPUT_RUNNING;
3179
3180 while (1) {
3181 struct mbuf *m = NULL;
3182 classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
3183 boolean_t notify = FALSE;
3184 boolean_t embryonic;
3185 u_int32_t m_cnt;
3186
3187 inp->dlth_flags &= ~DLIL_INPUT_WAITING;
3188
3189 if (__improbable(embryonic =
3190 (inp->dlth_flags & DLIL_INPUT_EMBRYONIC))) {
3191 inp->dlth_flags &= ~DLIL_INPUT_EMBRYONIC;
3192 }
3193
3194 /*
3195 * Protocol registration and injection must always use
3196 * the main input thread; in theory the latter can utilize
3197 * the corresponding input thread where the packet arrived
3198 * on, but that requires our knowing the interface in advance
3199 * (and the benefits might not worth the trouble.)
3200 */
3201 VERIFY(!(inp->dlth_flags &
3202 (DLIL_PROTO_WAITING | DLIL_PROTO_REGISTER)));
3203
3204 /* Packets for this interface */
3205 m_cnt = qlen(&inp->dlth_pkts);
3206 _getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
3207 m = pkt.cp_mbuf;
3208
3209 inp->dlth_wtot = 0;
3210
3211 #if SKYWALK
3212 /*
3213 * If this interface is attached to a netif nexus,
3214 * the stats are already incremented there; otherwise
3215 * do it here.
3216 */
3217 if (!(ifp->if_capabilities & IFCAP_SKYWALK))
3218 #endif /* SKYWALK */
3219 notify = dlil_input_stats_sync(ifp, inp);
3220
3221 lck_mtx_unlock(&inp->dlth_lock);
3222
3223 if (__improbable(embryonic)) {
3224 ifnet_decr_pending_thread_count(ifp);
3225 }
3226
3227 if (__improbable(notify)) {
3228 ifnet_notify_data_threshold(ifp);
3229 }
3230
3231 /*
3232 * NOTE warning %%% attention !!!!
3233 * We should think about putting some thread starvation
3234 * safeguards if we deal with long chains of packets.
3235 */
3236 if (__probable(m != NULL)) {
3237 dlil_input_packet_list_extended(NULL, m,
3238 m_cnt, ifp->if_poll_mode);
3239 }
3240
3241 lck_mtx_lock_spin(&inp->dlth_lock);
3242 VERIFY(inp->dlth_flags & DLIL_INPUT_RUNNING);
3243 if (!(inp->dlth_flags & ~(DLIL_INPUT_RUNNING |
3244 DLIL_INPUT_TERMINATE))) {
3245 break;
3246 }
3247 }
3248
3249 inp->dlth_flags &= ~DLIL_INPUT_RUNNING;
3250
3251 if (__improbable(inp->dlth_flags & DLIL_INPUT_TERMINATE)) {
3252 terminate:
3253 lck_mtx_unlock(&inp->dlth_lock);
3254 dlil_terminate_input_thread(inp);
3255 /* NOTREACHED */
3256 } else {
3257 (void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3258 lck_mtx_unlock(&inp->dlth_lock);
3259 (void) thread_block_parameter(dlil_input_thread_cont, inp);
3260 /* NOTREACHED */
3261 }
3262
3263 VERIFY(0); /* we should never get here */
3264 /* NOTREACHED */
3265 __builtin_unreachable();
3266 }
3267
3268 /*
3269 * Input thread for interfaces with opportunistic polling input model.
3270 */
3271 __attribute__((noreturn))
3272 static void
dlil_rxpoll_input_thread_func(void * v,wait_result_t w)3273 dlil_rxpoll_input_thread_func(void *v, wait_result_t w)
3274 {
3275 #pragma unused(w)
3276 char thread_name[MAXTHREADNAMESIZE];
3277 struct dlil_threading_info *inp = v;
3278 struct ifnet *ifp = inp->dlth_ifp;
3279
3280 VERIFY(inp != dlil_main_input_thread);
3281 VERIFY(ifp != NULL && (ifp->if_eflags & IFEF_RXPOLL) &&
3282 (ifp->if_xflags & IFXF_LEGACY));
3283 VERIFY(current_thread() == inp->dlth_thread);
3284
3285 /* construct the name for this thread, and then apply it */
3286 bzero(thread_name, sizeof(thread_name));
3287 (void) snprintf(thread_name, sizeof(thread_name),
3288 "dlil_input_poll_%s", ifp->if_xname);
3289 thread_set_thread_name(inp->dlth_thread, thread_name);
3290
3291 lck_mtx_lock(&inp->dlth_lock);
3292 VERIFY(!(inp->dlth_flags & (DLIL_INPUT_EMBRYONIC | DLIL_INPUT_RUNNING)));
3293 (void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3294 inp->dlth_flags |= DLIL_INPUT_EMBRYONIC;
3295 /* wake up once to get out of embryonic state */
3296 dlil_input_wakeup(inp);
3297 lck_mtx_unlock(&inp->dlth_lock);
3298 (void) thread_block_parameter(dlil_rxpoll_input_thread_cont, inp);
3299 /* NOTREACHED */
3300 __builtin_unreachable();
3301 }
3302
3303 __attribute__((noreturn))
3304 static void
dlil_rxpoll_input_thread_cont(void * v,wait_result_t wres)3305 dlil_rxpoll_input_thread_cont(void *v, wait_result_t wres)
3306 {
3307 struct dlil_threading_info *inp = v;
3308 struct ifnet *ifp = inp->dlth_ifp;
3309 struct timespec ts;
3310
3311 lck_mtx_lock_spin(&inp->dlth_lock);
3312 if (__improbable(wres == THREAD_INTERRUPTED ||
3313 (inp->dlth_flags & DLIL_INPUT_TERMINATE))) {
3314 goto terminate;
3315 }
3316
3317 VERIFY(!(inp->dlth_flags & DLIL_INPUT_RUNNING));
3318 inp->dlth_flags |= DLIL_INPUT_RUNNING;
3319
3320 while (1) {
3321 struct mbuf *m = NULL;
3322 uint32_t m_cnt, poll_req = 0;
3323 uint64_t m_size = 0;
3324 ifnet_model_t mode;
3325 struct timespec now, delta;
3326 classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
3327 boolean_t notify;
3328 boolean_t embryonic;
3329 uint64_t ival;
3330
3331 inp->dlth_flags &= ~DLIL_INPUT_WAITING;
3332
3333 if (__improbable(embryonic =
3334 (inp->dlth_flags & DLIL_INPUT_EMBRYONIC))) {
3335 inp->dlth_flags &= ~DLIL_INPUT_EMBRYONIC;
3336 goto skip;
3337 }
3338
3339 if ((ival = ifp->if_rxpoll_ival) < IF_RXPOLL_INTERVALTIME_MIN) {
3340 ival = IF_RXPOLL_INTERVALTIME_MIN;
3341 }
3342
3343 /* Link parameters changed? */
3344 if (ifp->if_poll_update != 0) {
3345 ifp->if_poll_update = 0;
3346 (void) dlil_rxpoll_set_params(ifp, NULL, TRUE);
3347 }
3348
3349 /* Current operating mode */
3350 mode = ifp->if_poll_mode;
3351
3352 /*
3353 * Protocol registration and injection must always use
3354 * the main input thread; in theory the latter can utilize
3355 * the corresponding input thread where the packet arrived
3356 * on, but that requires our knowing the interface in advance
3357 * (and the benefits might not worth the trouble.)
3358 */
3359 VERIFY(!(inp->dlth_flags &
3360 (DLIL_PROTO_WAITING | DLIL_PROTO_REGISTER)));
3361
3362 /* Total count of all packets */
3363 m_cnt = qlen(&inp->dlth_pkts);
3364
3365 /* Total bytes of all packets */
3366 m_size = qsize(&inp->dlth_pkts);
3367
3368 /* Packets for this interface */
3369 _getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
3370 m = pkt.cp_mbuf;
3371 VERIFY(m != NULL || m_cnt == 0);
3372
3373 nanouptime(&now);
3374 if (!net_timerisset(&ifp->if_poll_sample_lasttime)) {
3375 *(&ifp->if_poll_sample_lasttime) = *(&now);
3376 }
3377
3378 net_timersub(&now, &ifp->if_poll_sample_lasttime, &delta);
3379 if (if_rxpoll && net_timerisset(&ifp->if_poll_sample_holdtime)) {
3380 u_int32_t ptot, btot;
3381
3382 /* Accumulate statistics for current sampling */
3383 PKTCNTR_ADD(&ifp->if_poll_sstats, m_cnt, m_size);
3384
3385 if (net_timercmp(&delta, &ifp->if_poll_sample_holdtime, <)) {
3386 goto skip;
3387 }
3388
3389 *(&ifp->if_poll_sample_lasttime) = *(&now);
3390
3391 /* Calculate min/max of inbound bytes */
3392 btot = (u_int32_t)ifp->if_poll_sstats.bytes;
3393 if (ifp->if_rxpoll_bmin == 0 || ifp->if_rxpoll_bmin > btot) {
3394 ifp->if_rxpoll_bmin = btot;
3395 }
3396 if (btot > ifp->if_rxpoll_bmax) {
3397 ifp->if_rxpoll_bmax = btot;
3398 }
3399
3400 /* Calculate EWMA of inbound bytes */
3401 DLIL_EWMA(ifp->if_rxpoll_bavg, btot, if_rxpoll_decay);
3402
3403 /* Calculate min/max of inbound packets */
3404 ptot = (u_int32_t)ifp->if_poll_sstats.packets;
3405 if (ifp->if_rxpoll_pmin == 0 || ifp->if_rxpoll_pmin > ptot) {
3406 ifp->if_rxpoll_pmin = ptot;
3407 }
3408 if (ptot > ifp->if_rxpoll_pmax) {
3409 ifp->if_rxpoll_pmax = ptot;
3410 }
3411
3412 /* Calculate EWMA of inbound packets */
3413 DLIL_EWMA(ifp->if_rxpoll_pavg, ptot, if_rxpoll_decay);
3414
3415 /* Reset sampling statistics */
3416 PKTCNTR_CLEAR(&ifp->if_poll_sstats);
3417
3418 /* Calculate EWMA of wakeup requests */
3419 DLIL_EWMA(ifp->if_rxpoll_wavg, inp->dlth_wtot,
3420 if_rxpoll_decay);
3421 inp->dlth_wtot = 0;
3422
3423 if (dlil_verbose) {
3424 if (!net_timerisset(&ifp->if_poll_dbg_lasttime)) {
3425 *(&ifp->if_poll_dbg_lasttime) = *(&now);
3426 }
3427 net_timersub(&now, &ifp->if_poll_dbg_lasttime, &delta);
3428 if (net_timercmp(&delta, &dlil_dbgrate, >=)) {
3429 *(&ifp->if_poll_dbg_lasttime) = *(&now);
3430 DLIL_PRINTF("%s: [%s] pkts avg %d max %d "
3431 "limits [%d/%d], wreq avg %d "
3432 "limits [%d/%d], bytes avg %d "
3433 "limits [%d/%d]\n", if_name(ifp),
3434 (ifp->if_poll_mode ==
3435 IFNET_MODEL_INPUT_POLL_ON) ?
3436 "ON" : "OFF", ifp->if_rxpoll_pavg,
3437 ifp->if_rxpoll_pmax,
3438 ifp->if_rxpoll_plowat,
3439 ifp->if_rxpoll_phiwat,
3440 ifp->if_rxpoll_wavg,
3441 ifp->if_rxpoll_wlowat,
3442 ifp->if_rxpoll_whiwat,
3443 ifp->if_rxpoll_bavg,
3444 ifp->if_rxpoll_blowat,
3445 ifp->if_rxpoll_bhiwat);
3446 }
3447 }
3448
3449 /* Perform mode transition, if necessary */
3450 if (!net_timerisset(&ifp->if_poll_mode_lasttime)) {
3451 *(&ifp->if_poll_mode_lasttime) = *(&now);
3452 }
3453
3454 net_timersub(&now, &ifp->if_poll_mode_lasttime, &delta);
3455 if (net_timercmp(&delta, &ifp->if_poll_mode_holdtime, <)) {
3456 goto skip;
3457 }
3458
3459 if (ifp->if_rxpoll_pavg <= ifp->if_rxpoll_plowat &&
3460 ifp->if_rxpoll_bavg <= ifp->if_rxpoll_blowat &&
3461 ifp->if_poll_mode != IFNET_MODEL_INPUT_POLL_OFF) {
3462 mode = IFNET_MODEL_INPUT_POLL_OFF;
3463 } else if (ifp->if_rxpoll_pavg >= ifp->if_rxpoll_phiwat &&
3464 (ifp->if_rxpoll_bavg >= ifp->if_rxpoll_bhiwat ||
3465 ifp->if_rxpoll_wavg >= ifp->if_rxpoll_whiwat) &&
3466 ifp->if_poll_mode != IFNET_MODEL_INPUT_POLL_ON) {
3467 mode = IFNET_MODEL_INPUT_POLL_ON;
3468 }
3469
3470 if (mode != ifp->if_poll_mode) {
3471 ifp->if_poll_mode = mode;
3472 *(&ifp->if_poll_mode_lasttime) = *(&now);
3473 poll_req++;
3474 }
3475 }
3476 skip:
3477 notify = dlil_input_stats_sync(ifp, inp);
3478
3479 lck_mtx_unlock(&inp->dlth_lock);
3480
3481 if (__improbable(embryonic)) {
3482 ifnet_decr_pending_thread_count(ifp);
3483 }
3484
3485 if (__improbable(notify)) {
3486 ifnet_notify_data_threshold(ifp);
3487 }
3488
3489 /*
3490 * If there's a mode change and interface is still attached,
3491 * perform a downcall to the driver for the new mode. Also
3492 * hold an IO refcnt on the interface to prevent it from
3493 * being detached (will be release below.)
3494 */
3495 if (poll_req != 0 && ifnet_is_attached(ifp, 1)) {
3496 struct ifnet_model_params p = {
3497 .model = mode, .reserved = { 0 }
3498 };
3499 errno_t err;
3500
3501 if (dlil_verbose) {
3502 DLIL_PRINTF("%s: polling is now %s, "
3503 "pkts avg %d max %d limits [%d/%d], "
3504 "wreq avg %d limits [%d/%d], "
3505 "bytes avg %d limits [%d/%d]\n",
3506 if_name(ifp),
3507 (mode == IFNET_MODEL_INPUT_POLL_ON) ?
3508 "ON" : "OFF", ifp->if_rxpoll_pavg,
3509 ifp->if_rxpoll_pmax, ifp->if_rxpoll_plowat,
3510 ifp->if_rxpoll_phiwat, ifp->if_rxpoll_wavg,
3511 ifp->if_rxpoll_wlowat, ifp->if_rxpoll_whiwat,
3512 ifp->if_rxpoll_bavg, ifp->if_rxpoll_blowat,
3513 ifp->if_rxpoll_bhiwat);
3514 }
3515
3516 if ((err = ((*ifp->if_input_ctl)(ifp,
3517 IFNET_CTL_SET_INPUT_MODEL, sizeof(p), &p))) != 0) {
3518 DLIL_PRINTF("%s: error setting polling mode "
3519 "to %s (%d)\n", if_name(ifp),
3520 (mode == IFNET_MODEL_INPUT_POLL_ON) ?
3521 "ON" : "OFF", err);
3522 }
3523
3524 switch (mode) {
3525 case IFNET_MODEL_INPUT_POLL_OFF:
3526 ifnet_set_poll_cycle(ifp, NULL);
3527 ifp->if_rxpoll_offreq++;
3528 if (err != 0) {
3529 ifp->if_rxpoll_offerr++;
3530 }
3531 break;
3532
3533 case IFNET_MODEL_INPUT_POLL_ON:
3534 net_nsectimer(&ival, &ts);
3535 ifnet_set_poll_cycle(ifp, &ts);
3536 ifnet_poll(ifp);
3537 ifp->if_rxpoll_onreq++;
3538 if (err != 0) {
3539 ifp->if_rxpoll_onerr++;
3540 }
3541 break;
3542
3543 default:
3544 VERIFY(0);
3545 /* NOTREACHED */
3546 }
3547
3548 /* Release the IO refcnt */
3549 ifnet_decr_iorefcnt(ifp);
3550 }
3551
3552 /*
3553 * NOTE warning %%% attention !!!!
3554 * We should think about putting some thread starvation
3555 * safeguards if we deal with long chains of packets.
3556 */
3557 if (__probable(m != NULL)) {
3558 dlil_input_packet_list_extended(NULL, m, m_cnt, mode);
3559 }
3560
3561 lck_mtx_lock_spin(&inp->dlth_lock);
3562 VERIFY(inp->dlth_flags & DLIL_INPUT_RUNNING);
3563 if (!(inp->dlth_flags & ~(DLIL_INPUT_RUNNING |
3564 DLIL_INPUT_TERMINATE))) {
3565 break;
3566 }
3567 }
3568
3569 inp->dlth_flags &= ~DLIL_INPUT_RUNNING;
3570
3571 if (__improbable(inp->dlth_flags & DLIL_INPUT_TERMINATE)) {
3572 terminate:
3573 lck_mtx_unlock(&inp->dlth_lock);
3574 dlil_terminate_input_thread(inp);
3575 /* NOTREACHED */
3576 } else {
3577 (void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3578 lck_mtx_unlock(&inp->dlth_lock);
3579 (void) thread_block_parameter(dlil_rxpoll_input_thread_cont,
3580 inp);
3581 /* NOTREACHED */
3582 }
3583
3584 VERIFY(0); /* we should never get here */
3585 /* NOTREACHED */
3586 __builtin_unreachable();
3587 }
3588
3589 errno_t
dlil_rxpoll_validate_params(struct ifnet_poll_params * p)3590 dlil_rxpoll_validate_params(struct ifnet_poll_params *p)
3591 {
3592 if (p != NULL) {
3593 if ((p->packets_lowat == 0 && p->packets_hiwat != 0) ||
3594 (p->packets_lowat != 0 && p->packets_hiwat == 0)) {
3595 return EINVAL;
3596 }
3597 if (p->packets_lowat != 0 && /* hiwat must be non-zero */
3598 p->packets_lowat >= p->packets_hiwat) {
3599 return EINVAL;
3600 }
3601 if ((p->bytes_lowat == 0 && p->bytes_hiwat != 0) ||
3602 (p->bytes_lowat != 0 && p->bytes_hiwat == 0)) {
3603 return EINVAL;
3604 }
3605 if (p->bytes_lowat != 0 && /* hiwat must be non-zero */
3606 p->bytes_lowat >= p->bytes_hiwat) {
3607 return EINVAL;
3608 }
3609 if (p->interval_time != 0 &&
3610 p->interval_time < IF_RXPOLL_INTERVALTIME_MIN) {
3611 p->interval_time = IF_RXPOLL_INTERVALTIME_MIN;
3612 }
3613 }
3614 return 0;
3615 }
3616
3617 void
dlil_rxpoll_update_params(struct ifnet * ifp,struct ifnet_poll_params * p)3618 dlil_rxpoll_update_params(struct ifnet *ifp, struct ifnet_poll_params *p)
3619 {
3620 u_int64_t sample_holdtime, inbw;
3621
3622 if ((inbw = ifnet_input_linkrate(ifp)) == 0 && p == NULL) {
3623 sample_holdtime = 0; /* polling is disabled */
3624 ifp->if_rxpoll_wlowat = ifp->if_rxpoll_plowat =
3625 ifp->if_rxpoll_blowat = 0;
3626 ifp->if_rxpoll_whiwat = ifp->if_rxpoll_phiwat =
3627 ifp->if_rxpoll_bhiwat = (u_int32_t)-1;
3628 ifp->if_rxpoll_plim = 0;
3629 ifp->if_rxpoll_ival = IF_RXPOLL_INTERVALTIME_MIN;
3630 } else {
3631 u_int32_t plowat, phiwat, blowat, bhiwat, plim;
3632 u_int64_t ival;
3633 unsigned int n, i;
3634
3635 for (n = 0, i = 0; rxpoll_tbl[i].speed != 0; i++) {
3636 if (inbw < rxpoll_tbl[i].speed) {
3637 break;
3638 }
3639 n = i;
3640 }
3641 /* auto-tune if caller didn't specify a value */
3642 plowat = ((p == NULL || p->packets_lowat == 0) ?
3643 rxpoll_tbl[n].plowat : p->packets_lowat);
3644 phiwat = ((p == NULL || p->packets_hiwat == 0) ?
3645 rxpoll_tbl[n].phiwat : p->packets_hiwat);
3646 blowat = ((p == NULL || p->bytes_lowat == 0) ?
3647 rxpoll_tbl[n].blowat : p->bytes_lowat);
3648 bhiwat = ((p == NULL || p->bytes_hiwat == 0) ?
3649 rxpoll_tbl[n].bhiwat : p->bytes_hiwat);
3650 plim = ((p == NULL || p->packets_limit == 0 ||
3651 if_rxpoll_max != 0) ? if_rxpoll_max : p->packets_limit);
3652 ival = ((p == NULL || p->interval_time == 0 ||
3653 if_rxpoll_interval_time != IF_RXPOLL_INTERVALTIME) ?
3654 if_rxpoll_interval_time : p->interval_time);
3655
3656 VERIFY(plowat != 0 && phiwat != 0);
3657 VERIFY(blowat != 0 && bhiwat != 0);
3658 VERIFY(ival >= IF_RXPOLL_INTERVALTIME_MIN);
3659
3660 sample_holdtime = if_rxpoll_sample_holdtime;
3661 ifp->if_rxpoll_wlowat = if_sysctl_rxpoll_wlowat;
3662 ifp->if_rxpoll_whiwat = if_sysctl_rxpoll_whiwat;
3663 ifp->if_rxpoll_plowat = plowat;
3664 ifp->if_rxpoll_phiwat = phiwat;
3665 ifp->if_rxpoll_blowat = blowat;
3666 ifp->if_rxpoll_bhiwat = bhiwat;
3667 ifp->if_rxpoll_plim = plim;
3668 ifp->if_rxpoll_ival = ival;
3669 }
3670
3671 net_nsectimer(&if_rxpoll_mode_holdtime, &ifp->if_poll_mode_holdtime);
3672 net_nsectimer(&sample_holdtime, &ifp->if_poll_sample_holdtime);
3673
3674 if (dlil_verbose) {
3675 DLIL_PRINTF("%s: speed %llu bps, sample per %llu nsec, "
3676 "poll interval %llu nsec, pkts per poll %u, "
3677 "pkt limits [%u/%u], wreq limits [%u/%u], "
3678 "bytes limits [%u/%u]\n", if_name(ifp),
3679 inbw, sample_holdtime, ifp->if_rxpoll_ival,
3680 ifp->if_rxpoll_plim, ifp->if_rxpoll_plowat,
3681 ifp->if_rxpoll_phiwat, ifp->if_rxpoll_wlowat,
3682 ifp->if_rxpoll_whiwat, ifp->if_rxpoll_blowat,
3683 ifp->if_rxpoll_bhiwat);
3684 }
3685 }
3686
3687 /*
3688 * Must be called on an attached ifnet (caller is expected to check.)
3689 * Caller may pass NULL for poll parameters to indicate "auto-tuning."
3690 */
3691 errno_t
dlil_rxpoll_set_params(struct ifnet * ifp,struct ifnet_poll_params * p,boolean_t locked)3692 dlil_rxpoll_set_params(struct ifnet *ifp, struct ifnet_poll_params *p,
3693 boolean_t locked)
3694 {
3695 errno_t err;
3696 struct dlil_threading_info *inp;
3697
3698 VERIFY(ifp != NULL);
3699 if (!(ifp->if_eflags & IFEF_RXPOLL) || (inp = ifp->if_inp) == NULL) {
3700 return ENXIO;
3701 }
3702 err = dlil_rxpoll_validate_params(p);
3703 if (err != 0) {
3704 return err;
3705 }
3706
3707 if (!locked) {
3708 lck_mtx_lock(&inp->dlth_lock);
3709 }
3710 LCK_MTX_ASSERT(&inp->dlth_lock, LCK_MTX_ASSERT_OWNED);
3711 /*
3712 * Normally, we'd reset the parameters to the auto-tuned values
3713 * if the the input thread detects a change in link rate. If the
3714 * driver provides its own parameters right after a link rate
3715 * changes, but before the input thread gets to run, we want to
3716 * make sure to keep the driver's values. Clearing if_poll_update
3717 * will achieve that.
3718 */
3719 if (p != NULL && !locked && ifp->if_poll_update != 0) {
3720 ifp->if_poll_update = 0;
3721 }
3722 dlil_rxpoll_update_params(ifp, p);
3723 if (!locked) {
3724 lck_mtx_unlock(&inp->dlth_lock);
3725 }
3726 return 0;
3727 }
3728
3729 /*
3730 * Must be called on an attached ifnet (caller is expected to check.)
3731 */
3732 errno_t
dlil_rxpoll_get_params(struct ifnet * ifp,struct ifnet_poll_params * p)3733 dlil_rxpoll_get_params(struct ifnet *ifp, struct ifnet_poll_params *p)
3734 {
3735 struct dlil_threading_info *inp;
3736
3737 VERIFY(ifp != NULL && p != NULL);
3738 if (!(ifp->if_eflags & IFEF_RXPOLL) || (inp = ifp->if_inp) == NULL) {
3739 return ENXIO;
3740 }
3741
3742 bzero(p, sizeof(*p));
3743
3744 lck_mtx_lock(&inp->dlth_lock);
3745 p->packets_limit = ifp->if_rxpoll_plim;
3746 p->packets_lowat = ifp->if_rxpoll_plowat;
3747 p->packets_hiwat = ifp->if_rxpoll_phiwat;
3748 p->bytes_lowat = ifp->if_rxpoll_blowat;
3749 p->bytes_hiwat = ifp->if_rxpoll_bhiwat;
3750 p->interval_time = ifp->if_rxpoll_ival;
3751 lck_mtx_unlock(&inp->dlth_lock);
3752
3753 return 0;
3754 }
3755
3756 errno_t
ifnet_input(struct ifnet * ifp,struct mbuf * m_head,const struct ifnet_stat_increment_param * s)3757 ifnet_input(struct ifnet *ifp, struct mbuf *m_head,
3758 const struct ifnet_stat_increment_param *s)
3759 {
3760 return ifnet_input_common(ifp, m_head, NULL, s, FALSE, FALSE);
3761 }
3762
3763 errno_t
ifnet_input_extended(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s)3764 ifnet_input_extended(struct ifnet *ifp, struct mbuf *m_head,
3765 struct mbuf *m_tail, const struct ifnet_stat_increment_param *s)
3766 {
3767 return ifnet_input_common(ifp, m_head, m_tail, s, TRUE, FALSE);
3768 }
3769
3770 errno_t
ifnet_input_poll(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s)3771 ifnet_input_poll(struct ifnet *ifp, struct mbuf *m_head,
3772 struct mbuf *m_tail, const struct ifnet_stat_increment_param *s)
3773 {
3774 return ifnet_input_common(ifp, m_head, m_tail, s,
3775 (m_head != NULL), TRUE);
3776 }
3777
3778 static errno_t
ifnet_input_common(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t ext,boolean_t poll)3779 ifnet_input_common(struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail,
3780 const struct ifnet_stat_increment_param *s, boolean_t ext, boolean_t poll)
3781 {
3782 dlil_input_func input_func;
3783 struct ifnet_stat_increment_param _s;
3784 u_int32_t m_cnt = 0, m_size = 0;
3785 struct mbuf *last;
3786 errno_t err = 0;
3787
3788 if ((m_head == NULL && !poll) || (s == NULL && ext)) {
3789 if (m_head != NULL) {
3790 mbuf_freem_list(m_head);
3791 }
3792 return EINVAL;
3793 }
3794
3795 VERIFY(m_head != NULL || (s == NULL && m_tail == NULL && !ext && poll));
3796 VERIFY(m_tail == NULL || ext);
3797 VERIFY(s != NULL || !ext);
3798
3799 /*
3800 * Drop the packet(s) if the parameters are invalid, or if the
3801 * interface is no longer attached; else hold an IO refcnt to
3802 * prevent it from being detached (will be released below.)
3803 */
3804 if (ifp == NULL || (ifp != lo_ifp && !ifnet_datamov_begin(ifp))) {
3805 if (m_head != NULL) {
3806 mbuf_freem_list(m_head);
3807 }
3808 return EINVAL;
3809 }
3810
3811 input_func = ifp->if_input_dlil;
3812 VERIFY(input_func != NULL);
3813
3814 if (m_tail == NULL) {
3815 last = m_head;
3816 while (m_head != NULL) {
3817 #if IFNET_INPUT_SANITY_CHK
3818 if (__improbable(dlil_input_sanity_check != 0)) {
3819 DLIL_INPUT_CHECK(last, ifp);
3820 }
3821 #endif /* IFNET_INPUT_SANITY_CHK */
3822 m_cnt++;
3823 m_size += m_length(last);
3824 if (mbuf_nextpkt(last) == NULL) {
3825 break;
3826 }
3827 last = mbuf_nextpkt(last);
3828 }
3829 m_tail = last;
3830 } else {
3831 #if IFNET_INPUT_SANITY_CHK
3832 if (__improbable(dlil_input_sanity_check != 0)) {
3833 last = m_head;
3834 while (1) {
3835 DLIL_INPUT_CHECK(last, ifp);
3836 m_cnt++;
3837 m_size += m_length(last);
3838 if (mbuf_nextpkt(last) == NULL) {
3839 break;
3840 }
3841 last = mbuf_nextpkt(last);
3842 }
3843 } else {
3844 m_cnt = s->packets_in;
3845 m_size = s->bytes_in;
3846 last = m_tail;
3847 }
3848 #else
3849 m_cnt = s->packets_in;
3850 m_size = s->bytes_in;
3851 last = m_tail;
3852 #endif /* IFNET_INPUT_SANITY_CHK */
3853 }
3854
3855 if (last != m_tail) {
3856 panic_plain("%s: invalid input packet chain for %s, "
3857 "tail mbuf %p instead of %p\n", __func__, if_name(ifp),
3858 m_tail, last);
3859 }
3860
3861 /*
3862 * Assert packet count only for the extended variant, for backwards
3863 * compatibility, since this came directly from the device driver.
3864 * Relax this assertion for input bytes, as the driver may have
3865 * included the link-layer headers in the computation; hence
3866 * m_size is just an approximation.
3867 */
3868 if (ext && s->packets_in != m_cnt) {
3869 panic_plain("%s: input packet count mismatch for %s, "
3870 "%d instead of %d\n", __func__, if_name(ifp),
3871 s->packets_in, m_cnt);
3872 }
3873
3874 if (s == NULL) {
3875 bzero(&_s, sizeof(_s));
3876 s = &_s;
3877 } else {
3878 _s = *s;
3879 }
3880 _s.packets_in = m_cnt;
3881 _s.bytes_in = m_size;
3882
3883 err = (*input_func)(ifp, m_head, m_tail, s, poll, current_thread());
3884
3885 if (ifp != lo_ifp) {
3886 /* Release the IO refcnt */
3887 ifnet_datamov_end(ifp);
3888 }
3889
3890 return err;
3891 }
3892
3893 #if SKYWALK
3894 errno_t
dlil_set_input_handler(struct ifnet * ifp,dlil_input_func fn)3895 dlil_set_input_handler(struct ifnet *ifp, dlil_input_func fn)
3896 {
3897 return atomic_test_set_ptr(&ifp->if_input_dlil,
3898 ptrauth_nop_cast(void *, &dlil_input_handler),
3899 ptrauth_nop_cast(void *, fn)) ? 0 : EBUSY;
3900 }
3901
3902 void
dlil_reset_input_handler(struct ifnet * ifp)3903 dlil_reset_input_handler(struct ifnet *ifp)
3904 {
3905 while (!atomic_test_set_ptr(&ifp->if_input_dlil,
3906 ptrauth_nop_cast(void *, ifp->if_input_dlil),
3907 ptrauth_nop_cast(void *, &dlil_input_handler))) {
3908 ;
3909 }
3910 }
3911
3912 errno_t
dlil_set_output_handler(struct ifnet * ifp,dlil_output_func fn)3913 dlil_set_output_handler(struct ifnet *ifp, dlil_output_func fn)
3914 {
3915 return atomic_test_set_ptr(&ifp->if_output_dlil,
3916 ptrauth_nop_cast(void *, &dlil_output_handler),
3917 ptrauth_nop_cast(void *, fn)) ? 0 : EBUSY;
3918 }
3919
3920 void
dlil_reset_output_handler(struct ifnet * ifp)3921 dlil_reset_output_handler(struct ifnet *ifp)
3922 {
3923 while (!atomic_test_set_ptr(&ifp->if_output_dlil,
3924 ptrauth_nop_cast(void *, ifp->if_output_dlil),
3925 ptrauth_nop_cast(void *, &dlil_output_handler))) {
3926 ;
3927 }
3928 }
3929 #endif /* SKYWALK */
3930
3931 errno_t
dlil_output_handler(struct ifnet * ifp,struct mbuf * m)3932 dlil_output_handler(struct ifnet *ifp, struct mbuf *m)
3933 {
3934 return ifp->if_output(ifp, m);
3935 }
3936
3937 errno_t
dlil_input_handler(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t poll,struct thread * tp)3938 dlil_input_handler(struct ifnet *ifp, struct mbuf *m_head,
3939 struct mbuf *m_tail, const struct ifnet_stat_increment_param *s,
3940 boolean_t poll, struct thread *tp)
3941 {
3942 struct dlil_threading_info *inp = ifp->if_inp;
3943
3944 if (__improbable(inp == NULL)) {
3945 inp = dlil_main_input_thread;
3946 }
3947
3948 #if (DEVELOPMENT || DEBUG)
3949 if (__improbable(net_thread_is_marked(NET_THREAD_SYNC_RX))) {
3950 return dlil_input_sync(inp, ifp, m_head, m_tail, s, poll, tp);
3951 } else
3952 #endif /* (DEVELOPMENT || DEBUG) */
3953 {
3954 return inp->dlth_strategy(inp, ifp, m_head, m_tail, s, poll, tp);
3955 }
3956 }
3957
3958 static errno_t
dlil_input_async(struct dlil_threading_info * inp,struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t poll,struct thread * tp)3959 dlil_input_async(struct dlil_threading_info *inp,
3960 struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail,
3961 const struct ifnet_stat_increment_param *s, boolean_t poll,
3962 struct thread *tp)
3963 {
3964 u_int32_t m_cnt = s->packets_in;
3965 u_int32_t m_size = s->bytes_in;
3966 boolean_t notify = FALSE;
3967
3968 /*
3969 * If there is a matching DLIL input thread associated with an
3970 * affinity set, associate this thread with the same set. We
3971 * will only do this once.
3972 */
3973 lck_mtx_lock_spin(&inp->dlth_lock);
3974 if (inp != dlil_main_input_thread && inp->dlth_affinity && tp != NULL &&
3975 ((!poll && inp->dlth_driver_thread == THREAD_NULL) ||
3976 (poll && inp->dlth_poller_thread == THREAD_NULL))) {
3977 u_int32_t tag = inp->dlth_affinity_tag;
3978
3979 if (poll) {
3980 VERIFY(inp->dlth_poller_thread == THREAD_NULL);
3981 inp->dlth_poller_thread = tp;
3982 } else {
3983 VERIFY(inp->dlth_driver_thread == THREAD_NULL);
3984 inp->dlth_driver_thread = tp;
3985 }
3986 lck_mtx_unlock(&inp->dlth_lock);
3987
3988 /* Associate the current thread with the new affinity tag */
3989 (void) dlil_affinity_set(tp, tag);
3990
3991 /*
3992 * Take a reference on the current thread; during detach,
3993 * we will need to refer to it in order to tear down its
3994 * affinity.
3995 */
3996 thread_reference(tp);
3997 lck_mtx_lock_spin(&inp->dlth_lock);
3998 }
3999
4000 VERIFY(m_head != NULL || (m_tail == NULL && m_cnt == 0));
4001
4002 /*
4003 * Because of loopbacked multicast we cannot stuff the ifp in
4004 * the rcvif of the packet header: loopback (lo0) packets use a
4005 * dedicated list so that we can later associate them with lo_ifp
4006 * on their way up the stack. Packets for other interfaces without
4007 * dedicated input threads go to the regular list.
4008 */
4009 if (m_head != NULL) {
4010 classq_pkt_t head, tail;
4011 CLASSQ_PKT_INIT_MBUF(&head, m_head);
4012 CLASSQ_PKT_INIT_MBUF(&tail, m_tail);
4013 if (inp == dlil_main_input_thread && ifp == lo_ifp) {
4014 struct dlil_main_threading_info *inpm =
4015 (struct dlil_main_threading_info *)inp;
4016 _addq_multi(&inpm->lo_rcvq_pkts, &head, &tail,
4017 m_cnt, m_size);
4018 } else {
4019 _addq_multi(&inp->dlth_pkts, &head, &tail,
4020 m_cnt, m_size);
4021 }
4022 }
4023
4024 #if IFNET_INPUT_SANITY_CHK
4025 if (__improbable(dlil_input_sanity_check != 0)) {
4026 u_int32_t count = 0, size = 0;
4027 struct mbuf *m0;
4028
4029 for (m0 = m_head; m0; m0 = mbuf_nextpkt(m0)) {
4030 size += m_length(m0);
4031 count++;
4032 }
4033
4034 if (count != m_cnt) {
4035 panic_plain("%s: invalid total packet count %u "
4036 "(expected %u)\n", if_name(ifp), count, m_cnt);
4037 /* NOTREACHED */
4038 __builtin_unreachable();
4039 } else if (size != m_size) {
4040 panic_plain("%s: invalid total packet size %u "
4041 "(expected %u)\n", if_name(ifp), size, m_size);
4042 /* NOTREACHED */
4043 __builtin_unreachable();
4044 }
4045
4046 inp->dlth_pkts_cnt += m_cnt;
4047 }
4048 #endif /* IFNET_INPUT_SANITY_CHK */
4049
4050 dlil_input_stats_add(s, inp, ifp, poll);
4051 /*
4052 * If we're using the main input thread, synchronize the
4053 * stats now since we have the interface context. All
4054 * other cases involving dedicated input threads will
4055 * have their stats synchronized there.
4056 */
4057 if (inp == dlil_main_input_thread) {
4058 notify = dlil_input_stats_sync(ifp, inp);
4059 }
4060
4061 dlil_input_wakeup(inp);
4062 lck_mtx_unlock(&inp->dlth_lock);
4063
4064 if (notify) {
4065 ifnet_notify_data_threshold(ifp);
4066 }
4067
4068 return 0;
4069 }
4070
4071 static errno_t
dlil_input_sync(struct dlil_threading_info * inp,struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t poll,struct thread * tp)4072 dlil_input_sync(struct dlil_threading_info *inp,
4073 struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail,
4074 const struct ifnet_stat_increment_param *s, boolean_t poll,
4075 struct thread *tp)
4076 {
4077 #pragma unused(tp)
4078 u_int32_t m_cnt = s->packets_in;
4079 u_int32_t m_size = s->bytes_in;
4080 boolean_t notify = FALSE;
4081 classq_pkt_t head, tail;
4082
4083 ASSERT(inp != dlil_main_input_thread);
4084
4085 /* XXX: should we just assert instead? */
4086 if (__improbable(m_head == NULL)) {
4087 return 0;
4088 }
4089
4090 CLASSQ_PKT_INIT_MBUF(&head, m_head);
4091 CLASSQ_PKT_INIT_MBUF(&tail, m_tail);
4092
4093 lck_mtx_lock_spin(&inp->dlth_lock);
4094 _addq_multi(&inp->dlth_pkts, &head, &tail, m_cnt, m_size);
4095
4096 #if IFNET_INPUT_SANITY_CHK
4097 if (__improbable(dlil_input_sanity_check != 0)) {
4098 u_int32_t count = 0, size = 0;
4099 struct mbuf *m0;
4100
4101 for (m0 = m_head; m0; m0 = mbuf_nextpkt(m0)) {
4102 size += m_length(m0);
4103 count++;
4104 }
4105
4106 if (count != m_cnt) {
4107 panic_plain("%s: invalid total packet count %u "
4108 "(expected %u)\n", if_name(ifp), count, m_cnt);
4109 /* NOTREACHED */
4110 __builtin_unreachable();
4111 } else if (size != m_size) {
4112 panic_plain("%s: invalid total packet size %u "
4113 "(expected %u)\n", if_name(ifp), size, m_size);
4114 /* NOTREACHED */
4115 __builtin_unreachable();
4116 }
4117
4118 inp->dlth_pkts_cnt += m_cnt;
4119 }
4120 #endif /* IFNET_INPUT_SANITY_CHK */
4121
4122 dlil_input_stats_add(s, inp, ifp, poll);
4123
4124 m_cnt = qlen(&inp->dlth_pkts);
4125 _getq_all(&inp->dlth_pkts, &head, NULL, NULL, NULL);
4126
4127 #if SKYWALK
4128 /*
4129 * If this interface is attached to a netif nexus,
4130 * the stats are already incremented there; otherwise
4131 * do it here.
4132 */
4133 if (!(ifp->if_capabilities & IFCAP_SKYWALK))
4134 #endif /* SKYWALK */
4135 notify = dlil_input_stats_sync(ifp, inp);
4136
4137 lck_mtx_unlock(&inp->dlth_lock);
4138
4139 if (notify) {
4140 ifnet_notify_data_threshold(ifp);
4141 }
4142
4143 /*
4144 * NOTE warning %%% attention !!!!
4145 * We should think about putting some thread starvation
4146 * safeguards if we deal with long chains of packets.
4147 */
4148 if (head.cp_mbuf != NULL) {
4149 dlil_input_packet_list_extended(NULL, head.cp_mbuf,
4150 m_cnt, ifp->if_poll_mode);
4151 }
4152
4153 return 0;
4154 }
4155
4156 #if SKYWALK
4157 errno_t
ifnet_set_output_handler(struct ifnet * ifp,ifnet_output_func fn)4158 ifnet_set_output_handler(struct ifnet *ifp, ifnet_output_func fn)
4159 {
4160 return atomic_test_set_ptr(&ifp->if_output,
4161 ptrauth_nop_cast(void *, ifp->if_save_output),
4162 ptrauth_nop_cast(void *, fn)) ? 0 : EBUSY;
4163 }
4164
4165 void
ifnet_reset_output_handler(struct ifnet * ifp)4166 ifnet_reset_output_handler(struct ifnet *ifp)
4167 {
4168 while (!atomic_test_set_ptr(&ifp->if_output,
4169 ptrauth_nop_cast(void *, ifp->if_output),
4170 ptrauth_nop_cast(void *, ifp->if_save_output))) {
4171 ;
4172 }
4173 }
4174
4175 errno_t
ifnet_set_start_handler(struct ifnet * ifp,ifnet_start_func fn)4176 ifnet_set_start_handler(struct ifnet *ifp, ifnet_start_func fn)
4177 {
4178 return atomic_test_set_ptr(&ifp->if_start,
4179 ptrauth_nop_cast(void *, ifp->if_save_start),
4180 ptrauth_nop_cast(void *, fn)) ? 0 : EBUSY;
4181 }
4182
4183 void
ifnet_reset_start_handler(struct ifnet * ifp)4184 ifnet_reset_start_handler(struct ifnet *ifp)
4185 {
4186 while (!atomic_test_set_ptr(&ifp->if_start,
4187 ptrauth_nop_cast(void *, ifp->if_start),
4188 ptrauth_nop_cast(void *, ifp->if_save_start))) {
4189 ;
4190 }
4191 }
4192 #endif /* SKYWALK */
4193
4194 static void
ifnet_start_common(struct ifnet * ifp,boolean_t resetfc)4195 ifnet_start_common(struct ifnet *ifp, boolean_t resetfc)
4196 {
4197 if (!(ifp->if_eflags & IFEF_TXSTART)) {
4198 return;
4199 }
4200 /*
4201 * If the starter thread is inactive, signal it to do work,
4202 * unless the interface is being flow controlled from below,
4203 * e.g. a virtual interface being flow controlled by a real
4204 * network interface beneath it, or it's been disabled via
4205 * a call to ifnet_disable_output().
4206 */
4207 lck_mtx_lock_spin(&ifp->if_start_lock);
4208 if (resetfc) {
4209 ifp->if_start_flags &= ~IFSF_FLOW_CONTROLLED;
4210 } else if (ifp->if_start_flags & IFSF_FLOW_CONTROLLED) {
4211 lck_mtx_unlock(&ifp->if_start_lock);
4212 return;
4213 }
4214 ifp->if_start_req++;
4215 if (!ifp->if_start_active && ifp->if_start_thread != THREAD_NULL &&
4216 (resetfc || !(ifp->if_eflags & IFEF_ENQUEUE_MULTI) ||
4217 IFCQ_LEN(ifp->if_snd) >= ifp->if_start_delay_qlen ||
4218 ifp->if_start_delayed == 0)) {
4219 (void) wakeup_one((caddr_t)&ifp->if_start_thread);
4220 }
4221 lck_mtx_unlock(&ifp->if_start_lock);
4222 }
4223
4224 void
ifnet_start(struct ifnet * ifp)4225 ifnet_start(struct ifnet *ifp)
4226 {
4227 ifnet_start_common(ifp, FALSE);
4228 }
4229
4230 __attribute__((noreturn))
4231 static void
ifnet_start_thread_func(void * v,wait_result_t w)4232 ifnet_start_thread_func(void *v, wait_result_t w)
4233 {
4234 #pragma unused(w)
4235 struct ifnet *ifp = v;
4236 char thread_name[MAXTHREADNAMESIZE];
4237
4238 /* Construct the name for this thread, and then apply it. */
4239 bzero(thread_name, sizeof(thread_name));
4240 (void) snprintf(thread_name, sizeof(thread_name),
4241 "ifnet_start_%s", ifp->if_xname);
4242 #if SKYWALK
4243 /* override name for native Skywalk interface */
4244 if (ifp->if_eflags & IFEF_SKYWALK_NATIVE) {
4245 (void) snprintf(thread_name, sizeof(thread_name),
4246 "skywalk_doorbell_%s_tx", ifp->if_xname);
4247 }
4248 #endif /* SKYWALK */
4249 ASSERT(ifp->if_start_thread == current_thread());
4250 thread_set_thread_name(current_thread(), thread_name);
4251
4252 /*
4253 * Treat the dedicated starter thread for lo0 as equivalent to
4254 * the driver workloop thread; if net_affinity is enabled for
4255 * the main input thread, associate this starter thread to it
4256 * by binding them with the same affinity tag. This is done
4257 * only once (as we only have one lo_ifp which never goes away.)
4258 */
4259 if (ifp == lo_ifp) {
4260 struct dlil_threading_info *inp = dlil_main_input_thread;
4261 struct thread *tp = current_thread();
4262 #if SKYWALK
4263 /* native skywalk loopback not yet implemented */
4264 VERIFY(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
4265 #endif /* SKYWALK */
4266
4267 lck_mtx_lock(&inp->dlth_lock);
4268 if (inp->dlth_affinity) {
4269 u_int32_t tag = inp->dlth_affinity_tag;
4270
4271 VERIFY(inp->dlth_driver_thread == THREAD_NULL);
4272 VERIFY(inp->dlth_poller_thread == THREAD_NULL);
4273 inp->dlth_driver_thread = tp;
4274 lck_mtx_unlock(&inp->dlth_lock);
4275
4276 /* Associate this thread with the affinity tag */
4277 (void) dlil_affinity_set(tp, tag);
4278 } else {
4279 lck_mtx_unlock(&inp->dlth_lock);
4280 }
4281 }
4282
4283 lck_mtx_lock(&ifp->if_start_lock);
4284 VERIFY(!ifp->if_start_embryonic && !ifp->if_start_active);
4285 (void) assert_wait(&ifp->if_start_thread, THREAD_UNINT);
4286 ifp->if_start_embryonic = 1;
4287 /* wake up once to get out of embryonic state */
4288 ifp->if_start_req++;
4289 (void) wakeup_one((caddr_t)&ifp->if_start_thread);
4290 lck_mtx_unlock(&ifp->if_start_lock);
4291 (void) thread_block_parameter(ifnet_start_thread_cont, ifp);
4292 /* NOTREACHED */
4293 __builtin_unreachable();
4294 }
4295
4296 __attribute__((noreturn))
4297 static void
ifnet_start_thread_cont(void * v,wait_result_t wres)4298 ifnet_start_thread_cont(void *v, wait_result_t wres)
4299 {
4300 struct ifnet *ifp = v;
4301 struct ifclassq *ifq = ifp->if_snd;
4302
4303 lck_mtx_lock_spin(&ifp->if_start_lock);
4304 if (__improbable(wres == THREAD_INTERRUPTED ||
4305 (ifp->if_start_flags & IFSF_TERMINATING) != 0)) {
4306 goto terminate;
4307 }
4308
4309 if (__improbable(ifp->if_start_embryonic)) {
4310 ifp->if_start_embryonic = 0;
4311 lck_mtx_unlock(&ifp->if_start_lock);
4312 ifnet_decr_pending_thread_count(ifp);
4313 lck_mtx_lock_spin(&ifp->if_start_lock);
4314 goto skip;
4315 }
4316
4317 ifp->if_start_active = 1;
4318
4319 /*
4320 * Keep on servicing until no more request.
4321 */
4322 for (;;) {
4323 u_int32_t req = ifp->if_start_req;
4324 if (!IFCQ_IS_EMPTY(ifq) &&
4325 (ifp->if_eflags & IFEF_ENQUEUE_MULTI) &&
4326 ifp->if_start_delayed == 0 &&
4327 IFCQ_LEN(ifq) < ifp->if_start_delay_qlen &&
4328 (ifp->if_eflags & IFEF_DELAY_START)) {
4329 ifp->if_start_delayed = 1;
4330 ifnet_start_delayed++;
4331 break;
4332 }
4333 ifp->if_start_delayed = 0;
4334 lck_mtx_unlock(&ifp->if_start_lock);
4335
4336 /*
4337 * If no longer attached, don't call start because ifp
4338 * is being destroyed; else hold an IO refcnt to
4339 * prevent the interface from being detached (will be
4340 * released below.)
4341 */
4342 if (!ifnet_datamov_begin(ifp)) {
4343 lck_mtx_lock_spin(&ifp->if_start_lock);
4344 break;
4345 }
4346
4347 /* invoke the driver's start routine */
4348 ((*ifp->if_start)(ifp));
4349
4350 /*
4351 * Release the io ref count taken above.
4352 */
4353 ifnet_datamov_end(ifp);
4354
4355 lck_mtx_lock_spin(&ifp->if_start_lock);
4356
4357 /*
4358 * If there's no pending request or if the
4359 * interface has been disabled, we're done.
4360 */
4361 #define _IFSF_DISABLED (IFSF_FLOW_CONTROLLED | IFSF_TERMINATING)
4362 if (req == ifp->if_start_req ||
4363 (ifp->if_start_flags & _IFSF_DISABLED) != 0) {
4364 break;
4365 }
4366 }
4367 skip:
4368 ifp->if_start_req = 0;
4369 ifp->if_start_active = 0;
4370
4371 #if SKYWALK
4372 /*
4373 * Wakeup any waiters, e.g. any threads waiting to
4374 * detach the interface from the flowswitch, etc.
4375 */
4376 if (ifp->if_start_waiters != 0) {
4377 ifp->if_start_waiters = 0;
4378 wakeup(&ifp->if_start_waiters);
4379 }
4380 #endif /* SKYWALK */
4381 if (__probable((ifp->if_start_flags & IFSF_TERMINATING) == 0)) {
4382 uint64_t deadline = TIMEOUT_WAIT_FOREVER;
4383 struct timespec delay_start_ts;
4384 struct timespec *ts;
4385
4386 /*
4387 * Wakeup N ns from now if rate-controlled by TBR, and if
4388 * there are still packets in the send queue which haven't
4389 * been dequeued so far; else sleep indefinitely (ts = NULL)
4390 * until ifnet_start() is called again.
4391 */
4392 ts = ((IFCQ_TBR_IS_ENABLED(ifq) && !IFCQ_IS_EMPTY(ifq)) ?
4393 &ifp->if_start_cycle : NULL);
4394
4395 if (ts == NULL && ifp->if_start_delayed == 1) {
4396 delay_start_ts.tv_sec = 0;
4397 delay_start_ts.tv_nsec = ifp->if_start_delay_timeout;
4398 ts = &delay_start_ts;
4399 }
4400
4401 if (ts != NULL && ts->tv_sec == 0 && ts->tv_nsec == 0) {
4402 ts = NULL;
4403 }
4404
4405 if (__improbable(ts != NULL)) {
4406 clock_interval_to_deadline((uint32_t)(ts->tv_nsec +
4407 (ts->tv_sec * NSEC_PER_SEC)), 1, &deadline);
4408 }
4409
4410 (void) assert_wait_deadline(&ifp->if_start_thread,
4411 THREAD_UNINT, deadline);
4412 lck_mtx_unlock(&ifp->if_start_lock);
4413 (void) thread_block_parameter(ifnet_start_thread_cont, ifp);
4414 /* NOTREACHED */
4415 } else {
4416 terminate:
4417 /* interface is detached? */
4418 ifnet_set_start_cycle(ifp, NULL);
4419
4420 /* clear if_start_thread to allow termination to continue */
4421 ASSERT(ifp->if_start_thread != THREAD_NULL);
4422 ifp->if_start_thread = THREAD_NULL;
4423 wakeup((caddr_t)&ifp->if_start_thread);
4424 lck_mtx_unlock(&ifp->if_start_lock);
4425
4426 if (dlil_verbose) {
4427 DLIL_PRINTF("%s: starter thread terminated\n",
4428 if_name(ifp));
4429 }
4430
4431 /* for the extra refcnt from kernel_thread_start() */
4432 thread_deallocate(current_thread());
4433 /* this is the end */
4434 thread_terminate(current_thread());
4435 /* NOTREACHED */
4436 }
4437
4438 /* must never get here */
4439 VERIFY(0);
4440 /* NOTREACHED */
4441 __builtin_unreachable();
4442 }
4443
4444 void
ifnet_set_start_cycle(struct ifnet * ifp,struct timespec * ts)4445 ifnet_set_start_cycle(struct ifnet *ifp, struct timespec *ts)
4446 {
4447 if (ts == NULL) {
4448 bzero(&ifp->if_start_cycle, sizeof(ifp->if_start_cycle));
4449 } else {
4450 *(&ifp->if_start_cycle) = *ts;
4451 }
4452
4453 if (ts != NULL && ts->tv_nsec != 0 && dlil_verbose) {
4454 DLIL_PRINTF("%s: restart interval set to %lu nsec\n",
4455 if_name(ifp), ts->tv_nsec);
4456 }
4457 }
4458
4459 static inline void
ifnet_poll_wakeup(struct ifnet * ifp)4460 ifnet_poll_wakeup(struct ifnet *ifp)
4461 {
4462 LCK_MTX_ASSERT(&ifp->if_poll_lock, LCK_MTX_ASSERT_OWNED);
4463
4464 ifp->if_poll_req++;
4465 if (!(ifp->if_poll_flags & IF_POLLF_RUNNING) &&
4466 ifp->if_poll_thread != THREAD_NULL) {
4467 wakeup_one((caddr_t)&ifp->if_poll_thread);
4468 }
4469 }
4470
4471 void
ifnet_poll(struct ifnet * ifp)4472 ifnet_poll(struct ifnet *ifp)
4473 {
4474 /*
4475 * If the poller thread is inactive, signal it to do work.
4476 */
4477 lck_mtx_lock_spin(&ifp->if_poll_lock);
4478 ifnet_poll_wakeup(ifp);
4479 lck_mtx_unlock(&ifp->if_poll_lock);
4480 }
4481
4482 __attribute__((noreturn))
4483 static void
ifnet_poll_thread_func(void * v,wait_result_t w)4484 ifnet_poll_thread_func(void *v, wait_result_t w)
4485 {
4486 #pragma unused(w)
4487 char thread_name[MAXTHREADNAMESIZE];
4488 struct ifnet *ifp = v;
4489
4490 VERIFY(ifp->if_eflags & IFEF_RXPOLL);
4491 VERIFY(current_thread() == ifp->if_poll_thread);
4492
4493 /* construct the name for this thread, and then apply it */
4494 bzero(thread_name, sizeof(thread_name));
4495 (void) snprintf(thread_name, sizeof(thread_name),
4496 "ifnet_poller_%s", ifp->if_xname);
4497 thread_set_thread_name(ifp->if_poll_thread, thread_name);
4498
4499 lck_mtx_lock(&ifp->if_poll_lock);
4500 VERIFY(!(ifp->if_poll_flags & (IF_POLLF_EMBRYONIC | IF_POLLF_RUNNING)));
4501 (void) assert_wait(&ifp->if_poll_thread, THREAD_UNINT);
4502 ifp->if_poll_flags |= IF_POLLF_EMBRYONIC;
4503 /* wake up once to get out of embryonic state */
4504 ifnet_poll_wakeup(ifp);
4505 lck_mtx_unlock(&ifp->if_poll_lock);
4506 (void) thread_block_parameter(ifnet_poll_thread_cont, ifp);
4507 /* NOTREACHED */
4508 __builtin_unreachable();
4509 }
4510
4511 __attribute__((noreturn))
4512 static void
ifnet_poll_thread_cont(void * v,wait_result_t wres)4513 ifnet_poll_thread_cont(void *v, wait_result_t wres)
4514 {
4515 struct dlil_threading_info *inp;
4516 struct ifnet *ifp = v;
4517 struct ifnet_stat_increment_param s;
4518 struct timespec start_time;
4519
4520 VERIFY(ifp->if_eflags & IFEF_RXPOLL);
4521
4522 bzero(&s, sizeof(s));
4523 net_timerclear(&start_time);
4524
4525 lck_mtx_lock_spin(&ifp->if_poll_lock);
4526 if (__improbable(wres == THREAD_INTERRUPTED ||
4527 (ifp->if_poll_flags & IF_POLLF_TERMINATING) != 0)) {
4528 goto terminate;
4529 }
4530
4531 inp = ifp->if_inp;
4532 VERIFY(inp != NULL);
4533
4534 if (__improbable(ifp->if_poll_flags & IF_POLLF_EMBRYONIC)) {
4535 ifp->if_poll_flags &= ~IF_POLLF_EMBRYONIC;
4536 lck_mtx_unlock(&ifp->if_poll_lock);
4537 ifnet_decr_pending_thread_count(ifp);
4538 lck_mtx_lock_spin(&ifp->if_poll_lock);
4539 goto skip;
4540 }
4541
4542 ifp->if_poll_flags |= IF_POLLF_RUNNING;
4543
4544 /*
4545 * Keep on servicing until no more request.
4546 */
4547 for (;;) {
4548 struct mbuf *m_head, *m_tail;
4549 u_int32_t m_lim, m_cnt, m_totlen;
4550 u_int16_t req = ifp->if_poll_req;
4551
4552 m_lim = (ifp->if_rxpoll_plim != 0) ? ifp->if_rxpoll_plim :
4553 MAX((qlimit(&inp->dlth_pkts)), (ifp->if_rxpoll_phiwat << 2));
4554 lck_mtx_unlock(&ifp->if_poll_lock);
4555
4556 /*
4557 * If no longer attached, there's nothing to do;
4558 * else hold an IO refcnt to prevent the interface
4559 * from being detached (will be released below.)
4560 */
4561 if (!ifnet_is_attached(ifp, 1)) {
4562 lck_mtx_lock_spin(&ifp->if_poll_lock);
4563 break;
4564 }
4565
4566 if (dlil_verbose > 1) {
4567 DLIL_PRINTF("%s: polling up to %d pkts, "
4568 "pkts avg %d max %d, wreq avg %d, "
4569 "bytes avg %d\n",
4570 if_name(ifp), m_lim,
4571 ifp->if_rxpoll_pavg, ifp->if_rxpoll_pmax,
4572 ifp->if_rxpoll_wavg, ifp->if_rxpoll_bavg);
4573 }
4574
4575 /* invoke the driver's input poll routine */
4576 ((*ifp->if_input_poll)(ifp, 0, m_lim, &m_head, &m_tail,
4577 &m_cnt, &m_totlen));
4578
4579 if (m_head != NULL) {
4580 VERIFY(m_tail != NULL && m_cnt > 0);
4581
4582 if (dlil_verbose > 1) {
4583 DLIL_PRINTF("%s: polled %d pkts, "
4584 "pkts avg %d max %d, wreq avg %d, "
4585 "bytes avg %d\n",
4586 if_name(ifp), m_cnt,
4587 ifp->if_rxpoll_pavg, ifp->if_rxpoll_pmax,
4588 ifp->if_rxpoll_wavg, ifp->if_rxpoll_bavg);
4589 }
4590
4591 /* stats are required for extended variant */
4592 s.packets_in = m_cnt;
4593 s.bytes_in = m_totlen;
4594
4595 (void) ifnet_input_common(ifp, m_head, m_tail,
4596 &s, TRUE, TRUE);
4597 } else {
4598 if (dlil_verbose > 1) {
4599 DLIL_PRINTF("%s: no packets, "
4600 "pkts avg %d max %d, wreq avg %d, "
4601 "bytes avg %d\n",
4602 if_name(ifp), ifp->if_rxpoll_pavg,
4603 ifp->if_rxpoll_pmax, ifp->if_rxpoll_wavg,
4604 ifp->if_rxpoll_bavg);
4605 }
4606
4607 (void) ifnet_input_common(ifp, NULL, NULL,
4608 NULL, FALSE, TRUE);
4609 }
4610
4611 /* Release the io ref count */
4612 ifnet_decr_iorefcnt(ifp);
4613
4614 lck_mtx_lock_spin(&ifp->if_poll_lock);
4615
4616 /* if there's no pending request, we're done */
4617 if (req == ifp->if_poll_req ||
4618 (ifp->if_poll_flags & IF_POLLF_TERMINATING) != 0) {
4619 break;
4620 }
4621 }
4622 skip:
4623 ifp->if_poll_req = 0;
4624 ifp->if_poll_flags &= ~IF_POLLF_RUNNING;
4625
4626 if (__probable((ifp->if_poll_flags & IF_POLLF_TERMINATING) == 0)) {
4627 uint64_t deadline = TIMEOUT_WAIT_FOREVER;
4628 struct timespec *ts;
4629
4630 /*
4631 * Wakeup N ns from now, else sleep indefinitely (ts = NULL)
4632 * until ifnet_poll() is called again.
4633 */
4634 ts = &ifp->if_poll_cycle;
4635 if (ts->tv_sec == 0 && ts->tv_nsec == 0) {
4636 ts = NULL;
4637 }
4638
4639 if (ts != NULL) {
4640 clock_interval_to_deadline((uint32_t)(ts->tv_nsec +
4641 (ts->tv_sec * NSEC_PER_SEC)), 1, &deadline);
4642 }
4643
4644 (void) assert_wait_deadline(&ifp->if_poll_thread,
4645 THREAD_UNINT, deadline);
4646 lck_mtx_unlock(&ifp->if_poll_lock);
4647 (void) thread_block_parameter(ifnet_poll_thread_cont, ifp);
4648 /* NOTREACHED */
4649 } else {
4650 terminate:
4651 /* interface is detached (maybe while asleep)? */
4652 ifnet_set_poll_cycle(ifp, NULL);
4653
4654 /* clear if_poll_thread to allow termination to continue */
4655 ASSERT(ifp->if_poll_thread != THREAD_NULL);
4656 ifp->if_poll_thread = THREAD_NULL;
4657 wakeup((caddr_t)&ifp->if_poll_thread);
4658 lck_mtx_unlock(&ifp->if_poll_lock);
4659
4660 if (dlil_verbose) {
4661 DLIL_PRINTF("%s: poller thread terminated\n",
4662 if_name(ifp));
4663 }
4664
4665 /* for the extra refcnt from kernel_thread_start() */
4666 thread_deallocate(current_thread());
4667 /* this is the end */
4668 thread_terminate(current_thread());
4669 /* NOTREACHED */
4670 }
4671
4672 /* must never get here */
4673 VERIFY(0);
4674 /* NOTREACHED */
4675 __builtin_unreachable();
4676 }
4677
4678 void
ifnet_set_poll_cycle(struct ifnet * ifp,struct timespec * ts)4679 ifnet_set_poll_cycle(struct ifnet *ifp, struct timespec *ts)
4680 {
4681 if (ts == NULL) {
4682 bzero(&ifp->if_poll_cycle, sizeof(ifp->if_poll_cycle));
4683 } else {
4684 *(&ifp->if_poll_cycle) = *ts;
4685 }
4686
4687 if (ts != NULL && ts->tv_nsec != 0 && dlil_verbose) {
4688 DLIL_PRINTF("%s: poll interval set to %lu nsec\n",
4689 if_name(ifp), ts->tv_nsec);
4690 }
4691 }
4692
4693 void
ifnet_purge(struct ifnet * ifp)4694 ifnet_purge(struct ifnet *ifp)
4695 {
4696 if (ifp != NULL && (ifp->if_eflags & IFEF_TXSTART)) {
4697 if_qflush_snd(ifp, false);
4698 }
4699 }
4700
4701 void
ifnet_update_sndq(struct ifclassq * ifq,cqev_t ev)4702 ifnet_update_sndq(struct ifclassq *ifq, cqev_t ev)
4703 {
4704 IFCQ_LOCK_ASSERT_HELD(ifq);
4705
4706 if (!(IFCQ_IS_READY(ifq))) {
4707 return;
4708 }
4709
4710 if (IFCQ_TBR_IS_ENABLED(ifq)) {
4711 struct tb_profile tb = {
4712 .rate = ifq->ifcq_tbr.tbr_rate_raw,
4713 .percent = ifq->ifcq_tbr.tbr_percent, .depth = 0
4714 };
4715 (void) ifclassq_tbr_set(ifq, &tb, FALSE);
4716 }
4717
4718 ifclassq_update(ifq, ev);
4719 }
4720
4721 void
ifnet_update_rcv(struct ifnet * ifp,cqev_t ev)4722 ifnet_update_rcv(struct ifnet *ifp, cqev_t ev)
4723 {
4724 switch (ev) {
4725 case CLASSQ_EV_LINK_BANDWIDTH:
4726 if (net_rxpoll && (ifp->if_eflags & IFEF_RXPOLL)) {
4727 ifp->if_poll_update++;
4728 }
4729 break;
4730
4731 default:
4732 break;
4733 }
4734 }
4735
4736 errno_t
ifnet_set_output_sched_model(struct ifnet * ifp,u_int32_t model)4737 ifnet_set_output_sched_model(struct ifnet *ifp, u_int32_t model)
4738 {
4739 struct ifclassq *ifq;
4740 u_int32_t omodel;
4741 errno_t err;
4742
4743 if (ifp == NULL || model >= IFNET_SCHED_MODEL_MAX) {
4744 return EINVAL;
4745 } else if (!(ifp->if_eflags & IFEF_TXSTART)) {
4746 return ENXIO;
4747 }
4748
4749 ifq = ifp->if_snd;
4750 IFCQ_LOCK(ifq);
4751 omodel = ifp->if_output_sched_model;
4752 ifp->if_output_sched_model = model;
4753 if ((err = ifclassq_pktsched_setup(ifq)) != 0) {
4754 ifp->if_output_sched_model = omodel;
4755 }
4756 IFCQ_UNLOCK(ifq);
4757
4758 return err;
4759 }
4760
4761 errno_t
ifnet_set_sndq_maxlen(struct ifnet * ifp,u_int32_t maxqlen)4762 ifnet_set_sndq_maxlen(struct ifnet *ifp, u_int32_t maxqlen)
4763 {
4764 if (ifp == NULL) {
4765 return EINVAL;
4766 } else if (!(ifp->if_eflags & IFEF_TXSTART)) {
4767 return ENXIO;
4768 }
4769
4770 ifclassq_set_maxlen(ifp->if_snd, maxqlen);
4771
4772 return 0;
4773 }
4774
4775 errno_t
ifnet_get_sndq_maxlen(struct ifnet * ifp,u_int32_t * maxqlen)4776 ifnet_get_sndq_maxlen(struct ifnet *ifp, u_int32_t *maxqlen)
4777 {
4778 if (ifp == NULL || maxqlen == NULL) {
4779 return EINVAL;
4780 } else if (!(ifp->if_eflags & IFEF_TXSTART)) {
4781 return ENXIO;
4782 }
4783
4784 *maxqlen = ifclassq_get_maxlen(ifp->if_snd);
4785
4786 return 0;
4787 }
4788
4789 errno_t
ifnet_get_sndq_len(struct ifnet * ifp,u_int32_t * pkts)4790 ifnet_get_sndq_len(struct ifnet *ifp, u_int32_t *pkts)
4791 {
4792 errno_t err;
4793
4794 if (ifp == NULL || pkts == NULL) {
4795 err = EINVAL;
4796 } else if (!(ifp->if_eflags & IFEF_TXSTART)) {
4797 err = ENXIO;
4798 } else {
4799 err = ifclassq_get_len(ifp->if_snd, MBUF_SC_UNSPEC,
4800 IF_CLASSQ_ALL_GRPS, pkts, NULL);
4801 }
4802
4803 return err;
4804 }
4805
4806 errno_t
ifnet_get_service_class_sndq_len(struct ifnet * ifp,mbuf_svc_class_t sc,u_int32_t * pkts,u_int32_t * bytes)4807 ifnet_get_service_class_sndq_len(struct ifnet *ifp, mbuf_svc_class_t sc,
4808 u_int32_t *pkts, u_int32_t *bytes)
4809 {
4810 errno_t err;
4811
4812 if (ifp == NULL || !MBUF_VALID_SC(sc) ||
4813 (pkts == NULL && bytes == NULL)) {
4814 err = EINVAL;
4815 } else if (!(ifp->if_eflags & IFEF_TXSTART)) {
4816 err = ENXIO;
4817 } else {
4818 err = ifclassq_get_len(ifp->if_snd, sc, IF_CLASSQ_ALL_GRPS,
4819 pkts, bytes);
4820 }
4821
4822 return err;
4823 }
4824
4825 errno_t
ifnet_set_rcvq_maxlen(struct ifnet * ifp,u_int32_t maxqlen)4826 ifnet_set_rcvq_maxlen(struct ifnet *ifp, u_int32_t maxqlen)
4827 {
4828 struct dlil_threading_info *inp;
4829
4830 if (ifp == NULL) {
4831 return EINVAL;
4832 } else if (!(ifp->if_eflags & IFEF_RXPOLL) || ifp->if_inp == NULL) {
4833 return ENXIO;
4834 }
4835
4836 if (maxqlen == 0) {
4837 maxqlen = if_rcvq_maxlen;
4838 } else if (maxqlen < IF_RCVQ_MINLEN) {
4839 maxqlen = IF_RCVQ_MINLEN;
4840 }
4841
4842 inp = ifp->if_inp;
4843 lck_mtx_lock(&inp->dlth_lock);
4844 qlimit(&inp->dlth_pkts) = maxqlen;
4845 lck_mtx_unlock(&inp->dlth_lock);
4846
4847 return 0;
4848 }
4849
4850 errno_t
ifnet_get_rcvq_maxlen(struct ifnet * ifp,u_int32_t * maxqlen)4851 ifnet_get_rcvq_maxlen(struct ifnet *ifp, u_int32_t *maxqlen)
4852 {
4853 struct dlil_threading_info *inp;
4854
4855 if (ifp == NULL || maxqlen == NULL) {
4856 return EINVAL;
4857 } else if (!(ifp->if_eflags & IFEF_RXPOLL) || ifp->if_inp == NULL) {
4858 return ENXIO;
4859 }
4860
4861 inp = ifp->if_inp;
4862 lck_mtx_lock(&inp->dlth_lock);
4863 *maxqlen = qlimit(&inp->dlth_pkts);
4864 lck_mtx_unlock(&inp->dlth_lock);
4865 return 0;
4866 }
4867
4868 void
ifnet_enqueue_multi_setup(struct ifnet * ifp,uint16_t delay_qlen,uint16_t delay_timeout)4869 ifnet_enqueue_multi_setup(struct ifnet *ifp, uint16_t delay_qlen,
4870 uint16_t delay_timeout)
4871 {
4872 if (delay_qlen > 0 && delay_timeout > 0) {
4873 if_set_eflags(ifp, IFEF_ENQUEUE_MULTI);
4874 ifp->if_start_delay_qlen = MIN(100, delay_qlen);
4875 ifp->if_start_delay_timeout = min(20000, delay_timeout);
4876 /* convert timeout to nanoseconds */
4877 ifp->if_start_delay_timeout *= 1000;
4878 kprintf("%s: forced IFEF_ENQUEUE_MULTI qlen %u timeout %u\n",
4879 ifp->if_xname, (uint32_t)delay_qlen,
4880 (uint32_t)delay_timeout);
4881 } else {
4882 if_clear_eflags(ifp, IFEF_ENQUEUE_MULTI);
4883 }
4884 }
4885
4886 /*
4887 * This function clears the DSCP bits in the IPV4/V6 header pointed to by buf.
4888 * While it's ok for buf to be not 32 bit aligned, the caller must ensure that
4889 * buf holds the full header.
4890 */
4891 static __attribute__((noinline)) void
ifnet_mcast_clear_dscp(uint8_t * buf,uint8_t ip_ver)4892 ifnet_mcast_clear_dscp(uint8_t *buf, uint8_t ip_ver)
4893 {
4894 struct ip *ip;
4895 struct ip6_hdr *ip6;
4896 uint8_t lbuf[64] __attribute__((aligned(8)));
4897 uint8_t *p = buf;
4898
4899 if (ip_ver == IPVERSION) {
4900 uint8_t old_tos;
4901 uint32_t sum;
4902
4903 if (__improbable(!IP_HDR_ALIGNED_P(p))) {
4904 DTRACE_IP1(not__aligned__v4, uint8_t *, buf);
4905 bcopy(buf, lbuf, sizeof(struct ip));
4906 p = lbuf;
4907 }
4908 ip = (struct ip *)(void *)p;
4909 if (__probable((ip->ip_tos & ~IPTOS_ECN_MASK) == 0)) {
4910 return;
4911 }
4912
4913 DTRACE_IP1(clear__v4, struct ip *, ip);
4914 old_tos = ip->ip_tos;
4915 ip->ip_tos &= IPTOS_ECN_MASK;
4916 sum = ip->ip_sum + htons(old_tos) - htons(ip->ip_tos);
4917 sum = (sum >> 16) + (sum & 0xffff);
4918 ip->ip_sum = (uint16_t)(sum & 0xffff);
4919
4920 if (__improbable(p == lbuf)) {
4921 bcopy(lbuf, buf, sizeof(struct ip));
4922 }
4923 } else {
4924 uint32_t flow;
4925 ASSERT(ip_ver == IPV6_VERSION);
4926
4927 if (__improbable(!IP_HDR_ALIGNED_P(p))) {
4928 DTRACE_IP1(not__aligned__v6, uint8_t *, buf);
4929 bcopy(buf, lbuf, sizeof(struct ip6_hdr));
4930 p = lbuf;
4931 }
4932 ip6 = (struct ip6_hdr *)(void *)p;
4933 flow = ntohl(ip6->ip6_flow);
4934 if (__probable((flow & IP6FLOW_DSCP_MASK) == 0)) {
4935 return;
4936 }
4937
4938 DTRACE_IP1(clear__v6, struct ip6_hdr *, ip6);
4939 ip6->ip6_flow = htonl(flow & ~IP6FLOW_DSCP_MASK);
4940
4941 if (__improbable(p == lbuf)) {
4942 bcopy(lbuf, buf, sizeof(struct ip6_hdr));
4943 }
4944 }
4945 }
4946
4947 static inline errno_t
ifnet_enqueue_ifclassq(struct ifnet * ifp,struct ifclassq * ifcq,classq_pkt_t * p,boolean_t flush,boolean_t * pdrop)4948 ifnet_enqueue_ifclassq(struct ifnet *ifp, struct ifclassq *ifcq,
4949 classq_pkt_t *p, boolean_t flush, boolean_t *pdrop)
4950 {
4951 #if SKYWALK
4952 volatile struct sk_nexusadv *nxadv = NULL;
4953 #endif /* SKYWALK */
4954 volatile uint64_t *fg_ts = NULL;
4955 volatile uint64_t *rt_ts = NULL;
4956 struct timespec now;
4957 u_int64_t now_nsec = 0;
4958 int error = 0;
4959 uint8_t *mcast_buf = NULL;
4960 uint8_t ip_ver;
4961 uint32_t pktlen;
4962
4963 ASSERT(ifp->if_eflags & IFEF_TXSTART);
4964 #if SKYWALK
4965 /*
4966 * If attached to flowswitch, grab pointers to the
4967 * timestamp variables in the nexus advisory region.
4968 */
4969 if ((ifp->if_capabilities & IFCAP_SKYWALK) && ifp->if_na != NULL &&
4970 (nxadv = ifp->if_na->nifna_netif->nif_fsw_nxadv) != NULL) {
4971 fg_ts = &nxadv->nxadv_fg_sendts;
4972 rt_ts = &nxadv->nxadv_rt_sendts;
4973 }
4974 #endif /* SKYWALK */
4975
4976 /*
4977 * If packet already carries a timestamp, either from dlil_output()
4978 * or from flowswitch, use it here. Otherwise, record timestamp.
4979 * PKTF_TS_VALID is always cleared prior to entering classq, i.e.
4980 * the timestamp value is used internally there.
4981 */
4982 switch (p->cp_ptype) {
4983 case QP_MBUF:
4984 #if SKYWALK
4985 /*
4986 * Valid only for non-native (compat) Skywalk interface.
4987 * If the data source uses packet, caller must convert
4988 * it to mbuf first prior to calling this routine.
4989 */
4990 ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
4991 #endif /* SKYWALK */
4992 ASSERT(p->cp_mbuf->m_flags & M_PKTHDR);
4993 ASSERT(p->cp_mbuf->m_nextpkt == NULL);
4994
4995 if (!(p->cp_mbuf->m_pkthdr.pkt_flags & PKTF_TS_VALID) ||
4996 p->cp_mbuf->m_pkthdr.pkt_timestamp == 0) {
4997 nanouptime(&now);
4998 net_timernsec(&now, &now_nsec);
4999 p->cp_mbuf->m_pkthdr.pkt_timestamp = now_nsec;
5000 }
5001 p->cp_mbuf->m_pkthdr.pkt_flags &= ~PKTF_TS_VALID;
5002 /*
5003 * If the packet service class is not background,
5004 * update the timestamp to indicate recent activity
5005 * on a foreground socket.
5006 */
5007 if ((p->cp_mbuf->m_pkthdr.pkt_flags & PKTF_FLOW_ID) &&
5008 p->cp_mbuf->m_pkthdr.pkt_flowsrc == FLOWSRC_INPCB) {
5009 if (!(p->cp_mbuf->m_pkthdr.pkt_flags &
5010 PKTF_SO_BACKGROUND)) {
5011 ifp->if_fg_sendts = (uint32_t)_net_uptime;
5012 if (fg_ts != NULL) {
5013 *fg_ts = (uint32_t)_net_uptime;
5014 }
5015 }
5016 if (p->cp_mbuf->m_pkthdr.pkt_flags & PKTF_SO_REALTIME) {
5017 ifp->if_rt_sendts = (uint32_t)_net_uptime;
5018 if (rt_ts != NULL) {
5019 *rt_ts = (uint32_t)_net_uptime;
5020 }
5021 }
5022 }
5023 pktlen = m_pktlen(p->cp_mbuf);
5024
5025 /*
5026 * Some Wi-Fi AP implementations do not correctly handle
5027 * multicast IP packets with DSCP bits set (radr://9331522).
5028 * As a workaround we clear the DSCP bits but keep service
5029 * class (rdar://51507725).
5030 */
5031 if ((p->cp_mbuf->m_flags & M_MCAST) != 0 &&
5032 IFNET_IS_WIFI_INFRA(ifp)) {
5033 size_t len = mbuf_len(p->cp_mbuf), hlen;
5034 struct ether_header *eh;
5035 boolean_t pullup = FALSE;
5036 uint16_t etype;
5037
5038 if (__improbable(len < sizeof(struct ether_header))) {
5039 DTRACE_IP1(small__ether, size_t, len);
5040 if ((p->cp_mbuf = m_pullup(p->cp_mbuf,
5041 sizeof(struct ether_header))) == NULL) {
5042 return ENOMEM;
5043 }
5044 }
5045 eh = (struct ether_header *)mbuf_data(p->cp_mbuf);
5046 etype = ntohs(eh->ether_type);
5047 if (etype == ETHERTYPE_IP) {
5048 hlen = sizeof(struct ether_header) +
5049 sizeof(struct ip);
5050 if (len < hlen) {
5051 DTRACE_IP1(small__v4, size_t, len);
5052 pullup = TRUE;
5053 }
5054 ip_ver = IPVERSION;
5055 } else if (etype == ETHERTYPE_IPV6) {
5056 hlen = sizeof(struct ether_header) +
5057 sizeof(struct ip6_hdr);
5058 if (len < hlen) {
5059 DTRACE_IP1(small__v6, size_t, len);
5060 pullup = TRUE;
5061 }
5062 ip_ver = IPV6_VERSION;
5063 } else {
5064 DTRACE_IP1(invalid__etype, uint16_t, etype);
5065 break;
5066 }
5067 if (pullup) {
5068 if ((p->cp_mbuf = m_pullup(p->cp_mbuf, (int)hlen)) ==
5069 NULL) {
5070 return ENOMEM;
5071 }
5072
5073 eh = (struct ether_header *)mbuf_data(
5074 p->cp_mbuf);
5075 }
5076 mcast_buf = (uint8_t *)(eh + 1);
5077 /*
5078 * ifnet_mcast_clear_dscp() will finish the work below.
5079 * Note that the pullups above ensure that mcast_buf
5080 * points to a full IP header.
5081 */
5082 }
5083 break;
5084
5085 #if SKYWALK
5086 case QP_PACKET:
5087 /*
5088 * Valid only for native Skywalk interface. If the data
5089 * source uses mbuf, caller must convert it to packet first
5090 * prior to calling this routine.
5091 */
5092 ASSERT(ifp->if_eflags & IFEF_SKYWALK_NATIVE);
5093 if (!(p->cp_kpkt->pkt_pflags & PKT_F_TS_VALID) ||
5094 p->cp_kpkt->pkt_timestamp == 0) {
5095 nanouptime(&now);
5096 net_timernsec(&now, &now_nsec);
5097 p->cp_kpkt->pkt_timestamp = now_nsec;
5098 }
5099 p->cp_kpkt->pkt_pflags &= ~PKT_F_TS_VALID;
5100 /*
5101 * If the packet service class is not background,
5102 * update the timestamps on the interface, as well as
5103 * the ones in nexus-wide advisory to indicate recent
5104 * activity on a foreground flow.
5105 */
5106 if (!(p->cp_kpkt->pkt_pflags & PKT_F_BACKGROUND)) {
5107 ifp->if_fg_sendts = (uint32_t)_net_uptime;
5108 if (fg_ts != NULL) {
5109 *fg_ts = (uint32_t)_net_uptime;
5110 }
5111 }
5112 if (p->cp_kpkt->pkt_pflags & PKT_F_REALTIME) {
5113 ifp->if_rt_sendts = (uint32_t)_net_uptime;
5114 if (rt_ts != NULL) {
5115 *rt_ts = (uint32_t)_net_uptime;
5116 }
5117 }
5118 pktlen = p->cp_kpkt->pkt_length;
5119
5120 /*
5121 * Some Wi-Fi AP implementations do not correctly handle
5122 * multicast IP packets with DSCP bits set (radr://9331522).
5123 * As a workaround we clear the DSCP bits but keep service
5124 * class (rdar://51507725).
5125 */
5126 if ((p->cp_kpkt->pkt_link_flags & PKT_LINKF_MCAST) != 0 &&
5127 IFNET_IS_WIFI_INFRA(ifp)) {
5128 uint8_t *baddr;
5129 struct ether_header *eh;
5130 uint16_t etype;
5131
5132 MD_BUFLET_ADDR_ABS(p->cp_kpkt, baddr);
5133 baddr += p->cp_kpkt->pkt_headroom;
5134 if (__improbable(pktlen < sizeof(struct ether_header))) {
5135 DTRACE_IP1(pkt__small__ether, __kern_packet *,
5136 p->cp_kpkt);
5137 break;
5138 }
5139 eh = (struct ether_header *)(void *)baddr;
5140 etype = ntohs(eh->ether_type);
5141 if (etype == ETHERTYPE_IP) {
5142 if (pktlen < sizeof(struct ether_header) +
5143 sizeof(struct ip)) {
5144 DTRACE_IP1(pkt__small__v4, uint32_t,
5145 pktlen);
5146 break;
5147 }
5148 ip_ver = IPVERSION;
5149 } else if (etype == ETHERTYPE_IPV6) {
5150 if (pktlen < sizeof(struct ether_header) +
5151 sizeof(struct ip6_hdr)) {
5152 DTRACE_IP1(pkt__small__v6, uint32_t,
5153 pktlen);
5154 break;
5155 }
5156 ip_ver = IPV6_VERSION;
5157 } else {
5158 DTRACE_IP1(pkt__invalid__etype, uint16_t,
5159 etype);
5160 break;
5161 }
5162 mcast_buf = (uint8_t *)(eh + 1);
5163 /*
5164 * ifnet_mcast_clear_dscp() will finish the work below.
5165 * The checks above verify that the IP header is in the
5166 * first buflet.
5167 */
5168 }
5169 break;
5170 #endif /* SKYWALK */
5171
5172 default:
5173 VERIFY(0);
5174 /* NOTREACHED */
5175 __builtin_unreachable();
5176 }
5177
5178 if (mcast_buf != NULL) {
5179 ifnet_mcast_clear_dscp(mcast_buf, ip_ver);
5180 }
5181
5182 if (ifp->if_eflags & IFEF_ENQUEUE_MULTI) {
5183 if (now_nsec == 0) {
5184 nanouptime(&now);
5185 net_timernsec(&now, &now_nsec);
5186 }
5187 /*
5188 * If the driver chose to delay start callback for
5189 * coalescing multiple packets, Then use the following
5190 * heuristics to make sure that start callback will
5191 * be delayed only when bulk data transfer is detected.
5192 * 1. number of packets enqueued in (delay_win * 2) is
5193 * greater than or equal to the delay qlen.
5194 * 2. If delay_start is enabled it will stay enabled for
5195 * another 10 idle windows. This is to take into account
5196 * variable RTT and burst traffic.
5197 * 3. If the time elapsed since last enqueue is more
5198 * than 200ms we disable delaying start callback. This is
5199 * is to take idle time into account.
5200 */
5201 u_int64_t dwin = (ifp->if_start_delay_timeout << 1);
5202 if (ifp->if_start_delay_swin > 0) {
5203 if ((ifp->if_start_delay_swin + dwin) > now_nsec) {
5204 ifp->if_start_delay_cnt++;
5205 } else if ((now_nsec - ifp->if_start_delay_swin)
5206 >= (200 * 1000 * 1000)) {
5207 ifp->if_start_delay_swin = now_nsec;
5208 ifp->if_start_delay_cnt = 1;
5209 ifp->if_start_delay_idle = 0;
5210 if (ifp->if_eflags & IFEF_DELAY_START) {
5211 if_clear_eflags(ifp, IFEF_DELAY_START);
5212 ifnet_delay_start_disabled_increment();
5213 }
5214 } else {
5215 if (ifp->if_start_delay_cnt >=
5216 ifp->if_start_delay_qlen) {
5217 if_set_eflags(ifp, IFEF_DELAY_START);
5218 ifp->if_start_delay_idle = 0;
5219 } else {
5220 if (ifp->if_start_delay_idle >= 10) {
5221 if_clear_eflags(ifp,
5222 IFEF_DELAY_START);
5223 ifnet_delay_start_disabled_increment();
5224 } else {
5225 ifp->if_start_delay_idle++;
5226 }
5227 }
5228 ifp->if_start_delay_swin = now_nsec;
5229 ifp->if_start_delay_cnt = 1;
5230 }
5231 } else {
5232 ifp->if_start_delay_swin = now_nsec;
5233 ifp->if_start_delay_cnt = 1;
5234 ifp->if_start_delay_idle = 0;
5235 if_clear_eflags(ifp, IFEF_DELAY_START);
5236 }
5237 } else {
5238 if_clear_eflags(ifp, IFEF_DELAY_START);
5239 }
5240
5241 /* enqueue the packet (caller consumes object) */
5242 error = ifclassq_enqueue(((ifcq != NULL) ? ifcq : ifp->if_snd), p, p,
5243 1, pktlen, pdrop);
5244
5245 /*
5246 * Tell the driver to start dequeueing; do this even when the queue
5247 * for the packet is suspended (EQSUSPENDED), as the driver could still
5248 * be dequeueing from other unsuspended queues.
5249 */
5250 if (!(ifp->if_eflags & IFEF_ENQUEUE_MULTI) &&
5251 ((error == 0 && flush) || error == EQFULL || error == EQSUSPENDED)) {
5252 ifnet_start(ifp);
5253 }
5254
5255 return error;
5256 }
5257
5258 static inline errno_t
ifnet_enqueue_ifclassq_chain(struct ifnet * ifp,struct ifclassq * ifcq,classq_pkt_t * head,classq_pkt_t * tail,uint32_t cnt,uint32_t bytes,boolean_t flush,boolean_t * pdrop)5259 ifnet_enqueue_ifclassq_chain(struct ifnet *ifp, struct ifclassq *ifcq,
5260 classq_pkt_t *head, classq_pkt_t *tail, uint32_t cnt, uint32_t bytes,
5261 boolean_t flush, boolean_t *pdrop)
5262 {
5263 int error;
5264
5265 /* enqueue the packet (caller consumes object) */
5266 error = ifclassq_enqueue(ifcq != NULL ? ifcq : ifp->if_snd, head, tail,
5267 cnt, bytes, pdrop);
5268
5269 /*
5270 * Tell the driver to start dequeueing; do this even when the queue
5271 * for the packet is suspended (EQSUSPENDED), as the driver could still
5272 * be dequeueing from other unsuspended queues.
5273 */
5274 if ((error == 0 && flush) || error == EQFULL || error == EQSUSPENDED) {
5275 ifnet_start(ifp);
5276 }
5277 return error;
5278 }
5279
5280 #if DEVELOPMENT || DEBUG
5281 void
trace_pkt_dump_payload(struct ifnet * ifp,struct __kern_packet * kpkt,bool input)5282 trace_pkt_dump_payload(struct ifnet *ifp, struct __kern_packet *kpkt, bool input)
5283 {
5284 #define MIN_TRACE_DUMP_PKT_SIZE 32
5285 struct ether_header *eh = NULL;
5286 struct udphdr *uh = NULL;
5287
5288 if (__probable(kdebug_enable == 0 || (flow_key_trace.fk_ipver != IPVERSION &&
5289 flow_key_trace.fk_ipver != IPV6_VERSION))) {
5290 return;
5291 }
5292
5293 uint16_t bdlim, bdlen, bdoff;
5294 uint8_t *baddr;
5295
5296 MD_BUFLET_ADDR_ABS_DLEN(kpkt, baddr, bdlen, bdlim, bdoff);
5297
5298 if (!(kpkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED)) {
5299 if (!IFNET_IS_ETHERNET(ifp)) {
5300 return;
5301 }
5302
5303 sa_family_t af = AF_UNSPEC;
5304 ASSERT(kpkt->pkt_l2_len > 0);
5305
5306 baddr += kpkt->pkt_headroom;
5307 eh = (struct ether_header *)(void *)baddr;
5308 if (__improbable(sizeof(*eh) > kpkt->pkt_length)) {
5309 return;
5310 }
5311 if (__improbable(kpkt->pkt_headroom + sizeof(*eh) > bdlim)) {
5312 return;
5313 }
5314 uint16_t ether_type = ntohs(eh->ether_type);
5315 if (ether_type == ETHERTYPE_IP) {
5316 af = AF_INET;
5317 } else if (ether_type == ETHERTYPE_IPV6) {
5318 af = AF_INET6;
5319 } else {
5320 return;
5321 }
5322 flow_pkt_classify(kpkt, ifp, af, input);
5323 }
5324
5325 if (kpkt->pkt_flow_ip_ver != flow_key_trace.fk_ipver) {
5326 return;
5327 }
5328
5329 if (kpkt->pkt_flow_ip_proto != IPPROTO_UDP) {
5330 return;
5331 }
5332
5333 uint16_t sport = input ? flow_key_trace.fk_dport : flow_key_trace.fk_sport;
5334 uint16_t dport = input ? flow_key_trace.fk_sport : flow_key_trace.fk_dport;
5335
5336 if (kpkt->pkt_flow_udp_src != sport ||
5337 kpkt->pkt_flow_udp_dst != dport) {
5338 return;
5339 }
5340
5341 if (kpkt->pkt_flow_ip_ver == IPVERSION) {
5342 struct ip *ip_header = (struct ip *)kpkt->pkt_flow_ip_hdr;
5343 struct in_addr *saddr = input ? &flow_key_trace.fk_dst4 : &flow_key_trace.fk_src4;
5344 struct in_addr *daddr = input ? &flow_key_trace.fk_src4 : &flow_key_trace.fk_dst4;
5345
5346 if (ip_header->ip_src.s_addr != saddr->s_addr ||
5347 ip_header->ip_dst.s_addr != daddr->s_addr) {
5348 return;
5349 }
5350 } else if (kpkt->pkt_flow_ip_ver == IPV6_VERSION) {
5351 struct ip6_hdr *ip6_header = (struct ip6_hdr *)kpkt->pkt_flow_ip_hdr;
5352 struct in6_addr *saddr = input ? &flow_key_trace.fk_dst6 : &flow_key_trace.fk_src6;
5353 struct in6_addr *daddr = input ? &flow_key_trace.fk_src6 : &flow_key_trace.fk_dst6;
5354
5355 if (!IN6_ARE_ADDR_EQUAL(&ip6_header->ip6_src, saddr) ||
5356 !IN6_ARE_ADDR_EQUAL(&ip6_header->ip6_dst, daddr)) {
5357 return;
5358 }
5359 }
5360
5361 int udp_payload_offset = kpkt->pkt_l2_len + kpkt->pkt_flow_ip_hlen + sizeof(struct udphdr);
5362
5363 uint16_t pkt_payload_len = bdlim - bdoff;
5364 pkt_payload_len = (uint16_t)MIN(pkt_payload_len, kpkt->pkt_length);
5365 pkt_payload_len -= udp_payload_offset;
5366
5367 if (pkt_payload_len >= MIN_TRACE_DUMP_PKT_SIZE) {
5368 uh = (struct udphdr *)kpkt->pkt_flow_udp_hdr;
5369 uint8_t *payload = (uint8_t *)(uh + 1);
5370
5371 /* Trace 32 bytes of UDP transport payload */
5372 uint64_t *trace1 = __DECONST(uint64_t *, payload);
5373 uint64_t *trace2 = trace1 + 1;
5374 uint64_t *trace3 = trace2 + 1;
5375 uint64_t *trace4 = trace3 + 1;
5376
5377 if (input) {
5378 KDBG(IFNET_KTRACE_RX_PKT_DUMP, *trace1, *trace2, *trace3, *trace4);
5379 } else {
5380 KDBG(IFNET_KTRACE_TX_PKT_DUMP, *trace1, *trace2, *trace3, *trace4);
5381 }
5382 }
5383 }
5384 #endif /* DEVELOPMENT || DEBUG */
5385
5386 int
ifnet_enqueue_netem(void * handle,pktsched_pkt_t * pkts,uint32_t n_pkts)5387 ifnet_enqueue_netem(void *handle, pktsched_pkt_t *pkts, uint32_t n_pkts)
5388 {
5389 struct ifnet *ifp = handle;
5390 boolean_t pdrop; /* dummy */
5391 uint32_t i;
5392
5393 ASSERT(n_pkts >= 1);
5394 for (i = 0; i < n_pkts - 1; i++) {
5395 (void) ifnet_enqueue_ifclassq(ifp, NULL, &pkts[i].pktsched_pkt,
5396 FALSE, &pdrop);
5397 }
5398 /* flush with the last packet */
5399 (void) ifnet_enqueue_ifclassq(ifp, NULL, &pkts[i].pktsched_pkt,
5400 TRUE, &pdrop);
5401
5402 return 0;
5403 }
5404
5405 static inline errno_t
ifnet_enqueue_common(struct ifnet * ifp,struct ifclassq * ifcq,classq_pkt_t * pkt,boolean_t flush,boolean_t * pdrop)5406 ifnet_enqueue_common(struct ifnet *ifp, struct ifclassq *ifcq,
5407 classq_pkt_t *pkt, boolean_t flush, boolean_t *pdrop)
5408 {
5409 #if DEVELOPMENT || DEBUG
5410 switch (pkt->cp_ptype) {
5411 case QP_PACKET: {
5412 trace_pkt_dump_payload(ifp, pkt->cp_kpkt, false);
5413 break;
5414 }
5415 case QP_MBUF:
5416 case QP_INVALID: {
5417 break;
5418 }
5419 }
5420 #endif /* DEVELOPMENT || DEBUG */
5421
5422 if (ifp->if_output_netem != NULL) {
5423 bool drop;
5424 errno_t error;
5425 error = netem_enqueue(ifp->if_output_netem, pkt, &drop);
5426 *pdrop = drop ? TRUE : FALSE;
5427 return error;
5428 } else {
5429 return ifnet_enqueue_ifclassq(ifp, ifcq, pkt, flush, pdrop);
5430 }
5431 }
5432
5433 errno_t
ifnet_enqueue(struct ifnet * ifp,struct mbuf * m)5434 ifnet_enqueue(struct ifnet *ifp, struct mbuf *m)
5435 {
5436 boolean_t pdrop;
5437 return ifnet_enqueue_mbuf(ifp, m, TRUE, &pdrop);
5438 }
5439
5440 errno_t
ifnet_enqueue_mbuf(struct ifnet * ifp,struct mbuf * m,boolean_t flush,boolean_t * pdrop)5441 ifnet_enqueue_mbuf(struct ifnet *ifp, struct mbuf *m, boolean_t flush,
5442 boolean_t *pdrop)
5443 {
5444 classq_pkt_t pkt;
5445
5446 if (ifp == NULL || m == NULL || !(m->m_flags & M_PKTHDR) ||
5447 m->m_nextpkt != NULL) {
5448 if (m != NULL) {
5449 m_freem_list(m);
5450 *pdrop = TRUE;
5451 }
5452 return EINVAL;
5453 } else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5454 !IF_FULLY_ATTACHED(ifp)) {
5455 /* flag tested without lock for performance */
5456 m_freem(m);
5457 *pdrop = TRUE;
5458 return ENXIO;
5459 } else if (!(ifp->if_flags & IFF_UP)) {
5460 m_freem(m);
5461 *pdrop = TRUE;
5462 return ENETDOWN;
5463 }
5464
5465 CLASSQ_PKT_INIT_MBUF(&pkt, m);
5466 return ifnet_enqueue_common(ifp, NULL, &pkt, flush, pdrop);
5467 }
5468
5469 errno_t
ifnet_enqueue_mbuf_chain(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,uint32_t cnt,uint32_t bytes,boolean_t flush,boolean_t * pdrop)5470 ifnet_enqueue_mbuf_chain(struct ifnet *ifp, struct mbuf *m_head,
5471 struct mbuf *m_tail, uint32_t cnt, uint32_t bytes, boolean_t flush,
5472 boolean_t *pdrop)
5473 {
5474 classq_pkt_t head, tail;
5475
5476 ASSERT(m_head != NULL);
5477 ASSERT((m_head->m_flags & M_PKTHDR) != 0);
5478 ASSERT(m_tail != NULL);
5479 ASSERT((m_tail->m_flags & M_PKTHDR) != 0);
5480 ASSERT(ifp != NULL);
5481 ASSERT((ifp->if_eflags & IFEF_TXSTART) != 0);
5482
5483 if (!IF_FULLY_ATTACHED(ifp)) {
5484 /* flag tested without lock for performance */
5485 m_freem_list(m_head);
5486 *pdrop = TRUE;
5487 return ENXIO;
5488 } else if (!(ifp->if_flags & IFF_UP)) {
5489 m_freem_list(m_head);
5490 *pdrop = TRUE;
5491 return ENETDOWN;
5492 }
5493
5494 CLASSQ_PKT_INIT_MBUF(&head, m_head);
5495 CLASSQ_PKT_INIT_MBUF(&tail, m_tail);
5496 return ifnet_enqueue_ifclassq_chain(ifp, NULL, &head, &tail, cnt, bytes,
5497 flush, pdrop);
5498 }
5499
5500 #if SKYWALK
5501 static errno_t
ifnet_enqueue_pkt_common(struct ifnet * ifp,struct ifclassq * ifcq,struct __kern_packet * kpkt,boolean_t flush,boolean_t * pdrop)5502 ifnet_enqueue_pkt_common(struct ifnet *ifp, struct ifclassq *ifcq,
5503 struct __kern_packet *kpkt, boolean_t flush, boolean_t *pdrop)
5504 {
5505 classq_pkt_t pkt;
5506
5507 ASSERT(kpkt == NULL || kpkt->pkt_nextpkt == NULL);
5508
5509 if (__improbable(ifp == NULL || kpkt == NULL)) {
5510 if (kpkt != NULL) {
5511 pp_free_packet(__DECONST(struct kern_pbufpool *,
5512 kpkt->pkt_qum.qum_pp), SK_PTR_ADDR(kpkt));
5513 *pdrop = TRUE;
5514 }
5515 return EINVAL;
5516 } else if (__improbable(!(ifp->if_eflags & IFEF_TXSTART) ||
5517 !IF_FULLY_ATTACHED(ifp))) {
5518 /* flag tested without lock for performance */
5519 pp_free_packet(__DECONST(struct kern_pbufpool *,
5520 kpkt->pkt_qum.qum_pp), SK_PTR_ADDR(kpkt));
5521 *pdrop = TRUE;
5522 return ENXIO;
5523 } else if (__improbable(!(ifp->if_flags & IFF_UP))) {
5524 pp_free_packet(__DECONST(struct kern_pbufpool *,
5525 kpkt->pkt_qum.qum_pp), SK_PTR_ADDR(kpkt));
5526 *pdrop = TRUE;
5527 return ENETDOWN;
5528 }
5529
5530 CLASSQ_PKT_INIT_PACKET(&pkt, kpkt);
5531 return ifnet_enqueue_common(ifp, ifcq, &pkt, flush, pdrop);
5532 }
5533
5534 errno_t
ifnet_enqueue_pkt(struct ifnet * ifp,struct __kern_packet * kpkt,boolean_t flush,boolean_t * pdrop)5535 ifnet_enqueue_pkt(struct ifnet *ifp, struct __kern_packet *kpkt,
5536 boolean_t flush, boolean_t *pdrop)
5537 {
5538 return ifnet_enqueue_pkt_common(ifp, NULL, kpkt, flush, pdrop);
5539 }
5540
5541 errno_t
ifnet_enqueue_ifcq_pkt(struct ifnet * ifp,struct ifclassq * ifcq,struct __kern_packet * kpkt,boolean_t flush,boolean_t * pdrop)5542 ifnet_enqueue_ifcq_pkt(struct ifnet *ifp, struct ifclassq *ifcq,
5543 struct __kern_packet *kpkt, boolean_t flush, boolean_t *pdrop)
5544 {
5545 return ifnet_enqueue_pkt_common(ifp, ifcq, kpkt, flush, pdrop);
5546 }
5547
5548 static errno_t
ifnet_enqueue_pkt_chain_common(struct ifnet * ifp,struct ifclassq * ifcq,struct __kern_packet * k_head,struct __kern_packet * k_tail,uint32_t cnt,uint32_t bytes,boolean_t flush,boolean_t * pdrop)5549 ifnet_enqueue_pkt_chain_common(struct ifnet *ifp, struct ifclassq *ifcq,
5550 struct __kern_packet *k_head, struct __kern_packet *k_tail, uint32_t cnt,
5551 uint32_t bytes, boolean_t flush, boolean_t *pdrop)
5552 {
5553 classq_pkt_t head, tail;
5554
5555 ASSERT(k_head != NULL);
5556 ASSERT(k_tail != NULL);
5557 ASSERT(ifp != NULL);
5558 ASSERT((ifp->if_eflags & IFEF_TXSTART) != 0);
5559
5560 if (!IF_FULLY_ATTACHED(ifp)) {
5561 /* flag tested without lock for performance */
5562 pp_free_packet_chain(k_head, NULL);
5563 *pdrop = TRUE;
5564 return ENXIO;
5565 } else if (__improbable(!(ifp->if_flags & IFF_UP))) {
5566 pp_free_packet_chain(k_head, NULL);
5567 *pdrop = TRUE;
5568 return ENETDOWN;
5569 }
5570
5571 CLASSQ_PKT_INIT_PACKET(&head, k_head);
5572 CLASSQ_PKT_INIT_PACKET(&tail, k_tail);
5573 return ifnet_enqueue_ifclassq_chain(ifp, ifcq, &head, &tail, cnt, bytes,
5574 flush, pdrop);
5575 }
5576
5577 errno_t
ifnet_enqueue_pkt_chain(struct ifnet * ifp,struct __kern_packet * k_head,struct __kern_packet * k_tail,uint32_t cnt,uint32_t bytes,boolean_t flush,boolean_t * pdrop)5578 ifnet_enqueue_pkt_chain(struct ifnet *ifp, struct __kern_packet *k_head,
5579 struct __kern_packet *k_tail, uint32_t cnt, uint32_t bytes, boolean_t flush,
5580 boolean_t *pdrop)
5581 {
5582 return ifnet_enqueue_pkt_chain_common(ifp, NULL, k_head, k_tail,
5583 cnt, bytes, flush, pdrop);
5584 }
5585
5586 errno_t
ifnet_enqueue_ifcq_pkt_chain(struct ifnet * ifp,struct ifclassq * ifcq,struct __kern_packet * k_head,struct __kern_packet * k_tail,uint32_t cnt,uint32_t bytes,boolean_t flush,boolean_t * pdrop)5587 ifnet_enqueue_ifcq_pkt_chain(struct ifnet *ifp, struct ifclassq *ifcq,
5588 struct __kern_packet *k_head, struct __kern_packet *k_tail, uint32_t cnt,
5589 uint32_t bytes, boolean_t flush, boolean_t *pdrop)
5590 {
5591 return ifnet_enqueue_pkt_chain_common(ifp, ifcq, k_head, k_tail,
5592 cnt, bytes, flush, pdrop);
5593 }
5594 #endif /* SKYWALK */
5595
5596 errno_t
ifnet_dequeue(struct ifnet * ifp,struct mbuf ** mp)5597 ifnet_dequeue(struct ifnet *ifp, struct mbuf **mp)
5598 {
5599 errno_t rc;
5600 classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
5601
5602 if (ifp == NULL || mp == NULL) {
5603 return EINVAL;
5604 } else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5605 ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5606 return ENXIO;
5607 }
5608 if (!ifnet_is_attached(ifp, 1)) {
5609 return ENXIO;
5610 }
5611
5612 #if SKYWALK
5613 ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5614 #endif /* SKYWALK */
5615 rc = ifclassq_dequeue(ifp->if_snd, 1, CLASSQ_DEQUEUE_MAX_BYTE_LIMIT,
5616 &pkt, NULL, NULL, NULL, 0);
5617 VERIFY((pkt.cp_ptype == QP_MBUF) || (pkt.cp_mbuf == NULL));
5618 ifnet_decr_iorefcnt(ifp);
5619 *mp = pkt.cp_mbuf;
5620 return rc;
5621 }
5622
5623 errno_t
ifnet_dequeue_service_class(struct ifnet * ifp,mbuf_svc_class_t sc,struct mbuf ** mp)5624 ifnet_dequeue_service_class(struct ifnet *ifp, mbuf_svc_class_t sc,
5625 struct mbuf **mp)
5626 {
5627 errno_t rc;
5628 classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
5629
5630 if (ifp == NULL || mp == NULL || !MBUF_VALID_SC(sc)) {
5631 return EINVAL;
5632 } else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5633 ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5634 return ENXIO;
5635 }
5636 if (!ifnet_is_attached(ifp, 1)) {
5637 return ENXIO;
5638 }
5639
5640 #if SKYWALK
5641 ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5642 #endif /* SKYWALK */
5643 rc = ifclassq_dequeue_sc(ifp->if_snd, sc, 1,
5644 CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, &pkt, NULL, NULL, NULL, 0);
5645 VERIFY((pkt.cp_ptype == QP_MBUF) || (pkt.cp_mbuf == NULL));
5646 ifnet_decr_iorefcnt(ifp);
5647 *mp = pkt.cp_mbuf;
5648 return rc;
5649 }
5650
5651 errno_t
ifnet_dequeue_multi(struct ifnet * ifp,u_int32_t pkt_limit,struct mbuf ** head,struct mbuf ** tail,u_int32_t * cnt,u_int32_t * len)5652 ifnet_dequeue_multi(struct ifnet *ifp, u_int32_t pkt_limit,
5653 struct mbuf **head, struct mbuf **tail, u_int32_t *cnt, u_int32_t *len)
5654 {
5655 errno_t rc;
5656 classq_pkt_t pkt_head = CLASSQ_PKT_INITIALIZER(pkt_head);
5657 classq_pkt_t pkt_tail = CLASSQ_PKT_INITIALIZER(pkt_tail);
5658
5659 if (ifp == NULL || head == NULL || pkt_limit < 1) {
5660 return EINVAL;
5661 } else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5662 ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5663 return ENXIO;
5664 }
5665 if (!ifnet_is_attached(ifp, 1)) {
5666 return ENXIO;
5667 }
5668
5669 #if SKYWALK
5670 ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5671 #endif /* SKYWALK */
5672 rc = ifclassq_dequeue(ifp->if_snd, pkt_limit,
5673 CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, &pkt_head, &pkt_tail, cnt, len, 0);
5674 VERIFY((pkt_head.cp_ptype == QP_MBUF) || (pkt_head.cp_mbuf == NULL));
5675 ifnet_decr_iorefcnt(ifp);
5676 *head = pkt_head.cp_mbuf;
5677 if (tail != NULL) {
5678 *tail = pkt_tail.cp_mbuf;
5679 }
5680 return rc;
5681 }
5682
5683 errno_t
ifnet_dequeue_multi_bytes(struct ifnet * ifp,u_int32_t byte_limit,struct mbuf ** head,struct mbuf ** tail,u_int32_t * cnt,u_int32_t * len)5684 ifnet_dequeue_multi_bytes(struct ifnet *ifp, u_int32_t byte_limit,
5685 struct mbuf **head, struct mbuf **tail, u_int32_t *cnt, u_int32_t *len)
5686 {
5687 errno_t rc;
5688 classq_pkt_t pkt_head = CLASSQ_PKT_INITIALIZER(pkt_head);
5689 classq_pkt_t pkt_tail = CLASSQ_PKT_INITIALIZER(pkt_tail);
5690
5691 if (ifp == NULL || head == NULL || byte_limit < 1) {
5692 return EINVAL;
5693 } else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5694 ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5695 return ENXIO;
5696 }
5697 if (!ifnet_is_attached(ifp, 1)) {
5698 return ENXIO;
5699 }
5700
5701 #if SKYWALK
5702 ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5703 #endif /* SKYWALK */
5704 rc = ifclassq_dequeue(ifp->if_snd, CLASSQ_DEQUEUE_MAX_PKT_LIMIT,
5705 byte_limit, &pkt_head, &pkt_tail, cnt, len, 0);
5706 VERIFY((pkt_head.cp_ptype == QP_MBUF) || (pkt_head.cp_mbuf == NULL));
5707 ifnet_decr_iorefcnt(ifp);
5708 *head = pkt_head.cp_mbuf;
5709 if (tail != NULL) {
5710 *tail = pkt_tail.cp_mbuf;
5711 }
5712 return rc;
5713 }
5714
5715 errno_t
ifnet_dequeue_service_class_multi(struct ifnet * ifp,mbuf_svc_class_t sc,u_int32_t pkt_limit,struct mbuf ** head,struct mbuf ** tail,u_int32_t * cnt,u_int32_t * len)5716 ifnet_dequeue_service_class_multi(struct ifnet *ifp, mbuf_svc_class_t sc,
5717 u_int32_t pkt_limit, struct mbuf **head, struct mbuf **tail, u_int32_t *cnt,
5718 u_int32_t *len)
5719 {
5720 errno_t rc;
5721 classq_pkt_t pkt_head = CLASSQ_PKT_INITIALIZER(pkt_head);
5722 classq_pkt_t pkt_tail = CLASSQ_PKT_INITIALIZER(pkt_tail);
5723
5724 if (ifp == NULL || head == NULL || pkt_limit < 1 ||
5725 !MBUF_VALID_SC(sc)) {
5726 return EINVAL;
5727 } else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5728 ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5729 return ENXIO;
5730 }
5731 if (!ifnet_is_attached(ifp, 1)) {
5732 return ENXIO;
5733 }
5734
5735 #if SKYWALK
5736 ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5737 #endif /* SKYWALK */
5738 rc = ifclassq_dequeue_sc(ifp->if_snd, sc, pkt_limit,
5739 CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, &pkt_head, &pkt_tail,
5740 cnt, len, 0);
5741 VERIFY((pkt_head.cp_ptype == QP_MBUF) || (pkt_head.cp_mbuf == NULL));
5742 ifnet_decr_iorefcnt(ifp);
5743 *head = pkt_head.cp_mbuf;
5744 if (tail != NULL) {
5745 *tail = pkt_tail.cp_mbuf;
5746 }
5747 return rc;
5748 }
5749
5750 #if XNU_TARGET_OS_OSX
5751 errno_t
ifnet_framer_stub(struct ifnet * ifp,struct mbuf ** m,const struct sockaddr * dest,const char * dest_linkaddr,const char * frame_type,u_int32_t * pre,u_int32_t * post)5752 ifnet_framer_stub(struct ifnet *ifp, struct mbuf **m,
5753 const struct sockaddr *dest, const char *dest_linkaddr,
5754 const char *frame_type, u_int32_t *pre, u_int32_t *post)
5755 {
5756 if (pre != NULL) {
5757 *pre = 0;
5758 }
5759 if (post != NULL) {
5760 *post = 0;
5761 }
5762
5763 return ifp->if_framer_legacy(ifp, m, dest, dest_linkaddr, frame_type);
5764 }
5765 #endif /* XNU_TARGET_OS_OSX */
5766
5767 static boolean_t
packet_has_vlan_tag(struct mbuf * m)5768 packet_has_vlan_tag(struct mbuf * m)
5769 {
5770 u_int tag = 0;
5771
5772 if ((m->m_pkthdr.csum_flags & CSUM_VLAN_TAG_VALID) != 0) {
5773 tag = EVL_VLANOFTAG(m->m_pkthdr.vlan_tag);
5774 if (tag == 0) {
5775 /* the packet is just priority-tagged, clear the bit */
5776 m->m_pkthdr.csum_flags &= ~CSUM_VLAN_TAG_VALID;
5777 }
5778 }
5779 return tag != 0;
5780 }
5781
5782 static int
dlil_interface_filters_input(struct ifnet * ifp,struct mbuf ** m_p,char ** frame_header_p,protocol_family_t protocol_family)5783 dlil_interface_filters_input(struct ifnet *ifp, struct mbuf **m_p,
5784 char **frame_header_p, protocol_family_t protocol_family)
5785 {
5786 boolean_t is_vlan_packet = FALSE;
5787 struct ifnet_filter *filter;
5788 struct mbuf *m = *m_p;
5789
5790 is_vlan_packet = packet_has_vlan_tag(m);
5791
5792 if (TAILQ_EMPTY(&ifp->if_flt_head)) {
5793 return 0;
5794 }
5795
5796 /*
5797 * Pass the inbound packet to the interface filters
5798 */
5799 lck_mtx_lock_spin(&ifp->if_flt_lock);
5800 /* prevent filter list from changing in case we drop the lock */
5801 if_flt_monitor_busy(ifp);
5802 TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
5803 int result;
5804
5805 /* exclude VLAN packets from external filters PR-3586856 */
5806 if (is_vlan_packet &&
5807 (filter->filt_flags & DLIL_IFF_INTERNAL) == 0) {
5808 continue;
5809 }
5810
5811 if (!filter->filt_skip && filter->filt_input != NULL &&
5812 (filter->filt_protocol == 0 ||
5813 filter->filt_protocol == protocol_family)) {
5814 lck_mtx_unlock(&ifp->if_flt_lock);
5815
5816 result = (*filter->filt_input)(filter->filt_cookie,
5817 ifp, protocol_family, m_p, frame_header_p);
5818
5819 lck_mtx_lock_spin(&ifp->if_flt_lock);
5820 if (result != 0) {
5821 /* we're done with the filter list */
5822 if_flt_monitor_unbusy(ifp);
5823 lck_mtx_unlock(&ifp->if_flt_lock);
5824 return result;
5825 }
5826 }
5827 }
5828 /* we're done with the filter list */
5829 if_flt_monitor_unbusy(ifp);
5830 lck_mtx_unlock(&ifp->if_flt_lock);
5831
5832 /*
5833 * Strip away M_PROTO1 bit prior to sending packet up the stack as
5834 * it is meant to be local to a subsystem -- if_bridge for M_PROTO1
5835 */
5836 if (*m_p != NULL) {
5837 (*m_p)->m_flags &= ~M_PROTO1;
5838 }
5839
5840 return 0;
5841 }
5842
5843 __attribute__((noinline))
5844 static int
dlil_interface_filters_output(struct ifnet * ifp,struct mbuf ** m_p,protocol_family_t protocol_family)5845 dlil_interface_filters_output(struct ifnet *ifp, struct mbuf **m_p,
5846 protocol_family_t protocol_family)
5847 {
5848 boolean_t is_vlan_packet;
5849 struct ifnet_filter *filter;
5850 struct mbuf *m = *m_p;
5851
5852 is_vlan_packet = packet_has_vlan_tag(m);
5853
5854 /*
5855 * Pass the outbound packet to the interface filters
5856 */
5857 lck_mtx_lock_spin(&ifp->if_flt_lock);
5858 /* prevent filter list from changing in case we drop the lock */
5859 if_flt_monitor_busy(ifp);
5860 TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
5861 int result;
5862
5863 /* exclude VLAN packets from external filters PR-3586856 */
5864 if (is_vlan_packet &&
5865 (filter->filt_flags & DLIL_IFF_INTERNAL) == 0) {
5866 continue;
5867 }
5868
5869 if (!filter->filt_skip && filter->filt_output != NULL &&
5870 (filter->filt_protocol == 0 ||
5871 filter->filt_protocol == protocol_family)) {
5872 lck_mtx_unlock(&ifp->if_flt_lock);
5873
5874 result = filter->filt_output(filter->filt_cookie, ifp,
5875 protocol_family, m_p);
5876
5877 lck_mtx_lock_spin(&ifp->if_flt_lock);
5878 if (result != 0) {
5879 /* we're done with the filter list */
5880 if_flt_monitor_unbusy(ifp);
5881 lck_mtx_unlock(&ifp->if_flt_lock);
5882 return result;
5883 }
5884 }
5885 }
5886 /* we're done with the filter list */
5887 if_flt_monitor_unbusy(ifp);
5888 lck_mtx_unlock(&ifp->if_flt_lock);
5889
5890 return 0;
5891 }
5892
5893 static void
dlil_ifproto_input(struct if_proto * ifproto,mbuf_t m)5894 dlil_ifproto_input(struct if_proto * ifproto, mbuf_t m)
5895 {
5896 int error;
5897
5898 if (ifproto->proto_kpi == kProtoKPI_v1) {
5899 /* Version 1 protocols get one packet at a time */
5900 while (m != NULL) {
5901 char * frame_header;
5902 mbuf_t next_packet;
5903
5904 next_packet = m->m_nextpkt;
5905 m->m_nextpkt = NULL;
5906 frame_header = m->m_pkthdr.pkt_hdr;
5907 m->m_pkthdr.pkt_hdr = NULL;
5908 error = (*ifproto->kpi.v1.input)(ifproto->ifp,
5909 ifproto->protocol_family, m, frame_header);
5910 if (error != 0 && error != EJUSTRETURN) {
5911 m_freem(m);
5912 }
5913 m = next_packet;
5914 }
5915 } else if (ifproto->proto_kpi == kProtoKPI_v2) {
5916 /* Version 2 protocols support packet lists */
5917 error = (*ifproto->kpi.v2.input)(ifproto->ifp,
5918 ifproto->protocol_family, m);
5919 if (error != 0 && error != EJUSTRETURN) {
5920 m_freem_list(m);
5921 }
5922 }
5923 }
5924
5925 static void
dlil_input_stats_add(const struct ifnet_stat_increment_param * s,struct dlil_threading_info * inp,struct ifnet * ifp,boolean_t poll)5926 dlil_input_stats_add(const struct ifnet_stat_increment_param *s,
5927 struct dlil_threading_info *inp, struct ifnet *ifp, boolean_t poll)
5928 {
5929 struct ifnet_stat_increment_param *d = &inp->dlth_stats;
5930
5931 if (s->packets_in != 0) {
5932 d->packets_in += s->packets_in;
5933 }
5934 if (s->bytes_in != 0) {
5935 d->bytes_in += s->bytes_in;
5936 }
5937 if (s->errors_in != 0) {
5938 d->errors_in += s->errors_in;
5939 }
5940
5941 if (s->packets_out != 0) {
5942 d->packets_out += s->packets_out;
5943 }
5944 if (s->bytes_out != 0) {
5945 d->bytes_out += s->bytes_out;
5946 }
5947 if (s->errors_out != 0) {
5948 d->errors_out += s->errors_out;
5949 }
5950
5951 if (s->collisions != 0) {
5952 d->collisions += s->collisions;
5953 }
5954 if (s->dropped != 0) {
5955 d->dropped += s->dropped;
5956 }
5957
5958 if (poll) {
5959 PKTCNTR_ADD(&ifp->if_poll_tstats, s->packets_in, s->bytes_in);
5960 }
5961 }
5962
5963 static boolean_t
dlil_input_stats_sync(struct ifnet * ifp,struct dlil_threading_info * inp)5964 dlil_input_stats_sync(struct ifnet *ifp, struct dlil_threading_info *inp)
5965 {
5966 struct ifnet_stat_increment_param *s = &inp->dlth_stats;
5967
5968 /*
5969 * Use of atomic operations is unavoidable here because
5970 * these stats may also be incremented elsewhere via KPIs.
5971 */
5972 if (s->packets_in != 0) {
5973 atomic_add_64(&ifp->if_data.ifi_ipackets, s->packets_in);
5974 s->packets_in = 0;
5975 }
5976 if (s->bytes_in != 0) {
5977 atomic_add_64(&ifp->if_data.ifi_ibytes, s->bytes_in);
5978 s->bytes_in = 0;
5979 }
5980 if (s->errors_in != 0) {
5981 atomic_add_64(&ifp->if_data.ifi_ierrors, s->errors_in);
5982 s->errors_in = 0;
5983 }
5984
5985 if (s->packets_out != 0) {
5986 atomic_add_64(&ifp->if_data.ifi_opackets, s->packets_out);
5987 s->packets_out = 0;
5988 }
5989 if (s->bytes_out != 0) {
5990 atomic_add_64(&ifp->if_data.ifi_obytes, s->bytes_out);
5991 s->bytes_out = 0;
5992 }
5993 if (s->errors_out != 0) {
5994 atomic_add_64(&ifp->if_data.ifi_oerrors, s->errors_out);
5995 s->errors_out = 0;
5996 }
5997
5998 if (s->collisions != 0) {
5999 atomic_add_64(&ifp->if_data.ifi_collisions, s->collisions);
6000 s->collisions = 0;
6001 }
6002 if (s->dropped != 0) {
6003 atomic_add_64(&ifp->if_data.ifi_iqdrops, s->dropped);
6004 s->dropped = 0;
6005 }
6006
6007 /*
6008 * No need for atomic operations as they are modified here
6009 * only from within the DLIL input thread context.
6010 */
6011 if (ifp->if_poll_tstats.packets != 0) {
6012 ifp->if_poll_pstats.ifi_poll_packets += ifp->if_poll_tstats.packets;
6013 ifp->if_poll_tstats.packets = 0;
6014 }
6015 if (ifp->if_poll_tstats.bytes != 0) {
6016 ifp->if_poll_pstats.ifi_poll_bytes += ifp->if_poll_tstats.bytes;
6017 ifp->if_poll_tstats.bytes = 0;
6018 }
6019
6020 return ifp->if_data_threshold != 0;
6021 }
6022
6023 __private_extern__ void
dlil_input_packet_list(struct ifnet * ifp,struct mbuf * m)6024 dlil_input_packet_list(struct ifnet *ifp, struct mbuf *m)
6025 {
6026 return dlil_input_packet_list_common(ifp, m, 0,
6027 IFNET_MODEL_INPUT_POLL_OFF, FALSE);
6028 }
6029
6030 __private_extern__ void
dlil_input_packet_list_extended(struct ifnet * ifp,struct mbuf * m,u_int32_t cnt,ifnet_model_t mode)6031 dlil_input_packet_list_extended(struct ifnet *ifp, struct mbuf *m,
6032 u_int32_t cnt, ifnet_model_t mode)
6033 {
6034 return dlil_input_packet_list_common(ifp, m, cnt, mode, TRUE);
6035 }
6036
6037 static void
dlil_input_packet_list_common(struct ifnet * ifp_param,struct mbuf * m,u_int32_t cnt,ifnet_model_t mode,boolean_t ext)6038 dlil_input_packet_list_common(struct ifnet *ifp_param, struct mbuf *m,
6039 u_int32_t cnt, ifnet_model_t mode, boolean_t ext)
6040 {
6041 int error = 0;
6042 protocol_family_t protocol_family;
6043 mbuf_t next_packet;
6044 ifnet_t ifp = ifp_param;
6045 char *frame_header = NULL;
6046 struct if_proto *last_ifproto = NULL;
6047 mbuf_t pkt_first = NULL;
6048 mbuf_t *pkt_next = NULL;
6049 u_int32_t poll_thresh = 0, poll_ival = 0;
6050 int iorefcnt = 0;
6051
6052 KERNEL_DEBUG(DBG_FNC_DLIL_INPUT | DBG_FUNC_START, 0, 0, 0, 0, 0);
6053
6054 if (ext && mode == IFNET_MODEL_INPUT_POLL_ON && cnt > 1 &&
6055 (poll_ival = if_rxpoll_interval_pkts) > 0) {
6056 poll_thresh = cnt;
6057 }
6058
6059 while (m != NULL) {
6060 struct if_proto *ifproto = NULL;
6061 uint32_t pktf_mask; /* pkt flags to preserve */
6062
6063 m_add_crumb(m, PKT_CRUMB_DLIL_INPUT);
6064
6065 if (ifp_param == NULL) {
6066 ifp = m->m_pkthdr.rcvif;
6067 }
6068
6069 if ((ifp->if_eflags & IFEF_RXPOLL) &&
6070 (ifp->if_xflags & IFXF_LEGACY) && poll_thresh != 0 &&
6071 poll_ival > 0 && (--poll_thresh % poll_ival) == 0) {
6072 ifnet_poll(ifp);
6073 }
6074
6075 /* Check if this mbuf looks valid */
6076 MBUF_INPUT_CHECK(m, ifp);
6077
6078 next_packet = m->m_nextpkt;
6079 m->m_nextpkt = NULL;
6080 frame_header = m->m_pkthdr.pkt_hdr;
6081 m->m_pkthdr.pkt_hdr = NULL;
6082
6083 /*
6084 * Get an IO reference count if the interface is not
6085 * loopback (lo0) and it is attached; lo0 never goes
6086 * away, so optimize for that.
6087 */
6088 if (ifp != lo_ifp) {
6089 /* iorefcnt is 0 if it hasn't been taken yet */
6090 if (iorefcnt == 0) {
6091 if (!ifnet_datamov_begin(ifp)) {
6092 m_freem(m);
6093 goto next;
6094 }
6095 }
6096 iorefcnt = 1;
6097 /*
6098 * Preserve the time stamp and skip pktap flags.
6099 */
6100 pktf_mask = PKTF_TS_VALID | PKTF_SKIP_PKTAP;
6101 } else {
6102 /*
6103 * If this arrived on lo0, preserve interface addr
6104 * info to allow for connectivity between loopback
6105 * and local interface addresses.
6106 */
6107 pktf_mask = (PKTF_LOOP | PKTF_IFAINFO);
6108 }
6109 pktf_mask |= PKTF_WAKE_PKT;
6110
6111 /* make sure packet comes in clean */
6112 m_classifier_init(m, pktf_mask);
6113
6114 ifp_inc_traffic_class_in(ifp, m);
6115
6116 /* find which protocol family this packet is for */
6117 ifnet_lock_shared(ifp);
6118 error = (*ifp->if_demux)(ifp, m, frame_header,
6119 &protocol_family);
6120 ifnet_lock_done(ifp);
6121 if (error != 0) {
6122 if (error == EJUSTRETURN) {
6123 goto next;
6124 }
6125 protocol_family = 0;
6126 }
6127
6128 #if (DEVELOPMENT || DEBUG)
6129 /*
6130 * For testing we do not care about broadcast and multicast packets as
6131 * they are not as controllable as unicast traffic
6132 */
6133 if (__improbable(ifp->if_xflags & IFXF_MARK_WAKE_PKT)) {
6134 if ((protocol_family == PF_INET || protocol_family == PF_INET6) &&
6135 (m->m_flags & (M_BCAST | M_MCAST)) == 0) {
6136 /*
6137 * This is a one-shot command
6138 */
6139 ifp->if_xflags &= ~IFXF_MARK_WAKE_PKT;
6140 m->m_pkthdr.pkt_flags |= PKTF_WAKE_PKT;
6141 }
6142 }
6143 #endif /* (DEVELOPMENT || DEBUG) */
6144 if (__improbable(net_wake_pkt_debug > 0 && (m->m_pkthdr.pkt_flags & PKTF_WAKE_PKT))) {
6145 char buffer[64];
6146 size_t buflen = MIN(mbuf_pkthdr_len(m), sizeof(buffer));
6147
6148 os_log(OS_LOG_DEFAULT, "wake packet from %s len %d",
6149 ifp->if_xname, m_pktlen(m));
6150 if (mbuf_copydata(m, 0, buflen, buffer) == 0) {
6151 log_hexdump(buffer, buflen);
6152 }
6153 }
6154
6155 pktap_input(ifp, protocol_family, m, frame_header);
6156
6157 /* Drop v4 packets received on CLAT46 enabled cell interface */
6158 if (protocol_family == PF_INET && IS_INTF_CLAT46(ifp) &&
6159 ifp->if_type == IFT_CELLULAR) {
6160 m_freem(m);
6161 ip6stat.ip6s_clat464_in_v4_drop++;
6162 goto next;
6163 }
6164
6165 /* Translate the packet if it is received on CLAT interface */
6166 if (protocol_family == PF_INET6 && IS_INTF_CLAT46(ifp)
6167 && dlil_is_clat_needed(protocol_family, m)) {
6168 char *data = NULL;
6169 struct ether_header eh;
6170 struct ether_header *ehp = NULL;
6171
6172 if (ifp->if_type == IFT_ETHER) {
6173 ehp = (struct ether_header *)(void *)frame_header;
6174 /* Skip RX Ethernet packets if they are not IPV6 */
6175 if (ntohs(ehp->ether_type) != ETHERTYPE_IPV6) {
6176 goto skip_clat;
6177 }
6178
6179 /* Keep a copy of frame_header for Ethernet packets */
6180 bcopy(frame_header, (caddr_t)&eh, ETHER_HDR_LEN);
6181 }
6182 error = dlil_clat64(ifp, &protocol_family, &m);
6183 data = (char *) mbuf_data(m);
6184 if (error != 0) {
6185 m_freem(m);
6186 ip6stat.ip6s_clat464_in_drop++;
6187 goto next;
6188 }
6189 /* Native v6 should be No-op */
6190 if (protocol_family != PF_INET) {
6191 goto skip_clat;
6192 }
6193
6194 /* Do this only for translated v4 packets. */
6195 switch (ifp->if_type) {
6196 case IFT_CELLULAR:
6197 frame_header = data;
6198 break;
6199 case IFT_ETHER:
6200 /*
6201 * Drop if the mbuf doesn't have enough
6202 * space for Ethernet header
6203 */
6204 if (M_LEADINGSPACE(m) < ETHER_HDR_LEN) {
6205 m_free(m);
6206 ip6stat.ip6s_clat464_in_drop++;
6207 goto next;
6208 }
6209 /*
6210 * Set the frame_header ETHER_HDR_LEN bytes
6211 * preceeding the data pointer. Change
6212 * the ether_type too.
6213 */
6214 frame_header = data - ETHER_HDR_LEN;
6215 eh.ether_type = htons(ETHERTYPE_IP);
6216 bcopy((caddr_t)&eh, frame_header, ETHER_HDR_LEN);
6217 break;
6218 }
6219 }
6220 skip_clat:
6221 /*
6222 * Match the wake packet against the list of ports that has been
6223 * been queried by the driver before the device went to sleep
6224 */
6225 if (__improbable(m->m_pkthdr.pkt_flags & PKTF_WAKE_PKT)) {
6226 if (protocol_family != PF_INET && protocol_family != PF_INET6) {
6227 if_ports_used_match_mbuf(ifp, protocol_family, m);
6228 }
6229 }
6230 if (hwcksum_dbg != 0 && !(ifp->if_flags & IFF_LOOPBACK) &&
6231 !(m->m_pkthdr.pkt_flags & PKTF_LOOP)) {
6232 dlil_input_cksum_dbg(ifp, m, frame_header,
6233 protocol_family);
6234 }
6235 /*
6236 * For partial checksum offload, we expect the driver to
6237 * set the start offset indicating the start of the span
6238 * that is covered by the hardware-computed checksum;
6239 * adjust this start offset accordingly because the data
6240 * pointer has been advanced beyond the link-layer header.
6241 *
6242 * Virtual lan types (bridge, vlan, bond) can call
6243 * dlil_input_packet_list() with the same packet with the
6244 * checksum flags set. Set a flag indicating that the
6245 * adjustment has already been done.
6246 */
6247 if ((m->m_pkthdr.csum_flags & CSUM_ADJUST_DONE) != 0) {
6248 /* adjustment has already been done */
6249 } else if ((m->m_pkthdr.csum_flags &
6250 (CSUM_DATA_VALID | CSUM_PARTIAL)) ==
6251 (CSUM_DATA_VALID | CSUM_PARTIAL)) {
6252 int adj;
6253 if (frame_header == NULL ||
6254 frame_header < (char *)mbuf_datastart(m) ||
6255 frame_header > (char *)m->m_data ||
6256 (adj = (int)(m->m_data - frame_header)) >
6257 m->m_pkthdr.csum_rx_start) {
6258 m->m_pkthdr.csum_data = 0;
6259 m->m_pkthdr.csum_flags &= ~CSUM_DATA_VALID;
6260 hwcksum_in_invalidated++;
6261 } else {
6262 m->m_pkthdr.csum_rx_start -= adj;
6263 }
6264 /* make sure we don't adjust more than once */
6265 m->m_pkthdr.csum_flags |= CSUM_ADJUST_DONE;
6266 }
6267 if (clat_debug) {
6268 pktap_input(ifp, protocol_family, m, frame_header);
6269 }
6270
6271 if (m->m_flags & (M_BCAST | M_MCAST)) {
6272 atomic_add_64(&ifp->if_imcasts, 1);
6273 }
6274
6275 /* run interface filters */
6276 error = dlil_interface_filters_input(ifp, &m,
6277 &frame_header, protocol_family);
6278 if (error != 0) {
6279 if (error != EJUSTRETURN) {
6280 m_freem(m);
6281 }
6282 goto next;
6283 }
6284 /*
6285 * A VLAN interface receives VLAN-tagged packets by attaching
6286 * its PF_VLAN protocol to a parent interface. When a VLAN
6287 * interface is a member of a bridge, the parent interface
6288 * receives VLAN-tagged M_PROMISC packets. A VLAN-tagged
6289 * M_PROMISC packet must be processed by the VLAN protocol
6290 * so that it can be sent up the stack via
6291 * dlil_input_packet_list(). That allows the bridge interface's
6292 * input filter, attached to the VLAN interface, to process
6293 * the packet.
6294 */
6295 if (protocol_family != PF_VLAN &&
6296 (m->m_flags & M_PROMISC) != 0) {
6297 m_freem(m);
6298 goto next;
6299 }
6300
6301 /* Lookup the protocol attachment to this interface */
6302 if (protocol_family == 0) {
6303 ifproto = NULL;
6304 } else if (last_ifproto != NULL && last_ifproto->ifp == ifp &&
6305 (last_ifproto->protocol_family == protocol_family)) {
6306 VERIFY(ifproto == NULL);
6307 ifproto = last_ifproto;
6308 if_proto_ref(last_ifproto);
6309 } else {
6310 VERIFY(ifproto == NULL);
6311 ifnet_lock_shared(ifp);
6312 /* callee holds a proto refcnt upon success */
6313 ifproto = find_attached_proto(ifp, protocol_family);
6314 ifnet_lock_done(ifp);
6315 }
6316 if (ifproto == NULL) {
6317 /* no protocol for this packet, discard */
6318 m_freem(m);
6319 goto next;
6320 }
6321 if (ifproto != last_ifproto) {
6322 if (last_ifproto != NULL) {
6323 /* pass up the list for the previous protocol */
6324 dlil_ifproto_input(last_ifproto, pkt_first);
6325 pkt_first = NULL;
6326 if_proto_free(last_ifproto);
6327 }
6328 last_ifproto = ifproto;
6329 if_proto_ref(ifproto);
6330 }
6331 /* extend the list */
6332 m->m_pkthdr.pkt_hdr = frame_header;
6333 if (pkt_first == NULL) {
6334 pkt_first = m;
6335 } else {
6336 *pkt_next = m;
6337 }
6338 pkt_next = &m->m_nextpkt;
6339
6340 next:
6341 if (next_packet == NULL && last_ifproto != NULL) {
6342 /* pass up the last list of packets */
6343 dlil_ifproto_input(last_ifproto, pkt_first);
6344 if_proto_free(last_ifproto);
6345 last_ifproto = NULL;
6346 }
6347 if (ifproto != NULL) {
6348 if_proto_free(ifproto);
6349 ifproto = NULL;
6350 }
6351
6352 m = next_packet;
6353
6354 /* update the driver's multicast filter, if needed */
6355 if (ifp->if_updatemcasts > 0 && if_mcasts_update(ifp) == 0) {
6356 ifp->if_updatemcasts = 0;
6357 }
6358 if (iorefcnt == 1) {
6359 /* If the next mbuf is on a different interface, unlock data-mov */
6360 if (!m || (ifp != ifp_param && ifp != m->m_pkthdr.rcvif)) {
6361 ifnet_datamov_end(ifp);
6362 iorefcnt = 0;
6363 }
6364 }
6365 }
6366
6367 KERNEL_DEBUG(DBG_FNC_DLIL_INPUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
6368 }
6369
6370 errno_t
if_mcasts_update(struct ifnet * ifp)6371 if_mcasts_update(struct ifnet *ifp)
6372 {
6373 errno_t err;
6374
6375 err = ifnet_ioctl(ifp, 0, SIOCADDMULTI, NULL);
6376 if (err == EAFNOSUPPORT) {
6377 err = 0;
6378 }
6379 DLIL_PRINTF("%s: %s %d suspended link-layer multicast membership(s) "
6380 "(err=%d)\n", if_name(ifp),
6381 (err == 0 ? "successfully restored" : "failed to restore"),
6382 ifp->if_updatemcasts, err);
6383
6384 /* just return success */
6385 return 0;
6386 }
6387
6388 /* If ifp is set, we will increment the generation for the interface */
6389 int
dlil_post_complete_msg(struct ifnet * ifp,struct kev_msg * event)6390 dlil_post_complete_msg(struct ifnet *ifp, struct kev_msg *event)
6391 {
6392 if (ifp != NULL) {
6393 ifnet_increment_generation(ifp);
6394 }
6395
6396 #if NECP
6397 necp_update_all_clients();
6398 #endif /* NECP */
6399
6400 return kev_post_msg(event);
6401 }
6402
6403 __private_extern__ void
dlil_post_sifflags_msg(struct ifnet * ifp)6404 dlil_post_sifflags_msg(struct ifnet * ifp)
6405 {
6406 struct kev_msg ev_msg;
6407 struct net_event_data ev_data;
6408
6409 bzero(&ev_data, sizeof(ev_data));
6410 bzero(&ev_msg, sizeof(ev_msg));
6411 ev_msg.vendor_code = KEV_VENDOR_APPLE;
6412 ev_msg.kev_class = KEV_NETWORK_CLASS;
6413 ev_msg.kev_subclass = KEV_DL_SUBCLASS;
6414 ev_msg.event_code = KEV_DL_SIFFLAGS;
6415 strlcpy(&ev_data.if_name[0], ifp->if_name, IFNAMSIZ);
6416 ev_data.if_family = ifp->if_family;
6417 ev_data.if_unit = (u_int32_t) ifp->if_unit;
6418 ev_msg.dv[0].data_length = sizeof(struct net_event_data);
6419 ev_msg.dv[0].data_ptr = &ev_data;
6420 ev_msg.dv[1].data_length = 0;
6421 dlil_post_complete_msg(ifp, &ev_msg);
6422 }
6423
6424 #define TMP_IF_PROTO_ARR_SIZE 10
6425 static int
dlil_event_internal(struct ifnet * ifp,struct kev_msg * event,bool update_generation)6426 dlil_event_internal(struct ifnet *ifp, struct kev_msg *event, bool update_generation)
6427 {
6428 struct ifnet_filter *filter = NULL;
6429 struct if_proto *proto = NULL;
6430 int if_proto_count = 0;
6431 struct if_proto *tmp_ifproto_stack_arr[TMP_IF_PROTO_ARR_SIZE] = {NULL};
6432 struct if_proto **tmp_ifproto_arr = tmp_ifproto_stack_arr;
6433 int tmp_ifproto_arr_idx = 0;
6434
6435 /*
6436 * Pass the event to the interface filters
6437 */
6438 lck_mtx_lock_spin(&ifp->if_flt_lock);
6439 /* prevent filter list from changing in case we drop the lock */
6440 if_flt_monitor_busy(ifp);
6441 TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
6442 if (filter->filt_event != NULL) {
6443 lck_mtx_unlock(&ifp->if_flt_lock);
6444
6445 filter->filt_event(filter->filt_cookie, ifp,
6446 filter->filt_protocol, event);
6447
6448 lck_mtx_lock_spin(&ifp->if_flt_lock);
6449 }
6450 }
6451 /* we're done with the filter list */
6452 if_flt_monitor_unbusy(ifp);
6453 lck_mtx_unlock(&ifp->if_flt_lock);
6454
6455 /* Get an io ref count if the interface is attached */
6456 if (!ifnet_is_attached(ifp, 1)) {
6457 goto done;
6458 }
6459
6460 /*
6461 * An embedded tmp_list_entry in if_proto may still get
6462 * over-written by another thread after giving up ifnet lock,
6463 * therefore we are avoiding embedded pointers here.
6464 */
6465 ifnet_lock_shared(ifp);
6466 if_proto_count = dlil_ifp_protolist(ifp, NULL, 0);
6467 if (if_proto_count) {
6468 int i;
6469 VERIFY(ifp->if_proto_hash != NULL);
6470 if (if_proto_count <= TMP_IF_PROTO_ARR_SIZE) {
6471 tmp_ifproto_arr = tmp_ifproto_stack_arr;
6472 } else {
6473 tmp_ifproto_arr = kalloc_type(struct if_proto *,
6474 if_proto_count, Z_WAITOK | Z_ZERO);
6475 if (tmp_ifproto_arr == NULL) {
6476 ifnet_lock_done(ifp);
6477 goto cleanup;
6478 }
6479 }
6480
6481 for (i = 0; i < PROTO_HASH_SLOTS; i++) {
6482 SLIST_FOREACH(proto, &ifp->if_proto_hash[i],
6483 next_hash) {
6484 if_proto_ref(proto);
6485 tmp_ifproto_arr[tmp_ifproto_arr_idx] = proto;
6486 tmp_ifproto_arr_idx++;
6487 }
6488 }
6489 VERIFY(if_proto_count == tmp_ifproto_arr_idx);
6490 }
6491 ifnet_lock_done(ifp);
6492
6493 for (tmp_ifproto_arr_idx = 0; tmp_ifproto_arr_idx < if_proto_count;
6494 tmp_ifproto_arr_idx++) {
6495 proto = tmp_ifproto_arr[tmp_ifproto_arr_idx];
6496 VERIFY(proto != NULL);
6497 proto_media_event eventp =
6498 (proto->proto_kpi == kProtoKPI_v1 ?
6499 proto->kpi.v1.event :
6500 proto->kpi.v2.event);
6501
6502 if (eventp != NULL) {
6503 eventp(ifp, proto->protocol_family,
6504 event);
6505 }
6506 if_proto_free(proto);
6507 }
6508
6509 cleanup:
6510 if (tmp_ifproto_arr != tmp_ifproto_stack_arr) {
6511 kfree_type(struct if_proto *, if_proto_count, tmp_ifproto_arr);
6512 }
6513
6514 /* Pass the event to the interface */
6515 if (ifp->if_event != NULL) {
6516 ifp->if_event(ifp, event);
6517 }
6518
6519 /* Release the io ref count */
6520 ifnet_decr_iorefcnt(ifp);
6521 done:
6522 return dlil_post_complete_msg(update_generation ? ifp : NULL, event);
6523 }
6524
6525 errno_t
ifnet_event(ifnet_t ifp,struct kern_event_msg * event)6526 ifnet_event(ifnet_t ifp, struct kern_event_msg *event)
6527 {
6528 struct kev_msg kev_msg;
6529 int result = 0;
6530
6531 if (ifp == NULL || event == NULL) {
6532 return EINVAL;
6533 }
6534
6535 bzero(&kev_msg, sizeof(kev_msg));
6536 kev_msg.vendor_code = event->vendor_code;
6537 kev_msg.kev_class = event->kev_class;
6538 kev_msg.kev_subclass = event->kev_subclass;
6539 kev_msg.event_code = event->event_code;
6540 kev_msg.dv[0].data_ptr = &event->event_data[0];
6541 kev_msg.dv[0].data_length = event->total_size - KEV_MSG_HEADER_SIZE;
6542 kev_msg.dv[1].data_length = 0;
6543
6544 result = dlil_event_internal(ifp, &kev_msg, TRUE);
6545
6546 return result;
6547 }
6548
6549 static void
dlil_count_chain_len(mbuf_t m,struct chain_len_stats * cls)6550 dlil_count_chain_len(mbuf_t m, struct chain_len_stats *cls)
6551 {
6552 mbuf_t n = m;
6553 int chainlen = 0;
6554
6555 while (n != NULL) {
6556 chainlen++;
6557 n = n->m_next;
6558 }
6559 switch (chainlen) {
6560 case 0:
6561 break;
6562 case 1:
6563 atomic_add_64(&cls->cls_one, 1);
6564 break;
6565 case 2:
6566 atomic_add_64(&cls->cls_two, 1);
6567 break;
6568 case 3:
6569 atomic_add_64(&cls->cls_three, 1);
6570 break;
6571 case 4:
6572 atomic_add_64(&cls->cls_four, 1);
6573 break;
6574 case 5:
6575 default:
6576 atomic_add_64(&cls->cls_five_or_more, 1);
6577 break;
6578 }
6579 }
6580
6581 #if CONFIG_DTRACE
6582 __attribute__((noinline))
6583 static void
dlil_output_dtrace(ifnet_t ifp,protocol_family_t proto_family,mbuf_t m)6584 dlil_output_dtrace(ifnet_t ifp, protocol_family_t proto_family, mbuf_t m)
6585 {
6586 if (proto_family == PF_INET) {
6587 struct ip *ip = mtod(m, struct ip *);
6588 DTRACE_IP6(send, struct mbuf *, m, struct inpcb *, NULL,
6589 struct ip *, ip, struct ifnet *, ifp,
6590 struct ip *, ip, struct ip6_hdr *, NULL);
6591 } else if (proto_family == PF_INET6) {
6592 struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
6593 DTRACE_IP6(send, struct mbuf *, m, struct inpcb *, NULL,
6594 struct ip6_hdr *, ip6, struct ifnet *, ifp,
6595 struct ip *, NULL, struct ip6_hdr *, ip6);
6596 }
6597 }
6598 #endif /* CONFIG_DTRACE */
6599
6600 /*
6601 * dlil_output
6602 *
6603 * Caller should have a lock on the protocol domain if the protocol
6604 * doesn't support finer grained locking. In most cases, the lock
6605 * will be held from the socket layer and won't be released until
6606 * we return back to the socket layer.
6607 *
6608 * This does mean that we must take a protocol lock before we take
6609 * an interface lock if we're going to take both. This makes sense
6610 * because a protocol is likely to interact with an ifp while it
6611 * is under the protocol lock.
6612 *
6613 * An advisory code will be returned if adv is not null. This
6614 * can be used to provide feedback about interface queues to the
6615 * application.
6616 */
6617 errno_t
dlil_output(ifnet_t ifp,protocol_family_t proto_family,mbuf_t packetlist,void * route,const struct sockaddr * dest,int raw,struct flowadv * adv)6618 dlil_output(ifnet_t ifp, protocol_family_t proto_family, mbuf_t packetlist,
6619 void *route, const struct sockaddr *dest, int raw, struct flowadv *adv)
6620 {
6621 char *frame_type = NULL;
6622 char *dst_linkaddr = NULL;
6623 int retval = 0;
6624 char frame_type_buffer[MAX_FRAME_TYPE_SIZE * 4];
6625 char dst_linkaddr_buffer[MAX_LINKADDR * 4];
6626 struct if_proto *proto = NULL;
6627 mbuf_t m = NULL;
6628 mbuf_t send_head = NULL;
6629 mbuf_t *send_tail = &send_head;
6630 int iorefcnt = 0;
6631 u_int32_t pre = 0, post = 0;
6632 u_int32_t fpkts = 0, fbytes = 0;
6633 int32_t flen = 0;
6634 struct timespec now;
6635 u_int64_t now_nsec;
6636 boolean_t did_clat46 = FALSE;
6637 protocol_family_t old_proto_family = proto_family;
6638 struct sockaddr_in6 dest6;
6639 struct rtentry *rt = NULL;
6640 u_int32_t m_loop_set = 0;
6641
6642 KERNEL_DEBUG(DBG_FNC_DLIL_OUTPUT | DBG_FUNC_START, 0, 0, 0, 0, 0);
6643
6644 /*
6645 * Get an io refcnt if the interface is attached to prevent ifnet_detach
6646 * from happening while this operation is in progress
6647 */
6648 if (!ifnet_datamov_begin(ifp)) {
6649 retval = ENXIO;
6650 goto cleanup;
6651 }
6652 iorefcnt = 1;
6653
6654 VERIFY(ifp->if_output_dlil != NULL);
6655
6656 /* update the driver's multicast filter, if needed */
6657 if (ifp->if_updatemcasts > 0 && if_mcasts_update(ifp) == 0) {
6658 ifp->if_updatemcasts = 0;
6659 }
6660
6661 frame_type = frame_type_buffer;
6662 dst_linkaddr = dst_linkaddr_buffer;
6663
6664 if (raw == 0) {
6665 ifnet_lock_shared(ifp);
6666 /* callee holds a proto refcnt upon success */
6667 proto = find_attached_proto(ifp, proto_family);
6668 if (proto == NULL) {
6669 ifnet_lock_done(ifp);
6670 retval = ENXIO;
6671 goto cleanup;
6672 }
6673 ifnet_lock_done(ifp);
6674 }
6675
6676 preout_again:
6677 if (packetlist == NULL) {
6678 goto cleanup;
6679 }
6680
6681 m = packetlist;
6682 packetlist = packetlist->m_nextpkt;
6683 m->m_nextpkt = NULL;
6684
6685 m_add_crumb(m, PKT_CRUMB_DLIL_OUTPUT);
6686
6687 /*
6688 * Perform address family translation for the first
6689 * packet outside the loop in order to perform address
6690 * lookup for the translated proto family.
6691 */
6692 if (proto_family == PF_INET && IS_INTF_CLAT46(ifp) &&
6693 (ifp->if_type == IFT_CELLULAR ||
6694 dlil_is_clat_needed(proto_family, m))) {
6695 retval = dlil_clat46(ifp, &proto_family, &m);
6696 /*
6697 * Go to the next packet if translation fails
6698 */
6699 if (retval != 0) {
6700 m_freem(m);
6701 m = NULL;
6702 ip6stat.ip6s_clat464_out_drop++;
6703 /* Make sure that the proto family is PF_INET */
6704 ASSERT(proto_family == PF_INET);
6705 goto preout_again;
6706 }
6707 /*
6708 * Free the old one and make it point to the IPv6 proto structure.
6709 *
6710 * Change proto for the first time we have successfully
6711 * performed address family translation.
6712 */
6713 if (!did_clat46 && proto_family == PF_INET6) {
6714 did_clat46 = TRUE;
6715
6716 if (proto != NULL) {
6717 if_proto_free(proto);
6718 }
6719 ifnet_lock_shared(ifp);
6720 /* callee holds a proto refcnt upon success */
6721 proto = find_attached_proto(ifp, proto_family);
6722 if (proto == NULL) {
6723 ifnet_lock_done(ifp);
6724 retval = ENXIO;
6725 m_freem(m);
6726 m = NULL;
6727 goto cleanup;
6728 }
6729 ifnet_lock_done(ifp);
6730 if (ifp->if_type == IFT_ETHER) {
6731 /* Update the dest to translated v6 address */
6732 dest6.sin6_len = sizeof(struct sockaddr_in6);
6733 dest6.sin6_family = AF_INET6;
6734 dest6.sin6_addr = (mtod(m, struct ip6_hdr *))->ip6_dst;
6735 dest = (const struct sockaddr *)&dest6;
6736
6737 /*
6738 * Lookup route to the translated destination
6739 * Free this route ref during cleanup
6740 */
6741 rt = rtalloc1_scoped((struct sockaddr *)&dest6,
6742 0, 0, ifp->if_index);
6743
6744 route = rt;
6745 }
6746 }
6747 }
6748
6749 /*
6750 * This path gets packet chain going to the same destination.
6751 * The pre output routine is used to either trigger resolution of
6752 * the next hop or retreive the next hop's link layer addressing.
6753 * For ex: ether_inet(6)_pre_output routine.
6754 *
6755 * If the routine returns EJUSTRETURN, it implies that packet has
6756 * been queued, and therefore we have to call preout_again for the
6757 * following packet in the chain.
6758 *
6759 * For errors other than EJUSTRETURN, the current packet is freed
6760 * and the rest of the chain (pointed by packetlist is freed as
6761 * part of clean up.
6762 *
6763 * Else if there is no error the retrieved information is used for
6764 * all the packets in the chain.
6765 */
6766 if (raw == 0) {
6767 proto_media_preout preoutp = (proto->proto_kpi == kProtoKPI_v1 ?
6768 proto->kpi.v1.pre_output : proto->kpi.v2.pre_output);
6769 retval = 0;
6770 if (preoutp != NULL) {
6771 retval = preoutp(ifp, proto_family, &m, dest, route,
6772 frame_type, dst_linkaddr);
6773
6774 if (retval != 0) {
6775 if (retval == EJUSTRETURN) {
6776 goto preout_again;
6777 }
6778 m_freem(m);
6779 m = NULL;
6780 goto cleanup;
6781 }
6782 }
6783 }
6784
6785 do {
6786 /*
6787 * pkt_hdr is set here to point to m_data prior to
6788 * calling into the framer. This value of pkt_hdr is
6789 * used by the netif gso logic to retrieve the ip header
6790 * for the TCP packets, offloaded for TSO processing.
6791 */
6792 if ((raw != 0) && (ifp->if_family == IFNET_FAMILY_ETHERNET)) {
6793 uint8_t vlan_encap_len = 0;
6794
6795 if ((m->m_pkthdr.csum_flags & CSUM_VLAN_ENCAP_PRESENT) != 0) {
6796 vlan_encap_len = ETHER_VLAN_ENCAP_LEN;
6797 }
6798 m->m_pkthdr.pkt_hdr = mtod(m, char *) + ETHER_HDR_LEN + vlan_encap_len;
6799 } else {
6800 m->m_pkthdr.pkt_hdr = mtod(m, void *);
6801 }
6802
6803 /*
6804 * Perform address family translation if needed.
6805 * For now we only support stateless 4 to 6 translation
6806 * on the out path.
6807 *
6808 * The routine below translates IP header, updates protocol
6809 * checksum and also translates ICMP.
6810 *
6811 * We skip the first packet as it is already translated and
6812 * the proto family is set to PF_INET6.
6813 */
6814 if (proto_family == PF_INET && IS_INTF_CLAT46(ifp) &&
6815 (ifp->if_type == IFT_CELLULAR ||
6816 dlil_is_clat_needed(proto_family, m))) {
6817 retval = dlil_clat46(ifp, &proto_family, &m);
6818 /* Goto the next packet if the translation fails */
6819 if (retval != 0) {
6820 m_freem(m);
6821 m = NULL;
6822 ip6stat.ip6s_clat464_out_drop++;
6823 goto next;
6824 }
6825 }
6826
6827 #if CONFIG_DTRACE
6828 if (!raw) {
6829 dlil_output_dtrace(ifp, proto_family, m);
6830 }
6831 #endif /* CONFIG_DTRACE */
6832
6833 if (raw == 0 && ifp->if_framer != NULL) {
6834 int rcvif_set = 0;
6835
6836 /*
6837 * If this is a broadcast packet that needs to be
6838 * looped back into the system, set the inbound ifp
6839 * to that of the outbound ifp. This will allow
6840 * us to determine that it is a legitimate packet
6841 * for the system. Only set the ifp if it's not
6842 * already set, just to be safe.
6843 */
6844 if ((m->m_flags & (M_BCAST | M_LOOP)) &&
6845 m->m_pkthdr.rcvif == NULL) {
6846 m->m_pkthdr.rcvif = ifp;
6847 rcvif_set = 1;
6848 }
6849 m_loop_set = m->m_flags & M_LOOP;
6850 retval = ifp->if_framer(ifp, &m, dest, dst_linkaddr,
6851 frame_type, &pre, &post);
6852 if (retval != 0) {
6853 if (retval != EJUSTRETURN) {
6854 m_freem(m);
6855 }
6856 goto next;
6857 }
6858
6859 /*
6860 * For partial checksum offload, adjust the start
6861 * and stuff offsets based on the prepended header.
6862 */
6863 if ((m->m_pkthdr.csum_flags &
6864 (CSUM_DATA_VALID | CSUM_PARTIAL)) ==
6865 (CSUM_DATA_VALID | CSUM_PARTIAL)) {
6866 m->m_pkthdr.csum_tx_stuff += pre;
6867 m->m_pkthdr.csum_tx_start += pre;
6868 }
6869
6870 if (hwcksum_dbg != 0 && !(ifp->if_flags & IFF_LOOPBACK)) {
6871 dlil_output_cksum_dbg(ifp, m, pre,
6872 proto_family);
6873 }
6874
6875 /*
6876 * Clear the ifp if it was set above, and to be
6877 * safe, only if it is still the same as the
6878 * outbound ifp we have in context. If it was
6879 * looped back, then a copy of it was sent to the
6880 * loopback interface with the rcvif set, and we
6881 * are clearing the one that will go down to the
6882 * layer below.
6883 */
6884 if (rcvif_set && m->m_pkthdr.rcvif == ifp) {
6885 m->m_pkthdr.rcvif = NULL;
6886 }
6887 }
6888
6889 /*
6890 * Let interface filters (if any) do their thing ...
6891 */
6892 retval = dlil_interface_filters_output(ifp, &m, proto_family);
6893 if (retval != 0) {
6894 if (retval != EJUSTRETURN) {
6895 m_freem(m);
6896 }
6897 goto next;
6898 }
6899 /*
6900 * Strip away M_PROTO1 bit prior to sending packet
6901 * to the driver as this field may be used by the driver
6902 */
6903 m->m_flags &= ~M_PROTO1;
6904
6905 /*
6906 * If the underlying interface is not capable of handling a
6907 * packet whose data portion spans across physically disjoint
6908 * pages, we need to "normalize" the packet so that we pass
6909 * down a chain of mbufs where each mbuf points to a span that
6910 * resides in the system page boundary. If the packet does
6911 * not cross page(s), the following is a no-op.
6912 */
6913 if (!(ifp->if_hwassist & IFNET_MULTIPAGES)) {
6914 if ((m = m_normalize(m)) == NULL) {
6915 goto next;
6916 }
6917 }
6918
6919 /*
6920 * If this is a TSO packet, make sure the interface still
6921 * advertise TSO capability.
6922 */
6923 if (TSO_IPV4_NOTOK(ifp, m) || TSO_IPV6_NOTOK(ifp, m)) {
6924 retval = EMSGSIZE;
6925 m_freem(m);
6926 goto cleanup;
6927 }
6928
6929 ifp_inc_traffic_class_out(ifp, m);
6930
6931 #if SKYWALK
6932 /*
6933 * For native skywalk devices, packets will be passed to pktap
6934 * after GSO or after the mbuf to packet conversion.
6935 * This is done for IPv4/IPv6 packets only because there is no
6936 * space in the mbuf to pass down the proto family.
6937 */
6938 if (dlil_is_native_netif_nexus(ifp)) {
6939 if (raw || m->m_pkthdr.pkt_proto == 0) {
6940 pktap_output(ifp, proto_family, m, pre, post);
6941 m->m_pkthdr.pkt_flags |= PKTF_SKIP_PKTAP;
6942 }
6943 } else {
6944 pktap_output(ifp, proto_family, m, pre, post);
6945 }
6946 #else /* SKYWALK */
6947 pktap_output(ifp, proto_family, m, pre, post);
6948 #endif /* SKYWALK */
6949
6950 /*
6951 * Count the number of elements in the mbuf chain
6952 */
6953 if (tx_chain_len_count) {
6954 dlil_count_chain_len(m, &tx_chain_len_stats);
6955 }
6956
6957 /*
6958 * Record timestamp; ifnet_enqueue() will use this info
6959 * rather than redoing the work. An optimization could
6960 * involve doing this just once at the top, if there are
6961 * no interface filters attached, but that's probably
6962 * not a big deal.
6963 */
6964 nanouptime(&now);
6965 net_timernsec(&now, &now_nsec);
6966 (void) mbuf_set_timestamp(m, now_nsec, TRUE);
6967
6968 /*
6969 * Discard partial sum information if this packet originated
6970 * from another interface; the packet would already have the
6971 * final checksum and we shouldn't recompute it.
6972 */
6973 if ((m->m_pkthdr.pkt_flags & PKTF_FORWARDED) &&
6974 (m->m_pkthdr.csum_flags & (CSUM_DATA_VALID | CSUM_PARTIAL)) ==
6975 (CSUM_DATA_VALID | CSUM_PARTIAL)) {
6976 m->m_pkthdr.csum_flags &= ~CSUM_TX_FLAGS;
6977 m->m_pkthdr.csum_data = 0;
6978 }
6979
6980 /*
6981 * Finally, call the driver.
6982 */
6983 if (ifp->if_eflags & (IFEF_SENDLIST | IFEF_ENQUEUE_MULTI)) {
6984 if (m->m_pkthdr.pkt_flags & PKTF_FORWARDED) {
6985 flen += (m_pktlen(m) - (pre + post));
6986 m->m_pkthdr.pkt_flags &= ~PKTF_FORWARDED;
6987 }
6988 *send_tail = m;
6989 send_tail = &m->m_nextpkt;
6990 } else {
6991 if (m->m_pkthdr.pkt_flags & PKTF_FORWARDED) {
6992 flen = (m_pktlen(m) - (pre + post));
6993 m->m_pkthdr.pkt_flags &= ~PKTF_FORWARDED;
6994 } else {
6995 flen = 0;
6996 }
6997 KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_START,
6998 0, 0, 0, 0, 0);
6999 retval = (*ifp->if_output_dlil)(ifp, m);
7000 if (retval == EQFULL || retval == EQSUSPENDED) {
7001 if (adv != NULL && adv->code == FADV_SUCCESS) {
7002 adv->code = (retval == EQFULL ?
7003 FADV_FLOW_CONTROLLED :
7004 FADV_SUSPENDED);
7005 }
7006 retval = 0;
7007 }
7008 if (retval == 0 && flen > 0) {
7009 fbytes += flen;
7010 fpkts++;
7011 }
7012 if (retval != 0 && dlil_verbose) {
7013 DLIL_PRINTF("%s: output error on %s retval = %d\n",
7014 __func__, if_name(ifp),
7015 retval);
7016 }
7017 KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_END,
7018 0, 0, 0, 0, 0);
7019 }
7020 KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
7021
7022 next:
7023 m = packetlist;
7024 if (m != NULL) {
7025 m->m_flags |= m_loop_set;
7026 packetlist = packetlist->m_nextpkt;
7027 m->m_nextpkt = NULL;
7028 }
7029 /* Reset the proto family to old proto family for CLAT */
7030 if (did_clat46) {
7031 proto_family = old_proto_family;
7032 }
7033 } while (m != NULL);
7034
7035 if (send_head != NULL) {
7036 KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_START,
7037 0, 0, 0, 0, 0);
7038 if (ifp->if_eflags & IFEF_SENDLIST) {
7039 retval = (*ifp->if_output_dlil)(ifp, send_head);
7040 if (retval == EQFULL || retval == EQSUSPENDED) {
7041 if (adv != NULL) {
7042 adv->code = (retval == EQFULL ?
7043 FADV_FLOW_CONTROLLED :
7044 FADV_SUSPENDED);
7045 }
7046 retval = 0;
7047 }
7048 if (retval == 0 && flen > 0) {
7049 fbytes += flen;
7050 fpkts++;
7051 }
7052 if (retval != 0 && dlil_verbose) {
7053 DLIL_PRINTF("%s: output error on %s retval = %d\n",
7054 __func__, if_name(ifp), retval);
7055 }
7056 } else {
7057 struct mbuf *send_m;
7058 int enq_cnt = 0;
7059 VERIFY(ifp->if_eflags & IFEF_ENQUEUE_MULTI);
7060 while (send_head != NULL) {
7061 send_m = send_head;
7062 send_head = send_m->m_nextpkt;
7063 send_m->m_nextpkt = NULL;
7064 retval = (*ifp->if_output_dlil)(ifp, send_m);
7065 if (retval == EQFULL || retval == EQSUSPENDED) {
7066 if (adv != NULL) {
7067 adv->code = (retval == EQFULL ?
7068 FADV_FLOW_CONTROLLED :
7069 FADV_SUSPENDED);
7070 }
7071 retval = 0;
7072 }
7073 if (retval == 0) {
7074 enq_cnt++;
7075 if (flen > 0) {
7076 fpkts++;
7077 }
7078 }
7079 if (retval != 0 && dlil_verbose) {
7080 DLIL_PRINTF("%s: output error on %s "
7081 "retval = %d\n",
7082 __func__, if_name(ifp), retval);
7083 }
7084 }
7085 if (enq_cnt > 0) {
7086 fbytes += flen;
7087 ifnet_start(ifp);
7088 }
7089 }
7090 KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
7091 }
7092
7093 KERNEL_DEBUG(DBG_FNC_DLIL_OUTPUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
7094
7095 cleanup:
7096 if (fbytes > 0) {
7097 ifp->if_fbytes += fbytes;
7098 }
7099 if (fpkts > 0) {
7100 ifp->if_fpackets += fpkts;
7101 }
7102 if (proto != NULL) {
7103 if_proto_free(proto);
7104 }
7105 if (packetlist) { /* if any packets are left, clean up */
7106 mbuf_freem_list(packetlist);
7107 }
7108 if (retval == EJUSTRETURN) {
7109 retval = 0;
7110 }
7111 if (iorefcnt == 1) {
7112 ifnet_datamov_end(ifp);
7113 }
7114 if (rt != NULL) {
7115 rtfree(rt);
7116 rt = NULL;
7117 }
7118
7119 return retval;
7120 }
7121
7122 /*
7123 * This routine checks if the destination address is not a loopback, link-local,
7124 * multicast or broadcast address.
7125 */
7126 static int
dlil_is_clat_needed(protocol_family_t proto_family,mbuf_t m)7127 dlil_is_clat_needed(protocol_family_t proto_family, mbuf_t m)
7128 {
7129 int ret = 0;
7130 switch (proto_family) {
7131 case PF_INET: {
7132 struct ip *iph = mtod(m, struct ip *);
7133 if (CLAT46_NEEDED(ntohl(iph->ip_dst.s_addr))) {
7134 ret = 1;
7135 }
7136 break;
7137 }
7138 case PF_INET6: {
7139 struct ip6_hdr *ip6h = mtod(m, struct ip6_hdr *);
7140 if ((size_t)m_pktlen(m) >= sizeof(struct ip6_hdr) &&
7141 CLAT64_NEEDED(&ip6h->ip6_dst)) {
7142 ret = 1;
7143 }
7144 break;
7145 }
7146 }
7147
7148 return ret;
7149 }
7150 /*
7151 * @brief This routine translates IPv4 packet to IPv6 packet,
7152 * updates protocol checksum and also translates ICMP for code
7153 * along with inner header translation.
7154 *
7155 * @param ifp Pointer to the interface
7156 * @param proto_family pointer to protocol family. It is updated if function
7157 * performs the translation successfully.
7158 * @param m Pointer to the pointer pointing to the packet. Needed because this
7159 * routine can end up changing the mbuf to a different one.
7160 *
7161 * @return 0 on success or else a negative value.
7162 */
7163 static errno_t
dlil_clat46(ifnet_t ifp,protocol_family_t * proto_family,mbuf_t * m)7164 dlil_clat46(ifnet_t ifp, protocol_family_t *proto_family, mbuf_t *m)
7165 {
7166 VERIFY(*proto_family == PF_INET);
7167 VERIFY(IS_INTF_CLAT46(ifp));
7168
7169 pbuf_t pbuf_store, *pbuf = NULL;
7170 struct ip *iph = NULL;
7171 struct in_addr osrc, odst;
7172 uint8_t proto = 0;
7173 struct in6_ifaddr *ia6_clat_src = NULL;
7174 struct in6_addr *src = NULL;
7175 struct in6_addr dst;
7176 int error = 0;
7177 uint16_t off = 0;
7178 uint16_t tot_len = 0;
7179 uint16_t ip_id_val = 0;
7180 uint16_t ip_frag_off = 0;
7181
7182 boolean_t is_frag = FALSE;
7183 boolean_t is_first_frag = TRUE;
7184 boolean_t is_last_frag = TRUE;
7185
7186 pbuf_init_mbuf(&pbuf_store, *m, ifp);
7187 pbuf = &pbuf_store;
7188 iph = pbuf->pb_data;
7189
7190 osrc = iph->ip_src;
7191 odst = iph->ip_dst;
7192 proto = iph->ip_p;
7193 off = (uint16_t)(iph->ip_hl << 2);
7194 ip_id_val = iph->ip_id;
7195 ip_frag_off = ntohs(iph->ip_off) & IP_OFFMASK;
7196
7197 tot_len = ntohs(iph->ip_len);
7198
7199 /*
7200 * For packets that are not first frags
7201 * we only need to adjust CSUM.
7202 * For 4 to 6, Fragmentation header gets appended
7203 * after proto translation.
7204 */
7205 if (ntohs(iph->ip_off) & ~(IP_DF | IP_RF)) {
7206 is_frag = TRUE;
7207
7208 /* If the offset is not zero, it is not first frag */
7209 if (ip_frag_off != 0) {
7210 is_first_frag = FALSE;
7211 }
7212
7213 /* If IP_MF is set, then it is not last frag */
7214 if (ntohs(iph->ip_off) & IP_MF) {
7215 is_last_frag = FALSE;
7216 }
7217 }
7218
7219 /*
7220 * Retrive the local IPv6 CLAT46 address reserved for stateless
7221 * translation.
7222 */
7223 ia6_clat_src = in6ifa_ifpwithflag(ifp, IN6_IFF_CLAT46);
7224 if (ia6_clat_src == NULL) {
7225 ip6stat.ip6s_clat464_out_nov6addr_drop++;
7226 error = -1;
7227 goto cleanup;
7228 }
7229
7230 src = &ia6_clat_src->ia_addr.sin6_addr;
7231
7232 /*
7233 * Translate IPv4 destination to IPv6 destination by using the
7234 * prefixes learned through prior PLAT discovery.
7235 */
7236 if ((error = nat464_synthesize_ipv6(ifp, &odst, &dst)) != 0) {
7237 ip6stat.ip6s_clat464_out_v6synthfail_drop++;
7238 goto cleanup;
7239 }
7240
7241 /* Translate the IP header part first */
7242 error = (nat464_translate_46(pbuf, off, iph->ip_tos, iph->ip_p,
7243 iph->ip_ttl, *src, dst, tot_len) == NT_NAT64) ? 0 : -1;
7244
7245 iph = NULL; /* Invalidate iph as pbuf has been modified */
7246
7247 if (error != 0) {
7248 ip6stat.ip6s_clat464_out_46transfail_drop++;
7249 goto cleanup;
7250 }
7251
7252 /*
7253 * Translate protocol header, update checksum, checksum flags
7254 * and related fields.
7255 */
7256 error = (nat464_translate_proto(pbuf, (struct nat464_addr *)&osrc, (struct nat464_addr *)&odst,
7257 proto, PF_INET, PF_INET6, NT_OUT, !is_first_frag) == NT_NAT64) ? 0 : -1;
7258
7259 if (error != 0) {
7260 ip6stat.ip6s_clat464_out_46proto_transfail_drop++;
7261 goto cleanup;
7262 }
7263
7264 /* Now insert the IPv6 fragment header */
7265 if (is_frag) {
7266 error = nat464_insert_frag46(pbuf, ip_id_val, ip_frag_off, is_last_frag);
7267
7268 if (error != 0) {
7269 ip6stat.ip6s_clat464_out_46frag_transfail_drop++;
7270 goto cleanup;
7271 }
7272 }
7273
7274 cleanup:
7275 if (ia6_clat_src != NULL) {
7276 IFA_REMREF(&ia6_clat_src->ia_ifa);
7277 }
7278
7279 if (pbuf_is_valid(pbuf)) {
7280 *m = pbuf->pb_mbuf;
7281 pbuf->pb_mbuf = NULL;
7282 pbuf_destroy(pbuf);
7283 } else {
7284 error = -1;
7285 ip6stat.ip6s_clat464_out_invalpbuf_drop++;
7286 }
7287
7288 if (error == 0) {
7289 *proto_family = PF_INET6;
7290 ip6stat.ip6s_clat464_out_success++;
7291 }
7292
7293 return error;
7294 }
7295
7296 /*
7297 * @brief This routine translates incoming IPv6 to IPv4 packet,
7298 * updates protocol checksum and also translates ICMPv6 outer
7299 * and inner headers
7300 *
7301 * @return 0 on success or else a negative value.
7302 */
7303 static errno_t
dlil_clat64(ifnet_t ifp,protocol_family_t * proto_family,mbuf_t * m)7304 dlil_clat64(ifnet_t ifp, protocol_family_t *proto_family, mbuf_t *m)
7305 {
7306 VERIFY(*proto_family == PF_INET6);
7307 VERIFY(IS_INTF_CLAT46(ifp));
7308
7309 struct ip6_hdr *ip6h = NULL;
7310 struct in6_addr osrc, odst;
7311 uint8_t proto = 0;
7312 struct in6_ifaddr *ia6_clat_dst = NULL;
7313 struct in_ifaddr *ia4_clat_dst = NULL;
7314 struct in_addr *dst = NULL;
7315 struct in_addr src;
7316 int error = 0;
7317 uint32_t off = 0;
7318 u_int64_t tot_len = 0;
7319 uint8_t tos = 0;
7320 boolean_t is_first_frag = TRUE;
7321
7322 /* Incoming mbuf does not contain valid IP6 header */
7323 if ((size_t)(*m)->m_pkthdr.len < sizeof(struct ip6_hdr) ||
7324 ((size_t)(*m)->m_len < sizeof(struct ip6_hdr) &&
7325 (*m = m_pullup(*m, sizeof(struct ip6_hdr))) == NULL)) {
7326 ip6stat.ip6s_clat464_in_tooshort_drop++;
7327 return -1;
7328 }
7329
7330 ip6h = mtod(*m, struct ip6_hdr *);
7331 /* Validate that mbuf contains IP payload equal to ip6_plen */
7332 if ((size_t)(*m)->m_pkthdr.len < ntohs(ip6h->ip6_plen) + sizeof(struct ip6_hdr)) {
7333 ip6stat.ip6s_clat464_in_tooshort_drop++;
7334 return -1;
7335 }
7336
7337 osrc = ip6h->ip6_src;
7338 odst = ip6h->ip6_dst;
7339
7340 /*
7341 * Retrieve the local CLAT46 reserved IPv6 address.
7342 * Let the packet pass if we don't find one, as the flag
7343 * may get set before IPv6 configuration has taken place.
7344 */
7345 ia6_clat_dst = in6ifa_ifpwithflag(ifp, IN6_IFF_CLAT46);
7346 if (ia6_clat_dst == NULL) {
7347 goto done;
7348 }
7349
7350 /*
7351 * Check if the original dest in the packet is same as the reserved
7352 * CLAT46 IPv6 address
7353 */
7354 if (IN6_ARE_ADDR_EQUAL(&odst, &ia6_clat_dst->ia_addr.sin6_addr)) {
7355 pbuf_t pbuf_store, *pbuf = NULL;
7356 pbuf_init_mbuf(&pbuf_store, *m, ifp);
7357 pbuf = &pbuf_store;
7358
7359 /*
7360 * Retrive the local CLAT46 IPv4 address reserved for stateless
7361 * translation.
7362 */
7363 ia4_clat_dst = inifa_ifpclatv4(ifp);
7364 if (ia4_clat_dst == NULL) {
7365 IFA_REMREF(&ia6_clat_dst->ia_ifa);
7366 ip6stat.ip6s_clat464_in_nov4addr_drop++;
7367 error = -1;
7368 goto cleanup;
7369 }
7370 IFA_REMREF(&ia6_clat_dst->ia_ifa);
7371
7372 /* Translate IPv6 src to IPv4 src by removing the NAT64 prefix */
7373 dst = &ia4_clat_dst->ia_addr.sin_addr;
7374 if ((error = nat464_synthesize_ipv4(ifp, &osrc, &src)) != 0) {
7375 ip6stat.ip6s_clat464_in_v4synthfail_drop++;
7376 error = -1;
7377 goto cleanup;
7378 }
7379
7380 ip6h = pbuf->pb_data;
7381 off = sizeof(struct ip6_hdr);
7382 proto = ip6h->ip6_nxt;
7383 tos = (ntohl(ip6h->ip6_flow) >> 20) & 0xff;
7384 tot_len = ntohs(ip6h->ip6_plen) + sizeof(struct ip6_hdr);
7385
7386 /*
7387 * Translate the IP header and update the fragmentation
7388 * header if needed
7389 */
7390 error = (nat464_translate_64(pbuf, off, tos, &proto,
7391 ip6h->ip6_hlim, src, *dst, tot_len, &is_first_frag) == NT_NAT64) ?
7392 0 : -1;
7393
7394 ip6h = NULL; /* Invalidate ip6h as pbuf has been changed */
7395
7396 if (error != 0) {
7397 ip6stat.ip6s_clat464_in_64transfail_drop++;
7398 goto cleanup;
7399 }
7400
7401 /*
7402 * Translate protocol header, update checksum, checksum flags
7403 * and related fields.
7404 */
7405 error = (nat464_translate_proto(pbuf, (struct nat464_addr *)&osrc,
7406 (struct nat464_addr *)&odst, proto, PF_INET6, PF_INET,
7407 NT_IN, !is_first_frag) == NT_NAT64) ? 0 : -1;
7408
7409 if (error != 0) {
7410 ip6stat.ip6s_clat464_in_64proto_transfail_drop++;
7411 goto cleanup;
7412 }
7413
7414 cleanup:
7415 if (ia4_clat_dst != NULL) {
7416 IFA_REMREF(&ia4_clat_dst->ia_ifa);
7417 }
7418
7419 if (pbuf_is_valid(pbuf)) {
7420 *m = pbuf->pb_mbuf;
7421 pbuf->pb_mbuf = NULL;
7422 pbuf_destroy(pbuf);
7423 } else {
7424 error = -1;
7425 ip6stat.ip6s_clat464_in_invalpbuf_drop++;
7426 }
7427
7428 if (error == 0) {
7429 *proto_family = PF_INET;
7430 ip6stat.ip6s_clat464_in_success++;
7431 }
7432 } /* CLAT traffic */
7433
7434 done:
7435 return error;
7436 }
7437
7438 /* The following is used to enqueue work items for ifnet ioctl events */
7439 static void ifnet_ioctl_event_callback(struct nwk_wq_entry *);
7440
7441 struct ifnet_ioctl_event {
7442 struct ifnet *ifp;
7443 u_long ioctl_code;
7444 };
7445
7446 struct ifnet_ioctl_event_nwk_wq_entry {
7447 struct nwk_wq_entry nwk_wqe;
7448 struct ifnet_ioctl_event ifnet_ioctl_ev_arg;
7449 };
7450
7451 void
ifnet_ioctl_async(struct ifnet * ifp,u_long ioctl_code)7452 ifnet_ioctl_async(struct ifnet *ifp, u_long ioctl_code)
7453 {
7454 struct ifnet_ioctl_event_nwk_wq_entry *p_ifnet_ioctl_ev = NULL;
7455
7456 /*
7457 * Get an io ref count if the interface is attached.
7458 * At this point it most likely is. We are taking a reference for
7459 * deferred processing.
7460 */
7461 if (!ifnet_is_attached(ifp, 1)) {
7462 os_log(OS_LOG_DEFAULT, "%s:%d %s Failed for ioctl %lu as interface "
7463 "is not attached",
7464 __func__, __LINE__, if_name(ifp), ioctl_code);
7465 return;
7466 }
7467
7468 p_ifnet_ioctl_ev = kalloc_type(struct ifnet_ioctl_event_nwk_wq_entry,
7469 Z_WAITOK | Z_ZERO | Z_NOFAIL);
7470
7471 p_ifnet_ioctl_ev->ifnet_ioctl_ev_arg.ifp = ifp;
7472 p_ifnet_ioctl_ev->ifnet_ioctl_ev_arg.ioctl_code = ioctl_code;
7473 p_ifnet_ioctl_ev->nwk_wqe.func = ifnet_ioctl_event_callback;
7474 nwk_wq_enqueue(&p_ifnet_ioctl_ev->nwk_wqe);
7475 }
7476
7477 static void
ifnet_ioctl_event_callback(struct nwk_wq_entry * nwk_item)7478 ifnet_ioctl_event_callback(struct nwk_wq_entry *nwk_item)
7479 {
7480 struct ifnet_ioctl_event_nwk_wq_entry *p_ev = __container_of(nwk_item,
7481 struct ifnet_ioctl_event_nwk_wq_entry, nwk_wqe);
7482
7483 struct ifnet *ifp = p_ev->ifnet_ioctl_ev_arg.ifp;
7484 u_long ioctl_code = p_ev->ifnet_ioctl_ev_arg.ioctl_code;
7485 int ret = 0;
7486
7487 if ((ret = ifnet_ioctl(ifp, 0, ioctl_code, NULL)) != 0) {
7488 os_log(OS_LOG_DEFAULT, "%s:%d %s ifnet_ioctl returned %d for ioctl %lu",
7489 __func__, __LINE__, if_name(ifp), ret, ioctl_code);
7490 } else if (dlil_verbose) {
7491 os_log(OS_LOG_DEFAULT, "%s:%d %s ifnet_ioctl returned successfully "
7492 "for ioctl %lu",
7493 __func__, __LINE__, if_name(ifp), ioctl_code);
7494 }
7495 ifnet_decr_iorefcnt(ifp);
7496 kfree_type(struct ifnet_ioctl_event_nwk_wq_entry, p_ev);
7497 return;
7498 }
7499
7500 errno_t
ifnet_ioctl(ifnet_t ifp,protocol_family_t proto_fam,u_long ioctl_code,void * ioctl_arg)7501 ifnet_ioctl(ifnet_t ifp, protocol_family_t proto_fam, u_long ioctl_code,
7502 void *ioctl_arg)
7503 {
7504 struct ifnet_filter *filter;
7505 int retval = EOPNOTSUPP;
7506 int result = 0;
7507
7508 if (ifp == NULL || ioctl_code == 0) {
7509 return EINVAL;
7510 }
7511
7512 /* Get an io ref count if the interface is attached */
7513 if (!ifnet_is_attached(ifp, 1)) {
7514 return EOPNOTSUPP;
7515 }
7516
7517 /*
7518 * Run the interface filters first.
7519 * We want to run all filters before calling the protocol,
7520 * interface family, or interface.
7521 */
7522 lck_mtx_lock_spin(&ifp->if_flt_lock);
7523 /* prevent filter list from changing in case we drop the lock */
7524 if_flt_monitor_busy(ifp);
7525 TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
7526 if (filter->filt_ioctl != NULL && (filter->filt_protocol == 0 ||
7527 filter->filt_protocol == proto_fam)) {
7528 lck_mtx_unlock(&ifp->if_flt_lock);
7529
7530 result = filter->filt_ioctl(filter->filt_cookie, ifp,
7531 proto_fam, ioctl_code, ioctl_arg);
7532
7533 lck_mtx_lock_spin(&ifp->if_flt_lock);
7534
7535 /* Only update retval if no one has handled the ioctl */
7536 if (retval == EOPNOTSUPP || result == EJUSTRETURN) {
7537 if (result == ENOTSUP) {
7538 result = EOPNOTSUPP;
7539 }
7540 retval = result;
7541 if (retval != 0 && retval != EOPNOTSUPP) {
7542 /* we're done with the filter list */
7543 if_flt_monitor_unbusy(ifp);
7544 lck_mtx_unlock(&ifp->if_flt_lock);
7545 goto cleanup;
7546 }
7547 }
7548 }
7549 }
7550 /* we're done with the filter list */
7551 if_flt_monitor_unbusy(ifp);
7552 lck_mtx_unlock(&ifp->if_flt_lock);
7553
7554 /* Allow the protocol to handle the ioctl */
7555 if (proto_fam != 0) {
7556 struct if_proto *proto;
7557
7558 /* callee holds a proto refcnt upon success */
7559 ifnet_lock_shared(ifp);
7560 proto = find_attached_proto(ifp, proto_fam);
7561 ifnet_lock_done(ifp);
7562 if (proto != NULL) {
7563 proto_media_ioctl ioctlp =
7564 (proto->proto_kpi == kProtoKPI_v1 ?
7565 proto->kpi.v1.ioctl : proto->kpi.v2.ioctl);
7566 result = EOPNOTSUPP;
7567 if (ioctlp != NULL) {
7568 result = ioctlp(ifp, proto_fam, ioctl_code,
7569 ioctl_arg);
7570 }
7571 if_proto_free(proto);
7572
7573 /* Only update retval if no one has handled the ioctl */
7574 if (retval == EOPNOTSUPP || result == EJUSTRETURN) {
7575 if (result == ENOTSUP) {
7576 result = EOPNOTSUPP;
7577 }
7578 retval = result;
7579 if (retval && retval != EOPNOTSUPP) {
7580 goto cleanup;
7581 }
7582 }
7583 }
7584 }
7585
7586 /* retval is either 0 or EOPNOTSUPP */
7587
7588 /*
7589 * Let the interface handle this ioctl.
7590 * If it returns EOPNOTSUPP, ignore that, we may have
7591 * already handled this in the protocol or family.
7592 */
7593 if (ifp->if_ioctl) {
7594 result = (*ifp->if_ioctl)(ifp, ioctl_code, ioctl_arg);
7595 }
7596
7597 /* Only update retval if no one has handled the ioctl */
7598 if (retval == EOPNOTSUPP || result == EJUSTRETURN) {
7599 if (result == ENOTSUP) {
7600 result = EOPNOTSUPP;
7601 }
7602 retval = result;
7603 if (retval && retval != EOPNOTSUPP) {
7604 goto cleanup;
7605 }
7606 }
7607
7608 cleanup:
7609 if (retval == EJUSTRETURN) {
7610 retval = 0;
7611 }
7612
7613 ifnet_decr_iorefcnt(ifp);
7614
7615 return retval;
7616 }
7617
7618 __private_extern__ errno_t
dlil_set_bpf_tap(ifnet_t ifp,bpf_tap_mode mode,bpf_packet_func callback)7619 dlil_set_bpf_tap(ifnet_t ifp, bpf_tap_mode mode, bpf_packet_func callback)
7620 {
7621 errno_t error = 0;
7622
7623
7624 if (ifp->if_set_bpf_tap) {
7625 /* Get an io reference on the interface if it is attached */
7626 if (!ifnet_is_attached(ifp, 1)) {
7627 return ENXIO;
7628 }
7629 error = ifp->if_set_bpf_tap(ifp, mode, callback);
7630 ifnet_decr_iorefcnt(ifp);
7631 }
7632 return error;
7633 }
7634
7635 errno_t
dlil_resolve_multi(struct ifnet * ifp,const struct sockaddr * proto_addr,struct sockaddr * ll_addr,size_t ll_len)7636 dlil_resolve_multi(struct ifnet *ifp, const struct sockaddr *proto_addr,
7637 struct sockaddr *ll_addr, size_t ll_len)
7638 {
7639 errno_t result = EOPNOTSUPP;
7640 struct if_proto *proto;
7641 const struct sockaddr *verify;
7642 proto_media_resolve_multi resolvep;
7643
7644 if (!ifnet_is_attached(ifp, 1)) {
7645 return result;
7646 }
7647
7648 bzero(ll_addr, ll_len);
7649
7650 /* Call the protocol first; callee holds a proto refcnt upon success */
7651 ifnet_lock_shared(ifp);
7652 proto = find_attached_proto(ifp, proto_addr->sa_family);
7653 ifnet_lock_done(ifp);
7654 if (proto != NULL) {
7655 resolvep = (proto->proto_kpi == kProtoKPI_v1 ?
7656 proto->kpi.v1.resolve_multi : proto->kpi.v2.resolve_multi);
7657 if (resolvep != NULL) {
7658 result = resolvep(ifp, proto_addr,
7659 (struct sockaddr_dl *)(void *)ll_addr, ll_len);
7660 }
7661 if_proto_free(proto);
7662 }
7663
7664 /* Let the interface verify the multicast address */
7665 if ((result == EOPNOTSUPP || result == 0) && ifp->if_check_multi) {
7666 if (result == 0) {
7667 verify = ll_addr;
7668 } else {
7669 verify = proto_addr;
7670 }
7671 result = ifp->if_check_multi(ifp, verify);
7672 }
7673
7674 ifnet_decr_iorefcnt(ifp);
7675 return result;
7676 }
7677
7678 __private_extern__ errno_t
dlil_send_arp_internal(ifnet_t ifp,u_short arpop,const struct sockaddr_dl * sender_hw,const struct sockaddr * sender_proto,const struct sockaddr_dl * target_hw,const struct sockaddr * target_proto)7679 dlil_send_arp_internal(ifnet_t ifp, u_short arpop,
7680 const struct sockaddr_dl *sender_hw, const struct sockaddr *sender_proto,
7681 const struct sockaddr_dl *target_hw, const struct sockaddr *target_proto)
7682 {
7683 struct if_proto *proto;
7684 errno_t result = 0;
7685
7686 if ((ifp->if_flags & IFF_NOARP) != 0) {
7687 result = ENOTSUP;
7688 goto done;
7689 }
7690
7691 /* callee holds a proto refcnt upon success */
7692 ifnet_lock_shared(ifp);
7693 proto = find_attached_proto(ifp, target_proto->sa_family);
7694 ifnet_lock_done(ifp);
7695 if (proto == NULL) {
7696 result = ENOTSUP;
7697 } else {
7698 proto_media_send_arp arpp;
7699 arpp = (proto->proto_kpi == kProtoKPI_v1 ?
7700 proto->kpi.v1.send_arp : proto->kpi.v2.send_arp);
7701 if (arpp == NULL) {
7702 result = ENOTSUP;
7703 } else {
7704 switch (arpop) {
7705 case ARPOP_REQUEST:
7706 arpstat.txrequests++;
7707 if (target_hw != NULL) {
7708 arpstat.txurequests++;
7709 }
7710 break;
7711 case ARPOP_REPLY:
7712 arpstat.txreplies++;
7713 break;
7714 }
7715 result = arpp(ifp, arpop, sender_hw, sender_proto,
7716 target_hw, target_proto);
7717 }
7718 if_proto_free(proto);
7719 }
7720 done:
7721 return result;
7722 }
7723
7724 struct net_thread_marks { };
7725 static const struct net_thread_marks net_thread_marks_base = { };
7726
7727 __private_extern__ const net_thread_marks_t net_thread_marks_none =
7728 &net_thread_marks_base;
7729
7730 __private_extern__ net_thread_marks_t
net_thread_marks_push(u_int32_t push)7731 net_thread_marks_push(u_int32_t push)
7732 {
7733 static const char *const base = (const void*)&net_thread_marks_base;
7734 u_int32_t pop = 0;
7735
7736 if (push != 0) {
7737 struct uthread *uth = current_uthread();
7738
7739 pop = push & ~uth->uu_network_marks;
7740 if (pop != 0) {
7741 uth->uu_network_marks |= pop;
7742 }
7743 }
7744
7745 return (net_thread_marks_t)&base[pop];
7746 }
7747
7748 __private_extern__ net_thread_marks_t
net_thread_unmarks_push(u_int32_t unpush)7749 net_thread_unmarks_push(u_int32_t unpush)
7750 {
7751 static const char *const base = (const void*)&net_thread_marks_base;
7752 u_int32_t unpop = 0;
7753
7754 if (unpush != 0) {
7755 struct uthread *uth = current_uthread();
7756
7757 unpop = unpush & uth->uu_network_marks;
7758 if (unpop != 0) {
7759 uth->uu_network_marks &= ~unpop;
7760 }
7761 }
7762
7763 return (net_thread_marks_t)&base[unpop];
7764 }
7765
7766 __private_extern__ void
net_thread_marks_pop(net_thread_marks_t popx)7767 net_thread_marks_pop(net_thread_marks_t popx)
7768 {
7769 static const char *const base = (const void*)&net_thread_marks_base;
7770 const ptrdiff_t pop = (const char *)popx - (const char *)base;
7771
7772 if (pop != 0) {
7773 static const ptrdiff_t ones = (ptrdiff_t)(u_int32_t)~0U;
7774 struct uthread *uth = current_uthread();
7775
7776 VERIFY((pop & ones) == pop);
7777 VERIFY((ptrdiff_t)(uth->uu_network_marks & pop) == pop);
7778 uth->uu_network_marks &= ~pop;
7779 }
7780 }
7781
7782 __private_extern__ void
net_thread_unmarks_pop(net_thread_marks_t unpopx)7783 net_thread_unmarks_pop(net_thread_marks_t unpopx)
7784 {
7785 static const char *const base = (const void*)&net_thread_marks_base;
7786 ptrdiff_t unpop = (const char *)unpopx - (const char *)base;
7787
7788 if (unpop != 0) {
7789 static const ptrdiff_t ones = (ptrdiff_t)(u_int32_t)~0U;
7790 struct uthread *uth = current_uthread();
7791
7792 VERIFY((unpop & ones) == unpop);
7793 VERIFY((ptrdiff_t)(uth->uu_network_marks & unpop) == 0);
7794 uth->uu_network_marks |= unpop;
7795 }
7796 }
7797
7798 __private_extern__ u_int32_t
net_thread_is_marked(u_int32_t check)7799 net_thread_is_marked(u_int32_t check)
7800 {
7801 if (check != 0) {
7802 struct uthread *uth = current_uthread();
7803 return uth->uu_network_marks & check;
7804 } else {
7805 return 0;
7806 }
7807 }
7808
7809 __private_extern__ u_int32_t
net_thread_is_unmarked(u_int32_t check)7810 net_thread_is_unmarked(u_int32_t check)
7811 {
7812 if (check != 0) {
7813 struct uthread *uth = current_uthread();
7814 return ~uth->uu_network_marks & check;
7815 } else {
7816 return 0;
7817 }
7818 }
7819
7820 static __inline__ int
_is_announcement(const struct sockaddr_in * sender_sin,const struct sockaddr_in * target_sin)7821 _is_announcement(const struct sockaddr_in * sender_sin,
7822 const struct sockaddr_in * target_sin)
7823 {
7824 if (target_sin == NULL || sender_sin == NULL) {
7825 return FALSE;
7826 }
7827
7828 return sender_sin->sin_addr.s_addr == target_sin->sin_addr.s_addr;
7829 }
7830
7831 __private_extern__ errno_t
dlil_send_arp(ifnet_t ifp,u_short arpop,const struct sockaddr_dl * sender_hw,const struct sockaddr * sender_proto,const struct sockaddr_dl * target_hw,const struct sockaddr * target_proto0,u_int32_t rtflags)7832 dlil_send_arp(ifnet_t ifp, u_short arpop, const struct sockaddr_dl *sender_hw,
7833 const struct sockaddr *sender_proto, const struct sockaddr_dl *target_hw,
7834 const struct sockaddr *target_proto0, u_int32_t rtflags)
7835 {
7836 errno_t result = 0;
7837 const struct sockaddr_in * sender_sin;
7838 const struct sockaddr_in * target_sin;
7839 struct sockaddr_inarp target_proto_sinarp;
7840 struct sockaddr *target_proto = (void *)(uintptr_t)target_proto0;
7841
7842 if (target_proto == NULL || sender_proto == NULL) {
7843 return EINVAL;
7844 }
7845
7846 if (sender_proto->sa_family != target_proto->sa_family) {
7847 return EINVAL;
7848 }
7849
7850 /*
7851 * If the target is a (default) router, provide that
7852 * information to the send_arp callback routine.
7853 */
7854 if (rtflags & RTF_ROUTER) {
7855 bcopy(target_proto, &target_proto_sinarp,
7856 sizeof(struct sockaddr_in));
7857 target_proto_sinarp.sin_other |= SIN_ROUTER;
7858 target_proto = (struct sockaddr *)&target_proto_sinarp;
7859 }
7860
7861 /*
7862 * If this is an ARP request and the target IP is IPv4LL,
7863 * send the request on all interfaces. The exception is
7864 * an announcement, which must only appear on the specific
7865 * interface.
7866 */
7867 sender_sin = (struct sockaddr_in *)(void *)(uintptr_t)sender_proto;
7868 target_sin = (struct sockaddr_in *)(void *)(uintptr_t)target_proto;
7869 if (target_proto->sa_family == AF_INET &&
7870 IN_LINKLOCAL(ntohl(target_sin->sin_addr.s_addr)) &&
7871 ipv4_ll_arp_aware != 0 && arpop == ARPOP_REQUEST &&
7872 !_is_announcement(sender_sin, target_sin)) {
7873 ifnet_t *ifp_list;
7874 u_int32_t count;
7875 u_int32_t ifp_on;
7876
7877 result = ENOTSUP;
7878
7879 if (ifnet_list_get(IFNET_FAMILY_ANY, &ifp_list, &count) == 0) {
7880 for (ifp_on = 0; ifp_on < count; ifp_on++) {
7881 errno_t new_result;
7882 ifaddr_t source_hw = NULL;
7883 ifaddr_t source_ip = NULL;
7884 struct sockaddr_in source_ip_copy;
7885 struct ifnet *cur_ifp = ifp_list[ifp_on];
7886
7887 /*
7888 * Only arp on interfaces marked for IPv4LL
7889 * ARPing. This may mean that we don't ARP on
7890 * the interface the subnet route points to.
7891 */
7892 if (!(cur_ifp->if_eflags & IFEF_ARPLL)) {
7893 continue;
7894 }
7895
7896 /* Find the source IP address */
7897 ifnet_lock_shared(cur_ifp);
7898 source_hw = cur_ifp->if_lladdr;
7899 TAILQ_FOREACH(source_ip, &cur_ifp->if_addrhead,
7900 ifa_link) {
7901 IFA_LOCK(source_ip);
7902 if (source_ip->ifa_addr != NULL &&
7903 source_ip->ifa_addr->sa_family ==
7904 AF_INET) {
7905 /* Copy the source IP address */
7906 source_ip_copy =
7907 *(struct sockaddr_in *)
7908 (void *)source_ip->ifa_addr;
7909 IFA_UNLOCK(source_ip);
7910 break;
7911 }
7912 IFA_UNLOCK(source_ip);
7913 }
7914
7915 /* No IP Source, don't arp */
7916 if (source_ip == NULL) {
7917 ifnet_lock_done(cur_ifp);
7918 continue;
7919 }
7920
7921 IFA_ADDREF(source_hw);
7922 ifnet_lock_done(cur_ifp);
7923
7924 /* Send the ARP */
7925 new_result = dlil_send_arp_internal(cur_ifp,
7926 arpop, (struct sockaddr_dl *)(void *)
7927 source_hw->ifa_addr,
7928 (struct sockaddr *)&source_ip_copy, NULL,
7929 target_proto);
7930
7931 IFA_REMREF(source_hw);
7932 if (result == ENOTSUP) {
7933 result = new_result;
7934 }
7935 }
7936 ifnet_list_free(ifp_list);
7937 }
7938 } else {
7939 result = dlil_send_arp_internal(ifp, arpop, sender_hw,
7940 sender_proto, target_hw, target_proto);
7941 }
7942
7943 return result;
7944 }
7945
7946 /*
7947 * Caller must hold ifnet head lock.
7948 */
7949 static int
ifnet_lookup(struct ifnet * ifp)7950 ifnet_lookup(struct ifnet *ifp)
7951 {
7952 struct ifnet *_ifp;
7953
7954 LCK_RW_ASSERT(&ifnet_head_lock, LCK_RW_ASSERT_HELD);
7955 TAILQ_FOREACH(_ifp, &ifnet_head, if_link) {
7956 if (_ifp == ifp) {
7957 break;
7958 }
7959 }
7960 return _ifp != NULL;
7961 }
7962
7963 /*
7964 * Caller has to pass a non-zero refio argument to get a
7965 * IO reference count. This will prevent ifnet_detach from
7966 * being called when there are outstanding io reference counts.
7967 */
7968 int
ifnet_is_attached(struct ifnet * ifp,int refio)7969 ifnet_is_attached(struct ifnet *ifp, int refio)
7970 {
7971 int ret;
7972
7973 lck_mtx_lock_spin(&ifp->if_ref_lock);
7974 if ((ret = IF_FULLY_ATTACHED(ifp))) {
7975 if (refio > 0) {
7976 ifp->if_refio++;
7977 }
7978 }
7979 lck_mtx_unlock(&ifp->if_ref_lock);
7980
7981 return ret;
7982 }
7983
7984 void
ifnet_incr_pending_thread_count(struct ifnet * ifp)7985 ifnet_incr_pending_thread_count(struct ifnet *ifp)
7986 {
7987 lck_mtx_lock_spin(&ifp->if_ref_lock);
7988 ifp->if_threads_pending++;
7989 lck_mtx_unlock(&ifp->if_ref_lock);
7990 }
7991
7992 void
ifnet_decr_pending_thread_count(struct ifnet * ifp)7993 ifnet_decr_pending_thread_count(struct ifnet *ifp)
7994 {
7995 lck_mtx_lock_spin(&ifp->if_ref_lock);
7996 VERIFY(ifp->if_threads_pending > 0);
7997 ifp->if_threads_pending--;
7998 if (ifp->if_threads_pending == 0) {
7999 wakeup(&ifp->if_threads_pending);
8000 }
8001 lck_mtx_unlock(&ifp->if_ref_lock);
8002 }
8003
8004 /*
8005 * Caller must ensure the interface is attached; the assumption is that
8006 * there is at least an outstanding IO reference count held already.
8007 * Most callers would call ifnet_is_{attached,data_ready}() instead.
8008 */
8009 void
ifnet_incr_iorefcnt(struct ifnet * ifp)8010 ifnet_incr_iorefcnt(struct ifnet *ifp)
8011 {
8012 lck_mtx_lock_spin(&ifp->if_ref_lock);
8013 VERIFY(IF_FULLY_ATTACHED(ifp));
8014 VERIFY(ifp->if_refio > 0);
8015 ifp->if_refio++;
8016 lck_mtx_unlock(&ifp->if_ref_lock);
8017 }
8018
8019 __attribute__((always_inline))
8020 static void
ifnet_decr_iorefcnt_locked(struct ifnet * ifp)8021 ifnet_decr_iorefcnt_locked(struct ifnet *ifp)
8022 {
8023 LCK_MTX_ASSERT(&ifp->if_ref_lock, LCK_MTX_ASSERT_OWNED);
8024
8025 VERIFY(ifp->if_refio > 0);
8026 VERIFY(ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING));
8027
8028 ifp->if_refio--;
8029 VERIFY(ifp->if_refio != 0 || ifp->if_datamov == 0);
8030
8031 /*
8032 * if there are no more outstanding io references, wakeup the
8033 * ifnet_detach thread if detaching flag is set.
8034 */
8035 if (ifp->if_refio == 0 && (ifp->if_refflags & IFRF_DETACHING)) {
8036 wakeup(&(ifp->if_refio));
8037 }
8038 }
8039
8040 void
ifnet_decr_iorefcnt(struct ifnet * ifp)8041 ifnet_decr_iorefcnt(struct ifnet *ifp)
8042 {
8043 lck_mtx_lock_spin(&ifp->if_ref_lock);
8044 ifnet_decr_iorefcnt_locked(ifp);
8045 lck_mtx_unlock(&ifp->if_ref_lock);
8046 }
8047
8048 boolean_t
ifnet_datamov_begin(struct ifnet * ifp)8049 ifnet_datamov_begin(struct ifnet *ifp)
8050 {
8051 boolean_t ret;
8052
8053 lck_mtx_lock_spin(&ifp->if_ref_lock);
8054 if ((ret = IF_FULLY_ATTACHED_AND_READY(ifp))) {
8055 ifp->if_refio++;
8056 ifp->if_datamov++;
8057 }
8058 lck_mtx_unlock(&ifp->if_ref_lock);
8059
8060 return ret;
8061 }
8062
8063 void
ifnet_datamov_end(struct ifnet * ifp)8064 ifnet_datamov_end(struct ifnet *ifp)
8065 {
8066 lck_mtx_lock_spin(&ifp->if_ref_lock);
8067 VERIFY(ifp->if_datamov > 0);
8068 /*
8069 * if there's no more thread moving data, wakeup any
8070 * drainers that's blocked waiting for this.
8071 */
8072 if (--ifp->if_datamov == 0 && ifp->if_drainers > 0) {
8073 DLIL_PRINTF("Waking up drainers on %s\n", if_name(ifp));
8074 DTRACE_IP1(datamov__drain__wake, struct ifnet *, ifp);
8075 wakeup(&(ifp->if_datamov));
8076 }
8077 ifnet_decr_iorefcnt_locked(ifp);
8078 lck_mtx_unlock(&ifp->if_ref_lock);
8079 }
8080
8081 static void
ifnet_datamov_suspend_locked(struct ifnet * ifp)8082 ifnet_datamov_suspend_locked(struct ifnet *ifp)
8083 {
8084 LCK_MTX_ASSERT(&ifp->if_ref_lock, LCK_MTX_ASSERT_OWNED);
8085 ifp->if_refio++;
8086 if (ifp->if_suspend++ == 0) {
8087 VERIFY(ifp->if_refflags & IFRF_READY);
8088 ifp->if_refflags &= ~IFRF_READY;
8089 }
8090 }
8091
8092 void
ifnet_datamov_suspend(struct ifnet * ifp)8093 ifnet_datamov_suspend(struct ifnet *ifp)
8094 {
8095 lck_mtx_lock_spin(&ifp->if_ref_lock);
8096 VERIFY(ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING));
8097 ifnet_datamov_suspend_locked(ifp);
8098 lck_mtx_unlock(&ifp->if_ref_lock);
8099 }
8100
8101 boolean_t
ifnet_datamov_suspend_if_needed(struct ifnet * ifp)8102 ifnet_datamov_suspend_if_needed(struct ifnet *ifp)
8103 {
8104 lck_mtx_lock_spin(&ifp->if_ref_lock);
8105 VERIFY(ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING));
8106 if (ifp->if_suspend > 0) {
8107 lck_mtx_unlock(&ifp->if_ref_lock);
8108 return FALSE;
8109 }
8110 ifnet_datamov_suspend_locked(ifp);
8111 lck_mtx_unlock(&ifp->if_ref_lock);
8112 return TRUE;
8113 }
8114
8115 void
ifnet_datamov_drain(struct ifnet * ifp)8116 ifnet_datamov_drain(struct ifnet *ifp)
8117 {
8118 lck_mtx_lock(&ifp->if_ref_lock);
8119 VERIFY(ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING));
8120 /* data movement must already be suspended */
8121 VERIFY(ifp->if_suspend > 0);
8122 VERIFY(!(ifp->if_refflags & IFRF_READY));
8123 ifp->if_drainers++;
8124 while (ifp->if_datamov != 0) {
8125 DLIL_PRINTF("Waiting for data path(s) to quiesce on %s\n",
8126 if_name(ifp));
8127 DTRACE_IP1(datamov__wait, struct ifnet *, ifp);
8128 (void) msleep(&(ifp->if_datamov), &ifp->if_ref_lock,
8129 (PZERO - 1), __func__, NULL);
8130 DTRACE_IP1(datamov__wake, struct ifnet *, ifp);
8131 }
8132 VERIFY(!(ifp->if_refflags & IFRF_READY));
8133 VERIFY(ifp->if_drainers > 0);
8134 ifp->if_drainers--;
8135 lck_mtx_unlock(&ifp->if_ref_lock);
8136
8137 /* purge the interface queues */
8138 if ((ifp->if_eflags & IFEF_TXSTART) != 0) {
8139 if_qflush_snd(ifp, false);
8140 }
8141 }
8142
8143 void
ifnet_datamov_suspend_and_drain(struct ifnet * ifp)8144 ifnet_datamov_suspend_and_drain(struct ifnet *ifp)
8145 {
8146 ifnet_datamov_suspend(ifp);
8147 ifnet_datamov_drain(ifp);
8148 }
8149
8150 void
ifnet_datamov_resume(struct ifnet * ifp)8151 ifnet_datamov_resume(struct ifnet *ifp)
8152 {
8153 lck_mtx_lock(&ifp->if_ref_lock);
8154 /* data movement must already be suspended */
8155 VERIFY(ifp->if_suspend > 0);
8156 if (--ifp->if_suspend == 0) {
8157 VERIFY(!(ifp->if_refflags & IFRF_READY));
8158 ifp->if_refflags |= IFRF_READY;
8159 }
8160 ifnet_decr_iorefcnt_locked(ifp);
8161 lck_mtx_unlock(&ifp->if_ref_lock);
8162 }
8163
8164 static void
dlil_if_trace(struct dlil_ifnet * dl_if,int refhold)8165 dlil_if_trace(struct dlil_ifnet *dl_if, int refhold)
8166 {
8167 struct dlil_ifnet_dbg *dl_if_dbg = (struct dlil_ifnet_dbg *)dl_if;
8168 ctrace_t *tr;
8169 u_int32_t idx;
8170 u_int16_t *cnt;
8171
8172 if (!(dl_if->dl_if_flags & DLIF_DEBUG)) {
8173 panic("%s: dl_if %p has no debug structure", __func__, dl_if);
8174 /* NOTREACHED */
8175 }
8176
8177 if (refhold) {
8178 cnt = &dl_if_dbg->dldbg_if_refhold_cnt;
8179 tr = dl_if_dbg->dldbg_if_refhold;
8180 } else {
8181 cnt = &dl_if_dbg->dldbg_if_refrele_cnt;
8182 tr = dl_if_dbg->dldbg_if_refrele;
8183 }
8184
8185 idx = atomic_add_16_ov(cnt, 1) % IF_REF_TRACE_HIST_SIZE;
8186 ctrace_record(&tr[idx]);
8187 }
8188
8189 errno_t
dlil_if_ref(struct ifnet * ifp)8190 dlil_if_ref(struct ifnet *ifp)
8191 {
8192 struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
8193
8194 if (dl_if == NULL) {
8195 return EINVAL;
8196 }
8197
8198 lck_mtx_lock_spin(&dl_if->dl_if_lock);
8199 ++dl_if->dl_if_refcnt;
8200 if (dl_if->dl_if_refcnt == 0) {
8201 panic("%s: wraparound refcnt for ifp=%p", __func__, ifp);
8202 /* NOTREACHED */
8203 }
8204 if (dl_if->dl_if_trace != NULL) {
8205 (*dl_if->dl_if_trace)(dl_if, TRUE);
8206 }
8207 lck_mtx_unlock(&dl_if->dl_if_lock);
8208
8209 return 0;
8210 }
8211
8212 errno_t
dlil_if_free(struct ifnet * ifp)8213 dlil_if_free(struct ifnet *ifp)
8214 {
8215 struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
8216 bool need_release = FALSE;
8217
8218 if (dl_if == NULL) {
8219 return EINVAL;
8220 }
8221
8222 lck_mtx_lock_spin(&dl_if->dl_if_lock);
8223 switch (dl_if->dl_if_refcnt) {
8224 case 0:
8225 panic("%s: negative refcnt for ifp=%p", __func__, ifp);
8226 /* NOTREACHED */
8227 break;
8228 case 1:
8229 if ((ifp->if_refflags & IFRF_EMBRYONIC) != 0) {
8230 need_release = TRUE;
8231 }
8232 break;
8233 default:
8234 break;
8235 }
8236 --dl_if->dl_if_refcnt;
8237 if (dl_if->dl_if_trace != NULL) {
8238 (*dl_if->dl_if_trace)(dl_if, FALSE);
8239 }
8240 lck_mtx_unlock(&dl_if->dl_if_lock);
8241 if (need_release) {
8242 _dlil_if_release(ifp, true);
8243 }
8244 return 0;
8245 }
8246
8247 static errno_t
dlil_attach_protocol(struct if_proto * proto,const struct ifnet_demux_desc * demux_list,u_int32_t demux_count,uint32_t * proto_count)8248 dlil_attach_protocol(struct if_proto *proto,
8249 const struct ifnet_demux_desc *demux_list, u_int32_t demux_count,
8250 uint32_t * proto_count)
8251 {
8252 struct kev_dl_proto_data ev_pr_data;
8253 struct ifnet *ifp = proto->ifp;
8254 errno_t retval = 0;
8255 u_int32_t hash_value = proto_hash_value(proto->protocol_family);
8256 struct if_proto *prev_proto;
8257 struct if_proto *_proto;
8258
8259 /* don't allow attaching anything but PF_BRIDGE to vmnet interfaces */
8260 if (IFNET_IS_VMNET(ifp) && proto->protocol_family != PF_BRIDGE) {
8261 return EINVAL;
8262 }
8263
8264 if (!ifnet_is_attached(ifp, 1)) {
8265 os_log(OS_LOG_DEFAULT, "%s: %s is no longer attached",
8266 __func__, if_name(ifp));
8267 return ENXIO;
8268 }
8269 /* callee holds a proto refcnt upon success */
8270 ifnet_lock_exclusive(ifp);
8271 _proto = find_attached_proto(ifp, proto->protocol_family);
8272 if (_proto != NULL) {
8273 ifnet_lock_done(ifp);
8274 if_proto_free(_proto);
8275 retval = EEXIST;
8276 goto ioref_done;
8277 }
8278
8279 /*
8280 * Call family module add_proto routine so it can refine the
8281 * demux descriptors as it wishes.
8282 */
8283 retval = ifp->if_add_proto(ifp, proto->protocol_family, demux_list,
8284 demux_count);
8285 if (retval) {
8286 ifnet_lock_done(ifp);
8287 goto ioref_done;
8288 }
8289
8290 /*
8291 * Insert the protocol in the hash
8292 */
8293 prev_proto = SLIST_FIRST(&ifp->if_proto_hash[hash_value]);
8294 while (prev_proto != NULL && SLIST_NEXT(prev_proto, next_hash) != NULL) {
8295 prev_proto = SLIST_NEXT(prev_proto, next_hash);
8296 }
8297 if (prev_proto) {
8298 SLIST_INSERT_AFTER(prev_proto, proto, next_hash);
8299 } else {
8300 SLIST_INSERT_HEAD(&ifp->if_proto_hash[hash_value],
8301 proto, next_hash);
8302 }
8303
8304 /* hold a proto refcnt for attach */
8305 if_proto_ref(proto);
8306
8307 /*
8308 * The reserved field carries the number of protocol still attached
8309 * (subject to change)
8310 */
8311 ev_pr_data.proto_family = proto->protocol_family;
8312 ev_pr_data.proto_remaining_count = dlil_ifp_protolist(ifp, NULL, 0);
8313
8314 ifnet_lock_done(ifp);
8315
8316 dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_PROTO_ATTACHED,
8317 (struct net_event_data *)&ev_pr_data,
8318 sizeof(struct kev_dl_proto_data), FALSE);
8319 if (proto_count != NULL) {
8320 *proto_count = ev_pr_data.proto_remaining_count;
8321 }
8322 ioref_done:
8323 ifnet_decr_iorefcnt(ifp);
8324 return retval;
8325 }
8326
8327 static void
dlil_handle_proto_attach(ifnet_t ifp,protocol_family_t protocol)8328 dlil_handle_proto_attach(ifnet_t ifp, protocol_family_t protocol)
8329 {
8330 /*
8331 * A protocol has been attached, mark the interface up.
8332 * This used to be done by configd.KernelEventMonitor, but that
8333 * is inherently prone to races (rdar://problem/30810208).
8334 */
8335 (void) ifnet_set_flags(ifp, IFF_UP, IFF_UP);
8336 (void) ifnet_ioctl(ifp, 0, SIOCSIFFLAGS, NULL);
8337 dlil_post_sifflags_msg(ifp);
8338 #if SKYWALK
8339 switch (protocol) {
8340 case AF_INET:
8341 case AF_INET6:
8342 /* don't attach the flowswitch unless attaching IP */
8343 dlil_attach_flowswitch_nexus(ifp);
8344 break;
8345 default:
8346 break;
8347 }
8348 #endif /* SKYWALK */
8349 }
8350
8351 errno_t
ifnet_attach_protocol(ifnet_t ifp,protocol_family_t protocol,const struct ifnet_attach_proto_param * proto_details)8352 ifnet_attach_protocol(ifnet_t ifp, protocol_family_t protocol,
8353 const struct ifnet_attach_proto_param *proto_details)
8354 {
8355 int retval = 0;
8356 struct if_proto *ifproto = NULL;
8357 uint32_t proto_count = 0;
8358
8359 ifnet_head_lock_shared();
8360 if (ifp == NULL || protocol == 0 || proto_details == NULL) {
8361 retval = EINVAL;
8362 goto end;
8363 }
8364 /* Check that the interface is in the global list */
8365 if (!ifnet_lookup(ifp)) {
8366 retval = ENXIO;
8367 goto end;
8368 }
8369
8370 ifproto = zalloc_flags(dlif_proto_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
8371
8372 /* refcnt held above during lookup */
8373 ifproto->ifp = ifp;
8374 ifproto->protocol_family = protocol;
8375 ifproto->proto_kpi = kProtoKPI_v1;
8376 ifproto->kpi.v1.input = proto_details->input;
8377 ifproto->kpi.v1.pre_output = proto_details->pre_output;
8378 ifproto->kpi.v1.event = proto_details->event;
8379 ifproto->kpi.v1.ioctl = proto_details->ioctl;
8380 ifproto->kpi.v1.detached = proto_details->detached;
8381 ifproto->kpi.v1.resolve_multi = proto_details->resolve;
8382 ifproto->kpi.v1.send_arp = proto_details->send_arp;
8383
8384 retval = dlil_attach_protocol(ifproto,
8385 proto_details->demux_list, proto_details->demux_count,
8386 &proto_count);
8387
8388 end:
8389 if (retval == EEXIST) {
8390 /* already attached */
8391 if (dlil_verbose) {
8392 DLIL_PRINTF("%s: protocol %d already attached\n",
8393 ifp != NULL ? if_name(ifp) : "N/A",
8394 protocol);
8395 }
8396 } else if (retval != 0) {
8397 DLIL_PRINTF("%s: failed to attach v1 protocol %d (err=%d)\n",
8398 ifp != NULL ? if_name(ifp) : "N/A", protocol, retval);
8399 } else if (dlil_verbose) {
8400 DLIL_PRINTF("%s: attached v1 protocol %d (count = %d)\n",
8401 ifp != NULL ? if_name(ifp) : "N/A",
8402 protocol, proto_count);
8403 }
8404 ifnet_head_done();
8405 if (retval == 0) {
8406 dlil_handle_proto_attach(ifp, protocol);
8407 } else if (ifproto != NULL) {
8408 zfree(dlif_proto_zone, ifproto);
8409 }
8410 return retval;
8411 }
8412
8413 errno_t
ifnet_attach_protocol_v2(ifnet_t ifp,protocol_family_t protocol,const struct ifnet_attach_proto_param_v2 * proto_details)8414 ifnet_attach_protocol_v2(ifnet_t ifp, protocol_family_t protocol,
8415 const struct ifnet_attach_proto_param_v2 *proto_details)
8416 {
8417 int retval = 0;
8418 struct if_proto *ifproto = NULL;
8419 uint32_t proto_count = 0;
8420
8421 ifnet_head_lock_shared();
8422 if (ifp == NULL || protocol == 0 || proto_details == NULL) {
8423 retval = EINVAL;
8424 goto end;
8425 }
8426 /* Check that the interface is in the global list */
8427 if (!ifnet_lookup(ifp)) {
8428 retval = ENXIO;
8429 goto end;
8430 }
8431
8432 ifproto = zalloc_flags(dlif_proto_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
8433
8434 /* refcnt held above during lookup */
8435 ifproto->ifp = ifp;
8436 ifproto->protocol_family = protocol;
8437 ifproto->proto_kpi = kProtoKPI_v2;
8438 ifproto->kpi.v2.input = proto_details->input;
8439 ifproto->kpi.v2.pre_output = proto_details->pre_output;
8440 ifproto->kpi.v2.event = proto_details->event;
8441 ifproto->kpi.v2.ioctl = proto_details->ioctl;
8442 ifproto->kpi.v2.detached = proto_details->detached;
8443 ifproto->kpi.v2.resolve_multi = proto_details->resolve;
8444 ifproto->kpi.v2.send_arp = proto_details->send_arp;
8445
8446 retval = dlil_attach_protocol(ifproto,
8447 proto_details->demux_list, proto_details->demux_count,
8448 &proto_count);
8449
8450 end:
8451 if (retval == EEXIST) {
8452 /* already attached */
8453 if (dlil_verbose) {
8454 DLIL_PRINTF("%s: protocol %d already attached\n",
8455 ifp != NULL ? if_name(ifp) : "N/A",
8456 protocol);
8457 }
8458 } else if (retval != 0) {
8459 DLIL_PRINTF("%s: failed to attach v2 protocol %d (err=%d)\n",
8460 ifp != NULL ? if_name(ifp) : "N/A", protocol, retval);
8461 } else if (dlil_verbose) {
8462 DLIL_PRINTF("%s: attached v2 protocol %d (count = %d)\n",
8463 ifp != NULL ? if_name(ifp) : "N/A",
8464 protocol, proto_count);
8465 }
8466 ifnet_head_done();
8467 if (retval == 0) {
8468 dlil_handle_proto_attach(ifp, protocol);
8469 } else if (ifproto != NULL) {
8470 zfree(dlif_proto_zone, ifproto);
8471 }
8472 return retval;
8473 }
8474
8475 errno_t
ifnet_detach_protocol(ifnet_t ifp,protocol_family_t proto_family)8476 ifnet_detach_protocol(ifnet_t ifp, protocol_family_t proto_family)
8477 {
8478 struct if_proto *proto = NULL;
8479 int retval = 0;
8480
8481 if (ifp == NULL || proto_family == 0) {
8482 retval = EINVAL;
8483 goto end;
8484 }
8485
8486 ifnet_lock_exclusive(ifp);
8487 /* callee holds a proto refcnt upon success */
8488 proto = find_attached_proto(ifp, proto_family);
8489 if (proto == NULL) {
8490 retval = ENXIO;
8491 ifnet_lock_done(ifp);
8492 goto end;
8493 }
8494
8495 /* call family module del_proto */
8496 if (ifp->if_del_proto) {
8497 ifp->if_del_proto(ifp, proto->protocol_family);
8498 }
8499
8500 SLIST_REMOVE(&ifp->if_proto_hash[proto_hash_value(proto_family)],
8501 proto, if_proto, next_hash);
8502
8503 if (proto->proto_kpi == kProtoKPI_v1) {
8504 proto->kpi.v1.input = ifproto_media_input_v1;
8505 proto->kpi.v1.pre_output = ifproto_media_preout;
8506 proto->kpi.v1.event = ifproto_media_event;
8507 proto->kpi.v1.ioctl = ifproto_media_ioctl;
8508 proto->kpi.v1.resolve_multi = ifproto_media_resolve_multi;
8509 proto->kpi.v1.send_arp = ifproto_media_send_arp;
8510 } else {
8511 proto->kpi.v2.input = ifproto_media_input_v2;
8512 proto->kpi.v2.pre_output = ifproto_media_preout;
8513 proto->kpi.v2.event = ifproto_media_event;
8514 proto->kpi.v2.ioctl = ifproto_media_ioctl;
8515 proto->kpi.v2.resolve_multi = ifproto_media_resolve_multi;
8516 proto->kpi.v2.send_arp = ifproto_media_send_arp;
8517 }
8518 proto->detached = 1;
8519 ifnet_lock_done(ifp);
8520
8521 if (dlil_verbose) {
8522 DLIL_PRINTF("%s: detached %s protocol %d\n", if_name(ifp),
8523 (proto->proto_kpi == kProtoKPI_v1) ?
8524 "v1" : "v2", proto_family);
8525 }
8526
8527 /* release proto refcnt held during protocol attach */
8528 if_proto_free(proto);
8529
8530 /*
8531 * Release proto refcnt held during lookup; the rest of
8532 * protocol detach steps will happen when the last proto
8533 * reference is released.
8534 */
8535 if_proto_free(proto);
8536
8537 end:
8538 return retval;
8539 }
8540
8541
8542 static errno_t
ifproto_media_input_v1(struct ifnet * ifp,protocol_family_t protocol,struct mbuf * packet,char * header)8543 ifproto_media_input_v1(struct ifnet *ifp, protocol_family_t protocol,
8544 struct mbuf *packet, char *header)
8545 {
8546 #pragma unused(ifp, protocol, packet, header)
8547 return ENXIO;
8548 }
8549
8550 static errno_t
ifproto_media_input_v2(struct ifnet * ifp,protocol_family_t protocol,struct mbuf * packet)8551 ifproto_media_input_v2(struct ifnet *ifp, protocol_family_t protocol,
8552 struct mbuf *packet)
8553 {
8554 #pragma unused(ifp, protocol, packet)
8555 return ENXIO;
8556 }
8557
8558 static errno_t
ifproto_media_preout(struct ifnet * ifp,protocol_family_t protocol,mbuf_t * packet,const struct sockaddr * dest,void * route,char * frame_type,char * link_layer_dest)8559 ifproto_media_preout(struct ifnet *ifp, protocol_family_t protocol,
8560 mbuf_t *packet, const struct sockaddr *dest, void *route, char *frame_type,
8561 char *link_layer_dest)
8562 {
8563 #pragma unused(ifp, protocol, packet, dest, route, frame_type, link_layer_dest)
8564 return ENXIO;
8565 }
8566
8567 static void
ifproto_media_event(struct ifnet * ifp,protocol_family_t protocol,const struct kev_msg * event)8568 ifproto_media_event(struct ifnet *ifp, protocol_family_t protocol,
8569 const struct kev_msg *event)
8570 {
8571 #pragma unused(ifp, protocol, event)
8572 }
8573
8574 static errno_t
ifproto_media_ioctl(struct ifnet * ifp,protocol_family_t protocol,unsigned long command,void * argument)8575 ifproto_media_ioctl(struct ifnet *ifp, protocol_family_t protocol,
8576 unsigned long command, void *argument)
8577 {
8578 #pragma unused(ifp, protocol, command, argument)
8579 return ENXIO;
8580 }
8581
8582 static errno_t
ifproto_media_resolve_multi(ifnet_t ifp,const struct sockaddr * proto_addr,struct sockaddr_dl * out_ll,size_t ll_len)8583 ifproto_media_resolve_multi(ifnet_t ifp, const struct sockaddr *proto_addr,
8584 struct sockaddr_dl *out_ll, size_t ll_len)
8585 {
8586 #pragma unused(ifp, proto_addr, out_ll, ll_len)
8587 return ENXIO;
8588 }
8589
8590 static errno_t
ifproto_media_send_arp(struct ifnet * ifp,u_short arpop,const struct sockaddr_dl * sender_hw,const struct sockaddr * sender_proto,const struct sockaddr_dl * target_hw,const struct sockaddr * target_proto)8591 ifproto_media_send_arp(struct ifnet *ifp, u_short arpop,
8592 const struct sockaddr_dl *sender_hw, const struct sockaddr *sender_proto,
8593 const struct sockaddr_dl *target_hw, const struct sockaddr *target_proto)
8594 {
8595 #pragma unused(ifp, arpop, sender_hw, sender_proto, target_hw, target_proto)
8596 return ENXIO;
8597 }
8598
8599 extern int if_next_index(void);
8600 extern int tcp_ecn_outbound;
8601
8602 void
dlil_ifclassq_setup(struct ifnet * ifp,struct ifclassq * ifcq)8603 dlil_ifclassq_setup(struct ifnet *ifp, struct ifclassq *ifcq)
8604 {
8605 uint32_t sflags = 0;
8606 int err;
8607
8608 if (if_flowadv) {
8609 sflags |= PKTSCHEDF_QALG_FLOWCTL;
8610 }
8611
8612 if (if_delaybased_queue) {
8613 sflags |= PKTSCHEDF_QALG_DELAYBASED;
8614 }
8615
8616 if (ifp->if_output_sched_model ==
8617 IFNET_SCHED_MODEL_DRIVER_MANAGED) {
8618 sflags |= PKTSCHEDF_QALG_DRIVER_MANAGED;
8619 }
8620 /* Inherit drop limit from the default queue */
8621 if (ifp->if_snd != ifcq) {
8622 IFCQ_PKT_DROP_LIMIT(ifcq) = IFCQ_PKT_DROP_LIMIT(ifp->if_snd);
8623 }
8624 /* Initialize transmit queue(s) */
8625 err = ifclassq_setup(ifcq, ifp, sflags);
8626 if (err != 0) {
8627 panic_plain("%s: ifp=%p couldn't initialize transmit queue; "
8628 "err=%d", __func__, ifp, err);
8629 /* NOTREACHED */
8630 }
8631 }
8632
8633 errno_t
ifnet_attach(ifnet_t ifp,const struct sockaddr_dl * ll_addr)8634 ifnet_attach(ifnet_t ifp, const struct sockaddr_dl *ll_addr)
8635 {
8636 #if SKYWALK
8637 boolean_t netif_compat;
8638 if_nexus_netif nexus_netif;
8639 #endif /* SKYWALK */
8640 struct ifnet *tmp_if;
8641 struct ifaddr *ifa;
8642 struct if_data_internal if_data_saved;
8643 struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
8644 struct dlil_threading_info *dl_inp;
8645 thread_continue_t thfunc = NULL;
8646 int err;
8647
8648 if (ifp == NULL) {
8649 return EINVAL;
8650 }
8651
8652 /*
8653 * Serialize ifnet attach using dlil_ifnet_lock, in order to
8654 * prevent the interface from being configured while it is
8655 * embryonic, as ifnet_head_lock is dropped and reacquired
8656 * below prior to marking the ifnet with IFRF_ATTACHED.
8657 */
8658 dlil_if_lock();
8659 ifnet_head_lock_exclusive();
8660 /* Verify we aren't already on the list */
8661 TAILQ_FOREACH(tmp_if, &ifnet_head, if_link) {
8662 if (tmp_if == ifp) {
8663 ifnet_head_done();
8664 dlil_if_unlock();
8665 return EEXIST;
8666 }
8667 }
8668
8669 lck_mtx_lock_spin(&ifp->if_ref_lock);
8670 if (!(ifp->if_refflags & IFRF_EMBRYONIC)) {
8671 panic_plain("%s: flags mismatch (embryonic not set) ifp=%p",
8672 __func__, ifp);
8673 /* NOTREACHED */
8674 }
8675 lck_mtx_unlock(&ifp->if_ref_lock);
8676
8677 ifnet_lock_exclusive(ifp);
8678
8679 /* Sanity check */
8680 VERIFY(ifp->if_detaching_link.tqe_next == NULL);
8681 VERIFY(ifp->if_detaching_link.tqe_prev == NULL);
8682 VERIFY(ifp->if_threads_pending == 0);
8683
8684 if (ll_addr != NULL) {
8685 if (ifp->if_addrlen == 0) {
8686 ifp->if_addrlen = ll_addr->sdl_alen;
8687 } else if (ll_addr->sdl_alen != ifp->if_addrlen) {
8688 ifnet_lock_done(ifp);
8689 ifnet_head_done();
8690 dlil_if_unlock();
8691 return EINVAL;
8692 }
8693 }
8694
8695 /*
8696 * Allow interfaces without protocol families to attach
8697 * only if they have the necessary fields filled out.
8698 */
8699 if (ifp->if_add_proto == NULL || ifp->if_del_proto == NULL) {
8700 DLIL_PRINTF("%s: Attempt to attach interface without "
8701 "family module - %d\n", __func__, ifp->if_family);
8702 ifnet_lock_done(ifp);
8703 ifnet_head_done();
8704 dlil_if_unlock();
8705 return ENODEV;
8706 }
8707
8708 /* Allocate protocol hash table */
8709 VERIFY(ifp->if_proto_hash == NULL);
8710 ifp->if_proto_hash = zalloc_flags(dlif_phash_zone,
8711 Z_WAITOK | Z_ZERO | Z_NOFAIL);
8712
8713 lck_mtx_lock_spin(&ifp->if_flt_lock);
8714 VERIFY(TAILQ_EMPTY(&ifp->if_flt_head));
8715 TAILQ_INIT(&ifp->if_flt_head);
8716 VERIFY(ifp->if_flt_busy == 0);
8717 VERIFY(ifp->if_flt_waiters == 0);
8718 VERIFY(ifp->if_flt_non_os_count == 0);
8719 VERIFY(ifp->if_flt_no_tso_count == 0);
8720 lck_mtx_unlock(&ifp->if_flt_lock);
8721
8722 if (!(dl_if->dl_if_flags & DLIF_REUSE)) {
8723 VERIFY(LIST_EMPTY(&ifp->if_multiaddrs));
8724 LIST_INIT(&ifp->if_multiaddrs);
8725 }
8726
8727 VERIFY(ifp->if_allhostsinm == NULL);
8728 VERIFY(TAILQ_EMPTY(&ifp->if_addrhead));
8729 TAILQ_INIT(&ifp->if_addrhead);
8730
8731 if (ifp->if_index == 0) {
8732 int idx = if_next_index();
8733
8734 if (idx == -1) {
8735 ifp->if_index = 0;
8736 ifnet_lock_done(ifp);
8737 ifnet_head_done();
8738 dlil_if_unlock();
8739 return ENOBUFS;
8740 }
8741 ifp->if_index = (uint16_t)idx;
8742
8743 /* the lladdr passed at attach time is the permanent address */
8744 if (ll_addr != NULL && ifp->if_type == IFT_ETHER &&
8745 ll_addr->sdl_alen == ETHER_ADDR_LEN) {
8746 bcopy(CONST_LLADDR(ll_addr),
8747 dl_if->dl_if_permanent_ether,
8748 ETHER_ADDR_LEN);
8749 dl_if->dl_if_permanent_ether_is_set = 1;
8750 }
8751 }
8752 /* There should not be anything occupying this slot */
8753 VERIFY(ifindex2ifnet[ifp->if_index] == NULL);
8754
8755 /* allocate (if needed) and initialize a link address */
8756 ifa = dlil_alloc_lladdr(ifp, ll_addr);
8757 if (ifa == NULL) {
8758 ifnet_lock_done(ifp);
8759 ifnet_head_done();
8760 dlil_if_unlock();
8761 return ENOBUFS;
8762 }
8763
8764 VERIFY(ifnet_addrs[ifp->if_index - 1] == NULL);
8765 ifnet_addrs[ifp->if_index - 1] = ifa;
8766
8767 /* make this address the first on the list */
8768 IFA_LOCK(ifa);
8769 /* hold a reference for ifnet_addrs[] */
8770 IFA_ADDREF_LOCKED(ifa);
8771 /* if_attach_link_ifa() holds a reference for ifa_link */
8772 if_attach_link_ifa(ifp, ifa);
8773 IFA_UNLOCK(ifa);
8774
8775 TAILQ_INSERT_TAIL(&ifnet_head, ifp, if_link);
8776 ifindex2ifnet[ifp->if_index] = ifp;
8777
8778 /* Hold a reference to the underlying dlil_ifnet */
8779 ifnet_reference(ifp);
8780
8781 /* Clear stats (save and restore other fields that we care) */
8782 if_data_saved = ifp->if_data;
8783 bzero(&ifp->if_data, sizeof(ifp->if_data));
8784 ifp->if_data.ifi_type = if_data_saved.ifi_type;
8785 ifp->if_data.ifi_typelen = if_data_saved.ifi_typelen;
8786 ifp->if_data.ifi_physical = if_data_saved.ifi_physical;
8787 ifp->if_data.ifi_addrlen = if_data_saved.ifi_addrlen;
8788 ifp->if_data.ifi_hdrlen = if_data_saved.ifi_hdrlen;
8789 ifp->if_data.ifi_mtu = if_data_saved.ifi_mtu;
8790 ifp->if_data.ifi_baudrate = if_data_saved.ifi_baudrate;
8791 ifp->if_data.ifi_hwassist = if_data_saved.ifi_hwassist;
8792 ifp->if_data.ifi_tso_v4_mtu = if_data_saved.ifi_tso_v4_mtu;
8793 ifp->if_data.ifi_tso_v6_mtu = if_data_saved.ifi_tso_v6_mtu;
8794 ifnet_touch_lastchange(ifp);
8795
8796 VERIFY(ifp->if_output_sched_model == IFNET_SCHED_MODEL_NORMAL ||
8797 ifp->if_output_sched_model == IFNET_SCHED_MODEL_DRIVER_MANAGED ||
8798 ifp->if_output_sched_model == IFNET_SCHED_MODEL_FQ_CODEL);
8799
8800 dlil_ifclassq_setup(ifp, ifp->if_snd);
8801
8802 /* Sanity checks on the input thread storage */
8803 dl_inp = &dl_if->dl_if_inpstorage;
8804 bzero(&dl_inp->dlth_stats, sizeof(dl_inp->dlth_stats));
8805 VERIFY(dl_inp->dlth_flags == 0);
8806 VERIFY(dl_inp->dlth_wtot == 0);
8807 VERIFY(dl_inp->dlth_ifp == NULL);
8808 VERIFY(qhead(&dl_inp->dlth_pkts) == NULL && qempty(&dl_inp->dlth_pkts));
8809 VERIFY(qlimit(&dl_inp->dlth_pkts) == 0);
8810 VERIFY(!dl_inp->dlth_affinity);
8811 VERIFY(ifp->if_inp == NULL);
8812 VERIFY(dl_inp->dlth_thread == THREAD_NULL);
8813 VERIFY(dl_inp->dlth_strategy == NULL);
8814 VERIFY(dl_inp->dlth_driver_thread == THREAD_NULL);
8815 VERIFY(dl_inp->dlth_poller_thread == THREAD_NULL);
8816 VERIFY(dl_inp->dlth_affinity_tag == 0);
8817
8818 #if IFNET_INPUT_SANITY_CHK
8819 VERIFY(dl_inp->dlth_pkts_cnt == 0);
8820 #endif /* IFNET_INPUT_SANITY_CHK */
8821
8822 VERIFY(ifp->if_poll_thread == THREAD_NULL);
8823 dlil_reset_rxpoll_params(ifp);
8824 /*
8825 * A specific DLIL input thread is created per non-loopback interface.
8826 */
8827 if (ifp->if_family != IFNET_FAMILY_LOOPBACK) {
8828 ifp->if_inp = dl_inp;
8829 ifnet_incr_pending_thread_count(ifp);
8830 err = dlil_create_input_thread(ifp, ifp->if_inp, &thfunc);
8831 if (err == ENODEV) {
8832 VERIFY(thfunc == NULL);
8833 ifnet_decr_pending_thread_count(ifp);
8834 } else if (err != 0) {
8835 panic_plain("%s: ifp=%p couldn't get an input thread; "
8836 "err=%d", __func__, ifp, err);
8837 /* NOTREACHED */
8838 }
8839 }
8840 /*
8841 * If the driver supports the new transmit model, calculate flow hash
8842 * and create a workloop starter thread to invoke the if_start callback
8843 * where the packets may be dequeued and transmitted.
8844 */
8845 if (ifp->if_eflags & IFEF_TXSTART) {
8846 thread_precedence_policy_data_t info;
8847 __unused kern_return_t kret;
8848
8849 ifp->if_flowhash = ifnet_calc_flowhash(ifp);
8850 VERIFY(ifp->if_flowhash != 0);
8851 VERIFY(ifp->if_start_thread == THREAD_NULL);
8852
8853 ifnet_set_start_cycle(ifp, NULL);
8854 ifp->if_start_active = 0;
8855 ifp->if_start_req = 0;
8856 ifp->if_start_flags = 0;
8857 VERIFY(ifp->if_start != NULL);
8858 ifnet_incr_pending_thread_count(ifp);
8859 if ((err = kernel_thread_start(ifnet_start_thread_func,
8860 ifp, &ifp->if_start_thread)) != KERN_SUCCESS) {
8861 panic_plain("%s: "
8862 "ifp=%p couldn't get a start thread; "
8863 "err=%d", __func__, ifp, err);
8864 /* NOTREACHED */
8865 }
8866 bzero(&info, sizeof(info));
8867 info.importance = 1;
8868 kret = thread_policy_set(ifp->if_start_thread,
8869 THREAD_PRECEDENCE_POLICY, (thread_policy_t)&info,
8870 THREAD_PRECEDENCE_POLICY_COUNT);
8871 ASSERT(kret == KERN_SUCCESS);
8872 } else {
8873 ifp->if_flowhash = 0;
8874 }
8875
8876 /* Reset polling parameters */
8877 ifnet_set_poll_cycle(ifp, NULL);
8878 ifp->if_poll_update = 0;
8879 ifp->if_poll_flags = 0;
8880 ifp->if_poll_req = 0;
8881 VERIFY(ifp->if_poll_thread == THREAD_NULL);
8882
8883 /*
8884 * If the driver supports the new receive model, create a poller
8885 * thread to invoke if_input_poll callback where the packets may
8886 * be dequeued from the driver and processed for reception.
8887 * if the interface is netif compat then the poller thread is
8888 * managed by netif.
8889 */
8890 if (thfunc == dlil_rxpoll_input_thread_func) {
8891 thread_precedence_policy_data_t info;
8892 __unused kern_return_t kret;
8893 #if SKYWALK
8894 VERIFY(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
8895 #endif /* SKYWALK */
8896 VERIFY(ifp->if_input_poll != NULL);
8897 VERIFY(ifp->if_input_ctl != NULL);
8898 ifnet_incr_pending_thread_count(ifp);
8899 if ((err = kernel_thread_start(ifnet_poll_thread_func, ifp,
8900 &ifp->if_poll_thread)) != KERN_SUCCESS) {
8901 panic_plain("%s: ifp=%p couldn't get a poll thread; "
8902 "err=%d", __func__, ifp, err);
8903 /* NOTREACHED */
8904 }
8905 bzero(&info, sizeof(info));
8906 info.importance = 1;
8907 kret = thread_policy_set(ifp->if_poll_thread,
8908 THREAD_PRECEDENCE_POLICY, (thread_policy_t)&info,
8909 THREAD_PRECEDENCE_POLICY_COUNT);
8910 ASSERT(kret == KERN_SUCCESS);
8911 }
8912
8913 VERIFY(ifp->if_desc.ifd_maxlen == IF_DESCSIZE);
8914 VERIFY(ifp->if_desc.ifd_len == 0);
8915 VERIFY(ifp->if_desc.ifd_desc != NULL);
8916
8917 /* Record attach PC stacktrace */
8918 ctrace_record(&((struct dlil_ifnet *)ifp)->dl_if_attach);
8919
8920 ifp->if_updatemcasts = 0;
8921 if (!LIST_EMPTY(&ifp->if_multiaddrs)) {
8922 struct ifmultiaddr *ifma;
8923 LIST_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
8924 IFMA_LOCK(ifma);
8925 if (ifma->ifma_addr->sa_family == AF_LINK ||
8926 ifma->ifma_addr->sa_family == AF_UNSPEC) {
8927 ifp->if_updatemcasts++;
8928 }
8929 IFMA_UNLOCK(ifma);
8930 }
8931
8932 DLIL_PRINTF("%s: attached with %d suspended link-layer multicast "
8933 "membership(s)\n", if_name(ifp),
8934 ifp->if_updatemcasts);
8935 }
8936
8937 /* Clear logging parameters */
8938 bzero(&ifp->if_log, sizeof(ifp->if_log));
8939
8940 /* Clear foreground/realtime activity timestamps */
8941 ifp->if_fg_sendts = 0;
8942 ifp->if_rt_sendts = 0;
8943
8944 /* Clear throughput estimates and radio type */
8945 ifp->if_estimated_up_bucket = 0;
8946 ifp->if_estimated_down_bucket = 0;
8947 ifp->if_radio_type = 0;
8948 ifp->if_radio_channel = 0;
8949
8950 VERIFY(ifp->if_delegated.ifp == NULL);
8951 VERIFY(ifp->if_delegated.type == 0);
8952 VERIFY(ifp->if_delegated.family == 0);
8953 VERIFY(ifp->if_delegated.subfamily == 0);
8954 VERIFY(ifp->if_delegated.expensive == 0);
8955 VERIFY(ifp->if_delegated.constrained == 0);
8956
8957 VERIFY(ifp->if_agentids == NULL);
8958 VERIFY(ifp->if_agentcount == 0);
8959
8960 /* Reset interface state */
8961 bzero(&ifp->if_interface_state, sizeof(ifp->if_interface_state));
8962 ifp->if_interface_state.valid_bitmask |=
8963 IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID;
8964 ifp->if_interface_state.interface_availability =
8965 IF_INTERFACE_STATE_INTERFACE_AVAILABLE;
8966
8967 /* Initialize Link Quality Metric (loopback [lo0] is always good) */
8968 if (ifp == lo_ifp) {
8969 ifp->if_interface_state.lqm_state = IFNET_LQM_THRESH_GOOD;
8970 ifp->if_interface_state.valid_bitmask |=
8971 IF_INTERFACE_STATE_LQM_STATE_VALID;
8972 } else {
8973 ifp->if_interface_state.lqm_state = IFNET_LQM_THRESH_UNKNOWN;
8974 }
8975
8976 /*
8977 * Enable ECN capability on this interface depending on the
8978 * value of ECN global setting
8979 */
8980 if (tcp_ecn_outbound == 2 && !IFNET_IS_CELLULAR(ifp)) {
8981 if_set_eflags(ifp, IFEF_ECN_ENABLE);
8982 if_clear_eflags(ifp, IFEF_ECN_DISABLE);
8983 }
8984
8985 /*
8986 * Built-in Cyclops always on policy for WiFi infra
8987 */
8988 if (IFNET_IS_WIFI_INFRA(ifp) && net_qos_policy_wifi_enabled != 0) {
8989 errno_t error;
8990
8991 error = if_set_qosmarking_mode(ifp,
8992 IFRTYPE_QOSMARKING_FASTLANE);
8993 if (error != 0) {
8994 DLIL_PRINTF("%s if_set_qosmarking_mode(%s) error %d\n",
8995 __func__, ifp->if_xname, error);
8996 } else {
8997 if_set_eflags(ifp, IFEF_QOSMARKING_ENABLED);
8998 #if (DEVELOPMENT || DEBUG)
8999 DLIL_PRINTF("%s fastlane enabled on %s\n",
9000 __func__, ifp->if_xname);
9001 #endif /* (DEVELOPMENT || DEBUG) */
9002 }
9003 }
9004
9005 ifnet_lock_done(ifp);
9006 ifnet_head_done();
9007
9008 #if SKYWALK
9009 netif_compat = dlil_attach_netif_compat_nexus(ifp, &nexus_netif);
9010 #endif /* SKYWALK */
9011
9012 lck_mtx_lock(&ifp->if_cached_route_lock);
9013 /* Enable forwarding cached route */
9014 ifp->if_fwd_cacheok = 1;
9015 /* Clean up any existing cached routes */
9016 ROUTE_RELEASE(&ifp->if_fwd_route);
9017 bzero(&ifp->if_fwd_route, sizeof(ifp->if_fwd_route));
9018 ROUTE_RELEASE(&ifp->if_src_route);
9019 bzero(&ifp->if_src_route, sizeof(ifp->if_src_route));
9020 ROUTE_RELEASE(&ifp->if_src_route6);
9021 bzero(&ifp->if_src_route6, sizeof(ifp->if_src_route6));
9022 lck_mtx_unlock(&ifp->if_cached_route_lock);
9023
9024 ifnet_llreach_ifattach(ifp, (dl_if->dl_if_flags & DLIF_REUSE));
9025
9026 /*
9027 * Allocate and attach IGMPv3/MLDv2 interface specific variables
9028 * and trees; do this before the ifnet is marked as attached.
9029 * The ifnet keeps the reference to the info structures even after
9030 * the ifnet is detached, since the network-layer records still
9031 * refer to the info structures even after that. This also
9032 * makes it possible for them to still function after the ifnet
9033 * is recycled or reattached.
9034 */
9035 #if INET
9036 if (IGMP_IFINFO(ifp) == NULL) {
9037 IGMP_IFINFO(ifp) = igmp_domifattach(ifp, Z_WAITOK);
9038 VERIFY(IGMP_IFINFO(ifp) != NULL);
9039 } else {
9040 VERIFY(IGMP_IFINFO(ifp)->igi_ifp == ifp);
9041 igmp_domifreattach(IGMP_IFINFO(ifp));
9042 }
9043 #endif /* INET */
9044 if (MLD_IFINFO(ifp) == NULL) {
9045 MLD_IFINFO(ifp) = mld_domifattach(ifp, Z_WAITOK);
9046 VERIFY(MLD_IFINFO(ifp) != NULL);
9047 } else {
9048 VERIFY(MLD_IFINFO(ifp)->mli_ifp == ifp);
9049 mld_domifreattach(MLD_IFINFO(ifp));
9050 }
9051
9052 VERIFY(ifp->if_data_threshold == 0);
9053 VERIFY(ifp->if_dt_tcall != NULL);
9054
9055 /*
9056 * Wait for the created kernel threads for I/O to get
9057 * scheduled and run at least once before we proceed
9058 * to mark interface as attached.
9059 */
9060 lck_mtx_lock(&ifp->if_ref_lock);
9061 while (ifp->if_threads_pending != 0) {
9062 DLIL_PRINTF("%s: Waiting for all kernel threads created for "
9063 "interface %s to get scheduled at least once.\n",
9064 __func__, ifp->if_xname);
9065 (void) msleep(&ifp->if_threads_pending, &ifp->if_ref_lock, (PZERO - 1),
9066 __func__, NULL);
9067 LCK_MTX_ASSERT(&ifp->if_ref_lock, LCK_ASSERT_OWNED);
9068 }
9069 lck_mtx_unlock(&ifp->if_ref_lock);
9070 DLIL_PRINTF("%s: All kernel threads created for interface %s have been scheduled "
9071 "at least once. Proceeding.\n", __func__, ifp->if_xname);
9072
9073 /* Final mark this ifnet as attached. */
9074 ifnet_lock_exclusive(ifp);
9075 lck_mtx_lock_spin(&ifp->if_ref_lock);
9076 ifp->if_refflags = (IFRF_ATTACHED | IFRF_READY); /* clears embryonic */
9077 lck_mtx_unlock(&ifp->if_ref_lock);
9078 if (net_rtref) {
9079 /* boot-args override; enable idle notification */
9080 (void) ifnet_set_idle_flags_locked(ifp, IFRF_IDLE_NOTIFY,
9081 IFRF_IDLE_NOTIFY);
9082 } else {
9083 /* apply previous request(s) to set the idle flags, if any */
9084 (void) ifnet_set_idle_flags_locked(ifp, ifp->if_idle_new_flags,
9085 ifp->if_idle_new_flags_mask);
9086 }
9087 #if SKYWALK
9088 /* the interface is fully attached; let the nexus adapter know */
9089 if (netif_compat || dlil_is_native_netif_nexus(ifp)) {
9090 if (netif_compat) {
9091 if (sk_netif_compat_txmodel ==
9092 NETIF_COMPAT_TXMODEL_ENQUEUE_MULTI) {
9093 ifnet_enqueue_multi_setup(ifp,
9094 sk_tx_delay_qlen, sk_tx_delay_timeout);
9095 }
9096 ifp->if_nx_netif = nexus_netif;
9097 }
9098 ifp->if_na_ops->ni_finalize(ifp->if_na, ifp);
9099 }
9100 #endif /* SKYWALK */
9101 ifnet_lock_done(ifp);
9102 dlil_if_unlock();
9103
9104 #if PF
9105 /*
9106 * Attach packet filter to this interface, if enabled.
9107 */
9108 pf_ifnet_hook(ifp, 1);
9109 #endif /* PF */
9110
9111 dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_IF_ATTACHED, NULL, 0, FALSE);
9112
9113 if (dlil_verbose) {
9114 DLIL_PRINTF("%s: attached%s\n", if_name(ifp),
9115 (dl_if->dl_if_flags & DLIF_REUSE) ? " (recycled)" : "");
9116 }
9117
9118 return 0;
9119 }
9120
9121 /*
9122 * Prepare the storage for the first/permanent link address, which must
9123 * must have the same lifetime as the ifnet itself. Although the link
9124 * address gets removed from if_addrhead and ifnet_addrs[] at detach time,
9125 * its location in memory must never change as it may still be referred
9126 * to by some parts of the system afterwards (unfortunate implementation
9127 * artifacts inherited from BSD.)
9128 *
9129 * Caller must hold ifnet lock as writer.
9130 */
9131 static struct ifaddr *
dlil_alloc_lladdr(struct ifnet * ifp,const struct sockaddr_dl * ll_addr)9132 dlil_alloc_lladdr(struct ifnet *ifp, const struct sockaddr_dl *ll_addr)
9133 {
9134 struct ifaddr *ifa, *oifa;
9135 struct sockaddr_dl *asdl, *msdl;
9136 char workbuf[IFNAMSIZ * 2];
9137 int namelen, masklen, socksize;
9138 struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
9139
9140 ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_EXCLUSIVE);
9141 VERIFY(ll_addr == NULL || ll_addr->sdl_alen == ifp->if_addrlen);
9142
9143 namelen = scnprintf(workbuf, sizeof(workbuf), "%s",
9144 if_name(ifp));
9145 masklen = offsetof(struct sockaddr_dl, sdl_data[0])
9146 + ((namelen > 0) ? namelen : 0);
9147 socksize = masklen + ifp->if_addrlen;
9148 #define ROUNDUP(a) (1 + (((a) - 1) | (sizeof (u_int32_t) - 1)))
9149 if ((u_int32_t)socksize < sizeof(struct sockaddr_dl)) {
9150 socksize = sizeof(struct sockaddr_dl);
9151 }
9152 socksize = ROUNDUP(socksize);
9153 #undef ROUNDUP
9154
9155 ifa = ifp->if_lladdr;
9156 if (socksize > DLIL_SDLMAXLEN ||
9157 (ifa != NULL && ifa != &dl_if->dl_if_lladdr.ifa)) {
9158 /*
9159 * Rare, but in the event that the link address requires
9160 * more storage space than DLIL_SDLMAXLEN, allocate the
9161 * largest possible storages for address and mask, such
9162 * that we can reuse the same space when if_addrlen grows.
9163 * This same space will be used when if_addrlen shrinks.
9164 */
9165 if (ifa == NULL || ifa == &dl_if->dl_if_lladdr.ifa) {
9166 int ifasize = sizeof(*ifa) + 2 * SOCK_MAXADDRLEN;
9167
9168 ifa = zalloc_permanent(ifasize, ZALIGN(struct ifaddr));
9169 ifa_lock_init(ifa);
9170 /* Don't set IFD_ALLOC, as this is permanent */
9171 ifa->ifa_debug = IFD_LINK;
9172 }
9173 IFA_LOCK(ifa);
9174 /* address and mask sockaddr_dl locations */
9175 asdl = (struct sockaddr_dl *)(ifa + 1);
9176 bzero(asdl, SOCK_MAXADDRLEN);
9177 msdl = (struct sockaddr_dl *)(void *)
9178 ((char *)asdl + SOCK_MAXADDRLEN);
9179 bzero(msdl, SOCK_MAXADDRLEN);
9180 } else {
9181 VERIFY(ifa == NULL || ifa == &dl_if->dl_if_lladdr.ifa);
9182 /*
9183 * Use the storage areas for address and mask within the
9184 * dlil_ifnet structure. This is the most common case.
9185 */
9186 if (ifa == NULL) {
9187 ifa = &dl_if->dl_if_lladdr.ifa;
9188 ifa_lock_init(ifa);
9189 /* Don't set IFD_ALLOC, as this is permanent */
9190 ifa->ifa_debug = IFD_LINK;
9191 }
9192 IFA_LOCK(ifa);
9193 /* address and mask sockaddr_dl locations */
9194 asdl = (struct sockaddr_dl *)(void *)&dl_if->dl_if_lladdr.asdl;
9195 bzero(asdl, sizeof(dl_if->dl_if_lladdr.asdl));
9196 msdl = (struct sockaddr_dl *)(void *)&dl_if->dl_if_lladdr.msdl;
9197 bzero(msdl, sizeof(dl_if->dl_if_lladdr.msdl));
9198 }
9199
9200 /* hold a permanent reference for the ifnet itself */
9201 IFA_ADDREF_LOCKED(ifa);
9202 oifa = ifp->if_lladdr;
9203 ifp->if_lladdr = ifa;
9204
9205 VERIFY(ifa->ifa_debug == IFD_LINK);
9206 ifa->ifa_ifp = ifp;
9207 ifa->ifa_rtrequest = link_rtrequest;
9208 ifa->ifa_addr = (struct sockaddr *)asdl;
9209 asdl->sdl_len = (u_char)socksize;
9210 asdl->sdl_family = AF_LINK;
9211 if (namelen > 0) {
9212 bcopy(workbuf, asdl->sdl_data, min(namelen,
9213 sizeof(asdl->sdl_data)));
9214 asdl->sdl_nlen = (u_char)namelen;
9215 } else {
9216 asdl->sdl_nlen = 0;
9217 }
9218 asdl->sdl_index = ifp->if_index;
9219 asdl->sdl_type = ifp->if_type;
9220 if (ll_addr != NULL) {
9221 asdl->sdl_alen = ll_addr->sdl_alen;
9222 bcopy(CONST_LLADDR(ll_addr), LLADDR(asdl), asdl->sdl_alen);
9223 } else {
9224 asdl->sdl_alen = 0;
9225 }
9226 ifa->ifa_netmask = (struct sockaddr *)msdl;
9227 msdl->sdl_len = (u_char)masklen;
9228 while (namelen > 0) {
9229 msdl->sdl_data[--namelen] = 0xff;
9230 }
9231 IFA_UNLOCK(ifa);
9232
9233 if (oifa != NULL) {
9234 IFA_REMREF(oifa);
9235 }
9236
9237 return ifa;
9238 }
9239
9240 static void
if_purgeaddrs(struct ifnet * ifp)9241 if_purgeaddrs(struct ifnet *ifp)
9242 {
9243 #if INET
9244 in_purgeaddrs(ifp);
9245 #endif /* INET */
9246 in6_purgeaddrs(ifp);
9247 }
9248
9249 errno_t
ifnet_detach(ifnet_t ifp)9250 ifnet_detach(ifnet_t ifp)
9251 {
9252 struct ifnet *delegated_ifp;
9253 struct nd_ifinfo *ndi = NULL;
9254
9255 if (ifp == NULL) {
9256 return EINVAL;
9257 }
9258
9259 ndi = ND_IFINFO(ifp);
9260 if (NULL != ndi) {
9261 ndi->cga_initialized = FALSE;
9262 }
9263
9264 /* Mark the interface down */
9265 if_down(ifp);
9266
9267 /*
9268 * IMPORTANT NOTE
9269 *
9270 * Any field in the ifnet that relies on IF_FULLY_ATTACHED()
9271 * or equivalently, ifnet_is_attached(ifp, 1), can't be modified
9272 * until after we've waited for all I/O references to drain
9273 * in ifnet_detach_final().
9274 */
9275
9276 ifnet_head_lock_exclusive();
9277 ifnet_lock_exclusive(ifp);
9278
9279 if (ifp->if_output_netem != NULL) {
9280 netem_destroy(ifp->if_output_netem);
9281 ifp->if_output_netem = NULL;
9282 }
9283
9284 /*
9285 * Check to see if this interface has previously triggered
9286 * aggressive protocol draining; if so, decrement the global
9287 * refcnt and clear PR_AGGDRAIN on the route domain if
9288 * there are no more of such an interface around.
9289 */
9290 (void) ifnet_set_idle_flags_locked(ifp, 0, ~0);
9291
9292 lck_mtx_lock_spin(&ifp->if_ref_lock);
9293 if (!(ifp->if_refflags & IFRF_ATTACHED)) {
9294 lck_mtx_unlock(&ifp->if_ref_lock);
9295 ifnet_lock_done(ifp);
9296 ifnet_head_done();
9297 return EINVAL;
9298 } else if (ifp->if_refflags & IFRF_DETACHING) {
9299 /* Interface has already been detached */
9300 lck_mtx_unlock(&ifp->if_ref_lock);
9301 ifnet_lock_done(ifp);
9302 ifnet_head_done();
9303 return ENXIO;
9304 }
9305 VERIFY(!(ifp->if_refflags & IFRF_EMBRYONIC));
9306 /* Indicate this interface is being detached */
9307 ifp->if_refflags &= ~IFRF_ATTACHED;
9308 ifp->if_refflags |= IFRF_DETACHING;
9309 lck_mtx_unlock(&ifp->if_ref_lock);
9310
9311 if (dlil_verbose) {
9312 DLIL_PRINTF("%s: detaching\n", if_name(ifp));
9313 }
9314
9315 /* clean up flow control entry object if there's any */
9316 if (ifp->if_eflags & IFEF_TXSTART) {
9317 ifnet_flowadv(ifp->if_flowhash);
9318 }
9319
9320 /* Reset ECN enable/disable flags */
9321 /* Reset CLAT46 flag */
9322 if_clear_eflags(ifp, IFEF_ECN_ENABLE | IFEF_ECN_DISABLE | IFEF_CLAT46);
9323
9324 /*
9325 * We do not reset the TCP keep alive counters in case
9326 * a TCP connection stays connection after the interface
9327 * went down
9328 */
9329 if (ifp->if_tcp_kao_cnt > 0) {
9330 os_log(OS_LOG_DEFAULT, "%s %s tcp_kao_cnt %u not zero",
9331 __func__, if_name(ifp), ifp->if_tcp_kao_cnt);
9332 }
9333 ifp->if_tcp_kao_max = 0;
9334
9335 /*
9336 * Remove ifnet from the ifnet_head, ifindex2ifnet[]; it will
9337 * no longer be visible during lookups from this point.
9338 */
9339 VERIFY(ifindex2ifnet[ifp->if_index] == ifp);
9340 TAILQ_REMOVE(&ifnet_head, ifp, if_link);
9341 ifp->if_link.tqe_next = NULL;
9342 ifp->if_link.tqe_prev = NULL;
9343 if (ifp->if_ordered_link.tqe_next != NULL ||
9344 ifp->if_ordered_link.tqe_prev != NULL) {
9345 ifnet_remove_from_ordered_list(ifp);
9346 }
9347 ifindex2ifnet[ifp->if_index] = NULL;
9348
9349 /* 18717626 - reset router mode */
9350 if_clear_eflags(ifp, IFEF_IPV4_ROUTER);
9351 ifp->if_ipv6_router_mode = IPV6_ROUTER_MODE_DISABLED;
9352
9353 /* Record detach PC stacktrace */
9354 ctrace_record(&((struct dlil_ifnet *)ifp)->dl_if_detach);
9355
9356 /* Clear logging parameters */
9357 bzero(&ifp->if_log, sizeof(ifp->if_log));
9358
9359 /* Clear delegated interface info (reference released below) */
9360 delegated_ifp = ifp->if_delegated.ifp;
9361 bzero(&ifp->if_delegated, sizeof(ifp->if_delegated));
9362
9363 /* Reset interface state */
9364 bzero(&ifp->if_interface_state, sizeof(ifp->if_interface_state));
9365
9366 ifnet_lock_done(ifp);
9367 ifnet_head_done();
9368
9369 /* Release reference held on the delegated interface */
9370 if (delegated_ifp != NULL) {
9371 ifnet_release(delegated_ifp);
9372 }
9373
9374 /* Reset Link Quality Metric (unless loopback [lo0]) */
9375 if (ifp != lo_ifp) {
9376 if_lqm_update(ifp, IFNET_LQM_THRESH_OFF, 0);
9377 }
9378
9379 /* Reset TCP local statistics */
9380 if (ifp->if_tcp_stat != NULL) {
9381 bzero(ifp->if_tcp_stat, sizeof(*ifp->if_tcp_stat));
9382 }
9383
9384 /* Reset UDP local statistics */
9385 if (ifp->if_udp_stat != NULL) {
9386 bzero(ifp->if_udp_stat, sizeof(*ifp->if_udp_stat));
9387 }
9388
9389 /* Reset ifnet IPv4 stats */
9390 if (ifp->if_ipv4_stat != NULL) {
9391 bzero(ifp->if_ipv4_stat, sizeof(*ifp->if_ipv4_stat));
9392 }
9393
9394 /* Reset ifnet IPv6 stats */
9395 if (ifp->if_ipv6_stat != NULL) {
9396 bzero(ifp->if_ipv6_stat, sizeof(*ifp->if_ipv6_stat));
9397 }
9398
9399 /* Release memory held for interface link status report */
9400 if (ifp->if_link_status != NULL) {
9401 kfree_type(struct if_link_status, ifp->if_link_status);
9402 ifp->if_link_status = NULL;
9403 }
9404
9405 /* Let BPF know we're detaching */
9406 bpfdetach(ifp);
9407
9408 /* Disable forwarding cached route */
9409 lck_mtx_lock(&ifp->if_cached_route_lock);
9410 ifp->if_fwd_cacheok = 0;
9411 lck_mtx_unlock(&ifp->if_cached_route_lock);
9412
9413 /* Disable data threshold and wait for any pending event posting */
9414 ifp->if_data_threshold = 0;
9415 VERIFY(ifp->if_dt_tcall != NULL);
9416 (void) thread_call_cancel_wait(ifp->if_dt_tcall);
9417
9418 /*
9419 * Drain any deferred IGMPv3/MLDv2 query responses, but keep the
9420 * references to the info structures and leave them attached to
9421 * this ifnet.
9422 */
9423 #if INET
9424 igmp_domifdetach(ifp);
9425 #endif /* INET */
9426 mld_domifdetach(ifp);
9427
9428 #if SKYWALK
9429 /* Clean up any netns tokens still pointing to to this ifnet */
9430 netns_ifnet_detach(ifp);
9431 #endif /* SKYWALK */
9432 dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_IF_DETACHING, NULL, 0, FALSE);
9433
9434 /* Let worker thread take care of the rest, to avoid reentrancy */
9435 dlil_if_lock();
9436 ifnet_detaching_enqueue(ifp);
9437 dlil_if_unlock();
9438
9439 return 0;
9440 }
9441
9442 static void
ifnet_detaching_enqueue(struct ifnet * ifp)9443 ifnet_detaching_enqueue(struct ifnet *ifp)
9444 {
9445 dlil_if_lock_assert();
9446
9447 ++ifnet_detaching_cnt;
9448 VERIFY(ifnet_detaching_cnt != 0);
9449 TAILQ_INSERT_TAIL(&ifnet_detaching_head, ifp, if_detaching_link);
9450 wakeup((caddr_t)&ifnet_delayed_run);
9451 }
9452
9453 static struct ifnet *
ifnet_detaching_dequeue(void)9454 ifnet_detaching_dequeue(void)
9455 {
9456 struct ifnet *ifp;
9457
9458 dlil_if_lock_assert();
9459
9460 ifp = TAILQ_FIRST(&ifnet_detaching_head);
9461 VERIFY(ifnet_detaching_cnt != 0 || ifp == NULL);
9462 if (ifp != NULL) {
9463 VERIFY(ifnet_detaching_cnt != 0);
9464 --ifnet_detaching_cnt;
9465 TAILQ_REMOVE(&ifnet_detaching_head, ifp, if_detaching_link);
9466 ifp->if_detaching_link.tqe_next = NULL;
9467 ifp->if_detaching_link.tqe_prev = NULL;
9468 }
9469 return ifp;
9470 }
9471
9472 __attribute__((noreturn))
9473 static void
ifnet_detacher_thread_cont(void * v,wait_result_t wres)9474 ifnet_detacher_thread_cont(void *v, wait_result_t wres)
9475 {
9476 #pragma unused(v, wres)
9477 struct ifnet *ifp;
9478
9479 dlil_if_lock();
9480 if (__improbable(ifnet_detaching_embryonic)) {
9481 ifnet_detaching_embryonic = FALSE;
9482 /* there's no lock ordering constrain so OK to do this here */
9483 dlil_decr_pending_thread_count();
9484 }
9485
9486 for (;;) {
9487 dlil_if_lock_assert();
9488
9489 if (ifnet_detaching_cnt == 0) {
9490 break;
9491 }
9492
9493 net_update_uptime();
9494
9495 VERIFY(TAILQ_FIRST(&ifnet_detaching_head) != NULL);
9496
9497 /* Take care of detaching ifnet */
9498 ifp = ifnet_detaching_dequeue();
9499 if (ifp != NULL) {
9500 dlil_if_unlock();
9501 ifnet_detach_final(ifp);
9502 dlil_if_lock();
9503 }
9504 }
9505
9506 (void) assert_wait(&ifnet_delayed_run, THREAD_UNINT);
9507 dlil_if_unlock();
9508 (void) thread_block(ifnet_detacher_thread_cont);
9509
9510 VERIFY(0); /* we should never get here */
9511 /* NOTREACHED */
9512 __builtin_unreachable();
9513 }
9514
9515 __dead2
9516 static void
ifnet_detacher_thread_func(void * v,wait_result_t w)9517 ifnet_detacher_thread_func(void *v, wait_result_t w)
9518 {
9519 #pragma unused(v, w)
9520 dlil_if_lock();
9521 (void) assert_wait(&ifnet_delayed_run, THREAD_UNINT);
9522 ifnet_detaching_embryonic = TRUE;
9523 /* wake up once to get out of embryonic state */
9524 wakeup((caddr_t)&ifnet_delayed_run);
9525 dlil_if_unlock();
9526 (void) thread_block(ifnet_detacher_thread_cont);
9527 VERIFY(0);
9528 /* NOTREACHED */
9529 __builtin_unreachable();
9530 }
9531
9532 static void
ifnet_detach_final(struct ifnet * ifp)9533 ifnet_detach_final(struct ifnet *ifp)
9534 {
9535 struct ifnet_filter *filter, *filter_next;
9536 struct dlil_ifnet *dlifp;
9537 struct ifnet_filter_head fhead;
9538 struct dlil_threading_info *inp;
9539 struct ifaddr *ifa;
9540 ifnet_detached_func if_free;
9541 int i;
9542
9543 #if SKYWALK
9544 dlil_netif_detach_notify(ifp);
9545 /*
9546 * Wait for the datapath to quiesce before tearing down
9547 * netif/flowswitch nexuses.
9548 */
9549 dlil_quiesce_and_detach_nexuses(ifp);
9550 #endif /* SKYWALK */
9551
9552 lck_mtx_lock(&ifp->if_ref_lock);
9553 if (!(ifp->if_refflags & IFRF_DETACHING)) {
9554 panic("%s: flags mismatch (detaching not set) ifp=%p",
9555 __func__, ifp);
9556 /* NOTREACHED */
9557 }
9558
9559 /*
9560 * Wait until the existing IO references get released
9561 * before we proceed with ifnet_detach. This is not a
9562 * common case, so block without using a continuation.
9563 */
9564 while (ifp->if_refio > 0) {
9565 DLIL_PRINTF("%s: Waiting for IO references on %s interface "
9566 "to be released\n", __func__, if_name(ifp));
9567 (void) msleep(&(ifp->if_refio), &ifp->if_ref_lock,
9568 (PZERO - 1), "ifnet_ioref_wait", NULL);
9569 }
9570
9571 VERIFY(ifp->if_datamov == 0);
9572 VERIFY(ifp->if_drainers == 0);
9573 VERIFY(ifp->if_suspend == 0);
9574 ifp->if_refflags &= ~IFRF_READY;
9575 lck_mtx_unlock(&ifp->if_ref_lock);
9576
9577 /* Clear agent IDs */
9578 if (ifp->if_agentids != NULL) {
9579 kfree_data(ifp->if_agentids,
9580 sizeof(uuid_t) * ifp->if_agentcount);
9581 ifp->if_agentids = NULL;
9582 }
9583 ifp->if_agentcount = 0;
9584
9585 #if SKYWALK
9586 VERIFY(SLIST_EMPTY(&ifp->if_netns_tokens));
9587 #endif /* SKYWALK */
9588 /* Drain and destroy send queue */
9589 ifclassq_teardown(ifp->if_snd);
9590
9591 /* Detach interface filters */
9592 lck_mtx_lock(&ifp->if_flt_lock);
9593 if_flt_monitor_enter(ifp);
9594
9595 LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
9596 fhead = ifp->if_flt_head;
9597 TAILQ_INIT(&ifp->if_flt_head);
9598
9599 for (filter = TAILQ_FIRST(&fhead); filter; filter = filter_next) {
9600 filter_next = TAILQ_NEXT(filter, filt_next);
9601 lck_mtx_unlock(&ifp->if_flt_lock);
9602
9603 dlil_detach_filter_internal(filter, 1);
9604 lck_mtx_lock(&ifp->if_flt_lock);
9605 }
9606 if_flt_monitor_leave(ifp);
9607 lck_mtx_unlock(&ifp->if_flt_lock);
9608
9609 /* Tell upper layers to drop their network addresses */
9610 if_purgeaddrs(ifp);
9611
9612 ifnet_lock_exclusive(ifp);
9613
9614 /* Unplumb all protocols */
9615 for (i = 0; i < PROTO_HASH_SLOTS; i++) {
9616 struct if_proto *proto;
9617
9618 proto = SLIST_FIRST(&ifp->if_proto_hash[i]);
9619 while (proto != NULL) {
9620 protocol_family_t family = proto->protocol_family;
9621 ifnet_lock_done(ifp);
9622 proto_unplumb(family, ifp);
9623 ifnet_lock_exclusive(ifp);
9624 proto = SLIST_FIRST(&ifp->if_proto_hash[i]);
9625 }
9626 /* There should not be any protocols left */
9627 VERIFY(SLIST_EMPTY(&ifp->if_proto_hash[i]));
9628 }
9629 zfree(dlif_phash_zone, ifp->if_proto_hash);
9630 ifp->if_proto_hash = NULL;
9631
9632 /* Detach (permanent) link address from if_addrhead */
9633 ifa = TAILQ_FIRST(&ifp->if_addrhead);
9634 VERIFY(ifnet_addrs[ifp->if_index - 1] == ifa);
9635 IFA_LOCK(ifa);
9636 if_detach_link_ifa(ifp, ifa);
9637 IFA_UNLOCK(ifa);
9638
9639 /* Remove (permanent) link address from ifnet_addrs[] */
9640 IFA_REMREF(ifa);
9641 ifnet_addrs[ifp->if_index - 1] = NULL;
9642
9643 /* This interface should not be on {ifnet_head,detaching} */
9644 VERIFY(ifp->if_link.tqe_next == NULL);
9645 VERIFY(ifp->if_link.tqe_prev == NULL);
9646 VERIFY(ifp->if_detaching_link.tqe_next == NULL);
9647 VERIFY(ifp->if_detaching_link.tqe_prev == NULL);
9648 VERIFY(ifp->if_ordered_link.tqe_next == NULL);
9649 VERIFY(ifp->if_ordered_link.tqe_prev == NULL);
9650
9651 /* The slot should have been emptied */
9652 VERIFY(ifindex2ifnet[ifp->if_index] == NULL);
9653
9654 /* There should not be any addresses left */
9655 VERIFY(TAILQ_EMPTY(&ifp->if_addrhead));
9656
9657 /*
9658 * Signal the starter thread to terminate itself, and wait until
9659 * it has exited.
9660 */
9661 if (ifp->if_start_thread != THREAD_NULL) {
9662 lck_mtx_lock_spin(&ifp->if_start_lock);
9663 ifp->if_start_flags |= IFSF_TERMINATING;
9664 wakeup_one((caddr_t)&ifp->if_start_thread);
9665 lck_mtx_unlock(&ifp->if_start_lock);
9666
9667 /* wait for starter thread to terminate */
9668 lck_mtx_lock(&ifp->if_start_lock);
9669 while (ifp->if_start_thread != THREAD_NULL) {
9670 if (dlil_verbose) {
9671 DLIL_PRINTF("%s: waiting for %s starter thread to terminate\n",
9672 __func__,
9673 if_name(ifp));
9674 }
9675 (void) msleep(&ifp->if_start_thread,
9676 &ifp->if_start_lock, (PZERO - 1),
9677 "ifnet_start_thread_exit", NULL);
9678 }
9679 lck_mtx_unlock(&ifp->if_start_lock);
9680 if (dlil_verbose) {
9681 DLIL_PRINTF("%s: %s starter thread termination complete",
9682 __func__, if_name(ifp));
9683 }
9684 }
9685
9686 /*
9687 * Signal the poller thread to terminate itself, and wait until
9688 * it has exited.
9689 */
9690 if (ifp->if_poll_thread != THREAD_NULL) {
9691 #if SKYWALK
9692 VERIFY(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
9693 #endif /* SKYWALK */
9694 lck_mtx_lock_spin(&ifp->if_poll_lock);
9695 ifp->if_poll_flags |= IF_POLLF_TERMINATING;
9696 wakeup_one((caddr_t)&ifp->if_poll_thread);
9697 lck_mtx_unlock(&ifp->if_poll_lock);
9698
9699 /* wait for poller thread to terminate */
9700 lck_mtx_lock(&ifp->if_poll_lock);
9701 while (ifp->if_poll_thread != THREAD_NULL) {
9702 if (dlil_verbose) {
9703 DLIL_PRINTF("%s: waiting for %s poller thread to terminate\n",
9704 __func__,
9705 if_name(ifp));
9706 }
9707 (void) msleep(&ifp->if_poll_thread,
9708 &ifp->if_poll_lock, (PZERO - 1),
9709 "ifnet_poll_thread_exit", NULL);
9710 }
9711 lck_mtx_unlock(&ifp->if_poll_lock);
9712 if (dlil_verbose) {
9713 DLIL_PRINTF("%s: %s poller thread termination complete\n",
9714 __func__, if_name(ifp));
9715 }
9716 }
9717
9718 /*
9719 * If thread affinity was set for the workloop thread, we will need
9720 * to tear down the affinity and release the extra reference count
9721 * taken at attach time. Does not apply to lo0 or other interfaces
9722 * without dedicated input threads.
9723 */
9724 if ((inp = ifp->if_inp) != NULL) {
9725 VERIFY(inp != dlil_main_input_thread);
9726
9727 if (inp->dlth_affinity) {
9728 struct thread *tp, *wtp, *ptp;
9729
9730 lck_mtx_lock_spin(&inp->dlth_lock);
9731 wtp = inp->dlth_driver_thread;
9732 inp->dlth_driver_thread = THREAD_NULL;
9733 ptp = inp->dlth_poller_thread;
9734 inp->dlth_poller_thread = THREAD_NULL;
9735 ASSERT(inp->dlth_thread != THREAD_NULL);
9736 tp = inp->dlth_thread; /* don't nullify now */
9737 inp->dlth_affinity_tag = 0;
9738 inp->dlth_affinity = FALSE;
9739 lck_mtx_unlock(&inp->dlth_lock);
9740
9741 /* Tear down poll thread affinity */
9742 if (ptp != NULL) {
9743 VERIFY(ifp->if_eflags & IFEF_RXPOLL);
9744 VERIFY(ifp->if_xflags & IFXF_LEGACY);
9745 (void) dlil_affinity_set(ptp,
9746 THREAD_AFFINITY_TAG_NULL);
9747 thread_deallocate(ptp);
9748 }
9749
9750 /* Tear down workloop thread affinity */
9751 if (wtp != NULL) {
9752 (void) dlil_affinity_set(wtp,
9753 THREAD_AFFINITY_TAG_NULL);
9754 thread_deallocate(wtp);
9755 }
9756
9757 /* Tear down DLIL input thread affinity */
9758 (void) dlil_affinity_set(tp, THREAD_AFFINITY_TAG_NULL);
9759 thread_deallocate(tp);
9760 }
9761
9762 /* disassociate ifp DLIL input thread */
9763 ifp->if_inp = NULL;
9764
9765 /* if the worker thread was created, tell it to terminate */
9766 if (inp->dlth_thread != THREAD_NULL) {
9767 lck_mtx_lock_spin(&inp->dlth_lock);
9768 inp->dlth_flags |= DLIL_INPUT_TERMINATE;
9769 if (!(inp->dlth_flags & DLIL_INPUT_RUNNING)) {
9770 wakeup_one((caddr_t)&inp->dlth_flags);
9771 }
9772 lck_mtx_unlock(&inp->dlth_lock);
9773 ifnet_lock_done(ifp);
9774
9775 /* wait for the input thread to terminate */
9776 lck_mtx_lock_spin(&inp->dlth_lock);
9777 while ((inp->dlth_flags & DLIL_INPUT_TERMINATE_COMPLETE)
9778 == 0) {
9779 (void) msleep(&inp->dlth_flags, &inp->dlth_lock,
9780 (PZERO - 1) | PSPIN, inp->dlth_name, NULL);
9781 }
9782 lck_mtx_unlock(&inp->dlth_lock);
9783 ifnet_lock_exclusive(ifp);
9784 }
9785
9786 /* clean-up input thread state */
9787 dlil_clean_threading_info(inp);
9788 /* clean-up poll parameters */
9789 VERIFY(ifp->if_poll_thread == THREAD_NULL);
9790 dlil_reset_rxpoll_params(ifp);
9791 }
9792
9793 /* The driver might unload, so point these to ourselves */
9794 if_free = ifp->if_free;
9795 ifp->if_output_dlil = ifp_if_output;
9796 ifp->if_output = ifp_if_output;
9797 ifp->if_pre_enqueue = ifp_if_output;
9798 ifp->if_start = ifp_if_start;
9799 ifp->if_output_ctl = ifp_if_ctl;
9800 ifp->if_input_dlil = ifp_if_input;
9801 ifp->if_input_poll = ifp_if_input_poll;
9802 ifp->if_input_ctl = ifp_if_ctl;
9803 ifp->if_ioctl = ifp_if_ioctl;
9804 ifp->if_set_bpf_tap = ifp_if_set_bpf_tap;
9805 ifp->if_free = ifp_if_free;
9806 ifp->if_demux = ifp_if_demux;
9807 ifp->if_event = ifp_if_event;
9808 ifp->if_framer_legacy = ifp_if_framer;
9809 ifp->if_framer = ifp_if_framer_extended;
9810 ifp->if_add_proto = ifp_if_add_proto;
9811 ifp->if_del_proto = ifp_if_del_proto;
9812 ifp->if_check_multi = ifp_if_check_multi;
9813
9814 /* wipe out interface description */
9815 VERIFY(ifp->if_desc.ifd_maxlen == IF_DESCSIZE);
9816 ifp->if_desc.ifd_len = 0;
9817 VERIFY(ifp->if_desc.ifd_desc != NULL);
9818 bzero(ifp->if_desc.ifd_desc, IF_DESCSIZE);
9819
9820 /* there shouldn't be any delegation by now */
9821 VERIFY(ifp->if_delegated.ifp == NULL);
9822 VERIFY(ifp->if_delegated.type == 0);
9823 VERIFY(ifp->if_delegated.family == 0);
9824 VERIFY(ifp->if_delegated.subfamily == 0);
9825 VERIFY(ifp->if_delegated.expensive == 0);
9826 VERIFY(ifp->if_delegated.constrained == 0);
9827
9828 /* QoS marking get cleared */
9829 if_clear_eflags(ifp, IFEF_QOSMARKING_ENABLED);
9830 if_set_qosmarking_mode(ifp, IFRTYPE_QOSMARKING_MODE_NONE);
9831
9832 #if SKYWALK
9833 /* the nexus destructor is responsible for clearing these */
9834 VERIFY(ifp->if_na_ops == NULL);
9835 VERIFY(ifp->if_na == NULL);
9836 #endif /* SKYWALK */
9837
9838 /* promiscuous count needs to start at zero again */
9839 ifp->if_pcount = 0;
9840 ifp->if_flags &= ~IFF_PROMISC;
9841
9842 ifnet_lock_done(ifp);
9843
9844 #if PF
9845 /*
9846 * Detach this interface from packet filter, if enabled.
9847 */
9848 pf_ifnet_hook(ifp, 0);
9849 #endif /* PF */
9850
9851 /* Filter list should be empty */
9852 lck_mtx_lock_spin(&ifp->if_flt_lock);
9853 VERIFY(TAILQ_EMPTY(&ifp->if_flt_head));
9854 VERIFY(ifp->if_flt_busy == 0);
9855 VERIFY(ifp->if_flt_waiters == 0);
9856 VERIFY(ifp->if_flt_non_os_count == 0);
9857 VERIFY(ifp->if_flt_no_tso_count == 0);
9858 lck_mtx_unlock(&ifp->if_flt_lock);
9859
9860 /* Last chance to drain send queue */
9861 if_qflush_snd(ifp, 0);
9862
9863 /* Last chance to cleanup any cached route */
9864 lck_mtx_lock(&ifp->if_cached_route_lock);
9865 VERIFY(!ifp->if_fwd_cacheok);
9866 ROUTE_RELEASE(&ifp->if_fwd_route);
9867 bzero(&ifp->if_fwd_route, sizeof(ifp->if_fwd_route));
9868 ROUTE_RELEASE(&ifp->if_src_route);
9869 bzero(&ifp->if_src_route, sizeof(ifp->if_src_route));
9870 ROUTE_RELEASE(&ifp->if_src_route6);
9871 bzero(&ifp->if_src_route6, sizeof(ifp->if_src_route6));
9872 lck_mtx_unlock(&ifp->if_cached_route_lock);
9873
9874 VERIFY(ifp->if_data_threshold == 0);
9875 VERIFY(ifp->if_dt_tcall != NULL);
9876 VERIFY(!thread_call_isactive(ifp->if_dt_tcall));
9877
9878 ifnet_llreach_ifdetach(ifp);
9879
9880 dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_IF_DETACHED, NULL, 0, FALSE);
9881
9882 /*
9883 * Finally, mark this ifnet as detached.
9884 */
9885 if (dlil_verbose) {
9886 DLIL_PRINTF("%s: detached\n", if_name(ifp));
9887 }
9888 lck_mtx_lock_spin(&ifp->if_ref_lock);
9889 if (!(ifp->if_refflags & IFRF_DETACHING)) {
9890 panic("%s: flags mismatch (detaching not set) ifp=%p",
9891 __func__, ifp);
9892 /* NOTREACHED */
9893 }
9894 ifp->if_refflags &= ~IFRF_DETACHING;
9895 lck_mtx_unlock(&ifp->if_ref_lock);
9896 if (if_free != NULL) {
9897 if_free(ifp);
9898 }
9899
9900 ifclassq_release(&ifp->if_snd);
9901
9902 /* we're fully detached, clear the "in use" bit */
9903 dlifp = (struct dlil_ifnet *)ifp;
9904 lck_mtx_lock(&dlifp->dl_if_lock);
9905 ASSERT((dlifp->dl_if_flags & DLIF_INUSE) != 0);
9906 dlifp->dl_if_flags &= ~DLIF_INUSE;
9907 lck_mtx_unlock(&dlifp->dl_if_lock);
9908
9909 /* Release reference held during ifnet attach */
9910 ifnet_release(ifp);
9911 }
9912
9913 errno_t
ifp_if_output(struct ifnet * ifp,struct mbuf * m)9914 ifp_if_output(struct ifnet *ifp, struct mbuf *m)
9915 {
9916 #pragma unused(ifp)
9917 m_freem_list(m);
9918 return 0;
9919 }
9920
9921 void
ifp_if_start(struct ifnet * ifp)9922 ifp_if_start(struct ifnet *ifp)
9923 {
9924 ifnet_purge(ifp);
9925 }
9926
9927 static errno_t
ifp_if_input(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t poll,struct thread * tp)9928 ifp_if_input(struct ifnet *ifp, struct mbuf *m_head,
9929 struct mbuf *m_tail, const struct ifnet_stat_increment_param *s,
9930 boolean_t poll, struct thread *tp)
9931 {
9932 #pragma unused(ifp, m_tail, s, poll, tp)
9933 m_freem_list(m_head);
9934 return ENXIO;
9935 }
9936
9937 static void
ifp_if_input_poll(struct ifnet * ifp,u_int32_t flags,u_int32_t max_cnt,struct mbuf ** m_head,struct mbuf ** m_tail,u_int32_t * cnt,u_int32_t * len)9938 ifp_if_input_poll(struct ifnet *ifp, u_int32_t flags, u_int32_t max_cnt,
9939 struct mbuf **m_head, struct mbuf **m_tail, u_int32_t *cnt, u_int32_t *len)
9940 {
9941 #pragma unused(ifp, flags, max_cnt)
9942 if (m_head != NULL) {
9943 *m_head = NULL;
9944 }
9945 if (m_tail != NULL) {
9946 *m_tail = NULL;
9947 }
9948 if (cnt != NULL) {
9949 *cnt = 0;
9950 }
9951 if (len != NULL) {
9952 *len = 0;
9953 }
9954 }
9955
9956 static errno_t
ifp_if_ctl(struct ifnet * ifp,ifnet_ctl_cmd_t cmd,u_int32_t arglen,void * arg)9957 ifp_if_ctl(struct ifnet *ifp, ifnet_ctl_cmd_t cmd, u_int32_t arglen, void *arg)
9958 {
9959 #pragma unused(ifp, cmd, arglen, arg)
9960 return EOPNOTSUPP;
9961 }
9962
9963 static errno_t
ifp_if_demux(struct ifnet * ifp,struct mbuf * m,char * fh,protocol_family_t * pf)9964 ifp_if_demux(struct ifnet *ifp, struct mbuf *m, char *fh, protocol_family_t *pf)
9965 {
9966 #pragma unused(ifp, fh, pf)
9967 m_freem(m);
9968 return EJUSTRETURN;
9969 }
9970
9971 static errno_t
ifp_if_add_proto(struct ifnet * ifp,protocol_family_t pf,const struct ifnet_demux_desc * da,u_int32_t dc)9972 ifp_if_add_proto(struct ifnet *ifp, protocol_family_t pf,
9973 const struct ifnet_demux_desc *da, u_int32_t dc)
9974 {
9975 #pragma unused(ifp, pf, da, dc)
9976 return EINVAL;
9977 }
9978
9979 static errno_t
ifp_if_del_proto(struct ifnet * ifp,protocol_family_t pf)9980 ifp_if_del_proto(struct ifnet *ifp, protocol_family_t pf)
9981 {
9982 #pragma unused(ifp, pf)
9983 return EINVAL;
9984 }
9985
9986 static errno_t
ifp_if_check_multi(struct ifnet * ifp,const struct sockaddr * sa)9987 ifp_if_check_multi(struct ifnet *ifp, const struct sockaddr *sa)
9988 {
9989 #pragma unused(ifp, sa)
9990 return EOPNOTSUPP;
9991 }
9992
9993 #if !XNU_TARGET_OS_OSX
9994 static errno_t
ifp_if_framer(struct ifnet * ifp,struct mbuf ** m,const struct sockaddr * sa,const char * ll,const char * t,u_int32_t * pre,u_int32_t * post)9995 ifp_if_framer(struct ifnet *ifp, struct mbuf **m,
9996 const struct sockaddr *sa, const char *ll, const char *t,
9997 u_int32_t *pre, u_int32_t *post)
9998 #else /* XNU_TARGET_OS_OSX */
9999 static errno_t
10000 ifp_if_framer(struct ifnet *ifp, struct mbuf **m,
10001 const struct sockaddr *sa, const char *ll, const char *t)
10002 #endif /* XNU_TARGET_OS_OSX */
10003 {
10004 #pragma unused(ifp, m, sa, ll, t)
10005 #if !XNU_TARGET_OS_OSX
10006 return ifp_if_framer_extended(ifp, m, sa, ll, t, pre, post);
10007 #else /* XNU_TARGET_OS_OSX */
10008 return ifp_if_framer_extended(ifp, m, sa, ll, t, NULL, NULL);
10009 #endif /* XNU_TARGET_OS_OSX */
10010 }
10011
10012 static errno_t
ifp_if_framer_extended(struct ifnet * ifp,struct mbuf ** m,const struct sockaddr * sa,const char * ll,const char * t,u_int32_t * pre,u_int32_t * post)10013 ifp_if_framer_extended(struct ifnet *ifp, struct mbuf **m,
10014 const struct sockaddr *sa, const char *ll, const char *t,
10015 u_int32_t *pre, u_int32_t *post)
10016 {
10017 #pragma unused(ifp, sa, ll, t)
10018 m_freem(*m);
10019 *m = NULL;
10020
10021 if (pre != NULL) {
10022 *pre = 0;
10023 }
10024 if (post != NULL) {
10025 *post = 0;
10026 }
10027
10028 return EJUSTRETURN;
10029 }
10030
10031 errno_t
ifp_if_ioctl(struct ifnet * ifp,unsigned long cmd,void * arg)10032 ifp_if_ioctl(struct ifnet *ifp, unsigned long cmd, void *arg)
10033 {
10034 #pragma unused(ifp, cmd, arg)
10035 return EOPNOTSUPP;
10036 }
10037
10038 static errno_t
ifp_if_set_bpf_tap(struct ifnet * ifp,bpf_tap_mode tm,bpf_packet_func f)10039 ifp_if_set_bpf_tap(struct ifnet *ifp, bpf_tap_mode tm, bpf_packet_func f)
10040 {
10041 #pragma unused(ifp, tm, f)
10042 /* XXX not sure what to do here */
10043 return 0;
10044 }
10045
10046 static void
ifp_if_free(struct ifnet * ifp)10047 ifp_if_free(struct ifnet *ifp)
10048 {
10049 #pragma unused(ifp)
10050 }
10051
10052 static void
ifp_if_event(struct ifnet * ifp,const struct kev_msg * e)10053 ifp_if_event(struct ifnet *ifp, const struct kev_msg *e)
10054 {
10055 #pragma unused(ifp, e)
10056 }
10057
10058 int
dlil_if_acquire(u_int32_t family,const void * uniqueid,size_t uniqueid_len,const char * ifxname,struct ifnet ** ifp)10059 dlil_if_acquire(u_int32_t family, const void *uniqueid,
10060 size_t uniqueid_len, const char *ifxname, struct ifnet **ifp)
10061 {
10062 struct ifnet *ifp1 = NULL;
10063 struct dlil_ifnet *dlifp1 = NULL;
10064 struct dlil_ifnet *dlifp1_saved = NULL;
10065 void *buf, *base, **pbuf;
10066 int ret = 0;
10067
10068 VERIFY(*ifp == NULL);
10069 dlil_if_lock();
10070 /*
10071 * We absolutely can't have an interface with the same name
10072 * in in-use state.
10073 * To make sure of that list has to be traversed completely
10074 */
10075 TAILQ_FOREACH(dlifp1, &dlil_ifnet_head, dl_if_link) {
10076 ifp1 = (struct ifnet *)dlifp1;
10077
10078 if (ifp1->if_family != family) {
10079 continue;
10080 }
10081
10082 /*
10083 * If interface is in use, return EBUSY if either unique id
10084 * or interface extended names are the same
10085 */
10086 lck_mtx_lock(&dlifp1->dl_if_lock);
10087 if (strncmp(ifxname, ifp1->if_xname, IFXNAMSIZ) == 0 &&
10088 (dlifp1->dl_if_flags & DLIF_INUSE) != 0) {
10089 lck_mtx_unlock(&dlifp1->dl_if_lock);
10090 ret = EBUSY;
10091 goto end;
10092 }
10093
10094 if (uniqueid_len != 0 &&
10095 uniqueid_len == dlifp1->dl_if_uniqueid_len &&
10096 bcmp(uniqueid, dlifp1->dl_if_uniqueid, uniqueid_len) == 0) {
10097 if ((dlifp1->dl_if_flags & DLIF_INUSE) != 0) {
10098 lck_mtx_unlock(&dlifp1->dl_if_lock);
10099 ret = EBUSY;
10100 goto end;
10101 }
10102 if (dlifp1_saved == NULL) {
10103 /* cache the first match */
10104 dlifp1_saved = dlifp1;
10105 }
10106 /*
10107 * Do not break or jump to end as we have to traverse
10108 * the whole list to ensure there are no name collisions
10109 */
10110 }
10111 lck_mtx_unlock(&dlifp1->dl_if_lock);
10112 }
10113
10114 /* If there's an interface that can be recycled, use that */
10115 if (dlifp1_saved != NULL) {
10116 lck_mtx_lock(&dlifp1_saved->dl_if_lock);
10117 if ((dlifp1_saved->dl_if_flags & DLIF_INUSE) != 0) {
10118 /* some other thread got in ahead of us */
10119 lck_mtx_unlock(&dlifp1_saved->dl_if_lock);
10120 ret = EBUSY;
10121 goto end;
10122 }
10123 dlifp1_saved->dl_if_flags |= (DLIF_INUSE | DLIF_REUSE);
10124 lck_mtx_unlock(&dlifp1_saved->dl_if_lock);
10125 *ifp = (struct ifnet *)dlifp1_saved;
10126 dlil_if_ref(*ifp);
10127 goto end;
10128 }
10129
10130 /* no interface found, allocate a new one */
10131 buf = zalloc_flags(dlif_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
10132
10133 /* Get the 64-bit aligned base address for this object */
10134 base = (void *)P2ROUNDUP((intptr_t)buf + sizeof(u_int64_t),
10135 sizeof(u_int64_t));
10136 VERIFY(((intptr_t)base + dlif_size) <= ((intptr_t)buf + dlif_bufsize));
10137
10138 /*
10139 * Wind back a pointer size from the aligned base and
10140 * save the original address so we can free it later.
10141 */
10142 pbuf = (void **)((intptr_t)base - sizeof(void *));
10143 *pbuf = buf;
10144 dlifp1 = base;
10145
10146 if (uniqueid_len) {
10147 dlifp1->dl_if_uniqueid = kalloc_data(uniqueid_len,
10148 Z_WAITOK);
10149 if (dlifp1->dl_if_uniqueid == NULL) {
10150 zfree(dlif_zone, buf);
10151 ret = ENOMEM;
10152 goto end;
10153 }
10154 bcopy(uniqueid, dlifp1->dl_if_uniqueid, uniqueid_len);
10155 dlifp1->dl_if_uniqueid_len = uniqueid_len;
10156 }
10157
10158 ifp1 = (struct ifnet *)dlifp1;
10159 dlifp1->dl_if_flags = DLIF_INUSE;
10160 if (ifnet_debug) {
10161 dlifp1->dl_if_flags |= DLIF_DEBUG;
10162 dlifp1->dl_if_trace = dlil_if_trace;
10163 }
10164 ifp1->if_name = dlifp1->dl_if_namestorage;
10165 ifp1->if_xname = dlifp1->dl_if_xnamestorage;
10166
10167 /* initialize interface description */
10168 ifp1->if_desc.ifd_maxlen = IF_DESCSIZE;
10169 ifp1->if_desc.ifd_len = 0;
10170 ifp1->if_desc.ifd_desc = dlifp1->dl_if_descstorage;
10171
10172 #if SKYWALK
10173 SLIST_INIT(&ifp1->if_netns_tokens);
10174 #endif /* SKYWALK */
10175
10176 if ((ret = dlil_alloc_local_stats(ifp1)) != 0) {
10177 DLIL_PRINTF("%s: failed to allocate if local stats, "
10178 "error: %d\n", __func__, ret);
10179 /* This probably shouldn't be fatal */
10180 ret = 0;
10181 }
10182
10183 lck_mtx_init(&dlifp1->dl_if_lock, &ifnet_lock_group, &ifnet_lock_attr);
10184 lck_rw_init(&ifp1->if_lock, &ifnet_lock_group, &ifnet_lock_attr);
10185 lck_mtx_init(&ifp1->if_ref_lock, &ifnet_lock_group, &ifnet_lock_attr);
10186 lck_mtx_init(&ifp1->if_flt_lock, &ifnet_lock_group, &ifnet_lock_attr);
10187 lck_mtx_init(&ifp1->if_addrconfig_lock, &ifnet_lock_group,
10188 &ifnet_lock_attr);
10189 lck_rw_init(&ifp1->if_llreach_lock, &ifnet_lock_group, &ifnet_lock_attr);
10190 #if INET
10191 lck_rw_init(&ifp1->if_inetdata_lock, &ifnet_lock_group,
10192 &ifnet_lock_attr);
10193 ifp1->if_inetdata = NULL;
10194 #endif
10195 lck_mtx_init(&ifp1->if_inet6_ioctl_lock, &ifnet_lock_group, &ifnet_lock_attr);
10196 ifp1->if_inet6_ioctl_busy = FALSE;
10197 lck_rw_init(&ifp1->if_inet6data_lock, &ifnet_lock_group,
10198 &ifnet_lock_attr);
10199 ifp1->if_inet6data = NULL;
10200 lck_rw_init(&ifp1->if_link_status_lock, &ifnet_lock_group,
10201 &ifnet_lock_attr);
10202 ifp1->if_link_status = NULL;
10203
10204 /* for send data paths */
10205 lck_mtx_init(&ifp1->if_start_lock, &ifnet_snd_lock_group,
10206 &ifnet_lock_attr);
10207 lck_mtx_init(&ifp1->if_cached_route_lock, &ifnet_snd_lock_group,
10208 &ifnet_lock_attr);
10209
10210 /* for receive data paths */
10211 lck_mtx_init(&ifp1->if_poll_lock, &ifnet_rcv_lock_group,
10212 &ifnet_lock_attr);
10213
10214 /* thread call allocation is done with sleeping zalloc */
10215 ifp1->if_dt_tcall = thread_call_allocate_with_options(dlil_dt_tcall_fn,
10216 ifp1, THREAD_CALL_PRIORITY_KERNEL, THREAD_CALL_OPTIONS_ONCE);
10217 if (ifp1->if_dt_tcall == NULL) {
10218 panic_plain("%s: couldn't create if_dt_tcall", __func__);
10219 /* NOTREACHED */
10220 }
10221
10222 TAILQ_INSERT_TAIL(&dlil_ifnet_head, dlifp1, dl_if_link);
10223
10224 *ifp = ifp1;
10225 dlil_if_ref(*ifp);
10226
10227 end:
10228 dlil_if_unlock();
10229
10230 VERIFY(dlifp1 == NULL || (IS_P2ALIGNED(dlifp1, sizeof(u_int64_t)) &&
10231 IS_P2ALIGNED(&ifp1->if_data, sizeof(u_int64_t))));
10232
10233 return ret;
10234 }
10235
10236 static void
_dlil_if_release(ifnet_t ifp,bool clear_in_use)10237 _dlil_if_release(ifnet_t ifp, bool clear_in_use)
10238 {
10239 struct dlil_ifnet *dlifp = (struct dlil_ifnet *)ifp;
10240
10241 VERIFY(OSDecrementAtomic64(&net_api_stats.nas_ifnet_alloc_count) > 0);
10242 if (!(ifp->if_xflags & IFXF_ALLOC_KPI)) {
10243 VERIFY(OSDecrementAtomic64(&net_api_stats.nas_ifnet_alloc_os_count) > 0);
10244 }
10245
10246 ifnet_lock_exclusive(ifp);
10247 if (ifp->if_broadcast.length > sizeof(ifp->if_broadcast.u.buffer)) {
10248 kfree_data(ifp->if_broadcast.u.ptr, ifp->if_broadcast.length);
10249 ifp->if_broadcast.length = 0;
10250 ifp->if_broadcast.u.ptr = NULL;
10251 }
10252 lck_mtx_lock(&dlifp->dl_if_lock);
10253 strlcpy(dlifp->dl_if_namestorage, ifp->if_name, IFNAMSIZ);
10254 ifp->if_name = dlifp->dl_if_namestorage;
10255 /* Reset external name (name + unit) */
10256 ifp->if_xname = dlifp->dl_if_xnamestorage;
10257 snprintf(__DECONST(char *, ifp->if_xname), IFXNAMSIZ,
10258 "%s?", ifp->if_name);
10259 if (clear_in_use) {
10260 ASSERT((dlifp->dl_if_flags & DLIF_INUSE) != 0);
10261 dlifp->dl_if_flags &= ~DLIF_INUSE;
10262 }
10263 lck_mtx_unlock(&dlifp->dl_if_lock);
10264 ifnet_lock_done(ifp);
10265 }
10266
10267 __private_extern__ void
dlil_if_release(ifnet_t ifp)10268 dlil_if_release(ifnet_t ifp)
10269 {
10270 _dlil_if_release(ifp, false);
10271 }
10272
10273 __private_extern__ void
dlil_if_lock(void)10274 dlil_if_lock(void)
10275 {
10276 lck_mtx_lock(&dlil_ifnet_lock);
10277 }
10278
10279 __private_extern__ void
dlil_if_unlock(void)10280 dlil_if_unlock(void)
10281 {
10282 lck_mtx_unlock(&dlil_ifnet_lock);
10283 }
10284
10285 __private_extern__ void
dlil_if_lock_assert(void)10286 dlil_if_lock_assert(void)
10287 {
10288 LCK_MTX_ASSERT(&dlil_ifnet_lock, LCK_MTX_ASSERT_OWNED);
10289 }
10290
10291 __private_extern__ void
dlil_proto_unplumb_all(struct ifnet * ifp)10292 dlil_proto_unplumb_all(struct ifnet *ifp)
10293 {
10294 /*
10295 * if_proto_hash[0-2] are for PF_INET, PF_INET6 and PF_VLAN, where
10296 * each bucket contains exactly one entry; PF_VLAN does not need an
10297 * explicit unplumb.
10298 *
10299 * if_proto_hash[3] is for other protocols; we expect anything
10300 * in this bucket to respond to the DETACHING event (which would
10301 * have happened by now) and do the unplumb then.
10302 */
10303 (void) proto_unplumb(PF_INET, ifp);
10304 (void) proto_unplumb(PF_INET6, ifp);
10305 }
10306
10307 static void
ifp_src_route_copyout(struct ifnet * ifp,struct route * dst)10308 ifp_src_route_copyout(struct ifnet *ifp, struct route *dst)
10309 {
10310 lck_mtx_lock_spin(&ifp->if_cached_route_lock);
10311 lck_mtx_convert_spin(&ifp->if_cached_route_lock);
10312
10313 route_copyout(dst, &ifp->if_src_route, sizeof(*dst));
10314
10315 lck_mtx_unlock(&ifp->if_cached_route_lock);
10316 }
10317
10318 static void
ifp_src_route_copyin(struct ifnet * ifp,struct route * src)10319 ifp_src_route_copyin(struct ifnet *ifp, struct route *src)
10320 {
10321 lck_mtx_lock_spin(&ifp->if_cached_route_lock);
10322 lck_mtx_convert_spin(&ifp->if_cached_route_lock);
10323
10324 if (ifp->if_fwd_cacheok) {
10325 route_copyin(src, &ifp->if_src_route, sizeof(*src));
10326 } else {
10327 ROUTE_RELEASE(src);
10328 }
10329 lck_mtx_unlock(&ifp->if_cached_route_lock);
10330 }
10331
10332 static void
ifp_src_route6_copyout(struct ifnet * ifp,struct route_in6 * dst)10333 ifp_src_route6_copyout(struct ifnet *ifp, struct route_in6 *dst)
10334 {
10335 lck_mtx_lock_spin(&ifp->if_cached_route_lock);
10336 lck_mtx_convert_spin(&ifp->if_cached_route_lock);
10337
10338 route_copyout((struct route *)dst, (struct route *)&ifp->if_src_route6,
10339 sizeof(*dst));
10340
10341 lck_mtx_unlock(&ifp->if_cached_route_lock);
10342 }
10343
10344 static void
ifp_src_route6_copyin(struct ifnet * ifp,struct route_in6 * src)10345 ifp_src_route6_copyin(struct ifnet *ifp, struct route_in6 *src)
10346 {
10347 lck_mtx_lock_spin(&ifp->if_cached_route_lock);
10348 lck_mtx_convert_spin(&ifp->if_cached_route_lock);
10349
10350 if (ifp->if_fwd_cacheok) {
10351 route_copyin((struct route *)src,
10352 (struct route *)&ifp->if_src_route6, sizeof(*src));
10353 } else {
10354 ROUTE_RELEASE(src);
10355 }
10356 lck_mtx_unlock(&ifp->if_cached_route_lock);
10357 }
10358
10359 struct rtentry *
ifnet_cached_rtlookup_inet(struct ifnet * ifp,struct in_addr src_ip)10360 ifnet_cached_rtlookup_inet(struct ifnet *ifp, struct in_addr src_ip)
10361 {
10362 struct route src_rt;
10363 struct sockaddr_in *dst;
10364
10365 dst = (struct sockaddr_in *)(void *)(&src_rt.ro_dst);
10366
10367 ifp_src_route_copyout(ifp, &src_rt);
10368
10369 if (ROUTE_UNUSABLE(&src_rt) || src_ip.s_addr != dst->sin_addr.s_addr) {
10370 ROUTE_RELEASE(&src_rt);
10371 if (dst->sin_family != AF_INET) {
10372 bzero(&src_rt.ro_dst, sizeof(src_rt.ro_dst));
10373 dst->sin_len = sizeof(src_rt.ro_dst);
10374 dst->sin_family = AF_INET;
10375 }
10376 dst->sin_addr = src_ip;
10377
10378 VERIFY(src_rt.ro_rt == NULL);
10379 src_rt.ro_rt = rtalloc1_scoped((struct sockaddr *)dst,
10380 0, 0, ifp->if_index);
10381
10382 if (src_rt.ro_rt != NULL) {
10383 /* retain a ref, copyin consumes one */
10384 struct rtentry *rte = src_rt.ro_rt;
10385 RT_ADDREF(rte);
10386 ifp_src_route_copyin(ifp, &src_rt);
10387 src_rt.ro_rt = rte;
10388 }
10389 }
10390
10391 return src_rt.ro_rt;
10392 }
10393
10394 struct rtentry *
ifnet_cached_rtlookup_inet6(struct ifnet * ifp,struct in6_addr * src_ip6)10395 ifnet_cached_rtlookup_inet6(struct ifnet *ifp, struct in6_addr *src_ip6)
10396 {
10397 struct route_in6 src_rt;
10398
10399 ifp_src_route6_copyout(ifp, &src_rt);
10400
10401 if (ROUTE_UNUSABLE(&src_rt) ||
10402 !IN6_ARE_ADDR_EQUAL(src_ip6, &src_rt.ro_dst.sin6_addr)) {
10403 ROUTE_RELEASE(&src_rt);
10404 if (src_rt.ro_dst.sin6_family != AF_INET6) {
10405 bzero(&src_rt.ro_dst, sizeof(src_rt.ro_dst));
10406 src_rt.ro_dst.sin6_len = sizeof(src_rt.ro_dst);
10407 src_rt.ro_dst.sin6_family = AF_INET6;
10408 }
10409 src_rt.ro_dst.sin6_scope_id = in6_addr2scopeid(ifp, src_ip6);
10410 bcopy(src_ip6, &src_rt.ro_dst.sin6_addr,
10411 sizeof(src_rt.ro_dst.sin6_addr));
10412
10413 if (src_rt.ro_rt == NULL) {
10414 src_rt.ro_rt = rtalloc1_scoped(
10415 (struct sockaddr *)&src_rt.ro_dst, 0, 0,
10416 ifp->if_index);
10417
10418 if (src_rt.ro_rt != NULL) {
10419 /* retain a ref, copyin consumes one */
10420 struct rtentry *rte = src_rt.ro_rt;
10421 RT_ADDREF(rte);
10422 ifp_src_route6_copyin(ifp, &src_rt);
10423 src_rt.ro_rt = rte;
10424 }
10425 }
10426 }
10427
10428 return src_rt.ro_rt;
10429 }
10430
10431 void
if_lqm_update(struct ifnet * ifp,int lqm,int locked)10432 if_lqm_update(struct ifnet *ifp, int lqm, int locked)
10433 {
10434 struct kev_dl_link_quality_metric_data ev_lqm_data;
10435
10436 VERIFY(lqm >= IFNET_LQM_MIN && lqm <= IFNET_LQM_MAX);
10437
10438 /* Normalize to edge */
10439 if (lqm >= 0 && lqm <= IFNET_LQM_THRESH_ABORT) {
10440 lqm = IFNET_LQM_THRESH_ABORT;
10441 atomic_bitset_32(&tcbinfo.ipi_flags,
10442 INPCBINFO_HANDLE_LQM_ABORT);
10443 inpcb_timer_sched(&tcbinfo, INPCB_TIMER_FAST);
10444 } else if (lqm > IFNET_LQM_THRESH_ABORT &&
10445 lqm <= IFNET_LQM_THRESH_MINIMALLY_VIABLE) {
10446 lqm = IFNET_LQM_THRESH_MINIMALLY_VIABLE;
10447 } else if (lqm > IFNET_LQM_THRESH_MINIMALLY_VIABLE &&
10448 lqm <= IFNET_LQM_THRESH_POOR) {
10449 lqm = IFNET_LQM_THRESH_POOR;
10450 } else if (lqm > IFNET_LQM_THRESH_POOR &&
10451 lqm <= IFNET_LQM_THRESH_GOOD) {
10452 lqm = IFNET_LQM_THRESH_GOOD;
10453 }
10454
10455 /*
10456 * Take the lock if needed
10457 */
10458 if (!locked) {
10459 ifnet_lock_exclusive(ifp);
10460 }
10461
10462 if (lqm == ifp->if_interface_state.lqm_state &&
10463 (ifp->if_interface_state.valid_bitmask &
10464 IF_INTERFACE_STATE_LQM_STATE_VALID)) {
10465 /*
10466 * Release the lock if was not held by the caller
10467 */
10468 if (!locked) {
10469 ifnet_lock_done(ifp);
10470 }
10471 return; /* nothing to update */
10472 }
10473 ifp->if_interface_state.valid_bitmask |=
10474 IF_INTERFACE_STATE_LQM_STATE_VALID;
10475 ifp->if_interface_state.lqm_state = (int8_t)lqm;
10476
10477 /*
10478 * Don't want to hold the lock when issuing kernel events
10479 */
10480 ifnet_lock_done(ifp);
10481
10482 bzero(&ev_lqm_data, sizeof(ev_lqm_data));
10483 ev_lqm_data.link_quality_metric = lqm;
10484
10485 dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_LINK_QUALITY_METRIC_CHANGED,
10486 (struct net_event_data *)&ev_lqm_data, sizeof(ev_lqm_data), FALSE);
10487
10488 /*
10489 * Reacquire the lock for the caller
10490 */
10491 if (locked) {
10492 ifnet_lock_exclusive(ifp);
10493 }
10494 }
10495
10496 static void
if_rrc_state_update(struct ifnet * ifp,unsigned int rrc_state)10497 if_rrc_state_update(struct ifnet *ifp, unsigned int rrc_state)
10498 {
10499 struct kev_dl_rrc_state kev;
10500
10501 if (rrc_state == ifp->if_interface_state.rrc_state &&
10502 (ifp->if_interface_state.valid_bitmask &
10503 IF_INTERFACE_STATE_RRC_STATE_VALID)) {
10504 return;
10505 }
10506
10507 ifp->if_interface_state.valid_bitmask |=
10508 IF_INTERFACE_STATE_RRC_STATE_VALID;
10509
10510 ifp->if_interface_state.rrc_state = (uint8_t)rrc_state;
10511
10512 /*
10513 * Don't want to hold the lock when issuing kernel events
10514 */
10515 ifnet_lock_done(ifp);
10516
10517 bzero(&kev, sizeof(struct kev_dl_rrc_state));
10518 kev.rrc_state = rrc_state;
10519
10520 dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_RRC_STATE_CHANGED,
10521 (struct net_event_data *)&kev, sizeof(struct kev_dl_rrc_state), FALSE);
10522
10523 ifnet_lock_exclusive(ifp);
10524 }
10525
10526 errno_t
if_state_update(struct ifnet * ifp,struct if_interface_state * if_interface_state)10527 if_state_update(struct ifnet *ifp,
10528 struct if_interface_state *if_interface_state)
10529 {
10530 u_short if_index_available = 0;
10531
10532 ifnet_lock_exclusive(ifp);
10533
10534 if ((ifp->if_type != IFT_CELLULAR) &&
10535 (if_interface_state->valid_bitmask &
10536 IF_INTERFACE_STATE_RRC_STATE_VALID)) {
10537 ifnet_lock_done(ifp);
10538 return ENOTSUP;
10539 }
10540 if ((if_interface_state->valid_bitmask &
10541 IF_INTERFACE_STATE_LQM_STATE_VALID) &&
10542 (if_interface_state->lqm_state < IFNET_LQM_MIN ||
10543 if_interface_state->lqm_state > IFNET_LQM_MAX)) {
10544 ifnet_lock_done(ifp);
10545 return EINVAL;
10546 }
10547 if ((if_interface_state->valid_bitmask &
10548 IF_INTERFACE_STATE_RRC_STATE_VALID) &&
10549 if_interface_state->rrc_state !=
10550 IF_INTERFACE_STATE_RRC_STATE_IDLE &&
10551 if_interface_state->rrc_state !=
10552 IF_INTERFACE_STATE_RRC_STATE_CONNECTED) {
10553 ifnet_lock_done(ifp);
10554 return EINVAL;
10555 }
10556
10557 if (if_interface_state->valid_bitmask &
10558 IF_INTERFACE_STATE_LQM_STATE_VALID) {
10559 if_lqm_update(ifp, if_interface_state->lqm_state, 1);
10560 }
10561 if (if_interface_state->valid_bitmask &
10562 IF_INTERFACE_STATE_RRC_STATE_VALID) {
10563 if_rrc_state_update(ifp, if_interface_state->rrc_state);
10564 }
10565 if (if_interface_state->valid_bitmask &
10566 IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID) {
10567 ifp->if_interface_state.valid_bitmask |=
10568 IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID;
10569 ifp->if_interface_state.interface_availability =
10570 if_interface_state->interface_availability;
10571
10572 if (ifp->if_interface_state.interface_availability ==
10573 IF_INTERFACE_STATE_INTERFACE_AVAILABLE) {
10574 os_log(OS_LOG_DEFAULT, "%s: interface %s (%u) available\n",
10575 __func__, if_name(ifp), ifp->if_index);
10576 if_index_available = ifp->if_index;
10577 } else {
10578 os_log(OS_LOG_DEFAULT, "%s: interface %s (%u) unavailable)\n",
10579 __func__, if_name(ifp), ifp->if_index);
10580 }
10581 }
10582 ifnet_lock_done(ifp);
10583
10584 /*
10585 * Check if the TCP connections going on this interface should be
10586 * forced to send probe packets instead of waiting for TCP timers
10587 * to fire. This is done on an explicit notification such as
10588 * SIOCSIFINTERFACESTATE which marks the interface as available.
10589 */
10590 if (if_index_available > 0) {
10591 tcp_interface_send_probe(if_index_available);
10592 }
10593
10594 return 0;
10595 }
10596
10597 void
if_get_state(struct ifnet * ifp,struct if_interface_state * if_interface_state)10598 if_get_state(struct ifnet *ifp,
10599 struct if_interface_state *if_interface_state)
10600 {
10601 ifnet_lock_shared(ifp);
10602
10603 if_interface_state->valid_bitmask = 0;
10604
10605 if (ifp->if_interface_state.valid_bitmask &
10606 IF_INTERFACE_STATE_RRC_STATE_VALID) {
10607 if_interface_state->valid_bitmask |=
10608 IF_INTERFACE_STATE_RRC_STATE_VALID;
10609 if_interface_state->rrc_state =
10610 ifp->if_interface_state.rrc_state;
10611 }
10612 if (ifp->if_interface_state.valid_bitmask &
10613 IF_INTERFACE_STATE_LQM_STATE_VALID) {
10614 if_interface_state->valid_bitmask |=
10615 IF_INTERFACE_STATE_LQM_STATE_VALID;
10616 if_interface_state->lqm_state =
10617 ifp->if_interface_state.lqm_state;
10618 }
10619 if (ifp->if_interface_state.valid_bitmask &
10620 IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID) {
10621 if_interface_state->valid_bitmask |=
10622 IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID;
10623 if_interface_state->interface_availability =
10624 ifp->if_interface_state.interface_availability;
10625 }
10626
10627 ifnet_lock_done(ifp);
10628 }
10629
10630 errno_t
if_probe_connectivity(struct ifnet * ifp,u_int32_t conn_probe)10631 if_probe_connectivity(struct ifnet *ifp, u_int32_t conn_probe)
10632 {
10633 if (conn_probe > 1) {
10634 return EINVAL;
10635 }
10636 if (conn_probe == 0) {
10637 if_clear_eflags(ifp, IFEF_PROBE_CONNECTIVITY);
10638 } else {
10639 if_set_eflags(ifp, IFEF_PROBE_CONNECTIVITY);
10640 }
10641
10642 #if NECP
10643 necp_update_all_clients();
10644 #endif /* NECP */
10645
10646 tcp_probe_connectivity(ifp, conn_probe);
10647 return 0;
10648 }
10649
10650 /* for uuid.c */
10651 static int
get_ether_index(int * ret_other_index)10652 get_ether_index(int * ret_other_index)
10653 {
10654 struct ifnet *ifp;
10655 int en0_index = 0;
10656 int other_en_index = 0;
10657 int any_ether_index = 0;
10658 short best_unit = 0;
10659
10660 *ret_other_index = 0;
10661 TAILQ_FOREACH(ifp, &ifnet_head, if_link) {
10662 /*
10663 * find en0, or if not en0, the lowest unit en*, and if not
10664 * that, any ethernet
10665 */
10666 ifnet_lock_shared(ifp);
10667 if (strcmp(ifp->if_name, "en") == 0) {
10668 if (ifp->if_unit == 0) {
10669 /* found en0, we're done */
10670 en0_index = ifp->if_index;
10671 ifnet_lock_done(ifp);
10672 break;
10673 }
10674 if (other_en_index == 0 || ifp->if_unit < best_unit) {
10675 other_en_index = ifp->if_index;
10676 best_unit = ifp->if_unit;
10677 }
10678 } else if (ifp->if_type == IFT_ETHER && any_ether_index == 0) {
10679 any_ether_index = ifp->if_index;
10680 }
10681 ifnet_lock_done(ifp);
10682 }
10683 if (en0_index == 0) {
10684 if (other_en_index != 0) {
10685 *ret_other_index = other_en_index;
10686 } else if (any_ether_index != 0) {
10687 *ret_other_index = any_ether_index;
10688 }
10689 }
10690 return en0_index;
10691 }
10692
10693 int
uuid_get_ethernet(u_int8_t * node)10694 uuid_get_ethernet(u_int8_t *node)
10695 {
10696 static int en0_index;
10697 struct ifnet *ifp;
10698 int other_index = 0;
10699 int the_index = 0;
10700 int ret;
10701
10702 ifnet_head_lock_shared();
10703 if (en0_index == 0 || ifindex2ifnet[en0_index] == NULL) {
10704 en0_index = get_ether_index(&other_index);
10705 }
10706 if (en0_index != 0) {
10707 the_index = en0_index;
10708 } else if (other_index != 0) {
10709 the_index = other_index;
10710 }
10711 if (the_index != 0) {
10712 struct dlil_ifnet *dl_if;
10713
10714 ifp = ifindex2ifnet[the_index];
10715 VERIFY(ifp != NULL);
10716 dl_if = (struct dlil_ifnet *)ifp;
10717 if (dl_if->dl_if_permanent_ether_is_set != 0) {
10718 /*
10719 * Use the permanent ethernet address if it is
10720 * available because it will never change.
10721 */
10722 memcpy(node, dl_if->dl_if_permanent_ether,
10723 ETHER_ADDR_LEN);
10724 } else {
10725 memcpy(node, IF_LLADDR(ifp), ETHER_ADDR_LEN);
10726 }
10727 ret = 0;
10728 } else {
10729 ret = -1;
10730 }
10731 ifnet_head_done();
10732 return ret;
10733 }
10734
10735 static int
10736 sysctl_rxpoll SYSCTL_HANDLER_ARGS
10737 {
10738 #pragma unused(arg1, arg2)
10739 uint32_t i;
10740 int err;
10741
10742 i = if_rxpoll;
10743
10744 err = sysctl_handle_int(oidp, &i, 0, req);
10745 if (err != 0 || req->newptr == USER_ADDR_NULL) {
10746 return err;
10747 }
10748
10749 if (net_rxpoll == 0) {
10750 return ENXIO;
10751 }
10752
10753 if_rxpoll = i;
10754 return err;
10755 }
10756
10757 static int
10758 sysctl_rxpoll_mode_holdtime SYSCTL_HANDLER_ARGS
10759 {
10760 #pragma unused(arg1, arg2)
10761 uint64_t q;
10762 int err;
10763
10764 q = if_rxpoll_mode_holdtime;
10765
10766 err = sysctl_handle_quad(oidp, &q, 0, req);
10767 if (err != 0 || req->newptr == USER_ADDR_NULL) {
10768 return err;
10769 }
10770
10771 if (q < IF_RXPOLL_MODE_HOLDTIME_MIN) {
10772 q = IF_RXPOLL_MODE_HOLDTIME_MIN;
10773 }
10774
10775 if_rxpoll_mode_holdtime = q;
10776
10777 return err;
10778 }
10779
10780 static int
10781 sysctl_rxpoll_sample_holdtime SYSCTL_HANDLER_ARGS
10782 {
10783 #pragma unused(arg1, arg2)
10784 uint64_t q;
10785 int err;
10786
10787 q = if_rxpoll_sample_holdtime;
10788
10789 err = sysctl_handle_quad(oidp, &q, 0, req);
10790 if (err != 0 || req->newptr == USER_ADDR_NULL) {
10791 return err;
10792 }
10793
10794 if (q < IF_RXPOLL_SAMPLETIME_MIN) {
10795 q = IF_RXPOLL_SAMPLETIME_MIN;
10796 }
10797
10798 if_rxpoll_sample_holdtime = q;
10799
10800 return err;
10801 }
10802
10803 static int
10804 sysctl_rxpoll_interval_time SYSCTL_HANDLER_ARGS
10805 {
10806 #pragma unused(arg1, arg2)
10807 uint64_t q;
10808 int err;
10809
10810 q = if_rxpoll_interval_time;
10811
10812 err = sysctl_handle_quad(oidp, &q, 0, req);
10813 if (err != 0 || req->newptr == USER_ADDR_NULL) {
10814 return err;
10815 }
10816
10817 if (q < IF_RXPOLL_INTERVALTIME_MIN) {
10818 q = IF_RXPOLL_INTERVALTIME_MIN;
10819 }
10820
10821 if_rxpoll_interval_time = q;
10822
10823 return err;
10824 }
10825
10826 static int
10827 sysctl_rxpoll_wlowat SYSCTL_HANDLER_ARGS
10828 {
10829 #pragma unused(arg1, arg2)
10830 uint32_t i;
10831 int err;
10832
10833 i = if_sysctl_rxpoll_wlowat;
10834
10835 err = sysctl_handle_int(oidp, &i, 0, req);
10836 if (err != 0 || req->newptr == USER_ADDR_NULL) {
10837 return err;
10838 }
10839
10840 if (i == 0 || i >= if_sysctl_rxpoll_whiwat) {
10841 return EINVAL;
10842 }
10843
10844 if_sysctl_rxpoll_wlowat = i;
10845 return err;
10846 }
10847
10848 static int
10849 sysctl_rxpoll_whiwat SYSCTL_HANDLER_ARGS
10850 {
10851 #pragma unused(arg1, arg2)
10852 uint32_t i;
10853 int err;
10854
10855 i = if_sysctl_rxpoll_whiwat;
10856
10857 err = sysctl_handle_int(oidp, &i, 0, req);
10858 if (err != 0 || req->newptr == USER_ADDR_NULL) {
10859 return err;
10860 }
10861
10862 if (i <= if_sysctl_rxpoll_wlowat) {
10863 return EINVAL;
10864 }
10865
10866 if_sysctl_rxpoll_whiwat = i;
10867 return err;
10868 }
10869
10870 static int
10871 sysctl_sndq_maxlen SYSCTL_HANDLER_ARGS
10872 {
10873 #pragma unused(arg1, arg2)
10874 int i, err;
10875
10876 i = if_sndq_maxlen;
10877
10878 err = sysctl_handle_int(oidp, &i, 0, req);
10879 if (err != 0 || req->newptr == USER_ADDR_NULL) {
10880 return err;
10881 }
10882
10883 if (i < IF_SNDQ_MINLEN) {
10884 i = IF_SNDQ_MINLEN;
10885 }
10886
10887 if_sndq_maxlen = i;
10888 return err;
10889 }
10890
10891 static int
10892 sysctl_rcvq_maxlen SYSCTL_HANDLER_ARGS
10893 {
10894 #pragma unused(arg1, arg2)
10895 int i, err;
10896
10897 i = if_rcvq_maxlen;
10898
10899 err = sysctl_handle_int(oidp, &i, 0, req);
10900 if (err != 0 || req->newptr == USER_ADDR_NULL) {
10901 return err;
10902 }
10903
10904 if (i < IF_RCVQ_MINLEN) {
10905 i = IF_RCVQ_MINLEN;
10906 }
10907
10908 if_rcvq_maxlen = i;
10909 return err;
10910 }
10911
10912 int
dlil_node_present(struct ifnet * ifp,struct sockaddr * sa,int32_t rssi,int lqm,int npm,u_int8_t srvinfo[48])10913 dlil_node_present(struct ifnet *ifp, struct sockaddr *sa,
10914 int32_t rssi, int lqm, int npm, u_int8_t srvinfo[48])
10915 {
10916 struct kev_dl_node_presence kev;
10917 struct sockaddr_dl *sdl;
10918 struct sockaddr_in6 *sin6;
10919 int ret = 0;
10920
10921 VERIFY(ifp);
10922 VERIFY(sa);
10923 VERIFY(sa->sa_family == AF_LINK || sa->sa_family == AF_INET6);
10924
10925 bzero(&kev, sizeof(kev));
10926 sin6 = &kev.sin6_node_address;
10927 sdl = &kev.sdl_node_address;
10928 nd6_alt_node_addr_decompose(ifp, sa, sdl, sin6);
10929 kev.rssi = rssi;
10930 kev.link_quality_metric = lqm;
10931 kev.node_proximity_metric = npm;
10932 bcopy(srvinfo, kev.node_service_info, sizeof(kev.node_service_info));
10933
10934 ret = nd6_alt_node_present(ifp, sin6, sdl, rssi, lqm, npm);
10935 if (ret == 0 || ret == EEXIST) {
10936 int err = dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_NODE_PRESENCE,
10937 &kev.link_data, sizeof(kev), (ret == EEXIST) ? TRUE : FALSE);
10938 if (err != 0) {
10939 log(LOG_ERR, "%s: Post DL_NODE_PRESENCE failed with"
10940 "error %d\n", __func__, err);
10941 }
10942 }
10943
10944 if (ret == EEXIST) {
10945 ret = 0;
10946 }
10947 return ret;
10948 }
10949
10950 void
dlil_node_absent(struct ifnet * ifp,struct sockaddr * sa)10951 dlil_node_absent(struct ifnet *ifp, struct sockaddr *sa)
10952 {
10953 struct kev_dl_node_absence kev = {};
10954 struct sockaddr_in6 *kev_sin6 = NULL;
10955 struct sockaddr_dl *kev_sdl = NULL;
10956 int error = 0;
10957
10958 VERIFY(ifp != NULL);
10959 VERIFY(sa != NULL);
10960 VERIFY(sa->sa_family == AF_LINK || sa->sa_family == AF_INET6);
10961
10962 kev_sin6 = &kev.sin6_node_address;
10963 kev_sdl = &kev.sdl_node_address;
10964
10965 if (sa->sa_family == AF_INET6) {
10966 /*
10967 * If IPv6 address is given, get the link layer
10968 * address from what was cached in the neighbor cache
10969 */
10970 VERIFY(sa->sa_len <= sizeof(*kev_sin6));
10971 bcopy(sa, kev_sin6, sa->sa_len);
10972 error = nd6_alt_node_absent(ifp, kev_sin6, kev_sdl);
10973 } else {
10974 /*
10975 * If passed address is AF_LINK type, derive the address
10976 * based on the link address.
10977 */
10978 nd6_alt_node_addr_decompose(ifp, sa, kev_sdl, kev_sin6);
10979 error = nd6_alt_node_absent(ifp, kev_sin6, NULL);
10980 }
10981
10982 if (error == 0) {
10983 kev_sdl->sdl_type = ifp->if_type;
10984 kev_sdl->sdl_index = ifp->if_index;
10985
10986 dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_NODE_ABSENCE,
10987 &kev.link_data, sizeof(kev), FALSE);
10988 }
10989 }
10990
10991 int
dlil_node_present_v2(struct ifnet * ifp,struct sockaddr * sa,struct sockaddr_dl * sdl,int32_t rssi,int lqm,int npm,u_int8_t srvinfo[48])10992 dlil_node_present_v2(struct ifnet *ifp, struct sockaddr *sa, struct sockaddr_dl *sdl,
10993 int32_t rssi, int lqm, int npm, u_int8_t srvinfo[48])
10994 {
10995 struct kev_dl_node_presence kev = {};
10996 struct sockaddr_dl *kev_sdl = NULL;
10997 struct sockaddr_in6 *kev_sin6 = NULL;
10998 int ret = 0;
10999
11000 VERIFY(ifp != NULL);
11001 VERIFY(sa != NULL && sdl != NULL);
11002 VERIFY(sa->sa_family == AF_INET6 && sdl->sdl_family == AF_LINK);
11003
11004 kev_sin6 = &kev.sin6_node_address;
11005 kev_sdl = &kev.sdl_node_address;
11006
11007 VERIFY(sdl->sdl_len <= sizeof(*kev_sdl));
11008 bcopy(sdl, kev_sdl, sdl->sdl_len);
11009 kev_sdl->sdl_type = ifp->if_type;
11010 kev_sdl->sdl_index = ifp->if_index;
11011
11012 VERIFY(sa->sa_len <= sizeof(*kev_sin6));
11013 bcopy(sa, kev_sin6, sa->sa_len);
11014
11015 kev.rssi = rssi;
11016 kev.link_quality_metric = lqm;
11017 kev.node_proximity_metric = npm;
11018 bcopy(srvinfo, kev.node_service_info, sizeof(kev.node_service_info));
11019
11020 ret = nd6_alt_node_present(ifp, SIN6(sa), sdl, rssi, lqm, npm);
11021 if (ret == 0 || ret == EEXIST) {
11022 int err = dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_NODE_PRESENCE,
11023 &kev.link_data, sizeof(kev), (ret == EEXIST) ? TRUE : FALSE);
11024 if (err != 0) {
11025 log(LOG_ERR, "%s: Post DL_NODE_PRESENCE failed with error %d\n", __func__, err);
11026 }
11027 }
11028
11029 if (ret == EEXIST) {
11030 ret = 0;
11031 }
11032 return ret;
11033 }
11034
11035 const void *
dlil_ifaddr_bytes(const struct sockaddr_dl * sdl,size_t * sizep,kauth_cred_t * credp)11036 dlil_ifaddr_bytes(const struct sockaddr_dl *sdl, size_t *sizep,
11037 kauth_cred_t *credp)
11038 {
11039 const u_int8_t *bytes;
11040 size_t size;
11041
11042 bytes = CONST_LLADDR(sdl);
11043 size = sdl->sdl_alen;
11044
11045 #if CONFIG_MACF
11046 if (dlil_lladdr_ckreq) {
11047 switch (sdl->sdl_type) {
11048 case IFT_ETHER:
11049 case IFT_IEEE1394:
11050 break;
11051 default:
11052 credp = NULL;
11053 break;
11054 }
11055 ;
11056
11057 if (credp && mac_system_check_info(*credp, "net.link.addr")) {
11058 static const u_int8_t unspec[FIREWIRE_EUI64_LEN] = {
11059 [0] = 2
11060 };
11061
11062 bytes = unspec;
11063 }
11064 }
11065 #else
11066 #pragma unused(credp)
11067 #endif
11068
11069 if (sizep != NULL) {
11070 *sizep = size;
11071 }
11072 return bytes;
11073 }
11074
11075 void
dlil_report_issues(struct ifnet * ifp,u_int8_t modid[DLIL_MODIDLEN],u_int8_t info[DLIL_MODARGLEN])11076 dlil_report_issues(struct ifnet *ifp, u_int8_t modid[DLIL_MODIDLEN],
11077 u_int8_t info[DLIL_MODARGLEN])
11078 {
11079 struct kev_dl_issues kev;
11080 struct timeval tv;
11081
11082 VERIFY(ifp != NULL);
11083 VERIFY(modid != NULL);
11084 _CASSERT(sizeof(kev.modid) == DLIL_MODIDLEN);
11085 _CASSERT(sizeof(kev.info) == DLIL_MODARGLEN);
11086
11087 bzero(&kev, sizeof(kev));
11088
11089 microtime(&tv);
11090 kev.timestamp = tv.tv_sec;
11091 bcopy(modid, &kev.modid, DLIL_MODIDLEN);
11092 if (info != NULL) {
11093 bcopy(info, &kev.info, DLIL_MODARGLEN);
11094 }
11095
11096 dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_ISSUES,
11097 &kev.link_data, sizeof(kev), FALSE);
11098 }
11099
11100 errno_t
ifnet_getset_opportunistic(ifnet_t ifp,u_long cmd,struct ifreq * ifr,struct proc * p)11101 ifnet_getset_opportunistic(ifnet_t ifp, u_long cmd, struct ifreq *ifr,
11102 struct proc *p)
11103 {
11104 u_int32_t level = IFNET_THROTTLE_OFF;
11105 errno_t result = 0;
11106
11107 VERIFY(cmd == SIOCSIFOPPORTUNISTIC || cmd == SIOCGIFOPPORTUNISTIC);
11108
11109 if (cmd == SIOCSIFOPPORTUNISTIC) {
11110 /*
11111 * XXX: Use priv_check_cred() instead of root check?
11112 */
11113 if ((result = proc_suser(p)) != 0) {
11114 return result;
11115 }
11116
11117 if (ifr->ifr_opportunistic.ifo_flags ==
11118 IFRIFOF_BLOCK_OPPORTUNISTIC) {
11119 level = IFNET_THROTTLE_OPPORTUNISTIC;
11120 } else if (ifr->ifr_opportunistic.ifo_flags == 0) {
11121 level = IFNET_THROTTLE_OFF;
11122 } else {
11123 result = EINVAL;
11124 }
11125
11126 if (result == 0) {
11127 result = ifnet_set_throttle(ifp, level);
11128 }
11129 } else if ((result = ifnet_get_throttle(ifp, &level)) == 0) {
11130 ifr->ifr_opportunistic.ifo_flags = 0;
11131 if (level == IFNET_THROTTLE_OPPORTUNISTIC) {
11132 ifr->ifr_opportunistic.ifo_flags |=
11133 IFRIFOF_BLOCK_OPPORTUNISTIC;
11134 }
11135 }
11136
11137 /*
11138 * Return the count of current opportunistic connections
11139 * over the interface.
11140 */
11141 if (result == 0) {
11142 uint32_t flags = 0;
11143 flags |= (cmd == SIOCSIFOPPORTUNISTIC) ?
11144 INPCB_OPPORTUNISTIC_SETCMD : 0;
11145 flags |= (level == IFNET_THROTTLE_OPPORTUNISTIC) ?
11146 INPCB_OPPORTUNISTIC_THROTTLEON : 0;
11147 ifr->ifr_opportunistic.ifo_inuse =
11148 udp_count_opportunistic(ifp->if_index, flags) +
11149 tcp_count_opportunistic(ifp->if_index, flags);
11150 }
11151
11152 if (result == EALREADY) {
11153 result = 0;
11154 }
11155
11156 return result;
11157 }
11158
11159 int
ifnet_get_throttle(struct ifnet * ifp,u_int32_t * level)11160 ifnet_get_throttle(struct ifnet *ifp, u_int32_t *level)
11161 {
11162 struct ifclassq *ifq;
11163 int err = 0;
11164
11165 if (!(ifp->if_eflags & IFEF_TXSTART)) {
11166 return ENXIO;
11167 }
11168
11169 *level = IFNET_THROTTLE_OFF;
11170
11171 ifq = ifp->if_snd;
11172 IFCQ_LOCK(ifq);
11173 /* Throttling works only for IFCQ, not ALTQ instances */
11174 if (IFCQ_IS_ENABLED(ifq)) {
11175 cqrq_throttle_t req = { 0, IFNET_THROTTLE_OFF };
11176
11177 err = fq_if_request_classq(ifq, CLASSQRQ_THROTTLE, &req);
11178 *level = req.level;
11179 }
11180 IFCQ_UNLOCK(ifq);
11181
11182 return err;
11183 }
11184
11185 int
ifnet_set_throttle(struct ifnet * ifp,u_int32_t level)11186 ifnet_set_throttle(struct ifnet *ifp, u_int32_t level)
11187 {
11188 struct ifclassq *ifq;
11189 int err = 0;
11190
11191 if (!(ifp->if_eflags & IFEF_TXSTART)) {
11192 return ENXIO;
11193 }
11194
11195 ifq = ifp->if_snd;
11196
11197 switch (level) {
11198 case IFNET_THROTTLE_OFF:
11199 case IFNET_THROTTLE_OPPORTUNISTIC:
11200 break;
11201 default:
11202 return EINVAL;
11203 }
11204
11205 IFCQ_LOCK(ifq);
11206 if (IFCQ_IS_ENABLED(ifq)) {
11207 cqrq_throttle_t req = { 1, level };
11208
11209 err = fq_if_request_classq(ifq, CLASSQRQ_THROTTLE, &req);
11210 }
11211 IFCQ_UNLOCK(ifq);
11212
11213 if (err == 0) {
11214 DLIL_PRINTF("%s: throttling level set to %d\n", if_name(ifp),
11215 level);
11216 #if NECP
11217 necp_update_all_clients();
11218 #endif /* NECP */
11219 if (level == IFNET_THROTTLE_OFF) {
11220 ifnet_start(ifp);
11221 }
11222 }
11223
11224 return err;
11225 }
11226
11227 errno_t
ifnet_getset_log(ifnet_t ifp,u_long cmd,struct ifreq * ifr,struct proc * p)11228 ifnet_getset_log(ifnet_t ifp, u_long cmd, struct ifreq *ifr,
11229 struct proc *p)
11230 {
11231 #pragma unused(p)
11232 errno_t result = 0;
11233 uint32_t flags;
11234 int level, category, subcategory;
11235
11236 VERIFY(cmd == SIOCSIFLOG || cmd == SIOCGIFLOG);
11237
11238 if (cmd == SIOCSIFLOG) {
11239 if ((result = priv_check_cred(kauth_cred_get(),
11240 PRIV_NET_INTERFACE_CONTROL, 0)) != 0) {
11241 return result;
11242 }
11243
11244 level = ifr->ifr_log.ifl_level;
11245 if (level < IFNET_LOG_MIN || level > IFNET_LOG_MAX) {
11246 result = EINVAL;
11247 }
11248
11249 flags = ifr->ifr_log.ifl_flags;
11250 if ((flags &= IFNET_LOGF_MASK) == 0) {
11251 result = EINVAL;
11252 }
11253
11254 category = ifr->ifr_log.ifl_category;
11255 subcategory = ifr->ifr_log.ifl_subcategory;
11256
11257 if (result == 0) {
11258 result = ifnet_set_log(ifp, level, flags,
11259 category, subcategory);
11260 }
11261 } else {
11262 result = ifnet_get_log(ifp, &level, &flags, &category,
11263 &subcategory);
11264 if (result == 0) {
11265 ifr->ifr_log.ifl_level = level;
11266 ifr->ifr_log.ifl_flags = flags;
11267 ifr->ifr_log.ifl_category = category;
11268 ifr->ifr_log.ifl_subcategory = subcategory;
11269 }
11270 }
11271
11272 return result;
11273 }
11274
11275 int
ifnet_set_log(struct ifnet * ifp,int32_t level,uint32_t flags,int32_t category,int32_t subcategory)11276 ifnet_set_log(struct ifnet *ifp, int32_t level, uint32_t flags,
11277 int32_t category, int32_t subcategory)
11278 {
11279 int err = 0;
11280
11281 VERIFY(level >= IFNET_LOG_MIN && level <= IFNET_LOG_MAX);
11282 VERIFY(flags & IFNET_LOGF_MASK);
11283
11284 /*
11285 * The logging level applies to all facilities; make sure to
11286 * update them all with the most current level.
11287 */
11288 flags |= ifp->if_log.flags;
11289
11290 if (ifp->if_output_ctl != NULL) {
11291 struct ifnet_log_params l;
11292
11293 bzero(&l, sizeof(l));
11294 l.level = level;
11295 l.flags = flags;
11296 l.flags &= ~IFNET_LOGF_DLIL;
11297 l.category = category;
11298 l.subcategory = subcategory;
11299
11300 /* Send this request to lower layers */
11301 if (l.flags != 0) {
11302 err = ifp->if_output_ctl(ifp, IFNET_CTL_SET_LOG,
11303 sizeof(l), &l);
11304 }
11305 } else if ((flags & ~IFNET_LOGF_DLIL) && ifp->if_output_ctl == NULL) {
11306 /*
11307 * If targeted to the lower layers without an output
11308 * control callback registered on the interface, just
11309 * silently ignore facilities other than ours.
11310 */
11311 flags &= IFNET_LOGF_DLIL;
11312 if (flags == 0 && (!(ifp->if_log.flags & IFNET_LOGF_DLIL))) {
11313 level = 0;
11314 }
11315 }
11316
11317 if (err == 0) {
11318 if ((ifp->if_log.level = level) == IFNET_LOG_DEFAULT) {
11319 ifp->if_log.flags = 0;
11320 } else {
11321 ifp->if_log.flags |= flags;
11322 }
11323
11324 log(LOG_INFO, "%s: logging level set to %d flags=%b "
11325 "arg=%b, category=%d subcategory=%d\n", if_name(ifp),
11326 ifp->if_log.level, ifp->if_log.flags,
11327 IFNET_LOGF_BITS, flags, IFNET_LOGF_BITS,
11328 category, subcategory);
11329 }
11330
11331 return err;
11332 }
11333
11334 int
ifnet_get_log(struct ifnet * ifp,int32_t * level,uint32_t * flags,int32_t * category,int32_t * subcategory)11335 ifnet_get_log(struct ifnet *ifp, int32_t *level, uint32_t *flags,
11336 int32_t *category, int32_t *subcategory)
11337 {
11338 if (level != NULL) {
11339 *level = ifp->if_log.level;
11340 }
11341 if (flags != NULL) {
11342 *flags = ifp->if_log.flags;
11343 }
11344 if (category != NULL) {
11345 *category = ifp->if_log.category;
11346 }
11347 if (subcategory != NULL) {
11348 *subcategory = ifp->if_log.subcategory;
11349 }
11350
11351 return 0;
11352 }
11353
11354 int
ifnet_notify_address(struct ifnet * ifp,int af)11355 ifnet_notify_address(struct ifnet *ifp, int af)
11356 {
11357 struct ifnet_notify_address_params na;
11358
11359 #if PF
11360 (void) pf_ifaddr_hook(ifp);
11361 #endif /* PF */
11362
11363 if (ifp->if_output_ctl == NULL) {
11364 return EOPNOTSUPP;
11365 }
11366
11367 bzero(&na, sizeof(na));
11368 na.address_family = (sa_family_t)af;
11369
11370 return ifp->if_output_ctl(ifp, IFNET_CTL_NOTIFY_ADDRESS,
11371 sizeof(na), &na);
11372 }
11373
11374 errno_t
ifnet_flowid(struct ifnet * ifp,uint32_t * flowid)11375 ifnet_flowid(struct ifnet *ifp, uint32_t *flowid)
11376 {
11377 if (ifp == NULL || flowid == NULL) {
11378 return EINVAL;
11379 } else if (!(ifp->if_eflags & IFEF_TXSTART) ||
11380 !IF_FULLY_ATTACHED(ifp)) {
11381 return ENXIO;
11382 }
11383
11384 *flowid = ifp->if_flowhash;
11385
11386 return 0;
11387 }
11388
11389 errno_t
ifnet_disable_output(struct ifnet * ifp)11390 ifnet_disable_output(struct ifnet *ifp)
11391 {
11392 int err;
11393
11394 if (ifp == NULL) {
11395 return EINVAL;
11396 } else if (!(ifp->if_eflags & IFEF_TXSTART) ||
11397 !IF_FULLY_ATTACHED(ifp)) {
11398 return ENXIO;
11399 }
11400
11401 if ((err = ifnet_fc_add(ifp)) == 0) {
11402 lck_mtx_lock_spin(&ifp->if_start_lock);
11403 ifp->if_start_flags |= IFSF_FLOW_CONTROLLED;
11404 lck_mtx_unlock(&ifp->if_start_lock);
11405 }
11406 return err;
11407 }
11408
11409 errno_t
ifnet_enable_output(struct ifnet * ifp)11410 ifnet_enable_output(struct ifnet *ifp)
11411 {
11412 if (ifp == NULL) {
11413 return EINVAL;
11414 } else if (!(ifp->if_eflags & IFEF_TXSTART) ||
11415 !IF_FULLY_ATTACHED(ifp)) {
11416 return ENXIO;
11417 }
11418
11419 ifnet_start_common(ifp, TRUE);
11420 return 0;
11421 }
11422
11423 void
ifnet_flowadv(uint32_t flowhash)11424 ifnet_flowadv(uint32_t flowhash)
11425 {
11426 struct ifnet_fc_entry *ifce;
11427 struct ifnet *ifp;
11428
11429 ifce = ifnet_fc_get(flowhash);
11430 if (ifce == NULL) {
11431 return;
11432 }
11433
11434 VERIFY(ifce->ifce_ifp != NULL);
11435 ifp = ifce->ifce_ifp;
11436
11437 /* flow hash gets recalculated per attach, so check */
11438 if (ifnet_is_attached(ifp, 1)) {
11439 if (ifp->if_flowhash == flowhash) {
11440 (void) ifnet_enable_output(ifp);
11441 }
11442 ifnet_decr_iorefcnt(ifp);
11443 }
11444 ifnet_fc_entry_free(ifce);
11445 }
11446
11447 /*
11448 * Function to compare ifnet_fc_entries in ifnet flow control tree
11449 */
11450 static inline int
ifce_cmp(const struct ifnet_fc_entry * fc1,const struct ifnet_fc_entry * fc2)11451 ifce_cmp(const struct ifnet_fc_entry *fc1, const struct ifnet_fc_entry *fc2)
11452 {
11453 return fc1->ifce_flowhash - fc2->ifce_flowhash;
11454 }
11455
11456 static int
ifnet_fc_add(struct ifnet * ifp)11457 ifnet_fc_add(struct ifnet *ifp)
11458 {
11459 struct ifnet_fc_entry keyfc, *ifce;
11460 uint32_t flowhash;
11461
11462 VERIFY(ifp != NULL && (ifp->if_eflags & IFEF_TXSTART));
11463 VERIFY(ifp->if_flowhash != 0);
11464 flowhash = ifp->if_flowhash;
11465
11466 bzero(&keyfc, sizeof(keyfc));
11467 keyfc.ifce_flowhash = flowhash;
11468
11469 lck_mtx_lock_spin(&ifnet_fc_lock);
11470 ifce = RB_FIND(ifnet_fc_tree, &ifnet_fc_tree, &keyfc);
11471 if (ifce != NULL && ifce->ifce_ifp == ifp) {
11472 /* Entry is already in ifnet_fc_tree, return */
11473 lck_mtx_unlock(&ifnet_fc_lock);
11474 return 0;
11475 }
11476
11477 if (ifce != NULL) {
11478 /*
11479 * There is a different fc entry with the same flow hash
11480 * but different ifp pointer. There can be a collision
11481 * on flow hash but the probability is low. Let's just
11482 * avoid adding a second one when there is a collision.
11483 */
11484 lck_mtx_unlock(&ifnet_fc_lock);
11485 return EAGAIN;
11486 }
11487
11488 /* become regular mutex */
11489 lck_mtx_convert_spin(&ifnet_fc_lock);
11490
11491 ifce = zalloc_flags(ifnet_fc_zone, Z_WAITOK | Z_ZERO);
11492 ifce->ifce_flowhash = flowhash;
11493 ifce->ifce_ifp = ifp;
11494
11495 RB_INSERT(ifnet_fc_tree, &ifnet_fc_tree, ifce);
11496 lck_mtx_unlock(&ifnet_fc_lock);
11497 return 0;
11498 }
11499
11500 static struct ifnet_fc_entry *
ifnet_fc_get(uint32_t flowhash)11501 ifnet_fc_get(uint32_t flowhash)
11502 {
11503 struct ifnet_fc_entry keyfc, *ifce;
11504 struct ifnet *ifp;
11505
11506 bzero(&keyfc, sizeof(keyfc));
11507 keyfc.ifce_flowhash = flowhash;
11508
11509 lck_mtx_lock_spin(&ifnet_fc_lock);
11510 ifce = RB_FIND(ifnet_fc_tree, &ifnet_fc_tree, &keyfc);
11511 if (ifce == NULL) {
11512 /* Entry is not present in ifnet_fc_tree, return */
11513 lck_mtx_unlock(&ifnet_fc_lock);
11514 return NULL;
11515 }
11516
11517 RB_REMOVE(ifnet_fc_tree, &ifnet_fc_tree, ifce);
11518
11519 VERIFY(ifce->ifce_ifp != NULL);
11520 ifp = ifce->ifce_ifp;
11521
11522 /* become regular mutex */
11523 lck_mtx_convert_spin(&ifnet_fc_lock);
11524
11525 if (!ifnet_is_attached(ifp, 0)) {
11526 /*
11527 * This ifp is not attached or in the process of being
11528 * detached; just don't process it.
11529 */
11530 ifnet_fc_entry_free(ifce);
11531 ifce = NULL;
11532 }
11533 lck_mtx_unlock(&ifnet_fc_lock);
11534
11535 return ifce;
11536 }
11537
11538 static void
ifnet_fc_entry_free(struct ifnet_fc_entry * ifce)11539 ifnet_fc_entry_free(struct ifnet_fc_entry *ifce)
11540 {
11541 zfree(ifnet_fc_zone, ifce);
11542 }
11543
11544 static uint32_t
ifnet_calc_flowhash(struct ifnet * ifp)11545 ifnet_calc_flowhash(struct ifnet *ifp)
11546 {
11547 struct ifnet_flowhash_key fh __attribute__((aligned(8)));
11548 uint32_t flowhash = 0;
11549
11550 if (ifnet_flowhash_seed == 0) {
11551 ifnet_flowhash_seed = RandomULong();
11552 }
11553
11554 bzero(&fh, sizeof(fh));
11555
11556 (void) snprintf(fh.ifk_name, sizeof(fh.ifk_name), "%s", ifp->if_name);
11557 fh.ifk_unit = ifp->if_unit;
11558 fh.ifk_flags = ifp->if_flags;
11559 fh.ifk_eflags = ifp->if_eflags;
11560 fh.ifk_capabilities = ifp->if_capabilities;
11561 fh.ifk_capenable = ifp->if_capenable;
11562 fh.ifk_output_sched_model = ifp->if_output_sched_model;
11563 fh.ifk_rand1 = RandomULong();
11564 fh.ifk_rand2 = RandomULong();
11565
11566 try_again:
11567 flowhash = net_flowhash(&fh, sizeof(fh), ifnet_flowhash_seed);
11568 if (flowhash == 0) {
11569 /* try to get a non-zero flowhash */
11570 ifnet_flowhash_seed = RandomULong();
11571 goto try_again;
11572 }
11573
11574 return flowhash;
11575 }
11576
11577 int
ifnet_set_netsignature(struct ifnet * ifp,uint8_t family,uint8_t len,uint16_t flags,uint8_t * data)11578 ifnet_set_netsignature(struct ifnet *ifp, uint8_t family, uint8_t len,
11579 uint16_t flags, uint8_t *data)
11580 {
11581 #pragma unused(flags)
11582 int error = 0;
11583
11584 switch (family) {
11585 case AF_INET:
11586 if_inetdata_lock_exclusive(ifp);
11587 if (IN_IFEXTRA(ifp) != NULL) {
11588 if (len == 0) {
11589 /* Allow clearing the signature */
11590 IN_IFEXTRA(ifp)->netsig_len = 0;
11591 bzero(IN_IFEXTRA(ifp)->netsig,
11592 sizeof(IN_IFEXTRA(ifp)->netsig));
11593 if_inetdata_lock_done(ifp);
11594 break;
11595 } else if (len > sizeof(IN_IFEXTRA(ifp)->netsig)) {
11596 error = EINVAL;
11597 if_inetdata_lock_done(ifp);
11598 break;
11599 }
11600 IN_IFEXTRA(ifp)->netsig_len = len;
11601 bcopy(data, IN_IFEXTRA(ifp)->netsig, len);
11602 } else {
11603 error = ENOMEM;
11604 }
11605 if_inetdata_lock_done(ifp);
11606 break;
11607
11608 case AF_INET6:
11609 if_inet6data_lock_exclusive(ifp);
11610 if (IN6_IFEXTRA(ifp) != NULL) {
11611 if (len == 0) {
11612 /* Allow clearing the signature */
11613 IN6_IFEXTRA(ifp)->netsig_len = 0;
11614 bzero(IN6_IFEXTRA(ifp)->netsig,
11615 sizeof(IN6_IFEXTRA(ifp)->netsig));
11616 if_inet6data_lock_done(ifp);
11617 break;
11618 } else if (len > sizeof(IN6_IFEXTRA(ifp)->netsig)) {
11619 error = EINVAL;
11620 if_inet6data_lock_done(ifp);
11621 break;
11622 }
11623 IN6_IFEXTRA(ifp)->netsig_len = len;
11624 bcopy(data, IN6_IFEXTRA(ifp)->netsig, len);
11625 } else {
11626 error = ENOMEM;
11627 }
11628 if_inet6data_lock_done(ifp);
11629 break;
11630
11631 default:
11632 error = EINVAL;
11633 break;
11634 }
11635
11636 return error;
11637 }
11638
11639 int
ifnet_get_netsignature(struct ifnet * ifp,uint8_t family,uint8_t * len,uint16_t * flags,uint8_t * data)11640 ifnet_get_netsignature(struct ifnet *ifp, uint8_t family, uint8_t *len,
11641 uint16_t *flags, uint8_t *data)
11642 {
11643 int error = 0;
11644
11645 if (ifp == NULL || len == NULL || data == NULL) {
11646 return EINVAL;
11647 }
11648
11649 switch (family) {
11650 case AF_INET:
11651 if_inetdata_lock_shared(ifp);
11652 if (IN_IFEXTRA(ifp) != NULL) {
11653 if (*len == 0 || *len < IN_IFEXTRA(ifp)->netsig_len) {
11654 error = EINVAL;
11655 if_inetdata_lock_done(ifp);
11656 break;
11657 }
11658 if ((*len = (uint8_t)IN_IFEXTRA(ifp)->netsig_len) > 0) {
11659 bcopy(IN_IFEXTRA(ifp)->netsig, data, *len);
11660 } else {
11661 error = ENOENT;
11662 }
11663 } else {
11664 error = ENOMEM;
11665 }
11666 if_inetdata_lock_done(ifp);
11667 break;
11668
11669 case AF_INET6:
11670 if_inet6data_lock_shared(ifp);
11671 if (IN6_IFEXTRA(ifp) != NULL) {
11672 if (*len == 0 || *len < IN6_IFEXTRA(ifp)->netsig_len) {
11673 error = EINVAL;
11674 if_inet6data_lock_done(ifp);
11675 break;
11676 }
11677 if ((*len = (uint8_t)IN6_IFEXTRA(ifp)->netsig_len) > 0) {
11678 bcopy(IN6_IFEXTRA(ifp)->netsig, data, *len);
11679 } else {
11680 error = ENOENT;
11681 }
11682 } else {
11683 error = ENOMEM;
11684 }
11685 if_inet6data_lock_done(ifp);
11686 break;
11687
11688 default:
11689 error = EINVAL;
11690 break;
11691 }
11692
11693 if (error == 0 && flags != NULL) {
11694 *flags = 0;
11695 }
11696
11697 return error;
11698 }
11699
11700 int
ifnet_set_nat64prefix(struct ifnet * ifp,struct ipv6_prefix * prefixes)11701 ifnet_set_nat64prefix(struct ifnet *ifp, struct ipv6_prefix *prefixes)
11702 {
11703 int i, error = 0, one_set = 0;
11704
11705 if_inet6data_lock_exclusive(ifp);
11706
11707 if (IN6_IFEXTRA(ifp) == NULL) {
11708 error = ENOMEM;
11709 goto out;
11710 }
11711
11712 for (i = 0; i < NAT64_MAX_NUM_PREFIXES; i++) {
11713 uint32_t prefix_len =
11714 prefixes[i].prefix_len;
11715 struct in6_addr *prefix =
11716 &prefixes[i].ipv6_prefix;
11717
11718 if (prefix_len == 0) {
11719 clat_log0((LOG_DEBUG,
11720 "NAT64 prefixes purged from Interface %s\n",
11721 if_name(ifp)));
11722 /* Allow clearing the signature */
11723 IN6_IFEXTRA(ifp)->nat64_prefixes[i].prefix_len = 0;
11724 bzero(&IN6_IFEXTRA(ifp)->nat64_prefixes[i].ipv6_prefix,
11725 sizeof(struct in6_addr));
11726
11727 continue;
11728 } else if (prefix_len != NAT64_PREFIX_LEN_32 &&
11729 prefix_len != NAT64_PREFIX_LEN_40 &&
11730 prefix_len != NAT64_PREFIX_LEN_48 &&
11731 prefix_len != NAT64_PREFIX_LEN_56 &&
11732 prefix_len != NAT64_PREFIX_LEN_64 &&
11733 prefix_len != NAT64_PREFIX_LEN_96) {
11734 clat_log0((LOG_DEBUG,
11735 "NAT64 prefixlen is incorrect %d\n", prefix_len));
11736 error = EINVAL;
11737 goto out;
11738 }
11739
11740 if (IN6_IS_SCOPE_EMBED(prefix)) {
11741 clat_log0((LOG_DEBUG,
11742 "NAT64 prefix has interface/link local scope.\n"));
11743 error = EINVAL;
11744 goto out;
11745 }
11746
11747 IN6_IFEXTRA(ifp)->nat64_prefixes[i].prefix_len = prefix_len;
11748 bcopy(prefix, &IN6_IFEXTRA(ifp)->nat64_prefixes[i].ipv6_prefix,
11749 sizeof(struct in6_addr));
11750 clat_log0((LOG_DEBUG,
11751 "NAT64 prefix set to %s with prefixlen: %d\n",
11752 ip6_sprintf(prefix), prefix_len));
11753 one_set = 1;
11754 }
11755
11756 out:
11757 if_inet6data_lock_done(ifp);
11758
11759 if (error == 0 && one_set != 0) {
11760 necp_update_all_clients();
11761 }
11762
11763 return error;
11764 }
11765
11766 int
ifnet_get_nat64prefix(struct ifnet * ifp,struct ipv6_prefix * prefixes)11767 ifnet_get_nat64prefix(struct ifnet *ifp, struct ipv6_prefix *prefixes)
11768 {
11769 int i, found_one = 0, error = 0;
11770
11771 if (ifp == NULL) {
11772 return EINVAL;
11773 }
11774
11775 if_inet6data_lock_shared(ifp);
11776
11777 if (IN6_IFEXTRA(ifp) == NULL) {
11778 error = ENOMEM;
11779 goto out;
11780 }
11781
11782 for (i = 0; i < NAT64_MAX_NUM_PREFIXES; i++) {
11783 if (IN6_IFEXTRA(ifp)->nat64_prefixes[i].prefix_len != 0) {
11784 found_one = 1;
11785 }
11786 }
11787
11788 if (found_one == 0) {
11789 error = ENOENT;
11790 goto out;
11791 }
11792
11793 if (prefixes) {
11794 bcopy(IN6_IFEXTRA(ifp)->nat64_prefixes, prefixes,
11795 sizeof(IN6_IFEXTRA(ifp)->nat64_prefixes));
11796 }
11797
11798 out:
11799 if_inet6data_lock_done(ifp);
11800
11801 return error;
11802 }
11803
11804 __attribute__((noinline))
11805 static void
dlil_output_cksum_dbg(struct ifnet * ifp,struct mbuf * m,uint32_t hoff,protocol_family_t pf)11806 dlil_output_cksum_dbg(struct ifnet *ifp, struct mbuf *m, uint32_t hoff,
11807 protocol_family_t pf)
11808 {
11809 #pragma unused(ifp)
11810 uint32_t did_sw;
11811
11812 if (!(hwcksum_dbg_mode & HWCKSUM_DBG_FINALIZE_FORCED) ||
11813 (m->m_pkthdr.csum_flags & (CSUM_TSO_IPV4 | CSUM_TSO_IPV6))) {
11814 return;
11815 }
11816
11817 switch (pf) {
11818 case PF_INET:
11819 did_sw = in_finalize_cksum(m, hoff, m->m_pkthdr.csum_flags);
11820 if (did_sw & CSUM_DELAY_IP) {
11821 hwcksum_dbg_finalized_hdr++;
11822 }
11823 if (did_sw & CSUM_DELAY_DATA) {
11824 hwcksum_dbg_finalized_data++;
11825 }
11826 break;
11827 case PF_INET6:
11828 /*
11829 * Checksum offload should not have been enabled when
11830 * extension headers exist; that also means that we
11831 * cannot force-finalize packets with extension headers.
11832 * Indicate to the callee should it skip such case by
11833 * setting optlen to -1.
11834 */
11835 did_sw = in6_finalize_cksum(m, hoff, -1, -1,
11836 m->m_pkthdr.csum_flags);
11837 if (did_sw & CSUM_DELAY_IPV6_DATA) {
11838 hwcksum_dbg_finalized_data++;
11839 }
11840 break;
11841 default:
11842 return;
11843 }
11844 }
11845
11846 static void
dlil_input_cksum_dbg(struct ifnet * ifp,struct mbuf * m,char * frame_header,protocol_family_t pf)11847 dlil_input_cksum_dbg(struct ifnet *ifp, struct mbuf *m, char *frame_header,
11848 protocol_family_t pf)
11849 {
11850 uint16_t sum = 0;
11851 uint32_t hlen;
11852
11853 if (frame_header == NULL ||
11854 frame_header < (char *)mbuf_datastart(m) ||
11855 frame_header > (char *)m->m_data) {
11856 DLIL_PRINTF("%s: frame header pointer 0x%llx out of range "
11857 "[0x%llx,0x%llx] for mbuf 0x%llx\n", if_name(ifp),
11858 (uint64_t)VM_KERNEL_ADDRPERM(frame_header),
11859 (uint64_t)VM_KERNEL_ADDRPERM(mbuf_datastart(m)),
11860 (uint64_t)VM_KERNEL_ADDRPERM(m->m_data),
11861 (uint64_t)VM_KERNEL_ADDRPERM(m));
11862 return;
11863 }
11864 hlen = (uint32_t)(m->m_data - frame_header);
11865
11866 switch (pf) {
11867 case PF_INET:
11868 case PF_INET6:
11869 break;
11870 default:
11871 return;
11872 }
11873
11874 /*
11875 * Force partial checksum offload; useful to simulate cases
11876 * where the hardware does not support partial checksum offload,
11877 * in order to validate correctness throughout the layers above.
11878 */
11879 if (hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_FORCED) {
11880 uint32_t foff = hwcksum_dbg_partial_rxoff_forced;
11881
11882 if (foff > (uint32_t)m->m_pkthdr.len) {
11883 return;
11884 }
11885
11886 m->m_pkthdr.csum_flags &= ~CSUM_RX_FLAGS;
11887
11888 /* Compute 16-bit 1's complement sum from forced offset */
11889 sum = m_sum16(m, foff, (m->m_pkthdr.len - foff));
11890
11891 m->m_pkthdr.csum_flags |= (CSUM_DATA_VALID | CSUM_PARTIAL);
11892 m->m_pkthdr.csum_rx_val = sum;
11893 m->m_pkthdr.csum_rx_start = (uint16_t)(foff + hlen);
11894
11895 hwcksum_dbg_partial_forced++;
11896 hwcksum_dbg_partial_forced_bytes += m->m_pkthdr.len;
11897 }
11898
11899 /*
11900 * Partial checksum offload verification (and adjustment);
11901 * useful to validate and test cases where the hardware
11902 * supports partial checksum offload.
11903 */
11904 if ((m->m_pkthdr.csum_flags &
11905 (CSUM_DATA_VALID | CSUM_PARTIAL | CSUM_PSEUDO_HDR)) ==
11906 (CSUM_DATA_VALID | CSUM_PARTIAL)) {
11907 uint32_t rxoff;
11908
11909 /* Start offset must begin after frame header */
11910 rxoff = m->m_pkthdr.csum_rx_start;
11911 if (hlen > rxoff) {
11912 hwcksum_dbg_bad_rxoff++;
11913 if (dlil_verbose) {
11914 DLIL_PRINTF("%s: partial cksum start offset %d "
11915 "is less than frame header length %d for "
11916 "mbuf 0x%llx\n", if_name(ifp), rxoff, hlen,
11917 (uint64_t)VM_KERNEL_ADDRPERM(m));
11918 }
11919 return;
11920 }
11921 rxoff -= hlen;
11922
11923 if (!(hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_FORCED)) {
11924 /*
11925 * Compute the expected 16-bit 1's complement sum;
11926 * skip this if we've already computed it above
11927 * when partial checksum offload is forced.
11928 */
11929 sum = m_sum16(m, rxoff, (m->m_pkthdr.len - rxoff));
11930
11931 /* Hardware or driver is buggy */
11932 if (sum != m->m_pkthdr.csum_rx_val) {
11933 hwcksum_dbg_bad_cksum++;
11934 if (dlil_verbose) {
11935 DLIL_PRINTF("%s: bad partial cksum value "
11936 "0x%x (expected 0x%x) for mbuf "
11937 "0x%llx [rx_start %d]\n",
11938 if_name(ifp),
11939 m->m_pkthdr.csum_rx_val, sum,
11940 (uint64_t)VM_KERNEL_ADDRPERM(m),
11941 m->m_pkthdr.csum_rx_start);
11942 }
11943 return;
11944 }
11945 }
11946 hwcksum_dbg_verified++;
11947
11948 /*
11949 * This code allows us to emulate various hardwares that
11950 * perform 16-bit 1's complement sum beginning at various
11951 * start offset values.
11952 */
11953 if (hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_RXOFF_ADJ) {
11954 uint32_t aoff = hwcksum_dbg_partial_rxoff_adj;
11955
11956 if (aoff == rxoff || aoff > (uint32_t)m->m_pkthdr.len) {
11957 return;
11958 }
11959
11960 sum = m_adj_sum16(m, rxoff, aoff,
11961 m_pktlen(m) - aoff, sum);
11962
11963 m->m_pkthdr.csum_rx_val = sum;
11964 m->m_pkthdr.csum_rx_start = (uint16_t)(aoff + hlen);
11965
11966 hwcksum_dbg_adjusted++;
11967 }
11968 }
11969 }
11970
11971 static int
11972 sysctl_hwcksum_dbg_mode SYSCTL_HANDLER_ARGS
11973 {
11974 #pragma unused(arg1, arg2)
11975 u_int32_t i;
11976 int err;
11977
11978 i = hwcksum_dbg_mode;
11979
11980 err = sysctl_handle_int(oidp, &i, 0, req);
11981 if (err != 0 || req->newptr == USER_ADDR_NULL) {
11982 return err;
11983 }
11984
11985 if (hwcksum_dbg == 0) {
11986 return ENODEV;
11987 }
11988
11989 if ((i & ~HWCKSUM_DBG_MASK) != 0) {
11990 return EINVAL;
11991 }
11992
11993 hwcksum_dbg_mode = (i & HWCKSUM_DBG_MASK);
11994
11995 return err;
11996 }
11997
11998 static int
11999 sysctl_hwcksum_dbg_partial_rxoff_forced SYSCTL_HANDLER_ARGS
12000 {
12001 #pragma unused(arg1, arg2)
12002 u_int32_t i;
12003 int err;
12004
12005 i = hwcksum_dbg_partial_rxoff_forced;
12006
12007 err = sysctl_handle_int(oidp, &i, 0, req);
12008 if (err != 0 || req->newptr == USER_ADDR_NULL) {
12009 return err;
12010 }
12011
12012 if (!(hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_FORCED)) {
12013 return ENODEV;
12014 }
12015
12016 hwcksum_dbg_partial_rxoff_forced = i;
12017
12018 return err;
12019 }
12020
12021 static int
12022 sysctl_hwcksum_dbg_partial_rxoff_adj SYSCTL_HANDLER_ARGS
12023 {
12024 #pragma unused(arg1, arg2)
12025 u_int32_t i;
12026 int err;
12027
12028 i = hwcksum_dbg_partial_rxoff_adj;
12029
12030 err = sysctl_handle_int(oidp, &i, 0, req);
12031 if (err != 0 || req->newptr == USER_ADDR_NULL) {
12032 return err;
12033 }
12034
12035 if (!(hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_RXOFF_ADJ)) {
12036 return ENODEV;
12037 }
12038
12039 hwcksum_dbg_partial_rxoff_adj = i;
12040
12041 return err;
12042 }
12043
12044 static int
12045 sysctl_tx_chain_len_stats SYSCTL_HANDLER_ARGS
12046 {
12047 #pragma unused(oidp, arg1, arg2)
12048 int err;
12049
12050 if (req->oldptr == USER_ADDR_NULL) {
12051 }
12052 if (req->newptr != USER_ADDR_NULL) {
12053 return EPERM;
12054 }
12055 err = SYSCTL_OUT(req, &tx_chain_len_stats,
12056 sizeof(struct chain_len_stats));
12057
12058 return err;
12059 }
12060
12061
12062 #if DEBUG || DEVELOPMENT
12063 /* Blob for sum16 verification */
12064 static uint8_t sumdata[] = {
12065 0x1f, 0x8b, 0x08, 0x08, 0x4c, 0xe5, 0x9a, 0x4f, 0x00, 0x03,
12066 0x5f, 0x00, 0x5d, 0x91, 0x41, 0x4e, 0xc4, 0x30, 0x0c, 0x45,
12067 0xf7, 0x9c, 0xc2, 0x07, 0x18, 0xf5, 0x0e, 0xb0, 0xe2, 0x00,
12068 0x48, 0x88, 0xa5, 0xdb, 0xba, 0x49, 0x34, 0x69, 0xdc, 0x71,
12069 0x92, 0xa9, 0xc2, 0x8a, 0x6b, 0x70, 0x3d, 0x4e, 0x82, 0x93,
12070 0xb4, 0x08, 0xd8, 0xc5, 0xb1, 0xfd, 0xff, 0xb3, 0xfd, 0x4c,
12071 0x42, 0x5f, 0x1f, 0x9f, 0x11, 0x12, 0x43, 0xb2, 0x04, 0x93,
12072 0xe0, 0x7b, 0x01, 0x0e, 0x14, 0x07, 0x78, 0xd1, 0x78, 0x75,
12073 0x71, 0x71, 0xe9, 0x08, 0x84, 0x46, 0xf2, 0xc7, 0x3b, 0x09,
12074 0xe7, 0xd1, 0xd3, 0x8a, 0x57, 0x92, 0x33, 0xcd, 0x39, 0xcc,
12075 0xb0, 0x91, 0x89, 0xe0, 0x42, 0x53, 0x8b, 0xb7, 0x8c, 0x42,
12076 0x60, 0xd9, 0x9f, 0x7a, 0x55, 0x19, 0x76, 0xcb, 0x10, 0x49,
12077 0x35, 0xac, 0x0b, 0x5a, 0x3c, 0xbb, 0x65, 0x51, 0x8c, 0x90,
12078 0x7c, 0x69, 0x45, 0x45, 0x81, 0xb4, 0x2b, 0x70, 0x82, 0x85,
12079 0x55, 0x91, 0x17, 0x90, 0xdc, 0x14, 0x1e, 0x35, 0x52, 0xdd,
12080 0x02, 0x16, 0xef, 0xb5, 0x40, 0x89, 0xe2, 0x46, 0x53, 0xad,
12081 0x93, 0x6e, 0x98, 0x30, 0xe5, 0x08, 0xb7, 0xcc, 0x03, 0xbc,
12082 0x71, 0x86, 0x09, 0x43, 0x0d, 0x52, 0xf5, 0xa2, 0xf5, 0xa2,
12083 0x56, 0x11, 0x8d, 0xa8, 0xf5, 0xee, 0x92, 0x3d, 0xfe, 0x8c,
12084 0x67, 0x71, 0x8b, 0x0e, 0x2d, 0x70, 0x77, 0xbe, 0xbe, 0xea,
12085 0xbf, 0x9a, 0x8d, 0x9c, 0x53, 0x53, 0xe5, 0xe0, 0x4b, 0x87,
12086 0x85, 0xd2, 0x45, 0x95, 0x30, 0xc1, 0xcc, 0xe0, 0x74, 0x54,
12087 0x13, 0x58, 0xe8, 0xe8, 0x79, 0xa2, 0x09, 0x73, 0xa4, 0x0e,
12088 0x39, 0x59, 0x0c, 0xe6, 0x9c, 0xb2, 0x4f, 0x06, 0x5b, 0x8e,
12089 0xcd, 0x17, 0x6c, 0x5e, 0x95, 0x4d, 0x70, 0xa2, 0x0a, 0xbf,
12090 0xa3, 0xcc, 0x03, 0xbc, 0x5a, 0xe7, 0x75, 0x06, 0x5e, 0x75,
12091 0xef, 0x58, 0x8e, 0x15, 0xd1, 0x0a, 0x18, 0xff, 0xdd, 0xe6,
12092 0x02, 0x3b, 0xb5, 0xb4, 0xa1, 0xe0, 0x72, 0xfc, 0xe3, 0xab,
12093 0x07, 0xe0, 0x4d, 0x65, 0xea, 0x92, 0xeb, 0xf2, 0x7b, 0x17,
12094 0x05, 0xce, 0xc6, 0xf6, 0x2b, 0xbb, 0x70, 0x3d, 0x00, 0x95,
12095 0xe0, 0x07, 0x52, 0x3b, 0x58, 0xfc, 0x7c, 0x69, 0x4d, 0xe9,
12096 0xf7, 0xa9, 0x66, 0x1e, 0x1e, 0xbe, 0x01, 0x69, 0x98, 0xfe,
12097 0xc8, 0x28, 0x02, 0x00, 0x00
12098 };
12099
12100 /* Precomputed 16-bit 1's complement sums for various spans of the above data */
12101 static struct {
12102 boolean_t init;
12103 uint16_t len;
12104 uint16_t sumr; /* reference */
12105 uint16_t sumrp; /* reference, precomputed */
12106 } sumtbl[] = {
12107 { FALSE, 0, 0, 0x0000 },
12108 { FALSE, 1, 0, 0x001f },
12109 { FALSE, 2, 0, 0x8b1f },
12110 { FALSE, 3, 0, 0x8b27 },
12111 { FALSE, 7, 0, 0x790e },
12112 { FALSE, 11, 0, 0xcb6d },
12113 { FALSE, 20, 0, 0x20dd },
12114 { FALSE, 27, 0, 0xbabd },
12115 { FALSE, 32, 0, 0xf3e8 },
12116 { FALSE, 37, 0, 0x197d },
12117 { FALSE, 43, 0, 0x9eae },
12118 { FALSE, 64, 0, 0x4678 },
12119 { FALSE, 127, 0, 0x9399 },
12120 { FALSE, 256, 0, 0xd147 },
12121 { FALSE, 325, 0, 0x0358 },
12122 };
12123 #define SUMTBL_MAX ((int)sizeof (sumtbl) / (int)sizeof (sumtbl[0]))
12124
12125 static void
dlil_verify_sum16(void)12126 dlil_verify_sum16(void)
12127 {
12128 struct mbuf *m;
12129 uint8_t *buf;
12130 int n;
12131
12132 /* Make sure test data plus extra room for alignment fits in cluster */
12133 _CASSERT((sizeof(sumdata) + (sizeof(uint64_t) * 2)) <= MCLBYTES);
12134
12135 kprintf("DLIL: running SUM16 self-tests ... ");
12136
12137 m = m_getcl(M_WAITOK, MT_DATA, M_PKTHDR);
12138 m_align(m, sizeof(sumdata) + (sizeof(uint64_t) * 2));
12139
12140 buf = mtod(m, uint8_t *); /* base address */
12141
12142 for (n = 0; n < SUMTBL_MAX; n++) {
12143 uint16_t len = sumtbl[n].len;
12144 int i;
12145
12146 /* Verify for all possible alignments */
12147 for (i = 0; i < (int)sizeof(uint64_t); i++) {
12148 uint16_t sum, sumr;
12149 uint8_t *c;
12150
12151 /* Copy over test data to mbuf */
12152 VERIFY(len <= sizeof(sumdata));
12153 c = buf + i;
12154 bcopy(sumdata, c, len);
12155
12156 /* Zero-offset test (align by data pointer) */
12157 m->m_data = (caddr_t)c;
12158 m->m_len = len;
12159 sum = m_sum16(m, 0, len);
12160
12161 if (!sumtbl[n].init) {
12162 sumr = (uint16_t)in_cksum_mbuf_ref(m, len, 0, 0);
12163 sumtbl[n].sumr = sumr;
12164 sumtbl[n].init = TRUE;
12165 } else {
12166 sumr = sumtbl[n].sumr;
12167 }
12168
12169 /* Something is horribly broken; stop now */
12170 if (sumr != sumtbl[n].sumrp) {
12171 panic_plain("\n%s: broken in_cksum_mbuf_ref() "
12172 "for len=%d align=%d sum=0x%04x "
12173 "[expected=0x%04x]\n", __func__,
12174 len, i, sum, sumr);
12175 /* NOTREACHED */
12176 } else if (sum != sumr) {
12177 panic_plain("\n%s: broken m_sum16() for len=%d "
12178 "align=%d sum=0x%04x [expected=0x%04x]\n",
12179 __func__, len, i, sum, sumr);
12180 /* NOTREACHED */
12181 }
12182
12183 /* Alignment test by offset (fixed data pointer) */
12184 m->m_data = (caddr_t)buf;
12185 m->m_len = i + len;
12186 sum = m_sum16(m, i, len);
12187
12188 /* Something is horribly broken; stop now */
12189 if (sum != sumr) {
12190 panic_plain("\n%s: broken m_sum16() for len=%d "
12191 "offset=%d sum=0x%04x [expected=0x%04x]\n",
12192 __func__, len, i, sum, sumr);
12193 /* NOTREACHED */
12194 }
12195 #if INET
12196 /* Simple sum16 contiguous buffer test by aligment */
12197 sum = b_sum16(c, len);
12198
12199 /* Something is horribly broken; stop now */
12200 if (sum != sumr) {
12201 panic_plain("\n%s: broken b_sum16() for len=%d "
12202 "align=%d sum=0x%04x [expected=0x%04x]\n",
12203 __func__, len, i, sum, sumr);
12204 /* NOTREACHED */
12205 }
12206 #endif /* INET */
12207 }
12208 }
12209 m_freem(m);
12210
12211 kprintf("PASSED\n");
12212 }
12213 #endif /* DEBUG || DEVELOPMENT */
12214
12215 #define CASE_STRINGIFY(x) case x: return #x
12216
12217 __private_extern__ const char *
dlil_kev_dl_code_str(u_int32_t event_code)12218 dlil_kev_dl_code_str(u_int32_t event_code)
12219 {
12220 switch (event_code) {
12221 CASE_STRINGIFY(KEV_DL_SIFFLAGS);
12222 CASE_STRINGIFY(KEV_DL_SIFMETRICS);
12223 CASE_STRINGIFY(KEV_DL_SIFMTU);
12224 CASE_STRINGIFY(KEV_DL_SIFPHYS);
12225 CASE_STRINGIFY(KEV_DL_SIFMEDIA);
12226 CASE_STRINGIFY(KEV_DL_SIFGENERIC);
12227 CASE_STRINGIFY(KEV_DL_ADDMULTI);
12228 CASE_STRINGIFY(KEV_DL_DELMULTI);
12229 CASE_STRINGIFY(KEV_DL_IF_ATTACHED);
12230 CASE_STRINGIFY(KEV_DL_IF_DETACHING);
12231 CASE_STRINGIFY(KEV_DL_IF_DETACHED);
12232 CASE_STRINGIFY(KEV_DL_LINK_OFF);
12233 CASE_STRINGIFY(KEV_DL_LINK_ON);
12234 CASE_STRINGIFY(KEV_DL_PROTO_ATTACHED);
12235 CASE_STRINGIFY(KEV_DL_PROTO_DETACHED);
12236 CASE_STRINGIFY(KEV_DL_LINK_ADDRESS_CHANGED);
12237 CASE_STRINGIFY(KEV_DL_WAKEFLAGS_CHANGED);
12238 CASE_STRINGIFY(KEV_DL_IF_IDLE_ROUTE_REFCNT);
12239 CASE_STRINGIFY(KEV_DL_IFCAP_CHANGED);
12240 CASE_STRINGIFY(KEV_DL_LINK_QUALITY_METRIC_CHANGED);
12241 CASE_STRINGIFY(KEV_DL_NODE_PRESENCE);
12242 CASE_STRINGIFY(KEV_DL_NODE_ABSENCE);
12243 CASE_STRINGIFY(KEV_DL_PRIMARY_ELECTED);
12244 CASE_STRINGIFY(KEV_DL_ISSUES);
12245 CASE_STRINGIFY(KEV_DL_IFDELEGATE_CHANGED);
12246 default:
12247 break;
12248 }
12249 return "";
12250 }
12251
12252 static void
dlil_dt_tcall_fn(thread_call_param_t arg0,thread_call_param_t arg1)12253 dlil_dt_tcall_fn(thread_call_param_t arg0, thread_call_param_t arg1)
12254 {
12255 #pragma unused(arg1)
12256 struct ifnet *ifp = arg0;
12257
12258 if (ifnet_is_attached(ifp, 1)) {
12259 nstat_ifnet_threshold_reached(ifp->if_index);
12260 ifnet_decr_iorefcnt(ifp);
12261 }
12262 }
12263
12264 void
ifnet_notify_data_threshold(struct ifnet * ifp)12265 ifnet_notify_data_threshold(struct ifnet *ifp)
12266 {
12267 uint64_t bytes = (ifp->if_ibytes + ifp->if_obytes);
12268 uint64_t oldbytes = ifp->if_dt_bytes;
12269
12270 ASSERT(ifp->if_dt_tcall != NULL);
12271
12272 /*
12273 * If we went over the threshold, notify NetworkStatistics.
12274 * We rate-limit it based on the threshold interval value.
12275 */
12276 if (threshold_notify && (bytes - oldbytes) > ifp->if_data_threshold &&
12277 OSCompareAndSwap64(oldbytes, bytes, &ifp->if_dt_bytes) &&
12278 !thread_call_isactive(ifp->if_dt_tcall)) {
12279 uint64_t tival = (threshold_interval * NSEC_PER_SEC);
12280 uint64_t now = mach_absolute_time(), deadline = now;
12281 uint64_t ival;
12282
12283 if (tival != 0) {
12284 nanoseconds_to_absolutetime(tival, &ival);
12285 clock_deadline_for_periodic_event(ival, now, &deadline);
12286 (void) thread_call_enter_delayed(ifp->if_dt_tcall,
12287 deadline);
12288 } else {
12289 (void) thread_call_enter(ifp->if_dt_tcall);
12290 }
12291 }
12292 }
12293
12294 #if (DEVELOPMENT || DEBUG)
12295 /*
12296 * The sysctl variable name contains the input parameters of
12297 * ifnet_get_keepalive_offload_frames()
12298 * ifp (interface index): name[0]
12299 * frames_array_count: name[1]
12300 * frame_data_offset: name[2]
12301 * The return length gives used_frames_count
12302 */
12303 static int
12304 sysctl_get_kao_frames SYSCTL_HANDLER_ARGS
12305 {
12306 #pragma unused(oidp)
12307 int *name = (int *)arg1;
12308 u_int namelen = arg2;
12309 int idx;
12310 ifnet_t ifp = NULL;
12311 u_int32_t frames_array_count;
12312 size_t frame_data_offset;
12313 u_int32_t used_frames_count;
12314 struct ifnet_keepalive_offload_frame *frames_array = NULL;
12315 int error = 0;
12316 u_int32_t i;
12317
12318 /*
12319 * Only root can get look at other people TCP frames
12320 */
12321 error = proc_suser(current_proc());
12322 if (error != 0) {
12323 goto done;
12324 }
12325 /*
12326 * Validate the input parameters
12327 */
12328 if (req->newptr != USER_ADDR_NULL) {
12329 error = EPERM;
12330 goto done;
12331 }
12332 if (namelen != 3) {
12333 error = EINVAL;
12334 goto done;
12335 }
12336 if (req->oldptr == USER_ADDR_NULL) {
12337 error = EINVAL;
12338 goto done;
12339 }
12340 if (req->oldlen == 0) {
12341 error = EINVAL;
12342 goto done;
12343 }
12344 idx = name[0];
12345 frames_array_count = name[1];
12346 frame_data_offset = name[2];
12347
12348 /* Make sure the passed buffer is large enough */
12349 if (frames_array_count * sizeof(struct ifnet_keepalive_offload_frame) >
12350 req->oldlen) {
12351 error = ENOMEM;
12352 goto done;
12353 }
12354
12355 ifnet_head_lock_shared();
12356 if (!IF_INDEX_IN_RANGE(idx)) {
12357 ifnet_head_done();
12358 error = ENOENT;
12359 goto done;
12360 }
12361 ifp = ifindex2ifnet[idx];
12362 ifnet_head_done();
12363
12364 frames_array = (struct ifnet_keepalive_offload_frame *)kalloc_data(
12365 frames_array_count * sizeof(struct ifnet_keepalive_offload_frame),
12366 Z_WAITOK);
12367 if (frames_array == NULL) {
12368 error = ENOMEM;
12369 goto done;
12370 }
12371
12372 error = ifnet_get_keepalive_offload_frames(ifp, frames_array,
12373 frames_array_count, frame_data_offset, &used_frames_count);
12374 if (error != 0) {
12375 DLIL_PRINTF("%s: ifnet_get_keepalive_offload_frames error %d\n",
12376 __func__, error);
12377 goto done;
12378 }
12379
12380 for (i = 0; i < used_frames_count; i++) {
12381 error = SYSCTL_OUT(req, frames_array + i,
12382 sizeof(struct ifnet_keepalive_offload_frame));
12383 if (error != 0) {
12384 goto done;
12385 }
12386 }
12387 done:
12388 if (frames_array != NULL) {
12389 kfree_data(frames_array, frames_array_count *
12390 sizeof(struct ifnet_keepalive_offload_frame));
12391 }
12392 return error;
12393 }
12394 #endif /* DEVELOPMENT || DEBUG */
12395
12396 void
ifnet_update_stats_per_flow(struct ifnet_stats_per_flow * ifs,struct ifnet * ifp)12397 ifnet_update_stats_per_flow(struct ifnet_stats_per_flow *ifs,
12398 struct ifnet *ifp)
12399 {
12400 tcp_update_stats_per_flow(ifs, ifp);
12401 }
12402
12403 static inline u_int32_t
_set_flags(u_int32_t * flags_p,u_int32_t set_flags)12404 _set_flags(u_int32_t *flags_p, u_int32_t set_flags)
12405 {
12406 return (u_int32_t)OSBitOrAtomic(set_flags, flags_p);
12407 }
12408
12409 static inline void
_clear_flags(u_int32_t * flags_p,u_int32_t clear_flags)12410 _clear_flags(u_int32_t *flags_p, u_int32_t clear_flags)
12411 {
12412 OSBitAndAtomic(~clear_flags, flags_p);
12413 }
12414
12415 __private_extern__ u_int32_t
if_set_eflags(ifnet_t interface,u_int32_t set_flags)12416 if_set_eflags(ifnet_t interface, u_int32_t set_flags)
12417 {
12418 return _set_flags(&interface->if_eflags, set_flags);
12419 }
12420
12421 __private_extern__ void
if_clear_eflags(ifnet_t interface,u_int32_t clear_flags)12422 if_clear_eflags(ifnet_t interface, u_int32_t clear_flags)
12423 {
12424 _clear_flags(&interface->if_eflags, clear_flags);
12425 }
12426
12427 __private_extern__ u_int32_t
if_set_xflags(ifnet_t interface,u_int32_t set_flags)12428 if_set_xflags(ifnet_t interface, u_int32_t set_flags)
12429 {
12430 return _set_flags(&interface->if_xflags, set_flags);
12431 }
12432
12433 __private_extern__ void
if_clear_xflags(ifnet_t interface,u_int32_t clear_flags)12434 if_clear_xflags(ifnet_t interface, u_int32_t clear_flags)
12435 {
12436 _clear_flags(&interface->if_xflags, clear_flags);
12437 }
12438
12439 __private_extern__ void
ifnet_update_traffic_rule_genid(ifnet_t ifp)12440 ifnet_update_traffic_rule_genid(ifnet_t ifp)
12441 {
12442 atomic_add_32(&ifp->if_traffic_rule_genid, 1);
12443 }
12444
12445 __private_extern__ boolean_t
ifnet_sync_traffic_rule_genid(ifnet_t ifp,uint32_t * genid)12446 ifnet_sync_traffic_rule_genid(ifnet_t ifp, uint32_t *genid)
12447 {
12448 if (*genid != ifp->if_traffic_rule_genid) {
12449 *genid = ifp->if_traffic_rule_genid;
12450 return TRUE;
12451 }
12452 return FALSE;
12453 }
12454 __private_extern__ void
ifnet_update_traffic_rule_count(ifnet_t ifp,uint32_t count)12455 ifnet_update_traffic_rule_count(ifnet_t ifp, uint32_t count)
12456 {
12457 atomic_set_32(&ifp->if_traffic_rule_count, count);
12458 ifnet_update_traffic_rule_genid(ifp);
12459 }
12460
12461 static void
log_hexdump(void * data,size_t len)12462 log_hexdump(void *data, size_t len)
12463 {
12464 size_t i, j, k;
12465 unsigned char *ptr = (unsigned char *)data;
12466 #define MAX_DUMP_BUF 32
12467 unsigned char buf[3 * MAX_DUMP_BUF + 1];
12468
12469 for (i = 0; i < len; i += MAX_DUMP_BUF) {
12470 for (j = i, k = 0; j < i + MAX_DUMP_BUF && j < len; j++) {
12471 unsigned char msnbl = ptr[j] >> 4;
12472 unsigned char lsnbl = ptr[j] & 0x0f;
12473
12474 buf[k++] = msnbl < 10 ? msnbl + '0' : msnbl + 'a' - 10;
12475 buf[k++] = lsnbl < 10 ? lsnbl + '0' : lsnbl + 'a' - 10;
12476
12477 if ((j % 2) == 1) {
12478 buf[k++] = ' ';
12479 }
12480 if ((j % MAX_DUMP_BUF) == MAX_DUMP_BUF - 1) {
12481 buf[k++] = ' ';
12482 }
12483 }
12484 buf[k] = 0;
12485 os_log(OS_LOG_DEFAULT, "%3lu: %s", i, buf);
12486 }
12487 }
12488
12489 #if SKYWALK && defined(XNU_TARGET_OS_OSX)
12490 static bool
net_check_compatible_if_filter(struct ifnet * ifp)12491 net_check_compatible_if_filter(struct ifnet *ifp)
12492 {
12493 if (ifp == NULL) {
12494 if (net_api_stats.nas_iflt_attach_count > net_api_stats.nas_iflt_attach_os_count) {
12495 return false;
12496 }
12497 } else {
12498 if (ifp->if_flt_non_os_count > 0) {
12499 return false;
12500 }
12501 }
12502 return true;
12503 }
12504 #endif /* SKYWALK && XNU_TARGET_OS_OSX */
12505
12506 #define DUMP_BUF_CHK() { \
12507 clen -= k; \
12508 if (clen < 1) \
12509 goto done; \
12510 c += k; \
12511 }
12512
12513 int dlil_dump_top_if_qlen(char *, int);
12514 int
dlil_dump_top_if_qlen(char * str,int str_len)12515 dlil_dump_top_if_qlen(char *str, int str_len)
12516 {
12517 char *c = str;
12518 int k, clen = str_len;
12519 struct ifnet *top_ifcq_ifp = NULL;
12520 uint32_t top_ifcq_len = 0;
12521 struct ifnet *top_inq_ifp = NULL;
12522 uint32_t top_inq_len = 0;
12523
12524 for (int ifidx = 1; ifidx < if_index; ifidx++) {
12525 struct ifnet *ifp = ifindex2ifnet[ifidx];
12526 struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
12527
12528 if (ifp == NULL) {
12529 continue;
12530 }
12531 if (ifp->if_snd != NULL && ifp->if_snd->ifcq_len > top_ifcq_len) {
12532 top_ifcq_len = ifp->if_snd->ifcq_len;
12533 top_ifcq_ifp = ifp;
12534 }
12535 if (dl_if->dl_if_inpstorage.dlth_pkts.qlen > top_inq_len) {
12536 top_inq_len = dl_if->dl_if_inpstorage.dlth_pkts.qlen;
12537 top_inq_ifp = ifp;
12538 }
12539 }
12540
12541 if (top_ifcq_ifp != NULL) {
12542 k = scnprintf(c, clen, "\ntop ifcq_len %u packets by %s\n",
12543 top_ifcq_len, top_ifcq_ifp->if_xname);
12544 DUMP_BUF_CHK();
12545 }
12546 if (top_inq_ifp != NULL) {
12547 k = scnprintf(c, clen, "\ntop inq_len %u packets by %s\n",
12548 top_inq_len, top_inq_ifp->if_xname);
12549 DUMP_BUF_CHK();
12550 }
12551 done:
12552 return str_len - clen;
12553 }
12554
12555 #if DEVELOPMENT || DEBUG
12556 __private_extern__ int
packet_dump_trace_update(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)12557 packet_dump_trace_update(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
12558 {
12559 struct flow_key key = {};
12560 int error = 0;
12561
12562 if (req->newptr == USER_ADDR_NULL) {
12563 return EINVAL;
12564 }
12565 if (req->newlen < sizeof(struct flow_key)) {
12566 return EINVAL;
12567 }
12568 error = SYSCTL_IN(req, &key, sizeof(struct flow_key));
12569 if (error != 0) {
12570 return error;
12571 }
12572
12573 switch (key.fk_ipver) {
12574 case IPVERSION:
12575 if (key.fk_proto != IPPROTO_UDP ||
12576 key.fk_sport == 0 || key.fk_dport == 0) {
12577 return EINVAL;
12578 }
12579
12580 if (key.fk_src4.s_addr == INADDR_ANY ||
12581 key.fk_dst4.s_addr == INADDR_ANY) {
12582 return EINVAL;
12583 }
12584
12585 break;
12586 case IPV6_VERSION:
12587 if (key.fk_proto != IPPROTO_UDP ||
12588 key.fk_sport == 0 || key.fk_dport == 0) {
12589 return EINVAL;
12590 }
12591
12592 if (IN6_IS_ADDR_UNSPECIFIED(&key.fk_src6) ||
12593 IN6_IS_ADDR_UNSPECIFIED(&key.fk_dst6)) {
12594 return EINVAL;
12595 }
12596
12597 break;
12598 case 0:
12599 if (key.fk_proto != 0 ||
12600 key.fk_sport != 0 || key.fk_dport != 0) {
12601 return EINVAL;
12602 }
12603
12604 if (!IN6_IS_ADDR_UNSPECIFIED(&key.fk_src6) ||
12605 !IN6_IS_ADDR_UNSPECIFIED(&key.fk_dst6)) {
12606 return EINVAL;
12607 }
12608
12609 break;
12610 default:
12611 return EINVAL;
12612 }
12613
12614 memcpy(&flow_key_trace, &key, sizeof(struct flow_key));
12615 return 0;
12616 }
12617 #endif /* DEVELOPMENT || DEBUG */
12618