1 /*
2 * Copyright (c) 1999-2023 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
30 * support for mandatory and extensible security protections. This notice
31 * is included in support of clause 2.2 (b) of the Apple Public License,
32 * Version 2.0.
33 */
34 #include <stddef.h>
35 #include <ptrauth.h>
36
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/kernel.h>
40 #include <sys/malloc.h>
41 #include <sys/mbuf.h>
42 #include <sys/socket.h>
43 #include <sys/domain.h>
44 #include <sys/user.h>
45 #include <sys/random.h>
46 #include <sys/socketvar.h>
47 #include <net/if_dl.h>
48 #include <net/if.h>
49 #include <net/route.h>
50 #include <net/if_var.h>
51 #include <net/dlil.h>
52 #include <net/if_arp.h>
53 #include <net/iptap.h>
54 #include <net/pktap.h>
55 #include <net/nwk_wq.h>
56 #include <sys/kern_event.h>
57 #include <sys/kdebug.h>
58 #include <sys/mcache.h>
59 #include <sys/syslog.h>
60 #include <sys/protosw.h>
61 #include <sys/priv.h>
62
63 #include <kern/assert.h>
64 #include <kern/task.h>
65 #include <kern/thread.h>
66 #include <kern/sched_prim.h>
67 #include <kern/locks.h>
68 #include <kern/zalloc.h>
69
70 #include <net/kpi_protocol.h>
71 #include <net/if_types.h>
72 #include <net/if_ipsec.h>
73 #include <net/if_llreach.h>
74 #include <net/if_utun.h>
75 #include <net/kpi_interfacefilter.h>
76 #include <net/classq/classq.h>
77 #include <net/classq/classq_sfb.h>
78 #include <net/flowhash.h>
79 #include <net/ntstat.h>
80 #if SKYWALK && defined(XNU_TARGET_OS_OSX)
81 #include <skywalk/lib/net_filter_event.h>
82 #endif /* SKYWALK && XNU_TARGET_OS_OSX */
83 #include <net/if_llatbl.h>
84 #include <net/net_api_stats.h>
85 #include <net/if_ports_used.h>
86 #include <net/if_vlan_var.h>
87 #include <netinet/in.h>
88 #if INET
89 #include <netinet/in_var.h>
90 #include <netinet/igmp_var.h>
91 #include <netinet/ip_var.h>
92 #include <netinet/tcp.h>
93 #include <netinet/tcp_var.h>
94 #include <netinet/udp.h>
95 #include <netinet/udp_var.h>
96 #include <netinet/if_ether.h>
97 #include <netinet/in_pcb.h>
98 #include <netinet/in_tclass.h>
99 #include <netinet/ip.h>
100 #include <netinet/ip_icmp.h>
101 #include <netinet/icmp_var.h>
102 #endif /* INET */
103
104 #include <net/nat464_utils.h>
105 #include <netinet6/in6_var.h>
106 #include <netinet6/nd6.h>
107 #include <netinet6/mld6_var.h>
108 #include <netinet6/scope6_var.h>
109 #include <netinet/ip6.h>
110 #include <netinet/icmp6.h>
111 #include <net/pf_pbuf.h>
112 #include <libkern/OSAtomic.h>
113 #include <libkern/tree.h>
114
115 #include <dev/random/randomdev.h>
116 #include <machine/machine_routines.h>
117
118 #include <mach/thread_act.h>
119 #include <mach/sdt.h>
120
121 #if CONFIG_MACF
122 #include <sys/kauth.h>
123 #include <security/mac_framework.h>
124 #include <net/ethernet.h>
125 #include <net/firewire.h>
126 #endif
127
128 #if PF
129 #include <net/pfvar.h>
130 #endif /* PF */
131 #include <net/pktsched/pktsched.h>
132 #include <net/pktsched/pktsched_netem.h>
133
134 #if NECP
135 #include <net/necp.h>
136 #endif /* NECP */
137
138 #if SKYWALK
139 #include <skywalk/packet/packet_queue.h>
140 #include <skywalk/nexus/netif/nx_netif.h>
141 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
142 #endif /* SKYWALK */
143
144 #include <os/log.h>
145
146 #define DBG_LAYER_BEG DLILDBG_CODE(DBG_DLIL_STATIC, 0)
147 #define DBG_LAYER_END DLILDBG_CODE(DBG_DLIL_STATIC, 2)
148 #define DBG_FNC_DLIL_INPUT DLILDBG_CODE(DBG_DLIL_STATIC, (1 << 8))
149 #define DBG_FNC_DLIL_OUTPUT DLILDBG_CODE(DBG_DLIL_STATIC, (2 << 8))
150 #define DBG_FNC_DLIL_IFOUT DLILDBG_CODE(DBG_DLIL_STATIC, (3 << 8))
151
152 #define IFNET_KTRACE_TX_PKT_DUMP IFNETDBG_CODE(DBG_IFNET, 0x001)
153 #define IFNET_KTRACE_RX_PKT_DUMP IFNETDBG_CODE(DBG_IFNET, 0x002)
154
155 #define MAX_FRAME_TYPE_SIZE 4 /* LONGWORDS */
156 #define MAX_LINKADDR 4 /* LONGWORDS */
157
158
159 #if 1
160 #define DLIL_PRINTF printf
161 #else
162 #define DLIL_PRINTF kprintf
163 #endif
164
165 #define IF_DATA_REQUIRE_ALIGNED_64(f) \
166 _CASSERT(!(offsetof(struct if_data_internal, f) % sizeof (u_int64_t)))
167
168 #define IFNET_IF_DATA_REQUIRE_ALIGNED_64(f) \
169 _CASSERT(!(offsetof(struct ifnet, if_data.f) % sizeof (u_int64_t)))
170
171 enum {
172 kProtoKPI_v1 = 1,
173 kProtoKPI_v2 = 2
174 };
175
176 /*
177 * List of if_proto structures in if_proto_hash[] is protected by
178 * the ifnet lock. The rest of the fields are initialized at protocol
179 * attach time and never change, thus no lock required as long as
180 * a reference to it is valid, via if_proto_ref().
181 */
182 struct if_proto {
183 SLIST_ENTRY(if_proto) next_hash;
184 u_int32_t refcount;
185 u_int32_t detached;
186 struct ifnet *ifp;
187 protocol_family_t protocol_family;
188 int proto_kpi;
189 union {
190 struct {
191 proto_media_input input;
192 proto_media_preout pre_output;
193 proto_media_event event;
194 proto_media_ioctl ioctl;
195 proto_media_detached detached;
196 proto_media_resolve_multi resolve_multi;
197 proto_media_send_arp send_arp;
198 } v1;
199 struct {
200 proto_media_input_v2 input;
201 proto_media_preout pre_output;
202 proto_media_event event;
203 proto_media_ioctl ioctl;
204 proto_media_detached detached;
205 proto_media_resolve_multi resolve_multi;
206 proto_media_send_arp send_arp;
207 } v2;
208 } kpi;
209 };
210
211 SLIST_HEAD(proto_hash_entry, if_proto);
212
213 #define DLIL_SDLDATALEN \
214 (DLIL_SDLMAXLEN - offsetof(struct sockaddr_dl, sdl_data[0]))
215
216 struct dlil_ifnet {
217 struct ifnet dl_if; /* public ifnet */
218 /*
219 * DLIL private fields, protected by dl_if_lock
220 */
221 decl_lck_mtx_data(, dl_if_lock);
222 TAILQ_ENTRY(dlil_ifnet) dl_if_link; /* dlil_ifnet link */
223 u_int32_t dl_if_flags; /* flags (below) */
224 u_int32_t dl_if_refcnt; /* refcnt */
225 void (*dl_if_trace)(struct dlil_ifnet *, int); /* ref trace callback */
226 void *dl_if_uniqueid; /* unique interface id */
227 size_t dl_if_uniqueid_len; /* length of the unique id */
228 char dl_if_namestorage[IFNAMSIZ]; /* interface name storage */
229 char dl_if_xnamestorage[IFXNAMSIZ]; /* external name storage */
230 struct {
231 struct ifaddr ifa; /* lladdr ifa */
232 u_int8_t asdl[DLIL_SDLMAXLEN]; /* addr storage */
233 u_int8_t msdl[DLIL_SDLMAXLEN]; /* mask storage */
234 } dl_if_lladdr;
235 u_int8_t dl_if_descstorage[IF_DESCSIZE]; /* desc storage */
236 u_int8_t dl_if_permanent_ether[ETHER_ADDR_LEN]; /* permanent address */
237 u_int8_t dl_if_permanent_ether_is_set;
238 u_int8_t dl_if_unused;
239 struct dlil_threading_info dl_if_inpstorage; /* input thread storage */
240 ctrace_t dl_if_attach; /* attach PC stacktrace */
241 ctrace_t dl_if_detach; /* detach PC stacktrace */
242 };
243
244 /* Values for dl_if_flags (private to DLIL) */
245 #define DLIF_INUSE 0x1 /* DLIL ifnet recycler, ifnet in use */
246 #define DLIF_REUSE 0x2 /* DLIL ifnet recycles, ifnet is not new */
247 #define DLIF_DEBUG 0x4 /* has debugging info */
248
249 #define IF_REF_TRACE_HIST_SIZE 8 /* size of ref trace history */
250
251 /* For gdb */
252 __private_extern__ unsigned int if_ref_trace_hist_size = IF_REF_TRACE_HIST_SIZE;
253
254 struct dlil_ifnet_dbg {
255 struct dlil_ifnet dldbg_dlif; /* dlil_ifnet */
256 u_int16_t dldbg_if_refhold_cnt; /* # ifnet references */
257 u_int16_t dldbg_if_refrele_cnt; /* # ifnet releases */
258 /*
259 * Circular lists of ifnet_{reference,release} callers.
260 */
261 ctrace_t dldbg_if_refhold[IF_REF_TRACE_HIST_SIZE];
262 ctrace_t dldbg_if_refrele[IF_REF_TRACE_HIST_SIZE];
263 };
264
265 #define DLIL_TO_IFP(s) (&s->dl_if)
266 #define IFP_TO_DLIL(s) ((struct dlil_ifnet *)s)
267
268 struct ifnet_filter {
269 TAILQ_ENTRY(ifnet_filter) filt_next;
270 u_int32_t filt_skip;
271 u_int32_t filt_flags;
272 ifnet_t filt_ifp;
273 const char *filt_name;
274 void *filt_cookie;
275 protocol_family_t filt_protocol;
276 iff_input_func filt_input;
277 iff_output_func filt_output;
278 iff_event_func filt_event;
279 iff_ioctl_func filt_ioctl;
280 iff_detached_func filt_detached;
281 };
282
283 struct proto_input_entry;
284
285 static TAILQ_HEAD(, dlil_ifnet) dlil_ifnet_head;
286
287 static LCK_ATTR_DECLARE(dlil_lck_attributes, 0, 0);
288
289 static LCK_GRP_DECLARE(dlil_lock_group, "DLIL internal locks");
290 LCK_GRP_DECLARE(ifnet_lock_group, "ifnet locks");
291 static LCK_GRP_DECLARE(ifnet_head_lock_group, "ifnet head lock");
292 static LCK_GRP_DECLARE(ifnet_snd_lock_group, "ifnet snd locks");
293 static LCK_GRP_DECLARE(ifnet_rcv_lock_group, "ifnet rcv locks");
294
295 LCK_ATTR_DECLARE(ifnet_lock_attr, 0, 0);
296 static LCK_RW_DECLARE_ATTR(ifnet_head_lock, &ifnet_head_lock_group,
297 &dlil_lck_attributes);
298 static LCK_MTX_DECLARE_ATTR(dlil_ifnet_lock, &dlil_lock_group,
299 &dlil_lck_attributes);
300
301 #if DEBUG
302 static unsigned int ifnet_debug = 1; /* debugging (enabled) */
303 #else
304 static unsigned int ifnet_debug; /* debugging (disabled) */
305 #endif /* !DEBUG */
306 static unsigned int dlif_size; /* size of dlil_ifnet to allocate */
307 static unsigned int dlif_bufsize; /* size of dlif_size + headroom */
308 static struct zone *dlif_zone; /* zone for dlil_ifnet */
309 #define DLIF_ZONE_NAME "ifnet" /* zone name */
310
311 static KALLOC_TYPE_DEFINE(dlif_filt_zone, struct ifnet_filter, NET_KT_DEFAULT);
312
313 static KALLOC_TYPE_DEFINE(dlif_proto_zone, struct if_proto, NET_KT_DEFAULT);
314
315 static unsigned int dlif_tcpstat_size; /* size of tcpstat_local to allocate */
316 static unsigned int dlif_tcpstat_bufsize; /* size of dlif_tcpstat_size + headroom */
317 static struct zone *dlif_tcpstat_zone; /* zone for tcpstat_local */
318 #define DLIF_TCPSTAT_ZONE_NAME "ifnet_tcpstat" /* zone name */
319
320 static unsigned int dlif_udpstat_size; /* size of udpstat_local to allocate */
321 static unsigned int dlif_udpstat_bufsize; /* size of dlif_udpstat_size + headroom */
322 static struct zone *dlif_udpstat_zone; /* zone for udpstat_local */
323 #define DLIF_UDPSTAT_ZONE_NAME "ifnet_udpstat" /* zone name */
324
325 static u_int32_t net_rtref;
326
327 static struct dlil_main_threading_info dlil_main_input_thread_info;
328 __private_extern__ struct dlil_threading_info *dlil_main_input_thread =
329 (struct dlil_threading_info *)&dlil_main_input_thread_info;
330
331 static int dlil_event_internal(struct ifnet *ifp, struct kev_msg *msg, bool update_generation);
332 static int dlil_detach_filter_internal(interface_filter_t filter, int detached);
333 static void dlil_if_trace(struct dlil_ifnet *, int);
334 static void if_proto_ref(struct if_proto *);
335 static void if_proto_free(struct if_proto *);
336 static struct if_proto *find_attached_proto(struct ifnet *, u_int32_t);
337 static u_int32_t dlil_ifp_protolist(struct ifnet *ifp, protocol_family_t *list,
338 u_int32_t list_count);
339 static void _dlil_if_release(ifnet_t ifp, bool clear_in_use);
340 static void if_flt_monitor_busy(struct ifnet *);
341 static void if_flt_monitor_unbusy(struct ifnet *);
342 static void if_flt_monitor_enter(struct ifnet *);
343 static void if_flt_monitor_leave(struct ifnet *);
344 static int dlil_interface_filters_input(struct ifnet *, struct mbuf **,
345 char **, protocol_family_t);
346 static int dlil_interface_filters_output(struct ifnet *, struct mbuf **,
347 protocol_family_t);
348 static struct ifaddr *dlil_alloc_lladdr(struct ifnet *,
349 const struct sockaddr_dl *);
350 static int ifnet_lookup(struct ifnet *);
351 static void if_purgeaddrs(struct ifnet *);
352
353 static errno_t ifproto_media_input_v1(struct ifnet *, protocol_family_t,
354 struct mbuf *, char *);
355 static errno_t ifproto_media_input_v2(struct ifnet *, protocol_family_t,
356 struct mbuf *);
357 static errno_t ifproto_media_preout(struct ifnet *, protocol_family_t,
358 mbuf_t *, const struct sockaddr *, void *, char *, char *);
359 static void ifproto_media_event(struct ifnet *, protocol_family_t,
360 const struct kev_msg *);
361 static errno_t ifproto_media_ioctl(struct ifnet *, protocol_family_t,
362 unsigned long, void *);
363 static errno_t ifproto_media_resolve_multi(ifnet_t, const struct sockaddr *,
364 struct sockaddr_dl *, size_t);
365 static errno_t ifproto_media_send_arp(struct ifnet *, u_short,
366 const struct sockaddr_dl *, const struct sockaddr *,
367 const struct sockaddr_dl *, const struct sockaddr *);
368
369 static errno_t ifp_if_input(struct ifnet *ifp, struct mbuf *m_head,
370 struct mbuf *m_tail, const struct ifnet_stat_increment_param *s,
371 boolean_t poll, struct thread *tp);
372 static void ifp_if_input_poll(struct ifnet *, u_int32_t, u_int32_t,
373 struct mbuf **, struct mbuf **, u_int32_t *, u_int32_t *);
374 static errno_t ifp_if_ctl(struct ifnet *, ifnet_ctl_cmd_t, u_int32_t, void *);
375 static errno_t ifp_if_demux(struct ifnet *, struct mbuf *, char *,
376 protocol_family_t *);
377 static errno_t ifp_if_add_proto(struct ifnet *, protocol_family_t,
378 const struct ifnet_demux_desc *, u_int32_t);
379 static errno_t ifp_if_del_proto(struct ifnet *, protocol_family_t);
380 static errno_t ifp_if_check_multi(struct ifnet *, const struct sockaddr *);
381 #if !XNU_TARGET_OS_OSX
382 static errno_t ifp_if_framer(struct ifnet *, struct mbuf **,
383 const struct sockaddr *, const char *, const char *,
384 u_int32_t *, u_int32_t *);
385 #else /* XNU_TARGET_OS_OSX */
386 static errno_t ifp_if_framer(struct ifnet *, struct mbuf **,
387 const struct sockaddr *, const char *, const char *);
388 #endif /* XNU_TARGET_OS_OSX */
389 static errno_t ifp_if_framer_extended(struct ifnet *, struct mbuf **,
390 const struct sockaddr *, const char *, const char *,
391 u_int32_t *, u_int32_t *);
392 static errno_t ifp_if_set_bpf_tap(struct ifnet *, bpf_tap_mode, bpf_packet_func);
393 static void ifp_if_free(struct ifnet *);
394 static void ifp_if_event(struct ifnet *, const struct kev_msg *);
395 static __inline void ifp_inc_traffic_class_in(struct ifnet *, struct mbuf *);
396 static __inline void ifp_inc_traffic_class_out(struct ifnet *, struct mbuf *);
397
398 static errno_t dlil_input_async(struct dlil_threading_info *, struct ifnet *,
399 struct mbuf *, struct mbuf *, const struct ifnet_stat_increment_param *,
400 boolean_t, struct thread *);
401 static errno_t dlil_input_sync(struct dlil_threading_info *, struct ifnet *,
402 struct mbuf *, struct mbuf *, const struct ifnet_stat_increment_param *,
403 boolean_t, struct thread *);
404
405 static void dlil_main_input_thread_func(void *, wait_result_t);
406 static void dlil_main_input_thread_cont(void *, wait_result_t);
407
408 static void dlil_input_thread_func(void *, wait_result_t);
409 static void dlil_input_thread_cont(void *, wait_result_t);
410
411 static void dlil_rxpoll_input_thread_func(void *, wait_result_t);
412 static void dlil_rxpoll_input_thread_cont(void *, wait_result_t);
413
414 static int dlil_create_input_thread(ifnet_t, struct dlil_threading_info *,
415 thread_continue_t *);
416 static void dlil_terminate_input_thread(struct dlil_threading_info *);
417 static void dlil_input_stats_add(const struct ifnet_stat_increment_param *,
418 struct dlil_threading_info *, struct ifnet *, boolean_t);
419 static boolean_t dlil_input_stats_sync(struct ifnet *,
420 struct dlil_threading_info *);
421 static void dlil_input_packet_list_common(struct ifnet *, struct mbuf *,
422 u_int32_t, ifnet_model_t, boolean_t);
423 static errno_t ifnet_input_common(struct ifnet *, struct mbuf *, struct mbuf *,
424 const struct ifnet_stat_increment_param *, boolean_t, boolean_t);
425 static int dlil_is_clat_needed(protocol_family_t, mbuf_t );
426 static errno_t dlil_clat46(ifnet_t, protocol_family_t *, mbuf_t *);
427 static errno_t dlil_clat64(ifnet_t, protocol_family_t *, mbuf_t *);
428 #if DEBUG || DEVELOPMENT
429 static void dlil_verify_sum16(void);
430 #endif /* DEBUG || DEVELOPMENT */
431 static void dlil_output_cksum_dbg(struct ifnet *, struct mbuf *, uint32_t,
432 protocol_family_t);
433 static void dlil_input_cksum_dbg(struct ifnet *, struct mbuf *, char *,
434 protocol_family_t);
435
436 static void dlil_incr_pending_thread_count(void);
437 static void dlil_decr_pending_thread_count(void);
438
439 static void ifnet_detacher_thread_func(void *, wait_result_t);
440 static void ifnet_detacher_thread_cont(void *, wait_result_t);
441 static void ifnet_detach_final(struct ifnet *);
442 static void ifnet_detaching_enqueue(struct ifnet *);
443 static struct ifnet *ifnet_detaching_dequeue(void);
444
445 static void ifnet_start_thread_func(void *, wait_result_t);
446 static void ifnet_start_thread_cont(void *, wait_result_t);
447
448 static void ifnet_poll_thread_func(void *, wait_result_t);
449 static void ifnet_poll_thread_cont(void *, wait_result_t);
450
451 static errno_t ifnet_enqueue_common(struct ifnet *, struct ifclassq *,
452 classq_pkt_t *, boolean_t, boolean_t *);
453
454 static void ifp_src_route_copyout(struct ifnet *, struct route *);
455 static void ifp_src_route_copyin(struct ifnet *, struct route *);
456 static void ifp_src_route6_copyout(struct ifnet *, struct route_in6 *);
457 static void ifp_src_route6_copyin(struct ifnet *, struct route_in6 *);
458
459 static int sysctl_rxpoll SYSCTL_HANDLER_ARGS;
460 static int sysctl_rxpoll_mode_holdtime SYSCTL_HANDLER_ARGS;
461 static int sysctl_rxpoll_sample_holdtime SYSCTL_HANDLER_ARGS;
462 static int sysctl_rxpoll_interval_time SYSCTL_HANDLER_ARGS;
463 static int sysctl_rxpoll_wlowat SYSCTL_HANDLER_ARGS;
464 static int sysctl_rxpoll_whiwat SYSCTL_HANDLER_ARGS;
465 static int sysctl_sndq_maxlen SYSCTL_HANDLER_ARGS;
466 static int sysctl_rcvq_maxlen SYSCTL_HANDLER_ARGS;
467 static int sysctl_hwcksum_dbg_mode SYSCTL_HANDLER_ARGS;
468 static int sysctl_hwcksum_dbg_partial_rxoff_forced SYSCTL_HANDLER_ARGS;
469 static int sysctl_hwcksum_dbg_partial_rxoff_adj SYSCTL_HANDLER_ARGS;
470
471 struct chain_len_stats tx_chain_len_stats;
472 static int sysctl_tx_chain_len_stats SYSCTL_HANDLER_ARGS;
473
474 #if TEST_INPUT_THREAD_TERMINATION
475 static int sysctl_input_thread_termination_spin SYSCTL_HANDLER_ARGS;
476 #endif /* TEST_INPUT_THREAD_TERMINATION */
477
478
479 /* The following are protected by dlil_ifnet_lock */
480 static TAILQ_HEAD(, ifnet) ifnet_detaching_head;
481 static u_int32_t ifnet_detaching_cnt;
482 static boolean_t ifnet_detaching_embryonic;
483 static void *ifnet_delayed_run; /* wait channel for detaching thread */
484
485 static LCK_MTX_DECLARE_ATTR(ifnet_fc_lock, &dlil_lock_group,
486 &dlil_lck_attributes);
487
488 static uint32_t ifnet_flowhash_seed;
489
490 struct ifnet_flowhash_key {
491 char ifk_name[IFNAMSIZ];
492 uint32_t ifk_unit;
493 uint32_t ifk_flags;
494 uint32_t ifk_eflags;
495 uint32_t ifk_capabilities;
496 uint32_t ifk_capenable;
497 uint32_t ifk_output_sched_model;
498 uint32_t ifk_rand1;
499 uint32_t ifk_rand2;
500 };
501
502 /* Flow control entry per interface */
503 struct ifnet_fc_entry {
504 RB_ENTRY(ifnet_fc_entry) ifce_entry;
505 u_int32_t ifce_flowhash;
506 struct ifnet *ifce_ifp;
507 };
508
509 static uint32_t ifnet_calc_flowhash(struct ifnet *);
510 static int ifce_cmp(const struct ifnet_fc_entry *,
511 const struct ifnet_fc_entry *);
512 static int ifnet_fc_add(struct ifnet *);
513 static struct ifnet_fc_entry *ifnet_fc_get(u_int32_t);
514 static void ifnet_fc_entry_free(struct ifnet_fc_entry *);
515
516 /* protected by ifnet_fc_lock */
517 RB_HEAD(ifnet_fc_tree, ifnet_fc_entry) ifnet_fc_tree;
518 RB_PROTOTYPE(ifnet_fc_tree, ifnet_fc_entry, ifce_entry, ifce_cmp);
519 RB_GENERATE(ifnet_fc_tree, ifnet_fc_entry, ifce_entry, ifce_cmp);
520
521 static KALLOC_TYPE_DEFINE(ifnet_fc_zone, struct ifnet_fc_entry, NET_KT_DEFAULT);
522
523 extern void bpfdetach(struct ifnet *);
524 extern void proto_input_run(void);
525
526 extern uint32_t udp_count_opportunistic(unsigned int ifindex,
527 u_int32_t flags);
528 extern uint32_t tcp_count_opportunistic(unsigned int ifindex,
529 u_int32_t flags);
530
531 __private_extern__ void link_rtrequest(int, struct rtentry *, struct sockaddr *);
532
533 #if CONFIG_MACF
534 #if !XNU_TARGET_OS_OSX
535 int dlil_lladdr_ckreq = 1;
536 #else /* XNU_TARGET_OS_OSX */
537 int dlil_lladdr_ckreq = 0;
538 #endif /* XNU_TARGET_OS_OSX */
539 #endif /* CONFIG_MACF */
540
541 #if DEBUG
542 int dlil_verbose = 1;
543 #else
544 int dlil_verbose = 0;
545 #endif /* DEBUG */
546 #if IFNET_INPUT_SANITY_CHK
547 /* sanity checking of input packet lists received */
548 static u_int32_t dlil_input_sanity_check = 0;
549 #endif /* IFNET_INPUT_SANITY_CHK */
550 /* rate limit debug messages */
551 struct timespec dlil_dbgrate = { .tv_sec = 1, .tv_nsec = 0 };
552
553 SYSCTL_DECL(_net_link_generic_system);
554
555 SYSCTL_INT(_net_link_generic_system, OID_AUTO, dlil_verbose,
556 CTLFLAG_RW | CTLFLAG_LOCKED, &dlil_verbose, 0, "Log DLIL error messages");
557
558 #define IF_SNDQ_MINLEN 32
559 u_int32_t if_sndq_maxlen = IFQ_MAXLEN;
560 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, sndq_maxlen,
561 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_sndq_maxlen, IFQ_MAXLEN,
562 sysctl_sndq_maxlen, "I", "Default transmit queue max length");
563
564 #define IF_RCVQ_MINLEN 32
565 #define IF_RCVQ_MAXLEN 256
566 u_int32_t if_rcvq_maxlen = IF_RCVQ_MAXLEN;
567 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rcvq_maxlen,
568 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rcvq_maxlen, IFQ_MAXLEN,
569 sysctl_rcvq_maxlen, "I", "Default receive queue max length");
570
571 #define IF_RXPOLL_DECAY 2 /* ilog2 of EWMA decay rate (4) */
572 u_int32_t if_rxpoll_decay = IF_RXPOLL_DECAY;
573 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, rxpoll_decay,
574 CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_decay, IF_RXPOLL_DECAY,
575 "ilog2 of EWMA decay rate of avg inbound packets");
576
577 #define IF_RXPOLL_MODE_HOLDTIME_MIN (10ULL * 1000 * 1000) /* 10 ms */
578 #define IF_RXPOLL_MODE_HOLDTIME (1000ULL * 1000 * 1000) /* 1 sec */
579 static u_int64_t if_rxpoll_mode_holdtime = IF_RXPOLL_MODE_HOLDTIME;
580 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_freeze_time,
581 CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_mode_holdtime,
582 IF_RXPOLL_MODE_HOLDTIME, sysctl_rxpoll_mode_holdtime,
583 "Q", "input poll mode freeze time");
584
585 #define IF_RXPOLL_SAMPLETIME_MIN (1ULL * 1000 * 1000) /* 1 ms */
586 #define IF_RXPOLL_SAMPLETIME (10ULL * 1000 * 1000) /* 10 ms */
587 static u_int64_t if_rxpoll_sample_holdtime = IF_RXPOLL_SAMPLETIME;
588 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_sample_time,
589 CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_sample_holdtime,
590 IF_RXPOLL_SAMPLETIME, sysctl_rxpoll_sample_holdtime,
591 "Q", "input poll sampling time");
592
593 static u_int64_t if_rxpoll_interval_time = IF_RXPOLL_INTERVALTIME;
594 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_interval_time,
595 CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_interval_time,
596 IF_RXPOLL_INTERVALTIME, sysctl_rxpoll_interval_time,
597 "Q", "input poll interval (time)");
598
599 #define IF_RXPOLL_INTERVAL_PKTS 0 /* 0 (disabled) */
600 u_int32_t if_rxpoll_interval_pkts = IF_RXPOLL_INTERVAL_PKTS;
601 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, rxpoll_interval_pkts,
602 CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_interval_pkts,
603 IF_RXPOLL_INTERVAL_PKTS, "input poll interval (packets)");
604
605 #define IF_RXPOLL_WLOWAT 10
606 static u_int32_t if_sysctl_rxpoll_wlowat = IF_RXPOLL_WLOWAT;
607 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_wakeups_lowat,
608 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_sysctl_rxpoll_wlowat,
609 IF_RXPOLL_WLOWAT, sysctl_rxpoll_wlowat,
610 "I", "input poll wakeup low watermark");
611
612 #define IF_RXPOLL_WHIWAT 100
613 static u_int32_t if_sysctl_rxpoll_whiwat = IF_RXPOLL_WHIWAT;
614 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_wakeups_hiwat,
615 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_sysctl_rxpoll_whiwat,
616 IF_RXPOLL_WHIWAT, sysctl_rxpoll_whiwat,
617 "I", "input poll wakeup high watermark");
618
619 static u_int32_t if_rxpoll_max = 0; /* 0 (automatic) */
620 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, rxpoll_max,
621 CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_max, 0,
622 "max packets per poll call");
623
624 u_int32_t if_rxpoll = 1;
625 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll,
626 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll, 0,
627 sysctl_rxpoll, "I", "enable opportunistic input polling");
628
629 #if TEST_INPUT_THREAD_TERMINATION
630 static u_int32_t if_input_thread_termination_spin = 0;
631 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, input_thread_termination_spin,
632 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
633 &if_input_thread_termination_spin, 0,
634 sysctl_input_thread_termination_spin,
635 "I", "input thread termination spin limit");
636 #endif /* TEST_INPUT_THREAD_TERMINATION */
637
638 static u_int32_t cur_dlil_input_threads = 0;
639 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, dlil_input_threads,
640 CTLFLAG_RD | CTLFLAG_LOCKED, &cur_dlil_input_threads, 0,
641 "Current number of DLIL input threads");
642
643 #if IFNET_INPUT_SANITY_CHK
644 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, dlil_input_sanity_check,
645 CTLFLAG_RW | CTLFLAG_LOCKED, &dlil_input_sanity_check, 0,
646 "Turn on sanity checking in DLIL input");
647 #endif /* IFNET_INPUT_SANITY_CHK */
648
649 static u_int32_t if_flowadv = 1;
650 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, flow_advisory,
651 CTLFLAG_RW | CTLFLAG_LOCKED, &if_flowadv, 1,
652 "enable flow-advisory mechanism");
653
654 static u_int32_t if_delaybased_queue = 1;
655 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, delaybased_queue,
656 CTLFLAG_RW | CTLFLAG_LOCKED, &if_delaybased_queue, 1,
657 "enable delay based dynamic queue sizing");
658
659 static uint64_t hwcksum_in_invalidated = 0;
660 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
661 hwcksum_in_invalidated, CTLFLAG_RD | CTLFLAG_LOCKED,
662 &hwcksum_in_invalidated, "inbound packets with invalidated hardware cksum");
663
664 uint32_t hwcksum_dbg = 0;
665 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, hwcksum_dbg,
666 CTLFLAG_RW | CTLFLAG_LOCKED, &hwcksum_dbg, 0,
667 "enable hardware cksum debugging");
668
669 u_int32_t ifnet_start_delayed = 0;
670 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, start_delayed,
671 CTLFLAG_RW | CTLFLAG_LOCKED, &ifnet_start_delayed, 0,
672 "number of times start was delayed");
673
674 u_int32_t ifnet_delay_start_disabled = 0;
675 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, start_delay_disabled,
676 CTLFLAG_RW | CTLFLAG_LOCKED, &ifnet_delay_start_disabled, 0,
677 "number of times start was delayed");
678
679 #if DEVELOPMENT || DEBUG
680 static int packet_dump_trace_update SYSCTL_HANDLER_ARGS;
681
682 struct flow_key flow_key_trace;
683 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, flow_key_trace, CTLFLAG_WR | CTLFLAG_LOCKED |
684 CTLFLAG_KERN | CTLFLAG_ANYBODY, 0, 0, packet_dump_trace_update, "S", "Set flow key for packet tracing");
685 #endif /* DEVELOPMENT || DEBUG */
686
687 static inline void
ifnet_delay_start_disabled_increment(void)688 ifnet_delay_start_disabled_increment(void)
689 {
690 OSIncrementAtomic(&ifnet_delay_start_disabled);
691 }
692
693 #define HWCKSUM_DBG_PARTIAL_FORCED 0x1 /* forced partial checksum */
694 #define HWCKSUM_DBG_PARTIAL_RXOFF_ADJ 0x2 /* adjust start offset */
695 #define HWCKSUM_DBG_FINALIZE_FORCED 0x10 /* forced finalize */
696 #define HWCKSUM_DBG_MASK \
697 (HWCKSUM_DBG_PARTIAL_FORCED | HWCKSUM_DBG_PARTIAL_RXOFF_ADJ | \
698 HWCKSUM_DBG_FINALIZE_FORCED)
699
700 static uint32_t hwcksum_dbg_mode = 0;
701 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, hwcksum_dbg_mode,
702 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &hwcksum_dbg_mode,
703 0, sysctl_hwcksum_dbg_mode, "I", "hardware cksum debugging mode");
704
705 static uint64_t hwcksum_dbg_partial_forced = 0;
706 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
707 hwcksum_dbg_partial_forced, CTLFLAG_RD | CTLFLAG_LOCKED,
708 &hwcksum_dbg_partial_forced, "packets forced using partial cksum");
709
710 static uint64_t hwcksum_dbg_partial_forced_bytes = 0;
711 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
712 hwcksum_dbg_partial_forced_bytes, CTLFLAG_RD | CTLFLAG_LOCKED,
713 &hwcksum_dbg_partial_forced_bytes, "bytes forced using partial cksum");
714
715 static uint32_t hwcksum_dbg_partial_rxoff_forced = 0;
716 SYSCTL_PROC(_net_link_generic_system, OID_AUTO,
717 hwcksum_dbg_partial_rxoff_forced, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
718 &hwcksum_dbg_partial_rxoff_forced, 0,
719 sysctl_hwcksum_dbg_partial_rxoff_forced, "I",
720 "forced partial cksum rx offset");
721
722 static uint32_t hwcksum_dbg_partial_rxoff_adj = 0;
723 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, hwcksum_dbg_partial_rxoff_adj,
724 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &hwcksum_dbg_partial_rxoff_adj,
725 0, sysctl_hwcksum_dbg_partial_rxoff_adj, "I",
726 "adjusted partial cksum rx offset");
727
728 static uint64_t hwcksum_dbg_verified = 0;
729 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
730 hwcksum_dbg_verified, CTLFLAG_RD | CTLFLAG_LOCKED,
731 &hwcksum_dbg_verified, "packets verified for having good checksum");
732
733 static uint64_t hwcksum_dbg_bad_cksum = 0;
734 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
735 hwcksum_dbg_bad_cksum, CTLFLAG_RD | CTLFLAG_LOCKED,
736 &hwcksum_dbg_bad_cksum, "packets with bad hardware calculated checksum");
737
738 static uint64_t hwcksum_dbg_bad_rxoff = 0;
739 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
740 hwcksum_dbg_bad_rxoff, CTLFLAG_RD | CTLFLAG_LOCKED,
741 &hwcksum_dbg_bad_rxoff, "packets with invalid rxoff");
742
743 static uint64_t hwcksum_dbg_adjusted = 0;
744 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
745 hwcksum_dbg_adjusted, CTLFLAG_RD | CTLFLAG_LOCKED,
746 &hwcksum_dbg_adjusted, "packets with rxoff adjusted");
747
748 static uint64_t hwcksum_dbg_finalized_hdr = 0;
749 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
750 hwcksum_dbg_finalized_hdr, CTLFLAG_RD | CTLFLAG_LOCKED,
751 &hwcksum_dbg_finalized_hdr, "finalized headers");
752
753 static uint64_t hwcksum_dbg_finalized_data = 0;
754 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
755 hwcksum_dbg_finalized_data, CTLFLAG_RD | CTLFLAG_LOCKED,
756 &hwcksum_dbg_finalized_data, "finalized payloads");
757
758 uint32_t hwcksum_tx = 1;
759 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, hwcksum_tx,
760 CTLFLAG_RW | CTLFLAG_LOCKED, &hwcksum_tx, 0,
761 "enable transmit hardware checksum offload");
762
763 uint32_t hwcksum_rx = 1;
764 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, hwcksum_rx,
765 CTLFLAG_RW | CTLFLAG_LOCKED, &hwcksum_rx, 0,
766 "enable receive hardware checksum offload");
767
768 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, tx_chain_len_stats,
769 CTLFLAG_RD | CTLFLAG_LOCKED, 0, 9,
770 sysctl_tx_chain_len_stats, "S", "");
771
772 uint32_t tx_chain_len_count = 0;
773 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, tx_chain_len_count,
774 CTLFLAG_RW | CTLFLAG_LOCKED, &tx_chain_len_count, 0, "");
775
776 static uint32_t threshold_notify = 1; /* enable/disable */
777 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, threshold_notify,
778 CTLFLAG_RW | CTLFLAG_LOCKED, &threshold_notify, 0, "");
779
780 static uint32_t threshold_interval = 2; /* in seconds */
781 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, threshold_interval,
782 CTLFLAG_RW | CTLFLAG_LOCKED, &threshold_interval, 0, "");
783
784 #if (DEVELOPMENT || DEBUG)
785 static int sysctl_get_kao_frames SYSCTL_HANDLER_ARGS;
786 SYSCTL_NODE(_net_link_generic_system, OID_AUTO, get_kao_frames,
787 CTLFLAG_RD | CTLFLAG_LOCKED, sysctl_get_kao_frames, "");
788 #endif /* DEVELOPMENT || DEBUG */
789
790 struct net_api_stats net_api_stats;
791 SYSCTL_STRUCT(_net, OID_AUTO, api_stats, CTLFLAG_RD | CTLFLAG_LOCKED,
792 &net_api_stats, net_api_stats, "");
793
794 uint32_t net_wake_pkt_debug = 0;
795 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, wake_pkt_debug,
796 CTLFLAG_RW | CTLFLAG_LOCKED, &net_wake_pkt_debug, 0, "");
797
798 static void log_hexdump(void *data, size_t len);
799
800 unsigned int net_rxpoll = 1;
801 unsigned int net_affinity = 1;
802 unsigned int net_async = 1; /* 0: synchronous, 1: asynchronous */
803
804 static kern_return_t dlil_affinity_set(struct thread *, u_int32_t);
805
806 extern u_int32_t inject_buckets;
807
808 /* DLIL data threshold thread call */
809 static void dlil_dt_tcall_fn(thread_call_param_t, thread_call_param_t);
810
811 void
ifnet_filter_update_tso(struct ifnet * ifp,boolean_t filter_enable)812 ifnet_filter_update_tso(struct ifnet *ifp, boolean_t filter_enable)
813 {
814 /*
815 * update filter count and route_generation ID to let TCP
816 * know it should reevalute doing TSO or not
817 */
818 if (filter_enable) {
819 OSAddAtomic(1, &ifp->if_flt_no_tso_count);
820 } else {
821 VERIFY(ifp->if_flt_no_tso_count != 0);
822 OSAddAtomic(-1, &ifp->if_flt_no_tso_count);
823 }
824 routegenid_update();
825 }
826
827 #if SKYWALK
828
829 #if defined(XNU_TARGET_OS_OSX)
830 static bool net_check_compatible_if_filter(struct ifnet *ifp);
831 #endif /* XNU_TARGET_OS_OSX */
832
833 /* if_attach_nx flags defined in os_skywalk_private.h */
834 static unsigned int if_attach_nx = IF_ATTACH_NX_DEFAULT;
835 unsigned int if_enable_fsw_ip_netagent =
836 ((IF_ATTACH_NX_DEFAULT & IF_ATTACH_NX_FSW_IP_NETAGENT) != 0);
837 unsigned int if_enable_fsw_transport_netagent =
838 ((IF_ATTACH_NX_DEFAULT & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0);
839
840 unsigned int if_netif_all =
841 ((IF_ATTACH_NX_DEFAULT & IF_ATTACH_NX_NETIF_ALL) != 0);
842
843 /* Configure flowswitch to use max mtu sized buffer */
844 static bool fsw_use_max_mtu_buffer = false;
845
846 #if (DEVELOPMENT || DEBUG)
847 static int
848 if_attach_nx_sysctl SYSCTL_HANDLER_ARGS
849 {
850 #pragma unused(oidp, arg1, arg2)
851 unsigned int new_value;
852 int changed;
853 int error = sysctl_io_number(req, if_attach_nx, sizeof(if_attach_nx),
854 &new_value, &changed);
855 if (error) {
856 return error;
857 }
858 if (changed) {
859 if ((new_value & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) !=
860 (if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT)) {
861 return ENOTSUP;
862 }
863 if_attach_nx = new_value;
864 }
865 return 0;
866 }
867
868 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, if_attach_nx,
869 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
870 0, 0, &if_attach_nx_sysctl, "IU", "attach nexus");
871
872 #endif /* DEVELOPMENT || DEBUG */
873
874 static int
875 if_enable_fsw_transport_netagent_sysctl SYSCTL_HANDLER_ARGS
876 {
877 #pragma unused(oidp, arg1, arg2)
878 unsigned int new_value;
879 int changed;
880 int error;
881
882 error = sysctl_io_number(req, if_enable_fsw_transport_netagent,
883 sizeof(if_enable_fsw_transport_netagent),
884 &new_value, &changed);
885 if (error == 0 && changed != 0) {
886 if (new_value != 0 && new_value != 1) {
887 /* only allow 0 or 1 */
888 error = EINVAL;
889 } else if ((if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0) {
890 /* netagent can be enabled/disabled */
891 if_enable_fsw_transport_netagent = new_value;
892 if (new_value == 0) {
893 kern_nexus_deregister_netagents();
894 } else {
895 kern_nexus_register_netagents();
896 }
897 } else {
898 /* netagent can't be enabled */
899 error = ENOTSUP;
900 }
901 }
902 return error;
903 }
904
905 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, enable_netagent,
906 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
907 0, 0, &if_enable_fsw_transport_netagent_sysctl, "IU",
908 "enable flowswitch netagent");
909
910 static void dlil_detach_flowswitch_nexus(if_nexus_flowswitch_t nexus_fsw);
911
912 #include <skywalk/os_skywalk_private.h>
913
914 boolean_t
ifnet_nx_noauto(ifnet_t ifp)915 ifnet_nx_noauto(ifnet_t ifp)
916 {
917 return (ifp->if_xflags & IFXF_NX_NOAUTO) != 0;
918 }
919
920 boolean_t
ifnet_nx_noauto_flowswitch(ifnet_t ifp)921 ifnet_nx_noauto_flowswitch(ifnet_t ifp)
922 {
923 return ifnet_is_low_latency(ifp);
924 }
925
926 boolean_t
ifnet_is_low_latency(ifnet_t ifp)927 ifnet_is_low_latency(ifnet_t ifp)
928 {
929 return (ifp->if_xflags & IFXF_LOW_LATENCY) != 0;
930 }
931
932 boolean_t
ifnet_needs_compat(ifnet_t ifp)933 ifnet_needs_compat(ifnet_t ifp)
934 {
935 if ((if_attach_nx & IF_ATTACH_NX_NETIF_COMPAT) == 0) {
936 return FALSE;
937 }
938 #if !XNU_TARGET_OS_OSX
939 /*
940 * To conserve memory, we plumb in the compat layer selectively; this
941 * can be overridden via if_attach_nx flag IF_ATTACH_NX_NETIF_ALL.
942 * In particular, we check for Wi-Fi Access Point.
943 */
944 if (IFNET_IS_WIFI(ifp)) {
945 /* Wi-Fi Access Point */
946 if (ifp->if_name[0] == 'a' && ifp->if_name[1] == 'p' &&
947 ifp->if_name[2] == '\0') {
948 return if_netif_all;
949 }
950 }
951 #else /* XNU_TARGET_OS_OSX */
952 #pragma unused(ifp)
953 #endif /* XNU_TARGET_OS_OSX */
954 return TRUE;
955 }
956
957 boolean_t
ifnet_needs_fsw_transport_netagent(ifnet_t ifp)958 ifnet_needs_fsw_transport_netagent(ifnet_t ifp)
959 {
960 if (if_is_fsw_transport_netagent_enabled()) {
961 /* check if netagent has been manually enabled for ipsec/utun */
962 if (ifp->if_family == IFNET_FAMILY_IPSEC) {
963 return ipsec_interface_needs_netagent(ifp);
964 } else if (ifp->if_family == IFNET_FAMILY_UTUN) {
965 return utun_interface_needs_netagent(ifp);
966 }
967
968 /* check ifnet no auto nexus override */
969 if (ifnet_nx_noauto(ifp)) {
970 return FALSE;
971 }
972
973 /* check global if_attach_nx configuration */
974 switch (ifp->if_family) {
975 case IFNET_FAMILY_CELLULAR:
976 case IFNET_FAMILY_ETHERNET:
977 if ((if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0) {
978 return TRUE;
979 }
980 break;
981 default:
982 break;
983 }
984 }
985 return FALSE;
986 }
987
988 boolean_t
ifnet_needs_fsw_ip_netagent(ifnet_t ifp)989 ifnet_needs_fsw_ip_netagent(ifnet_t ifp)
990 {
991 #pragma unused(ifp)
992 if ((if_attach_nx & IF_ATTACH_NX_FSW_IP_NETAGENT) != 0) {
993 return TRUE;
994 }
995 return FALSE;
996 }
997
998 boolean_t
ifnet_needs_netif_netagent(ifnet_t ifp)999 ifnet_needs_netif_netagent(ifnet_t ifp)
1000 {
1001 #pragma unused(ifp)
1002 return (if_attach_nx & IF_ATTACH_NX_NETIF_NETAGENT) != 0;
1003 }
1004
1005 static boolean_t
dlil_detach_nexus_instance(nexus_controller_t controller,const char * func_str,uuid_t instance,uuid_t device)1006 dlil_detach_nexus_instance(nexus_controller_t controller,
1007 const char *func_str, uuid_t instance, uuid_t device)
1008 {
1009 errno_t err;
1010
1011 if (instance == NULL || uuid_is_null(instance)) {
1012 return FALSE;
1013 }
1014
1015 /* followed by the device port */
1016 if (device != NULL && !uuid_is_null(device)) {
1017 err = kern_nexus_ifdetach(controller, instance, device);
1018 if (err != 0) {
1019 DLIL_PRINTF("%s kern_nexus_ifdetach device failed %d\n",
1020 func_str, err);
1021 }
1022 }
1023 err = kern_nexus_controller_free_provider_instance(controller,
1024 instance);
1025 if (err != 0) {
1026 DLIL_PRINTF("%s free_provider_instance failed %d\n",
1027 func_str, err);
1028 }
1029 return TRUE;
1030 }
1031
1032 static boolean_t
dlil_detach_nexus(const char * func_str,uuid_t provider,uuid_t instance,uuid_t device)1033 dlil_detach_nexus(const char *func_str, uuid_t provider, uuid_t instance,
1034 uuid_t device)
1035 {
1036 boolean_t detached = FALSE;
1037 nexus_controller_t controller = kern_nexus_shared_controller();
1038 int err;
1039
1040 if (dlil_detach_nexus_instance(controller, func_str, instance,
1041 device)) {
1042 detached = TRUE;
1043 }
1044 if (provider != NULL && !uuid_is_null(provider)) {
1045 detached = TRUE;
1046 err = kern_nexus_controller_deregister_provider(controller,
1047 provider);
1048 if (err != 0) {
1049 DLIL_PRINTF("%s deregister_provider %d\n",
1050 func_str, err);
1051 }
1052 }
1053 return detached;
1054 }
1055
1056 static errno_t
dlil_create_provider_and_instance(nexus_controller_t controller,nexus_type_t type,ifnet_t ifp,uuid_t * provider,uuid_t * instance,nexus_attr_t attr)1057 dlil_create_provider_and_instance(nexus_controller_t controller,
1058 nexus_type_t type, ifnet_t ifp, uuid_t *provider, uuid_t *instance,
1059 nexus_attr_t attr)
1060 {
1061 uuid_t dom_prov;
1062 errno_t err;
1063 nexus_name_t provider_name;
1064 const char *type_name =
1065 (type == NEXUS_TYPE_NET_IF) ? "netif" : "flowswitch";
1066 struct kern_nexus_init init;
1067
1068 err = kern_nexus_get_default_domain_provider(type, &dom_prov);
1069 if (err != 0) {
1070 DLIL_PRINTF("%s can't get %s provider, error %d\n",
1071 __func__, type_name, err);
1072 goto failed;
1073 }
1074
1075 snprintf((char *)provider_name, sizeof(provider_name),
1076 "com.apple.%s.%s", type_name, if_name(ifp));
1077 err = kern_nexus_controller_register_provider(controller,
1078 dom_prov,
1079 provider_name,
1080 NULL,
1081 0,
1082 attr,
1083 provider);
1084 if (err != 0) {
1085 DLIL_PRINTF("%s register %s provider failed, error %d\n",
1086 __func__, type_name, err);
1087 goto failed;
1088 }
1089 bzero(&init, sizeof(init));
1090 init.nxi_version = KERN_NEXUS_CURRENT_VERSION;
1091 err = kern_nexus_controller_alloc_provider_instance(controller,
1092 *provider,
1093 NULL, NULL,
1094 instance, &init);
1095 if (err != 0) {
1096 DLIL_PRINTF("%s alloc_provider_instance %s failed, %d\n",
1097 __func__, type_name, err);
1098 kern_nexus_controller_deregister_provider(controller,
1099 *provider);
1100 goto failed;
1101 }
1102 failed:
1103 return err;
1104 }
1105
1106 static boolean_t
dlil_attach_netif_nexus_common(ifnet_t ifp,if_nexus_netif_t netif_nx)1107 dlil_attach_netif_nexus_common(ifnet_t ifp, if_nexus_netif_t netif_nx)
1108 {
1109 nexus_attr_t attr = NULL;
1110 nexus_controller_t controller;
1111 errno_t err;
1112
1113 if ((ifp->if_capabilities & IFCAP_SKYWALK) != 0) {
1114 /* it's already attached */
1115 if (dlil_verbose) {
1116 DLIL_PRINTF("%s: %s already has nexus attached\n",
1117 __func__, if_name(ifp));
1118 /* already attached */
1119 }
1120 goto failed;
1121 }
1122
1123 err = kern_nexus_attr_create(&attr);
1124 if (err != 0) {
1125 DLIL_PRINTF("%s: nexus attr create for %s\n", __func__,
1126 if_name(ifp));
1127 goto failed;
1128 }
1129 err = kern_nexus_attr_set(attr, NEXUS_ATTR_IFINDEX, ifp->if_index);
1130 VERIFY(err == 0);
1131
1132 controller = kern_nexus_shared_controller();
1133
1134 /* create the netif provider and instance */
1135 err = dlil_create_provider_and_instance(controller,
1136 NEXUS_TYPE_NET_IF, ifp, &netif_nx->if_nif_provider,
1137 &netif_nx->if_nif_instance, attr);
1138 if (err != 0) {
1139 goto failed;
1140 }
1141 err = kern_nexus_ifattach(controller, netif_nx->if_nif_instance,
1142 ifp, NULL, FALSE, &netif_nx->if_nif_attach);
1143 if (err != 0) {
1144 DLIL_PRINTF("%s kern_nexus_ifattach %d\n",
1145 __func__, err);
1146 /* cleanup provider and instance */
1147 dlil_detach_nexus(__func__, netif_nx->if_nif_provider,
1148 netif_nx->if_nif_instance, NULL);
1149 goto failed;
1150 }
1151 return TRUE;
1152
1153 failed:
1154 if (attr != NULL) {
1155 kern_nexus_attr_destroy(attr);
1156 }
1157 return FALSE;
1158 }
1159
1160 static boolean_t
dlil_attach_netif_compat_nexus(ifnet_t ifp,if_nexus_netif_t netif_nx)1161 dlil_attach_netif_compat_nexus(ifnet_t ifp, if_nexus_netif_t netif_nx)
1162 {
1163 if (ifnet_nx_noauto(ifp) || IFNET_IS_INTCOPROC(ifp) ||
1164 IFNET_IS_VMNET(ifp)) {
1165 goto failed;
1166 }
1167 switch (ifp->if_type) {
1168 case IFT_CELLULAR:
1169 case IFT_ETHER:
1170 if ((if_attach_nx & IF_ATTACH_NX_NETIF_COMPAT) == 0) {
1171 /* don't auto-attach */
1172 goto failed;
1173 }
1174 break;
1175 default:
1176 /* don't auto-attach */
1177 goto failed;
1178 }
1179 return dlil_attach_netif_nexus_common(ifp, netif_nx);
1180
1181 failed:
1182 return FALSE;
1183 }
1184
1185 static boolean_t
dlil_is_native_netif_nexus(ifnet_t ifp)1186 dlil_is_native_netif_nexus(ifnet_t ifp)
1187 {
1188 return (ifp->if_eflags & IFEF_SKYWALK_NATIVE) && ifp->if_na != NULL;
1189 }
1190
1191 __attribute__((noinline))
1192 static void
dlil_detach_netif_nexus(if_nexus_netif_t nexus_netif)1193 dlil_detach_netif_nexus(if_nexus_netif_t nexus_netif)
1194 {
1195 dlil_detach_nexus(__func__, nexus_netif->if_nif_provider,
1196 nexus_netif->if_nif_instance, nexus_netif->if_nif_attach);
1197 }
1198
1199 static inline int
dlil_siocgifdevmtu(struct ifnet * ifp,struct ifdevmtu * ifdm_p)1200 dlil_siocgifdevmtu(struct ifnet * ifp, struct ifdevmtu * ifdm_p)
1201 {
1202 struct ifreq ifr;
1203 int error;
1204
1205 bzero(&ifr, sizeof(ifr));
1206 error = ifnet_ioctl(ifp, 0, SIOCGIFDEVMTU, &ifr);
1207 if (error == 0) {
1208 *ifdm_p = ifr.ifr_devmtu;
1209 }
1210 return error;
1211 }
1212
1213 static inline int
_dlil_get_flowswitch_buffer_size(ifnet_t ifp,uuid_t netif,uint32_t * buf_size,bool * use_multi_buflet,uint32_t * large_buf_size)1214 _dlil_get_flowswitch_buffer_size(ifnet_t ifp, uuid_t netif, uint32_t *buf_size,
1215 bool *use_multi_buflet, uint32_t *large_buf_size)
1216 {
1217 struct kern_pbufpool_memory_info rx_pp_info;
1218 struct kern_pbufpool_memory_info tx_pp_info;
1219 uint32_t if_max_mtu = 0;
1220 uint32_t drv_buf_size;
1221 struct ifdevmtu ifdm;
1222 int err;
1223
1224 /*
1225 * To perform intra-stack RX aggregation flowswitch needs to use
1226 * multi-buflet packet.
1227 */
1228 *use_multi_buflet = NX_FSW_TCP_RX_AGG_ENABLED();
1229
1230 *large_buf_size = *use_multi_buflet ? NX_FSW_DEF_LARGE_BUFSIZE : 0;
1231 /*
1232 * IP over Thunderbolt interface can deliver the largest IP packet,
1233 * but the driver advertises the MAX MTU as only 9K.
1234 */
1235 if (IFNET_IS_THUNDERBOLT_IP(ifp)) {
1236 if_max_mtu = IP_MAXPACKET;
1237 goto skip_mtu_ioctl;
1238 }
1239
1240 /* determine max mtu */
1241 bzero(&ifdm, sizeof(ifdm));
1242 err = dlil_siocgifdevmtu(ifp, &ifdm);
1243 if (__improbable(err != 0)) {
1244 DLIL_PRINTF("%s: SIOCGIFDEVMTU failed for %s\n",
1245 __func__, if_name(ifp));
1246 /* use default flowswitch buffer size */
1247 if_max_mtu = NX_FSW_BUFSIZE;
1248 } else {
1249 DLIL_PRINTF("%s: %s %d %d\n", __func__, if_name(ifp),
1250 ifdm.ifdm_max, ifdm.ifdm_current);
1251 /* rdar://problem/44589731 */
1252 if_max_mtu = MAX(ifdm.ifdm_max, ifdm.ifdm_current);
1253 }
1254
1255 skip_mtu_ioctl:
1256 if (if_max_mtu == 0) {
1257 DLIL_PRINTF("%s: can't determine MAX MTU for %s\n",
1258 __func__, if_name(ifp));
1259 return EINVAL;
1260 }
1261 if ((if_max_mtu > NX_FSW_MAXBUFSIZE) && fsw_use_max_mtu_buffer) {
1262 DLIL_PRINTF("%s: interace (%s) has MAX MTU (%u) > flowswitch "
1263 "max bufsize(%d)\n", __func__,
1264 if_name(ifp), if_max_mtu, NX_FSW_MAXBUFSIZE);
1265 return EINVAL;
1266 }
1267
1268 /*
1269 * for skywalk native driver, consult the driver packet pool also.
1270 */
1271 if (dlil_is_native_netif_nexus(ifp)) {
1272 err = kern_nexus_get_pbufpool_info(netif, &rx_pp_info,
1273 &tx_pp_info);
1274 if (err != 0) {
1275 DLIL_PRINTF("%s: can't get pbufpool info for %s\n",
1276 __func__, if_name(ifp));
1277 return ENXIO;
1278 }
1279 drv_buf_size = tx_pp_info.kpm_bufsize *
1280 tx_pp_info.kpm_max_frags;
1281 if (if_max_mtu > drv_buf_size) {
1282 DLIL_PRINTF("%s: interface %s packet pool (rx %d * %d, "
1283 "tx %d * %d) can't support max mtu(%d)\n", __func__,
1284 if_name(ifp), rx_pp_info.kpm_bufsize,
1285 rx_pp_info.kpm_max_frags, tx_pp_info.kpm_bufsize,
1286 tx_pp_info.kpm_max_frags, if_max_mtu);
1287 return EINVAL;
1288 }
1289 } else {
1290 drv_buf_size = if_max_mtu;
1291 }
1292
1293 if ((drv_buf_size > NX_FSW_BUFSIZE) && (!fsw_use_max_mtu_buffer)) {
1294 _CASSERT((NX_FSW_BUFSIZE * NX_PBUF_FRAGS_MAX) >= IP_MAXPACKET);
1295 *use_multi_buflet = true;
1296 /* default flowswitch buffer size */
1297 *buf_size = NX_FSW_BUFSIZE;
1298 *large_buf_size = MIN(NX_FSW_MAX_LARGE_BUFSIZE, drv_buf_size);
1299 } else {
1300 *buf_size = MAX(drv_buf_size, NX_FSW_BUFSIZE);
1301 }
1302
1303 /*
1304 * if HW TSO is enabled on a Skywalk native interface then make
1305 * the flowswitch default buffer be able to handle max TSO segment.
1306 */
1307 uint32_t tso_v4_mtu = 0;
1308 uint32_t tso_v6_mtu = 0;
1309 #ifdef XNU_TARGET_OS_OSX
1310 if (dlil_is_native_netif_nexus(ifp)) {
1311 if ((ifp->if_hwassist & IFNET_TSO_IPV4) != 0) {
1312 tso_v4_mtu = ifp->if_tso_v4_mtu;
1313 }
1314 if ((ifp->if_hwassist & IFNET_TSO_IPV6) != 0) {
1315 tso_v6_mtu = ifp->if_tso_v6_mtu;
1316 }
1317 }
1318 #endif /* XNU_TARGET_OS_OSX */
1319 if ((tso_v4_mtu != 0) || (tso_v6_mtu != 0)) {
1320 *buf_size = max(*buf_size, max(tso_v4_mtu, tso_v6_mtu));
1321 ASSERT(*buf_size <= NX_FSW_MAXBUFSIZE);
1322 }
1323 if (*buf_size >= *large_buf_size) {
1324 *large_buf_size = 0;
1325 }
1326 return 0;
1327 }
1328
1329 static boolean_t
_dlil_attach_flowswitch_nexus(ifnet_t ifp,if_nexus_flowswitch_t nexus_fsw)1330 _dlil_attach_flowswitch_nexus(ifnet_t ifp, if_nexus_flowswitch_t nexus_fsw)
1331 {
1332 nexus_attr_t attr = NULL;
1333 nexus_controller_t controller;
1334 errno_t err = 0;
1335 uuid_t netif;
1336 uint32_t buf_size = 0;
1337 uint32_t large_buf_size = 0;
1338 bool multi_buflet;
1339
1340 if (ifnet_nx_noauto(ifp) || ifnet_nx_noauto_flowswitch(ifp) ||
1341 IFNET_IS_VMNET(ifp)) {
1342 goto failed;
1343 }
1344
1345 if ((ifp->if_capabilities & IFCAP_SKYWALK) == 0) {
1346 /* not possible to attach (netif native/compat not plumbed) */
1347 goto failed;
1348 }
1349
1350 if ((if_attach_nx & IF_ATTACH_NX_FLOWSWITCH) == 0) {
1351 /* don't auto-attach */
1352 goto failed;
1353 }
1354
1355 /* get the netif instance from the ifp */
1356 err = kern_nexus_get_netif_instance(ifp, netif);
1357 if (err != 0) {
1358 DLIL_PRINTF("%s: can't find netif for %s\n", __func__,
1359 if_name(ifp));
1360 goto failed;
1361 }
1362
1363 err = kern_nexus_attr_create(&attr);
1364 if (err != 0) {
1365 DLIL_PRINTF("%s: nexus attr create for %s\n", __func__,
1366 if_name(ifp));
1367 goto failed;
1368 }
1369
1370 err = _dlil_get_flowswitch_buffer_size(ifp, netif, &buf_size,
1371 &multi_buflet, &large_buf_size);
1372 if (err != 0) {
1373 goto failed;
1374 }
1375 ASSERT((buf_size >= NX_FSW_BUFSIZE) && (buf_size <= NX_FSW_MAXBUFSIZE));
1376 ASSERT(large_buf_size <= NX_FSW_MAX_LARGE_BUFSIZE);
1377
1378 /* Configure flowswitch buffer size */
1379 err = kern_nexus_attr_set(attr, NEXUS_ATTR_SLOT_BUF_SIZE, buf_size);
1380 VERIFY(err == 0);
1381 err = kern_nexus_attr_set(attr, NEXUS_ATTR_LARGE_BUF_SIZE,
1382 large_buf_size);
1383 VERIFY(err == 0);
1384
1385 /*
1386 * Configure flowswitch to use super-packet (multi-buflet).
1387 */
1388 err = kern_nexus_attr_set(attr, NEXUS_ATTR_MAX_FRAGS,
1389 multi_buflet ? NX_PBUF_FRAGS_MAX : 1);
1390 VERIFY(err == 0);
1391
1392 /* create the flowswitch provider and instance */
1393 controller = kern_nexus_shared_controller();
1394 err = dlil_create_provider_and_instance(controller,
1395 NEXUS_TYPE_FLOW_SWITCH, ifp, &nexus_fsw->if_fsw_provider,
1396 &nexus_fsw->if_fsw_instance, attr);
1397 if (err != 0) {
1398 goto failed;
1399 }
1400
1401 /* attach the device port */
1402 err = kern_nexus_ifattach(controller, nexus_fsw->if_fsw_instance,
1403 NULL, netif, FALSE, &nexus_fsw->if_fsw_device);
1404 if (err != 0) {
1405 DLIL_PRINTF("%s kern_nexus_ifattach device failed %d %s\n",
1406 __func__, err, if_name(ifp));
1407 /* cleanup provider and instance */
1408 dlil_detach_nexus(__func__, nexus_fsw->if_fsw_provider,
1409 nexus_fsw->if_fsw_instance, nexus_fsw->if_fsw_device);
1410 goto failed;
1411 }
1412 return TRUE;
1413
1414 failed:
1415 if (err != 0) {
1416 DLIL_PRINTF("%s: failed to attach flowswitch to %s, error %d\n",
1417 __func__, if_name(ifp), err);
1418 } else {
1419 DLIL_PRINTF("%s: not attaching flowswitch to %s\n",
1420 __func__, if_name(ifp));
1421 }
1422 if (attr != NULL) {
1423 kern_nexus_attr_destroy(attr);
1424 }
1425 return FALSE;
1426 }
1427
1428 static boolean_t
dlil_attach_flowswitch_nexus(ifnet_t ifp)1429 dlil_attach_flowswitch_nexus(ifnet_t ifp)
1430 {
1431 boolean_t attached;
1432 if_nexus_flowswitch nexus_fsw;
1433
1434 #if (DEVELOPMENT || DEBUG)
1435 if (skywalk_netif_direct_allowed(if_name(ifp))) {
1436 DLIL_PRINTF("skip attaching fsw to %s", if_name(ifp));
1437 return FALSE;
1438 }
1439 #endif /* (DEVELOPMENT || DEBUG) */
1440
1441 /*
1442 * flowswitch attachment is not supported for interface using the
1443 * legacy model (IFNET_INIT_LEGACY)
1444 */
1445 if ((ifp->if_eflags & IFEF_TXSTART) == 0) {
1446 DLIL_PRINTF("skip attaching fsw to %s using legacy TX model",
1447 if_name(ifp));
1448 return FALSE;
1449 }
1450
1451 if (uuid_is_null(ifp->if_nx_flowswitch.if_fsw_instance) == 0) {
1452 /* it's already attached */
1453 return FALSE;
1454 }
1455 bzero(&nexus_fsw, sizeof(nexus_fsw));
1456 attached = _dlil_attach_flowswitch_nexus(ifp, &nexus_fsw);
1457 if (attached) {
1458 ifnet_lock_exclusive(ifp);
1459 if (!IF_FULLY_ATTACHED(ifp)) {
1460 /* interface is going away */
1461 attached = FALSE;
1462 } else {
1463 ifp->if_nx_flowswitch = nexus_fsw;
1464 }
1465 ifnet_lock_done(ifp);
1466 if (!attached) {
1467 /* clean up flowswitch nexus */
1468 dlil_detach_flowswitch_nexus(&nexus_fsw);
1469 }
1470 }
1471 return attached;
1472 }
1473
1474 __attribute__((noinline))
1475 static void
dlil_detach_flowswitch_nexus(if_nexus_flowswitch_t nexus_fsw)1476 dlil_detach_flowswitch_nexus(if_nexus_flowswitch_t nexus_fsw)
1477 {
1478 dlil_detach_nexus(__func__, nexus_fsw->if_fsw_provider,
1479 nexus_fsw->if_fsw_instance, nexus_fsw->if_fsw_device);
1480 }
1481
1482 __attribute__((noinline))
1483 static void
dlil_netif_detach_notify(ifnet_t ifp)1484 dlil_netif_detach_notify(ifnet_t ifp)
1485 {
1486 void (*detach_notify)(struct nexus_netif_adapter *);
1487
1488 /*
1489 * This is only needed for low latency interfaces for now.
1490 */
1491 if (!ifnet_is_low_latency(ifp)) {
1492 return;
1493 }
1494 detach_notify = (ifp->if_na_ops != NULL) ? ifp->if_na_ops->ni_detach_notify : NULL;
1495 if (detach_notify != NULL) {
1496 (*detach_notify)(ifp->if_na);
1497 } else {
1498 DLIL_PRINTF("%s: %s has no detach notify calback\n",
1499 __func__, if_name(ifp));
1500 }
1501 }
1502
1503 __attribute__((noinline))
1504 static void
dlil_quiesce_and_detach_nexuses(ifnet_t ifp)1505 dlil_quiesce_and_detach_nexuses(ifnet_t ifp)
1506 {
1507 if_nexus_flowswitch *nx_fsw = &ifp->if_nx_flowswitch;
1508 if_nexus_netif *nx_netif = &ifp->if_nx_netif;
1509
1510 ifnet_datamov_suspend_and_drain(ifp);
1511 if (!uuid_is_null(nx_fsw->if_fsw_device)) {
1512 ASSERT(!uuid_is_null(nx_fsw->if_fsw_provider));
1513 ASSERT(!uuid_is_null(nx_fsw->if_fsw_instance));
1514 dlil_detach_flowswitch_nexus(nx_fsw);
1515 bzero(nx_fsw, sizeof(*nx_fsw));
1516 } else {
1517 ASSERT(uuid_is_null(nx_fsw->if_fsw_provider));
1518 ASSERT(uuid_is_null(nx_fsw->if_fsw_instance));
1519 DTRACE_IP1(fsw__not__attached, ifnet_t, ifp);
1520 }
1521
1522 if (!uuid_is_null(nx_netif->if_nif_attach)) {
1523 ASSERT(!uuid_is_null(nx_netif->if_nif_provider));
1524 ASSERT(!uuid_is_null(nx_netif->if_nif_instance));
1525 dlil_detach_netif_nexus(nx_netif);
1526 bzero(nx_netif, sizeof(*nx_netif));
1527 } else {
1528 ASSERT(uuid_is_null(nx_netif->if_nif_provider));
1529 ASSERT(uuid_is_null(nx_netif->if_nif_instance));
1530 DTRACE_IP1(netif__not__attached, ifnet_t, ifp);
1531 }
1532 ifnet_datamov_resume(ifp);
1533 }
1534
1535 boolean_t
ifnet_add_netagent(ifnet_t ifp)1536 ifnet_add_netagent(ifnet_t ifp)
1537 {
1538 int error;
1539
1540 error = kern_nexus_interface_add_netagent(ifp);
1541 os_log(OS_LOG_DEFAULT,
1542 "kern_nexus_interface_add_netagent(%s) returned %d",
1543 ifp->if_xname, error);
1544 return error == 0;
1545 }
1546
1547 boolean_t
ifnet_remove_netagent(ifnet_t ifp)1548 ifnet_remove_netagent(ifnet_t ifp)
1549 {
1550 int error;
1551
1552 error = kern_nexus_interface_remove_netagent(ifp);
1553 os_log(OS_LOG_DEFAULT,
1554 "kern_nexus_interface_remove_netagent(%s) returned %d",
1555 ifp->if_xname, error);
1556 return error == 0;
1557 }
1558
1559 boolean_t
ifnet_attach_flowswitch_nexus(ifnet_t ifp)1560 ifnet_attach_flowswitch_nexus(ifnet_t ifp)
1561 {
1562 if (!IF_FULLY_ATTACHED(ifp)) {
1563 return FALSE;
1564 }
1565 return dlil_attach_flowswitch_nexus(ifp);
1566 }
1567
1568 boolean_t
ifnet_detach_flowswitch_nexus(ifnet_t ifp)1569 ifnet_detach_flowswitch_nexus(ifnet_t ifp)
1570 {
1571 if_nexus_flowswitch nexus_fsw;
1572
1573 ifnet_lock_exclusive(ifp);
1574 nexus_fsw = ifp->if_nx_flowswitch;
1575 bzero(&ifp->if_nx_flowswitch, sizeof(ifp->if_nx_flowswitch));
1576 ifnet_lock_done(ifp);
1577 return dlil_detach_nexus(__func__, nexus_fsw.if_fsw_provider,
1578 nexus_fsw.if_fsw_instance, nexus_fsw.if_fsw_device);
1579 }
1580
1581 boolean_t
ifnet_attach_netif_nexus(ifnet_t ifp)1582 ifnet_attach_netif_nexus(ifnet_t ifp)
1583 {
1584 boolean_t nexus_attached;
1585 if_nexus_netif nexus_netif;
1586
1587 if (!IF_FULLY_ATTACHED(ifp)) {
1588 return FALSE;
1589 }
1590 nexus_attached = dlil_attach_netif_nexus_common(ifp, &nexus_netif);
1591 if (nexus_attached) {
1592 ifnet_lock_exclusive(ifp);
1593 ifp->if_nx_netif = nexus_netif;
1594 ifnet_lock_done(ifp);
1595 }
1596 return nexus_attached;
1597 }
1598
1599 boolean_t
ifnet_detach_netif_nexus(ifnet_t ifp)1600 ifnet_detach_netif_nexus(ifnet_t ifp)
1601 {
1602 if_nexus_netif nexus_netif;
1603
1604 ifnet_lock_exclusive(ifp);
1605 nexus_netif = ifp->if_nx_netif;
1606 bzero(&ifp->if_nx_netif, sizeof(ifp->if_nx_netif));
1607 ifnet_lock_done(ifp);
1608
1609 return dlil_detach_nexus(__func__, nexus_netif.if_nif_provider,
1610 nexus_netif.if_nif_instance, nexus_netif.if_nif_attach);
1611 }
1612
1613 void
ifnet_attach_native_flowswitch(ifnet_t ifp)1614 ifnet_attach_native_flowswitch(ifnet_t ifp)
1615 {
1616 if (!dlil_is_native_netif_nexus(ifp)) {
1617 /* not a native netif */
1618 return;
1619 }
1620 ifnet_attach_flowswitch_nexus(ifp);
1621 }
1622
1623 #endif /* SKYWALK */
1624
1625 #define DLIL_INPUT_CHECK(m, ifp) { \
1626 struct ifnet *_rcvif = mbuf_pkthdr_rcvif(m); \
1627 if (_rcvif == NULL || (ifp != lo_ifp && _rcvif != ifp) || \
1628 !(mbuf_flags(m) & MBUF_PKTHDR)) { \
1629 panic_plain("%s: invalid mbuf %p\n", __func__, m); \
1630 /* NOTREACHED */ \
1631 } \
1632 }
1633
1634 #define DLIL_EWMA(old, new, decay) do { \
1635 u_int32_t _avg; \
1636 if ((_avg = (old)) > 0) \
1637 _avg = (((_avg << (decay)) - _avg) + (new)) >> (decay); \
1638 else \
1639 _avg = (new); \
1640 (old) = _avg; \
1641 } while (0)
1642
1643 #define MBPS (1ULL * 1000 * 1000)
1644 #define GBPS (MBPS * 1000)
1645
1646 struct rxpoll_time_tbl {
1647 u_int64_t speed; /* downlink speed */
1648 u_int32_t plowat; /* packets low watermark */
1649 u_int32_t phiwat; /* packets high watermark */
1650 u_int32_t blowat; /* bytes low watermark */
1651 u_int32_t bhiwat; /* bytes high watermark */
1652 };
1653
1654 static struct rxpoll_time_tbl rxpoll_tbl[] = {
1655 { .speed = 10 * MBPS, .plowat = 2, .phiwat = 8, .blowat = (1 * 1024), .bhiwat = (6 * 1024) },
1656 { .speed = 100 * MBPS, .plowat = 10, .phiwat = 40, .blowat = (4 * 1024), .bhiwat = (64 * 1024) },
1657 { .speed = 1 * GBPS, .plowat = 10, .phiwat = 40, .blowat = (4 * 1024), .bhiwat = (64 * 1024) },
1658 { .speed = 10 * GBPS, .plowat = 10, .phiwat = 40, .blowat = (4 * 1024), .bhiwat = (64 * 1024) },
1659 { .speed = 100 * GBPS, .plowat = 10, .phiwat = 40, .blowat = (4 * 1024), .bhiwat = (64 * 1024) },
1660 { .speed = 0, .plowat = 0, .phiwat = 0, .blowat = 0, .bhiwat = 0 }
1661 };
1662
1663 static LCK_MTX_DECLARE_ATTR(dlil_thread_sync_lock, &dlil_lock_group,
1664 &dlil_lck_attributes);
1665 static uint32_t dlil_pending_thread_cnt = 0;
1666
1667 static void
dlil_incr_pending_thread_count(void)1668 dlil_incr_pending_thread_count(void)
1669 {
1670 LCK_MTX_ASSERT(&dlil_thread_sync_lock, LCK_MTX_ASSERT_NOTOWNED);
1671 lck_mtx_lock(&dlil_thread_sync_lock);
1672 dlil_pending_thread_cnt++;
1673 lck_mtx_unlock(&dlil_thread_sync_lock);
1674 }
1675
1676 static void
dlil_decr_pending_thread_count(void)1677 dlil_decr_pending_thread_count(void)
1678 {
1679 LCK_MTX_ASSERT(&dlil_thread_sync_lock, LCK_MTX_ASSERT_NOTOWNED);
1680 lck_mtx_lock(&dlil_thread_sync_lock);
1681 VERIFY(dlil_pending_thread_cnt > 0);
1682 dlil_pending_thread_cnt--;
1683 if (dlil_pending_thread_cnt == 0) {
1684 wakeup(&dlil_pending_thread_cnt);
1685 }
1686 lck_mtx_unlock(&dlil_thread_sync_lock);
1687 }
1688
1689 int
proto_hash_value(u_int32_t protocol_family)1690 proto_hash_value(u_int32_t protocol_family)
1691 {
1692 /*
1693 * dlil_proto_unplumb_all() depends on the mapping between
1694 * the hash bucket index and the protocol family defined
1695 * here; future changes must be applied there as well.
1696 */
1697 switch (protocol_family) {
1698 case PF_INET:
1699 return 0;
1700 case PF_INET6:
1701 return 1;
1702 case PF_VLAN:
1703 return 2;
1704 case PF_UNSPEC:
1705 default:
1706 return 3;
1707 }
1708 }
1709
1710 /*
1711 * Caller must already be holding ifnet lock.
1712 */
1713 static struct if_proto *
find_attached_proto(struct ifnet * ifp,u_int32_t protocol_family)1714 find_attached_proto(struct ifnet *ifp, u_int32_t protocol_family)
1715 {
1716 struct if_proto *proto = NULL;
1717 u_int32_t i = proto_hash_value(protocol_family);
1718
1719 ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_OWNED);
1720
1721 if (ifp->if_proto_hash != NULL) {
1722 proto = SLIST_FIRST(&ifp->if_proto_hash[i]);
1723 }
1724
1725 while (proto != NULL && proto->protocol_family != protocol_family) {
1726 proto = SLIST_NEXT(proto, next_hash);
1727 }
1728
1729 if (proto != NULL) {
1730 if_proto_ref(proto);
1731 }
1732
1733 return proto;
1734 }
1735
1736 static void
if_proto_ref(struct if_proto * proto)1737 if_proto_ref(struct if_proto *proto)
1738 {
1739 atomic_add_32(&proto->refcount, 1);
1740 }
1741
1742 extern void if_rtproto_del(struct ifnet *ifp, int protocol);
1743
1744 static void
if_proto_free(struct if_proto * proto)1745 if_proto_free(struct if_proto *proto)
1746 {
1747 u_int32_t oldval;
1748 struct ifnet *ifp = proto->ifp;
1749 u_int32_t proto_family = proto->protocol_family;
1750 struct kev_dl_proto_data ev_pr_data;
1751
1752 oldval = atomic_add_32_ov(&proto->refcount, -1);
1753 if (oldval > 1) {
1754 return;
1755 }
1756
1757 if (proto->proto_kpi == kProtoKPI_v1) {
1758 if (proto->kpi.v1.detached) {
1759 proto->kpi.v1.detached(ifp, proto->protocol_family);
1760 }
1761 }
1762 if (proto->proto_kpi == kProtoKPI_v2) {
1763 if (proto->kpi.v2.detached) {
1764 proto->kpi.v2.detached(ifp, proto->protocol_family);
1765 }
1766 }
1767
1768 /*
1769 * Cleanup routes that may still be in the routing table for that
1770 * interface/protocol pair.
1771 */
1772 if_rtproto_del(ifp, proto_family);
1773
1774 ifnet_lock_shared(ifp);
1775
1776 /* No more reference on this, protocol must have been detached */
1777 VERIFY(proto->detached);
1778
1779 /*
1780 * The reserved field carries the number of protocol still attached
1781 * (subject to change)
1782 */
1783 ev_pr_data.proto_family = proto_family;
1784 ev_pr_data.proto_remaining_count = dlil_ifp_protolist(ifp, NULL, 0);
1785
1786 ifnet_lock_done(ifp);
1787
1788 dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_PROTO_DETACHED,
1789 (struct net_event_data *)&ev_pr_data,
1790 sizeof(struct kev_dl_proto_data), FALSE);
1791
1792 if (ev_pr_data.proto_remaining_count == 0) {
1793 /*
1794 * The protocol count has gone to zero, mark the interface down.
1795 * This used to be done by configd.KernelEventMonitor, but that
1796 * is inherently prone to races (rdar://problem/30810208).
1797 */
1798 (void) ifnet_set_flags(ifp, 0, IFF_UP);
1799 (void) ifnet_ioctl(ifp, 0, SIOCSIFFLAGS, NULL);
1800 dlil_post_sifflags_msg(ifp);
1801 }
1802
1803 zfree(dlif_proto_zone, proto);
1804 }
1805
1806 __private_extern__ void
ifnet_lock_assert(struct ifnet * ifp,ifnet_lock_assert_t what)1807 ifnet_lock_assert(struct ifnet *ifp, ifnet_lock_assert_t what)
1808 {
1809 #if !MACH_ASSERT
1810 #pragma unused(ifp)
1811 #endif
1812 unsigned int type = 0;
1813 int ass = 1;
1814
1815 switch (what) {
1816 case IFNET_LCK_ASSERT_EXCLUSIVE:
1817 type = LCK_RW_ASSERT_EXCLUSIVE;
1818 break;
1819
1820 case IFNET_LCK_ASSERT_SHARED:
1821 type = LCK_RW_ASSERT_SHARED;
1822 break;
1823
1824 case IFNET_LCK_ASSERT_OWNED:
1825 type = LCK_RW_ASSERT_HELD;
1826 break;
1827
1828 case IFNET_LCK_ASSERT_NOTOWNED:
1829 /* nothing to do here for RW lock; bypass assert */
1830 ass = 0;
1831 break;
1832
1833 default:
1834 panic("bad ifnet assert type: %d", what);
1835 /* NOTREACHED */
1836 }
1837 if (ass) {
1838 LCK_RW_ASSERT(&ifp->if_lock, type);
1839 }
1840 }
1841
1842 __private_extern__ void
ifnet_lock_shared(struct ifnet * ifp)1843 ifnet_lock_shared(struct ifnet *ifp)
1844 {
1845 lck_rw_lock_shared(&ifp->if_lock);
1846 }
1847
1848 __private_extern__ void
ifnet_lock_exclusive(struct ifnet * ifp)1849 ifnet_lock_exclusive(struct ifnet *ifp)
1850 {
1851 lck_rw_lock_exclusive(&ifp->if_lock);
1852 }
1853
1854 __private_extern__ void
ifnet_lock_done(struct ifnet * ifp)1855 ifnet_lock_done(struct ifnet *ifp)
1856 {
1857 lck_rw_done(&ifp->if_lock);
1858 }
1859
1860 #if INET
1861 __private_extern__ void
if_inetdata_lock_shared(struct ifnet * ifp)1862 if_inetdata_lock_shared(struct ifnet *ifp)
1863 {
1864 lck_rw_lock_shared(&ifp->if_inetdata_lock);
1865 }
1866
1867 __private_extern__ void
if_inetdata_lock_exclusive(struct ifnet * ifp)1868 if_inetdata_lock_exclusive(struct ifnet *ifp)
1869 {
1870 lck_rw_lock_exclusive(&ifp->if_inetdata_lock);
1871 }
1872
1873 __private_extern__ void
if_inetdata_lock_done(struct ifnet * ifp)1874 if_inetdata_lock_done(struct ifnet *ifp)
1875 {
1876 lck_rw_done(&ifp->if_inetdata_lock);
1877 }
1878 #endif
1879
1880 __private_extern__ void
if_inet6data_lock_shared(struct ifnet * ifp)1881 if_inet6data_lock_shared(struct ifnet *ifp)
1882 {
1883 lck_rw_lock_shared(&ifp->if_inet6data_lock);
1884 }
1885
1886 __private_extern__ void
if_inet6data_lock_exclusive(struct ifnet * ifp)1887 if_inet6data_lock_exclusive(struct ifnet *ifp)
1888 {
1889 lck_rw_lock_exclusive(&ifp->if_inet6data_lock);
1890 }
1891
1892 __private_extern__ void
if_inet6data_lock_done(struct ifnet * ifp)1893 if_inet6data_lock_done(struct ifnet *ifp)
1894 {
1895 lck_rw_done(&ifp->if_inet6data_lock);
1896 }
1897
1898 __private_extern__ void
ifnet_head_lock_shared(void)1899 ifnet_head_lock_shared(void)
1900 {
1901 lck_rw_lock_shared(&ifnet_head_lock);
1902 }
1903
1904 __private_extern__ void
ifnet_head_lock_exclusive(void)1905 ifnet_head_lock_exclusive(void)
1906 {
1907 lck_rw_lock_exclusive(&ifnet_head_lock);
1908 }
1909
1910 __private_extern__ void
ifnet_head_done(void)1911 ifnet_head_done(void)
1912 {
1913 lck_rw_done(&ifnet_head_lock);
1914 }
1915
1916 __private_extern__ void
ifnet_head_assert_exclusive(void)1917 ifnet_head_assert_exclusive(void)
1918 {
1919 LCK_RW_ASSERT(&ifnet_head_lock, LCK_RW_ASSERT_EXCLUSIVE);
1920 }
1921
1922 /*
1923 * dlil_ifp_protolist
1924 * - get the list of protocols attached to the interface, or just the number
1925 * of attached protocols
1926 * - if the number returned is greater than 'list_count', truncation occurred
1927 *
1928 * Note:
1929 * - caller must already be holding ifnet lock.
1930 */
1931 static u_int32_t
dlil_ifp_protolist(struct ifnet * ifp,protocol_family_t * list,u_int32_t list_count)1932 dlil_ifp_protolist(struct ifnet *ifp, protocol_family_t *list,
1933 u_int32_t list_count)
1934 {
1935 u_int32_t count = 0;
1936 int i;
1937
1938 ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_OWNED);
1939
1940 if (ifp->if_proto_hash == NULL) {
1941 goto done;
1942 }
1943
1944 for (i = 0; i < PROTO_HASH_SLOTS; i++) {
1945 struct if_proto *proto;
1946 SLIST_FOREACH(proto, &ifp->if_proto_hash[i], next_hash) {
1947 if (list != NULL && count < list_count) {
1948 list[count] = proto->protocol_family;
1949 }
1950 count++;
1951 }
1952 }
1953 done:
1954 return count;
1955 }
1956
1957 __private_extern__ u_int32_t
if_get_protolist(struct ifnet * ifp,u_int32_t * protolist,u_int32_t count)1958 if_get_protolist(struct ifnet * ifp, u_int32_t *protolist, u_int32_t count)
1959 {
1960 ifnet_lock_shared(ifp);
1961 count = dlil_ifp_protolist(ifp, protolist, count);
1962 ifnet_lock_done(ifp);
1963 return count;
1964 }
1965
1966 __private_extern__ void
if_free_protolist(u_int32_t * list)1967 if_free_protolist(u_int32_t *list)
1968 {
1969 kfree_data_addr(list);
1970 }
1971
1972 __private_extern__ int
dlil_post_msg(struct ifnet * ifp,u_int32_t event_subclass,u_int32_t event_code,struct net_event_data * event_data,u_int32_t event_data_len,boolean_t suppress_generation)1973 dlil_post_msg(struct ifnet *ifp, u_int32_t event_subclass,
1974 u_int32_t event_code, struct net_event_data *event_data,
1975 u_int32_t event_data_len, boolean_t suppress_generation)
1976 {
1977 struct net_event_data ev_data;
1978 struct kev_msg ev_msg;
1979
1980 bzero(&ev_msg, sizeof(ev_msg));
1981 bzero(&ev_data, sizeof(ev_data));
1982 /*
1983 * a net event always starts with a net_event_data structure
1984 * but the caller can generate a simple net event or
1985 * provide a longer event structure to post
1986 */
1987 ev_msg.vendor_code = KEV_VENDOR_APPLE;
1988 ev_msg.kev_class = KEV_NETWORK_CLASS;
1989 ev_msg.kev_subclass = event_subclass;
1990 ev_msg.event_code = event_code;
1991
1992 if (event_data == NULL) {
1993 event_data = &ev_data;
1994 event_data_len = sizeof(struct net_event_data);
1995 }
1996
1997 strlcpy(&event_data->if_name[0], ifp->if_name, IFNAMSIZ);
1998 event_data->if_family = ifp->if_family;
1999 event_data->if_unit = (u_int32_t)ifp->if_unit;
2000
2001 ev_msg.dv[0].data_length = event_data_len;
2002 ev_msg.dv[0].data_ptr = event_data;
2003 ev_msg.dv[1].data_length = 0;
2004
2005 bool update_generation = true;
2006 if (event_subclass == KEV_DL_SUBCLASS) {
2007 /* Don't update interface generation for frequent link quality and state changes */
2008 switch (event_code) {
2009 case KEV_DL_LINK_QUALITY_METRIC_CHANGED:
2010 case KEV_DL_RRC_STATE_CHANGED:
2011 case KEV_DL_PRIMARY_ELECTED:
2012 update_generation = false;
2013 break;
2014 default:
2015 break;
2016 }
2017 }
2018
2019 /*
2020 * Some events that update generation counts might
2021 * want to suppress generation count.
2022 * One example is node presence/absence where we still
2023 * issue kernel event for the invocation but want to avoid
2024 * expensive operation of updating generation which triggers
2025 * NECP client updates.
2026 */
2027 if (suppress_generation) {
2028 update_generation = false;
2029 }
2030
2031 return dlil_event_internal(ifp, &ev_msg, update_generation);
2032 }
2033
2034 __private_extern__ int
dlil_alloc_local_stats(struct ifnet * ifp)2035 dlil_alloc_local_stats(struct ifnet *ifp)
2036 {
2037 int ret = EINVAL;
2038 void *buf, *base, **pbuf;
2039
2040 if (ifp == NULL) {
2041 goto end;
2042 }
2043
2044 if (ifp->if_tcp_stat == NULL && ifp->if_udp_stat == NULL) {
2045 /* allocate tcpstat_local structure */
2046 buf = zalloc_flags(dlif_tcpstat_zone,
2047 Z_WAITOK | Z_ZERO | Z_NOFAIL);
2048
2049 /* Get the 64-bit aligned base address for this object */
2050 base = (void *)P2ROUNDUP((intptr_t)buf + sizeof(u_int64_t),
2051 sizeof(u_int64_t));
2052 VERIFY(((intptr_t)base + dlif_tcpstat_size) <=
2053 ((intptr_t)buf + dlif_tcpstat_bufsize));
2054
2055 /*
2056 * Wind back a pointer size from the aligned base and
2057 * save the original address so we can free it later.
2058 */
2059 pbuf = (void **)((intptr_t)base - sizeof(void *));
2060 *pbuf = buf;
2061 ifp->if_tcp_stat = base;
2062
2063 /* allocate udpstat_local structure */
2064 buf = zalloc_flags(dlif_udpstat_zone,
2065 Z_WAITOK | Z_ZERO | Z_NOFAIL);
2066
2067 /* Get the 64-bit aligned base address for this object */
2068 base = (void *)P2ROUNDUP((intptr_t)buf + sizeof(u_int64_t),
2069 sizeof(u_int64_t));
2070 VERIFY(((intptr_t)base + dlif_udpstat_size) <=
2071 ((intptr_t)buf + dlif_udpstat_bufsize));
2072
2073 /*
2074 * Wind back a pointer size from the aligned base and
2075 * save the original address so we can free it later.
2076 */
2077 pbuf = (void **)((intptr_t)base - sizeof(void *));
2078 *pbuf = buf;
2079 ifp->if_udp_stat = base;
2080
2081 VERIFY(IS_P2ALIGNED(ifp->if_tcp_stat, sizeof(u_int64_t)) &&
2082 IS_P2ALIGNED(ifp->if_udp_stat, sizeof(u_int64_t)));
2083
2084 ret = 0;
2085 }
2086
2087 if (ifp->if_ipv4_stat == NULL) {
2088 ifp->if_ipv4_stat = kalloc_type(struct if_tcp_ecn_stat, Z_WAITOK | Z_ZERO);
2089 }
2090
2091 if (ifp->if_ipv6_stat == NULL) {
2092 ifp->if_ipv6_stat = kalloc_type(struct if_tcp_ecn_stat, Z_WAITOK | Z_ZERO);
2093 }
2094 end:
2095 if (ifp != NULL && ret != 0) {
2096 if (ifp->if_tcp_stat != NULL) {
2097 pbuf = (void **)
2098 ((intptr_t)ifp->if_tcp_stat - sizeof(void *));
2099 zfree(dlif_tcpstat_zone, *pbuf);
2100 ifp->if_tcp_stat = NULL;
2101 }
2102 if (ifp->if_udp_stat != NULL) {
2103 pbuf = (void **)
2104 ((intptr_t)ifp->if_udp_stat - sizeof(void *));
2105 zfree(dlif_udpstat_zone, *pbuf);
2106 ifp->if_udp_stat = NULL;
2107 }
2108 /* The macro kfree_type sets the passed pointer to NULL */
2109 if (ifp->if_ipv4_stat != NULL) {
2110 kfree_type(struct if_tcp_ecn_stat, ifp->if_ipv4_stat);
2111 }
2112 if (ifp->if_ipv6_stat != NULL) {
2113 kfree_type(struct if_tcp_ecn_stat, ifp->if_ipv6_stat);
2114 }
2115 }
2116
2117 return ret;
2118 }
2119
2120 static void
dlil_reset_rxpoll_params(ifnet_t ifp)2121 dlil_reset_rxpoll_params(ifnet_t ifp)
2122 {
2123 ASSERT(ifp != NULL);
2124 ifnet_set_poll_cycle(ifp, NULL);
2125 ifp->if_poll_update = 0;
2126 ifp->if_poll_flags = 0;
2127 ifp->if_poll_req = 0;
2128 ifp->if_poll_mode = IFNET_MODEL_INPUT_POLL_OFF;
2129 bzero(&ifp->if_poll_tstats, sizeof(ifp->if_poll_tstats));
2130 bzero(&ifp->if_poll_pstats, sizeof(ifp->if_poll_pstats));
2131 bzero(&ifp->if_poll_sstats, sizeof(ifp->if_poll_sstats));
2132 net_timerclear(&ifp->if_poll_mode_holdtime);
2133 net_timerclear(&ifp->if_poll_mode_lasttime);
2134 net_timerclear(&ifp->if_poll_sample_holdtime);
2135 net_timerclear(&ifp->if_poll_sample_lasttime);
2136 net_timerclear(&ifp->if_poll_dbg_lasttime);
2137 }
2138
2139 static int
dlil_create_input_thread(ifnet_t ifp,struct dlil_threading_info * inp,thread_continue_t * thfunc)2140 dlil_create_input_thread(ifnet_t ifp, struct dlil_threading_info *inp,
2141 thread_continue_t *thfunc)
2142 {
2143 boolean_t dlil_rxpoll_input;
2144 thread_continue_t func = NULL;
2145 u_int32_t limit;
2146 int error = 0;
2147
2148 dlil_rxpoll_input = (ifp != NULL && net_rxpoll &&
2149 (ifp->if_eflags & IFEF_RXPOLL) && (ifp->if_xflags & IFXF_LEGACY));
2150
2151 /* default strategy utilizes the DLIL worker thread */
2152 inp->dlth_strategy = dlil_input_async;
2153
2154 /* NULL ifp indicates the main input thread, called at dlil_init time */
2155 if (ifp == NULL) {
2156 /*
2157 * Main input thread only.
2158 */
2159 func = dlil_main_input_thread_func;
2160 VERIFY(inp == dlil_main_input_thread);
2161 (void) strlcat(inp->dlth_name,
2162 "main_input", DLIL_THREADNAME_LEN);
2163 } else if (dlil_rxpoll_input) {
2164 /*
2165 * Legacy (non-netif) hybrid polling.
2166 */
2167 func = dlil_rxpoll_input_thread_func;
2168 VERIFY(inp != dlil_main_input_thread);
2169 (void) snprintf(inp->dlth_name, DLIL_THREADNAME_LEN,
2170 "%s_input_poll", if_name(ifp));
2171 } else if (net_async || (ifp->if_xflags & IFXF_LEGACY)) {
2172 /*
2173 * Asynchronous strategy.
2174 */
2175 func = dlil_input_thread_func;
2176 VERIFY(inp != dlil_main_input_thread);
2177 (void) snprintf(inp->dlth_name, DLIL_THREADNAME_LEN,
2178 "%s_input", if_name(ifp));
2179 } else {
2180 /*
2181 * Synchronous strategy if there's a netif below and
2182 * the device isn't capable of hybrid polling.
2183 */
2184 ASSERT(func == NULL);
2185 ASSERT(!(ifp->if_xflags & IFXF_LEGACY));
2186 VERIFY(inp != dlil_main_input_thread);
2187 ASSERT(!inp->dlth_affinity);
2188 inp->dlth_strategy = dlil_input_sync;
2189 }
2190 VERIFY(inp->dlth_thread == THREAD_NULL);
2191
2192 /* let caller know */
2193 if (thfunc != NULL) {
2194 *thfunc = func;
2195 }
2196
2197 inp->dlth_lock_grp = lck_grp_alloc_init(inp->dlth_name, LCK_GRP_ATTR_NULL);
2198 lck_mtx_init(&inp->dlth_lock, inp->dlth_lock_grp, &dlil_lck_attributes);
2199
2200 inp->dlth_ifp = ifp; /* NULL for main input thread */
2201 /*
2202 * For interfaces that support opportunistic polling, set the
2203 * low and high watermarks for outstanding inbound packets/bytes.
2204 * Also define freeze times for transitioning between modes
2205 * and updating the average.
2206 */
2207 if (ifp != NULL && net_rxpoll && (ifp->if_eflags & IFEF_RXPOLL)) {
2208 limit = MAX(if_rcvq_maxlen, IF_RCVQ_MINLEN);
2209 if (ifp->if_xflags & IFXF_LEGACY) {
2210 (void) dlil_rxpoll_set_params(ifp, NULL, FALSE);
2211 }
2212 } else {
2213 limit = (u_int32_t)-1;
2214 }
2215
2216 _qinit(&inp->dlth_pkts, Q_DROPTAIL, limit, QP_MBUF);
2217 if (inp == dlil_main_input_thread) {
2218 struct dlil_main_threading_info *inpm =
2219 (struct dlil_main_threading_info *)inp;
2220 _qinit(&inpm->lo_rcvq_pkts, Q_DROPTAIL, limit, QP_MBUF);
2221 }
2222
2223 if (func == NULL) {
2224 ASSERT(!(ifp->if_xflags & IFXF_LEGACY));
2225 ASSERT(error == 0);
2226 error = ENODEV;
2227 goto done;
2228 }
2229
2230 error = kernel_thread_start(func, inp, &inp->dlth_thread);
2231 if (error == KERN_SUCCESS) {
2232 thread_precedence_policy_data_t info;
2233 __unused kern_return_t kret;
2234
2235 bzero(&info, sizeof(info));
2236 info.importance = 0;
2237 kret = thread_policy_set(inp->dlth_thread,
2238 THREAD_PRECEDENCE_POLICY, (thread_policy_t)&info,
2239 THREAD_PRECEDENCE_POLICY_COUNT);
2240 ASSERT(kret == KERN_SUCCESS);
2241 /*
2242 * We create an affinity set so that the matching workloop
2243 * thread or the starter thread (for loopback) can be
2244 * scheduled on the same processor set as the input thread.
2245 */
2246 if (net_affinity) {
2247 struct thread *tp = inp->dlth_thread;
2248 u_int32_t tag;
2249 /*
2250 * Randomize to reduce the probability
2251 * of affinity tag namespace collision.
2252 */
2253 read_frandom(&tag, sizeof(tag));
2254 if (dlil_affinity_set(tp, tag) == KERN_SUCCESS) {
2255 thread_reference(tp);
2256 inp->dlth_affinity_tag = tag;
2257 inp->dlth_affinity = TRUE;
2258 }
2259 }
2260 } else if (inp == dlil_main_input_thread) {
2261 panic_plain("%s: couldn't create main input thread", __func__);
2262 /* NOTREACHED */
2263 } else {
2264 panic_plain("%s: couldn't create %s input thread", __func__,
2265 if_name(ifp));
2266 /* NOTREACHED */
2267 }
2268 OSAddAtomic(1, &cur_dlil_input_threads);
2269
2270 done:
2271 return error;
2272 }
2273
2274 #if TEST_INPUT_THREAD_TERMINATION
2275 static int
2276 sysctl_input_thread_termination_spin SYSCTL_HANDLER_ARGS
2277 {
2278 #pragma unused(arg1, arg2)
2279 uint32_t i;
2280 int err;
2281
2282 i = if_input_thread_termination_spin;
2283
2284 err = sysctl_handle_int(oidp, &i, 0, req);
2285 if (err != 0 || req->newptr == USER_ADDR_NULL) {
2286 return err;
2287 }
2288
2289 if (net_rxpoll == 0) {
2290 return ENXIO;
2291 }
2292
2293 if_input_thread_termination_spin = i;
2294 return err;
2295 }
2296 #endif /* TEST_INPUT_THREAD_TERMINATION */
2297
2298 static void
dlil_clean_threading_info(struct dlil_threading_info * inp)2299 dlil_clean_threading_info(struct dlil_threading_info *inp)
2300 {
2301 lck_mtx_destroy(&inp->dlth_lock, inp->dlth_lock_grp);
2302 lck_grp_free(inp->dlth_lock_grp);
2303 inp->dlth_lock_grp = NULL;
2304
2305 inp->dlth_flags = 0;
2306 inp->dlth_wtot = 0;
2307 bzero(inp->dlth_name, sizeof(inp->dlth_name));
2308 inp->dlth_ifp = NULL;
2309 VERIFY(qhead(&inp->dlth_pkts) == NULL && qempty(&inp->dlth_pkts));
2310 qlimit(&inp->dlth_pkts) = 0;
2311 bzero(&inp->dlth_stats, sizeof(inp->dlth_stats));
2312
2313 VERIFY(!inp->dlth_affinity);
2314 inp->dlth_thread = THREAD_NULL;
2315 inp->dlth_strategy = NULL;
2316 VERIFY(inp->dlth_driver_thread == THREAD_NULL);
2317 VERIFY(inp->dlth_poller_thread == THREAD_NULL);
2318 VERIFY(inp->dlth_affinity_tag == 0);
2319 #if IFNET_INPUT_SANITY_CHK
2320 inp->dlth_pkts_cnt = 0;
2321 #endif /* IFNET_INPUT_SANITY_CHK */
2322 }
2323
2324 static void
dlil_terminate_input_thread(struct dlil_threading_info * inp)2325 dlil_terminate_input_thread(struct dlil_threading_info *inp)
2326 {
2327 struct ifnet *ifp = inp->dlth_ifp;
2328 classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
2329
2330 VERIFY(current_thread() == inp->dlth_thread);
2331 VERIFY(inp != dlil_main_input_thread);
2332
2333 OSAddAtomic(-1, &cur_dlil_input_threads);
2334
2335 #if TEST_INPUT_THREAD_TERMINATION
2336 { /* do something useless that won't get optimized away */
2337 uint32_t v = 1;
2338 for (uint32_t i = 0;
2339 i < if_input_thread_termination_spin;
2340 i++) {
2341 v = (i + 1) * v;
2342 }
2343 DLIL_PRINTF("the value is %d\n", v);
2344 }
2345 #endif /* TEST_INPUT_THREAD_TERMINATION */
2346
2347 lck_mtx_lock_spin(&inp->dlth_lock);
2348 _getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
2349 VERIFY((inp->dlth_flags & DLIL_INPUT_TERMINATE) != 0);
2350 inp->dlth_flags |= DLIL_INPUT_TERMINATE_COMPLETE;
2351 wakeup_one((caddr_t)&inp->dlth_flags);
2352 lck_mtx_unlock(&inp->dlth_lock);
2353
2354 /* free up pending packets */
2355 if (pkt.cp_mbuf != NULL) {
2356 mbuf_freem_list(pkt.cp_mbuf);
2357 }
2358
2359 /* for the extra refcnt from kernel_thread_start() */
2360 thread_deallocate(current_thread());
2361
2362 if (dlil_verbose) {
2363 DLIL_PRINTF("%s: input thread terminated\n",
2364 if_name(ifp));
2365 }
2366
2367 /* this is the end */
2368 thread_terminate(current_thread());
2369 /* NOTREACHED */
2370 }
2371
2372 static kern_return_t
dlil_affinity_set(struct thread * tp,u_int32_t tag)2373 dlil_affinity_set(struct thread *tp, u_int32_t tag)
2374 {
2375 thread_affinity_policy_data_t policy;
2376
2377 bzero(&policy, sizeof(policy));
2378 policy.affinity_tag = tag;
2379 return thread_policy_set(tp, THREAD_AFFINITY_POLICY,
2380 (thread_policy_t)&policy, THREAD_AFFINITY_POLICY_COUNT);
2381 }
2382
2383 #if SKYWALK && defined(XNU_TARGET_OS_OSX)
2384 static void
dlil_filter_event(struct eventhandler_entry_arg arg __unused,enum net_filter_event_subsystems state)2385 dlil_filter_event(struct eventhandler_entry_arg arg __unused,
2386 enum net_filter_event_subsystems state)
2387 {
2388 bool old_if_enable_fsw_transport_netagent = if_enable_fsw_transport_netagent;
2389 if ((state & ~NET_FILTER_EVENT_PF_PRIVATE_PROXY) == 0) {
2390 if_enable_fsw_transport_netagent = 1;
2391 } else {
2392 if_enable_fsw_transport_netagent = 0;
2393 }
2394 if (old_if_enable_fsw_transport_netagent != if_enable_fsw_transport_netagent) {
2395 kern_nexus_update_netagents();
2396 } else if (!if_enable_fsw_transport_netagent) {
2397 necp_update_all_clients();
2398 }
2399 }
2400 #endif /* SKYWALK && XNU_TARGET_OS_OSX */
2401
2402 void
dlil_init(void)2403 dlil_init(void)
2404 {
2405 thread_t thread = THREAD_NULL;
2406
2407 /*
2408 * The following fields must be 64-bit aligned for atomic operations.
2409 */
2410 IF_DATA_REQUIRE_ALIGNED_64(ifi_ipackets);
2411 IF_DATA_REQUIRE_ALIGNED_64(ifi_ierrors);
2412 IF_DATA_REQUIRE_ALIGNED_64(ifi_opackets);
2413 IF_DATA_REQUIRE_ALIGNED_64(ifi_oerrors);
2414 IF_DATA_REQUIRE_ALIGNED_64(ifi_collisions);
2415 IF_DATA_REQUIRE_ALIGNED_64(ifi_ibytes);
2416 IF_DATA_REQUIRE_ALIGNED_64(ifi_obytes);
2417 IF_DATA_REQUIRE_ALIGNED_64(ifi_imcasts);
2418 IF_DATA_REQUIRE_ALIGNED_64(ifi_omcasts);
2419 IF_DATA_REQUIRE_ALIGNED_64(ifi_iqdrops);
2420 IF_DATA_REQUIRE_ALIGNED_64(ifi_noproto);
2421 IF_DATA_REQUIRE_ALIGNED_64(ifi_alignerrs);
2422 IF_DATA_REQUIRE_ALIGNED_64(ifi_dt_bytes);
2423 IF_DATA_REQUIRE_ALIGNED_64(ifi_fpackets);
2424 IF_DATA_REQUIRE_ALIGNED_64(ifi_fbytes);
2425
2426 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_ipackets);
2427 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_ierrors);
2428 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_opackets);
2429 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_oerrors);
2430 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_collisions);
2431 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_ibytes);
2432 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_obytes);
2433 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_imcasts);
2434 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_omcasts);
2435 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_iqdrops);
2436 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_noproto);
2437 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_alignerrs);
2438 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_dt_bytes);
2439 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_fpackets);
2440 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_fbytes);
2441
2442 /*
2443 * These IF_HWASSIST_ flags must be equal to their IFNET_* counterparts.
2444 */
2445 _CASSERT(IF_HWASSIST_CSUM_IP == IFNET_CSUM_IP);
2446 _CASSERT(IF_HWASSIST_CSUM_TCP == IFNET_CSUM_TCP);
2447 _CASSERT(IF_HWASSIST_CSUM_UDP == IFNET_CSUM_UDP);
2448 _CASSERT(IF_HWASSIST_CSUM_IP_FRAGS == IFNET_CSUM_FRAGMENT);
2449 _CASSERT(IF_HWASSIST_CSUM_FRAGMENT == IFNET_IP_FRAGMENT);
2450 _CASSERT(IF_HWASSIST_CSUM_TCPIPV6 == IFNET_CSUM_TCPIPV6);
2451 _CASSERT(IF_HWASSIST_CSUM_UDPIPV6 == IFNET_CSUM_UDPIPV6);
2452 _CASSERT(IF_HWASSIST_CSUM_FRAGMENT_IPV6 == IFNET_IPV6_FRAGMENT);
2453 _CASSERT(IF_HWASSIST_CSUM_PARTIAL == IFNET_CSUM_PARTIAL);
2454 _CASSERT(IF_HWASSIST_CSUM_ZERO_INVERT == IFNET_CSUM_ZERO_INVERT);
2455 _CASSERT(IF_HWASSIST_VLAN_TAGGING == IFNET_VLAN_TAGGING);
2456 _CASSERT(IF_HWASSIST_VLAN_MTU == IFNET_VLAN_MTU);
2457 _CASSERT(IF_HWASSIST_TSO_V4 == IFNET_TSO_IPV4);
2458 _CASSERT(IF_HWASSIST_TSO_V6 == IFNET_TSO_IPV6);
2459
2460 /*
2461 * ... as well as the mbuf checksum flags counterparts.
2462 */
2463 _CASSERT(CSUM_IP == IF_HWASSIST_CSUM_IP);
2464 _CASSERT(CSUM_TCP == IF_HWASSIST_CSUM_TCP);
2465 _CASSERT(CSUM_UDP == IF_HWASSIST_CSUM_UDP);
2466 _CASSERT(CSUM_IP_FRAGS == IF_HWASSIST_CSUM_IP_FRAGS);
2467 _CASSERT(CSUM_FRAGMENT == IF_HWASSIST_CSUM_FRAGMENT);
2468 _CASSERT(CSUM_TCPIPV6 == IF_HWASSIST_CSUM_TCPIPV6);
2469 _CASSERT(CSUM_UDPIPV6 == IF_HWASSIST_CSUM_UDPIPV6);
2470 _CASSERT(CSUM_FRAGMENT_IPV6 == IF_HWASSIST_CSUM_FRAGMENT_IPV6);
2471 _CASSERT(CSUM_PARTIAL == IF_HWASSIST_CSUM_PARTIAL);
2472 _CASSERT(CSUM_ZERO_INVERT == IF_HWASSIST_CSUM_ZERO_INVERT);
2473 _CASSERT(CSUM_VLAN_TAG_VALID == IF_HWASSIST_VLAN_TAGGING);
2474
2475 /*
2476 * Make sure we have at least IF_LLREACH_MAXLEN in the llreach info.
2477 */
2478 _CASSERT(IF_LLREACH_MAXLEN <= IF_LLREACHINFO_ADDRLEN);
2479 _CASSERT(IFNET_LLREACHINFO_ADDRLEN == IF_LLREACHINFO_ADDRLEN);
2480
2481 _CASSERT(IFRLOGF_DLIL == IFNET_LOGF_DLIL);
2482 _CASSERT(IFRLOGF_FAMILY == IFNET_LOGF_FAMILY);
2483 _CASSERT(IFRLOGF_DRIVER == IFNET_LOGF_DRIVER);
2484 _CASSERT(IFRLOGF_FIRMWARE == IFNET_LOGF_FIRMWARE);
2485
2486 _CASSERT(IFRLOGCAT_CONNECTIVITY == IFNET_LOGCAT_CONNECTIVITY);
2487 _CASSERT(IFRLOGCAT_QUALITY == IFNET_LOGCAT_QUALITY);
2488 _CASSERT(IFRLOGCAT_PERFORMANCE == IFNET_LOGCAT_PERFORMANCE);
2489
2490 _CASSERT(IFRTYPE_FAMILY_ANY == IFNET_FAMILY_ANY);
2491 _CASSERT(IFRTYPE_FAMILY_LOOPBACK == IFNET_FAMILY_LOOPBACK);
2492 _CASSERT(IFRTYPE_FAMILY_ETHERNET == IFNET_FAMILY_ETHERNET);
2493 _CASSERT(IFRTYPE_FAMILY_SLIP == IFNET_FAMILY_SLIP);
2494 _CASSERT(IFRTYPE_FAMILY_TUN == IFNET_FAMILY_TUN);
2495 _CASSERT(IFRTYPE_FAMILY_VLAN == IFNET_FAMILY_VLAN);
2496 _CASSERT(IFRTYPE_FAMILY_PPP == IFNET_FAMILY_PPP);
2497 _CASSERT(IFRTYPE_FAMILY_PVC == IFNET_FAMILY_PVC);
2498 _CASSERT(IFRTYPE_FAMILY_DISC == IFNET_FAMILY_DISC);
2499 _CASSERT(IFRTYPE_FAMILY_MDECAP == IFNET_FAMILY_MDECAP);
2500 _CASSERT(IFRTYPE_FAMILY_GIF == IFNET_FAMILY_GIF);
2501 _CASSERT(IFRTYPE_FAMILY_FAITH == IFNET_FAMILY_FAITH);
2502 _CASSERT(IFRTYPE_FAMILY_STF == IFNET_FAMILY_STF);
2503 _CASSERT(IFRTYPE_FAMILY_FIREWIRE == IFNET_FAMILY_FIREWIRE);
2504 _CASSERT(IFRTYPE_FAMILY_BOND == IFNET_FAMILY_BOND);
2505 _CASSERT(IFRTYPE_FAMILY_CELLULAR == IFNET_FAMILY_CELLULAR);
2506 _CASSERT(IFRTYPE_FAMILY_UTUN == IFNET_FAMILY_UTUN);
2507 _CASSERT(IFRTYPE_FAMILY_IPSEC == IFNET_FAMILY_IPSEC);
2508
2509 _CASSERT(IFRTYPE_SUBFAMILY_ANY == IFNET_SUBFAMILY_ANY);
2510 _CASSERT(IFRTYPE_SUBFAMILY_USB == IFNET_SUBFAMILY_USB);
2511 _CASSERT(IFRTYPE_SUBFAMILY_BLUETOOTH == IFNET_SUBFAMILY_BLUETOOTH);
2512 _CASSERT(IFRTYPE_SUBFAMILY_WIFI == IFNET_SUBFAMILY_WIFI);
2513 _CASSERT(IFRTYPE_SUBFAMILY_THUNDERBOLT == IFNET_SUBFAMILY_THUNDERBOLT);
2514 _CASSERT(IFRTYPE_SUBFAMILY_RESERVED == IFNET_SUBFAMILY_RESERVED);
2515 _CASSERT(IFRTYPE_SUBFAMILY_INTCOPROC == IFNET_SUBFAMILY_INTCOPROC);
2516 _CASSERT(IFRTYPE_SUBFAMILY_QUICKRELAY == IFNET_SUBFAMILY_QUICKRELAY);
2517 _CASSERT(IFRTYPE_SUBFAMILY_DEFAULT == IFNET_SUBFAMILY_DEFAULT);
2518
2519 _CASSERT(DLIL_MODIDLEN == IFNET_MODIDLEN);
2520 _CASSERT(DLIL_MODARGLEN == IFNET_MODARGLEN);
2521
2522 PE_parse_boot_argn("net_affinity", &net_affinity,
2523 sizeof(net_affinity));
2524
2525 PE_parse_boot_argn("net_rxpoll", &net_rxpoll, sizeof(net_rxpoll));
2526
2527 PE_parse_boot_argn("net_rtref", &net_rtref, sizeof(net_rtref));
2528
2529 PE_parse_boot_argn("net_async", &net_async, sizeof(net_async));
2530
2531 PE_parse_boot_argn("ifnet_debug", &ifnet_debug, sizeof(ifnet_debug));
2532
2533 VERIFY(dlil_pending_thread_cnt == 0);
2534 #if SKYWALK
2535 boolean_t pe_enable_fsw_transport_netagent = FALSE;
2536 boolean_t pe_disable_fsw_transport_netagent = FALSE;
2537 boolean_t enable_fsw_netagent =
2538 (((if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0) ||
2539 (if_attach_nx & IF_ATTACH_NX_FSW_IP_NETAGENT) != 0);
2540
2541 /*
2542 * Check the device tree to see if Skywalk netagent has been explicitly
2543 * enabled or disabled. This can be overridden via if_attach_nx below.
2544 * Note that the property is a 0-length key, and so checking for the
2545 * presence itself is enough (no need to check for the actual value of
2546 * the retrieved variable.)
2547 */
2548 pe_enable_fsw_transport_netagent =
2549 PE_get_default("kern.skywalk_netagent_enable",
2550 &pe_enable_fsw_transport_netagent,
2551 sizeof(pe_enable_fsw_transport_netagent));
2552 pe_disable_fsw_transport_netagent =
2553 PE_get_default("kern.skywalk_netagent_disable",
2554 &pe_disable_fsw_transport_netagent,
2555 sizeof(pe_disable_fsw_transport_netagent));
2556
2557 /*
2558 * These two are mutually exclusive, i.e. they both can be absent,
2559 * but only one can be present at a time, and so we assert to make
2560 * sure it is correct.
2561 */
2562 VERIFY((!pe_enable_fsw_transport_netagent &&
2563 !pe_disable_fsw_transport_netagent) ||
2564 (pe_enable_fsw_transport_netagent ^
2565 pe_disable_fsw_transport_netagent));
2566
2567 if (pe_enable_fsw_transport_netagent) {
2568 kprintf("SK: netagent is enabled via an override for "
2569 "this platform\n");
2570 if_attach_nx = SKYWALK_NETWORKING_ENABLED;
2571 } else if (pe_disable_fsw_transport_netagent) {
2572 kprintf("SK: netagent is disabled via an override for "
2573 "this platform\n");
2574 if_attach_nx = SKYWALK_NETWORKING_DISABLED;
2575 } else {
2576 kprintf("SK: netagent is %s by default for this platform\n",
2577 (enable_fsw_netagent ? "enabled" : "disabled"));
2578 if_attach_nx = IF_ATTACH_NX_DEFAULT;
2579 }
2580
2581 /*
2582 * Now see if there's a boot-arg override.
2583 */
2584 (void) PE_parse_boot_argn("if_attach_nx", &if_attach_nx,
2585 sizeof(if_attach_nx));
2586 if_enable_fsw_transport_netagent =
2587 ((if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0);
2588
2589 if_netif_all = ((if_attach_nx & IF_ATTACH_NX_NETIF_ALL) != 0);
2590
2591 if (pe_disable_fsw_transport_netagent &&
2592 if_enable_fsw_transport_netagent) {
2593 kprintf("SK: netagent is force-enabled\n");
2594 } else if (!pe_disable_fsw_transport_netagent &&
2595 !if_enable_fsw_transport_netagent) {
2596 kprintf("SK: netagent is force-disabled\n");
2597 }
2598 #ifdef XNU_TARGET_OS_OSX
2599 if (if_enable_fsw_transport_netagent) {
2600 net_filter_event_register(dlil_filter_event);
2601 }
2602 #endif /* XNU_TARGET_OS_OSX */
2603
2604 #if (DEVELOPMENT || DEBUG)
2605 (void) PE_parse_boot_argn("fsw_use_max_mtu_buffer",
2606 &fsw_use_max_mtu_buffer, sizeof(fsw_use_max_mtu_buffer));
2607 #endif /* (DEVELOPMENT || DEBUG) */
2608
2609 #endif /* SKYWALK */
2610 dlif_size = (ifnet_debug == 0) ? sizeof(struct dlil_ifnet) :
2611 sizeof(struct dlil_ifnet_dbg);
2612 /* Enforce 64-bit alignment for dlil_ifnet structure */
2613 dlif_bufsize = dlif_size + sizeof(void *) + sizeof(u_int64_t);
2614 dlif_bufsize = (uint32_t)P2ROUNDUP(dlif_bufsize, sizeof(u_int64_t));
2615 dlif_zone = zone_create(DLIF_ZONE_NAME, dlif_bufsize, ZC_ZFREE_CLEARMEM);
2616
2617 dlif_tcpstat_size = sizeof(struct tcpstat_local);
2618 /* Enforce 64-bit alignment for tcpstat_local structure */
2619 dlif_tcpstat_bufsize =
2620 dlif_tcpstat_size + sizeof(void *) + sizeof(u_int64_t);
2621 dlif_tcpstat_bufsize = (uint32_t)
2622 P2ROUNDUP(dlif_tcpstat_bufsize, sizeof(u_int64_t));
2623 dlif_tcpstat_zone = zone_create(DLIF_TCPSTAT_ZONE_NAME,
2624 dlif_tcpstat_bufsize, ZC_ZFREE_CLEARMEM);
2625
2626 dlif_udpstat_size = sizeof(struct udpstat_local);
2627 /* Enforce 64-bit alignment for udpstat_local structure */
2628 dlif_udpstat_bufsize =
2629 dlif_udpstat_size + sizeof(void *) + sizeof(u_int64_t);
2630 dlif_udpstat_bufsize = (uint32_t)
2631 P2ROUNDUP(dlif_udpstat_bufsize, sizeof(u_int64_t));
2632 dlif_udpstat_zone = zone_create(DLIF_UDPSTAT_ZONE_NAME,
2633 dlif_udpstat_bufsize, ZC_ZFREE_CLEARMEM);
2634
2635 eventhandler_lists_ctxt_init(&ifnet_evhdlr_ctxt);
2636
2637 TAILQ_INIT(&dlil_ifnet_head);
2638 TAILQ_INIT(&ifnet_head);
2639 TAILQ_INIT(&ifnet_detaching_head);
2640 TAILQ_INIT(&ifnet_ordered_head);
2641
2642 /* Initialize interface address subsystem */
2643 ifa_init();
2644
2645 #if PF
2646 /* Initialize the packet filter */
2647 pfinit();
2648 #endif /* PF */
2649
2650 /* Initialize queue algorithms */
2651 classq_init();
2652
2653 /* Initialize packet schedulers */
2654 pktsched_init();
2655
2656 /* Initialize flow advisory subsystem */
2657 flowadv_init();
2658
2659 /* Initialize the pktap virtual interface */
2660 pktap_init();
2661
2662 /* Initialize the service class to dscp map */
2663 net_qos_map_init();
2664
2665 /* Initialize the interface low power mode event handler */
2666 if_low_power_evhdlr_init();
2667
2668 /* Initialize the interface offload port list subsystem */
2669 if_ports_used_init();
2670
2671 #if DEBUG || DEVELOPMENT
2672 /* Run self-tests */
2673 dlil_verify_sum16();
2674 #endif /* DEBUG || DEVELOPMENT */
2675
2676 /*
2677 * Create and start up the main DLIL input thread and the interface
2678 * detacher threads once everything is initialized.
2679 */
2680 dlil_incr_pending_thread_count();
2681 (void) dlil_create_input_thread(NULL, dlil_main_input_thread, NULL);
2682
2683 /*
2684 * Create ifnet detacher thread.
2685 * When an interface gets detached, part of the detach processing
2686 * is delayed. The interface is added to delayed detach list
2687 * and this thread is woken up to call ifnet_detach_final
2688 * on these interfaces.
2689 */
2690 dlil_incr_pending_thread_count();
2691 if (kernel_thread_start(ifnet_detacher_thread_func,
2692 NULL, &thread) != KERN_SUCCESS) {
2693 panic_plain("%s: couldn't create detacher thread", __func__);
2694 /* NOTREACHED */
2695 }
2696 thread_deallocate(thread);
2697
2698 /*
2699 * Wait for the created kernel threads for dlil to get
2700 * scheduled and run at least once before we proceed
2701 */
2702 lck_mtx_lock(&dlil_thread_sync_lock);
2703 while (dlil_pending_thread_cnt != 0) {
2704 DLIL_PRINTF("%s: Waiting for all the create dlil kernel "
2705 "threads to get scheduled at least once.\n", __func__);
2706 (void) msleep(&dlil_pending_thread_cnt, &dlil_thread_sync_lock,
2707 (PZERO - 1), __func__, NULL);
2708 LCK_MTX_ASSERT(&dlil_thread_sync_lock, LCK_ASSERT_OWNED);
2709 }
2710 lck_mtx_unlock(&dlil_thread_sync_lock);
2711 DLIL_PRINTF("%s: All the created dlil kernel threads have been "
2712 "scheduled at least once. Proceeding.\n", __func__);
2713 }
2714
2715 static void
if_flt_monitor_busy(struct ifnet * ifp)2716 if_flt_monitor_busy(struct ifnet *ifp)
2717 {
2718 LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
2719
2720 ++ifp->if_flt_busy;
2721 VERIFY(ifp->if_flt_busy != 0);
2722 }
2723
2724 static void
if_flt_monitor_unbusy(struct ifnet * ifp)2725 if_flt_monitor_unbusy(struct ifnet *ifp)
2726 {
2727 if_flt_monitor_leave(ifp);
2728 }
2729
2730 static void
if_flt_monitor_enter(struct ifnet * ifp)2731 if_flt_monitor_enter(struct ifnet *ifp)
2732 {
2733 LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
2734
2735 while (ifp->if_flt_busy) {
2736 ++ifp->if_flt_waiters;
2737 (void) msleep(&ifp->if_flt_head, &ifp->if_flt_lock,
2738 (PZERO - 1), "if_flt_monitor", NULL);
2739 }
2740 if_flt_monitor_busy(ifp);
2741 }
2742
2743 static void
if_flt_monitor_leave(struct ifnet * ifp)2744 if_flt_monitor_leave(struct ifnet *ifp)
2745 {
2746 LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
2747
2748 VERIFY(ifp->if_flt_busy != 0);
2749 --ifp->if_flt_busy;
2750
2751 if (ifp->if_flt_busy == 0 && ifp->if_flt_waiters > 0) {
2752 ifp->if_flt_waiters = 0;
2753 wakeup(&ifp->if_flt_head);
2754 }
2755 }
2756
2757 __private_extern__ int
dlil_attach_filter(struct ifnet * ifp,const struct iff_filter * if_filter,interface_filter_t * filter_ref,u_int32_t flags)2758 dlil_attach_filter(struct ifnet *ifp, const struct iff_filter *if_filter,
2759 interface_filter_t *filter_ref, u_int32_t flags)
2760 {
2761 int retval = 0;
2762 struct ifnet_filter *filter = NULL;
2763
2764 ifnet_head_lock_shared();
2765
2766 /* Check that the interface is in the global list */
2767 if (!ifnet_lookup(ifp)) {
2768 retval = ENXIO;
2769 goto done;
2770 }
2771 if (!ifnet_is_attached(ifp, 1)) {
2772 os_log(OS_LOG_DEFAULT, "%s: %s is no longer attached",
2773 __func__, if_name(ifp));
2774 retval = ENXIO;
2775 goto done;
2776 }
2777
2778 filter = zalloc_flags(dlif_filt_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
2779
2780 /* refcnt held above during lookup */
2781 filter->filt_flags = flags;
2782 filter->filt_ifp = ifp;
2783 filter->filt_cookie = if_filter->iff_cookie;
2784 filter->filt_name = if_filter->iff_name;
2785 filter->filt_protocol = if_filter->iff_protocol;
2786 /*
2787 * Do not install filter callbacks for internal coproc interface
2788 */
2789 if (!IFNET_IS_INTCOPROC(ifp)) {
2790 filter->filt_input = if_filter->iff_input;
2791 filter->filt_output = if_filter->iff_output;
2792 filter->filt_event = if_filter->iff_event;
2793 filter->filt_ioctl = if_filter->iff_ioctl;
2794 }
2795 filter->filt_detached = if_filter->iff_detached;
2796
2797 lck_mtx_lock(&ifp->if_flt_lock);
2798 if_flt_monitor_enter(ifp);
2799
2800 LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
2801 TAILQ_INSERT_TAIL(&ifp->if_flt_head, filter, filt_next);
2802
2803 *filter_ref = filter;
2804
2805 /*
2806 * Bump filter count and route_generation ID to let TCP
2807 * know it shouldn't do TSO on this connection
2808 */
2809 if ((filter->filt_flags & DLIL_IFF_TSO) == 0) {
2810 ifnet_filter_update_tso(ifp, TRUE);
2811 }
2812 OSIncrementAtomic64(&net_api_stats.nas_iflt_attach_count);
2813 INC_ATOMIC_INT64_LIM(net_api_stats.nas_iflt_attach_total);
2814 if (filter->filt_flags & DLIL_IFF_INTERNAL) {
2815 OSIncrementAtomic64(&net_api_stats.nas_iflt_attach_os_count);
2816 INC_ATOMIC_INT64_LIM(net_api_stats.nas_iflt_attach_os_total);
2817 } else {
2818 OSAddAtomic(1, &ifp->if_flt_non_os_count);
2819 }
2820 if_flt_monitor_leave(ifp);
2821 lck_mtx_unlock(&ifp->if_flt_lock);
2822
2823 #if SKYWALK && defined(XNU_TARGET_OS_OSX)
2824 net_filter_event_mark(NET_FILTER_EVENT_INTERFACE,
2825 net_check_compatible_if_filter(NULL));
2826 #endif /* SKYWALK && XNU_TARGET_OS_OSX */
2827
2828 if (dlil_verbose) {
2829 DLIL_PRINTF("%s: %s filter attached\n", if_name(ifp),
2830 if_filter->iff_name);
2831 }
2832 ifnet_decr_iorefcnt(ifp);
2833
2834 done:
2835 ifnet_head_done();
2836 if (retval != 0 && ifp != NULL) {
2837 DLIL_PRINTF("%s: failed to attach %s (err=%d)\n",
2838 if_name(ifp), if_filter->iff_name, retval);
2839 }
2840 if (retval != 0 && filter != NULL) {
2841 zfree(dlif_filt_zone, filter);
2842 }
2843
2844 return retval;
2845 }
2846
2847 static int
dlil_detach_filter_internal(interface_filter_t filter,int detached)2848 dlil_detach_filter_internal(interface_filter_t filter, int detached)
2849 {
2850 int retval = 0;
2851
2852 if (detached == 0) {
2853 ifnet_t ifp = NULL;
2854
2855 ifnet_head_lock_shared();
2856 TAILQ_FOREACH(ifp, &ifnet_head, if_link) {
2857 interface_filter_t entry = NULL;
2858
2859 lck_mtx_lock(&ifp->if_flt_lock);
2860 TAILQ_FOREACH(entry, &ifp->if_flt_head, filt_next) {
2861 if (entry != filter || entry->filt_skip) {
2862 continue;
2863 }
2864 /*
2865 * We've found a match; since it's possible
2866 * that the thread gets blocked in the monitor,
2867 * we do the lock dance. Interface should
2868 * not be detached since we still have a use
2869 * count held during filter attach.
2870 */
2871 entry->filt_skip = 1; /* skip input/output */
2872 lck_mtx_unlock(&ifp->if_flt_lock);
2873 ifnet_head_done();
2874
2875 lck_mtx_lock(&ifp->if_flt_lock);
2876 if_flt_monitor_enter(ifp);
2877 LCK_MTX_ASSERT(&ifp->if_flt_lock,
2878 LCK_MTX_ASSERT_OWNED);
2879
2880 /* Remove the filter from the list */
2881 TAILQ_REMOVE(&ifp->if_flt_head, filter,
2882 filt_next);
2883
2884 if (dlil_verbose) {
2885 DLIL_PRINTF("%s: %s filter detached\n",
2886 if_name(ifp), filter->filt_name);
2887 }
2888 if (!(filter->filt_flags & DLIL_IFF_INTERNAL)) {
2889 VERIFY(ifp->if_flt_non_os_count != 0);
2890 OSAddAtomic(-1, &ifp->if_flt_non_os_count);
2891 }
2892 /*
2893 * Decrease filter count and route_generation
2894 * ID to let TCP know it should reevalute doing
2895 * TSO or not.
2896 */
2897 if ((filter->filt_flags & DLIL_IFF_TSO) == 0) {
2898 ifnet_filter_update_tso(ifp, FALSE);
2899 }
2900 if_flt_monitor_leave(ifp);
2901 lck_mtx_unlock(&ifp->if_flt_lock);
2902 goto destroy;
2903 }
2904 lck_mtx_unlock(&ifp->if_flt_lock);
2905 }
2906 ifnet_head_done();
2907
2908 /* filter parameter is not a valid filter ref */
2909 retval = EINVAL;
2910 goto done;
2911 } else {
2912 struct ifnet *ifp = filter->filt_ifp;
2913 /*
2914 * Here we are called from ifnet_detach_final(); the
2915 * caller had emptied if_flt_head and we're doing an
2916 * implicit filter detach because the interface is
2917 * about to go away. Make sure to adjust the counters
2918 * in this case. We don't need the protection of the
2919 * filter monitor since we're called as part of the
2920 * final detach in the context of the detacher thread.
2921 */
2922 if (!(filter->filt_flags & DLIL_IFF_INTERNAL)) {
2923 VERIFY(ifp->if_flt_non_os_count != 0);
2924 OSAddAtomic(-1, &ifp->if_flt_non_os_count);
2925 }
2926 /*
2927 * Decrease filter count and route_generation
2928 * ID to let TCP know it should reevalute doing
2929 * TSO or not.
2930 */
2931 if ((filter->filt_flags & DLIL_IFF_TSO) == 0) {
2932 ifnet_filter_update_tso(ifp, FALSE);
2933 }
2934 }
2935
2936 if (dlil_verbose) {
2937 DLIL_PRINTF("%s filter detached\n", filter->filt_name);
2938 }
2939
2940 destroy:
2941
2942 /* Call the detached function if there is one */
2943 if (filter->filt_detached) {
2944 filter->filt_detached(filter->filt_cookie, filter->filt_ifp);
2945 }
2946
2947 VERIFY(OSDecrementAtomic64(&net_api_stats.nas_iflt_attach_count) > 0);
2948 if (filter->filt_flags & DLIL_IFF_INTERNAL) {
2949 VERIFY(OSDecrementAtomic64(&net_api_stats.nas_iflt_attach_os_count) > 0);
2950 }
2951 #if SKYWALK && defined(XNU_TARGET_OS_OSX)
2952 net_filter_event_mark(NET_FILTER_EVENT_INTERFACE,
2953 net_check_compatible_if_filter(NULL));
2954 #endif /* SKYWALK && XNU_TARGET_OS_OSX */
2955
2956 /* Free the filter */
2957 zfree(dlif_filt_zone, filter);
2958 filter = NULL;
2959 done:
2960 if (retval != 0 && filter != NULL) {
2961 DLIL_PRINTF("failed to detach %s filter (err=%d)\n",
2962 filter->filt_name, retval);
2963 }
2964
2965 return retval;
2966 }
2967
2968 __private_extern__ void
dlil_detach_filter(interface_filter_t filter)2969 dlil_detach_filter(interface_filter_t filter)
2970 {
2971 if (filter == NULL) {
2972 return;
2973 }
2974 dlil_detach_filter_internal(filter, 0);
2975 }
2976
2977 __private_extern__ boolean_t
dlil_has_ip_filter(void)2978 dlil_has_ip_filter(void)
2979 {
2980 boolean_t has_filter = (net_api_stats.nas_ipf_add_count > 0);
2981 DTRACE_IP1(dlil_has_ip_filter, boolean_t, has_filter);
2982 return has_filter;
2983 }
2984
2985 __private_extern__ boolean_t
dlil_has_if_filter(struct ifnet * ifp)2986 dlil_has_if_filter(struct ifnet *ifp)
2987 {
2988 boolean_t has_filter = !TAILQ_EMPTY(&ifp->if_flt_head);
2989 DTRACE_IP1(dlil_has_if_filter, boolean_t, has_filter);
2990 return has_filter;
2991 }
2992
2993 static inline void
dlil_input_wakeup(struct dlil_threading_info * inp)2994 dlil_input_wakeup(struct dlil_threading_info *inp)
2995 {
2996 LCK_MTX_ASSERT(&inp->dlth_lock, LCK_MTX_ASSERT_OWNED);
2997
2998 inp->dlth_flags |= DLIL_INPUT_WAITING;
2999 if (!(inp->dlth_flags & DLIL_INPUT_RUNNING)) {
3000 inp->dlth_wtot++;
3001 wakeup_one((caddr_t)&inp->dlth_flags);
3002 }
3003 }
3004
3005 __attribute__((noreturn))
3006 static void
dlil_main_input_thread_func(void * v,wait_result_t w)3007 dlil_main_input_thread_func(void *v, wait_result_t w)
3008 {
3009 #pragma unused(w)
3010 struct dlil_threading_info *inp = v;
3011
3012 VERIFY(inp == dlil_main_input_thread);
3013 VERIFY(inp->dlth_ifp == NULL);
3014 VERIFY(current_thread() == inp->dlth_thread);
3015
3016 lck_mtx_lock(&inp->dlth_lock);
3017 VERIFY(!(inp->dlth_flags & (DLIL_INPUT_EMBRYONIC | DLIL_INPUT_RUNNING)));
3018 (void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3019 inp->dlth_flags |= DLIL_INPUT_EMBRYONIC;
3020 /* wake up once to get out of embryonic state */
3021 dlil_input_wakeup(inp);
3022 lck_mtx_unlock(&inp->dlth_lock);
3023 (void) thread_block_parameter(dlil_main_input_thread_cont, inp);
3024 /* NOTREACHED */
3025 __builtin_unreachable();
3026 }
3027
3028 /*
3029 * Main input thread:
3030 *
3031 * a) handles all inbound packets for lo0
3032 * b) handles all inbound packets for interfaces with no dedicated
3033 * input thread (e.g. anything but Ethernet/PDP or those that support
3034 * opportunistic polling.)
3035 * c) protocol registrations
3036 * d) packet injections
3037 */
3038 __attribute__((noreturn))
3039 static void
dlil_main_input_thread_cont(void * v,wait_result_t wres)3040 dlil_main_input_thread_cont(void *v, wait_result_t wres)
3041 {
3042 struct dlil_main_threading_info *inpm = v;
3043 struct dlil_threading_info *inp = v;
3044
3045 /* main input thread is uninterruptible */
3046 VERIFY(wres != THREAD_INTERRUPTED);
3047 lck_mtx_lock_spin(&inp->dlth_lock);
3048 VERIFY(!(inp->dlth_flags & (DLIL_INPUT_TERMINATE |
3049 DLIL_INPUT_RUNNING)));
3050 inp->dlth_flags |= DLIL_INPUT_RUNNING;
3051
3052 while (1) {
3053 struct mbuf *m = NULL, *m_loop = NULL;
3054 u_int32_t m_cnt, m_cnt_loop;
3055 classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
3056 boolean_t proto_req;
3057 boolean_t embryonic;
3058
3059 inp->dlth_flags &= ~DLIL_INPUT_WAITING;
3060
3061 if (__improbable(embryonic =
3062 (inp->dlth_flags & DLIL_INPUT_EMBRYONIC))) {
3063 inp->dlth_flags &= ~DLIL_INPUT_EMBRYONIC;
3064 }
3065
3066 proto_req = (inp->dlth_flags &
3067 (DLIL_PROTO_WAITING | DLIL_PROTO_REGISTER));
3068
3069 /* Packets for non-dedicated interfaces other than lo0 */
3070 m_cnt = qlen(&inp->dlth_pkts);
3071 _getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
3072 m = pkt.cp_mbuf;
3073
3074 /* Packets exclusive to lo0 */
3075 m_cnt_loop = qlen(&inpm->lo_rcvq_pkts);
3076 _getq_all(&inpm->lo_rcvq_pkts, &pkt, NULL, NULL, NULL);
3077 m_loop = pkt.cp_mbuf;
3078
3079 inp->dlth_wtot = 0;
3080
3081 lck_mtx_unlock(&inp->dlth_lock);
3082
3083 if (__improbable(embryonic)) {
3084 dlil_decr_pending_thread_count();
3085 }
3086
3087 /*
3088 * NOTE warning %%% attention !!!!
3089 * We should think about putting some thread starvation
3090 * safeguards if we deal with long chains of packets.
3091 */
3092 if (__probable(m_loop != NULL)) {
3093 dlil_input_packet_list_extended(lo_ifp, m_loop,
3094 m_cnt_loop, IFNET_MODEL_INPUT_POLL_OFF);
3095 }
3096
3097 if (__probable(m != NULL)) {
3098 dlil_input_packet_list_extended(NULL, m,
3099 m_cnt, IFNET_MODEL_INPUT_POLL_OFF);
3100 }
3101
3102 if (__improbable(proto_req)) {
3103 proto_input_run();
3104 }
3105
3106 lck_mtx_lock_spin(&inp->dlth_lock);
3107 VERIFY(inp->dlth_flags & DLIL_INPUT_RUNNING);
3108 /* main input thread cannot be terminated */
3109 VERIFY(!(inp->dlth_flags & DLIL_INPUT_TERMINATE));
3110 if (!(inp->dlth_flags & ~DLIL_INPUT_RUNNING)) {
3111 break;
3112 }
3113 }
3114
3115 inp->dlth_flags &= ~DLIL_INPUT_RUNNING;
3116 (void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3117 lck_mtx_unlock(&inp->dlth_lock);
3118 (void) thread_block_parameter(dlil_main_input_thread_cont, inp);
3119
3120 VERIFY(0); /* we should never get here */
3121 /* NOTREACHED */
3122 __builtin_unreachable();
3123 }
3124
3125 /*
3126 * Input thread for interfaces with legacy input model.
3127 */
3128 __attribute__((noreturn))
3129 static void
dlil_input_thread_func(void * v,wait_result_t w)3130 dlil_input_thread_func(void *v, wait_result_t w)
3131 {
3132 #pragma unused(w)
3133 char thread_name[MAXTHREADNAMESIZE];
3134 struct dlil_threading_info *inp = v;
3135 struct ifnet *ifp = inp->dlth_ifp;
3136
3137 VERIFY(inp != dlil_main_input_thread);
3138 VERIFY(ifp != NULL);
3139 VERIFY(!(ifp->if_eflags & IFEF_RXPOLL) || !net_rxpoll ||
3140 !(ifp->if_xflags & IFXF_LEGACY));
3141 VERIFY(ifp->if_poll_mode == IFNET_MODEL_INPUT_POLL_OFF ||
3142 !(ifp->if_xflags & IFXF_LEGACY));
3143 VERIFY(current_thread() == inp->dlth_thread);
3144
3145 /* construct the name for this thread, and then apply it */
3146 bzero(thread_name, sizeof(thread_name));
3147 (void) snprintf(thread_name, sizeof(thread_name),
3148 "dlil_input_%s", ifp->if_xname);
3149 thread_set_thread_name(inp->dlth_thread, thread_name);
3150
3151 lck_mtx_lock(&inp->dlth_lock);
3152 VERIFY(!(inp->dlth_flags & (DLIL_INPUT_EMBRYONIC | DLIL_INPUT_RUNNING)));
3153 (void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3154 inp->dlth_flags |= DLIL_INPUT_EMBRYONIC;
3155 /* wake up once to get out of embryonic state */
3156 dlil_input_wakeup(inp);
3157 lck_mtx_unlock(&inp->dlth_lock);
3158 (void) thread_block_parameter(dlil_input_thread_cont, inp);
3159 /* NOTREACHED */
3160 __builtin_unreachable();
3161 }
3162
3163 __attribute__((noreturn))
3164 static void
dlil_input_thread_cont(void * v,wait_result_t wres)3165 dlil_input_thread_cont(void *v, wait_result_t wres)
3166 {
3167 struct dlil_threading_info *inp = v;
3168 struct ifnet *ifp = inp->dlth_ifp;
3169
3170 lck_mtx_lock_spin(&inp->dlth_lock);
3171 if (__improbable(wres == THREAD_INTERRUPTED ||
3172 (inp->dlth_flags & DLIL_INPUT_TERMINATE))) {
3173 goto terminate;
3174 }
3175
3176 VERIFY(!(inp->dlth_flags & DLIL_INPUT_RUNNING));
3177 inp->dlth_flags |= DLIL_INPUT_RUNNING;
3178
3179 while (1) {
3180 struct mbuf *m = NULL;
3181 classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
3182 boolean_t notify = FALSE;
3183 boolean_t embryonic;
3184 u_int32_t m_cnt;
3185
3186 inp->dlth_flags &= ~DLIL_INPUT_WAITING;
3187
3188 if (__improbable(embryonic =
3189 (inp->dlth_flags & DLIL_INPUT_EMBRYONIC))) {
3190 inp->dlth_flags &= ~DLIL_INPUT_EMBRYONIC;
3191 }
3192
3193 /*
3194 * Protocol registration and injection must always use
3195 * the main input thread; in theory the latter can utilize
3196 * the corresponding input thread where the packet arrived
3197 * on, but that requires our knowing the interface in advance
3198 * (and the benefits might not worth the trouble.)
3199 */
3200 VERIFY(!(inp->dlth_flags &
3201 (DLIL_PROTO_WAITING | DLIL_PROTO_REGISTER)));
3202
3203 /* Packets for this interface */
3204 m_cnt = qlen(&inp->dlth_pkts);
3205 _getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
3206 m = pkt.cp_mbuf;
3207
3208 inp->dlth_wtot = 0;
3209
3210 #if SKYWALK
3211 /*
3212 * If this interface is attached to a netif nexus,
3213 * the stats are already incremented there; otherwise
3214 * do it here.
3215 */
3216 if (!(ifp->if_capabilities & IFCAP_SKYWALK))
3217 #endif /* SKYWALK */
3218 notify = dlil_input_stats_sync(ifp, inp);
3219
3220 lck_mtx_unlock(&inp->dlth_lock);
3221
3222 if (__improbable(embryonic)) {
3223 ifnet_decr_pending_thread_count(ifp);
3224 }
3225
3226 if (__improbable(notify)) {
3227 ifnet_notify_data_threshold(ifp);
3228 }
3229
3230 /*
3231 * NOTE warning %%% attention !!!!
3232 * We should think about putting some thread starvation
3233 * safeguards if we deal with long chains of packets.
3234 */
3235 if (__probable(m != NULL)) {
3236 dlil_input_packet_list_extended(NULL, m,
3237 m_cnt, ifp->if_poll_mode);
3238 }
3239
3240 lck_mtx_lock_spin(&inp->dlth_lock);
3241 VERIFY(inp->dlth_flags & DLIL_INPUT_RUNNING);
3242 if (!(inp->dlth_flags & ~(DLIL_INPUT_RUNNING |
3243 DLIL_INPUT_TERMINATE))) {
3244 break;
3245 }
3246 }
3247
3248 inp->dlth_flags &= ~DLIL_INPUT_RUNNING;
3249
3250 if (__improbable(inp->dlth_flags & DLIL_INPUT_TERMINATE)) {
3251 terminate:
3252 lck_mtx_unlock(&inp->dlth_lock);
3253 dlil_terminate_input_thread(inp);
3254 /* NOTREACHED */
3255 } else {
3256 (void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3257 lck_mtx_unlock(&inp->dlth_lock);
3258 (void) thread_block_parameter(dlil_input_thread_cont, inp);
3259 /* NOTREACHED */
3260 }
3261
3262 VERIFY(0); /* we should never get here */
3263 /* NOTREACHED */
3264 __builtin_unreachable();
3265 }
3266
3267 /*
3268 * Input thread for interfaces with opportunistic polling input model.
3269 */
3270 __attribute__((noreturn))
3271 static void
dlil_rxpoll_input_thread_func(void * v,wait_result_t w)3272 dlil_rxpoll_input_thread_func(void *v, wait_result_t w)
3273 {
3274 #pragma unused(w)
3275 char thread_name[MAXTHREADNAMESIZE];
3276 struct dlil_threading_info *inp = v;
3277 struct ifnet *ifp = inp->dlth_ifp;
3278
3279 VERIFY(inp != dlil_main_input_thread);
3280 VERIFY(ifp != NULL && (ifp->if_eflags & IFEF_RXPOLL) &&
3281 (ifp->if_xflags & IFXF_LEGACY));
3282 VERIFY(current_thread() == inp->dlth_thread);
3283
3284 /* construct the name for this thread, and then apply it */
3285 bzero(thread_name, sizeof(thread_name));
3286 (void) snprintf(thread_name, sizeof(thread_name),
3287 "dlil_input_poll_%s", ifp->if_xname);
3288 thread_set_thread_name(inp->dlth_thread, thread_name);
3289
3290 lck_mtx_lock(&inp->dlth_lock);
3291 VERIFY(!(inp->dlth_flags & (DLIL_INPUT_EMBRYONIC | DLIL_INPUT_RUNNING)));
3292 (void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3293 inp->dlth_flags |= DLIL_INPUT_EMBRYONIC;
3294 /* wake up once to get out of embryonic state */
3295 dlil_input_wakeup(inp);
3296 lck_mtx_unlock(&inp->dlth_lock);
3297 (void) thread_block_parameter(dlil_rxpoll_input_thread_cont, inp);
3298 /* NOTREACHED */
3299 __builtin_unreachable();
3300 }
3301
3302 __attribute__((noreturn))
3303 static void
dlil_rxpoll_input_thread_cont(void * v,wait_result_t wres)3304 dlil_rxpoll_input_thread_cont(void *v, wait_result_t wres)
3305 {
3306 struct dlil_threading_info *inp = v;
3307 struct ifnet *ifp = inp->dlth_ifp;
3308 struct timespec ts;
3309
3310 lck_mtx_lock_spin(&inp->dlth_lock);
3311 if (__improbable(wres == THREAD_INTERRUPTED ||
3312 (inp->dlth_flags & DLIL_INPUT_TERMINATE))) {
3313 goto terminate;
3314 }
3315
3316 VERIFY(!(inp->dlth_flags & DLIL_INPUT_RUNNING));
3317 inp->dlth_flags |= DLIL_INPUT_RUNNING;
3318
3319 while (1) {
3320 struct mbuf *m = NULL;
3321 uint32_t m_cnt, poll_req = 0;
3322 uint64_t m_size = 0;
3323 ifnet_model_t mode;
3324 struct timespec now, delta;
3325 classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
3326 boolean_t notify;
3327 boolean_t embryonic;
3328 uint64_t ival;
3329
3330 inp->dlth_flags &= ~DLIL_INPUT_WAITING;
3331
3332 if (__improbable(embryonic =
3333 (inp->dlth_flags & DLIL_INPUT_EMBRYONIC))) {
3334 inp->dlth_flags &= ~DLIL_INPUT_EMBRYONIC;
3335 goto skip;
3336 }
3337
3338 if ((ival = ifp->if_rxpoll_ival) < IF_RXPOLL_INTERVALTIME_MIN) {
3339 ival = IF_RXPOLL_INTERVALTIME_MIN;
3340 }
3341
3342 /* Link parameters changed? */
3343 if (ifp->if_poll_update != 0) {
3344 ifp->if_poll_update = 0;
3345 (void) dlil_rxpoll_set_params(ifp, NULL, TRUE);
3346 }
3347
3348 /* Current operating mode */
3349 mode = ifp->if_poll_mode;
3350
3351 /*
3352 * Protocol registration and injection must always use
3353 * the main input thread; in theory the latter can utilize
3354 * the corresponding input thread where the packet arrived
3355 * on, but that requires our knowing the interface in advance
3356 * (and the benefits might not worth the trouble.)
3357 */
3358 VERIFY(!(inp->dlth_flags &
3359 (DLIL_PROTO_WAITING | DLIL_PROTO_REGISTER)));
3360
3361 /* Total count of all packets */
3362 m_cnt = qlen(&inp->dlth_pkts);
3363
3364 /* Total bytes of all packets */
3365 m_size = qsize(&inp->dlth_pkts);
3366
3367 /* Packets for this interface */
3368 _getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
3369 m = pkt.cp_mbuf;
3370 VERIFY(m != NULL || m_cnt == 0);
3371
3372 nanouptime(&now);
3373 if (!net_timerisset(&ifp->if_poll_sample_lasttime)) {
3374 *(&ifp->if_poll_sample_lasttime) = *(&now);
3375 }
3376
3377 net_timersub(&now, &ifp->if_poll_sample_lasttime, &delta);
3378 if (if_rxpoll && net_timerisset(&ifp->if_poll_sample_holdtime)) {
3379 u_int32_t ptot, btot;
3380
3381 /* Accumulate statistics for current sampling */
3382 PKTCNTR_ADD(&ifp->if_poll_sstats, m_cnt, m_size);
3383
3384 if (net_timercmp(&delta, &ifp->if_poll_sample_holdtime, <)) {
3385 goto skip;
3386 }
3387
3388 *(&ifp->if_poll_sample_lasttime) = *(&now);
3389
3390 /* Calculate min/max of inbound bytes */
3391 btot = (u_int32_t)ifp->if_poll_sstats.bytes;
3392 if (ifp->if_rxpoll_bmin == 0 || ifp->if_rxpoll_bmin > btot) {
3393 ifp->if_rxpoll_bmin = btot;
3394 }
3395 if (btot > ifp->if_rxpoll_bmax) {
3396 ifp->if_rxpoll_bmax = btot;
3397 }
3398
3399 /* Calculate EWMA of inbound bytes */
3400 DLIL_EWMA(ifp->if_rxpoll_bavg, btot, if_rxpoll_decay);
3401
3402 /* Calculate min/max of inbound packets */
3403 ptot = (u_int32_t)ifp->if_poll_sstats.packets;
3404 if (ifp->if_rxpoll_pmin == 0 || ifp->if_rxpoll_pmin > ptot) {
3405 ifp->if_rxpoll_pmin = ptot;
3406 }
3407 if (ptot > ifp->if_rxpoll_pmax) {
3408 ifp->if_rxpoll_pmax = ptot;
3409 }
3410
3411 /* Calculate EWMA of inbound packets */
3412 DLIL_EWMA(ifp->if_rxpoll_pavg, ptot, if_rxpoll_decay);
3413
3414 /* Reset sampling statistics */
3415 PKTCNTR_CLEAR(&ifp->if_poll_sstats);
3416
3417 /* Calculate EWMA of wakeup requests */
3418 DLIL_EWMA(ifp->if_rxpoll_wavg, inp->dlth_wtot,
3419 if_rxpoll_decay);
3420 inp->dlth_wtot = 0;
3421
3422 if (dlil_verbose) {
3423 if (!net_timerisset(&ifp->if_poll_dbg_lasttime)) {
3424 *(&ifp->if_poll_dbg_lasttime) = *(&now);
3425 }
3426 net_timersub(&now, &ifp->if_poll_dbg_lasttime, &delta);
3427 if (net_timercmp(&delta, &dlil_dbgrate, >=)) {
3428 *(&ifp->if_poll_dbg_lasttime) = *(&now);
3429 DLIL_PRINTF("%s: [%s] pkts avg %d max %d "
3430 "limits [%d/%d], wreq avg %d "
3431 "limits [%d/%d], bytes avg %d "
3432 "limits [%d/%d]\n", if_name(ifp),
3433 (ifp->if_poll_mode ==
3434 IFNET_MODEL_INPUT_POLL_ON) ?
3435 "ON" : "OFF", ifp->if_rxpoll_pavg,
3436 ifp->if_rxpoll_pmax,
3437 ifp->if_rxpoll_plowat,
3438 ifp->if_rxpoll_phiwat,
3439 ifp->if_rxpoll_wavg,
3440 ifp->if_rxpoll_wlowat,
3441 ifp->if_rxpoll_whiwat,
3442 ifp->if_rxpoll_bavg,
3443 ifp->if_rxpoll_blowat,
3444 ifp->if_rxpoll_bhiwat);
3445 }
3446 }
3447
3448 /* Perform mode transition, if necessary */
3449 if (!net_timerisset(&ifp->if_poll_mode_lasttime)) {
3450 *(&ifp->if_poll_mode_lasttime) = *(&now);
3451 }
3452
3453 net_timersub(&now, &ifp->if_poll_mode_lasttime, &delta);
3454 if (net_timercmp(&delta, &ifp->if_poll_mode_holdtime, <)) {
3455 goto skip;
3456 }
3457
3458 if (ifp->if_rxpoll_pavg <= ifp->if_rxpoll_plowat &&
3459 ifp->if_rxpoll_bavg <= ifp->if_rxpoll_blowat &&
3460 ifp->if_poll_mode != IFNET_MODEL_INPUT_POLL_OFF) {
3461 mode = IFNET_MODEL_INPUT_POLL_OFF;
3462 } else if (ifp->if_rxpoll_pavg >= ifp->if_rxpoll_phiwat &&
3463 (ifp->if_rxpoll_bavg >= ifp->if_rxpoll_bhiwat ||
3464 ifp->if_rxpoll_wavg >= ifp->if_rxpoll_whiwat) &&
3465 ifp->if_poll_mode != IFNET_MODEL_INPUT_POLL_ON) {
3466 mode = IFNET_MODEL_INPUT_POLL_ON;
3467 }
3468
3469 if (mode != ifp->if_poll_mode) {
3470 ifp->if_poll_mode = mode;
3471 *(&ifp->if_poll_mode_lasttime) = *(&now);
3472 poll_req++;
3473 }
3474 }
3475 skip:
3476 notify = dlil_input_stats_sync(ifp, inp);
3477
3478 lck_mtx_unlock(&inp->dlth_lock);
3479
3480 if (__improbable(embryonic)) {
3481 ifnet_decr_pending_thread_count(ifp);
3482 }
3483
3484 if (__improbable(notify)) {
3485 ifnet_notify_data_threshold(ifp);
3486 }
3487
3488 /*
3489 * If there's a mode change and interface is still attached,
3490 * perform a downcall to the driver for the new mode. Also
3491 * hold an IO refcnt on the interface to prevent it from
3492 * being detached (will be release below.)
3493 */
3494 if (poll_req != 0 && ifnet_is_attached(ifp, 1)) {
3495 struct ifnet_model_params p = {
3496 .model = mode, .reserved = { 0 }
3497 };
3498 errno_t err;
3499
3500 if (dlil_verbose) {
3501 DLIL_PRINTF("%s: polling is now %s, "
3502 "pkts avg %d max %d limits [%d/%d], "
3503 "wreq avg %d limits [%d/%d], "
3504 "bytes avg %d limits [%d/%d]\n",
3505 if_name(ifp),
3506 (mode == IFNET_MODEL_INPUT_POLL_ON) ?
3507 "ON" : "OFF", ifp->if_rxpoll_pavg,
3508 ifp->if_rxpoll_pmax, ifp->if_rxpoll_plowat,
3509 ifp->if_rxpoll_phiwat, ifp->if_rxpoll_wavg,
3510 ifp->if_rxpoll_wlowat, ifp->if_rxpoll_whiwat,
3511 ifp->if_rxpoll_bavg, ifp->if_rxpoll_blowat,
3512 ifp->if_rxpoll_bhiwat);
3513 }
3514
3515 if ((err = ((*ifp->if_input_ctl)(ifp,
3516 IFNET_CTL_SET_INPUT_MODEL, sizeof(p), &p))) != 0) {
3517 DLIL_PRINTF("%s: error setting polling mode "
3518 "to %s (%d)\n", if_name(ifp),
3519 (mode == IFNET_MODEL_INPUT_POLL_ON) ?
3520 "ON" : "OFF", err);
3521 }
3522
3523 switch (mode) {
3524 case IFNET_MODEL_INPUT_POLL_OFF:
3525 ifnet_set_poll_cycle(ifp, NULL);
3526 ifp->if_rxpoll_offreq++;
3527 if (err != 0) {
3528 ifp->if_rxpoll_offerr++;
3529 }
3530 break;
3531
3532 case IFNET_MODEL_INPUT_POLL_ON:
3533 net_nsectimer(&ival, &ts);
3534 ifnet_set_poll_cycle(ifp, &ts);
3535 ifnet_poll(ifp);
3536 ifp->if_rxpoll_onreq++;
3537 if (err != 0) {
3538 ifp->if_rxpoll_onerr++;
3539 }
3540 break;
3541
3542 default:
3543 VERIFY(0);
3544 /* NOTREACHED */
3545 }
3546
3547 /* Release the IO refcnt */
3548 ifnet_decr_iorefcnt(ifp);
3549 }
3550
3551 /*
3552 * NOTE warning %%% attention !!!!
3553 * We should think about putting some thread starvation
3554 * safeguards if we deal with long chains of packets.
3555 */
3556 if (__probable(m != NULL)) {
3557 dlil_input_packet_list_extended(NULL, m, m_cnt, mode);
3558 }
3559
3560 lck_mtx_lock_spin(&inp->dlth_lock);
3561 VERIFY(inp->dlth_flags & DLIL_INPUT_RUNNING);
3562 if (!(inp->dlth_flags & ~(DLIL_INPUT_RUNNING |
3563 DLIL_INPUT_TERMINATE))) {
3564 break;
3565 }
3566 }
3567
3568 inp->dlth_flags &= ~DLIL_INPUT_RUNNING;
3569
3570 if (__improbable(inp->dlth_flags & DLIL_INPUT_TERMINATE)) {
3571 terminate:
3572 lck_mtx_unlock(&inp->dlth_lock);
3573 dlil_terminate_input_thread(inp);
3574 /* NOTREACHED */
3575 } else {
3576 (void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3577 lck_mtx_unlock(&inp->dlth_lock);
3578 (void) thread_block_parameter(dlil_rxpoll_input_thread_cont,
3579 inp);
3580 /* NOTREACHED */
3581 }
3582
3583 VERIFY(0); /* we should never get here */
3584 /* NOTREACHED */
3585 __builtin_unreachable();
3586 }
3587
3588 errno_t
dlil_rxpoll_validate_params(struct ifnet_poll_params * p)3589 dlil_rxpoll_validate_params(struct ifnet_poll_params *p)
3590 {
3591 if (p != NULL) {
3592 if ((p->packets_lowat == 0 && p->packets_hiwat != 0) ||
3593 (p->packets_lowat != 0 && p->packets_hiwat == 0)) {
3594 return EINVAL;
3595 }
3596 if (p->packets_lowat != 0 && /* hiwat must be non-zero */
3597 p->packets_lowat >= p->packets_hiwat) {
3598 return EINVAL;
3599 }
3600 if ((p->bytes_lowat == 0 && p->bytes_hiwat != 0) ||
3601 (p->bytes_lowat != 0 && p->bytes_hiwat == 0)) {
3602 return EINVAL;
3603 }
3604 if (p->bytes_lowat != 0 && /* hiwat must be non-zero */
3605 p->bytes_lowat >= p->bytes_hiwat) {
3606 return EINVAL;
3607 }
3608 if (p->interval_time != 0 &&
3609 p->interval_time < IF_RXPOLL_INTERVALTIME_MIN) {
3610 p->interval_time = IF_RXPOLL_INTERVALTIME_MIN;
3611 }
3612 }
3613 return 0;
3614 }
3615
3616 void
dlil_rxpoll_update_params(struct ifnet * ifp,struct ifnet_poll_params * p)3617 dlil_rxpoll_update_params(struct ifnet *ifp, struct ifnet_poll_params *p)
3618 {
3619 u_int64_t sample_holdtime, inbw;
3620
3621 if ((inbw = ifnet_input_linkrate(ifp)) == 0 && p == NULL) {
3622 sample_holdtime = 0; /* polling is disabled */
3623 ifp->if_rxpoll_wlowat = ifp->if_rxpoll_plowat =
3624 ifp->if_rxpoll_blowat = 0;
3625 ifp->if_rxpoll_whiwat = ifp->if_rxpoll_phiwat =
3626 ifp->if_rxpoll_bhiwat = (u_int32_t)-1;
3627 ifp->if_rxpoll_plim = 0;
3628 ifp->if_rxpoll_ival = IF_RXPOLL_INTERVALTIME_MIN;
3629 } else {
3630 u_int32_t plowat, phiwat, blowat, bhiwat, plim;
3631 u_int64_t ival;
3632 unsigned int n, i;
3633
3634 for (n = 0, i = 0; rxpoll_tbl[i].speed != 0; i++) {
3635 if (inbw < rxpoll_tbl[i].speed) {
3636 break;
3637 }
3638 n = i;
3639 }
3640 /* auto-tune if caller didn't specify a value */
3641 plowat = ((p == NULL || p->packets_lowat == 0) ?
3642 rxpoll_tbl[n].plowat : p->packets_lowat);
3643 phiwat = ((p == NULL || p->packets_hiwat == 0) ?
3644 rxpoll_tbl[n].phiwat : p->packets_hiwat);
3645 blowat = ((p == NULL || p->bytes_lowat == 0) ?
3646 rxpoll_tbl[n].blowat : p->bytes_lowat);
3647 bhiwat = ((p == NULL || p->bytes_hiwat == 0) ?
3648 rxpoll_tbl[n].bhiwat : p->bytes_hiwat);
3649 plim = ((p == NULL || p->packets_limit == 0 ||
3650 if_rxpoll_max != 0) ? if_rxpoll_max : p->packets_limit);
3651 ival = ((p == NULL || p->interval_time == 0 ||
3652 if_rxpoll_interval_time != IF_RXPOLL_INTERVALTIME) ?
3653 if_rxpoll_interval_time : p->interval_time);
3654
3655 VERIFY(plowat != 0 && phiwat != 0);
3656 VERIFY(blowat != 0 && bhiwat != 0);
3657 VERIFY(ival >= IF_RXPOLL_INTERVALTIME_MIN);
3658
3659 sample_holdtime = if_rxpoll_sample_holdtime;
3660 ifp->if_rxpoll_wlowat = if_sysctl_rxpoll_wlowat;
3661 ifp->if_rxpoll_whiwat = if_sysctl_rxpoll_whiwat;
3662 ifp->if_rxpoll_plowat = plowat;
3663 ifp->if_rxpoll_phiwat = phiwat;
3664 ifp->if_rxpoll_blowat = blowat;
3665 ifp->if_rxpoll_bhiwat = bhiwat;
3666 ifp->if_rxpoll_plim = plim;
3667 ifp->if_rxpoll_ival = ival;
3668 }
3669
3670 net_nsectimer(&if_rxpoll_mode_holdtime, &ifp->if_poll_mode_holdtime);
3671 net_nsectimer(&sample_holdtime, &ifp->if_poll_sample_holdtime);
3672
3673 if (dlil_verbose) {
3674 DLIL_PRINTF("%s: speed %llu bps, sample per %llu nsec, "
3675 "poll interval %llu nsec, pkts per poll %u, "
3676 "pkt limits [%u/%u], wreq limits [%u/%u], "
3677 "bytes limits [%u/%u]\n", if_name(ifp),
3678 inbw, sample_holdtime, ifp->if_rxpoll_ival,
3679 ifp->if_rxpoll_plim, ifp->if_rxpoll_plowat,
3680 ifp->if_rxpoll_phiwat, ifp->if_rxpoll_wlowat,
3681 ifp->if_rxpoll_whiwat, ifp->if_rxpoll_blowat,
3682 ifp->if_rxpoll_bhiwat);
3683 }
3684 }
3685
3686 /*
3687 * Must be called on an attached ifnet (caller is expected to check.)
3688 * Caller may pass NULL for poll parameters to indicate "auto-tuning."
3689 */
3690 errno_t
dlil_rxpoll_set_params(struct ifnet * ifp,struct ifnet_poll_params * p,boolean_t locked)3691 dlil_rxpoll_set_params(struct ifnet *ifp, struct ifnet_poll_params *p,
3692 boolean_t locked)
3693 {
3694 errno_t err;
3695 struct dlil_threading_info *inp;
3696
3697 VERIFY(ifp != NULL);
3698 if (!(ifp->if_eflags & IFEF_RXPOLL) || (inp = ifp->if_inp) == NULL) {
3699 return ENXIO;
3700 }
3701 err = dlil_rxpoll_validate_params(p);
3702 if (err != 0) {
3703 return err;
3704 }
3705
3706 if (!locked) {
3707 lck_mtx_lock(&inp->dlth_lock);
3708 }
3709 LCK_MTX_ASSERT(&inp->dlth_lock, LCK_MTX_ASSERT_OWNED);
3710 /*
3711 * Normally, we'd reset the parameters to the auto-tuned values
3712 * if the the input thread detects a change in link rate. If the
3713 * driver provides its own parameters right after a link rate
3714 * changes, but before the input thread gets to run, we want to
3715 * make sure to keep the driver's values. Clearing if_poll_update
3716 * will achieve that.
3717 */
3718 if (p != NULL && !locked && ifp->if_poll_update != 0) {
3719 ifp->if_poll_update = 0;
3720 }
3721 dlil_rxpoll_update_params(ifp, p);
3722 if (!locked) {
3723 lck_mtx_unlock(&inp->dlth_lock);
3724 }
3725 return 0;
3726 }
3727
3728 /*
3729 * Must be called on an attached ifnet (caller is expected to check.)
3730 */
3731 errno_t
dlil_rxpoll_get_params(struct ifnet * ifp,struct ifnet_poll_params * p)3732 dlil_rxpoll_get_params(struct ifnet *ifp, struct ifnet_poll_params *p)
3733 {
3734 struct dlil_threading_info *inp;
3735
3736 VERIFY(ifp != NULL && p != NULL);
3737 if (!(ifp->if_eflags & IFEF_RXPOLL) || (inp = ifp->if_inp) == NULL) {
3738 return ENXIO;
3739 }
3740
3741 bzero(p, sizeof(*p));
3742
3743 lck_mtx_lock(&inp->dlth_lock);
3744 p->packets_limit = ifp->if_rxpoll_plim;
3745 p->packets_lowat = ifp->if_rxpoll_plowat;
3746 p->packets_hiwat = ifp->if_rxpoll_phiwat;
3747 p->bytes_lowat = ifp->if_rxpoll_blowat;
3748 p->bytes_hiwat = ifp->if_rxpoll_bhiwat;
3749 p->interval_time = ifp->if_rxpoll_ival;
3750 lck_mtx_unlock(&inp->dlth_lock);
3751
3752 return 0;
3753 }
3754
3755 errno_t
ifnet_input(struct ifnet * ifp,struct mbuf * m_head,const struct ifnet_stat_increment_param * s)3756 ifnet_input(struct ifnet *ifp, struct mbuf *m_head,
3757 const struct ifnet_stat_increment_param *s)
3758 {
3759 return ifnet_input_common(ifp, m_head, NULL, s, FALSE, FALSE);
3760 }
3761
3762 errno_t
ifnet_input_extended(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s)3763 ifnet_input_extended(struct ifnet *ifp, struct mbuf *m_head,
3764 struct mbuf *m_tail, const struct ifnet_stat_increment_param *s)
3765 {
3766 return ifnet_input_common(ifp, m_head, m_tail, s, TRUE, FALSE);
3767 }
3768
3769 errno_t
ifnet_input_poll(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s)3770 ifnet_input_poll(struct ifnet *ifp, struct mbuf *m_head,
3771 struct mbuf *m_tail, const struct ifnet_stat_increment_param *s)
3772 {
3773 return ifnet_input_common(ifp, m_head, m_tail, s,
3774 (m_head != NULL), TRUE);
3775 }
3776
3777 static errno_t
ifnet_input_common(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t ext,boolean_t poll)3778 ifnet_input_common(struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail,
3779 const struct ifnet_stat_increment_param *s, boolean_t ext, boolean_t poll)
3780 {
3781 dlil_input_func input_func;
3782 struct ifnet_stat_increment_param _s;
3783 u_int32_t m_cnt = 0, m_size = 0;
3784 struct mbuf *last;
3785 errno_t err = 0;
3786
3787 if ((m_head == NULL && !poll) || (s == NULL && ext)) {
3788 if (m_head != NULL) {
3789 mbuf_freem_list(m_head);
3790 }
3791 return EINVAL;
3792 }
3793
3794 VERIFY(m_head != NULL || (s == NULL && m_tail == NULL && !ext && poll));
3795 VERIFY(m_tail == NULL || ext);
3796 VERIFY(s != NULL || !ext);
3797
3798 /*
3799 * Drop the packet(s) if the parameters are invalid, or if the
3800 * interface is no longer attached; else hold an IO refcnt to
3801 * prevent it from being detached (will be released below.)
3802 */
3803 if (ifp == NULL || (ifp != lo_ifp && !ifnet_datamov_begin(ifp))) {
3804 if (m_head != NULL) {
3805 mbuf_freem_list(m_head);
3806 }
3807 return EINVAL;
3808 }
3809
3810 input_func = ifp->if_input_dlil;
3811 VERIFY(input_func != NULL);
3812
3813 if (m_tail == NULL) {
3814 last = m_head;
3815 while (m_head != NULL) {
3816 #if IFNET_INPUT_SANITY_CHK
3817 if (__improbable(dlil_input_sanity_check != 0)) {
3818 DLIL_INPUT_CHECK(last, ifp);
3819 }
3820 #endif /* IFNET_INPUT_SANITY_CHK */
3821 m_cnt++;
3822 m_size += m_length(last);
3823 if (mbuf_nextpkt(last) == NULL) {
3824 break;
3825 }
3826 last = mbuf_nextpkt(last);
3827 }
3828 m_tail = last;
3829 } else {
3830 #if IFNET_INPUT_SANITY_CHK
3831 if (__improbable(dlil_input_sanity_check != 0)) {
3832 last = m_head;
3833 while (1) {
3834 DLIL_INPUT_CHECK(last, ifp);
3835 m_cnt++;
3836 m_size += m_length(last);
3837 if (mbuf_nextpkt(last) == NULL) {
3838 break;
3839 }
3840 last = mbuf_nextpkt(last);
3841 }
3842 } else {
3843 m_cnt = s->packets_in;
3844 m_size = s->bytes_in;
3845 last = m_tail;
3846 }
3847 #else
3848 m_cnt = s->packets_in;
3849 m_size = s->bytes_in;
3850 last = m_tail;
3851 #endif /* IFNET_INPUT_SANITY_CHK */
3852 }
3853
3854 if (last != m_tail) {
3855 panic_plain("%s: invalid input packet chain for %s, "
3856 "tail mbuf %p instead of %p\n", __func__, if_name(ifp),
3857 m_tail, last);
3858 }
3859
3860 /*
3861 * Assert packet count only for the extended variant, for backwards
3862 * compatibility, since this came directly from the device driver.
3863 * Relax this assertion for input bytes, as the driver may have
3864 * included the link-layer headers in the computation; hence
3865 * m_size is just an approximation.
3866 */
3867 if (ext && s->packets_in != m_cnt) {
3868 panic_plain("%s: input packet count mismatch for %s, "
3869 "%d instead of %d\n", __func__, if_name(ifp),
3870 s->packets_in, m_cnt);
3871 }
3872
3873 if (s == NULL) {
3874 bzero(&_s, sizeof(_s));
3875 s = &_s;
3876 } else {
3877 _s = *s;
3878 }
3879 _s.packets_in = m_cnt;
3880 _s.bytes_in = m_size;
3881
3882 err = (*input_func)(ifp, m_head, m_tail, s, poll, current_thread());
3883
3884 if (ifp != lo_ifp) {
3885 /* Release the IO refcnt */
3886 ifnet_datamov_end(ifp);
3887 }
3888
3889 return err;
3890 }
3891
3892 #if SKYWALK
3893 errno_t
dlil_set_input_handler(struct ifnet * ifp,dlil_input_func fn)3894 dlil_set_input_handler(struct ifnet *ifp, dlil_input_func fn)
3895 {
3896 return atomic_test_set_ptr(&ifp->if_input_dlil,
3897 ptrauth_nop_cast(void *, &dlil_input_handler),
3898 ptrauth_nop_cast(void *, fn)) ? 0 : EBUSY;
3899 }
3900
3901 void
dlil_reset_input_handler(struct ifnet * ifp)3902 dlil_reset_input_handler(struct ifnet *ifp)
3903 {
3904 while (!atomic_test_set_ptr(&ifp->if_input_dlil,
3905 ptrauth_nop_cast(void *, ifp->if_input_dlil),
3906 ptrauth_nop_cast(void *, &dlil_input_handler))) {
3907 ;
3908 }
3909 }
3910
3911 errno_t
dlil_set_output_handler(struct ifnet * ifp,dlil_output_func fn)3912 dlil_set_output_handler(struct ifnet *ifp, dlil_output_func fn)
3913 {
3914 return atomic_test_set_ptr(&ifp->if_output_dlil,
3915 ptrauth_nop_cast(void *, &dlil_output_handler),
3916 ptrauth_nop_cast(void *, fn)) ? 0 : EBUSY;
3917 }
3918
3919 void
dlil_reset_output_handler(struct ifnet * ifp)3920 dlil_reset_output_handler(struct ifnet *ifp)
3921 {
3922 while (!atomic_test_set_ptr(&ifp->if_output_dlil,
3923 ptrauth_nop_cast(void *, ifp->if_output_dlil),
3924 ptrauth_nop_cast(void *, &dlil_output_handler))) {
3925 ;
3926 }
3927 }
3928 #endif /* SKYWALK */
3929
3930 errno_t
dlil_output_handler(struct ifnet * ifp,struct mbuf * m)3931 dlil_output_handler(struct ifnet *ifp, struct mbuf *m)
3932 {
3933 return ifp->if_output(ifp, m);
3934 }
3935
3936 errno_t
dlil_input_handler(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t poll,struct thread * tp)3937 dlil_input_handler(struct ifnet *ifp, struct mbuf *m_head,
3938 struct mbuf *m_tail, const struct ifnet_stat_increment_param *s,
3939 boolean_t poll, struct thread *tp)
3940 {
3941 struct dlil_threading_info *inp = ifp->if_inp;
3942
3943 if (__improbable(inp == NULL)) {
3944 inp = dlil_main_input_thread;
3945 }
3946
3947 #if (DEVELOPMENT || DEBUG)
3948 if (__improbable(net_thread_is_marked(NET_THREAD_SYNC_RX))) {
3949 return dlil_input_sync(inp, ifp, m_head, m_tail, s, poll, tp);
3950 } else
3951 #endif /* (DEVELOPMENT || DEBUG) */
3952 {
3953 return inp->dlth_strategy(inp, ifp, m_head, m_tail, s, poll, tp);
3954 }
3955 }
3956
3957 static errno_t
dlil_input_async(struct dlil_threading_info * inp,struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t poll,struct thread * tp)3958 dlil_input_async(struct dlil_threading_info *inp,
3959 struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail,
3960 const struct ifnet_stat_increment_param *s, boolean_t poll,
3961 struct thread *tp)
3962 {
3963 u_int32_t m_cnt = s->packets_in;
3964 u_int32_t m_size = s->bytes_in;
3965 boolean_t notify = FALSE;
3966
3967 /*
3968 * If there is a matching DLIL input thread associated with an
3969 * affinity set, associate this thread with the same set. We
3970 * will only do this once.
3971 */
3972 lck_mtx_lock_spin(&inp->dlth_lock);
3973 if (inp != dlil_main_input_thread && inp->dlth_affinity && tp != NULL &&
3974 ((!poll && inp->dlth_driver_thread == THREAD_NULL) ||
3975 (poll && inp->dlth_poller_thread == THREAD_NULL))) {
3976 u_int32_t tag = inp->dlth_affinity_tag;
3977
3978 if (poll) {
3979 VERIFY(inp->dlth_poller_thread == THREAD_NULL);
3980 inp->dlth_poller_thread = tp;
3981 } else {
3982 VERIFY(inp->dlth_driver_thread == THREAD_NULL);
3983 inp->dlth_driver_thread = tp;
3984 }
3985 lck_mtx_unlock(&inp->dlth_lock);
3986
3987 /* Associate the current thread with the new affinity tag */
3988 (void) dlil_affinity_set(tp, tag);
3989
3990 /*
3991 * Take a reference on the current thread; during detach,
3992 * we will need to refer to it in order to tear down its
3993 * affinity.
3994 */
3995 thread_reference(tp);
3996 lck_mtx_lock_spin(&inp->dlth_lock);
3997 }
3998
3999 VERIFY(m_head != NULL || (m_tail == NULL && m_cnt == 0));
4000
4001 /*
4002 * Because of loopbacked multicast we cannot stuff the ifp in
4003 * the rcvif of the packet header: loopback (lo0) packets use a
4004 * dedicated list so that we can later associate them with lo_ifp
4005 * on their way up the stack. Packets for other interfaces without
4006 * dedicated input threads go to the regular list.
4007 */
4008 if (m_head != NULL) {
4009 classq_pkt_t head, tail;
4010 CLASSQ_PKT_INIT_MBUF(&head, m_head);
4011 CLASSQ_PKT_INIT_MBUF(&tail, m_tail);
4012 if (inp == dlil_main_input_thread && ifp == lo_ifp) {
4013 struct dlil_main_threading_info *inpm =
4014 (struct dlil_main_threading_info *)inp;
4015 _addq_multi(&inpm->lo_rcvq_pkts, &head, &tail,
4016 m_cnt, m_size);
4017 } else {
4018 _addq_multi(&inp->dlth_pkts, &head, &tail,
4019 m_cnt, m_size);
4020 }
4021 }
4022
4023 #if IFNET_INPUT_SANITY_CHK
4024 if (__improbable(dlil_input_sanity_check != 0)) {
4025 u_int32_t count = 0, size = 0;
4026 struct mbuf *m0;
4027
4028 for (m0 = m_head; m0; m0 = mbuf_nextpkt(m0)) {
4029 size += m_length(m0);
4030 count++;
4031 }
4032
4033 if (count != m_cnt) {
4034 panic_plain("%s: invalid total packet count %u "
4035 "(expected %u)\n", if_name(ifp), count, m_cnt);
4036 /* NOTREACHED */
4037 __builtin_unreachable();
4038 } else if (size != m_size) {
4039 panic_plain("%s: invalid total packet size %u "
4040 "(expected %u)\n", if_name(ifp), size, m_size);
4041 /* NOTREACHED */
4042 __builtin_unreachable();
4043 }
4044
4045 inp->dlth_pkts_cnt += m_cnt;
4046 }
4047 #endif /* IFNET_INPUT_SANITY_CHK */
4048
4049 dlil_input_stats_add(s, inp, ifp, poll);
4050 /*
4051 * If we're using the main input thread, synchronize the
4052 * stats now since we have the interface context. All
4053 * other cases involving dedicated input threads will
4054 * have their stats synchronized there.
4055 */
4056 if (inp == dlil_main_input_thread) {
4057 notify = dlil_input_stats_sync(ifp, inp);
4058 }
4059
4060 dlil_input_wakeup(inp);
4061 lck_mtx_unlock(&inp->dlth_lock);
4062
4063 if (notify) {
4064 ifnet_notify_data_threshold(ifp);
4065 }
4066
4067 return 0;
4068 }
4069
4070 static errno_t
dlil_input_sync(struct dlil_threading_info * inp,struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t poll,struct thread * tp)4071 dlil_input_sync(struct dlil_threading_info *inp,
4072 struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail,
4073 const struct ifnet_stat_increment_param *s, boolean_t poll,
4074 struct thread *tp)
4075 {
4076 #pragma unused(tp)
4077 u_int32_t m_cnt = s->packets_in;
4078 u_int32_t m_size = s->bytes_in;
4079 boolean_t notify = FALSE;
4080 classq_pkt_t head, tail;
4081
4082 ASSERT(inp != dlil_main_input_thread);
4083
4084 /* XXX: should we just assert instead? */
4085 if (__improbable(m_head == NULL)) {
4086 return 0;
4087 }
4088
4089 CLASSQ_PKT_INIT_MBUF(&head, m_head);
4090 CLASSQ_PKT_INIT_MBUF(&tail, m_tail);
4091
4092 lck_mtx_lock_spin(&inp->dlth_lock);
4093 _addq_multi(&inp->dlth_pkts, &head, &tail, m_cnt, m_size);
4094
4095 #if IFNET_INPUT_SANITY_CHK
4096 if (__improbable(dlil_input_sanity_check != 0)) {
4097 u_int32_t count = 0, size = 0;
4098 struct mbuf *m0;
4099
4100 for (m0 = m_head; m0; m0 = mbuf_nextpkt(m0)) {
4101 size += m_length(m0);
4102 count++;
4103 }
4104
4105 if (count != m_cnt) {
4106 panic_plain("%s: invalid total packet count %u "
4107 "(expected %u)\n", if_name(ifp), count, m_cnt);
4108 /* NOTREACHED */
4109 __builtin_unreachable();
4110 } else if (size != m_size) {
4111 panic_plain("%s: invalid total packet size %u "
4112 "(expected %u)\n", if_name(ifp), size, m_size);
4113 /* NOTREACHED */
4114 __builtin_unreachable();
4115 }
4116
4117 inp->dlth_pkts_cnt += m_cnt;
4118 }
4119 #endif /* IFNET_INPUT_SANITY_CHK */
4120
4121 dlil_input_stats_add(s, inp, ifp, poll);
4122
4123 m_cnt = qlen(&inp->dlth_pkts);
4124 _getq_all(&inp->dlth_pkts, &head, NULL, NULL, NULL);
4125
4126 #if SKYWALK
4127 /*
4128 * If this interface is attached to a netif nexus,
4129 * the stats are already incremented there; otherwise
4130 * do it here.
4131 */
4132 if (!(ifp->if_capabilities & IFCAP_SKYWALK))
4133 #endif /* SKYWALK */
4134 notify = dlil_input_stats_sync(ifp, inp);
4135
4136 lck_mtx_unlock(&inp->dlth_lock);
4137
4138 if (notify) {
4139 ifnet_notify_data_threshold(ifp);
4140 }
4141
4142 /*
4143 * NOTE warning %%% attention !!!!
4144 * We should think about putting some thread starvation
4145 * safeguards if we deal with long chains of packets.
4146 */
4147 if (head.cp_mbuf != NULL) {
4148 dlil_input_packet_list_extended(NULL, head.cp_mbuf,
4149 m_cnt, ifp->if_poll_mode);
4150 }
4151
4152 return 0;
4153 }
4154
4155 #if SKYWALK
4156 errno_t
ifnet_set_output_handler(struct ifnet * ifp,ifnet_output_func fn)4157 ifnet_set_output_handler(struct ifnet *ifp, ifnet_output_func fn)
4158 {
4159 return atomic_test_set_ptr(&ifp->if_output,
4160 ptrauth_nop_cast(void *, ifp->if_save_output),
4161 ptrauth_nop_cast(void *, fn)) ? 0 : EBUSY;
4162 }
4163
4164 void
ifnet_reset_output_handler(struct ifnet * ifp)4165 ifnet_reset_output_handler(struct ifnet *ifp)
4166 {
4167 while (!atomic_test_set_ptr(&ifp->if_output,
4168 ptrauth_nop_cast(void *, ifp->if_output),
4169 ptrauth_nop_cast(void *, ifp->if_save_output))) {
4170 ;
4171 }
4172 }
4173
4174 errno_t
ifnet_set_start_handler(struct ifnet * ifp,ifnet_start_func fn)4175 ifnet_set_start_handler(struct ifnet *ifp, ifnet_start_func fn)
4176 {
4177 return atomic_test_set_ptr(&ifp->if_start,
4178 ptrauth_nop_cast(void *, ifp->if_save_start),
4179 ptrauth_nop_cast(void *, fn)) ? 0 : EBUSY;
4180 }
4181
4182 void
ifnet_reset_start_handler(struct ifnet * ifp)4183 ifnet_reset_start_handler(struct ifnet *ifp)
4184 {
4185 while (!atomic_test_set_ptr(&ifp->if_start,
4186 ptrauth_nop_cast(void *, ifp->if_start),
4187 ptrauth_nop_cast(void *, ifp->if_save_start))) {
4188 ;
4189 }
4190 }
4191 #endif /* SKYWALK */
4192
4193 static void
ifnet_start_common(struct ifnet * ifp,boolean_t resetfc)4194 ifnet_start_common(struct ifnet *ifp, boolean_t resetfc)
4195 {
4196 if (!(ifp->if_eflags & IFEF_TXSTART)) {
4197 return;
4198 }
4199 /*
4200 * If the starter thread is inactive, signal it to do work,
4201 * unless the interface is being flow controlled from below,
4202 * e.g. a virtual interface being flow controlled by a real
4203 * network interface beneath it, or it's been disabled via
4204 * a call to ifnet_disable_output().
4205 */
4206 lck_mtx_lock_spin(&ifp->if_start_lock);
4207 if (resetfc) {
4208 ifp->if_start_flags &= ~IFSF_FLOW_CONTROLLED;
4209 } else if (ifp->if_start_flags & IFSF_FLOW_CONTROLLED) {
4210 lck_mtx_unlock(&ifp->if_start_lock);
4211 return;
4212 }
4213 ifp->if_start_req++;
4214 if (!ifp->if_start_active && ifp->if_start_thread != THREAD_NULL &&
4215 (resetfc || !(ifp->if_eflags & IFEF_ENQUEUE_MULTI) ||
4216 IFCQ_LEN(ifp->if_snd) >= ifp->if_start_delay_qlen ||
4217 ifp->if_start_delayed == 0)) {
4218 (void) wakeup_one((caddr_t)&ifp->if_start_thread);
4219 }
4220 lck_mtx_unlock(&ifp->if_start_lock);
4221 }
4222
4223 void
ifnet_start(struct ifnet * ifp)4224 ifnet_start(struct ifnet *ifp)
4225 {
4226 ifnet_start_common(ifp, FALSE);
4227 }
4228
4229 __attribute__((noreturn))
4230 static void
ifnet_start_thread_func(void * v,wait_result_t w)4231 ifnet_start_thread_func(void *v, wait_result_t w)
4232 {
4233 #pragma unused(w)
4234 struct ifnet *ifp = v;
4235 char thread_name[MAXTHREADNAMESIZE];
4236
4237 /* Construct the name for this thread, and then apply it. */
4238 bzero(thread_name, sizeof(thread_name));
4239 (void) snprintf(thread_name, sizeof(thread_name),
4240 "ifnet_start_%s", ifp->if_xname);
4241 #if SKYWALK
4242 /* override name for native Skywalk interface */
4243 if (ifp->if_eflags & IFEF_SKYWALK_NATIVE) {
4244 (void) snprintf(thread_name, sizeof(thread_name),
4245 "skywalk_doorbell_%s_tx", ifp->if_xname);
4246 }
4247 #endif /* SKYWALK */
4248 ASSERT(ifp->if_start_thread == current_thread());
4249 thread_set_thread_name(current_thread(), thread_name);
4250
4251 /*
4252 * Treat the dedicated starter thread for lo0 as equivalent to
4253 * the driver workloop thread; if net_affinity is enabled for
4254 * the main input thread, associate this starter thread to it
4255 * by binding them with the same affinity tag. This is done
4256 * only once (as we only have one lo_ifp which never goes away.)
4257 */
4258 if (ifp == lo_ifp) {
4259 struct dlil_threading_info *inp = dlil_main_input_thread;
4260 struct thread *tp = current_thread();
4261 #if SKYWALK
4262 /* native skywalk loopback not yet implemented */
4263 VERIFY(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
4264 #endif /* SKYWALK */
4265
4266 lck_mtx_lock(&inp->dlth_lock);
4267 if (inp->dlth_affinity) {
4268 u_int32_t tag = inp->dlth_affinity_tag;
4269
4270 VERIFY(inp->dlth_driver_thread == THREAD_NULL);
4271 VERIFY(inp->dlth_poller_thread == THREAD_NULL);
4272 inp->dlth_driver_thread = tp;
4273 lck_mtx_unlock(&inp->dlth_lock);
4274
4275 /* Associate this thread with the affinity tag */
4276 (void) dlil_affinity_set(tp, tag);
4277 } else {
4278 lck_mtx_unlock(&inp->dlth_lock);
4279 }
4280 }
4281
4282 lck_mtx_lock(&ifp->if_start_lock);
4283 VERIFY(!ifp->if_start_embryonic && !ifp->if_start_active);
4284 (void) assert_wait(&ifp->if_start_thread, THREAD_UNINT);
4285 ifp->if_start_embryonic = 1;
4286 /* wake up once to get out of embryonic state */
4287 ifp->if_start_req++;
4288 (void) wakeup_one((caddr_t)&ifp->if_start_thread);
4289 lck_mtx_unlock(&ifp->if_start_lock);
4290 (void) thread_block_parameter(ifnet_start_thread_cont, ifp);
4291 /* NOTREACHED */
4292 __builtin_unreachable();
4293 }
4294
4295 __attribute__((noreturn))
4296 static void
ifnet_start_thread_cont(void * v,wait_result_t wres)4297 ifnet_start_thread_cont(void *v, wait_result_t wres)
4298 {
4299 struct ifnet *ifp = v;
4300 struct ifclassq *ifq = ifp->if_snd;
4301
4302 lck_mtx_lock_spin(&ifp->if_start_lock);
4303 if (__improbable(wres == THREAD_INTERRUPTED ||
4304 (ifp->if_start_flags & IFSF_TERMINATING) != 0)) {
4305 goto terminate;
4306 }
4307
4308 if (__improbable(ifp->if_start_embryonic)) {
4309 ifp->if_start_embryonic = 0;
4310 lck_mtx_unlock(&ifp->if_start_lock);
4311 ifnet_decr_pending_thread_count(ifp);
4312 lck_mtx_lock_spin(&ifp->if_start_lock);
4313 goto skip;
4314 }
4315
4316 ifp->if_start_active = 1;
4317
4318 /*
4319 * Keep on servicing until no more request.
4320 */
4321 for (;;) {
4322 u_int32_t req = ifp->if_start_req;
4323 if (!IFCQ_IS_EMPTY(ifq) &&
4324 (ifp->if_eflags & IFEF_ENQUEUE_MULTI) &&
4325 ifp->if_start_delayed == 0 &&
4326 IFCQ_LEN(ifq) < ifp->if_start_delay_qlen &&
4327 (ifp->if_eflags & IFEF_DELAY_START)) {
4328 ifp->if_start_delayed = 1;
4329 ifnet_start_delayed++;
4330 break;
4331 }
4332 ifp->if_start_delayed = 0;
4333 lck_mtx_unlock(&ifp->if_start_lock);
4334
4335 /*
4336 * If no longer attached, don't call start because ifp
4337 * is being destroyed; else hold an IO refcnt to
4338 * prevent the interface from being detached (will be
4339 * released below.)
4340 */
4341 if (!ifnet_datamov_begin(ifp)) {
4342 lck_mtx_lock_spin(&ifp->if_start_lock);
4343 break;
4344 }
4345
4346 /* invoke the driver's start routine */
4347 ((*ifp->if_start)(ifp));
4348
4349 /*
4350 * Release the io ref count taken above.
4351 */
4352 ifnet_datamov_end(ifp);
4353
4354 lck_mtx_lock_spin(&ifp->if_start_lock);
4355
4356 /*
4357 * If there's no pending request or if the
4358 * interface has been disabled, we're done.
4359 */
4360 #define _IFSF_DISABLED (IFSF_FLOW_CONTROLLED | IFSF_TERMINATING)
4361 if (req == ifp->if_start_req ||
4362 (ifp->if_start_flags & _IFSF_DISABLED) != 0) {
4363 break;
4364 }
4365 }
4366 skip:
4367 ifp->if_start_req = 0;
4368 ifp->if_start_active = 0;
4369
4370 #if SKYWALK
4371 /*
4372 * Wakeup any waiters, e.g. any threads waiting to
4373 * detach the interface from the flowswitch, etc.
4374 */
4375 if (ifp->if_start_waiters != 0) {
4376 ifp->if_start_waiters = 0;
4377 wakeup(&ifp->if_start_waiters);
4378 }
4379 #endif /* SKYWALK */
4380 if (__probable((ifp->if_start_flags & IFSF_TERMINATING) == 0)) {
4381 uint64_t deadline = TIMEOUT_WAIT_FOREVER;
4382 struct timespec delay_start_ts;
4383 struct timespec *ts;
4384
4385 /*
4386 * Wakeup N ns from now if rate-controlled by TBR, and if
4387 * there are still packets in the send queue which haven't
4388 * been dequeued so far; else sleep indefinitely (ts = NULL)
4389 * until ifnet_start() is called again.
4390 */
4391 ts = ((IFCQ_TBR_IS_ENABLED(ifq) && !IFCQ_IS_EMPTY(ifq)) ?
4392 &ifp->if_start_cycle : NULL);
4393
4394 if (ts == NULL && ifp->if_start_delayed == 1) {
4395 delay_start_ts.tv_sec = 0;
4396 delay_start_ts.tv_nsec = ifp->if_start_delay_timeout;
4397 ts = &delay_start_ts;
4398 }
4399
4400 if (ts != NULL && ts->tv_sec == 0 && ts->tv_nsec == 0) {
4401 ts = NULL;
4402 }
4403
4404 if (__improbable(ts != NULL)) {
4405 clock_interval_to_deadline((uint32_t)(ts->tv_nsec +
4406 (ts->tv_sec * NSEC_PER_SEC)), 1, &deadline);
4407 }
4408
4409 (void) assert_wait_deadline(&ifp->if_start_thread,
4410 THREAD_UNINT, deadline);
4411 lck_mtx_unlock(&ifp->if_start_lock);
4412 (void) thread_block_parameter(ifnet_start_thread_cont, ifp);
4413 /* NOTREACHED */
4414 } else {
4415 terminate:
4416 /* interface is detached? */
4417 ifnet_set_start_cycle(ifp, NULL);
4418
4419 /* clear if_start_thread to allow termination to continue */
4420 ASSERT(ifp->if_start_thread != THREAD_NULL);
4421 ifp->if_start_thread = THREAD_NULL;
4422 wakeup((caddr_t)&ifp->if_start_thread);
4423 lck_mtx_unlock(&ifp->if_start_lock);
4424
4425 if (dlil_verbose) {
4426 DLIL_PRINTF("%s: starter thread terminated\n",
4427 if_name(ifp));
4428 }
4429
4430 /* for the extra refcnt from kernel_thread_start() */
4431 thread_deallocate(current_thread());
4432 /* this is the end */
4433 thread_terminate(current_thread());
4434 /* NOTREACHED */
4435 }
4436
4437 /* must never get here */
4438 VERIFY(0);
4439 /* NOTREACHED */
4440 __builtin_unreachable();
4441 }
4442
4443 void
ifnet_set_start_cycle(struct ifnet * ifp,struct timespec * ts)4444 ifnet_set_start_cycle(struct ifnet *ifp, struct timespec *ts)
4445 {
4446 if (ts == NULL) {
4447 bzero(&ifp->if_start_cycle, sizeof(ifp->if_start_cycle));
4448 } else {
4449 *(&ifp->if_start_cycle) = *ts;
4450 }
4451
4452 if (ts != NULL && ts->tv_nsec != 0 && dlil_verbose) {
4453 DLIL_PRINTF("%s: restart interval set to %lu nsec\n",
4454 if_name(ifp), ts->tv_nsec);
4455 }
4456 }
4457
4458 static inline void
ifnet_poll_wakeup(struct ifnet * ifp)4459 ifnet_poll_wakeup(struct ifnet *ifp)
4460 {
4461 LCK_MTX_ASSERT(&ifp->if_poll_lock, LCK_MTX_ASSERT_OWNED);
4462
4463 ifp->if_poll_req++;
4464 if (!(ifp->if_poll_flags & IF_POLLF_RUNNING) &&
4465 ifp->if_poll_thread != THREAD_NULL) {
4466 wakeup_one((caddr_t)&ifp->if_poll_thread);
4467 }
4468 }
4469
4470 void
ifnet_poll(struct ifnet * ifp)4471 ifnet_poll(struct ifnet *ifp)
4472 {
4473 /*
4474 * If the poller thread is inactive, signal it to do work.
4475 */
4476 lck_mtx_lock_spin(&ifp->if_poll_lock);
4477 ifnet_poll_wakeup(ifp);
4478 lck_mtx_unlock(&ifp->if_poll_lock);
4479 }
4480
4481 __attribute__((noreturn))
4482 static void
ifnet_poll_thread_func(void * v,wait_result_t w)4483 ifnet_poll_thread_func(void *v, wait_result_t w)
4484 {
4485 #pragma unused(w)
4486 char thread_name[MAXTHREADNAMESIZE];
4487 struct ifnet *ifp = v;
4488
4489 VERIFY(ifp->if_eflags & IFEF_RXPOLL);
4490 VERIFY(current_thread() == ifp->if_poll_thread);
4491
4492 /* construct the name for this thread, and then apply it */
4493 bzero(thread_name, sizeof(thread_name));
4494 (void) snprintf(thread_name, sizeof(thread_name),
4495 "ifnet_poller_%s", ifp->if_xname);
4496 thread_set_thread_name(ifp->if_poll_thread, thread_name);
4497
4498 lck_mtx_lock(&ifp->if_poll_lock);
4499 VERIFY(!(ifp->if_poll_flags & (IF_POLLF_EMBRYONIC | IF_POLLF_RUNNING)));
4500 (void) assert_wait(&ifp->if_poll_thread, THREAD_UNINT);
4501 ifp->if_poll_flags |= IF_POLLF_EMBRYONIC;
4502 /* wake up once to get out of embryonic state */
4503 ifnet_poll_wakeup(ifp);
4504 lck_mtx_unlock(&ifp->if_poll_lock);
4505 (void) thread_block_parameter(ifnet_poll_thread_cont, ifp);
4506 /* NOTREACHED */
4507 __builtin_unreachable();
4508 }
4509
4510 __attribute__((noreturn))
4511 static void
ifnet_poll_thread_cont(void * v,wait_result_t wres)4512 ifnet_poll_thread_cont(void *v, wait_result_t wres)
4513 {
4514 struct dlil_threading_info *inp;
4515 struct ifnet *ifp = v;
4516 struct ifnet_stat_increment_param s;
4517 struct timespec start_time;
4518
4519 VERIFY(ifp->if_eflags & IFEF_RXPOLL);
4520
4521 bzero(&s, sizeof(s));
4522 net_timerclear(&start_time);
4523
4524 lck_mtx_lock_spin(&ifp->if_poll_lock);
4525 if (__improbable(wres == THREAD_INTERRUPTED ||
4526 (ifp->if_poll_flags & IF_POLLF_TERMINATING) != 0)) {
4527 goto terminate;
4528 }
4529
4530 inp = ifp->if_inp;
4531 VERIFY(inp != NULL);
4532
4533 if (__improbable(ifp->if_poll_flags & IF_POLLF_EMBRYONIC)) {
4534 ifp->if_poll_flags &= ~IF_POLLF_EMBRYONIC;
4535 lck_mtx_unlock(&ifp->if_poll_lock);
4536 ifnet_decr_pending_thread_count(ifp);
4537 lck_mtx_lock_spin(&ifp->if_poll_lock);
4538 goto skip;
4539 }
4540
4541 ifp->if_poll_flags |= IF_POLLF_RUNNING;
4542
4543 /*
4544 * Keep on servicing until no more request.
4545 */
4546 for (;;) {
4547 struct mbuf *m_head, *m_tail;
4548 u_int32_t m_lim, m_cnt, m_totlen;
4549 u_int16_t req = ifp->if_poll_req;
4550
4551 m_lim = (ifp->if_rxpoll_plim != 0) ? ifp->if_rxpoll_plim :
4552 MAX((qlimit(&inp->dlth_pkts)), (ifp->if_rxpoll_phiwat << 2));
4553 lck_mtx_unlock(&ifp->if_poll_lock);
4554
4555 /*
4556 * If no longer attached, there's nothing to do;
4557 * else hold an IO refcnt to prevent the interface
4558 * from being detached (will be released below.)
4559 */
4560 if (!ifnet_is_attached(ifp, 1)) {
4561 lck_mtx_lock_spin(&ifp->if_poll_lock);
4562 break;
4563 }
4564
4565 if (dlil_verbose > 1) {
4566 DLIL_PRINTF("%s: polling up to %d pkts, "
4567 "pkts avg %d max %d, wreq avg %d, "
4568 "bytes avg %d\n",
4569 if_name(ifp), m_lim,
4570 ifp->if_rxpoll_pavg, ifp->if_rxpoll_pmax,
4571 ifp->if_rxpoll_wavg, ifp->if_rxpoll_bavg);
4572 }
4573
4574 /* invoke the driver's input poll routine */
4575 ((*ifp->if_input_poll)(ifp, 0, m_lim, &m_head, &m_tail,
4576 &m_cnt, &m_totlen));
4577
4578 if (m_head != NULL) {
4579 VERIFY(m_tail != NULL && m_cnt > 0);
4580
4581 if (dlil_verbose > 1) {
4582 DLIL_PRINTF("%s: polled %d pkts, "
4583 "pkts avg %d max %d, wreq avg %d, "
4584 "bytes avg %d\n",
4585 if_name(ifp), m_cnt,
4586 ifp->if_rxpoll_pavg, ifp->if_rxpoll_pmax,
4587 ifp->if_rxpoll_wavg, ifp->if_rxpoll_bavg);
4588 }
4589
4590 /* stats are required for extended variant */
4591 s.packets_in = m_cnt;
4592 s.bytes_in = m_totlen;
4593
4594 (void) ifnet_input_common(ifp, m_head, m_tail,
4595 &s, TRUE, TRUE);
4596 } else {
4597 if (dlil_verbose > 1) {
4598 DLIL_PRINTF("%s: no packets, "
4599 "pkts avg %d max %d, wreq avg %d, "
4600 "bytes avg %d\n",
4601 if_name(ifp), ifp->if_rxpoll_pavg,
4602 ifp->if_rxpoll_pmax, ifp->if_rxpoll_wavg,
4603 ifp->if_rxpoll_bavg);
4604 }
4605
4606 (void) ifnet_input_common(ifp, NULL, NULL,
4607 NULL, FALSE, TRUE);
4608 }
4609
4610 /* Release the io ref count */
4611 ifnet_decr_iorefcnt(ifp);
4612
4613 lck_mtx_lock_spin(&ifp->if_poll_lock);
4614
4615 /* if there's no pending request, we're done */
4616 if (req == ifp->if_poll_req ||
4617 (ifp->if_poll_flags & IF_POLLF_TERMINATING) != 0) {
4618 break;
4619 }
4620 }
4621 skip:
4622 ifp->if_poll_req = 0;
4623 ifp->if_poll_flags &= ~IF_POLLF_RUNNING;
4624
4625 if (__probable((ifp->if_poll_flags & IF_POLLF_TERMINATING) == 0)) {
4626 uint64_t deadline = TIMEOUT_WAIT_FOREVER;
4627 struct timespec *ts;
4628
4629 /*
4630 * Wakeup N ns from now, else sleep indefinitely (ts = NULL)
4631 * until ifnet_poll() is called again.
4632 */
4633 ts = &ifp->if_poll_cycle;
4634 if (ts->tv_sec == 0 && ts->tv_nsec == 0) {
4635 ts = NULL;
4636 }
4637
4638 if (ts != NULL) {
4639 clock_interval_to_deadline((uint32_t)(ts->tv_nsec +
4640 (ts->tv_sec * NSEC_PER_SEC)), 1, &deadline);
4641 }
4642
4643 (void) assert_wait_deadline(&ifp->if_poll_thread,
4644 THREAD_UNINT, deadline);
4645 lck_mtx_unlock(&ifp->if_poll_lock);
4646 (void) thread_block_parameter(ifnet_poll_thread_cont, ifp);
4647 /* NOTREACHED */
4648 } else {
4649 terminate:
4650 /* interface is detached (maybe while asleep)? */
4651 ifnet_set_poll_cycle(ifp, NULL);
4652
4653 /* clear if_poll_thread to allow termination to continue */
4654 ASSERT(ifp->if_poll_thread != THREAD_NULL);
4655 ifp->if_poll_thread = THREAD_NULL;
4656 wakeup((caddr_t)&ifp->if_poll_thread);
4657 lck_mtx_unlock(&ifp->if_poll_lock);
4658
4659 if (dlil_verbose) {
4660 DLIL_PRINTF("%s: poller thread terminated\n",
4661 if_name(ifp));
4662 }
4663
4664 /* for the extra refcnt from kernel_thread_start() */
4665 thread_deallocate(current_thread());
4666 /* this is the end */
4667 thread_terminate(current_thread());
4668 /* NOTREACHED */
4669 }
4670
4671 /* must never get here */
4672 VERIFY(0);
4673 /* NOTREACHED */
4674 __builtin_unreachable();
4675 }
4676
4677 void
ifnet_set_poll_cycle(struct ifnet * ifp,struct timespec * ts)4678 ifnet_set_poll_cycle(struct ifnet *ifp, struct timespec *ts)
4679 {
4680 if (ts == NULL) {
4681 bzero(&ifp->if_poll_cycle, sizeof(ifp->if_poll_cycle));
4682 } else {
4683 *(&ifp->if_poll_cycle) = *ts;
4684 }
4685
4686 if (ts != NULL && ts->tv_nsec != 0 && dlil_verbose) {
4687 DLIL_PRINTF("%s: poll interval set to %lu nsec\n",
4688 if_name(ifp), ts->tv_nsec);
4689 }
4690 }
4691
4692 void
ifnet_purge(struct ifnet * ifp)4693 ifnet_purge(struct ifnet *ifp)
4694 {
4695 if (ifp != NULL && (ifp->if_eflags & IFEF_TXSTART)) {
4696 if_qflush_snd(ifp, false);
4697 }
4698 }
4699
4700 void
ifnet_update_sndq(struct ifclassq * ifq,cqev_t ev)4701 ifnet_update_sndq(struct ifclassq *ifq, cqev_t ev)
4702 {
4703 IFCQ_LOCK_ASSERT_HELD(ifq);
4704
4705 if (!(IFCQ_IS_READY(ifq))) {
4706 return;
4707 }
4708
4709 if (IFCQ_TBR_IS_ENABLED(ifq)) {
4710 struct tb_profile tb = {
4711 .rate = ifq->ifcq_tbr.tbr_rate_raw,
4712 .percent = ifq->ifcq_tbr.tbr_percent, .depth = 0
4713 };
4714 (void) ifclassq_tbr_set(ifq, &tb, FALSE);
4715 }
4716
4717 ifclassq_update(ifq, ev);
4718 }
4719
4720 void
ifnet_update_rcv(struct ifnet * ifp,cqev_t ev)4721 ifnet_update_rcv(struct ifnet *ifp, cqev_t ev)
4722 {
4723 switch (ev) {
4724 case CLASSQ_EV_LINK_BANDWIDTH:
4725 if (net_rxpoll && (ifp->if_eflags & IFEF_RXPOLL)) {
4726 ifp->if_poll_update++;
4727 }
4728 break;
4729
4730 default:
4731 break;
4732 }
4733 }
4734
4735 errno_t
ifnet_set_output_sched_model(struct ifnet * ifp,u_int32_t model)4736 ifnet_set_output_sched_model(struct ifnet *ifp, u_int32_t model)
4737 {
4738 struct ifclassq *ifq;
4739 u_int32_t omodel;
4740 errno_t err;
4741
4742 if (ifp == NULL || model >= IFNET_SCHED_MODEL_MAX) {
4743 return EINVAL;
4744 } else if (!(ifp->if_eflags & IFEF_TXSTART)) {
4745 return ENXIO;
4746 }
4747
4748 ifq = ifp->if_snd;
4749 IFCQ_LOCK(ifq);
4750 omodel = ifp->if_output_sched_model;
4751 ifp->if_output_sched_model = model;
4752 if ((err = ifclassq_pktsched_setup(ifq)) != 0) {
4753 ifp->if_output_sched_model = omodel;
4754 }
4755 IFCQ_UNLOCK(ifq);
4756
4757 return err;
4758 }
4759
4760 errno_t
ifnet_set_sndq_maxlen(struct ifnet * ifp,u_int32_t maxqlen)4761 ifnet_set_sndq_maxlen(struct ifnet *ifp, u_int32_t maxqlen)
4762 {
4763 if (ifp == NULL) {
4764 return EINVAL;
4765 } else if (!(ifp->if_eflags & IFEF_TXSTART)) {
4766 return ENXIO;
4767 }
4768
4769 ifclassq_set_maxlen(ifp->if_snd, maxqlen);
4770
4771 return 0;
4772 }
4773
4774 errno_t
ifnet_get_sndq_maxlen(struct ifnet * ifp,u_int32_t * maxqlen)4775 ifnet_get_sndq_maxlen(struct ifnet *ifp, u_int32_t *maxqlen)
4776 {
4777 if (ifp == NULL || maxqlen == NULL) {
4778 return EINVAL;
4779 } else if (!(ifp->if_eflags & IFEF_TXSTART)) {
4780 return ENXIO;
4781 }
4782
4783 *maxqlen = ifclassq_get_maxlen(ifp->if_snd);
4784
4785 return 0;
4786 }
4787
4788 errno_t
ifnet_get_sndq_len(struct ifnet * ifp,u_int32_t * pkts)4789 ifnet_get_sndq_len(struct ifnet *ifp, u_int32_t *pkts)
4790 {
4791 errno_t err;
4792
4793 if (ifp == NULL || pkts == NULL) {
4794 err = EINVAL;
4795 } else if (!(ifp->if_eflags & IFEF_TXSTART)) {
4796 err = ENXIO;
4797 } else {
4798 err = ifclassq_get_len(ifp->if_snd, MBUF_SC_UNSPEC,
4799 IF_CLASSQ_ALL_GRPS, pkts, NULL);
4800 }
4801
4802 return err;
4803 }
4804
4805 errno_t
ifnet_get_service_class_sndq_len(struct ifnet * ifp,mbuf_svc_class_t sc,u_int32_t * pkts,u_int32_t * bytes)4806 ifnet_get_service_class_sndq_len(struct ifnet *ifp, mbuf_svc_class_t sc,
4807 u_int32_t *pkts, u_int32_t *bytes)
4808 {
4809 errno_t err;
4810
4811 if (ifp == NULL || !MBUF_VALID_SC(sc) ||
4812 (pkts == NULL && bytes == NULL)) {
4813 err = EINVAL;
4814 } else if (!(ifp->if_eflags & IFEF_TXSTART)) {
4815 err = ENXIO;
4816 } else {
4817 err = ifclassq_get_len(ifp->if_snd, sc, IF_CLASSQ_ALL_GRPS,
4818 pkts, bytes);
4819 }
4820
4821 return err;
4822 }
4823
4824 errno_t
ifnet_set_rcvq_maxlen(struct ifnet * ifp,u_int32_t maxqlen)4825 ifnet_set_rcvq_maxlen(struct ifnet *ifp, u_int32_t maxqlen)
4826 {
4827 struct dlil_threading_info *inp;
4828
4829 if (ifp == NULL) {
4830 return EINVAL;
4831 } else if (!(ifp->if_eflags & IFEF_RXPOLL) || ifp->if_inp == NULL) {
4832 return ENXIO;
4833 }
4834
4835 if (maxqlen == 0) {
4836 maxqlen = if_rcvq_maxlen;
4837 } else if (maxqlen < IF_RCVQ_MINLEN) {
4838 maxqlen = IF_RCVQ_MINLEN;
4839 }
4840
4841 inp = ifp->if_inp;
4842 lck_mtx_lock(&inp->dlth_lock);
4843 qlimit(&inp->dlth_pkts) = maxqlen;
4844 lck_mtx_unlock(&inp->dlth_lock);
4845
4846 return 0;
4847 }
4848
4849 errno_t
ifnet_get_rcvq_maxlen(struct ifnet * ifp,u_int32_t * maxqlen)4850 ifnet_get_rcvq_maxlen(struct ifnet *ifp, u_int32_t *maxqlen)
4851 {
4852 struct dlil_threading_info *inp;
4853
4854 if (ifp == NULL || maxqlen == NULL) {
4855 return EINVAL;
4856 } else if (!(ifp->if_eflags & IFEF_RXPOLL) || ifp->if_inp == NULL) {
4857 return ENXIO;
4858 }
4859
4860 inp = ifp->if_inp;
4861 lck_mtx_lock(&inp->dlth_lock);
4862 *maxqlen = qlimit(&inp->dlth_pkts);
4863 lck_mtx_unlock(&inp->dlth_lock);
4864 return 0;
4865 }
4866
4867 void
ifnet_enqueue_multi_setup(struct ifnet * ifp,uint16_t delay_qlen,uint16_t delay_timeout)4868 ifnet_enqueue_multi_setup(struct ifnet *ifp, uint16_t delay_qlen,
4869 uint16_t delay_timeout)
4870 {
4871 if (delay_qlen > 0 && delay_timeout > 0) {
4872 if_set_eflags(ifp, IFEF_ENQUEUE_MULTI);
4873 ifp->if_start_delay_qlen = MIN(100, delay_qlen);
4874 ifp->if_start_delay_timeout = min(20000, delay_timeout);
4875 /* convert timeout to nanoseconds */
4876 ifp->if_start_delay_timeout *= 1000;
4877 kprintf("%s: forced IFEF_ENQUEUE_MULTI qlen %u timeout %u\n",
4878 ifp->if_xname, (uint32_t)delay_qlen,
4879 (uint32_t)delay_timeout);
4880 } else {
4881 if_clear_eflags(ifp, IFEF_ENQUEUE_MULTI);
4882 }
4883 }
4884
4885 /*
4886 * This function clears the DSCP bits in the IPV4/V6 header pointed to by buf.
4887 * While it's ok for buf to be not 32 bit aligned, the caller must ensure that
4888 * buf holds the full header.
4889 */
4890 static __attribute__((noinline)) void
ifnet_mcast_clear_dscp(uint8_t * buf,uint8_t ip_ver)4891 ifnet_mcast_clear_dscp(uint8_t *buf, uint8_t ip_ver)
4892 {
4893 struct ip *ip;
4894 struct ip6_hdr *ip6;
4895 uint8_t lbuf[64] __attribute__((aligned(8)));
4896 uint8_t *p = buf;
4897
4898 if (ip_ver == IPVERSION) {
4899 uint8_t old_tos;
4900 uint32_t sum;
4901
4902 if (__improbable(!IP_HDR_ALIGNED_P(p))) {
4903 DTRACE_IP1(not__aligned__v4, uint8_t *, buf);
4904 bcopy(buf, lbuf, sizeof(struct ip));
4905 p = lbuf;
4906 }
4907 ip = (struct ip *)(void *)p;
4908 if (__probable((ip->ip_tos & ~IPTOS_ECN_MASK) == 0)) {
4909 return;
4910 }
4911
4912 DTRACE_IP1(clear__v4, struct ip *, ip);
4913 old_tos = ip->ip_tos;
4914 ip->ip_tos &= IPTOS_ECN_MASK;
4915 sum = ip->ip_sum + htons(old_tos) - htons(ip->ip_tos);
4916 sum = (sum >> 16) + (sum & 0xffff);
4917 ip->ip_sum = (uint16_t)(sum & 0xffff);
4918
4919 if (__improbable(p == lbuf)) {
4920 bcopy(lbuf, buf, sizeof(struct ip));
4921 }
4922 } else {
4923 uint32_t flow;
4924 ASSERT(ip_ver == IPV6_VERSION);
4925
4926 if (__improbable(!IP_HDR_ALIGNED_P(p))) {
4927 DTRACE_IP1(not__aligned__v6, uint8_t *, buf);
4928 bcopy(buf, lbuf, sizeof(struct ip6_hdr));
4929 p = lbuf;
4930 }
4931 ip6 = (struct ip6_hdr *)(void *)p;
4932 flow = ntohl(ip6->ip6_flow);
4933 if (__probable((flow & IP6FLOW_DSCP_MASK) == 0)) {
4934 return;
4935 }
4936
4937 DTRACE_IP1(clear__v6, struct ip6_hdr *, ip6);
4938 ip6->ip6_flow = htonl(flow & ~IP6FLOW_DSCP_MASK);
4939
4940 if (__improbable(p == lbuf)) {
4941 bcopy(lbuf, buf, sizeof(struct ip6_hdr));
4942 }
4943 }
4944 }
4945
4946 static inline errno_t
ifnet_enqueue_ifclassq(struct ifnet * ifp,struct ifclassq * ifcq,classq_pkt_t * p,boolean_t flush,boolean_t * pdrop)4947 ifnet_enqueue_ifclassq(struct ifnet *ifp, struct ifclassq *ifcq,
4948 classq_pkt_t *p, boolean_t flush, boolean_t *pdrop)
4949 {
4950 #if SKYWALK
4951 volatile struct sk_nexusadv *nxadv = NULL;
4952 #endif /* SKYWALK */
4953 volatile uint64_t *fg_ts = NULL;
4954 volatile uint64_t *rt_ts = NULL;
4955 struct timespec now;
4956 u_int64_t now_nsec = 0;
4957 int error = 0;
4958 uint8_t *mcast_buf = NULL;
4959 uint8_t ip_ver;
4960 uint32_t pktlen;
4961
4962 ASSERT(ifp->if_eflags & IFEF_TXSTART);
4963 #if SKYWALK
4964 /*
4965 * If attached to flowswitch, grab pointers to the
4966 * timestamp variables in the nexus advisory region.
4967 */
4968 if ((ifp->if_capabilities & IFCAP_SKYWALK) && ifp->if_na != NULL &&
4969 (nxadv = ifp->if_na->nifna_netif->nif_fsw_nxadv) != NULL) {
4970 fg_ts = &nxadv->nxadv_fg_sendts;
4971 rt_ts = &nxadv->nxadv_rt_sendts;
4972 }
4973 #endif /* SKYWALK */
4974
4975 /*
4976 * If packet already carries a timestamp, either from dlil_output()
4977 * or from flowswitch, use it here. Otherwise, record timestamp.
4978 * PKTF_TS_VALID is always cleared prior to entering classq, i.e.
4979 * the timestamp value is used internally there.
4980 */
4981 switch (p->cp_ptype) {
4982 case QP_MBUF:
4983 #if SKYWALK
4984 /*
4985 * Valid only for non-native (compat) Skywalk interface.
4986 * If the data source uses packet, caller must convert
4987 * it to mbuf first prior to calling this routine.
4988 */
4989 ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
4990 #endif /* SKYWALK */
4991 ASSERT(p->cp_mbuf->m_flags & M_PKTHDR);
4992 ASSERT(p->cp_mbuf->m_nextpkt == NULL);
4993
4994 if (!(p->cp_mbuf->m_pkthdr.pkt_flags & PKTF_TS_VALID) ||
4995 p->cp_mbuf->m_pkthdr.pkt_timestamp == 0) {
4996 nanouptime(&now);
4997 net_timernsec(&now, &now_nsec);
4998 p->cp_mbuf->m_pkthdr.pkt_timestamp = now_nsec;
4999 }
5000 p->cp_mbuf->m_pkthdr.pkt_flags &= ~PKTF_TS_VALID;
5001 /*
5002 * If the packet service class is not background,
5003 * update the timestamp to indicate recent activity
5004 * on a foreground socket.
5005 */
5006 if ((p->cp_mbuf->m_pkthdr.pkt_flags & PKTF_FLOW_ID) &&
5007 p->cp_mbuf->m_pkthdr.pkt_flowsrc == FLOWSRC_INPCB) {
5008 if (!(p->cp_mbuf->m_pkthdr.pkt_flags &
5009 PKTF_SO_BACKGROUND)) {
5010 ifp->if_fg_sendts = (uint32_t)_net_uptime;
5011 if (fg_ts != NULL) {
5012 *fg_ts = (uint32_t)_net_uptime;
5013 }
5014 }
5015 if (p->cp_mbuf->m_pkthdr.pkt_flags & PKTF_SO_REALTIME) {
5016 ifp->if_rt_sendts = (uint32_t)_net_uptime;
5017 if (rt_ts != NULL) {
5018 *rt_ts = (uint32_t)_net_uptime;
5019 }
5020 }
5021 }
5022 pktlen = m_pktlen(p->cp_mbuf);
5023
5024 /*
5025 * Some Wi-Fi AP implementations do not correctly handle
5026 * multicast IP packets with DSCP bits set (radr://9331522).
5027 * As a workaround we clear the DSCP bits but keep service
5028 * class (rdar://51507725).
5029 */
5030 if ((p->cp_mbuf->m_flags & M_MCAST) != 0 &&
5031 IFNET_IS_WIFI_INFRA(ifp)) {
5032 size_t len = mbuf_len(p->cp_mbuf), hlen;
5033 struct ether_header *eh;
5034 boolean_t pullup = FALSE;
5035 uint16_t etype;
5036
5037 if (__improbable(len < sizeof(struct ether_header))) {
5038 DTRACE_IP1(small__ether, size_t, len);
5039 if ((p->cp_mbuf = m_pullup(p->cp_mbuf,
5040 sizeof(struct ether_header))) == NULL) {
5041 return ENOMEM;
5042 }
5043 }
5044 eh = (struct ether_header *)mbuf_data(p->cp_mbuf);
5045 etype = ntohs(eh->ether_type);
5046 if (etype == ETHERTYPE_IP) {
5047 hlen = sizeof(struct ether_header) +
5048 sizeof(struct ip);
5049 if (len < hlen) {
5050 DTRACE_IP1(small__v4, size_t, len);
5051 pullup = TRUE;
5052 }
5053 ip_ver = IPVERSION;
5054 } else if (etype == ETHERTYPE_IPV6) {
5055 hlen = sizeof(struct ether_header) +
5056 sizeof(struct ip6_hdr);
5057 if (len < hlen) {
5058 DTRACE_IP1(small__v6, size_t, len);
5059 pullup = TRUE;
5060 }
5061 ip_ver = IPV6_VERSION;
5062 } else {
5063 DTRACE_IP1(invalid__etype, uint16_t, etype);
5064 break;
5065 }
5066 if (pullup) {
5067 if ((p->cp_mbuf = m_pullup(p->cp_mbuf, (int)hlen)) ==
5068 NULL) {
5069 return ENOMEM;
5070 }
5071
5072 eh = (struct ether_header *)mbuf_data(
5073 p->cp_mbuf);
5074 }
5075 mcast_buf = (uint8_t *)(eh + 1);
5076 /*
5077 * ifnet_mcast_clear_dscp() will finish the work below.
5078 * Note that the pullups above ensure that mcast_buf
5079 * points to a full IP header.
5080 */
5081 }
5082 break;
5083
5084 #if SKYWALK
5085 case QP_PACKET:
5086 /*
5087 * Valid only for native Skywalk interface. If the data
5088 * source uses mbuf, caller must convert it to packet first
5089 * prior to calling this routine.
5090 */
5091 ASSERT(ifp->if_eflags & IFEF_SKYWALK_NATIVE);
5092 if (!(p->cp_kpkt->pkt_pflags & PKT_F_TS_VALID) ||
5093 p->cp_kpkt->pkt_timestamp == 0) {
5094 nanouptime(&now);
5095 net_timernsec(&now, &now_nsec);
5096 p->cp_kpkt->pkt_timestamp = now_nsec;
5097 }
5098 p->cp_kpkt->pkt_pflags &= ~PKT_F_TS_VALID;
5099 /*
5100 * If the packet service class is not background,
5101 * update the timestamps on the interface, as well as
5102 * the ones in nexus-wide advisory to indicate recent
5103 * activity on a foreground flow.
5104 */
5105 if (!(p->cp_kpkt->pkt_pflags & PKT_F_BACKGROUND)) {
5106 ifp->if_fg_sendts = (uint32_t)_net_uptime;
5107 if (fg_ts != NULL) {
5108 *fg_ts = (uint32_t)_net_uptime;
5109 }
5110 }
5111 if (p->cp_kpkt->pkt_pflags & PKT_F_REALTIME) {
5112 ifp->if_rt_sendts = (uint32_t)_net_uptime;
5113 if (rt_ts != NULL) {
5114 *rt_ts = (uint32_t)_net_uptime;
5115 }
5116 }
5117 pktlen = p->cp_kpkt->pkt_length;
5118
5119 /*
5120 * Some Wi-Fi AP implementations do not correctly handle
5121 * multicast IP packets with DSCP bits set (radr://9331522).
5122 * As a workaround we clear the DSCP bits but keep service
5123 * class (rdar://51507725).
5124 */
5125 if ((p->cp_kpkt->pkt_link_flags & PKT_LINKF_MCAST) != 0 &&
5126 IFNET_IS_WIFI_INFRA(ifp)) {
5127 uint8_t *baddr;
5128 struct ether_header *eh;
5129 uint16_t etype;
5130
5131 MD_BUFLET_ADDR_ABS(p->cp_kpkt, baddr);
5132 baddr += p->cp_kpkt->pkt_headroom;
5133 if (__improbable(pktlen < sizeof(struct ether_header))) {
5134 DTRACE_IP1(pkt__small__ether, __kern_packet *,
5135 p->cp_kpkt);
5136 break;
5137 }
5138 eh = (struct ether_header *)(void *)baddr;
5139 etype = ntohs(eh->ether_type);
5140 if (etype == ETHERTYPE_IP) {
5141 if (pktlen < sizeof(struct ether_header) +
5142 sizeof(struct ip)) {
5143 DTRACE_IP1(pkt__small__v4, uint32_t,
5144 pktlen);
5145 break;
5146 }
5147 ip_ver = IPVERSION;
5148 } else if (etype == ETHERTYPE_IPV6) {
5149 if (pktlen < sizeof(struct ether_header) +
5150 sizeof(struct ip6_hdr)) {
5151 DTRACE_IP1(pkt__small__v6, uint32_t,
5152 pktlen);
5153 break;
5154 }
5155 ip_ver = IPV6_VERSION;
5156 } else {
5157 DTRACE_IP1(pkt__invalid__etype, uint16_t,
5158 etype);
5159 break;
5160 }
5161 mcast_buf = (uint8_t *)(eh + 1);
5162 /*
5163 * ifnet_mcast_clear_dscp() will finish the work below.
5164 * The checks above verify that the IP header is in the
5165 * first buflet.
5166 */
5167 }
5168 break;
5169 #endif /* SKYWALK */
5170
5171 default:
5172 VERIFY(0);
5173 /* NOTREACHED */
5174 __builtin_unreachable();
5175 }
5176
5177 if (mcast_buf != NULL) {
5178 ifnet_mcast_clear_dscp(mcast_buf, ip_ver);
5179 }
5180
5181 if (ifp->if_eflags & IFEF_ENQUEUE_MULTI) {
5182 if (now_nsec == 0) {
5183 nanouptime(&now);
5184 net_timernsec(&now, &now_nsec);
5185 }
5186 /*
5187 * If the driver chose to delay start callback for
5188 * coalescing multiple packets, Then use the following
5189 * heuristics to make sure that start callback will
5190 * be delayed only when bulk data transfer is detected.
5191 * 1. number of packets enqueued in (delay_win * 2) is
5192 * greater than or equal to the delay qlen.
5193 * 2. If delay_start is enabled it will stay enabled for
5194 * another 10 idle windows. This is to take into account
5195 * variable RTT and burst traffic.
5196 * 3. If the time elapsed since last enqueue is more
5197 * than 200ms we disable delaying start callback. This is
5198 * is to take idle time into account.
5199 */
5200 u_int64_t dwin = (ifp->if_start_delay_timeout << 1);
5201 if (ifp->if_start_delay_swin > 0) {
5202 if ((ifp->if_start_delay_swin + dwin) > now_nsec) {
5203 ifp->if_start_delay_cnt++;
5204 } else if ((now_nsec - ifp->if_start_delay_swin)
5205 >= (200 * 1000 * 1000)) {
5206 ifp->if_start_delay_swin = now_nsec;
5207 ifp->if_start_delay_cnt = 1;
5208 ifp->if_start_delay_idle = 0;
5209 if (ifp->if_eflags & IFEF_DELAY_START) {
5210 if_clear_eflags(ifp, IFEF_DELAY_START);
5211 ifnet_delay_start_disabled_increment();
5212 }
5213 } else {
5214 if (ifp->if_start_delay_cnt >=
5215 ifp->if_start_delay_qlen) {
5216 if_set_eflags(ifp, IFEF_DELAY_START);
5217 ifp->if_start_delay_idle = 0;
5218 } else {
5219 if (ifp->if_start_delay_idle >= 10) {
5220 if_clear_eflags(ifp,
5221 IFEF_DELAY_START);
5222 ifnet_delay_start_disabled_increment();
5223 } else {
5224 ifp->if_start_delay_idle++;
5225 }
5226 }
5227 ifp->if_start_delay_swin = now_nsec;
5228 ifp->if_start_delay_cnt = 1;
5229 }
5230 } else {
5231 ifp->if_start_delay_swin = now_nsec;
5232 ifp->if_start_delay_cnt = 1;
5233 ifp->if_start_delay_idle = 0;
5234 if_clear_eflags(ifp, IFEF_DELAY_START);
5235 }
5236 } else {
5237 if_clear_eflags(ifp, IFEF_DELAY_START);
5238 }
5239
5240 /* enqueue the packet (caller consumes object) */
5241 error = ifclassq_enqueue(((ifcq != NULL) ? ifcq : ifp->if_snd), p, p,
5242 1, pktlen, pdrop);
5243
5244 /*
5245 * Tell the driver to start dequeueing; do this even when the queue
5246 * for the packet is suspended (EQSUSPENDED), as the driver could still
5247 * be dequeueing from other unsuspended queues.
5248 */
5249 if (!(ifp->if_eflags & IFEF_ENQUEUE_MULTI) &&
5250 ((error == 0 && flush) || error == EQFULL || error == EQSUSPENDED)) {
5251 ifnet_start(ifp);
5252 }
5253
5254 return error;
5255 }
5256
5257 static inline errno_t
ifnet_enqueue_ifclassq_chain(struct ifnet * ifp,struct ifclassq * ifcq,classq_pkt_t * head,classq_pkt_t * tail,uint32_t cnt,uint32_t bytes,boolean_t flush,boolean_t * pdrop)5258 ifnet_enqueue_ifclassq_chain(struct ifnet *ifp, struct ifclassq *ifcq,
5259 classq_pkt_t *head, classq_pkt_t *tail, uint32_t cnt, uint32_t bytes,
5260 boolean_t flush, boolean_t *pdrop)
5261 {
5262 int error;
5263
5264 /* enqueue the packet (caller consumes object) */
5265 error = ifclassq_enqueue(ifcq != NULL ? ifcq : ifp->if_snd, head, tail,
5266 cnt, bytes, pdrop);
5267
5268 /*
5269 * Tell the driver to start dequeueing; do this even when the queue
5270 * for the packet is suspended (EQSUSPENDED), as the driver could still
5271 * be dequeueing from other unsuspended queues.
5272 */
5273 if ((error == 0 && flush) || error == EQFULL || error == EQSUSPENDED) {
5274 ifnet_start(ifp);
5275 }
5276 return error;
5277 }
5278
5279 #if DEVELOPMENT || DEBUG
5280 void
trace_pkt_dump_payload(struct ifnet * ifp,struct __kern_packet * kpkt,bool input)5281 trace_pkt_dump_payload(struct ifnet *ifp, struct __kern_packet *kpkt, bool input)
5282 {
5283 #define MIN_TRACE_DUMP_PKT_SIZE 32
5284 struct ether_header *eh = NULL;
5285 struct udphdr *uh = NULL;
5286
5287 if (__probable(kdebug_enable == 0 || (flow_key_trace.fk_ipver != IPVERSION &&
5288 flow_key_trace.fk_ipver != IPV6_VERSION))) {
5289 return;
5290 }
5291
5292 uint16_t bdlim, bdlen, bdoff;
5293 uint8_t *baddr;
5294
5295 MD_BUFLET_ADDR_ABS_DLEN(kpkt, baddr, bdlen, bdlim, bdoff);
5296
5297 if (!(kpkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED)) {
5298 if (!IFNET_IS_ETHERNET(ifp)) {
5299 return;
5300 }
5301
5302 sa_family_t af = AF_UNSPEC;
5303 ASSERT(kpkt->pkt_l2_len > 0);
5304
5305 baddr += kpkt->pkt_headroom;
5306 eh = (struct ether_header *)(void *)baddr;
5307 if (__improbable(sizeof(*eh) > kpkt->pkt_length)) {
5308 return;
5309 }
5310 if (__improbable(kpkt->pkt_headroom + sizeof(*eh) > bdlim)) {
5311 return;
5312 }
5313 uint16_t ether_type = ntohs(eh->ether_type);
5314 if (ether_type == ETHERTYPE_IP) {
5315 af = AF_INET;
5316 } else if (ether_type == ETHERTYPE_IPV6) {
5317 af = AF_INET6;
5318 } else {
5319 return;
5320 }
5321 flow_pkt_classify(kpkt, ifp, af, input);
5322 }
5323
5324 if (kpkt->pkt_flow_ip_ver != flow_key_trace.fk_ipver) {
5325 return;
5326 }
5327
5328 if (kpkt->pkt_flow_ip_proto != IPPROTO_UDP) {
5329 return;
5330 }
5331
5332 uint16_t sport = input ? flow_key_trace.fk_dport : flow_key_trace.fk_sport;
5333 uint16_t dport = input ? flow_key_trace.fk_sport : flow_key_trace.fk_dport;
5334
5335 if (kpkt->pkt_flow_udp_src != sport ||
5336 kpkt->pkt_flow_udp_dst != dport) {
5337 return;
5338 }
5339
5340 if (kpkt->pkt_flow_ip_ver == IPVERSION) {
5341 struct ip *ip_header = (struct ip *)kpkt->pkt_flow_ip_hdr;
5342 struct in_addr *saddr = input ? &flow_key_trace.fk_dst4 : &flow_key_trace.fk_src4;
5343 struct in_addr *daddr = input ? &flow_key_trace.fk_src4 : &flow_key_trace.fk_dst4;
5344
5345 if (ip_header->ip_src.s_addr != saddr->s_addr ||
5346 ip_header->ip_dst.s_addr != daddr->s_addr) {
5347 return;
5348 }
5349 } else if (kpkt->pkt_flow_ip_ver == IPV6_VERSION) {
5350 struct ip6_hdr *ip6_header = (struct ip6_hdr *)kpkt->pkt_flow_ip_hdr;
5351 struct in6_addr *saddr = input ? &flow_key_trace.fk_dst6 : &flow_key_trace.fk_src6;
5352 struct in6_addr *daddr = input ? &flow_key_trace.fk_src6 : &flow_key_trace.fk_dst6;
5353
5354 if (!IN6_ARE_ADDR_EQUAL(&ip6_header->ip6_src, saddr) ||
5355 !IN6_ARE_ADDR_EQUAL(&ip6_header->ip6_dst, daddr)) {
5356 return;
5357 }
5358 }
5359
5360 int udp_payload_offset = kpkt->pkt_l2_len + kpkt->pkt_flow_ip_hlen + sizeof(struct udphdr);
5361
5362 uint16_t pkt_payload_len = bdlim - bdoff;
5363 pkt_payload_len = (uint16_t)MIN(pkt_payload_len, kpkt->pkt_length);
5364 pkt_payload_len -= udp_payload_offset;
5365
5366 if (pkt_payload_len >= MIN_TRACE_DUMP_PKT_SIZE) {
5367 uh = (struct udphdr *)kpkt->pkt_flow_udp_hdr;
5368 uint8_t *payload = (uint8_t *)(uh + 1);
5369
5370 /* Trace 32 bytes of UDP transport payload */
5371 uint64_t *trace1 = __DECONST(uint64_t *, payload);
5372 uint64_t *trace2 = trace1 + 1;
5373 uint64_t *trace3 = trace2 + 1;
5374 uint64_t *trace4 = trace3 + 1;
5375
5376 if (input) {
5377 KDBG(IFNET_KTRACE_RX_PKT_DUMP, *trace1, *trace2, *trace3, *trace4);
5378 } else {
5379 KDBG(IFNET_KTRACE_TX_PKT_DUMP, *trace1, *trace2, *trace3, *trace4);
5380 }
5381 }
5382 }
5383 #endif /* DEVELOPMENT || DEBUG */
5384
5385 int
ifnet_enqueue_netem(void * handle,pktsched_pkt_t * pkts,uint32_t n_pkts)5386 ifnet_enqueue_netem(void *handle, pktsched_pkt_t *pkts, uint32_t n_pkts)
5387 {
5388 struct ifnet *ifp = handle;
5389 boolean_t pdrop; /* dummy */
5390 uint32_t i;
5391
5392 ASSERT(n_pkts >= 1);
5393 for (i = 0; i < n_pkts - 1; i++) {
5394 (void) ifnet_enqueue_ifclassq(ifp, NULL, &pkts[i].pktsched_pkt,
5395 FALSE, &pdrop);
5396 }
5397 /* flush with the last packet */
5398 (void) ifnet_enqueue_ifclassq(ifp, NULL, &pkts[i].pktsched_pkt,
5399 TRUE, &pdrop);
5400
5401 return 0;
5402 }
5403
5404 static inline errno_t
ifnet_enqueue_common(struct ifnet * ifp,struct ifclassq * ifcq,classq_pkt_t * pkt,boolean_t flush,boolean_t * pdrop)5405 ifnet_enqueue_common(struct ifnet *ifp, struct ifclassq *ifcq,
5406 classq_pkt_t *pkt, boolean_t flush, boolean_t *pdrop)
5407 {
5408 #if DEVELOPMENT || DEBUG
5409 switch (pkt->cp_ptype) {
5410 case QP_PACKET: {
5411 trace_pkt_dump_payload(ifp, pkt->cp_kpkt, false);
5412 break;
5413 }
5414 case QP_MBUF:
5415 case QP_INVALID: {
5416 break;
5417 }
5418 }
5419 #endif /* DEVELOPMENT || DEBUG */
5420
5421 if (ifp->if_output_netem != NULL) {
5422 bool drop;
5423 errno_t error;
5424 error = netem_enqueue(ifp->if_output_netem, pkt, &drop);
5425 *pdrop = drop ? TRUE : FALSE;
5426 return error;
5427 } else {
5428 return ifnet_enqueue_ifclassq(ifp, ifcq, pkt, flush, pdrop);
5429 }
5430 }
5431
5432 errno_t
ifnet_enqueue(struct ifnet * ifp,struct mbuf * m)5433 ifnet_enqueue(struct ifnet *ifp, struct mbuf *m)
5434 {
5435 boolean_t pdrop;
5436 return ifnet_enqueue_mbuf(ifp, m, TRUE, &pdrop);
5437 }
5438
5439 errno_t
ifnet_enqueue_mbuf(struct ifnet * ifp,struct mbuf * m,boolean_t flush,boolean_t * pdrop)5440 ifnet_enqueue_mbuf(struct ifnet *ifp, struct mbuf *m, boolean_t flush,
5441 boolean_t *pdrop)
5442 {
5443 classq_pkt_t pkt;
5444
5445 if (ifp == NULL || m == NULL || !(m->m_flags & M_PKTHDR) ||
5446 m->m_nextpkt != NULL) {
5447 if (m != NULL) {
5448 m_freem_list(m);
5449 *pdrop = TRUE;
5450 }
5451 return EINVAL;
5452 } else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5453 !IF_FULLY_ATTACHED(ifp)) {
5454 /* flag tested without lock for performance */
5455 m_freem(m);
5456 *pdrop = TRUE;
5457 return ENXIO;
5458 } else if (!(ifp->if_flags & IFF_UP)) {
5459 m_freem(m);
5460 *pdrop = TRUE;
5461 return ENETDOWN;
5462 }
5463
5464 CLASSQ_PKT_INIT_MBUF(&pkt, m);
5465 return ifnet_enqueue_common(ifp, NULL, &pkt, flush, pdrop);
5466 }
5467
5468 errno_t
ifnet_enqueue_mbuf_chain(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,uint32_t cnt,uint32_t bytes,boolean_t flush,boolean_t * pdrop)5469 ifnet_enqueue_mbuf_chain(struct ifnet *ifp, struct mbuf *m_head,
5470 struct mbuf *m_tail, uint32_t cnt, uint32_t bytes, boolean_t flush,
5471 boolean_t *pdrop)
5472 {
5473 classq_pkt_t head, tail;
5474
5475 ASSERT(m_head != NULL);
5476 ASSERT((m_head->m_flags & M_PKTHDR) != 0);
5477 ASSERT(m_tail != NULL);
5478 ASSERT((m_tail->m_flags & M_PKTHDR) != 0);
5479 ASSERT(ifp != NULL);
5480 ASSERT((ifp->if_eflags & IFEF_TXSTART) != 0);
5481
5482 if (!IF_FULLY_ATTACHED(ifp)) {
5483 /* flag tested without lock for performance */
5484 m_freem_list(m_head);
5485 *pdrop = TRUE;
5486 return ENXIO;
5487 } else if (!(ifp->if_flags & IFF_UP)) {
5488 m_freem_list(m_head);
5489 *pdrop = TRUE;
5490 return ENETDOWN;
5491 }
5492
5493 CLASSQ_PKT_INIT_MBUF(&head, m_head);
5494 CLASSQ_PKT_INIT_MBUF(&tail, m_tail);
5495 return ifnet_enqueue_ifclassq_chain(ifp, NULL, &head, &tail, cnt, bytes,
5496 flush, pdrop);
5497 }
5498
5499 #if SKYWALK
5500 static errno_t
ifnet_enqueue_pkt_common(struct ifnet * ifp,struct ifclassq * ifcq,struct __kern_packet * kpkt,boolean_t flush,boolean_t * pdrop)5501 ifnet_enqueue_pkt_common(struct ifnet *ifp, struct ifclassq *ifcq,
5502 struct __kern_packet *kpkt, boolean_t flush, boolean_t *pdrop)
5503 {
5504 classq_pkt_t pkt;
5505
5506 ASSERT(kpkt == NULL || kpkt->pkt_nextpkt == NULL);
5507
5508 if (__improbable(ifp == NULL || kpkt == NULL)) {
5509 if (kpkt != NULL) {
5510 pp_free_packet(__DECONST(struct kern_pbufpool *,
5511 kpkt->pkt_qum.qum_pp), SK_PTR_ADDR(kpkt));
5512 *pdrop = TRUE;
5513 }
5514 return EINVAL;
5515 } else if (__improbable(!(ifp->if_eflags & IFEF_TXSTART) ||
5516 !IF_FULLY_ATTACHED(ifp))) {
5517 /* flag tested without lock for performance */
5518 pp_free_packet(__DECONST(struct kern_pbufpool *,
5519 kpkt->pkt_qum.qum_pp), SK_PTR_ADDR(kpkt));
5520 *pdrop = TRUE;
5521 return ENXIO;
5522 } else if (__improbable(!(ifp->if_flags & IFF_UP))) {
5523 pp_free_packet(__DECONST(struct kern_pbufpool *,
5524 kpkt->pkt_qum.qum_pp), SK_PTR_ADDR(kpkt));
5525 *pdrop = TRUE;
5526 return ENETDOWN;
5527 }
5528
5529 CLASSQ_PKT_INIT_PACKET(&pkt, kpkt);
5530 return ifnet_enqueue_common(ifp, ifcq, &pkt, flush, pdrop);
5531 }
5532
5533 errno_t
ifnet_enqueue_pkt(struct ifnet * ifp,struct __kern_packet * kpkt,boolean_t flush,boolean_t * pdrop)5534 ifnet_enqueue_pkt(struct ifnet *ifp, struct __kern_packet *kpkt,
5535 boolean_t flush, boolean_t *pdrop)
5536 {
5537 return ifnet_enqueue_pkt_common(ifp, NULL, kpkt, flush, pdrop);
5538 }
5539
5540 errno_t
ifnet_enqueue_ifcq_pkt(struct ifnet * ifp,struct ifclassq * ifcq,struct __kern_packet * kpkt,boolean_t flush,boolean_t * pdrop)5541 ifnet_enqueue_ifcq_pkt(struct ifnet *ifp, struct ifclassq *ifcq,
5542 struct __kern_packet *kpkt, boolean_t flush, boolean_t *pdrop)
5543 {
5544 return ifnet_enqueue_pkt_common(ifp, ifcq, kpkt, flush, pdrop);
5545 }
5546
5547 static errno_t
ifnet_enqueue_pkt_chain_common(struct ifnet * ifp,struct ifclassq * ifcq,struct __kern_packet * k_head,struct __kern_packet * k_tail,uint32_t cnt,uint32_t bytes,boolean_t flush,boolean_t * pdrop)5548 ifnet_enqueue_pkt_chain_common(struct ifnet *ifp, struct ifclassq *ifcq,
5549 struct __kern_packet *k_head, struct __kern_packet *k_tail, uint32_t cnt,
5550 uint32_t bytes, boolean_t flush, boolean_t *pdrop)
5551 {
5552 classq_pkt_t head, tail;
5553
5554 ASSERT(k_head != NULL);
5555 ASSERT(k_tail != NULL);
5556 ASSERT(ifp != NULL);
5557 ASSERT((ifp->if_eflags & IFEF_TXSTART) != 0);
5558
5559 if (!IF_FULLY_ATTACHED(ifp)) {
5560 /* flag tested without lock for performance */
5561 pp_free_packet_chain(k_head, NULL);
5562 *pdrop = TRUE;
5563 return ENXIO;
5564 } else if (__improbable(!(ifp->if_flags & IFF_UP))) {
5565 pp_free_packet_chain(k_head, NULL);
5566 *pdrop = TRUE;
5567 return ENETDOWN;
5568 }
5569
5570 CLASSQ_PKT_INIT_PACKET(&head, k_head);
5571 CLASSQ_PKT_INIT_PACKET(&tail, k_tail);
5572 return ifnet_enqueue_ifclassq_chain(ifp, ifcq, &head, &tail, cnt, bytes,
5573 flush, pdrop);
5574 }
5575
5576 errno_t
ifnet_enqueue_pkt_chain(struct ifnet * ifp,struct __kern_packet * k_head,struct __kern_packet * k_tail,uint32_t cnt,uint32_t bytes,boolean_t flush,boolean_t * pdrop)5577 ifnet_enqueue_pkt_chain(struct ifnet *ifp, struct __kern_packet *k_head,
5578 struct __kern_packet *k_tail, uint32_t cnt, uint32_t bytes, boolean_t flush,
5579 boolean_t *pdrop)
5580 {
5581 return ifnet_enqueue_pkt_chain_common(ifp, NULL, k_head, k_tail,
5582 cnt, bytes, flush, pdrop);
5583 }
5584
5585 errno_t
ifnet_enqueue_ifcq_pkt_chain(struct ifnet * ifp,struct ifclassq * ifcq,struct __kern_packet * k_head,struct __kern_packet * k_tail,uint32_t cnt,uint32_t bytes,boolean_t flush,boolean_t * pdrop)5586 ifnet_enqueue_ifcq_pkt_chain(struct ifnet *ifp, struct ifclassq *ifcq,
5587 struct __kern_packet *k_head, struct __kern_packet *k_tail, uint32_t cnt,
5588 uint32_t bytes, boolean_t flush, boolean_t *pdrop)
5589 {
5590 return ifnet_enqueue_pkt_chain_common(ifp, ifcq, k_head, k_tail,
5591 cnt, bytes, flush, pdrop);
5592 }
5593 #endif /* SKYWALK */
5594
5595 errno_t
ifnet_dequeue(struct ifnet * ifp,struct mbuf ** mp)5596 ifnet_dequeue(struct ifnet *ifp, struct mbuf **mp)
5597 {
5598 errno_t rc;
5599 classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
5600
5601 if (ifp == NULL || mp == NULL) {
5602 return EINVAL;
5603 } else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5604 ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5605 return ENXIO;
5606 }
5607 if (!ifnet_is_attached(ifp, 1)) {
5608 return ENXIO;
5609 }
5610
5611 #if SKYWALK
5612 ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5613 #endif /* SKYWALK */
5614 rc = ifclassq_dequeue(ifp->if_snd, 1, CLASSQ_DEQUEUE_MAX_BYTE_LIMIT,
5615 &pkt, NULL, NULL, NULL, 0);
5616 VERIFY((pkt.cp_ptype == QP_MBUF) || (pkt.cp_mbuf == NULL));
5617 ifnet_decr_iorefcnt(ifp);
5618 *mp = pkt.cp_mbuf;
5619 return rc;
5620 }
5621
5622 errno_t
ifnet_dequeue_service_class(struct ifnet * ifp,mbuf_svc_class_t sc,struct mbuf ** mp)5623 ifnet_dequeue_service_class(struct ifnet *ifp, mbuf_svc_class_t sc,
5624 struct mbuf **mp)
5625 {
5626 errno_t rc;
5627 classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
5628
5629 if (ifp == NULL || mp == NULL || !MBUF_VALID_SC(sc)) {
5630 return EINVAL;
5631 } else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5632 ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5633 return ENXIO;
5634 }
5635 if (!ifnet_is_attached(ifp, 1)) {
5636 return ENXIO;
5637 }
5638
5639 #if SKYWALK
5640 ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5641 #endif /* SKYWALK */
5642 rc = ifclassq_dequeue_sc(ifp->if_snd, sc, 1,
5643 CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, &pkt, NULL, NULL, NULL, 0);
5644 VERIFY((pkt.cp_ptype == QP_MBUF) || (pkt.cp_mbuf == NULL));
5645 ifnet_decr_iorefcnt(ifp);
5646 *mp = pkt.cp_mbuf;
5647 return rc;
5648 }
5649
5650 errno_t
ifnet_dequeue_multi(struct ifnet * ifp,u_int32_t pkt_limit,struct mbuf ** head,struct mbuf ** tail,u_int32_t * cnt,u_int32_t * len)5651 ifnet_dequeue_multi(struct ifnet *ifp, u_int32_t pkt_limit,
5652 struct mbuf **head, struct mbuf **tail, u_int32_t *cnt, u_int32_t *len)
5653 {
5654 errno_t rc;
5655 classq_pkt_t pkt_head = CLASSQ_PKT_INITIALIZER(pkt_head);
5656 classq_pkt_t pkt_tail = CLASSQ_PKT_INITIALIZER(pkt_tail);
5657
5658 if (ifp == NULL || head == NULL || pkt_limit < 1) {
5659 return EINVAL;
5660 } else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5661 ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5662 return ENXIO;
5663 }
5664 if (!ifnet_is_attached(ifp, 1)) {
5665 return ENXIO;
5666 }
5667
5668 #if SKYWALK
5669 ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5670 #endif /* SKYWALK */
5671 rc = ifclassq_dequeue(ifp->if_snd, pkt_limit,
5672 CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, &pkt_head, &pkt_tail, cnt, len, 0);
5673 VERIFY((pkt_head.cp_ptype == QP_MBUF) || (pkt_head.cp_mbuf == NULL));
5674 ifnet_decr_iorefcnt(ifp);
5675 *head = pkt_head.cp_mbuf;
5676 if (tail != NULL) {
5677 *tail = pkt_tail.cp_mbuf;
5678 }
5679 return rc;
5680 }
5681
5682 errno_t
ifnet_dequeue_multi_bytes(struct ifnet * ifp,u_int32_t byte_limit,struct mbuf ** head,struct mbuf ** tail,u_int32_t * cnt,u_int32_t * len)5683 ifnet_dequeue_multi_bytes(struct ifnet *ifp, u_int32_t byte_limit,
5684 struct mbuf **head, struct mbuf **tail, u_int32_t *cnt, u_int32_t *len)
5685 {
5686 errno_t rc;
5687 classq_pkt_t pkt_head = CLASSQ_PKT_INITIALIZER(pkt_head);
5688 classq_pkt_t pkt_tail = CLASSQ_PKT_INITIALIZER(pkt_tail);
5689
5690 if (ifp == NULL || head == NULL || byte_limit < 1) {
5691 return EINVAL;
5692 } else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5693 ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5694 return ENXIO;
5695 }
5696 if (!ifnet_is_attached(ifp, 1)) {
5697 return ENXIO;
5698 }
5699
5700 #if SKYWALK
5701 ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5702 #endif /* SKYWALK */
5703 rc = ifclassq_dequeue(ifp->if_snd, CLASSQ_DEQUEUE_MAX_PKT_LIMIT,
5704 byte_limit, &pkt_head, &pkt_tail, cnt, len, 0);
5705 VERIFY((pkt_head.cp_ptype == QP_MBUF) || (pkt_head.cp_mbuf == NULL));
5706 ifnet_decr_iorefcnt(ifp);
5707 *head = pkt_head.cp_mbuf;
5708 if (tail != NULL) {
5709 *tail = pkt_tail.cp_mbuf;
5710 }
5711 return rc;
5712 }
5713
5714 errno_t
ifnet_dequeue_service_class_multi(struct ifnet * ifp,mbuf_svc_class_t sc,u_int32_t pkt_limit,struct mbuf ** head,struct mbuf ** tail,u_int32_t * cnt,u_int32_t * len)5715 ifnet_dequeue_service_class_multi(struct ifnet *ifp, mbuf_svc_class_t sc,
5716 u_int32_t pkt_limit, struct mbuf **head, struct mbuf **tail, u_int32_t *cnt,
5717 u_int32_t *len)
5718 {
5719 errno_t rc;
5720 classq_pkt_t pkt_head = CLASSQ_PKT_INITIALIZER(pkt_head);
5721 classq_pkt_t pkt_tail = CLASSQ_PKT_INITIALIZER(pkt_tail);
5722
5723 if (ifp == NULL || head == NULL || pkt_limit < 1 ||
5724 !MBUF_VALID_SC(sc)) {
5725 return EINVAL;
5726 } else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5727 ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5728 return ENXIO;
5729 }
5730 if (!ifnet_is_attached(ifp, 1)) {
5731 return ENXIO;
5732 }
5733
5734 #if SKYWALK
5735 ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5736 #endif /* SKYWALK */
5737 rc = ifclassq_dequeue_sc(ifp->if_snd, sc, pkt_limit,
5738 CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, &pkt_head, &pkt_tail,
5739 cnt, len, 0);
5740 VERIFY((pkt_head.cp_ptype == QP_MBUF) || (pkt_head.cp_mbuf == NULL));
5741 ifnet_decr_iorefcnt(ifp);
5742 *head = pkt_head.cp_mbuf;
5743 if (tail != NULL) {
5744 *tail = pkt_tail.cp_mbuf;
5745 }
5746 return rc;
5747 }
5748
5749 #if XNU_TARGET_OS_OSX
5750 errno_t
ifnet_framer_stub(struct ifnet * ifp,struct mbuf ** m,const struct sockaddr * dest,const char * dest_linkaddr,const char * frame_type,u_int32_t * pre,u_int32_t * post)5751 ifnet_framer_stub(struct ifnet *ifp, struct mbuf **m,
5752 const struct sockaddr *dest, const char *dest_linkaddr,
5753 const char *frame_type, u_int32_t *pre, u_int32_t *post)
5754 {
5755 if (pre != NULL) {
5756 *pre = 0;
5757 }
5758 if (post != NULL) {
5759 *post = 0;
5760 }
5761
5762 return ifp->if_framer_legacy(ifp, m, dest, dest_linkaddr, frame_type);
5763 }
5764 #endif /* XNU_TARGET_OS_OSX */
5765
5766 static boolean_t
packet_has_vlan_tag(struct mbuf * m)5767 packet_has_vlan_tag(struct mbuf * m)
5768 {
5769 u_int tag = 0;
5770
5771 if ((m->m_pkthdr.csum_flags & CSUM_VLAN_TAG_VALID) != 0) {
5772 tag = EVL_VLANOFTAG(m->m_pkthdr.vlan_tag);
5773 if (tag == 0) {
5774 /* the packet is just priority-tagged, clear the bit */
5775 m->m_pkthdr.csum_flags &= ~CSUM_VLAN_TAG_VALID;
5776 }
5777 }
5778 return tag != 0;
5779 }
5780
5781 static int
dlil_interface_filters_input(struct ifnet * ifp,struct mbuf ** m_p,char ** frame_header_p,protocol_family_t protocol_family)5782 dlil_interface_filters_input(struct ifnet *ifp, struct mbuf **m_p,
5783 char **frame_header_p, protocol_family_t protocol_family)
5784 {
5785 boolean_t is_vlan_packet = FALSE;
5786 struct ifnet_filter *filter;
5787 struct mbuf *m = *m_p;
5788
5789 is_vlan_packet = packet_has_vlan_tag(m);
5790
5791 if (TAILQ_EMPTY(&ifp->if_flt_head)) {
5792 return 0;
5793 }
5794
5795 /*
5796 * Pass the inbound packet to the interface filters
5797 */
5798 lck_mtx_lock_spin(&ifp->if_flt_lock);
5799 /* prevent filter list from changing in case we drop the lock */
5800 if_flt_monitor_busy(ifp);
5801 TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
5802 int result;
5803
5804 /* exclude VLAN packets from external filters PR-3586856 */
5805 if (is_vlan_packet &&
5806 (filter->filt_flags & DLIL_IFF_INTERNAL) == 0) {
5807 continue;
5808 }
5809
5810 if (!filter->filt_skip && filter->filt_input != NULL &&
5811 (filter->filt_protocol == 0 ||
5812 filter->filt_protocol == protocol_family)) {
5813 lck_mtx_unlock(&ifp->if_flt_lock);
5814
5815 result = (*filter->filt_input)(filter->filt_cookie,
5816 ifp, protocol_family, m_p, frame_header_p);
5817
5818 lck_mtx_lock_spin(&ifp->if_flt_lock);
5819 if (result != 0) {
5820 /* we're done with the filter list */
5821 if_flt_monitor_unbusy(ifp);
5822 lck_mtx_unlock(&ifp->if_flt_lock);
5823 return result;
5824 }
5825 }
5826 }
5827 /* we're done with the filter list */
5828 if_flt_monitor_unbusy(ifp);
5829 lck_mtx_unlock(&ifp->if_flt_lock);
5830
5831 /*
5832 * Strip away M_PROTO1 bit prior to sending packet up the stack as
5833 * it is meant to be local to a subsystem -- if_bridge for M_PROTO1
5834 */
5835 if (*m_p != NULL) {
5836 (*m_p)->m_flags &= ~M_PROTO1;
5837 }
5838
5839 return 0;
5840 }
5841
5842 __attribute__((noinline))
5843 static int
dlil_interface_filters_output(struct ifnet * ifp,struct mbuf ** m_p,protocol_family_t protocol_family)5844 dlil_interface_filters_output(struct ifnet *ifp, struct mbuf **m_p,
5845 protocol_family_t protocol_family)
5846 {
5847 boolean_t is_vlan_packet;
5848 struct ifnet_filter *filter;
5849 struct mbuf *m = *m_p;
5850
5851 is_vlan_packet = packet_has_vlan_tag(m);
5852
5853 /*
5854 * Pass the outbound packet to the interface filters
5855 */
5856 lck_mtx_lock_spin(&ifp->if_flt_lock);
5857 /* prevent filter list from changing in case we drop the lock */
5858 if_flt_monitor_busy(ifp);
5859 TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
5860 int result;
5861
5862 /* exclude VLAN packets from external filters PR-3586856 */
5863 if (is_vlan_packet &&
5864 (filter->filt_flags & DLIL_IFF_INTERNAL) == 0) {
5865 continue;
5866 }
5867
5868 if (!filter->filt_skip && filter->filt_output != NULL &&
5869 (filter->filt_protocol == 0 ||
5870 filter->filt_protocol == protocol_family)) {
5871 lck_mtx_unlock(&ifp->if_flt_lock);
5872
5873 result = filter->filt_output(filter->filt_cookie, ifp,
5874 protocol_family, m_p);
5875
5876 lck_mtx_lock_spin(&ifp->if_flt_lock);
5877 if (result != 0) {
5878 /* we're done with the filter list */
5879 if_flt_monitor_unbusy(ifp);
5880 lck_mtx_unlock(&ifp->if_flt_lock);
5881 return result;
5882 }
5883 }
5884 }
5885 /* we're done with the filter list */
5886 if_flt_monitor_unbusy(ifp);
5887 lck_mtx_unlock(&ifp->if_flt_lock);
5888
5889 return 0;
5890 }
5891
5892 static void
dlil_ifproto_input(struct if_proto * ifproto,mbuf_t m)5893 dlil_ifproto_input(struct if_proto * ifproto, mbuf_t m)
5894 {
5895 int error;
5896
5897 if (ifproto->proto_kpi == kProtoKPI_v1) {
5898 /* Version 1 protocols get one packet at a time */
5899 while (m != NULL) {
5900 char * frame_header;
5901 mbuf_t next_packet;
5902
5903 next_packet = m->m_nextpkt;
5904 m->m_nextpkt = NULL;
5905 frame_header = m->m_pkthdr.pkt_hdr;
5906 m->m_pkthdr.pkt_hdr = NULL;
5907 error = (*ifproto->kpi.v1.input)(ifproto->ifp,
5908 ifproto->protocol_family, m, frame_header);
5909 if (error != 0 && error != EJUSTRETURN) {
5910 m_freem(m);
5911 }
5912 m = next_packet;
5913 }
5914 } else if (ifproto->proto_kpi == kProtoKPI_v2) {
5915 /* Version 2 protocols support packet lists */
5916 error = (*ifproto->kpi.v2.input)(ifproto->ifp,
5917 ifproto->protocol_family, m);
5918 if (error != 0 && error != EJUSTRETURN) {
5919 m_freem_list(m);
5920 }
5921 }
5922 }
5923
5924 static void
dlil_input_stats_add(const struct ifnet_stat_increment_param * s,struct dlil_threading_info * inp,struct ifnet * ifp,boolean_t poll)5925 dlil_input_stats_add(const struct ifnet_stat_increment_param *s,
5926 struct dlil_threading_info *inp, struct ifnet *ifp, boolean_t poll)
5927 {
5928 struct ifnet_stat_increment_param *d = &inp->dlth_stats;
5929
5930 if (s->packets_in != 0) {
5931 d->packets_in += s->packets_in;
5932 }
5933 if (s->bytes_in != 0) {
5934 d->bytes_in += s->bytes_in;
5935 }
5936 if (s->errors_in != 0) {
5937 d->errors_in += s->errors_in;
5938 }
5939
5940 if (s->packets_out != 0) {
5941 d->packets_out += s->packets_out;
5942 }
5943 if (s->bytes_out != 0) {
5944 d->bytes_out += s->bytes_out;
5945 }
5946 if (s->errors_out != 0) {
5947 d->errors_out += s->errors_out;
5948 }
5949
5950 if (s->collisions != 0) {
5951 d->collisions += s->collisions;
5952 }
5953 if (s->dropped != 0) {
5954 d->dropped += s->dropped;
5955 }
5956
5957 if (poll) {
5958 PKTCNTR_ADD(&ifp->if_poll_tstats, s->packets_in, s->bytes_in);
5959 }
5960 }
5961
5962 static boolean_t
dlil_input_stats_sync(struct ifnet * ifp,struct dlil_threading_info * inp)5963 dlil_input_stats_sync(struct ifnet *ifp, struct dlil_threading_info *inp)
5964 {
5965 struct ifnet_stat_increment_param *s = &inp->dlth_stats;
5966
5967 /*
5968 * Use of atomic operations is unavoidable here because
5969 * these stats may also be incremented elsewhere via KPIs.
5970 */
5971 if (s->packets_in != 0) {
5972 atomic_add_64(&ifp->if_data.ifi_ipackets, s->packets_in);
5973 s->packets_in = 0;
5974 }
5975 if (s->bytes_in != 0) {
5976 atomic_add_64(&ifp->if_data.ifi_ibytes, s->bytes_in);
5977 s->bytes_in = 0;
5978 }
5979 if (s->errors_in != 0) {
5980 atomic_add_64(&ifp->if_data.ifi_ierrors, s->errors_in);
5981 s->errors_in = 0;
5982 }
5983
5984 if (s->packets_out != 0) {
5985 atomic_add_64(&ifp->if_data.ifi_opackets, s->packets_out);
5986 s->packets_out = 0;
5987 }
5988 if (s->bytes_out != 0) {
5989 atomic_add_64(&ifp->if_data.ifi_obytes, s->bytes_out);
5990 s->bytes_out = 0;
5991 }
5992 if (s->errors_out != 0) {
5993 atomic_add_64(&ifp->if_data.ifi_oerrors, s->errors_out);
5994 s->errors_out = 0;
5995 }
5996
5997 if (s->collisions != 0) {
5998 atomic_add_64(&ifp->if_data.ifi_collisions, s->collisions);
5999 s->collisions = 0;
6000 }
6001 if (s->dropped != 0) {
6002 atomic_add_64(&ifp->if_data.ifi_iqdrops, s->dropped);
6003 s->dropped = 0;
6004 }
6005
6006 /*
6007 * No need for atomic operations as they are modified here
6008 * only from within the DLIL input thread context.
6009 */
6010 if (ifp->if_poll_tstats.packets != 0) {
6011 ifp->if_poll_pstats.ifi_poll_packets += ifp->if_poll_tstats.packets;
6012 ifp->if_poll_tstats.packets = 0;
6013 }
6014 if (ifp->if_poll_tstats.bytes != 0) {
6015 ifp->if_poll_pstats.ifi_poll_bytes += ifp->if_poll_tstats.bytes;
6016 ifp->if_poll_tstats.bytes = 0;
6017 }
6018
6019 return ifp->if_data_threshold != 0;
6020 }
6021
6022 __private_extern__ void
dlil_input_packet_list(struct ifnet * ifp,struct mbuf * m)6023 dlil_input_packet_list(struct ifnet *ifp, struct mbuf *m)
6024 {
6025 return dlil_input_packet_list_common(ifp, m, 0,
6026 IFNET_MODEL_INPUT_POLL_OFF, FALSE);
6027 }
6028
6029 __private_extern__ void
dlil_input_packet_list_extended(struct ifnet * ifp,struct mbuf * m,u_int32_t cnt,ifnet_model_t mode)6030 dlil_input_packet_list_extended(struct ifnet *ifp, struct mbuf *m,
6031 u_int32_t cnt, ifnet_model_t mode)
6032 {
6033 return dlil_input_packet_list_common(ifp, m, cnt, mode, TRUE);
6034 }
6035
6036 static void
dlil_input_packet_list_common(struct ifnet * ifp_param,struct mbuf * m,u_int32_t cnt,ifnet_model_t mode,boolean_t ext)6037 dlil_input_packet_list_common(struct ifnet *ifp_param, struct mbuf *m,
6038 u_int32_t cnt, ifnet_model_t mode, boolean_t ext)
6039 {
6040 int error = 0;
6041 protocol_family_t protocol_family;
6042 mbuf_t next_packet;
6043 ifnet_t ifp = ifp_param;
6044 char *frame_header = NULL;
6045 struct if_proto *last_ifproto = NULL;
6046 mbuf_t pkt_first = NULL;
6047 mbuf_t *pkt_next = NULL;
6048 u_int32_t poll_thresh = 0, poll_ival = 0;
6049 int iorefcnt = 0;
6050
6051 KERNEL_DEBUG(DBG_FNC_DLIL_INPUT | DBG_FUNC_START, 0, 0, 0, 0, 0);
6052
6053 if (ext && mode == IFNET_MODEL_INPUT_POLL_ON && cnt > 1 &&
6054 (poll_ival = if_rxpoll_interval_pkts) > 0) {
6055 poll_thresh = cnt;
6056 }
6057
6058 while (m != NULL) {
6059 struct if_proto *ifproto = NULL;
6060 uint32_t pktf_mask; /* pkt flags to preserve */
6061
6062 m_add_crumb(m, PKT_CRUMB_DLIL_INPUT);
6063
6064 if (ifp_param == NULL) {
6065 ifp = m->m_pkthdr.rcvif;
6066 }
6067
6068 if ((ifp->if_eflags & IFEF_RXPOLL) &&
6069 (ifp->if_xflags & IFXF_LEGACY) && poll_thresh != 0 &&
6070 poll_ival > 0 && (--poll_thresh % poll_ival) == 0) {
6071 ifnet_poll(ifp);
6072 }
6073
6074 /* Check if this mbuf looks valid */
6075 MBUF_INPUT_CHECK(m, ifp);
6076
6077 next_packet = m->m_nextpkt;
6078 m->m_nextpkt = NULL;
6079 frame_header = m->m_pkthdr.pkt_hdr;
6080 m->m_pkthdr.pkt_hdr = NULL;
6081
6082 /*
6083 * Get an IO reference count if the interface is not
6084 * loopback (lo0) and it is attached; lo0 never goes
6085 * away, so optimize for that.
6086 */
6087 if (ifp != lo_ifp) {
6088 /* iorefcnt is 0 if it hasn't been taken yet */
6089 if (iorefcnt == 0) {
6090 if (!ifnet_datamov_begin(ifp)) {
6091 m_freem(m);
6092 goto next;
6093 }
6094 }
6095 iorefcnt = 1;
6096 /*
6097 * Preserve the time stamp and skip pktap flags.
6098 */
6099 pktf_mask = PKTF_TS_VALID | PKTF_SKIP_PKTAP;
6100 } else {
6101 /*
6102 * If this arrived on lo0, preserve interface addr
6103 * info to allow for connectivity between loopback
6104 * and local interface addresses.
6105 */
6106 pktf_mask = (PKTF_LOOP | PKTF_IFAINFO);
6107 }
6108 pktf_mask |= PKTF_WAKE_PKT;
6109
6110 /* make sure packet comes in clean */
6111 m_classifier_init(m, pktf_mask);
6112
6113 ifp_inc_traffic_class_in(ifp, m);
6114
6115 /* find which protocol family this packet is for */
6116 ifnet_lock_shared(ifp);
6117 error = (*ifp->if_demux)(ifp, m, frame_header,
6118 &protocol_family);
6119 ifnet_lock_done(ifp);
6120 if (error != 0) {
6121 if (error == EJUSTRETURN) {
6122 goto next;
6123 }
6124 protocol_family = 0;
6125 }
6126
6127 #if (DEVELOPMENT || DEBUG)
6128 /*
6129 * For testing we do not care about broadcast and multicast packets as
6130 * they are not as controllable as unicast traffic
6131 */
6132 if (__improbable(ifp->if_xflags & IFXF_MARK_WAKE_PKT)) {
6133 if ((protocol_family == PF_INET || protocol_family == PF_INET6) &&
6134 (m->m_flags & (M_BCAST | M_MCAST)) == 0) {
6135 /*
6136 * This is a one-shot command
6137 */
6138 ifp->if_xflags &= ~IFXF_MARK_WAKE_PKT;
6139 m->m_pkthdr.pkt_flags |= PKTF_WAKE_PKT;
6140 }
6141 }
6142 #endif /* (DEVELOPMENT || DEBUG) */
6143 if (__improbable(net_wake_pkt_debug > 0 && (m->m_pkthdr.pkt_flags & PKTF_WAKE_PKT))) {
6144 char buffer[64];
6145 size_t buflen = MIN(mbuf_pkthdr_len(m), sizeof(buffer));
6146
6147 os_log(OS_LOG_DEFAULT, "wake packet from %s len %d",
6148 ifp->if_xname, m_pktlen(m));
6149 if (mbuf_copydata(m, 0, buflen, buffer) == 0) {
6150 log_hexdump(buffer, buflen);
6151 }
6152 }
6153
6154 pktap_input(ifp, protocol_family, m, frame_header);
6155
6156 /* Drop v4 packets received on CLAT46 enabled cell interface */
6157 if (protocol_family == PF_INET && IS_INTF_CLAT46(ifp) &&
6158 ifp->if_type == IFT_CELLULAR) {
6159 m_freem(m);
6160 ip6stat.ip6s_clat464_in_v4_drop++;
6161 goto next;
6162 }
6163
6164 /* Translate the packet if it is received on CLAT interface */
6165 if (protocol_family == PF_INET6 && IS_INTF_CLAT46(ifp)
6166 && dlil_is_clat_needed(protocol_family, m)) {
6167 char *data = NULL;
6168 struct ether_header eh;
6169 struct ether_header *ehp = NULL;
6170
6171 if (ifp->if_type == IFT_ETHER) {
6172 ehp = (struct ether_header *)(void *)frame_header;
6173 /* Skip RX Ethernet packets if they are not IPV6 */
6174 if (ntohs(ehp->ether_type) != ETHERTYPE_IPV6) {
6175 goto skip_clat;
6176 }
6177
6178 /* Keep a copy of frame_header for Ethernet packets */
6179 bcopy(frame_header, (caddr_t)&eh, ETHER_HDR_LEN);
6180 }
6181 error = dlil_clat64(ifp, &protocol_family, &m);
6182 data = (char *) mbuf_data(m);
6183 if (error != 0) {
6184 m_freem(m);
6185 ip6stat.ip6s_clat464_in_drop++;
6186 goto next;
6187 }
6188 /* Native v6 should be No-op */
6189 if (protocol_family != PF_INET) {
6190 goto skip_clat;
6191 }
6192
6193 /* Do this only for translated v4 packets. */
6194 switch (ifp->if_type) {
6195 case IFT_CELLULAR:
6196 frame_header = data;
6197 break;
6198 case IFT_ETHER:
6199 /*
6200 * Drop if the mbuf doesn't have enough
6201 * space for Ethernet header
6202 */
6203 if (M_LEADINGSPACE(m) < ETHER_HDR_LEN) {
6204 m_free(m);
6205 ip6stat.ip6s_clat464_in_drop++;
6206 goto next;
6207 }
6208 /*
6209 * Set the frame_header ETHER_HDR_LEN bytes
6210 * preceeding the data pointer. Change
6211 * the ether_type too.
6212 */
6213 frame_header = data - ETHER_HDR_LEN;
6214 eh.ether_type = htons(ETHERTYPE_IP);
6215 bcopy((caddr_t)&eh, frame_header, ETHER_HDR_LEN);
6216 break;
6217 }
6218 }
6219 skip_clat:
6220 /*
6221 * Match the wake packet against the list of ports that has been
6222 * been queried by the driver before the device went to sleep
6223 */
6224 if (__improbable(m->m_pkthdr.pkt_flags & PKTF_WAKE_PKT)) {
6225 if (protocol_family != PF_INET && protocol_family != PF_INET6) {
6226 if_ports_used_match_mbuf(ifp, protocol_family, m);
6227 }
6228 }
6229 if (hwcksum_dbg != 0 && !(ifp->if_flags & IFF_LOOPBACK) &&
6230 !(m->m_pkthdr.pkt_flags & PKTF_LOOP)) {
6231 dlil_input_cksum_dbg(ifp, m, frame_header,
6232 protocol_family);
6233 }
6234 /*
6235 * For partial checksum offload, we expect the driver to
6236 * set the start offset indicating the start of the span
6237 * that is covered by the hardware-computed checksum;
6238 * adjust this start offset accordingly because the data
6239 * pointer has been advanced beyond the link-layer header.
6240 *
6241 * Virtual lan types (bridge, vlan, bond) can call
6242 * dlil_input_packet_list() with the same packet with the
6243 * checksum flags set. Set a flag indicating that the
6244 * adjustment has already been done.
6245 */
6246 if ((m->m_pkthdr.csum_flags & CSUM_ADJUST_DONE) != 0) {
6247 /* adjustment has already been done */
6248 } else if ((m->m_pkthdr.csum_flags &
6249 (CSUM_DATA_VALID | CSUM_PARTIAL)) ==
6250 (CSUM_DATA_VALID | CSUM_PARTIAL)) {
6251 int adj;
6252 if (frame_header == NULL ||
6253 frame_header < (char *)mbuf_datastart(m) ||
6254 frame_header > (char *)m->m_data ||
6255 (adj = (int)(m->m_data - frame_header)) >
6256 m->m_pkthdr.csum_rx_start) {
6257 m->m_pkthdr.csum_data = 0;
6258 m->m_pkthdr.csum_flags &= ~CSUM_DATA_VALID;
6259 hwcksum_in_invalidated++;
6260 } else {
6261 m->m_pkthdr.csum_rx_start -= adj;
6262 }
6263 /* make sure we don't adjust more than once */
6264 m->m_pkthdr.csum_flags |= CSUM_ADJUST_DONE;
6265 }
6266 if (clat_debug) {
6267 pktap_input(ifp, protocol_family, m, frame_header);
6268 }
6269
6270 if (m->m_flags & (M_BCAST | M_MCAST)) {
6271 atomic_add_64(&ifp->if_imcasts, 1);
6272 }
6273
6274 /* run interface filters */
6275 error = dlil_interface_filters_input(ifp, &m,
6276 &frame_header, protocol_family);
6277 if (error != 0) {
6278 if (error != EJUSTRETURN) {
6279 m_freem(m);
6280 }
6281 goto next;
6282 }
6283 /*
6284 * A VLAN interface receives VLAN-tagged packets by attaching
6285 * its PF_VLAN protocol to a parent interface. When a VLAN
6286 * interface is a member of a bridge, the parent interface
6287 * receives VLAN-tagged M_PROMISC packets. A VLAN-tagged
6288 * M_PROMISC packet must be processed by the VLAN protocol
6289 * so that it can be sent up the stack via
6290 * dlil_input_packet_list(). That allows the bridge interface's
6291 * input filter, attached to the VLAN interface, to process
6292 * the packet.
6293 */
6294 if (protocol_family != PF_VLAN &&
6295 (m->m_flags & M_PROMISC) != 0) {
6296 m_freem(m);
6297 goto next;
6298 }
6299
6300 /* Lookup the protocol attachment to this interface */
6301 if (protocol_family == 0) {
6302 ifproto = NULL;
6303 } else if (last_ifproto != NULL && last_ifproto->ifp == ifp &&
6304 (last_ifproto->protocol_family == protocol_family)) {
6305 VERIFY(ifproto == NULL);
6306 ifproto = last_ifproto;
6307 if_proto_ref(last_ifproto);
6308 } else {
6309 VERIFY(ifproto == NULL);
6310 ifnet_lock_shared(ifp);
6311 /* callee holds a proto refcnt upon success */
6312 ifproto = find_attached_proto(ifp, protocol_family);
6313 ifnet_lock_done(ifp);
6314 }
6315 if (ifproto == NULL) {
6316 /* no protocol for this packet, discard */
6317 m_freem(m);
6318 goto next;
6319 }
6320 if (ifproto != last_ifproto) {
6321 if (last_ifproto != NULL) {
6322 /* pass up the list for the previous protocol */
6323 dlil_ifproto_input(last_ifproto, pkt_first);
6324 pkt_first = NULL;
6325 if_proto_free(last_ifproto);
6326 }
6327 last_ifproto = ifproto;
6328 if_proto_ref(ifproto);
6329 }
6330 /* extend the list */
6331 m->m_pkthdr.pkt_hdr = frame_header;
6332 if (pkt_first == NULL) {
6333 pkt_first = m;
6334 } else {
6335 *pkt_next = m;
6336 }
6337 pkt_next = &m->m_nextpkt;
6338
6339 next:
6340 if (next_packet == NULL && last_ifproto != NULL) {
6341 /* pass up the last list of packets */
6342 dlil_ifproto_input(last_ifproto, pkt_first);
6343 if_proto_free(last_ifproto);
6344 last_ifproto = NULL;
6345 }
6346 if (ifproto != NULL) {
6347 if_proto_free(ifproto);
6348 ifproto = NULL;
6349 }
6350
6351 m = next_packet;
6352
6353 /* update the driver's multicast filter, if needed */
6354 if (ifp->if_updatemcasts > 0 && if_mcasts_update(ifp) == 0) {
6355 ifp->if_updatemcasts = 0;
6356 }
6357 if (iorefcnt == 1) {
6358 /* If the next mbuf is on a different interface, unlock data-mov */
6359 if (!m || (ifp != ifp_param && ifp != m->m_pkthdr.rcvif)) {
6360 ifnet_datamov_end(ifp);
6361 iorefcnt = 0;
6362 }
6363 }
6364 }
6365
6366 KERNEL_DEBUG(DBG_FNC_DLIL_INPUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
6367 }
6368
6369 errno_t
if_mcasts_update(struct ifnet * ifp)6370 if_mcasts_update(struct ifnet *ifp)
6371 {
6372 errno_t err;
6373
6374 err = ifnet_ioctl(ifp, 0, SIOCADDMULTI, NULL);
6375 if (err == EAFNOSUPPORT) {
6376 err = 0;
6377 }
6378 DLIL_PRINTF("%s: %s %d suspended link-layer multicast membership(s) "
6379 "(err=%d)\n", if_name(ifp),
6380 (err == 0 ? "successfully restored" : "failed to restore"),
6381 ifp->if_updatemcasts, err);
6382
6383 /* just return success */
6384 return 0;
6385 }
6386
6387 /* If ifp is set, we will increment the generation for the interface */
6388 int
dlil_post_complete_msg(struct ifnet * ifp,struct kev_msg * event)6389 dlil_post_complete_msg(struct ifnet *ifp, struct kev_msg *event)
6390 {
6391 if (ifp != NULL) {
6392 ifnet_increment_generation(ifp);
6393 }
6394
6395 #if NECP
6396 necp_update_all_clients();
6397 #endif /* NECP */
6398
6399 return kev_post_msg(event);
6400 }
6401
6402 __private_extern__ void
dlil_post_sifflags_msg(struct ifnet * ifp)6403 dlil_post_sifflags_msg(struct ifnet * ifp)
6404 {
6405 struct kev_msg ev_msg;
6406 struct net_event_data ev_data;
6407
6408 bzero(&ev_data, sizeof(ev_data));
6409 bzero(&ev_msg, sizeof(ev_msg));
6410 ev_msg.vendor_code = KEV_VENDOR_APPLE;
6411 ev_msg.kev_class = KEV_NETWORK_CLASS;
6412 ev_msg.kev_subclass = KEV_DL_SUBCLASS;
6413 ev_msg.event_code = KEV_DL_SIFFLAGS;
6414 strlcpy(&ev_data.if_name[0], ifp->if_name, IFNAMSIZ);
6415 ev_data.if_family = ifp->if_family;
6416 ev_data.if_unit = (u_int32_t) ifp->if_unit;
6417 ev_msg.dv[0].data_length = sizeof(struct net_event_data);
6418 ev_msg.dv[0].data_ptr = &ev_data;
6419 ev_msg.dv[1].data_length = 0;
6420 dlil_post_complete_msg(ifp, &ev_msg);
6421 }
6422
6423 #define TMP_IF_PROTO_ARR_SIZE 10
6424 static int
dlil_event_internal(struct ifnet * ifp,struct kev_msg * event,bool update_generation)6425 dlil_event_internal(struct ifnet *ifp, struct kev_msg *event, bool update_generation)
6426 {
6427 struct ifnet_filter *filter = NULL;
6428 struct if_proto *proto = NULL;
6429 int if_proto_count = 0;
6430 struct if_proto *tmp_ifproto_stack_arr[TMP_IF_PROTO_ARR_SIZE] = {NULL};
6431 struct if_proto **tmp_ifproto_arr = tmp_ifproto_stack_arr;
6432 int tmp_ifproto_arr_idx = 0;
6433
6434 /*
6435 * Pass the event to the interface filters
6436 */
6437 lck_mtx_lock_spin(&ifp->if_flt_lock);
6438 /* prevent filter list from changing in case we drop the lock */
6439 if_flt_monitor_busy(ifp);
6440 TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
6441 if (filter->filt_event != NULL) {
6442 lck_mtx_unlock(&ifp->if_flt_lock);
6443
6444 filter->filt_event(filter->filt_cookie, ifp,
6445 filter->filt_protocol, event);
6446
6447 lck_mtx_lock_spin(&ifp->if_flt_lock);
6448 }
6449 }
6450 /* we're done with the filter list */
6451 if_flt_monitor_unbusy(ifp);
6452 lck_mtx_unlock(&ifp->if_flt_lock);
6453
6454 /* Get an io ref count if the interface is attached */
6455 if (!ifnet_is_attached(ifp, 1)) {
6456 goto done;
6457 }
6458
6459 /*
6460 * An embedded tmp_list_entry in if_proto may still get
6461 * over-written by another thread after giving up ifnet lock,
6462 * therefore we are avoiding embedded pointers here.
6463 */
6464 ifnet_lock_shared(ifp);
6465 if_proto_count = dlil_ifp_protolist(ifp, NULL, 0);
6466 if (if_proto_count) {
6467 int i;
6468 VERIFY(ifp->if_proto_hash != NULL);
6469 if (if_proto_count <= TMP_IF_PROTO_ARR_SIZE) {
6470 tmp_ifproto_arr = tmp_ifproto_stack_arr;
6471 } else {
6472 tmp_ifproto_arr = kalloc_type(struct if_proto *,
6473 if_proto_count, Z_WAITOK | Z_ZERO);
6474 if (tmp_ifproto_arr == NULL) {
6475 ifnet_lock_done(ifp);
6476 goto cleanup;
6477 }
6478 }
6479
6480 for (i = 0; i < PROTO_HASH_SLOTS; i++) {
6481 SLIST_FOREACH(proto, &ifp->if_proto_hash[i],
6482 next_hash) {
6483 if_proto_ref(proto);
6484 tmp_ifproto_arr[tmp_ifproto_arr_idx] = proto;
6485 tmp_ifproto_arr_idx++;
6486 }
6487 }
6488 VERIFY(if_proto_count == tmp_ifproto_arr_idx);
6489 }
6490 ifnet_lock_done(ifp);
6491
6492 for (tmp_ifproto_arr_idx = 0; tmp_ifproto_arr_idx < if_proto_count;
6493 tmp_ifproto_arr_idx++) {
6494 proto = tmp_ifproto_arr[tmp_ifproto_arr_idx];
6495 VERIFY(proto != NULL);
6496 proto_media_event eventp =
6497 (proto->proto_kpi == kProtoKPI_v1 ?
6498 proto->kpi.v1.event :
6499 proto->kpi.v2.event);
6500
6501 if (eventp != NULL) {
6502 eventp(ifp, proto->protocol_family,
6503 event);
6504 }
6505 if_proto_free(proto);
6506 }
6507
6508 cleanup:
6509 if (tmp_ifproto_arr != tmp_ifproto_stack_arr) {
6510 kfree_type(struct if_proto *, if_proto_count, tmp_ifproto_arr);
6511 }
6512
6513 /* Pass the event to the interface */
6514 if (ifp->if_event != NULL) {
6515 ifp->if_event(ifp, event);
6516 }
6517
6518 /* Release the io ref count */
6519 ifnet_decr_iorefcnt(ifp);
6520 done:
6521 return dlil_post_complete_msg(update_generation ? ifp : NULL, event);
6522 }
6523
6524 errno_t
ifnet_event(ifnet_t ifp,struct kern_event_msg * event)6525 ifnet_event(ifnet_t ifp, struct kern_event_msg *event)
6526 {
6527 struct kev_msg kev_msg;
6528 int result = 0;
6529
6530 if (ifp == NULL || event == NULL) {
6531 return EINVAL;
6532 }
6533
6534 bzero(&kev_msg, sizeof(kev_msg));
6535 kev_msg.vendor_code = event->vendor_code;
6536 kev_msg.kev_class = event->kev_class;
6537 kev_msg.kev_subclass = event->kev_subclass;
6538 kev_msg.event_code = event->event_code;
6539 kev_msg.dv[0].data_ptr = &event->event_data[0];
6540 kev_msg.dv[0].data_length = event->total_size - KEV_MSG_HEADER_SIZE;
6541 kev_msg.dv[1].data_length = 0;
6542
6543 result = dlil_event_internal(ifp, &kev_msg, TRUE);
6544
6545 return result;
6546 }
6547
6548 static void
dlil_count_chain_len(mbuf_t m,struct chain_len_stats * cls)6549 dlil_count_chain_len(mbuf_t m, struct chain_len_stats *cls)
6550 {
6551 mbuf_t n = m;
6552 int chainlen = 0;
6553
6554 while (n != NULL) {
6555 chainlen++;
6556 n = n->m_next;
6557 }
6558 switch (chainlen) {
6559 case 0:
6560 break;
6561 case 1:
6562 atomic_add_64(&cls->cls_one, 1);
6563 break;
6564 case 2:
6565 atomic_add_64(&cls->cls_two, 1);
6566 break;
6567 case 3:
6568 atomic_add_64(&cls->cls_three, 1);
6569 break;
6570 case 4:
6571 atomic_add_64(&cls->cls_four, 1);
6572 break;
6573 case 5:
6574 default:
6575 atomic_add_64(&cls->cls_five_or_more, 1);
6576 break;
6577 }
6578 }
6579
6580 #if CONFIG_DTRACE
6581 __attribute__((noinline))
6582 static void
dlil_output_dtrace(ifnet_t ifp,protocol_family_t proto_family,mbuf_t m)6583 dlil_output_dtrace(ifnet_t ifp, protocol_family_t proto_family, mbuf_t m)
6584 {
6585 if (proto_family == PF_INET) {
6586 struct ip *ip = mtod(m, struct ip *);
6587 DTRACE_IP6(send, struct mbuf *, m, struct inpcb *, NULL,
6588 struct ip *, ip, struct ifnet *, ifp,
6589 struct ip *, ip, struct ip6_hdr *, NULL);
6590 } else if (proto_family == PF_INET6) {
6591 struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
6592 DTRACE_IP6(send, struct mbuf *, m, struct inpcb *, NULL,
6593 struct ip6_hdr *, ip6, struct ifnet *, ifp,
6594 struct ip *, NULL, struct ip6_hdr *, ip6);
6595 }
6596 }
6597 #endif /* CONFIG_DTRACE */
6598
6599 /*
6600 * dlil_output
6601 *
6602 * Caller should have a lock on the protocol domain if the protocol
6603 * doesn't support finer grained locking. In most cases, the lock
6604 * will be held from the socket layer and won't be released until
6605 * we return back to the socket layer.
6606 *
6607 * This does mean that we must take a protocol lock before we take
6608 * an interface lock if we're going to take both. This makes sense
6609 * because a protocol is likely to interact with an ifp while it
6610 * is under the protocol lock.
6611 *
6612 * An advisory code will be returned if adv is not null. This
6613 * can be used to provide feedback about interface queues to the
6614 * application.
6615 */
6616 errno_t
dlil_output(ifnet_t ifp,protocol_family_t proto_family,mbuf_t packetlist,void * route,const struct sockaddr * dest,int raw,struct flowadv * adv)6617 dlil_output(ifnet_t ifp, protocol_family_t proto_family, mbuf_t packetlist,
6618 void *route, const struct sockaddr *dest, int raw, struct flowadv *adv)
6619 {
6620 char *frame_type = NULL;
6621 char *dst_linkaddr = NULL;
6622 int retval = 0;
6623 char frame_type_buffer[MAX_FRAME_TYPE_SIZE * 4];
6624 char dst_linkaddr_buffer[MAX_LINKADDR * 4];
6625 struct if_proto *proto = NULL;
6626 mbuf_t m = NULL;
6627 mbuf_t send_head = NULL;
6628 mbuf_t *send_tail = &send_head;
6629 int iorefcnt = 0;
6630 u_int32_t pre = 0, post = 0;
6631 u_int32_t fpkts = 0, fbytes = 0;
6632 int32_t flen = 0;
6633 struct timespec now;
6634 u_int64_t now_nsec;
6635 boolean_t did_clat46 = FALSE;
6636 protocol_family_t old_proto_family = proto_family;
6637 struct sockaddr_in6 dest6;
6638 struct rtentry *rt = NULL;
6639 u_int32_t m_loop_set = 0;
6640
6641 KERNEL_DEBUG(DBG_FNC_DLIL_OUTPUT | DBG_FUNC_START, 0, 0, 0, 0, 0);
6642
6643 /*
6644 * Get an io refcnt if the interface is attached to prevent ifnet_detach
6645 * from happening while this operation is in progress
6646 */
6647 if (!ifnet_datamov_begin(ifp)) {
6648 retval = ENXIO;
6649 goto cleanup;
6650 }
6651 iorefcnt = 1;
6652
6653 VERIFY(ifp->if_output_dlil != NULL);
6654
6655 /* update the driver's multicast filter, if needed */
6656 if (ifp->if_updatemcasts > 0 && if_mcasts_update(ifp) == 0) {
6657 ifp->if_updatemcasts = 0;
6658 }
6659
6660 frame_type = frame_type_buffer;
6661 dst_linkaddr = dst_linkaddr_buffer;
6662
6663 if (raw == 0) {
6664 ifnet_lock_shared(ifp);
6665 /* callee holds a proto refcnt upon success */
6666 proto = find_attached_proto(ifp, proto_family);
6667 if (proto == NULL) {
6668 ifnet_lock_done(ifp);
6669 retval = ENXIO;
6670 goto cleanup;
6671 }
6672 ifnet_lock_done(ifp);
6673 }
6674
6675 preout_again:
6676 if (packetlist == NULL) {
6677 goto cleanup;
6678 }
6679
6680 m = packetlist;
6681 packetlist = packetlist->m_nextpkt;
6682 m->m_nextpkt = NULL;
6683
6684 m_add_crumb(m, PKT_CRUMB_DLIL_OUTPUT);
6685
6686 /*
6687 * Perform address family translation for the first
6688 * packet outside the loop in order to perform address
6689 * lookup for the translated proto family.
6690 */
6691 if (proto_family == PF_INET && IS_INTF_CLAT46(ifp) &&
6692 (ifp->if_type == IFT_CELLULAR ||
6693 dlil_is_clat_needed(proto_family, m))) {
6694 retval = dlil_clat46(ifp, &proto_family, &m);
6695 /*
6696 * Go to the next packet if translation fails
6697 */
6698 if (retval != 0) {
6699 m_freem(m);
6700 m = NULL;
6701 ip6stat.ip6s_clat464_out_drop++;
6702 /* Make sure that the proto family is PF_INET */
6703 ASSERT(proto_family == PF_INET);
6704 goto preout_again;
6705 }
6706 /*
6707 * Free the old one and make it point to the IPv6 proto structure.
6708 *
6709 * Change proto for the first time we have successfully
6710 * performed address family translation.
6711 */
6712 if (!did_clat46 && proto_family == PF_INET6) {
6713 did_clat46 = TRUE;
6714
6715 if (proto != NULL) {
6716 if_proto_free(proto);
6717 }
6718 ifnet_lock_shared(ifp);
6719 /* callee holds a proto refcnt upon success */
6720 proto = find_attached_proto(ifp, proto_family);
6721 if (proto == NULL) {
6722 ifnet_lock_done(ifp);
6723 retval = ENXIO;
6724 m_freem(m);
6725 m = NULL;
6726 goto cleanup;
6727 }
6728 ifnet_lock_done(ifp);
6729 if (ifp->if_type == IFT_ETHER) {
6730 /* Update the dest to translated v6 address */
6731 dest6.sin6_len = sizeof(struct sockaddr_in6);
6732 dest6.sin6_family = AF_INET6;
6733 dest6.sin6_addr = (mtod(m, struct ip6_hdr *))->ip6_dst;
6734 dest = (const struct sockaddr *)&dest6;
6735
6736 /*
6737 * Lookup route to the translated destination
6738 * Free this route ref during cleanup
6739 */
6740 rt = rtalloc1_scoped((struct sockaddr *)&dest6,
6741 0, 0, ifp->if_index);
6742
6743 route = rt;
6744 }
6745 }
6746 }
6747
6748 /*
6749 * This path gets packet chain going to the same destination.
6750 * The pre output routine is used to either trigger resolution of
6751 * the next hop or retreive the next hop's link layer addressing.
6752 * For ex: ether_inet(6)_pre_output routine.
6753 *
6754 * If the routine returns EJUSTRETURN, it implies that packet has
6755 * been queued, and therefore we have to call preout_again for the
6756 * following packet in the chain.
6757 *
6758 * For errors other than EJUSTRETURN, the current packet is freed
6759 * and the rest of the chain (pointed by packetlist is freed as
6760 * part of clean up.
6761 *
6762 * Else if there is no error the retrieved information is used for
6763 * all the packets in the chain.
6764 */
6765 if (raw == 0) {
6766 proto_media_preout preoutp = (proto->proto_kpi == kProtoKPI_v1 ?
6767 proto->kpi.v1.pre_output : proto->kpi.v2.pre_output);
6768 retval = 0;
6769 if (preoutp != NULL) {
6770 retval = preoutp(ifp, proto_family, &m, dest, route,
6771 frame_type, dst_linkaddr);
6772
6773 if (retval != 0) {
6774 if (retval == EJUSTRETURN) {
6775 goto preout_again;
6776 }
6777 m_freem(m);
6778 m = NULL;
6779 goto cleanup;
6780 }
6781 }
6782 }
6783
6784 do {
6785 /*
6786 * pkt_hdr is set here to point to m_data prior to
6787 * calling into the framer. This value of pkt_hdr is
6788 * used by the netif gso logic to retrieve the ip header
6789 * for the TCP packets, offloaded for TSO processing.
6790 */
6791 if ((raw != 0) && (ifp->if_family == IFNET_FAMILY_ETHERNET)) {
6792 uint8_t vlan_encap_len = 0;
6793
6794 if ((m->m_pkthdr.csum_flags & CSUM_VLAN_ENCAP_PRESENT) != 0) {
6795 vlan_encap_len = ETHER_VLAN_ENCAP_LEN;
6796 }
6797 m->m_pkthdr.pkt_hdr = mtod(m, char *) + ETHER_HDR_LEN + vlan_encap_len;
6798 } else {
6799 m->m_pkthdr.pkt_hdr = mtod(m, void *);
6800 }
6801
6802 /*
6803 * Perform address family translation if needed.
6804 * For now we only support stateless 4 to 6 translation
6805 * on the out path.
6806 *
6807 * The routine below translates IP header, updates protocol
6808 * checksum and also translates ICMP.
6809 *
6810 * We skip the first packet as it is already translated and
6811 * the proto family is set to PF_INET6.
6812 */
6813 if (proto_family == PF_INET && IS_INTF_CLAT46(ifp) &&
6814 (ifp->if_type == IFT_CELLULAR ||
6815 dlil_is_clat_needed(proto_family, m))) {
6816 retval = dlil_clat46(ifp, &proto_family, &m);
6817 /* Goto the next packet if the translation fails */
6818 if (retval != 0) {
6819 m_freem(m);
6820 m = NULL;
6821 ip6stat.ip6s_clat464_out_drop++;
6822 goto next;
6823 }
6824 }
6825
6826 #if CONFIG_DTRACE
6827 if (!raw) {
6828 dlil_output_dtrace(ifp, proto_family, m);
6829 }
6830 #endif /* CONFIG_DTRACE */
6831
6832 if (raw == 0 && ifp->if_framer != NULL) {
6833 int rcvif_set = 0;
6834
6835 /*
6836 * If this is a broadcast packet that needs to be
6837 * looped back into the system, set the inbound ifp
6838 * to that of the outbound ifp. This will allow
6839 * us to determine that it is a legitimate packet
6840 * for the system. Only set the ifp if it's not
6841 * already set, just to be safe.
6842 */
6843 if ((m->m_flags & (M_BCAST | M_LOOP)) &&
6844 m->m_pkthdr.rcvif == NULL) {
6845 m->m_pkthdr.rcvif = ifp;
6846 rcvif_set = 1;
6847 }
6848 m_loop_set = m->m_flags & M_LOOP;
6849 retval = ifp->if_framer(ifp, &m, dest, dst_linkaddr,
6850 frame_type, &pre, &post);
6851 if (retval != 0) {
6852 if (retval != EJUSTRETURN) {
6853 m_freem(m);
6854 }
6855 goto next;
6856 }
6857
6858 /*
6859 * For partial checksum offload, adjust the start
6860 * and stuff offsets based on the prepended header.
6861 */
6862 if ((m->m_pkthdr.csum_flags &
6863 (CSUM_DATA_VALID | CSUM_PARTIAL)) ==
6864 (CSUM_DATA_VALID | CSUM_PARTIAL)) {
6865 m->m_pkthdr.csum_tx_stuff += pre;
6866 m->m_pkthdr.csum_tx_start += pre;
6867 }
6868
6869 if (hwcksum_dbg != 0 && !(ifp->if_flags & IFF_LOOPBACK)) {
6870 dlil_output_cksum_dbg(ifp, m, pre,
6871 proto_family);
6872 }
6873
6874 /*
6875 * Clear the ifp if it was set above, and to be
6876 * safe, only if it is still the same as the
6877 * outbound ifp we have in context. If it was
6878 * looped back, then a copy of it was sent to the
6879 * loopback interface with the rcvif set, and we
6880 * are clearing the one that will go down to the
6881 * layer below.
6882 */
6883 if (rcvif_set && m->m_pkthdr.rcvif == ifp) {
6884 m->m_pkthdr.rcvif = NULL;
6885 }
6886 }
6887
6888 /*
6889 * Let interface filters (if any) do their thing ...
6890 */
6891 retval = dlil_interface_filters_output(ifp, &m, proto_family);
6892 if (retval != 0) {
6893 if (retval != EJUSTRETURN) {
6894 m_freem(m);
6895 }
6896 goto next;
6897 }
6898 /*
6899 * Strip away M_PROTO1 bit prior to sending packet
6900 * to the driver as this field may be used by the driver
6901 */
6902 m->m_flags &= ~M_PROTO1;
6903
6904 /*
6905 * If the underlying interface is not capable of handling a
6906 * packet whose data portion spans across physically disjoint
6907 * pages, we need to "normalize" the packet so that we pass
6908 * down a chain of mbufs where each mbuf points to a span that
6909 * resides in the system page boundary. If the packet does
6910 * not cross page(s), the following is a no-op.
6911 */
6912 if (!(ifp->if_hwassist & IFNET_MULTIPAGES)) {
6913 if ((m = m_normalize(m)) == NULL) {
6914 goto next;
6915 }
6916 }
6917
6918 /*
6919 * If this is a TSO packet, make sure the interface still
6920 * advertise TSO capability.
6921 */
6922 if (TSO_IPV4_NOTOK(ifp, m) || TSO_IPV6_NOTOK(ifp, m)) {
6923 retval = EMSGSIZE;
6924 m_freem(m);
6925 goto cleanup;
6926 }
6927
6928 ifp_inc_traffic_class_out(ifp, m);
6929
6930 #if SKYWALK
6931 /*
6932 * For native skywalk devices, packets will be passed to pktap
6933 * after GSO or after the mbuf to packet conversion.
6934 * This is done for IPv4/IPv6 packets only because there is no
6935 * space in the mbuf to pass down the proto family.
6936 */
6937 if (dlil_is_native_netif_nexus(ifp)) {
6938 if (raw || m->m_pkthdr.pkt_proto == 0) {
6939 pktap_output(ifp, proto_family, m, pre, post);
6940 m->m_pkthdr.pkt_flags |= PKTF_SKIP_PKTAP;
6941 }
6942 } else {
6943 pktap_output(ifp, proto_family, m, pre, post);
6944 }
6945 #else /* SKYWALK */
6946 pktap_output(ifp, proto_family, m, pre, post);
6947 #endif /* SKYWALK */
6948
6949 /*
6950 * Count the number of elements in the mbuf chain
6951 */
6952 if (tx_chain_len_count) {
6953 dlil_count_chain_len(m, &tx_chain_len_stats);
6954 }
6955
6956 /*
6957 * Record timestamp; ifnet_enqueue() will use this info
6958 * rather than redoing the work. An optimization could
6959 * involve doing this just once at the top, if there are
6960 * no interface filters attached, but that's probably
6961 * not a big deal.
6962 */
6963 nanouptime(&now);
6964 net_timernsec(&now, &now_nsec);
6965 (void) mbuf_set_timestamp(m, now_nsec, TRUE);
6966
6967 /*
6968 * Discard partial sum information if this packet originated
6969 * from another interface; the packet would already have the
6970 * final checksum and we shouldn't recompute it.
6971 */
6972 if ((m->m_pkthdr.pkt_flags & PKTF_FORWARDED) &&
6973 (m->m_pkthdr.csum_flags & (CSUM_DATA_VALID | CSUM_PARTIAL)) ==
6974 (CSUM_DATA_VALID | CSUM_PARTIAL)) {
6975 m->m_pkthdr.csum_flags &= ~CSUM_TX_FLAGS;
6976 m->m_pkthdr.csum_data = 0;
6977 }
6978
6979 /*
6980 * Finally, call the driver.
6981 */
6982 if (ifp->if_eflags & (IFEF_SENDLIST | IFEF_ENQUEUE_MULTI)) {
6983 if (m->m_pkthdr.pkt_flags & PKTF_FORWARDED) {
6984 flen += (m_pktlen(m) - (pre + post));
6985 m->m_pkthdr.pkt_flags &= ~PKTF_FORWARDED;
6986 }
6987 *send_tail = m;
6988 send_tail = &m->m_nextpkt;
6989 } else {
6990 if (m->m_pkthdr.pkt_flags & PKTF_FORWARDED) {
6991 flen = (m_pktlen(m) - (pre + post));
6992 m->m_pkthdr.pkt_flags &= ~PKTF_FORWARDED;
6993 } else {
6994 flen = 0;
6995 }
6996 KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_START,
6997 0, 0, 0, 0, 0);
6998 retval = (*ifp->if_output_dlil)(ifp, m);
6999 if (retval == EQFULL || retval == EQSUSPENDED) {
7000 if (adv != NULL && adv->code == FADV_SUCCESS) {
7001 adv->code = (retval == EQFULL ?
7002 FADV_FLOW_CONTROLLED :
7003 FADV_SUSPENDED);
7004 }
7005 retval = 0;
7006 }
7007 if (retval == 0 && flen > 0) {
7008 fbytes += flen;
7009 fpkts++;
7010 }
7011 if (retval != 0 && dlil_verbose) {
7012 DLIL_PRINTF("%s: output error on %s retval = %d\n",
7013 __func__, if_name(ifp),
7014 retval);
7015 }
7016 KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_END,
7017 0, 0, 0, 0, 0);
7018 }
7019 KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
7020
7021 next:
7022 m = packetlist;
7023 if (m != NULL) {
7024 m->m_flags |= m_loop_set;
7025 packetlist = packetlist->m_nextpkt;
7026 m->m_nextpkt = NULL;
7027 }
7028 /* Reset the proto family to old proto family for CLAT */
7029 if (did_clat46) {
7030 proto_family = old_proto_family;
7031 }
7032 } while (m != NULL);
7033
7034 if (send_head != NULL) {
7035 KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_START,
7036 0, 0, 0, 0, 0);
7037 if (ifp->if_eflags & IFEF_SENDLIST) {
7038 retval = (*ifp->if_output_dlil)(ifp, send_head);
7039 if (retval == EQFULL || retval == EQSUSPENDED) {
7040 if (adv != NULL) {
7041 adv->code = (retval == EQFULL ?
7042 FADV_FLOW_CONTROLLED :
7043 FADV_SUSPENDED);
7044 }
7045 retval = 0;
7046 }
7047 if (retval == 0 && flen > 0) {
7048 fbytes += flen;
7049 fpkts++;
7050 }
7051 if (retval != 0 && dlil_verbose) {
7052 DLIL_PRINTF("%s: output error on %s retval = %d\n",
7053 __func__, if_name(ifp), retval);
7054 }
7055 } else {
7056 struct mbuf *send_m;
7057 int enq_cnt = 0;
7058 VERIFY(ifp->if_eflags & IFEF_ENQUEUE_MULTI);
7059 while (send_head != NULL) {
7060 send_m = send_head;
7061 send_head = send_m->m_nextpkt;
7062 send_m->m_nextpkt = NULL;
7063 retval = (*ifp->if_output_dlil)(ifp, send_m);
7064 if (retval == EQFULL || retval == EQSUSPENDED) {
7065 if (adv != NULL) {
7066 adv->code = (retval == EQFULL ?
7067 FADV_FLOW_CONTROLLED :
7068 FADV_SUSPENDED);
7069 }
7070 retval = 0;
7071 }
7072 if (retval == 0) {
7073 enq_cnt++;
7074 if (flen > 0) {
7075 fpkts++;
7076 }
7077 }
7078 if (retval != 0 && dlil_verbose) {
7079 DLIL_PRINTF("%s: output error on %s "
7080 "retval = %d\n",
7081 __func__, if_name(ifp), retval);
7082 }
7083 }
7084 if (enq_cnt > 0) {
7085 fbytes += flen;
7086 ifnet_start(ifp);
7087 }
7088 }
7089 KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
7090 }
7091
7092 KERNEL_DEBUG(DBG_FNC_DLIL_OUTPUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
7093
7094 cleanup:
7095 if (fbytes > 0) {
7096 ifp->if_fbytes += fbytes;
7097 }
7098 if (fpkts > 0) {
7099 ifp->if_fpackets += fpkts;
7100 }
7101 if (proto != NULL) {
7102 if_proto_free(proto);
7103 }
7104 if (packetlist) { /* if any packets are left, clean up */
7105 mbuf_freem_list(packetlist);
7106 }
7107 if (retval == EJUSTRETURN) {
7108 retval = 0;
7109 }
7110 if (iorefcnt == 1) {
7111 ifnet_datamov_end(ifp);
7112 }
7113 if (rt != NULL) {
7114 rtfree(rt);
7115 rt = NULL;
7116 }
7117
7118 return retval;
7119 }
7120
7121 /*
7122 * This routine checks if the destination address is not a loopback, link-local,
7123 * multicast or broadcast address.
7124 */
7125 static int
dlil_is_clat_needed(protocol_family_t proto_family,mbuf_t m)7126 dlil_is_clat_needed(protocol_family_t proto_family, mbuf_t m)
7127 {
7128 int ret = 0;
7129 switch (proto_family) {
7130 case PF_INET: {
7131 struct ip *iph = mtod(m, struct ip *);
7132 if (CLAT46_NEEDED(ntohl(iph->ip_dst.s_addr))) {
7133 ret = 1;
7134 }
7135 break;
7136 }
7137 case PF_INET6: {
7138 struct ip6_hdr *ip6h = mtod(m, struct ip6_hdr *);
7139 if ((size_t)m_pktlen(m) >= sizeof(struct ip6_hdr) &&
7140 CLAT64_NEEDED(&ip6h->ip6_dst)) {
7141 ret = 1;
7142 }
7143 break;
7144 }
7145 }
7146
7147 return ret;
7148 }
7149 /*
7150 * @brief This routine translates IPv4 packet to IPv6 packet,
7151 * updates protocol checksum and also translates ICMP for code
7152 * along with inner header translation.
7153 *
7154 * @param ifp Pointer to the interface
7155 * @param proto_family pointer to protocol family. It is updated if function
7156 * performs the translation successfully.
7157 * @param m Pointer to the pointer pointing to the packet. Needed because this
7158 * routine can end up changing the mbuf to a different one.
7159 *
7160 * @return 0 on success or else a negative value.
7161 */
7162 static errno_t
dlil_clat46(ifnet_t ifp,protocol_family_t * proto_family,mbuf_t * m)7163 dlil_clat46(ifnet_t ifp, protocol_family_t *proto_family, mbuf_t *m)
7164 {
7165 VERIFY(*proto_family == PF_INET);
7166 VERIFY(IS_INTF_CLAT46(ifp));
7167
7168 pbuf_t pbuf_store, *pbuf = NULL;
7169 struct ip *iph = NULL;
7170 struct in_addr osrc, odst;
7171 uint8_t proto = 0;
7172 struct in6_ifaddr *ia6_clat_src = NULL;
7173 struct in6_addr *src = NULL;
7174 struct in6_addr dst;
7175 int error = 0;
7176 uint16_t off = 0;
7177 uint16_t tot_len = 0;
7178 uint16_t ip_id_val = 0;
7179 uint16_t ip_frag_off = 0;
7180
7181 boolean_t is_frag = FALSE;
7182 boolean_t is_first_frag = TRUE;
7183 boolean_t is_last_frag = TRUE;
7184
7185 pbuf_init_mbuf(&pbuf_store, *m, ifp);
7186 pbuf = &pbuf_store;
7187 iph = pbuf->pb_data;
7188
7189 osrc = iph->ip_src;
7190 odst = iph->ip_dst;
7191 proto = iph->ip_p;
7192 off = (uint16_t)(iph->ip_hl << 2);
7193 ip_id_val = iph->ip_id;
7194 ip_frag_off = ntohs(iph->ip_off) & IP_OFFMASK;
7195
7196 tot_len = ntohs(iph->ip_len);
7197
7198 /*
7199 * For packets that are not first frags
7200 * we only need to adjust CSUM.
7201 * For 4 to 6, Fragmentation header gets appended
7202 * after proto translation.
7203 */
7204 if (ntohs(iph->ip_off) & ~(IP_DF | IP_RF)) {
7205 is_frag = TRUE;
7206
7207 /* If the offset is not zero, it is not first frag */
7208 if (ip_frag_off != 0) {
7209 is_first_frag = FALSE;
7210 }
7211
7212 /* If IP_MF is set, then it is not last frag */
7213 if (ntohs(iph->ip_off) & IP_MF) {
7214 is_last_frag = FALSE;
7215 }
7216 }
7217
7218 /*
7219 * Retrive the local IPv6 CLAT46 address reserved for stateless
7220 * translation.
7221 */
7222 ia6_clat_src = in6ifa_ifpwithflag(ifp, IN6_IFF_CLAT46);
7223 if (ia6_clat_src == NULL) {
7224 ip6stat.ip6s_clat464_out_nov6addr_drop++;
7225 error = -1;
7226 goto cleanup;
7227 }
7228
7229 src = &ia6_clat_src->ia_addr.sin6_addr;
7230
7231 /*
7232 * Translate IPv4 destination to IPv6 destination by using the
7233 * prefixes learned through prior PLAT discovery.
7234 */
7235 if ((error = nat464_synthesize_ipv6(ifp, &odst, &dst)) != 0) {
7236 ip6stat.ip6s_clat464_out_v6synthfail_drop++;
7237 goto cleanup;
7238 }
7239
7240 /* Translate the IP header part first */
7241 error = (nat464_translate_46(pbuf, off, iph->ip_tos, iph->ip_p,
7242 iph->ip_ttl, *src, dst, tot_len) == NT_NAT64) ? 0 : -1;
7243
7244 iph = NULL; /* Invalidate iph as pbuf has been modified */
7245
7246 if (error != 0) {
7247 ip6stat.ip6s_clat464_out_46transfail_drop++;
7248 goto cleanup;
7249 }
7250
7251 /*
7252 * Translate protocol header, update checksum, checksum flags
7253 * and related fields.
7254 */
7255 error = (nat464_translate_proto(pbuf, (struct nat464_addr *)&osrc, (struct nat464_addr *)&odst,
7256 proto, PF_INET, PF_INET6, NT_OUT, !is_first_frag) == NT_NAT64) ? 0 : -1;
7257
7258 if (error != 0) {
7259 ip6stat.ip6s_clat464_out_46proto_transfail_drop++;
7260 goto cleanup;
7261 }
7262
7263 /* Now insert the IPv6 fragment header */
7264 if (is_frag) {
7265 error = nat464_insert_frag46(pbuf, ip_id_val, ip_frag_off, is_last_frag);
7266
7267 if (error != 0) {
7268 ip6stat.ip6s_clat464_out_46frag_transfail_drop++;
7269 goto cleanup;
7270 }
7271 }
7272
7273 cleanup:
7274 if (ia6_clat_src != NULL) {
7275 IFA_REMREF(&ia6_clat_src->ia_ifa);
7276 }
7277
7278 if (pbuf_is_valid(pbuf)) {
7279 *m = pbuf->pb_mbuf;
7280 pbuf->pb_mbuf = NULL;
7281 pbuf_destroy(pbuf);
7282 } else {
7283 error = -1;
7284 ip6stat.ip6s_clat464_out_invalpbuf_drop++;
7285 }
7286
7287 if (error == 0) {
7288 *proto_family = PF_INET6;
7289 ip6stat.ip6s_clat464_out_success++;
7290 }
7291
7292 return error;
7293 }
7294
7295 /*
7296 * @brief This routine translates incoming IPv6 to IPv4 packet,
7297 * updates protocol checksum and also translates ICMPv6 outer
7298 * and inner headers
7299 *
7300 * @return 0 on success or else a negative value.
7301 */
7302 static errno_t
dlil_clat64(ifnet_t ifp,protocol_family_t * proto_family,mbuf_t * m)7303 dlil_clat64(ifnet_t ifp, protocol_family_t *proto_family, mbuf_t *m)
7304 {
7305 VERIFY(*proto_family == PF_INET6);
7306 VERIFY(IS_INTF_CLAT46(ifp));
7307
7308 struct ip6_hdr *ip6h = NULL;
7309 struct in6_addr osrc, odst;
7310 uint8_t proto = 0;
7311 struct in6_ifaddr *ia6_clat_dst = NULL;
7312 struct in_ifaddr *ia4_clat_dst = NULL;
7313 struct in_addr *dst = NULL;
7314 struct in_addr src;
7315 int error = 0;
7316 uint32_t off = 0;
7317 u_int64_t tot_len = 0;
7318 uint8_t tos = 0;
7319 boolean_t is_first_frag = TRUE;
7320
7321 /* Incoming mbuf does not contain valid IP6 header */
7322 if ((size_t)(*m)->m_pkthdr.len < sizeof(struct ip6_hdr) ||
7323 ((size_t)(*m)->m_len < sizeof(struct ip6_hdr) &&
7324 (*m = m_pullup(*m, sizeof(struct ip6_hdr))) == NULL)) {
7325 ip6stat.ip6s_clat464_in_tooshort_drop++;
7326 return -1;
7327 }
7328
7329 ip6h = mtod(*m, struct ip6_hdr *);
7330 /* Validate that mbuf contains IP payload equal to ip6_plen */
7331 if ((size_t)(*m)->m_pkthdr.len < ntohs(ip6h->ip6_plen) + sizeof(struct ip6_hdr)) {
7332 ip6stat.ip6s_clat464_in_tooshort_drop++;
7333 return -1;
7334 }
7335
7336 osrc = ip6h->ip6_src;
7337 odst = ip6h->ip6_dst;
7338
7339 /*
7340 * Retrieve the local CLAT46 reserved IPv6 address.
7341 * Let the packet pass if we don't find one, as the flag
7342 * may get set before IPv6 configuration has taken place.
7343 */
7344 ia6_clat_dst = in6ifa_ifpwithflag(ifp, IN6_IFF_CLAT46);
7345 if (ia6_clat_dst == NULL) {
7346 goto done;
7347 }
7348
7349 /*
7350 * Check if the original dest in the packet is same as the reserved
7351 * CLAT46 IPv6 address
7352 */
7353 if (IN6_ARE_ADDR_EQUAL(&odst, &ia6_clat_dst->ia_addr.sin6_addr)) {
7354 pbuf_t pbuf_store, *pbuf = NULL;
7355 pbuf_init_mbuf(&pbuf_store, *m, ifp);
7356 pbuf = &pbuf_store;
7357
7358 /*
7359 * Retrive the local CLAT46 IPv4 address reserved for stateless
7360 * translation.
7361 */
7362 ia4_clat_dst = inifa_ifpclatv4(ifp);
7363 if (ia4_clat_dst == NULL) {
7364 IFA_REMREF(&ia6_clat_dst->ia_ifa);
7365 ip6stat.ip6s_clat464_in_nov4addr_drop++;
7366 error = -1;
7367 goto cleanup;
7368 }
7369 IFA_REMREF(&ia6_clat_dst->ia_ifa);
7370
7371 /* Translate IPv6 src to IPv4 src by removing the NAT64 prefix */
7372 dst = &ia4_clat_dst->ia_addr.sin_addr;
7373 if ((error = nat464_synthesize_ipv4(ifp, &osrc, &src)) != 0) {
7374 ip6stat.ip6s_clat464_in_v4synthfail_drop++;
7375 error = -1;
7376 goto cleanup;
7377 }
7378
7379 ip6h = pbuf->pb_data;
7380 off = sizeof(struct ip6_hdr);
7381 proto = ip6h->ip6_nxt;
7382 tos = (ntohl(ip6h->ip6_flow) >> 20) & 0xff;
7383 tot_len = ntohs(ip6h->ip6_plen) + sizeof(struct ip6_hdr);
7384
7385 /*
7386 * Translate the IP header and update the fragmentation
7387 * header if needed
7388 */
7389 error = (nat464_translate_64(pbuf, off, tos, &proto,
7390 ip6h->ip6_hlim, src, *dst, tot_len, &is_first_frag) == NT_NAT64) ?
7391 0 : -1;
7392
7393 ip6h = NULL; /* Invalidate ip6h as pbuf has been changed */
7394
7395 if (error != 0) {
7396 ip6stat.ip6s_clat464_in_64transfail_drop++;
7397 goto cleanup;
7398 }
7399
7400 /*
7401 * Translate protocol header, update checksum, checksum flags
7402 * and related fields.
7403 */
7404 error = (nat464_translate_proto(pbuf, (struct nat464_addr *)&osrc,
7405 (struct nat464_addr *)&odst, proto, PF_INET6, PF_INET,
7406 NT_IN, !is_first_frag) == NT_NAT64) ? 0 : -1;
7407
7408 if (error != 0) {
7409 ip6stat.ip6s_clat464_in_64proto_transfail_drop++;
7410 goto cleanup;
7411 }
7412
7413 cleanup:
7414 if (ia4_clat_dst != NULL) {
7415 IFA_REMREF(&ia4_clat_dst->ia_ifa);
7416 }
7417
7418 if (pbuf_is_valid(pbuf)) {
7419 *m = pbuf->pb_mbuf;
7420 pbuf->pb_mbuf = NULL;
7421 pbuf_destroy(pbuf);
7422 } else {
7423 error = -1;
7424 ip6stat.ip6s_clat464_in_invalpbuf_drop++;
7425 }
7426
7427 if (error == 0) {
7428 *proto_family = PF_INET;
7429 ip6stat.ip6s_clat464_in_success++;
7430 }
7431 } /* CLAT traffic */
7432
7433 done:
7434 return error;
7435 }
7436
7437 /* The following is used to enqueue work items for ifnet ioctl events */
7438 static void ifnet_ioctl_event_callback(struct nwk_wq_entry *);
7439
7440 struct ifnet_ioctl_event {
7441 struct ifnet *ifp;
7442 u_long ioctl_code;
7443 };
7444
7445 struct ifnet_ioctl_event_nwk_wq_entry {
7446 struct nwk_wq_entry nwk_wqe;
7447 struct ifnet_ioctl_event ifnet_ioctl_ev_arg;
7448 };
7449
7450 void
ifnet_ioctl_async(struct ifnet * ifp,u_long ioctl_code)7451 ifnet_ioctl_async(struct ifnet *ifp, u_long ioctl_code)
7452 {
7453 struct ifnet_ioctl_event_nwk_wq_entry *p_ifnet_ioctl_ev = NULL;
7454
7455 /*
7456 * Get an io ref count if the interface is attached.
7457 * At this point it most likely is. We are taking a reference for
7458 * deferred processing.
7459 */
7460 if (!ifnet_is_attached(ifp, 1)) {
7461 os_log(OS_LOG_DEFAULT, "%s:%d %s Failed for ioctl %lu as interface "
7462 "is not attached",
7463 __func__, __LINE__, if_name(ifp), ioctl_code);
7464 return;
7465 }
7466
7467 p_ifnet_ioctl_ev = kalloc_type(struct ifnet_ioctl_event_nwk_wq_entry,
7468 Z_WAITOK | Z_ZERO | Z_NOFAIL);
7469
7470 p_ifnet_ioctl_ev->ifnet_ioctl_ev_arg.ifp = ifp;
7471 p_ifnet_ioctl_ev->ifnet_ioctl_ev_arg.ioctl_code = ioctl_code;
7472 p_ifnet_ioctl_ev->nwk_wqe.func = ifnet_ioctl_event_callback;
7473 nwk_wq_enqueue(&p_ifnet_ioctl_ev->nwk_wqe);
7474 }
7475
7476 static void
ifnet_ioctl_event_callback(struct nwk_wq_entry * nwk_item)7477 ifnet_ioctl_event_callback(struct nwk_wq_entry *nwk_item)
7478 {
7479 struct ifnet_ioctl_event_nwk_wq_entry *p_ev = __container_of(nwk_item,
7480 struct ifnet_ioctl_event_nwk_wq_entry, nwk_wqe);
7481
7482 struct ifnet *ifp = p_ev->ifnet_ioctl_ev_arg.ifp;
7483 u_long ioctl_code = p_ev->ifnet_ioctl_ev_arg.ioctl_code;
7484 int ret = 0;
7485
7486 if ((ret = ifnet_ioctl(ifp, 0, ioctl_code, NULL)) != 0) {
7487 os_log(OS_LOG_DEFAULT, "%s:%d %s ifnet_ioctl returned %d for ioctl %lu",
7488 __func__, __LINE__, if_name(ifp), ret, ioctl_code);
7489 } else if (dlil_verbose) {
7490 os_log(OS_LOG_DEFAULT, "%s:%d %s ifnet_ioctl returned successfully "
7491 "for ioctl %lu",
7492 __func__, __LINE__, if_name(ifp), ioctl_code);
7493 }
7494 ifnet_decr_iorefcnt(ifp);
7495 kfree_type(struct ifnet_ioctl_event_nwk_wq_entry, p_ev);
7496 return;
7497 }
7498
7499 errno_t
ifnet_ioctl(ifnet_t ifp,protocol_family_t proto_fam,u_long ioctl_code,void * ioctl_arg)7500 ifnet_ioctl(ifnet_t ifp, protocol_family_t proto_fam, u_long ioctl_code,
7501 void *ioctl_arg)
7502 {
7503 struct ifnet_filter *filter;
7504 int retval = EOPNOTSUPP;
7505 int result = 0;
7506
7507 if (ifp == NULL || ioctl_code == 0) {
7508 return EINVAL;
7509 }
7510
7511 /* Get an io ref count if the interface is attached */
7512 if (!ifnet_is_attached(ifp, 1)) {
7513 return EOPNOTSUPP;
7514 }
7515
7516 /*
7517 * Run the interface filters first.
7518 * We want to run all filters before calling the protocol,
7519 * interface family, or interface.
7520 */
7521 lck_mtx_lock_spin(&ifp->if_flt_lock);
7522 /* prevent filter list from changing in case we drop the lock */
7523 if_flt_monitor_busy(ifp);
7524 TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
7525 if (filter->filt_ioctl != NULL && (filter->filt_protocol == 0 ||
7526 filter->filt_protocol == proto_fam)) {
7527 lck_mtx_unlock(&ifp->if_flt_lock);
7528
7529 result = filter->filt_ioctl(filter->filt_cookie, ifp,
7530 proto_fam, ioctl_code, ioctl_arg);
7531
7532 lck_mtx_lock_spin(&ifp->if_flt_lock);
7533
7534 /* Only update retval if no one has handled the ioctl */
7535 if (retval == EOPNOTSUPP || result == EJUSTRETURN) {
7536 if (result == ENOTSUP) {
7537 result = EOPNOTSUPP;
7538 }
7539 retval = result;
7540 if (retval != 0 && retval != EOPNOTSUPP) {
7541 /* we're done with the filter list */
7542 if_flt_monitor_unbusy(ifp);
7543 lck_mtx_unlock(&ifp->if_flt_lock);
7544 goto cleanup;
7545 }
7546 }
7547 }
7548 }
7549 /* we're done with the filter list */
7550 if_flt_monitor_unbusy(ifp);
7551 lck_mtx_unlock(&ifp->if_flt_lock);
7552
7553 /* Allow the protocol to handle the ioctl */
7554 if (proto_fam != 0) {
7555 struct if_proto *proto;
7556
7557 /* callee holds a proto refcnt upon success */
7558 ifnet_lock_shared(ifp);
7559 proto = find_attached_proto(ifp, proto_fam);
7560 ifnet_lock_done(ifp);
7561 if (proto != NULL) {
7562 proto_media_ioctl ioctlp =
7563 (proto->proto_kpi == kProtoKPI_v1 ?
7564 proto->kpi.v1.ioctl : proto->kpi.v2.ioctl);
7565 result = EOPNOTSUPP;
7566 if (ioctlp != NULL) {
7567 result = ioctlp(ifp, proto_fam, ioctl_code,
7568 ioctl_arg);
7569 }
7570 if_proto_free(proto);
7571
7572 /* Only update retval if no one has handled the ioctl */
7573 if (retval == EOPNOTSUPP || result == EJUSTRETURN) {
7574 if (result == ENOTSUP) {
7575 result = EOPNOTSUPP;
7576 }
7577 retval = result;
7578 if (retval && retval != EOPNOTSUPP) {
7579 goto cleanup;
7580 }
7581 }
7582 }
7583 }
7584
7585 /* retval is either 0 or EOPNOTSUPP */
7586
7587 /*
7588 * Let the interface handle this ioctl.
7589 * If it returns EOPNOTSUPP, ignore that, we may have
7590 * already handled this in the protocol or family.
7591 */
7592 if (ifp->if_ioctl) {
7593 result = (*ifp->if_ioctl)(ifp, ioctl_code, ioctl_arg);
7594 }
7595
7596 /* Only update retval if no one has handled the ioctl */
7597 if (retval == EOPNOTSUPP || result == EJUSTRETURN) {
7598 if (result == ENOTSUP) {
7599 result = EOPNOTSUPP;
7600 }
7601 retval = result;
7602 if (retval && retval != EOPNOTSUPP) {
7603 goto cleanup;
7604 }
7605 }
7606
7607 cleanup:
7608 if (retval == EJUSTRETURN) {
7609 retval = 0;
7610 }
7611
7612 ifnet_decr_iorefcnt(ifp);
7613
7614 return retval;
7615 }
7616
7617 __private_extern__ errno_t
dlil_set_bpf_tap(ifnet_t ifp,bpf_tap_mode mode,bpf_packet_func callback)7618 dlil_set_bpf_tap(ifnet_t ifp, bpf_tap_mode mode, bpf_packet_func callback)
7619 {
7620 errno_t error = 0;
7621
7622
7623 if (ifp->if_set_bpf_tap) {
7624 /* Get an io reference on the interface if it is attached */
7625 if (!ifnet_is_attached(ifp, 1)) {
7626 return ENXIO;
7627 }
7628 error = ifp->if_set_bpf_tap(ifp, mode, callback);
7629 ifnet_decr_iorefcnt(ifp);
7630 }
7631 return error;
7632 }
7633
7634 errno_t
dlil_resolve_multi(struct ifnet * ifp,const struct sockaddr * proto_addr,struct sockaddr * ll_addr,size_t ll_len)7635 dlil_resolve_multi(struct ifnet *ifp, const struct sockaddr *proto_addr,
7636 struct sockaddr *ll_addr, size_t ll_len)
7637 {
7638 errno_t result = EOPNOTSUPP;
7639 struct if_proto *proto;
7640 const struct sockaddr *verify;
7641 proto_media_resolve_multi resolvep;
7642
7643 if (!ifnet_is_attached(ifp, 1)) {
7644 return result;
7645 }
7646
7647 bzero(ll_addr, ll_len);
7648
7649 /* Call the protocol first; callee holds a proto refcnt upon success */
7650 ifnet_lock_shared(ifp);
7651 proto = find_attached_proto(ifp, proto_addr->sa_family);
7652 ifnet_lock_done(ifp);
7653 if (proto != NULL) {
7654 resolvep = (proto->proto_kpi == kProtoKPI_v1 ?
7655 proto->kpi.v1.resolve_multi : proto->kpi.v2.resolve_multi);
7656 if (resolvep != NULL) {
7657 result = resolvep(ifp, proto_addr,
7658 (struct sockaddr_dl *)(void *)ll_addr, ll_len);
7659 }
7660 if_proto_free(proto);
7661 }
7662
7663 /* Let the interface verify the multicast address */
7664 if ((result == EOPNOTSUPP || result == 0) && ifp->if_check_multi) {
7665 if (result == 0) {
7666 verify = ll_addr;
7667 } else {
7668 verify = proto_addr;
7669 }
7670 result = ifp->if_check_multi(ifp, verify);
7671 }
7672
7673 ifnet_decr_iorefcnt(ifp);
7674 return result;
7675 }
7676
7677 __private_extern__ errno_t
dlil_send_arp_internal(ifnet_t ifp,u_short arpop,const struct sockaddr_dl * sender_hw,const struct sockaddr * sender_proto,const struct sockaddr_dl * target_hw,const struct sockaddr * target_proto)7678 dlil_send_arp_internal(ifnet_t ifp, u_short arpop,
7679 const struct sockaddr_dl *sender_hw, const struct sockaddr *sender_proto,
7680 const struct sockaddr_dl *target_hw, const struct sockaddr *target_proto)
7681 {
7682 struct if_proto *proto;
7683 errno_t result = 0;
7684
7685 if ((ifp->if_flags & IFF_NOARP) != 0) {
7686 result = ENOTSUP;
7687 goto done;
7688 }
7689
7690 /* callee holds a proto refcnt upon success */
7691 ifnet_lock_shared(ifp);
7692 proto = find_attached_proto(ifp, target_proto->sa_family);
7693 ifnet_lock_done(ifp);
7694 if (proto == NULL) {
7695 result = ENOTSUP;
7696 } else {
7697 proto_media_send_arp arpp;
7698 arpp = (proto->proto_kpi == kProtoKPI_v1 ?
7699 proto->kpi.v1.send_arp : proto->kpi.v2.send_arp);
7700 if (arpp == NULL) {
7701 result = ENOTSUP;
7702 } else {
7703 switch (arpop) {
7704 case ARPOP_REQUEST:
7705 arpstat.txrequests++;
7706 if (target_hw != NULL) {
7707 arpstat.txurequests++;
7708 }
7709 break;
7710 case ARPOP_REPLY:
7711 arpstat.txreplies++;
7712 break;
7713 }
7714 result = arpp(ifp, arpop, sender_hw, sender_proto,
7715 target_hw, target_proto);
7716 }
7717 if_proto_free(proto);
7718 }
7719 done:
7720 return result;
7721 }
7722
7723 struct net_thread_marks { };
7724 static const struct net_thread_marks net_thread_marks_base = { };
7725
7726 __private_extern__ const net_thread_marks_t net_thread_marks_none =
7727 &net_thread_marks_base;
7728
7729 __private_extern__ net_thread_marks_t
net_thread_marks_push(u_int32_t push)7730 net_thread_marks_push(u_int32_t push)
7731 {
7732 static const char *const base = (const void*)&net_thread_marks_base;
7733 u_int32_t pop = 0;
7734
7735 if (push != 0) {
7736 struct uthread *uth = current_uthread();
7737
7738 pop = push & ~uth->uu_network_marks;
7739 if (pop != 0) {
7740 uth->uu_network_marks |= pop;
7741 }
7742 }
7743
7744 return (net_thread_marks_t)&base[pop];
7745 }
7746
7747 __private_extern__ net_thread_marks_t
net_thread_unmarks_push(u_int32_t unpush)7748 net_thread_unmarks_push(u_int32_t unpush)
7749 {
7750 static const char *const base = (const void*)&net_thread_marks_base;
7751 u_int32_t unpop = 0;
7752
7753 if (unpush != 0) {
7754 struct uthread *uth = current_uthread();
7755
7756 unpop = unpush & uth->uu_network_marks;
7757 if (unpop != 0) {
7758 uth->uu_network_marks &= ~unpop;
7759 }
7760 }
7761
7762 return (net_thread_marks_t)&base[unpop];
7763 }
7764
7765 __private_extern__ void
net_thread_marks_pop(net_thread_marks_t popx)7766 net_thread_marks_pop(net_thread_marks_t popx)
7767 {
7768 static const char *const base = (const void*)&net_thread_marks_base;
7769 const ptrdiff_t pop = (const char *)popx - (const char *)base;
7770
7771 if (pop != 0) {
7772 static const ptrdiff_t ones = (ptrdiff_t)(u_int32_t)~0U;
7773 struct uthread *uth = current_uthread();
7774
7775 VERIFY((pop & ones) == pop);
7776 VERIFY((ptrdiff_t)(uth->uu_network_marks & pop) == pop);
7777 uth->uu_network_marks &= ~pop;
7778 }
7779 }
7780
7781 __private_extern__ void
net_thread_unmarks_pop(net_thread_marks_t unpopx)7782 net_thread_unmarks_pop(net_thread_marks_t unpopx)
7783 {
7784 static const char *const base = (const void*)&net_thread_marks_base;
7785 ptrdiff_t unpop = (const char *)unpopx - (const char *)base;
7786
7787 if (unpop != 0) {
7788 static const ptrdiff_t ones = (ptrdiff_t)(u_int32_t)~0U;
7789 struct uthread *uth = current_uthread();
7790
7791 VERIFY((unpop & ones) == unpop);
7792 VERIFY((ptrdiff_t)(uth->uu_network_marks & unpop) == 0);
7793 uth->uu_network_marks |= unpop;
7794 }
7795 }
7796
7797 __private_extern__ u_int32_t
net_thread_is_marked(u_int32_t check)7798 net_thread_is_marked(u_int32_t check)
7799 {
7800 if (check != 0) {
7801 struct uthread *uth = current_uthread();
7802 return uth->uu_network_marks & check;
7803 } else {
7804 return 0;
7805 }
7806 }
7807
7808 __private_extern__ u_int32_t
net_thread_is_unmarked(u_int32_t check)7809 net_thread_is_unmarked(u_int32_t check)
7810 {
7811 if (check != 0) {
7812 struct uthread *uth = current_uthread();
7813 return ~uth->uu_network_marks & check;
7814 } else {
7815 return 0;
7816 }
7817 }
7818
7819 static __inline__ int
_is_announcement(const struct sockaddr_in * sender_sin,const struct sockaddr_in * target_sin)7820 _is_announcement(const struct sockaddr_in * sender_sin,
7821 const struct sockaddr_in * target_sin)
7822 {
7823 if (target_sin == NULL || sender_sin == NULL) {
7824 return FALSE;
7825 }
7826
7827 return sender_sin->sin_addr.s_addr == target_sin->sin_addr.s_addr;
7828 }
7829
7830 __private_extern__ errno_t
dlil_send_arp(ifnet_t ifp,u_short arpop,const struct sockaddr_dl * sender_hw,const struct sockaddr * sender_proto,const struct sockaddr_dl * target_hw,const struct sockaddr * target_proto0,u_int32_t rtflags)7831 dlil_send_arp(ifnet_t ifp, u_short arpop, const struct sockaddr_dl *sender_hw,
7832 const struct sockaddr *sender_proto, const struct sockaddr_dl *target_hw,
7833 const struct sockaddr *target_proto0, u_int32_t rtflags)
7834 {
7835 errno_t result = 0;
7836 const struct sockaddr_in * sender_sin;
7837 const struct sockaddr_in * target_sin;
7838 struct sockaddr_inarp target_proto_sinarp;
7839 struct sockaddr *target_proto = (void *)(uintptr_t)target_proto0;
7840
7841 if (target_proto == NULL || sender_proto == NULL) {
7842 return EINVAL;
7843 }
7844
7845 if (sender_proto->sa_family != target_proto->sa_family) {
7846 return EINVAL;
7847 }
7848
7849 /*
7850 * If the target is a (default) router, provide that
7851 * information to the send_arp callback routine.
7852 */
7853 if (rtflags & RTF_ROUTER) {
7854 bcopy(target_proto, &target_proto_sinarp,
7855 sizeof(struct sockaddr_in));
7856 target_proto_sinarp.sin_other |= SIN_ROUTER;
7857 target_proto = (struct sockaddr *)&target_proto_sinarp;
7858 }
7859
7860 /*
7861 * If this is an ARP request and the target IP is IPv4LL,
7862 * send the request on all interfaces. The exception is
7863 * an announcement, which must only appear on the specific
7864 * interface.
7865 */
7866 sender_sin = (struct sockaddr_in *)(void *)(uintptr_t)sender_proto;
7867 target_sin = (struct sockaddr_in *)(void *)(uintptr_t)target_proto;
7868 if (target_proto->sa_family == AF_INET &&
7869 IN_LINKLOCAL(ntohl(target_sin->sin_addr.s_addr)) &&
7870 ipv4_ll_arp_aware != 0 && arpop == ARPOP_REQUEST &&
7871 !_is_announcement(sender_sin, target_sin)) {
7872 ifnet_t *ifp_list;
7873 u_int32_t count;
7874 u_int32_t ifp_on;
7875
7876 result = ENOTSUP;
7877
7878 if (ifnet_list_get(IFNET_FAMILY_ANY, &ifp_list, &count) == 0) {
7879 for (ifp_on = 0; ifp_on < count; ifp_on++) {
7880 errno_t new_result;
7881 ifaddr_t source_hw = NULL;
7882 ifaddr_t source_ip = NULL;
7883 struct sockaddr_in source_ip_copy;
7884 struct ifnet *cur_ifp = ifp_list[ifp_on];
7885
7886 /*
7887 * Only arp on interfaces marked for IPv4LL
7888 * ARPing. This may mean that we don't ARP on
7889 * the interface the subnet route points to.
7890 */
7891 if (!(cur_ifp->if_eflags & IFEF_ARPLL)) {
7892 continue;
7893 }
7894
7895 /* Find the source IP address */
7896 ifnet_lock_shared(cur_ifp);
7897 source_hw = cur_ifp->if_lladdr;
7898 TAILQ_FOREACH(source_ip, &cur_ifp->if_addrhead,
7899 ifa_link) {
7900 IFA_LOCK(source_ip);
7901 if (source_ip->ifa_addr != NULL &&
7902 source_ip->ifa_addr->sa_family ==
7903 AF_INET) {
7904 /* Copy the source IP address */
7905 source_ip_copy =
7906 *(struct sockaddr_in *)
7907 (void *)source_ip->ifa_addr;
7908 IFA_UNLOCK(source_ip);
7909 break;
7910 }
7911 IFA_UNLOCK(source_ip);
7912 }
7913
7914 /* No IP Source, don't arp */
7915 if (source_ip == NULL) {
7916 ifnet_lock_done(cur_ifp);
7917 continue;
7918 }
7919
7920 IFA_ADDREF(source_hw);
7921 ifnet_lock_done(cur_ifp);
7922
7923 /* Send the ARP */
7924 new_result = dlil_send_arp_internal(cur_ifp,
7925 arpop, (struct sockaddr_dl *)(void *)
7926 source_hw->ifa_addr,
7927 (struct sockaddr *)&source_ip_copy, NULL,
7928 target_proto);
7929
7930 IFA_REMREF(source_hw);
7931 if (result == ENOTSUP) {
7932 result = new_result;
7933 }
7934 }
7935 ifnet_list_free(ifp_list);
7936 }
7937 } else {
7938 result = dlil_send_arp_internal(ifp, arpop, sender_hw,
7939 sender_proto, target_hw, target_proto);
7940 }
7941
7942 return result;
7943 }
7944
7945 /*
7946 * Caller must hold ifnet head lock.
7947 */
7948 static int
ifnet_lookup(struct ifnet * ifp)7949 ifnet_lookup(struct ifnet *ifp)
7950 {
7951 struct ifnet *_ifp;
7952
7953 LCK_RW_ASSERT(&ifnet_head_lock, LCK_RW_ASSERT_HELD);
7954 TAILQ_FOREACH(_ifp, &ifnet_head, if_link) {
7955 if (_ifp == ifp) {
7956 break;
7957 }
7958 }
7959 return _ifp != NULL;
7960 }
7961
7962 /*
7963 * Caller has to pass a non-zero refio argument to get a
7964 * IO reference count. This will prevent ifnet_detach from
7965 * being called when there are outstanding io reference counts.
7966 */
7967 int
ifnet_is_attached(struct ifnet * ifp,int refio)7968 ifnet_is_attached(struct ifnet *ifp, int refio)
7969 {
7970 int ret;
7971
7972 lck_mtx_lock_spin(&ifp->if_ref_lock);
7973 if ((ret = IF_FULLY_ATTACHED(ifp))) {
7974 if (refio > 0) {
7975 ifp->if_refio++;
7976 }
7977 }
7978 lck_mtx_unlock(&ifp->if_ref_lock);
7979
7980 return ret;
7981 }
7982
7983 void
ifnet_incr_pending_thread_count(struct ifnet * ifp)7984 ifnet_incr_pending_thread_count(struct ifnet *ifp)
7985 {
7986 lck_mtx_lock_spin(&ifp->if_ref_lock);
7987 ifp->if_threads_pending++;
7988 lck_mtx_unlock(&ifp->if_ref_lock);
7989 }
7990
7991 void
ifnet_decr_pending_thread_count(struct ifnet * ifp)7992 ifnet_decr_pending_thread_count(struct ifnet *ifp)
7993 {
7994 lck_mtx_lock_spin(&ifp->if_ref_lock);
7995 VERIFY(ifp->if_threads_pending > 0);
7996 ifp->if_threads_pending--;
7997 if (ifp->if_threads_pending == 0) {
7998 wakeup(&ifp->if_threads_pending);
7999 }
8000 lck_mtx_unlock(&ifp->if_ref_lock);
8001 }
8002
8003 /*
8004 * Caller must ensure the interface is attached; the assumption is that
8005 * there is at least an outstanding IO reference count held already.
8006 * Most callers would call ifnet_is_{attached,data_ready}() instead.
8007 */
8008 void
ifnet_incr_iorefcnt(struct ifnet * ifp)8009 ifnet_incr_iorefcnt(struct ifnet *ifp)
8010 {
8011 lck_mtx_lock_spin(&ifp->if_ref_lock);
8012 VERIFY(IF_FULLY_ATTACHED(ifp));
8013 VERIFY(ifp->if_refio > 0);
8014 ifp->if_refio++;
8015 lck_mtx_unlock(&ifp->if_ref_lock);
8016 }
8017
8018 __attribute__((always_inline))
8019 static void
ifnet_decr_iorefcnt_locked(struct ifnet * ifp)8020 ifnet_decr_iorefcnt_locked(struct ifnet *ifp)
8021 {
8022 LCK_MTX_ASSERT(&ifp->if_ref_lock, LCK_MTX_ASSERT_OWNED);
8023
8024 VERIFY(ifp->if_refio > 0);
8025 VERIFY(ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING));
8026
8027 ifp->if_refio--;
8028 VERIFY(ifp->if_refio != 0 || ifp->if_datamov == 0);
8029
8030 /*
8031 * if there are no more outstanding io references, wakeup the
8032 * ifnet_detach thread if detaching flag is set.
8033 */
8034 if (ifp->if_refio == 0 && (ifp->if_refflags & IFRF_DETACHING)) {
8035 wakeup(&(ifp->if_refio));
8036 }
8037 }
8038
8039 void
ifnet_decr_iorefcnt(struct ifnet * ifp)8040 ifnet_decr_iorefcnt(struct ifnet *ifp)
8041 {
8042 lck_mtx_lock_spin(&ifp->if_ref_lock);
8043 ifnet_decr_iorefcnt_locked(ifp);
8044 lck_mtx_unlock(&ifp->if_ref_lock);
8045 }
8046
8047 boolean_t
ifnet_datamov_begin(struct ifnet * ifp)8048 ifnet_datamov_begin(struct ifnet *ifp)
8049 {
8050 boolean_t ret;
8051
8052 lck_mtx_lock_spin(&ifp->if_ref_lock);
8053 if ((ret = IF_FULLY_ATTACHED_AND_READY(ifp))) {
8054 ifp->if_refio++;
8055 ifp->if_datamov++;
8056 }
8057 lck_mtx_unlock(&ifp->if_ref_lock);
8058
8059 return ret;
8060 }
8061
8062 void
ifnet_datamov_end(struct ifnet * ifp)8063 ifnet_datamov_end(struct ifnet *ifp)
8064 {
8065 lck_mtx_lock_spin(&ifp->if_ref_lock);
8066 VERIFY(ifp->if_datamov > 0);
8067 /*
8068 * if there's no more thread moving data, wakeup any
8069 * drainers that's blocked waiting for this.
8070 */
8071 if (--ifp->if_datamov == 0 && ifp->if_drainers > 0) {
8072 DLIL_PRINTF("Waking up drainers on %s\n", if_name(ifp));
8073 DTRACE_IP1(datamov__drain__wake, struct ifnet *, ifp);
8074 wakeup(&(ifp->if_datamov));
8075 }
8076 ifnet_decr_iorefcnt_locked(ifp);
8077 lck_mtx_unlock(&ifp->if_ref_lock);
8078 }
8079
8080 static void
ifnet_datamov_suspend_locked(struct ifnet * ifp)8081 ifnet_datamov_suspend_locked(struct ifnet *ifp)
8082 {
8083 LCK_MTX_ASSERT(&ifp->if_ref_lock, LCK_MTX_ASSERT_OWNED);
8084 ifp->if_refio++;
8085 if (ifp->if_suspend++ == 0) {
8086 VERIFY(ifp->if_refflags & IFRF_READY);
8087 ifp->if_refflags &= ~IFRF_READY;
8088 }
8089 }
8090
8091 void
ifnet_datamov_suspend(struct ifnet * ifp)8092 ifnet_datamov_suspend(struct ifnet *ifp)
8093 {
8094 lck_mtx_lock_spin(&ifp->if_ref_lock);
8095 VERIFY(ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING));
8096 ifnet_datamov_suspend_locked(ifp);
8097 lck_mtx_unlock(&ifp->if_ref_lock);
8098 }
8099
8100 boolean_t
ifnet_datamov_suspend_if_needed(struct ifnet * ifp)8101 ifnet_datamov_suspend_if_needed(struct ifnet *ifp)
8102 {
8103 lck_mtx_lock_spin(&ifp->if_ref_lock);
8104 VERIFY(ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING));
8105 if (ifp->if_suspend > 0) {
8106 lck_mtx_unlock(&ifp->if_ref_lock);
8107 return FALSE;
8108 }
8109 ifnet_datamov_suspend_locked(ifp);
8110 lck_mtx_unlock(&ifp->if_ref_lock);
8111 return TRUE;
8112 }
8113
8114 void
ifnet_datamov_drain(struct ifnet * ifp)8115 ifnet_datamov_drain(struct ifnet *ifp)
8116 {
8117 lck_mtx_lock(&ifp->if_ref_lock);
8118 VERIFY(ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING));
8119 /* data movement must already be suspended */
8120 VERIFY(ifp->if_suspend > 0);
8121 VERIFY(!(ifp->if_refflags & IFRF_READY));
8122 ifp->if_drainers++;
8123 while (ifp->if_datamov != 0) {
8124 DLIL_PRINTF("Waiting for data path(s) to quiesce on %s\n",
8125 if_name(ifp));
8126 DTRACE_IP1(datamov__wait, struct ifnet *, ifp);
8127 (void) msleep(&(ifp->if_datamov), &ifp->if_ref_lock,
8128 (PZERO - 1), __func__, NULL);
8129 DTRACE_IP1(datamov__wake, struct ifnet *, ifp);
8130 }
8131 VERIFY(!(ifp->if_refflags & IFRF_READY));
8132 VERIFY(ifp->if_drainers > 0);
8133 ifp->if_drainers--;
8134 lck_mtx_unlock(&ifp->if_ref_lock);
8135
8136 /* purge the interface queues */
8137 if ((ifp->if_eflags & IFEF_TXSTART) != 0) {
8138 if_qflush_snd(ifp, false);
8139 }
8140 }
8141
8142 void
ifnet_datamov_suspend_and_drain(struct ifnet * ifp)8143 ifnet_datamov_suspend_and_drain(struct ifnet *ifp)
8144 {
8145 ifnet_datamov_suspend(ifp);
8146 ifnet_datamov_drain(ifp);
8147 }
8148
8149 void
ifnet_datamov_resume(struct ifnet * ifp)8150 ifnet_datamov_resume(struct ifnet *ifp)
8151 {
8152 lck_mtx_lock(&ifp->if_ref_lock);
8153 /* data movement must already be suspended */
8154 VERIFY(ifp->if_suspend > 0);
8155 if (--ifp->if_suspend == 0) {
8156 VERIFY(!(ifp->if_refflags & IFRF_READY));
8157 ifp->if_refflags |= IFRF_READY;
8158 }
8159 ifnet_decr_iorefcnt_locked(ifp);
8160 lck_mtx_unlock(&ifp->if_ref_lock);
8161 }
8162
8163 static void
dlil_if_trace(struct dlil_ifnet * dl_if,int refhold)8164 dlil_if_trace(struct dlil_ifnet *dl_if, int refhold)
8165 {
8166 struct dlil_ifnet_dbg *dl_if_dbg = (struct dlil_ifnet_dbg *)dl_if;
8167 ctrace_t *tr;
8168 u_int32_t idx;
8169 u_int16_t *cnt;
8170
8171 if (!(dl_if->dl_if_flags & DLIF_DEBUG)) {
8172 panic("%s: dl_if %p has no debug structure", __func__, dl_if);
8173 /* NOTREACHED */
8174 }
8175
8176 if (refhold) {
8177 cnt = &dl_if_dbg->dldbg_if_refhold_cnt;
8178 tr = dl_if_dbg->dldbg_if_refhold;
8179 } else {
8180 cnt = &dl_if_dbg->dldbg_if_refrele_cnt;
8181 tr = dl_if_dbg->dldbg_if_refrele;
8182 }
8183
8184 idx = atomic_add_16_ov(cnt, 1) % IF_REF_TRACE_HIST_SIZE;
8185 ctrace_record(&tr[idx]);
8186 }
8187
8188 errno_t
dlil_if_ref(struct ifnet * ifp)8189 dlil_if_ref(struct ifnet *ifp)
8190 {
8191 struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
8192
8193 if (dl_if == NULL) {
8194 return EINVAL;
8195 }
8196
8197 lck_mtx_lock_spin(&dl_if->dl_if_lock);
8198 ++dl_if->dl_if_refcnt;
8199 if (dl_if->dl_if_refcnt == 0) {
8200 panic("%s: wraparound refcnt for ifp=%p", __func__, ifp);
8201 /* NOTREACHED */
8202 }
8203 if (dl_if->dl_if_trace != NULL) {
8204 (*dl_if->dl_if_trace)(dl_if, TRUE);
8205 }
8206 lck_mtx_unlock(&dl_if->dl_if_lock);
8207
8208 return 0;
8209 }
8210
8211 errno_t
dlil_if_free(struct ifnet * ifp)8212 dlil_if_free(struct ifnet *ifp)
8213 {
8214 struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
8215 bool need_release = FALSE;
8216
8217 if (dl_if == NULL) {
8218 return EINVAL;
8219 }
8220
8221 lck_mtx_lock_spin(&dl_if->dl_if_lock);
8222 switch (dl_if->dl_if_refcnt) {
8223 case 0:
8224 panic("%s: negative refcnt for ifp=%p", __func__, ifp);
8225 /* NOTREACHED */
8226 break;
8227 case 1:
8228 if ((ifp->if_refflags & IFRF_EMBRYONIC) != 0) {
8229 need_release = TRUE;
8230 }
8231 break;
8232 default:
8233 break;
8234 }
8235 --dl_if->dl_if_refcnt;
8236 if (dl_if->dl_if_trace != NULL) {
8237 (*dl_if->dl_if_trace)(dl_if, FALSE);
8238 }
8239 lck_mtx_unlock(&dl_if->dl_if_lock);
8240 if (need_release) {
8241 _dlil_if_release(ifp, true);
8242 }
8243 return 0;
8244 }
8245
8246 static errno_t
dlil_attach_protocol(struct if_proto * proto,const struct ifnet_demux_desc * demux_list,u_int32_t demux_count,uint32_t * proto_count)8247 dlil_attach_protocol(struct if_proto *proto,
8248 const struct ifnet_demux_desc *demux_list, u_int32_t demux_count,
8249 uint32_t * proto_count)
8250 {
8251 struct kev_dl_proto_data ev_pr_data;
8252 struct ifnet *ifp = proto->ifp;
8253 errno_t retval = 0;
8254 u_int32_t hash_value = proto_hash_value(proto->protocol_family);
8255 struct if_proto *prev_proto;
8256 struct if_proto *_proto;
8257
8258 /* don't allow attaching anything but PF_BRIDGE to vmnet interfaces */
8259 if (IFNET_IS_VMNET(ifp) && proto->protocol_family != PF_BRIDGE) {
8260 return EINVAL;
8261 }
8262
8263 if (!ifnet_is_attached(ifp, 1)) {
8264 os_log(OS_LOG_DEFAULT, "%s: %s is no longer attached",
8265 __func__, if_name(ifp));
8266 return ENXIO;
8267 }
8268 /* callee holds a proto refcnt upon success */
8269 ifnet_lock_exclusive(ifp);
8270 _proto = find_attached_proto(ifp, proto->protocol_family);
8271 if (_proto != NULL) {
8272 ifnet_lock_done(ifp);
8273 if_proto_free(_proto);
8274 retval = EEXIST;
8275 goto ioref_done;
8276 }
8277
8278 /*
8279 * Call family module add_proto routine so it can refine the
8280 * demux descriptors as it wishes.
8281 */
8282 retval = ifp->if_add_proto(ifp, proto->protocol_family, demux_list,
8283 demux_count);
8284 if (retval) {
8285 ifnet_lock_done(ifp);
8286 goto ioref_done;
8287 }
8288
8289 /*
8290 * Insert the protocol in the hash
8291 */
8292 prev_proto = SLIST_FIRST(&ifp->if_proto_hash[hash_value]);
8293 while (prev_proto != NULL && SLIST_NEXT(prev_proto, next_hash) != NULL) {
8294 prev_proto = SLIST_NEXT(prev_proto, next_hash);
8295 }
8296 if (prev_proto) {
8297 SLIST_INSERT_AFTER(prev_proto, proto, next_hash);
8298 } else {
8299 SLIST_INSERT_HEAD(&ifp->if_proto_hash[hash_value],
8300 proto, next_hash);
8301 }
8302
8303 /* hold a proto refcnt for attach */
8304 if_proto_ref(proto);
8305
8306 /*
8307 * The reserved field carries the number of protocol still attached
8308 * (subject to change)
8309 */
8310 ev_pr_data.proto_family = proto->protocol_family;
8311 ev_pr_data.proto_remaining_count = dlil_ifp_protolist(ifp, NULL, 0);
8312
8313 ifnet_lock_done(ifp);
8314
8315 dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_PROTO_ATTACHED,
8316 (struct net_event_data *)&ev_pr_data,
8317 sizeof(struct kev_dl_proto_data), FALSE);
8318 if (proto_count != NULL) {
8319 *proto_count = ev_pr_data.proto_remaining_count;
8320 }
8321 ioref_done:
8322 ifnet_decr_iorefcnt(ifp);
8323 return retval;
8324 }
8325
8326 static void
dlil_handle_proto_attach(ifnet_t ifp,protocol_family_t protocol)8327 dlil_handle_proto_attach(ifnet_t ifp, protocol_family_t protocol)
8328 {
8329 /*
8330 * A protocol has been attached, mark the interface up.
8331 * This used to be done by configd.KernelEventMonitor, but that
8332 * is inherently prone to races (rdar://problem/30810208).
8333 */
8334 (void) ifnet_set_flags(ifp, IFF_UP, IFF_UP);
8335 (void) ifnet_ioctl(ifp, 0, SIOCSIFFLAGS, NULL);
8336 dlil_post_sifflags_msg(ifp);
8337 #if SKYWALK
8338 switch (protocol) {
8339 case AF_INET:
8340 case AF_INET6:
8341 /* don't attach the flowswitch unless attaching IP */
8342 dlil_attach_flowswitch_nexus(ifp);
8343 break;
8344 default:
8345 break;
8346 }
8347 #endif /* SKYWALK */
8348 }
8349
8350 errno_t
ifnet_attach_protocol(ifnet_t ifp,protocol_family_t protocol,const struct ifnet_attach_proto_param * proto_details)8351 ifnet_attach_protocol(ifnet_t ifp, protocol_family_t protocol,
8352 const struct ifnet_attach_proto_param *proto_details)
8353 {
8354 int retval = 0;
8355 struct if_proto *ifproto = NULL;
8356 uint32_t proto_count = 0;
8357
8358 ifnet_head_lock_shared();
8359 if (ifp == NULL || protocol == 0 || proto_details == NULL) {
8360 retval = EINVAL;
8361 goto end;
8362 }
8363 /* Check that the interface is in the global list */
8364 if (!ifnet_lookup(ifp)) {
8365 retval = ENXIO;
8366 goto end;
8367 }
8368
8369 ifproto = zalloc_flags(dlif_proto_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
8370
8371 /* refcnt held above during lookup */
8372 ifproto->ifp = ifp;
8373 ifproto->protocol_family = protocol;
8374 ifproto->proto_kpi = kProtoKPI_v1;
8375 ifproto->kpi.v1.input = proto_details->input;
8376 ifproto->kpi.v1.pre_output = proto_details->pre_output;
8377 ifproto->kpi.v1.event = proto_details->event;
8378 ifproto->kpi.v1.ioctl = proto_details->ioctl;
8379 ifproto->kpi.v1.detached = proto_details->detached;
8380 ifproto->kpi.v1.resolve_multi = proto_details->resolve;
8381 ifproto->kpi.v1.send_arp = proto_details->send_arp;
8382
8383 retval = dlil_attach_protocol(ifproto,
8384 proto_details->demux_list, proto_details->demux_count,
8385 &proto_count);
8386
8387 end:
8388 if (retval == EEXIST) {
8389 /* already attached */
8390 if (dlil_verbose) {
8391 DLIL_PRINTF("%s: protocol %d already attached\n",
8392 ifp != NULL ? if_name(ifp) : "N/A",
8393 protocol);
8394 }
8395 } else if (retval != 0) {
8396 DLIL_PRINTF("%s: failed to attach v1 protocol %d (err=%d)\n",
8397 ifp != NULL ? if_name(ifp) : "N/A", protocol, retval);
8398 } else if (dlil_verbose) {
8399 DLIL_PRINTF("%s: attached v1 protocol %d (count = %d)\n",
8400 ifp != NULL ? if_name(ifp) : "N/A",
8401 protocol, proto_count);
8402 }
8403 ifnet_head_done();
8404 if (retval == 0) {
8405 dlil_handle_proto_attach(ifp, protocol);
8406 } else if (ifproto != NULL) {
8407 zfree(dlif_proto_zone, ifproto);
8408 }
8409 return retval;
8410 }
8411
8412 errno_t
ifnet_attach_protocol_v2(ifnet_t ifp,protocol_family_t protocol,const struct ifnet_attach_proto_param_v2 * proto_details)8413 ifnet_attach_protocol_v2(ifnet_t ifp, protocol_family_t protocol,
8414 const struct ifnet_attach_proto_param_v2 *proto_details)
8415 {
8416 int retval = 0;
8417 struct if_proto *ifproto = NULL;
8418 uint32_t proto_count = 0;
8419
8420 ifnet_head_lock_shared();
8421 if (ifp == NULL || protocol == 0 || proto_details == NULL) {
8422 retval = EINVAL;
8423 goto end;
8424 }
8425 /* Check that the interface is in the global list */
8426 if (!ifnet_lookup(ifp)) {
8427 retval = ENXIO;
8428 goto end;
8429 }
8430
8431 ifproto = zalloc_flags(dlif_proto_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
8432
8433 /* refcnt held above during lookup */
8434 ifproto->ifp = ifp;
8435 ifproto->protocol_family = protocol;
8436 ifproto->proto_kpi = kProtoKPI_v2;
8437 ifproto->kpi.v2.input = proto_details->input;
8438 ifproto->kpi.v2.pre_output = proto_details->pre_output;
8439 ifproto->kpi.v2.event = proto_details->event;
8440 ifproto->kpi.v2.ioctl = proto_details->ioctl;
8441 ifproto->kpi.v2.detached = proto_details->detached;
8442 ifproto->kpi.v2.resolve_multi = proto_details->resolve;
8443 ifproto->kpi.v2.send_arp = proto_details->send_arp;
8444
8445 retval = dlil_attach_protocol(ifproto,
8446 proto_details->demux_list, proto_details->demux_count,
8447 &proto_count);
8448
8449 end:
8450 if (retval == EEXIST) {
8451 /* already attached */
8452 if (dlil_verbose) {
8453 DLIL_PRINTF("%s: protocol %d already attached\n",
8454 ifp != NULL ? if_name(ifp) : "N/A",
8455 protocol);
8456 }
8457 } else if (retval != 0) {
8458 DLIL_PRINTF("%s: failed to attach v2 protocol %d (err=%d)\n",
8459 ifp != NULL ? if_name(ifp) : "N/A", protocol, retval);
8460 } else if (dlil_verbose) {
8461 DLIL_PRINTF("%s: attached v2 protocol %d (count = %d)\n",
8462 ifp != NULL ? if_name(ifp) : "N/A",
8463 protocol, proto_count);
8464 }
8465 ifnet_head_done();
8466 if (retval == 0) {
8467 dlil_handle_proto_attach(ifp, protocol);
8468 } else if (ifproto != NULL) {
8469 zfree(dlif_proto_zone, ifproto);
8470 }
8471 return retval;
8472 }
8473
8474 errno_t
ifnet_detach_protocol(ifnet_t ifp,protocol_family_t proto_family)8475 ifnet_detach_protocol(ifnet_t ifp, protocol_family_t proto_family)
8476 {
8477 struct if_proto *proto = NULL;
8478 int retval = 0;
8479
8480 if (ifp == NULL || proto_family == 0) {
8481 retval = EINVAL;
8482 goto end;
8483 }
8484
8485 ifnet_lock_exclusive(ifp);
8486 /* callee holds a proto refcnt upon success */
8487 proto = find_attached_proto(ifp, proto_family);
8488 if (proto == NULL) {
8489 retval = ENXIO;
8490 ifnet_lock_done(ifp);
8491 goto end;
8492 }
8493
8494 /* call family module del_proto */
8495 if (ifp->if_del_proto) {
8496 ifp->if_del_proto(ifp, proto->protocol_family);
8497 }
8498
8499 SLIST_REMOVE(&ifp->if_proto_hash[proto_hash_value(proto_family)],
8500 proto, if_proto, next_hash);
8501
8502 if (proto->proto_kpi == kProtoKPI_v1) {
8503 proto->kpi.v1.input = ifproto_media_input_v1;
8504 proto->kpi.v1.pre_output = ifproto_media_preout;
8505 proto->kpi.v1.event = ifproto_media_event;
8506 proto->kpi.v1.ioctl = ifproto_media_ioctl;
8507 proto->kpi.v1.resolve_multi = ifproto_media_resolve_multi;
8508 proto->kpi.v1.send_arp = ifproto_media_send_arp;
8509 } else {
8510 proto->kpi.v2.input = ifproto_media_input_v2;
8511 proto->kpi.v2.pre_output = ifproto_media_preout;
8512 proto->kpi.v2.event = ifproto_media_event;
8513 proto->kpi.v2.ioctl = ifproto_media_ioctl;
8514 proto->kpi.v2.resolve_multi = ifproto_media_resolve_multi;
8515 proto->kpi.v2.send_arp = ifproto_media_send_arp;
8516 }
8517 proto->detached = 1;
8518 ifnet_lock_done(ifp);
8519
8520 if (dlil_verbose) {
8521 DLIL_PRINTF("%s: detached %s protocol %d\n", if_name(ifp),
8522 (proto->proto_kpi == kProtoKPI_v1) ?
8523 "v1" : "v2", proto_family);
8524 }
8525
8526 /* release proto refcnt held during protocol attach */
8527 if_proto_free(proto);
8528
8529 /*
8530 * Release proto refcnt held during lookup; the rest of
8531 * protocol detach steps will happen when the last proto
8532 * reference is released.
8533 */
8534 if_proto_free(proto);
8535
8536 end:
8537 return retval;
8538 }
8539
8540
8541 static errno_t
ifproto_media_input_v1(struct ifnet * ifp,protocol_family_t protocol,struct mbuf * packet,char * header)8542 ifproto_media_input_v1(struct ifnet *ifp, protocol_family_t protocol,
8543 struct mbuf *packet, char *header)
8544 {
8545 #pragma unused(ifp, protocol, packet, header)
8546 return ENXIO;
8547 }
8548
8549 static errno_t
ifproto_media_input_v2(struct ifnet * ifp,protocol_family_t protocol,struct mbuf * packet)8550 ifproto_media_input_v2(struct ifnet *ifp, protocol_family_t protocol,
8551 struct mbuf *packet)
8552 {
8553 #pragma unused(ifp, protocol, packet)
8554 return ENXIO;
8555 }
8556
8557 static errno_t
ifproto_media_preout(struct ifnet * ifp,protocol_family_t protocol,mbuf_t * packet,const struct sockaddr * dest,void * route,char * frame_type,char * link_layer_dest)8558 ifproto_media_preout(struct ifnet *ifp, protocol_family_t protocol,
8559 mbuf_t *packet, const struct sockaddr *dest, void *route, char *frame_type,
8560 char *link_layer_dest)
8561 {
8562 #pragma unused(ifp, protocol, packet, dest, route, frame_type, link_layer_dest)
8563 return ENXIO;
8564 }
8565
8566 static void
ifproto_media_event(struct ifnet * ifp,protocol_family_t protocol,const struct kev_msg * event)8567 ifproto_media_event(struct ifnet *ifp, protocol_family_t protocol,
8568 const struct kev_msg *event)
8569 {
8570 #pragma unused(ifp, protocol, event)
8571 }
8572
8573 static errno_t
ifproto_media_ioctl(struct ifnet * ifp,protocol_family_t protocol,unsigned long command,void * argument)8574 ifproto_media_ioctl(struct ifnet *ifp, protocol_family_t protocol,
8575 unsigned long command, void *argument)
8576 {
8577 #pragma unused(ifp, protocol, command, argument)
8578 return ENXIO;
8579 }
8580
8581 static errno_t
ifproto_media_resolve_multi(ifnet_t ifp,const struct sockaddr * proto_addr,struct sockaddr_dl * out_ll,size_t ll_len)8582 ifproto_media_resolve_multi(ifnet_t ifp, const struct sockaddr *proto_addr,
8583 struct sockaddr_dl *out_ll, size_t ll_len)
8584 {
8585 #pragma unused(ifp, proto_addr, out_ll, ll_len)
8586 return ENXIO;
8587 }
8588
8589 static errno_t
ifproto_media_send_arp(struct ifnet * ifp,u_short arpop,const struct sockaddr_dl * sender_hw,const struct sockaddr * sender_proto,const struct sockaddr_dl * target_hw,const struct sockaddr * target_proto)8590 ifproto_media_send_arp(struct ifnet *ifp, u_short arpop,
8591 const struct sockaddr_dl *sender_hw, const struct sockaddr *sender_proto,
8592 const struct sockaddr_dl *target_hw, const struct sockaddr *target_proto)
8593 {
8594 #pragma unused(ifp, arpop, sender_hw, sender_proto, target_hw, target_proto)
8595 return ENXIO;
8596 }
8597
8598 extern int if_next_index(void);
8599 extern int tcp_ecn_outbound;
8600
8601 void
dlil_ifclassq_setup(struct ifnet * ifp,struct ifclassq * ifcq)8602 dlil_ifclassq_setup(struct ifnet *ifp, struct ifclassq *ifcq)
8603 {
8604 uint32_t sflags = 0;
8605 int err;
8606
8607 if (if_flowadv) {
8608 sflags |= PKTSCHEDF_QALG_FLOWCTL;
8609 }
8610
8611 if (if_delaybased_queue) {
8612 sflags |= PKTSCHEDF_QALG_DELAYBASED;
8613 }
8614
8615 if (ifp->if_output_sched_model ==
8616 IFNET_SCHED_MODEL_DRIVER_MANAGED) {
8617 sflags |= PKTSCHEDF_QALG_DRIVER_MANAGED;
8618 }
8619 /* Inherit drop limit from the default queue */
8620 if (ifp->if_snd != ifcq) {
8621 IFCQ_PKT_DROP_LIMIT(ifcq) = IFCQ_PKT_DROP_LIMIT(ifp->if_snd);
8622 }
8623 /* Initialize transmit queue(s) */
8624 err = ifclassq_setup(ifcq, ifp, sflags);
8625 if (err != 0) {
8626 panic_plain("%s: ifp=%p couldn't initialize transmit queue; "
8627 "err=%d", __func__, ifp, err);
8628 /* NOTREACHED */
8629 }
8630 }
8631
8632 errno_t
ifnet_attach(ifnet_t ifp,const struct sockaddr_dl * ll_addr)8633 ifnet_attach(ifnet_t ifp, const struct sockaddr_dl *ll_addr)
8634 {
8635 #if SKYWALK
8636 boolean_t netif_compat;
8637 if_nexus_netif nexus_netif;
8638 #endif /* SKYWALK */
8639 struct ifnet *tmp_if;
8640 struct ifaddr *ifa;
8641 struct if_data_internal if_data_saved;
8642 struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
8643 struct dlil_threading_info *dl_inp;
8644 thread_continue_t thfunc = NULL;
8645 int err;
8646
8647 if (ifp == NULL) {
8648 return EINVAL;
8649 }
8650
8651 /*
8652 * Serialize ifnet attach using dlil_ifnet_lock, in order to
8653 * prevent the interface from being configured while it is
8654 * embryonic, as ifnet_head_lock is dropped and reacquired
8655 * below prior to marking the ifnet with IFRF_ATTACHED.
8656 */
8657 dlil_if_lock();
8658 ifnet_head_lock_exclusive();
8659 /* Verify we aren't already on the list */
8660 TAILQ_FOREACH(tmp_if, &ifnet_head, if_link) {
8661 if (tmp_if == ifp) {
8662 ifnet_head_done();
8663 dlil_if_unlock();
8664 return EEXIST;
8665 }
8666 }
8667
8668 lck_mtx_lock_spin(&ifp->if_ref_lock);
8669 if (!(ifp->if_refflags & IFRF_EMBRYONIC)) {
8670 panic_plain("%s: flags mismatch (embryonic not set) ifp=%p",
8671 __func__, ifp);
8672 /* NOTREACHED */
8673 }
8674 lck_mtx_unlock(&ifp->if_ref_lock);
8675
8676 ifnet_lock_exclusive(ifp);
8677
8678 /* Sanity check */
8679 VERIFY(ifp->if_detaching_link.tqe_next == NULL);
8680 VERIFY(ifp->if_detaching_link.tqe_prev == NULL);
8681 VERIFY(ifp->if_threads_pending == 0);
8682
8683 if (ll_addr != NULL) {
8684 if (ifp->if_addrlen == 0) {
8685 ifp->if_addrlen = ll_addr->sdl_alen;
8686 } else if (ll_addr->sdl_alen != ifp->if_addrlen) {
8687 ifnet_lock_done(ifp);
8688 ifnet_head_done();
8689 dlil_if_unlock();
8690 return EINVAL;
8691 }
8692 }
8693
8694 /*
8695 * Allow interfaces without protocol families to attach
8696 * only if they have the necessary fields filled out.
8697 */
8698 if (ifp->if_add_proto == NULL || ifp->if_del_proto == NULL) {
8699 DLIL_PRINTF("%s: Attempt to attach interface without "
8700 "family module - %d\n", __func__, ifp->if_family);
8701 ifnet_lock_done(ifp);
8702 ifnet_head_done();
8703 dlil_if_unlock();
8704 return ENODEV;
8705 }
8706
8707 /* Allocate protocol hash table */
8708 VERIFY(ifp->if_proto_hash == NULL);
8709 ifp->if_proto_hash = kalloc_type(struct proto_hash_entry,
8710 PROTO_HASH_SLOTS, Z_WAITOK | Z_ZERO | Z_NOFAIL);
8711
8712 lck_mtx_lock_spin(&ifp->if_flt_lock);
8713 VERIFY(TAILQ_EMPTY(&ifp->if_flt_head));
8714 TAILQ_INIT(&ifp->if_flt_head);
8715 VERIFY(ifp->if_flt_busy == 0);
8716 VERIFY(ifp->if_flt_waiters == 0);
8717 VERIFY(ifp->if_flt_non_os_count == 0);
8718 VERIFY(ifp->if_flt_no_tso_count == 0);
8719 lck_mtx_unlock(&ifp->if_flt_lock);
8720
8721 if (!(dl_if->dl_if_flags & DLIF_REUSE)) {
8722 VERIFY(LIST_EMPTY(&ifp->if_multiaddrs));
8723 LIST_INIT(&ifp->if_multiaddrs);
8724 }
8725
8726 VERIFY(ifp->if_allhostsinm == NULL);
8727 VERIFY(TAILQ_EMPTY(&ifp->if_addrhead));
8728 TAILQ_INIT(&ifp->if_addrhead);
8729
8730 if (ifp->if_index == 0) {
8731 int idx = if_next_index();
8732
8733 /*
8734 * Since we exhausted the list of
8735 * if_index's, try to find an empty slot
8736 * in ifindex2ifnet.
8737 */
8738 if (idx == -1 && if_index >= UINT16_MAX) {
8739 for (int i = 1; i < if_index; i++) {
8740 if (ifindex2ifnet[i] == NULL &&
8741 ifnet_addrs[i - 1] == NULL) {
8742 idx = i;
8743 break;
8744 }
8745 }
8746 }
8747 if (idx == -1) {
8748 ifp->if_index = 0;
8749 ifnet_lock_done(ifp);
8750 ifnet_head_done();
8751 dlil_if_unlock();
8752 return ENOBUFS;
8753 }
8754 ifp->if_index = (uint16_t)idx;
8755
8756 /* the lladdr passed at attach time is the permanent address */
8757 if (ll_addr != NULL && ifp->if_type == IFT_ETHER &&
8758 ll_addr->sdl_alen == ETHER_ADDR_LEN) {
8759 bcopy(CONST_LLADDR(ll_addr),
8760 dl_if->dl_if_permanent_ether,
8761 ETHER_ADDR_LEN);
8762 dl_if->dl_if_permanent_ether_is_set = 1;
8763 }
8764 }
8765 /* There should not be anything occupying this slot */
8766 VERIFY(ifindex2ifnet[ifp->if_index] == NULL);
8767
8768 /* allocate (if needed) and initialize a link address */
8769 ifa = dlil_alloc_lladdr(ifp, ll_addr);
8770 if (ifa == NULL) {
8771 ifnet_lock_done(ifp);
8772 ifnet_head_done();
8773 dlil_if_unlock();
8774 return ENOBUFS;
8775 }
8776
8777 VERIFY(ifnet_addrs[ifp->if_index - 1] == NULL);
8778 ifnet_addrs[ifp->if_index - 1] = ifa;
8779
8780 /* make this address the first on the list */
8781 IFA_LOCK(ifa);
8782 /* hold a reference for ifnet_addrs[] */
8783 IFA_ADDREF_LOCKED(ifa);
8784 /* if_attach_link_ifa() holds a reference for ifa_link */
8785 if_attach_link_ifa(ifp, ifa);
8786 IFA_UNLOCK(ifa);
8787
8788 TAILQ_INSERT_TAIL(&ifnet_head, ifp, if_link);
8789 ifindex2ifnet[ifp->if_index] = ifp;
8790
8791 /* Hold a reference to the underlying dlil_ifnet */
8792 ifnet_reference(ifp);
8793
8794 /* Clear stats (save and restore other fields that we care) */
8795 if_data_saved = ifp->if_data;
8796 bzero(&ifp->if_data, sizeof(ifp->if_data));
8797 ifp->if_data.ifi_type = if_data_saved.ifi_type;
8798 ifp->if_data.ifi_typelen = if_data_saved.ifi_typelen;
8799 ifp->if_data.ifi_physical = if_data_saved.ifi_physical;
8800 ifp->if_data.ifi_addrlen = if_data_saved.ifi_addrlen;
8801 ifp->if_data.ifi_hdrlen = if_data_saved.ifi_hdrlen;
8802 ifp->if_data.ifi_mtu = if_data_saved.ifi_mtu;
8803 ifp->if_data.ifi_baudrate = if_data_saved.ifi_baudrate;
8804 ifp->if_data.ifi_hwassist = if_data_saved.ifi_hwassist;
8805 ifp->if_data.ifi_tso_v4_mtu = if_data_saved.ifi_tso_v4_mtu;
8806 ifp->if_data.ifi_tso_v6_mtu = if_data_saved.ifi_tso_v6_mtu;
8807 ifnet_touch_lastchange(ifp);
8808
8809 VERIFY(ifp->if_output_sched_model == IFNET_SCHED_MODEL_NORMAL ||
8810 ifp->if_output_sched_model == IFNET_SCHED_MODEL_DRIVER_MANAGED ||
8811 ifp->if_output_sched_model == IFNET_SCHED_MODEL_FQ_CODEL);
8812
8813 dlil_ifclassq_setup(ifp, ifp->if_snd);
8814
8815 /* Sanity checks on the input thread storage */
8816 dl_inp = &dl_if->dl_if_inpstorage;
8817 bzero(&dl_inp->dlth_stats, sizeof(dl_inp->dlth_stats));
8818 VERIFY(dl_inp->dlth_flags == 0);
8819 VERIFY(dl_inp->dlth_wtot == 0);
8820 VERIFY(dl_inp->dlth_ifp == NULL);
8821 VERIFY(qhead(&dl_inp->dlth_pkts) == NULL && qempty(&dl_inp->dlth_pkts));
8822 VERIFY(qlimit(&dl_inp->dlth_pkts) == 0);
8823 VERIFY(!dl_inp->dlth_affinity);
8824 VERIFY(ifp->if_inp == NULL);
8825 VERIFY(dl_inp->dlth_thread == THREAD_NULL);
8826 VERIFY(dl_inp->dlth_strategy == NULL);
8827 VERIFY(dl_inp->dlth_driver_thread == THREAD_NULL);
8828 VERIFY(dl_inp->dlth_poller_thread == THREAD_NULL);
8829 VERIFY(dl_inp->dlth_affinity_tag == 0);
8830
8831 #if IFNET_INPUT_SANITY_CHK
8832 VERIFY(dl_inp->dlth_pkts_cnt == 0);
8833 #endif /* IFNET_INPUT_SANITY_CHK */
8834
8835 VERIFY(ifp->if_poll_thread == THREAD_NULL);
8836 dlil_reset_rxpoll_params(ifp);
8837 /*
8838 * A specific DLIL input thread is created per non-loopback interface.
8839 */
8840 if (ifp->if_family != IFNET_FAMILY_LOOPBACK) {
8841 ifp->if_inp = dl_inp;
8842 ifnet_incr_pending_thread_count(ifp);
8843 err = dlil_create_input_thread(ifp, ifp->if_inp, &thfunc);
8844 if (err == ENODEV) {
8845 VERIFY(thfunc == NULL);
8846 ifnet_decr_pending_thread_count(ifp);
8847 } else if (err != 0) {
8848 panic_plain("%s: ifp=%p couldn't get an input thread; "
8849 "err=%d", __func__, ifp, err);
8850 /* NOTREACHED */
8851 }
8852 }
8853 /*
8854 * If the driver supports the new transmit model, calculate flow hash
8855 * and create a workloop starter thread to invoke the if_start callback
8856 * where the packets may be dequeued and transmitted.
8857 */
8858 if (ifp->if_eflags & IFEF_TXSTART) {
8859 thread_precedence_policy_data_t info;
8860 __unused kern_return_t kret;
8861
8862 ifp->if_flowhash = ifnet_calc_flowhash(ifp);
8863 VERIFY(ifp->if_flowhash != 0);
8864 VERIFY(ifp->if_start_thread == THREAD_NULL);
8865
8866 ifnet_set_start_cycle(ifp, NULL);
8867 ifp->if_start_active = 0;
8868 ifp->if_start_req = 0;
8869 ifp->if_start_flags = 0;
8870 VERIFY(ifp->if_start != NULL);
8871 ifnet_incr_pending_thread_count(ifp);
8872 if ((err = kernel_thread_start(ifnet_start_thread_func,
8873 ifp, &ifp->if_start_thread)) != KERN_SUCCESS) {
8874 panic_plain("%s: "
8875 "ifp=%p couldn't get a start thread; "
8876 "err=%d", __func__, ifp, err);
8877 /* NOTREACHED */
8878 }
8879 bzero(&info, sizeof(info));
8880 info.importance = 1;
8881 kret = thread_policy_set(ifp->if_start_thread,
8882 THREAD_PRECEDENCE_POLICY, (thread_policy_t)&info,
8883 THREAD_PRECEDENCE_POLICY_COUNT);
8884 ASSERT(kret == KERN_SUCCESS);
8885 } else {
8886 ifp->if_flowhash = 0;
8887 }
8888
8889 /* Reset polling parameters */
8890 ifnet_set_poll_cycle(ifp, NULL);
8891 ifp->if_poll_update = 0;
8892 ifp->if_poll_flags = 0;
8893 ifp->if_poll_req = 0;
8894 VERIFY(ifp->if_poll_thread == THREAD_NULL);
8895
8896 /*
8897 * If the driver supports the new receive model, create a poller
8898 * thread to invoke if_input_poll callback where the packets may
8899 * be dequeued from the driver and processed for reception.
8900 * if the interface is netif compat then the poller thread is
8901 * managed by netif.
8902 */
8903 if (thfunc == dlil_rxpoll_input_thread_func) {
8904 thread_precedence_policy_data_t info;
8905 __unused kern_return_t kret;
8906 #if SKYWALK
8907 VERIFY(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
8908 #endif /* SKYWALK */
8909 VERIFY(ifp->if_input_poll != NULL);
8910 VERIFY(ifp->if_input_ctl != NULL);
8911 ifnet_incr_pending_thread_count(ifp);
8912 if ((err = kernel_thread_start(ifnet_poll_thread_func, ifp,
8913 &ifp->if_poll_thread)) != KERN_SUCCESS) {
8914 panic_plain("%s: ifp=%p couldn't get a poll thread; "
8915 "err=%d", __func__, ifp, err);
8916 /* NOTREACHED */
8917 }
8918 bzero(&info, sizeof(info));
8919 info.importance = 1;
8920 kret = thread_policy_set(ifp->if_poll_thread,
8921 THREAD_PRECEDENCE_POLICY, (thread_policy_t)&info,
8922 THREAD_PRECEDENCE_POLICY_COUNT);
8923 ASSERT(kret == KERN_SUCCESS);
8924 }
8925
8926 VERIFY(ifp->if_desc.ifd_maxlen == IF_DESCSIZE);
8927 VERIFY(ifp->if_desc.ifd_len == 0);
8928 VERIFY(ifp->if_desc.ifd_desc != NULL);
8929
8930 /* Record attach PC stacktrace */
8931 ctrace_record(&((struct dlil_ifnet *)ifp)->dl_if_attach);
8932
8933 ifp->if_updatemcasts = 0;
8934 if (!LIST_EMPTY(&ifp->if_multiaddrs)) {
8935 struct ifmultiaddr *ifma;
8936 LIST_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
8937 IFMA_LOCK(ifma);
8938 if (ifma->ifma_addr->sa_family == AF_LINK ||
8939 ifma->ifma_addr->sa_family == AF_UNSPEC) {
8940 ifp->if_updatemcasts++;
8941 }
8942 IFMA_UNLOCK(ifma);
8943 }
8944
8945 DLIL_PRINTF("%s: attached with %d suspended link-layer multicast "
8946 "membership(s)\n", if_name(ifp),
8947 ifp->if_updatemcasts);
8948 }
8949
8950 /* Clear logging parameters */
8951 bzero(&ifp->if_log, sizeof(ifp->if_log));
8952
8953 /* Clear foreground/realtime activity timestamps */
8954 ifp->if_fg_sendts = 0;
8955 ifp->if_rt_sendts = 0;
8956
8957 /* Clear throughput estimates and radio type */
8958 ifp->if_estimated_up_bucket = 0;
8959 ifp->if_estimated_down_bucket = 0;
8960 ifp->if_radio_type = 0;
8961 ifp->if_radio_channel = 0;
8962
8963 VERIFY(ifp->if_delegated.ifp == NULL);
8964 VERIFY(ifp->if_delegated.type == 0);
8965 VERIFY(ifp->if_delegated.family == 0);
8966 VERIFY(ifp->if_delegated.subfamily == 0);
8967 VERIFY(ifp->if_delegated.expensive == 0);
8968 VERIFY(ifp->if_delegated.constrained == 0);
8969
8970 VERIFY(ifp->if_agentids == NULL);
8971 VERIFY(ifp->if_agentcount == 0);
8972
8973 /* Reset interface state */
8974 bzero(&ifp->if_interface_state, sizeof(ifp->if_interface_state));
8975 ifp->if_interface_state.valid_bitmask |=
8976 IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID;
8977 ifp->if_interface_state.interface_availability =
8978 IF_INTERFACE_STATE_INTERFACE_AVAILABLE;
8979
8980 /* Initialize Link Quality Metric (loopback [lo0] is always good) */
8981 if (ifp == lo_ifp) {
8982 ifp->if_interface_state.lqm_state = IFNET_LQM_THRESH_GOOD;
8983 ifp->if_interface_state.valid_bitmask |=
8984 IF_INTERFACE_STATE_LQM_STATE_VALID;
8985 } else {
8986 ifp->if_interface_state.lqm_state = IFNET_LQM_THRESH_UNKNOWN;
8987 }
8988
8989 /*
8990 * Enable ECN capability on this interface depending on the
8991 * value of ECN global setting
8992 */
8993 if (tcp_ecn_outbound == 2 && !IFNET_IS_CELLULAR(ifp)) {
8994 if_set_eflags(ifp, IFEF_ECN_ENABLE);
8995 if_clear_eflags(ifp, IFEF_ECN_DISABLE);
8996 }
8997
8998 /*
8999 * Built-in Cyclops always on policy for WiFi infra
9000 */
9001 if (IFNET_IS_WIFI_INFRA(ifp) && net_qos_policy_wifi_enabled != 0) {
9002 errno_t error;
9003
9004 error = if_set_qosmarking_mode(ifp,
9005 IFRTYPE_QOSMARKING_FASTLANE);
9006 if (error != 0) {
9007 DLIL_PRINTF("%s if_set_qosmarking_mode(%s) error %d\n",
9008 __func__, ifp->if_xname, error);
9009 } else {
9010 if_set_eflags(ifp, IFEF_QOSMARKING_ENABLED);
9011 #if (DEVELOPMENT || DEBUG)
9012 DLIL_PRINTF("%s fastlane enabled on %s\n",
9013 __func__, ifp->if_xname);
9014 #endif /* (DEVELOPMENT || DEBUG) */
9015 }
9016 }
9017
9018 ifnet_lock_done(ifp);
9019 ifnet_head_done();
9020
9021 #if SKYWALK
9022 netif_compat = dlil_attach_netif_compat_nexus(ifp, &nexus_netif);
9023 #endif /* SKYWALK */
9024
9025 lck_mtx_lock(&ifp->if_cached_route_lock);
9026 /* Enable forwarding cached route */
9027 ifp->if_fwd_cacheok = 1;
9028 /* Clean up any existing cached routes */
9029 ROUTE_RELEASE(&ifp->if_fwd_route);
9030 bzero(&ifp->if_fwd_route, sizeof(ifp->if_fwd_route));
9031 ROUTE_RELEASE(&ifp->if_src_route);
9032 bzero(&ifp->if_src_route, sizeof(ifp->if_src_route));
9033 ROUTE_RELEASE(&ifp->if_src_route6);
9034 bzero(&ifp->if_src_route6, sizeof(ifp->if_src_route6));
9035 lck_mtx_unlock(&ifp->if_cached_route_lock);
9036
9037 ifnet_llreach_ifattach(ifp, (dl_if->dl_if_flags & DLIF_REUSE));
9038
9039 /*
9040 * Allocate and attach IGMPv3/MLDv2 interface specific variables
9041 * and trees; do this before the ifnet is marked as attached.
9042 * The ifnet keeps the reference to the info structures even after
9043 * the ifnet is detached, since the network-layer records still
9044 * refer to the info structures even after that. This also
9045 * makes it possible for them to still function after the ifnet
9046 * is recycled or reattached.
9047 */
9048 #if INET
9049 if (IGMP_IFINFO(ifp) == NULL) {
9050 IGMP_IFINFO(ifp) = igmp_domifattach(ifp, Z_WAITOK);
9051 VERIFY(IGMP_IFINFO(ifp) != NULL);
9052 } else {
9053 VERIFY(IGMP_IFINFO(ifp)->igi_ifp == ifp);
9054 igmp_domifreattach(IGMP_IFINFO(ifp));
9055 }
9056 #endif /* INET */
9057 if (MLD_IFINFO(ifp) == NULL) {
9058 MLD_IFINFO(ifp) = mld_domifattach(ifp, Z_WAITOK);
9059 VERIFY(MLD_IFINFO(ifp) != NULL);
9060 } else {
9061 VERIFY(MLD_IFINFO(ifp)->mli_ifp == ifp);
9062 mld_domifreattach(MLD_IFINFO(ifp));
9063 }
9064
9065 VERIFY(ifp->if_data_threshold == 0);
9066 VERIFY(ifp->if_dt_tcall != NULL);
9067
9068 /*
9069 * Wait for the created kernel threads for I/O to get
9070 * scheduled and run at least once before we proceed
9071 * to mark interface as attached.
9072 */
9073 lck_mtx_lock(&ifp->if_ref_lock);
9074 while (ifp->if_threads_pending != 0) {
9075 DLIL_PRINTF("%s: Waiting for all kernel threads created for "
9076 "interface %s to get scheduled at least once.\n",
9077 __func__, ifp->if_xname);
9078 (void) msleep(&ifp->if_threads_pending, &ifp->if_ref_lock, (PZERO - 1),
9079 __func__, NULL);
9080 LCK_MTX_ASSERT(&ifp->if_ref_lock, LCK_ASSERT_OWNED);
9081 }
9082 lck_mtx_unlock(&ifp->if_ref_lock);
9083 DLIL_PRINTF("%s: All kernel threads created for interface %s have been scheduled "
9084 "at least once. Proceeding.\n", __func__, ifp->if_xname);
9085
9086 /* Final mark this ifnet as attached. */
9087 ifnet_lock_exclusive(ifp);
9088 lck_mtx_lock_spin(&ifp->if_ref_lock);
9089 ifp->if_refflags = (IFRF_ATTACHED | IFRF_READY); /* clears embryonic */
9090 lck_mtx_unlock(&ifp->if_ref_lock);
9091 if (net_rtref) {
9092 /* boot-args override; enable idle notification */
9093 (void) ifnet_set_idle_flags_locked(ifp, IFRF_IDLE_NOTIFY,
9094 IFRF_IDLE_NOTIFY);
9095 } else {
9096 /* apply previous request(s) to set the idle flags, if any */
9097 (void) ifnet_set_idle_flags_locked(ifp, ifp->if_idle_new_flags,
9098 ifp->if_idle_new_flags_mask);
9099 }
9100 #if SKYWALK
9101 /* the interface is fully attached; let the nexus adapter know */
9102 if (netif_compat || dlil_is_native_netif_nexus(ifp)) {
9103 if (netif_compat) {
9104 if (sk_netif_compat_txmodel ==
9105 NETIF_COMPAT_TXMODEL_ENQUEUE_MULTI) {
9106 ifnet_enqueue_multi_setup(ifp,
9107 sk_tx_delay_qlen, sk_tx_delay_timeout);
9108 }
9109 ifp->if_nx_netif = nexus_netif;
9110 }
9111 ifp->if_na_ops->ni_finalize(ifp->if_na, ifp);
9112 }
9113 #endif /* SKYWALK */
9114 ifnet_lock_done(ifp);
9115 dlil_if_unlock();
9116
9117 #if PF
9118 /*
9119 * Attach packet filter to this interface, if enabled.
9120 */
9121 pf_ifnet_hook(ifp, 1);
9122 #endif /* PF */
9123
9124 dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_IF_ATTACHED, NULL, 0, FALSE);
9125
9126 if (dlil_verbose) {
9127 DLIL_PRINTF("%s: attached%s\n", if_name(ifp),
9128 (dl_if->dl_if_flags & DLIF_REUSE) ? " (recycled)" : "");
9129 }
9130
9131 return 0;
9132 }
9133
9134 /*
9135 * Prepare the storage for the first/permanent link address, which must
9136 * must have the same lifetime as the ifnet itself. Although the link
9137 * address gets removed from if_addrhead and ifnet_addrs[] at detach time,
9138 * its location in memory must never change as it may still be referred
9139 * to by some parts of the system afterwards (unfortunate implementation
9140 * artifacts inherited from BSD.)
9141 *
9142 * Caller must hold ifnet lock as writer.
9143 */
9144 static struct ifaddr *
dlil_alloc_lladdr(struct ifnet * ifp,const struct sockaddr_dl * ll_addr)9145 dlil_alloc_lladdr(struct ifnet *ifp, const struct sockaddr_dl *ll_addr)
9146 {
9147 struct ifaddr *ifa, *oifa;
9148 struct sockaddr_dl *asdl, *msdl;
9149 char workbuf[IFNAMSIZ * 2];
9150 int namelen, masklen, socksize;
9151 struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
9152
9153 ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_EXCLUSIVE);
9154 VERIFY(ll_addr == NULL || ll_addr->sdl_alen == ifp->if_addrlen);
9155
9156 namelen = scnprintf(workbuf, sizeof(workbuf), "%s",
9157 if_name(ifp));
9158 masklen = offsetof(struct sockaddr_dl, sdl_data[0])
9159 + ((namelen > 0) ? namelen : 0);
9160 socksize = masklen + ifp->if_addrlen;
9161 #define ROUNDUP(a) (1 + (((a) - 1) | (sizeof (u_int32_t) - 1)))
9162 if ((u_int32_t)socksize < sizeof(struct sockaddr_dl)) {
9163 socksize = sizeof(struct sockaddr_dl);
9164 }
9165 socksize = ROUNDUP(socksize);
9166 #undef ROUNDUP
9167
9168 ifa = ifp->if_lladdr;
9169 if (socksize > DLIL_SDLMAXLEN ||
9170 (ifa != NULL && ifa != &dl_if->dl_if_lladdr.ifa)) {
9171 /*
9172 * Rare, but in the event that the link address requires
9173 * more storage space than DLIL_SDLMAXLEN, allocate the
9174 * largest possible storages for address and mask, such
9175 * that we can reuse the same space when if_addrlen grows.
9176 * This same space will be used when if_addrlen shrinks.
9177 */
9178 if (ifa == NULL || ifa == &dl_if->dl_if_lladdr.ifa) {
9179 int ifasize = sizeof(*ifa) + 2 * SOCK_MAXADDRLEN;
9180
9181 ifa = zalloc_permanent(ifasize, ZALIGN(struct ifaddr));
9182 ifa_lock_init(ifa);
9183 /* Don't set IFD_ALLOC, as this is permanent */
9184 ifa->ifa_debug = IFD_LINK;
9185 }
9186 IFA_LOCK(ifa);
9187 /* address and mask sockaddr_dl locations */
9188 asdl = (struct sockaddr_dl *)(ifa + 1);
9189 bzero(asdl, SOCK_MAXADDRLEN);
9190 msdl = (struct sockaddr_dl *)(void *)
9191 ((char *)asdl + SOCK_MAXADDRLEN);
9192 bzero(msdl, SOCK_MAXADDRLEN);
9193 } else {
9194 VERIFY(ifa == NULL || ifa == &dl_if->dl_if_lladdr.ifa);
9195 /*
9196 * Use the storage areas for address and mask within the
9197 * dlil_ifnet structure. This is the most common case.
9198 */
9199 if (ifa == NULL) {
9200 ifa = &dl_if->dl_if_lladdr.ifa;
9201 ifa_lock_init(ifa);
9202 /* Don't set IFD_ALLOC, as this is permanent */
9203 ifa->ifa_debug = IFD_LINK;
9204 }
9205 IFA_LOCK(ifa);
9206 /* address and mask sockaddr_dl locations */
9207 asdl = (struct sockaddr_dl *)(void *)&dl_if->dl_if_lladdr.asdl;
9208 bzero(asdl, sizeof(dl_if->dl_if_lladdr.asdl));
9209 msdl = (struct sockaddr_dl *)(void *)&dl_if->dl_if_lladdr.msdl;
9210 bzero(msdl, sizeof(dl_if->dl_if_lladdr.msdl));
9211 }
9212
9213 /* hold a permanent reference for the ifnet itself */
9214 IFA_ADDREF_LOCKED(ifa);
9215 oifa = ifp->if_lladdr;
9216 ifp->if_lladdr = ifa;
9217
9218 VERIFY(ifa->ifa_debug == IFD_LINK);
9219 ifa->ifa_ifp = ifp;
9220 ifa->ifa_rtrequest = link_rtrequest;
9221 ifa->ifa_addr = (struct sockaddr *)asdl;
9222 asdl->sdl_len = (u_char)socksize;
9223 asdl->sdl_family = AF_LINK;
9224 if (namelen > 0) {
9225 bcopy(workbuf, asdl->sdl_data, min(namelen,
9226 sizeof(asdl->sdl_data)));
9227 asdl->sdl_nlen = (u_char)namelen;
9228 } else {
9229 asdl->sdl_nlen = 0;
9230 }
9231 asdl->sdl_index = ifp->if_index;
9232 asdl->sdl_type = ifp->if_type;
9233 if (ll_addr != NULL) {
9234 asdl->sdl_alen = ll_addr->sdl_alen;
9235 bcopy(CONST_LLADDR(ll_addr), LLADDR(asdl), asdl->sdl_alen);
9236 } else {
9237 asdl->sdl_alen = 0;
9238 }
9239 ifa->ifa_netmask = (struct sockaddr *)msdl;
9240 msdl->sdl_len = (u_char)masklen;
9241 while (namelen > 0) {
9242 msdl->sdl_data[--namelen] = 0xff;
9243 }
9244 IFA_UNLOCK(ifa);
9245
9246 if (oifa != NULL) {
9247 IFA_REMREF(oifa);
9248 }
9249
9250 return ifa;
9251 }
9252
9253 static void
if_purgeaddrs(struct ifnet * ifp)9254 if_purgeaddrs(struct ifnet *ifp)
9255 {
9256 #if INET
9257 in_purgeaddrs(ifp);
9258 #endif /* INET */
9259 in6_purgeaddrs(ifp);
9260 }
9261
9262 errno_t
ifnet_detach(ifnet_t ifp)9263 ifnet_detach(ifnet_t ifp)
9264 {
9265 struct ifnet *delegated_ifp;
9266 struct nd_ifinfo *ndi = NULL;
9267
9268 if (ifp == NULL) {
9269 return EINVAL;
9270 }
9271
9272 ndi = ND_IFINFO(ifp);
9273 if (NULL != ndi) {
9274 ndi->cga_initialized = FALSE;
9275 }
9276
9277 /* Mark the interface down */
9278 if_down(ifp);
9279
9280 /*
9281 * IMPORTANT NOTE
9282 *
9283 * Any field in the ifnet that relies on IF_FULLY_ATTACHED()
9284 * or equivalently, ifnet_is_attached(ifp, 1), can't be modified
9285 * until after we've waited for all I/O references to drain
9286 * in ifnet_detach_final().
9287 */
9288
9289 ifnet_head_lock_exclusive();
9290 ifnet_lock_exclusive(ifp);
9291
9292 if (ifp->if_output_netem != NULL) {
9293 netem_destroy(ifp->if_output_netem);
9294 ifp->if_output_netem = NULL;
9295 }
9296
9297 /*
9298 * Check to see if this interface has previously triggered
9299 * aggressive protocol draining; if so, decrement the global
9300 * refcnt and clear PR_AGGDRAIN on the route domain if
9301 * there are no more of such an interface around.
9302 */
9303 (void) ifnet_set_idle_flags_locked(ifp, 0, ~0);
9304
9305 lck_mtx_lock_spin(&ifp->if_ref_lock);
9306 if (!(ifp->if_refflags & IFRF_ATTACHED)) {
9307 lck_mtx_unlock(&ifp->if_ref_lock);
9308 ifnet_lock_done(ifp);
9309 ifnet_head_done();
9310 return EINVAL;
9311 } else if (ifp->if_refflags & IFRF_DETACHING) {
9312 /* Interface has already been detached */
9313 lck_mtx_unlock(&ifp->if_ref_lock);
9314 ifnet_lock_done(ifp);
9315 ifnet_head_done();
9316 return ENXIO;
9317 }
9318 VERIFY(!(ifp->if_refflags & IFRF_EMBRYONIC));
9319 /* Indicate this interface is being detached */
9320 ifp->if_refflags &= ~IFRF_ATTACHED;
9321 ifp->if_refflags |= IFRF_DETACHING;
9322 lck_mtx_unlock(&ifp->if_ref_lock);
9323
9324 if (dlil_verbose) {
9325 DLIL_PRINTF("%s: detaching\n", if_name(ifp));
9326 }
9327
9328 /* clean up flow control entry object if there's any */
9329 if (ifp->if_eflags & IFEF_TXSTART) {
9330 ifnet_flowadv(ifp->if_flowhash);
9331 }
9332
9333 /* Reset ECN enable/disable flags */
9334 /* Reset CLAT46 flag */
9335 if_clear_eflags(ifp, IFEF_ECN_ENABLE | IFEF_ECN_DISABLE | IFEF_CLAT46);
9336
9337 /*
9338 * We do not reset the TCP keep alive counters in case
9339 * a TCP connection stays connection after the interface
9340 * went down
9341 */
9342 if (ifp->if_tcp_kao_cnt > 0) {
9343 os_log(OS_LOG_DEFAULT, "%s %s tcp_kao_cnt %u not zero",
9344 __func__, if_name(ifp), ifp->if_tcp_kao_cnt);
9345 }
9346 ifp->if_tcp_kao_max = 0;
9347
9348 /*
9349 * Remove ifnet from the ifnet_head, ifindex2ifnet[]; it will
9350 * no longer be visible during lookups from this point.
9351 */
9352 VERIFY(ifindex2ifnet[ifp->if_index] == ifp);
9353 TAILQ_REMOVE(&ifnet_head, ifp, if_link);
9354 ifp->if_link.tqe_next = NULL;
9355 ifp->if_link.tqe_prev = NULL;
9356 if (ifp->if_ordered_link.tqe_next != NULL ||
9357 ifp->if_ordered_link.tqe_prev != NULL) {
9358 ifnet_remove_from_ordered_list(ifp);
9359 }
9360 ifindex2ifnet[ifp->if_index] = NULL;
9361
9362 /* 18717626 - reset router mode */
9363 if_clear_eflags(ifp, IFEF_IPV4_ROUTER);
9364 ifp->if_ipv6_router_mode = IPV6_ROUTER_MODE_DISABLED;
9365
9366 /* Record detach PC stacktrace */
9367 ctrace_record(&((struct dlil_ifnet *)ifp)->dl_if_detach);
9368
9369 /* Clear logging parameters */
9370 bzero(&ifp->if_log, sizeof(ifp->if_log));
9371
9372 /* Clear delegated interface info (reference released below) */
9373 delegated_ifp = ifp->if_delegated.ifp;
9374 bzero(&ifp->if_delegated, sizeof(ifp->if_delegated));
9375
9376 /* Reset interface state */
9377 bzero(&ifp->if_interface_state, sizeof(ifp->if_interface_state));
9378
9379 ifnet_lock_done(ifp);
9380 ifnet_head_done();
9381
9382 /* Release reference held on the delegated interface */
9383 if (delegated_ifp != NULL) {
9384 ifnet_release(delegated_ifp);
9385 }
9386
9387 /* Reset Link Quality Metric (unless loopback [lo0]) */
9388 if (ifp != lo_ifp) {
9389 if_lqm_update(ifp, IFNET_LQM_THRESH_OFF, 0);
9390 }
9391
9392 /* Reset TCP local statistics */
9393 if (ifp->if_tcp_stat != NULL) {
9394 bzero(ifp->if_tcp_stat, sizeof(*ifp->if_tcp_stat));
9395 }
9396
9397 /* Reset UDP local statistics */
9398 if (ifp->if_udp_stat != NULL) {
9399 bzero(ifp->if_udp_stat, sizeof(*ifp->if_udp_stat));
9400 }
9401
9402 /* Reset ifnet IPv4 stats */
9403 if (ifp->if_ipv4_stat != NULL) {
9404 bzero(ifp->if_ipv4_stat, sizeof(*ifp->if_ipv4_stat));
9405 }
9406
9407 /* Reset ifnet IPv6 stats */
9408 if (ifp->if_ipv6_stat != NULL) {
9409 bzero(ifp->if_ipv6_stat, sizeof(*ifp->if_ipv6_stat));
9410 }
9411
9412 /* Release memory held for interface link status report */
9413 if (ifp->if_link_status != NULL) {
9414 kfree_type(struct if_link_status, ifp->if_link_status);
9415 ifp->if_link_status = NULL;
9416 }
9417
9418 /* Let BPF know we're detaching */
9419 bpfdetach(ifp);
9420
9421 /* Disable forwarding cached route */
9422 lck_mtx_lock(&ifp->if_cached_route_lock);
9423 ifp->if_fwd_cacheok = 0;
9424 lck_mtx_unlock(&ifp->if_cached_route_lock);
9425
9426 /* Disable data threshold and wait for any pending event posting */
9427 ifp->if_data_threshold = 0;
9428 VERIFY(ifp->if_dt_tcall != NULL);
9429 (void) thread_call_cancel_wait(ifp->if_dt_tcall);
9430
9431 /*
9432 * Drain any deferred IGMPv3/MLDv2 query responses, but keep the
9433 * references to the info structures and leave them attached to
9434 * this ifnet.
9435 */
9436 #if INET
9437 igmp_domifdetach(ifp);
9438 #endif /* INET */
9439 mld_domifdetach(ifp);
9440
9441 #if SKYWALK
9442 /* Clean up any netns tokens still pointing to to this ifnet */
9443 netns_ifnet_detach(ifp);
9444 #endif /* SKYWALK */
9445 dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_IF_DETACHING, NULL, 0, FALSE);
9446
9447 /* Let worker thread take care of the rest, to avoid reentrancy */
9448 dlil_if_lock();
9449 ifnet_detaching_enqueue(ifp);
9450 dlil_if_unlock();
9451
9452 return 0;
9453 }
9454
9455 static void
ifnet_detaching_enqueue(struct ifnet * ifp)9456 ifnet_detaching_enqueue(struct ifnet *ifp)
9457 {
9458 dlil_if_lock_assert();
9459
9460 ++ifnet_detaching_cnt;
9461 VERIFY(ifnet_detaching_cnt != 0);
9462 TAILQ_INSERT_TAIL(&ifnet_detaching_head, ifp, if_detaching_link);
9463 wakeup((caddr_t)&ifnet_delayed_run);
9464 }
9465
9466 static struct ifnet *
ifnet_detaching_dequeue(void)9467 ifnet_detaching_dequeue(void)
9468 {
9469 struct ifnet *ifp;
9470
9471 dlil_if_lock_assert();
9472
9473 ifp = TAILQ_FIRST(&ifnet_detaching_head);
9474 VERIFY(ifnet_detaching_cnt != 0 || ifp == NULL);
9475 if (ifp != NULL) {
9476 VERIFY(ifnet_detaching_cnt != 0);
9477 --ifnet_detaching_cnt;
9478 TAILQ_REMOVE(&ifnet_detaching_head, ifp, if_detaching_link);
9479 ifp->if_detaching_link.tqe_next = NULL;
9480 ifp->if_detaching_link.tqe_prev = NULL;
9481 }
9482 return ifp;
9483 }
9484
9485 __attribute__((noreturn))
9486 static void
ifnet_detacher_thread_cont(void * v,wait_result_t wres)9487 ifnet_detacher_thread_cont(void *v, wait_result_t wres)
9488 {
9489 #pragma unused(v, wres)
9490 struct ifnet *ifp;
9491
9492 dlil_if_lock();
9493 if (__improbable(ifnet_detaching_embryonic)) {
9494 ifnet_detaching_embryonic = FALSE;
9495 /* there's no lock ordering constrain so OK to do this here */
9496 dlil_decr_pending_thread_count();
9497 }
9498
9499 for (;;) {
9500 dlil_if_lock_assert();
9501
9502 if (ifnet_detaching_cnt == 0) {
9503 break;
9504 }
9505
9506 net_update_uptime();
9507
9508 VERIFY(TAILQ_FIRST(&ifnet_detaching_head) != NULL);
9509
9510 /* Take care of detaching ifnet */
9511 ifp = ifnet_detaching_dequeue();
9512 if (ifp != NULL) {
9513 dlil_if_unlock();
9514 ifnet_detach_final(ifp);
9515 dlil_if_lock();
9516 }
9517 }
9518
9519 (void) assert_wait(&ifnet_delayed_run, THREAD_UNINT);
9520 dlil_if_unlock();
9521 (void) thread_block(ifnet_detacher_thread_cont);
9522
9523 VERIFY(0); /* we should never get here */
9524 /* NOTREACHED */
9525 __builtin_unreachable();
9526 }
9527
9528 __dead2
9529 static void
ifnet_detacher_thread_func(void * v,wait_result_t w)9530 ifnet_detacher_thread_func(void *v, wait_result_t w)
9531 {
9532 #pragma unused(v, w)
9533 dlil_if_lock();
9534 (void) assert_wait(&ifnet_delayed_run, THREAD_UNINT);
9535 ifnet_detaching_embryonic = TRUE;
9536 /* wake up once to get out of embryonic state */
9537 wakeup((caddr_t)&ifnet_delayed_run);
9538 dlil_if_unlock();
9539 (void) thread_block(ifnet_detacher_thread_cont);
9540 VERIFY(0);
9541 /* NOTREACHED */
9542 __builtin_unreachable();
9543 }
9544
9545 static void
ifnet_detach_final(struct ifnet * ifp)9546 ifnet_detach_final(struct ifnet *ifp)
9547 {
9548 struct ifnet_filter *filter, *filter_next;
9549 struct dlil_ifnet *dlifp;
9550 struct ifnet_filter_head fhead;
9551 struct dlil_threading_info *inp;
9552 struct ifaddr *ifa;
9553 ifnet_detached_func if_free;
9554 int i;
9555
9556 #if SKYWALK
9557 dlil_netif_detach_notify(ifp);
9558 /*
9559 * Wait for the datapath to quiesce before tearing down
9560 * netif/flowswitch nexuses.
9561 */
9562 dlil_quiesce_and_detach_nexuses(ifp);
9563 #endif /* SKYWALK */
9564
9565 lck_mtx_lock(&ifp->if_ref_lock);
9566 if (!(ifp->if_refflags & IFRF_DETACHING)) {
9567 panic("%s: flags mismatch (detaching not set) ifp=%p",
9568 __func__, ifp);
9569 /* NOTREACHED */
9570 }
9571
9572 /*
9573 * Wait until the existing IO references get released
9574 * before we proceed with ifnet_detach. This is not a
9575 * common case, so block without using a continuation.
9576 */
9577 while (ifp->if_refio > 0) {
9578 DLIL_PRINTF("%s: Waiting for IO references on %s interface "
9579 "to be released\n", __func__, if_name(ifp));
9580 (void) msleep(&(ifp->if_refio), &ifp->if_ref_lock,
9581 (PZERO - 1), "ifnet_ioref_wait", NULL);
9582 }
9583
9584 VERIFY(ifp->if_datamov == 0);
9585 VERIFY(ifp->if_drainers == 0);
9586 VERIFY(ifp->if_suspend == 0);
9587 ifp->if_refflags &= ~IFRF_READY;
9588 lck_mtx_unlock(&ifp->if_ref_lock);
9589
9590 /* Clear agent IDs */
9591 if (ifp->if_agentids != NULL) {
9592 kfree_data(ifp->if_agentids,
9593 sizeof(uuid_t) * ifp->if_agentcount);
9594 ifp->if_agentids = NULL;
9595 }
9596 ifp->if_agentcount = 0;
9597
9598 #if SKYWALK
9599 VERIFY(SLIST_EMPTY(&ifp->if_netns_tokens));
9600 #endif /* SKYWALK */
9601 /* Drain and destroy send queue */
9602 ifclassq_teardown(ifp->if_snd);
9603
9604 /* Detach interface filters */
9605 lck_mtx_lock(&ifp->if_flt_lock);
9606 if_flt_monitor_enter(ifp);
9607
9608 LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
9609 fhead = ifp->if_flt_head;
9610 TAILQ_INIT(&ifp->if_flt_head);
9611
9612 for (filter = TAILQ_FIRST(&fhead); filter; filter = filter_next) {
9613 filter_next = TAILQ_NEXT(filter, filt_next);
9614 lck_mtx_unlock(&ifp->if_flt_lock);
9615
9616 dlil_detach_filter_internal(filter, 1);
9617 lck_mtx_lock(&ifp->if_flt_lock);
9618 }
9619 if_flt_monitor_leave(ifp);
9620 lck_mtx_unlock(&ifp->if_flt_lock);
9621
9622 /* Tell upper layers to drop their network addresses */
9623 if_purgeaddrs(ifp);
9624
9625 ifnet_lock_exclusive(ifp);
9626
9627 /* Unplumb all protocols */
9628 for (i = 0; i < PROTO_HASH_SLOTS; i++) {
9629 struct if_proto *proto;
9630
9631 proto = SLIST_FIRST(&ifp->if_proto_hash[i]);
9632 while (proto != NULL) {
9633 protocol_family_t family = proto->protocol_family;
9634 ifnet_lock_done(ifp);
9635 proto_unplumb(family, ifp);
9636 ifnet_lock_exclusive(ifp);
9637 proto = SLIST_FIRST(&ifp->if_proto_hash[i]);
9638 }
9639 /* There should not be any protocols left */
9640 VERIFY(SLIST_EMPTY(&ifp->if_proto_hash[i]));
9641 }
9642 kfree_type(struct proto_hash_entry, PROTO_HASH_SLOTS, ifp->if_proto_hash);
9643 ifp->if_proto_hash = NULL;
9644
9645 /* Detach (permanent) link address from if_addrhead */
9646 ifa = TAILQ_FIRST(&ifp->if_addrhead);
9647 VERIFY(ifnet_addrs[ifp->if_index - 1] == ifa);
9648 IFA_LOCK(ifa);
9649 if_detach_link_ifa(ifp, ifa);
9650 IFA_UNLOCK(ifa);
9651
9652 /* Remove (permanent) link address from ifnet_addrs[] */
9653 IFA_REMREF(ifa);
9654 ifnet_addrs[ifp->if_index - 1] = NULL;
9655
9656 /* This interface should not be on {ifnet_head,detaching} */
9657 VERIFY(ifp->if_link.tqe_next == NULL);
9658 VERIFY(ifp->if_link.tqe_prev == NULL);
9659 VERIFY(ifp->if_detaching_link.tqe_next == NULL);
9660 VERIFY(ifp->if_detaching_link.tqe_prev == NULL);
9661 VERIFY(ifp->if_ordered_link.tqe_next == NULL);
9662 VERIFY(ifp->if_ordered_link.tqe_prev == NULL);
9663
9664 /* The slot should have been emptied */
9665 VERIFY(ifindex2ifnet[ifp->if_index] == NULL);
9666
9667 /* There should not be any addresses left */
9668 VERIFY(TAILQ_EMPTY(&ifp->if_addrhead));
9669
9670 /*
9671 * Signal the starter thread to terminate itself, and wait until
9672 * it has exited.
9673 */
9674 if (ifp->if_start_thread != THREAD_NULL) {
9675 lck_mtx_lock_spin(&ifp->if_start_lock);
9676 ifp->if_start_flags |= IFSF_TERMINATING;
9677 wakeup_one((caddr_t)&ifp->if_start_thread);
9678 lck_mtx_unlock(&ifp->if_start_lock);
9679
9680 /* wait for starter thread to terminate */
9681 lck_mtx_lock(&ifp->if_start_lock);
9682 while (ifp->if_start_thread != THREAD_NULL) {
9683 if (dlil_verbose) {
9684 DLIL_PRINTF("%s: waiting for %s starter thread to terminate\n",
9685 __func__,
9686 if_name(ifp));
9687 }
9688 (void) msleep(&ifp->if_start_thread,
9689 &ifp->if_start_lock, (PZERO - 1),
9690 "ifnet_start_thread_exit", NULL);
9691 }
9692 lck_mtx_unlock(&ifp->if_start_lock);
9693 if (dlil_verbose) {
9694 DLIL_PRINTF("%s: %s starter thread termination complete",
9695 __func__, if_name(ifp));
9696 }
9697 }
9698
9699 /*
9700 * Signal the poller thread to terminate itself, and wait until
9701 * it has exited.
9702 */
9703 if (ifp->if_poll_thread != THREAD_NULL) {
9704 #if SKYWALK
9705 VERIFY(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
9706 #endif /* SKYWALK */
9707 lck_mtx_lock_spin(&ifp->if_poll_lock);
9708 ifp->if_poll_flags |= IF_POLLF_TERMINATING;
9709 wakeup_one((caddr_t)&ifp->if_poll_thread);
9710 lck_mtx_unlock(&ifp->if_poll_lock);
9711
9712 /* wait for poller thread to terminate */
9713 lck_mtx_lock(&ifp->if_poll_lock);
9714 while (ifp->if_poll_thread != THREAD_NULL) {
9715 if (dlil_verbose) {
9716 DLIL_PRINTF("%s: waiting for %s poller thread to terminate\n",
9717 __func__,
9718 if_name(ifp));
9719 }
9720 (void) msleep(&ifp->if_poll_thread,
9721 &ifp->if_poll_lock, (PZERO - 1),
9722 "ifnet_poll_thread_exit", NULL);
9723 }
9724 lck_mtx_unlock(&ifp->if_poll_lock);
9725 if (dlil_verbose) {
9726 DLIL_PRINTF("%s: %s poller thread termination complete\n",
9727 __func__, if_name(ifp));
9728 }
9729 }
9730
9731 /*
9732 * If thread affinity was set for the workloop thread, we will need
9733 * to tear down the affinity and release the extra reference count
9734 * taken at attach time. Does not apply to lo0 or other interfaces
9735 * without dedicated input threads.
9736 */
9737 if ((inp = ifp->if_inp) != NULL) {
9738 VERIFY(inp != dlil_main_input_thread);
9739
9740 if (inp->dlth_affinity) {
9741 struct thread *tp, *wtp, *ptp;
9742
9743 lck_mtx_lock_spin(&inp->dlth_lock);
9744 wtp = inp->dlth_driver_thread;
9745 inp->dlth_driver_thread = THREAD_NULL;
9746 ptp = inp->dlth_poller_thread;
9747 inp->dlth_poller_thread = THREAD_NULL;
9748 ASSERT(inp->dlth_thread != THREAD_NULL);
9749 tp = inp->dlth_thread; /* don't nullify now */
9750 inp->dlth_affinity_tag = 0;
9751 inp->dlth_affinity = FALSE;
9752 lck_mtx_unlock(&inp->dlth_lock);
9753
9754 /* Tear down poll thread affinity */
9755 if (ptp != NULL) {
9756 VERIFY(ifp->if_eflags & IFEF_RXPOLL);
9757 VERIFY(ifp->if_xflags & IFXF_LEGACY);
9758 (void) dlil_affinity_set(ptp,
9759 THREAD_AFFINITY_TAG_NULL);
9760 thread_deallocate(ptp);
9761 }
9762
9763 /* Tear down workloop thread affinity */
9764 if (wtp != NULL) {
9765 (void) dlil_affinity_set(wtp,
9766 THREAD_AFFINITY_TAG_NULL);
9767 thread_deallocate(wtp);
9768 }
9769
9770 /* Tear down DLIL input thread affinity */
9771 (void) dlil_affinity_set(tp, THREAD_AFFINITY_TAG_NULL);
9772 thread_deallocate(tp);
9773 }
9774
9775 /* disassociate ifp DLIL input thread */
9776 ifp->if_inp = NULL;
9777
9778 /* if the worker thread was created, tell it to terminate */
9779 if (inp->dlth_thread != THREAD_NULL) {
9780 lck_mtx_lock_spin(&inp->dlth_lock);
9781 inp->dlth_flags |= DLIL_INPUT_TERMINATE;
9782 if (!(inp->dlth_flags & DLIL_INPUT_RUNNING)) {
9783 wakeup_one((caddr_t)&inp->dlth_flags);
9784 }
9785 lck_mtx_unlock(&inp->dlth_lock);
9786 ifnet_lock_done(ifp);
9787
9788 /* wait for the input thread to terminate */
9789 lck_mtx_lock_spin(&inp->dlth_lock);
9790 while ((inp->dlth_flags & DLIL_INPUT_TERMINATE_COMPLETE)
9791 == 0) {
9792 (void) msleep(&inp->dlth_flags, &inp->dlth_lock,
9793 (PZERO - 1) | PSPIN, inp->dlth_name, NULL);
9794 }
9795 lck_mtx_unlock(&inp->dlth_lock);
9796 ifnet_lock_exclusive(ifp);
9797 }
9798
9799 /* clean-up input thread state */
9800 dlil_clean_threading_info(inp);
9801 /* clean-up poll parameters */
9802 VERIFY(ifp->if_poll_thread == THREAD_NULL);
9803 dlil_reset_rxpoll_params(ifp);
9804 }
9805
9806 /* The driver might unload, so point these to ourselves */
9807 if_free = ifp->if_free;
9808 ifp->if_output_dlil = ifp_if_output;
9809 ifp->if_output = ifp_if_output;
9810 ifp->if_pre_enqueue = ifp_if_output;
9811 ifp->if_start = ifp_if_start;
9812 ifp->if_output_ctl = ifp_if_ctl;
9813 ifp->if_input_dlil = ifp_if_input;
9814 ifp->if_input_poll = ifp_if_input_poll;
9815 ifp->if_input_ctl = ifp_if_ctl;
9816 ifp->if_ioctl = ifp_if_ioctl;
9817 ifp->if_set_bpf_tap = ifp_if_set_bpf_tap;
9818 ifp->if_free = ifp_if_free;
9819 ifp->if_demux = ifp_if_demux;
9820 ifp->if_event = ifp_if_event;
9821 ifp->if_framer_legacy = ifp_if_framer;
9822 ifp->if_framer = ifp_if_framer_extended;
9823 ifp->if_add_proto = ifp_if_add_proto;
9824 ifp->if_del_proto = ifp_if_del_proto;
9825 ifp->if_check_multi = ifp_if_check_multi;
9826
9827 /* wipe out interface description */
9828 VERIFY(ifp->if_desc.ifd_maxlen == IF_DESCSIZE);
9829 ifp->if_desc.ifd_len = 0;
9830 VERIFY(ifp->if_desc.ifd_desc != NULL);
9831 bzero(ifp->if_desc.ifd_desc, IF_DESCSIZE);
9832
9833 /* there shouldn't be any delegation by now */
9834 VERIFY(ifp->if_delegated.ifp == NULL);
9835 VERIFY(ifp->if_delegated.type == 0);
9836 VERIFY(ifp->if_delegated.family == 0);
9837 VERIFY(ifp->if_delegated.subfamily == 0);
9838 VERIFY(ifp->if_delegated.expensive == 0);
9839 VERIFY(ifp->if_delegated.constrained == 0);
9840
9841 /* QoS marking get cleared */
9842 if_clear_eflags(ifp, IFEF_QOSMARKING_ENABLED);
9843 if_set_qosmarking_mode(ifp, IFRTYPE_QOSMARKING_MODE_NONE);
9844
9845 #if SKYWALK
9846 /* the nexus destructor is responsible for clearing these */
9847 VERIFY(ifp->if_na_ops == NULL);
9848 VERIFY(ifp->if_na == NULL);
9849 #endif /* SKYWALK */
9850
9851 /* promiscuous/allmulti counts need to start at zero again */
9852 ifp->if_pcount = 0;
9853 ifp->if_amcount = 0;
9854 ifp->if_flags &= ~(IFF_PROMISC | IFF_ALLMULTI);
9855
9856 ifnet_lock_done(ifp);
9857
9858 #if PF
9859 /*
9860 * Detach this interface from packet filter, if enabled.
9861 */
9862 pf_ifnet_hook(ifp, 0);
9863 #endif /* PF */
9864
9865 /* Filter list should be empty */
9866 lck_mtx_lock_spin(&ifp->if_flt_lock);
9867 VERIFY(TAILQ_EMPTY(&ifp->if_flt_head));
9868 VERIFY(ifp->if_flt_busy == 0);
9869 VERIFY(ifp->if_flt_waiters == 0);
9870 VERIFY(ifp->if_flt_non_os_count == 0);
9871 VERIFY(ifp->if_flt_no_tso_count == 0);
9872 lck_mtx_unlock(&ifp->if_flt_lock);
9873
9874 /* Last chance to drain send queue */
9875 if_qflush_snd(ifp, 0);
9876
9877 /* Last chance to cleanup any cached route */
9878 lck_mtx_lock(&ifp->if_cached_route_lock);
9879 VERIFY(!ifp->if_fwd_cacheok);
9880 ROUTE_RELEASE(&ifp->if_fwd_route);
9881 bzero(&ifp->if_fwd_route, sizeof(ifp->if_fwd_route));
9882 ROUTE_RELEASE(&ifp->if_src_route);
9883 bzero(&ifp->if_src_route, sizeof(ifp->if_src_route));
9884 ROUTE_RELEASE(&ifp->if_src_route6);
9885 bzero(&ifp->if_src_route6, sizeof(ifp->if_src_route6));
9886 lck_mtx_unlock(&ifp->if_cached_route_lock);
9887
9888 VERIFY(ifp->if_data_threshold == 0);
9889 VERIFY(ifp->if_dt_tcall != NULL);
9890 VERIFY(!thread_call_isactive(ifp->if_dt_tcall));
9891
9892 ifnet_llreach_ifdetach(ifp);
9893
9894 dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_IF_DETACHED, NULL, 0, FALSE);
9895
9896 /*
9897 * Finally, mark this ifnet as detached.
9898 */
9899 if (dlil_verbose) {
9900 DLIL_PRINTF("%s: detached\n", if_name(ifp));
9901 }
9902 lck_mtx_lock_spin(&ifp->if_ref_lock);
9903 if (!(ifp->if_refflags & IFRF_DETACHING)) {
9904 panic("%s: flags mismatch (detaching not set) ifp=%p",
9905 __func__, ifp);
9906 /* NOTREACHED */
9907 }
9908 ifp->if_refflags &= ~IFRF_DETACHING;
9909 lck_mtx_unlock(&ifp->if_ref_lock);
9910 if (if_free != NULL) {
9911 if_free(ifp);
9912 }
9913
9914 ifclassq_release(&ifp->if_snd);
9915
9916 /* we're fully detached, clear the "in use" bit */
9917 dlifp = (struct dlil_ifnet *)ifp;
9918 lck_mtx_lock(&dlifp->dl_if_lock);
9919 ASSERT((dlifp->dl_if_flags & DLIF_INUSE) != 0);
9920 dlifp->dl_if_flags &= ~DLIF_INUSE;
9921 lck_mtx_unlock(&dlifp->dl_if_lock);
9922
9923 /* Release reference held during ifnet attach */
9924 ifnet_release(ifp);
9925 }
9926
9927 errno_t
ifp_if_output(struct ifnet * ifp,struct mbuf * m)9928 ifp_if_output(struct ifnet *ifp, struct mbuf *m)
9929 {
9930 #pragma unused(ifp)
9931 m_freem_list(m);
9932 return 0;
9933 }
9934
9935 void
ifp_if_start(struct ifnet * ifp)9936 ifp_if_start(struct ifnet *ifp)
9937 {
9938 ifnet_purge(ifp);
9939 }
9940
9941 static errno_t
ifp_if_input(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t poll,struct thread * tp)9942 ifp_if_input(struct ifnet *ifp, struct mbuf *m_head,
9943 struct mbuf *m_tail, const struct ifnet_stat_increment_param *s,
9944 boolean_t poll, struct thread *tp)
9945 {
9946 #pragma unused(ifp, m_tail, s, poll, tp)
9947 m_freem_list(m_head);
9948 return ENXIO;
9949 }
9950
9951 static void
ifp_if_input_poll(struct ifnet * ifp,u_int32_t flags,u_int32_t max_cnt,struct mbuf ** m_head,struct mbuf ** m_tail,u_int32_t * cnt,u_int32_t * len)9952 ifp_if_input_poll(struct ifnet *ifp, u_int32_t flags, u_int32_t max_cnt,
9953 struct mbuf **m_head, struct mbuf **m_tail, u_int32_t *cnt, u_int32_t *len)
9954 {
9955 #pragma unused(ifp, flags, max_cnt)
9956 if (m_head != NULL) {
9957 *m_head = NULL;
9958 }
9959 if (m_tail != NULL) {
9960 *m_tail = NULL;
9961 }
9962 if (cnt != NULL) {
9963 *cnt = 0;
9964 }
9965 if (len != NULL) {
9966 *len = 0;
9967 }
9968 }
9969
9970 static errno_t
ifp_if_ctl(struct ifnet * ifp,ifnet_ctl_cmd_t cmd,u_int32_t arglen,void * arg)9971 ifp_if_ctl(struct ifnet *ifp, ifnet_ctl_cmd_t cmd, u_int32_t arglen, void *arg)
9972 {
9973 #pragma unused(ifp, cmd, arglen, arg)
9974 return EOPNOTSUPP;
9975 }
9976
9977 static errno_t
ifp_if_demux(struct ifnet * ifp,struct mbuf * m,char * fh,protocol_family_t * pf)9978 ifp_if_demux(struct ifnet *ifp, struct mbuf *m, char *fh, protocol_family_t *pf)
9979 {
9980 #pragma unused(ifp, fh, pf)
9981 m_freem(m);
9982 return EJUSTRETURN;
9983 }
9984
9985 static errno_t
ifp_if_add_proto(struct ifnet * ifp,protocol_family_t pf,const struct ifnet_demux_desc * da,u_int32_t dc)9986 ifp_if_add_proto(struct ifnet *ifp, protocol_family_t pf,
9987 const struct ifnet_demux_desc *da, u_int32_t dc)
9988 {
9989 #pragma unused(ifp, pf, da, dc)
9990 return EINVAL;
9991 }
9992
9993 static errno_t
ifp_if_del_proto(struct ifnet * ifp,protocol_family_t pf)9994 ifp_if_del_proto(struct ifnet *ifp, protocol_family_t pf)
9995 {
9996 #pragma unused(ifp, pf)
9997 return EINVAL;
9998 }
9999
10000 static errno_t
ifp_if_check_multi(struct ifnet * ifp,const struct sockaddr * sa)10001 ifp_if_check_multi(struct ifnet *ifp, const struct sockaddr *sa)
10002 {
10003 #pragma unused(ifp, sa)
10004 return EOPNOTSUPP;
10005 }
10006
10007 #if !XNU_TARGET_OS_OSX
10008 static errno_t
ifp_if_framer(struct ifnet * ifp,struct mbuf ** m,const struct sockaddr * sa,const char * ll,const char * t,u_int32_t * pre,u_int32_t * post)10009 ifp_if_framer(struct ifnet *ifp, struct mbuf **m,
10010 const struct sockaddr *sa, const char *ll, const char *t,
10011 u_int32_t *pre, u_int32_t *post)
10012 #else /* XNU_TARGET_OS_OSX */
10013 static errno_t
10014 ifp_if_framer(struct ifnet *ifp, struct mbuf **m,
10015 const struct sockaddr *sa, const char *ll, const char *t)
10016 #endif /* XNU_TARGET_OS_OSX */
10017 {
10018 #pragma unused(ifp, m, sa, ll, t)
10019 #if !XNU_TARGET_OS_OSX
10020 return ifp_if_framer_extended(ifp, m, sa, ll, t, pre, post);
10021 #else /* XNU_TARGET_OS_OSX */
10022 return ifp_if_framer_extended(ifp, m, sa, ll, t, NULL, NULL);
10023 #endif /* XNU_TARGET_OS_OSX */
10024 }
10025
10026 static errno_t
ifp_if_framer_extended(struct ifnet * ifp,struct mbuf ** m,const struct sockaddr * sa,const char * ll,const char * t,u_int32_t * pre,u_int32_t * post)10027 ifp_if_framer_extended(struct ifnet *ifp, struct mbuf **m,
10028 const struct sockaddr *sa, const char *ll, const char *t,
10029 u_int32_t *pre, u_int32_t *post)
10030 {
10031 #pragma unused(ifp, sa, ll, t)
10032 m_freem(*m);
10033 *m = NULL;
10034
10035 if (pre != NULL) {
10036 *pre = 0;
10037 }
10038 if (post != NULL) {
10039 *post = 0;
10040 }
10041
10042 return EJUSTRETURN;
10043 }
10044
10045 errno_t
ifp_if_ioctl(struct ifnet * ifp,unsigned long cmd,void * arg)10046 ifp_if_ioctl(struct ifnet *ifp, unsigned long cmd, void *arg)
10047 {
10048 #pragma unused(ifp, cmd, arg)
10049 return EOPNOTSUPP;
10050 }
10051
10052 static errno_t
ifp_if_set_bpf_tap(struct ifnet * ifp,bpf_tap_mode tm,bpf_packet_func f)10053 ifp_if_set_bpf_tap(struct ifnet *ifp, bpf_tap_mode tm, bpf_packet_func f)
10054 {
10055 #pragma unused(ifp, tm, f)
10056 /* XXX not sure what to do here */
10057 return 0;
10058 }
10059
10060 static void
ifp_if_free(struct ifnet * ifp)10061 ifp_if_free(struct ifnet *ifp)
10062 {
10063 #pragma unused(ifp)
10064 }
10065
10066 static void
ifp_if_event(struct ifnet * ifp,const struct kev_msg * e)10067 ifp_if_event(struct ifnet *ifp, const struct kev_msg *e)
10068 {
10069 #pragma unused(ifp, e)
10070 }
10071
10072 int
dlil_if_acquire(u_int32_t family,const void * uniqueid,size_t uniqueid_len,const char * ifxname,struct ifnet ** ifp)10073 dlil_if_acquire(u_int32_t family, const void *uniqueid,
10074 size_t uniqueid_len, const char *ifxname, struct ifnet **ifp)
10075 {
10076 struct ifnet *ifp1 = NULL;
10077 struct dlil_ifnet *dlifp1 = NULL;
10078 struct dlil_ifnet *dlifp1_saved = NULL;
10079 void *buf, *base, **pbuf;
10080 int ret = 0;
10081
10082 VERIFY(*ifp == NULL);
10083 dlil_if_lock();
10084 /*
10085 * We absolutely can't have an interface with the same name
10086 * in in-use state.
10087 * To make sure of that list has to be traversed completely
10088 */
10089 TAILQ_FOREACH(dlifp1, &dlil_ifnet_head, dl_if_link) {
10090 ifp1 = (struct ifnet *)dlifp1;
10091
10092 if (ifp1->if_family != family) {
10093 continue;
10094 }
10095
10096 /*
10097 * If interface is in use, return EBUSY if either unique id
10098 * or interface extended names are the same
10099 */
10100 lck_mtx_lock(&dlifp1->dl_if_lock);
10101 if (strncmp(ifxname, ifp1->if_xname, IFXNAMSIZ) == 0 &&
10102 (dlifp1->dl_if_flags & DLIF_INUSE) != 0) {
10103 lck_mtx_unlock(&dlifp1->dl_if_lock);
10104 ret = EBUSY;
10105 goto end;
10106 }
10107
10108 if (uniqueid_len != 0 &&
10109 uniqueid_len == dlifp1->dl_if_uniqueid_len &&
10110 bcmp(uniqueid, dlifp1->dl_if_uniqueid, uniqueid_len) == 0) {
10111 if ((dlifp1->dl_if_flags & DLIF_INUSE) != 0) {
10112 lck_mtx_unlock(&dlifp1->dl_if_lock);
10113 ret = EBUSY;
10114 goto end;
10115 }
10116 if (dlifp1_saved == NULL) {
10117 /* cache the first match */
10118 dlifp1_saved = dlifp1;
10119 }
10120 /*
10121 * Do not break or jump to end as we have to traverse
10122 * the whole list to ensure there are no name collisions
10123 */
10124 }
10125 lck_mtx_unlock(&dlifp1->dl_if_lock);
10126 }
10127
10128 /* If there's an interface that can be recycled, use that */
10129 if (dlifp1_saved != NULL) {
10130 lck_mtx_lock(&dlifp1_saved->dl_if_lock);
10131 if ((dlifp1_saved->dl_if_flags & DLIF_INUSE) != 0) {
10132 /* some other thread got in ahead of us */
10133 lck_mtx_unlock(&dlifp1_saved->dl_if_lock);
10134 ret = EBUSY;
10135 goto end;
10136 }
10137 dlifp1_saved->dl_if_flags |= (DLIF_INUSE | DLIF_REUSE);
10138 lck_mtx_unlock(&dlifp1_saved->dl_if_lock);
10139 *ifp = (struct ifnet *)dlifp1_saved;
10140 dlil_if_ref(*ifp);
10141 goto end;
10142 }
10143
10144 /* no interface found, allocate a new one */
10145 buf = zalloc_flags(dlif_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
10146
10147 /* Get the 64-bit aligned base address for this object */
10148 base = (void *)P2ROUNDUP((intptr_t)buf + sizeof(u_int64_t),
10149 sizeof(u_int64_t));
10150 VERIFY(((intptr_t)base + dlif_size) <= ((intptr_t)buf + dlif_bufsize));
10151
10152 /*
10153 * Wind back a pointer size from the aligned base and
10154 * save the original address so we can free it later.
10155 */
10156 pbuf = (void **)((intptr_t)base - sizeof(void *));
10157 *pbuf = buf;
10158 dlifp1 = base;
10159
10160 if (uniqueid_len) {
10161 dlifp1->dl_if_uniqueid = kalloc_data(uniqueid_len,
10162 Z_WAITOK);
10163 if (dlifp1->dl_if_uniqueid == NULL) {
10164 zfree(dlif_zone, buf);
10165 ret = ENOMEM;
10166 goto end;
10167 }
10168 bcopy(uniqueid, dlifp1->dl_if_uniqueid, uniqueid_len);
10169 dlifp1->dl_if_uniqueid_len = uniqueid_len;
10170 }
10171
10172 ifp1 = (struct ifnet *)dlifp1;
10173 dlifp1->dl_if_flags = DLIF_INUSE;
10174 if (ifnet_debug) {
10175 dlifp1->dl_if_flags |= DLIF_DEBUG;
10176 dlifp1->dl_if_trace = dlil_if_trace;
10177 }
10178 ifp1->if_name = dlifp1->dl_if_namestorage;
10179 ifp1->if_xname = dlifp1->dl_if_xnamestorage;
10180
10181 /* initialize interface description */
10182 ifp1->if_desc.ifd_maxlen = IF_DESCSIZE;
10183 ifp1->if_desc.ifd_len = 0;
10184 ifp1->if_desc.ifd_desc = dlifp1->dl_if_descstorage;
10185
10186 #if SKYWALK
10187 SLIST_INIT(&ifp1->if_netns_tokens);
10188 #endif /* SKYWALK */
10189
10190 if ((ret = dlil_alloc_local_stats(ifp1)) != 0) {
10191 DLIL_PRINTF("%s: failed to allocate if local stats, "
10192 "error: %d\n", __func__, ret);
10193 /* This probably shouldn't be fatal */
10194 ret = 0;
10195 }
10196
10197 lck_mtx_init(&dlifp1->dl_if_lock, &ifnet_lock_group, &ifnet_lock_attr);
10198 lck_rw_init(&ifp1->if_lock, &ifnet_lock_group, &ifnet_lock_attr);
10199 lck_mtx_init(&ifp1->if_ref_lock, &ifnet_lock_group, &ifnet_lock_attr);
10200 lck_mtx_init(&ifp1->if_flt_lock, &ifnet_lock_group, &ifnet_lock_attr);
10201 lck_mtx_init(&ifp1->if_addrconfig_lock, &ifnet_lock_group,
10202 &ifnet_lock_attr);
10203 lck_rw_init(&ifp1->if_llreach_lock, &ifnet_lock_group, &ifnet_lock_attr);
10204 #if INET
10205 lck_rw_init(&ifp1->if_inetdata_lock, &ifnet_lock_group,
10206 &ifnet_lock_attr);
10207 ifp1->if_inetdata = NULL;
10208 #endif
10209 lck_mtx_init(&ifp1->if_inet6_ioctl_lock, &ifnet_lock_group, &ifnet_lock_attr);
10210 ifp1->if_inet6_ioctl_busy = FALSE;
10211 lck_rw_init(&ifp1->if_inet6data_lock, &ifnet_lock_group,
10212 &ifnet_lock_attr);
10213 ifp1->if_inet6data = NULL;
10214 lck_rw_init(&ifp1->if_link_status_lock, &ifnet_lock_group,
10215 &ifnet_lock_attr);
10216 ifp1->if_link_status = NULL;
10217
10218 /* for send data paths */
10219 lck_mtx_init(&ifp1->if_start_lock, &ifnet_snd_lock_group,
10220 &ifnet_lock_attr);
10221 lck_mtx_init(&ifp1->if_cached_route_lock, &ifnet_snd_lock_group,
10222 &ifnet_lock_attr);
10223
10224 /* for receive data paths */
10225 lck_mtx_init(&ifp1->if_poll_lock, &ifnet_rcv_lock_group,
10226 &ifnet_lock_attr);
10227
10228 /* thread call allocation is done with sleeping zalloc */
10229 ifp1->if_dt_tcall = thread_call_allocate_with_options(dlil_dt_tcall_fn,
10230 ifp1, THREAD_CALL_PRIORITY_KERNEL, THREAD_CALL_OPTIONS_ONCE);
10231 if (ifp1->if_dt_tcall == NULL) {
10232 panic_plain("%s: couldn't create if_dt_tcall", __func__);
10233 /* NOTREACHED */
10234 }
10235
10236 TAILQ_INSERT_TAIL(&dlil_ifnet_head, dlifp1, dl_if_link);
10237
10238 *ifp = ifp1;
10239 dlil_if_ref(*ifp);
10240
10241 end:
10242 dlil_if_unlock();
10243
10244 VERIFY(dlifp1 == NULL || (IS_P2ALIGNED(dlifp1, sizeof(u_int64_t)) &&
10245 IS_P2ALIGNED(&ifp1->if_data, sizeof(u_int64_t))));
10246
10247 return ret;
10248 }
10249
10250 static void
_dlil_if_release(ifnet_t ifp,bool clear_in_use)10251 _dlil_if_release(ifnet_t ifp, bool clear_in_use)
10252 {
10253 struct dlil_ifnet *dlifp = (struct dlil_ifnet *)ifp;
10254
10255 VERIFY(OSDecrementAtomic64(&net_api_stats.nas_ifnet_alloc_count) > 0);
10256 if (!(ifp->if_xflags & IFXF_ALLOC_KPI)) {
10257 VERIFY(OSDecrementAtomic64(&net_api_stats.nas_ifnet_alloc_os_count) > 0);
10258 }
10259
10260 ifnet_lock_exclusive(ifp);
10261 if (ifp->if_broadcast.length > sizeof(ifp->if_broadcast.u.buffer)) {
10262 kfree_data(ifp->if_broadcast.u.ptr, ifp->if_broadcast.length);
10263 ifp->if_broadcast.length = 0;
10264 ifp->if_broadcast.u.ptr = NULL;
10265 }
10266 lck_mtx_lock(&dlifp->dl_if_lock);
10267 strlcpy(dlifp->dl_if_namestorage, ifp->if_name, IFNAMSIZ);
10268 ifp->if_name = dlifp->dl_if_namestorage;
10269 /* Reset external name (name + unit) */
10270 ifp->if_xname = dlifp->dl_if_xnamestorage;
10271 snprintf(__DECONST(char *, ifp->if_xname), IFXNAMSIZ,
10272 "%s?", ifp->if_name);
10273 if (clear_in_use) {
10274 ASSERT((dlifp->dl_if_flags & DLIF_INUSE) != 0);
10275 dlifp->dl_if_flags &= ~DLIF_INUSE;
10276 }
10277 lck_mtx_unlock(&dlifp->dl_if_lock);
10278 ifnet_lock_done(ifp);
10279 }
10280
10281 __private_extern__ void
dlil_if_release(ifnet_t ifp)10282 dlil_if_release(ifnet_t ifp)
10283 {
10284 _dlil_if_release(ifp, false);
10285 }
10286
10287 __private_extern__ void
dlil_if_lock(void)10288 dlil_if_lock(void)
10289 {
10290 lck_mtx_lock(&dlil_ifnet_lock);
10291 }
10292
10293 __private_extern__ void
dlil_if_unlock(void)10294 dlil_if_unlock(void)
10295 {
10296 lck_mtx_unlock(&dlil_ifnet_lock);
10297 }
10298
10299 __private_extern__ void
dlil_if_lock_assert(void)10300 dlil_if_lock_assert(void)
10301 {
10302 LCK_MTX_ASSERT(&dlil_ifnet_lock, LCK_MTX_ASSERT_OWNED);
10303 }
10304
10305 __private_extern__ void
dlil_proto_unplumb_all(struct ifnet * ifp)10306 dlil_proto_unplumb_all(struct ifnet *ifp)
10307 {
10308 /*
10309 * if_proto_hash[0-2] are for PF_INET, PF_INET6 and PF_VLAN, where
10310 * each bucket contains exactly one entry; PF_VLAN does not need an
10311 * explicit unplumb.
10312 *
10313 * if_proto_hash[3] is for other protocols; we expect anything
10314 * in this bucket to respond to the DETACHING event (which would
10315 * have happened by now) and do the unplumb then.
10316 */
10317 (void) proto_unplumb(PF_INET, ifp);
10318 (void) proto_unplumb(PF_INET6, ifp);
10319 }
10320
10321 static void
ifp_src_route_copyout(struct ifnet * ifp,struct route * dst)10322 ifp_src_route_copyout(struct ifnet *ifp, struct route *dst)
10323 {
10324 lck_mtx_lock_spin(&ifp->if_cached_route_lock);
10325 lck_mtx_convert_spin(&ifp->if_cached_route_lock);
10326
10327 route_copyout(dst, &ifp->if_src_route, sizeof(*dst));
10328
10329 lck_mtx_unlock(&ifp->if_cached_route_lock);
10330 }
10331
10332 static void
ifp_src_route_copyin(struct ifnet * ifp,struct route * src)10333 ifp_src_route_copyin(struct ifnet *ifp, struct route *src)
10334 {
10335 lck_mtx_lock_spin(&ifp->if_cached_route_lock);
10336 lck_mtx_convert_spin(&ifp->if_cached_route_lock);
10337
10338 if (ifp->if_fwd_cacheok) {
10339 route_copyin(src, &ifp->if_src_route, sizeof(*src));
10340 } else {
10341 ROUTE_RELEASE(src);
10342 }
10343 lck_mtx_unlock(&ifp->if_cached_route_lock);
10344 }
10345
10346 static void
ifp_src_route6_copyout(struct ifnet * ifp,struct route_in6 * dst)10347 ifp_src_route6_copyout(struct ifnet *ifp, struct route_in6 *dst)
10348 {
10349 lck_mtx_lock_spin(&ifp->if_cached_route_lock);
10350 lck_mtx_convert_spin(&ifp->if_cached_route_lock);
10351
10352 route_copyout((struct route *)dst, (struct route *)&ifp->if_src_route6,
10353 sizeof(*dst));
10354
10355 lck_mtx_unlock(&ifp->if_cached_route_lock);
10356 }
10357
10358 static void
ifp_src_route6_copyin(struct ifnet * ifp,struct route_in6 * src)10359 ifp_src_route6_copyin(struct ifnet *ifp, struct route_in6 *src)
10360 {
10361 lck_mtx_lock_spin(&ifp->if_cached_route_lock);
10362 lck_mtx_convert_spin(&ifp->if_cached_route_lock);
10363
10364 if (ifp->if_fwd_cacheok) {
10365 route_copyin((struct route *)src,
10366 (struct route *)&ifp->if_src_route6, sizeof(*src));
10367 } else {
10368 ROUTE_RELEASE(src);
10369 }
10370 lck_mtx_unlock(&ifp->if_cached_route_lock);
10371 }
10372
10373 struct rtentry *
ifnet_cached_rtlookup_inet(struct ifnet * ifp,struct in_addr src_ip)10374 ifnet_cached_rtlookup_inet(struct ifnet *ifp, struct in_addr src_ip)
10375 {
10376 struct route src_rt;
10377 struct sockaddr_in *dst;
10378
10379 dst = (struct sockaddr_in *)(void *)(&src_rt.ro_dst);
10380
10381 ifp_src_route_copyout(ifp, &src_rt);
10382
10383 if (ROUTE_UNUSABLE(&src_rt) || src_ip.s_addr != dst->sin_addr.s_addr) {
10384 ROUTE_RELEASE(&src_rt);
10385 if (dst->sin_family != AF_INET) {
10386 bzero(&src_rt.ro_dst, sizeof(src_rt.ro_dst));
10387 dst->sin_len = sizeof(src_rt.ro_dst);
10388 dst->sin_family = AF_INET;
10389 }
10390 dst->sin_addr = src_ip;
10391
10392 VERIFY(src_rt.ro_rt == NULL);
10393 src_rt.ro_rt = rtalloc1_scoped((struct sockaddr *)dst,
10394 0, 0, ifp->if_index);
10395
10396 if (src_rt.ro_rt != NULL) {
10397 /* retain a ref, copyin consumes one */
10398 struct rtentry *rte = src_rt.ro_rt;
10399 RT_ADDREF(rte);
10400 ifp_src_route_copyin(ifp, &src_rt);
10401 src_rt.ro_rt = rte;
10402 }
10403 }
10404
10405 return src_rt.ro_rt;
10406 }
10407
10408 struct rtentry *
ifnet_cached_rtlookup_inet6(struct ifnet * ifp,struct in6_addr * src_ip6)10409 ifnet_cached_rtlookup_inet6(struct ifnet *ifp, struct in6_addr *src_ip6)
10410 {
10411 struct route_in6 src_rt;
10412
10413 ifp_src_route6_copyout(ifp, &src_rt);
10414
10415 if (ROUTE_UNUSABLE(&src_rt) ||
10416 !IN6_ARE_ADDR_EQUAL(src_ip6, &src_rt.ro_dst.sin6_addr)) {
10417 ROUTE_RELEASE(&src_rt);
10418 if (src_rt.ro_dst.sin6_family != AF_INET6) {
10419 bzero(&src_rt.ro_dst, sizeof(src_rt.ro_dst));
10420 src_rt.ro_dst.sin6_len = sizeof(src_rt.ro_dst);
10421 src_rt.ro_dst.sin6_family = AF_INET6;
10422 }
10423 src_rt.ro_dst.sin6_scope_id = in6_addr2scopeid(ifp, src_ip6);
10424 bcopy(src_ip6, &src_rt.ro_dst.sin6_addr,
10425 sizeof(src_rt.ro_dst.sin6_addr));
10426
10427 if (src_rt.ro_rt == NULL) {
10428 src_rt.ro_rt = rtalloc1_scoped(
10429 (struct sockaddr *)&src_rt.ro_dst, 0, 0,
10430 ifp->if_index);
10431
10432 if (src_rt.ro_rt != NULL) {
10433 /* retain a ref, copyin consumes one */
10434 struct rtentry *rte = src_rt.ro_rt;
10435 RT_ADDREF(rte);
10436 ifp_src_route6_copyin(ifp, &src_rt);
10437 src_rt.ro_rt = rte;
10438 }
10439 }
10440 }
10441
10442 return src_rt.ro_rt;
10443 }
10444
10445 void
if_lqm_update(struct ifnet * ifp,int lqm,int locked)10446 if_lqm_update(struct ifnet *ifp, int lqm, int locked)
10447 {
10448 struct kev_dl_link_quality_metric_data ev_lqm_data;
10449
10450 VERIFY(lqm >= IFNET_LQM_MIN && lqm <= IFNET_LQM_MAX);
10451
10452 /* Normalize to edge */
10453 if (lqm >= 0 && lqm <= IFNET_LQM_THRESH_ABORT) {
10454 lqm = IFNET_LQM_THRESH_ABORT;
10455 atomic_bitset_32(&tcbinfo.ipi_flags,
10456 INPCBINFO_HANDLE_LQM_ABORT);
10457 inpcb_timer_sched(&tcbinfo, INPCB_TIMER_FAST);
10458 } else if (lqm > IFNET_LQM_THRESH_ABORT &&
10459 lqm <= IFNET_LQM_THRESH_MINIMALLY_VIABLE) {
10460 lqm = IFNET_LQM_THRESH_MINIMALLY_VIABLE;
10461 } else if (lqm > IFNET_LQM_THRESH_MINIMALLY_VIABLE &&
10462 lqm <= IFNET_LQM_THRESH_POOR) {
10463 lqm = IFNET_LQM_THRESH_POOR;
10464 } else if (lqm > IFNET_LQM_THRESH_POOR &&
10465 lqm <= IFNET_LQM_THRESH_GOOD) {
10466 lqm = IFNET_LQM_THRESH_GOOD;
10467 }
10468
10469 /*
10470 * Take the lock if needed
10471 */
10472 if (!locked) {
10473 ifnet_lock_exclusive(ifp);
10474 }
10475
10476 if (lqm == ifp->if_interface_state.lqm_state &&
10477 (ifp->if_interface_state.valid_bitmask &
10478 IF_INTERFACE_STATE_LQM_STATE_VALID)) {
10479 /*
10480 * Release the lock if was not held by the caller
10481 */
10482 if (!locked) {
10483 ifnet_lock_done(ifp);
10484 }
10485 return; /* nothing to update */
10486 }
10487 ifp->if_interface_state.valid_bitmask |=
10488 IF_INTERFACE_STATE_LQM_STATE_VALID;
10489 ifp->if_interface_state.lqm_state = (int8_t)lqm;
10490
10491 /*
10492 * Don't want to hold the lock when issuing kernel events
10493 */
10494 ifnet_lock_done(ifp);
10495
10496 bzero(&ev_lqm_data, sizeof(ev_lqm_data));
10497 ev_lqm_data.link_quality_metric = lqm;
10498
10499 dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_LINK_QUALITY_METRIC_CHANGED,
10500 (struct net_event_data *)&ev_lqm_data, sizeof(ev_lqm_data), FALSE);
10501
10502 /*
10503 * Reacquire the lock for the caller
10504 */
10505 if (locked) {
10506 ifnet_lock_exclusive(ifp);
10507 }
10508 }
10509
10510 static void
if_rrc_state_update(struct ifnet * ifp,unsigned int rrc_state)10511 if_rrc_state_update(struct ifnet *ifp, unsigned int rrc_state)
10512 {
10513 struct kev_dl_rrc_state kev;
10514
10515 if (rrc_state == ifp->if_interface_state.rrc_state &&
10516 (ifp->if_interface_state.valid_bitmask &
10517 IF_INTERFACE_STATE_RRC_STATE_VALID)) {
10518 return;
10519 }
10520
10521 ifp->if_interface_state.valid_bitmask |=
10522 IF_INTERFACE_STATE_RRC_STATE_VALID;
10523
10524 ifp->if_interface_state.rrc_state = (uint8_t)rrc_state;
10525
10526 /*
10527 * Don't want to hold the lock when issuing kernel events
10528 */
10529 ifnet_lock_done(ifp);
10530
10531 bzero(&kev, sizeof(struct kev_dl_rrc_state));
10532 kev.rrc_state = rrc_state;
10533
10534 dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_RRC_STATE_CHANGED,
10535 (struct net_event_data *)&kev, sizeof(struct kev_dl_rrc_state), FALSE);
10536
10537 ifnet_lock_exclusive(ifp);
10538 }
10539
10540 errno_t
if_state_update(struct ifnet * ifp,struct if_interface_state * if_interface_state)10541 if_state_update(struct ifnet *ifp,
10542 struct if_interface_state *if_interface_state)
10543 {
10544 u_short if_index_available = 0;
10545
10546 ifnet_lock_exclusive(ifp);
10547
10548 if ((ifp->if_type != IFT_CELLULAR) &&
10549 (if_interface_state->valid_bitmask &
10550 IF_INTERFACE_STATE_RRC_STATE_VALID)) {
10551 ifnet_lock_done(ifp);
10552 return ENOTSUP;
10553 }
10554 if ((if_interface_state->valid_bitmask &
10555 IF_INTERFACE_STATE_LQM_STATE_VALID) &&
10556 (if_interface_state->lqm_state < IFNET_LQM_MIN ||
10557 if_interface_state->lqm_state > IFNET_LQM_MAX)) {
10558 ifnet_lock_done(ifp);
10559 return EINVAL;
10560 }
10561 if ((if_interface_state->valid_bitmask &
10562 IF_INTERFACE_STATE_RRC_STATE_VALID) &&
10563 if_interface_state->rrc_state !=
10564 IF_INTERFACE_STATE_RRC_STATE_IDLE &&
10565 if_interface_state->rrc_state !=
10566 IF_INTERFACE_STATE_RRC_STATE_CONNECTED) {
10567 ifnet_lock_done(ifp);
10568 return EINVAL;
10569 }
10570
10571 if (if_interface_state->valid_bitmask &
10572 IF_INTERFACE_STATE_LQM_STATE_VALID) {
10573 if_lqm_update(ifp, if_interface_state->lqm_state, 1);
10574 }
10575 if (if_interface_state->valid_bitmask &
10576 IF_INTERFACE_STATE_RRC_STATE_VALID) {
10577 if_rrc_state_update(ifp, if_interface_state->rrc_state);
10578 }
10579 if (if_interface_state->valid_bitmask &
10580 IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID) {
10581 ifp->if_interface_state.valid_bitmask |=
10582 IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID;
10583 ifp->if_interface_state.interface_availability =
10584 if_interface_state->interface_availability;
10585
10586 if (ifp->if_interface_state.interface_availability ==
10587 IF_INTERFACE_STATE_INTERFACE_AVAILABLE) {
10588 os_log(OS_LOG_DEFAULT, "%s: interface %s (%u) available\n",
10589 __func__, if_name(ifp), ifp->if_index);
10590 if_index_available = ifp->if_index;
10591 } else {
10592 os_log(OS_LOG_DEFAULT, "%s: interface %s (%u) unavailable)\n",
10593 __func__, if_name(ifp), ifp->if_index);
10594 }
10595 }
10596 ifnet_lock_done(ifp);
10597
10598 /*
10599 * Check if the TCP connections going on this interface should be
10600 * forced to send probe packets instead of waiting for TCP timers
10601 * to fire. This is done on an explicit notification such as
10602 * SIOCSIFINTERFACESTATE which marks the interface as available.
10603 */
10604 if (if_index_available > 0) {
10605 tcp_interface_send_probe(if_index_available);
10606 }
10607
10608 return 0;
10609 }
10610
10611 void
if_get_state(struct ifnet * ifp,struct if_interface_state * if_interface_state)10612 if_get_state(struct ifnet *ifp,
10613 struct if_interface_state *if_interface_state)
10614 {
10615 ifnet_lock_shared(ifp);
10616
10617 if_interface_state->valid_bitmask = 0;
10618
10619 if (ifp->if_interface_state.valid_bitmask &
10620 IF_INTERFACE_STATE_RRC_STATE_VALID) {
10621 if_interface_state->valid_bitmask |=
10622 IF_INTERFACE_STATE_RRC_STATE_VALID;
10623 if_interface_state->rrc_state =
10624 ifp->if_interface_state.rrc_state;
10625 }
10626 if (ifp->if_interface_state.valid_bitmask &
10627 IF_INTERFACE_STATE_LQM_STATE_VALID) {
10628 if_interface_state->valid_bitmask |=
10629 IF_INTERFACE_STATE_LQM_STATE_VALID;
10630 if_interface_state->lqm_state =
10631 ifp->if_interface_state.lqm_state;
10632 }
10633 if (ifp->if_interface_state.valid_bitmask &
10634 IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID) {
10635 if_interface_state->valid_bitmask |=
10636 IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID;
10637 if_interface_state->interface_availability =
10638 ifp->if_interface_state.interface_availability;
10639 }
10640
10641 ifnet_lock_done(ifp);
10642 }
10643
10644 errno_t
if_probe_connectivity(struct ifnet * ifp,u_int32_t conn_probe)10645 if_probe_connectivity(struct ifnet *ifp, u_int32_t conn_probe)
10646 {
10647 if (conn_probe > 1) {
10648 return EINVAL;
10649 }
10650 if (conn_probe == 0) {
10651 if_clear_eflags(ifp, IFEF_PROBE_CONNECTIVITY);
10652 } else {
10653 if_set_eflags(ifp, IFEF_PROBE_CONNECTIVITY);
10654 }
10655
10656 #if NECP
10657 necp_update_all_clients();
10658 #endif /* NECP */
10659
10660 tcp_probe_connectivity(ifp, conn_probe);
10661 return 0;
10662 }
10663
10664 /* for uuid.c */
10665 static int
get_ether_index(int * ret_other_index)10666 get_ether_index(int * ret_other_index)
10667 {
10668 struct ifnet *ifp;
10669 int en0_index = 0;
10670 int other_en_index = 0;
10671 int any_ether_index = 0;
10672 short best_unit = 0;
10673
10674 *ret_other_index = 0;
10675 TAILQ_FOREACH(ifp, &ifnet_head, if_link) {
10676 /*
10677 * find en0, or if not en0, the lowest unit en*, and if not
10678 * that, any ethernet
10679 */
10680 ifnet_lock_shared(ifp);
10681 if (strcmp(ifp->if_name, "en") == 0) {
10682 if (ifp->if_unit == 0) {
10683 /* found en0, we're done */
10684 en0_index = ifp->if_index;
10685 ifnet_lock_done(ifp);
10686 break;
10687 }
10688 if (other_en_index == 0 || ifp->if_unit < best_unit) {
10689 other_en_index = ifp->if_index;
10690 best_unit = ifp->if_unit;
10691 }
10692 } else if (ifp->if_type == IFT_ETHER && any_ether_index == 0) {
10693 any_ether_index = ifp->if_index;
10694 }
10695 ifnet_lock_done(ifp);
10696 }
10697 if (en0_index == 0) {
10698 if (other_en_index != 0) {
10699 *ret_other_index = other_en_index;
10700 } else if (any_ether_index != 0) {
10701 *ret_other_index = any_ether_index;
10702 }
10703 }
10704 return en0_index;
10705 }
10706
10707 int
uuid_get_ethernet(u_int8_t * node)10708 uuid_get_ethernet(u_int8_t *node)
10709 {
10710 static int en0_index;
10711 struct ifnet *ifp;
10712 int other_index = 0;
10713 int the_index = 0;
10714 int ret;
10715
10716 ifnet_head_lock_shared();
10717 if (en0_index == 0 || ifindex2ifnet[en0_index] == NULL) {
10718 en0_index = get_ether_index(&other_index);
10719 }
10720 if (en0_index != 0) {
10721 the_index = en0_index;
10722 } else if (other_index != 0) {
10723 the_index = other_index;
10724 }
10725 if (the_index != 0) {
10726 struct dlil_ifnet *dl_if;
10727
10728 ifp = ifindex2ifnet[the_index];
10729 VERIFY(ifp != NULL);
10730 dl_if = (struct dlil_ifnet *)ifp;
10731 if (dl_if->dl_if_permanent_ether_is_set != 0) {
10732 /*
10733 * Use the permanent ethernet address if it is
10734 * available because it will never change.
10735 */
10736 memcpy(node, dl_if->dl_if_permanent_ether,
10737 ETHER_ADDR_LEN);
10738 } else {
10739 memcpy(node, IF_LLADDR(ifp), ETHER_ADDR_LEN);
10740 }
10741 ret = 0;
10742 } else {
10743 ret = -1;
10744 }
10745 ifnet_head_done();
10746 return ret;
10747 }
10748
10749 static int
10750 sysctl_rxpoll SYSCTL_HANDLER_ARGS
10751 {
10752 #pragma unused(arg1, arg2)
10753 uint32_t i;
10754 int err;
10755
10756 i = if_rxpoll;
10757
10758 err = sysctl_handle_int(oidp, &i, 0, req);
10759 if (err != 0 || req->newptr == USER_ADDR_NULL) {
10760 return err;
10761 }
10762
10763 if (net_rxpoll == 0) {
10764 return ENXIO;
10765 }
10766
10767 if_rxpoll = i;
10768 return err;
10769 }
10770
10771 static int
10772 sysctl_rxpoll_mode_holdtime SYSCTL_HANDLER_ARGS
10773 {
10774 #pragma unused(arg1, arg2)
10775 uint64_t q;
10776 int err;
10777
10778 q = if_rxpoll_mode_holdtime;
10779
10780 err = sysctl_handle_quad(oidp, &q, 0, req);
10781 if (err != 0 || req->newptr == USER_ADDR_NULL) {
10782 return err;
10783 }
10784
10785 if (q < IF_RXPOLL_MODE_HOLDTIME_MIN) {
10786 q = IF_RXPOLL_MODE_HOLDTIME_MIN;
10787 }
10788
10789 if_rxpoll_mode_holdtime = q;
10790
10791 return err;
10792 }
10793
10794 static int
10795 sysctl_rxpoll_sample_holdtime SYSCTL_HANDLER_ARGS
10796 {
10797 #pragma unused(arg1, arg2)
10798 uint64_t q;
10799 int err;
10800
10801 q = if_rxpoll_sample_holdtime;
10802
10803 err = sysctl_handle_quad(oidp, &q, 0, req);
10804 if (err != 0 || req->newptr == USER_ADDR_NULL) {
10805 return err;
10806 }
10807
10808 if (q < IF_RXPOLL_SAMPLETIME_MIN) {
10809 q = IF_RXPOLL_SAMPLETIME_MIN;
10810 }
10811
10812 if_rxpoll_sample_holdtime = q;
10813
10814 return err;
10815 }
10816
10817 static int
10818 sysctl_rxpoll_interval_time SYSCTL_HANDLER_ARGS
10819 {
10820 #pragma unused(arg1, arg2)
10821 uint64_t q;
10822 int err;
10823
10824 q = if_rxpoll_interval_time;
10825
10826 err = sysctl_handle_quad(oidp, &q, 0, req);
10827 if (err != 0 || req->newptr == USER_ADDR_NULL) {
10828 return err;
10829 }
10830
10831 if (q < IF_RXPOLL_INTERVALTIME_MIN) {
10832 q = IF_RXPOLL_INTERVALTIME_MIN;
10833 }
10834
10835 if_rxpoll_interval_time = q;
10836
10837 return err;
10838 }
10839
10840 static int
10841 sysctl_rxpoll_wlowat SYSCTL_HANDLER_ARGS
10842 {
10843 #pragma unused(arg1, arg2)
10844 uint32_t i;
10845 int err;
10846
10847 i = if_sysctl_rxpoll_wlowat;
10848
10849 err = sysctl_handle_int(oidp, &i, 0, req);
10850 if (err != 0 || req->newptr == USER_ADDR_NULL) {
10851 return err;
10852 }
10853
10854 if (i == 0 || i >= if_sysctl_rxpoll_whiwat) {
10855 return EINVAL;
10856 }
10857
10858 if_sysctl_rxpoll_wlowat = i;
10859 return err;
10860 }
10861
10862 static int
10863 sysctl_rxpoll_whiwat SYSCTL_HANDLER_ARGS
10864 {
10865 #pragma unused(arg1, arg2)
10866 uint32_t i;
10867 int err;
10868
10869 i = if_sysctl_rxpoll_whiwat;
10870
10871 err = sysctl_handle_int(oidp, &i, 0, req);
10872 if (err != 0 || req->newptr == USER_ADDR_NULL) {
10873 return err;
10874 }
10875
10876 if (i <= if_sysctl_rxpoll_wlowat) {
10877 return EINVAL;
10878 }
10879
10880 if_sysctl_rxpoll_whiwat = i;
10881 return err;
10882 }
10883
10884 static int
10885 sysctl_sndq_maxlen SYSCTL_HANDLER_ARGS
10886 {
10887 #pragma unused(arg1, arg2)
10888 int i, err;
10889
10890 i = if_sndq_maxlen;
10891
10892 err = sysctl_handle_int(oidp, &i, 0, req);
10893 if (err != 0 || req->newptr == USER_ADDR_NULL) {
10894 return err;
10895 }
10896
10897 if (i < IF_SNDQ_MINLEN) {
10898 i = IF_SNDQ_MINLEN;
10899 }
10900
10901 if_sndq_maxlen = i;
10902 return err;
10903 }
10904
10905 static int
10906 sysctl_rcvq_maxlen SYSCTL_HANDLER_ARGS
10907 {
10908 #pragma unused(arg1, arg2)
10909 int i, err;
10910
10911 i = if_rcvq_maxlen;
10912
10913 err = sysctl_handle_int(oidp, &i, 0, req);
10914 if (err != 0 || req->newptr == USER_ADDR_NULL) {
10915 return err;
10916 }
10917
10918 if (i < IF_RCVQ_MINLEN) {
10919 i = IF_RCVQ_MINLEN;
10920 }
10921
10922 if_rcvq_maxlen = i;
10923 return err;
10924 }
10925
10926 int
dlil_node_present(struct ifnet * ifp,struct sockaddr * sa,int32_t rssi,int lqm,int npm,u_int8_t srvinfo[48])10927 dlil_node_present(struct ifnet *ifp, struct sockaddr *sa,
10928 int32_t rssi, int lqm, int npm, u_int8_t srvinfo[48])
10929 {
10930 struct kev_dl_node_presence kev;
10931 struct sockaddr_dl *sdl;
10932 struct sockaddr_in6 *sin6;
10933 int ret = 0;
10934
10935 VERIFY(ifp);
10936 VERIFY(sa);
10937 VERIFY(sa->sa_family == AF_LINK || sa->sa_family == AF_INET6);
10938
10939 bzero(&kev, sizeof(kev));
10940 sin6 = &kev.sin6_node_address;
10941 sdl = &kev.sdl_node_address;
10942 nd6_alt_node_addr_decompose(ifp, sa, sdl, sin6);
10943 kev.rssi = rssi;
10944 kev.link_quality_metric = lqm;
10945 kev.node_proximity_metric = npm;
10946 bcopy(srvinfo, kev.node_service_info, sizeof(kev.node_service_info));
10947
10948 ret = nd6_alt_node_present(ifp, sin6, sdl, rssi, lqm, npm);
10949 if (ret == 0 || ret == EEXIST) {
10950 int err = dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_NODE_PRESENCE,
10951 &kev.link_data, sizeof(kev), (ret == EEXIST) ? TRUE : FALSE);
10952 if (err != 0) {
10953 log(LOG_ERR, "%s: Post DL_NODE_PRESENCE failed with"
10954 "error %d\n", __func__, err);
10955 }
10956 }
10957
10958 if (ret == EEXIST) {
10959 ret = 0;
10960 }
10961 return ret;
10962 }
10963
10964 void
dlil_node_absent(struct ifnet * ifp,struct sockaddr * sa)10965 dlil_node_absent(struct ifnet *ifp, struct sockaddr *sa)
10966 {
10967 struct kev_dl_node_absence kev = {};
10968 struct sockaddr_in6 *kev_sin6 = NULL;
10969 struct sockaddr_dl *kev_sdl = NULL;
10970 int error = 0;
10971
10972 VERIFY(ifp != NULL);
10973 VERIFY(sa != NULL);
10974 VERIFY(sa->sa_family == AF_LINK || sa->sa_family == AF_INET6);
10975
10976 kev_sin6 = &kev.sin6_node_address;
10977 kev_sdl = &kev.sdl_node_address;
10978
10979 if (sa->sa_family == AF_INET6) {
10980 /*
10981 * If IPv6 address is given, get the link layer
10982 * address from what was cached in the neighbor cache
10983 */
10984 VERIFY(sa->sa_len <= sizeof(*kev_sin6));
10985 bcopy(sa, kev_sin6, sa->sa_len);
10986 error = nd6_alt_node_absent(ifp, kev_sin6, kev_sdl);
10987 } else {
10988 /*
10989 * If passed address is AF_LINK type, derive the address
10990 * based on the link address.
10991 */
10992 nd6_alt_node_addr_decompose(ifp, sa, kev_sdl, kev_sin6);
10993 error = nd6_alt_node_absent(ifp, kev_sin6, NULL);
10994 }
10995
10996 if (error == 0) {
10997 kev_sdl->sdl_type = ifp->if_type;
10998 kev_sdl->sdl_index = ifp->if_index;
10999
11000 dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_NODE_ABSENCE,
11001 &kev.link_data, sizeof(kev), FALSE);
11002 }
11003 }
11004
11005 int
dlil_node_present_v2(struct ifnet * ifp,struct sockaddr * sa,struct sockaddr_dl * sdl,int32_t rssi,int lqm,int npm,u_int8_t srvinfo[48])11006 dlil_node_present_v2(struct ifnet *ifp, struct sockaddr *sa, struct sockaddr_dl *sdl,
11007 int32_t rssi, int lqm, int npm, u_int8_t srvinfo[48])
11008 {
11009 struct kev_dl_node_presence kev = {};
11010 struct sockaddr_dl *kev_sdl = NULL;
11011 struct sockaddr_in6 *kev_sin6 = NULL;
11012 int ret = 0;
11013
11014 VERIFY(ifp != NULL);
11015 VERIFY(sa != NULL && sdl != NULL);
11016 VERIFY(sa->sa_family == AF_INET6 && sdl->sdl_family == AF_LINK);
11017
11018 kev_sin6 = &kev.sin6_node_address;
11019 kev_sdl = &kev.sdl_node_address;
11020
11021 VERIFY(sdl->sdl_len <= sizeof(*kev_sdl));
11022 bcopy(sdl, kev_sdl, sdl->sdl_len);
11023 kev_sdl->sdl_type = ifp->if_type;
11024 kev_sdl->sdl_index = ifp->if_index;
11025
11026 VERIFY(sa->sa_len <= sizeof(*kev_sin6));
11027 bcopy(sa, kev_sin6, sa->sa_len);
11028
11029 kev.rssi = rssi;
11030 kev.link_quality_metric = lqm;
11031 kev.node_proximity_metric = npm;
11032 bcopy(srvinfo, kev.node_service_info, sizeof(kev.node_service_info));
11033
11034 ret = nd6_alt_node_present(ifp, SIN6(sa), sdl, rssi, lqm, npm);
11035 if (ret == 0 || ret == EEXIST) {
11036 int err = dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_NODE_PRESENCE,
11037 &kev.link_data, sizeof(kev), (ret == EEXIST) ? TRUE : FALSE);
11038 if (err != 0) {
11039 log(LOG_ERR, "%s: Post DL_NODE_PRESENCE failed with error %d\n", __func__, err);
11040 }
11041 }
11042
11043 if (ret == EEXIST) {
11044 ret = 0;
11045 }
11046 return ret;
11047 }
11048
11049 const void *
dlil_ifaddr_bytes(const struct sockaddr_dl * sdl,size_t * sizep,kauth_cred_t * credp)11050 dlil_ifaddr_bytes(const struct sockaddr_dl *sdl, size_t *sizep,
11051 kauth_cred_t *credp)
11052 {
11053 const u_int8_t *bytes;
11054 size_t size;
11055
11056 bytes = CONST_LLADDR(sdl);
11057 size = sdl->sdl_alen;
11058
11059 #if CONFIG_MACF
11060 if (dlil_lladdr_ckreq) {
11061 switch (sdl->sdl_type) {
11062 case IFT_ETHER:
11063 case IFT_IEEE1394:
11064 break;
11065 default:
11066 credp = NULL;
11067 break;
11068 }
11069 ;
11070
11071 if (credp && mac_system_check_info(*credp, "net.link.addr")) {
11072 static const u_int8_t unspec[FIREWIRE_EUI64_LEN] = {
11073 [0] = 2
11074 };
11075
11076 bytes = unspec;
11077 }
11078 }
11079 #else
11080 #pragma unused(credp)
11081 #endif
11082
11083 if (sizep != NULL) {
11084 *sizep = size;
11085 }
11086 return bytes;
11087 }
11088
11089 void
dlil_report_issues(struct ifnet * ifp,u_int8_t modid[DLIL_MODIDLEN],u_int8_t info[DLIL_MODARGLEN])11090 dlil_report_issues(struct ifnet *ifp, u_int8_t modid[DLIL_MODIDLEN],
11091 u_int8_t info[DLIL_MODARGLEN])
11092 {
11093 struct kev_dl_issues kev;
11094 struct timeval tv;
11095
11096 VERIFY(ifp != NULL);
11097 VERIFY(modid != NULL);
11098 _CASSERT(sizeof(kev.modid) == DLIL_MODIDLEN);
11099 _CASSERT(sizeof(kev.info) == DLIL_MODARGLEN);
11100
11101 bzero(&kev, sizeof(kev));
11102
11103 microtime(&tv);
11104 kev.timestamp = tv.tv_sec;
11105 bcopy(modid, &kev.modid, DLIL_MODIDLEN);
11106 if (info != NULL) {
11107 bcopy(info, &kev.info, DLIL_MODARGLEN);
11108 }
11109
11110 dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_ISSUES,
11111 &kev.link_data, sizeof(kev), FALSE);
11112 }
11113
11114 errno_t
ifnet_getset_opportunistic(ifnet_t ifp,u_long cmd,struct ifreq * ifr,struct proc * p)11115 ifnet_getset_opportunistic(ifnet_t ifp, u_long cmd, struct ifreq *ifr,
11116 struct proc *p)
11117 {
11118 u_int32_t level = IFNET_THROTTLE_OFF;
11119 errno_t result = 0;
11120
11121 VERIFY(cmd == SIOCSIFOPPORTUNISTIC || cmd == SIOCGIFOPPORTUNISTIC);
11122
11123 if (cmd == SIOCSIFOPPORTUNISTIC) {
11124 /*
11125 * XXX: Use priv_check_cred() instead of root check?
11126 */
11127 if ((result = proc_suser(p)) != 0) {
11128 return result;
11129 }
11130
11131 if (ifr->ifr_opportunistic.ifo_flags ==
11132 IFRIFOF_BLOCK_OPPORTUNISTIC) {
11133 level = IFNET_THROTTLE_OPPORTUNISTIC;
11134 } else if (ifr->ifr_opportunistic.ifo_flags == 0) {
11135 level = IFNET_THROTTLE_OFF;
11136 } else {
11137 result = EINVAL;
11138 }
11139
11140 if (result == 0) {
11141 result = ifnet_set_throttle(ifp, level);
11142 }
11143 } else if ((result = ifnet_get_throttle(ifp, &level)) == 0) {
11144 ifr->ifr_opportunistic.ifo_flags = 0;
11145 if (level == IFNET_THROTTLE_OPPORTUNISTIC) {
11146 ifr->ifr_opportunistic.ifo_flags |=
11147 IFRIFOF_BLOCK_OPPORTUNISTIC;
11148 }
11149 }
11150
11151 /*
11152 * Return the count of current opportunistic connections
11153 * over the interface.
11154 */
11155 if (result == 0) {
11156 uint32_t flags = 0;
11157 flags |= (cmd == SIOCSIFOPPORTUNISTIC) ?
11158 INPCB_OPPORTUNISTIC_SETCMD : 0;
11159 flags |= (level == IFNET_THROTTLE_OPPORTUNISTIC) ?
11160 INPCB_OPPORTUNISTIC_THROTTLEON : 0;
11161 ifr->ifr_opportunistic.ifo_inuse =
11162 udp_count_opportunistic(ifp->if_index, flags) +
11163 tcp_count_opportunistic(ifp->if_index, flags);
11164 }
11165
11166 if (result == EALREADY) {
11167 result = 0;
11168 }
11169
11170 return result;
11171 }
11172
11173 int
ifnet_get_throttle(struct ifnet * ifp,u_int32_t * level)11174 ifnet_get_throttle(struct ifnet *ifp, u_int32_t *level)
11175 {
11176 struct ifclassq *ifq;
11177 int err = 0;
11178
11179 if (!(ifp->if_eflags & IFEF_TXSTART)) {
11180 return ENXIO;
11181 }
11182
11183 *level = IFNET_THROTTLE_OFF;
11184
11185 ifq = ifp->if_snd;
11186 IFCQ_LOCK(ifq);
11187 /* Throttling works only for IFCQ, not ALTQ instances */
11188 if (IFCQ_IS_ENABLED(ifq)) {
11189 cqrq_throttle_t req = { 0, IFNET_THROTTLE_OFF };
11190
11191 err = fq_if_request_classq(ifq, CLASSQRQ_THROTTLE, &req);
11192 *level = req.level;
11193 }
11194 IFCQ_UNLOCK(ifq);
11195
11196 return err;
11197 }
11198
11199 int
ifnet_set_throttle(struct ifnet * ifp,u_int32_t level)11200 ifnet_set_throttle(struct ifnet *ifp, u_int32_t level)
11201 {
11202 struct ifclassq *ifq;
11203 int err = 0;
11204
11205 if (!(ifp->if_eflags & IFEF_TXSTART)) {
11206 return ENXIO;
11207 }
11208
11209 ifq = ifp->if_snd;
11210
11211 switch (level) {
11212 case IFNET_THROTTLE_OFF:
11213 case IFNET_THROTTLE_OPPORTUNISTIC:
11214 break;
11215 default:
11216 return EINVAL;
11217 }
11218
11219 IFCQ_LOCK(ifq);
11220 if (IFCQ_IS_ENABLED(ifq)) {
11221 cqrq_throttle_t req = { 1, level };
11222
11223 err = fq_if_request_classq(ifq, CLASSQRQ_THROTTLE, &req);
11224 }
11225 IFCQ_UNLOCK(ifq);
11226
11227 if (err == 0) {
11228 DLIL_PRINTF("%s: throttling level set to %d\n", if_name(ifp),
11229 level);
11230 #if NECP
11231 necp_update_all_clients();
11232 #endif /* NECP */
11233 if (level == IFNET_THROTTLE_OFF) {
11234 ifnet_start(ifp);
11235 }
11236 }
11237
11238 return err;
11239 }
11240
11241 errno_t
ifnet_getset_log(ifnet_t ifp,u_long cmd,struct ifreq * ifr,struct proc * p)11242 ifnet_getset_log(ifnet_t ifp, u_long cmd, struct ifreq *ifr,
11243 struct proc *p)
11244 {
11245 #pragma unused(p)
11246 errno_t result = 0;
11247 uint32_t flags;
11248 int level, category, subcategory;
11249
11250 VERIFY(cmd == SIOCSIFLOG || cmd == SIOCGIFLOG);
11251
11252 if (cmd == SIOCSIFLOG) {
11253 if ((result = priv_check_cred(kauth_cred_get(),
11254 PRIV_NET_INTERFACE_CONTROL, 0)) != 0) {
11255 return result;
11256 }
11257
11258 level = ifr->ifr_log.ifl_level;
11259 if (level < IFNET_LOG_MIN || level > IFNET_LOG_MAX) {
11260 result = EINVAL;
11261 }
11262
11263 flags = ifr->ifr_log.ifl_flags;
11264 if ((flags &= IFNET_LOGF_MASK) == 0) {
11265 result = EINVAL;
11266 }
11267
11268 category = ifr->ifr_log.ifl_category;
11269 subcategory = ifr->ifr_log.ifl_subcategory;
11270
11271 if (result == 0) {
11272 result = ifnet_set_log(ifp, level, flags,
11273 category, subcategory);
11274 }
11275 } else {
11276 result = ifnet_get_log(ifp, &level, &flags, &category,
11277 &subcategory);
11278 if (result == 0) {
11279 ifr->ifr_log.ifl_level = level;
11280 ifr->ifr_log.ifl_flags = flags;
11281 ifr->ifr_log.ifl_category = category;
11282 ifr->ifr_log.ifl_subcategory = subcategory;
11283 }
11284 }
11285
11286 return result;
11287 }
11288
11289 int
ifnet_set_log(struct ifnet * ifp,int32_t level,uint32_t flags,int32_t category,int32_t subcategory)11290 ifnet_set_log(struct ifnet *ifp, int32_t level, uint32_t flags,
11291 int32_t category, int32_t subcategory)
11292 {
11293 int err = 0;
11294
11295 VERIFY(level >= IFNET_LOG_MIN && level <= IFNET_LOG_MAX);
11296 VERIFY(flags & IFNET_LOGF_MASK);
11297
11298 /*
11299 * The logging level applies to all facilities; make sure to
11300 * update them all with the most current level.
11301 */
11302 flags |= ifp->if_log.flags;
11303
11304 if (ifp->if_output_ctl != NULL) {
11305 struct ifnet_log_params l;
11306
11307 bzero(&l, sizeof(l));
11308 l.level = level;
11309 l.flags = flags;
11310 l.flags &= ~IFNET_LOGF_DLIL;
11311 l.category = category;
11312 l.subcategory = subcategory;
11313
11314 /* Send this request to lower layers */
11315 if (l.flags != 0) {
11316 err = ifp->if_output_ctl(ifp, IFNET_CTL_SET_LOG,
11317 sizeof(l), &l);
11318 }
11319 } else if ((flags & ~IFNET_LOGF_DLIL) && ifp->if_output_ctl == NULL) {
11320 /*
11321 * If targeted to the lower layers without an output
11322 * control callback registered on the interface, just
11323 * silently ignore facilities other than ours.
11324 */
11325 flags &= IFNET_LOGF_DLIL;
11326 if (flags == 0 && (!(ifp->if_log.flags & IFNET_LOGF_DLIL))) {
11327 level = 0;
11328 }
11329 }
11330
11331 if (err == 0) {
11332 if ((ifp->if_log.level = level) == IFNET_LOG_DEFAULT) {
11333 ifp->if_log.flags = 0;
11334 } else {
11335 ifp->if_log.flags |= flags;
11336 }
11337
11338 log(LOG_INFO, "%s: logging level set to %d flags=%b "
11339 "arg=%b, category=%d subcategory=%d\n", if_name(ifp),
11340 ifp->if_log.level, ifp->if_log.flags,
11341 IFNET_LOGF_BITS, flags, IFNET_LOGF_BITS,
11342 category, subcategory);
11343 }
11344
11345 return err;
11346 }
11347
11348 int
ifnet_get_log(struct ifnet * ifp,int32_t * level,uint32_t * flags,int32_t * category,int32_t * subcategory)11349 ifnet_get_log(struct ifnet *ifp, int32_t *level, uint32_t *flags,
11350 int32_t *category, int32_t *subcategory)
11351 {
11352 if (level != NULL) {
11353 *level = ifp->if_log.level;
11354 }
11355 if (flags != NULL) {
11356 *flags = ifp->if_log.flags;
11357 }
11358 if (category != NULL) {
11359 *category = ifp->if_log.category;
11360 }
11361 if (subcategory != NULL) {
11362 *subcategory = ifp->if_log.subcategory;
11363 }
11364
11365 return 0;
11366 }
11367
11368 int
ifnet_notify_address(struct ifnet * ifp,int af)11369 ifnet_notify_address(struct ifnet *ifp, int af)
11370 {
11371 struct ifnet_notify_address_params na;
11372
11373 #if PF
11374 (void) pf_ifaddr_hook(ifp);
11375 #endif /* PF */
11376
11377 if (ifp->if_output_ctl == NULL) {
11378 return EOPNOTSUPP;
11379 }
11380
11381 bzero(&na, sizeof(na));
11382 na.address_family = (sa_family_t)af;
11383
11384 return ifp->if_output_ctl(ifp, IFNET_CTL_NOTIFY_ADDRESS,
11385 sizeof(na), &na);
11386 }
11387
11388 errno_t
ifnet_flowid(struct ifnet * ifp,uint32_t * flowid)11389 ifnet_flowid(struct ifnet *ifp, uint32_t *flowid)
11390 {
11391 if (ifp == NULL || flowid == NULL) {
11392 return EINVAL;
11393 } else if (!(ifp->if_eflags & IFEF_TXSTART) ||
11394 !IF_FULLY_ATTACHED(ifp)) {
11395 return ENXIO;
11396 }
11397
11398 *flowid = ifp->if_flowhash;
11399
11400 return 0;
11401 }
11402
11403 errno_t
ifnet_disable_output(struct ifnet * ifp)11404 ifnet_disable_output(struct ifnet *ifp)
11405 {
11406 int err;
11407
11408 if (ifp == NULL) {
11409 return EINVAL;
11410 } else if (!(ifp->if_eflags & IFEF_TXSTART) ||
11411 !IF_FULLY_ATTACHED(ifp)) {
11412 return ENXIO;
11413 }
11414
11415 if ((err = ifnet_fc_add(ifp)) == 0) {
11416 lck_mtx_lock_spin(&ifp->if_start_lock);
11417 ifp->if_start_flags |= IFSF_FLOW_CONTROLLED;
11418 lck_mtx_unlock(&ifp->if_start_lock);
11419 }
11420 return err;
11421 }
11422
11423 errno_t
ifnet_enable_output(struct ifnet * ifp)11424 ifnet_enable_output(struct ifnet *ifp)
11425 {
11426 if (ifp == NULL) {
11427 return EINVAL;
11428 } else if (!(ifp->if_eflags & IFEF_TXSTART) ||
11429 !IF_FULLY_ATTACHED(ifp)) {
11430 return ENXIO;
11431 }
11432
11433 ifnet_start_common(ifp, TRUE);
11434 return 0;
11435 }
11436
11437 void
ifnet_flowadv(uint32_t flowhash)11438 ifnet_flowadv(uint32_t flowhash)
11439 {
11440 struct ifnet_fc_entry *ifce;
11441 struct ifnet *ifp;
11442
11443 ifce = ifnet_fc_get(flowhash);
11444 if (ifce == NULL) {
11445 return;
11446 }
11447
11448 VERIFY(ifce->ifce_ifp != NULL);
11449 ifp = ifce->ifce_ifp;
11450
11451 /* flow hash gets recalculated per attach, so check */
11452 if (ifnet_is_attached(ifp, 1)) {
11453 if (ifp->if_flowhash == flowhash) {
11454 (void) ifnet_enable_output(ifp);
11455 }
11456 ifnet_decr_iorefcnt(ifp);
11457 }
11458 ifnet_fc_entry_free(ifce);
11459 }
11460
11461 /*
11462 * Function to compare ifnet_fc_entries in ifnet flow control tree
11463 */
11464 static inline int
ifce_cmp(const struct ifnet_fc_entry * fc1,const struct ifnet_fc_entry * fc2)11465 ifce_cmp(const struct ifnet_fc_entry *fc1, const struct ifnet_fc_entry *fc2)
11466 {
11467 return fc1->ifce_flowhash - fc2->ifce_flowhash;
11468 }
11469
11470 static int
ifnet_fc_add(struct ifnet * ifp)11471 ifnet_fc_add(struct ifnet *ifp)
11472 {
11473 struct ifnet_fc_entry keyfc, *ifce;
11474 uint32_t flowhash;
11475
11476 VERIFY(ifp != NULL && (ifp->if_eflags & IFEF_TXSTART));
11477 VERIFY(ifp->if_flowhash != 0);
11478 flowhash = ifp->if_flowhash;
11479
11480 bzero(&keyfc, sizeof(keyfc));
11481 keyfc.ifce_flowhash = flowhash;
11482
11483 lck_mtx_lock_spin(&ifnet_fc_lock);
11484 ifce = RB_FIND(ifnet_fc_tree, &ifnet_fc_tree, &keyfc);
11485 if (ifce != NULL && ifce->ifce_ifp == ifp) {
11486 /* Entry is already in ifnet_fc_tree, return */
11487 lck_mtx_unlock(&ifnet_fc_lock);
11488 return 0;
11489 }
11490
11491 if (ifce != NULL) {
11492 /*
11493 * There is a different fc entry with the same flow hash
11494 * but different ifp pointer. There can be a collision
11495 * on flow hash but the probability is low. Let's just
11496 * avoid adding a second one when there is a collision.
11497 */
11498 lck_mtx_unlock(&ifnet_fc_lock);
11499 return EAGAIN;
11500 }
11501
11502 /* become regular mutex */
11503 lck_mtx_convert_spin(&ifnet_fc_lock);
11504
11505 ifce = zalloc_flags(ifnet_fc_zone, Z_WAITOK | Z_ZERO);
11506 ifce->ifce_flowhash = flowhash;
11507 ifce->ifce_ifp = ifp;
11508
11509 RB_INSERT(ifnet_fc_tree, &ifnet_fc_tree, ifce);
11510 lck_mtx_unlock(&ifnet_fc_lock);
11511 return 0;
11512 }
11513
11514 static struct ifnet_fc_entry *
ifnet_fc_get(uint32_t flowhash)11515 ifnet_fc_get(uint32_t flowhash)
11516 {
11517 struct ifnet_fc_entry keyfc, *ifce;
11518 struct ifnet *ifp;
11519
11520 bzero(&keyfc, sizeof(keyfc));
11521 keyfc.ifce_flowhash = flowhash;
11522
11523 lck_mtx_lock_spin(&ifnet_fc_lock);
11524 ifce = RB_FIND(ifnet_fc_tree, &ifnet_fc_tree, &keyfc);
11525 if (ifce == NULL) {
11526 /* Entry is not present in ifnet_fc_tree, return */
11527 lck_mtx_unlock(&ifnet_fc_lock);
11528 return NULL;
11529 }
11530
11531 RB_REMOVE(ifnet_fc_tree, &ifnet_fc_tree, ifce);
11532
11533 VERIFY(ifce->ifce_ifp != NULL);
11534 ifp = ifce->ifce_ifp;
11535
11536 /* become regular mutex */
11537 lck_mtx_convert_spin(&ifnet_fc_lock);
11538
11539 if (!ifnet_is_attached(ifp, 0)) {
11540 /*
11541 * This ifp is not attached or in the process of being
11542 * detached; just don't process it.
11543 */
11544 ifnet_fc_entry_free(ifce);
11545 ifce = NULL;
11546 }
11547 lck_mtx_unlock(&ifnet_fc_lock);
11548
11549 return ifce;
11550 }
11551
11552 static void
ifnet_fc_entry_free(struct ifnet_fc_entry * ifce)11553 ifnet_fc_entry_free(struct ifnet_fc_entry *ifce)
11554 {
11555 zfree(ifnet_fc_zone, ifce);
11556 }
11557
11558 static uint32_t
ifnet_calc_flowhash(struct ifnet * ifp)11559 ifnet_calc_flowhash(struct ifnet *ifp)
11560 {
11561 struct ifnet_flowhash_key fh __attribute__((aligned(8)));
11562 uint32_t flowhash = 0;
11563
11564 if (ifnet_flowhash_seed == 0) {
11565 ifnet_flowhash_seed = RandomULong();
11566 }
11567
11568 bzero(&fh, sizeof(fh));
11569
11570 (void) snprintf(fh.ifk_name, sizeof(fh.ifk_name), "%s", ifp->if_name);
11571 fh.ifk_unit = ifp->if_unit;
11572 fh.ifk_flags = ifp->if_flags;
11573 fh.ifk_eflags = ifp->if_eflags;
11574 fh.ifk_capabilities = ifp->if_capabilities;
11575 fh.ifk_capenable = ifp->if_capenable;
11576 fh.ifk_output_sched_model = ifp->if_output_sched_model;
11577 fh.ifk_rand1 = RandomULong();
11578 fh.ifk_rand2 = RandomULong();
11579
11580 try_again:
11581 flowhash = net_flowhash(&fh, sizeof(fh), ifnet_flowhash_seed);
11582 if (flowhash == 0) {
11583 /* try to get a non-zero flowhash */
11584 ifnet_flowhash_seed = RandomULong();
11585 goto try_again;
11586 }
11587
11588 return flowhash;
11589 }
11590
11591 int
ifnet_set_netsignature(struct ifnet * ifp,uint8_t family,uint8_t len,uint16_t flags,uint8_t * data)11592 ifnet_set_netsignature(struct ifnet *ifp, uint8_t family, uint8_t len,
11593 uint16_t flags, uint8_t *data)
11594 {
11595 #pragma unused(flags)
11596 int error = 0;
11597
11598 switch (family) {
11599 case AF_INET:
11600 if_inetdata_lock_exclusive(ifp);
11601 if (IN_IFEXTRA(ifp) != NULL) {
11602 if (len == 0) {
11603 /* Allow clearing the signature */
11604 IN_IFEXTRA(ifp)->netsig_len = 0;
11605 bzero(IN_IFEXTRA(ifp)->netsig,
11606 sizeof(IN_IFEXTRA(ifp)->netsig));
11607 if_inetdata_lock_done(ifp);
11608 break;
11609 } else if (len > sizeof(IN_IFEXTRA(ifp)->netsig)) {
11610 error = EINVAL;
11611 if_inetdata_lock_done(ifp);
11612 break;
11613 }
11614 IN_IFEXTRA(ifp)->netsig_len = len;
11615 bcopy(data, IN_IFEXTRA(ifp)->netsig, len);
11616 } else {
11617 error = ENOMEM;
11618 }
11619 if_inetdata_lock_done(ifp);
11620 break;
11621
11622 case AF_INET6:
11623 if_inet6data_lock_exclusive(ifp);
11624 if (IN6_IFEXTRA(ifp) != NULL) {
11625 if (len == 0) {
11626 /* Allow clearing the signature */
11627 IN6_IFEXTRA(ifp)->netsig_len = 0;
11628 bzero(IN6_IFEXTRA(ifp)->netsig,
11629 sizeof(IN6_IFEXTRA(ifp)->netsig));
11630 if_inet6data_lock_done(ifp);
11631 break;
11632 } else if (len > sizeof(IN6_IFEXTRA(ifp)->netsig)) {
11633 error = EINVAL;
11634 if_inet6data_lock_done(ifp);
11635 break;
11636 }
11637 IN6_IFEXTRA(ifp)->netsig_len = len;
11638 bcopy(data, IN6_IFEXTRA(ifp)->netsig, len);
11639 } else {
11640 error = ENOMEM;
11641 }
11642 if_inet6data_lock_done(ifp);
11643 break;
11644
11645 default:
11646 error = EINVAL;
11647 break;
11648 }
11649
11650 return error;
11651 }
11652
11653 int
ifnet_get_netsignature(struct ifnet * ifp,uint8_t family,uint8_t * len,uint16_t * flags,uint8_t * data)11654 ifnet_get_netsignature(struct ifnet *ifp, uint8_t family, uint8_t *len,
11655 uint16_t *flags, uint8_t *data)
11656 {
11657 int error = 0;
11658
11659 if (ifp == NULL || len == NULL || data == NULL) {
11660 return EINVAL;
11661 }
11662
11663 switch (family) {
11664 case AF_INET:
11665 if_inetdata_lock_shared(ifp);
11666 if (IN_IFEXTRA(ifp) != NULL) {
11667 if (*len == 0 || *len < IN_IFEXTRA(ifp)->netsig_len) {
11668 error = EINVAL;
11669 if_inetdata_lock_done(ifp);
11670 break;
11671 }
11672 if ((*len = (uint8_t)IN_IFEXTRA(ifp)->netsig_len) > 0) {
11673 bcopy(IN_IFEXTRA(ifp)->netsig, data, *len);
11674 } else {
11675 error = ENOENT;
11676 }
11677 } else {
11678 error = ENOMEM;
11679 }
11680 if_inetdata_lock_done(ifp);
11681 break;
11682
11683 case AF_INET6:
11684 if_inet6data_lock_shared(ifp);
11685 if (IN6_IFEXTRA(ifp) != NULL) {
11686 if (*len == 0 || *len < IN6_IFEXTRA(ifp)->netsig_len) {
11687 error = EINVAL;
11688 if_inet6data_lock_done(ifp);
11689 break;
11690 }
11691 if ((*len = (uint8_t)IN6_IFEXTRA(ifp)->netsig_len) > 0) {
11692 bcopy(IN6_IFEXTRA(ifp)->netsig, data, *len);
11693 } else {
11694 error = ENOENT;
11695 }
11696 } else {
11697 error = ENOMEM;
11698 }
11699 if_inet6data_lock_done(ifp);
11700 break;
11701
11702 default:
11703 error = EINVAL;
11704 break;
11705 }
11706
11707 if (error == 0 && flags != NULL) {
11708 *flags = 0;
11709 }
11710
11711 return error;
11712 }
11713
11714 int
ifnet_set_nat64prefix(struct ifnet * ifp,struct ipv6_prefix * prefixes)11715 ifnet_set_nat64prefix(struct ifnet *ifp, struct ipv6_prefix *prefixes)
11716 {
11717 int i, error = 0, one_set = 0;
11718
11719 if_inet6data_lock_exclusive(ifp);
11720
11721 if (IN6_IFEXTRA(ifp) == NULL) {
11722 error = ENOMEM;
11723 goto out;
11724 }
11725
11726 for (i = 0; i < NAT64_MAX_NUM_PREFIXES; i++) {
11727 uint32_t prefix_len =
11728 prefixes[i].prefix_len;
11729 struct in6_addr *prefix =
11730 &prefixes[i].ipv6_prefix;
11731
11732 if (prefix_len == 0) {
11733 clat_log0((LOG_DEBUG,
11734 "NAT64 prefixes purged from Interface %s\n",
11735 if_name(ifp)));
11736 /* Allow clearing the signature */
11737 IN6_IFEXTRA(ifp)->nat64_prefixes[i].prefix_len = 0;
11738 bzero(&IN6_IFEXTRA(ifp)->nat64_prefixes[i].ipv6_prefix,
11739 sizeof(struct in6_addr));
11740
11741 continue;
11742 } else if (prefix_len != NAT64_PREFIX_LEN_32 &&
11743 prefix_len != NAT64_PREFIX_LEN_40 &&
11744 prefix_len != NAT64_PREFIX_LEN_48 &&
11745 prefix_len != NAT64_PREFIX_LEN_56 &&
11746 prefix_len != NAT64_PREFIX_LEN_64 &&
11747 prefix_len != NAT64_PREFIX_LEN_96) {
11748 clat_log0((LOG_DEBUG,
11749 "NAT64 prefixlen is incorrect %d\n", prefix_len));
11750 error = EINVAL;
11751 goto out;
11752 }
11753
11754 if (IN6_IS_SCOPE_EMBED(prefix)) {
11755 clat_log0((LOG_DEBUG,
11756 "NAT64 prefix has interface/link local scope.\n"));
11757 error = EINVAL;
11758 goto out;
11759 }
11760
11761 IN6_IFEXTRA(ifp)->nat64_prefixes[i].prefix_len = prefix_len;
11762 bcopy(prefix, &IN6_IFEXTRA(ifp)->nat64_prefixes[i].ipv6_prefix,
11763 sizeof(struct in6_addr));
11764 clat_log0((LOG_DEBUG,
11765 "NAT64 prefix set to %s with prefixlen: %d\n",
11766 ip6_sprintf(prefix), prefix_len));
11767 one_set = 1;
11768 }
11769
11770 out:
11771 if_inet6data_lock_done(ifp);
11772
11773 if (error == 0 && one_set != 0) {
11774 necp_update_all_clients();
11775 }
11776
11777 return error;
11778 }
11779
11780 int
ifnet_get_nat64prefix(struct ifnet * ifp,struct ipv6_prefix * prefixes)11781 ifnet_get_nat64prefix(struct ifnet *ifp, struct ipv6_prefix *prefixes)
11782 {
11783 int i, found_one = 0, error = 0;
11784
11785 if (ifp == NULL) {
11786 return EINVAL;
11787 }
11788
11789 if_inet6data_lock_shared(ifp);
11790
11791 if (IN6_IFEXTRA(ifp) == NULL) {
11792 error = ENOMEM;
11793 goto out;
11794 }
11795
11796 for (i = 0; i < NAT64_MAX_NUM_PREFIXES; i++) {
11797 if (IN6_IFEXTRA(ifp)->nat64_prefixes[i].prefix_len != 0) {
11798 found_one = 1;
11799 }
11800 }
11801
11802 if (found_one == 0) {
11803 error = ENOENT;
11804 goto out;
11805 }
11806
11807 if (prefixes) {
11808 bcopy(IN6_IFEXTRA(ifp)->nat64_prefixes, prefixes,
11809 sizeof(IN6_IFEXTRA(ifp)->nat64_prefixes));
11810 }
11811
11812 out:
11813 if_inet6data_lock_done(ifp);
11814
11815 return error;
11816 }
11817
11818 __attribute__((noinline))
11819 static void
dlil_output_cksum_dbg(struct ifnet * ifp,struct mbuf * m,uint32_t hoff,protocol_family_t pf)11820 dlil_output_cksum_dbg(struct ifnet *ifp, struct mbuf *m, uint32_t hoff,
11821 protocol_family_t pf)
11822 {
11823 #pragma unused(ifp)
11824 uint32_t did_sw;
11825
11826 if (!(hwcksum_dbg_mode & HWCKSUM_DBG_FINALIZE_FORCED) ||
11827 (m->m_pkthdr.csum_flags & (CSUM_TSO_IPV4 | CSUM_TSO_IPV6))) {
11828 return;
11829 }
11830
11831 switch (pf) {
11832 case PF_INET:
11833 did_sw = in_finalize_cksum(m, hoff, m->m_pkthdr.csum_flags);
11834 if (did_sw & CSUM_DELAY_IP) {
11835 hwcksum_dbg_finalized_hdr++;
11836 }
11837 if (did_sw & CSUM_DELAY_DATA) {
11838 hwcksum_dbg_finalized_data++;
11839 }
11840 break;
11841 case PF_INET6:
11842 /*
11843 * Checksum offload should not have been enabled when
11844 * extension headers exist; that also means that we
11845 * cannot force-finalize packets with extension headers.
11846 * Indicate to the callee should it skip such case by
11847 * setting optlen to -1.
11848 */
11849 did_sw = in6_finalize_cksum(m, hoff, -1, -1,
11850 m->m_pkthdr.csum_flags);
11851 if (did_sw & CSUM_DELAY_IPV6_DATA) {
11852 hwcksum_dbg_finalized_data++;
11853 }
11854 break;
11855 default:
11856 return;
11857 }
11858 }
11859
11860 static void
dlil_input_cksum_dbg(struct ifnet * ifp,struct mbuf * m,char * frame_header,protocol_family_t pf)11861 dlil_input_cksum_dbg(struct ifnet *ifp, struct mbuf *m, char *frame_header,
11862 protocol_family_t pf)
11863 {
11864 uint16_t sum = 0;
11865 uint32_t hlen;
11866
11867 if (frame_header == NULL ||
11868 frame_header < (char *)mbuf_datastart(m) ||
11869 frame_header > (char *)m->m_data) {
11870 DLIL_PRINTF("%s: frame header pointer 0x%llx out of range "
11871 "[0x%llx,0x%llx] for mbuf 0x%llx\n", if_name(ifp),
11872 (uint64_t)VM_KERNEL_ADDRPERM(frame_header),
11873 (uint64_t)VM_KERNEL_ADDRPERM(mbuf_datastart(m)),
11874 (uint64_t)VM_KERNEL_ADDRPERM(m->m_data),
11875 (uint64_t)VM_KERNEL_ADDRPERM(m));
11876 return;
11877 }
11878 hlen = (uint32_t)(m->m_data - frame_header);
11879
11880 switch (pf) {
11881 case PF_INET:
11882 case PF_INET6:
11883 break;
11884 default:
11885 return;
11886 }
11887
11888 /*
11889 * Force partial checksum offload; useful to simulate cases
11890 * where the hardware does not support partial checksum offload,
11891 * in order to validate correctness throughout the layers above.
11892 */
11893 if (hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_FORCED) {
11894 uint32_t foff = hwcksum_dbg_partial_rxoff_forced;
11895
11896 if (foff > (uint32_t)m->m_pkthdr.len) {
11897 return;
11898 }
11899
11900 m->m_pkthdr.csum_flags &= ~CSUM_RX_FLAGS;
11901
11902 /* Compute 16-bit 1's complement sum from forced offset */
11903 sum = m_sum16(m, foff, (m->m_pkthdr.len - foff));
11904
11905 m->m_pkthdr.csum_flags |= (CSUM_DATA_VALID | CSUM_PARTIAL);
11906 m->m_pkthdr.csum_rx_val = sum;
11907 m->m_pkthdr.csum_rx_start = (uint16_t)(foff + hlen);
11908
11909 hwcksum_dbg_partial_forced++;
11910 hwcksum_dbg_partial_forced_bytes += m->m_pkthdr.len;
11911 }
11912
11913 /*
11914 * Partial checksum offload verification (and adjustment);
11915 * useful to validate and test cases where the hardware
11916 * supports partial checksum offload.
11917 */
11918 if ((m->m_pkthdr.csum_flags &
11919 (CSUM_DATA_VALID | CSUM_PARTIAL | CSUM_PSEUDO_HDR)) ==
11920 (CSUM_DATA_VALID | CSUM_PARTIAL)) {
11921 uint32_t rxoff;
11922
11923 /* Start offset must begin after frame header */
11924 rxoff = m->m_pkthdr.csum_rx_start;
11925 if (hlen > rxoff) {
11926 hwcksum_dbg_bad_rxoff++;
11927 if (dlil_verbose) {
11928 DLIL_PRINTF("%s: partial cksum start offset %d "
11929 "is less than frame header length %d for "
11930 "mbuf 0x%llx\n", if_name(ifp), rxoff, hlen,
11931 (uint64_t)VM_KERNEL_ADDRPERM(m));
11932 }
11933 return;
11934 }
11935 rxoff -= hlen;
11936
11937 if (!(hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_FORCED)) {
11938 /*
11939 * Compute the expected 16-bit 1's complement sum;
11940 * skip this if we've already computed it above
11941 * when partial checksum offload is forced.
11942 */
11943 sum = m_sum16(m, rxoff, (m->m_pkthdr.len - rxoff));
11944
11945 /* Hardware or driver is buggy */
11946 if (sum != m->m_pkthdr.csum_rx_val) {
11947 hwcksum_dbg_bad_cksum++;
11948 if (dlil_verbose) {
11949 DLIL_PRINTF("%s: bad partial cksum value "
11950 "0x%x (expected 0x%x) for mbuf "
11951 "0x%llx [rx_start %d]\n",
11952 if_name(ifp),
11953 m->m_pkthdr.csum_rx_val, sum,
11954 (uint64_t)VM_KERNEL_ADDRPERM(m),
11955 m->m_pkthdr.csum_rx_start);
11956 }
11957 return;
11958 }
11959 }
11960 hwcksum_dbg_verified++;
11961
11962 /*
11963 * This code allows us to emulate various hardwares that
11964 * perform 16-bit 1's complement sum beginning at various
11965 * start offset values.
11966 */
11967 if (hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_RXOFF_ADJ) {
11968 uint32_t aoff = hwcksum_dbg_partial_rxoff_adj;
11969
11970 if (aoff == rxoff || aoff > (uint32_t)m->m_pkthdr.len) {
11971 return;
11972 }
11973
11974 sum = m_adj_sum16(m, rxoff, aoff,
11975 m_pktlen(m) - aoff, sum);
11976
11977 m->m_pkthdr.csum_rx_val = sum;
11978 m->m_pkthdr.csum_rx_start = (uint16_t)(aoff + hlen);
11979
11980 hwcksum_dbg_adjusted++;
11981 }
11982 }
11983 }
11984
11985 static int
11986 sysctl_hwcksum_dbg_mode SYSCTL_HANDLER_ARGS
11987 {
11988 #pragma unused(arg1, arg2)
11989 u_int32_t i;
11990 int err;
11991
11992 i = hwcksum_dbg_mode;
11993
11994 err = sysctl_handle_int(oidp, &i, 0, req);
11995 if (err != 0 || req->newptr == USER_ADDR_NULL) {
11996 return err;
11997 }
11998
11999 if (hwcksum_dbg == 0) {
12000 return ENODEV;
12001 }
12002
12003 if ((i & ~HWCKSUM_DBG_MASK) != 0) {
12004 return EINVAL;
12005 }
12006
12007 hwcksum_dbg_mode = (i & HWCKSUM_DBG_MASK);
12008
12009 return err;
12010 }
12011
12012 static int
12013 sysctl_hwcksum_dbg_partial_rxoff_forced SYSCTL_HANDLER_ARGS
12014 {
12015 #pragma unused(arg1, arg2)
12016 u_int32_t i;
12017 int err;
12018
12019 i = hwcksum_dbg_partial_rxoff_forced;
12020
12021 err = sysctl_handle_int(oidp, &i, 0, req);
12022 if (err != 0 || req->newptr == USER_ADDR_NULL) {
12023 return err;
12024 }
12025
12026 if (!(hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_FORCED)) {
12027 return ENODEV;
12028 }
12029
12030 hwcksum_dbg_partial_rxoff_forced = i;
12031
12032 return err;
12033 }
12034
12035 static int
12036 sysctl_hwcksum_dbg_partial_rxoff_adj SYSCTL_HANDLER_ARGS
12037 {
12038 #pragma unused(arg1, arg2)
12039 u_int32_t i;
12040 int err;
12041
12042 i = hwcksum_dbg_partial_rxoff_adj;
12043
12044 err = sysctl_handle_int(oidp, &i, 0, req);
12045 if (err != 0 || req->newptr == USER_ADDR_NULL) {
12046 return err;
12047 }
12048
12049 if (!(hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_RXOFF_ADJ)) {
12050 return ENODEV;
12051 }
12052
12053 hwcksum_dbg_partial_rxoff_adj = i;
12054
12055 return err;
12056 }
12057
12058 static int
12059 sysctl_tx_chain_len_stats SYSCTL_HANDLER_ARGS
12060 {
12061 #pragma unused(oidp, arg1, arg2)
12062 int err;
12063
12064 if (req->oldptr == USER_ADDR_NULL) {
12065 }
12066 if (req->newptr != USER_ADDR_NULL) {
12067 return EPERM;
12068 }
12069 err = SYSCTL_OUT(req, &tx_chain_len_stats,
12070 sizeof(struct chain_len_stats));
12071
12072 return err;
12073 }
12074
12075
12076 #if DEBUG || DEVELOPMENT
12077 /* Blob for sum16 verification */
12078 static uint8_t sumdata[] = {
12079 0x1f, 0x8b, 0x08, 0x08, 0x4c, 0xe5, 0x9a, 0x4f, 0x00, 0x03,
12080 0x5f, 0x00, 0x5d, 0x91, 0x41, 0x4e, 0xc4, 0x30, 0x0c, 0x45,
12081 0xf7, 0x9c, 0xc2, 0x07, 0x18, 0xf5, 0x0e, 0xb0, 0xe2, 0x00,
12082 0x48, 0x88, 0xa5, 0xdb, 0xba, 0x49, 0x34, 0x69, 0xdc, 0x71,
12083 0x92, 0xa9, 0xc2, 0x8a, 0x6b, 0x70, 0x3d, 0x4e, 0x82, 0x93,
12084 0xb4, 0x08, 0xd8, 0xc5, 0xb1, 0xfd, 0xff, 0xb3, 0xfd, 0x4c,
12085 0x42, 0x5f, 0x1f, 0x9f, 0x11, 0x12, 0x43, 0xb2, 0x04, 0x93,
12086 0xe0, 0x7b, 0x01, 0x0e, 0x14, 0x07, 0x78, 0xd1, 0x78, 0x75,
12087 0x71, 0x71, 0xe9, 0x08, 0x84, 0x46, 0xf2, 0xc7, 0x3b, 0x09,
12088 0xe7, 0xd1, 0xd3, 0x8a, 0x57, 0x92, 0x33, 0xcd, 0x39, 0xcc,
12089 0xb0, 0x91, 0x89, 0xe0, 0x42, 0x53, 0x8b, 0xb7, 0x8c, 0x42,
12090 0x60, 0xd9, 0x9f, 0x7a, 0x55, 0x19, 0x76, 0xcb, 0x10, 0x49,
12091 0x35, 0xac, 0x0b, 0x5a, 0x3c, 0xbb, 0x65, 0x51, 0x8c, 0x90,
12092 0x7c, 0x69, 0x45, 0x45, 0x81, 0xb4, 0x2b, 0x70, 0x82, 0x85,
12093 0x55, 0x91, 0x17, 0x90, 0xdc, 0x14, 0x1e, 0x35, 0x52, 0xdd,
12094 0x02, 0x16, 0xef, 0xb5, 0x40, 0x89, 0xe2, 0x46, 0x53, 0xad,
12095 0x93, 0x6e, 0x98, 0x30, 0xe5, 0x08, 0xb7, 0xcc, 0x03, 0xbc,
12096 0x71, 0x86, 0x09, 0x43, 0x0d, 0x52, 0xf5, 0xa2, 0xf5, 0xa2,
12097 0x56, 0x11, 0x8d, 0xa8, 0xf5, 0xee, 0x92, 0x3d, 0xfe, 0x8c,
12098 0x67, 0x71, 0x8b, 0x0e, 0x2d, 0x70, 0x77, 0xbe, 0xbe, 0xea,
12099 0xbf, 0x9a, 0x8d, 0x9c, 0x53, 0x53, 0xe5, 0xe0, 0x4b, 0x87,
12100 0x85, 0xd2, 0x45, 0x95, 0x30, 0xc1, 0xcc, 0xe0, 0x74, 0x54,
12101 0x13, 0x58, 0xe8, 0xe8, 0x79, 0xa2, 0x09, 0x73, 0xa4, 0x0e,
12102 0x39, 0x59, 0x0c, 0xe6, 0x9c, 0xb2, 0x4f, 0x06, 0x5b, 0x8e,
12103 0xcd, 0x17, 0x6c, 0x5e, 0x95, 0x4d, 0x70, 0xa2, 0x0a, 0xbf,
12104 0xa3, 0xcc, 0x03, 0xbc, 0x5a, 0xe7, 0x75, 0x06, 0x5e, 0x75,
12105 0xef, 0x58, 0x8e, 0x15, 0xd1, 0x0a, 0x18, 0xff, 0xdd, 0xe6,
12106 0x02, 0x3b, 0xb5, 0xb4, 0xa1, 0xe0, 0x72, 0xfc, 0xe3, 0xab,
12107 0x07, 0xe0, 0x4d, 0x65, 0xea, 0x92, 0xeb, 0xf2, 0x7b, 0x17,
12108 0x05, 0xce, 0xc6, 0xf6, 0x2b, 0xbb, 0x70, 0x3d, 0x00, 0x95,
12109 0xe0, 0x07, 0x52, 0x3b, 0x58, 0xfc, 0x7c, 0x69, 0x4d, 0xe9,
12110 0xf7, 0xa9, 0x66, 0x1e, 0x1e, 0xbe, 0x01, 0x69, 0x98, 0xfe,
12111 0xc8, 0x28, 0x02, 0x00, 0x00
12112 };
12113
12114 /* Precomputed 16-bit 1's complement sums for various spans of the above data */
12115 static struct {
12116 boolean_t init;
12117 uint16_t len;
12118 uint16_t sumr; /* reference */
12119 uint16_t sumrp; /* reference, precomputed */
12120 } sumtbl[] = {
12121 { FALSE, 0, 0, 0x0000 },
12122 { FALSE, 1, 0, 0x001f },
12123 { FALSE, 2, 0, 0x8b1f },
12124 { FALSE, 3, 0, 0x8b27 },
12125 { FALSE, 7, 0, 0x790e },
12126 { FALSE, 11, 0, 0xcb6d },
12127 { FALSE, 20, 0, 0x20dd },
12128 { FALSE, 27, 0, 0xbabd },
12129 { FALSE, 32, 0, 0xf3e8 },
12130 { FALSE, 37, 0, 0x197d },
12131 { FALSE, 43, 0, 0x9eae },
12132 { FALSE, 64, 0, 0x4678 },
12133 { FALSE, 127, 0, 0x9399 },
12134 { FALSE, 256, 0, 0xd147 },
12135 { FALSE, 325, 0, 0x0358 },
12136 };
12137 #define SUMTBL_MAX ((int)sizeof (sumtbl) / (int)sizeof (sumtbl[0]))
12138
12139 static void
dlil_verify_sum16(void)12140 dlil_verify_sum16(void)
12141 {
12142 struct mbuf *m;
12143 uint8_t *buf;
12144 int n;
12145
12146 /* Make sure test data plus extra room for alignment fits in cluster */
12147 _CASSERT((sizeof(sumdata) + (sizeof(uint64_t) * 2)) <= MCLBYTES);
12148
12149 kprintf("DLIL: running SUM16 self-tests ... ");
12150
12151 m = m_getcl(M_WAITOK, MT_DATA, M_PKTHDR);
12152 m_align(m, sizeof(sumdata) + (sizeof(uint64_t) * 2));
12153
12154 buf = mtod(m, uint8_t *); /* base address */
12155
12156 for (n = 0; n < SUMTBL_MAX; n++) {
12157 uint16_t len = sumtbl[n].len;
12158 int i;
12159
12160 /* Verify for all possible alignments */
12161 for (i = 0; i < (int)sizeof(uint64_t); i++) {
12162 uint16_t sum, sumr;
12163 uint8_t *c;
12164
12165 /* Copy over test data to mbuf */
12166 VERIFY(len <= sizeof(sumdata));
12167 c = buf + i;
12168 bcopy(sumdata, c, len);
12169
12170 /* Zero-offset test (align by data pointer) */
12171 m->m_data = (caddr_t)c;
12172 m->m_len = len;
12173 sum = m_sum16(m, 0, len);
12174
12175 if (!sumtbl[n].init) {
12176 sumr = (uint16_t)in_cksum_mbuf_ref(m, len, 0, 0);
12177 sumtbl[n].sumr = sumr;
12178 sumtbl[n].init = TRUE;
12179 } else {
12180 sumr = sumtbl[n].sumr;
12181 }
12182
12183 /* Something is horribly broken; stop now */
12184 if (sumr != sumtbl[n].sumrp) {
12185 panic_plain("\n%s: broken in_cksum_mbuf_ref() "
12186 "for len=%d align=%d sum=0x%04x "
12187 "[expected=0x%04x]\n", __func__,
12188 len, i, sum, sumr);
12189 /* NOTREACHED */
12190 } else if (sum != sumr) {
12191 panic_plain("\n%s: broken m_sum16() for len=%d "
12192 "align=%d sum=0x%04x [expected=0x%04x]\n",
12193 __func__, len, i, sum, sumr);
12194 /* NOTREACHED */
12195 }
12196
12197 /* Alignment test by offset (fixed data pointer) */
12198 m->m_data = (caddr_t)buf;
12199 m->m_len = i + len;
12200 sum = m_sum16(m, i, len);
12201
12202 /* Something is horribly broken; stop now */
12203 if (sum != sumr) {
12204 panic_plain("\n%s: broken m_sum16() for len=%d "
12205 "offset=%d sum=0x%04x [expected=0x%04x]\n",
12206 __func__, len, i, sum, sumr);
12207 /* NOTREACHED */
12208 }
12209 #if INET
12210 /* Simple sum16 contiguous buffer test by aligment */
12211 sum = b_sum16(c, len);
12212
12213 /* Something is horribly broken; stop now */
12214 if (sum != sumr) {
12215 panic_plain("\n%s: broken b_sum16() for len=%d "
12216 "align=%d sum=0x%04x [expected=0x%04x]\n",
12217 __func__, len, i, sum, sumr);
12218 /* NOTREACHED */
12219 }
12220 #endif /* INET */
12221 }
12222 }
12223 m_freem(m);
12224
12225 kprintf("PASSED\n");
12226 }
12227 #endif /* DEBUG || DEVELOPMENT */
12228
12229 #define CASE_STRINGIFY(x) case x: return #x
12230
12231 __private_extern__ const char *
dlil_kev_dl_code_str(u_int32_t event_code)12232 dlil_kev_dl_code_str(u_int32_t event_code)
12233 {
12234 switch (event_code) {
12235 CASE_STRINGIFY(KEV_DL_SIFFLAGS);
12236 CASE_STRINGIFY(KEV_DL_SIFMETRICS);
12237 CASE_STRINGIFY(KEV_DL_SIFMTU);
12238 CASE_STRINGIFY(KEV_DL_SIFPHYS);
12239 CASE_STRINGIFY(KEV_DL_SIFMEDIA);
12240 CASE_STRINGIFY(KEV_DL_SIFGENERIC);
12241 CASE_STRINGIFY(KEV_DL_ADDMULTI);
12242 CASE_STRINGIFY(KEV_DL_DELMULTI);
12243 CASE_STRINGIFY(KEV_DL_IF_ATTACHED);
12244 CASE_STRINGIFY(KEV_DL_IF_DETACHING);
12245 CASE_STRINGIFY(KEV_DL_IF_DETACHED);
12246 CASE_STRINGIFY(KEV_DL_LINK_OFF);
12247 CASE_STRINGIFY(KEV_DL_LINK_ON);
12248 CASE_STRINGIFY(KEV_DL_PROTO_ATTACHED);
12249 CASE_STRINGIFY(KEV_DL_PROTO_DETACHED);
12250 CASE_STRINGIFY(KEV_DL_LINK_ADDRESS_CHANGED);
12251 CASE_STRINGIFY(KEV_DL_WAKEFLAGS_CHANGED);
12252 CASE_STRINGIFY(KEV_DL_IF_IDLE_ROUTE_REFCNT);
12253 CASE_STRINGIFY(KEV_DL_IFCAP_CHANGED);
12254 CASE_STRINGIFY(KEV_DL_LINK_QUALITY_METRIC_CHANGED);
12255 CASE_STRINGIFY(KEV_DL_NODE_PRESENCE);
12256 CASE_STRINGIFY(KEV_DL_NODE_ABSENCE);
12257 CASE_STRINGIFY(KEV_DL_PRIMARY_ELECTED);
12258 CASE_STRINGIFY(KEV_DL_ISSUES);
12259 CASE_STRINGIFY(KEV_DL_IFDELEGATE_CHANGED);
12260 default:
12261 break;
12262 }
12263 return "";
12264 }
12265
12266 static void
dlil_dt_tcall_fn(thread_call_param_t arg0,thread_call_param_t arg1)12267 dlil_dt_tcall_fn(thread_call_param_t arg0, thread_call_param_t arg1)
12268 {
12269 #pragma unused(arg1)
12270 struct ifnet *ifp = arg0;
12271
12272 if (ifnet_is_attached(ifp, 1)) {
12273 nstat_ifnet_threshold_reached(ifp->if_index);
12274 ifnet_decr_iorefcnt(ifp);
12275 }
12276 }
12277
12278 void
ifnet_notify_data_threshold(struct ifnet * ifp)12279 ifnet_notify_data_threshold(struct ifnet *ifp)
12280 {
12281 uint64_t bytes = (ifp->if_ibytes + ifp->if_obytes);
12282 uint64_t oldbytes = ifp->if_dt_bytes;
12283
12284 ASSERT(ifp->if_dt_tcall != NULL);
12285
12286 /*
12287 * If we went over the threshold, notify NetworkStatistics.
12288 * We rate-limit it based on the threshold interval value.
12289 */
12290 if (threshold_notify && (bytes - oldbytes) > ifp->if_data_threshold &&
12291 OSCompareAndSwap64(oldbytes, bytes, &ifp->if_dt_bytes) &&
12292 !thread_call_isactive(ifp->if_dt_tcall)) {
12293 uint64_t tival = (threshold_interval * NSEC_PER_SEC);
12294 uint64_t now = mach_absolute_time(), deadline = now;
12295 uint64_t ival;
12296
12297 if (tival != 0) {
12298 nanoseconds_to_absolutetime(tival, &ival);
12299 clock_deadline_for_periodic_event(ival, now, &deadline);
12300 (void) thread_call_enter_delayed(ifp->if_dt_tcall,
12301 deadline);
12302 } else {
12303 (void) thread_call_enter(ifp->if_dt_tcall);
12304 }
12305 }
12306 }
12307
12308 #if (DEVELOPMENT || DEBUG)
12309 /*
12310 * The sysctl variable name contains the input parameters of
12311 * ifnet_get_keepalive_offload_frames()
12312 * ifp (interface index): name[0]
12313 * frames_array_count: name[1]
12314 * frame_data_offset: name[2]
12315 * The return length gives used_frames_count
12316 */
12317 static int
12318 sysctl_get_kao_frames SYSCTL_HANDLER_ARGS
12319 {
12320 #pragma unused(oidp)
12321 int *name = (int *)arg1;
12322 u_int namelen = arg2;
12323 int idx;
12324 ifnet_t ifp = NULL;
12325 u_int32_t frames_array_count;
12326 size_t frame_data_offset;
12327 u_int32_t used_frames_count;
12328 struct ifnet_keepalive_offload_frame *frames_array = NULL;
12329 int error = 0;
12330 u_int32_t i;
12331
12332 /*
12333 * Only root can get look at other people TCP frames
12334 */
12335 error = proc_suser(current_proc());
12336 if (error != 0) {
12337 goto done;
12338 }
12339 /*
12340 * Validate the input parameters
12341 */
12342 if (req->newptr != USER_ADDR_NULL) {
12343 error = EPERM;
12344 goto done;
12345 }
12346 if (namelen != 3) {
12347 error = EINVAL;
12348 goto done;
12349 }
12350 if (req->oldptr == USER_ADDR_NULL) {
12351 error = EINVAL;
12352 goto done;
12353 }
12354 if (req->oldlen == 0) {
12355 error = EINVAL;
12356 goto done;
12357 }
12358 idx = name[0];
12359 frames_array_count = name[1];
12360 frame_data_offset = name[2];
12361
12362 /* Make sure the passed buffer is large enough */
12363 if (frames_array_count * sizeof(struct ifnet_keepalive_offload_frame) >
12364 req->oldlen) {
12365 error = ENOMEM;
12366 goto done;
12367 }
12368
12369 ifnet_head_lock_shared();
12370 if (!IF_INDEX_IN_RANGE(idx)) {
12371 ifnet_head_done();
12372 error = ENOENT;
12373 goto done;
12374 }
12375 ifp = ifindex2ifnet[idx];
12376 ifnet_head_done();
12377
12378 frames_array = (struct ifnet_keepalive_offload_frame *)kalloc_data(
12379 frames_array_count * sizeof(struct ifnet_keepalive_offload_frame),
12380 Z_WAITOK);
12381 if (frames_array == NULL) {
12382 error = ENOMEM;
12383 goto done;
12384 }
12385
12386 error = ifnet_get_keepalive_offload_frames(ifp, frames_array,
12387 frames_array_count, frame_data_offset, &used_frames_count);
12388 if (error != 0) {
12389 DLIL_PRINTF("%s: ifnet_get_keepalive_offload_frames error %d\n",
12390 __func__, error);
12391 goto done;
12392 }
12393
12394 for (i = 0; i < used_frames_count; i++) {
12395 error = SYSCTL_OUT(req, frames_array + i,
12396 sizeof(struct ifnet_keepalive_offload_frame));
12397 if (error != 0) {
12398 goto done;
12399 }
12400 }
12401 done:
12402 if (frames_array != NULL) {
12403 kfree_data(frames_array, frames_array_count *
12404 sizeof(struct ifnet_keepalive_offload_frame));
12405 }
12406 return error;
12407 }
12408 #endif /* DEVELOPMENT || DEBUG */
12409
12410 void
ifnet_update_stats_per_flow(struct ifnet_stats_per_flow * ifs,struct ifnet * ifp)12411 ifnet_update_stats_per_flow(struct ifnet_stats_per_flow *ifs,
12412 struct ifnet *ifp)
12413 {
12414 tcp_update_stats_per_flow(ifs, ifp);
12415 }
12416
12417 static inline u_int32_t
_set_flags(u_int32_t * flags_p,u_int32_t set_flags)12418 _set_flags(u_int32_t *flags_p, u_int32_t set_flags)
12419 {
12420 return (u_int32_t)OSBitOrAtomic(set_flags, flags_p);
12421 }
12422
12423 static inline void
_clear_flags(u_int32_t * flags_p,u_int32_t clear_flags)12424 _clear_flags(u_int32_t *flags_p, u_int32_t clear_flags)
12425 {
12426 OSBitAndAtomic(~clear_flags, flags_p);
12427 }
12428
12429 __private_extern__ u_int32_t
if_set_eflags(ifnet_t interface,u_int32_t set_flags)12430 if_set_eflags(ifnet_t interface, u_int32_t set_flags)
12431 {
12432 return _set_flags(&interface->if_eflags, set_flags);
12433 }
12434
12435 __private_extern__ void
if_clear_eflags(ifnet_t interface,u_int32_t clear_flags)12436 if_clear_eflags(ifnet_t interface, u_int32_t clear_flags)
12437 {
12438 _clear_flags(&interface->if_eflags, clear_flags);
12439 }
12440
12441 __private_extern__ u_int32_t
if_set_xflags(ifnet_t interface,u_int32_t set_flags)12442 if_set_xflags(ifnet_t interface, u_int32_t set_flags)
12443 {
12444 return _set_flags(&interface->if_xflags, set_flags);
12445 }
12446
12447 __private_extern__ void
if_clear_xflags(ifnet_t interface,u_int32_t clear_flags)12448 if_clear_xflags(ifnet_t interface, u_int32_t clear_flags)
12449 {
12450 _clear_flags(&interface->if_xflags, clear_flags);
12451 }
12452
12453 __private_extern__ void
ifnet_update_traffic_rule_genid(ifnet_t ifp)12454 ifnet_update_traffic_rule_genid(ifnet_t ifp)
12455 {
12456 atomic_add_32(&ifp->if_traffic_rule_genid, 1);
12457 }
12458
12459 __private_extern__ boolean_t
ifnet_sync_traffic_rule_genid(ifnet_t ifp,uint32_t * genid)12460 ifnet_sync_traffic_rule_genid(ifnet_t ifp, uint32_t *genid)
12461 {
12462 if (*genid != ifp->if_traffic_rule_genid) {
12463 *genid = ifp->if_traffic_rule_genid;
12464 return TRUE;
12465 }
12466 return FALSE;
12467 }
12468 __private_extern__ void
ifnet_update_traffic_rule_count(ifnet_t ifp,uint32_t count)12469 ifnet_update_traffic_rule_count(ifnet_t ifp, uint32_t count)
12470 {
12471 atomic_set_32(&ifp->if_traffic_rule_count, count);
12472 ifnet_update_traffic_rule_genid(ifp);
12473 }
12474
12475 static void
log_hexdump(void * data,size_t len)12476 log_hexdump(void *data, size_t len)
12477 {
12478 size_t i, j, k;
12479 unsigned char *ptr = (unsigned char *)data;
12480 #define MAX_DUMP_BUF 32
12481 unsigned char buf[3 * MAX_DUMP_BUF + 1];
12482
12483 for (i = 0; i < len; i += MAX_DUMP_BUF) {
12484 for (j = i, k = 0; j < i + MAX_DUMP_BUF && j < len; j++) {
12485 unsigned char msnbl = ptr[j] >> 4;
12486 unsigned char lsnbl = ptr[j] & 0x0f;
12487
12488 buf[k++] = msnbl < 10 ? msnbl + '0' : msnbl + 'a' - 10;
12489 buf[k++] = lsnbl < 10 ? lsnbl + '0' : lsnbl + 'a' - 10;
12490
12491 if ((j % 2) == 1) {
12492 buf[k++] = ' ';
12493 }
12494 if ((j % MAX_DUMP_BUF) == MAX_DUMP_BUF - 1) {
12495 buf[k++] = ' ';
12496 }
12497 }
12498 buf[k] = 0;
12499 os_log(OS_LOG_DEFAULT, "%3lu: %s", i, buf);
12500 }
12501 }
12502
12503 #if SKYWALK && defined(XNU_TARGET_OS_OSX)
12504 static bool
net_check_compatible_if_filter(struct ifnet * ifp)12505 net_check_compatible_if_filter(struct ifnet *ifp)
12506 {
12507 if (ifp == NULL) {
12508 if (net_api_stats.nas_iflt_attach_count > net_api_stats.nas_iflt_attach_os_count) {
12509 return false;
12510 }
12511 } else {
12512 if (ifp->if_flt_non_os_count > 0) {
12513 return false;
12514 }
12515 }
12516 return true;
12517 }
12518 #endif /* SKYWALK && XNU_TARGET_OS_OSX */
12519
12520 #define DUMP_BUF_CHK() { \
12521 clen -= k; \
12522 if (clen < 1) \
12523 goto done; \
12524 c += k; \
12525 }
12526
12527 int dlil_dump_top_if_qlen(char *, int);
12528 int
dlil_dump_top_if_qlen(char * str,int str_len)12529 dlil_dump_top_if_qlen(char *str, int str_len)
12530 {
12531 char *c = str;
12532 int k, clen = str_len;
12533 struct ifnet *top_ifcq_ifp = NULL;
12534 uint32_t top_ifcq_len = 0;
12535 struct ifnet *top_inq_ifp = NULL;
12536 uint32_t top_inq_len = 0;
12537
12538 for (int ifidx = 1; ifidx < if_index; ifidx++) {
12539 struct ifnet *ifp = ifindex2ifnet[ifidx];
12540 struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
12541
12542 if (ifp == NULL) {
12543 continue;
12544 }
12545 if (ifp->if_snd != NULL && ifp->if_snd->ifcq_len > top_ifcq_len) {
12546 top_ifcq_len = ifp->if_snd->ifcq_len;
12547 top_ifcq_ifp = ifp;
12548 }
12549 if (dl_if->dl_if_inpstorage.dlth_pkts.qlen > top_inq_len) {
12550 top_inq_len = dl_if->dl_if_inpstorage.dlth_pkts.qlen;
12551 top_inq_ifp = ifp;
12552 }
12553 }
12554
12555 if (top_ifcq_ifp != NULL) {
12556 k = scnprintf(c, clen, "\ntop ifcq_len %u packets by %s\n",
12557 top_ifcq_len, top_ifcq_ifp->if_xname);
12558 DUMP_BUF_CHK();
12559 }
12560 if (top_inq_ifp != NULL) {
12561 k = scnprintf(c, clen, "\ntop inq_len %u packets by %s\n",
12562 top_inq_len, top_inq_ifp->if_xname);
12563 DUMP_BUF_CHK();
12564 }
12565 done:
12566 return str_len - clen;
12567 }
12568
12569 #if DEVELOPMENT || DEBUG
12570 __private_extern__ int
packet_dump_trace_update(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)12571 packet_dump_trace_update(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
12572 {
12573 struct flow_key key = {};
12574 int error = 0;
12575
12576 if (req->newptr == USER_ADDR_NULL) {
12577 return EINVAL;
12578 }
12579 if (req->newlen < sizeof(struct flow_key)) {
12580 return EINVAL;
12581 }
12582 error = SYSCTL_IN(req, &key, sizeof(struct flow_key));
12583 if (error != 0) {
12584 return error;
12585 }
12586
12587 switch (key.fk_ipver) {
12588 case IPVERSION:
12589 if (key.fk_proto != IPPROTO_UDP ||
12590 key.fk_sport == 0 || key.fk_dport == 0) {
12591 return EINVAL;
12592 }
12593
12594 if (key.fk_src4.s_addr == INADDR_ANY ||
12595 key.fk_dst4.s_addr == INADDR_ANY) {
12596 return EINVAL;
12597 }
12598
12599 break;
12600 case IPV6_VERSION:
12601 if (key.fk_proto != IPPROTO_UDP ||
12602 key.fk_sport == 0 || key.fk_dport == 0) {
12603 return EINVAL;
12604 }
12605
12606 if (IN6_IS_ADDR_UNSPECIFIED(&key.fk_src6) ||
12607 IN6_IS_ADDR_UNSPECIFIED(&key.fk_dst6)) {
12608 return EINVAL;
12609 }
12610
12611 break;
12612 case 0:
12613 if (key.fk_proto != 0 ||
12614 key.fk_sport != 0 || key.fk_dport != 0) {
12615 return EINVAL;
12616 }
12617
12618 if (!IN6_IS_ADDR_UNSPECIFIED(&key.fk_src6) ||
12619 !IN6_IS_ADDR_UNSPECIFIED(&key.fk_dst6)) {
12620 return EINVAL;
12621 }
12622
12623 break;
12624 default:
12625 return EINVAL;
12626 }
12627
12628 memcpy(&flow_key_trace, &key, sizeof(struct flow_key));
12629 return 0;
12630 }
12631 #endif /* DEVELOPMENT || DEBUG */
12632