1 /*
2 * Copyright (c) 1999-2023 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
30 * support for mandatory and extensible security protections. This notice
31 * is included in support of clause 2.2 (b) of the Apple Public License,
32 * Version 2.0.
33 */
34 #include <stddef.h>
35 #include <ptrauth.h>
36
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/kernel.h>
40 #include <sys/malloc.h>
41 #include <sys/mbuf.h>
42 #include <sys/socket.h>
43 #include <sys/domain.h>
44 #include <sys/user.h>
45 #include <sys/random.h>
46 #include <sys/socketvar.h>
47 #include <net/if_dl.h>
48 #include <net/if.h>
49 #include <net/route.h>
50 #include <net/if_var.h>
51 #include <net/dlil.h>
52 #include <net/if_arp.h>
53 #include <net/iptap.h>
54 #include <net/pktap.h>
55 #include <net/nwk_wq.h>
56 #include <sys/kern_event.h>
57 #include <sys/kdebug.h>
58 #include <sys/mcache.h>
59 #include <sys/syslog.h>
60 #include <sys/protosw.h>
61 #include <sys/priv.h>
62
63 #include <kern/assert.h>
64 #include <kern/task.h>
65 #include <kern/thread.h>
66 #include <kern/sched_prim.h>
67 #include <kern/locks.h>
68 #include <kern/zalloc.h>
69
70 #include <net/kpi_protocol.h>
71 #include <net/if_types.h>
72 #include <net/if_ipsec.h>
73 #include <net/if_llreach.h>
74 #include <net/if_utun.h>
75 #include <net/kpi_interfacefilter.h>
76 #include <net/classq/classq.h>
77 #include <net/classq/classq_sfb.h>
78 #include <net/flowhash.h>
79 #include <net/ntstat.h>
80 #if SKYWALK && defined(XNU_TARGET_OS_OSX)
81 #include <skywalk/lib/net_filter_event.h>
82 #endif /* SKYWALK && XNU_TARGET_OS_OSX */
83 #include <net/if_llatbl.h>
84 #include <net/net_api_stats.h>
85 #include <net/if_ports_used.h>
86 #include <net/if_vlan_var.h>
87 #include <netinet/in.h>
88 #if INET
89 #include <netinet/in_var.h>
90 #include <netinet/igmp_var.h>
91 #include <netinet/ip_var.h>
92 #include <netinet/tcp.h>
93 #include <netinet/tcp_var.h>
94 #include <netinet/udp.h>
95 #include <netinet/udp_var.h>
96 #include <netinet/if_ether.h>
97 #include <netinet/in_pcb.h>
98 #include <netinet/in_tclass.h>
99 #include <netinet/ip.h>
100 #include <netinet/ip_icmp.h>
101 #include <netinet/icmp_var.h>
102 #endif /* INET */
103
104 #include <net/nat464_utils.h>
105 #include <netinet6/in6_var.h>
106 #include <netinet6/nd6.h>
107 #include <netinet6/mld6_var.h>
108 #include <netinet6/scope6_var.h>
109 #include <netinet/ip6.h>
110 #include <netinet/icmp6.h>
111 #include <net/pf_pbuf.h>
112 #include <libkern/OSAtomic.h>
113 #include <libkern/tree.h>
114
115 #include <dev/random/randomdev.h>
116 #include <machine/machine_routines.h>
117
118 #include <mach/thread_act.h>
119 #include <mach/sdt.h>
120
121 #if CONFIG_MACF
122 #include <sys/kauth.h>
123 #include <security/mac_framework.h>
124 #include <net/ethernet.h>
125 #include <net/firewire.h>
126 #endif
127
128 #if PF
129 #include <net/pfvar.h>
130 #endif /* PF */
131 #include <net/pktsched/pktsched.h>
132 #include <net/pktsched/pktsched_netem.h>
133
134 #if NECP
135 #include <net/necp.h>
136 #endif /* NECP */
137
138 #if SKYWALK
139 #include <skywalk/packet/packet_queue.h>
140 #include <skywalk/nexus/netif/nx_netif.h>
141 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
142 #endif /* SKYWALK */
143
144 #include <os/log.h>
145
146 #define DBG_LAYER_BEG DLILDBG_CODE(DBG_DLIL_STATIC, 0)
147 #define DBG_LAYER_END DLILDBG_CODE(DBG_DLIL_STATIC, 2)
148 #define DBG_FNC_DLIL_INPUT DLILDBG_CODE(DBG_DLIL_STATIC, (1 << 8))
149 #define DBG_FNC_DLIL_OUTPUT DLILDBG_CODE(DBG_DLIL_STATIC, (2 << 8))
150 #define DBG_FNC_DLIL_IFOUT DLILDBG_CODE(DBG_DLIL_STATIC, (3 << 8))
151
152 #define IFNET_KTRACE_TX_PKT_DUMP IFNETDBG_CODE(DBG_IFNET, 0x001)
153 #define IFNET_KTRACE_RX_PKT_DUMP IFNETDBG_CODE(DBG_IFNET, 0x002)
154
155 #define MAX_FRAME_TYPE_SIZE 4 /* LONGWORDS */
156 #define MAX_LINKADDR 4 /* LONGWORDS */
157
158
159 #if 1
160 #define DLIL_PRINTF printf
161 #else
162 #define DLIL_PRINTF kprintf
163 #endif
164
165 #define IF_DATA_REQUIRE_ALIGNED_64(f) \
166 _CASSERT(!(offsetof(struct if_data_internal, f) % sizeof (u_int64_t)))
167
168 #define IFNET_IF_DATA_REQUIRE_ALIGNED_64(f) \
169 _CASSERT(!(offsetof(struct ifnet, if_data.f) % sizeof (u_int64_t)))
170
171 enum {
172 kProtoKPI_v1 = 1,
173 kProtoKPI_v2 = 2
174 };
175
176 /*
177 * List of if_proto structures in if_proto_hash[] is protected by
178 * the ifnet lock. The rest of the fields are initialized at protocol
179 * attach time and never change, thus no lock required as long as
180 * a reference to it is valid, via if_proto_ref().
181 */
182 struct if_proto {
183 SLIST_ENTRY(if_proto) next_hash;
184 u_int32_t refcount;
185 u_int32_t detached;
186 struct ifnet *ifp;
187 protocol_family_t protocol_family;
188 int proto_kpi;
189 union {
190 struct {
191 proto_media_input input;
192 proto_media_preout pre_output;
193 proto_media_event event;
194 proto_media_ioctl ioctl;
195 proto_media_detached detached;
196 proto_media_resolve_multi resolve_multi;
197 proto_media_send_arp send_arp;
198 } v1;
199 struct {
200 proto_media_input_v2 input;
201 proto_media_preout pre_output;
202 proto_media_event event;
203 proto_media_ioctl ioctl;
204 proto_media_detached detached;
205 proto_media_resolve_multi resolve_multi;
206 proto_media_send_arp send_arp;
207 } v2;
208 } kpi;
209 };
210
211 SLIST_HEAD(proto_hash_entry, if_proto);
212
213 #define DLIL_SDLDATALEN \
214 (DLIL_SDLMAXLEN - offsetof(struct sockaddr_dl, sdl_data[0]))
215
216 struct dlil_ifnet {
217 struct ifnet dl_if; /* public ifnet */
218 /*
219 * DLIL private fields, protected by dl_if_lock
220 */
221 decl_lck_mtx_data(, dl_if_lock);
222 TAILQ_ENTRY(dlil_ifnet) dl_if_link; /* dlil_ifnet link */
223 u_int32_t dl_if_flags; /* flags (below) */
224 u_int32_t dl_if_refcnt; /* refcnt */
225 void (*dl_if_trace)(struct dlil_ifnet *, int); /* ref trace callback */
226 void *dl_if_uniqueid; /* unique interface id */
227 size_t dl_if_uniqueid_len; /* length of the unique id */
228 char dl_if_namestorage[IFNAMSIZ]; /* interface name storage */
229 char dl_if_xnamestorage[IFXNAMSIZ]; /* external name storage */
230 struct {
231 struct ifaddr ifa; /* lladdr ifa */
232 u_int8_t asdl[DLIL_SDLMAXLEN]; /* addr storage */
233 u_int8_t msdl[DLIL_SDLMAXLEN]; /* mask storage */
234 } dl_if_lladdr;
235 u_int8_t dl_if_descstorage[IF_DESCSIZE]; /* desc storage */
236 u_int8_t dl_if_permanent_ether[ETHER_ADDR_LEN]; /* permanent address */
237 u_int8_t dl_if_permanent_ether_is_set;
238 u_int8_t dl_if_unused;
239 struct dlil_threading_info dl_if_inpstorage; /* input thread storage */
240 ctrace_t dl_if_attach; /* attach PC stacktrace */
241 ctrace_t dl_if_detach; /* detach PC stacktrace */
242 };
243
244 /* Values for dl_if_flags (private to DLIL) */
245 #define DLIF_INUSE 0x1 /* DLIL ifnet recycler, ifnet in use */
246 #define DLIF_REUSE 0x2 /* DLIL ifnet recycles, ifnet is not new */
247 #define DLIF_DEBUG 0x4 /* has debugging info */
248
249 #define IF_REF_TRACE_HIST_SIZE 8 /* size of ref trace history */
250
251 /* For gdb */
252 __private_extern__ unsigned int if_ref_trace_hist_size = IF_REF_TRACE_HIST_SIZE;
253
254 struct dlil_ifnet_dbg {
255 struct dlil_ifnet dldbg_dlif; /* dlil_ifnet */
256 u_int16_t dldbg_if_refhold_cnt; /* # ifnet references */
257 u_int16_t dldbg_if_refrele_cnt; /* # ifnet releases */
258 /*
259 * Circular lists of ifnet_{reference,release} callers.
260 */
261 ctrace_t dldbg_if_refhold[IF_REF_TRACE_HIST_SIZE];
262 ctrace_t dldbg_if_refrele[IF_REF_TRACE_HIST_SIZE];
263 };
264
265 #define DLIL_TO_IFP(s) (&s->dl_if)
266 #define IFP_TO_DLIL(s) ((struct dlil_ifnet *)s)
267
268 struct ifnet_filter {
269 TAILQ_ENTRY(ifnet_filter) filt_next;
270 u_int32_t filt_skip;
271 u_int32_t filt_flags;
272 ifnet_t filt_ifp;
273 const char *filt_name;
274 void *filt_cookie;
275 protocol_family_t filt_protocol;
276 iff_input_func filt_input;
277 iff_output_func filt_output;
278 iff_event_func filt_event;
279 iff_ioctl_func filt_ioctl;
280 iff_detached_func filt_detached;
281 };
282
283 struct proto_input_entry;
284
285 static TAILQ_HEAD(, dlil_ifnet) dlil_ifnet_head;
286
287 static LCK_ATTR_DECLARE(dlil_lck_attributes, 0, 0);
288
289 static LCK_GRP_DECLARE(dlil_lock_group, "DLIL internal locks");
290 LCK_GRP_DECLARE(ifnet_lock_group, "ifnet locks");
291 static LCK_GRP_DECLARE(ifnet_head_lock_group, "ifnet head lock");
292 static LCK_GRP_DECLARE(ifnet_snd_lock_group, "ifnet snd locks");
293 static LCK_GRP_DECLARE(ifnet_rcv_lock_group, "ifnet rcv locks");
294
295 LCK_ATTR_DECLARE(ifnet_lock_attr, 0, 0);
296 static LCK_RW_DECLARE_ATTR(ifnet_head_lock, &ifnet_head_lock_group,
297 &dlil_lck_attributes);
298 static LCK_MTX_DECLARE_ATTR(dlil_ifnet_lock, &dlil_lock_group,
299 &dlil_lck_attributes);
300
301 #if DEBUG
302 static unsigned int ifnet_debug = 1; /* debugging (enabled) */
303 #else
304 static unsigned int ifnet_debug; /* debugging (disabled) */
305 #endif /* !DEBUG */
306 static unsigned int dlif_size; /* size of dlil_ifnet to allocate */
307 static unsigned int dlif_bufsize; /* size of dlif_size + headroom */
308 static struct zone *dlif_zone; /* zone for dlil_ifnet */
309 #define DLIF_ZONE_NAME "ifnet" /* zone name */
310
311 static KALLOC_TYPE_DEFINE(dlif_filt_zone, struct ifnet_filter, NET_KT_DEFAULT);
312
313 static KALLOC_TYPE_DEFINE(dlif_proto_zone, struct if_proto, NET_KT_DEFAULT);
314
315 static unsigned int dlif_tcpstat_size; /* size of tcpstat_local to allocate */
316 static unsigned int dlif_tcpstat_bufsize; /* size of dlif_tcpstat_size + headroom */
317 static struct zone *dlif_tcpstat_zone; /* zone for tcpstat_local */
318 #define DLIF_TCPSTAT_ZONE_NAME "ifnet_tcpstat" /* zone name */
319
320 static unsigned int dlif_udpstat_size; /* size of udpstat_local to allocate */
321 static unsigned int dlif_udpstat_bufsize; /* size of dlif_udpstat_size + headroom */
322 static struct zone *dlif_udpstat_zone; /* zone for udpstat_local */
323 #define DLIF_UDPSTAT_ZONE_NAME "ifnet_udpstat" /* zone name */
324
325 static u_int32_t net_rtref;
326
327 static struct dlil_main_threading_info dlil_main_input_thread_info;
328 __private_extern__ struct dlil_threading_info *dlil_main_input_thread =
329 (struct dlil_threading_info *)&dlil_main_input_thread_info;
330
331 static int dlil_event_internal(struct ifnet *ifp, struct kev_msg *msg, bool update_generation);
332 static int dlil_detach_filter_internal(interface_filter_t filter, int detached);
333 static void dlil_if_trace(struct dlil_ifnet *, int);
334 static void if_proto_ref(struct if_proto *);
335 static void if_proto_free(struct if_proto *);
336 static struct if_proto *find_attached_proto(struct ifnet *, u_int32_t);
337 static u_int32_t dlil_ifp_protolist(struct ifnet *ifp, protocol_family_t *list,
338 u_int32_t list_count);
339 static void _dlil_if_release(ifnet_t ifp, bool clear_in_use);
340 static void if_flt_monitor_busy(struct ifnet *);
341 static void if_flt_monitor_unbusy(struct ifnet *);
342 static void if_flt_monitor_enter(struct ifnet *);
343 static void if_flt_monitor_leave(struct ifnet *);
344 static int dlil_interface_filters_input(struct ifnet *, struct mbuf **,
345 char **, protocol_family_t);
346 static int dlil_interface_filters_output(struct ifnet *, struct mbuf **,
347 protocol_family_t);
348 static struct ifaddr *dlil_alloc_lladdr(struct ifnet *,
349 const struct sockaddr_dl *);
350 static int ifnet_lookup(struct ifnet *);
351 static void if_purgeaddrs(struct ifnet *);
352
353 static errno_t ifproto_media_input_v1(struct ifnet *, protocol_family_t,
354 struct mbuf *, char *);
355 static errno_t ifproto_media_input_v2(struct ifnet *, protocol_family_t,
356 struct mbuf *);
357 static errno_t ifproto_media_preout(struct ifnet *, protocol_family_t,
358 mbuf_t *, const struct sockaddr *, void *, char *, char *);
359 static void ifproto_media_event(struct ifnet *, protocol_family_t,
360 const struct kev_msg *);
361 static errno_t ifproto_media_ioctl(struct ifnet *, protocol_family_t,
362 unsigned long, void *);
363 static errno_t ifproto_media_resolve_multi(ifnet_t, const struct sockaddr *,
364 struct sockaddr_dl *, size_t);
365 static errno_t ifproto_media_send_arp(struct ifnet *, u_short,
366 const struct sockaddr_dl *, const struct sockaddr *,
367 const struct sockaddr_dl *, const struct sockaddr *);
368
369 static errno_t ifp_if_input(struct ifnet *ifp, struct mbuf *m_head,
370 struct mbuf *m_tail, const struct ifnet_stat_increment_param *s,
371 boolean_t poll, struct thread *tp);
372 static void ifp_if_input_poll(struct ifnet *, u_int32_t, u_int32_t,
373 struct mbuf **, struct mbuf **, u_int32_t *, u_int32_t *);
374 static errno_t ifp_if_ctl(struct ifnet *, ifnet_ctl_cmd_t, u_int32_t, void *);
375 static errno_t ifp_if_demux(struct ifnet *, struct mbuf *, char *,
376 protocol_family_t *);
377 static errno_t ifp_if_add_proto(struct ifnet *, protocol_family_t,
378 const struct ifnet_demux_desc *, u_int32_t);
379 static errno_t ifp_if_del_proto(struct ifnet *, protocol_family_t);
380 static errno_t ifp_if_check_multi(struct ifnet *, const struct sockaddr *);
381 #if !XNU_TARGET_OS_OSX
382 static errno_t ifp_if_framer(struct ifnet *, struct mbuf **,
383 const struct sockaddr *, const char *, const char *,
384 u_int32_t *, u_int32_t *);
385 #else /* XNU_TARGET_OS_OSX */
386 static errno_t ifp_if_framer(struct ifnet *, struct mbuf **,
387 const struct sockaddr *, const char *, const char *);
388 #endif /* XNU_TARGET_OS_OSX */
389 static errno_t ifp_if_framer_extended(struct ifnet *, struct mbuf **,
390 const struct sockaddr *, const char *, const char *,
391 u_int32_t *, u_int32_t *);
392 static errno_t ifp_if_set_bpf_tap(struct ifnet *, bpf_tap_mode, bpf_packet_func);
393 static void ifp_if_free(struct ifnet *);
394 static void ifp_if_event(struct ifnet *, const struct kev_msg *);
395 static __inline void ifp_inc_traffic_class_in(struct ifnet *, struct mbuf *);
396 static __inline void ifp_inc_traffic_class_out(struct ifnet *, struct mbuf *);
397
398 static errno_t dlil_input_async(struct dlil_threading_info *, struct ifnet *,
399 struct mbuf *, struct mbuf *, const struct ifnet_stat_increment_param *,
400 boolean_t, struct thread *);
401 static errno_t dlil_input_sync(struct dlil_threading_info *, struct ifnet *,
402 struct mbuf *, struct mbuf *, const struct ifnet_stat_increment_param *,
403 boolean_t, struct thread *);
404
405 static void dlil_main_input_thread_func(void *, wait_result_t);
406 static void dlil_main_input_thread_cont(void *, wait_result_t);
407
408 static void dlil_input_thread_func(void *, wait_result_t);
409 static void dlil_input_thread_cont(void *, wait_result_t);
410
411 static void dlil_rxpoll_input_thread_func(void *, wait_result_t);
412 static void dlil_rxpoll_input_thread_cont(void *, wait_result_t);
413
414 static int dlil_create_input_thread(ifnet_t, struct dlil_threading_info *,
415 thread_continue_t *);
416 static void dlil_terminate_input_thread(struct dlil_threading_info *);
417 static void dlil_input_stats_add(const struct ifnet_stat_increment_param *,
418 struct dlil_threading_info *, struct ifnet *, boolean_t);
419 static boolean_t dlil_input_stats_sync(struct ifnet *,
420 struct dlil_threading_info *);
421 static void dlil_input_packet_list_common(struct ifnet *, struct mbuf *,
422 u_int32_t, ifnet_model_t, boolean_t);
423 static errno_t ifnet_input_common(struct ifnet *, struct mbuf *, struct mbuf *,
424 const struct ifnet_stat_increment_param *, boolean_t, boolean_t);
425 static int dlil_is_clat_needed(protocol_family_t, mbuf_t );
426 static errno_t dlil_clat46(ifnet_t, protocol_family_t *, mbuf_t *);
427 static errno_t dlil_clat64(ifnet_t, protocol_family_t *, mbuf_t *);
428 #if DEBUG || DEVELOPMENT
429 static void dlil_verify_sum16(void);
430 #endif /* DEBUG || DEVELOPMENT */
431 static void dlil_output_cksum_dbg(struct ifnet *, struct mbuf *, uint32_t,
432 protocol_family_t);
433 static void dlil_input_cksum_dbg(struct ifnet *, struct mbuf *, char *,
434 protocol_family_t);
435
436 static void dlil_incr_pending_thread_count(void);
437 static void dlil_decr_pending_thread_count(void);
438
439 static void ifnet_detacher_thread_func(void *, wait_result_t);
440 static void ifnet_detacher_thread_cont(void *, wait_result_t);
441 static void ifnet_detach_final(struct ifnet *);
442 static void ifnet_detaching_enqueue(struct ifnet *);
443 static struct ifnet *ifnet_detaching_dequeue(void);
444
445 static void ifnet_start_thread_func(void *, wait_result_t);
446 static void ifnet_start_thread_cont(void *, wait_result_t);
447
448 static void ifnet_poll_thread_func(void *, wait_result_t);
449 static void ifnet_poll_thread_cont(void *, wait_result_t);
450
451 static errno_t ifnet_enqueue_common(struct ifnet *, struct ifclassq *,
452 classq_pkt_t *, boolean_t, boolean_t *);
453
454 static void ifp_src_route_copyout(struct ifnet *, struct route *);
455 static void ifp_src_route_copyin(struct ifnet *, struct route *);
456 static void ifp_src_route6_copyout(struct ifnet *, struct route_in6 *);
457 static void ifp_src_route6_copyin(struct ifnet *, struct route_in6 *);
458
459 static int sysctl_rxpoll SYSCTL_HANDLER_ARGS;
460 static int sysctl_rxpoll_mode_holdtime SYSCTL_HANDLER_ARGS;
461 static int sysctl_rxpoll_sample_holdtime SYSCTL_HANDLER_ARGS;
462 static int sysctl_rxpoll_interval_time SYSCTL_HANDLER_ARGS;
463 static int sysctl_rxpoll_wlowat SYSCTL_HANDLER_ARGS;
464 static int sysctl_rxpoll_whiwat SYSCTL_HANDLER_ARGS;
465 static int sysctl_sndq_maxlen SYSCTL_HANDLER_ARGS;
466 static int sysctl_rcvq_maxlen SYSCTL_HANDLER_ARGS;
467 static int sysctl_hwcksum_dbg_mode SYSCTL_HANDLER_ARGS;
468 static int sysctl_hwcksum_dbg_partial_rxoff_forced SYSCTL_HANDLER_ARGS;
469 static int sysctl_hwcksum_dbg_partial_rxoff_adj SYSCTL_HANDLER_ARGS;
470
471 struct chain_len_stats tx_chain_len_stats;
472 static int sysctl_tx_chain_len_stats SYSCTL_HANDLER_ARGS;
473
474 #if TEST_INPUT_THREAD_TERMINATION
475 static int sysctl_input_thread_termination_spin SYSCTL_HANDLER_ARGS;
476 #endif /* TEST_INPUT_THREAD_TERMINATION */
477
478
479 /* The following are protected by dlil_ifnet_lock */
480 static TAILQ_HEAD(, ifnet) ifnet_detaching_head;
481 static u_int32_t ifnet_detaching_cnt;
482 static boolean_t ifnet_detaching_embryonic;
483 static void *ifnet_delayed_run; /* wait channel for detaching thread */
484
485 static LCK_MTX_DECLARE_ATTR(ifnet_fc_lock, &dlil_lock_group,
486 &dlil_lck_attributes);
487
488 static uint32_t ifnet_flowhash_seed;
489
490 struct ifnet_flowhash_key {
491 char ifk_name[IFNAMSIZ];
492 uint32_t ifk_unit;
493 uint32_t ifk_flags;
494 uint32_t ifk_eflags;
495 uint32_t ifk_capabilities;
496 uint32_t ifk_capenable;
497 uint32_t ifk_output_sched_model;
498 uint32_t ifk_rand1;
499 uint32_t ifk_rand2;
500 };
501
502 /* Flow control entry per interface */
503 struct ifnet_fc_entry {
504 RB_ENTRY(ifnet_fc_entry) ifce_entry;
505 u_int32_t ifce_flowhash;
506 struct ifnet *ifce_ifp;
507 };
508
509 static uint32_t ifnet_calc_flowhash(struct ifnet *);
510 static int ifce_cmp(const struct ifnet_fc_entry *,
511 const struct ifnet_fc_entry *);
512 static int ifnet_fc_add(struct ifnet *);
513 static struct ifnet_fc_entry *ifnet_fc_get(u_int32_t);
514 static void ifnet_fc_entry_free(struct ifnet_fc_entry *);
515
516 /* protected by ifnet_fc_lock */
517 RB_HEAD(ifnet_fc_tree, ifnet_fc_entry) ifnet_fc_tree;
518 RB_PROTOTYPE(ifnet_fc_tree, ifnet_fc_entry, ifce_entry, ifce_cmp);
519 RB_GENERATE(ifnet_fc_tree, ifnet_fc_entry, ifce_entry, ifce_cmp);
520
521 static KALLOC_TYPE_DEFINE(ifnet_fc_zone, struct ifnet_fc_entry, NET_KT_DEFAULT);
522
523 extern void bpfdetach(struct ifnet *);
524 extern void proto_input_run(void);
525
526 extern uint32_t udp_count_opportunistic(unsigned int ifindex,
527 u_int32_t flags);
528 extern uint32_t tcp_count_opportunistic(unsigned int ifindex,
529 u_int32_t flags);
530
531 __private_extern__ void link_rtrequest(int, struct rtentry *, struct sockaddr *);
532
533 #if CONFIG_MACF
534 #if !XNU_TARGET_OS_OSX
535 int dlil_lladdr_ckreq = 1;
536 #else /* XNU_TARGET_OS_OSX */
537 int dlil_lladdr_ckreq = 0;
538 #endif /* XNU_TARGET_OS_OSX */
539 #endif /* CONFIG_MACF */
540
541 #if DEBUG
542 int dlil_verbose = 1;
543 #else
544 int dlil_verbose = 0;
545 #endif /* DEBUG */
546 #if IFNET_INPUT_SANITY_CHK
547 /* sanity checking of input packet lists received */
548 static u_int32_t dlil_input_sanity_check = 0;
549 #endif /* IFNET_INPUT_SANITY_CHK */
550 /* rate limit debug messages */
551 struct timespec dlil_dbgrate = { .tv_sec = 1, .tv_nsec = 0 };
552
553 SYSCTL_DECL(_net_link_generic_system);
554
555 SYSCTL_INT(_net_link_generic_system, OID_AUTO, dlil_verbose,
556 CTLFLAG_RW | CTLFLAG_LOCKED, &dlil_verbose, 0, "Log DLIL error messages");
557
558 #define IF_SNDQ_MINLEN 32
559 u_int32_t if_sndq_maxlen = IFQ_MAXLEN;
560 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, sndq_maxlen,
561 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_sndq_maxlen, IFQ_MAXLEN,
562 sysctl_sndq_maxlen, "I", "Default transmit queue max length");
563
564 #define IF_RCVQ_MINLEN 32
565 #define IF_RCVQ_MAXLEN 256
566 u_int32_t if_rcvq_maxlen = IF_RCVQ_MAXLEN;
567 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rcvq_maxlen,
568 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rcvq_maxlen, IFQ_MAXLEN,
569 sysctl_rcvq_maxlen, "I", "Default receive queue max length");
570
571 #define IF_RXPOLL_DECAY 2 /* ilog2 of EWMA decay rate (4) */
572 u_int32_t if_rxpoll_decay = IF_RXPOLL_DECAY;
573 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, rxpoll_decay,
574 CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_decay, IF_RXPOLL_DECAY,
575 "ilog2 of EWMA decay rate of avg inbound packets");
576
577 #define IF_RXPOLL_MODE_HOLDTIME_MIN (10ULL * 1000 * 1000) /* 10 ms */
578 #define IF_RXPOLL_MODE_HOLDTIME (1000ULL * 1000 * 1000) /* 1 sec */
579 static u_int64_t if_rxpoll_mode_holdtime = IF_RXPOLL_MODE_HOLDTIME;
580 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_freeze_time,
581 CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_mode_holdtime,
582 IF_RXPOLL_MODE_HOLDTIME, sysctl_rxpoll_mode_holdtime,
583 "Q", "input poll mode freeze time");
584
585 #define IF_RXPOLL_SAMPLETIME_MIN (1ULL * 1000 * 1000) /* 1 ms */
586 #define IF_RXPOLL_SAMPLETIME (10ULL * 1000 * 1000) /* 10 ms */
587 static u_int64_t if_rxpoll_sample_holdtime = IF_RXPOLL_SAMPLETIME;
588 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_sample_time,
589 CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_sample_holdtime,
590 IF_RXPOLL_SAMPLETIME, sysctl_rxpoll_sample_holdtime,
591 "Q", "input poll sampling time");
592
593 static u_int64_t if_rxpoll_interval_time = IF_RXPOLL_INTERVALTIME;
594 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_interval_time,
595 CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_interval_time,
596 IF_RXPOLL_INTERVALTIME, sysctl_rxpoll_interval_time,
597 "Q", "input poll interval (time)");
598
599 #define IF_RXPOLL_INTERVAL_PKTS 0 /* 0 (disabled) */
600 u_int32_t if_rxpoll_interval_pkts = IF_RXPOLL_INTERVAL_PKTS;
601 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, rxpoll_interval_pkts,
602 CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_interval_pkts,
603 IF_RXPOLL_INTERVAL_PKTS, "input poll interval (packets)");
604
605 #define IF_RXPOLL_WLOWAT 10
606 static u_int32_t if_sysctl_rxpoll_wlowat = IF_RXPOLL_WLOWAT;
607 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_wakeups_lowat,
608 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_sysctl_rxpoll_wlowat,
609 IF_RXPOLL_WLOWAT, sysctl_rxpoll_wlowat,
610 "I", "input poll wakeup low watermark");
611
612 #define IF_RXPOLL_WHIWAT 100
613 static u_int32_t if_sysctl_rxpoll_whiwat = IF_RXPOLL_WHIWAT;
614 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_wakeups_hiwat,
615 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_sysctl_rxpoll_whiwat,
616 IF_RXPOLL_WHIWAT, sysctl_rxpoll_whiwat,
617 "I", "input poll wakeup high watermark");
618
619 static u_int32_t if_rxpoll_max = 0; /* 0 (automatic) */
620 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, rxpoll_max,
621 CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_max, 0,
622 "max packets per poll call");
623
624 u_int32_t if_rxpoll = 1;
625 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll,
626 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll, 0,
627 sysctl_rxpoll, "I", "enable opportunistic input polling");
628
629 #if TEST_INPUT_THREAD_TERMINATION
630 static u_int32_t if_input_thread_termination_spin = 0;
631 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, input_thread_termination_spin,
632 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
633 &if_input_thread_termination_spin, 0,
634 sysctl_input_thread_termination_spin,
635 "I", "input thread termination spin limit");
636 #endif /* TEST_INPUT_THREAD_TERMINATION */
637
638 static u_int32_t cur_dlil_input_threads = 0;
639 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, dlil_input_threads,
640 CTLFLAG_RD | CTLFLAG_LOCKED, &cur_dlil_input_threads, 0,
641 "Current number of DLIL input threads");
642
643 #if IFNET_INPUT_SANITY_CHK
644 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, dlil_input_sanity_check,
645 CTLFLAG_RW | CTLFLAG_LOCKED, &dlil_input_sanity_check, 0,
646 "Turn on sanity checking in DLIL input");
647 #endif /* IFNET_INPUT_SANITY_CHK */
648
649 static u_int32_t if_flowadv = 1;
650 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, flow_advisory,
651 CTLFLAG_RW | CTLFLAG_LOCKED, &if_flowadv, 1,
652 "enable flow-advisory mechanism");
653
654 static u_int32_t if_delaybased_queue = 1;
655 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, delaybased_queue,
656 CTLFLAG_RW | CTLFLAG_LOCKED, &if_delaybased_queue, 1,
657 "enable delay based dynamic queue sizing");
658
659 static uint64_t hwcksum_in_invalidated = 0;
660 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
661 hwcksum_in_invalidated, CTLFLAG_RD | CTLFLAG_LOCKED,
662 &hwcksum_in_invalidated, "inbound packets with invalidated hardware cksum");
663
664 uint32_t hwcksum_dbg = 0;
665 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, hwcksum_dbg,
666 CTLFLAG_RW | CTLFLAG_LOCKED, &hwcksum_dbg, 0,
667 "enable hardware cksum debugging");
668
669 u_int32_t ifnet_start_delayed = 0;
670 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, start_delayed,
671 CTLFLAG_RW | CTLFLAG_LOCKED, &ifnet_start_delayed, 0,
672 "number of times start was delayed");
673
674 u_int32_t ifnet_delay_start_disabled = 0;
675 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, start_delay_disabled,
676 CTLFLAG_RW | CTLFLAG_LOCKED, &ifnet_delay_start_disabled, 0,
677 "number of times start was delayed");
678
679 #if DEVELOPMENT || DEBUG
680 static int packet_dump_trace_update SYSCTL_HANDLER_ARGS;
681
682 struct flow_key flow_key_trace;
683 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, flow_key_trace, CTLFLAG_WR | CTLFLAG_LOCKED |
684 CTLFLAG_KERN | CTLFLAG_ANYBODY, 0, 0, packet_dump_trace_update, "S", "Set flow key for packet tracing");
685 #endif /* DEVELOPMENT || DEBUG */
686
687 static inline void
ifnet_delay_start_disabled_increment(void)688 ifnet_delay_start_disabled_increment(void)
689 {
690 OSIncrementAtomic(&ifnet_delay_start_disabled);
691 }
692
693 #define HWCKSUM_DBG_PARTIAL_FORCED 0x1 /* forced partial checksum */
694 #define HWCKSUM_DBG_PARTIAL_RXOFF_ADJ 0x2 /* adjust start offset */
695 #define HWCKSUM_DBG_FINALIZE_FORCED 0x10 /* forced finalize */
696 #define HWCKSUM_DBG_MASK \
697 (HWCKSUM_DBG_PARTIAL_FORCED | HWCKSUM_DBG_PARTIAL_RXOFF_ADJ | \
698 HWCKSUM_DBG_FINALIZE_FORCED)
699
700 static uint32_t hwcksum_dbg_mode = 0;
701 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, hwcksum_dbg_mode,
702 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &hwcksum_dbg_mode,
703 0, sysctl_hwcksum_dbg_mode, "I", "hardware cksum debugging mode");
704
705 static uint64_t hwcksum_dbg_partial_forced = 0;
706 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
707 hwcksum_dbg_partial_forced, CTLFLAG_RD | CTLFLAG_LOCKED,
708 &hwcksum_dbg_partial_forced, "packets forced using partial cksum");
709
710 static uint64_t hwcksum_dbg_partial_forced_bytes = 0;
711 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
712 hwcksum_dbg_partial_forced_bytes, CTLFLAG_RD | CTLFLAG_LOCKED,
713 &hwcksum_dbg_partial_forced_bytes, "bytes forced using partial cksum");
714
715 static uint32_t hwcksum_dbg_partial_rxoff_forced = 0;
716 SYSCTL_PROC(_net_link_generic_system, OID_AUTO,
717 hwcksum_dbg_partial_rxoff_forced, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
718 &hwcksum_dbg_partial_rxoff_forced, 0,
719 sysctl_hwcksum_dbg_partial_rxoff_forced, "I",
720 "forced partial cksum rx offset");
721
722 static uint32_t hwcksum_dbg_partial_rxoff_adj = 0;
723 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, hwcksum_dbg_partial_rxoff_adj,
724 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &hwcksum_dbg_partial_rxoff_adj,
725 0, sysctl_hwcksum_dbg_partial_rxoff_adj, "I",
726 "adjusted partial cksum rx offset");
727
728 static uint64_t hwcksum_dbg_verified = 0;
729 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
730 hwcksum_dbg_verified, CTLFLAG_RD | CTLFLAG_LOCKED,
731 &hwcksum_dbg_verified, "packets verified for having good checksum");
732
733 static uint64_t hwcksum_dbg_bad_cksum = 0;
734 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
735 hwcksum_dbg_bad_cksum, CTLFLAG_RD | CTLFLAG_LOCKED,
736 &hwcksum_dbg_bad_cksum, "packets with bad hardware calculated checksum");
737
738 static uint64_t hwcksum_dbg_bad_rxoff = 0;
739 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
740 hwcksum_dbg_bad_rxoff, CTLFLAG_RD | CTLFLAG_LOCKED,
741 &hwcksum_dbg_bad_rxoff, "packets with invalid rxoff");
742
743 static uint64_t hwcksum_dbg_adjusted = 0;
744 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
745 hwcksum_dbg_adjusted, CTLFLAG_RD | CTLFLAG_LOCKED,
746 &hwcksum_dbg_adjusted, "packets with rxoff adjusted");
747
748 static uint64_t hwcksum_dbg_finalized_hdr = 0;
749 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
750 hwcksum_dbg_finalized_hdr, CTLFLAG_RD | CTLFLAG_LOCKED,
751 &hwcksum_dbg_finalized_hdr, "finalized headers");
752
753 static uint64_t hwcksum_dbg_finalized_data = 0;
754 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
755 hwcksum_dbg_finalized_data, CTLFLAG_RD | CTLFLAG_LOCKED,
756 &hwcksum_dbg_finalized_data, "finalized payloads");
757
758 uint32_t hwcksum_tx = 1;
759 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, hwcksum_tx,
760 CTLFLAG_RW | CTLFLAG_LOCKED, &hwcksum_tx, 0,
761 "enable transmit hardware checksum offload");
762
763 uint32_t hwcksum_rx = 1;
764 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, hwcksum_rx,
765 CTLFLAG_RW | CTLFLAG_LOCKED, &hwcksum_rx, 0,
766 "enable receive hardware checksum offload");
767
768 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, tx_chain_len_stats,
769 CTLFLAG_RD | CTLFLAG_LOCKED, 0, 9,
770 sysctl_tx_chain_len_stats, "S", "");
771
772 uint32_t tx_chain_len_count = 0;
773 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, tx_chain_len_count,
774 CTLFLAG_RW | CTLFLAG_LOCKED, &tx_chain_len_count, 0, "");
775
776 static uint32_t threshold_notify = 1; /* enable/disable */
777 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, threshold_notify,
778 CTLFLAG_RW | CTLFLAG_LOCKED, &threshold_notify, 0, "");
779
780 static uint32_t threshold_interval = 2; /* in seconds */
781 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, threshold_interval,
782 CTLFLAG_RW | CTLFLAG_LOCKED, &threshold_interval, 0, "");
783
784 #if (DEVELOPMENT || DEBUG)
785 static int sysctl_get_kao_frames SYSCTL_HANDLER_ARGS;
786 SYSCTL_NODE(_net_link_generic_system, OID_AUTO, get_kao_frames,
787 CTLFLAG_RD | CTLFLAG_LOCKED, sysctl_get_kao_frames, "");
788 #endif /* DEVELOPMENT || DEBUG */
789
790 struct net_api_stats net_api_stats;
791 SYSCTL_STRUCT(_net, OID_AUTO, api_stats, CTLFLAG_RD | CTLFLAG_LOCKED,
792 &net_api_stats, net_api_stats, "");
793
794 uint32_t net_wake_pkt_debug = 0;
795 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, wake_pkt_debug,
796 CTLFLAG_RW | CTLFLAG_LOCKED, &net_wake_pkt_debug, 0, "");
797
798 static void log_hexdump(void *data, size_t len);
799
800 unsigned int net_rxpoll = 1;
801 unsigned int net_affinity = 1;
802 unsigned int net_async = 1; /* 0: synchronous, 1: asynchronous */
803
804 static kern_return_t dlil_affinity_set(struct thread *, u_int32_t);
805
806 extern u_int32_t inject_buckets;
807
808 /* DLIL data threshold thread call */
809 static void dlil_dt_tcall_fn(thread_call_param_t, thread_call_param_t);
810
811 void
ifnet_filter_update_tso(struct ifnet * ifp,boolean_t filter_enable)812 ifnet_filter_update_tso(struct ifnet *ifp, boolean_t filter_enable)
813 {
814 /*
815 * update filter count and route_generation ID to let TCP
816 * know it should reevalute doing TSO or not
817 */
818 if (filter_enable) {
819 OSAddAtomic(1, &ifp->if_flt_no_tso_count);
820 } else {
821 VERIFY(ifp->if_flt_no_tso_count != 0);
822 OSAddAtomic(-1, &ifp->if_flt_no_tso_count);
823 }
824 routegenid_update();
825 }
826
827 #if SKYWALK
828
829 #if defined(XNU_TARGET_OS_OSX)
830 static bool net_check_compatible_if_filter(struct ifnet *ifp);
831 #endif /* XNU_TARGET_OS_OSX */
832
833 /* if_attach_nx flags defined in os_skywalk_private.h */
834 static unsigned int if_attach_nx = IF_ATTACH_NX_DEFAULT;
835 unsigned int if_enable_fsw_ip_netagent =
836 ((IF_ATTACH_NX_DEFAULT & IF_ATTACH_NX_FSW_IP_NETAGENT) != 0);
837 unsigned int if_enable_fsw_transport_netagent =
838 ((IF_ATTACH_NX_DEFAULT & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0);
839
840 unsigned int if_netif_all =
841 ((IF_ATTACH_NX_DEFAULT & IF_ATTACH_NX_NETIF_ALL) != 0);
842
843 /* Configure flowswitch to use max mtu sized buffer */
844 static bool fsw_use_max_mtu_buffer = false;
845
846 #if (DEVELOPMENT || DEBUG)
847 static int
848 if_attach_nx_sysctl SYSCTL_HANDLER_ARGS
849 {
850 #pragma unused(oidp, arg1, arg2)
851 unsigned int new_value;
852 int changed;
853 int error = sysctl_io_number(req, if_attach_nx, sizeof(if_attach_nx),
854 &new_value, &changed);
855 if (error) {
856 return error;
857 }
858 if (changed) {
859 if ((new_value & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) !=
860 (if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT)) {
861 return ENOTSUP;
862 }
863 if_attach_nx = new_value;
864 }
865 return 0;
866 }
867
868 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, if_attach_nx,
869 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
870 0, 0, &if_attach_nx_sysctl, "IU", "attach nexus");
871
872 #endif /* DEVELOPMENT || DEBUG */
873
874 static int
875 if_enable_fsw_transport_netagent_sysctl SYSCTL_HANDLER_ARGS
876 {
877 #pragma unused(oidp, arg1, arg2)
878 unsigned int new_value;
879 int changed;
880 int error;
881
882 error = sysctl_io_number(req, if_enable_fsw_transport_netagent,
883 sizeof(if_enable_fsw_transport_netagent),
884 &new_value, &changed);
885 if (error == 0 && changed != 0) {
886 if (new_value != 0 && new_value != 1) {
887 /* only allow 0 or 1 */
888 error = EINVAL;
889 } else if ((if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0) {
890 /* netagent can be enabled/disabled */
891 if_enable_fsw_transport_netagent = new_value;
892 if (new_value == 0) {
893 kern_nexus_deregister_netagents();
894 } else {
895 kern_nexus_register_netagents();
896 }
897 } else {
898 /* netagent can't be enabled */
899 error = ENOTSUP;
900 }
901 }
902 return error;
903 }
904
905 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, enable_netagent,
906 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
907 0, 0, &if_enable_fsw_transport_netagent_sysctl, "IU",
908 "enable flowswitch netagent");
909
910 static void dlil_detach_flowswitch_nexus(if_nexus_flowswitch_t nexus_fsw);
911
912 #include <skywalk/os_skywalk_private.h>
913
914 boolean_t
ifnet_nx_noauto(ifnet_t ifp)915 ifnet_nx_noauto(ifnet_t ifp)
916 {
917 return (ifp->if_xflags & IFXF_NX_NOAUTO) != 0;
918 }
919
920 boolean_t
ifnet_nx_noauto_flowswitch(ifnet_t ifp)921 ifnet_nx_noauto_flowswitch(ifnet_t ifp)
922 {
923 return ifnet_is_low_latency(ifp);
924 }
925
926 boolean_t
ifnet_is_low_latency(ifnet_t ifp)927 ifnet_is_low_latency(ifnet_t ifp)
928 {
929 return (ifp->if_xflags & IFXF_LOW_LATENCY) != 0;
930 }
931
932 boolean_t
ifnet_needs_compat(ifnet_t ifp)933 ifnet_needs_compat(ifnet_t ifp)
934 {
935 if ((if_attach_nx & IF_ATTACH_NX_NETIF_COMPAT) == 0) {
936 return FALSE;
937 }
938 #if !XNU_TARGET_OS_OSX
939 /*
940 * To conserve memory, we plumb in the compat layer selectively; this
941 * can be overridden via if_attach_nx flag IF_ATTACH_NX_NETIF_ALL.
942 * In particular, we check for Wi-Fi Access Point.
943 */
944 if (IFNET_IS_WIFI(ifp)) {
945 /* Wi-Fi Access Point */
946 if (ifp->if_name[0] == 'a' && ifp->if_name[1] == 'p' &&
947 ifp->if_name[2] == '\0') {
948 return if_netif_all;
949 }
950 }
951 #else /* XNU_TARGET_OS_OSX */
952 #pragma unused(ifp)
953 #endif /* XNU_TARGET_OS_OSX */
954 return TRUE;
955 }
956
957 boolean_t
ifnet_needs_fsw_transport_netagent(ifnet_t ifp)958 ifnet_needs_fsw_transport_netagent(ifnet_t ifp)
959 {
960 if (if_is_fsw_transport_netagent_enabled()) {
961 /* check if netagent has been manually enabled for ipsec/utun */
962 if (ifp->if_family == IFNET_FAMILY_IPSEC) {
963 return ipsec_interface_needs_netagent(ifp);
964 } else if (ifp->if_family == IFNET_FAMILY_UTUN) {
965 return utun_interface_needs_netagent(ifp);
966 }
967
968 /* check ifnet no auto nexus override */
969 if (ifnet_nx_noauto(ifp)) {
970 return FALSE;
971 }
972
973 /* check global if_attach_nx configuration */
974 switch (ifp->if_family) {
975 case IFNET_FAMILY_CELLULAR:
976 case IFNET_FAMILY_ETHERNET:
977 if ((if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0) {
978 return TRUE;
979 }
980 break;
981 default:
982 break;
983 }
984 }
985 return FALSE;
986 }
987
988 boolean_t
ifnet_needs_fsw_ip_netagent(ifnet_t ifp)989 ifnet_needs_fsw_ip_netagent(ifnet_t ifp)
990 {
991 #pragma unused(ifp)
992 if ((if_attach_nx & IF_ATTACH_NX_FSW_IP_NETAGENT) != 0) {
993 return TRUE;
994 }
995 return FALSE;
996 }
997
998 boolean_t
ifnet_needs_netif_netagent(ifnet_t ifp)999 ifnet_needs_netif_netagent(ifnet_t ifp)
1000 {
1001 #pragma unused(ifp)
1002 return (if_attach_nx & IF_ATTACH_NX_NETIF_NETAGENT) != 0;
1003 }
1004
1005 static boolean_t
dlil_detach_nexus_instance(nexus_controller_t controller,const char * func_str,uuid_t instance,uuid_t device)1006 dlil_detach_nexus_instance(nexus_controller_t controller,
1007 const char *func_str, uuid_t instance, uuid_t device)
1008 {
1009 errno_t err;
1010
1011 if (instance == NULL || uuid_is_null(instance)) {
1012 return FALSE;
1013 }
1014
1015 /* followed by the device port */
1016 if (device != NULL && !uuid_is_null(device)) {
1017 err = kern_nexus_ifdetach(controller, instance, device);
1018 if (err != 0) {
1019 DLIL_PRINTF("%s kern_nexus_ifdetach device failed %d\n",
1020 func_str, err);
1021 }
1022 }
1023 err = kern_nexus_controller_free_provider_instance(controller,
1024 instance);
1025 if (err != 0) {
1026 DLIL_PRINTF("%s free_provider_instance failed %d\n",
1027 func_str, err);
1028 }
1029 return TRUE;
1030 }
1031
1032 static boolean_t
dlil_detach_nexus(const char * func_str,uuid_t provider,uuid_t instance,uuid_t device)1033 dlil_detach_nexus(const char *func_str, uuid_t provider, uuid_t instance,
1034 uuid_t device)
1035 {
1036 boolean_t detached = FALSE;
1037 nexus_controller_t controller = kern_nexus_shared_controller();
1038 int err;
1039
1040 if (dlil_detach_nexus_instance(controller, func_str, instance,
1041 device)) {
1042 detached = TRUE;
1043 }
1044 if (provider != NULL && !uuid_is_null(provider)) {
1045 detached = TRUE;
1046 err = kern_nexus_controller_deregister_provider(controller,
1047 provider);
1048 if (err != 0) {
1049 DLIL_PRINTF("%s deregister_provider %d\n",
1050 func_str, err);
1051 }
1052 }
1053 return detached;
1054 }
1055
1056 static errno_t
dlil_create_provider_and_instance(nexus_controller_t controller,nexus_type_t type,ifnet_t ifp,uuid_t * provider,uuid_t * instance,nexus_attr_t attr)1057 dlil_create_provider_and_instance(nexus_controller_t controller,
1058 nexus_type_t type, ifnet_t ifp, uuid_t *provider, uuid_t *instance,
1059 nexus_attr_t attr)
1060 {
1061 uuid_t dom_prov;
1062 errno_t err;
1063 nexus_name_t provider_name;
1064 const char *type_name =
1065 (type == NEXUS_TYPE_NET_IF) ? "netif" : "flowswitch";
1066 struct kern_nexus_init init;
1067
1068 err = kern_nexus_get_default_domain_provider(type, &dom_prov);
1069 if (err != 0) {
1070 DLIL_PRINTF("%s can't get %s provider, error %d\n",
1071 __func__, type_name, err);
1072 goto failed;
1073 }
1074
1075 snprintf((char *)provider_name, sizeof(provider_name),
1076 "com.apple.%s.%s", type_name, if_name(ifp));
1077 err = kern_nexus_controller_register_provider(controller,
1078 dom_prov,
1079 provider_name,
1080 NULL,
1081 0,
1082 attr,
1083 provider);
1084 if (err != 0) {
1085 DLIL_PRINTF("%s register %s provider failed, error %d\n",
1086 __func__, type_name, err);
1087 goto failed;
1088 }
1089 bzero(&init, sizeof(init));
1090 init.nxi_version = KERN_NEXUS_CURRENT_VERSION;
1091 err = kern_nexus_controller_alloc_provider_instance(controller,
1092 *provider,
1093 NULL, NULL,
1094 instance, &init);
1095 if (err != 0) {
1096 DLIL_PRINTF("%s alloc_provider_instance %s failed, %d\n",
1097 __func__, type_name, err);
1098 kern_nexus_controller_deregister_provider(controller,
1099 *provider);
1100 goto failed;
1101 }
1102 failed:
1103 return err;
1104 }
1105
1106 static boolean_t
dlil_attach_netif_nexus_common(ifnet_t ifp,if_nexus_netif_t netif_nx)1107 dlil_attach_netif_nexus_common(ifnet_t ifp, if_nexus_netif_t netif_nx)
1108 {
1109 nexus_attr_t attr = NULL;
1110 nexus_controller_t controller;
1111 errno_t err;
1112
1113 if ((ifp->if_capabilities & IFCAP_SKYWALK) != 0) {
1114 /* it's already attached */
1115 if (dlil_verbose) {
1116 DLIL_PRINTF("%s: %s already has nexus attached\n",
1117 __func__, if_name(ifp));
1118 /* already attached */
1119 }
1120 goto failed;
1121 }
1122
1123 err = kern_nexus_attr_create(&attr);
1124 if (err != 0) {
1125 DLIL_PRINTF("%s: nexus attr create for %s\n", __func__,
1126 if_name(ifp));
1127 goto failed;
1128 }
1129 err = kern_nexus_attr_set(attr, NEXUS_ATTR_IFINDEX, ifp->if_index);
1130 VERIFY(err == 0);
1131
1132 controller = kern_nexus_shared_controller();
1133
1134 /* create the netif provider and instance */
1135 err = dlil_create_provider_and_instance(controller,
1136 NEXUS_TYPE_NET_IF, ifp, &netif_nx->if_nif_provider,
1137 &netif_nx->if_nif_instance, attr);
1138 if (err != 0) {
1139 goto failed;
1140 }
1141 err = kern_nexus_ifattach(controller, netif_nx->if_nif_instance,
1142 ifp, NULL, FALSE, &netif_nx->if_nif_attach);
1143 if (err != 0) {
1144 DLIL_PRINTF("%s kern_nexus_ifattach %d\n",
1145 __func__, err);
1146 /* cleanup provider and instance */
1147 dlil_detach_nexus(__func__, netif_nx->if_nif_provider,
1148 netif_nx->if_nif_instance, NULL);
1149 goto failed;
1150 }
1151 return TRUE;
1152
1153 failed:
1154 if (attr != NULL) {
1155 kern_nexus_attr_destroy(attr);
1156 }
1157 return FALSE;
1158 }
1159
1160 static boolean_t
dlil_attach_netif_compat_nexus(ifnet_t ifp,if_nexus_netif_t netif_nx)1161 dlil_attach_netif_compat_nexus(ifnet_t ifp, if_nexus_netif_t netif_nx)
1162 {
1163 if (ifnet_nx_noauto(ifp) || IFNET_IS_INTCOPROC(ifp) ||
1164 IFNET_IS_MANAGEMENT(ifp) || IFNET_IS_VMNET(ifp)) {
1165 goto failed;
1166 }
1167 switch (ifp->if_type) {
1168 case IFT_CELLULAR:
1169 case IFT_ETHER:
1170 if ((if_attach_nx & IF_ATTACH_NX_NETIF_COMPAT) == 0) {
1171 /* don't auto-attach */
1172 goto failed;
1173 }
1174 break;
1175 default:
1176 /* don't auto-attach */
1177 goto failed;
1178 }
1179 return dlil_attach_netif_nexus_common(ifp, netif_nx);
1180
1181 failed:
1182 return FALSE;
1183 }
1184
1185 static boolean_t
dlil_is_native_netif_nexus(ifnet_t ifp)1186 dlil_is_native_netif_nexus(ifnet_t ifp)
1187 {
1188 return (ifp->if_eflags & IFEF_SKYWALK_NATIVE) && ifp->if_na != NULL;
1189 }
1190
1191 __attribute__((noinline))
1192 static void
dlil_detach_netif_nexus(if_nexus_netif_t nexus_netif)1193 dlil_detach_netif_nexus(if_nexus_netif_t nexus_netif)
1194 {
1195 dlil_detach_nexus(__func__, nexus_netif->if_nif_provider,
1196 nexus_netif->if_nif_instance, nexus_netif->if_nif_attach);
1197 }
1198
1199 static inline int
dlil_siocgifdevmtu(struct ifnet * ifp,struct ifdevmtu * ifdm_p)1200 dlil_siocgifdevmtu(struct ifnet * ifp, struct ifdevmtu * ifdm_p)
1201 {
1202 struct ifreq ifr;
1203 int error;
1204
1205 bzero(&ifr, sizeof(ifr));
1206 error = ifnet_ioctl(ifp, 0, SIOCGIFDEVMTU, &ifr);
1207 if (error == 0) {
1208 *ifdm_p = ifr.ifr_devmtu;
1209 }
1210 return error;
1211 }
1212
1213 static inline int
_dlil_get_flowswitch_buffer_size(ifnet_t ifp,uuid_t netif,uint32_t * buf_size,bool * use_multi_buflet,uint32_t * large_buf_size)1214 _dlil_get_flowswitch_buffer_size(ifnet_t ifp, uuid_t netif, uint32_t *buf_size,
1215 bool *use_multi_buflet, uint32_t *large_buf_size)
1216 {
1217 struct kern_pbufpool_memory_info rx_pp_info;
1218 struct kern_pbufpool_memory_info tx_pp_info;
1219 uint32_t if_max_mtu = 0;
1220 uint32_t drv_buf_size;
1221 struct ifdevmtu ifdm;
1222 int err;
1223
1224 /*
1225 * To perform intra-stack RX aggregation flowswitch needs to use
1226 * multi-buflet packet.
1227 */
1228 *use_multi_buflet = NX_FSW_TCP_RX_AGG_ENABLED();
1229
1230 *large_buf_size = *use_multi_buflet ? NX_FSW_DEF_LARGE_BUFSIZE : 0;
1231 /*
1232 * IP over Thunderbolt interface can deliver the largest IP packet,
1233 * but the driver advertises the MAX MTU as only 9K.
1234 */
1235 if (IFNET_IS_THUNDERBOLT_IP(ifp)) {
1236 if_max_mtu = IP_MAXPACKET;
1237 goto skip_mtu_ioctl;
1238 }
1239
1240 /* determine max mtu */
1241 bzero(&ifdm, sizeof(ifdm));
1242 err = dlil_siocgifdevmtu(ifp, &ifdm);
1243 if (__improbable(err != 0)) {
1244 DLIL_PRINTF("%s: SIOCGIFDEVMTU failed for %s\n",
1245 __func__, if_name(ifp));
1246 /* use default flowswitch buffer size */
1247 if_max_mtu = NX_FSW_BUFSIZE;
1248 } else {
1249 DLIL_PRINTF("%s: %s %d %d\n", __func__, if_name(ifp),
1250 ifdm.ifdm_max, ifdm.ifdm_current);
1251 /* rdar://problem/44589731 */
1252 if_max_mtu = MAX(ifdm.ifdm_max, ifdm.ifdm_current);
1253 }
1254
1255 skip_mtu_ioctl:
1256 if (if_max_mtu == 0) {
1257 DLIL_PRINTF("%s: can't determine MAX MTU for %s\n",
1258 __func__, if_name(ifp));
1259 return EINVAL;
1260 }
1261 if ((if_max_mtu > NX_FSW_MAXBUFSIZE) && fsw_use_max_mtu_buffer) {
1262 DLIL_PRINTF("%s: interace (%s) has MAX MTU (%u) > flowswitch "
1263 "max bufsize(%d)\n", __func__,
1264 if_name(ifp), if_max_mtu, NX_FSW_MAXBUFSIZE);
1265 return EINVAL;
1266 }
1267
1268 /*
1269 * for skywalk native driver, consult the driver packet pool also.
1270 */
1271 if (dlil_is_native_netif_nexus(ifp)) {
1272 err = kern_nexus_get_pbufpool_info(netif, &rx_pp_info,
1273 &tx_pp_info);
1274 if (err != 0) {
1275 DLIL_PRINTF("%s: can't get pbufpool info for %s\n",
1276 __func__, if_name(ifp));
1277 return ENXIO;
1278 }
1279 drv_buf_size = tx_pp_info.kpm_bufsize *
1280 tx_pp_info.kpm_max_frags;
1281 if (if_max_mtu > drv_buf_size) {
1282 DLIL_PRINTF("%s: interface %s packet pool (rx %d * %d, "
1283 "tx %d * %d) can't support max mtu(%d)\n", __func__,
1284 if_name(ifp), rx_pp_info.kpm_bufsize,
1285 rx_pp_info.kpm_max_frags, tx_pp_info.kpm_bufsize,
1286 tx_pp_info.kpm_max_frags, if_max_mtu);
1287 return EINVAL;
1288 }
1289 } else {
1290 drv_buf_size = if_max_mtu;
1291 }
1292
1293 if ((drv_buf_size > NX_FSW_BUFSIZE) && (!fsw_use_max_mtu_buffer)) {
1294 _CASSERT((NX_FSW_BUFSIZE * NX_PBUF_FRAGS_MAX) >= IP_MAXPACKET);
1295 *use_multi_buflet = true;
1296 /* default flowswitch buffer size */
1297 *buf_size = NX_FSW_BUFSIZE;
1298 *large_buf_size = MIN(NX_FSW_MAX_LARGE_BUFSIZE, drv_buf_size);
1299 } else {
1300 *buf_size = MAX(drv_buf_size, NX_FSW_BUFSIZE);
1301 }
1302
1303 /*
1304 * if HW TSO is enabled on a Skywalk native interface then make
1305 * the flowswitch default buffer be able to handle max TSO segment.
1306 */
1307 uint32_t tso_v4_mtu = 0;
1308 uint32_t tso_v6_mtu = 0;
1309 #ifdef XNU_TARGET_OS_OSX
1310 if (dlil_is_native_netif_nexus(ifp)) {
1311 if ((ifp->if_hwassist & IFNET_TSO_IPV4) != 0) {
1312 tso_v4_mtu = ifp->if_tso_v4_mtu;
1313 }
1314 if ((ifp->if_hwassist & IFNET_TSO_IPV6) != 0) {
1315 tso_v6_mtu = ifp->if_tso_v6_mtu;
1316 }
1317 }
1318 #endif /* XNU_TARGET_OS_OSX */
1319 if ((tso_v4_mtu != 0) || (tso_v6_mtu != 0)) {
1320 *buf_size = max(*buf_size, max(tso_v4_mtu, tso_v6_mtu));
1321 ASSERT(*buf_size <= NX_FSW_MAXBUFSIZE);
1322 }
1323 if (*buf_size >= *large_buf_size) {
1324 *large_buf_size = 0;
1325 }
1326 return 0;
1327 }
1328
1329 static boolean_t
_dlil_attach_flowswitch_nexus(ifnet_t ifp,if_nexus_flowswitch_t nexus_fsw)1330 _dlil_attach_flowswitch_nexus(ifnet_t ifp, if_nexus_flowswitch_t nexus_fsw)
1331 {
1332 nexus_attr_t attr = NULL;
1333 nexus_controller_t controller;
1334 errno_t err = 0;
1335 uuid_t netif;
1336 uint32_t buf_size = 0;
1337 uint32_t large_buf_size = 0;
1338 bool multi_buflet;
1339
1340 if (ifnet_nx_noauto(ifp) || ifnet_nx_noauto_flowswitch(ifp) ||
1341 IFNET_IS_VMNET(ifp)) {
1342 goto failed;
1343 }
1344
1345 if ((ifp->if_capabilities & IFCAP_SKYWALK) == 0) {
1346 /* not possible to attach (netif native/compat not plumbed) */
1347 goto failed;
1348 }
1349
1350 if ((if_attach_nx & IF_ATTACH_NX_FLOWSWITCH) == 0) {
1351 /* don't auto-attach */
1352 goto failed;
1353 }
1354
1355 /* get the netif instance from the ifp */
1356 err = kern_nexus_get_netif_instance(ifp, netif);
1357 if (err != 0) {
1358 DLIL_PRINTF("%s: can't find netif for %s\n", __func__,
1359 if_name(ifp));
1360 goto failed;
1361 }
1362
1363 err = kern_nexus_attr_create(&attr);
1364 if (err != 0) {
1365 DLIL_PRINTF("%s: nexus attr create for %s\n", __func__,
1366 if_name(ifp));
1367 goto failed;
1368 }
1369
1370 err = _dlil_get_flowswitch_buffer_size(ifp, netif, &buf_size,
1371 &multi_buflet, &large_buf_size);
1372 if (err != 0) {
1373 goto failed;
1374 }
1375 ASSERT((buf_size >= NX_FSW_BUFSIZE) && (buf_size <= NX_FSW_MAXBUFSIZE));
1376 ASSERT(large_buf_size <= NX_FSW_MAX_LARGE_BUFSIZE);
1377
1378 /* Configure flowswitch buffer size */
1379 err = kern_nexus_attr_set(attr, NEXUS_ATTR_SLOT_BUF_SIZE, buf_size);
1380 VERIFY(err == 0);
1381 err = kern_nexus_attr_set(attr, NEXUS_ATTR_LARGE_BUF_SIZE,
1382 large_buf_size);
1383 VERIFY(err == 0);
1384
1385 /*
1386 * Configure flowswitch to use super-packet (multi-buflet).
1387 */
1388 err = kern_nexus_attr_set(attr, NEXUS_ATTR_MAX_FRAGS,
1389 multi_buflet ? NX_PBUF_FRAGS_MAX : 1);
1390 VERIFY(err == 0);
1391
1392 /* create the flowswitch provider and instance */
1393 controller = kern_nexus_shared_controller();
1394 err = dlil_create_provider_and_instance(controller,
1395 NEXUS_TYPE_FLOW_SWITCH, ifp, &nexus_fsw->if_fsw_provider,
1396 &nexus_fsw->if_fsw_instance, attr);
1397 if (err != 0) {
1398 goto failed;
1399 }
1400
1401 /* attach the device port */
1402 err = kern_nexus_ifattach(controller, nexus_fsw->if_fsw_instance,
1403 NULL, netif, FALSE, &nexus_fsw->if_fsw_device);
1404 if (err != 0) {
1405 DLIL_PRINTF("%s kern_nexus_ifattach device failed %d %s\n",
1406 __func__, err, if_name(ifp));
1407 /* cleanup provider and instance */
1408 dlil_detach_nexus(__func__, nexus_fsw->if_fsw_provider,
1409 nexus_fsw->if_fsw_instance, nexus_fsw->if_fsw_device);
1410 goto failed;
1411 }
1412 return TRUE;
1413
1414 failed:
1415 if (err != 0) {
1416 DLIL_PRINTF("%s: failed to attach flowswitch to %s, error %d\n",
1417 __func__, if_name(ifp), err);
1418 } else {
1419 DLIL_PRINTF("%s: not attaching flowswitch to %s\n",
1420 __func__, if_name(ifp));
1421 }
1422 if (attr != NULL) {
1423 kern_nexus_attr_destroy(attr);
1424 }
1425 return FALSE;
1426 }
1427
1428 static boolean_t
dlil_attach_flowswitch_nexus(ifnet_t ifp)1429 dlil_attach_flowswitch_nexus(ifnet_t ifp)
1430 {
1431 boolean_t attached;
1432 if_nexus_flowswitch nexus_fsw;
1433
1434 #if (DEVELOPMENT || DEBUG)
1435 if (skywalk_netif_direct_allowed(if_name(ifp))) {
1436 DLIL_PRINTF("skip attaching fsw to %s", if_name(ifp));
1437 return FALSE;
1438 }
1439 #endif /* (DEVELOPMENT || DEBUG) */
1440
1441 /*
1442 * flowswitch attachment is not supported for interface using the
1443 * legacy model (IFNET_INIT_LEGACY)
1444 */
1445 if ((ifp->if_eflags & IFEF_TXSTART) == 0) {
1446 DLIL_PRINTF("skip attaching fsw to %s using legacy TX model",
1447 if_name(ifp));
1448 return FALSE;
1449 }
1450
1451 if (uuid_is_null(ifp->if_nx_flowswitch.if_fsw_instance) == 0) {
1452 /* it's already attached */
1453 return FALSE;
1454 }
1455 bzero(&nexus_fsw, sizeof(nexus_fsw));
1456 attached = _dlil_attach_flowswitch_nexus(ifp, &nexus_fsw);
1457 if (attached) {
1458 ifnet_lock_exclusive(ifp);
1459 if (!IF_FULLY_ATTACHED(ifp)) {
1460 /* interface is going away */
1461 attached = FALSE;
1462 } else {
1463 ifp->if_nx_flowswitch = nexus_fsw;
1464 }
1465 ifnet_lock_done(ifp);
1466 if (!attached) {
1467 /* clean up flowswitch nexus */
1468 dlil_detach_flowswitch_nexus(&nexus_fsw);
1469 }
1470 }
1471 return attached;
1472 }
1473
1474 __attribute__((noinline))
1475 static void
dlil_detach_flowswitch_nexus(if_nexus_flowswitch_t nexus_fsw)1476 dlil_detach_flowswitch_nexus(if_nexus_flowswitch_t nexus_fsw)
1477 {
1478 dlil_detach_nexus(__func__, nexus_fsw->if_fsw_provider,
1479 nexus_fsw->if_fsw_instance, nexus_fsw->if_fsw_device);
1480 }
1481
1482 __attribute__((noinline))
1483 static void
dlil_netif_detach_notify(ifnet_t ifp)1484 dlil_netif_detach_notify(ifnet_t ifp)
1485 {
1486 void (*detach_notify)(struct nexus_netif_adapter *);
1487
1488 /*
1489 * This is only needed for low latency interfaces for now.
1490 */
1491 if (!ifnet_is_low_latency(ifp)) {
1492 return;
1493 }
1494 detach_notify = (ifp->if_na_ops != NULL) ? ifp->if_na_ops->ni_detach_notify : NULL;
1495 if (detach_notify != NULL) {
1496 (*detach_notify)(ifp->if_na);
1497 } else {
1498 DLIL_PRINTF("%s: %s has no detach notify calback\n",
1499 __func__, if_name(ifp));
1500 }
1501 }
1502
1503 __attribute__((noinline))
1504 static void
dlil_quiesce_and_detach_nexuses(ifnet_t ifp)1505 dlil_quiesce_and_detach_nexuses(ifnet_t ifp)
1506 {
1507 if_nexus_flowswitch *nx_fsw = &ifp->if_nx_flowswitch;
1508 if_nexus_netif *nx_netif = &ifp->if_nx_netif;
1509
1510 ifnet_datamov_suspend_and_drain(ifp);
1511 if (!uuid_is_null(nx_fsw->if_fsw_device)) {
1512 ASSERT(!uuid_is_null(nx_fsw->if_fsw_provider));
1513 ASSERT(!uuid_is_null(nx_fsw->if_fsw_instance));
1514 dlil_detach_flowswitch_nexus(nx_fsw);
1515 bzero(nx_fsw, sizeof(*nx_fsw));
1516 } else {
1517 ASSERT(uuid_is_null(nx_fsw->if_fsw_provider));
1518 ASSERT(uuid_is_null(nx_fsw->if_fsw_instance));
1519 DTRACE_IP1(fsw__not__attached, ifnet_t, ifp);
1520 }
1521
1522 if (!uuid_is_null(nx_netif->if_nif_attach)) {
1523 ASSERT(!uuid_is_null(nx_netif->if_nif_provider));
1524 ASSERT(!uuid_is_null(nx_netif->if_nif_instance));
1525 dlil_detach_netif_nexus(nx_netif);
1526 bzero(nx_netif, sizeof(*nx_netif));
1527 } else {
1528 ASSERT(uuid_is_null(nx_netif->if_nif_provider));
1529 ASSERT(uuid_is_null(nx_netif->if_nif_instance));
1530 DTRACE_IP1(netif__not__attached, ifnet_t, ifp);
1531 }
1532 ifnet_datamov_resume(ifp);
1533 }
1534
1535 boolean_t
ifnet_add_netagent(ifnet_t ifp)1536 ifnet_add_netagent(ifnet_t ifp)
1537 {
1538 int error;
1539
1540 error = kern_nexus_interface_add_netagent(ifp);
1541 os_log(OS_LOG_DEFAULT,
1542 "kern_nexus_interface_add_netagent(%s) returned %d",
1543 ifp->if_xname, error);
1544 return error == 0;
1545 }
1546
1547 boolean_t
ifnet_remove_netagent(ifnet_t ifp)1548 ifnet_remove_netagent(ifnet_t ifp)
1549 {
1550 int error;
1551
1552 error = kern_nexus_interface_remove_netagent(ifp);
1553 os_log(OS_LOG_DEFAULT,
1554 "kern_nexus_interface_remove_netagent(%s) returned %d",
1555 ifp->if_xname, error);
1556 return error == 0;
1557 }
1558
1559 boolean_t
ifnet_attach_flowswitch_nexus(ifnet_t ifp)1560 ifnet_attach_flowswitch_nexus(ifnet_t ifp)
1561 {
1562 if (!IF_FULLY_ATTACHED(ifp)) {
1563 return FALSE;
1564 }
1565 return dlil_attach_flowswitch_nexus(ifp);
1566 }
1567
1568 boolean_t
ifnet_detach_flowswitch_nexus(ifnet_t ifp)1569 ifnet_detach_flowswitch_nexus(ifnet_t ifp)
1570 {
1571 if_nexus_flowswitch nexus_fsw;
1572
1573 ifnet_lock_exclusive(ifp);
1574 nexus_fsw = ifp->if_nx_flowswitch;
1575 bzero(&ifp->if_nx_flowswitch, sizeof(ifp->if_nx_flowswitch));
1576 ifnet_lock_done(ifp);
1577 return dlil_detach_nexus(__func__, nexus_fsw.if_fsw_provider,
1578 nexus_fsw.if_fsw_instance, nexus_fsw.if_fsw_device);
1579 }
1580
1581 boolean_t
ifnet_attach_netif_nexus(ifnet_t ifp)1582 ifnet_attach_netif_nexus(ifnet_t ifp)
1583 {
1584 boolean_t nexus_attached;
1585 if_nexus_netif nexus_netif;
1586
1587 if (!IF_FULLY_ATTACHED(ifp)) {
1588 return FALSE;
1589 }
1590 nexus_attached = dlil_attach_netif_nexus_common(ifp, &nexus_netif);
1591 if (nexus_attached) {
1592 ifnet_lock_exclusive(ifp);
1593 ifp->if_nx_netif = nexus_netif;
1594 ifnet_lock_done(ifp);
1595 }
1596 return nexus_attached;
1597 }
1598
1599 boolean_t
ifnet_detach_netif_nexus(ifnet_t ifp)1600 ifnet_detach_netif_nexus(ifnet_t ifp)
1601 {
1602 if_nexus_netif nexus_netif;
1603
1604 ifnet_lock_exclusive(ifp);
1605 nexus_netif = ifp->if_nx_netif;
1606 bzero(&ifp->if_nx_netif, sizeof(ifp->if_nx_netif));
1607 ifnet_lock_done(ifp);
1608
1609 return dlil_detach_nexus(__func__, nexus_netif.if_nif_provider,
1610 nexus_netif.if_nif_instance, nexus_netif.if_nif_attach);
1611 }
1612
1613 void
ifnet_attach_native_flowswitch(ifnet_t ifp)1614 ifnet_attach_native_flowswitch(ifnet_t ifp)
1615 {
1616 if (!dlil_is_native_netif_nexus(ifp)) {
1617 /* not a native netif */
1618 return;
1619 }
1620 ifnet_attach_flowswitch_nexus(ifp);
1621 }
1622
1623 #endif /* SKYWALK */
1624
1625 #define DLIL_INPUT_CHECK(m, ifp) { \
1626 struct ifnet *_rcvif = mbuf_pkthdr_rcvif(m); \
1627 if (_rcvif == NULL || (ifp != lo_ifp && _rcvif != ifp) || \
1628 !(mbuf_flags(m) & MBUF_PKTHDR)) { \
1629 panic_plain("%s: invalid mbuf %p\n", __func__, m); \
1630 /* NOTREACHED */ \
1631 } \
1632 }
1633
1634 #define DLIL_EWMA(old, new, decay) do { \
1635 u_int32_t _avg; \
1636 if ((_avg = (old)) > 0) \
1637 _avg = (((_avg << (decay)) - _avg) + (new)) >> (decay); \
1638 else \
1639 _avg = (new); \
1640 (old) = _avg; \
1641 } while (0)
1642
1643 #define MBPS (1ULL * 1000 * 1000)
1644 #define GBPS (MBPS * 1000)
1645
1646 struct rxpoll_time_tbl {
1647 u_int64_t speed; /* downlink speed */
1648 u_int32_t plowat; /* packets low watermark */
1649 u_int32_t phiwat; /* packets high watermark */
1650 u_int32_t blowat; /* bytes low watermark */
1651 u_int32_t bhiwat; /* bytes high watermark */
1652 };
1653
1654 static struct rxpoll_time_tbl rxpoll_tbl[] = {
1655 { .speed = 10 * MBPS, .plowat = 2, .phiwat = 8, .blowat = (1 * 1024), .bhiwat = (6 * 1024) },
1656 { .speed = 100 * MBPS, .plowat = 10, .phiwat = 40, .blowat = (4 * 1024), .bhiwat = (64 * 1024) },
1657 { .speed = 1 * GBPS, .plowat = 10, .phiwat = 40, .blowat = (4 * 1024), .bhiwat = (64 * 1024) },
1658 { .speed = 10 * GBPS, .plowat = 10, .phiwat = 40, .blowat = (4 * 1024), .bhiwat = (64 * 1024) },
1659 { .speed = 100 * GBPS, .plowat = 10, .phiwat = 40, .blowat = (4 * 1024), .bhiwat = (64 * 1024) },
1660 { .speed = 0, .plowat = 0, .phiwat = 0, .blowat = 0, .bhiwat = 0 }
1661 };
1662
1663 static LCK_MTX_DECLARE_ATTR(dlil_thread_sync_lock, &dlil_lock_group,
1664 &dlil_lck_attributes);
1665 static uint32_t dlil_pending_thread_cnt = 0;
1666
1667 static void
dlil_incr_pending_thread_count(void)1668 dlil_incr_pending_thread_count(void)
1669 {
1670 LCK_MTX_ASSERT(&dlil_thread_sync_lock, LCK_MTX_ASSERT_NOTOWNED);
1671 lck_mtx_lock(&dlil_thread_sync_lock);
1672 dlil_pending_thread_cnt++;
1673 lck_mtx_unlock(&dlil_thread_sync_lock);
1674 }
1675
1676 static void
dlil_decr_pending_thread_count(void)1677 dlil_decr_pending_thread_count(void)
1678 {
1679 LCK_MTX_ASSERT(&dlil_thread_sync_lock, LCK_MTX_ASSERT_NOTOWNED);
1680 lck_mtx_lock(&dlil_thread_sync_lock);
1681 VERIFY(dlil_pending_thread_cnt > 0);
1682 dlil_pending_thread_cnt--;
1683 if (dlil_pending_thread_cnt == 0) {
1684 wakeup(&dlil_pending_thread_cnt);
1685 }
1686 lck_mtx_unlock(&dlil_thread_sync_lock);
1687 }
1688
1689 int
proto_hash_value(u_int32_t protocol_family)1690 proto_hash_value(u_int32_t protocol_family)
1691 {
1692 /*
1693 * dlil_proto_unplumb_all() depends on the mapping between
1694 * the hash bucket index and the protocol family defined
1695 * here; future changes must be applied there as well.
1696 */
1697 switch (protocol_family) {
1698 case PF_INET:
1699 return 0;
1700 case PF_INET6:
1701 return 1;
1702 case PF_VLAN:
1703 return 2;
1704 case PF_UNSPEC:
1705 default:
1706 return 3;
1707 }
1708 }
1709
1710 /*
1711 * Caller must already be holding ifnet lock.
1712 */
1713 static struct if_proto *
find_attached_proto(struct ifnet * ifp,u_int32_t protocol_family)1714 find_attached_proto(struct ifnet *ifp, u_int32_t protocol_family)
1715 {
1716 struct if_proto *proto = NULL;
1717 u_int32_t i = proto_hash_value(protocol_family);
1718
1719 ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_OWNED);
1720
1721 if (ifp->if_proto_hash != NULL) {
1722 proto = SLIST_FIRST(&ifp->if_proto_hash[i]);
1723 }
1724
1725 while (proto != NULL && proto->protocol_family != protocol_family) {
1726 proto = SLIST_NEXT(proto, next_hash);
1727 }
1728
1729 if (proto != NULL) {
1730 if_proto_ref(proto);
1731 }
1732
1733 return proto;
1734 }
1735
1736 static void
if_proto_ref(struct if_proto * proto)1737 if_proto_ref(struct if_proto *proto)
1738 {
1739 atomic_add_32(&proto->refcount, 1);
1740 }
1741
1742 extern void if_rtproto_del(struct ifnet *ifp, int protocol);
1743
1744 static void
if_proto_free(struct if_proto * proto)1745 if_proto_free(struct if_proto *proto)
1746 {
1747 u_int32_t oldval;
1748 struct ifnet *ifp = proto->ifp;
1749 u_int32_t proto_family = proto->protocol_family;
1750 struct kev_dl_proto_data ev_pr_data;
1751
1752 oldval = atomic_add_32_ov(&proto->refcount, -1);
1753 if (oldval > 1) {
1754 return;
1755 }
1756
1757 if (proto->proto_kpi == kProtoKPI_v1) {
1758 if (proto->kpi.v1.detached) {
1759 proto->kpi.v1.detached(ifp, proto->protocol_family);
1760 }
1761 }
1762 if (proto->proto_kpi == kProtoKPI_v2) {
1763 if (proto->kpi.v2.detached) {
1764 proto->kpi.v2.detached(ifp, proto->protocol_family);
1765 }
1766 }
1767
1768 /*
1769 * Cleanup routes that may still be in the routing table for that
1770 * interface/protocol pair.
1771 */
1772 if_rtproto_del(ifp, proto_family);
1773
1774 ifnet_lock_shared(ifp);
1775
1776 /* No more reference on this, protocol must have been detached */
1777 VERIFY(proto->detached);
1778
1779 /*
1780 * The reserved field carries the number of protocol still attached
1781 * (subject to change)
1782 */
1783 ev_pr_data.proto_family = proto_family;
1784 ev_pr_data.proto_remaining_count = dlil_ifp_protolist(ifp, NULL, 0);
1785
1786 ifnet_lock_done(ifp);
1787
1788 dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_PROTO_DETACHED,
1789 (struct net_event_data *)&ev_pr_data,
1790 sizeof(struct kev_dl_proto_data), FALSE);
1791
1792 if (ev_pr_data.proto_remaining_count == 0) {
1793 /*
1794 * The protocol count has gone to zero, mark the interface down.
1795 * This used to be done by configd.KernelEventMonitor, but that
1796 * is inherently prone to races (rdar://problem/30810208).
1797 */
1798 (void) ifnet_set_flags(ifp, 0, IFF_UP);
1799 (void) ifnet_ioctl(ifp, 0, SIOCSIFFLAGS, NULL);
1800 dlil_post_sifflags_msg(ifp);
1801 }
1802
1803 zfree(dlif_proto_zone, proto);
1804 }
1805
1806 __private_extern__ void
ifnet_lock_assert(struct ifnet * ifp,ifnet_lock_assert_t what)1807 ifnet_lock_assert(struct ifnet *ifp, ifnet_lock_assert_t what)
1808 {
1809 #if !MACH_ASSERT
1810 #pragma unused(ifp)
1811 #endif
1812 unsigned int type = 0;
1813 int ass = 1;
1814
1815 switch (what) {
1816 case IFNET_LCK_ASSERT_EXCLUSIVE:
1817 type = LCK_RW_ASSERT_EXCLUSIVE;
1818 break;
1819
1820 case IFNET_LCK_ASSERT_SHARED:
1821 type = LCK_RW_ASSERT_SHARED;
1822 break;
1823
1824 case IFNET_LCK_ASSERT_OWNED:
1825 type = LCK_RW_ASSERT_HELD;
1826 break;
1827
1828 case IFNET_LCK_ASSERT_NOTOWNED:
1829 /* nothing to do here for RW lock; bypass assert */
1830 ass = 0;
1831 break;
1832
1833 default:
1834 panic("bad ifnet assert type: %d", what);
1835 /* NOTREACHED */
1836 }
1837 if (ass) {
1838 LCK_RW_ASSERT(&ifp->if_lock, type);
1839 }
1840 }
1841
1842 __private_extern__ void
ifnet_lock_shared(struct ifnet * ifp)1843 ifnet_lock_shared(struct ifnet *ifp)
1844 {
1845 lck_rw_lock_shared(&ifp->if_lock);
1846 }
1847
1848 __private_extern__ void
ifnet_lock_exclusive(struct ifnet * ifp)1849 ifnet_lock_exclusive(struct ifnet *ifp)
1850 {
1851 lck_rw_lock_exclusive(&ifp->if_lock);
1852 }
1853
1854 __private_extern__ void
ifnet_lock_done(struct ifnet * ifp)1855 ifnet_lock_done(struct ifnet *ifp)
1856 {
1857 lck_rw_done(&ifp->if_lock);
1858 }
1859
1860 #if INET
1861 __private_extern__ void
if_inetdata_lock_shared(struct ifnet * ifp)1862 if_inetdata_lock_shared(struct ifnet *ifp)
1863 {
1864 lck_rw_lock_shared(&ifp->if_inetdata_lock);
1865 }
1866
1867 __private_extern__ void
if_inetdata_lock_exclusive(struct ifnet * ifp)1868 if_inetdata_lock_exclusive(struct ifnet *ifp)
1869 {
1870 lck_rw_lock_exclusive(&ifp->if_inetdata_lock);
1871 }
1872
1873 __private_extern__ void
if_inetdata_lock_done(struct ifnet * ifp)1874 if_inetdata_lock_done(struct ifnet *ifp)
1875 {
1876 lck_rw_done(&ifp->if_inetdata_lock);
1877 }
1878 #endif
1879
1880 __private_extern__ void
if_inet6data_lock_shared(struct ifnet * ifp)1881 if_inet6data_lock_shared(struct ifnet *ifp)
1882 {
1883 lck_rw_lock_shared(&ifp->if_inet6data_lock);
1884 }
1885
1886 __private_extern__ void
if_inet6data_lock_exclusive(struct ifnet * ifp)1887 if_inet6data_lock_exclusive(struct ifnet *ifp)
1888 {
1889 lck_rw_lock_exclusive(&ifp->if_inet6data_lock);
1890 }
1891
1892 __private_extern__ void
if_inet6data_lock_done(struct ifnet * ifp)1893 if_inet6data_lock_done(struct ifnet *ifp)
1894 {
1895 lck_rw_done(&ifp->if_inet6data_lock);
1896 }
1897
1898 __private_extern__ void
ifnet_head_lock_shared(void)1899 ifnet_head_lock_shared(void)
1900 {
1901 lck_rw_lock_shared(&ifnet_head_lock);
1902 }
1903
1904 __private_extern__ void
ifnet_head_lock_exclusive(void)1905 ifnet_head_lock_exclusive(void)
1906 {
1907 lck_rw_lock_exclusive(&ifnet_head_lock);
1908 }
1909
1910 __private_extern__ void
ifnet_head_done(void)1911 ifnet_head_done(void)
1912 {
1913 lck_rw_done(&ifnet_head_lock);
1914 }
1915
1916 __private_extern__ void
ifnet_head_assert_exclusive(void)1917 ifnet_head_assert_exclusive(void)
1918 {
1919 LCK_RW_ASSERT(&ifnet_head_lock, LCK_RW_ASSERT_EXCLUSIVE);
1920 }
1921
1922 /*
1923 * dlil_ifp_protolist
1924 * - get the list of protocols attached to the interface, or just the number
1925 * of attached protocols
1926 * - if the number returned is greater than 'list_count', truncation occurred
1927 *
1928 * Note:
1929 * - caller must already be holding ifnet lock.
1930 */
1931 static u_int32_t
dlil_ifp_protolist(struct ifnet * ifp,protocol_family_t * list,u_int32_t list_count)1932 dlil_ifp_protolist(struct ifnet *ifp, protocol_family_t *list,
1933 u_int32_t list_count)
1934 {
1935 u_int32_t count = 0;
1936 int i;
1937
1938 ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_OWNED);
1939
1940 if (ifp->if_proto_hash == NULL) {
1941 goto done;
1942 }
1943
1944 for (i = 0; i < PROTO_HASH_SLOTS; i++) {
1945 struct if_proto *proto;
1946 SLIST_FOREACH(proto, &ifp->if_proto_hash[i], next_hash) {
1947 if (list != NULL && count < list_count) {
1948 list[count] = proto->protocol_family;
1949 }
1950 count++;
1951 }
1952 }
1953 done:
1954 return count;
1955 }
1956
1957 __private_extern__ u_int32_t
if_get_protolist(struct ifnet * ifp,u_int32_t * protolist,u_int32_t count)1958 if_get_protolist(struct ifnet * ifp, u_int32_t *protolist, u_int32_t count)
1959 {
1960 ifnet_lock_shared(ifp);
1961 count = dlil_ifp_protolist(ifp, protolist, count);
1962 ifnet_lock_done(ifp);
1963 return count;
1964 }
1965
1966 __private_extern__ void
if_free_protolist(u_int32_t * list)1967 if_free_protolist(u_int32_t *list)
1968 {
1969 kfree_data_addr(list);
1970 }
1971
1972 __private_extern__ int
dlil_post_msg(struct ifnet * ifp,u_int32_t event_subclass,u_int32_t event_code,struct net_event_data * event_data,u_int32_t event_data_len,boolean_t suppress_generation)1973 dlil_post_msg(struct ifnet *ifp, u_int32_t event_subclass,
1974 u_int32_t event_code, struct net_event_data *event_data,
1975 u_int32_t event_data_len, boolean_t suppress_generation)
1976 {
1977 struct net_event_data ev_data;
1978 struct kev_msg ev_msg;
1979
1980 bzero(&ev_msg, sizeof(ev_msg));
1981 bzero(&ev_data, sizeof(ev_data));
1982 /*
1983 * a net event always starts with a net_event_data structure
1984 * but the caller can generate a simple net event or
1985 * provide a longer event structure to post
1986 */
1987 ev_msg.vendor_code = KEV_VENDOR_APPLE;
1988 ev_msg.kev_class = KEV_NETWORK_CLASS;
1989 ev_msg.kev_subclass = event_subclass;
1990 ev_msg.event_code = event_code;
1991
1992 if (event_data == NULL) {
1993 event_data = &ev_data;
1994 event_data_len = sizeof(struct net_event_data);
1995 }
1996
1997 strlcpy(&event_data->if_name[0], ifp->if_name, IFNAMSIZ);
1998 event_data->if_family = ifp->if_family;
1999 event_data->if_unit = (u_int32_t)ifp->if_unit;
2000
2001 ev_msg.dv[0].data_length = event_data_len;
2002 ev_msg.dv[0].data_ptr = event_data;
2003 ev_msg.dv[1].data_length = 0;
2004
2005 bool update_generation = true;
2006 if (event_subclass == KEV_DL_SUBCLASS) {
2007 /* Don't update interface generation for frequent link quality and state changes */
2008 switch (event_code) {
2009 case KEV_DL_LINK_QUALITY_METRIC_CHANGED:
2010 case KEV_DL_RRC_STATE_CHANGED:
2011 case KEV_DL_PRIMARY_ELECTED:
2012 update_generation = false;
2013 break;
2014 default:
2015 break;
2016 }
2017 }
2018
2019 /*
2020 * Some events that update generation counts might
2021 * want to suppress generation count.
2022 * One example is node presence/absence where we still
2023 * issue kernel event for the invocation but want to avoid
2024 * expensive operation of updating generation which triggers
2025 * NECP client updates.
2026 */
2027 if (suppress_generation) {
2028 update_generation = false;
2029 }
2030
2031 return dlil_event_internal(ifp, &ev_msg, update_generation);
2032 }
2033
2034 __private_extern__ int
dlil_alloc_local_stats(struct ifnet * ifp)2035 dlil_alloc_local_stats(struct ifnet *ifp)
2036 {
2037 int ret = EINVAL;
2038 void *buf, *base, **pbuf;
2039
2040 if (ifp == NULL) {
2041 goto end;
2042 }
2043
2044 if (ifp->if_tcp_stat == NULL && ifp->if_udp_stat == NULL) {
2045 /* allocate tcpstat_local structure */
2046 buf = zalloc_flags(dlif_tcpstat_zone,
2047 Z_WAITOK | Z_ZERO | Z_NOFAIL);
2048
2049 /* Get the 64-bit aligned base address for this object */
2050 base = (void *)P2ROUNDUP((intptr_t)buf + sizeof(u_int64_t),
2051 sizeof(u_int64_t));
2052 VERIFY(((intptr_t)base + dlif_tcpstat_size) <=
2053 ((intptr_t)buf + dlif_tcpstat_bufsize));
2054
2055 /*
2056 * Wind back a pointer size from the aligned base and
2057 * save the original address so we can free it later.
2058 */
2059 pbuf = (void **)((intptr_t)base - sizeof(void *));
2060 *pbuf = buf;
2061 ifp->if_tcp_stat = base;
2062
2063 /* allocate udpstat_local structure */
2064 buf = zalloc_flags(dlif_udpstat_zone,
2065 Z_WAITOK | Z_ZERO | Z_NOFAIL);
2066
2067 /* Get the 64-bit aligned base address for this object */
2068 base = (void *)P2ROUNDUP((intptr_t)buf + sizeof(u_int64_t),
2069 sizeof(u_int64_t));
2070 VERIFY(((intptr_t)base + dlif_udpstat_size) <=
2071 ((intptr_t)buf + dlif_udpstat_bufsize));
2072
2073 /*
2074 * Wind back a pointer size from the aligned base and
2075 * save the original address so we can free it later.
2076 */
2077 pbuf = (void **)((intptr_t)base - sizeof(void *));
2078 *pbuf = buf;
2079 ifp->if_udp_stat = base;
2080
2081 VERIFY(IS_P2ALIGNED(ifp->if_tcp_stat, sizeof(u_int64_t)) &&
2082 IS_P2ALIGNED(ifp->if_udp_stat, sizeof(u_int64_t)));
2083
2084 ret = 0;
2085 }
2086
2087 if (ifp->if_ipv4_stat == NULL) {
2088 ifp->if_ipv4_stat = kalloc_type(struct if_tcp_ecn_stat, Z_WAITOK | Z_ZERO);
2089 }
2090
2091 if (ifp->if_ipv6_stat == NULL) {
2092 ifp->if_ipv6_stat = kalloc_type(struct if_tcp_ecn_stat, Z_WAITOK | Z_ZERO);
2093 }
2094 end:
2095 if (ifp != NULL && ret != 0) {
2096 if (ifp->if_tcp_stat != NULL) {
2097 pbuf = (void **)
2098 ((intptr_t)ifp->if_tcp_stat - sizeof(void *));
2099 zfree(dlif_tcpstat_zone, *pbuf);
2100 ifp->if_tcp_stat = NULL;
2101 }
2102 if (ifp->if_udp_stat != NULL) {
2103 pbuf = (void **)
2104 ((intptr_t)ifp->if_udp_stat - sizeof(void *));
2105 zfree(dlif_udpstat_zone, *pbuf);
2106 ifp->if_udp_stat = NULL;
2107 }
2108 /* The macro kfree_type sets the passed pointer to NULL */
2109 if (ifp->if_ipv4_stat != NULL) {
2110 kfree_type(struct if_tcp_ecn_stat, ifp->if_ipv4_stat);
2111 }
2112 if (ifp->if_ipv6_stat != NULL) {
2113 kfree_type(struct if_tcp_ecn_stat, ifp->if_ipv6_stat);
2114 }
2115 }
2116
2117 return ret;
2118 }
2119
2120 static void
dlil_reset_rxpoll_params(ifnet_t ifp)2121 dlil_reset_rxpoll_params(ifnet_t ifp)
2122 {
2123 ASSERT(ifp != NULL);
2124 ifnet_set_poll_cycle(ifp, NULL);
2125 ifp->if_poll_update = 0;
2126 ifp->if_poll_flags = 0;
2127 ifp->if_poll_req = 0;
2128 ifp->if_poll_mode = IFNET_MODEL_INPUT_POLL_OFF;
2129 bzero(&ifp->if_poll_tstats, sizeof(ifp->if_poll_tstats));
2130 bzero(&ifp->if_poll_pstats, sizeof(ifp->if_poll_pstats));
2131 bzero(&ifp->if_poll_sstats, sizeof(ifp->if_poll_sstats));
2132 net_timerclear(&ifp->if_poll_mode_holdtime);
2133 net_timerclear(&ifp->if_poll_mode_lasttime);
2134 net_timerclear(&ifp->if_poll_sample_holdtime);
2135 net_timerclear(&ifp->if_poll_sample_lasttime);
2136 net_timerclear(&ifp->if_poll_dbg_lasttime);
2137 }
2138
2139 static int
dlil_create_input_thread(ifnet_t ifp,struct dlil_threading_info * inp,thread_continue_t * thfunc)2140 dlil_create_input_thread(ifnet_t ifp, struct dlil_threading_info *inp,
2141 thread_continue_t *thfunc)
2142 {
2143 boolean_t dlil_rxpoll_input;
2144 thread_continue_t func = NULL;
2145 u_int32_t limit;
2146 int error = 0;
2147
2148 dlil_rxpoll_input = (ifp != NULL && net_rxpoll &&
2149 (ifp->if_eflags & IFEF_RXPOLL) && (ifp->if_xflags & IFXF_LEGACY));
2150
2151 /* default strategy utilizes the DLIL worker thread */
2152 inp->dlth_strategy = dlil_input_async;
2153
2154 /* NULL ifp indicates the main input thread, called at dlil_init time */
2155 if (ifp == NULL) {
2156 /*
2157 * Main input thread only.
2158 */
2159 func = dlil_main_input_thread_func;
2160 VERIFY(inp == dlil_main_input_thread);
2161 (void) strlcat(inp->dlth_name,
2162 "main_input", DLIL_THREADNAME_LEN);
2163 } else if (dlil_rxpoll_input) {
2164 /*
2165 * Legacy (non-netif) hybrid polling.
2166 */
2167 func = dlil_rxpoll_input_thread_func;
2168 VERIFY(inp != dlil_main_input_thread);
2169 (void) snprintf(inp->dlth_name, DLIL_THREADNAME_LEN,
2170 "%s_input_poll", if_name(ifp));
2171 } else if (net_async || (ifp->if_xflags & IFXF_LEGACY)) {
2172 /*
2173 * Asynchronous strategy.
2174 */
2175 func = dlil_input_thread_func;
2176 VERIFY(inp != dlil_main_input_thread);
2177 (void) snprintf(inp->dlth_name, DLIL_THREADNAME_LEN,
2178 "%s_input", if_name(ifp));
2179 } else {
2180 /*
2181 * Synchronous strategy if there's a netif below and
2182 * the device isn't capable of hybrid polling.
2183 */
2184 ASSERT(func == NULL);
2185 ASSERT(!(ifp->if_xflags & IFXF_LEGACY));
2186 VERIFY(inp != dlil_main_input_thread);
2187 ASSERT(!inp->dlth_affinity);
2188 inp->dlth_strategy = dlil_input_sync;
2189 }
2190 VERIFY(inp->dlth_thread == THREAD_NULL);
2191
2192 /* let caller know */
2193 if (thfunc != NULL) {
2194 *thfunc = func;
2195 }
2196
2197 inp->dlth_lock_grp = lck_grp_alloc_init(inp->dlth_name, LCK_GRP_ATTR_NULL);
2198 lck_mtx_init(&inp->dlth_lock, inp->dlth_lock_grp, &dlil_lck_attributes);
2199
2200 inp->dlth_ifp = ifp; /* NULL for main input thread */
2201 /*
2202 * For interfaces that support opportunistic polling, set the
2203 * low and high watermarks for outstanding inbound packets/bytes.
2204 * Also define freeze times for transitioning between modes
2205 * and updating the average.
2206 */
2207 if (ifp != NULL && net_rxpoll && (ifp->if_eflags & IFEF_RXPOLL)) {
2208 limit = MAX(if_rcvq_maxlen, IF_RCVQ_MINLEN);
2209 if (ifp->if_xflags & IFXF_LEGACY) {
2210 (void) dlil_rxpoll_set_params(ifp, NULL, FALSE);
2211 }
2212 } else {
2213 limit = (u_int32_t)-1;
2214 }
2215
2216 _qinit(&inp->dlth_pkts, Q_DROPTAIL, limit, QP_MBUF);
2217 if (inp == dlil_main_input_thread) {
2218 struct dlil_main_threading_info *inpm =
2219 (struct dlil_main_threading_info *)inp;
2220 _qinit(&inpm->lo_rcvq_pkts, Q_DROPTAIL, limit, QP_MBUF);
2221 }
2222
2223 if (func == NULL) {
2224 ASSERT(!(ifp->if_xflags & IFXF_LEGACY));
2225 ASSERT(error == 0);
2226 error = ENODEV;
2227 goto done;
2228 }
2229
2230 error = kernel_thread_start(func, inp, &inp->dlth_thread);
2231 if (error == KERN_SUCCESS) {
2232 thread_precedence_policy_data_t info;
2233 __unused kern_return_t kret;
2234
2235 bzero(&info, sizeof(info));
2236 info.importance = 0;
2237 kret = thread_policy_set(inp->dlth_thread,
2238 THREAD_PRECEDENCE_POLICY, (thread_policy_t)&info,
2239 THREAD_PRECEDENCE_POLICY_COUNT);
2240 ASSERT(kret == KERN_SUCCESS);
2241 /*
2242 * We create an affinity set so that the matching workloop
2243 * thread or the starter thread (for loopback) can be
2244 * scheduled on the same processor set as the input thread.
2245 */
2246 if (net_affinity) {
2247 struct thread *tp = inp->dlth_thread;
2248 u_int32_t tag;
2249 /*
2250 * Randomize to reduce the probability
2251 * of affinity tag namespace collision.
2252 */
2253 read_frandom(&tag, sizeof(tag));
2254 if (dlil_affinity_set(tp, tag) == KERN_SUCCESS) {
2255 thread_reference(tp);
2256 inp->dlth_affinity_tag = tag;
2257 inp->dlth_affinity = TRUE;
2258 }
2259 }
2260 } else if (inp == dlil_main_input_thread) {
2261 panic_plain("%s: couldn't create main input thread", __func__);
2262 /* NOTREACHED */
2263 } else {
2264 panic_plain("%s: couldn't create %s input thread", __func__,
2265 if_name(ifp));
2266 /* NOTREACHED */
2267 }
2268 OSAddAtomic(1, &cur_dlil_input_threads);
2269
2270 done:
2271 return error;
2272 }
2273
2274 #if TEST_INPUT_THREAD_TERMINATION
2275 static int
2276 sysctl_input_thread_termination_spin SYSCTL_HANDLER_ARGS
2277 {
2278 #pragma unused(arg1, arg2)
2279 uint32_t i;
2280 int err;
2281
2282 i = if_input_thread_termination_spin;
2283
2284 err = sysctl_handle_int(oidp, &i, 0, req);
2285 if (err != 0 || req->newptr == USER_ADDR_NULL) {
2286 return err;
2287 }
2288
2289 if (net_rxpoll == 0) {
2290 return ENXIO;
2291 }
2292
2293 if_input_thread_termination_spin = i;
2294 return err;
2295 }
2296 #endif /* TEST_INPUT_THREAD_TERMINATION */
2297
2298 static void
dlil_clean_threading_info(struct dlil_threading_info * inp)2299 dlil_clean_threading_info(struct dlil_threading_info *inp)
2300 {
2301 lck_mtx_destroy(&inp->dlth_lock, inp->dlth_lock_grp);
2302 lck_grp_free(inp->dlth_lock_grp);
2303 inp->dlth_lock_grp = NULL;
2304
2305 inp->dlth_flags = 0;
2306 inp->dlth_wtot = 0;
2307 bzero(inp->dlth_name, sizeof(inp->dlth_name));
2308 inp->dlth_ifp = NULL;
2309 VERIFY(qhead(&inp->dlth_pkts) == NULL && qempty(&inp->dlth_pkts));
2310 qlimit(&inp->dlth_pkts) = 0;
2311 bzero(&inp->dlth_stats, sizeof(inp->dlth_stats));
2312
2313 VERIFY(!inp->dlth_affinity);
2314 inp->dlth_thread = THREAD_NULL;
2315 inp->dlth_strategy = NULL;
2316 VERIFY(inp->dlth_driver_thread == THREAD_NULL);
2317 VERIFY(inp->dlth_poller_thread == THREAD_NULL);
2318 VERIFY(inp->dlth_affinity_tag == 0);
2319 #if IFNET_INPUT_SANITY_CHK
2320 inp->dlth_pkts_cnt = 0;
2321 #endif /* IFNET_INPUT_SANITY_CHK */
2322 }
2323
2324 static void
dlil_terminate_input_thread(struct dlil_threading_info * inp)2325 dlil_terminate_input_thread(struct dlil_threading_info *inp)
2326 {
2327 struct ifnet *ifp = inp->dlth_ifp;
2328 classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
2329
2330 VERIFY(current_thread() == inp->dlth_thread);
2331 VERIFY(inp != dlil_main_input_thread);
2332
2333 OSAddAtomic(-1, &cur_dlil_input_threads);
2334
2335 #if TEST_INPUT_THREAD_TERMINATION
2336 { /* do something useless that won't get optimized away */
2337 uint32_t v = 1;
2338 for (uint32_t i = 0;
2339 i < if_input_thread_termination_spin;
2340 i++) {
2341 v = (i + 1) * v;
2342 }
2343 DLIL_PRINTF("the value is %d\n", v);
2344 }
2345 #endif /* TEST_INPUT_THREAD_TERMINATION */
2346
2347 lck_mtx_lock_spin(&inp->dlth_lock);
2348 _getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
2349 VERIFY((inp->dlth_flags & DLIL_INPUT_TERMINATE) != 0);
2350 inp->dlth_flags |= DLIL_INPUT_TERMINATE_COMPLETE;
2351 wakeup_one((caddr_t)&inp->dlth_flags);
2352 lck_mtx_unlock(&inp->dlth_lock);
2353
2354 /* free up pending packets */
2355 if (pkt.cp_mbuf != NULL) {
2356 mbuf_freem_list(pkt.cp_mbuf);
2357 }
2358
2359 /* for the extra refcnt from kernel_thread_start() */
2360 thread_deallocate(current_thread());
2361
2362 if (dlil_verbose) {
2363 DLIL_PRINTF("%s: input thread terminated\n",
2364 if_name(ifp));
2365 }
2366
2367 /* this is the end */
2368 thread_terminate(current_thread());
2369 /* NOTREACHED */
2370 }
2371
2372 static kern_return_t
dlil_affinity_set(struct thread * tp,u_int32_t tag)2373 dlil_affinity_set(struct thread *tp, u_int32_t tag)
2374 {
2375 thread_affinity_policy_data_t policy;
2376
2377 bzero(&policy, sizeof(policy));
2378 policy.affinity_tag = tag;
2379 return thread_policy_set(tp, THREAD_AFFINITY_POLICY,
2380 (thread_policy_t)&policy, THREAD_AFFINITY_POLICY_COUNT);
2381 }
2382
2383 #if SKYWALK && defined(XNU_TARGET_OS_OSX)
2384 static void
dlil_filter_event(struct eventhandler_entry_arg arg __unused,enum net_filter_event_subsystems state)2385 dlil_filter_event(struct eventhandler_entry_arg arg __unused,
2386 enum net_filter_event_subsystems state)
2387 {
2388 bool old_if_enable_fsw_transport_netagent = if_enable_fsw_transport_netagent;
2389 if ((state & ~NET_FILTER_EVENT_PF_PRIVATE_PROXY) == 0) {
2390 if_enable_fsw_transport_netagent = 1;
2391 } else {
2392 if_enable_fsw_transport_netagent = 0;
2393 }
2394 if (old_if_enable_fsw_transport_netagent != if_enable_fsw_transport_netagent) {
2395 kern_nexus_update_netagents();
2396 } else if (!if_enable_fsw_transport_netagent) {
2397 necp_update_all_clients();
2398 }
2399 }
2400 #endif /* SKYWALK && XNU_TARGET_OS_OSX */
2401
2402 void
dlil_init(void)2403 dlil_init(void)
2404 {
2405 thread_t thread = THREAD_NULL;
2406
2407 /*
2408 * The following fields must be 64-bit aligned for atomic operations.
2409 */
2410 IF_DATA_REQUIRE_ALIGNED_64(ifi_ipackets);
2411 IF_DATA_REQUIRE_ALIGNED_64(ifi_ierrors);
2412 IF_DATA_REQUIRE_ALIGNED_64(ifi_opackets);
2413 IF_DATA_REQUIRE_ALIGNED_64(ifi_oerrors);
2414 IF_DATA_REQUIRE_ALIGNED_64(ifi_collisions);
2415 IF_DATA_REQUIRE_ALIGNED_64(ifi_ibytes);
2416 IF_DATA_REQUIRE_ALIGNED_64(ifi_obytes);
2417 IF_DATA_REQUIRE_ALIGNED_64(ifi_imcasts);
2418 IF_DATA_REQUIRE_ALIGNED_64(ifi_omcasts);
2419 IF_DATA_REQUIRE_ALIGNED_64(ifi_iqdrops);
2420 IF_DATA_REQUIRE_ALIGNED_64(ifi_noproto);
2421 IF_DATA_REQUIRE_ALIGNED_64(ifi_alignerrs);
2422 IF_DATA_REQUIRE_ALIGNED_64(ifi_dt_bytes);
2423 IF_DATA_REQUIRE_ALIGNED_64(ifi_fpackets);
2424 IF_DATA_REQUIRE_ALIGNED_64(ifi_fbytes);
2425
2426 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_ipackets);
2427 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_ierrors);
2428 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_opackets);
2429 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_oerrors);
2430 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_collisions);
2431 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_ibytes);
2432 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_obytes);
2433 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_imcasts);
2434 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_omcasts);
2435 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_iqdrops);
2436 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_noproto);
2437 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_alignerrs);
2438 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_dt_bytes);
2439 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_fpackets);
2440 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_fbytes);
2441
2442 /*
2443 * These IF_HWASSIST_ flags must be equal to their IFNET_* counterparts.
2444 */
2445 _CASSERT(IF_HWASSIST_CSUM_IP == IFNET_CSUM_IP);
2446 _CASSERT(IF_HWASSIST_CSUM_TCP == IFNET_CSUM_TCP);
2447 _CASSERT(IF_HWASSIST_CSUM_UDP == IFNET_CSUM_UDP);
2448 _CASSERT(IF_HWASSIST_CSUM_IP_FRAGS == IFNET_CSUM_FRAGMENT);
2449 _CASSERT(IF_HWASSIST_CSUM_FRAGMENT == IFNET_IP_FRAGMENT);
2450 _CASSERT(IF_HWASSIST_CSUM_TCPIPV6 == IFNET_CSUM_TCPIPV6);
2451 _CASSERT(IF_HWASSIST_CSUM_UDPIPV6 == IFNET_CSUM_UDPIPV6);
2452 _CASSERT(IF_HWASSIST_CSUM_FRAGMENT_IPV6 == IFNET_IPV6_FRAGMENT);
2453 _CASSERT(IF_HWASSIST_CSUM_PARTIAL == IFNET_CSUM_PARTIAL);
2454 _CASSERT(IF_HWASSIST_CSUM_ZERO_INVERT == IFNET_CSUM_ZERO_INVERT);
2455 _CASSERT(IF_HWASSIST_VLAN_TAGGING == IFNET_VLAN_TAGGING);
2456 _CASSERT(IF_HWASSIST_VLAN_MTU == IFNET_VLAN_MTU);
2457 _CASSERT(IF_HWASSIST_TSO_V4 == IFNET_TSO_IPV4);
2458 _CASSERT(IF_HWASSIST_TSO_V6 == IFNET_TSO_IPV6);
2459
2460 /*
2461 * ... as well as the mbuf checksum flags counterparts.
2462 */
2463 _CASSERT(CSUM_IP == IF_HWASSIST_CSUM_IP);
2464 _CASSERT(CSUM_TCP == IF_HWASSIST_CSUM_TCP);
2465 _CASSERT(CSUM_UDP == IF_HWASSIST_CSUM_UDP);
2466 _CASSERT(CSUM_IP_FRAGS == IF_HWASSIST_CSUM_IP_FRAGS);
2467 _CASSERT(CSUM_FRAGMENT == IF_HWASSIST_CSUM_FRAGMENT);
2468 _CASSERT(CSUM_TCPIPV6 == IF_HWASSIST_CSUM_TCPIPV6);
2469 _CASSERT(CSUM_UDPIPV6 == IF_HWASSIST_CSUM_UDPIPV6);
2470 _CASSERT(CSUM_FRAGMENT_IPV6 == IF_HWASSIST_CSUM_FRAGMENT_IPV6);
2471 _CASSERT(CSUM_PARTIAL == IF_HWASSIST_CSUM_PARTIAL);
2472 _CASSERT(CSUM_ZERO_INVERT == IF_HWASSIST_CSUM_ZERO_INVERT);
2473 _CASSERT(CSUM_VLAN_TAG_VALID == IF_HWASSIST_VLAN_TAGGING);
2474
2475 /*
2476 * Make sure we have at least IF_LLREACH_MAXLEN in the llreach info.
2477 */
2478 _CASSERT(IF_LLREACH_MAXLEN <= IF_LLREACHINFO_ADDRLEN);
2479 _CASSERT(IFNET_LLREACHINFO_ADDRLEN == IF_LLREACHINFO_ADDRLEN);
2480
2481 _CASSERT(IFRLOGF_DLIL == IFNET_LOGF_DLIL);
2482 _CASSERT(IFRLOGF_FAMILY == IFNET_LOGF_FAMILY);
2483 _CASSERT(IFRLOGF_DRIVER == IFNET_LOGF_DRIVER);
2484 _CASSERT(IFRLOGF_FIRMWARE == IFNET_LOGF_FIRMWARE);
2485
2486 _CASSERT(IFRLOGCAT_CONNECTIVITY == IFNET_LOGCAT_CONNECTIVITY);
2487 _CASSERT(IFRLOGCAT_QUALITY == IFNET_LOGCAT_QUALITY);
2488 _CASSERT(IFRLOGCAT_PERFORMANCE == IFNET_LOGCAT_PERFORMANCE);
2489
2490 _CASSERT(IFRTYPE_FAMILY_ANY == IFNET_FAMILY_ANY);
2491 _CASSERT(IFRTYPE_FAMILY_LOOPBACK == IFNET_FAMILY_LOOPBACK);
2492 _CASSERT(IFRTYPE_FAMILY_ETHERNET == IFNET_FAMILY_ETHERNET);
2493 _CASSERT(IFRTYPE_FAMILY_SLIP == IFNET_FAMILY_SLIP);
2494 _CASSERT(IFRTYPE_FAMILY_TUN == IFNET_FAMILY_TUN);
2495 _CASSERT(IFRTYPE_FAMILY_VLAN == IFNET_FAMILY_VLAN);
2496 _CASSERT(IFRTYPE_FAMILY_PPP == IFNET_FAMILY_PPP);
2497 _CASSERT(IFRTYPE_FAMILY_PVC == IFNET_FAMILY_PVC);
2498 _CASSERT(IFRTYPE_FAMILY_DISC == IFNET_FAMILY_DISC);
2499 _CASSERT(IFRTYPE_FAMILY_MDECAP == IFNET_FAMILY_MDECAP);
2500 _CASSERT(IFRTYPE_FAMILY_GIF == IFNET_FAMILY_GIF);
2501 _CASSERT(IFRTYPE_FAMILY_FAITH == IFNET_FAMILY_FAITH);
2502 _CASSERT(IFRTYPE_FAMILY_STF == IFNET_FAMILY_STF);
2503 _CASSERT(IFRTYPE_FAMILY_FIREWIRE == IFNET_FAMILY_FIREWIRE);
2504 _CASSERT(IFRTYPE_FAMILY_BOND == IFNET_FAMILY_BOND);
2505 _CASSERT(IFRTYPE_FAMILY_CELLULAR == IFNET_FAMILY_CELLULAR);
2506 _CASSERT(IFRTYPE_FAMILY_UTUN == IFNET_FAMILY_UTUN);
2507 _CASSERT(IFRTYPE_FAMILY_IPSEC == IFNET_FAMILY_IPSEC);
2508
2509 _CASSERT(IFRTYPE_SUBFAMILY_ANY == IFNET_SUBFAMILY_ANY);
2510 _CASSERT(IFRTYPE_SUBFAMILY_USB == IFNET_SUBFAMILY_USB);
2511 _CASSERT(IFRTYPE_SUBFAMILY_BLUETOOTH == IFNET_SUBFAMILY_BLUETOOTH);
2512 _CASSERT(IFRTYPE_SUBFAMILY_WIFI == IFNET_SUBFAMILY_WIFI);
2513 _CASSERT(IFRTYPE_SUBFAMILY_THUNDERBOLT == IFNET_SUBFAMILY_THUNDERBOLT);
2514 _CASSERT(IFRTYPE_SUBFAMILY_RESERVED == IFNET_SUBFAMILY_RESERVED);
2515 _CASSERT(IFRTYPE_SUBFAMILY_INTCOPROC == IFNET_SUBFAMILY_INTCOPROC);
2516 _CASSERT(IFRTYPE_SUBFAMILY_QUICKRELAY == IFNET_SUBFAMILY_QUICKRELAY);
2517 _CASSERT(IFRTYPE_SUBFAMILY_VMNET == IFNET_SUBFAMILY_VMNET);
2518 _CASSERT(IFRTYPE_SUBFAMILY_SIMCELL == IFNET_SUBFAMILY_SIMCELL);
2519 _CASSERT(IFRTYPE_SUBFAMILY_MANAGEMENT == IFNET_SUBFAMILY_MANAGEMENT);
2520
2521 _CASSERT(DLIL_MODIDLEN == IFNET_MODIDLEN);
2522 _CASSERT(DLIL_MODARGLEN == IFNET_MODARGLEN);
2523
2524 PE_parse_boot_argn("net_affinity", &net_affinity,
2525 sizeof(net_affinity));
2526
2527 PE_parse_boot_argn("net_rxpoll", &net_rxpoll, sizeof(net_rxpoll));
2528
2529 PE_parse_boot_argn("net_rtref", &net_rtref, sizeof(net_rtref));
2530
2531 PE_parse_boot_argn("net_async", &net_async, sizeof(net_async));
2532
2533 PE_parse_boot_argn("ifnet_debug", &ifnet_debug, sizeof(ifnet_debug));
2534
2535 VERIFY(dlil_pending_thread_cnt == 0);
2536 #if SKYWALK
2537 boolean_t pe_enable_fsw_transport_netagent = FALSE;
2538 boolean_t pe_disable_fsw_transport_netagent = FALSE;
2539 boolean_t enable_fsw_netagent =
2540 (((if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0) ||
2541 (if_attach_nx & IF_ATTACH_NX_FSW_IP_NETAGENT) != 0);
2542
2543 /*
2544 * Check the device tree to see if Skywalk netagent has been explicitly
2545 * enabled or disabled. This can be overridden via if_attach_nx below.
2546 * Note that the property is a 0-length key, and so checking for the
2547 * presence itself is enough (no need to check for the actual value of
2548 * the retrieved variable.)
2549 */
2550 pe_enable_fsw_transport_netagent =
2551 PE_get_default("kern.skywalk_netagent_enable",
2552 &pe_enable_fsw_transport_netagent,
2553 sizeof(pe_enable_fsw_transport_netagent));
2554 pe_disable_fsw_transport_netagent =
2555 PE_get_default("kern.skywalk_netagent_disable",
2556 &pe_disable_fsw_transport_netagent,
2557 sizeof(pe_disable_fsw_transport_netagent));
2558
2559 /*
2560 * These two are mutually exclusive, i.e. they both can be absent,
2561 * but only one can be present at a time, and so we assert to make
2562 * sure it is correct.
2563 */
2564 VERIFY((!pe_enable_fsw_transport_netagent &&
2565 !pe_disable_fsw_transport_netagent) ||
2566 (pe_enable_fsw_transport_netagent ^
2567 pe_disable_fsw_transport_netagent));
2568
2569 if (pe_enable_fsw_transport_netagent) {
2570 kprintf("SK: netagent is enabled via an override for "
2571 "this platform\n");
2572 if_attach_nx = SKYWALK_NETWORKING_ENABLED;
2573 } else if (pe_disable_fsw_transport_netagent) {
2574 kprintf("SK: netagent is disabled via an override for "
2575 "this platform\n");
2576 if_attach_nx = SKYWALK_NETWORKING_DISABLED;
2577 } else {
2578 kprintf("SK: netagent is %s by default for this platform\n",
2579 (enable_fsw_netagent ? "enabled" : "disabled"));
2580 if_attach_nx = IF_ATTACH_NX_DEFAULT;
2581 }
2582
2583 /*
2584 * Now see if there's a boot-arg override.
2585 */
2586 (void) PE_parse_boot_argn("if_attach_nx", &if_attach_nx,
2587 sizeof(if_attach_nx));
2588 if_enable_fsw_transport_netagent =
2589 ((if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0);
2590
2591 if_netif_all = ((if_attach_nx & IF_ATTACH_NX_NETIF_ALL) != 0);
2592
2593 if (pe_disable_fsw_transport_netagent &&
2594 if_enable_fsw_transport_netagent) {
2595 kprintf("SK: netagent is force-enabled\n");
2596 } else if (!pe_disable_fsw_transport_netagent &&
2597 !if_enable_fsw_transport_netagent) {
2598 kprintf("SK: netagent is force-disabled\n");
2599 }
2600 #ifdef XNU_TARGET_OS_OSX
2601 if (if_enable_fsw_transport_netagent) {
2602 net_filter_event_register(dlil_filter_event);
2603 }
2604 #endif /* XNU_TARGET_OS_OSX */
2605
2606 #if (DEVELOPMENT || DEBUG)
2607 (void) PE_parse_boot_argn("fsw_use_max_mtu_buffer",
2608 &fsw_use_max_mtu_buffer, sizeof(fsw_use_max_mtu_buffer));
2609 #endif /* (DEVELOPMENT || DEBUG) */
2610
2611 #endif /* SKYWALK */
2612 dlif_size = (ifnet_debug == 0) ? sizeof(struct dlil_ifnet) :
2613 sizeof(struct dlil_ifnet_dbg);
2614 /* Enforce 64-bit alignment for dlil_ifnet structure */
2615 dlif_bufsize = dlif_size + sizeof(void *) + sizeof(u_int64_t);
2616 dlif_bufsize = (uint32_t)P2ROUNDUP(dlif_bufsize, sizeof(u_int64_t));
2617 dlif_zone = zone_create(DLIF_ZONE_NAME, dlif_bufsize, ZC_ZFREE_CLEARMEM);
2618
2619 dlif_tcpstat_size = sizeof(struct tcpstat_local);
2620 /* Enforce 64-bit alignment for tcpstat_local structure */
2621 dlif_tcpstat_bufsize =
2622 dlif_tcpstat_size + sizeof(void *) + sizeof(u_int64_t);
2623 dlif_tcpstat_bufsize = (uint32_t)
2624 P2ROUNDUP(dlif_tcpstat_bufsize, sizeof(u_int64_t));
2625 dlif_tcpstat_zone = zone_create(DLIF_TCPSTAT_ZONE_NAME,
2626 dlif_tcpstat_bufsize, ZC_ZFREE_CLEARMEM);
2627
2628 dlif_udpstat_size = sizeof(struct udpstat_local);
2629 /* Enforce 64-bit alignment for udpstat_local structure */
2630 dlif_udpstat_bufsize =
2631 dlif_udpstat_size + sizeof(void *) + sizeof(u_int64_t);
2632 dlif_udpstat_bufsize = (uint32_t)
2633 P2ROUNDUP(dlif_udpstat_bufsize, sizeof(u_int64_t));
2634 dlif_udpstat_zone = zone_create(DLIF_UDPSTAT_ZONE_NAME,
2635 dlif_udpstat_bufsize, ZC_ZFREE_CLEARMEM);
2636
2637 eventhandler_lists_ctxt_init(&ifnet_evhdlr_ctxt);
2638
2639 TAILQ_INIT(&dlil_ifnet_head);
2640 TAILQ_INIT(&ifnet_head);
2641 TAILQ_INIT(&ifnet_detaching_head);
2642 TAILQ_INIT(&ifnet_ordered_head);
2643
2644 /* Initialize interface address subsystem */
2645 ifa_init();
2646
2647 #if PF
2648 /* Initialize the packet filter */
2649 pfinit();
2650 #endif /* PF */
2651
2652 /* Initialize queue algorithms */
2653 classq_init();
2654
2655 /* Initialize packet schedulers */
2656 pktsched_init();
2657
2658 /* Initialize flow advisory subsystem */
2659 flowadv_init();
2660
2661 /* Initialize the pktap virtual interface */
2662 pktap_init();
2663
2664 /* Initialize the service class to dscp map */
2665 net_qos_map_init();
2666
2667 /* Initialize the interface low power mode event handler */
2668 if_low_power_evhdlr_init();
2669
2670 /* Initialize the interface offload port list subsystem */
2671 if_ports_used_init();
2672
2673 #if DEBUG || DEVELOPMENT
2674 /* Run self-tests */
2675 dlil_verify_sum16();
2676 #endif /* DEBUG || DEVELOPMENT */
2677
2678 /*
2679 * Create and start up the main DLIL input thread and the interface
2680 * detacher threads once everything is initialized.
2681 */
2682 dlil_incr_pending_thread_count();
2683 (void) dlil_create_input_thread(NULL, dlil_main_input_thread, NULL);
2684
2685 /*
2686 * Create ifnet detacher thread.
2687 * When an interface gets detached, part of the detach processing
2688 * is delayed. The interface is added to delayed detach list
2689 * and this thread is woken up to call ifnet_detach_final
2690 * on these interfaces.
2691 */
2692 dlil_incr_pending_thread_count();
2693 if (kernel_thread_start(ifnet_detacher_thread_func,
2694 NULL, &thread) != KERN_SUCCESS) {
2695 panic_plain("%s: couldn't create detacher thread", __func__);
2696 /* NOTREACHED */
2697 }
2698 thread_deallocate(thread);
2699
2700 /*
2701 * Wait for the created kernel threads for dlil to get
2702 * scheduled and run at least once before we proceed
2703 */
2704 lck_mtx_lock(&dlil_thread_sync_lock);
2705 while (dlil_pending_thread_cnt != 0) {
2706 DLIL_PRINTF("%s: Waiting for all the create dlil kernel "
2707 "threads to get scheduled at least once.\n", __func__);
2708 (void) msleep(&dlil_pending_thread_cnt, &dlil_thread_sync_lock,
2709 (PZERO - 1), __func__, NULL);
2710 LCK_MTX_ASSERT(&dlil_thread_sync_lock, LCK_ASSERT_OWNED);
2711 }
2712 lck_mtx_unlock(&dlil_thread_sync_lock);
2713 DLIL_PRINTF("%s: All the created dlil kernel threads have been "
2714 "scheduled at least once. Proceeding.\n", __func__);
2715 }
2716
2717 static void
if_flt_monitor_busy(struct ifnet * ifp)2718 if_flt_monitor_busy(struct ifnet *ifp)
2719 {
2720 LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
2721
2722 ++ifp->if_flt_busy;
2723 VERIFY(ifp->if_flt_busy != 0);
2724 }
2725
2726 static void
if_flt_monitor_unbusy(struct ifnet * ifp)2727 if_flt_monitor_unbusy(struct ifnet *ifp)
2728 {
2729 if_flt_monitor_leave(ifp);
2730 }
2731
2732 static void
if_flt_monitor_enter(struct ifnet * ifp)2733 if_flt_monitor_enter(struct ifnet *ifp)
2734 {
2735 LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
2736
2737 while (ifp->if_flt_busy) {
2738 ++ifp->if_flt_waiters;
2739 (void) msleep(&ifp->if_flt_head, &ifp->if_flt_lock,
2740 (PZERO - 1), "if_flt_monitor", NULL);
2741 }
2742 if_flt_monitor_busy(ifp);
2743 }
2744
2745 static void
if_flt_monitor_leave(struct ifnet * ifp)2746 if_flt_monitor_leave(struct ifnet *ifp)
2747 {
2748 LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
2749
2750 VERIFY(ifp->if_flt_busy != 0);
2751 --ifp->if_flt_busy;
2752
2753 if (ifp->if_flt_busy == 0 && ifp->if_flt_waiters > 0) {
2754 ifp->if_flt_waiters = 0;
2755 wakeup(&ifp->if_flt_head);
2756 }
2757 }
2758
2759 __private_extern__ int
dlil_attach_filter(struct ifnet * ifp,const struct iff_filter * if_filter,interface_filter_t * filter_ref,u_int32_t flags)2760 dlil_attach_filter(struct ifnet *ifp, const struct iff_filter *if_filter,
2761 interface_filter_t *filter_ref, u_int32_t flags)
2762 {
2763 int retval = 0;
2764 struct ifnet_filter *filter = NULL;
2765
2766 ifnet_head_lock_shared();
2767
2768 /* Check that the interface is in the global list */
2769 if (!ifnet_lookup(ifp)) {
2770 retval = ENXIO;
2771 goto done;
2772 }
2773 if (!ifnet_is_attached(ifp, 1)) {
2774 os_log(OS_LOG_DEFAULT, "%s: %s is no longer attached",
2775 __func__, if_name(ifp));
2776 retval = ENXIO;
2777 goto done;
2778 }
2779
2780 filter = zalloc_flags(dlif_filt_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
2781
2782 /* refcnt held above during lookup */
2783 filter->filt_flags = flags;
2784 filter->filt_ifp = ifp;
2785 filter->filt_cookie = if_filter->iff_cookie;
2786 filter->filt_name = if_filter->iff_name;
2787 filter->filt_protocol = if_filter->iff_protocol;
2788 /*
2789 * Do not install filter callbacks for internal coproc interface
2790 * and for management interfaces
2791 */
2792 if (!IFNET_IS_INTCOPROC(ifp) && !IFNET_IS_MANAGEMENT(ifp)) {
2793 filter->filt_input = if_filter->iff_input;
2794 filter->filt_output = if_filter->iff_output;
2795 filter->filt_event = if_filter->iff_event;
2796 filter->filt_ioctl = if_filter->iff_ioctl;
2797 }
2798 filter->filt_detached = if_filter->iff_detached;
2799
2800 lck_mtx_lock(&ifp->if_flt_lock);
2801 if_flt_monitor_enter(ifp);
2802
2803 LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
2804 TAILQ_INSERT_TAIL(&ifp->if_flt_head, filter, filt_next);
2805
2806 *filter_ref = filter;
2807
2808 /*
2809 * Bump filter count and route_generation ID to let TCP
2810 * know it shouldn't do TSO on this connection
2811 */
2812 if ((filter->filt_flags & DLIL_IFF_TSO) == 0) {
2813 ifnet_filter_update_tso(ifp, TRUE);
2814 }
2815 OSIncrementAtomic64(&net_api_stats.nas_iflt_attach_count);
2816 INC_ATOMIC_INT64_LIM(net_api_stats.nas_iflt_attach_total);
2817 if (filter->filt_flags & DLIL_IFF_INTERNAL) {
2818 OSIncrementAtomic64(&net_api_stats.nas_iflt_attach_os_count);
2819 INC_ATOMIC_INT64_LIM(net_api_stats.nas_iflt_attach_os_total);
2820 } else {
2821 OSAddAtomic(1, &ifp->if_flt_non_os_count);
2822 }
2823 if_flt_monitor_leave(ifp);
2824 lck_mtx_unlock(&ifp->if_flt_lock);
2825
2826 #if SKYWALK && defined(XNU_TARGET_OS_OSX)
2827 net_filter_event_mark(NET_FILTER_EVENT_INTERFACE,
2828 net_check_compatible_if_filter(NULL));
2829 #endif /* SKYWALK && XNU_TARGET_OS_OSX */
2830
2831 if (dlil_verbose) {
2832 DLIL_PRINTF("%s: %s filter attached\n", if_name(ifp),
2833 if_filter->iff_name);
2834 }
2835 ifnet_decr_iorefcnt(ifp);
2836
2837 done:
2838 ifnet_head_done();
2839 if (retval != 0 && ifp != NULL) {
2840 DLIL_PRINTF("%s: failed to attach %s (err=%d)\n",
2841 if_name(ifp), if_filter->iff_name, retval);
2842 }
2843 if (retval != 0 && filter != NULL) {
2844 zfree(dlif_filt_zone, filter);
2845 }
2846
2847 return retval;
2848 }
2849
2850 static int
dlil_detach_filter_internal(interface_filter_t filter,int detached)2851 dlil_detach_filter_internal(interface_filter_t filter, int detached)
2852 {
2853 int retval = 0;
2854
2855 if (detached == 0) {
2856 ifnet_t ifp = NULL;
2857
2858 ifnet_head_lock_shared();
2859 TAILQ_FOREACH(ifp, &ifnet_head, if_link) {
2860 interface_filter_t entry = NULL;
2861
2862 lck_mtx_lock(&ifp->if_flt_lock);
2863 TAILQ_FOREACH(entry, &ifp->if_flt_head, filt_next) {
2864 if (entry != filter || entry->filt_skip) {
2865 continue;
2866 }
2867 /*
2868 * We've found a match; since it's possible
2869 * that the thread gets blocked in the monitor,
2870 * we do the lock dance. Interface should
2871 * not be detached since we still have a use
2872 * count held during filter attach.
2873 */
2874 entry->filt_skip = 1; /* skip input/output */
2875 lck_mtx_unlock(&ifp->if_flt_lock);
2876 ifnet_head_done();
2877
2878 lck_mtx_lock(&ifp->if_flt_lock);
2879 if_flt_monitor_enter(ifp);
2880 LCK_MTX_ASSERT(&ifp->if_flt_lock,
2881 LCK_MTX_ASSERT_OWNED);
2882
2883 /* Remove the filter from the list */
2884 TAILQ_REMOVE(&ifp->if_flt_head, filter,
2885 filt_next);
2886
2887 if (dlil_verbose) {
2888 DLIL_PRINTF("%s: %s filter detached\n",
2889 if_name(ifp), filter->filt_name);
2890 }
2891 if (!(filter->filt_flags & DLIL_IFF_INTERNAL)) {
2892 VERIFY(ifp->if_flt_non_os_count != 0);
2893 OSAddAtomic(-1, &ifp->if_flt_non_os_count);
2894 }
2895 /*
2896 * Decrease filter count and route_generation
2897 * ID to let TCP know it should reevalute doing
2898 * TSO or not.
2899 */
2900 if ((filter->filt_flags & DLIL_IFF_TSO) == 0) {
2901 ifnet_filter_update_tso(ifp, FALSE);
2902 }
2903 if_flt_monitor_leave(ifp);
2904 lck_mtx_unlock(&ifp->if_flt_lock);
2905 goto destroy;
2906 }
2907 lck_mtx_unlock(&ifp->if_flt_lock);
2908 }
2909 ifnet_head_done();
2910
2911 /* filter parameter is not a valid filter ref */
2912 retval = EINVAL;
2913 goto done;
2914 } else {
2915 struct ifnet *ifp = filter->filt_ifp;
2916 /*
2917 * Here we are called from ifnet_detach_final(); the
2918 * caller had emptied if_flt_head and we're doing an
2919 * implicit filter detach because the interface is
2920 * about to go away. Make sure to adjust the counters
2921 * in this case. We don't need the protection of the
2922 * filter monitor since we're called as part of the
2923 * final detach in the context of the detacher thread.
2924 */
2925 if (!(filter->filt_flags & DLIL_IFF_INTERNAL)) {
2926 VERIFY(ifp->if_flt_non_os_count != 0);
2927 OSAddAtomic(-1, &ifp->if_flt_non_os_count);
2928 }
2929 /*
2930 * Decrease filter count and route_generation
2931 * ID to let TCP know it should reevalute doing
2932 * TSO or not.
2933 */
2934 if ((filter->filt_flags & DLIL_IFF_TSO) == 0) {
2935 ifnet_filter_update_tso(ifp, FALSE);
2936 }
2937 }
2938
2939 if (dlil_verbose) {
2940 DLIL_PRINTF("%s filter detached\n", filter->filt_name);
2941 }
2942
2943 destroy:
2944
2945 /* Call the detached function if there is one */
2946 if (filter->filt_detached) {
2947 filter->filt_detached(filter->filt_cookie, filter->filt_ifp);
2948 }
2949
2950 VERIFY(OSDecrementAtomic64(&net_api_stats.nas_iflt_attach_count) > 0);
2951 if (filter->filt_flags & DLIL_IFF_INTERNAL) {
2952 VERIFY(OSDecrementAtomic64(&net_api_stats.nas_iflt_attach_os_count) > 0);
2953 }
2954 #if SKYWALK && defined(XNU_TARGET_OS_OSX)
2955 net_filter_event_mark(NET_FILTER_EVENT_INTERFACE,
2956 net_check_compatible_if_filter(NULL));
2957 #endif /* SKYWALK && XNU_TARGET_OS_OSX */
2958
2959 /* Free the filter */
2960 zfree(dlif_filt_zone, filter);
2961 filter = NULL;
2962 done:
2963 if (retval != 0 && filter != NULL) {
2964 DLIL_PRINTF("failed to detach %s filter (err=%d)\n",
2965 filter->filt_name, retval);
2966 }
2967
2968 return retval;
2969 }
2970
2971 __private_extern__ void
dlil_detach_filter(interface_filter_t filter)2972 dlil_detach_filter(interface_filter_t filter)
2973 {
2974 if (filter == NULL) {
2975 return;
2976 }
2977 dlil_detach_filter_internal(filter, 0);
2978 }
2979
2980 __private_extern__ boolean_t
dlil_has_ip_filter(void)2981 dlil_has_ip_filter(void)
2982 {
2983 boolean_t has_filter = (net_api_stats.nas_ipf_add_count > 0);
2984 DTRACE_IP1(dlil_has_ip_filter, boolean_t, has_filter);
2985 return has_filter;
2986 }
2987
2988 __private_extern__ boolean_t
dlil_has_if_filter(struct ifnet * ifp)2989 dlil_has_if_filter(struct ifnet *ifp)
2990 {
2991 boolean_t has_filter = !TAILQ_EMPTY(&ifp->if_flt_head);
2992 DTRACE_IP1(dlil_has_if_filter, boolean_t, has_filter);
2993 return has_filter;
2994 }
2995
2996 static inline void
dlil_input_wakeup(struct dlil_threading_info * inp)2997 dlil_input_wakeup(struct dlil_threading_info *inp)
2998 {
2999 LCK_MTX_ASSERT(&inp->dlth_lock, LCK_MTX_ASSERT_OWNED);
3000
3001 inp->dlth_flags |= DLIL_INPUT_WAITING;
3002 if (!(inp->dlth_flags & DLIL_INPUT_RUNNING)) {
3003 inp->dlth_wtot++;
3004 wakeup_one((caddr_t)&inp->dlth_flags);
3005 }
3006 }
3007
3008 __attribute__((noreturn))
3009 static void
dlil_main_input_thread_func(void * v,wait_result_t w)3010 dlil_main_input_thread_func(void *v, wait_result_t w)
3011 {
3012 #pragma unused(w)
3013 struct dlil_threading_info *inp = v;
3014
3015 VERIFY(inp == dlil_main_input_thread);
3016 VERIFY(inp->dlth_ifp == NULL);
3017 VERIFY(current_thread() == inp->dlth_thread);
3018
3019 lck_mtx_lock(&inp->dlth_lock);
3020 VERIFY(!(inp->dlth_flags & (DLIL_INPUT_EMBRYONIC | DLIL_INPUT_RUNNING)));
3021 (void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3022 inp->dlth_flags |= DLIL_INPUT_EMBRYONIC;
3023 /* wake up once to get out of embryonic state */
3024 dlil_input_wakeup(inp);
3025 lck_mtx_unlock(&inp->dlth_lock);
3026 (void) thread_block_parameter(dlil_main_input_thread_cont, inp);
3027 /* NOTREACHED */
3028 __builtin_unreachable();
3029 }
3030
3031 /*
3032 * Main input thread:
3033 *
3034 * a) handles all inbound packets for lo0
3035 * b) handles all inbound packets for interfaces with no dedicated
3036 * input thread (e.g. anything but Ethernet/PDP or those that support
3037 * opportunistic polling.)
3038 * c) protocol registrations
3039 * d) packet injections
3040 */
3041 __attribute__((noreturn))
3042 static void
dlil_main_input_thread_cont(void * v,wait_result_t wres)3043 dlil_main_input_thread_cont(void *v, wait_result_t wres)
3044 {
3045 struct dlil_main_threading_info *inpm = v;
3046 struct dlil_threading_info *inp = v;
3047
3048 /* main input thread is uninterruptible */
3049 VERIFY(wres != THREAD_INTERRUPTED);
3050 lck_mtx_lock_spin(&inp->dlth_lock);
3051 VERIFY(!(inp->dlth_flags & (DLIL_INPUT_TERMINATE |
3052 DLIL_INPUT_RUNNING)));
3053 inp->dlth_flags |= DLIL_INPUT_RUNNING;
3054
3055 while (1) {
3056 struct mbuf *m = NULL, *m_loop = NULL;
3057 u_int32_t m_cnt, m_cnt_loop;
3058 classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
3059 boolean_t proto_req;
3060 boolean_t embryonic;
3061
3062 inp->dlth_flags &= ~DLIL_INPUT_WAITING;
3063
3064 if (__improbable(embryonic =
3065 (inp->dlth_flags & DLIL_INPUT_EMBRYONIC))) {
3066 inp->dlth_flags &= ~DLIL_INPUT_EMBRYONIC;
3067 }
3068
3069 proto_req = (inp->dlth_flags &
3070 (DLIL_PROTO_WAITING | DLIL_PROTO_REGISTER));
3071
3072 /* Packets for non-dedicated interfaces other than lo0 */
3073 m_cnt = qlen(&inp->dlth_pkts);
3074 _getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
3075 m = pkt.cp_mbuf;
3076
3077 /* Packets exclusive to lo0 */
3078 m_cnt_loop = qlen(&inpm->lo_rcvq_pkts);
3079 _getq_all(&inpm->lo_rcvq_pkts, &pkt, NULL, NULL, NULL);
3080 m_loop = pkt.cp_mbuf;
3081
3082 inp->dlth_wtot = 0;
3083
3084 lck_mtx_unlock(&inp->dlth_lock);
3085
3086 if (__improbable(embryonic)) {
3087 dlil_decr_pending_thread_count();
3088 }
3089
3090 /*
3091 * NOTE warning %%% attention !!!!
3092 * We should think about putting some thread starvation
3093 * safeguards if we deal with long chains of packets.
3094 */
3095 if (__probable(m_loop != NULL)) {
3096 dlil_input_packet_list_extended(lo_ifp, m_loop,
3097 m_cnt_loop, IFNET_MODEL_INPUT_POLL_OFF);
3098 }
3099
3100 if (__probable(m != NULL)) {
3101 dlil_input_packet_list_extended(NULL, m,
3102 m_cnt, IFNET_MODEL_INPUT_POLL_OFF);
3103 }
3104
3105 if (__improbable(proto_req)) {
3106 proto_input_run();
3107 }
3108
3109 lck_mtx_lock_spin(&inp->dlth_lock);
3110 VERIFY(inp->dlth_flags & DLIL_INPUT_RUNNING);
3111 /* main input thread cannot be terminated */
3112 VERIFY(!(inp->dlth_flags & DLIL_INPUT_TERMINATE));
3113 if (!(inp->dlth_flags & ~DLIL_INPUT_RUNNING)) {
3114 break;
3115 }
3116 }
3117
3118 inp->dlth_flags &= ~DLIL_INPUT_RUNNING;
3119 (void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3120 lck_mtx_unlock(&inp->dlth_lock);
3121 (void) thread_block_parameter(dlil_main_input_thread_cont, inp);
3122
3123 VERIFY(0); /* we should never get here */
3124 /* NOTREACHED */
3125 __builtin_unreachable();
3126 }
3127
3128 /*
3129 * Input thread for interfaces with legacy input model.
3130 */
3131 __attribute__((noreturn))
3132 static void
dlil_input_thread_func(void * v,wait_result_t w)3133 dlil_input_thread_func(void *v, wait_result_t w)
3134 {
3135 #pragma unused(w)
3136 char thread_name[MAXTHREADNAMESIZE];
3137 struct dlil_threading_info *inp = v;
3138 struct ifnet *ifp = inp->dlth_ifp;
3139
3140 VERIFY(inp != dlil_main_input_thread);
3141 VERIFY(ifp != NULL);
3142 VERIFY(!(ifp->if_eflags & IFEF_RXPOLL) || !net_rxpoll ||
3143 !(ifp->if_xflags & IFXF_LEGACY));
3144 VERIFY(ifp->if_poll_mode == IFNET_MODEL_INPUT_POLL_OFF ||
3145 !(ifp->if_xflags & IFXF_LEGACY));
3146 VERIFY(current_thread() == inp->dlth_thread);
3147
3148 /* construct the name for this thread, and then apply it */
3149 bzero(thread_name, sizeof(thread_name));
3150 (void) snprintf(thread_name, sizeof(thread_name),
3151 "dlil_input_%s", ifp->if_xname);
3152 thread_set_thread_name(inp->dlth_thread, thread_name);
3153
3154 lck_mtx_lock(&inp->dlth_lock);
3155 VERIFY(!(inp->dlth_flags & (DLIL_INPUT_EMBRYONIC | DLIL_INPUT_RUNNING)));
3156 (void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3157 inp->dlth_flags |= DLIL_INPUT_EMBRYONIC;
3158 /* wake up once to get out of embryonic state */
3159 dlil_input_wakeup(inp);
3160 lck_mtx_unlock(&inp->dlth_lock);
3161 (void) thread_block_parameter(dlil_input_thread_cont, inp);
3162 /* NOTREACHED */
3163 __builtin_unreachable();
3164 }
3165
3166 __attribute__((noreturn))
3167 static void
dlil_input_thread_cont(void * v,wait_result_t wres)3168 dlil_input_thread_cont(void *v, wait_result_t wres)
3169 {
3170 struct dlil_threading_info *inp = v;
3171 struct ifnet *ifp = inp->dlth_ifp;
3172
3173 lck_mtx_lock_spin(&inp->dlth_lock);
3174 if (__improbable(wres == THREAD_INTERRUPTED ||
3175 (inp->dlth_flags & DLIL_INPUT_TERMINATE))) {
3176 goto terminate;
3177 }
3178
3179 VERIFY(!(inp->dlth_flags & DLIL_INPUT_RUNNING));
3180 inp->dlth_flags |= DLIL_INPUT_RUNNING;
3181
3182 while (1) {
3183 struct mbuf *m = NULL;
3184 classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
3185 boolean_t notify = FALSE;
3186 boolean_t embryonic;
3187 u_int32_t m_cnt;
3188
3189 inp->dlth_flags &= ~DLIL_INPUT_WAITING;
3190
3191 if (__improbable(embryonic =
3192 (inp->dlth_flags & DLIL_INPUT_EMBRYONIC))) {
3193 inp->dlth_flags &= ~DLIL_INPUT_EMBRYONIC;
3194 }
3195
3196 /*
3197 * Protocol registration and injection must always use
3198 * the main input thread; in theory the latter can utilize
3199 * the corresponding input thread where the packet arrived
3200 * on, but that requires our knowing the interface in advance
3201 * (and the benefits might not worth the trouble.)
3202 */
3203 VERIFY(!(inp->dlth_flags &
3204 (DLIL_PROTO_WAITING | DLIL_PROTO_REGISTER)));
3205
3206 /* Packets for this interface */
3207 m_cnt = qlen(&inp->dlth_pkts);
3208 _getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
3209 m = pkt.cp_mbuf;
3210
3211 inp->dlth_wtot = 0;
3212
3213 #if SKYWALK
3214 /*
3215 * If this interface is attached to a netif nexus,
3216 * the stats are already incremented there; otherwise
3217 * do it here.
3218 */
3219 if (!(ifp->if_capabilities & IFCAP_SKYWALK))
3220 #endif /* SKYWALK */
3221 notify = dlil_input_stats_sync(ifp, inp);
3222
3223 lck_mtx_unlock(&inp->dlth_lock);
3224
3225 if (__improbable(embryonic)) {
3226 ifnet_decr_pending_thread_count(ifp);
3227 }
3228
3229 if (__improbable(notify)) {
3230 ifnet_notify_data_threshold(ifp);
3231 }
3232
3233 /*
3234 * NOTE warning %%% attention !!!!
3235 * We should think about putting some thread starvation
3236 * safeguards if we deal with long chains of packets.
3237 */
3238 if (__probable(m != NULL)) {
3239 dlil_input_packet_list_extended(NULL, m,
3240 m_cnt, ifp->if_poll_mode);
3241 }
3242
3243 lck_mtx_lock_spin(&inp->dlth_lock);
3244 VERIFY(inp->dlth_flags & DLIL_INPUT_RUNNING);
3245 if (!(inp->dlth_flags & ~(DLIL_INPUT_RUNNING |
3246 DLIL_INPUT_TERMINATE))) {
3247 break;
3248 }
3249 }
3250
3251 inp->dlth_flags &= ~DLIL_INPUT_RUNNING;
3252
3253 if (__improbable(inp->dlth_flags & DLIL_INPUT_TERMINATE)) {
3254 terminate:
3255 lck_mtx_unlock(&inp->dlth_lock);
3256 dlil_terminate_input_thread(inp);
3257 /* NOTREACHED */
3258 } else {
3259 (void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3260 lck_mtx_unlock(&inp->dlth_lock);
3261 (void) thread_block_parameter(dlil_input_thread_cont, inp);
3262 /* NOTREACHED */
3263 }
3264
3265 VERIFY(0); /* we should never get here */
3266 /* NOTREACHED */
3267 __builtin_unreachable();
3268 }
3269
3270 /*
3271 * Input thread for interfaces with opportunistic polling input model.
3272 */
3273 __attribute__((noreturn))
3274 static void
dlil_rxpoll_input_thread_func(void * v,wait_result_t w)3275 dlil_rxpoll_input_thread_func(void *v, wait_result_t w)
3276 {
3277 #pragma unused(w)
3278 char thread_name[MAXTHREADNAMESIZE];
3279 struct dlil_threading_info *inp = v;
3280 struct ifnet *ifp = inp->dlth_ifp;
3281
3282 VERIFY(inp != dlil_main_input_thread);
3283 VERIFY(ifp != NULL && (ifp->if_eflags & IFEF_RXPOLL) &&
3284 (ifp->if_xflags & IFXF_LEGACY));
3285 VERIFY(current_thread() == inp->dlth_thread);
3286
3287 /* construct the name for this thread, and then apply it */
3288 bzero(thread_name, sizeof(thread_name));
3289 (void) snprintf(thread_name, sizeof(thread_name),
3290 "dlil_input_poll_%s", ifp->if_xname);
3291 thread_set_thread_name(inp->dlth_thread, thread_name);
3292
3293 lck_mtx_lock(&inp->dlth_lock);
3294 VERIFY(!(inp->dlth_flags & (DLIL_INPUT_EMBRYONIC | DLIL_INPUT_RUNNING)));
3295 (void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3296 inp->dlth_flags |= DLIL_INPUT_EMBRYONIC;
3297 /* wake up once to get out of embryonic state */
3298 dlil_input_wakeup(inp);
3299 lck_mtx_unlock(&inp->dlth_lock);
3300 (void) thread_block_parameter(dlil_rxpoll_input_thread_cont, inp);
3301 /* NOTREACHED */
3302 __builtin_unreachable();
3303 }
3304
3305 __attribute__((noreturn))
3306 static void
dlil_rxpoll_input_thread_cont(void * v,wait_result_t wres)3307 dlil_rxpoll_input_thread_cont(void *v, wait_result_t wres)
3308 {
3309 struct dlil_threading_info *inp = v;
3310 struct ifnet *ifp = inp->dlth_ifp;
3311 struct timespec ts;
3312
3313 lck_mtx_lock_spin(&inp->dlth_lock);
3314 if (__improbable(wres == THREAD_INTERRUPTED ||
3315 (inp->dlth_flags & DLIL_INPUT_TERMINATE))) {
3316 goto terminate;
3317 }
3318
3319 VERIFY(!(inp->dlth_flags & DLIL_INPUT_RUNNING));
3320 inp->dlth_flags |= DLIL_INPUT_RUNNING;
3321
3322 while (1) {
3323 struct mbuf *m = NULL;
3324 uint32_t m_cnt, poll_req = 0;
3325 uint64_t m_size = 0;
3326 ifnet_model_t mode;
3327 struct timespec now, delta;
3328 classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
3329 boolean_t notify;
3330 boolean_t embryonic;
3331 uint64_t ival;
3332
3333 inp->dlth_flags &= ~DLIL_INPUT_WAITING;
3334
3335 if (__improbable(embryonic =
3336 (inp->dlth_flags & DLIL_INPUT_EMBRYONIC))) {
3337 inp->dlth_flags &= ~DLIL_INPUT_EMBRYONIC;
3338 goto skip;
3339 }
3340
3341 if ((ival = ifp->if_rxpoll_ival) < IF_RXPOLL_INTERVALTIME_MIN) {
3342 ival = IF_RXPOLL_INTERVALTIME_MIN;
3343 }
3344
3345 /* Link parameters changed? */
3346 if (ifp->if_poll_update != 0) {
3347 ifp->if_poll_update = 0;
3348 (void) dlil_rxpoll_set_params(ifp, NULL, TRUE);
3349 }
3350
3351 /* Current operating mode */
3352 mode = ifp->if_poll_mode;
3353
3354 /*
3355 * Protocol registration and injection must always use
3356 * the main input thread; in theory the latter can utilize
3357 * the corresponding input thread where the packet arrived
3358 * on, but that requires our knowing the interface in advance
3359 * (and the benefits might not worth the trouble.)
3360 */
3361 VERIFY(!(inp->dlth_flags &
3362 (DLIL_PROTO_WAITING | DLIL_PROTO_REGISTER)));
3363
3364 /* Total count of all packets */
3365 m_cnt = qlen(&inp->dlth_pkts);
3366
3367 /* Total bytes of all packets */
3368 m_size = qsize(&inp->dlth_pkts);
3369
3370 /* Packets for this interface */
3371 _getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
3372 m = pkt.cp_mbuf;
3373 VERIFY(m != NULL || m_cnt == 0);
3374
3375 nanouptime(&now);
3376 if (!net_timerisset(&ifp->if_poll_sample_lasttime)) {
3377 *(&ifp->if_poll_sample_lasttime) = *(&now);
3378 }
3379
3380 net_timersub(&now, &ifp->if_poll_sample_lasttime, &delta);
3381 if (if_rxpoll && net_timerisset(&ifp->if_poll_sample_holdtime)) {
3382 u_int32_t ptot, btot;
3383
3384 /* Accumulate statistics for current sampling */
3385 PKTCNTR_ADD(&ifp->if_poll_sstats, m_cnt, m_size);
3386
3387 if (net_timercmp(&delta, &ifp->if_poll_sample_holdtime, <)) {
3388 goto skip;
3389 }
3390
3391 *(&ifp->if_poll_sample_lasttime) = *(&now);
3392
3393 /* Calculate min/max of inbound bytes */
3394 btot = (u_int32_t)ifp->if_poll_sstats.bytes;
3395 if (ifp->if_rxpoll_bmin == 0 || ifp->if_rxpoll_bmin > btot) {
3396 ifp->if_rxpoll_bmin = btot;
3397 }
3398 if (btot > ifp->if_rxpoll_bmax) {
3399 ifp->if_rxpoll_bmax = btot;
3400 }
3401
3402 /* Calculate EWMA of inbound bytes */
3403 DLIL_EWMA(ifp->if_rxpoll_bavg, btot, if_rxpoll_decay);
3404
3405 /* Calculate min/max of inbound packets */
3406 ptot = (u_int32_t)ifp->if_poll_sstats.packets;
3407 if (ifp->if_rxpoll_pmin == 0 || ifp->if_rxpoll_pmin > ptot) {
3408 ifp->if_rxpoll_pmin = ptot;
3409 }
3410 if (ptot > ifp->if_rxpoll_pmax) {
3411 ifp->if_rxpoll_pmax = ptot;
3412 }
3413
3414 /* Calculate EWMA of inbound packets */
3415 DLIL_EWMA(ifp->if_rxpoll_pavg, ptot, if_rxpoll_decay);
3416
3417 /* Reset sampling statistics */
3418 PKTCNTR_CLEAR(&ifp->if_poll_sstats);
3419
3420 /* Calculate EWMA of wakeup requests */
3421 DLIL_EWMA(ifp->if_rxpoll_wavg, inp->dlth_wtot,
3422 if_rxpoll_decay);
3423 inp->dlth_wtot = 0;
3424
3425 if (dlil_verbose) {
3426 if (!net_timerisset(&ifp->if_poll_dbg_lasttime)) {
3427 *(&ifp->if_poll_dbg_lasttime) = *(&now);
3428 }
3429 net_timersub(&now, &ifp->if_poll_dbg_lasttime, &delta);
3430 if (net_timercmp(&delta, &dlil_dbgrate, >=)) {
3431 *(&ifp->if_poll_dbg_lasttime) = *(&now);
3432 DLIL_PRINTF("%s: [%s] pkts avg %d max %d "
3433 "limits [%d/%d], wreq avg %d "
3434 "limits [%d/%d], bytes avg %d "
3435 "limits [%d/%d]\n", if_name(ifp),
3436 (ifp->if_poll_mode ==
3437 IFNET_MODEL_INPUT_POLL_ON) ?
3438 "ON" : "OFF", ifp->if_rxpoll_pavg,
3439 ifp->if_rxpoll_pmax,
3440 ifp->if_rxpoll_plowat,
3441 ifp->if_rxpoll_phiwat,
3442 ifp->if_rxpoll_wavg,
3443 ifp->if_rxpoll_wlowat,
3444 ifp->if_rxpoll_whiwat,
3445 ifp->if_rxpoll_bavg,
3446 ifp->if_rxpoll_blowat,
3447 ifp->if_rxpoll_bhiwat);
3448 }
3449 }
3450
3451 /* Perform mode transition, if necessary */
3452 if (!net_timerisset(&ifp->if_poll_mode_lasttime)) {
3453 *(&ifp->if_poll_mode_lasttime) = *(&now);
3454 }
3455
3456 net_timersub(&now, &ifp->if_poll_mode_lasttime, &delta);
3457 if (net_timercmp(&delta, &ifp->if_poll_mode_holdtime, <)) {
3458 goto skip;
3459 }
3460
3461 if (ifp->if_rxpoll_pavg <= ifp->if_rxpoll_plowat &&
3462 ifp->if_rxpoll_bavg <= ifp->if_rxpoll_blowat &&
3463 ifp->if_poll_mode != IFNET_MODEL_INPUT_POLL_OFF) {
3464 mode = IFNET_MODEL_INPUT_POLL_OFF;
3465 } else if (ifp->if_rxpoll_pavg >= ifp->if_rxpoll_phiwat &&
3466 (ifp->if_rxpoll_bavg >= ifp->if_rxpoll_bhiwat ||
3467 ifp->if_rxpoll_wavg >= ifp->if_rxpoll_whiwat) &&
3468 ifp->if_poll_mode != IFNET_MODEL_INPUT_POLL_ON) {
3469 mode = IFNET_MODEL_INPUT_POLL_ON;
3470 }
3471
3472 if (mode != ifp->if_poll_mode) {
3473 ifp->if_poll_mode = mode;
3474 *(&ifp->if_poll_mode_lasttime) = *(&now);
3475 poll_req++;
3476 }
3477 }
3478 skip:
3479 notify = dlil_input_stats_sync(ifp, inp);
3480
3481 lck_mtx_unlock(&inp->dlth_lock);
3482
3483 if (__improbable(embryonic)) {
3484 ifnet_decr_pending_thread_count(ifp);
3485 }
3486
3487 if (__improbable(notify)) {
3488 ifnet_notify_data_threshold(ifp);
3489 }
3490
3491 /*
3492 * If there's a mode change and interface is still attached,
3493 * perform a downcall to the driver for the new mode. Also
3494 * hold an IO refcnt on the interface to prevent it from
3495 * being detached (will be release below.)
3496 */
3497 if (poll_req != 0 && ifnet_is_attached(ifp, 1)) {
3498 struct ifnet_model_params p = {
3499 .model = mode, .reserved = { 0 }
3500 };
3501 errno_t err;
3502
3503 if (dlil_verbose) {
3504 DLIL_PRINTF("%s: polling is now %s, "
3505 "pkts avg %d max %d limits [%d/%d], "
3506 "wreq avg %d limits [%d/%d], "
3507 "bytes avg %d limits [%d/%d]\n",
3508 if_name(ifp),
3509 (mode == IFNET_MODEL_INPUT_POLL_ON) ?
3510 "ON" : "OFF", ifp->if_rxpoll_pavg,
3511 ifp->if_rxpoll_pmax, ifp->if_rxpoll_plowat,
3512 ifp->if_rxpoll_phiwat, ifp->if_rxpoll_wavg,
3513 ifp->if_rxpoll_wlowat, ifp->if_rxpoll_whiwat,
3514 ifp->if_rxpoll_bavg, ifp->if_rxpoll_blowat,
3515 ifp->if_rxpoll_bhiwat);
3516 }
3517
3518 if ((err = ((*ifp->if_input_ctl)(ifp,
3519 IFNET_CTL_SET_INPUT_MODEL, sizeof(p), &p))) != 0) {
3520 DLIL_PRINTF("%s: error setting polling mode "
3521 "to %s (%d)\n", if_name(ifp),
3522 (mode == IFNET_MODEL_INPUT_POLL_ON) ?
3523 "ON" : "OFF", err);
3524 }
3525
3526 switch (mode) {
3527 case IFNET_MODEL_INPUT_POLL_OFF:
3528 ifnet_set_poll_cycle(ifp, NULL);
3529 ifp->if_rxpoll_offreq++;
3530 if (err != 0) {
3531 ifp->if_rxpoll_offerr++;
3532 }
3533 break;
3534
3535 case IFNET_MODEL_INPUT_POLL_ON:
3536 net_nsectimer(&ival, &ts);
3537 ifnet_set_poll_cycle(ifp, &ts);
3538 ifnet_poll(ifp);
3539 ifp->if_rxpoll_onreq++;
3540 if (err != 0) {
3541 ifp->if_rxpoll_onerr++;
3542 }
3543 break;
3544
3545 default:
3546 VERIFY(0);
3547 /* NOTREACHED */
3548 }
3549
3550 /* Release the IO refcnt */
3551 ifnet_decr_iorefcnt(ifp);
3552 }
3553
3554 /*
3555 * NOTE warning %%% attention !!!!
3556 * We should think about putting some thread starvation
3557 * safeguards if we deal with long chains of packets.
3558 */
3559 if (__probable(m != NULL)) {
3560 dlil_input_packet_list_extended(NULL, m, m_cnt, mode);
3561 }
3562
3563 lck_mtx_lock_spin(&inp->dlth_lock);
3564 VERIFY(inp->dlth_flags & DLIL_INPUT_RUNNING);
3565 if (!(inp->dlth_flags & ~(DLIL_INPUT_RUNNING |
3566 DLIL_INPUT_TERMINATE))) {
3567 break;
3568 }
3569 }
3570
3571 inp->dlth_flags &= ~DLIL_INPUT_RUNNING;
3572
3573 if (__improbable(inp->dlth_flags & DLIL_INPUT_TERMINATE)) {
3574 terminate:
3575 lck_mtx_unlock(&inp->dlth_lock);
3576 dlil_terminate_input_thread(inp);
3577 /* NOTREACHED */
3578 } else {
3579 (void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3580 lck_mtx_unlock(&inp->dlth_lock);
3581 (void) thread_block_parameter(dlil_rxpoll_input_thread_cont,
3582 inp);
3583 /* NOTREACHED */
3584 }
3585
3586 VERIFY(0); /* we should never get here */
3587 /* NOTREACHED */
3588 __builtin_unreachable();
3589 }
3590
3591 errno_t
dlil_rxpoll_validate_params(struct ifnet_poll_params * p)3592 dlil_rxpoll_validate_params(struct ifnet_poll_params *p)
3593 {
3594 if (p != NULL) {
3595 if ((p->packets_lowat == 0 && p->packets_hiwat != 0) ||
3596 (p->packets_lowat != 0 && p->packets_hiwat == 0)) {
3597 return EINVAL;
3598 }
3599 if (p->packets_lowat != 0 && /* hiwat must be non-zero */
3600 p->packets_lowat >= p->packets_hiwat) {
3601 return EINVAL;
3602 }
3603 if ((p->bytes_lowat == 0 && p->bytes_hiwat != 0) ||
3604 (p->bytes_lowat != 0 && p->bytes_hiwat == 0)) {
3605 return EINVAL;
3606 }
3607 if (p->bytes_lowat != 0 && /* hiwat must be non-zero */
3608 p->bytes_lowat >= p->bytes_hiwat) {
3609 return EINVAL;
3610 }
3611 if (p->interval_time != 0 &&
3612 p->interval_time < IF_RXPOLL_INTERVALTIME_MIN) {
3613 p->interval_time = IF_RXPOLL_INTERVALTIME_MIN;
3614 }
3615 }
3616 return 0;
3617 }
3618
3619 void
dlil_rxpoll_update_params(struct ifnet * ifp,struct ifnet_poll_params * p)3620 dlil_rxpoll_update_params(struct ifnet *ifp, struct ifnet_poll_params *p)
3621 {
3622 u_int64_t sample_holdtime, inbw;
3623
3624 if ((inbw = ifnet_input_linkrate(ifp)) == 0 && p == NULL) {
3625 sample_holdtime = 0; /* polling is disabled */
3626 ifp->if_rxpoll_wlowat = ifp->if_rxpoll_plowat =
3627 ifp->if_rxpoll_blowat = 0;
3628 ifp->if_rxpoll_whiwat = ifp->if_rxpoll_phiwat =
3629 ifp->if_rxpoll_bhiwat = (u_int32_t)-1;
3630 ifp->if_rxpoll_plim = 0;
3631 ifp->if_rxpoll_ival = IF_RXPOLL_INTERVALTIME_MIN;
3632 } else {
3633 u_int32_t plowat, phiwat, blowat, bhiwat, plim;
3634 u_int64_t ival;
3635 unsigned int n, i;
3636
3637 for (n = 0, i = 0; rxpoll_tbl[i].speed != 0; i++) {
3638 if (inbw < rxpoll_tbl[i].speed) {
3639 break;
3640 }
3641 n = i;
3642 }
3643 /* auto-tune if caller didn't specify a value */
3644 plowat = ((p == NULL || p->packets_lowat == 0) ?
3645 rxpoll_tbl[n].plowat : p->packets_lowat);
3646 phiwat = ((p == NULL || p->packets_hiwat == 0) ?
3647 rxpoll_tbl[n].phiwat : p->packets_hiwat);
3648 blowat = ((p == NULL || p->bytes_lowat == 0) ?
3649 rxpoll_tbl[n].blowat : p->bytes_lowat);
3650 bhiwat = ((p == NULL || p->bytes_hiwat == 0) ?
3651 rxpoll_tbl[n].bhiwat : p->bytes_hiwat);
3652 plim = ((p == NULL || p->packets_limit == 0 ||
3653 if_rxpoll_max != 0) ? if_rxpoll_max : p->packets_limit);
3654 ival = ((p == NULL || p->interval_time == 0 ||
3655 if_rxpoll_interval_time != IF_RXPOLL_INTERVALTIME) ?
3656 if_rxpoll_interval_time : p->interval_time);
3657
3658 VERIFY(plowat != 0 && phiwat != 0);
3659 VERIFY(blowat != 0 && bhiwat != 0);
3660 VERIFY(ival >= IF_RXPOLL_INTERVALTIME_MIN);
3661
3662 sample_holdtime = if_rxpoll_sample_holdtime;
3663 ifp->if_rxpoll_wlowat = if_sysctl_rxpoll_wlowat;
3664 ifp->if_rxpoll_whiwat = if_sysctl_rxpoll_whiwat;
3665 ifp->if_rxpoll_plowat = plowat;
3666 ifp->if_rxpoll_phiwat = phiwat;
3667 ifp->if_rxpoll_blowat = blowat;
3668 ifp->if_rxpoll_bhiwat = bhiwat;
3669 ifp->if_rxpoll_plim = plim;
3670 ifp->if_rxpoll_ival = ival;
3671 }
3672
3673 net_nsectimer(&if_rxpoll_mode_holdtime, &ifp->if_poll_mode_holdtime);
3674 net_nsectimer(&sample_holdtime, &ifp->if_poll_sample_holdtime);
3675
3676 if (dlil_verbose) {
3677 DLIL_PRINTF("%s: speed %llu bps, sample per %llu nsec, "
3678 "poll interval %llu nsec, pkts per poll %u, "
3679 "pkt limits [%u/%u], wreq limits [%u/%u], "
3680 "bytes limits [%u/%u]\n", if_name(ifp),
3681 inbw, sample_holdtime, ifp->if_rxpoll_ival,
3682 ifp->if_rxpoll_plim, ifp->if_rxpoll_plowat,
3683 ifp->if_rxpoll_phiwat, ifp->if_rxpoll_wlowat,
3684 ifp->if_rxpoll_whiwat, ifp->if_rxpoll_blowat,
3685 ifp->if_rxpoll_bhiwat);
3686 }
3687 }
3688
3689 /*
3690 * Must be called on an attached ifnet (caller is expected to check.)
3691 * Caller may pass NULL for poll parameters to indicate "auto-tuning."
3692 */
3693 errno_t
dlil_rxpoll_set_params(struct ifnet * ifp,struct ifnet_poll_params * p,boolean_t locked)3694 dlil_rxpoll_set_params(struct ifnet *ifp, struct ifnet_poll_params *p,
3695 boolean_t locked)
3696 {
3697 errno_t err;
3698 struct dlil_threading_info *inp;
3699
3700 VERIFY(ifp != NULL);
3701 if (!(ifp->if_eflags & IFEF_RXPOLL) || (inp = ifp->if_inp) == NULL) {
3702 return ENXIO;
3703 }
3704 err = dlil_rxpoll_validate_params(p);
3705 if (err != 0) {
3706 return err;
3707 }
3708
3709 if (!locked) {
3710 lck_mtx_lock(&inp->dlth_lock);
3711 }
3712 LCK_MTX_ASSERT(&inp->dlth_lock, LCK_MTX_ASSERT_OWNED);
3713 /*
3714 * Normally, we'd reset the parameters to the auto-tuned values
3715 * if the the input thread detects a change in link rate. If the
3716 * driver provides its own parameters right after a link rate
3717 * changes, but before the input thread gets to run, we want to
3718 * make sure to keep the driver's values. Clearing if_poll_update
3719 * will achieve that.
3720 */
3721 if (p != NULL && !locked && ifp->if_poll_update != 0) {
3722 ifp->if_poll_update = 0;
3723 }
3724 dlil_rxpoll_update_params(ifp, p);
3725 if (!locked) {
3726 lck_mtx_unlock(&inp->dlth_lock);
3727 }
3728 return 0;
3729 }
3730
3731 /*
3732 * Must be called on an attached ifnet (caller is expected to check.)
3733 */
3734 errno_t
dlil_rxpoll_get_params(struct ifnet * ifp,struct ifnet_poll_params * p)3735 dlil_rxpoll_get_params(struct ifnet *ifp, struct ifnet_poll_params *p)
3736 {
3737 struct dlil_threading_info *inp;
3738
3739 VERIFY(ifp != NULL && p != NULL);
3740 if (!(ifp->if_eflags & IFEF_RXPOLL) || (inp = ifp->if_inp) == NULL) {
3741 return ENXIO;
3742 }
3743
3744 bzero(p, sizeof(*p));
3745
3746 lck_mtx_lock(&inp->dlth_lock);
3747 p->packets_limit = ifp->if_rxpoll_plim;
3748 p->packets_lowat = ifp->if_rxpoll_plowat;
3749 p->packets_hiwat = ifp->if_rxpoll_phiwat;
3750 p->bytes_lowat = ifp->if_rxpoll_blowat;
3751 p->bytes_hiwat = ifp->if_rxpoll_bhiwat;
3752 p->interval_time = ifp->if_rxpoll_ival;
3753 lck_mtx_unlock(&inp->dlth_lock);
3754
3755 return 0;
3756 }
3757
3758 errno_t
ifnet_input(struct ifnet * ifp,struct mbuf * m_head,const struct ifnet_stat_increment_param * s)3759 ifnet_input(struct ifnet *ifp, struct mbuf *m_head,
3760 const struct ifnet_stat_increment_param *s)
3761 {
3762 return ifnet_input_common(ifp, m_head, NULL, s, FALSE, FALSE);
3763 }
3764
3765 errno_t
ifnet_input_extended(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s)3766 ifnet_input_extended(struct ifnet *ifp, struct mbuf *m_head,
3767 struct mbuf *m_tail, const struct ifnet_stat_increment_param *s)
3768 {
3769 return ifnet_input_common(ifp, m_head, m_tail, s, TRUE, FALSE);
3770 }
3771
3772 errno_t
ifnet_input_poll(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s)3773 ifnet_input_poll(struct ifnet *ifp, struct mbuf *m_head,
3774 struct mbuf *m_tail, const struct ifnet_stat_increment_param *s)
3775 {
3776 return ifnet_input_common(ifp, m_head, m_tail, s,
3777 (m_head != NULL), TRUE);
3778 }
3779
3780 static errno_t
ifnet_input_common(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t ext,boolean_t poll)3781 ifnet_input_common(struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail,
3782 const struct ifnet_stat_increment_param *s, boolean_t ext, boolean_t poll)
3783 {
3784 dlil_input_func input_func;
3785 struct ifnet_stat_increment_param _s;
3786 u_int32_t m_cnt = 0, m_size = 0;
3787 struct mbuf *last;
3788 errno_t err = 0;
3789
3790 if ((m_head == NULL && !poll) || (s == NULL && ext)) {
3791 if (m_head != NULL) {
3792 mbuf_freem_list(m_head);
3793 }
3794 return EINVAL;
3795 }
3796
3797 VERIFY(m_head != NULL || (s == NULL && m_tail == NULL && !ext && poll));
3798 VERIFY(m_tail == NULL || ext);
3799 VERIFY(s != NULL || !ext);
3800
3801 /*
3802 * Drop the packet(s) if the parameters are invalid, or if the
3803 * interface is no longer attached; else hold an IO refcnt to
3804 * prevent it from being detached (will be released below.)
3805 */
3806 if (ifp == NULL || (ifp != lo_ifp && !ifnet_datamov_begin(ifp))) {
3807 if (m_head != NULL) {
3808 mbuf_freem_list(m_head);
3809 }
3810 return EINVAL;
3811 }
3812
3813 input_func = ifp->if_input_dlil;
3814 VERIFY(input_func != NULL);
3815
3816 if (m_tail == NULL) {
3817 last = m_head;
3818 while (m_head != NULL) {
3819 #if IFNET_INPUT_SANITY_CHK
3820 if (__improbable(dlil_input_sanity_check != 0)) {
3821 DLIL_INPUT_CHECK(last, ifp);
3822 }
3823 #endif /* IFNET_INPUT_SANITY_CHK */
3824 m_cnt++;
3825 m_size += m_length(last);
3826 if (mbuf_nextpkt(last) == NULL) {
3827 break;
3828 }
3829 last = mbuf_nextpkt(last);
3830 }
3831 m_tail = last;
3832 } else {
3833 #if IFNET_INPUT_SANITY_CHK
3834 if (__improbable(dlil_input_sanity_check != 0)) {
3835 last = m_head;
3836 while (1) {
3837 DLIL_INPUT_CHECK(last, ifp);
3838 m_cnt++;
3839 m_size += m_length(last);
3840 if (mbuf_nextpkt(last) == NULL) {
3841 break;
3842 }
3843 last = mbuf_nextpkt(last);
3844 }
3845 } else {
3846 m_cnt = s->packets_in;
3847 m_size = s->bytes_in;
3848 last = m_tail;
3849 }
3850 #else
3851 m_cnt = s->packets_in;
3852 m_size = s->bytes_in;
3853 last = m_tail;
3854 #endif /* IFNET_INPUT_SANITY_CHK */
3855 }
3856
3857 if (last != m_tail) {
3858 panic_plain("%s: invalid input packet chain for %s, "
3859 "tail mbuf %p instead of %p\n", __func__, if_name(ifp),
3860 m_tail, last);
3861 }
3862
3863 /*
3864 * Assert packet count only for the extended variant, for backwards
3865 * compatibility, since this came directly from the device driver.
3866 * Relax this assertion for input bytes, as the driver may have
3867 * included the link-layer headers in the computation; hence
3868 * m_size is just an approximation.
3869 */
3870 if (ext && s->packets_in != m_cnt) {
3871 panic_plain("%s: input packet count mismatch for %s, "
3872 "%d instead of %d\n", __func__, if_name(ifp),
3873 s->packets_in, m_cnt);
3874 }
3875
3876 if (s == NULL) {
3877 bzero(&_s, sizeof(_s));
3878 s = &_s;
3879 } else {
3880 _s = *s;
3881 }
3882 _s.packets_in = m_cnt;
3883 _s.bytes_in = m_size;
3884
3885 err = (*input_func)(ifp, m_head, m_tail, s, poll, current_thread());
3886
3887 if (ifp != lo_ifp) {
3888 /* Release the IO refcnt */
3889 ifnet_datamov_end(ifp);
3890 }
3891
3892 return err;
3893 }
3894
3895 #if SKYWALK
3896 errno_t
dlil_set_input_handler(struct ifnet * ifp,dlil_input_func fn)3897 dlil_set_input_handler(struct ifnet *ifp, dlil_input_func fn)
3898 {
3899 return atomic_test_set_ptr(&ifp->if_input_dlil,
3900 ptrauth_nop_cast(void *, &dlil_input_handler),
3901 ptrauth_nop_cast(void *, fn)) ? 0 : EBUSY;
3902 }
3903
3904 void
dlil_reset_input_handler(struct ifnet * ifp)3905 dlil_reset_input_handler(struct ifnet *ifp)
3906 {
3907 while (!atomic_test_set_ptr(&ifp->if_input_dlil,
3908 ptrauth_nop_cast(void *, ifp->if_input_dlil),
3909 ptrauth_nop_cast(void *, &dlil_input_handler))) {
3910 ;
3911 }
3912 }
3913
3914 errno_t
dlil_set_output_handler(struct ifnet * ifp,dlil_output_func fn)3915 dlil_set_output_handler(struct ifnet *ifp, dlil_output_func fn)
3916 {
3917 return atomic_test_set_ptr(&ifp->if_output_dlil,
3918 ptrauth_nop_cast(void *, &dlil_output_handler),
3919 ptrauth_nop_cast(void *, fn)) ? 0 : EBUSY;
3920 }
3921
3922 void
dlil_reset_output_handler(struct ifnet * ifp)3923 dlil_reset_output_handler(struct ifnet *ifp)
3924 {
3925 while (!atomic_test_set_ptr(&ifp->if_output_dlil,
3926 ptrauth_nop_cast(void *, ifp->if_output_dlil),
3927 ptrauth_nop_cast(void *, &dlil_output_handler))) {
3928 ;
3929 }
3930 }
3931 #endif /* SKYWALK */
3932
3933 errno_t
dlil_output_handler(struct ifnet * ifp,struct mbuf * m)3934 dlil_output_handler(struct ifnet *ifp, struct mbuf *m)
3935 {
3936 return ifp->if_output(ifp, m);
3937 }
3938
3939 errno_t
dlil_input_handler(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t poll,struct thread * tp)3940 dlil_input_handler(struct ifnet *ifp, struct mbuf *m_head,
3941 struct mbuf *m_tail, const struct ifnet_stat_increment_param *s,
3942 boolean_t poll, struct thread *tp)
3943 {
3944 struct dlil_threading_info *inp = ifp->if_inp;
3945
3946 if (__improbable(inp == NULL)) {
3947 inp = dlil_main_input_thread;
3948 }
3949
3950 #if (DEVELOPMENT || DEBUG)
3951 if (__improbable(net_thread_is_marked(NET_THREAD_SYNC_RX))) {
3952 return dlil_input_sync(inp, ifp, m_head, m_tail, s, poll, tp);
3953 } else
3954 #endif /* (DEVELOPMENT || DEBUG) */
3955 {
3956 return inp->dlth_strategy(inp, ifp, m_head, m_tail, s, poll, tp);
3957 }
3958 }
3959
3960 static errno_t
dlil_input_async(struct dlil_threading_info * inp,struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t poll,struct thread * tp)3961 dlil_input_async(struct dlil_threading_info *inp,
3962 struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail,
3963 const struct ifnet_stat_increment_param *s, boolean_t poll,
3964 struct thread *tp)
3965 {
3966 u_int32_t m_cnt = s->packets_in;
3967 u_int32_t m_size = s->bytes_in;
3968 boolean_t notify = FALSE;
3969
3970 /*
3971 * If there is a matching DLIL input thread associated with an
3972 * affinity set, associate this thread with the same set. We
3973 * will only do this once.
3974 */
3975 lck_mtx_lock_spin(&inp->dlth_lock);
3976 if (inp != dlil_main_input_thread && inp->dlth_affinity && tp != NULL &&
3977 ((!poll && inp->dlth_driver_thread == THREAD_NULL) ||
3978 (poll && inp->dlth_poller_thread == THREAD_NULL))) {
3979 u_int32_t tag = inp->dlth_affinity_tag;
3980
3981 if (poll) {
3982 VERIFY(inp->dlth_poller_thread == THREAD_NULL);
3983 inp->dlth_poller_thread = tp;
3984 } else {
3985 VERIFY(inp->dlth_driver_thread == THREAD_NULL);
3986 inp->dlth_driver_thread = tp;
3987 }
3988 lck_mtx_unlock(&inp->dlth_lock);
3989
3990 /* Associate the current thread with the new affinity tag */
3991 (void) dlil_affinity_set(tp, tag);
3992
3993 /*
3994 * Take a reference on the current thread; during detach,
3995 * we will need to refer to it in order to tear down its
3996 * affinity.
3997 */
3998 thread_reference(tp);
3999 lck_mtx_lock_spin(&inp->dlth_lock);
4000 }
4001
4002 VERIFY(m_head != NULL || (m_tail == NULL && m_cnt == 0));
4003
4004 /*
4005 * Because of loopbacked multicast we cannot stuff the ifp in
4006 * the rcvif of the packet header: loopback (lo0) packets use a
4007 * dedicated list so that we can later associate them with lo_ifp
4008 * on their way up the stack. Packets for other interfaces without
4009 * dedicated input threads go to the regular list.
4010 */
4011 if (m_head != NULL) {
4012 classq_pkt_t head, tail;
4013 CLASSQ_PKT_INIT_MBUF(&head, m_head);
4014 CLASSQ_PKT_INIT_MBUF(&tail, m_tail);
4015 if (inp == dlil_main_input_thread && ifp == lo_ifp) {
4016 struct dlil_main_threading_info *inpm =
4017 (struct dlil_main_threading_info *)inp;
4018 _addq_multi(&inpm->lo_rcvq_pkts, &head, &tail,
4019 m_cnt, m_size);
4020 } else {
4021 _addq_multi(&inp->dlth_pkts, &head, &tail,
4022 m_cnt, m_size);
4023 }
4024 }
4025
4026 #if IFNET_INPUT_SANITY_CHK
4027 if (__improbable(dlil_input_sanity_check != 0)) {
4028 u_int32_t count = 0, size = 0;
4029 struct mbuf *m0;
4030
4031 for (m0 = m_head; m0; m0 = mbuf_nextpkt(m0)) {
4032 size += m_length(m0);
4033 count++;
4034 }
4035
4036 if (count != m_cnt) {
4037 panic_plain("%s: invalid total packet count %u "
4038 "(expected %u)\n", if_name(ifp), count, m_cnt);
4039 /* NOTREACHED */
4040 __builtin_unreachable();
4041 } else if (size != m_size) {
4042 panic_plain("%s: invalid total packet size %u "
4043 "(expected %u)\n", if_name(ifp), size, m_size);
4044 /* NOTREACHED */
4045 __builtin_unreachable();
4046 }
4047
4048 inp->dlth_pkts_cnt += m_cnt;
4049 }
4050 #endif /* IFNET_INPUT_SANITY_CHK */
4051
4052 dlil_input_stats_add(s, inp, ifp, poll);
4053 /*
4054 * If we're using the main input thread, synchronize the
4055 * stats now since we have the interface context. All
4056 * other cases involving dedicated input threads will
4057 * have their stats synchronized there.
4058 */
4059 if (inp == dlil_main_input_thread) {
4060 notify = dlil_input_stats_sync(ifp, inp);
4061 }
4062
4063 dlil_input_wakeup(inp);
4064 lck_mtx_unlock(&inp->dlth_lock);
4065
4066 if (notify) {
4067 ifnet_notify_data_threshold(ifp);
4068 }
4069
4070 return 0;
4071 }
4072
4073 static errno_t
dlil_input_sync(struct dlil_threading_info * inp,struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t poll,struct thread * tp)4074 dlil_input_sync(struct dlil_threading_info *inp,
4075 struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail,
4076 const struct ifnet_stat_increment_param *s, boolean_t poll,
4077 struct thread *tp)
4078 {
4079 #pragma unused(tp)
4080 u_int32_t m_cnt = s->packets_in;
4081 u_int32_t m_size = s->bytes_in;
4082 boolean_t notify = FALSE;
4083 classq_pkt_t head, tail;
4084
4085 ASSERT(inp != dlil_main_input_thread);
4086
4087 /* XXX: should we just assert instead? */
4088 if (__improbable(m_head == NULL)) {
4089 return 0;
4090 }
4091
4092 CLASSQ_PKT_INIT_MBUF(&head, m_head);
4093 CLASSQ_PKT_INIT_MBUF(&tail, m_tail);
4094
4095 lck_mtx_lock_spin(&inp->dlth_lock);
4096 _addq_multi(&inp->dlth_pkts, &head, &tail, m_cnt, m_size);
4097
4098 #if IFNET_INPUT_SANITY_CHK
4099 if (__improbable(dlil_input_sanity_check != 0)) {
4100 u_int32_t count = 0, size = 0;
4101 struct mbuf *m0;
4102
4103 for (m0 = m_head; m0; m0 = mbuf_nextpkt(m0)) {
4104 size += m_length(m0);
4105 count++;
4106 }
4107
4108 if (count != m_cnt) {
4109 panic_plain("%s: invalid total packet count %u "
4110 "(expected %u)\n", if_name(ifp), count, m_cnt);
4111 /* NOTREACHED */
4112 __builtin_unreachable();
4113 } else if (size != m_size) {
4114 panic_plain("%s: invalid total packet size %u "
4115 "(expected %u)\n", if_name(ifp), size, m_size);
4116 /* NOTREACHED */
4117 __builtin_unreachable();
4118 }
4119
4120 inp->dlth_pkts_cnt += m_cnt;
4121 }
4122 #endif /* IFNET_INPUT_SANITY_CHK */
4123
4124 dlil_input_stats_add(s, inp, ifp, poll);
4125
4126 m_cnt = qlen(&inp->dlth_pkts);
4127 _getq_all(&inp->dlth_pkts, &head, NULL, NULL, NULL);
4128
4129 #if SKYWALK
4130 /*
4131 * If this interface is attached to a netif nexus,
4132 * the stats are already incremented there; otherwise
4133 * do it here.
4134 */
4135 if (!(ifp->if_capabilities & IFCAP_SKYWALK))
4136 #endif /* SKYWALK */
4137 notify = dlil_input_stats_sync(ifp, inp);
4138
4139 lck_mtx_unlock(&inp->dlth_lock);
4140
4141 if (notify) {
4142 ifnet_notify_data_threshold(ifp);
4143 }
4144
4145 /*
4146 * NOTE warning %%% attention !!!!
4147 * We should think about putting some thread starvation
4148 * safeguards if we deal with long chains of packets.
4149 */
4150 if (head.cp_mbuf != NULL) {
4151 dlil_input_packet_list_extended(NULL, head.cp_mbuf,
4152 m_cnt, ifp->if_poll_mode);
4153 }
4154
4155 return 0;
4156 }
4157
4158 #if SKYWALK
4159 errno_t
ifnet_set_output_handler(struct ifnet * ifp,ifnet_output_func fn)4160 ifnet_set_output_handler(struct ifnet *ifp, ifnet_output_func fn)
4161 {
4162 return atomic_test_set_ptr(&ifp->if_output,
4163 ptrauth_nop_cast(void *, ifp->if_save_output),
4164 ptrauth_nop_cast(void *, fn)) ? 0 : EBUSY;
4165 }
4166
4167 void
ifnet_reset_output_handler(struct ifnet * ifp)4168 ifnet_reset_output_handler(struct ifnet *ifp)
4169 {
4170 while (!atomic_test_set_ptr(&ifp->if_output,
4171 ptrauth_nop_cast(void *, ifp->if_output),
4172 ptrauth_nop_cast(void *, ifp->if_save_output))) {
4173 ;
4174 }
4175 }
4176
4177 errno_t
ifnet_set_start_handler(struct ifnet * ifp,ifnet_start_func fn)4178 ifnet_set_start_handler(struct ifnet *ifp, ifnet_start_func fn)
4179 {
4180 return atomic_test_set_ptr(&ifp->if_start,
4181 ptrauth_nop_cast(void *, ifp->if_save_start),
4182 ptrauth_nop_cast(void *, fn)) ? 0 : EBUSY;
4183 }
4184
4185 void
ifnet_reset_start_handler(struct ifnet * ifp)4186 ifnet_reset_start_handler(struct ifnet *ifp)
4187 {
4188 while (!atomic_test_set_ptr(&ifp->if_start,
4189 ptrauth_nop_cast(void *, ifp->if_start),
4190 ptrauth_nop_cast(void *, ifp->if_save_start))) {
4191 ;
4192 }
4193 }
4194 #endif /* SKYWALK */
4195
4196 static void
ifnet_start_common(struct ifnet * ifp,boolean_t resetfc)4197 ifnet_start_common(struct ifnet *ifp, boolean_t resetfc)
4198 {
4199 if (!(ifp->if_eflags & IFEF_TXSTART)) {
4200 return;
4201 }
4202 /*
4203 * If the starter thread is inactive, signal it to do work,
4204 * unless the interface is being flow controlled from below,
4205 * e.g. a virtual interface being flow controlled by a real
4206 * network interface beneath it, or it's been disabled via
4207 * a call to ifnet_disable_output().
4208 */
4209 lck_mtx_lock_spin(&ifp->if_start_lock);
4210 if (resetfc) {
4211 ifp->if_start_flags &= ~IFSF_FLOW_CONTROLLED;
4212 } else if (ifp->if_start_flags & IFSF_FLOW_CONTROLLED) {
4213 lck_mtx_unlock(&ifp->if_start_lock);
4214 return;
4215 }
4216 ifp->if_start_req++;
4217 if (!ifp->if_start_active && ifp->if_start_thread != THREAD_NULL &&
4218 (resetfc || !(ifp->if_eflags & IFEF_ENQUEUE_MULTI) ||
4219 IFCQ_LEN(ifp->if_snd) >= ifp->if_start_delay_qlen ||
4220 ifp->if_start_delayed == 0)) {
4221 (void) wakeup_one((caddr_t)&ifp->if_start_thread);
4222 }
4223 lck_mtx_unlock(&ifp->if_start_lock);
4224 }
4225
4226 void
ifnet_start(struct ifnet * ifp)4227 ifnet_start(struct ifnet *ifp)
4228 {
4229 ifnet_start_common(ifp, FALSE);
4230 }
4231
4232 __attribute__((noreturn))
4233 static void
ifnet_start_thread_func(void * v,wait_result_t w)4234 ifnet_start_thread_func(void *v, wait_result_t w)
4235 {
4236 #pragma unused(w)
4237 struct ifnet *ifp = v;
4238 char thread_name[MAXTHREADNAMESIZE];
4239
4240 /* Construct the name for this thread, and then apply it. */
4241 bzero(thread_name, sizeof(thread_name));
4242 (void) snprintf(thread_name, sizeof(thread_name),
4243 "ifnet_start_%s", ifp->if_xname);
4244 #if SKYWALK
4245 /* override name for native Skywalk interface */
4246 if (ifp->if_eflags & IFEF_SKYWALK_NATIVE) {
4247 (void) snprintf(thread_name, sizeof(thread_name),
4248 "skywalk_doorbell_%s_tx", ifp->if_xname);
4249 }
4250 #endif /* SKYWALK */
4251 ASSERT(ifp->if_start_thread == current_thread());
4252 thread_set_thread_name(current_thread(), thread_name);
4253
4254 /*
4255 * Treat the dedicated starter thread for lo0 as equivalent to
4256 * the driver workloop thread; if net_affinity is enabled for
4257 * the main input thread, associate this starter thread to it
4258 * by binding them with the same affinity tag. This is done
4259 * only once (as we only have one lo_ifp which never goes away.)
4260 */
4261 if (ifp == lo_ifp) {
4262 struct dlil_threading_info *inp = dlil_main_input_thread;
4263 struct thread *tp = current_thread();
4264 #if SKYWALK
4265 /* native skywalk loopback not yet implemented */
4266 VERIFY(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
4267 #endif /* SKYWALK */
4268
4269 lck_mtx_lock(&inp->dlth_lock);
4270 if (inp->dlth_affinity) {
4271 u_int32_t tag = inp->dlth_affinity_tag;
4272
4273 VERIFY(inp->dlth_driver_thread == THREAD_NULL);
4274 VERIFY(inp->dlth_poller_thread == THREAD_NULL);
4275 inp->dlth_driver_thread = tp;
4276 lck_mtx_unlock(&inp->dlth_lock);
4277
4278 /* Associate this thread with the affinity tag */
4279 (void) dlil_affinity_set(tp, tag);
4280 } else {
4281 lck_mtx_unlock(&inp->dlth_lock);
4282 }
4283 }
4284
4285 lck_mtx_lock(&ifp->if_start_lock);
4286 VERIFY(!ifp->if_start_embryonic && !ifp->if_start_active);
4287 (void) assert_wait(&ifp->if_start_thread, THREAD_UNINT);
4288 ifp->if_start_embryonic = 1;
4289 /* wake up once to get out of embryonic state */
4290 ifp->if_start_req++;
4291 (void) wakeup_one((caddr_t)&ifp->if_start_thread);
4292 lck_mtx_unlock(&ifp->if_start_lock);
4293 (void) thread_block_parameter(ifnet_start_thread_cont, ifp);
4294 /* NOTREACHED */
4295 __builtin_unreachable();
4296 }
4297
4298 __attribute__((noreturn))
4299 static void
ifnet_start_thread_cont(void * v,wait_result_t wres)4300 ifnet_start_thread_cont(void *v, wait_result_t wres)
4301 {
4302 struct ifnet *ifp = v;
4303 struct ifclassq *ifq = ifp->if_snd;
4304
4305 lck_mtx_lock_spin(&ifp->if_start_lock);
4306 if (__improbable(wres == THREAD_INTERRUPTED ||
4307 (ifp->if_start_flags & IFSF_TERMINATING) != 0)) {
4308 goto terminate;
4309 }
4310
4311 if (__improbable(ifp->if_start_embryonic)) {
4312 ifp->if_start_embryonic = 0;
4313 lck_mtx_unlock(&ifp->if_start_lock);
4314 ifnet_decr_pending_thread_count(ifp);
4315 lck_mtx_lock_spin(&ifp->if_start_lock);
4316 goto skip;
4317 }
4318
4319 ifp->if_start_active = 1;
4320
4321 /*
4322 * Keep on servicing until no more request.
4323 */
4324 for (;;) {
4325 u_int32_t req = ifp->if_start_req;
4326 if (!IFCQ_IS_EMPTY(ifq) &&
4327 (ifp->if_eflags & IFEF_ENQUEUE_MULTI) &&
4328 ifp->if_start_delayed == 0 &&
4329 IFCQ_LEN(ifq) < ifp->if_start_delay_qlen &&
4330 (ifp->if_eflags & IFEF_DELAY_START)) {
4331 ifp->if_start_delayed = 1;
4332 ifnet_start_delayed++;
4333 break;
4334 }
4335 ifp->if_start_delayed = 0;
4336 lck_mtx_unlock(&ifp->if_start_lock);
4337
4338 /*
4339 * If no longer attached, don't call start because ifp
4340 * is being destroyed; else hold an IO refcnt to
4341 * prevent the interface from being detached (will be
4342 * released below.)
4343 */
4344 if (!ifnet_datamov_begin(ifp)) {
4345 lck_mtx_lock_spin(&ifp->if_start_lock);
4346 break;
4347 }
4348
4349 /* invoke the driver's start routine */
4350 ((*ifp->if_start)(ifp));
4351
4352 /*
4353 * Release the io ref count taken above.
4354 */
4355 ifnet_datamov_end(ifp);
4356
4357 lck_mtx_lock_spin(&ifp->if_start_lock);
4358
4359 /*
4360 * If there's no pending request or if the
4361 * interface has been disabled, we're done.
4362 */
4363 #define _IFSF_DISABLED (IFSF_FLOW_CONTROLLED | IFSF_TERMINATING)
4364 if (req == ifp->if_start_req ||
4365 (ifp->if_start_flags & _IFSF_DISABLED) != 0) {
4366 break;
4367 }
4368 }
4369 skip:
4370 ifp->if_start_req = 0;
4371 ifp->if_start_active = 0;
4372
4373 #if SKYWALK
4374 /*
4375 * Wakeup any waiters, e.g. any threads waiting to
4376 * detach the interface from the flowswitch, etc.
4377 */
4378 if (ifp->if_start_waiters != 0) {
4379 ifp->if_start_waiters = 0;
4380 wakeup(&ifp->if_start_waiters);
4381 }
4382 #endif /* SKYWALK */
4383 if (__probable((ifp->if_start_flags & IFSF_TERMINATING) == 0)) {
4384 uint64_t deadline = TIMEOUT_WAIT_FOREVER;
4385 struct timespec delay_start_ts;
4386 struct timespec *ts;
4387
4388 /*
4389 * Wakeup N ns from now if rate-controlled by TBR, and if
4390 * there are still packets in the send queue which haven't
4391 * been dequeued so far; else sleep indefinitely (ts = NULL)
4392 * until ifnet_start() is called again.
4393 */
4394 ts = ((IFCQ_TBR_IS_ENABLED(ifq) && !IFCQ_IS_EMPTY(ifq)) ?
4395 &ifp->if_start_cycle : NULL);
4396
4397 if (ts == NULL && ifp->if_start_delayed == 1) {
4398 delay_start_ts.tv_sec = 0;
4399 delay_start_ts.tv_nsec = ifp->if_start_delay_timeout;
4400 ts = &delay_start_ts;
4401 }
4402
4403 if (ts != NULL && ts->tv_sec == 0 && ts->tv_nsec == 0) {
4404 ts = NULL;
4405 }
4406
4407 if (__improbable(ts != NULL)) {
4408 clock_interval_to_deadline((uint32_t)(ts->tv_nsec +
4409 (ts->tv_sec * NSEC_PER_SEC)), 1, &deadline);
4410 }
4411
4412 (void) assert_wait_deadline(&ifp->if_start_thread,
4413 THREAD_UNINT, deadline);
4414 lck_mtx_unlock(&ifp->if_start_lock);
4415 (void) thread_block_parameter(ifnet_start_thread_cont, ifp);
4416 /* NOTREACHED */
4417 } else {
4418 terminate:
4419 /* interface is detached? */
4420 ifnet_set_start_cycle(ifp, NULL);
4421
4422 /* clear if_start_thread to allow termination to continue */
4423 ASSERT(ifp->if_start_thread != THREAD_NULL);
4424 ifp->if_start_thread = THREAD_NULL;
4425 wakeup((caddr_t)&ifp->if_start_thread);
4426 lck_mtx_unlock(&ifp->if_start_lock);
4427
4428 if (dlil_verbose) {
4429 DLIL_PRINTF("%s: starter thread terminated\n",
4430 if_name(ifp));
4431 }
4432
4433 /* for the extra refcnt from kernel_thread_start() */
4434 thread_deallocate(current_thread());
4435 /* this is the end */
4436 thread_terminate(current_thread());
4437 /* NOTREACHED */
4438 }
4439
4440 /* must never get here */
4441 VERIFY(0);
4442 /* NOTREACHED */
4443 __builtin_unreachable();
4444 }
4445
4446 void
ifnet_set_start_cycle(struct ifnet * ifp,struct timespec * ts)4447 ifnet_set_start_cycle(struct ifnet *ifp, struct timespec *ts)
4448 {
4449 if (ts == NULL) {
4450 bzero(&ifp->if_start_cycle, sizeof(ifp->if_start_cycle));
4451 } else {
4452 *(&ifp->if_start_cycle) = *ts;
4453 }
4454
4455 if (ts != NULL && ts->tv_nsec != 0 && dlil_verbose) {
4456 DLIL_PRINTF("%s: restart interval set to %lu nsec\n",
4457 if_name(ifp), ts->tv_nsec);
4458 }
4459 }
4460
4461 static inline void
ifnet_poll_wakeup(struct ifnet * ifp)4462 ifnet_poll_wakeup(struct ifnet *ifp)
4463 {
4464 LCK_MTX_ASSERT(&ifp->if_poll_lock, LCK_MTX_ASSERT_OWNED);
4465
4466 ifp->if_poll_req++;
4467 if (!(ifp->if_poll_flags & IF_POLLF_RUNNING) &&
4468 ifp->if_poll_thread != THREAD_NULL) {
4469 wakeup_one((caddr_t)&ifp->if_poll_thread);
4470 }
4471 }
4472
4473 void
ifnet_poll(struct ifnet * ifp)4474 ifnet_poll(struct ifnet *ifp)
4475 {
4476 /*
4477 * If the poller thread is inactive, signal it to do work.
4478 */
4479 lck_mtx_lock_spin(&ifp->if_poll_lock);
4480 ifnet_poll_wakeup(ifp);
4481 lck_mtx_unlock(&ifp->if_poll_lock);
4482 }
4483
4484 __attribute__((noreturn))
4485 static void
ifnet_poll_thread_func(void * v,wait_result_t w)4486 ifnet_poll_thread_func(void *v, wait_result_t w)
4487 {
4488 #pragma unused(w)
4489 char thread_name[MAXTHREADNAMESIZE];
4490 struct ifnet *ifp = v;
4491
4492 VERIFY(ifp->if_eflags & IFEF_RXPOLL);
4493 VERIFY(current_thread() == ifp->if_poll_thread);
4494
4495 /* construct the name for this thread, and then apply it */
4496 bzero(thread_name, sizeof(thread_name));
4497 (void) snprintf(thread_name, sizeof(thread_name),
4498 "ifnet_poller_%s", ifp->if_xname);
4499 thread_set_thread_name(ifp->if_poll_thread, thread_name);
4500
4501 lck_mtx_lock(&ifp->if_poll_lock);
4502 VERIFY(!(ifp->if_poll_flags & (IF_POLLF_EMBRYONIC | IF_POLLF_RUNNING)));
4503 (void) assert_wait(&ifp->if_poll_thread, THREAD_UNINT);
4504 ifp->if_poll_flags |= IF_POLLF_EMBRYONIC;
4505 /* wake up once to get out of embryonic state */
4506 ifnet_poll_wakeup(ifp);
4507 lck_mtx_unlock(&ifp->if_poll_lock);
4508 (void) thread_block_parameter(ifnet_poll_thread_cont, ifp);
4509 /* NOTREACHED */
4510 __builtin_unreachable();
4511 }
4512
4513 __attribute__((noreturn))
4514 static void
ifnet_poll_thread_cont(void * v,wait_result_t wres)4515 ifnet_poll_thread_cont(void *v, wait_result_t wres)
4516 {
4517 struct dlil_threading_info *inp;
4518 struct ifnet *ifp = v;
4519 struct ifnet_stat_increment_param s;
4520 struct timespec start_time;
4521
4522 VERIFY(ifp->if_eflags & IFEF_RXPOLL);
4523
4524 bzero(&s, sizeof(s));
4525 net_timerclear(&start_time);
4526
4527 lck_mtx_lock_spin(&ifp->if_poll_lock);
4528 if (__improbable(wres == THREAD_INTERRUPTED ||
4529 (ifp->if_poll_flags & IF_POLLF_TERMINATING) != 0)) {
4530 goto terminate;
4531 }
4532
4533 inp = ifp->if_inp;
4534 VERIFY(inp != NULL);
4535
4536 if (__improbable(ifp->if_poll_flags & IF_POLLF_EMBRYONIC)) {
4537 ifp->if_poll_flags &= ~IF_POLLF_EMBRYONIC;
4538 lck_mtx_unlock(&ifp->if_poll_lock);
4539 ifnet_decr_pending_thread_count(ifp);
4540 lck_mtx_lock_spin(&ifp->if_poll_lock);
4541 goto skip;
4542 }
4543
4544 ifp->if_poll_flags |= IF_POLLF_RUNNING;
4545
4546 /*
4547 * Keep on servicing until no more request.
4548 */
4549 for (;;) {
4550 struct mbuf *m_head, *m_tail;
4551 u_int32_t m_lim, m_cnt, m_totlen;
4552 u_int16_t req = ifp->if_poll_req;
4553
4554 m_lim = (ifp->if_rxpoll_plim != 0) ? ifp->if_rxpoll_plim :
4555 MAX((qlimit(&inp->dlth_pkts)), (ifp->if_rxpoll_phiwat << 2));
4556 lck_mtx_unlock(&ifp->if_poll_lock);
4557
4558 /*
4559 * If no longer attached, there's nothing to do;
4560 * else hold an IO refcnt to prevent the interface
4561 * from being detached (will be released below.)
4562 */
4563 if (!ifnet_is_attached(ifp, 1)) {
4564 lck_mtx_lock_spin(&ifp->if_poll_lock);
4565 break;
4566 }
4567
4568 if (dlil_verbose > 1) {
4569 DLIL_PRINTF("%s: polling up to %d pkts, "
4570 "pkts avg %d max %d, wreq avg %d, "
4571 "bytes avg %d\n",
4572 if_name(ifp), m_lim,
4573 ifp->if_rxpoll_pavg, ifp->if_rxpoll_pmax,
4574 ifp->if_rxpoll_wavg, ifp->if_rxpoll_bavg);
4575 }
4576
4577 /* invoke the driver's input poll routine */
4578 ((*ifp->if_input_poll)(ifp, 0, m_lim, &m_head, &m_tail,
4579 &m_cnt, &m_totlen));
4580
4581 if (m_head != NULL) {
4582 VERIFY(m_tail != NULL && m_cnt > 0);
4583
4584 if (dlil_verbose > 1) {
4585 DLIL_PRINTF("%s: polled %d pkts, "
4586 "pkts avg %d max %d, wreq avg %d, "
4587 "bytes avg %d\n",
4588 if_name(ifp), m_cnt,
4589 ifp->if_rxpoll_pavg, ifp->if_rxpoll_pmax,
4590 ifp->if_rxpoll_wavg, ifp->if_rxpoll_bavg);
4591 }
4592
4593 /* stats are required for extended variant */
4594 s.packets_in = m_cnt;
4595 s.bytes_in = m_totlen;
4596
4597 (void) ifnet_input_common(ifp, m_head, m_tail,
4598 &s, TRUE, TRUE);
4599 } else {
4600 if (dlil_verbose > 1) {
4601 DLIL_PRINTF("%s: no packets, "
4602 "pkts avg %d max %d, wreq avg %d, "
4603 "bytes avg %d\n",
4604 if_name(ifp), ifp->if_rxpoll_pavg,
4605 ifp->if_rxpoll_pmax, ifp->if_rxpoll_wavg,
4606 ifp->if_rxpoll_bavg);
4607 }
4608
4609 (void) ifnet_input_common(ifp, NULL, NULL,
4610 NULL, FALSE, TRUE);
4611 }
4612
4613 /* Release the io ref count */
4614 ifnet_decr_iorefcnt(ifp);
4615
4616 lck_mtx_lock_spin(&ifp->if_poll_lock);
4617
4618 /* if there's no pending request, we're done */
4619 if (req == ifp->if_poll_req ||
4620 (ifp->if_poll_flags & IF_POLLF_TERMINATING) != 0) {
4621 break;
4622 }
4623 }
4624 skip:
4625 ifp->if_poll_req = 0;
4626 ifp->if_poll_flags &= ~IF_POLLF_RUNNING;
4627
4628 if (__probable((ifp->if_poll_flags & IF_POLLF_TERMINATING) == 0)) {
4629 uint64_t deadline = TIMEOUT_WAIT_FOREVER;
4630 struct timespec *ts;
4631
4632 /*
4633 * Wakeup N ns from now, else sleep indefinitely (ts = NULL)
4634 * until ifnet_poll() is called again.
4635 */
4636 ts = &ifp->if_poll_cycle;
4637 if (ts->tv_sec == 0 && ts->tv_nsec == 0) {
4638 ts = NULL;
4639 }
4640
4641 if (ts != NULL) {
4642 clock_interval_to_deadline((uint32_t)(ts->tv_nsec +
4643 (ts->tv_sec * NSEC_PER_SEC)), 1, &deadline);
4644 }
4645
4646 (void) assert_wait_deadline(&ifp->if_poll_thread,
4647 THREAD_UNINT, deadline);
4648 lck_mtx_unlock(&ifp->if_poll_lock);
4649 (void) thread_block_parameter(ifnet_poll_thread_cont, ifp);
4650 /* NOTREACHED */
4651 } else {
4652 terminate:
4653 /* interface is detached (maybe while asleep)? */
4654 ifnet_set_poll_cycle(ifp, NULL);
4655
4656 /* clear if_poll_thread to allow termination to continue */
4657 ASSERT(ifp->if_poll_thread != THREAD_NULL);
4658 ifp->if_poll_thread = THREAD_NULL;
4659 wakeup((caddr_t)&ifp->if_poll_thread);
4660 lck_mtx_unlock(&ifp->if_poll_lock);
4661
4662 if (dlil_verbose) {
4663 DLIL_PRINTF("%s: poller thread terminated\n",
4664 if_name(ifp));
4665 }
4666
4667 /* for the extra refcnt from kernel_thread_start() */
4668 thread_deallocate(current_thread());
4669 /* this is the end */
4670 thread_terminate(current_thread());
4671 /* NOTREACHED */
4672 }
4673
4674 /* must never get here */
4675 VERIFY(0);
4676 /* NOTREACHED */
4677 __builtin_unreachable();
4678 }
4679
4680 void
ifnet_set_poll_cycle(struct ifnet * ifp,struct timespec * ts)4681 ifnet_set_poll_cycle(struct ifnet *ifp, struct timespec *ts)
4682 {
4683 if (ts == NULL) {
4684 bzero(&ifp->if_poll_cycle, sizeof(ifp->if_poll_cycle));
4685 } else {
4686 *(&ifp->if_poll_cycle) = *ts;
4687 }
4688
4689 if (ts != NULL && ts->tv_nsec != 0 && dlil_verbose) {
4690 DLIL_PRINTF("%s: poll interval set to %lu nsec\n",
4691 if_name(ifp), ts->tv_nsec);
4692 }
4693 }
4694
4695 void
ifnet_purge(struct ifnet * ifp)4696 ifnet_purge(struct ifnet *ifp)
4697 {
4698 if (ifp != NULL && (ifp->if_eflags & IFEF_TXSTART)) {
4699 if_qflush_snd(ifp, false);
4700 }
4701 }
4702
4703 void
ifnet_update_sndq(struct ifclassq * ifq,cqev_t ev)4704 ifnet_update_sndq(struct ifclassq *ifq, cqev_t ev)
4705 {
4706 IFCQ_LOCK_ASSERT_HELD(ifq);
4707
4708 if (!(IFCQ_IS_READY(ifq))) {
4709 return;
4710 }
4711
4712 if (IFCQ_TBR_IS_ENABLED(ifq)) {
4713 struct tb_profile tb = {
4714 .rate = ifq->ifcq_tbr.tbr_rate_raw,
4715 .percent = ifq->ifcq_tbr.tbr_percent, .depth = 0
4716 };
4717 (void) ifclassq_tbr_set(ifq, &tb, FALSE);
4718 }
4719
4720 ifclassq_update(ifq, ev);
4721 }
4722
4723 void
ifnet_update_rcv(struct ifnet * ifp,cqev_t ev)4724 ifnet_update_rcv(struct ifnet *ifp, cqev_t ev)
4725 {
4726 switch (ev) {
4727 case CLASSQ_EV_LINK_BANDWIDTH:
4728 if (net_rxpoll && (ifp->if_eflags & IFEF_RXPOLL)) {
4729 ifp->if_poll_update++;
4730 }
4731 break;
4732
4733 default:
4734 break;
4735 }
4736 }
4737
4738 errno_t
ifnet_set_output_sched_model(struct ifnet * ifp,u_int32_t model)4739 ifnet_set_output_sched_model(struct ifnet *ifp, u_int32_t model)
4740 {
4741 struct ifclassq *ifq;
4742 u_int32_t omodel;
4743 errno_t err;
4744
4745 if (ifp == NULL || model >= IFNET_SCHED_MODEL_MAX) {
4746 return EINVAL;
4747 } else if (!(ifp->if_eflags & IFEF_TXSTART)) {
4748 return ENXIO;
4749 }
4750
4751 ifq = ifp->if_snd;
4752 IFCQ_LOCK(ifq);
4753 omodel = ifp->if_output_sched_model;
4754 ifp->if_output_sched_model = model;
4755 if ((err = ifclassq_pktsched_setup(ifq)) != 0) {
4756 ifp->if_output_sched_model = omodel;
4757 }
4758 IFCQ_UNLOCK(ifq);
4759
4760 return err;
4761 }
4762
4763 errno_t
ifnet_set_sndq_maxlen(struct ifnet * ifp,u_int32_t maxqlen)4764 ifnet_set_sndq_maxlen(struct ifnet *ifp, u_int32_t maxqlen)
4765 {
4766 if (ifp == NULL) {
4767 return EINVAL;
4768 } else if (!(ifp->if_eflags & IFEF_TXSTART)) {
4769 return ENXIO;
4770 }
4771
4772 ifclassq_set_maxlen(ifp->if_snd, maxqlen);
4773
4774 return 0;
4775 }
4776
4777 errno_t
ifnet_get_sndq_maxlen(struct ifnet * ifp,u_int32_t * maxqlen)4778 ifnet_get_sndq_maxlen(struct ifnet *ifp, u_int32_t *maxqlen)
4779 {
4780 if (ifp == NULL || maxqlen == NULL) {
4781 return EINVAL;
4782 } else if (!(ifp->if_eflags & IFEF_TXSTART)) {
4783 return ENXIO;
4784 }
4785
4786 *maxqlen = ifclassq_get_maxlen(ifp->if_snd);
4787
4788 return 0;
4789 }
4790
4791 errno_t
ifnet_get_sndq_len(struct ifnet * ifp,u_int32_t * pkts)4792 ifnet_get_sndq_len(struct ifnet *ifp, u_int32_t *pkts)
4793 {
4794 errno_t err;
4795
4796 if (ifp == NULL || pkts == NULL) {
4797 err = EINVAL;
4798 } else if (!(ifp->if_eflags & IFEF_TXSTART)) {
4799 err = ENXIO;
4800 } else {
4801 err = ifclassq_get_len(ifp->if_snd, MBUF_SC_UNSPEC,
4802 IF_CLASSQ_ALL_GRPS, pkts, NULL);
4803 }
4804
4805 return err;
4806 }
4807
4808 errno_t
ifnet_get_service_class_sndq_len(struct ifnet * ifp,mbuf_svc_class_t sc,u_int32_t * pkts,u_int32_t * bytes)4809 ifnet_get_service_class_sndq_len(struct ifnet *ifp, mbuf_svc_class_t sc,
4810 u_int32_t *pkts, u_int32_t *bytes)
4811 {
4812 errno_t err;
4813
4814 if (ifp == NULL || !MBUF_VALID_SC(sc) ||
4815 (pkts == NULL && bytes == NULL)) {
4816 err = EINVAL;
4817 } else if (!(ifp->if_eflags & IFEF_TXSTART)) {
4818 err = ENXIO;
4819 } else {
4820 err = ifclassq_get_len(ifp->if_snd, sc, IF_CLASSQ_ALL_GRPS,
4821 pkts, bytes);
4822 }
4823
4824 return err;
4825 }
4826
4827 errno_t
ifnet_set_rcvq_maxlen(struct ifnet * ifp,u_int32_t maxqlen)4828 ifnet_set_rcvq_maxlen(struct ifnet *ifp, u_int32_t maxqlen)
4829 {
4830 struct dlil_threading_info *inp;
4831
4832 if (ifp == NULL) {
4833 return EINVAL;
4834 } else if (!(ifp->if_eflags & IFEF_RXPOLL) || ifp->if_inp == NULL) {
4835 return ENXIO;
4836 }
4837
4838 if (maxqlen == 0) {
4839 maxqlen = if_rcvq_maxlen;
4840 } else if (maxqlen < IF_RCVQ_MINLEN) {
4841 maxqlen = IF_RCVQ_MINLEN;
4842 }
4843
4844 inp = ifp->if_inp;
4845 lck_mtx_lock(&inp->dlth_lock);
4846 qlimit(&inp->dlth_pkts) = maxqlen;
4847 lck_mtx_unlock(&inp->dlth_lock);
4848
4849 return 0;
4850 }
4851
4852 errno_t
ifnet_get_rcvq_maxlen(struct ifnet * ifp,u_int32_t * maxqlen)4853 ifnet_get_rcvq_maxlen(struct ifnet *ifp, u_int32_t *maxqlen)
4854 {
4855 struct dlil_threading_info *inp;
4856
4857 if (ifp == NULL || maxqlen == NULL) {
4858 return EINVAL;
4859 } else if (!(ifp->if_eflags & IFEF_RXPOLL) || ifp->if_inp == NULL) {
4860 return ENXIO;
4861 }
4862
4863 inp = ifp->if_inp;
4864 lck_mtx_lock(&inp->dlth_lock);
4865 *maxqlen = qlimit(&inp->dlth_pkts);
4866 lck_mtx_unlock(&inp->dlth_lock);
4867 return 0;
4868 }
4869
4870 void
ifnet_enqueue_multi_setup(struct ifnet * ifp,uint16_t delay_qlen,uint16_t delay_timeout)4871 ifnet_enqueue_multi_setup(struct ifnet *ifp, uint16_t delay_qlen,
4872 uint16_t delay_timeout)
4873 {
4874 if (delay_qlen > 0 && delay_timeout > 0) {
4875 if_set_eflags(ifp, IFEF_ENQUEUE_MULTI);
4876 ifp->if_start_delay_qlen = MIN(100, delay_qlen);
4877 ifp->if_start_delay_timeout = min(20000, delay_timeout);
4878 /* convert timeout to nanoseconds */
4879 ifp->if_start_delay_timeout *= 1000;
4880 kprintf("%s: forced IFEF_ENQUEUE_MULTI qlen %u timeout %u\n",
4881 ifp->if_xname, (uint32_t)delay_qlen,
4882 (uint32_t)delay_timeout);
4883 } else {
4884 if_clear_eflags(ifp, IFEF_ENQUEUE_MULTI);
4885 }
4886 }
4887
4888 /*
4889 * This function clears the DSCP bits in the IPV4/V6 header pointed to by buf.
4890 * While it's ok for buf to be not 32 bit aligned, the caller must ensure that
4891 * buf holds the full header.
4892 */
4893 static __attribute__((noinline)) void
ifnet_mcast_clear_dscp(uint8_t * buf,uint8_t ip_ver)4894 ifnet_mcast_clear_dscp(uint8_t *buf, uint8_t ip_ver)
4895 {
4896 struct ip *ip;
4897 struct ip6_hdr *ip6;
4898 uint8_t lbuf[64] __attribute__((aligned(8)));
4899 uint8_t *p = buf;
4900
4901 if (ip_ver == IPVERSION) {
4902 uint8_t old_tos;
4903 uint32_t sum;
4904
4905 if (__improbable(!IP_HDR_ALIGNED_P(p))) {
4906 DTRACE_IP1(not__aligned__v4, uint8_t *, buf);
4907 bcopy(buf, lbuf, sizeof(struct ip));
4908 p = lbuf;
4909 }
4910 ip = (struct ip *)(void *)p;
4911 if (__probable((ip->ip_tos & ~IPTOS_ECN_MASK) == 0)) {
4912 return;
4913 }
4914
4915 DTRACE_IP1(clear__v4, struct ip *, ip);
4916 old_tos = ip->ip_tos;
4917 ip->ip_tos &= IPTOS_ECN_MASK;
4918 sum = ip->ip_sum + htons(old_tos) - htons(ip->ip_tos);
4919 sum = (sum >> 16) + (sum & 0xffff);
4920 ip->ip_sum = (uint16_t)(sum & 0xffff);
4921
4922 if (__improbable(p == lbuf)) {
4923 bcopy(lbuf, buf, sizeof(struct ip));
4924 }
4925 } else {
4926 uint32_t flow;
4927 ASSERT(ip_ver == IPV6_VERSION);
4928
4929 if (__improbable(!IP_HDR_ALIGNED_P(p))) {
4930 DTRACE_IP1(not__aligned__v6, uint8_t *, buf);
4931 bcopy(buf, lbuf, sizeof(struct ip6_hdr));
4932 p = lbuf;
4933 }
4934 ip6 = (struct ip6_hdr *)(void *)p;
4935 flow = ntohl(ip6->ip6_flow);
4936 if (__probable((flow & IP6FLOW_DSCP_MASK) == 0)) {
4937 return;
4938 }
4939
4940 DTRACE_IP1(clear__v6, struct ip6_hdr *, ip6);
4941 ip6->ip6_flow = htonl(flow & ~IP6FLOW_DSCP_MASK);
4942
4943 if (__improbable(p == lbuf)) {
4944 bcopy(lbuf, buf, sizeof(struct ip6_hdr));
4945 }
4946 }
4947 }
4948
4949 static inline errno_t
ifnet_enqueue_ifclassq(struct ifnet * ifp,struct ifclassq * ifcq,classq_pkt_t * p,boolean_t flush,boolean_t * pdrop)4950 ifnet_enqueue_ifclassq(struct ifnet *ifp, struct ifclassq *ifcq,
4951 classq_pkt_t *p, boolean_t flush, boolean_t *pdrop)
4952 {
4953 #if SKYWALK
4954 volatile struct sk_nexusadv *nxadv = NULL;
4955 #endif /* SKYWALK */
4956 volatile uint64_t *fg_ts = NULL;
4957 volatile uint64_t *rt_ts = NULL;
4958 struct timespec now;
4959 u_int64_t now_nsec = 0;
4960 int error = 0;
4961 uint8_t *mcast_buf = NULL;
4962 uint8_t ip_ver;
4963 uint32_t pktlen;
4964
4965 ASSERT(ifp->if_eflags & IFEF_TXSTART);
4966 #if SKYWALK
4967 /*
4968 * If attached to flowswitch, grab pointers to the
4969 * timestamp variables in the nexus advisory region.
4970 */
4971 if ((ifp->if_capabilities & IFCAP_SKYWALK) && ifp->if_na != NULL &&
4972 (nxadv = ifp->if_na->nifna_netif->nif_fsw_nxadv) != NULL) {
4973 fg_ts = &nxadv->nxadv_fg_sendts;
4974 rt_ts = &nxadv->nxadv_rt_sendts;
4975 }
4976 #endif /* SKYWALK */
4977
4978 /*
4979 * If packet already carries a timestamp, either from dlil_output()
4980 * or from flowswitch, use it here. Otherwise, record timestamp.
4981 * PKTF_TS_VALID is always cleared prior to entering classq, i.e.
4982 * the timestamp value is used internally there.
4983 */
4984 switch (p->cp_ptype) {
4985 case QP_MBUF:
4986 #if SKYWALK
4987 /*
4988 * Valid only for non-native (compat) Skywalk interface.
4989 * If the data source uses packet, caller must convert
4990 * it to mbuf first prior to calling this routine.
4991 */
4992 ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
4993 #endif /* SKYWALK */
4994 ASSERT(p->cp_mbuf->m_flags & M_PKTHDR);
4995 ASSERT(p->cp_mbuf->m_nextpkt == NULL);
4996
4997 if (!(p->cp_mbuf->m_pkthdr.pkt_flags & PKTF_TS_VALID) ||
4998 p->cp_mbuf->m_pkthdr.pkt_timestamp == 0) {
4999 nanouptime(&now);
5000 net_timernsec(&now, &now_nsec);
5001 p->cp_mbuf->m_pkthdr.pkt_timestamp = now_nsec;
5002 }
5003 p->cp_mbuf->m_pkthdr.pkt_flags &= ~PKTF_TS_VALID;
5004 /*
5005 * If the packet service class is not background,
5006 * update the timestamp to indicate recent activity
5007 * on a foreground socket.
5008 */
5009 if ((p->cp_mbuf->m_pkthdr.pkt_flags & PKTF_FLOW_ID) &&
5010 p->cp_mbuf->m_pkthdr.pkt_flowsrc == FLOWSRC_INPCB) {
5011 if (!(p->cp_mbuf->m_pkthdr.pkt_flags &
5012 PKTF_SO_BACKGROUND)) {
5013 ifp->if_fg_sendts = (uint32_t)_net_uptime;
5014 if (fg_ts != NULL) {
5015 *fg_ts = (uint32_t)_net_uptime;
5016 }
5017 }
5018 if (p->cp_mbuf->m_pkthdr.pkt_flags & PKTF_SO_REALTIME) {
5019 ifp->if_rt_sendts = (uint32_t)_net_uptime;
5020 if (rt_ts != NULL) {
5021 *rt_ts = (uint32_t)_net_uptime;
5022 }
5023 }
5024 }
5025 pktlen = m_pktlen(p->cp_mbuf);
5026
5027 /*
5028 * Some Wi-Fi AP implementations do not correctly handle
5029 * multicast IP packets with DSCP bits set (radr://9331522).
5030 * As a workaround we clear the DSCP bits but keep service
5031 * class (rdar://51507725).
5032 */
5033 if ((p->cp_mbuf->m_flags & M_MCAST) != 0 &&
5034 IFNET_IS_WIFI_INFRA(ifp)) {
5035 size_t len = mbuf_len(p->cp_mbuf), hlen;
5036 struct ether_header *eh;
5037 boolean_t pullup = FALSE;
5038 uint16_t etype;
5039
5040 if (__improbable(len < sizeof(struct ether_header))) {
5041 DTRACE_IP1(small__ether, size_t, len);
5042 if ((p->cp_mbuf = m_pullup(p->cp_mbuf,
5043 sizeof(struct ether_header))) == NULL) {
5044 return ENOMEM;
5045 }
5046 }
5047 eh = (struct ether_header *)mbuf_data(p->cp_mbuf);
5048 etype = ntohs(eh->ether_type);
5049 if (etype == ETHERTYPE_IP) {
5050 hlen = sizeof(struct ether_header) +
5051 sizeof(struct ip);
5052 if (len < hlen) {
5053 DTRACE_IP1(small__v4, size_t, len);
5054 pullup = TRUE;
5055 }
5056 ip_ver = IPVERSION;
5057 } else if (etype == ETHERTYPE_IPV6) {
5058 hlen = sizeof(struct ether_header) +
5059 sizeof(struct ip6_hdr);
5060 if (len < hlen) {
5061 DTRACE_IP1(small__v6, size_t, len);
5062 pullup = TRUE;
5063 }
5064 ip_ver = IPV6_VERSION;
5065 } else {
5066 DTRACE_IP1(invalid__etype, uint16_t, etype);
5067 break;
5068 }
5069 if (pullup) {
5070 if ((p->cp_mbuf = m_pullup(p->cp_mbuf, (int)hlen)) ==
5071 NULL) {
5072 return ENOMEM;
5073 }
5074
5075 eh = (struct ether_header *)mbuf_data(
5076 p->cp_mbuf);
5077 }
5078 mcast_buf = (uint8_t *)(eh + 1);
5079 /*
5080 * ifnet_mcast_clear_dscp() will finish the work below.
5081 * Note that the pullups above ensure that mcast_buf
5082 * points to a full IP header.
5083 */
5084 }
5085 break;
5086
5087 #if SKYWALK
5088 case QP_PACKET:
5089 /*
5090 * Valid only for native Skywalk interface. If the data
5091 * source uses mbuf, caller must convert it to packet first
5092 * prior to calling this routine.
5093 */
5094 ASSERT(ifp->if_eflags & IFEF_SKYWALK_NATIVE);
5095 if (!(p->cp_kpkt->pkt_pflags & PKT_F_TS_VALID) ||
5096 p->cp_kpkt->pkt_timestamp == 0) {
5097 nanouptime(&now);
5098 net_timernsec(&now, &now_nsec);
5099 p->cp_kpkt->pkt_timestamp = now_nsec;
5100 }
5101 p->cp_kpkt->pkt_pflags &= ~PKT_F_TS_VALID;
5102 /*
5103 * If the packet service class is not background,
5104 * update the timestamps on the interface, as well as
5105 * the ones in nexus-wide advisory to indicate recent
5106 * activity on a foreground flow.
5107 */
5108 if (!(p->cp_kpkt->pkt_pflags & PKT_F_BACKGROUND)) {
5109 ifp->if_fg_sendts = (uint32_t)_net_uptime;
5110 if (fg_ts != NULL) {
5111 *fg_ts = (uint32_t)_net_uptime;
5112 }
5113 }
5114 if (p->cp_kpkt->pkt_pflags & PKT_F_REALTIME) {
5115 ifp->if_rt_sendts = (uint32_t)_net_uptime;
5116 if (rt_ts != NULL) {
5117 *rt_ts = (uint32_t)_net_uptime;
5118 }
5119 }
5120 pktlen = p->cp_kpkt->pkt_length;
5121
5122 /*
5123 * Some Wi-Fi AP implementations do not correctly handle
5124 * multicast IP packets with DSCP bits set (radr://9331522).
5125 * As a workaround we clear the DSCP bits but keep service
5126 * class (rdar://51507725).
5127 */
5128 if ((p->cp_kpkt->pkt_link_flags & PKT_LINKF_MCAST) != 0 &&
5129 IFNET_IS_WIFI_INFRA(ifp)) {
5130 uint8_t *baddr;
5131 struct ether_header *eh;
5132 uint16_t etype;
5133
5134 MD_BUFLET_ADDR_ABS(p->cp_kpkt, baddr);
5135 baddr += p->cp_kpkt->pkt_headroom;
5136 if (__improbable(pktlen < sizeof(struct ether_header))) {
5137 DTRACE_IP1(pkt__small__ether, __kern_packet *,
5138 p->cp_kpkt);
5139 break;
5140 }
5141 eh = (struct ether_header *)(void *)baddr;
5142 etype = ntohs(eh->ether_type);
5143 if (etype == ETHERTYPE_IP) {
5144 if (pktlen < sizeof(struct ether_header) +
5145 sizeof(struct ip)) {
5146 DTRACE_IP1(pkt__small__v4, uint32_t,
5147 pktlen);
5148 break;
5149 }
5150 ip_ver = IPVERSION;
5151 } else if (etype == ETHERTYPE_IPV6) {
5152 if (pktlen < sizeof(struct ether_header) +
5153 sizeof(struct ip6_hdr)) {
5154 DTRACE_IP1(pkt__small__v6, uint32_t,
5155 pktlen);
5156 break;
5157 }
5158 ip_ver = IPV6_VERSION;
5159 } else {
5160 DTRACE_IP1(pkt__invalid__etype, uint16_t,
5161 etype);
5162 break;
5163 }
5164 mcast_buf = (uint8_t *)(eh + 1);
5165 /*
5166 * ifnet_mcast_clear_dscp() will finish the work below.
5167 * The checks above verify that the IP header is in the
5168 * first buflet.
5169 */
5170 }
5171 break;
5172 #endif /* SKYWALK */
5173
5174 default:
5175 VERIFY(0);
5176 /* NOTREACHED */
5177 __builtin_unreachable();
5178 }
5179
5180 if (mcast_buf != NULL) {
5181 ifnet_mcast_clear_dscp(mcast_buf, ip_ver);
5182 }
5183
5184 if (ifp->if_eflags & IFEF_ENQUEUE_MULTI) {
5185 if (now_nsec == 0) {
5186 nanouptime(&now);
5187 net_timernsec(&now, &now_nsec);
5188 }
5189 /*
5190 * If the driver chose to delay start callback for
5191 * coalescing multiple packets, Then use the following
5192 * heuristics to make sure that start callback will
5193 * be delayed only when bulk data transfer is detected.
5194 * 1. number of packets enqueued in (delay_win * 2) is
5195 * greater than or equal to the delay qlen.
5196 * 2. If delay_start is enabled it will stay enabled for
5197 * another 10 idle windows. This is to take into account
5198 * variable RTT and burst traffic.
5199 * 3. If the time elapsed since last enqueue is more
5200 * than 200ms we disable delaying start callback. This is
5201 * is to take idle time into account.
5202 */
5203 u_int64_t dwin = (ifp->if_start_delay_timeout << 1);
5204 if (ifp->if_start_delay_swin > 0) {
5205 if ((ifp->if_start_delay_swin + dwin) > now_nsec) {
5206 ifp->if_start_delay_cnt++;
5207 } else if ((now_nsec - ifp->if_start_delay_swin)
5208 >= (200 * 1000 * 1000)) {
5209 ifp->if_start_delay_swin = now_nsec;
5210 ifp->if_start_delay_cnt = 1;
5211 ifp->if_start_delay_idle = 0;
5212 if (ifp->if_eflags & IFEF_DELAY_START) {
5213 if_clear_eflags(ifp, IFEF_DELAY_START);
5214 ifnet_delay_start_disabled_increment();
5215 }
5216 } else {
5217 if (ifp->if_start_delay_cnt >=
5218 ifp->if_start_delay_qlen) {
5219 if_set_eflags(ifp, IFEF_DELAY_START);
5220 ifp->if_start_delay_idle = 0;
5221 } else {
5222 if (ifp->if_start_delay_idle >= 10) {
5223 if_clear_eflags(ifp,
5224 IFEF_DELAY_START);
5225 ifnet_delay_start_disabled_increment();
5226 } else {
5227 ifp->if_start_delay_idle++;
5228 }
5229 }
5230 ifp->if_start_delay_swin = now_nsec;
5231 ifp->if_start_delay_cnt = 1;
5232 }
5233 } else {
5234 ifp->if_start_delay_swin = now_nsec;
5235 ifp->if_start_delay_cnt = 1;
5236 ifp->if_start_delay_idle = 0;
5237 if_clear_eflags(ifp, IFEF_DELAY_START);
5238 }
5239 } else {
5240 if_clear_eflags(ifp, IFEF_DELAY_START);
5241 }
5242
5243 /* enqueue the packet (caller consumes object) */
5244 error = ifclassq_enqueue(((ifcq != NULL) ? ifcq : ifp->if_snd), p, p,
5245 1, pktlen, pdrop);
5246
5247 /*
5248 * Tell the driver to start dequeueing; do this even when the queue
5249 * for the packet is suspended (EQSUSPENDED), as the driver could still
5250 * be dequeueing from other unsuspended queues.
5251 */
5252 if (!(ifp->if_eflags & IFEF_ENQUEUE_MULTI) &&
5253 ((error == 0 && flush) || error == EQFULL || error == EQSUSPENDED)) {
5254 ifnet_start(ifp);
5255 }
5256
5257 return error;
5258 }
5259
5260 static inline errno_t
ifnet_enqueue_ifclassq_chain(struct ifnet * ifp,struct ifclassq * ifcq,classq_pkt_t * head,classq_pkt_t * tail,uint32_t cnt,uint32_t bytes,boolean_t flush,boolean_t * pdrop)5261 ifnet_enqueue_ifclassq_chain(struct ifnet *ifp, struct ifclassq *ifcq,
5262 classq_pkt_t *head, classq_pkt_t *tail, uint32_t cnt, uint32_t bytes,
5263 boolean_t flush, boolean_t *pdrop)
5264 {
5265 int error;
5266
5267 /* enqueue the packet (caller consumes object) */
5268 error = ifclassq_enqueue(ifcq != NULL ? ifcq : ifp->if_snd, head, tail,
5269 cnt, bytes, pdrop);
5270
5271 /*
5272 * Tell the driver to start dequeueing; do this even when the queue
5273 * for the packet is suspended (EQSUSPENDED), as the driver could still
5274 * be dequeueing from other unsuspended queues.
5275 */
5276 if ((error == 0 && flush) || error == EQFULL || error == EQSUSPENDED) {
5277 ifnet_start(ifp);
5278 }
5279 return error;
5280 }
5281
5282 #if DEVELOPMENT || DEBUG
5283 void
trace_pkt_dump_payload(struct ifnet * ifp,struct __kern_packet * kpkt,bool input)5284 trace_pkt_dump_payload(struct ifnet *ifp, struct __kern_packet *kpkt, bool input)
5285 {
5286 #define MIN_TRACE_DUMP_PKT_SIZE 32
5287 struct ether_header *eh = NULL;
5288 struct udphdr *uh = NULL;
5289
5290 if (__probable(kdebug_enable == 0 || (flow_key_trace.fk_ipver != IPVERSION &&
5291 flow_key_trace.fk_ipver != IPV6_VERSION))) {
5292 return;
5293 }
5294
5295 uint16_t bdlim, bdlen, bdoff;
5296 uint8_t *baddr;
5297
5298 MD_BUFLET_ADDR_ABS_DLEN(kpkt, baddr, bdlen, bdlim, bdoff);
5299
5300 if (!(kpkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED)) {
5301 if (!IFNET_IS_ETHERNET(ifp)) {
5302 return;
5303 }
5304
5305 sa_family_t af = AF_UNSPEC;
5306 ASSERT(kpkt->pkt_l2_len > 0);
5307
5308 baddr += kpkt->pkt_headroom;
5309 eh = (struct ether_header *)(void *)baddr;
5310 if (__improbable(sizeof(*eh) > kpkt->pkt_length)) {
5311 return;
5312 }
5313 if (__improbable(kpkt->pkt_headroom + sizeof(*eh) > bdlim)) {
5314 return;
5315 }
5316 uint16_t ether_type = ntohs(eh->ether_type);
5317 if (ether_type == ETHERTYPE_IP) {
5318 af = AF_INET;
5319 } else if (ether_type == ETHERTYPE_IPV6) {
5320 af = AF_INET6;
5321 } else {
5322 return;
5323 }
5324 flow_pkt_classify(kpkt, ifp, af, input);
5325 }
5326
5327 if (kpkt->pkt_flow_ip_ver != flow_key_trace.fk_ipver) {
5328 return;
5329 }
5330
5331 if (kpkt->pkt_flow_ip_proto != IPPROTO_UDP) {
5332 return;
5333 }
5334
5335 uint16_t sport = input ? flow_key_trace.fk_dport : flow_key_trace.fk_sport;
5336 uint16_t dport = input ? flow_key_trace.fk_sport : flow_key_trace.fk_dport;
5337
5338 if (kpkt->pkt_flow_udp_src != sport ||
5339 kpkt->pkt_flow_udp_dst != dport) {
5340 return;
5341 }
5342
5343 if (kpkt->pkt_flow_ip_ver == IPVERSION) {
5344 struct ip *ip_header = (struct ip *)kpkt->pkt_flow_ip_hdr;
5345 struct in_addr *saddr = input ? &flow_key_trace.fk_dst4 : &flow_key_trace.fk_src4;
5346 struct in_addr *daddr = input ? &flow_key_trace.fk_src4 : &flow_key_trace.fk_dst4;
5347
5348 if (ip_header->ip_src.s_addr != saddr->s_addr ||
5349 ip_header->ip_dst.s_addr != daddr->s_addr) {
5350 return;
5351 }
5352 } else if (kpkt->pkt_flow_ip_ver == IPV6_VERSION) {
5353 struct ip6_hdr *ip6_header = (struct ip6_hdr *)kpkt->pkt_flow_ip_hdr;
5354 struct in6_addr *saddr = input ? &flow_key_trace.fk_dst6 : &flow_key_trace.fk_src6;
5355 struct in6_addr *daddr = input ? &flow_key_trace.fk_src6 : &flow_key_trace.fk_dst6;
5356
5357 if (!IN6_ARE_ADDR_EQUAL(&ip6_header->ip6_src, saddr) ||
5358 !IN6_ARE_ADDR_EQUAL(&ip6_header->ip6_dst, daddr)) {
5359 return;
5360 }
5361 }
5362
5363 int udp_payload_offset = kpkt->pkt_l2_len + kpkt->pkt_flow_ip_hlen + sizeof(struct udphdr);
5364
5365 uint16_t pkt_payload_len = bdlim - bdoff;
5366 pkt_payload_len = (uint16_t)MIN(pkt_payload_len, kpkt->pkt_length);
5367 pkt_payload_len -= udp_payload_offset;
5368
5369 if (pkt_payload_len >= MIN_TRACE_DUMP_PKT_SIZE) {
5370 uh = (struct udphdr *)kpkt->pkt_flow_udp_hdr;
5371 uint8_t *payload = (uint8_t *)(uh + 1);
5372
5373 /* Trace 32 bytes of UDP transport payload */
5374 uint64_t *trace1 = __DECONST(uint64_t *, payload);
5375 uint64_t *trace2 = trace1 + 1;
5376 uint64_t *trace3 = trace2 + 1;
5377 uint64_t *trace4 = trace3 + 1;
5378
5379 if (input) {
5380 KDBG(IFNET_KTRACE_RX_PKT_DUMP, *trace1, *trace2, *trace3, *trace4);
5381 } else {
5382 KDBG(IFNET_KTRACE_TX_PKT_DUMP, *trace1, *trace2, *trace3, *trace4);
5383 }
5384 }
5385 }
5386 #endif /* DEVELOPMENT || DEBUG */
5387
5388 int
ifnet_enqueue_netem(void * handle,pktsched_pkt_t * pkts,uint32_t n_pkts)5389 ifnet_enqueue_netem(void *handle, pktsched_pkt_t *pkts, uint32_t n_pkts)
5390 {
5391 struct ifnet *ifp = handle;
5392 boolean_t pdrop; /* dummy */
5393 uint32_t i;
5394
5395 ASSERT(n_pkts >= 1);
5396 for (i = 0; i < n_pkts - 1; i++) {
5397 (void) ifnet_enqueue_ifclassq(ifp, NULL, &pkts[i].pktsched_pkt,
5398 FALSE, &pdrop);
5399 }
5400 /* flush with the last packet */
5401 (void) ifnet_enqueue_ifclassq(ifp, NULL, &pkts[i].pktsched_pkt,
5402 TRUE, &pdrop);
5403
5404 return 0;
5405 }
5406
5407 static inline errno_t
ifnet_enqueue_common(struct ifnet * ifp,struct ifclassq * ifcq,classq_pkt_t * pkt,boolean_t flush,boolean_t * pdrop)5408 ifnet_enqueue_common(struct ifnet *ifp, struct ifclassq *ifcq,
5409 classq_pkt_t *pkt, boolean_t flush, boolean_t *pdrop)
5410 {
5411 #if DEVELOPMENT || DEBUG
5412 switch (pkt->cp_ptype) {
5413 case QP_PACKET: {
5414 trace_pkt_dump_payload(ifp, pkt->cp_kpkt, false);
5415 break;
5416 }
5417 case QP_MBUF:
5418 case QP_INVALID: {
5419 break;
5420 }
5421 }
5422 #endif /* DEVELOPMENT || DEBUG */
5423
5424 if (ifp->if_output_netem != NULL) {
5425 bool drop;
5426 errno_t error;
5427 error = netem_enqueue(ifp->if_output_netem, pkt, &drop);
5428 *pdrop = drop ? TRUE : FALSE;
5429 return error;
5430 } else {
5431 return ifnet_enqueue_ifclassq(ifp, ifcq, pkt, flush, pdrop);
5432 }
5433 }
5434
5435 errno_t
ifnet_enqueue(struct ifnet * ifp,struct mbuf * m)5436 ifnet_enqueue(struct ifnet *ifp, struct mbuf *m)
5437 {
5438 boolean_t pdrop;
5439 return ifnet_enqueue_mbuf(ifp, m, TRUE, &pdrop);
5440 }
5441
5442 errno_t
ifnet_enqueue_mbuf(struct ifnet * ifp,struct mbuf * m,boolean_t flush,boolean_t * pdrop)5443 ifnet_enqueue_mbuf(struct ifnet *ifp, struct mbuf *m, boolean_t flush,
5444 boolean_t *pdrop)
5445 {
5446 classq_pkt_t pkt;
5447
5448 if (ifp == NULL || m == NULL || !(m->m_flags & M_PKTHDR) ||
5449 m->m_nextpkt != NULL) {
5450 if (m != NULL) {
5451 m_freem_list(m);
5452 *pdrop = TRUE;
5453 }
5454 return EINVAL;
5455 } else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5456 !IF_FULLY_ATTACHED(ifp)) {
5457 /* flag tested without lock for performance */
5458 m_freem(m);
5459 *pdrop = TRUE;
5460 return ENXIO;
5461 } else if (!(ifp->if_flags & IFF_UP)) {
5462 m_freem(m);
5463 *pdrop = TRUE;
5464 return ENETDOWN;
5465 }
5466
5467 CLASSQ_PKT_INIT_MBUF(&pkt, m);
5468 return ifnet_enqueue_common(ifp, NULL, &pkt, flush, pdrop);
5469 }
5470
5471 errno_t
ifnet_enqueue_mbuf_chain(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,uint32_t cnt,uint32_t bytes,boolean_t flush,boolean_t * pdrop)5472 ifnet_enqueue_mbuf_chain(struct ifnet *ifp, struct mbuf *m_head,
5473 struct mbuf *m_tail, uint32_t cnt, uint32_t bytes, boolean_t flush,
5474 boolean_t *pdrop)
5475 {
5476 classq_pkt_t head, tail;
5477
5478 ASSERT(m_head != NULL);
5479 ASSERT((m_head->m_flags & M_PKTHDR) != 0);
5480 ASSERT(m_tail != NULL);
5481 ASSERT((m_tail->m_flags & M_PKTHDR) != 0);
5482 ASSERT(ifp != NULL);
5483 ASSERT((ifp->if_eflags & IFEF_TXSTART) != 0);
5484
5485 if (!IF_FULLY_ATTACHED(ifp)) {
5486 /* flag tested without lock for performance */
5487 m_freem_list(m_head);
5488 *pdrop = TRUE;
5489 return ENXIO;
5490 } else if (!(ifp->if_flags & IFF_UP)) {
5491 m_freem_list(m_head);
5492 *pdrop = TRUE;
5493 return ENETDOWN;
5494 }
5495
5496 CLASSQ_PKT_INIT_MBUF(&head, m_head);
5497 CLASSQ_PKT_INIT_MBUF(&tail, m_tail);
5498 return ifnet_enqueue_ifclassq_chain(ifp, NULL, &head, &tail, cnt, bytes,
5499 flush, pdrop);
5500 }
5501
5502 #if SKYWALK
5503 static errno_t
ifnet_enqueue_pkt_common(struct ifnet * ifp,struct ifclassq * ifcq,struct __kern_packet * kpkt,boolean_t flush,boolean_t * pdrop)5504 ifnet_enqueue_pkt_common(struct ifnet *ifp, struct ifclassq *ifcq,
5505 struct __kern_packet *kpkt, boolean_t flush, boolean_t *pdrop)
5506 {
5507 classq_pkt_t pkt;
5508
5509 ASSERT(kpkt == NULL || kpkt->pkt_nextpkt == NULL);
5510
5511 if (__improbable(ifp == NULL || kpkt == NULL)) {
5512 if (kpkt != NULL) {
5513 pp_free_packet(__DECONST(struct kern_pbufpool *,
5514 kpkt->pkt_qum.qum_pp), SK_PTR_ADDR(kpkt));
5515 *pdrop = TRUE;
5516 }
5517 return EINVAL;
5518 } else if (__improbable(!(ifp->if_eflags & IFEF_TXSTART) ||
5519 !IF_FULLY_ATTACHED(ifp))) {
5520 /* flag tested without lock for performance */
5521 pp_free_packet(__DECONST(struct kern_pbufpool *,
5522 kpkt->pkt_qum.qum_pp), SK_PTR_ADDR(kpkt));
5523 *pdrop = TRUE;
5524 return ENXIO;
5525 } else if (__improbable(!(ifp->if_flags & IFF_UP))) {
5526 pp_free_packet(__DECONST(struct kern_pbufpool *,
5527 kpkt->pkt_qum.qum_pp), SK_PTR_ADDR(kpkt));
5528 *pdrop = TRUE;
5529 return ENETDOWN;
5530 }
5531
5532 CLASSQ_PKT_INIT_PACKET(&pkt, kpkt);
5533 return ifnet_enqueue_common(ifp, ifcq, &pkt, flush, pdrop);
5534 }
5535
5536 errno_t
ifnet_enqueue_pkt(struct ifnet * ifp,struct __kern_packet * kpkt,boolean_t flush,boolean_t * pdrop)5537 ifnet_enqueue_pkt(struct ifnet *ifp, struct __kern_packet *kpkt,
5538 boolean_t flush, boolean_t *pdrop)
5539 {
5540 return ifnet_enqueue_pkt_common(ifp, NULL, kpkt, flush, pdrop);
5541 }
5542
5543 errno_t
ifnet_enqueue_ifcq_pkt(struct ifnet * ifp,struct ifclassq * ifcq,struct __kern_packet * kpkt,boolean_t flush,boolean_t * pdrop)5544 ifnet_enqueue_ifcq_pkt(struct ifnet *ifp, struct ifclassq *ifcq,
5545 struct __kern_packet *kpkt, boolean_t flush, boolean_t *pdrop)
5546 {
5547 return ifnet_enqueue_pkt_common(ifp, ifcq, kpkt, flush, pdrop);
5548 }
5549
5550 static errno_t
ifnet_enqueue_pkt_chain_common(struct ifnet * ifp,struct ifclassq * ifcq,struct __kern_packet * k_head,struct __kern_packet * k_tail,uint32_t cnt,uint32_t bytes,boolean_t flush,boolean_t * pdrop)5551 ifnet_enqueue_pkt_chain_common(struct ifnet *ifp, struct ifclassq *ifcq,
5552 struct __kern_packet *k_head, struct __kern_packet *k_tail, uint32_t cnt,
5553 uint32_t bytes, boolean_t flush, boolean_t *pdrop)
5554 {
5555 classq_pkt_t head, tail;
5556
5557 ASSERT(k_head != NULL);
5558 ASSERT(k_tail != NULL);
5559 ASSERT(ifp != NULL);
5560 ASSERT((ifp->if_eflags & IFEF_TXSTART) != 0);
5561
5562 if (!IF_FULLY_ATTACHED(ifp)) {
5563 /* flag tested without lock for performance */
5564 pp_free_packet_chain(k_head, NULL);
5565 *pdrop = TRUE;
5566 return ENXIO;
5567 } else if (__improbable(!(ifp->if_flags & IFF_UP))) {
5568 pp_free_packet_chain(k_head, NULL);
5569 *pdrop = TRUE;
5570 return ENETDOWN;
5571 }
5572
5573 CLASSQ_PKT_INIT_PACKET(&head, k_head);
5574 CLASSQ_PKT_INIT_PACKET(&tail, k_tail);
5575 return ifnet_enqueue_ifclassq_chain(ifp, ifcq, &head, &tail, cnt, bytes,
5576 flush, pdrop);
5577 }
5578
5579 errno_t
ifnet_enqueue_pkt_chain(struct ifnet * ifp,struct __kern_packet * k_head,struct __kern_packet * k_tail,uint32_t cnt,uint32_t bytes,boolean_t flush,boolean_t * pdrop)5580 ifnet_enqueue_pkt_chain(struct ifnet *ifp, struct __kern_packet *k_head,
5581 struct __kern_packet *k_tail, uint32_t cnt, uint32_t bytes, boolean_t flush,
5582 boolean_t *pdrop)
5583 {
5584 return ifnet_enqueue_pkt_chain_common(ifp, NULL, k_head, k_tail,
5585 cnt, bytes, flush, pdrop);
5586 }
5587
5588 errno_t
ifnet_enqueue_ifcq_pkt_chain(struct ifnet * ifp,struct ifclassq * ifcq,struct __kern_packet * k_head,struct __kern_packet * k_tail,uint32_t cnt,uint32_t bytes,boolean_t flush,boolean_t * pdrop)5589 ifnet_enqueue_ifcq_pkt_chain(struct ifnet *ifp, struct ifclassq *ifcq,
5590 struct __kern_packet *k_head, struct __kern_packet *k_tail, uint32_t cnt,
5591 uint32_t bytes, boolean_t flush, boolean_t *pdrop)
5592 {
5593 return ifnet_enqueue_pkt_chain_common(ifp, ifcq, k_head, k_tail,
5594 cnt, bytes, flush, pdrop);
5595 }
5596 #endif /* SKYWALK */
5597
5598 errno_t
ifnet_dequeue(struct ifnet * ifp,struct mbuf ** mp)5599 ifnet_dequeue(struct ifnet *ifp, struct mbuf **mp)
5600 {
5601 errno_t rc;
5602 classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
5603
5604 if (ifp == NULL || mp == NULL) {
5605 return EINVAL;
5606 } else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5607 ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5608 return ENXIO;
5609 }
5610 if (!ifnet_is_attached(ifp, 1)) {
5611 return ENXIO;
5612 }
5613
5614 #if SKYWALK
5615 ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5616 #endif /* SKYWALK */
5617 rc = ifclassq_dequeue(ifp->if_snd, 1, CLASSQ_DEQUEUE_MAX_BYTE_LIMIT,
5618 &pkt, NULL, NULL, NULL, 0);
5619 VERIFY((pkt.cp_ptype == QP_MBUF) || (pkt.cp_mbuf == NULL));
5620 ifnet_decr_iorefcnt(ifp);
5621 *mp = pkt.cp_mbuf;
5622 return rc;
5623 }
5624
5625 errno_t
ifnet_dequeue_service_class(struct ifnet * ifp,mbuf_svc_class_t sc,struct mbuf ** mp)5626 ifnet_dequeue_service_class(struct ifnet *ifp, mbuf_svc_class_t sc,
5627 struct mbuf **mp)
5628 {
5629 errno_t rc;
5630 classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
5631
5632 if (ifp == NULL || mp == NULL || !MBUF_VALID_SC(sc)) {
5633 return EINVAL;
5634 } else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5635 ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5636 return ENXIO;
5637 }
5638 if (!ifnet_is_attached(ifp, 1)) {
5639 return ENXIO;
5640 }
5641
5642 #if SKYWALK
5643 ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5644 #endif /* SKYWALK */
5645 rc = ifclassq_dequeue_sc(ifp->if_snd, sc, 1,
5646 CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, &pkt, NULL, NULL, NULL, 0);
5647 VERIFY((pkt.cp_ptype == QP_MBUF) || (pkt.cp_mbuf == NULL));
5648 ifnet_decr_iorefcnt(ifp);
5649 *mp = pkt.cp_mbuf;
5650 return rc;
5651 }
5652
5653 errno_t
ifnet_dequeue_multi(struct ifnet * ifp,u_int32_t pkt_limit,struct mbuf ** head,struct mbuf ** tail,u_int32_t * cnt,u_int32_t * len)5654 ifnet_dequeue_multi(struct ifnet *ifp, u_int32_t pkt_limit,
5655 struct mbuf **head, struct mbuf **tail, u_int32_t *cnt, u_int32_t *len)
5656 {
5657 errno_t rc;
5658 classq_pkt_t pkt_head = CLASSQ_PKT_INITIALIZER(pkt_head);
5659 classq_pkt_t pkt_tail = CLASSQ_PKT_INITIALIZER(pkt_tail);
5660
5661 if (ifp == NULL || head == NULL || pkt_limit < 1) {
5662 return EINVAL;
5663 } else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5664 ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5665 return ENXIO;
5666 }
5667 if (!ifnet_is_attached(ifp, 1)) {
5668 return ENXIO;
5669 }
5670
5671 #if SKYWALK
5672 ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5673 #endif /* SKYWALK */
5674 rc = ifclassq_dequeue(ifp->if_snd, pkt_limit,
5675 CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, &pkt_head, &pkt_tail, cnt, len, 0);
5676 VERIFY((pkt_head.cp_ptype == QP_MBUF) || (pkt_head.cp_mbuf == NULL));
5677 ifnet_decr_iorefcnt(ifp);
5678 *head = pkt_head.cp_mbuf;
5679 if (tail != NULL) {
5680 *tail = pkt_tail.cp_mbuf;
5681 }
5682 return rc;
5683 }
5684
5685 errno_t
ifnet_dequeue_multi_bytes(struct ifnet * ifp,u_int32_t byte_limit,struct mbuf ** head,struct mbuf ** tail,u_int32_t * cnt,u_int32_t * len)5686 ifnet_dequeue_multi_bytes(struct ifnet *ifp, u_int32_t byte_limit,
5687 struct mbuf **head, struct mbuf **tail, u_int32_t *cnt, u_int32_t *len)
5688 {
5689 errno_t rc;
5690 classq_pkt_t pkt_head = CLASSQ_PKT_INITIALIZER(pkt_head);
5691 classq_pkt_t pkt_tail = CLASSQ_PKT_INITIALIZER(pkt_tail);
5692
5693 if (ifp == NULL || head == NULL || byte_limit < 1) {
5694 return EINVAL;
5695 } else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5696 ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5697 return ENXIO;
5698 }
5699 if (!ifnet_is_attached(ifp, 1)) {
5700 return ENXIO;
5701 }
5702
5703 #if SKYWALK
5704 ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5705 #endif /* SKYWALK */
5706 rc = ifclassq_dequeue(ifp->if_snd, CLASSQ_DEQUEUE_MAX_PKT_LIMIT,
5707 byte_limit, &pkt_head, &pkt_tail, cnt, len, 0);
5708 VERIFY((pkt_head.cp_ptype == QP_MBUF) || (pkt_head.cp_mbuf == NULL));
5709 ifnet_decr_iorefcnt(ifp);
5710 *head = pkt_head.cp_mbuf;
5711 if (tail != NULL) {
5712 *tail = pkt_tail.cp_mbuf;
5713 }
5714 return rc;
5715 }
5716
5717 errno_t
ifnet_dequeue_service_class_multi(struct ifnet * ifp,mbuf_svc_class_t sc,u_int32_t pkt_limit,struct mbuf ** head,struct mbuf ** tail,u_int32_t * cnt,u_int32_t * len)5718 ifnet_dequeue_service_class_multi(struct ifnet *ifp, mbuf_svc_class_t sc,
5719 u_int32_t pkt_limit, struct mbuf **head, struct mbuf **tail, u_int32_t *cnt,
5720 u_int32_t *len)
5721 {
5722 errno_t rc;
5723 classq_pkt_t pkt_head = CLASSQ_PKT_INITIALIZER(pkt_head);
5724 classq_pkt_t pkt_tail = CLASSQ_PKT_INITIALIZER(pkt_tail);
5725
5726 if (ifp == NULL || head == NULL || pkt_limit < 1 ||
5727 !MBUF_VALID_SC(sc)) {
5728 return EINVAL;
5729 } else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5730 ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5731 return ENXIO;
5732 }
5733 if (!ifnet_is_attached(ifp, 1)) {
5734 return ENXIO;
5735 }
5736
5737 #if SKYWALK
5738 ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5739 #endif /* SKYWALK */
5740 rc = ifclassq_dequeue_sc(ifp->if_snd, sc, pkt_limit,
5741 CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, &pkt_head, &pkt_tail,
5742 cnt, len, 0);
5743 VERIFY((pkt_head.cp_ptype == QP_MBUF) || (pkt_head.cp_mbuf == NULL));
5744 ifnet_decr_iorefcnt(ifp);
5745 *head = pkt_head.cp_mbuf;
5746 if (tail != NULL) {
5747 *tail = pkt_tail.cp_mbuf;
5748 }
5749 return rc;
5750 }
5751
5752 #if XNU_TARGET_OS_OSX
5753 errno_t
ifnet_framer_stub(struct ifnet * ifp,struct mbuf ** m,const struct sockaddr * dest,const char * dest_linkaddr,const char * frame_type,u_int32_t * pre,u_int32_t * post)5754 ifnet_framer_stub(struct ifnet *ifp, struct mbuf **m,
5755 const struct sockaddr *dest, const char *dest_linkaddr,
5756 const char *frame_type, u_int32_t *pre, u_int32_t *post)
5757 {
5758 if (pre != NULL) {
5759 *pre = 0;
5760 }
5761 if (post != NULL) {
5762 *post = 0;
5763 }
5764
5765 return ifp->if_framer_legacy(ifp, m, dest, dest_linkaddr, frame_type);
5766 }
5767 #endif /* XNU_TARGET_OS_OSX */
5768
5769 static boolean_t
packet_has_vlan_tag(struct mbuf * m)5770 packet_has_vlan_tag(struct mbuf * m)
5771 {
5772 u_int tag = 0;
5773
5774 if ((m->m_pkthdr.csum_flags & CSUM_VLAN_TAG_VALID) != 0) {
5775 tag = EVL_VLANOFTAG(m->m_pkthdr.vlan_tag);
5776 if (tag == 0) {
5777 /* the packet is just priority-tagged, clear the bit */
5778 m->m_pkthdr.csum_flags &= ~CSUM_VLAN_TAG_VALID;
5779 }
5780 }
5781 return tag != 0;
5782 }
5783
5784 static int
dlil_interface_filters_input(struct ifnet * ifp,struct mbuf ** m_p,char ** frame_header_p,protocol_family_t protocol_family)5785 dlil_interface_filters_input(struct ifnet *ifp, struct mbuf **m_p,
5786 char **frame_header_p, protocol_family_t protocol_family)
5787 {
5788 boolean_t is_vlan_packet = FALSE;
5789 struct ifnet_filter *filter;
5790 struct mbuf *m = *m_p;
5791
5792 is_vlan_packet = packet_has_vlan_tag(m);
5793
5794 if (TAILQ_EMPTY(&ifp->if_flt_head)) {
5795 return 0;
5796 }
5797
5798 /*
5799 * Pass the inbound packet to the interface filters
5800 */
5801 lck_mtx_lock_spin(&ifp->if_flt_lock);
5802 /* prevent filter list from changing in case we drop the lock */
5803 if_flt_monitor_busy(ifp);
5804 TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
5805 int result;
5806
5807 /* exclude VLAN packets from external filters PR-3586856 */
5808 if (is_vlan_packet &&
5809 (filter->filt_flags & DLIL_IFF_INTERNAL) == 0) {
5810 continue;
5811 }
5812
5813 if (!filter->filt_skip && filter->filt_input != NULL &&
5814 (filter->filt_protocol == 0 ||
5815 filter->filt_protocol == protocol_family)) {
5816 lck_mtx_unlock(&ifp->if_flt_lock);
5817
5818 result = (*filter->filt_input)(filter->filt_cookie,
5819 ifp, protocol_family, m_p, frame_header_p);
5820
5821 lck_mtx_lock_spin(&ifp->if_flt_lock);
5822 if (result != 0) {
5823 /* we're done with the filter list */
5824 if_flt_monitor_unbusy(ifp);
5825 lck_mtx_unlock(&ifp->if_flt_lock);
5826 return result;
5827 }
5828 }
5829 }
5830 /* we're done with the filter list */
5831 if_flt_monitor_unbusy(ifp);
5832 lck_mtx_unlock(&ifp->if_flt_lock);
5833
5834 /*
5835 * Strip away M_PROTO1 bit prior to sending packet up the stack as
5836 * it is meant to be local to a subsystem -- if_bridge for M_PROTO1
5837 */
5838 if (*m_p != NULL) {
5839 (*m_p)->m_flags &= ~M_PROTO1;
5840 }
5841
5842 return 0;
5843 }
5844
5845 __attribute__((noinline))
5846 static int
dlil_interface_filters_output(struct ifnet * ifp,struct mbuf ** m_p,protocol_family_t protocol_family)5847 dlil_interface_filters_output(struct ifnet *ifp, struct mbuf **m_p,
5848 protocol_family_t protocol_family)
5849 {
5850 boolean_t is_vlan_packet;
5851 struct ifnet_filter *filter;
5852 struct mbuf *m = *m_p;
5853
5854 is_vlan_packet = packet_has_vlan_tag(m);
5855
5856 /*
5857 * Pass the outbound packet to the interface filters
5858 */
5859 lck_mtx_lock_spin(&ifp->if_flt_lock);
5860 /* prevent filter list from changing in case we drop the lock */
5861 if_flt_monitor_busy(ifp);
5862 TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
5863 int result;
5864
5865 /* exclude VLAN packets from external filters PR-3586856 */
5866 if (is_vlan_packet &&
5867 (filter->filt_flags & DLIL_IFF_INTERNAL) == 0) {
5868 continue;
5869 }
5870
5871 if (!filter->filt_skip && filter->filt_output != NULL &&
5872 (filter->filt_protocol == 0 ||
5873 filter->filt_protocol == protocol_family)) {
5874 lck_mtx_unlock(&ifp->if_flt_lock);
5875
5876 result = filter->filt_output(filter->filt_cookie, ifp,
5877 protocol_family, m_p);
5878
5879 lck_mtx_lock_spin(&ifp->if_flt_lock);
5880 if (result != 0) {
5881 /* we're done with the filter list */
5882 if_flt_monitor_unbusy(ifp);
5883 lck_mtx_unlock(&ifp->if_flt_lock);
5884 return result;
5885 }
5886 }
5887 }
5888 /* we're done with the filter list */
5889 if_flt_monitor_unbusy(ifp);
5890 lck_mtx_unlock(&ifp->if_flt_lock);
5891
5892 return 0;
5893 }
5894
5895 static void
dlil_ifproto_input(struct if_proto * ifproto,mbuf_t m)5896 dlil_ifproto_input(struct if_proto * ifproto, mbuf_t m)
5897 {
5898 int error;
5899
5900 if (ifproto->proto_kpi == kProtoKPI_v1) {
5901 /* Version 1 protocols get one packet at a time */
5902 while (m != NULL) {
5903 char * frame_header;
5904 mbuf_t next_packet;
5905
5906 next_packet = m->m_nextpkt;
5907 m->m_nextpkt = NULL;
5908 frame_header = m->m_pkthdr.pkt_hdr;
5909 m->m_pkthdr.pkt_hdr = NULL;
5910 error = (*ifproto->kpi.v1.input)(ifproto->ifp,
5911 ifproto->protocol_family, m, frame_header);
5912 if (error != 0 && error != EJUSTRETURN) {
5913 m_freem(m);
5914 }
5915 m = next_packet;
5916 }
5917 } else if (ifproto->proto_kpi == kProtoKPI_v2) {
5918 /* Version 2 protocols support packet lists */
5919 error = (*ifproto->kpi.v2.input)(ifproto->ifp,
5920 ifproto->protocol_family, m);
5921 if (error != 0 && error != EJUSTRETURN) {
5922 m_freem_list(m);
5923 }
5924 }
5925 }
5926
5927 static void
dlil_input_stats_add(const struct ifnet_stat_increment_param * s,struct dlil_threading_info * inp,struct ifnet * ifp,boolean_t poll)5928 dlil_input_stats_add(const struct ifnet_stat_increment_param *s,
5929 struct dlil_threading_info *inp, struct ifnet *ifp, boolean_t poll)
5930 {
5931 struct ifnet_stat_increment_param *d = &inp->dlth_stats;
5932
5933 if (s->packets_in != 0) {
5934 d->packets_in += s->packets_in;
5935 }
5936 if (s->bytes_in != 0) {
5937 d->bytes_in += s->bytes_in;
5938 }
5939 if (s->errors_in != 0) {
5940 d->errors_in += s->errors_in;
5941 }
5942
5943 if (s->packets_out != 0) {
5944 d->packets_out += s->packets_out;
5945 }
5946 if (s->bytes_out != 0) {
5947 d->bytes_out += s->bytes_out;
5948 }
5949 if (s->errors_out != 0) {
5950 d->errors_out += s->errors_out;
5951 }
5952
5953 if (s->collisions != 0) {
5954 d->collisions += s->collisions;
5955 }
5956 if (s->dropped != 0) {
5957 d->dropped += s->dropped;
5958 }
5959
5960 if (poll) {
5961 PKTCNTR_ADD(&ifp->if_poll_tstats, s->packets_in, s->bytes_in);
5962 }
5963 }
5964
5965 static boolean_t
dlil_input_stats_sync(struct ifnet * ifp,struct dlil_threading_info * inp)5966 dlil_input_stats_sync(struct ifnet *ifp, struct dlil_threading_info *inp)
5967 {
5968 struct ifnet_stat_increment_param *s = &inp->dlth_stats;
5969
5970 /*
5971 * Use of atomic operations is unavoidable here because
5972 * these stats may also be incremented elsewhere via KPIs.
5973 */
5974 if (s->packets_in != 0) {
5975 atomic_add_64(&ifp->if_data.ifi_ipackets, s->packets_in);
5976 s->packets_in = 0;
5977 }
5978 if (s->bytes_in != 0) {
5979 atomic_add_64(&ifp->if_data.ifi_ibytes, s->bytes_in);
5980 s->bytes_in = 0;
5981 }
5982 if (s->errors_in != 0) {
5983 atomic_add_64(&ifp->if_data.ifi_ierrors, s->errors_in);
5984 s->errors_in = 0;
5985 }
5986
5987 if (s->packets_out != 0) {
5988 atomic_add_64(&ifp->if_data.ifi_opackets, s->packets_out);
5989 s->packets_out = 0;
5990 }
5991 if (s->bytes_out != 0) {
5992 atomic_add_64(&ifp->if_data.ifi_obytes, s->bytes_out);
5993 s->bytes_out = 0;
5994 }
5995 if (s->errors_out != 0) {
5996 atomic_add_64(&ifp->if_data.ifi_oerrors, s->errors_out);
5997 s->errors_out = 0;
5998 }
5999
6000 if (s->collisions != 0) {
6001 atomic_add_64(&ifp->if_data.ifi_collisions, s->collisions);
6002 s->collisions = 0;
6003 }
6004 if (s->dropped != 0) {
6005 atomic_add_64(&ifp->if_data.ifi_iqdrops, s->dropped);
6006 s->dropped = 0;
6007 }
6008
6009 /*
6010 * No need for atomic operations as they are modified here
6011 * only from within the DLIL input thread context.
6012 */
6013 if (ifp->if_poll_tstats.packets != 0) {
6014 ifp->if_poll_pstats.ifi_poll_packets += ifp->if_poll_tstats.packets;
6015 ifp->if_poll_tstats.packets = 0;
6016 }
6017 if (ifp->if_poll_tstats.bytes != 0) {
6018 ifp->if_poll_pstats.ifi_poll_bytes += ifp->if_poll_tstats.bytes;
6019 ifp->if_poll_tstats.bytes = 0;
6020 }
6021
6022 return ifp->if_data_threshold != 0;
6023 }
6024
6025 __private_extern__ void
dlil_input_packet_list(struct ifnet * ifp,struct mbuf * m)6026 dlil_input_packet_list(struct ifnet *ifp, struct mbuf *m)
6027 {
6028 return dlil_input_packet_list_common(ifp, m, 0,
6029 IFNET_MODEL_INPUT_POLL_OFF, FALSE);
6030 }
6031
6032 __private_extern__ void
dlil_input_packet_list_extended(struct ifnet * ifp,struct mbuf * m,u_int32_t cnt,ifnet_model_t mode)6033 dlil_input_packet_list_extended(struct ifnet *ifp, struct mbuf *m,
6034 u_int32_t cnt, ifnet_model_t mode)
6035 {
6036 return dlil_input_packet_list_common(ifp, m, cnt, mode, TRUE);
6037 }
6038
6039 static void
dlil_input_packet_list_common(struct ifnet * ifp_param,struct mbuf * m,u_int32_t cnt,ifnet_model_t mode,boolean_t ext)6040 dlil_input_packet_list_common(struct ifnet *ifp_param, struct mbuf *m,
6041 u_int32_t cnt, ifnet_model_t mode, boolean_t ext)
6042 {
6043 int error = 0;
6044 protocol_family_t protocol_family;
6045 mbuf_t next_packet;
6046 ifnet_t ifp = ifp_param;
6047 char *frame_header = NULL;
6048 struct if_proto *last_ifproto = NULL;
6049 mbuf_t pkt_first = NULL;
6050 mbuf_t *pkt_next = NULL;
6051 u_int32_t poll_thresh = 0, poll_ival = 0;
6052 int iorefcnt = 0;
6053
6054 KERNEL_DEBUG(DBG_FNC_DLIL_INPUT | DBG_FUNC_START, 0, 0, 0, 0, 0);
6055
6056 if (ext && mode == IFNET_MODEL_INPUT_POLL_ON && cnt > 1 &&
6057 (poll_ival = if_rxpoll_interval_pkts) > 0) {
6058 poll_thresh = cnt;
6059 }
6060
6061 while (m != NULL) {
6062 struct if_proto *ifproto = NULL;
6063 uint32_t pktf_mask; /* pkt flags to preserve */
6064
6065 m_add_crumb(m, PKT_CRUMB_DLIL_INPUT);
6066
6067 if (ifp_param == NULL) {
6068 ifp = m->m_pkthdr.rcvif;
6069 }
6070
6071 if ((ifp->if_eflags & IFEF_RXPOLL) &&
6072 (ifp->if_xflags & IFXF_LEGACY) && poll_thresh != 0 &&
6073 poll_ival > 0 && (--poll_thresh % poll_ival) == 0) {
6074 ifnet_poll(ifp);
6075 }
6076
6077 /* Check if this mbuf looks valid */
6078 MBUF_INPUT_CHECK(m, ifp);
6079
6080 next_packet = m->m_nextpkt;
6081 m->m_nextpkt = NULL;
6082 frame_header = m->m_pkthdr.pkt_hdr;
6083 m->m_pkthdr.pkt_hdr = NULL;
6084
6085 /*
6086 * Get an IO reference count if the interface is not
6087 * loopback (lo0) and it is attached; lo0 never goes
6088 * away, so optimize for that.
6089 */
6090 if (ifp != lo_ifp) {
6091 /* iorefcnt is 0 if it hasn't been taken yet */
6092 if (iorefcnt == 0) {
6093 if (!ifnet_datamov_begin(ifp)) {
6094 m_freem(m);
6095 goto next;
6096 }
6097 }
6098 iorefcnt = 1;
6099 /*
6100 * Preserve the time stamp and skip pktap flags.
6101 */
6102 pktf_mask = PKTF_TS_VALID | PKTF_SKIP_PKTAP;
6103 } else {
6104 /*
6105 * If this arrived on lo0, preserve interface addr
6106 * info to allow for connectivity between loopback
6107 * and local interface addresses.
6108 */
6109 pktf_mask = (PKTF_LOOP | PKTF_IFAINFO);
6110 }
6111 pktf_mask |= PKTF_WAKE_PKT;
6112
6113 /* make sure packet comes in clean */
6114 m_classifier_init(m, pktf_mask);
6115
6116 ifp_inc_traffic_class_in(ifp, m);
6117
6118 /* find which protocol family this packet is for */
6119 ifnet_lock_shared(ifp);
6120 error = (*ifp->if_demux)(ifp, m, frame_header,
6121 &protocol_family);
6122 ifnet_lock_done(ifp);
6123 if (error != 0) {
6124 if (error == EJUSTRETURN) {
6125 goto next;
6126 }
6127 protocol_family = 0;
6128 }
6129
6130 #if (DEVELOPMENT || DEBUG)
6131 /*
6132 * For testing we do not care about broadcast and multicast packets as
6133 * they are not as controllable as unicast traffic
6134 */
6135 if (__improbable(ifp->if_xflags & IFXF_MARK_WAKE_PKT)) {
6136 if ((protocol_family == PF_INET || protocol_family == PF_INET6) &&
6137 (m->m_flags & (M_BCAST | M_MCAST)) == 0) {
6138 /*
6139 * This is a one-shot command
6140 */
6141 ifp->if_xflags &= ~IFXF_MARK_WAKE_PKT;
6142 m->m_pkthdr.pkt_flags |= PKTF_WAKE_PKT;
6143 }
6144 }
6145 #endif /* (DEVELOPMENT || DEBUG) */
6146 if (__improbable(net_wake_pkt_debug > 0 && (m->m_pkthdr.pkt_flags & PKTF_WAKE_PKT))) {
6147 char buffer[64];
6148 size_t buflen = MIN(mbuf_pkthdr_len(m), sizeof(buffer));
6149
6150 os_log(OS_LOG_DEFAULT, "wake packet from %s len %d",
6151 ifp->if_xname, m_pktlen(m));
6152 if (mbuf_copydata(m, 0, buflen, buffer) == 0) {
6153 log_hexdump(buffer, buflen);
6154 }
6155 }
6156
6157 pktap_input(ifp, protocol_family, m, frame_header);
6158
6159 /* Drop v4 packets received on CLAT46 enabled cell interface */
6160 if (protocol_family == PF_INET && IS_INTF_CLAT46(ifp) &&
6161 ifp->if_type == IFT_CELLULAR) {
6162 m_freem(m);
6163 ip6stat.ip6s_clat464_in_v4_drop++;
6164 goto next;
6165 }
6166
6167 /* Translate the packet if it is received on CLAT interface */
6168 if (protocol_family == PF_INET6 && IS_INTF_CLAT46(ifp)
6169 && dlil_is_clat_needed(protocol_family, m)) {
6170 char *data = NULL;
6171 struct ether_header eh;
6172 struct ether_header *ehp = NULL;
6173
6174 if (ifp->if_type == IFT_ETHER) {
6175 ehp = (struct ether_header *)(void *)frame_header;
6176 /* Skip RX Ethernet packets if they are not IPV6 */
6177 if (ntohs(ehp->ether_type) != ETHERTYPE_IPV6) {
6178 goto skip_clat;
6179 }
6180
6181 /* Keep a copy of frame_header for Ethernet packets */
6182 bcopy(frame_header, (caddr_t)&eh, ETHER_HDR_LEN);
6183 }
6184 error = dlil_clat64(ifp, &protocol_family, &m);
6185 data = (char *) mbuf_data(m);
6186 if (error != 0) {
6187 m_freem(m);
6188 ip6stat.ip6s_clat464_in_drop++;
6189 goto next;
6190 }
6191 /* Native v6 should be No-op */
6192 if (protocol_family != PF_INET) {
6193 goto skip_clat;
6194 }
6195
6196 /* Do this only for translated v4 packets. */
6197 switch (ifp->if_type) {
6198 case IFT_CELLULAR:
6199 frame_header = data;
6200 break;
6201 case IFT_ETHER:
6202 /*
6203 * Drop if the mbuf doesn't have enough
6204 * space for Ethernet header
6205 */
6206 if (M_LEADINGSPACE(m) < ETHER_HDR_LEN) {
6207 m_free(m);
6208 ip6stat.ip6s_clat464_in_drop++;
6209 goto next;
6210 }
6211 /*
6212 * Set the frame_header ETHER_HDR_LEN bytes
6213 * preceeding the data pointer. Change
6214 * the ether_type too.
6215 */
6216 frame_header = data - ETHER_HDR_LEN;
6217 eh.ether_type = htons(ETHERTYPE_IP);
6218 bcopy((caddr_t)&eh, frame_header, ETHER_HDR_LEN);
6219 break;
6220 }
6221 }
6222 skip_clat:
6223 /*
6224 * Match the wake packet against the list of ports that has been
6225 * been queried by the driver before the device went to sleep
6226 */
6227 if (__improbable(m->m_pkthdr.pkt_flags & PKTF_WAKE_PKT)) {
6228 if (protocol_family != PF_INET && protocol_family != PF_INET6) {
6229 if_ports_used_match_mbuf(ifp, protocol_family, m);
6230 }
6231 }
6232 if (hwcksum_dbg != 0 && !(ifp->if_flags & IFF_LOOPBACK) &&
6233 !(m->m_pkthdr.pkt_flags & PKTF_LOOP)) {
6234 dlil_input_cksum_dbg(ifp, m, frame_header,
6235 protocol_family);
6236 }
6237 /*
6238 * For partial checksum offload, we expect the driver to
6239 * set the start offset indicating the start of the span
6240 * that is covered by the hardware-computed checksum;
6241 * adjust this start offset accordingly because the data
6242 * pointer has been advanced beyond the link-layer header.
6243 *
6244 * Virtual lan types (bridge, vlan, bond) can call
6245 * dlil_input_packet_list() with the same packet with the
6246 * checksum flags set. Set a flag indicating that the
6247 * adjustment has already been done.
6248 */
6249 if ((m->m_pkthdr.csum_flags & CSUM_ADJUST_DONE) != 0) {
6250 /* adjustment has already been done */
6251 } else if ((m->m_pkthdr.csum_flags &
6252 (CSUM_DATA_VALID | CSUM_PARTIAL)) ==
6253 (CSUM_DATA_VALID | CSUM_PARTIAL)) {
6254 int adj;
6255 if (frame_header == NULL ||
6256 frame_header < (char *)mbuf_datastart(m) ||
6257 frame_header > (char *)m->m_data ||
6258 (adj = (int)(m->m_data - frame_header)) >
6259 m->m_pkthdr.csum_rx_start) {
6260 m->m_pkthdr.csum_data = 0;
6261 m->m_pkthdr.csum_flags &= ~CSUM_DATA_VALID;
6262 hwcksum_in_invalidated++;
6263 } else {
6264 m->m_pkthdr.csum_rx_start -= adj;
6265 }
6266 /* make sure we don't adjust more than once */
6267 m->m_pkthdr.csum_flags |= CSUM_ADJUST_DONE;
6268 }
6269 if (clat_debug) {
6270 pktap_input(ifp, protocol_family, m, frame_header);
6271 }
6272
6273 if (m->m_flags & (M_BCAST | M_MCAST)) {
6274 atomic_add_64(&ifp->if_imcasts, 1);
6275 }
6276
6277 /* run interface filters */
6278 error = dlil_interface_filters_input(ifp, &m,
6279 &frame_header, protocol_family);
6280 if (error != 0) {
6281 if (error != EJUSTRETURN) {
6282 m_freem(m);
6283 }
6284 goto next;
6285 }
6286 /*
6287 * A VLAN interface receives VLAN-tagged packets by attaching
6288 * its PF_VLAN protocol to a parent interface. When a VLAN
6289 * interface is a member of a bridge, the parent interface
6290 * receives VLAN-tagged M_PROMISC packets. A VLAN-tagged
6291 * M_PROMISC packet must be processed by the VLAN protocol
6292 * so that it can be sent up the stack via
6293 * dlil_input_packet_list(). That allows the bridge interface's
6294 * input filter, attached to the VLAN interface, to process
6295 * the packet.
6296 */
6297 if (protocol_family != PF_VLAN &&
6298 (m->m_flags & M_PROMISC) != 0) {
6299 m_freem(m);
6300 goto next;
6301 }
6302
6303 /* Lookup the protocol attachment to this interface */
6304 if (protocol_family == 0) {
6305 ifproto = NULL;
6306 } else if (last_ifproto != NULL && last_ifproto->ifp == ifp &&
6307 (last_ifproto->protocol_family == protocol_family)) {
6308 VERIFY(ifproto == NULL);
6309 ifproto = last_ifproto;
6310 if_proto_ref(last_ifproto);
6311 } else {
6312 VERIFY(ifproto == NULL);
6313 ifnet_lock_shared(ifp);
6314 /* callee holds a proto refcnt upon success */
6315 ifproto = find_attached_proto(ifp, protocol_family);
6316 ifnet_lock_done(ifp);
6317 }
6318 if (ifproto == NULL) {
6319 /* no protocol for this packet, discard */
6320 m_freem(m);
6321 goto next;
6322 }
6323 if (ifproto != last_ifproto) {
6324 if (last_ifproto != NULL) {
6325 /* pass up the list for the previous protocol */
6326 dlil_ifproto_input(last_ifproto, pkt_first);
6327 pkt_first = NULL;
6328 if_proto_free(last_ifproto);
6329 }
6330 last_ifproto = ifproto;
6331 if_proto_ref(ifproto);
6332 }
6333 /* extend the list */
6334 m->m_pkthdr.pkt_hdr = frame_header;
6335 if (pkt_first == NULL) {
6336 pkt_first = m;
6337 } else {
6338 *pkt_next = m;
6339 }
6340 pkt_next = &m->m_nextpkt;
6341
6342 next:
6343 if (next_packet == NULL && last_ifproto != NULL) {
6344 /* pass up the last list of packets */
6345 dlil_ifproto_input(last_ifproto, pkt_first);
6346 if_proto_free(last_ifproto);
6347 last_ifproto = NULL;
6348 }
6349 if (ifproto != NULL) {
6350 if_proto_free(ifproto);
6351 ifproto = NULL;
6352 }
6353
6354 m = next_packet;
6355
6356 /* update the driver's multicast filter, if needed */
6357 if (ifp->if_updatemcasts > 0 && if_mcasts_update(ifp) == 0) {
6358 ifp->if_updatemcasts = 0;
6359 }
6360 if (iorefcnt == 1) {
6361 /* If the next mbuf is on a different interface, unlock data-mov */
6362 if (!m || (ifp != ifp_param && ifp != m->m_pkthdr.rcvif)) {
6363 ifnet_datamov_end(ifp);
6364 iorefcnt = 0;
6365 }
6366 }
6367 }
6368
6369 KERNEL_DEBUG(DBG_FNC_DLIL_INPUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
6370 }
6371
6372 errno_t
if_mcasts_update(struct ifnet * ifp)6373 if_mcasts_update(struct ifnet *ifp)
6374 {
6375 errno_t err;
6376
6377 err = ifnet_ioctl(ifp, 0, SIOCADDMULTI, NULL);
6378 if (err == EAFNOSUPPORT) {
6379 err = 0;
6380 }
6381 DLIL_PRINTF("%s: %s %d suspended link-layer multicast membership(s) "
6382 "(err=%d)\n", if_name(ifp),
6383 (err == 0 ? "successfully restored" : "failed to restore"),
6384 ifp->if_updatemcasts, err);
6385
6386 /* just return success */
6387 return 0;
6388 }
6389
6390 /* If ifp is set, we will increment the generation for the interface */
6391 int
dlil_post_complete_msg(struct ifnet * ifp,struct kev_msg * event)6392 dlil_post_complete_msg(struct ifnet *ifp, struct kev_msg *event)
6393 {
6394 if (ifp != NULL) {
6395 ifnet_increment_generation(ifp);
6396 }
6397
6398 #if NECP
6399 necp_update_all_clients();
6400 #endif /* NECP */
6401
6402 return kev_post_msg(event);
6403 }
6404
6405 __private_extern__ void
dlil_post_sifflags_msg(struct ifnet * ifp)6406 dlil_post_sifflags_msg(struct ifnet * ifp)
6407 {
6408 struct kev_msg ev_msg;
6409 struct net_event_data ev_data;
6410
6411 bzero(&ev_data, sizeof(ev_data));
6412 bzero(&ev_msg, sizeof(ev_msg));
6413 ev_msg.vendor_code = KEV_VENDOR_APPLE;
6414 ev_msg.kev_class = KEV_NETWORK_CLASS;
6415 ev_msg.kev_subclass = KEV_DL_SUBCLASS;
6416 ev_msg.event_code = KEV_DL_SIFFLAGS;
6417 strlcpy(&ev_data.if_name[0], ifp->if_name, IFNAMSIZ);
6418 ev_data.if_family = ifp->if_family;
6419 ev_data.if_unit = (u_int32_t) ifp->if_unit;
6420 ev_msg.dv[0].data_length = sizeof(struct net_event_data);
6421 ev_msg.dv[0].data_ptr = &ev_data;
6422 ev_msg.dv[1].data_length = 0;
6423 dlil_post_complete_msg(ifp, &ev_msg);
6424 }
6425
6426 #define TMP_IF_PROTO_ARR_SIZE 10
6427 static int
dlil_event_internal(struct ifnet * ifp,struct kev_msg * event,bool update_generation)6428 dlil_event_internal(struct ifnet *ifp, struct kev_msg *event, bool update_generation)
6429 {
6430 struct ifnet_filter *filter = NULL;
6431 struct if_proto *proto = NULL;
6432 int if_proto_count = 0;
6433 struct if_proto *tmp_ifproto_stack_arr[TMP_IF_PROTO_ARR_SIZE] = {NULL};
6434 struct if_proto **tmp_ifproto_arr = tmp_ifproto_stack_arr;
6435 int tmp_ifproto_arr_idx = 0;
6436
6437 /*
6438 * Pass the event to the interface filters
6439 */
6440 lck_mtx_lock_spin(&ifp->if_flt_lock);
6441 /* prevent filter list from changing in case we drop the lock */
6442 if_flt_monitor_busy(ifp);
6443 TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
6444 if (filter->filt_event != NULL) {
6445 lck_mtx_unlock(&ifp->if_flt_lock);
6446
6447 filter->filt_event(filter->filt_cookie, ifp,
6448 filter->filt_protocol, event);
6449
6450 lck_mtx_lock_spin(&ifp->if_flt_lock);
6451 }
6452 }
6453 /* we're done with the filter list */
6454 if_flt_monitor_unbusy(ifp);
6455 lck_mtx_unlock(&ifp->if_flt_lock);
6456
6457 /* Get an io ref count if the interface is attached */
6458 if (!ifnet_is_attached(ifp, 1)) {
6459 goto done;
6460 }
6461
6462 /*
6463 * An embedded tmp_list_entry in if_proto may still get
6464 * over-written by another thread after giving up ifnet lock,
6465 * therefore we are avoiding embedded pointers here.
6466 */
6467 ifnet_lock_shared(ifp);
6468 if_proto_count = dlil_ifp_protolist(ifp, NULL, 0);
6469 if (if_proto_count) {
6470 int i;
6471 VERIFY(ifp->if_proto_hash != NULL);
6472 if (if_proto_count <= TMP_IF_PROTO_ARR_SIZE) {
6473 tmp_ifproto_arr = tmp_ifproto_stack_arr;
6474 } else {
6475 tmp_ifproto_arr = kalloc_type(struct if_proto *,
6476 if_proto_count, Z_WAITOK | Z_ZERO);
6477 if (tmp_ifproto_arr == NULL) {
6478 ifnet_lock_done(ifp);
6479 goto cleanup;
6480 }
6481 }
6482
6483 for (i = 0; i < PROTO_HASH_SLOTS; i++) {
6484 SLIST_FOREACH(proto, &ifp->if_proto_hash[i],
6485 next_hash) {
6486 if_proto_ref(proto);
6487 tmp_ifproto_arr[tmp_ifproto_arr_idx] = proto;
6488 tmp_ifproto_arr_idx++;
6489 }
6490 }
6491 VERIFY(if_proto_count == tmp_ifproto_arr_idx);
6492 }
6493 ifnet_lock_done(ifp);
6494
6495 for (tmp_ifproto_arr_idx = 0; tmp_ifproto_arr_idx < if_proto_count;
6496 tmp_ifproto_arr_idx++) {
6497 proto = tmp_ifproto_arr[tmp_ifproto_arr_idx];
6498 VERIFY(proto != NULL);
6499 proto_media_event eventp =
6500 (proto->proto_kpi == kProtoKPI_v1 ?
6501 proto->kpi.v1.event :
6502 proto->kpi.v2.event);
6503
6504 if (eventp != NULL) {
6505 eventp(ifp, proto->protocol_family,
6506 event);
6507 }
6508 if_proto_free(proto);
6509 }
6510
6511 cleanup:
6512 if (tmp_ifproto_arr != tmp_ifproto_stack_arr) {
6513 kfree_type(struct if_proto *, if_proto_count, tmp_ifproto_arr);
6514 }
6515
6516 /* Pass the event to the interface */
6517 if (ifp->if_event != NULL) {
6518 ifp->if_event(ifp, event);
6519 }
6520
6521 /* Release the io ref count */
6522 ifnet_decr_iorefcnt(ifp);
6523 done:
6524 return dlil_post_complete_msg(update_generation ? ifp : NULL, event);
6525 }
6526
6527 errno_t
ifnet_event(ifnet_t ifp,struct kern_event_msg * event)6528 ifnet_event(ifnet_t ifp, struct kern_event_msg *event)
6529 {
6530 struct kev_msg kev_msg;
6531 int result = 0;
6532
6533 if (ifp == NULL || event == NULL) {
6534 return EINVAL;
6535 }
6536
6537 bzero(&kev_msg, sizeof(kev_msg));
6538 kev_msg.vendor_code = event->vendor_code;
6539 kev_msg.kev_class = event->kev_class;
6540 kev_msg.kev_subclass = event->kev_subclass;
6541 kev_msg.event_code = event->event_code;
6542 kev_msg.dv[0].data_ptr = &event->event_data[0];
6543 kev_msg.dv[0].data_length = event->total_size - KEV_MSG_HEADER_SIZE;
6544 kev_msg.dv[1].data_length = 0;
6545
6546 result = dlil_event_internal(ifp, &kev_msg, TRUE);
6547
6548 return result;
6549 }
6550
6551 static void
dlil_count_chain_len(mbuf_t m,struct chain_len_stats * cls)6552 dlil_count_chain_len(mbuf_t m, struct chain_len_stats *cls)
6553 {
6554 mbuf_t n = m;
6555 int chainlen = 0;
6556
6557 while (n != NULL) {
6558 chainlen++;
6559 n = n->m_next;
6560 }
6561 switch (chainlen) {
6562 case 0:
6563 break;
6564 case 1:
6565 atomic_add_64(&cls->cls_one, 1);
6566 break;
6567 case 2:
6568 atomic_add_64(&cls->cls_two, 1);
6569 break;
6570 case 3:
6571 atomic_add_64(&cls->cls_three, 1);
6572 break;
6573 case 4:
6574 atomic_add_64(&cls->cls_four, 1);
6575 break;
6576 case 5:
6577 default:
6578 atomic_add_64(&cls->cls_five_or_more, 1);
6579 break;
6580 }
6581 }
6582
6583 #if CONFIG_DTRACE
6584 __attribute__((noinline))
6585 static void
dlil_output_dtrace(ifnet_t ifp,protocol_family_t proto_family,mbuf_t m)6586 dlil_output_dtrace(ifnet_t ifp, protocol_family_t proto_family, mbuf_t m)
6587 {
6588 if (proto_family == PF_INET) {
6589 struct ip *ip = mtod(m, struct ip *);
6590 DTRACE_IP6(send, struct mbuf *, m, struct inpcb *, NULL,
6591 struct ip *, ip, struct ifnet *, ifp,
6592 struct ip *, ip, struct ip6_hdr *, NULL);
6593 } else if (proto_family == PF_INET6) {
6594 struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
6595 DTRACE_IP6(send, struct mbuf *, m, struct inpcb *, NULL,
6596 struct ip6_hdr *, ip6, struct ifnet *, ifp,
6597 struct ip *, NULL, struct ip6_hdr *, ip6);
6598 }
6599 }
6600 #endif /* CONFIG_DTRACE */
6601
6602 /*
6603 * dlil_output
6604 *
6605 * Caller should have a lock on the protocol domain if the protocol
6606 * doesn't support finer grained locking. In most cases, the lock
6607 * will be held from the socket layer and won't be released until
6608 * we return back to the socket layer.
6609 *
6610 * This does mean that we must take a protocol lock before we take
6611 * an interface lock if we're going to take both. This makes sense
6612 * because a protocol is likely to interact with an ifp while it
6613 * is under the protocol lock.
6614 *
6615 * An advisory code will be returned if adv is not null. This
6616 * can be used to provide feedback about interface queues to the
6617 * application.
6618 */
6619 errno_t
dlil_output(ifnet_t ifp,protocol_family_t proto_family,mbuf_t packetlist,void * route,const struct sockaddr * dest,int raw,struct flowadv * adv)6620 dlil_output(ifnet_t ifp, protocol_family_t proto_family, mbuf_t packetlist,
6621 void *route, const struct sockaddr *dest, int raw, struct flowadv *adv)
6622 {
6623 char *frame_type = NULL;
6624 char *dst_linkaddr = NULL;
6625 int retval = 0;
6626 char frame_type_buffer[MAX_FRAME_TYPE_SIZE * 4];
6627 char dst_linkaddr_buffer[MAX_LINKADDR * 4];
6628 struct if_proto *proto = NULL;
6629 mbuf_t m = NULL;
6630 mbuf_t send_head = NULL;
6631 mbuf_t *send_tail = &send_head;
6632 int iorefcnt = 0;
6633 u_int32_t pre = 0, post = 0;
6634 u_int32_t fpkts = 0, fbytes = 0;
6635 int32_t flen = 0;
6636 struct timespec now;
6637 u_int64_t now_nsec;
6638 boolean_t did_clat46 = FALSE;
6639 protocol_family_t old_proto_family = proto_family;
6640 struct sockaddr_in6 dest6;
6641 struct rtentry *rt = NULL;
6642 u_int32_t m_loop_set = 0;
6643
6644 KERNEL_DEBUG(DBG_FNC_DLIL_OUTPUT | DBG_FUNC_START, 0, 0, 0, 0, 0);
6645
6646 /*
6647 * Get an io refcnt if the interface is attached to prevent ifnet_detach
6648 * from happening while this operation is in progress
6649 */
6650 if (!ifnet_datamov_begin(ifp)) {
6651 retval = ENXIO;
6652 goto cleanup;
6653 }
6654 iorefcnt = 1;
6655
6656 VERIFY(ifp->if_output_dlil != NULL);
6657
6658 /* update the driver's multicast filter, if needed */
6659 if (ifp->if_updatemcasts > 0 && if_mcasts_update(ifp) == 0) {
6660 ifp->if_updatemcasts = 0;
6661 }
6662
6663 frame_type = frame_type_buffer;
6664 dst_linkaddr = dst_linkaddr_buffer;
6665
6666 if (raw == 0) {
6667 ifnet_lock_shared(ifp);
6668 /* callee holds a proto refcnt upon success */
6669 proto = find_attached_proto(ifp, proto_family);
6670 if (proto == NULL) {
6671 ifnet_lock_done(ifp);
6672 retval = ENXIO;
6673 goto cleanup;
6674 }
6675 ifnet_lock_done(ifp);
6676 }
6677
6678 preout_again:
6679 if (packetlist == NULL) {
6680 goto cleanup;
6681 }
6682
6683 m = packetlist;
6684 packetlist = packetlist->m_nextpkt;
6685 m->m_nextpkt = NULL;
6686
6687 m_add_crumb(m, PKT_CRUMB_DLIL_OUTPUT);
6688
6689 /*
6690 * Perform address family translation for the first
6691 * packet outside the loop in order to perform address
6692 * lookup for the translated proto family.
6693 */
6694 if (proto_family == PF_INET && IS_INTF_CLAT46(ifp) &&
6695 (ifp->if_type == IFT_CELLULAR ||
6696 dlil_is_clat_needed(proto_family, m))) {
6697 retval = dlil_clat46(ifp, &proto_family, &m);
6698 /*
6699 * Go to the next packet if translation fails
6700 */
6701 if (retval != 0) {
6702 m_freem(m);
6703 m = NULL;
6704 ip6stat.ip6s_clat464_out_drop++;
6705 /* Make sure that the proto family is PF_INET */
6706 ASSERT(proto_family == PF_INET);
6707 goto preout_again;
6708 }
6709 /*
6710 * Free the old one and make it point to the IPv6 proto structure.
6711 *
6712 * Change proto for the first time we have successfully
6713 * performed address family translation.
6714 */
6715 if (!did_clat46 && proto_family == PF_INET6) {
6716 did_clat46 = TRUE;
6717
6718 if (proto != NULL) {
6719 if_proto_free(proto);
6720 }
6721 ifnet_lock_shared(ifp);
6722 /* callee holds a proto refcnt upon success */
6723 proto = find_attached_proto(ifp, proto_family);
6724 if (proto == NULL) {
6725 ifnet_lock_done(ifp);
6726 retval = ENXIO;
6727 m_freem(m);
6728 m = NULL;
6729 goto cleanup;
6730 }
6731 ifnet_lock_done(ifp);
6732 if (ifp->if_type == IFT_ETHER) {
6733 /* Update the dest to translated v6 address */
6734 dest6.sin6_len = sizeof(struct sockaddr_in6);
6735 dest6.sin6_family = AF_INET6;
6736 dest6.sin6_addr = (mtod(m, struct ip6_hdr *))->ip6_dst;
6737 dest = (const struct sockaddr *)&dest6;
6738
6739 /*
6740 * Lookup route to the translated destination
6741 * Free this route ref during cleanup
6742 */
6743 rt = rtalloc1_scoped((struct sockaddr *)&dest6,
6744 0, 0, ifp->if_index);
6745
6746 route = rt;
6747 }
6748 }
6749 }
6750
6751 /*
6752 * This path gets packet chain going to the same destination.
6753 * The pre output routine is used to either trigger resolution of
6754 * the next hop or retreive the next hop's link layer addressing.
6755 * For ex: ether_inet(6)_pre_output routine.
6756 *
6757 * If the routine returns EJUSTRETURN, it implies that packet has
6758 * been queued, and therefore we have to call preout_again for the
6759 * following packet in the chain.
6760 *
6761 * For errors other than EJUSTRETURN, the current packet is freed
6762 * and the rest of the chain (pointed by packetlist is freed as
6763 * part of clean up.
6764 *
6765 * Else if there is no error the retrieved information is used for
6766 * all the packets in the chain.
6767 */
6768 if (raw == 0) {
6769 proto_media_preout preoutp = (proto->proto_kpi == kProtoKPI_v1 ?
6770 proto->kpi.v1.pre_output : proto->kpi.v2.pre_output);
6771 retval = 0;
6772 if (preoutp != NULL) {
6773 retval = preoutp(ifp, proto_family, &m, dest, route,
6774 frame_type, dst_linkaddr);
6775
6776 if (retval != 0) {
6777 if (retval == EJUSTRETURN) {
6778 goto preout_again;
6779 }
6780 m_freem(m);
6781 m = NULL;
6782 goto cleanup;
6783 }
6784 }
6785 }
6786
6787 do {
6788 /*
6789 * pkt_hdr is set here to point to m_data prior to
6790 * calling into the framer. This value of pkt_hdr is
6791 * used by the netif gso logic to retrieve the ip header
6792 * for the TCP packets, offloaded for TSO processing.
6793 */
6794 if ((raw != 0) && (ifp->if_family == IFNET_FAMILY_ETHERNET)) {
6795 uint8_t vlan_encap_len = 0;
6796
6797 if ((m->m_pkthdr.csum_flags & CSUM_VLAN_ENCAP_PRESENT) != 0) {
6798 vlan_encap_len = ETHER_VLAN_ENCAP_LEN;
6799 }
6800 m->m_pkthdr.pkt_hdr = mtod(m, char *) + ETHER_HDR_LEN + vlan_encap_len;
6801 } else {
6802 m->m_pkthdr.pkt_hdr = mtod(m, void *);
6803 }
6804
6805 /*
6806 * Perform address family translation if needed.
6807 * For now we only support stateless 4 to 6 translation
6808 * on the out path.
6809 *
6810 * The routine below translates IP header, updates protocol
6811 * checksum and also translates ICMP.
6812 *
6813 * We skip the first packet as it is already translated and
6814 * the proto family is set to PF_INET6.
6815 */
6816 if (proto_family == PF_INET && IS_INTF_CLAT46(ifp) &&
6817 (ifp->if_type == IFT_CELLULAR ||
6818 dlil_is_clat_needed(proto_family, m))) {
6819 retval = dlil_clat46(ifp, &proto_family, &m);
6820 /* Goto the next packet if the translation fails */
6821 if (retval != 0) {
6822 m_freem(m);
6823 m = NULL;
6824 ip6stat.ip6s_clat464_out_drop++;
6825 goto next;
6826 }
6827 }
6828
6829 #if CONFIG_DTRACE
6830 if (!raw) {
6831 dlil_output_dtrace(ifp, proto_family, m);
6832 }
6833 #endif /* CONFIG_DTRACE */
6834
6835 if (raw == 0 && ifp->if_framer != NULL) {
6836 int rcvif_set = 0;
6837
6838 /*
6839 * If this is a broadcast packet that needs to be
6840 * looped back into the system, set the inbound ifp
6841 * to that of the outbound ifp. This will allow
6842 * us to determine that it is a legitimate packet
6843 * for the system. Only set the ifp if it's not
6844 * already set, just to be safe.
6845 */
6846 if ((m->m_flags & (M_BCAST | M_LOOP)) &&
6847 m->m_pkthdr.rcvif == NULL) {
6848 m->m_pkthdr.rcvif = ifp;
6849 rcvif_set = 1;
6850 }
6851 m_loop_set = m->m_flags & M_LOOP;
6852 retval = ifp->if_framer(ifp, &m, dest, dst_linkaddr,
6853 frame_type, &pre, &post);
6854 if (retval != 0) {
6855 if (retval != EJUSTRETURN) {
6856 m_freem(m);
6857 }
6858 goto next;
6859 }
6860
6861 /*
6862 * For partial checksum offload, adjust the start
6863 * and stuff offsets based on the prepended header.
6864 */
6865 if ((m->m_pkthdr.csum_flags &
6866 (CSUM_DATA_VALID | CSUM_PARTIAL)) ==
6867 (CSUM_DATA_VALID | CSUM_PARTIAL)) {
6868 m->m_pkthdr.csum_tx_stuff += pre;
6869 m->m_pkthdr.csum_tx_start += pre;
6870 }
6871
6872 if (hwcksum_dbg != 0 && !(ifp->if_flags & IFF_LOOPBACK)) {
6873 dlil_output_cksum_dbg(ifp, m, pre,
6874 proto_family);
6875 }
6876
6877 /*
6878 * Clear the ifp if it was set above, and to be
6879 * safe, only if it is still the same as the
6880 * outbound ifp we have in context. If it was
6881 * looped back, then a copy of it was sent to the
6882 * loopback interface with the rcvif set, and we
6883 * are clearing the one that will go down to the
6884 * layer below.
6885 */
6886 if (rcvif_set && m->m_pkthdr.rcvif == ifp) {
6887 m->m_pkthdr.rcvif = NULL;
6888 }
6889 }
6890
6891 /*
6892 * Let interface filters (if any) do their thing ...
6893 */
6894 retval = dlil_interface_filters_output(ifp, &m, proto_family);
6895 if (retval != 0) {
6896 if (retval != EJUSTRETURN) {
6897 m_freem(m);
6898 }
6899 goto next;
6900 }
6901 /*
6902 * Strip away M_PROTO1 bit prior to sending packet
6903 * to the driver as this field may be used by the driver
6904 */
6905 m->m_flags &= ~M_PROTO1;
6906
6907 /*
6908 * If the underlying interface is not capable of handling a
6909 * packet whose data portion spans across physically disjoint
6910 * pages, we need to "normalize" the packet so that we pass
6911 * down a chain of mbufs where each mbuf points to a span that
6912 * resides in the system page boundary. If the packet does
6913 * not cross page(s), the following is a no-op.
6914 */
6915 if (!(ifp->if_hwassist & IFNET_MULTIPAGES)) {
6916 if ((m = m_normalize(m)) == NULL) {
6917 goto next;
6918 }
6919 }
6920
6921 /*
6922 * If this is a TSO packet, make sure the interface still
6923 * advertise TSO capability.
6924 */
6925 if (TSO_IPV4_NOTOK(ifp, m) || TSO_IPV6_NOTOK(ifp, m)) {
6926 retval = EMSGSIZE;
6927 m_freem(m);
6928 goto cleanup;
6929 }
6930
6931 ifp_inc_traffic_class_out(ifp, m);
6932
6933 #if SKYWALK
6934 /*
6935 * For native skywalk devices, packets will be passed to pktap
6936 * after GSO or after the mbuf to packet conversion.
6937 * This is done for IPv4/IPv6 packets only because there is no
6938 * space in the mbuf to pass down the proto family.
6939 */
6940 if (dlil_is_native_netif_nexus(ifp)) {
6941 if (raw || m->m_pkthdr.pkt_proto == 0) {
6942 pktap_output(ifp, proto_family, m, pre, post);
6943 m->m_pkthdr.pkt_flags |= PKTF_SKIP_PKTAP;
6944 }
6945 } else {
6946 pktap_output(ifp, proto_family, m, pre, post);
6947 }
6948 #else /* SKYWALK */
6949 pktap_output(ifp, proto_family, m, pre, post);
6950 #endif /* SKYWALK */
6951
6952 /*
6953 * Count the number of elements in the mbuf chain
6954 */
6955 if (tx_chain_len_count) {
6956 dlil_count_chain_len(m, &tx_chain_len_stats);
6957 }
6958
6959 /*
6960 * Record timestamp; ifnet_enqueue() will use this info
6961 * rather than redoing the work. An optimization could
6962 * involve doing this just once at the top, if there are
6963 * no interface filters attached, but that's probably
6964 * not a big deal.
6965 */
6966 nanouptime(&now);
6967 net_timernsec(&now, &now_nsec);
6968 (void) mbuf_set_timestamp(m, now_nsec, TRUE);
6969
6970 /*
6971 * Discard partial sum information if this packet originated
6972 * from another interface; the packet would already have the
6973 * final checksum and we shouldn't recompute it.
6974 */
6975 if ((m->m_pkthdr.pkt_flags & PKTF_FORWARDED) &&
6976 (m->m_pkthdr.csum_flags & (CSUM_DATA_VALID | CSUM_PARTIAL)) ==
6977 (CSUM_DATA_VALID | CSUM_PARTIAL)) {
6978 m->m_pkthdr.csum_flags &= ~CSUM_TX_FLAGS;
6979 m->m_pkthdr.csum_data = 0;
6980 }
6981
6982 /*
6983 * Finally, call the driver.
6984 */
6985 if (ifp->if_eflags & (IFEF_SENDLIST | IFEF_ENQUEUE_MULTI)) {
6986 if (m->m_pkthdr.pkt_flags & PKTF_FORWARDED) {
6987 flen += (m_pktlen(m) - (pre + post));
6988 m->m_pkthdr.pkt_flags &= ~PKTF_FORWARDED;
6989 }
6990 *send_tail = m;
6991 send_tail = &m->m_nextpkt;
6992 } else {
6993 if (m->m_pkthdr.pkt_flags & PKTF_FORWARDED) {
6994 flen = (m_pktlen(m) - (pre + post));
6995 m->m_pkthdr.pkt_flags &= ~PKTF_FORWARDED;
6996 } else {
6997 flen = 0;
6998 }
6999 KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_START,
7000 0, 0, 0, 0, 0);
7001 retval = (*ifp->if_output_dlil)(ifp, m);
7002 if (retval == EQFULL || retval == EQSUSPENDED) {
7003 if (adv != NULL && adv->code == FADV_SUCCESS) {
7004 adv->code = (retval == EQFULL ?
7005 FADV_FLOW_CONTROLLED :
7006 FADV_SUSPENDED);
7007 }
7008 retval = 0;
7009 }
7010 if (retval == 0 && flen > 0) {
7011 fbytes += flen;
7012 fpkts++;
7013 }
7014 if (retval != 0 && dlil_verbose) {
7015 DLIL_PRINTF("%s: output error on %s retval = %d\n",
7016 __func__, if_name(ifp),
7017 retval);
7018 }
7019 KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_END,
7020 0, 0, 0, 0, 0);
7021 }
7022 KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
7023
7024 next:
7025 m = packetlist;
7026 if (m != NULL) {
7027 m->m_flags |= m_loop_set;
7028 packetlist = packetlist->m_nextpkt;
7029 m->m_nextpkt = NULL;
7030 }
7031 /* Reset the proto family to old proto family for CLAT */
7032 if (did_clat46) {
7033 proto_family = old_proto_family;
7034 }
7035 } while (m != NULL);
7036
7037 if (send_head != NULL) {
7038 KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_START,
7039 0, 0, 0, 0, 0);
7040 if (ifp->if_eflags & IFEF_SENDLIST) {
7041 retval = (*ifp->if_output_dlil)(ifp, send_head);
7042 if (retval == EQFULL || retval == EQSUSPENDED) {
7043 if (adv != NULL) {
7044 adv->code = (retval == EQFULL ?
7045 FADV_FLOW_CONTROLLED :
7046 FADV_SUSPENDED);
7047 }
7048 retval = 0;
7049 }
7050 if (retval == 0 && flen > 0) {
7051 fbytes += flen;
7052 fpkts++;
7053 }
7054 if (retval != 0 && dlil_verbose) {
7055 DLIL_PRINTF("%s: output error on %s retval = %d\n",
7056 __func__, if_name(ifp), retval);
7057 }
7058 } else {
7059 struct mbuf *send_m;
7060 int enq_cnt = 0;
7061 VERIFY(ifp->if_eflags & IFEF_ENQUEUE_MULTI);
7062 while (send_head != NULL) {
7063 send_m = send_head;
7064 send_head = send_m->m_nextpkt;
7065 send_m->m_nextpkt = NULL;
7066 retval = (*ifp->if_output_dlil)(ifp, send_m);
7067 if (retval == EQFULL || retval == EQSUSPENDED) {
7068 if (adv != NULL) {
7069 adv->code = (retval == EQFULL ?
7070 FADV_FLOW_CONTROLLED :
7071 FADV_SUSPENDED);
7072 }
7073 retval = 0;
7074 }
7075 if (retval == 0) {
7076 enq_cnt++;
7077 if (flen > 0) {
7078 fpkts++;
7079 }
7080 }
7081 if (retval != 0 && dlil_verbose) {
7082 DLIL_PRINTF("%s: output error on %s "
7083 "retval = %d\n",
7084 __func__, if_name(ifp), retval);
7085 }
7086 }
7087 if (enq_cnt > 0) {
7088 fbytes += flen;
7089 ifnet_start(ifp);
7090 }
7091 }
7092 KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
7093 }
7094
7095 KERNEL_DEBUG(DBG_FNC_DLIL_OUTPUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
7096
7097 cleanup:
7098 if (fbytes > 0) {
7099 ifp->if_fbytes += fbytes;
7100 }
7101 if (fpkts > 0) {
7102 ifp->if_fpackets += fpkts;
7103 }
7104 if (proto != NULL) {
7105 if_proto_free(proto);
7106 }
7107 if (packetlist) { /* if any packets are left, clean up */
7108 mbuf_freem_list(packetlist);
7109 }
7110 if (retval == EJUSTRETURN) {
7111 retval = 0;
7112 }
7113 if (iorefcnt == 1) {
7114 ifnet_datamov_end(ifp);
7115 }
7116 if (rt != NULL) {
7117 rtfree(rt);
7118 rt = NULL;
7119 }
7120
7121 return retval;
7122 }
7123
7124 /*
7125 * This routine checks if the destination address is not a loopback, link-local,
7126 * multicast or broadcast address.
7127 */
7128 static int
dlil_is_clat_needed(protocol_family_t proto_family,mbuf_t m)7129 dlil_is_clat_needed(protocol_family_t proto_family, mbuf_t m)
7130 {
7131 int ret = 0;
7132 switch (proto_family) {
7133 case PF_INET: {
7134 struct ip *iph = mtod(m, struct ip *);
7135 if (CLAT46_NEEDED(ntohl(iph->ip_dst.s_addr))) {
7136 ret = 1;
7137 }
7138 break;
7139 }
7140 case PF_INET6: {
7141 struct ip6_hdr *ip6h = mtod(m, struct ip6_hdr *);
7142 if ((size_t)m_pktlen(m) >= sizeof(struct ip6_hdr) &&
7143 CLAT64_NEEDED(&ip6h->ip6_dst)) {
7144 ret = 1;
7145 }
7146 break;
7147 }
7148 }
7149
7150 return ret;
7151 }
7152 /*
7153 * @brief This routine translates IPv4 packet to IPv6 packet,
7154 * updates protocol checksum and also translates ICMP for code
7155 * along with inner header translation.
7156 *
7157 * @param ifp Pointer to the interface
7158 * @param proto_family pointer to protocol family. It is updated if function
7159 * performs the translation successfully.
7160 * @param m Pointer to the pointer pointing to the packet. Needed because this
7161 * routine can end up changing the mbuf to a different one.
7162 *
7163 * @return 0 on success or else a negative value.
7164 */
7165 static errno_t
dlil_clat46(ifnet_t ifp,protocol_family_t * proto_family,mbuf_t * m)7166 dlil_clat46(ifnet_t ifp, protocol_family_t *proto_family, mbuf_t *m)
7167 {
7168 VERIFY(*proto_family == PF_INET);
7169 VERIFY(IS_INTF_CLAT46(ifp));
7170
7171 pbuf_t pbuf_store, *pbuf = NULL;
7172 struct ip *iph = NULL;
7173 struct in_addr osrc, odst;
7174 uint8_t proto = 0;
7175 struct in6_ifaddr *ia6_clat_src = NULL;
7176 struct in6_addr *src = NULL;
7177 struct in6_addr dst;
7178 int error = 0;
7179 uint16_t off = 0;
7180 uint16_t tot_len = 0;
7181 uint16_t ip_id_val = 0;
7182 uint16_t ip_frag_off = 0;
7183
7184 boolean_t is_frag = FALSE;
7185 boolean_t is_first_frag = TRUE;
7186 boolean_t is_last_frag = TRUE;
7187
7188 pbuf_init_mbuf(&pbuf_store, *m, ifp);
7189 pbuf = &pbuf_store;
7190 iph = pbuf->pb_data;
7191
7192 osrc = iph->ip_src;
7193 odst = iph->ip_dst;
7194 proto = iph->ip_p;
7195 off = (uint16_t)(iph->ip_hl << 2);
7196 ip_id_val = iph->ip_id;
7197 ip_frag_off = ntohs(iph->ip_off) & IP_OFFMASK;
7198
7199 tot_len = ntohs(iph->ip_len);
7200
7201 /*
7202 * For packets that are not first frags
7203 * we only need to adjust CSUM.
7204 * For 4 to 6, Fragmentation header gets appended
7205 * after proto translation.
7206 */
7207 if (ntohs(iph->ip_off) & ~(IP_DF | IP_RF)) {
7208 is_frag = TRUE;
7209
7210 /* If the offset is not zero, it is not first frag */
7211 if (ip_frag_off != 0) {
7212 is_first_frag = FALSE;
7213 }
7214
7215 /* If IP_MF is set, then it is not last frag */
7216 if (ntohs(iph->ip_off) & IP_MF) {
7217 is_last_frag = FALSE;
7218 }
7219 }
7220
7221 /*
7222 * Retrive the local IPv6 CLAT46 address reserved for stateless
7223 * translation.
7224 */
7225 ia6_clat_src = in6ifa_ifpwithflag(ifp, IN6_IFF_CLAT46);
7226 if (ia6_clat_src == NULL) {
7227 ip6stat.ip6s_clat464_out_nov6addr_drop++;
7228 error = -1;
7229 goto cleanup;
7230 }
7231
7232 src = &ia6_clat_src->ia_addr.sin6_addr;
7233
7234 /*
7235 * Translate IPv4 destination to IPv6 destination by using the
7236 * prefixes learned through prior PLAT discovery.
7237 */
7238 if ((error = nat464_synthesize_ipv6(ifp, &odst, &dst)) != 0) {
7239 ip6stat.ip6s_clat464_out_v6synthfail_drop++;
7240 goto cleanup;
7241 }
7242
7243 /* Translate the IP header part first */
7244 error = (nat464_translate_46(pbuf, off, iph->ip_tos, iph->ip_p,
7245 iph->ip_ttl, *src, dst, tot_len) == NT_NAT64) ? 0 : -1;
7246
7247 iph = NULL; /* Invalidate iph as pbuf has been modified */
7248
7249 if (error != 0) {
7250 ip6stat.ip6s_clat464_out_46transfail_drop++;
7251 goto cleanup;
7252 }
7253
7254 /*
7255 * Translate protocol header, update checksum, checksum flags
7256 * and related fields.
7257 */
7258 error = (nat464_translate_proto(pbuf, (struct nat464_addr *)&osrc, (struct nat464_addr *)&odst,
7259 proto, PF_INET, PF_INET6, NT_OUT, !is_first_frag) == NT_NAT64) ? 0 : -1;
7260
7261 if (error != 0) {
7262 ip6stat.ip6s_clat464_out_46proto_transfail_drop++;
7263 goto cleanup;
7264 }
7265
7266 /* Now insert the IPv6 fragment header */
7267 if (is_frag) {
7268 error = nat464_insert_frag46(pbuf, ip_id_val, ip_frag_off, is_last_frag);
7269
7270 if (error != 0) {
7271 ip6stat.ip6s_clat464_out_46frag_transfail_drop++;
7272 goto cleanup;
7273 }
7274 }
7275
7276 cleanup:
7277 if (ia6_clat_src != NULL) {
7278 IFA_REMREF(&ia6_clat_src->ia_ifa);
7279 }
7280
7281 if (pbuf_is_valid(pbuf)) {
7282 *m = pbuf->pb_mbuf;
7283 pbuf->pb_mbuf = NULL;
7284 pbuf_destroy(pbuf);
7285 } else {
7286 error = -1;
7287 ip6stat.ip6s_clat464_out_invalpbuf_drop++;
7288 }
7289
7290 if (error == 0) {
7291 *proto_family = PF_INET6;
7292 ip6stat.ip6s_clat464_out_success++;
7293 }
7294
7295 return error;
7296 }
7297
7298 /*
7299 * @brief This routine translates incoming IPv6 to IPv4 packet,
7300 * updates protocol checksum and also translates ICMPv6 outer
7301 * and inner headers
7302 *
7303 * @return 0 on success or else a negative value.
7304 */
7305 static errno_t
dlil_clat64(ifnet_t ifp,protocol_family_t * proto_family,mbuf_t * m)7306 dlil_clat64(ifnet_t ifp, protocol_family_t *proto_family, mbuf_t *m)
7307 {
7308 VERIFY(*proto_family == PF_INET6);
7309 VERIFY(IS_INTF_CLAT46(ifp));
7310
7311 struct ip6_hdr *ip6h = NULL;
7312 struct in6_addr osrc, odst;
7313 uint8_t proto = 0;
7314 struct in6_ifaddr *ia6_clat_dst = NULL;
7315 struct in_ifaddr *ia4_clat_dst = NULL;
7316 struct in_addr *dst = NULL;
7317 struct in_addr src;
7318 int error = 0;
7319 uint32_t off = 0;
7320 u_int64_t tot_len = 0;
7321 uint8_t tos = 0;
7322 boolean_t is_first_frag = TRUE;
7323
7324 /* Incoming mbuf does not contain valid IP6 header */
7325 if ((size_t)(*m)->m_pkthdr.len < sizeof(struct ip6_hdr) ||
7326 ((size_t)(*m)->m_len < sizeof(struct ip6_hdr) &&
7327 (*m = m_pullup(*m, sizeof(struct ip6_hdr))) == NULL)) {
7328 ip6stat.ip6s_clat464_in_tooshort_drop++;
7329 return -1;
7330 }
7331
7332 ip6h = mtod(*m, struct ip6_hdr *);
7333 /* Validate that mbuf contains IP payload equal to ip6_plen */
7334 if ((size_t)(*m)->m_pkthdr.len < ntohs(ip6h->ip6_plen) + sizeof(struct ip6_hdr)) {
7335 ip6stat.ip6s_clat464_in_tooshort_drop++;
7336 return -1;
7337 }
7338
7339 osrc = ip6h->ip6_src;
7340 odst = ip6h->ip6_dst;
7341
7342 /*
7343 * Retrieve the local CLAT46 reserved IPv6 address.
7344 * Let the packet pass if we don't find one, as the flag
7345 * may get set before IPv6 configuration has taken place.
7346 */
7347 ia6_clat_dst = in6ifa_ifpwithflag(ifp, IN6_IFF_CLAT46);
7348 if (ia6_clat_dst == NULL) {
7349 goto done;
7350 }
7351
7352 /*
7353 * Check if the original dest in the packet is same as the reserved
7354 * CLAT46 IPv6 address
7355 */
7356 if (IN6_ARE_ADDR_EQUAL(&odst, &ia6_clat_dst->ia_addr.sin6_addr)) {
7357 pbuf_t pbuf_store, *pbuf = NULL;
7358 pbuf_init_mbuf(&pbuf_store, *m, ifp);
7359 pbuf = &pbuf_store;
7360
7361 /*
7362 * Retrive the local CLAT46 IPv4 address reserved for stateless
7363 * translation.
7364 */
7365 ia4_clat_dst = inifa_ifpclatv4(ifp);
7366 if (ia4_clat_dst == NULL) {
7367 IFA_REMREF(&ia6_clat_dst->ia_ifa);
7368 ip6stat.ip6s_clat464_in_nov4addr_drop++;
7369 error = -1;
7370 goto cleanup;
7371 }
7372 IFA_REMREF(&ia6_clat_dst->ia_ifa);
7373
7374 /* Translate IPv6 src to IPv4 src by removing the NAT64 prefix */
7375 dst = &ia4_clat_dst->ia_addr.sin_addr;
7376 if ((error = nat464_synthesize_ipv4(ifp, &osrc, &src)) != 0) {
7377 ip6stat.ip6s_clat464_in_v4synthfail_drop++;
7378 error = -1;
7379 goto cleanup;
7380 }
7381
7382 ip6h = pbuf->pb_data;
7383 off = sizeof(struct ip6_hdr);
7384 proto = ip6h->ip6_nxt;
7385 tos = (ntohl(ip6h->ip6_flow) >> 20) & 0xff;
7386 tot_len = ntohs(ip6h->ip6_plen) + sizeof(struct ip6_hdr);
7387
7388 /*
7389 * Translate the IP header and update the fragmentation
7390 * header if needed
7391 */
7392 error = (nat464_translate_64(pbuf, off, tos, &proto,
7393 ip6h->ip6_hlim, src, *dst, tot_len, &is_first_frag) == NT_NAT64) ?
7394 0 : -1;
7395
7396 ip6h = NULL; /* Invalidate ip6h as pbuf has been changed */
7397
7398 if (error != 0) {
7399 ip6stat.ip6s_clat464_in_64transfail_drop++;
7400 goto cleanup;
7401 }
7402
7403 /*
7404 * Translate protocol header, update checksum, checksum flags
7405 * and related fields.
7406 */
7407 error = (nat464_translate_proto(pbuf, (struct nat464_addr *)&osrc,
7408 (struct nat464_addr *)&odst, proto, PF_INET6, PF_INET,
7409 NT_IN, !is_first_frag) == NT_NAT64) ? 0 : -1;
7410
7411 if (error != 0) {
7412 ip6stat.ip6s_clat464_in_64proto_transfail_drop++;
7413 goto cleanup;
7414 }
7415
7416 cleanup:
7417 if (ia4_clat_dst != NULL) {
7418 IFA_REMREF(&ia4_clat_dst->ia_ifa);
7419 }
7420
7421 if (pbuf_is_valid(pbuf)) {
7422 *m = pbuf->pb_mbuf;
7423 pbuf->pb_mbuf = NULL;
7424 pbuf_destroy(pbuf);
7425 } else {
7426 error = -1;
7427 ip6stat.ip6s_clat464_in_invalpbuf_drop++;
7428 }
7429
7430 if (error == 0) {
7431 *proto_family = PF_INET;
7432 ip6stat.ip6s_clat464_in_success++;
7433 }
7434 } /* CLAT traffic */
7435
7436 done:
7437 return error;
7438 }
7439
7440 /* The following is used to enqueue work items for ifnet ioctl events */
7441 static void ifnet_ioctl_event_callback(struct nwk_wq_entry *);
7442
7443 struct ifnet_ioctl_event {
7444 struct ifnet *ifp;
7445 u_long ioctl_code;
7446 };
7447
7448 struct ifnet_ioctl_event_nwk_wq_entry {
7449 struct nwk_wq_entry nwk_wqe;
7450 struct ifnet_ioctl_event ifnet_ioctl_ev_arg;
7451 };
7452
7453 void
ifnet_ioctl_async(struct ifnet * ifp,u_long ioctl_code)7454 ifnet_ioctl_async(struct ifnet *ifp, u_long ioctl_code)
7455 {
7456 struct ifnet_ioctl_event_nwk_wq_entry *p_ifnet_ioctl_ev = NULL;
7457
7458 /*
7459 * Get an io ref count if the interface is attached.
7460 * At this point it most likely is. We are taking a reference for
7461 * deferred processing.
7462 */
7463 if (!ifnet_is_attached(ifp, 1)) {
7464 os_log(OS_LOG_DEFAULT, "%s:%d %s Failed for ioctl %lu as interface "
7465 "is not attached",
7466 __func__, __LINE__, if_name(ifp), ioctl_code);
7467 return;
7468 }
7469
7470 p_ifnet_ioctl_ev = kalloc_type(struct ifnet_ioctl_event_nwk_wq_entry,
7471 Z_WAITOK | Z_ZERO | Z_NOFAIL);
7472
7473 p_ifnet_ioctl_ev->ifnet_ioctl_ev_arg.ifp = ifp;
7474 p_ifnet_ioctl_ev->ifnet_ioctl_ev_arg.ioctl_code = ioctl_code;
7475 p_ifnet_ioctl_ev->nwk_wqe.func = ifnet_ioctl_event_callback;
7476 nwk_wq_enqueue(&p_ifnet_ioctl_ev->nwk_wqe);
7477 }
7478
7479 static void
ifnet_ioctl_event_callback(struct nwk_wq_entry * nwk_item)7480 ifnet_ioctl_event_callback(struct nwk_wq_entry *nwk_item)
7481 {
7482 struct ifnet_ioctl_event_nwk_wq_entry *p_ev = __container_of(nwk_item,
7483 struct ifnet_ioctl_event_nwk_wq_entry, nwk_wqe);
7484
7485 struct ifnet *ifp = p_ev->ifnet_ioctl_ev_arg.ifp;
7486 u_long ioctl_code = p_ev->ifnet_ioctl_ev_arg.ioctl_code;
7487 int ret = 0;
7488
7489 if ((ret = ifnet_ioctl(ifp, 0, ioctl_code, NULL)) != 0) {
7490 os_log(OS_LOG_DEFAULT, "%s:%d %s ifnet_ioctl returned %d for ioctl %lu",
7491 __func__, __LINE__, if_name(ifp), ret, ioctl_code);
7492 } else if (dlil_verbose) {
7493 os_log(OS_LOG_DEFAULT, "%s:%d %s ifnet_ioctl returned successfully "
7494 "for ioctl %lu",
7495 __func__, __LINE__, if_name(ifp), ioctl_code);
7496 }
7497 ifnet_decr_iorefcnt(ifp);
7498 kfree_type(struct ifnet_ioctl_event_nwk_wq_entry, p_ev);
7499 return;
7500 }
7501
7502 errno_t
ifnet_ioctl(ifnet_t ifp,protocol_family_t proto_fam,u_long ioctl_code,void * ioctl_arg)7503 ifnet_ioctl(ifnet_t ifp, protocol_family_t proto_fam, u_long ioctl_code,
7504 void *ioctl_arg)
7505 {
7506 struct ifnet_filter *filter;
7507 int retval = EOPNOTSUPP;
7508 int result = 0;
7509
7510 if (ifp == NULL || ioctl_code == 0) {
7511 return EINVAL;
7512 }
7513
7514 /* Get an io ref count if the interface is attached */
7515 if (!ifnet_is_attached(ifp, 1)) {
7516 return EOPNOTSUPP;
7517 }
7518
7519 /*
7520 * Run the interface filters first.
7521 * We want to run all filters before calling the protocol,
7522 * interface family, or interface.
7523 */
7524 lck_mtx_lock_spin(&ifp->if_flt_lock);
7525 /* prevent filter list from changing in case we drop the lock */
7526 if_flt_monitor_busy(ifp);
7527 TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
7528 if (filter->filt_ioctl != NULL && (filter->filt_protocol == 0 ||
7529 filter->filt_protocol == proto_fam)) {
7530 lck_mtx_unlock(&ifp->if_flt_lock);
7531
7532 result = filter->filt_ioctl(filter->filt_cookie, ifp,
7533 proto_fam, ioctl_code, ioctl_arg);
7534
7535 lck_mtx_lock_spin(&ifp->if_flt_lock);
7536
7537 /* Only update retval if no one has handled the ioctl */
7538 if (retval == EOPNOTSUPP || result == EJUSTRETURN) {
7539 if (result == ENOTSUP) {
7540 result = EOPNOTSUPP;
7541 }
7542 retval = result;
7543 if (retval != 0 && retval != EOPNOTSUPP) {
7544 /* we're done with the filter list */
7545 if_flt_monitor_unbusy(ifp);
7546 lck_mtx_unlock(&ifp->if_flt_lock);
7547 goto cleanup;
7548 }
7549 }
7550 }
7551 }
7552 /* we're done with the filter list */
7553 if_flt_monitor_unbusy(ifp);
7554 lck_mtx_unlock(&ifp->if_flt_lock);
7555
7556 /* Allow the protocol to handle the ioctl */
7557 if (proto_fam != 0) {
7558 struct if_proto *proto;
7559
7560 /* callee holds a proto refcnt upon success */
7561 ifnet_lock_shared(ifp);
7562 proto = find_attached_proto(ifp, proto_fam);
7563 ifnet_lock_done(ifp);
7564 if (proto != NULL) {
7565 proto_media_ioctl ioctlp =
7566 (proto->proto_kpi == kProtoKPI_v1 ?
7567 proto->kpi.v1.ioctl : proto->kpi.v2.ioctl);
7568 result = EOPNOTSUPP;
7569 if (ioctlp != NULL) {
7570 result = ioctlp(ifp, proto_fam, ioctl_code,
7571 ioctl_arg);
7572 }
7573 if_proto_free(proto);
7574
7575 /* Only update retval if no one has handled the ioctl */
7576 if (retval == EOPNOTSUPP || result == EJUSTRETURN) {
7577 if (result == ENOTSUP) {
7578 result = EOPNOTSUPP;
7579 }
7580 retval = result;
7581 if (retval && retval != EOPNOTSUPP) {
7582 goto cleanup;
7583 }
7584 }
7585 }
7586 }
7587
7588 /* retval is either 0 or EOPNOTSUPP */
7589
7590 /*
7591 * Let the interface handle this ioctl.
7592 * If it returns EOPNOTSUPP, ignore that, we may have
7593 * already handled this in the protocol or family.
7594 */
7595 if (ifp->if_ioctl) {
7596 result = (*ifp->if_ioctl)(ifp, ioctl_code, ioctl_arg);
7597 }
7598
7599 /* Only update retval if no one has handled the ioctl */
7600 if (retval == EOPNOTSUPP || result == EJUSTRETURN) {
7601 if (result == ENOTSUP) {
7602 result = EOPNOTSUPP;
7603 }
7604 retval = result;
7605 if (retval && retval != EOPNOTSUPP) {
7606 goto cleanup;
7607 }
7608 }
7609
7610 cleanup:
7611 if (retval == EJUSTRETURN) {
7612 retval = 0;
7613 }
7614
7615 ifnet_decr_iorefcnt(ifp);
7616
7617 return retval;
7618 }
7619
7620 __private_extern__ errno_t
dlil_set_bpf_tap(ifnet_t ifp,bpf_tap_mode mode,bpf_packet_func callback)7621 dlil_set_bpf_tap(ifnet_t ifp, bpf_tap_mode mode, bpf_packet_func callback)
7622 {
7623 errno_t error = 0;
7624
7625
7626 if (ifp->if_set_bpf_tap) {
7627 /* Get an io reference on the interface if it is attached */
7628 if (!ifnet_is_attached(ifp, 1)) {
7629 return ENXIO;
7630 }
7631 error = ifp->if_set_bpf_tap(ifp, mode, callback);
7632 ifnet_decr_iorefcnt(ifp);
7633 }
7634 return error;
7635 }
7636
7637 errno_t
dlil_resolve_multi(struct ifnet * ifp,const struct sockaddr * proto_addr,struct sockaddr * ll_addr,size_t ll_len)7638 dlil_resolve_multi(struct ifnet *ifp, const struct sockaddr *proto_addr,
7639 struct sockaddr *ll_addr, size_t ll_len)
7640 {
7641 errno_t result = EOPNOTSUPP;
7642 struct if_proto *proto;
7643 const struct sockaddr *verify;
7644 proto_media_resolve_multi resolvep;
7645
7646 if (!ifnet_is_attached(ifp, 1)) {
7647 return result;
7648 }
7649
7650 bzero(ll_addr, ll_len);
7651
7652 /* Call the protocol first; callee holds a proto refcnt upon success */
7653 ifnet_lock_shared(ifp);
7654 proto = find_attached_proto(ifp, proto_addr->sa_family);
7655 ifnet_lock_done(ifp);
7656 if (proto != NULL) {
7657 resolvep = (proto->proto_kpi == kProtoKPI_v1 ?
7658 proto->kpi.v1.resolve_multi : proto->kpi.v2.resolve_multi);
7659 if (resolvep != NULL) {
7660 result = resolvep(ifp, proto_addr,
7661 (struct sockaddr_dl *)(void *)ll_addr, ll_len);
7662 }
7663 if_proto_free(proto);
7664 }
7665
7666 /* Let the interface verify the multicast address */
7667 if ((result == EOPNOTSUPP || result == 0) && ifp->if_check_multi) {
7668 if (result == 0) {
7669 verify = ll_addr;
7670 } else {
7671 verify = proto_addr;
7672 }
7673 result = ifp->if_check_multi(ifp, verify);
7674 }
7675
7676 ifnet_decr_iorefcnt(ifp);
7677 return result;
7678 }
7679
7680 __private_extern__ errno_t
dlil_send_arp_internal(ifnet_t ifp,u_short arpop,const struct sockaddr_dl * sender_hw,const struct sockaddr * sender_proto,const struct sockaddr_dl * target_hw,const struct sockaddr * target_proto)7681 dlil_send_arp_internal(ifnet_t ifp, u_short arpop,
7682 const struct sockaddr_dl *sender_hw, const struct sockaddr *sender_proto,
7683 const struct sockaddr_dl *target_hw, const struct sockaddr *target_proto)
7684 {
7685 struct if_proto *proto;
7686 errno_t result = 0;
7687
7688 if ((ifp->if_flags & IFF_NOARP) != 0) {
7689 result = ENOTSUP;
7690 goto done;
7691 }
7692
7693 /* callee holds a proto refcnt upon success */
7694 ifnet_lock_shared(ifp);
7695 proto = find_attached_proto(ifp, target_proto->sa_family);
7696 ifnet_lock_done(ifp);
7697 if (proto == NULL) {
7698 result = ENOTSUP;
7699 } else {
7700 proto_media_send_arp arpp;
7701 arpp = (proto->proto_kpi == kProtoKPI_v1 ?
7702 proto->kpi.v1.send_arp : proto->kpi.v2.send_arp);
7703 if (arpp == NULL) {
7704 result = ENOTSUP;
7705 } else {
7706 switch (arpop) {
7707 case ARPOP_REQUEST:
7708 arpstat.txrequests++;
7709 if (target_hw != NULL) {
7710 arpstat.txurequests++;
7711 }
7712 break;
7713 case ARPOP_REPLY:
7714 arpstat.txreplies++;
7715 break;
7716 }
7717 result = arpp(ifp, arpop, sender_hw, sender_proto,
7718 target_hw, target_proto);
7719 }
7720 if_proto_free(proto);
7721 }
7722 done:
7723 return result;
7724 }
7725
7726 struct net_thread_marks { };
7727 static const struct net_thread_marks net_thread_marks_base = { };
7728
7729 __private_extern__ const net_thread_marks_t net_thread_marks_none =
7730 &net_thread_marks_base;
7731
7732 __private_extern__ net_thread_marks_t
net_thread_marks_push(u_int32_t push)7733 net_thread_marks_push(u_int32_t push)
7734 {
7735 static const char *const base = (const void*)&net_thread_marks_base;
7736 u_int32_t pop = 0;
7737
7738 if (push != 0) {
7739 struct uthread *uth = current_uthread();
7740
7741 pop = push & ~uth->uu_network_marks;
7742 if (pop != 0) {
7743 uth->uu_network_marks |= pop;
7744 }
7745 }
7746
7747 return (net_thread_marks_t)&base[pop];
7748 }
7749
7750 __private_extern__ net_thread_marks_t
net_thread_unmarks_push(u_int32_t unpush)7751 net_thread_unmarks_push(u_int32_t unpush)
7752 {
7753 static const char *const base = (const void*)&net_thread_marks_base;
7754 u_int32_t unpop = 0;
7755
7756 if (unpush != 0) {
7757 struct uthread *uth = current_uthread();
7758
7759 unpop = unpush & uth->uu_network_marks;
7760 if (unpop != 0) {
7761 uth->uu_network_marks &= ~unpop;
7762 }
7763 }
7764
7765 return (net_thread_marks_t)&base[unpop];
7766 }
7767
7768 __private_extern__ void
net_thread_marks_pop(net_thread_marks_t popx)7769 net_thread_marks_pop(net_thread_marks_t popx)
7770 {
7771 static const char *const base = (const void*)&net_thread_marks_base;
7772 const ptrdiff_t pop = (const char *)popx - (const char *)base;
7773
7774 if (pop != 0) {
7775 static const ptrdiff_t ones = (ptrdiff_t)(u_int32_t)~0U;
7776 struct uthread *uth = current_uthread();
7777
7778 VERIFY((pop & ones) == pop);
7779 VERIFY((ptrdiff_t)(uth->uu_network_marks & pop) == pop);
7780 uth->uu_network_marks &= ~pop;
7781 }
7782 }
7783
7784 __private_extern__ void
net_thread_unmarks_pop(net_thread_marks_t unpopx)7785 net_thread_unmarks_pop(net_thread_marks_t unpopx)
7786 {
7787 static const char *const base = (const void*)&net_thread_marks_base;
7788 ptrdiff_t unpop = (const char *)unpopx - (const char *)base;
7789
7790 if (unpop != 0) {
7791 static const ptrdiff_t ones = (ptrdiff_t)(u_int32_t)~0U;
7792 struct uthread *uth = current_uthread();
7793
7794 VERIFY((unpop & ones) == unpop);
7795 VERIFY((ptrdiff_t)(uth->uu_network_marks & unpop) == 0);
7796 uth->uu_network_marks |= unpop;
7797 }
7798 }
7799
7800 __private_extern__ u_int32_t
net_thread_is_marked(u_int32_t check)7801 net_thread_is_marked(u_int32_t check)
7802 {
7803 if (check != 0) {
7804 struct uthread *uth = current_uthread();
7805 return uth->uu_network_marks & check;
7806 } else {
7807 return 0;
7808 }
7809 }
7810
7811 __private_extern__ u_int32_t
net_thread_is_unmarked(u_int32_t check)7812 net_thread_is_unmarked(u_int32_t check)
7813 {
7814 if (check != 0) {
7815 struct uthread *uth = current_uthread();
7816 return ~uth->uu_network_marks & check;
7817 } else {
7818 return 0;
7819 }
7820 }
7821
7822 static __inline__ int
_is_announcement(const struct sockaddr_in * sender_sin,const struct sockaddr_in * target_sin)7823 _is_announcement(const struct sockaddr_in * sender_sin,
7824 const struct sockaddr_in * target_sin)
7825 {
7826 if (target_sin == NULL || sender_sin == NULL) {
7827 return FALSE;
7828 }
7829
7830 return sender_sin->sin_addr.s_addr == target_sin->sin_addr.s_addr;
7831 }
7832
7833 __private_extern__ errno_t
dlil_send_arp(ifnet_t ifp,u_short arpop,const struct sockaddr_dl * sender_hw,const struct sockaddr * sender_proto,const struct sockaddr_dl * target_hw,const struct sockaddr * target_proto0,u_int32_t rtflags)7834 dlil_send_arp(ifnet_t ifp, u_short arpop, const struct sockaddr_dl *sender_hw,
7835 const struct sockaddr *sender_proto, const struct sockaddr_dl *target_hw,
7836 const struct sockaddr *target_proto0, u_int32_t rtflags)
7837 {
7838 errno_t result = 0;
7839 const struct sockaddr_in * sender_sin;
7840 const struct sockaddr_in * target_sin;
7841 struct sockaddr_inarp target_proto_sinarp;
7842 struct sockaddr *target_proto = (void *)(uintptr_t)target_proto0;
7843
7844 if (target_proto == NULL || sender_proto == NULL) {
7845 return EINVAL;
7846 }
7847
7848 if (sender_proto->sa_family != target_proto->sa_family) {
7849 return EINVAL;
7850 }
7851
7852 /*
7853 * If the target is a (default) router, provide that
7854 * information to the send_arp callback routine.
7855 */
7856 if (rtflags & RTF_ROUTER) {
7857 bcopy(target_proto, &target_proto_sinarp,
7858 sizeof(struct sockaddr_in));
7859 target_proto_sinarp.sin_other |= SIN_ROUTER;
7860 target_proto = (struct sockaddr *)&target_proto_sinarp;
7861 }
7862
7863 /*
7864 * If this is an ARP request and the target IP is IPv4LL,
7865 * send the request on all interfaces. The exception is
7866 * an announcement, which must only appear on the specific
7867 * interface.
7868 */
7869 sender_sin = (struct sockaddr_in *)(void *)(uintptr_t)sender_proto;
7870 target_sin = (struct sockaddr_in *)(void *)(uintptr_t)target_proto;
7871 if (target_proto->sa_family == AF_INET &&
7872 IN_LINKLOCAL(ntohl(target_sin->sin_addr.s_addr)) &&
7873 ipv4_ll_arp_aware != 0 && arpop == ARPOP_REQUEST &&
7874 !_is_announcement(sender_sin, target_sin)) {
7875 ifnet_t *ifp_list;
7876 u_int32_t count;
7877 u_int32_t ifp_on;
7878
7879 result = ENOTSUP;
7880
7881 if (ifnet_list_get(IFNET_FAMILY_ANY, &ifp_list, &count) == 0) {
7882 for (ifp_on = 0; ifp_on < count; ifp_on++) {
7883 errno_t new_result;
7884 ifaddr_t source_hw = NULL;
7885 ifaddr_t source_ip = NULL;
7886 struct sockaddr_in source_ip_copy;
7887 struct ifnet *cur_ifp = ifp_list[ifp_on];
7888
7889 /*
7890 * Only arp on interfaces marked for IPv4LL
7891 * ARPing. This may mean that we don't ARP on
7892 * the interface the subnet route points to.
7893 */
7894 if (!(cur_ifp->if_eflags & IFEF_ARPLL)) {
7895 continue;
7896 }
7897
7898 /* Find the source IP address */
7899 ifnet_lock_shared(cur_ifp);
7900 source_hw = cur_ifp->if_lladdr;
7901 TAILQ_FOREACH(source_ip, &cur_ifp->if_addrhead,
7902 ifa_link) {
7903 IFA_LOCK(source_ip);
7904 if (source_ip->ifa_addr != NULL &&
7905 source_ip->ifa_addr->sa_family ==
7906 AF_INET) {
7907 /* Copy the source IP address */
7908 source_ip_copy =
7909 *(struct sockaddr_in *)
7910 (void *)source_ip->ifa_addr;
7911 IFA_UNLOCK(source_ip);
7912 break;
7913 }
7914 IFA_UNLOCK(source_ip);
7915 }
7916
7917 /* No IP Source, don't arp */
7918 if (source_ip == NULL) {
7919 ifnet_lock_done(cur_ifp);
7920 continue;
7921 }
7922
7923 IFA_ADDREF(source_hw);
7924 ifnet_lock_done(cur_ifp);
7925
7926 /* Send the ARP */
7927 new_result = dlil_send_arp_internal(cur_ifp,
7928 arpop, (struct sockaddr_dl *)(void *)
7929 source_hw->ifa_addr,
7930 (struct sockaddr *)&source_ip_copy, NULL,
7931 target_proto);
7932
7933 IFA_REMREF(source_hw);
7934 if (result == ENOTSUP) {
7935 result = new_result;
7936 }
7937 }
7938 ifnet_list_free(ifp_list);
7939 }
7940 } else {
7941 result = dlil_send_arp_internal(ifp, arpop, sender_hw,
7942 sender_proto, target_hw, target_proto);
7943 }
7944
7945 return result;
7946 }
7947
7948 /*
7949 * Caller must hold ifnet head lock.
7950 */
7951 static int
ifnet_lookup(struct ifnet * ifp)7952 ifnet_lookup(struct ifnet *ifp)
7953 {
7954 struct ifnet *_ifp;
7955
7956 LCK_RW_ASSERT(&ifnet_head_lock, LCK_RW_ASSERT_HELD);
7957 TAILQ_FOREACH(_ifp, &ifnet_head, if_link) {
7958 if (_ifp == ifp) {
7959 break;
7960 }
7961 }
7962 return _ifp != NULL;
7963 }
7964
7965 /*
7966 * Caller has to pass a non-zero refio argument to get a
7967 * IO reference count. This will prevent ifnet_detach from
7968 * being called when there are outstanding io reference counts.
7969 */
7970 int
ifnet_is_attached(struct ifnet * ifp,int refio)7971 ifnet_is_attached(struct ifnet *ifp, int refio)
7972 {
7973 int ret;
7974
7975 lck_mtx_lock_spin(&ifp->if_ref_lock);
7976 if ((ret = IF_FULLY_ATTACHED(ifp))) {
7977 if (refio > 0) {
7978 ifp->if_refio++;
7979 }
7980 }
7981 lck_mtx_unlock(&ifp->if_ref_lock);
7982
7983 return ret;
7984 }
7985
7986 void
ifnet_incr_pending_thread_count(struct ifnet * ifp)7987 ifnet_incr_pending_thread_count(struct ifnet *ifp)
7988 {
7989 lck_mtx_lock_spin(&ifp->if_ref_lock);
7990 ifp->if_threads_pending++;
7991 lck_mtx_unlock(&ifp->if_ref_lock);
7992 }
7993
7994 void
ifnet_decr_pending_thread_count(struct ifnet * ifp)7995 ifnet_decr_pending_thread_count(struct ifnet *ifp)
7996 {
7997 lck_mtx_lock_spin(&ifp->if_ref_lock);
7998 VERIFY(ifp->if_threads_pending > 0);
7999 ifp->if_threads_pending--;
8000 if (ifp->if_threads_pending == 0) {
8001 wakeup(&ifp->if_threads_pending);
8002 }
8003 lck_mtx_unlock(&ifp->if_ref_lock);
8004 }
8005
8006 /*
8007 * Caller must ensure the interface is attached; the assumption is that
8008 * there is at least an outstanding IO reference count held already.
8009 * Most callers would call ifnet_is_{attached,data_ready}() instead.
8010 */
8011 void
ifnet_incr_iorefcnt(struct ifnet * ifp)8012 ifnet_incr_iorefcnt(struct ifnet *ifp)
8013 {
8014 lck_mtx_lock_spin(&ifp->if_ref_lock);
8015 VERIFY(IF_FULLY_ATTACHED(ifp));
8016 VERIFY(ifp->if_refio > 0);
8017 ifp->if_refio++;
8018 lck_mtx_unlock(&ifp->if_ref_lock);
8019 }
8020
8021 __attribute__((always_inline))
8022 static void
ifnet_decr_iorefcnt_locked(struct ifnet * ifp)8023 ifnet_decr_iorefcnt_locked(struct ifnet *ifp)
8024 {
8025 LCK_MTX_ASSERT(&ifp->if_ref_lock, LCK_MTX_ASSERT_OWNED);
8026
8027 VERIFY(ifp->if_refio > 0);
8028 VERIFY(ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING));
8029
8030 ifp->if_refio--;
8031 VERIFY(ifp->if_refio != 0 || ifp->if_datamov == 0);
8032
8033 /*
8034 * if there are no more outstanding io references, wakeup the
8035 * ifnet_detach thread if detaching flag is set.
8036 */
8037 if (ifp->if_refio == 0 && (ifp->if_refflags & IFRF_DETACHING)) {
8038 wakeup(&(ifp->if_refio));
8039 }
8040 }
8041
8042 void
ifnet_decr_iorefcnt(struct ifnet * ifp)8043 ifnet_decr_iorefcnt(struct ifnet *ifp)
8044 {
8045 lck_mtx_lock_spin(&ifp->if_ref_lock);
8046 ifnet_decr_iorefcnt_locked(ifp);
8047 lck_mtx_unlock(&ifp->if_ref_lock);
8048 }
8049
8050 boolean_t
ifnet_datamov_begin(struct ifnet * ifp)8051 ifnet_datamov_begin(struct ifnet *ifp)
8052 {
8053 boolean_t ret;
8054
8055 lck_mtx_lock_spin(&ifp->if_ref_lock);
8056 if ((ret = IF_FULLY_ATTACHED_AND_READY(ifp))) {
8057 ifp->if_refio++;
8058 ifp->if_datamov++;
8059 }
8060 lck_mtx_unlock(&ifp->if_ref_lock);
8061
8062 return ret;
8063 }
8064
8065 void
ifnet_datamov_end(struct ifnet * ifp)8066 ifnet_datamov_end(struct ifnet *ifp)
8067 {
8068 lck_mtx_lock_spin(&ifp->if_ref_lock);
8069 VERIFY(ifp->if_datamov > 0);
8070 /*
8071 * if there's no more thread moving data, wakeup any
8072 * drainers that's blocked waiting for this.
8073 */
8074 if (--ifp->if_datamov == 0 && ifp->if_drainers > 0) {
8075 DLIL_PRINTF("Waking up drainers on %s\n", if_name(ifp));
8076 DTRACE_IP1(datamov__drain__wake, struct ifnet *, ifp);
8077 wakeup(&(ifp->if_datamov));
8078 }
8079 ifnet_decr_iorefcnt_locked(ifp);
8080 lck_mtx_unlock(&ifp->if_ref_lock);
8081 }
8082
8083 static void
ifnet_datamov_suspend_locked(struct ifnet * ifp)8084 ifnet_datamov_suspend_locked(struct ifnet *ifp)
8085 {
8086 LCK_MTX_ASSERT(&ifp->if_ref_lock, LCK_MTX_ASSERT_OWNED);
8087 ifp->if_refio++;
8088 if (ifp->if_suspend++ == 0) {
8089 VERIFY(ifp->if_refflags & IFRF_READY);
8090 ifp->if_refflags &= ~IFRF_READY;
8091 }
8092 }
8093
8094 void
ifnet_datamov_suspend(struct ifnet * ifp)8095 ifnet_datamov_suspend(struct ifnet *ifp)
8096 {
8097 lck_mtx_lock_spin(&ifp->if_ref_lock);
8098 VERIFY(ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING));
8099 ifnet_datamov_suspend_locked(ifp);
8100 lck_mtx_unlock(&ifp->if_ref_lock);
8101 }
8102
8103 boolean_t
ifnet_datamov_suspend_if_needed(struct ifnet * ifp)8104 ifnet_datamov_suspend_if_needed(struct ifnet *ifp)
8105 {
8106 lck_mtx_lock_spin(&ifp->if_ref_lock);
8107 VERIFY(ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING));
8108 if (ifp->if_suspend > 0) {
8109 lck_mtx_unlock(&ifp->if_ref_lock);
8110 return FALSE;
8111 }
8112 ifnet_datamov_suspend_locked(ifp);
8113 lck_mtx_unlock(&ifp->if_ref_lock);
8114 return TRUE;
8115 }
8116
8117 void
ifnet_datamov_drain(struct ifnet * ifp)8118 ifnet_datamov_drain(struct ifnet *ifp)
8119 {
8120 lck_mtx_lock(&ifp->if_ref_lock);
8121 VERIFY(ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING));
8122 /* data movement must already be suspended */
8123 VERIFY(ifp->if_suspend > 0);
8124 VERIFY(!(ifp->if_refflags & IFRF_READY));
8125 ifp->if_drainers++;
8126 while (ifp->if_datamov != 0) {
8127 DLIL_PRINTF("Waiting for data path(s) to quiesce on %s\n",
8128 if_name(ifp));
8129 DTRACE_IP1(datamov__wait, struct ifnet *, ifp);
8130 (void) msleep(&(ifp->if_datamov), &ifp->if_ref_lock,
8131 (PZERO - 1), __func__, NULL);
8132 DTRACE_IP1(datamov__wake, struct ifnet *, ifp);
8133 }
8134 VERIFY(!(ifp->if_refflags & IFRF_READY));
8135 VERIFY(ifp->if_drainers > 0);
8136 ifp->if_drainers--;
8137 lck_mtx_unlock(&ifp->if_ref_lock);
8138
8139 /* purge the interface queues */
8140 if ((ifp->if_eflags & IFEF_TXSTART) != 0) {
8141 if_qflush_snd(ifp, false);
8142 }
8143 }
8144
8145 void
ifnet_datamov_suspend_and_drain(struct ifnet * ifp)8146 ifnet_datamov_suspend_and_drain(struct ifnet *ifp)
8147 {
8148 ifnet_datamov_suspend(ifp);
8149 ifnet_datamov_drain(ifp);
8150 }
8151
8152 void
ifnet_datamov_resume(struct ifnet * ifp)8153 ifnet_datamov_resume(struct ifnet *ifp)
8154 {
8155 lck_mtx_lock(&ifp->if_ref_lock);
8156 /* data movement must already be suspended */
8157 VERIFY(ifp->if_suspend > 0);
8158 if (--ifp->if_suspend == 0) {
8159 VERIFY(!(ifp->if_refflags & IFRF_READY));
8160 ifp->if_refflags |= IFRF_READY;
8161 }
8162 ifnet_decr_iorefcnt_locked(ifp);
8163 lck_mtx_unlock(&ifp->if_ref_lock);
8164 }
8165
8166 static void
dlil_if_trace(struct dlil_ifnet * dl_if,int refhold)8167 dlil_if_trace(struct dlil_ifnet *dl_if, int refhold)
8168 {
8169 struct dlil_ifnet_dbg *dl_if_dbg = (struct dlil_ifnet_dbg *)dl_if;
8170 ctrace_t *tr;
8171 u_int32_t idx;
8172 u_int16_t *cnt;
8173
8174 if (!(dl_if->dl_if_flags & DLIF_DEBUG)) {
8175 panic("%s: dl_if %p has no debug structure", __func__, dl_if);
8176 /* NOTREACHED */
8177 }
8178
8179 if (refhold) {
8180 cnt = &dl_if_dbg->dldbg_if_refhold_cnt;
8181 tr = dl_if_dbg->dldbg_if_refhold;
8182 } else {
8183 cnt = &dl_if_dbg->dldbg_if_refrele_cnt;
8184 tr = dl_if_dbg->dldbg_if_refrele;
8185 }
8186
8187 idx = atomic_add_16_ov(cnt, 1) % IF_REF_TRACE_HIST_SIZE;
8188 ctrace_record(&tr[idx]);
8189 }
8190
8191 errno_t
dlil_if_ref(struct ifnet * ifp)8192 dlil_if_ref(struct ifnet *ifp)
8193 {
8194 struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
8195
8196 if (dl_if == NULL) {
8197 return EINVAL;
8198 }
8199
8200 lck_mtx_lock_spin(&dl_if->dl_if_lock);
8201 ++dl_if->dl_if_refcnt;
8202 if (dl_if->dl_if_refcnt == 0) {
8203 panic("%s: wraparound refcnt for ifp=%p", __func__, ifp);
8204 /* NOTREACHED */
8205 }
8206 if (dl_if->dl_if_trace != NULL) {
8207 (*dl_if->dl_if_trace)(dl_if, TRUE);
8208 }
8209 lck_mtx_unlock(&dl_if->dl_if_lock);
8210
8211 return 0;
8212 }
8213
8214 errno_t
dlil_if_free(struct ifnet * ifp)8215 dlil_if_free(struct ifnet *ifp)
8216 {
8217 struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
8218 bool need_release = FALSE;
8219
8220 if (dl_if == NULL) {
8221 return EINVAL;
8222 }
8223
8224 lck_mtx_lock_spin(&dl_if->dl_if_lock);
8225 switch (dl_if->dl_if_refcnt) {
8226 case 0:
8227 panic("%s: negative refcnt for ifp=%p", __func__, ifp);
8228 /* NOTREACHED */
8229 break;
8230 case 1:
8231 if ((ifp->if_refflags & IFRF_EMBRYONIC) != 0) {
8232 need_release = TRUE;
8233 }
8234 break;
8235 default:
8236 break;
8237 }
8238 --dl_if->dl_if_refcnt;
8239 if (dl_if->dl_if_trace != NULL) {
8240 (*dl_if->dl_if_trace)(dl_if, FALSE);
8241 }
8242 lck_mtx_unlock(&dl_if->dl_if_lock);
8243 if (need_release) {
8244 _dlil_if_release(ifp, true);
8245 }
8246 return 0;
8247 }
8248
8249 static errno_t
dlil_attach_protocol(struct if_proto * proto,const struct ifnet_demux_desc * demux_list,u_int32_t demux_count,uint32_t * proto_count)8250 dlil_attach_protocol(struct if_proto *proto,
8251 const struct ifnet_demux_desc *demux_list, u_int32_t demux_count,
8252 uint32_t * proto_count)
8253 {
8254 struct kev_dl_proto_data ev_pr_data;
8255 struct ifnet *ifp = proto->ifp;
8256 errno_t retval = 0;
8257 u_int32_t hash_value = proto_hash_value(proto->protocol_family);
8258 struct if_proto *prev_proto;
8259 struct if_proto *_proto;
8260
8261 /* don't allow attaching anything but PF_BRIDGE to vmnet interfaces */
8262 if (IFNET_IS_VMNET(ifp) && proto->protocol_family != PF_BRIDGE) {
8263 return EINVAL;
8264 }
8265
8266 if (!ifnet_is_attached(ifp, 1)) {
8267 os_log(OS_LOG_DEFAULT, "%s: %s is no longer attached",
8268 __func__, if_name(ifp));
8269 return ENXIO;
8270 }
8271 /* callee holds a proto refcnt upon success */
8272 ifnet_lock_exclusive(ifp);
8273 _proto = find_attached_proto(ifp, proto->protocol_family);
8274 if (_proto != NULL) {
8275 ifnet_lock_done(ifp);
8276 if_proto_free(_proto);
8277 retval = EEXIST;
8278 goto ioref_done;
8279 }
8280
8281 /*
8282 * Call family module add_proto routine so it can refine the
8283 * demux descriptors as it wishes.
8284 */
8285 retval = ifp->if_add_proto(ifp, proto->protocol_family, demux_list,
8286 demux_count);
8287 if (retval) {
8288 ifnet_lock_done(ifp);
8289 goto ioref_done;
8290 }
8291
8292 /*
8293 * Insert the protocol in the hash
8294 */
8295 prev_proto = SLIST_FIRST(&ifp->if_proto_hash[hash_value]);
8296 while (prev_proto != NULL && SLIST_NEXT(prev_proto, next_hash) != NULL) {
8297 prev_proto = SLIST_NEXT(prev_proto, next_hash);
8298 }
8299 if (prev_proto) {
8300 SLIST_INSERT_AFTER(prev_proto, proto, next_hash);
8301 } else {
8302 SLIST_INSERT_HEAD(&ifp->if_proto_hash[hash_value],
8303 proto, next_hash);
8304 }
8305
8306 /* hold a proto refcnt for attach */
8307 if_proto_ref(proto);
8308
8309 /*
8310 * The reserved field carries the number of protocol still attached
8311 * (subject to change)
8312 */
8313 ev_pr_data.proto_family = proto->protocol_family;
8314 ev_pr_data.proto_remaining_count = dlil_ifp_protolist(ifp, NULL, 0);
8315
8316 ifnet_lock_done(ifp);
8317
8318 dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_PROTO_ATTACHED,
8319 (struct net_event_data *)&ev_pr_data,
8320 sizeof(struct kev_dl_proto_data), FALSE);
8321 if (proto_count != NULL) {
8322 *proto_count = ev_pr_data.proto_remaining_count;
8323 }
8324 ioref_done:
8325 ifnet_decr_iorefcnt(ifp);
8326 return retval;
8327 }
8328
8329 static void
dlil_handle_proto_attach(ifnet_t ifp,protocol_family_t protocol)8330 dlil_handle_proto_attach(ifnet_t ifp, protocol_family_t protocol)
8331 {
8332 /*
8333 * A protocol has been attached, mark the interface up.
8334 * This used to be done by configd.KernelEventMonitor, but that
8335 * is inherently prone to races (rdar://problem/30810208).
8336 */
8337 (void) ifnet_set_flags(ifp, IFF_UP, IFF_UP);
8338 (void) ifnet_ioctl(ifp, 0, SIOCSIFFLAGS, NULL);
8339 dlil_post_sifflags_msg(ifp);
8340 #if SKYWALK
8341 switch (protocol) {
8342 case AF_INET:
8343 case AF_INET6:
8344 /* don't attach the flowswitch unless attaching IP */
8345 dlil_attach_flowswitch_nexus(ifp);
8346 break;
8347 default:
8348 break;
8349 }
8350 #endif /* SKYWALK */
8351 }
8352
8353 errno_t
ifnet_attach_protocol(ifnet_t ifp,protocol_family_t protocol,const struct ifnet_attach_proto_param * proto_details)8354 ifnet_attach_protocol(ifnet_t ifp, protocol_family_t protocol,
8355 const struct ifnet_attach_proto_param *proto_details)
8356 {
8357 int retval = 0;
8358 struct if_proto *ifproto = NULL;
8359 uint32_t proto_count = 0;
8360
8361 ifnet_head_lock_shared();
8362 if (ifp == NULL || protocol == 0 || proto_details == NULL) {
8363 retval = EINVAL;
8364 goto end;
8365 }
8366 /* Check that the interface is in the global list */
8367 if (!ifnet_lookup(ifp)) {
8368 retval = ENXIO;
8369 goto end;
8370 }
8371
8372 ifproto = zalloc_flags(dlif_proto_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
8373
8374 /* refcnt held above during lookup */
8375 ifproto->ifp = ifp;
8376 ifproto->protocol_family = protocol;
8377 ifproto->proto_kpi = kProtoKPI_v1;
8378 ifproto->kpi.v1.input = proto_details->input;
8379 ifproto->kpi.v1.pre_output = proto_details->pre_output;
8380 ifproto->kpi.v1.event = proto_details->event;
8381 ifproto->kpi.v1.ioctl = proto_details->ioctl;
8382 ifproto->kpi.v1.detached = proto_details->detached;
8383 ifproto->kpi.v1.resolve_multi = proto_details->resolve;
8384 ifproto->kpi.v1.send_arp = proto_details->send_arp;
8385
8386 retval = dlil_attach_protocol(ifproto,
8387 proto_details->demux_list, proto_details->demux_count,
8388 &proto_count);
8389
8390 end:
8391 if (retval == EEXIST) {
8392 /* already attached */
8393 if (dlil_verbose) {
8394 DLIL_PRINTF("%s: protocol %d already attached\n",
8395 ifp != NULL ? if_name(ifp) : "N/A",
8396 protocol);
8397 }
8398 } else if (retval != 0) {
8399 DLIL_PRINTF("%s: failed to attach v1 protocol %d (err=%d)\n",
8400 ifp != NULL ? if_name(ifp) : "N/A", protocol, retval);
8401 } else if (dlil_verbose) {
8402 DLIL_PRINTF("%s: attached v1 protocol %d (count = %d)\n",
8403 ifp != NULL ? if_name(ifp) : "N/A",
8404 protocol, proto_count);
8405 }
8406 ifnet_head_done();
8407 if (retval == 0) {
8408 dlil_handle_proto_attach(ifp, protocol);
8409 } else if (ifproto != NULL) {
8410 zfree(dlif_proto_zone, ifproto);
8411 }
8412 return retval;
8413 }
8414
8415 errno_t
ifnet_attach_protocol_v2(ifnet_t ifp,protocol_family_t protocol,const struct ifnet_attach_proto_param_v2 * proto_details)8416 ifnet_attach_protocol_v2(ifnet_t ifp, protocol_family_t protocol,
8417 const struct ifnet_attach_proto_param_v2 *proto_details)
8418 {
8419 int retval = 0;
8420 struct if_proto *ifproto = NULL;
8421 uint32_t proto_count = 0;
8422
8423 ifnet_head_lock_shared();
8424 if (ifp == NULL || protocol == 0 || proto_details == NULL) {
8425 retval = EINVAL;
8426 goto end;
8427 }
8428 /* Check that the interface is in the global list */
8429 if (!ifnet_lookup(ifp)) {
8430 retval = ENXIO;
8431 goto end;
8432 }
8433
8434 ifproto = zalloc_flags(dlif_proto_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
8435
8436 /* refcnt held above during lookup */
8437 ifproto->ifp = ifp;
8438 ifproto->protocol_family = protocol;
8439 ifproto->proto_kpi = kProtoKPI_v2;
8440 ifproto->kpi.v2.input = proto_details->input;
8441 ifproto->kpi.v2.pre_output = proto_details->pre_output;
8442 ifproto->kpi.v2.event = proto_details->event;
8443 ifproto->kpi.v2.ioctl = proto_details->ioctl;
8444 ifproto->kpi.v2.detached = proto_details->detached;
8445 ifproto->kpi.v2.resolve_multi = proto_details->resolve;
8446 ifproto->kpi.v2.send_arp = proto_details->send_arp;
8447
8448 retval = dlil_attach_protocol(ifproto,
8449 proto_details->demux_list, proto_details->demux_count,
8450 &proto_count);
8451
8452 end:
8453 if (retval == EEXIST) {
8454 /* already attached */
8455 if (dlil_verbose) {
8456 DLIL_PRINTF("%s: protocol %d already attached\n",
8457 ifp != NULL ? if_name(ifp) : "N/A",
8458 protocol);
8459 }
8460 } else if (retval != 0) {
8461 DLIL_PRINTF("%s: failed to attach v2 protocol %d (err=%d)\n",
8462 ifp != NULL ? if_name(ifp) : "N/A", protocol, retval);
8463 } else if (dlil_verbose) {
8464 DLIL_PRINTF("%s: attached v2 protocol %d (count = %d)\n",
8465 ifp != NULL ? if_name(ifp) : "N/A",
8466 protocol, proto_count);
8467 }
8468 ifnet_head_done();
8469 if (retval == 0) {
8470 dlil_handle_proto_attach(ifp, protocol);
8471 } else if (ifproto != NULL) {
8472 zfree(dlif_proto_zone, ifproto);
8473 }
8474 return retval;
8475 }
8476
8477 errno_t
ifnet_detach_protocol(ifnet_t ifp,protocol_family_t proto_family)8478 ifnet_detach_protocol(ifnet_t ifp, protocol_family_t proto_family)
8479 {
8480 struct if_proto *proto = NULL;
8481 int retval = 0;
8482
8483 if (ifp == NULL || proto_family == 0) {
8484 retval = EINVAL;
8485 goto end;
8486 }
8487
8488 ifnet_lock_exclusive(ifp);
8489 /* callee holds a proto refcnt upon success */
8490 proto = find_attached_proto(ifp, proto_family);
8491 if (proto == NULL) {
8492 retval = ENXIO;
8493 ifnet_lock_done(ifp);
8494 goto end;
8495 }
8496
8497 /* call family module del_proto */
8498 if (ifp->if_del_proto) {
8499 ifp->if_del_proto(ifp, proto->protocol_family);
8500 }
8501
8502 SLIST_REMOVE(&ifp->if_proto_hash[proto_hash_value(proto_family)],
8503 proto, if_proto, next_hash);
8504
8505 if (proto->proto_kpi == kProtoKPI_v1) {
8506 proto->kpi.v1.input = ifproto_media_input_v1;
8507 proto->kpi.v1.pre_output = ifproto_media_preout;
8508 proto->kpi.v1.event = ifproto_media_event;
8509 proto->kpi.v1.ioctl = ifproto_media_ioctl;
8510 proto->kpi.v1.resolve_multi = ifproto_media_resolve_multi;
8511 proto->kpi.v1.send_arp = ifproto_media_send_arp;
8512 } else {
8513 proto->kpi.v2.input = ifproto_media_input_v2;
8514 proto->kpi.v2.pre_output = ifproto_media_preout;
8515 proto->kpi.v2.event = ifproto_media_event;
8516 proto->kpi.v2.ioctl = ifproto_media_ioctl;
8517 proto->kpi.v2.resolve_multi = ifproto_media_resolve_multi;
8518 proto->kpi.v2.send_arp = ifproto_media_send_arp;
8519 }
8520 proto->detached = 1;
8521 ifnet_lock_done(ifp);
8522
8523 if (dlil_verbose) {
8524 DLIL_PRINTF("%s: detached %s protocol %d\n", if_name(ifp),
8525 (proto->proto_kpi == kProtoKPI_v1) ?
8526 "v1" : "v2", proto_family);
8527 }
8528
8529 /* release proto refcnt held during protocol attach */
8530 if_proto_free(proto);
8531
8532 /*
8533 * Release proto refcnt held during lookup; the rest of
8534 * protocol detach steps will happen when the last proto
8535 * reference is released.
8536 */
8537 if_proto_free(proto);
8538
8539 end:
8540 return retval;
8541 }
8542
8543
8544 static errno_t
ifproto_media_input_v1(struct ifnet * ifp,protocol_family_t protocol,struct mbuf * packet,char * header)8545 ifproto_media_input_v1(struct ifnet *ifp, protocol_family_t protocol,
8546 struct mbuf *packet, char *header)
8547 {
8548 #pragma unused(ifp, protocol, packet, header)
8549 return ENXIO;
8550 }
8551
8552 static errno_t
ifproto_media_input_v2(struct ifnet * ifp,protocol_family_t protocol,struct mbuf * packet)8553 ifproto_media_input_v2(struct ifnet *ifp, protocol_family_t protocol,
8554 struct mbuf *packet)
8555 {
8556 #pragma unused(ifp, protocol, packet)
8557 return ENXIO;
8558 }
8559
8560 static errno_t
ifproto_media_preout(struct ifnet * ifp,protocol_family_t protocol,mbuf_t * packet,const struct sockaddr * dest,void * route,char * frame_type,char * link_layer_dest)8561 ifproto_media_preout(struct ifnet *ifp, protocol_family_t protocol,
8562 mbuf_t *packet, const struct sockaddr *dest, void *route, char *frame_type,
8563 char *link_layer_dest)
8564 {
8565 #pragma unused(ifp, protocol, packet, dest, route, frame_type, link_layer_dest)
8566 return ENXIO;
8567 }
8568
8569 static void
ifproto_media_event(struct ifnet * ifp,protocol_family_t protocol,const struct kev_msg * event)8570 ifproto_media_event(struct ifnet *ifp, protocol_family_t protocol,
8571 const struct kev_msg *event)
8572 {
8573 #pragma unused(ifp, protocol, event)
8574 }
8575
8576 static errno_t
ifproto_media_ioctl(struct ifnet * ifp,protocol_family_t protocol,unsigned long command,void * argument)8577 ifproto_media_ioctl(struct ifnet *ifp, protocol_family_t protocol,
8578 unsigned long command, void *argument)
8579 {
8580 #pragma unused(ifp, protocol, command, argument)
8581 return ENXIO;
8582 }
8583
8584 static errno_t
ifproto_media_resolve_multi(ifnet_t ifp,const struct sockaddr * proto_addr,struct sockaddr_dl * out_ll,size_t ll_len)8585 ifproto_media_resolve_multi(ifnet_t ifp, const struct sockaddr *proto_addr,
8586 struct sockaddr_dl *out_ll, size_t ll_len)
8587 {
8588 #pragma unused(ifp, proto_addr, out_ll, ll_len)
8589 return ENXIO;
8590 }
8591
8592 static errno_t
ifproto_media_send_arp(struct ifnet * ifp,u_short arpop,const struct sockaddr_dl * sender_hw,const struct sockaddr * sender_proto,const struct sockaddr_dl * target_hw,const struct sockaddr * target_proto)8593 ifproto_media_send_arp(struct ifnet *ifp, u_short arpop,
8594 const struct sockaddr_dl *sender_hw, const struct sockaddr *sender_proto,
8595 const struct sockaddr_dl *target_hw, const struct sockaddr *target_proto)
8596 {
8597 #pragma unused(ifp, arpop, sender_hw, sender_proto, target_hw, target_proto)
8598 return ENXIO;
8599 }
8600
8601 extern int if_next_index(void);
8602 extern int tcp_ecn_outbound;
8603
8604 void
dlil_ifclassq_setup(struct ifnet * ifp,struct ifclassq * ifcq)8605 dlil_ifclassq_setup(struct ifnet *ifp, struct ifclassq *ifcq)
8606 {
8607 uint32_t sflags = 0;
8608 int err;
8609
8610 if (if_flowadv) {
8611 sflags |= PKTSCHEDF_QALG_FLOWCTL;
8612 }
8613
8614 if (if_delaybased_queue) {
8615 sflags |= PKTSCHEDF_QALG_DELAYBASED;
8616 }
8617
8618 if (ifp->if_output_sched_model ==
8619 IFNET_SCHED_MODEL_DRIVER_MANAGED) {
8620 sflags |= PKTSCHEDF_QALG_DRIVER_MANAGED;
8621 }
8622 /* Inherit drop limit from the default queue */
8623 if (ifp->if_snd != ifcq) {
8624 IFCQ_PKT_DROP_LIMIT(ifcq) = IFCQ_PKT_DROP_LIMIT(ifp->if_snd);
8625 }
8626 /* Initialize transmit queue(s) */
8627 err = ifclassq_setup(ifcq, ifp, sflags);
8628 if (err != 0) {
8629 panic_plain("%s: ifp=%p couldn't initialize transmit queue; "
8630 "err=%d", __func__, ifp, err);
8631 /* NOTREACHED */
8632 }
8633 }
8634
8635 errno_t
ifnet_attach(ifnet_t ifp,const struct sockaddr_dl * ll_addr)8636 ifnet_attach(ifnet_t ifp, const struct sockaddr_dl *ll_addr)
8637 {
8638 #if SKYWALK
8639 boolean_t netif_compat;
8640 if_nexus_netif nexus_netif;
8641 #endif /* SKYWALK */
8642 struct ifnet *tmp_if;
8643 struct ifaddr *ifa;
8644 struct if_data_internal if_data_saved;
8645 struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
8646 struct dlil_threading_info *dl_inp;
8647 thread_continue_t thfunc = NULL;
8648 int err;
8649
8650 if (ifp == NULL) {
8651 return EINVAL;
8652 }
8653
8654 /*
8655 * Serialize ifnet attach using dlil_ifnet_lock, in order to
8656 * prevent the interface from being configured while it is
8657 * embryonic, as ifnet_head_lock is dropped and reacquired
8658 * below prior to marking the ifnet with IFRF_ATTACHED.
8659 */
8660 dlil_if_lock();
8661 ifnet_head_lock_exclusive();
8662 /* Verify we aren't already on the list */
8663 TAILQ_FOREACH(tmp_if, &ifnet_head, if_link) {
8664 if (tmp_if == ifp) {
8665 ifnet_head_done();
8666 dlil_if_unlock();
8667 return EEXIST;
8668 }
8669 }
8670
8671 lck_mtx_lock_spin(&ifp->if_ref_lock);
8672 if (!(ifp->if_refflags & IFRF_EMBRYONIC)) {
8673 panic_plain("%s: flags mismatch (embryonic not set) ifp=%p",
8674 __func__, ifp);
8675 /* NOTREACHED */
8676 }
8677 lck_mtx_unlock(&ifp->if_ref_lock);
8678
8679 ifnet_lock_exclusive(ifp);
8680
8681 /* Sanity check */
8682 VERIFY(ifp->if_detaching_link.tqe_next == NULL);
8683 VERIFY(ifp->if_detaching_link.tqe_prev == NULL);
8684 VERIFY(ifp->if_threads_pending == 0);
8685
8686 if (ll_addr != NULL) {
8687 if (ifp->if_addrlen == 0) {
8688 ifp->if_addrlen = ll_addr->sdl_alen;
8689 } else if (ll_addr->sdl_alen != ifp->if_addrlen) {
8690 ifnet_lock_done(ifp);
8691 ifnet_head_done();
8692 dlil_if_unlock();
8693 return EINVAL;
8694 }
8695 }
8696
8697 /*
8698 * Allow interfaces without protocol families to attach
8699 * only if they have the necessary fields filled out.
8700 */
8701 if (ifp->if_add_proto == NULL || ifp->if_del_proto == NULL) {
8702 DLIL_PRINTF("%s: Attempt to attach interface without "
8703 "family module - %d\n", __func__, ifp->if_family);
8704 ifnet_lock_done(ifp);
8705 ifnet_head_done();
8706 dlil_if_unlock();
8707 return ENODEV;
8708 }
8709
8710 /* Allocate protocol hash table */
8711 VERIFY(ifp->if_proto_hash == NULL);
8712 ifp->if_proto_hash = kalloc_type(struct proto_hash_entry,
8713 PROTO_HASH_SLOTS, Z_WAITOK | Z_ZERO | Z_NOFAIL);
8714
8715 lck_mtx_lock_spin(&ifp->if_flt_lock);
8716 VERIFY(TAILQ_EMPTY(&ifp->if_flt_head));
8717 TAILQ_INIT(&ifp->if_flt_head);
8718 VERIFY(ifp->if_flt_busy == 0);
8719 VERIFY(ifp->if_flt_waiters == 0);
8720 VERIFY(ifp->if_flt_non_os_count == 0);
8721 VERIFY(ifp->if_flt_no_tso_count == 0);
8722 lck_mtx_unlock(&ifp->if_flt_lock);
8723
8724 if (!(dl_if->dl_if_flags & DLIF_REUSE)) {
8725 VERIFY(LIST_EMPTY(&ifp->if_multiaddrs));
8726 LIST_INIT(&ifp->if_multiaddrs);
8727 }
8728
8729 VERIFY(ifp->if_allhostsinm == NULL);
8730 VERIFY(TAILQ_EMPTY(&ifp->if_addrhead));
8731 TAILQ_INIT(&ifp->if_addrhead);
8732
8733 if (ifp->if_index == 0) {
8734 int idx = if_next_index();
8735
8736 /*
8737 * Since we exhausted the list of
8738 * if_index's, try to find an empty slot
8739 * in ifindex2ifnet.
8740 */
8741 if (idx == -1 && if_index >= UINT16_MAX) {
8742 for (int i = 1; i < if_index; i++) {
8743 if (ifindex2ifnet[i] == NULL &&
8744 ifnet_addrs[i - 1] == NULL) {
8745 idx = i;
8746 break;
8747 }
8748 }
8749 }
8750 if (idx == -1) {
8751 ifp->if_index = 0;
8752 ifnet_lock_done(ifp);
8753 ifnet_head_done();
8754 dlil_if_unlock();
8755 return ENOBUFS;
8756 }
8757 ifp->if_index = (uint16_t)idx;
8758
8759 /* the lladdr passed at attach time is the permanent address */
8760 if (ll_addr != NULL && ifp->if_type == IFT_ETHER &&
8761 ll_addr->sdl_alen == ETHER_ADDR_LEN) {
8762 bcopy(CONST_LLADDR(ll_addr),
8763 dl_if->dl_if_permanent_ether,
8764 ETHER_ADDR_LEN);
8765 dl_if->dl_if_permanent_ether_is_set = 1;
8766 }
8767 }
8768 /* There should not be anything occupying this slot */
8769 VERIFY(ifindex2ifnet[ifp->if_index] == NULL);
8770
8771 /* allocate (if needed) and initialize a link address */
8772 ifa = dlil_alloc_lladdr(ifp, ll_addr);
8773 if (ifa == NULL) {
8774 ifnet_lock_done(ifp);
8775 ifnet_head_done();
8776 dlil_if_unlock();
8777 return ENOBUFS;
8778 }
8779
8780 VERIFY(ifnet_addrs[ifp->if_index - 1] == NULL);
8781 ifnet_addrs[ifp->if_index - 1] = ifa;
8782
8783 /* make this address the first on the list */
8784 IFA_LOCK(ifa);
8785 /* hold a reference for ifnet_addrs[] */
8786 IFA_ADDREF_LOCKED(ifa);
8787 /* if_attach_link_ifa() holds a reference for ifa_link */
8788 if_attach_link_ifa(ifp, ifa);
8789 IFA_UNLOCK(ifa);
8790
8791 TAILQ_INSERT_TAIL(&ifnet_head, ifp, if_link);
8792 ifindex2ifnet[ifp->if_index] = ifp;
8793
8794 /* Hold a reference to the underlying dlil_ifnet */
8795 ifnet_reference(ifp);
8796
8797 /* Clear stats (save and restore other fields that we care) */
8798 if_data_saved = ifp->if_data;
8799 bzero(&ifp->if_data, sizeof(ifp->if_data));
8800 ifp->if_data.ifi_type = if_data_saved.ifi_type;
8801 ifp->if_data.ifi_typelen = if_data_saved.ifi_typelen;
8802 ifp->if_data.ifi_physical = if_data_saved.ifi_physical;
8803 ifp->if_data.ifi_addrlen = if_data_saved.ifi_addrlen;
8804 ifp->if_data.ifi_hdrlen = if_data_saved.ifi_hdrlen;
8805 ifp->if_data.ifi_mtu = if_data_saved.ifi_mtu;
8806 ifp->if_data.ifi_baudrate = if_data_saved.ifi_baudrate;
8807 ifp->if_data.ifi_hwassist = if_data_saved.ifi_hwassist;
8808 ifp->if_data.ifi_tso_v4_mtu = if_data_saved.ifi_tso_v4_mtu;
8809 ifp->if_data.ifi_tso_v6_mtu = if_data_saved.ifi_tso_v6_mtu;
8810 ifnet_touch_lastchange(ifp);
8811
8812 VERIFY(ifp->if_output_sched_model == IFNET_SCHED_MODEL_NORMAL ||
8813 ifp->if_output_sched_model == IFNET_SCHED_MODEL_DRIVER_MANAGED ||
8814 ifp->if_output_sched_model == IFNET_SCHED_MODEL_FQ_CODEL);
8815
8816 dlil_ifclassq_setup(ifp, ifp->if_snd);
8817
8818 /* Sanity checks on the input thread storage */
8819 dl_inp = &dl_if->dl_if_inpstorage;
8820 bzero(&dl_inp->dlth_stats, sizeof(dl_inp->dlth_stats));
8821 VERIFY(dl_inp->dlth_flags == 0);
8822 VERIFY(dl_inp->dlth_wtot == 0);
8823 VERIFY(dl_inp->dlth_ifp == NULL);
8824 VERIFY(qhead(&dl_inp->dlth_pkts) == NULL && qempty(&dl_inp->dlth_pkts));
8825 VERIFY(qlimit(&dl_inp->dlth_pkts) == 0);
8826 VERIFY(!dl_inp->dlth_affinity);
8827 VERIFY(ifp->if_inp == NULL);
8828 VERIFY(dl_inp->dlth_thread == THREAD_NULL);
8829 VERIFY(dl_inp->dlth_strategy == NULL);
8830 VERIFY(dl_inp->dlth_driver_thread == THREAD_NULL);
8831 VERIFY(dl_inp->dlth_poller_thread == THREAD_NULL);
8832 VERIFY(dl_inp->dlth_affinity_tag == 0);
8833
8834 #if IFNET_INPUT_SANITY_CHK
8835 VERIFY(dl_inp->dlth_pkts_cnt == 0);
8836 #endif /* IFNET_INPUT_SANITY_CHK */
8837
8838 VERIFY(ifp->if_poll_thread == THREAD_NULL);
8839 dlil_reset_rxpoll_params(ifp);
8840 /*
8841 * A specific DLIL input thread is created per non-loopback interface.
8842 */
8843 if (ifp->if_family != IFNET_FAMILY_LOOPBACK) {
8844 ifp->if_inp = dl_inp;
8845 ifnet_incr_pending_thread_count(ifp);
8846 err = dlil_create_input_thread(ifp, ifp->if_inp, &thfunc);
8847 if (err == ENODEV) {
8848 VERIFY(thfunc == NULL);
8849 ifnet_decr_pending_thread_count(ifp);
8850 } else if (err != 0) {
8851 panic_plain("%s: ifp=%p couldn't get an input thread; "
8852 "err=%d", __func__, ifp, err);
8853 /* NOTREACHED */
8854 }
8855 }
8856 /*
8857 * If the driver supports the new transmit model, calculate flow hash
8858 * and create a workloop starter thread to invoke the if_start callback
8859 * where the packets may be dequeued and transmitted.
8860 */
8861 if (ifp->if_eflags & IFEF_TXSTART) {
8862 thread_precedence_policy_data_t info;
8863 __unused kern_return_t kret;
8864
8865 ifp->if_flowhash = ifnet_calc_flowhash(ifp);
8866 VERIFY(ifp->if_flowhash != 0);
8867 VERIFY(ifp->if_start_thread == THREAD_NULL);
8868
8869 ifnet_set_start_cycle(ifp, NULL);
8870 ifp->if_start_active = 0;
8871 ifp->if_start_req = 0;
8872 ifp->if_start_flags = 0;
8873 VERIFY(ifp->if_start != NULL);
8874 ifnet_incr_pending_thread_count(ifp);
8875 if ((err = kernel_thread_start(ifnet_start_thread_func,
8876 ifp, &ifp->if_start_thread)) != KERN_SUCCESS) {
8877 panic_plain("%s: "
8878 "ifp=%p couldn't get a start thread; "
8879 "err=%d", __func__, ifp, err);
8880 /* NOTREACHED */
8881 }
8882 bzero(&info, sizeof(info));
8883 info.importance = 1;
8884 kret = thread_policy_set(ifp->if_start_thread,
8885 THREAD_PRECEDENCE_POLICY, (thread_policy_t)&info,
8886 THREAD_PRECEDENCE_POLICY_COUNT);
8887 ASSERT(kret == KERN_SUCCESS);
8888 } else {
8889 ifp->if_flowhash = 0;
8890 }
8891
8892 /* Reset polling parameters */
8893 ifnet_set_poll_cycle(ifp, NULL);
8894 ifp->if_poll_update = 0;
8895 ifp->if_poll_flags = 0;
8896 ifp->if_poll_req = 0;
8897 VERIFY(ifp->if_poll_thread == THREAD_NULL);
8898
8899 /*
8900 * If the driver supports the new receive model, create a poller
8901 * thread to invoke if_input_poll callback where the packets may
8902 * be dequeued from the driver and processed for reception.
8903 * if the interface is netif compat then the poller thread is
8904 * managed by netif.
8905 */
8906 if (thfunc == dlil_rxpoll_input_thread_func) {
8907 thread_precedence_policy_data_t info;
8908 __unused kern_return_t kret;
8909 #if SKYWALK
8910 VERIFY(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
8911 #endif /* SKYWALK */
8912 VERIFY(ifp->if_input_poll != NULL);
8913 VERIFY(ifp->if_input_ctl != NULL);
8914 ifnet_incr_pending_thread_count(ifp);
8915 if ((err = kernel_thread_start(ifnet_poll_thread_func, ifp,
8916 &ifp->if_poll_thread)) != KERN_SUCCESS) {
8917 panic_plain("%s: ifp=%p couldn't get a poll thread; "
8918 "err=%d", __func__, ifp, err);
8919 /* NOTREACHED */
8920 }
8921 bzero(&info, sizeof(info));
8922 info.importance = 1;
8923 kret = thread_policy_set(ifp->if_poll_thread,
8924 THREAD_PRECEDENCE_POLICY, (thread_policy_t)&info,
8925 THREAD_PRECEDENCE_POLICY_COUNT);
8926 ASSERT(kret == KERN_SUCCESS);
8927 }
8928
8929 VERIFY(ifp->if_desc.ifd_maxlen == IF_DESCSIZE);
8930 VERIFY(ifp->if_desc.ifd_len == 0);
8931 VERIFY(ifp->if_desc.ifd_desc != NULL);
8932
8933 /* Record attach PC stacktrace */
8934 ctrace_record(&((struct dlil_ifnet *)ifp)->dl_if_attach);
8935
8936 ifp->if_updatemcasts = 0;
8937 if (!LIST_EMPTY(&ifp->if_multiaddrs)) {
8938 struct ifmultiaddr *ifma;
8939 LIST_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
8940 IFMA_LOCK(ifma);
8941 if (ifma->ifma_addr->sa_family == AF_LINK ||
8942 ifma->ifma_addr->sa_family == AF_UNSPEC) {
8943 ifp->if_updatemcasts++;
8944 }
8945 IFMA_UNLOCK(ifma);
8946 }
8947
8948 DLIL_PRINTF("%s: attached with %d suspended link-layer multicast "
8949 "membership(s)\n", if_name(ifp),
8950 ifp->if_updatemcasts);
8951 }
8952
8953 /* Clear logging parameters */
8954 bzero(&ifp->if_log, sizeof(ifp->if_log));
8955
8956 /* Clear foreground/realtime activity timestamps */
8957 ifp->if_fg_sendts = 0;
8958 ifp->if_rt_sendts = 0;
8959
8960 /* Clear throughput estimates and radio type */
8961 ifp->if_estimated_up_bucket = 0;
8962 ifp->if_estimated_down_bucket = 0;
8963 ifp->if_radio_type = 0;
8964 ifp->if_radio_channel = 0;
8965
8966 VERIFY(ifp->if_delegated.ifp == NULL);
8967 VERIFY(ifp->if_delegated.type == 0);
8968 VERIFY(ifp->if_delegated.family == 0);
8969 VERIFY(ifp->if_delegated.subfamily == 0);
8970 VERIFY(ifp->if_delegated.expensive == 0);
8971 VERIFY(ifp->if_delegated.constrained == 0);
8972
8973 VERIFY(ifp->if_agentids == NULL);
8974 VERIFY(ifp->if_agentcount == 0);
8975
8976 /* Reset interface state */
8977 bzero(&ifp->if_interface_state, sizeof(ifp->if_interface_state));
8978 ifp->if_interface_state.valid_bitmask |=
8979 IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID;
8980 ifp->if_interface_state.interface_availability =
8981 IF_INTERFACE_STATE_INTERFACE_AVAILABLE;
8982
8983 /* Initialize Link Quality Metric (loopback [lo0] is always good) */
8984 if (ifp == lo_ifp) {
8985 ifp->if_interface_state.lqm_state = IFNET_LQM_THRESH_GOOD;
8986 ifp->if_interface_state.valid_bitmask |=
8987 IF_INTERFACE_STATE_LQM_STATE_VALID;
8988 } else {
8989 ifp->if_interface_state.lqm_state = IFNET_LQM_THRESH_UNKNOWN;
8990 }
8991
8992 /*
8993 * Enable ECN capability on this interface depending on the
8994 * value of ECN global setting
8995 */
8996 if (tcp_ecn_outbound == 2 && !IFNET_IS_CELLULAR(ifp)) {
8997 if_set_eflags(ifp, IFEF_ECN_ENABLE);
8998 if_clear_eflags(ifp, IFEF_ECN_DISABLE);
8999 }
9000
9001 /*
9002 * Built-in Cyclops always on policy for WiFi infra
9003 */
9004 if (IFNET_IS_WIFI_INFRA(ifp) && net_qos_policy_wifi_enabled != 0) {
9005 errno_t error;
9006
9007 error = if_set_qosmarking_mode(ifp,
9008 IFRTYPE_QOSMARKING_FASTLANE);
9009 if (error != 0) {
9010 DLIL_PRINTF("%s if_set_qosmarking_mode(%s) error %d\n",
9011 __func__, ifp->if_xname, error);
9012 } else {
9013 if_set_eflags(ifp, IFEF_QOSMARKING_ENABLED);
9014 #if (DEVELOPMENT || DEBUG)
9015 DLIL_PRINTF("%s fastlane enabled on %s\n",
9016 __func__, ifp->if_xname);
9017 #endif /* (DEVELOPMENT || DEBUG) */
9018 }
9019 }
9020
9021 ifnet_lock_done(ifp);
9022 ifnet_head_done();
9023
9024 #if SKYWALK
9025 netif_compat = dlil_attach_netif_compat_nexus(ifp, &nexus_netif);
9026 #endif /* SKYWALK */
9027
9028 lck_mtx_lock(&ifp->if_cached_route_lock);
9029 /* Enable forwarding cached route */
9030 ifp->if_fwd_cacheok = 1;
9031 /* Clean up any existing cached routes */
9032 ROUTE_RELEASE(&ifp->if_fwd_route);
9033 bzero(&ifp->if_fwd_route, sizeof(ifp->if_fwd_route));
9034 ROUTE_RELEASE(&ifp->if_src_route);
9035 bzero(&ifp->if_src_route, sizeof(ifp->if_src_route));
9036 ROUTE_RELEASE(&ifp->if_src_route6);
9037 bzero(&ifp->if_src_route6, sizeof(ifp->if_src_route6));
9038 lck_mtx_unlock(&ifp->if_cached_route_lock);
9039
9040 ifnet_llreach_ifattach(ifp, (dl_if->dl_if_flags & DLIF_REUSE));
9041
9042 /*
9043 * Allocate and attach IGMPv3/MLDv2 interface specific variables
9044 * and trees; do this before the ifnet is marked as attached.
9045 * The ifnet keeps the reference to the info structures even after
9046 * the ifnet is detached, since the network-layer records still
9047 * refer to the info structures even after that. This also
9048 * makes it possible for them to still function after the ifnet
9049 * is recycled or reattached.
9050 */
9051 #if INET
9052 if (IGMP_IFINFO(ifp) == NULL) {
9053 IGMP_IFINFO(ifp) = igmp_domifattach(ifp, Z_WAITOK);
9054 VERIFY(IGMP_IFINFO(ifp) != NULL);
9055 } else {
9056 VERIFY(IGMP_IFINFO(ifp)->igi_ifp == ifp);
9057 igmp_domifreattach(IGMP_IFINFO(ifp));
9058 }
9059 #endif /* INET */
9060 if (MLD_IFINFO(ifp) == NULL) {
9061 MLD_IFINFO(ifp) = mld_domifattach(ifp, Z_WAITOK);
9062 VERIFY(MLD_IFINFO(ifp) != NULL);
9063 } else {
9064 VERIFY(MLD_IFINFO(ifp)->mli_ifp == ifp);
9065 mld_domifreattach(MLD_IFINFO(ifp));
9066 }
9067
9068 VERIFY(ifp->if_data_threshold == 0);
9069 VERIFY(ifp->if_dt_tcall != NULL);
9070
9071 /*
9072 * Wait for the created kernel threads for I/O to get
9073 * scheduled and run at least once before we proceed
9074 * to mark interface as attached.
9075 */
9076 lck_mtx_lock(&ifp->if_ref_lock);
9077 while (ifp->if_threads_pending != 0) {
9078 DLIL_PRINTF("%s: Waiting for all kernel threads created for "
9079 "interface %s to get scheduled at least once.\n",
9080 __func__, ifp->if_xname);
9081 (void) msleep(&ifp->if_threads_pending, &ifp->if_ref_lock, (PZERO - 1),
9082 __func__, NULL);
9083 LCK_MTX_ASSERT(&ifp->if_ref_lock, LCK_ASSERT_OWNED);
9084 }
9085 lck_mtx_unlock(&ifp->if_ref_lock);
9086 DLIL_PRINTF("%s: All kernel threads created for interface %s have been scheduled "
9087 "at least once. Proceeding.\n", __func__, ifp->if_xname);
9088
9089 /* Final mark this ifnet as attached. */
9090 ifnet_lock_exclusive(ifp);
9091 lck_mtx_lock_spin(&ifp->if_ref_lock);
9092 ifp->if_refflags = (IFRF_ATTACHED | IFRF_READY); /* clears embryonic */
9093 lck_mtx_unlock(&ifp->if_ref_lock);
9094 if (net_rtref) {
9095 /* boot-args override; enable idle notification */
9096 (void) ifnet_set_idle_flags_locked(ifp, IFRF_IDLE_NOTIFY,
9097 IFRF_IDLE_NOTIFY);
9098 } else {
9099 /* apply previous request(s) to set the idle flags, if any */
9100 (void) ifnet_set_idle_flags_locked(ifp, ifp->if_idle_new_flags,
9101 ifp->if_idle_new_flags_mask);
9102 }
9103 #if SKYWALK
9104 /* the interface is fully attached; let the nexus adapter know */
9105 if (netif_compat || dlil_is_native_netif_nexus(ifp)) {
9106 if (netif_compat) {
9107 if (sk_netif_compat_txmodel ==
9108 NETIF_COMPAT_TXMODEL_ENQUEUE_MULTI) {
9109 ifnet_enqueue_multi_setup(ifp,
9110 sk_tx_delay_qlen, sk_tx_delay_timeout);
9111 }
9112 ifp->if_nx_netif = nexus_netif;
9113 }
9114 ifp->if_na_ops->ni_finalize(ifp->if_na, ifp);
9115 }
9116 #endif /* SKYWALK */
9117 ifnet_lock_done(ifp);
9118 dlil_if_unlock();
9119
9120 #if PF
9121 /*
9122 * Attach packet filter to this interface, if enabled.
9123 */
9124 pf_ifnet_hook(ifp, 1);
9125 #endif /* PF */
9126
9127 dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_IF_ATTACHED, NULL, 0, FALSE);
9128
9129 if (dlil_verbose) {
9130 DLIL_PRINTF("%s: attached%s\n", if_name(ifp),
9131 (dl_if->dl_if_flags & DLIF_REUSE) ? " (recycled)" : "");
9132 }
9133
9134 return 0;
9135 }
9136
9137 /*
9138 * Prepare the storage for the first/permanent link address, which must
9139 * must have the same lifetime as the ifnet itself. Although the link
9140 * address gets removed from if_addrhead and ifnet_addrs[] at detach time,
9141 * its location in memory must never change as it may still be referred
9142 * to by some parts of the system afterwards (unfortunate implementation
9143 * artifacts inherited from BSD.)
9144 *
9145 * Caller must hold ifnet lock as writer.
9146 */
9147 static struct ifaddr *
dlil_alloc_lladdr(struct ifnet * ifp,const struct sockaddr_dl * ll_addr)9148 dlil_alloc_lladdr(struct ifnet *ifp, const struct sockaddr_dl *ll_addr)
9149 {
9150 struct ifaddr *ifa, *oifa;
9151 struct sockaddr_dl *asdl, *msdl;
9152 char workbuf[IFNAMSIZ * 2];
9153 int namelen, masklen, socksize;
9154 struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
9155
9156 ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_EXCLUSIVE);
9157 VERIFY(ll_addr == NULL || ll_addr->sdl_alen == ifp->if_addrlen);
9158
9159 namelen = scnprintf(workbuf, sizeof(workbuf), "%s",
9160 if_name(ifp));
9161 masklen = offsetof(struct sockaddr_dl, sdl_data[0])
9162 + ((namelen > 0) ? namelen : 0);
9163 socksize = masklen + ifp->if_addrlen;
9164 #define ROUNDUP(a) (1 + (((a) - 1) | (sizeof (u_int32_t) - 1)))
9165 if ((u_int32_t)socksize < sizeof(struct sockaddr_dl)) {
9166 socksize = sizeof(struct sockaddr_dl);
9167 }
9168 socksize = ROUNDUP(socksize);
9169 #undef ROUNDUP
9170
9171 ifa = ifp->if_lladdr;
9172 if (socksize > DLIL_SDLMAXLEN ||
9173 (ifa != NULL && ifa != &dl_if->dl_if_lladdr.ifa)) {
9174 /*
9175 * Rare, but in the event that the link address requires
9176 * more storage space than DLIL_SDLMAXLEN, allocate the
9177 * largest possible storages for address and mask, such
9178 * that we can reuse the same space when if_addrlen grows.
9179 * This same space will be used when if_addrlen shrinks.
9180 */
9181 if (ifa == NULL || ifa == &dl_if->dl_if_lladdr.ifa) {
9182 int ifasize = sizeof(*ifa) + 2 * SOCK_MAXADDRLEN;
9183
9184 ifa = zalloc_permanent(ifasize, ZALIGN(struct ifaddr));
9185 ifa_lock_init(ifa);
9186 /* Don't set IFD_ALLOC, as this is permanent */
9187 ifa->ifa_debug = IFD_LINK;
9188 }
9189 IFA_LOCK(ifa);
9190 /* address and mask sockaddr_dl locations */
9191 asdl = (struct sockaddr_dl *)(ifa + 1);
9192 bzero(asdl, SOCK_MAXADDRLEN);
9193 msdl = (struct sockaddr_dl *)(void *)
9194 ((char *)asdl + SOCK_MAXADDRLEN);
9195 bzero(msdl, SOCK_MAXADDRLEN);
9196 } else {
9197 VERIFY(ifa == NULL || ifa == &dl_if->dl_if_lladdr.ifa);
9198 /*
9199 * Use the storage areas for address and mask within the
9200 * dlil_ifnet structure. This is the most common case.
9201 */
9202 if (ifa == NULL) {
9203 ifa = &dl_if->dl_if_lladdr.ifa;
9204 ifa_lock_init(ifa);
9205 /* Don't set IFD_ALLOC, as this is permanent */
9206 ifa->ifa_debug = IFD_LINK;
9207 }
9208 IFA_LOCK(ifa);
9209 /* address and mask sockaddr_dl locations */
9210 asdl = (struct sockaddr_dl *)(void *)&dl_if->dl_if_lladdr.asdl;
9211 bzero(asdl, sizeof(dl_if->dl_if_lladdr.asdl));
9212 msdl = (struct sockaddr_dl *)(void *)&dl_if->dl_if_lladdr.msdl;
9213 bzero(msdl, sizeof(dl_if->dl_if_lladdr.msdl));
9214 }
9215
9216 /* hold a permanent reference for the ifnet itself */
9217 IFA_ADDREF_LOCKED(ifa);
9218 oifa = ifp->if_lladdr;
9219 ifp->if_lladdr = ifa;
9220
9221 VERIFY(ifa->ifa_debug == IFD_LINK);
9222 ifa->ifa_ifp = ifp;
9223 ifa->ifa_rtrequest = link_rtrequest;
9224 ifa->ifa_addr = (struct sockaddr *)asdl;
9225 asdl->sdl_len = (u_char)socksize;
9226 asdl->sdl_family = AF_LINK;
9227 if (namelen > 0) {
9228 bcopy(workbuf, asdl->sdl_data, min(namelen,
9229 sizeof(asdl->sdl_data)));
9230 asdl->sdl_nlen = (u_char)namelen;
9231 } else {
9232 asdl->sdl_nlen = 0;
9233 }
9234 asdl->sdl_index = ifp->if_index;
9235 asdl->sdl_type = ifp->if_type;
9236 if (ll_addr != NULL) {
9237 asdl->sdl_alen = ll_addr->sdl_alen;
9238 bcopy(CONST_LLADDR(ll_addr), LLADDR(asdl), asdl->sdl_alen);
9239 } else {
9240 asdl->sdl_alen = 0;
9241 }
9242 ifa->ifa_netmask = (struct sockaddr *)msdl;
9243 msdl->sdl_len = (u_char)masklen;
9244 while (namelen > 0) {
9245 msdl->sdl_data[--namelen] = 0xff;
9246 }
9247 IFA_UNLOCK(ifa);
9248
9249 if (oifa != NULL) {
9250 IFA_REMREF(oifa);
9251 }
9252
9253 return ifa;
9254 }
9255
9256 static void
if_purgeaddrs(struct ifnet * ifp)9257 if_purgeaddrs(struct ifnet *ifp)
9258 {
9259 #if INET
9260 in_purgeaddrs(ifp);
9261 #endif /* INET */
9262 in6_purgeaddrs(ifp);
9263 }
9264
9265 errno_t
ifnet_detach(ifnet_t ifp)9266 ifnet_detach(ifnet_t ifp)
9267 {
9268 struct ifnet *delegated_ifp;
9269 struct nd_ifinfo *ndi = NULL;
9270
9271 if (ifp == NULL) {
9272 return EINVAL;
9273 }
9274
9275 ndi = ND_IFINFO(ifp);
9276 if (NULL != ndi) {
9277 ndi->cga_initialized = FALSE;
9278 }
9279
9280 /* Mark the interface down */
9281 if_down(ifp);
9282
9283 /*
9284 * IMPORTANT NOTE
9285 *
9286 * Any field in the ifnet that relies on IF_FULLY_ATTACHED()
9287 * or equivalently, ifnet_is_attached(ifp, 1), can't be modified
9288 * until after we've waited for all I/O references to drain
9289 * in ifnet_detach_final().
9290 */
9291
9292 ifnet_head_lock_exclusive();
9293 ifnet_lock_exclusive(ifp);
9294
9295 if (ifp->if_output_netem != NULL) {
9296 netem_destroy(ifp->if_output_netem);
9297 ifp->if_output_netem = NULL;
9298 }
9299
9300 /*
9301 * Check to see if this interface has previously triggered
9302 * aggressive protocol draining; if so, decrement the global
9303 * refcnt and clear PR_AGGDRAIN on the route domain if
9304 * there are no more of such an interface around.
9305 */
9306 (void) ifnet_set_idle_flags_locked(ifp, 0, ~0);
9307
9308 lck_mtx_lock_spin(&ifp->if_ref_lock);
9309 if (!(ifp->if_refflags & IFRF_ATTACHED)) {
9310 lck_mtx_unlock(&ifp->if_ref_lock);
9311 ifnet_lock_done(ifp);
9312 ifnet_head_done();
9313 return EINVAL;
9314 } else if (ifp->if_refflags & IFRF_DETACHING) {
9315 /* Interface has already been detached */
9316 lck_mtx_unlock(&ifp->if_ref_lock);
9317 ifnet_lock_done(ifp);
9318 ifnet_head_done();
9319 return ENXIO;
9320 }
9321 VERIFY(!(ifp->if_refflags & IFRF_EMBRYONIC));
9322 /* Indicate this interface is being detached */
9323 ifp->if_refflags &= ~IFRF_ATTACHED;
9324 ifp->if_refflags |= IFRF_DETACHING;
9325 lck_mtx_unlock(&ifp->if_ref_lock);
9326
9327 if (dlil_verbose) {
9328 DLIL_PRINTF("%s: detaching\n", if_name(ifp));
9329 }
9330
9331 /* clean up flow control entry object if there's any */
9332 if (ifp->if_eflags & IFEF_TXSTART) {
9333 ifnet_flowadv(ifp->if_flowhash);
9334 }
9335
9336 /* Reset ECN enable/disable flags */
9337 /* Reset CLAT46 flag */
9338 if_clear_eflags(ifp, IFEF_ECN_ENABLE | IFEF_ECN_DISABLE | IFEF_CLAT46);
9339
9340 /*
9341 * We do not reset the TCP keep alive counters in case
9342 * a TCP connection stays connection after the interface
9343 * went down
9344 */
9345 if (ifp->if_tcp_kao_cnt > 0) {
9346 os_log(OS_LOG_DEFAULT, "%s %s tcp_kao_cnt %u not zero",
9347 __func__, if_name(ifp), ifp->if_tcp_kao_cnt);
9348 }
9349 ifp->if_tcp_kao_max = 0;
9350
9351 /*
9352 * Remove ifnet from the ifnet_head, ifindex2ifnet[]; it will
9353 * no longer be visible during lookups from this point.
9354 */
9355 VERIFY(ifindex2ifnet[ifp->if_index] == ifp);
9356 TAILQ_REMOVE(&ifnet_head, ifp, if_link);
9357 ifp->if_link.tqe_next = NULL;
9358 ifp->if_link.tqe_prev = NULL;
9359 if (ifp->if_ordered_link.tqe_next != NULL ||
9360 ifp->if_ordered_link.tqe_prev != NULL) {
9361 ifnet_remove_from_ordered_list(ifp);
9362 }
9363 ifindex2ifnet[ifp->if_index] = NULL;
9364
9365 /* 18717626 - reset router mode */
9366 if_clear_eflags(ifp, IFEF_IPV4_ROUTER);
9367 ifp->if_ipv6_router_mode = IPV6_ROUTER_MODE_DISABLED;
9368
9369 /* Record detach PC stacktrace */
9370 ctrace_record(&((struct dlil_ifnet *)ifp)->dl_if_detach);
9371
9372 /* Clear logging parameters */
9373 bzero(&ifp->if_log, sizeof(ifp->if_log));
9374
9375 /* Clear delegated interface info (reference released below) */
9376 delegated_ifp = ifp->if_delegated.ifp;
9377 bzero(&ifp->if_delegated, sizeof(ifp->if_delegated));
9378
9379 /* Reset interface state */
9380 bzero(&ifp->if_interface_state, sizeof(ifp->if_interface_state));
9381
9382 ifnet_lock_done(ifp);
9383 ifnet_head_done();
9384
9385 /* Release reference held on the delegated interface */
9386 if (delegated_ifp != NULL) {
9387 ifnet_release(delegated_ifp);
9388 }
9389
9390 /* Reset Link Quality Metric (unless loopback [lo0]) */
9391 if (ifp != lo_ifp) {
9392 if_lqm_update(ifp, IFNET_LQM_THRESH_OFF, 0);
9393 }
9394
9395 /* Reset TCP local statistics */
9396 if (ifp->if_tcp_stat != NULL) {
9397 bzero(ifp->if_tcp_stat, sizeof(*ifp->if_tcp_stat));
9398 }
9399
9400 /* Reset UDP local statistics */
9401 if (ifp->if_udp_stat != NULL) {
9402 bzero(ifp->if_udp_stat, sizeof(*ifp->if_udp_stat));
9403 }
9404
9405 /* Reset ifnet IPv4 stats */
9406 if (ifp->if_ipv4_stat != NULL) {
9407 bzero(ifp->if_ipv4_stat, sizeof(*ifp->if_ipv4_stat));
9408 }
9409
9410 /* Reset ifnet IPv6 stats */
9411 if (ifp->if_ipv6_stat != NULL) {
9412 bzero(ifp->if_ipv6_stat, sizeof(*ifp->if_ipv6_stat));
9413 }
9414
9415 /* Release memory held for interface link status report */
9416 if (ifp->if_link_status != NULL) {
9417 kfree_type(struct if_link_status, ifp->if_link_status);
9418 ifp->if_link_status = NULL;
9419 }
9420
9421 /* Let BPF know we're detaching */
9422 bpfdetach(ifp);
9423
9424 /* Disable forwarding cached route */
9425 lck_mtx_lock(&ifp->if_cached_route_lock);
9426 ifp->if_fwd_cacheok = 0;
9427 lck_mtx_unlock(&ifp->if_cached_route_lock);
9428
9429 /* Disable data threshold and wait for any pending event posting */
9430 ifp->if_data_threshold = 0;
9431 VERIFY(ifp->if_dt_tcall != NULL);
9432 (void) thread_call_cancel_wait(ifp->if_dt_tcall);
9433
9434 /*
9435 * Drain any deferred IGMPv3/MLDv2 query responses, but keep the
9436 * references to the info structures and leave them attached to
9437 * this ifnet.
9438 */
9439 #if INET
9440 igmp_domifdetach(ifp);
9441 #endif /* INET */
9442 mld_domifdetach(ifp);
9443
9444 #if SKYWALK
9445 /* Clean up any netns tokens still pointing to to this ifnet */
9446 netns_ifnet_detach(ifp);
9447 #endif /* SKYWALK */
9448 dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_IF_DETACHING, NULL, 0, FALSE);
9449
9450 /* Let worker thread take care of the rest, to avoid reentrancy */
9451 dlil_if_lock();
9452 ifnet_detaching_enqueue(ifp);
9453 dlil_if_unlock();
9454
9455 return 0;
9456 }
9457
9458 static void
ifnet_detaching_enqueue(struct ifnet * ifp)9459 ifnet_detaching_enqueue(struct ifnet *ifp)
9460 {
9461 dlil_if_lock_assert();
9462
9463 ++ifnet_detaching_cnt;
9464 VERIFY(ifnet_detaching_cnt != 0);
9465 TAILQ_INSERT_TAIL(&ifnet_detaching_head, ifp, if_detaching_link);
9466 wakeup((caddr_t)&ifnet_delayed_run);
9467 }
9468
9469 static struct ifnet *
ifnet_detaching_dequeue(void)9470 ifnet_detaching_dequeue(void)
9471 {
9472 struct ifnet *ifp;
9473
9474 dlil_if_lock_assert();
9475
9476 ifp = TAILQ_FIRST(&ifnet_detaching_head);
9477 VERIFY(ifnet_detaching_cnt != 0 || ifp == NULL);
9478 if (ifp != NULL) {
9479 VERIFY(ifnet_detaching_cnt != 0);
9480 --ifnet_detaching_cnt;
9481 TAILQ_REMOVE(&ifnet_detaching_head, ifp, if_detaching_link);
9482 ifp->if_detaching_link.tqe_next = NULL;
9483 ifp->if_detaching_link.tqe_prev = NULL;
9484 }
9485 return ifp;
9486 }
9487
9488 __attribute__((noreturn))
9489 static void
ifnet_detacher_thread_cont(void * v,wait_result_t wres)9490 ifnet_detacher_thread_cont(void *v, wait_result_t wres)
9491 {
9492 #pragma unused(v, wres)
9493 struct ifnet *ifp;
9494
9495 dlil_if_lock();
9496 if (__improbable(ifnet_detaching_embryonic)) {
9497 ifnet_detaching_embryonic = FALSE;
9498 /* there's no lock ordering constrain so OK to do this here */
9499 dlil_decr_pending_thread_count();
9500 }
9501
9502 for (;;) {
9503 dlil_if_lock_assert();
9504
9505 if (ifnet_detaching_cnt == 0) {
9506 break;
9507 }
9508
9509 net_update_uptime();
9510
9511 VERIFY(TAILQ_FIRST(&ifnet_detaching_head) != NULL);
9512
9513 /* Take care of detaching ifnet */
9514 ifp = ifnet_detaching_dequeue();
9515 if (ifp != NULL) {
9516 dlil_if_unlock();
9517 ifnet_detach_final(ifp);
9518 dlil_if_lock();
9519 }
9520 }
9521
9522 (void) assert_wait(&ifnet_delayed_run, THREAD_UNINT);
9523 dlil_if_unlock();
9524 (void) thread_block(ifnet_detacher_thread_cont);
9525
9526 VERIFY(0); /* we should never get here */
9527 /* NOTREACHED */
9528 __builtin_unreachable();
9529 }
9530
9531 __dead2
9532 static void
ifnet_detacher_thread_func(void * v,wait_result_t w)9533 ifnet_detacher_thread_func(void *v, wait_result_t w)
9534 {
9535 #pragma unused(v, w)
9536 dlil_if_lock();
9537 (void) assert_wait(&ifnet_delayed_run, THREAD_UNINT);
9538 ifnet_detaching_embryonic = TRUE;
9539 /* wake up once to get out of embryonic state */
9540 wakeup((caddr_t)&ifnet_delayed_run);
9541 dlil_if_unlock();
9542 (void) thread_block(ifnet_detacher_thread_cont);
9543 VERIFY(0);
9544 /* NOTREACHED */
9545 __builtin_unreachable();
9546 }
9547
9548 static void
ifnet_detach_final(struct ifnet * ifp)9549 ifnet_detach_final(struct ifnet *ifp)
9550 {
9551 struct ifnet_filter *filter, *filter_next;
9552 struct dlil_ifnet *dlifp;
9553 struct ifnet_filter_head fhead;
9554 struct dlil_threading_info *inp;
9555 struct ifaddr *ifa;
9556 ifnet_detached_func if_free;
9557 int i;
9558
9559 #if SKYWALK
9560 dlil_netif_detach_notify(ifp);
9561 /*
9562 * Wait for the datapath to quiesce before tearing down
9563 * netif/flowswitch nexuses.
9564 */
9565 dlil_quiesce_and_detach_nexuses(ifp);
9566 #endif /* SKYWALK */
9567
9568 lck_mtx_lock(&ifp->if_ref_lock);
9569 if (!(ifp->if_refflags & IFRF_DETACHING)) {
9570 panic("%s: flags mismatch (detaching not set) ifp=%p",
9571 __func__, ifp);
9572 /* NOTREACHED */
9573 }
9574
9575 /*
9576 * Wait until the existing IO references get released
9577 * before we proceed with ifnet_detach. This is not a
9578 * common case, so block without using a continuation.
9579 */
9580 while (ifp->if_refio > 0) {
9581 DLIL_PRINTF("%s: Waiting for IO references on %s interface "
9582 "to be released\n", __func__, if_name(ifp));
9583 (void) msleep(&(ifp->if_refio), &ifp->if_ref_lock,
9584 (PZERO - 1), "ifnet_ioref_wait", NULL);
9585 }
9586
9587 VERIFY(ifp->if_datamov == 0);
9588 VERIFY(ifp->if_drainers == 0);
9589 VERIFY(ifp->if_suspend == 0);
9590 ifp->if_refflags &= ~IFRF_READY;
9591 lck_mtx_unlock(&ifp->if_ref_lock);
9592
9593 /* Clear agent IDs */
9594 if (ifp->if_agentids != NULL) {
9595 kfree_data(ifp->if_agentids,
9596 sizeof(uuid_t) * ifp->if_agentcount);
9597 ifp->if_agentids = NULL;
9598 }
9599 ifp->if_agentcount = 0;
9600
9601 #if SKYWALK
9602 VERIFY(SLIST_EMPTY(&ifp->if_netns_tokens));
9603 #endif /* SKYWALK */
9604 /* Drain and destroy send queue */
9605 ifclassq_teardown(ifp->if_snd);
9606
9607 /* Detach interface filters */
9608 lck_mtx_lock(&ifp->if_flt_lock);
9609 if_flt_monitor_enter(ifp);
9610
9611 LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
9612 fhead = ifp->if_flt_head;
9613 TAILQ_INIT(&ifp->if_flt_head);
9614
9615 for (filter = TAILQ_FIRST(&fhead); filter; filter = filter_next) {
9616 filter_next = TAILQ_NEXT(filter, filt_next);
9617 lck_mtx_unlock(&ifp->if_flt_lock);
9618
9619 dlil_detach_filter_internal(filter, 1);
9620 lck_mtx_lock(&ifp->if_flt_lock);
9621 }
9622 if_flt_monitor_leave(ifp);
9623 lck_mtx_unlock(&ifp->if_flt_lock);
9624
9625 /* Tell upper layers to drop their network addresses */
9626 if_purgeaddrs(ifp);
9627
9628 ifnet_lock_exclusive(ifp);
9629
9630 /* Unplumb all protocols */
9631 for (i = 0; i < PROTO_HASH_SLOTS; i++) {
9632 struct if_proto *proto;
9633
9634 proto = SLIST_FIRST(&ifp->if_proto_hash[i]);
9635 while (proto != NULL) {
9636 protocol_family_t family = proto->protocol_family;
9637 ifnet_lock_done(ifp);
9638 proto_unplumb(family, ifp);
9639 ifnet_lock_exclusive(ifp);
9640 proto = SLIST_FIRST(&ifp->if_proto_hash[i]);
9641 }
9642 /* There should not be any protocols left */
9643 VERIFY(SLIST_EMPTY(&ifp->if_proto_hash[i]));
9644 }
9645 kfree_type(struct proto_hash_entry, PROTO_HASH_SLOTS, ifp->if_proto_hash);
9646 ifp->if_proto_hash = NULL;
9647
9648 /* Detach (permanent) link address from if_addrhead */
9649 ifa = TAILQ_FIRST(&ifp->if_addrhead);
9650 VERIFY(ifnet_addrs[ifp->if_index - 1] == ifa);
9651 IFA_LOCK(ifa);
9652 if_detach_link_ifa(ifp, ifa);
9653 IFA_UNLOCK(ifa);
9654
9655 /* Remove (permanent) link address from ifnet_addrs[] */
9656 IFA_REMREF(ifa);
9657 ifnet_addrs[ifp->if_index - 1] = NULL;
9658
9659 /* This interface should not be on {ifnet_head,detaching} */
9660 VERIFY(ifp->if_link.tqe_next == NULL);
9661 VERIFY(ifp->if_link.tqe_prev == NULL);
9662 VERIFY(ifp->if_detaching_link.tqe_next == NULL);
9663 VERIFY(ifp->if_detaching_link.tqe_prev == NULL);
9664 VERIFY(ifp->if_ordered_link.tqe_next == NULL);
9665 VERIFY(ifp->if_ordered_link.tqe_prev == NULL);
9666
9667 /* The slot should have been emptied */
9668 VERIFY(ifindex2ifnet[ifp->if_index] == NULL);
9669
9670 /* There should not be any addresses left */
9671 VERIFY(TAILQ_EMPTY(&ifp->if_addrhead));
9672
9673 /*
9674 * Signal the starter thread to terminate itself, and wait until
9675 * it has exited.
9676 */
9677 if (ifp->if_start_thread != THREAD_NULL) {
9678 lck_mtx_lock_spin(&ifp->if_start_lock);
9679 ifp->if_start_flags |= IFSF_TERMINATING;
9680 wakeup_one((caddr_t)&ifp->if_start_thread);
9681 lck_mtx_unlock(&ifp->if_start_lock);
9682
9683 /* wait for starter thread to terminate */
9684 lck_mtx_lock(&ifp->if_start_lock);
9685 while (ifp->if_start_thread != THREAD_NULL) {
9686 if (dlil_verbose) {
9687 DLIL_PRINTF("%s: waiting for %s starter thread to terminate\n",
9688 __func__,
9689 if_name(ifp));
9690 }
9691 (void) msleep(&ifp->if_start_thread,
9692 &ifp->if_start_lock, (PZERO - 1),
9693 "ifnet_start_thread_exit", NULL);
9694 }
9695 lck_mtx_unlock(&ifp->if_start_lock);
9696 if (dlil_verbose) {
9697 DLIL_PRINTF("%s: %s starter thread termination complete",
9698 __func__, if_name(ifp));
9699 }
9700 }
9701
9702 /*
9703 * Signal the poller thread to terminate itself, and wait until
9704 * it has exited.
9705 */
9706 if (ifp->if_poll_thread != THREAD_NULL) {
9707 #if SKYWALK
9708 VERIFY(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
9709 #endif /* SKYWALK */
9710 lck_mtx_lock_spin(&ifp->if_poll_lock);
9711 ifp->if_poll_flags |= IF_POLLF_TERMINATING;
9712 wakeup_one((caddr_t)&ifp->if_poll_thread);
9713 lck_mtx_unlock(&ifp->if_poll_lock);
9714
9715 /* wait for poller thread to terminate */
9716 lck_mtx_lock(&ifp->if_poll_lock);
9717 while (ifp->if_poll_thread != THREAD_NULL) {
9718 if (dlil_verbose) {
9719 DLIL_PRINTF("%s: waiting for %s poller thread to terminate\n",
9720 __func__,
9721 if_name(ifp));
9722 }
9723 (void) msleep(&ifp->if_poll_thread,
9724 &ifp->if_poll_lock, (PZERO - 1),
9725 "ifnet_poll_thread_exit", NULL);
9726 }
9727 lck_mtx_unlock(&ifp->if_poll_lock);
9728 if (dlil_verbose) {
9729 DLIL_PRINTF("%s: %s poller thread termination complete\n",
9730 __func__, if_name(ifp));
9731 }
9732 }
9733
9734 /*
9735 * If thread affinity was set for the workloop thread, we will need
9736 * to tear down the affinity and release the extra reference count
9737 * taken at attach time. Does not apply to lo0 or other interfaces
9738 * without dedicated input threads.
9739 */
9740 if ((inp = ifp->if_inp) != NULL) {
9741 VERIFY(inp != dlil_main_input_thread);
9742
9743 if (inp->dlth_affinity) {
9744 struct thread *tp, *wtp, *ptp;
9745
9746 lck_mtx_lock_spin(&inp->dlth_lock);
9747 wtp = inp->dlth_driver_thread;
9748 inp->dlth_driver_thread = THREAD_NULL;
9749 ptp = inp->dlth_poller_thread;
9750 inp->dlth_poller_thread = THREAD_NULL;
9751 ASSERT(inp->dlth_thread != THREAD_NULL);
9752 tp = inp->dlth_thread; /* don't nullify now */
9753 inp->dlth_affinity_tag = 0;
9754 inp->dlth_affinity = FALSE;
9755 lck_mtx_unlock(&inp->dlth_lock);
9756
9757 /* Tear down poll thread affinity */
9758 if (ptp != NULL) {
9759 VERIFY(ifp->if_eflags & IFEF_RXPOLL);
9760 VERIFY(ifp->if_xflags & IFXF_LEGACY);
9761 (void) dlil_affinity_set(ptp,
9762 THREAD_AFFINITY_TAG_NULL);
9763 thread_deallocate(ptp);
9764 }
9765
9766 /* Tear down workloop thread affinity */
9767 if (wtp != NULL) {
9768 (void) dlil_affinity_set(wtp,
9769 THREAD_AFFINITY_TAG_NULL);
9770 thread_deallocate(wtp);
9771 }
9772
9773 /* Tear down DLIL input thread affinity */
9774 (void) dlil_affinity_set(tp, THREAD_AFFINITY_TAG_NULL);
9775 thread_deallocate(tp);
9776 }
9777
9778 /* disassociate ifp DLIL input thread */
9779 ifp->if_inp = NULL;
9780
9781 /* if the worker thread was created, tell it to terminate */
9782 if (inp->dlth_thread != THREAD_NULL) {
9783 lck_mtx_lock_spin(&inp->dlth_lock);
9784 inp->dlth_flags |= DLIL_INPUT_TERMINATE;
9785 if (!(inp->dlth_flags & DLIL_INPUT_RUNNING)) {
9786 wakeup_one((caddr_t)&inp->dlth_flags);
9787 }
9788 lck_mtx_unlock(&inp->dlth_lock);
9789 ifnet_lock_done(ifp);
9790
9791 /* wait for the input thread to terminate */
9792 lck_mtx_lock_spin(&inp->dlth_lock);
9793 while ((inp->dlth_flags & DLIL_INPUT_TERMINATE_COMPLETE)
9794 == 0) {
9795 (void) msleep(&inp->dlth_flags, &inp->dlth_lock,
9796 (PZERO - 1) | PSPIN, inp->dlth_name, NULL);
9797 }
9798 lck_mtx_unlock(&inp->dlth_lock);
9799 ifnet_lock_exclusive(ifp);
9800 }
9801
9802 /* clean-up input thread state */
9803 dlil_clean_threading_info(inp);
9804 /* clean-up poll parameters */
9805 VERIFY(ifp->if_poll_thread == THREAD_NULL);
9806 dlil_reset_rxpoll_params(ifp);
9807 }
9808
9809 /* The driver might unload, so point these to ourselves */
9810 if_free = ifp->if_free;
9811 ifp->if_output_dlil = ifp_if_output;
9812 ifp->if_output = ifp_if_output;
9813 ifp->if_pre_enqueue = ifp_if_output;
9814 ifp->if_start = ifp_if_start;
9815 ifp->if_output_ctl = ifp_if_ctl;
9816 ifp->if_input_dlil = ifp_if_input;
9817 ifp->if_input_poll = ifp_if_input_poll;
9818 ifp->if_input_ctl = ifp_if_ctl;
9819 ifp->if_ioctl = ifp_if_ioctl;
9820 ifp->if_set_bpf_tap = ifp_if_set_bpf_tap;
9821 ifp->if_free = ifp_if_free;
9822 ifp->if_demux = ifp_if_demux;
9823 ifp->if_event = ifp_if_event;
9824 ifp->if_framer_legacy = ifp_if_framer;
9825 ifp->if_framer = ifp_if_framer_extended;
9826 ifp->if_add_proto = ifp_if_add_proto;
9827 ifp->if_del_proto = ifp_if_del_proto;
9828 ifp->if_check_multi = ifp_if_check_multi;
9829
9830 /* wipe out interface description */
9831 VERIFY(ifp->if_desc.ifd_maxlen == IF_DESCSIZE);
9832 ifp->if_desc.ifd_len = 0;
9833 VERIFY(ifp->if_desc.ifd_desc != NULL);
9834 bzero(ifp->if_desc.ifd_desc, IF_DESCSIZE);
9835
9836 /* there shouldn't be any delegation by now */
9837 VERIFY(ifp->if_delegated.ifp == NULL);
9838 VERIFY(ifp->if_delegated.type == 0);
9839 VERIFY(ifp->if_delegated.family == 0);
9840 VERIFY(ifp->if_delegated.subfamily == 0);
9841 VERIFY(ifp->if_delegated.expensive == 0);
9842 VERIFY(ifp->if_delegated.constrained == 0);
9843
9844 /* QoS marking get cleared */
9845 if_clear_eflags(ifp, IFEF_QOSMARKING_ENABLED);
9846 if_set_qosmarking_mode(ifp, IFRTYPE_QOSMARKING_MODE_NONE);
9847
9848 #if SKYWALK
9849 /* the nexus destructor is responsible for clearing these */
9850 VERIFY(ifp->if_na_ops == NULL);
9851 VERIFY(ifp->if_na == NULL);
9852 #endif /* SKYWALK */
9853
9854 /* promiscuous/allmulti counts need to start at zero again */
9855 ifp->if_pcount = 0;
9856 ifp->if_amcount = 0;
9857 ifp->if_flags &= ~(IFF_PROMISC | IFF_ALLMULTI);
9858
9859 ifnet_lock_done(ifp);
9860
9861 #if PF
9862 /*
9863 * Detach this interface from packet filter, if enabled.
9864 */
9865 pf_ifnet_hook(ifp, 0);
9866 #endif /* PF */
9867
9868 /* Filter list should be empty */
9869 lck_mtx_lock_spin(&ifp->if_flt_lock);
9870 VERIFY(TAILQ_EMPTY(&ifp->if_flt_head));
9871 VERIFY(ifp->if_flt_busy == 0);
9872 VERIFY(ifp->if_flt_waiters == 0);
9873 VERIFY(ifp->if_flt_non_os_count == 0);
9874 VERIFY(ifp->if_flt_no_tso_count == 0);
9875 lck_mtx_unlock(&ifp->if_flt_lock);
9876
9877 /* Last chance to drain send queue */
9878 if_qflush_snd(ifp, 0);
9879
9880 /* Last chance to cleanup any cached route */
9881 lck_mtx_lock(&ifp->if_cached_route_lock);
9882 VERIFY(!ifp->if_fwd_cacheok);
9883 ROUTE_RELEASE(&ifp->if_fwd_route);
9884 bzero(&ifp->if_fwd_route, sizeof(ifp->if_fwd_route));
9885 ROUTE_RELEASE(&ifp->if_src_route);
9886 bzero(&ifp->if_src_route, sizeof(ifp->if_src_route));
9887 ROUTE_RELEASE(&ifp->if_src_route6);
9888 bzero(&ifp->if_src_route6, sizeof(ifp->if_src_route6));
9889 lck_mtx_unlock(&ifp->if_cached_route_lock);
9890
9891 VERIFY(ifp->if_data_threshold == 0);
9892 VERIFY(ifp->if_dt_tcall != NULL);
9893 VERIFY(!thread_call_isactive(ifp->if_dt_tcall));
9894
9895 ifnet_llreach_ifdetach(ifp);
9896
9897 dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_IF_DETACHED, NULL, 0, FALSE);
9898
9899 /*
9900 * Finally, mark this ifnet as detached.
9901 */
9902 if (dlil_verbose) {
9903 DLIL_PRINTF("%s: detached\n", if_name(ifp));
9904 }
9905 lck_mtx_lock_spin(&ifp->if_ref_lock);
9906 if (!(ifp->if_refflags & IFRF_DETACHING)) {
9907 panic("%s: flags mismatch (detaching not set) ifp=%p",
9908 __func__, ifp);
9909 /* NOTREACHED */
9910 }
9911 ifp->if_refflags &= ~IFRF_DETACHING;
9912 lck_mtx_unlock(&ifp->if_ref_lock);
9913 if (if_free != NULL) {
9914 if_free(ifp);
9915 }
9916
9917 ifclassq_release(&ifp->if_snd);
9918
9919 /* we're fully detached, clear the "in use" bit */
9920 dlifp = (struct dlil_ifnet *)ifp;
9921 lck_mtx_lock(&dlifp->dl_if_lock);
9922 ASSERT((dlifp->dl_if_flags & DLIF_INUSE) != 0);
9923 dlifp->dl_if_flags &= ~DLIF_INUSE;
9924 lck_mtx_unlock(&dlifp->dl_if_lock);
9925
9926 /* Release reference held during ifnet attach */
9927 ifnet_release(ifp);
9928 }
9929
9930 errno_t
ifp_if_output(struct ifnet * ifp,struct mbuf * m)9931 ifp_if_output(struct ifnet *ifp, struct mbuf *m)
9932 {
9933 #pragma unused(ifp)
9934 m_freem_list(m);
9935 return 0;
9936 }
9937
9938 void
ifp_if_start(struct ifnet * ifp)9939 ifp_if_start(struct ifnet *ifp)
9940 {
9941 ifnet_purge(ifp);
9942 }
9943
9944 static errno_t
ifp_if_input(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t poll,struct thread * tp)9945 ifp_if_input(struct ifnet *ifp, struct mbuf *m_head,
9946 struct mbuf *m_tail, const struct ifnet_stat_increment_param *s,
9947 boolean_t poll, struct thread *tp)
9948 {
9949 #pragma unused(ifp, m_tail, s, poll, tp)
9950 m_freem_list(m_head);
9951 return ENXIO;
9952 }
9953
9954 static void
ifp_if_input_poll(struct ifnet * ifp,u_int32_t flags,u_int32_t max_cnt,struct mbuf ** m_head,struct mbuf ** m_tail,u_int32_t * cnt,u_int32_t * len)9955 ifp_if_input_poll(struct ifnet *ifp, u_int32_t flags, u_int32_t max_cnt,
9956 struct mbuf **m_head, struct mbuf **m_tail, u_int32_t *cnt, u_int32_t *len)
9957 {
9958 #pragma unused(ifp, flags, max_cnt)
9959 if (m_head != NULL) {
9960 *m_head = NULL;
9961 }
9962 if (m_tail != NULL) {
9963 *m_tail = NULL;
9964 }
9965 if (cnt != NULL) {
9966 *cnt = 0;
9967 }
9968 if (len != NULL) {
9969 *len = 0;
9970 }
9971 }
9972
9973 static errno_t
ifp_if_ctl(struct ifnet * ifp,ifnet_ctl_cmd_t cmd,u_int32_t arglen,void * arg)9974 ifp_if_ctl(struct ifnet *ifp, ifnet_ctl_cmd_t cmd, u_int32_t arglen, void *arg)
9975 {
9976 #pragma unused(ifp, cmd, arglen, arg)
9977 return EOPNOTSUPP;
9978 }
9979
9980 static errno_t
ifp_if_demux(struct ifnet * ifp,struct mbuf * m,char * fh,protocol_family_t * pf)9981 ifp_if_demux(struct ifnet *ifp, struct mbuf *m, char *fh, protocol_family_t *pf)
9982 {
9983 #pragma unused(ifp, fh, pf)
9984 m_freem(m);
9985 return EJUSTRETURN;
9986 }
9987
9988 static errno_t
ifp_if_add_proto(struct ifnet * ifp,protocol_family_t pf,const struct ifnet_demux_desc * da,u_int32_t dc)9989 ifp_if_add_proto(struct ifnet *ifp, protocol_family_t pf,
9990 const struct ifnet_demux_desc *da, u_int32_t dc)
9991 {
9992 #pragma unused(ifp, pf, da, dc)
9993 return EINVAL;
9994 }
9995
9996 static errno_t
ifp_if_del_proto(struct ifnet * ifp,protocol_family_t pf)9997 ifp_if_del_proto(struct ifnet *ifp, protocol_family_t pf)
9998 {
9999 #pragma unused(ifp, pf)
10000 return EINVAL;
10001 }
10002
10003 static errno_t
ifp_if_check_multi(struct ifnet * ifp,const struct sockaddr * sa)10004 ifp_if_check_multi(struct ifnet *ifp, const struct sockaddr *sa)
10005 {
10006 #pragma unused(ifp, sa)
10007 return EOPNOTSUPP;
10008 }
10009
10010 #if !XNU_TARGET_OS_OSX
10011 static errno_t
ifp_if_framer(struct ifnet * ifp,struct mbuf ** m,const struct sockaddr * sa,const char * ll,const char * t,u_int32_t * pre,u_int32_t * post)10012 ifp_if_framer(struct ifnet *ifp, struct mbuf **m,
10013 const struct sockaddr *sa, const char *ll, const char *t,
10014 u_int32_t *pre, u_int32_t *post)
10015 #else /* XNU_TARGET_OS_OSX */
10016 static errno_t
10017 ifp_if_framer(struct ifnet *ifp, struct mbuf **m,
10018 const struct sockaddr *sa, const char *ll, const char *t)
10019 #endif /* XNU_TARGET_OS_OSX */
10020 {
10021 #pragma unused(ifp, m, sa, ll, t)
10022 #if !XNU_TARGET_OS_OSX
10023 return ifp_if_framer_extended(ifp, m, sa, ll, t, pre, post);
10024 #else /* XNU_TARGET_OS_OSX */
10025 return ifp_if_framer_extended(ifp, m, sa, ll, t, NULL, NULL);
10026 #endif /* XNU_TARGET_OS_OSX */
10027 }
10028
10029 static errno_t
ifp_if_framer_extended(struct ifnet * ifp,struct mbuf ** m,const struct sockaddr * sa,const char * ll,const char * t,u_int32_t * pre,u_int32_t * post)10030 ifp_if_framer_extended(struct ifnet *ifp, struct mbuf **m,
10031 const struct sockaddr *sa, const char *ll, const char *t,
10032 u_int32_t *pre, u_int32_t *post)
10033 {
10034 #pragma unused(ifp, sa, ll, t)
10035 m_freem(*m);
10036 *m = NULL;
10037
10038 if (pre != NULL) {
10039 *pre = 0;
10040 }
10041 if (post != NULL) {
10042 *post = 0;
10043 }
10044
10045 return EJUSTRETURN;
10046 }
10047
10048 errno_t
ifp_if_ioctl(struct ifnet * ifp,unsigned long cmd,void * arg)10049 ifp_if_ioctl(struct ifnet *ifp, unsigned long cmd, void *arg)
10050 {
10051 #pragma unused(ifp, cmd, arg)
10052 return EOPNOTSUPP;
10053 }
10054
10055 static errno_t
ifp_if_set_bpf_tap(struct ifnet * ifp,bpf_tap_mode tm,bpf_packet_func f)10056 ifp_if_set_bpf_tap(struct ifnet *ifp, bpf_tap_mode tm, bpf_packet_func f)
10057 {
10058 #pragma unused(ifp, tm, f)
10059 /* XXX not sure what to do here */
10060 return 0;
10061 }
10062
10063 static void
ifp_if_free(struct ifnet * ifp)10064 ifp_if_free(struct ifnet *ifp)
10065 {
10066 #pragma unused(ifp)
10067 }
10068
10069 static void
ifp_if_event(struct ifnet * ifp,const struct kev_msg * e)10070 ifp_if_event(struct ifnet *ifp, const struct kev_msg *e)
10071 {
10072 #pragma unused(ifp, e)
10073 }
10074
10075 int
dlil_if_acquire(u_int32_t family,const void * uniqueid,size_t uniqueid_len,const char * ifxname,struct ifnet ** ifp)10076 dlil_if_acquire(u_int32_t family, const void *uniqueid,
10077 size_t uniqueid_len, const char *ifxname, struct ifnet **ifp)
10078 {
10079 struct ifnet *ifp1 = NULL;
10080 struct dlil_ifnet *dlifp1 = NULL;
10081 struct dlil_ifnet *dlifp1_saved = NULL;
10082 void *buf, *base, **pbuf;
10083 int ret = 0;
10084
10085 VERIFY(*ifp == NULL);
10086 dlil_if_lock();
10087 /*
10088 * We absolutely can't have an interface with the same name
10089 * in in-use state.
10090 * To make sure of that list has to be traversed completely
10091 */
10092 TAILQ_FOREACH(dlifp1, &dlil_ifnet_head, dl_if_link) {
10093 ifp1 = (struct ifnet *)dlifp1;
10094
10095 if (ifp1->if_family != family) {
10096 continue;
10097 }
10098
10099 /*
10100 * If interface is in use, return EBUSY if either unique id
10101 * or interface extended names are the same
10102 */
10103 lck_mtx_lock(&dlifp1->dl_if_lock);
10104 if (strncmp(ifxname, ifp1->if_xname, IFXNAMSIZ) == 0 &&
10105 (dlifp1->dl_if_flags & DLIF_INUSE) != 0) {
10106 lck_mtx_unlock(&dlifp1->dl_if_lock);
10107 ret = EBUSY;
10108 goto end;
10109 }
10110
10111 if (uniqueid_len != 0 &&
10112 uniqueid_len == dlifp1->dl_if_uniqueid_len &&
10113 bcmp(uniqueid, dlifp1->dl_if_uniqueid, uniqueid_len) == 0) {
10114 if ((dlifp1->dl_if_flags & DLIF_INUSE) != 0) {
10115 lck_mtx_unlock(&dlifp1->dl_if_lock);
10116 ret = EBUSY;
10117 goto end;
10118 }
10119 if (dlifp1_saved == NULL) {
10120 /* cache the first match */
10121 dlifp1_saved = dlifp1;
10122 }
10123 /*
10124 * Do not break or jump to end as we have to traverse
10125 * the whole list to ensure there are no name collisions
10126 */
10127 }
10128 lck_mtx_unlock(&dlifp1->dl_if_lock);
10129 }
10130
10131 /* If there's an interface that can be recycled, use that */
10132 if (dlifp1_saved != NULL) {
10133 lck_mtx_lock(&dlifp1_saved->dl_if_lock);
10134 if ((dlifp1_saved->dl_if_flags & DLIF_INUSE) != 0) {
10135 /* some other thread got in ahead of us */
10136 lck_mtx_unlock(&dlifp1_saved->dl_if_lock);
10137 ret = EBUSY;
10138 goto end;
10139 }
10140 dlifp1_saved->dl_if_flags |= (DLIF_INUSE | DLIF_REUSE);
10141 lck_mtx_unlock(&dlifp1_saved->dl_if_lock);
10142 *ifp = (struct ifnet *)dlifp1_saved;
10143 dlil_if_ref(*ifp);
10144 goto end;
10145 }
10146
10147 /* no interface found, allocate a new one */
10148 buf = zalloc_flags(dlif_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
10149
10150 /* Get the 64-bit aligned base address for this object */
10151 base = (void *)P2ROUNDUP((intptr_t)buf + sizeof(u_int64_t),
10152 sizeof(u_int64_t));
10153 VERIFY(((intptr_t)base + dlif_size) <= ((intptr_t)buf + dlif_bufsize));
10154
10155 /*
10156 * Wind back a pointer size from the aligned base and
10157 * save the original address so we can free it later.
10158 */
10159 pbuf = (void **)((intptr_t)base - sizeof(void *));
10160 *pbuf = buf;
10161 dlifp1 = base;
10162
10163 if (uniqueid_len) {
10164 dlifp1->dl_if_uniqueid = kalloc_data(uniqueid_len,
10165 Z_WAITOK);
10166 if (dlifp1->dl_if_uniqueid == NULL) {
10167 zfree(dlif_zone, buf);
10168 ret = ENOMEM;
10169 goto end;
10170 }
10171 bcopy(uniqueid, dlifp1->dl_if_uniqueid, uniqueid_len);
10172 dlifp1->dl_if_uniqueid_len = uniqueid_len;
10173 }
10174
10175 ifp1 = (struct ifnet *)dlifp1;
10176 dlifp1->dl_if_flags = DLIF_INUSE;
10177 if (ifnet_debug) {
10178 dlifp1->dl_if_flags |= DLIF_DEBUG;
10179 dlifp1->dl_if_trace = dlil_if_trace;
10180 }
10181 ifp1->if_name = dlifp1->dl_if_namestorage;
10182 ifp1->if_xname = dlifp1->dl_if_xnamestorage;
10183
10184 /* initialize interface description */
10185 ifp1->if_desc.ifd_maxlen = IF_DESCSIZE;
10186 ifp1->if_desc.ifd_len = 0;
10187 ifp1->if_desc.ifd_desc = dlifp1->dl_if_descstorage;
10188
10189 #if SKYWALK
10190 SLIST_INIT(&ifp1->if_netns_tokens);
10191 #endif /* SKYWALK */
10192
10193 if ((ret = dlil_alloc_local_stats(ifp1)) != 0) {
10194 DLIL_PRINTF("%s: failed to allocate if local stats, "
10195 "error: %d\n", __func__, ret);
10196 /* This probably shouldn't be fatal */
10197 ret = 0;
10198 }
10199
10200 lck_mtx_init(&dlifp1->dl_if_lock, &ifnet_lock_group, &ifnet_lock_attr);
10201 lck_rw_init(&ifp1->if_lock, &ifnet_lock_group, &ifnet_lock_attr);
10202 lck_mtx_init(&ifp1->if_ref_lock, &ifnet_lock_group, &ifnet_lock_attr);
10203 lck_mtx_init(&ifp1->if_flt_lock, &ifnet_lock_group, &ifnet_lock_attr);
10204 lck_mtx_init(&ifp1->if_addrconfig_lock, &ifnet_lock_group,
10205 &ifnet_lock_attr);
10206 lck_rw_init(&ifp1->if_llreach_lock, &ifnet_lock_group, &ifnet_lock_attr);
10207 #if INET
10208 lck_rw_init(&ifp1->if_inetdata_lock, &ifnet_lock_group,
10209 &ifnet_lock_attr);
10210 ifp1->if_inetdata = NULL;
10211 #endif
10212 lck_mtx_init(&ifp1->if_inet6_ioctl_lock, &ifnet_lock_group, &ifnet_lock_attr);
10213 ifp1->if_inet6_ioctl_busy = FALSE;
10214 lck_rw_init(&ifp1->if_inet6data_lock, &ifnet_lock_group,
10215 &ifnet_lock_attr);
10216 ifp1->if_inet6data = NULL;
10217 lck_rw_init(&ifp1->if_link_status_lock, &ifnet_lock_group,
10218 &ifnet_lock_attr);
10219 ifp1->if_link_status = NULL;
10220
10221 /* for send data paths */
10222 lck_mtx_init(&ifp1->if_start_lock, &ifnet_snd_lock_group,
10223 &ifnet_lock_attr);
10224 lck_mtx_init(&ifp1->if_cached_route_lock, &ifnet_snd_lock_group,
10225 &ifnet_lock_attr);
10226
10227 /* for receive data paths */
10228 lck_mtx_init(&ifp1->if_poll_lock, &ifnet_rcv_lock_group,
10229 &ifnet_lock_attr);
10230
10231 /* thread call allocation is done with sleeping zalloc */
10232 ifp1->if_dt_tcall = thread_call_allocate_with_options(dlil_dt_tcall_fn,
10233 ifp1, THREAD_CALL_PRIORITY_KERNEL, THREAD_CALL_OPTIONS_ONCE);
10234 if (ifp1->if_dt_tcall == NULL) {
10235 panic_plain("%s: couldn't create if_dt_tcall", __func__);
10236 /* NOTREACHED */
10237 }
10238
10239 TAILQ_INSERT_TAIL(&dlil_ifnet_head, dlifp1, dl_if_link);
10240
10241 *ifp = ifp1;
10242 dlil_if_ref(*ifp);
10243
10244 end:
10245 dlil_if_unlock();
10246
10247 VERIFY(dlifp1 == NULL || (IS_P2ALIGNED(dlifp1, sizeof(u_int64_t)) &&
10248 IS_P2ALIGNED(&ifp1->if_data, sizeof(u_int64_t))));
10249
10250 return ret;
10251 }
10252
10253 static void
_dlil_if_release(ifnet_t ifp,bool clear_in_use)10254 _dlil_if_release(ifnet_t ifp, bool clear_in_use)
10255 {
10256 struct dlil_ifnet *dlifp = (struct dlil_ifnet *)ifp;
10257
10258 VERIFY(OSDecrementAtomic64(&net_api_stats.nas_ifnet_alloc_count) > 0);
10259 if (!(ifp->if_xflags & IFXF_ALLOC_KPI)) {
10260 VERIFY(OSDecrementAtomic64(&net_api_stats.nas_ifnet_alloc_os_count) > 0);
10261 }
10262
10263 ifnet_lock_exclusive(ifp);
10264 if (ifp->if_broadcast.length > sizeof(ifp->if_broadcast.u.buffer)) {
10265 kfree_data(ifp->if_broadcast.u.ptr, ifp->if_broadcast.length);
10266 ifp->if_broadcast.length = 0;
10267 ifp->if_broadcast.u.ptr = NULL;
10268 }
10269 lck_mtx_lock(&dlifp->dl_if_lock);
10270 strlcpy(dlifp->dl_if_namestorage, ifp->if_name, IFNAMSIZ);
10271 ifp->if_name = dlifp->dl_if_namestorage;
10272 /* Reset external name (name + unit) */
10273 ifp->if_xname = dlifp->dl_if_xnamestorage;
10274 snprintf(__DECONST(char *, ifp->if_xname), IFXNAMSIZ,
10275 "%s?", ifp->if_name);
10276 if (clear_in_use) {
10277 ASSERT((dlifp->dl_if_flags & DLIF_INUSE) != 0);
10278 dlifp->dl_if_flags &= ~DLIF_INUSE;
10279 }
10280 lck_mtx_unlock(&dlifp->dl_if_lock);
10281 ifnet_lock_done(ifp);
10282 }
10283
10284 __private_extern__ void
dlil_if_release(ifnet_t ifp)10285 dlil_if_release(ifnet_t ifp)
10286 {
10287 _dlil_if_release(ifp, false);
10288 }
10289
10290 __private_extern__ void
dlil_if_lock(void)10291 dlil_if_lock(void)
10292 {
10293 lck_mtx_lock(&dlil_ifnet_lock);
10294 }
10295
10296 __private_extern__ void
dlil_if_unlock(void)10297 dlil_if_unlock(void)
10298 {
10299 lck_mtx_unlock(&dlil_ifnet_lock);
10300 }
10301
10302 __private_extern__ void
dlil_if_lock_assert(void)10303 dlil_if_lock_assert(void)
10304 {
10305 LCK_MTX_ASSERT(&dlil_ifnet_lock, LCK_MTX_ASSERT_OWNED);
10306 }
10307
10308 __private_extern__ void
dlil_proto_unplumb_all(struct ifnet * ifp)10309 dlil_proto_unplumb_all(struct ifnet *ifp)
10310 {
10311 /*
10312 * if_proto_hash[0-2] are for PF_INET, PF_INET6 and PF_VLAN, where
10313 * each bucket contains exactly one entry; PF_VLAN does not need an
10314 * explicit unplumb.
10315 *
10316 * if_proto_hash[3] is for other protocols; we expect anything
10317 * in this bucket to respond to the DETACHING event (which would
10318 * have happened by now) and do the unplumb then.
10319 */
10320 (void) proto_unplumb(PF_INET, ifp);
10321 (void) proto_unplumb(PF_INET6, ifp);
10322 }
10323
10324 static void
ifp_src_route_copyout(struct ifnet * ifp,struct route * dst)10325 ifp_src_route_copyout(struct ifnet *ifp, struct route *dst)
10326 {
10327 lck_mtx_lock_spin(&ifp->if_cached_route_lock);
10328 lck_mtx_convert_spin(&ifp->if_cached_route_lock);
10329
10330 route_copyout(dst, &ifp->if_src_route, sizeof(*dst));
10331
10332 lck_mtx_unlock(&ifp->if_cached_route_lock);
10333 }
10334
10335 static void
ifp_src_route_copyin(struct ifnet * ifp,struct route * src)10336 ifp_src_route_copyin(struct ifnet *ifp, struct route *src)
10337 {
10338 lck_mtx_lock_spin(&ifp->if_cached_route_lock);
10339 lck_mtx_convert_spin(&ifp->if_cached_route_lock);
10340
10341 if (ifp->if_fwd_cacheok) {
10342 route_copyin(src, &ifp->if_src_route, sizeof(*src));
10343 } else {
10344 ROUTE_RELEASE(src);
10345 }
10346 lck_mtx_unlock(&ifp->if_cached_route_lock);
10347 }
10348
10349 static void
ifp_src_route6_copyout(struct ifnet * ifp,struct route_in6 * dst)10350 ifp_src_route6_copyout(struct ifnet *ifp, struct route_in6 *dst)
10351 {
10352 lck_mtx_lock_spin(&ifp->if_cached_route_lock);
10353 lck_mtx_convert_spin(&ifp->if_cached_route_lock);
10354
10355 route_copyout((struct route *)dst, (struct route *)&ifp->if_src_route6,
10356 sizeof(*dst));
10357
10358 lck_mtx_unlock(&ifp->if_cached_route_lock);
10359 }
10360
10361 static void
ifp_src_route6_copyin(struct ifnet * ifp,struct route_in6 * src)10362 ifp_src_route6_copyin(struct ifnet *ifp, struct route_in6 *src)
10363 {
10364 lck_mtx_lock_spin(&ifp->if_cached_route_lock);
10365 lck_mtx_convert_spin(&ifp->if_cached_route_lock);
10366
10367 if (ifp->if_fwd_cacheok) {
10368 route_copyin((struct route *)src,
10369 (struct route *)&ifp->if_src_route6, sizeof(*src));
10370 } else {
10371 ROUTE_RELEASE(src);
10372 }
10373 lck_mtx_unlock(&ifp->if_cached_route_lock);
10374 }
10375
10376 struct rtentry *
ifnet_cached_rtlookup_inet(struct ifnet * ifp,struct in_addr src_ip)10377 ifnet_cached_rtlookup_inet(struct ifnet *ifp, struct in_addr src_ip)
10378 {
10379 struct route src_rt;
10380 struct sockaddr_in *dst;
10381
10382 dst = (struct sockaddr_in *)(void *)(&src_rt.ro_dst);
10383
10384 ifp_src_route_copyout(ifp, &src_rt);
10385
10386 if (ROUTE_UNUSABLE(&src_rt) || src_ip.s_addr != dst->sin_addr.s_addr) {
10387 ROUTE_RELEASE(&src_rt);
10388 if (dst->sin_family != AF_INET) {
10389 bzero(&src_rt.ro_dst, sizeof(src_rt.ro_dst));
10390 dst->sin_len = sizeof(src_rt.ro_dst);
10391 dst->sin_family = AF_INET;
10392 }
10393 dst->sin_addr = src_ip;
10394
10395 VERIFY(src_rt.ro_rt == NULL);
10396 src_rt.ro_rt = rtalloc1_scoped((struct sockaddr *)dst,
10397 0, 0, ifp->if_index);
10398
10399 if (src_rt.ro_rt != NULL) {
10400 /* retain a ref, copyin consumes one */
10401 struct rtentry *rte = src_rt.ro_rt;
10402 RT_ADDREF(rte);
10403 ifp_src_route_copyin(ifp, &src_rt);
10404 src_rt.ro_rt = rte;
10405 }
10406 }
10407
10408 return src_rt.ro_rt;
10409 }
10410
10411 struct rtentry *
ifnet_cached_rtlookup_inet6(struct ifnet * ifp,struct in6_addr * src_ip6)10412 ifnet_cached_rtlookup_inet6(struct ifnet *ifp, struct in6_addr *src_ip6)
10413 {
10414 struct route_in6 src_rt;
10415
10416 ifp_src_route6_copyout(ifp, &src_rt);
10417
10418 if (ROUTE_UNUSABLE(&src_rt) ||
10419 !IN6_ARE_ADDR_EQUAL(src_ip6, &src_rt.ro_dst.sin6_addr)) {
10420 ROUTE_RELEASE(&src_rt);
10421 if (src_rt.ro_dst.sin6_family != AF_INET6) {
10422 bzero(&src_rt.ro_dst, sizeof(src_rt.ro_dst));
10423 src_rt.ro_dst.sin6_len = sizeof(src_rt.ro_dst);
10424 src_rt.ro_dst.sin6_family = AF_INET6;
10425 }
10426 src_rt.ro_dst.sin6_scope_id = in6_addr2scopeid(ifp, src_ip6);
10427 bcopy(src_ip6, &src_rt.ro_dst.sin6_addr,
10428 sizeof(src_rt.ro_dst.sin6_addr));
10429
10430 if (src_rt.ro_rt == NULL) {
10431 src_rt.ro_rt = rtalloc1_scoped(
10432 (struct sockaddr *)&src_rt.ro_dst, 0, 0,
10433 ifp->if_index);
10434
10435 if (src_rt.ro_rt != NULL) {
10436 /* retain a ref, copyin consumes one */
10437 struct rtentry *rte = src_rt.ro_rt;
10438 RT_ADDREF(rte);
10439 ifp_src_route6_copyin(ifp, &src_rt);
10440 src_rt.ro_rt = rte;
10441 }
10442 }
10443 }
10444
10445 return src_rt.ro_rt;
10446 }
10447
10448 void
if_lqm_update(struct ifnet * ifp,int lqm,int locked)10449 if_lqm_update(struct ifnet *ifp, int lqm, int locked)
10450 {
10451 struct kev_dl_link_quality_metric_data ev_lqm_data;
10452
10453 VERIFY(lqm >= IFNET_LQM_MIN && lqm <= IFNET_LQM_MAX);
10454
10455 /* Normalize to edge */
10456 if (lqm >= 0 && lqm <= IFNET_LQM_THRESH_ABORT) {
10457 lqm = IFNET_LQM_THRESH_ABORT;
10458 atomic_bitset_32(&tcbinfo.ipi_flags,
10459 INPCBINFO_HANDLE_LQM_ABORT);
10460 inpcb_timer_sched(&tcbinfo, INPCB_TIMER_FAST);
10461 } else if (lqm > IFNET_LQM_THRESH_ABORT &&
10462 lqm <= IFNET_LQM_THRESH_MINIMALLY_VIABLE) {
10463 lqm = IFNET_LQM_THRESH_MINIMALLY_VIABLE;
10464 } else if (lqm > IFNET_LQM_THRESH_MINIMALLY_VIABLE &&
10465 lqm <= IFNET_LQM_THRESH_POOR) {
10466 lqm = IFNET_LQM_THRESH_POOR;
10467 } else if (lqm > IFNET_LQM_THRESH_POOR &&
10468 lqm <= IFNET_LQM_THRESH_GOOD) {
10469 lqm = IFNET_LQM_THRESH_GOOD;
10470 }
10471
10472 /*
10473 * Take the lock if needed
10474 */
10475 if (!locked) {
10476 ifnet_lock_exclusive(ifp);
10477 }
10478
10479 if (lqm == ifp->if_interface_state.lqm_state &&
10480 (ifp->if_interface_state.valid_bitmask &
10481 IF_INTERFACE_STATE_LQM_STATE_VALID)) {
10482 /*
10483 * Release the lock if was not held by the caller
10484 */
10485 if (!locked) {
10486 ifnet_lock_done(ifp);
10487 }
10488 return; /* nothing to update */
10489 }
10490 ifp->if_interface_state.valid_bitmask |=
10491 IF_INTERFACE_STATE_LQM_STATE_VALID;
10492 ifp->if_interface_state.lqm_state = (int8_t)lqm;
10493
10494 /*
10495 * Don't want to hold the lock when issuing kernel events
10496 */
10497 ifnet_lock_done(ifp);
10498
10499 bzero(&ev_lqm_data, sizeof(ev_lqm_data));
10500 ev_lqm_data.link_quality_metric = lqm;
10501
10502 dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_LINK_QUALITY_METRIC_CHANGED,
10503 (struct net_event_data *)&ev_lqm_data, sizeof(ev_lqm_data), FALSE);
10504
10505 /*
10506 * Reacquire the lock for the caller
10507 */
10508 if (locked) {
10509 ifnet_lock_exclusive(ifp);
10510 }
10511 }
10512
10513 static void
if_rrc_state_update(struct ifnet * ifp,unsigned int rrc_state)10514 if_rrc_state_update(struct ifnet *ifp, unsigned int rrc_state)
10515 {
10516 struct kev_dl_rrc_state kev;
10517
10518 if (rrc_state == ifp->if_interface_state.rrc_state &&
10519 (ifp->if_interface_state.valid_bitmask &
10520 IF_INTERFACE_STATE_RRC_STATE_VALID)) {
10521 return;
10522 }
10523
10524 ifp->if_interface_state.valid_bitmask |=
10525 IF_INTERFACE_STATE_RRC_STATE_VALID;
10526
10527 ifp->if_interface_state.rrc_state = (uint8_t)rrc_state;
10528
10529 /*
10530 * Don't want to hold the lock when issuing kernel events
10531 */
10532 ifnet_lock_done(ifp);
10533
10534 bzero(&kev, sizeof(struct kev_dl_rrc_state));
10535 kev.rrc_state = rrc_state;
10536
10537 dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_RRC_STATE_CHANGED,
10538 (struct net_event_data *)&kev, sizeof(struct kev_dl_rrc_state), FALSE);
10539
10540 ifnet_lock_exclusive(ifp);
10541 }
10542
10543 errno_t
if_state_update(struct ifnet * ifp,struct if_interface_state * if_interface_state)10544 if_state_update(struct ifnet *ifp,
10545 struct if_interface_state *if_interface_state)
10546 {
10547 u_short if_index_available = 0;
10548
10549 ifnet_lock_exclusive(ifp);
10550
10551 if ((ifp->if_type != IFT_CELLULAR) &&
10552 (if_interface_state->valid_bitmask &
10553 IF_INTERFACE_STATE_RRC_STATE_VALID)) {
10554 ifnet_lock_done(ifp);
10555 return ENOTSUP;
10556 }
10557 if ((if_interface_state->valid_bitmask &
10558 IF_INTERFACE_STATE_LQM_STATE_VALID) &&
10559 (if_interface_state->lqm_state < IFNET_LQM_MIN ||
10560 if_interface_state->lqm_state > IFNET_LQM_MAX)) {
10561 ifnet_lock_done(ifp);
10562 return EINVAL;
10563 }
10564 if ((if_interface_state->valid_bitmask &
10565 IF_INTERFACE_STATE_RRC_STATE_VALID) &&
10566 if_interface_state->rrc_state !=
10567 IF_INTERFACE_STATE_RRC_STATE_IDLE &&
10568 if_interface_state->rrc_state !=
10569 IF_INTERFACE_STATE_RRC_STATE_CONNECTED) {
10570 ifnet_lock_done(ifp);
10571 return EINVAL;
10572 }
10573
10574 if (if_interface_state->valid_bitmask &
10575 IF_INTERFACE_STATE_LQM_STATE_VALID) {
10576 if_lqm_update(ifp, if_interface_state->lqm_state, 1);
10577 }
10578 if (if_interface_state->valid_bitmask &
10579 IF_INTERFACE_STATE_RRC_STATE_VALID) {
10580 if_rrc_state_update(ifp, if_interface_state->rrc_state);
10581 }
10582 if (if_interface_state->valid_bitmask &
10583 IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID) {
10584 ifp->if_interface_state.valid_bitmask |=
10585 IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID;
10586 ifp->if_interface_state.interface_availability =
10587 if_interface_state->interface_availability;
10588
10589 if (ifp->if_interface_state.interface_availability ==
10590 IF_INTERFACE_STATE_INTERFACE_AVAILABLE) {
10591 os_log(OS_LOG_DEFAULT, "%s: interface %s (%u) available\n",
10592 __func__, if_name(ifp), ifp->if_index);
10593 if_index_available = ifp->if_index;
10594 } else {
10595 os_log(OS_LOG_DEFAULT, "%s: interface %s (%u) unavailable)\n",
10596 __func__, if_name(ifp), ifp->if_index);
10597 }
10598 }
10599 ifnet_lock_done(ifp);
10600
10601 /*
10602 * Check if the TCP connections going on this interface should be
10603 * forced to send probe packets instead of waiting for TCP timers
10604 * to fire. This is done on an explicit notification such as
10605 * SIOCSIFINTERFACESTATE which marks the interface as available.
10606 */
10607 if (if_index_available > 0) {
10608 tcp_interface_send_probe(if_index_available);
10609 }
10610
10611 return 0;
10612 }
10613
10614 void
if_get_state(struct ifnet * ifp,struct if_interface_state * if_interface_state)10615 if_get_state(struct ifnet *ifp,
10616 struct if_interface_state *if_interface_state)
10617 {
10618 ifnet_lock_shared(ifp);
10619
10620 if_interface_state->valid_bitmask = 0;
10621
10622 if (ifp->if_interface_state.valid_bitmask &
10623 IF_INTERFACE_STATE_RRC_STATE_VALID) {
10624 if_interface_state->valid_bitmask |=
10625 IF_INTERFACE_STATE_RRC_STATE_VALID;
10626 if_interface_state->rrc_state =
10627 ifp->if_interface_state.rrc_state;
10628 }
10629 if (ifp->if_interface_state.valid_bitmask &
10630 IF_INTERFACE_STATE_LQM_STATE_VALID) {
10631 if_interface_state->valid_bitmask |=
10632 IF_INTERFACE_STATE_LQM_STATE_VALID;
10633 if_interface_state->lqm_state =
10634 ifp->if_interface_state.lqm_state;
10635 }
10636 if (ifp->if_interface_state.valid_bitmask &
10637 IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID) {
10638 if_interface_state->valid_bitmask |=
10639 IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID;
10640 if_interface_state->interface_availability =
10641 ifp->if_interface_state.interface_availability;
10642 }
10643
10644 ifnet_lock_done(ifp);
10645 }
10646
10647 errno_t
if_probe_connectivity(struct ifnet * ifp,u_int32_t conn_probe)10648 if_probe_connectivity(struct ifnet *ifp, u_int32_t conn_probe)
10649 {
10650 if (conn_probe > 1) {
10651 return EINVAL;
10652 }
10653 if (conn_probe == 0) {
10654 if_clear_eflags(ifp, IFEF_PROBE_CONNECTIVITY);
10655 } else {
10656 if_set_eflags(ifp, IFEF_PROBE_CONNECTIVITY);
10657 }
10658
10659 #if NECP
10660 necp_update_all_clients();
10661 #endif /* NECP */
10662
10663 tcp_probe_connectivity(ifp, conn_probe);
10664 return 0;
10665 }
10666
10667 /* for uuid.c */
10668 static int
get_ether_index(int * ret_other_index)10669 get_ether_index(int * ret_other_index)
10670 {
10671 struct ifnet *ifp;
10672 int en0_index = 0;
10673 int other_en_index = 0;
10674 int any_ether_index = 0;
10675 short best_unit = 0;
10676
10677 *ret_other_index = 0;
10678 TAILQ_FOREACH(ifp, &ifnet_head, if_link) {
10679 /*
10680 * find en0, or if not en0, the lowest unit en*, and if not
10681 * that, any ethernet
10682 */
10683 ifnet_lock_shared(ifp);
10684 if (strcmp(ifp->if_name, "en") == 0) {
10685 if (ifp->if_unit == 0) {
10686 /* found en0, we're done */
10687 en0_index = ifp->if_index;
10688 ifnet_lock_done(ifp);
10689 break;
10690 }
10691 if (other_en_index == 0 || ifp->if_unit < best_unit) {
10692 other_en_index = ifp->if_index;
10693 best_unit = ifp->if_unit;
10694 }
10695 } else if (ifp->if_type == IFT_ETHER && any_ether_index == 0) {
10696 any_ether_index = ifp->if_index;
10697 }
10698 ifnet_lock_done(ifp);
10699 }
10700 if (en0_index == 0) {
10701 if (other_en_index != 0) {
10702 *ret_other_index = other_en_index;
10703 } else if (any_ether_index != 0) {
10704 *ret_other_index = any_ether_index;
10705 }
10706 }
10707 return en0_index;
10708 }
10709
10710 int
uuid_get_ethernet(u_int8_t * node)10711 uuid_get_ethernet(u_int8_t *node)
10712 {
10713 static int en0_index;
10714 struct ifnet *ifp;
10715 int other_index = 0;
10716 int the_index = 0;
10717 int ret;
10718
10719 ifnet_head_lock_shared();
10720 if (en0_index == 0 || ifindex2ifnet[en0_index] == NULL) {
10721 en0_index = get_ether_index(&other_index);
10722 }
10723 if (en0_index != 0) {
10724 the_index = en0_index;
10725 } else if (other_index != 0) {
10726 the_index = other_index;
10727 }
10728 if (the_index != 0) {
10729 struct dlil_ifnet *dl_if;
10730
10731 ifp = ifindex2ifnet[the_index];
10732 VERIFY(ifp != NULL);
10733 dl_if = (struct dlil_ifnet *)ifp;
10734 if (dl_if->dl_if_permanent_ether_is_set != 0) {
10735 /*
10736 * Use the permanent ethernet address if it is
10737 * available because it will never change.
10738 */
10739 memcpy(node, dl_if->dl_if_permanent_ether,
10740 ETHER_ADDR_LEN);
10741 } else {
10742 memcpy(node, IF_LLADDR(ifp), ETHER_ADDR_LEN);
10743 }
10744 ret = 0;
10745 } else {
10746 ret = -1;
10747 }
10748 ifnet_head_done();
10749 return ret;
10750 }
10751
10752 static int
10753 sysctl_rxpoll SYSCTL_HANDLER_ARGS
10754 {
10755 #pragma unused(arg1, arg2)
10756 uint32_t i;
10757 int err;
10758
10759 i = if_rxpoll;
10760
10761 err = sysctl_handle_int(oidp, &i, 0, req);
10762 if (err != 0 || req->newptr == USER_ADDR_NULL) {
10763 return err;
10764 }
10765
10766 if (net_rxpoll == 0) {
10767 return ENXIO;
10768 }
10769
10770 if_rxpoll = i;
10771 return err;
10772 }
10773
10774 static int
10775 sysctl_rxpoll_mode_holdtime SYSCTL_HANDLER_ARGS
10776 {
10777 #pragma unused(arg1, arg2)
10778 uint64_t q;
10779 int err;
10780
10781 q = if_rxpoll_mode_holdtime;
10782
10783 err = sysctl_handle_quad(oidp, &q, 0, req);
10784 if (err != 0 || req->newptr == USER_ADDR_NULL) {
10785 return err;
10786 }
10787
10788 if (q < IF_RXPOLL_MODE_HOLDTIME_MIN) {
10789 q = IF_RXPOLL_MODE_HOLDTIME_MIN;
10790 }
10791
10792 if_rxpoll_mode_holdtime = q;
10793
10794 return err;
10795 }
10796
10797 static int
10798 sysctl_rxpoll_sample_holdtime SYSCTL_HANDLER_ARGS
10799 {
10800 #pragma unused(arg1, arg2)
10801 uint64_t q;
10802 int err;
10803
10804 q = if_rxpoll_sample_holdtime;
10805
10806 err = sysctl_handle_quad(oidp, &q, 0, req);
10807 if (err != 0 || req->newptr == USER_ADDR_NULL) {
10808 return err;
10809 }
10810
10811 if (q < IF_RXPOLL_SAMPLETIME_MIN) {
10812 q = IF_RXPOLL_SAMPLETIME_MIN;
10813 }
10814
10815 if_rxpoll_sample_holdtime = q;
10816
10817 return err;
10818 }
10819
10820 static int
10821 sysctl_rxpoll_interval_time SYSCTL_HANDLER_ARGS
10822 {
10823 #pragma unused(arg1, arg2)
10824 uint64_t q;
10825 int err;
10826
10827 q = if_rxpoll_interval_time;
10828
10829 err = sysctl_handle_quad(oidp, &q, 0, req);
10830 if (err != 0 || req->newptr == USER_ADDR_NULL) {
10831 return err;
10832 }
10833
10834 if (q < IF_RXPOLL_INTERVALTIME_MIN) {
10835 q = IF_RXPOLL_INTERVALTIME_MIN;
10836 }
10837
10838 if_rxpoll_interval_time = q;
10839
10840 return err;
10841 }
10842
10843 static int
10844 sysctl_rxpoll_wlowat SYSCTL_HANDLER_ARGS
10845 {
10846 #pragma unused(arg1, arg2)
10847 uint32_t i;
10848 int err;
10849
10850 i = if_sysctl_rxpoll_wlowat;
10851
10852 err = sysctl_handle_int(oidp, &i, 0, req);
10853 if (err != 0 || req->newptr == USER_ADDR_NULL) {
10854 return err;
10855 }
10856
10857 if (i == 0 || i >= if_sysctl_rxpoll_whiwat) {
10858 return EINVAL;
10859 }
10860
10861 if_sysctl_rxpoll_wlowat = i;
10862 return err;
10863 }
10864
10865 static int
10866 sysctl_rxpoll_whiwat SYSCTL_HANDLER_ARGS
10867 {
10868 #pragma unused(arg1, arg2)
10869 uint32_t i;
10870 int err;
10871
10872 i = if_sysctl_rxpoll_whiwat;
10873
10874 err = sysctl_handle_int(oidp, &i, 0, req);
10875 if (err != 0 || req->newptr == USER_ADDR_NULL) {
10876 return err;
10877 }
10878
10879 if (i <= if_sysctl_rxpoll_wlowat) {
10880 return EINVAL;
10881 }
10882
10883 if_sysctl_rxpoll_whiwat = i;
10884 return err;
10885 }
10886
10887 static int
10888 sysctl_sndq_maxlen SYSCTL_HANDLER_ARGS
10889 {
10890 #pragma unused(arg1, arg2)
10891 int i, err;
10892
10893 i = if_sndq_maxlen;
10894
10895 err = sysctl_handle_int(oidp, &i, 0, req);
10896 if (err != 0 || req->newptr == USER_ADDR_NULL) {
10897 return err;
10898 }
10899
10900 if (i < IF_SNDQ_MINLEN) {
10901 i = IF_SNDQ_MINLEN;
10902 }
10903
10904 if_sndq_maxlen = i;
10905 return err;
10906 }
10907
10908 static int
10909 sysctl_rcvq_maxlen SYSCTL_HANDLER_ARGS
10910 {
10911 #pragma unused(arg1, arg2)
10912 int i, err;
10913
10914 i = if_rcvq_maxlen;
10915
10916 err = sysctl_handle_int(oidp, &i, 0, req);
10917 if (err != 0 || req->newptr == USER_ADDR_NULL) {
10918 return err;
10919 }
10920
10921 if (i < IF_RCVQ_MINLEN) {
10922 i = IF_RCVQ_MINLEN;
10923 }
10924
10925 if_rcvq_maxlen = i;
10926 return err;
10927 }
10928
10929 int
dlil_node_present(struct ifnet * ifp,struct sockaddr * sa,int32_t rssi,int lqm,int npm,u_int8_t srvinfo[48])10930 dlil_node_present(struct ifnet *ifp, struct sockaddr *sa,
10931 int32_t rssi, int lqm, int npm, u_int8_t srvinfo[48])
10932 {
10933 struct kev_dl_node_presence kev;
10934 struct sockaddr_dl *sdl;
10935 struct sockaddr_in6 *sin6;
10936 int ret = 0;
10937
10938 VERIFY(ifp);
10939 VERIFY(sa);
10940 VERIFY(sa->sa_family == AF_LINK || sa->sa_family == AF_INET6);
10941
10942 bzero(&kev, sizeof(kev));
10943 sin6 = &kev.sin6_node_address;
10944 sdl = &kev.sdl_node_address;
10945 nd6_alt_node_addr_decompose(ifp, sa, sdl, sin6);
10946 kev.rssi = rssi;
10947 kev.link_quality_metric = lqm;
10948 kev.node_proximity_metric = npm;
10949 bcopy(srvinfo, kev.node_service_info, sizeof(kev.node_service_info));
10950
10951 ret = nd6_alt_node_present(ifp, sin6, sdl, rssi, lqm, npm);
10952 if (ret == 0 || ret == EEXIST) {
10953 int err = dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_NODE_PRESENCE,
10954 &kev.link_data, sizeof(kev), (ret == EEXIST) ? TRUE : FALSE);
10955 if (err != 0) {
10956 log(LOG_ERR, "%s: Post DL_NODE_PRESENCE failed with"
10957 "error %d\n", __func__, err);
10958 }
10959 }
10960
10961 if (ret == EEXIST) {
10962 ret = 0;
10963 }
10964 return ret;
10965 }
10966
10967 void
dlil_node_absent(struct ifnet * ifp,struct sockaddr * sa)10968 dlil_node_absent(struct ifnet *ifp, struct sockaddr *sa)
10969 {
10970 struct kev_dl_node_absence kev = {};
10971 struct sockaddr_in6 *kev_sin6 = NULL;
10972 struct sockaddr_dl *kev_sdl = NULL;
10973 int error = 0;
10974
10975 VERIFY(ifp != NULL);
10976 VERIFY(sa != NULL);
10977 VERIFY(sa->sa_family == AF_LINK || sa->sa_family == AF_INET6);
10978
10979 kev_sin6 = &kev.sin6_node_address;
10980 kev_sdl = &kev.sdl_node_address;
10981
10982 if (sa->sa_family == AF_INET6) {
10983 /*
10984 * If IPv6 address is given, get the link layer
10985 * address from what was cached in the neighbor cache
10986 */
10987 VERIFY(sa->sa_len <= sizeof(*kev_sin6));
10988 bcopy(sa, kev_sin6, sa->sa_len);
10989 error = nd6_alt_node_absent(ifp, kev_sin6, kev_sdl);
10990 } else {
10991 /*
10992 * If passed address is AF_LINK type, derive the address
10993 * based on the link address.
10994 */
10995 nd6_alt_node_addr_decompose(ifp, sa, kev_sdl, kev_sin6);
10996 error = nd6_alt_node_absent(ifp, kev_sin6, NULL);
10997 }
10998
10999 if (error == 0) {
11000 kev_sdl->sdl_type = ifp->if_type;
11001 kev_sdl->sdl_index = ifp->if_index;
11002
11003 dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_NODE_ABSENCE,
11004 &kev.link_data, sizeof(kev), FALSE);
11005 }
11006 }
11007
11008 int
dlil_node_present_v2(struct ifnet * ifp,struct sockaddr * sa,struct sockaddr_dl * sdl,int32_t rssi,int lqm,int npm,u_int8_t srvinfo[48])11009 dlil_node_present_v2(struct ifnet *ifp, struct sockaddr *sa, struct sockaddr_dl *sdl,
11010 int32_t rssi, int lqm, int npm, u_int8_t srvinfo[48])
11011 {
11012 struct kev_dl_node_presence kev = {};
11013 struct sockaddr_dl *kev_sdl = NULL;
11014 struct sockaddr_in6 *kev_sin6 = NULL;
11015 int ret = 0;
11016
11017 VERIFY(ifp != NULL);
11018 VERIFY(sa != NULL && sdl != NULL);
11019 VERIFY(sa->sa_family == AF_INET6 && sdl->sdl_family == AF_LINK);
11020
11021 kev_sin6 = &kev.sin6_node_address;
11022 kev_sdl = &kev.sdl_node_address;
11023
11024 VERIFY(sdl->sdl_len <= sizeof(*kev_sdl));
11025 bcopy(sdl, kev_sdl, sdl->sdl_len);
11026 kev_sdl->sdl_type = ifp->if_type;
11027 kev_sdl->sdl_index = ifp->if_index;
11028
11029 VERIFY(sa->sa_len <= sizeof(*kev_sin6));
11030 bcopy(sa, kev_sin6, sa->sa_len);
11031
11032 kev.rssi = rssi;
11033 kev.link_quality_metric = lqm;
11034 kev.node_proximity_metric = npm;
11035 bcopy(srvinfo, kev.node_service_info, sizeof(kev.node_service_info));
11036
11037 ret = nd6_alt_node_present(ifp, SIN6(sa), sdl, rssi, lqm, npm);
11038 if (ret == 0 || ret == EEXIST) {
11039 int err = dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_NODE_PRESENCE,
11040 &kev.link_data, sizeof(kev), (ret == EEXIST) ? TRUE : FALSE);
11041 if (err != 0) {
11042 log(LOG_ERR, "%s: Post DL_NODE_PRESENCE failed with error %d\n", __func__, err);
11043 }
11044 }
11045
11046 if (ret == EEXIST) {
11047 ret = 0;
11048 }
11049 return ret;
11050 }
11051
11052 const void *
dlil_ifaddr_bytes(const struct sockaddr_dl * sdl,size_t * sizep,kauth_cred_t * credp)11053 dlil_ifaddr_bytes(const struct sockaddr_dl *sdl, size_t *sizep,
11054 kauth_cred_t *credp)
11055 {
11056 const u_int8_t *bytes;
11057 size_t size;
11058
11059 bytes = CONST_LLADDR(sdl);
11060 size = sdl->sdl_alen;
11061
11062 #if CONFIG_MACF
11063 if (dlil_lladdr_ckreq) {
11064 switch (sdl->sdl_type) {
11065 case IFT_ETHER:
11066 case IFT_IEEE1394:
11067 break;
11068 default:
11069 credp = NULL;
11070 break;
11071 }
11072 ;
11073
11074 if (credp && mac_system_check_info(*credp, "net.link.addr")) {
11075 static const u_int8_t unspec[FIREWIRE_EUI64_LEN] = {
11076 [0] = 2
11077 };
11078
11079 bytes = unspec;
11080 }
11081 }
11082 #else
11083 #pragma unused(credp)
11084 #endif
11085
11086 if (sizep != NULL) {
11087 *sizep = size;
11088 }
11089 return bytes;
11090 }
11091
11092 void
dlil_report_issues(struct ifnet * ifp,u_int8_t modid[DLIL_MODIDLEN],u_int8_t info[DLIL_MODARGLEN])11093 dlil_report_issues(struct ifnet *ifp, u_int8_t modid[DLIL_MODIDLEN],
11094 u_int8_t info[DLIL_MODARGLEN])
11095 {
11096 struct kev_dl_issues kev;
11097 struct timeval tv;
11098
11099 VERIFY(ifp != NULL);
11100 VERIFY(modid != NULL);
11101 _CASSERT(sizeof(kev.modid) == DLIL_MODIDLEN);
11102 _CASSERT(sizeof(kev.info) == DLIL_MODARGLEN);
11103
11104 bzero(&kev, sizeof(kev));
11105
11106 microtime(&tv);
11107 kev.timestamp = tv.tv_sec;
11108 bcopy(modid, &kev.modid, DLIL_MODIDLEN);
11109 if (info != NULL) {
11110 bcopy(info, &kev.info, DLIL_MODARGLEN);
11111 }
11112
11113 dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_ISSUES,
11114 &kev.link_data, sizeof(kev), FALSE);
11115 }
11116
11117 errno_t
ifnet_getset_opportunistic(ifnet_t ifp,u_long cmd,struct ifreq * ifr,struct proc * p)11118 ifnet_getset_opportunistic(ifnet_t ifp, u_long cmd, struct ifreq *ifr,
11119 struct proc *p)
11120 {
11121 u_int32_t level = IFNET_THROTTLE_OFF;
11122 errno_t result = 0;
11123
11124 VERIFY(cmd == SIOCSIFOPPORTUNISTIC || cmd == SIOCGIFOPPORTUNISTIC);
11125
11126 if (cmd == SIOCSIFOPPORTUNISTIC) {
11127 /*
11128 * XXX: Use priv_check_cred() instead of root check?
11129 */
11130 if ((result = proc_suser(p)) != 0) {
11131 return result;
11132 }
11133
11134 if (ifr->ifr_opportunistic.ifo_flags ==
11135 IFRIFOF_BLOCK_OPPORTUNISTIC) {
11136 level = IFNET_THROTTLE_OPPORTUNISTIC;
11137 } else if (ifr->ifr_opportunistic.ifo_flags == 0) {
11138 level = IFNET_THROTTLE_OFF;
11139 } else {
11140 result = EINVAL;
11141 }
11142
11143 if (result == 0) {
11144 result = ifnet_set_throttle(ifp, level);
11145 }
11146 } else if ((result = ifnet_get_throttle(ifp, &level)) == 0) {
11147 ifr->ifr_opportunistic.ifo_flags = 0;
11148 if (level == IFNET_THROTTLE_OPPORTUNISTIC) {
11149 ifr->ifr_opportunistic.ifo_flags |=
11150 IFRIFOF_BLOCK_OPPORTUNISTIC;
11151 }
11152 }
11153
11154 /*
11155 * Return the count of current opportunistic connections
11156 * over the interface.
11157 */
11158 if (result == 0) {
11159 uint32_t flags = 0;
11160 flags |= (cmd == SIOCSIFOPPORTUNISTIC) ?
11161 INPCB_OPPORTUNISTIC_SETCMD : 0;
11162 flags |= (level == IFNET_THROTTLE_OPPORTUNISTIC) ?
11163 INPCB_OPPORTUNISTIC_THROTTLEON : 0;
11164 ifr->ifr_opportunistic.ifo_inuse =
11165 udp_count_opportunistic(ifp->if_index, flags) +
11166 tcp_count_opportunistic(ifp->if_index, flags);
11167 }
11168
11169 if (result == EALREADY) {
11170 result = 0;
11171 }
11172
11173 return result;
11174 }
11175
11176 int
ifnet_get_throttle(struct ifnet * ifp,u_int32_t * level)11177 ifnet_get_throttle(struct ifnet *ifp, u_int32_t *level)
11178 {
11179 struct ifclassq *ifq;
11180 int err = 0;
11181
11182 if (!(ifp->if_eflags & IFEF_TXSTART)) {
11183 return ENXIO;
11184 }
11185
11186 *level = IFNET_THROTTLE_OFF;
11187
11188 ifq = ifp->if_snd;
11189 IFCQ_LOCK(ifq);
11190 /* Throttling works only for IFCQ, not ALTQ instances */
11191 if (IFCQ_IS_ENABLED(ifq)) {
11192 cqrq_throttle_t req = { 0, IFNET_THROTTLE_OFF };
11193
11194 err = fq_if_request_classq(ifq, CLASSQRQ_THROTTLE, &req);
11195 *level = req.level;
11196 }
11197 IFCQ_UNLOCK(ifq);
11198
11199 return err;
11200 }
11201
11202 int
ifnet_set_throttle(struct ifnet * ifp,u_int32_t level)11203 ifnet_set_throttle(struct ifnet *ifp, u_int32_t level)
11204 {
11205 struct ifclassq *ifq;
11206 int err = 0;
11207
11208 if (!(ifp->if_eflags & IFEF_TXSTART)) {
11209 return ENXIO;
11210 }
11211
11212 ifq = ifp->if_snd;
11213
11214 switch (level) {
11215 case IFNET_THROTTLE_OFF:
11216 case IFNET_THROTTLE_OPPORTUNISTIC:
11217 break;
11218 default:
11219 return EINVAL;
11220 }
11221
11222 IFCQ_LOCK(ifq);
11223 if (IFCQ_IS_ENABLED(ifq)) {
11224 cqrq_throttle_t req = { 1, level };
11225
11226 err = fq_if_request_classq(ifq, CLASSQRQ_THROTTLE, &req);
11227 }
11228 IFCQ_UNLOCK(ifq);
11229
11230 if (err == 0) {
11231 DLIL_PRINTF("%s: throttling level set to %d\n", if_name(ifp),
11232 level);
11233 #if NECP
11234 necp_update_all_clients();
11235 #endif /* NECP */
11236 if (level == IFNET_THROTTLE_OFF) {
11237 ifnet_start(ifp);
11238 }
11239 }
11240
11241 return err;
11242 }
11243
11244 errno_t
ifnet_getset_log(ifnet_t ifp,u_long cmd,struct ifreq * ifr,struct proc * p)11245 ifnet_getset_log(ifnet_t ifp, u_long cmd, struct ifreq *ifr,
11246 struct proc *p)
11247 {
11248 #pragma unused(p)
11249 errno_t result = 0;
11250 uint32_t flags;
11251 int level, category, subcategory;
11252
11253 VERIFY(cmd == SIOCSIFLOG || cmd == SIOCGIFLOG);
11254
11255 if (cmd == SIOCSIFLOG) {
11256 if ((result = priv_check_cred(kauth_cred_get(),
11257 PRIV_NET_INTERFACE_CONTROL, 0)) != 0) {
11258 return result;
11259 }
11260
11261 level = ifr->ifr_log.ifl_level;
11262 if (level < IFNET_LOG_MIN || level > IFNET_LOG_MAX) {
11263 result = EINVAL;
11264 }
11265
11266 flags = ifr->ifr_log.ifl_flags;
11267 if ((flags &= IFNET_LOGF_MASK) == 0) {
11268 result = EINVAL;
11269 }
11270
11271 category = ifr->ifr_log.ifl_category;
11272 subcategory = ifr->ifr_log.ifl_subcategory;
11273
11274 if (result == 0) {
11275 result = ifnet_set_log(ifp, level, flags,
11276 category, subcategory);
11277 }
11278 } else {
11279 result = ifnet_get_log(ifp, &level, &flags, &category,
11280 &subcategory);
11281 if (result == 0) {
11282 ifr->ifr_log.ifl_level = level;
11283 ifr->ifr_log.ifl_flags = flags;
11284 ifr->ifr_log.ifl_category = category;
11285 ifr->ifr_log.ifl_subcategory = subcategory;
11286 }
11287 }
11288
11289 return result;
11290 }
11291
11292 int
ifnet_set_log(struct ifnet * ifp,int32_t level,uint32_t flags,int32_t category,int32_t subcategory)11293 ifnet_set_log(struct ifnet *ifp, int32_t level, uint32_t flags,
11294 int32_t category, int32_t subcategory)
11295 {
11296 int err = 0;
11297
11298 VERIFY(level >= IFNET_LOG_MIN && level <= IFNET_LOG_MAX);
11299 VERIFY(flags & IFNET_LOGF_MASK);
11300
11301 /*
11302 * The logging level applies to all facilities; make sure to
11303 * update them all with the most current level.
11304 */
11305 flags |= ifp->if_log.flags;
11306
11307 if (ifp->if_output_ctl != NULL) {
11308 struct ifnet_log_params l;
11309
11310 bzero(&l, sizeof(l));
11311 l.level = level;
11312 l.flags = flags;
11313 l.flags &= ~IFNET_LOGF_DLIL;
11314 l.category = category;
11315 l.subcategory = subcategory;
11316
11317 /* Send this request to lower layers */
11318 if (l.flags != 0) {
11319 err = ifp->if_output_ctl(ifp, IFNET_CTL_SET_LOG,
11320 sizeof(l), &l);
11321 }
11322 } else if ((flags & ~IFNET_LOGF_DLIL) && ifp->if_output_ctl == NULL) {
11323 /*
11324 * If targeted to the lower layers without an output
11325 * control callback registered on the interface, just
11326 * silently ignore facilities other than ours.
11327 */
11328 flags &= IFNET_LOGF_DLIL;
11329 if (flags == 0 && (!(ifp->if_log.flags & IFNET_LOGF_DLIL))) {
11330 level = 0;
11331 }
11332 }
11333
11334 if (err == 0) {
11335 if ((ifp->if_log.level = level) == IFNET_LOG_DEFAULT) {
11336 ifp->if_log.flags = 0;
11337 } else {
11338 ifp->if_log.flags |= flags;
11339 }
11340
11341 log(LOG_INFO, "%s: logging level set to %d flags=%b "
11342 "arg=%b, category=%d subcategory=%d\n", if_name(ifp),
11343 ifp->if_log.level, ifp->if_log.flags,
11344 IFNET_LOGF_BITS, flags, IFNET_LOGF_BITS,
11345 category, subcategory);
11346 }
11347
11348 return err;
11349 }
11350
11351 int
ifnet_get_log(struct ifnet * ifp,int32_t * level,uint32_t * flags,int32_t * category,int32_t * subcategory)11352 ifnet_get_log(struct ifnet *ifp, int32_t *level, uint32_t *flags,
11353 int32_t *category, int32_t *subcategory)
11354 {
11355 if (level != NULL) {
11356 *level = ifp->if_log.level;
11357 }
11358 if (flags != NULL) {
11359 *flags = ifp->if_log.flags;
11360 }
11361 if (category != NULL) {
11362 *category = ifp->if_log.category;
11363 }
11364 if (subcategory != NULL) {
11365 *subcategory = ifp->if_log.subcategory;
11366 }
11367
11368 return 0;
11369 }
11370
11371 int
ifnet_notify_address(struct ifnet * ifp,int af)11372 ifnet_notify_address(struct ifnet *ifp, int af)
11373 {
11374 struct ifnet_notify_address_params na;
11375
11376 #if PF
11377 (void) pf_ifaddr_hook(ifp);
11378 #endif /* PF */
11379
11380 if (ifp->if_output_ctl == NULL) {
11381 return EOPNOTSUPP;
11382 }
11383
11384 bzero(&na, sizeof(na));
11385 na.address_family = (sa_family_t)af;
11386
11387 return ifp->if_output_ctl(ifp, IFNET_CTL_NOTIFY_ADDRESS,
11388 sizeof(na), &na);
11389 }
11390
11391 errno_t
ifnet_flowid(struct ifnet * ifp,uint32_t * flowid)11392 ifnet_flowid(struct ifnet *ifp, uint32_t *flowid)
11393 {
11394 if (ifp == NULL || flowid == NULL) {
11395 return EINVAL;
11396 } else if (!(ifp->if_eflags & IFEF_TXSTART) ||
11397 !IF_FULLY_ATTACHED(ifp)) {
11398 return ENXIO;
11399 }
11400
11401 *flowid = ifp->if_flowhash;
11402
11403 return 0;
11404 }
11405
11406 errno_t
ifnet_disable_output(struct ifnet * ifp)11407 ifnet_disable_output(struct ifnet *ifp)
11408 {
11409 int err;
11410
11411 if (ifp == NULL) {
11412 return EINVAL;
11413 } else if (!(ifp->if_eflags & IFEF_TXSTART) ||
11414 !IF_FULLY_ATTACHED(ifp)) {
11415 return ENXIO;
11416 }
11417
11418 if ((err = ifnet_fc_add(ifp)) == 0) {
11419 lck_mtx_lock_spin(&ifp->if_start_lock);
11420 ifp->if_start_flags |= IFSF_FLOW_CONTROLLED;
11421 lck_mtx_unlock(&ifp->if_start_lock);
11422 }
11423 return err;
11424 }
11425
11426 errno_t
ifnet_enable_output(struct ifnet * ifp)11427 ifnet_enable_output(struct ifnet *ifp)
11428 {
11429 if (ifp == NULL) {
11430 return EINVAL;
11431 } else if (!(ifp->if_eflags & IFEF_TXSTART) ||
11432 !IF_FULLY_ATTACHED(ifp)) {
11433 return ENXIO;
11434 }
11435
11436 ifnet_start_common(ifp, TRUE);
11437 return 0;
11438 }
11439
11440 void
ifnet_flowadv(uint32_t flowhash)11441 ifnet_flowadv(uint32_t flowhash)
11442 {
11443 struct ifnet_fc_entry *ifce;
11444 struct ifnet *ifp;
11445
11446 ifce = ifnet_fc_get(flowhash);
11447 if (ifce == NULL) {
11448 return;
11449 }
11450
11451 VERIFY(ifce->ifce_ifp != NULL);
11452 ifp = ifce->ifce_ifp;
11453
11454 /* flow hash gets recalculated per attach, so check */
11455 if (ifnet_is_attached(ifp, 1)) {
11456 if (ifp->if_flowhash == flowhash) {
11457 (void) ifnet_enable_output(ifp);
11458 }
11459 ifnet_decr_iorefcnt(ifp);
11460 }
11461 ifnet_fc_entry_free(ifce);
11462 }
11463
11464 /*
11465 * Function to compare ifnet_fc_entries in ifnet flow control tree
11466 */
11467 static inline int
ifce_cmp(const struct ifnet_fc_entry * fc1,const struct ifnet_fc_entry * fc2)11468 ifce_cmp(const struct ifnet_fc_entry *fc1, const struct ifnet_fc_entry *fc2)
11469 {
11470 return fc1->ifce_flowhash - fc2->ifce_flowhash;
11471 }
11472
11473 static int
ifnet_fc_add(struct ifnet * ifp)11474 ifnet_fc_add(struct ifnet *ifp)
11475 {
11476 struct ifnet_fc_entry keyfc, *ifce;
11477 uint32_t flowhash;
11478
11479 VERIFY(ifp != NULL && (ifp->if_eflags & IFEF_TXSTART));
11480 VERIFY(ifp->if_flowhash != 0);
11481 flowhash = ifp->if_flowhash;
11482
11483 bzero(&keyfc, sizeof(keyfc));
11484 keyfc.ifce_flowhash = flowhash;
11485
11486 lck_mtx_lock_spin(&ifnet_fc_lock);
11487 ifce = RB_FIND(ifnet_fc_tree, &ifnet_fc_tree, &keyfc);
11488 if (ifce != NULL && ifce->ifce_ifp == ifp) {
11489 /* Entry is already in ifnet_fc_tree, return */
11490 lck_mtx_unlock(&ifnet_fc_lock);
11491 return 0;
11492 }
11493
11494 if (ifce != NULL) {
11495 /*
11496 * There is a different fc entry with the same flow hash
11497 * but different ifp pointer. There can be a collision
11498 * on flow hash but the probability is low. Let's just
11499 * avoid adding a second one when there is a collision.
11500 */
11501 lck_mtx_unlock(&ifnet_fc_lock);
11502 return EAGAIN;
11503 }
11504
11505 /* become regular mutex */
11506 lck_mtx_convert_spin(&ifnet_fc_lock);
11507
11508 ifce = zalloc_flags(ifnet_fc_zone, Z_WAITOK | Z_ZERO);
11509 ifce->ifce_flowhash = flowhash;
11510 ifce->ifce_ifp = ifp;
11511
11512 RB_INSERT(ifnet_fc_tree, &ifnet_fc_tree, ifce);
11513 lck_mtx_unlock(&ifnet_fc_lock);
11514 return 0;
11515 }
11516
11517 static struct ifnet_fc_entry *
ifnet_fc_get(uint32_t flowhash)11518 ifnet_fc_get(uint32_t flowhash)
11519 {
11520 struct ifnet_fc_entry keyfc, *ifce;
11521 struct ifnet *ifp;
11522
11523 bzero(&keyfc, sizeof(keyfc));
11524 keyfc.ifce_flowhash = flowhash;
11525
11526 lck_mtx_lock_spin(&ifnet_fc_lock);
11527 ifce = RB_FIND(ifnet_fc_tree, &ifnet_fc_tree, &keyfc);
11528 if (ifce == NULL) {
11529 /* Entry is not present in ifnet_fc_tree, return */
11530 lck_mtx_unlock(&ifnet_fc_lock);
11531 return NULL;
11532 }
11533
11534 RB_REMOVE(ifnet_fc_tree, &ifnet_fc_tree, ifce);
11535
11536 VERIFY(ifce->ifce_ifp != NULL);
11537 ifp = ifce->ifce_ifp;
11538
11539 /* become regular mutex */
11540 lck_mtx_convert_spin(&ifnet_fc_lock);
11541
11542 if (!ifnet_is_attached(ifp, 0)) {
11543 /*
11544 * This ifp is not attached or in the process of being
11545 * detached; just don't process it.
11546 */
11547 ifnet_fc_entry_free(ifce);
11548 ifce = NULL;
11549 }
11550 lck_mtx_unlock(&ifnet_fc_lock);
11551
11552 return ifce;
11553 }
11554
11555 static void
ifnet_fc_entry_free(struct ifnet_fc_entry * ifce)11556 ifnet_fc_entry_free(struct ifnet_fc_entry *ifce)
11557 {
11558 zfree(ifnet_fc_zone, ifce);
11559 }
11560
11561 static uint32_t
ifnet_calc_flowhash(struct ifnet * ifp)11562 ifnet_calc_flowhash(struct ifnet *ifp)
11563 {
11564 struct ifnet_flowhash_key fh __attribute__((aligned(8)));
11565 uint32_t flowhash = 0;
11566
11567 if (ifnet_flowhash_seed == 0) {
11568 ifnet_flowhash_seed = RandomULong();
11569 }
11570
11571 bzero(&fh, sizeof(fh));
11572
11573 (void) snprintf(fh.ifk_name, sizeof(fh.ifk_name), "%s", ifp->if_name);
11574 fh.ifk_unit = ifp->if_unit;
11575 fh.ifk_flags = ifp->if_flags;
11576 fh.ifk_eflags = ifp->if_eflags;
11577 fh.ifk_capabilities = ifp->if_capabilities;
11578 fh.ifk_capenable = ifp->if_capenable;
11579 fh.ifk_output_sched_model = ifp->if_output_sched_model;
11580 fh.ifk_rand1 = RandomULong();
11581 fh.ifk_rand2 = RandomULong();
11582
11583 try_again:
11584 flowhash = net_flowhash(&fh, sizeof(fh), ifnet_flowhash_seed);
11585 if (flowhash == 0) {
11586 /* try to get a non-zero flowhash */
11587 ifnet_flowhash_seed = RandomULong();
11588 goto try_again;
11589 }
11590
11591 return flowhash;
11592 }
11593
11594 int
ifnet_set_netsignature(struct ifnet * ifp,uint8_t family,uint8_t len,uint16_t flags,uint8_t * data)11595 ifnet_set_netsignature(struct ifnet *ifp, uint8_t family, uint8_t len,
11596 uint16_t flags, uint8_t *data)
11597 {
11598 #pragma unused(flags)
11599 int error = 0;
11600
11601 switch (family) {
11602 case AF_INET:
11603 if_inetdata_lock_exclusive(ifp);
11604 if (IN_IFEXTRA(ifp) != NULL) {
11605 if (len == 0) {
11606 /* Allow clearing the signature */
11607 IN_IFEXTRA(ifp)->netsig_len = 0;
11608 bzero(IN_IFEXTRA(ifp)->netsig,
11609 sizeof(IN_IFEXTRA(ifp)->netsig));
11610 if_inetdata_lock_done(ifp);
11611 break;
11612 } else if (len > sizeof(IN_IFEXTRA(ifp)->netsig)) {
11613 error = EINVAL;
11614 if_inetdata_lock_done(ifp);
11615 break;
11616 }
11617 IN_IFEXTRA(ifp)->netsig_len = len;
11618 bcopy(data, IN_IFEXTRA(ifp)->netsig, len);
11619 } else {
11620 error = ENOMEM;
11621 }
11622 if_inetdata_lock_done(ifp);
11623 break;
11624
11625 case AF_INET6:
11626 if_inet6data_lock_exclusive(ifp);
11627 if (IN6_IFEXTRA(ifp) != NULL) {
11628 if (len == 0) {
11629 /* Allow clearing the signature */
11630 IN6_IFEXTRA(ifp)->netsig_len = 0;
11631 bzero(IN6_IFEXTRA(ifp)->netsig,
11632 sizeof(IN6_IFEXTRA(ifp)->netsig));
11633 if_inet6data_lock_done(ifp);
11634 break;
11635 } else if (len > sizeof(IN6_IFEXTRA(ifp)->netsig)) {
11636 error = EINVAL;
11637 if_inet6data_lock_done(ifp);
11638 break;
11639 }
11640 IN6_IFEXTRA(ifp)->netsig_len = len;
11641 bcopy(data, IN6_IFEXTRA(ifp)->netsig, len);
11642 } else {
11643 error = ENOMEM;
11644 }
11645 if_inet6data_lock_done(ifp);
11646 break;
11647
11648 default:
11649 error = EINVAL;
11650 break;
11651 }
11652
11653 return error;
11654 }
11655
11656 int
ifnet_get_netsignature(struct ifnet * ifp,uint8_t family,uint8_t * len,uint16_t * flags,uint8_t * data)11657 ifnet_get_netsignature(struct ifnet *ifp, uint8_t family, uint8_t *len,
11658 uint16_t *flags, uint8_t *data)
11659 {
11660 int error = 0;
11661
11662 if (ifp == NULL || len == NULL || data == NULL) {
11663 return EINVAL;
11664 }
11665
11666 switch (family) {
11667 case AF_INET:
11668 if_inetdata_lock_shared(ifp);
11669 if (IN_IFEXTRA(ifp) != NULL) {
11670 if (*len == 0 || *len < IN_IFEXTRA(ifp)->netsig_len) {
11671 error = EINVAL;
11672 if_inetdata_lock_done(ifp);
11673 break;
11674 }
11675 if ((*len = (uint8_t)IN_IFEXTRA(ifp)->netsig_len) > 0) {
11676 bcopy(IN_IFEXTRA(ifp)->netsig, data, *len);
11677 } else {
11678 error = ENOENT;
11679 }
11680 } else {
11681 error = ENOMEM;
11682 }
11683 if_inetdata_lock_done(ifp);
11684 break;
11685
11686 case AF_INET6:
11687 if_inet6data_lock_shared(ifp);
11688 if (IN6_IFEXTRA(ifp) != NULL) {
11689 if (*len == 0 || *len < IN6_IFEXTRA(ifp)->netsig_len) {
11690 error = EINVAL;
11691 if_inet6data_lock_done(ifp);
11692 break;
11693 }
11694 if ((*len = (uint8_t)IN6_IFEXTRA(ifp)->netsig_len) > 0) {
11695 bcopy(IN6_IFEXTRA(ifp)->netsig, data, *len);
11696 } else {
11697 error = ENOENT;
11698 }
11699 } else {
11700 error = ENOMEM;
11701 }
11702 if_inet6data_lock_done(ifp);
11703 break;
11704
11705 default:
11706 error = EINVAL;
11707 break;
11708 }
11709
11710 if (error == 0 && flags != NULL) {
11711 *flags = 0;
11712 }
11713
11714 return error;
11715 }
11716
11717 int
ifnet_set_nat64prefix(struct ifnet * ifp,struct ipv6_prefix * prefixes)11718 ifnet_set_nat64prefix(struct ifnet *ifp, struct ipv6_prefix *prefixes)
11719 {
11720 int i, error = 0, one_set = 0;
11721
11722 if_inet6data_lock_exclusive(ifp);
11723
11724 if (IN6_IFEXTRA(ifp) == NULL) {
11725 error = ENOMEM;
11726 goto out;
11727 }
11728
11729 for (i = 0; i < NAT64_MAX_NUM_PREFIXES; i++) {
11730 uint32_t prefix_len =
11731 prefixes[i].prefix_len;
11732 struct in6_addr *prefix =
11733 &prefixes[i].ipv6_prefix;
11734
11735 if (prefix_len == 0) {
11736 clat_log0((LOG_DEBUG,
11737 "NAT64 prefixes purged from Interface %s\n",
11738 if_name(ifp)));
11739 /* Allow clearing the signature */
11740 IN6_IFEXTRA(ifp)->nat64_prefixes[i].prefix_len = 0;
11741 bzero(&IN6_IFEXTRA(ifp)->nat64_prefixes[i].ipv6_prefix,
11742 sizeof(struct in6_addr));
11743
11744 continue;
11745 } else if (prefix_len != NAT64_PREFIX_LEN_32 &&
11746 prefix_len != NAT64_PREFIX_LEN_40 &&
11747 prefix_len != NAT64_PREFIX_LEN_48 &&
11748 prefix_len != NAT64_PREFIX_LEN_56 &&
11749 prefix_len != NAT64_PREFIX_LEN_64 &&
11750 prefix_len != NAT64_PREFIX_LEN_96) {
11751 clat_log0((LOG_DEBUG,
11752 "NAT64 prefixlen is incorrect %d\n", prefix_len));
11753 error = EINVAL;
11754 goto out;
11755 }
11756
11757 if (IN6_IS_SCOPE_EMBED(prefix)) {
11758 clat_log0((LOG_DEBUG,
11759 "NAT64 prefix has interface/link local scope.\n"));
11760 error = EINVAL;
11761 goto out;
11762 }
11763
11764 IN6_IFEXTRA(ifp)->nat64_prefixes[i].prefix_len = prefix_len;
11765 bcopy(prefix, &IN6_IFEXTRA(ifp)->nat64_prefixes[i].ipv6_prefix,
11766 sizeof(struct in6_addr));
11767 clat_log0((LOG_DEBUG,
11768 "NAT64 prefix set to %s with prefixlen: %d\n",
11769 ip6_sprintf(prefix), prefix_len));
11770 one_set = 1;
11771 }
11772
11773 out:
11774 if_inet6data_lock_done(ifp);
11775
11776 if (error == 0 && one_set != 0) {
11777 necp_update_all_clients();
11778 }
11779
11780 return error;
11781 }
11782
11783 int
ifnet_get_nat64prefix(struct ifnet * ifp,struct ipv6_prefix * prefixes)11784 ifnet_get_nat64prefix(struct ifnet *ifp, struct ipv6_prefix *prefixes)
11785 {
11786 int i, found_one = 0, error = 0;
11787
11788 if (ifp == NULL) {
11789 return EINVAL;
11790 }
11791
11792 if_inet6data_lock_shared(ifp);
11793
11794 if (IN6_IFEXTRA(ifp) == NULL) {
11795 error = ENOMEM;
11796 goto out;
11797 }
11798
11799 for (i = 0; i < NAT64_MAX_NUM_PREFIXES; i++) {
11800 if (IN6_IFEXTRA(ifp)->nat64_prefixes[i].prefix_len != 0) {
11801 found_one = 1;
11802 }
11803 }
11804
11805 if (found_one == 0) {
11806 error = ENOENT;
11807 goto out;
11808 }
11809
11810 if (prefixes) {
11811 bcopy(IN6_IFEXTRA(ifp)->nat64_prefixes, prefixes,
11812 sizeof(IN6_IFEXTRA(ifp)->nat64_prefixes));
11813 }
11814
11815 out:
11816 if_inet6data_lock_done(ifp);
11817
11818 return error;
11819 }
11820
11821 __attribute__((noinline))
11822 static void
dlil_output_cksum_dbg(struct ifnet * ifp,struct mbuf * m,uint32_t hoff,protocol_family_t pf)11823 dlil_output_cksum_dbg(struct ifnet *ifp, struct mbuf *m, uint32_t hoff,
11824 protocol_family_t pf)
11825 {
11826 #pragma unused(ifp)
11827 uint32_t did_sw;
11828
11829 if (!(hwcksum_dbg_mode & HWCKSUM_DBG_FINALIZE_FORCED) ||
11830 (m->m_pkthdr.csum_flags & (CSUM_TSO_IPV4 | CSUM_TSO_IPV6))) {
11831 return;
11832 }
11833
11834 switch (pf) {
11835 case PF_INET:
11836 did_sw = in_finalize_cksum(m, hoff, m->m_pkthdr.csum_flags);
11837 if (did_sw & CSUM_DELAY_IP) {
11838 hwcksum_dbg_finalized_hdr++;
11839 }
11840 if (did_sw & CSUM_DELAY_DATA) {
11841 hwcksum_dbg_finalized_data++;
11842 }
11843 break;
11844 case PF_INET6:
11845 /*
11846 * Checksum offload should not have been enabled when
11847 * extension headers exist; that also means that we
11848 * cannot force-finalize packets with extension headers.
11849 * Indicate to the callee should it skip such case by
11850 * setting optlen to -1.
11851 */
11852 did_sw = in6_finalize_cksum(m, hoff, -1, -1,
11853 m->m_pkthdr.csum_flags);
11854 if (did_sw & CSUM_DELAY_IPV6_DATA) {
11855 hwcksum_dbg_finalized_data++;
11856 }
11857 break;
11858 default:
11859 return;
11860 }
11861 }
11862
11863 static void
dlil_input_cksum_dbg(struct ifnet * ifp,struct mbuf * m,char * frame_header,protocol_family_t pf)11864 dlil_input_cksum_dbg(struct ifnet *ifp, struct mbuf *m, char *frame_header,
11865 protocol_family_t pf)
11866 {
11867 uint16_t sum = 0;
11868 uint32_t hlen;
11869
11870 if (frame_header == NULL ||
11871 frame_header < (char *)mbuf_datastart(m) ||
11872 frame_header > (char *)m->m_data) {
11873 DLIL_PRINTF("%s: frame header pointer 0x%llx out of range "
11874 "[0x%llx,0x%llx] for mbuf 0x%llx\n", if_name(ifp),
11875 (uint64_t)VM_KERNEL_ADDRPERM(frame_header),
11876 (uint64_t)VM_KERNEL_ADDRPERM(mbuf_datastart(m)),
11877 (uint64_t)VM_KERNEL_ADDRPERM(m->m_data),
11878 (uint64_t)VM_KERNEL_ADDRPERM(m));
11879 return;
11880 }
11881 hlen = (uint32_t)(m->m_data - frame_header);
11882
11883 switch (pf) {
11884 case PF_INET:
11885 case PF_INET6:
11886 break;
11887 default:
11888 return;
11889 }
11890
11891 /*
11892 * Force partial checksum offload; useful to simulate cases
11893 * where the hardware does not support partial checksum offload,
11894 * in order to validate correctness throughout the layers above.
11895 */
11896 if (hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_FORCED) {
11897 uint32_t foff = hwcksum_dbg_partial_rxoff_forced;
11898
11899 if (foff > (uint32_t)m->m_pkthdr.len) {
11900 return;
11901 }
11902
11903 m->m_pkthdr.csum_flags &= ~CSUM_RX_FLAGS;
11904
11905 /* Compute 16-bit 1's complement sum from forced offset */
11906 sum = m_sum16(m, foff, (m->m_pkthdr.len - foff));
11907
11908 m->m_pkthdr.csum_flags |= (CSUM_DATA_VALID | CSUM_PARTIAL);
11909 m->m_pkthdr.csum_rx_val = sum;
11910 m->m_pkthdr.csum_rx_start = (uint16_t)(foff + hlen);
11911
11912 hwcksum_dbg_partial_forced++;
11913 hwcksum_dbg_partial_forced_bytes += m->m_pkthdr.len;
11914 }
11915
11916 /*
11917 * Partial checksum offload verification (and adjustment);
11918 * useful to validate and test cases where the hardware
11919 * supports partial checksum offload.
11920 */
11921 if ((m->m_pkthdr.csum_flags &
11922 (CSUM_DATA_VALID | CSUM_PARTIAL | CSUM_PSEUDO_HDR)) ==
11923 (CSUM_DATA_VALID | CSUM_PARTIAL)) {
11924 uint32_t rxoff;
11925
11926 /* Start offset must begin after frame header */
11927 rxoff = m->m_pkthdr.csum_rx_start;
11928 if (hlen > rxoff) {
11929 hwcksum_dbg_bad_rxoff++;
11930 if (dlil_verbose) {
11931 DLIL_PRINTF("%s: partial cksum start offset %d "
11932 "is less than frame header length %d for "
11933 "mbuf 0x%llx\n", if_name(ifp), rxoff, hlen,
11934 (uint64_t)VM_KERNEL_ADDRPERM(m));
11935 }
11936 return;
11937 }
11938 rxoff -= hlen;
11939
11940 if (!(hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_FORCED)) {
11941 /*
11942 * Compute the expected 16-bit 1's complement sum;
11943 * skip this if we've already computed it above
11944 * when partial checksum offload is forced.
11945 */
11946 sum = m_sum16(m, rxoff, (m->m_pkthdr.len - rxoff));
11947
11948 /* Hardware or driver is buggy */
11949 if (sum != m->m_pkthdr.csum_rx_val) {
11950 hwcksum_dbg_bad_cksum++;
11951 if (dlil_verbose) {
11952 DLIL_PRINTF("%s: bad partial cksum value "
11953 "0x%x (expected 0x%x) for mbuf "
11954 "0x%llx [rx_start %d]\n",
11955 if_name(ifp),
11956 m->m_pkthdr.csum_rx_val, sum,
11957 (uint64_t)VM_KERNEL_ADDRPERM(m),
11958 m->m_pkthdr.csum_rx_start);
11959 }
11960 return;
11961 }
11962 }
11963 hwcksum_dbg_verified++;
11964
11965 /*
11966 * This code allows us to emulate various hardwares that
11967 * perform 16-bit 1's complement sum beginning at various
11968 * start offset values.
11969 */
11970 if (hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_RXOFF_ADJ) {
11971 uint32_t aoff = hwcksum_dbg_partial_rxoff_adj;
11972
11973 if (aoff == rxoff || aoff > (uint32_t)m->m_pkthdr.len) {
11974 return;
11975 }
11976
11977 sum = m_adj_sum16(m, rxoff, aoff,
11978 m_pktlen(m) - aoff, sum);
11979
11980 m->m_pkthdr.csum_rx_val = sum;
11981 m->m_pkthdr.csum_rx_start = (uint16_t)(aoff + hlen);
11982
11983 hwcksum_dbg_adjusted++;
11984 }
11985 }
11986 }
11987
11988 static int
11989 sysctl_hwcksum_dbg_mode SYSCTL_HANDLER_ARGS
11990 {
11991 #pragma unused(arg1, arg2)
11992 u_int32_t i;
11993 int err;
11994
11995 i = hwcksum_dbg_mode;
11996
11997 err = sysctl_handle_int(oidp, &i, 0, req);
11998 if (err != 0 || req->newptr == USER_ADDR_NULL) {
11999 return err;
12000 }
12001
12002 if (hwcksum_dbg == 0) {
12003 return ENODEV;
12004 }
12005
12006 if ((i & ~HWCKSUM_DBG_MASK) != 0) {
12007 return EINVAL;
12008 }
12009
12010 hwcksum_dbg_mode = (i & HWCKSUM_DBG_MASK);
12011
12012 return err;
12013 }
12014
12015 static int
12016 sysctl_hwcksum_dbg_partial_rxoff_forced SYSCTL_HANDLER_ARGS
12017 {
12018 #pragma unused(arg1, arg2)
12019 u_int32_t i;
12020 int err;
12021
12022 i = hwcksum_dbg_partial_rxoff_forced;
12023
12024 err = sysctl_handle_int(oidp, &i, 0, req);
12025 if (err != 0 || req->newptr == USER_ADDR_NULL) {
12026 return err;
12027 }
12028
12029 if (!(hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_FORCED)) {
12030 return ENODEV;
12031 }
12032
12033 hwcksum_dbg_partial_rxoff_forced = i;
12034
12035 return err;
12036 }
12037
12038 static int
12039 sysctl_hwcksum_dbg_partial_rxoff_adj SYSCTL_HANDLER_ARGS
12040 {
12041 #pragma unused(arg1, arg2)
12042 u_int32_t i;
12043 int err;
12044
12045 i = hwcksum_dbg_partial_rxoff_adj;
12046
12047 err = sysctl_handle_int(oidp, &i, 0, req);
12048 if (err != 0 || req->newptr == USER_ADDR_NULL) {
12049 return err;
12050 }
12051
12052 if (!(hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_RXOFF_ADJ)) {
12053 return ENODEV;
12054 }
12055
12056 hwcksum_dbg_partial_rxoff_adj = i;
12057
12058 return err;
12059 }
12060
12061 static int
12062 sysctl_tx_chain_len_stats SYSCTL_HANDLER_ARGS
12063 {
12064 #pragma unused(oidp, arg1, arg2)
12065 int err;
12066
12067 if (req->oldptr == USER_ADDR_NULL) {
12068 }
12069 if (req->newptr != USER_ADDR_NULL) {
12070 return EPERM;
12071 }
12072 err = SYSCTL_OUT(req, &tx_chain_len_stats,
12073 sizeof(struct chain_len_stats));
12074
12075 return err;
12076 }
12077
12078
12079 #if DEBUG || DEVELOPMENT
12080 /* Blob for sum16 verification */
12081 static uint8_t sumdata[] = {
12082 0x1f, 0x8b, 0x08, 0x08, 0x4c, 0xe5, 0x9a, 0x4f, 0x00, 0x03,
12083 0x5f, 0x00, 0x5d, 0x91, 0x41, 0x4e, 0xc4, 0x30, 0x0c, 0x45,
12084 0xf7, 0x9c, 0xc2, 0x07, 0x18, 0xf5, 0x0e, 0xb0, 0xe2, 0x00,
12085 0x48, 0x88, 0xa5, 0xdb, 0xba, 0x49, 0x34, 0x69, 0xdc, 0x71,
12086 0x92, 0xa9, 0xc2, 0x8a, 0x6b, 0x70, 0x3d, 0x4e, 0x82, 0x93,
12087 0xb4, 0x08, 0xd8, 0xc5, 0xb1, 0xfd, 0xff, 0xb3, 0xfd, 0x4c,
12088 0x42, 0x5f, 0x1f, 0x9f, 0x11, 0x12, 0x43, 0xb2, 0x04, 0x93,
12089 0xe0, 0x7b, 0x01, 0x0e, 0x14, 0x07, 0x78, 0xd1, 0x78, 0x75,
12090 0x71, 0x71, 0xe9, 0x08, 0x84, 0x46, 0xf2, 0xc7, 0x3b, 0x09,
12091 0xe7, 0xd1, 0xd3, 0x8a, 0x57, 0x92, 0x33, 0xcd, 0x39, 0xcc,
12092 0xb0, 0x91, 0x89, 0xe0, 0x42, 0x53, 0x8b, 0xb7, 0x8c, 0x42,
12093 0x60, 0xd9, 0x9f, 0x7a, 0x55, 0x19, 0x76, 0xcb, 0x10, 0x49,
12094 0x35, 0xac, 0x0b, 0x5a, 0x3c, 0xbb, 0x65, 0x51, 0x8c, 0x90,
12095 0x7c, 0x69, 0x45, 0x45, 0x81, 0xb4, 0x2b, 0x70, 0x82, 0x85,
12096 0x55, 0x91, 0x17, 0x90, 0xdc, 0x14, 0x1e, 0x35, 0x52, 0xdd,
12097 0x02, 0x16, 0xef, 0xb5, 0x40, 0x89, 0xe2, 0x46, 0x53, 0xad,
12098 0x93, 0x6e, 0x98, 0x30, 0xe5, 0x08, 0xb7, 0xcc, 0x03, 0xbc,
12099 0x71, 0x86, 0x09, 0x43, 0x0d, 0x52, 0xf5, 0xa2, 0xf5, 0xa2,
12100 0x56, 0x11, 0x8d, 0xa8, 0xf5, 0xee, 0x92, 0x3d, 0xfe, 0x8c,
12101 0x67, 0x71, 0x8b, 0x0e, 0x2d, 0x70, 0x77, 0xbe, 0xbe, 0xea,
12102 0xbf, 0x9a, 0x8d, 0x9c, 0x53, 0x53, 0xe5, 0xe0, 0x4b, 0x87,
12103 0x85, 0xd2, 0x45, 0x95, 0x30, 0xc1, 0xcc, 0xe0, 0x74, 0x54,
12104 0x13, 0x58, 0xe8, 0xe8, 0x79, 0xa2, 0x09, 0x73, 0xa4, 0x0e,
12105 0x39, 0x59, 0x0c, 0xe6, 0x9c, 0xb2, 0x4f, 0x06, 0x5b, 0x8e,
12106 0xcd, 0x17, 0x6c, 0x5e, 0x95, 0x4d, 0x70, 0xa2, 0x0a, 0xbf,
12107 0xa3, 0xcc, 0x03, 0xbc, 0x5a, 0xe7, 0x75, 0x06, 0x5e, 0x75,
12108 0xef, 0x58, 0x8e, 0x15, 0xd1, 0x0a, 0x18, 0xff, 0xdd, 0xe6,
12109 0x02, 0x3b, 0xb5, 0xb4, 0xa1, 0xe0, 0x72, 0xfc, 0xe3, 0xab,
12110 0x07, 0xe0, 0x4d, 0x65, 0xea, 0x92, 0xeb, 0xf2, 0x7b, 0x17,
12111 0x05, 0xce, 0xc6, 0xf6, 0x2b, 0xbb, 0x70, 0x3d, 0x00, 0x95,
12112 0xe0, 0x07, 0x52, 0x3b, 0x58, 0xfc, 0x7c, 0x69, 0x4d, 0xe9,
12113 0xf7, 0xa9, 0x66, 0x1e, 0x1e, 0xbe, 0x01, 0x69, 0x98, 0xfe,
12114 0xc8, 0x28, 0x02, 0x00, 0x00
12115 };
12116
12117 /* Precomputed 16-bit 1's complement sums for various spans of the above data */
12118 static struct {
12119 boolean_t init;
12120 uint16_t len;
12121 uint16_t sumr; /* reference */
12122 uint16_t sumrp; /* reference, precomputed */
12123 } sumtbl[] = {
12124 { FALSE, 0, 0, 0x0000 },
12125 { FALSE, 1, 0, 0x001f },
12126 { FALSE, 2, 0, 0x8b1f },
12127 { FALSE, 3, 0, 0x8b27 },
12128 { FALSE, 7, 0, 0x790e },
12129 { FALSE, 11, 0, 0xcb6d },
12130 { FALSE, 20, 0, 0x20dd },
12131 { FALSE, 27, 0, 0xbabd },
12132 { FALSE, 32, 0, 0xf3e8 },
12133 { FALSE, 37, 0, 0x197d },
12134 { FALSE, 43, 0, 0x9eae },
12135 { FALSE, 64, 0, 0x4678 },
12136 { FALSE, 127, 0, 0x9399 },
12137 { FALSE, 256, 0, 0xd147 },
12138 { FALSE, 325, 0, 0x0358 },
12139 };
12140 #define SUMTBL_MAX ((int)sizeof (sumtbl) / (int)sizeof (sumtbl[0]))
12141
12142 static void
dlil_verify_sum16(void)12143 dlil_verify_sum16(void)
12144 {
12145 struct mbuf *m;
12146 uint8_t *buf;
12147 int n;
12148
12149 /* Make sure test data plus extra room for alignment fits in cluster */
12150 _CASSERT((sizeof(sumdata) + (sizeof(uint64_t) * 2)) <= MCLBYTES);
12151
12152 kprintf("DLIL: running SUM16 self-tests ... ");
12153
12154 m = m_getcl(M_WAITOK, MT_DATA, M_PKTHDR);
12155 m_align(m, sizeof(sumdata) + (sizeof(uint64_t) * 2));
12156
12157 buf = mtod(m, uint8_t *); /* base address */
12158
12159 for (n = 0; n < SUMTBL_MAX; n++) {
12160 uint16_t len = sumtbl[n].len;
12161 int i;
12162
12163 /* Verify for all possible alignments */
12164 for (i = 0; i < (int)sizeof(uint64_t); i++) {
12165 uint16_t sum, sumr;
12166 uint8_t *c;
12167
12168 /* Copy over test data to mbuf */
12169 VERIFY(len <= sizeof(sumdata));
12170 c = buf + i;
12171 bcopy(sumdata, c, len);
12172
12173 /* Zero-offset test (align by data pointer) */
12174 m->m_data = (caddr_t)c;
12175 m->m_len = len;
12176 sum = m_sum16(m, 0, len);
12177
12178 if (!sumtbl[n].init) {
12179 sumr = (uint16_t)in_cksum_mbuf_ref(m, len, 0, 0);
12180 sumtbl[n].sumr = sumr;
12181 sumtbl[n].init = TRUE;
12182 } else {
12183 sumr = sumtbl[n].sumr;
12184 }
12185
12186 /* Something is horribly broken; stop now */
12187 if (sumr != sumtbl[n].sumrp) {
12188 panic_plain("\n%s: broken in_cksum_mbuf_ref() "
12189 "for len=%d align=%d sum=0x%04x "
12190 "[expected=0x%04x]\n", __func__,
12191 len, i, sum, sumr);
12192 /* NOTREACHED */
12193 } else if (sum != sumr) {
12194 panic_plain("\n%s: broken m_sum16() for len=%d "
12195 "align=%d sum=0x%04x [expected=0x%04x]\n",
12196 __func__, len, i, sum, sumr);
12197 /* NOTREACHED */
12198 }
12199
12200 /* Alignment test by offset (fixed data pointer) */
12201 m->m_data = (caddr_t)buf;
12202 m->m_len = i + len;
12203 sum = m_sum16(m, i, len);
12204
12205 /* Something is horribly broken; stop now */
12206 if (sum != sumr) {
12207 panic_plain("\n%s: broken m_sum16() for len=%d "
12208 "offset=%d sum=0x%04x [expected=0x%04x]\n",
12209 __func__, len, i, sum, sumr);
12210 /* NOTREACHED */
12211 }
12212 #if INET
12213 /* Simple sum16 contiguous buffer test by aligment */
12214 sum = b_sum16(c, len);
12215
12216 /* Something is horribly broken; stop now */
12217 if (sum != sumr) {
12218 panic_plain("\n%s: broken b_sum16() for len=%d "
12219 "align=%d sum=0x%04x [expected=0x%04x]\n",
12220 __func__, len, i, sum, sumr);
12221 /* NOTREACHED */
12222 }
12223 #endif /* INET */
12224 }
12225 }
12226 m_freem(m);
12227
12228 kprintf("PASSED\n");
12229 }
12230 #endif /* DEBUG || DEVELOPMENT */
12231
12232 #define CASE_STRINGIFY(x) case x: return #x
12233
12234 __private_extern__ const char *
dlil_kev_dl_code_str(u_int32_t event_code)12235 dlil_kev_dl_code_str(u_int32_t event_code)
12236 {
12237 switch (event_code) {
12238 CASE_STRINGIFY(KEV_DL_SIFFLAGS);
12239 CASE_STRINGIFY(KEV_DL_SIFMETRICS);
12240 CASE_STRINGIFY(KEV_DL_SIFMTU);
12241 CASE_STRINGIFY(KEV_DL_SIFPHYS);
12242 CASE_STRINGIFY(KEV_DL_SIFMEDIA);
12243 CASE_STRINGIFY(KEV_DL_SIFGENERIC);
12244 CASE_STRINGIFY(KEV_DL_ADDMULTI);
12245 CASE_STRINGIFY(KEV_DL_DELMULTI);
12246 CASE_STRINGIFY(KEV_DL_IF_ATTACHED);
12247 CASE_STRINGIFY(KEV_DL_IF_DETACHING);
12248 CASE_STRINGIFY(KEV_DL_IF_DETACHED);
12249 CASE_STRINGIFY(KEV_DL_LINK_OFF);
12250 CASE_STRINGIFY(KEV_DL_LINK_ON);
12251 CASE_STRINGIFY(KEV_DL_PROTO_ATTACHED);
12252 CASE_STRINGIFY(KEV_DL_PROTO_DETACHED);
12253 CASE_STRINGIFY(KEV_DL_LINK_ADDRESS_CHANGED);
12254 CASE_STRINGIFY(KEV_DL_WAKEFLAGS_CHANGED);
12255 CASE_STRINGIFY(KEV_DL_IF_IDLE_ROUTE_REFCNT);
12256 CASE_STRINGIFY(KEV_DL_IFCAP_CHANGED);
12257 CASE_STRINGIFY(KEV_DL_LINK_QUALITY_METRIC_CHANGED);
12258 CASE_STRINGIFY(KEV_DL_NODE_PRESENCE);
12259 CASE_STRINGIFY(KEV_DL_NODE_ABSENCE);
12260 CASE_STRINGIFY(KEV_DL_PRIMARY_ELECTED);
12261 CASE_STRINGIFY(KEV_DL_ISSUES);
12262 CASE_STRINGIFY(KEV_DL_IFDELEGATE_CHANGED);
12263 default:
12264 break;
12265 }
12266 return "";
12267 }
12268
12269 static void
dlil_dt_tcall_fn(thread_call_param_t arg0,thread_call_param_t arg1)12270 dlil_dt_tcall_fn(thread_call_param_t arg0, thread_call_param_t arg1)
12271 {
12272 #pragma unused(arg1)
12273 struct ifnet *ifp = arg0;
12274
12275 if (ifnet_is_attached(ifp, 1)) {
12276 nstat_ifnet_threshold_reached(ifp->if_index);
12277 ifnet_decr_iorefcnt(ifp);
12278 }
12279 }
12280
12281 void
ifnet_notify_data_threshold(struct ifnet * ifp)12282 ifnet_notify_data_threshold(struct ifnet *ifp)
12283 {
12284 uint64_t bytes = (ifp->if_ibytes + ifp->if_obytes);
12285 uint64_t oldbytes = ifp->if_dt_bytes;
12286
12287 ASSERT(ifp->if_dt_tcall != NULL);
12288
12289 /*
12290 * If we went over the threshold, notify NetworkStatistics.
12291 * We rate-limit it based on the threshold interval value.
12292 */
12293 if (threshold_notify && (bytes - oldbytes) > ifp->if_data_threshold &&
12294 OSCompareAndSwap64(oldbytes, bytes, &ifp->if_dt_bytes) &&
12295 !thread_call_isactive(ifp->if_dt_tcall)) {
12296 uint64_t tival = (threshold_interval * NSEC_PER_SEC);
12297 uint64_t now = mach_absolute_time(), deadline = now;
12298 uint64_t ival;
12299
12300 if (tival != 0) {
12301 nanoseconds_to_absolutetime(tival, &ival);
12302 clock_deadline_for_periodic_event(ival, now, &deadline);
12303 (void) thread_call_enter_delayed(ifp->if_dt_tcall,
12304 deadline);
12305 } else {
12306 (void) thread_call_enter(ifp->if_dt_tcall);
12307 }
12308 }
12309 }
12310
12311 #if (DEVELOPMENT || DEBUG)
12312 /*
12313 * The sysctl variable name contains the input parameters of
12314 * ifnet_get_keepalive_offload_frames()
12315 * ifp (interface index): name[0]
12316 * frames_array_count: name[1]
12317 * frame_data_offset: name[2]
12318 * The return length gives used_frames_count
12319 */
12320 static int
12321 sysctl_get_kao_frames SYSCTL_HANDLER_ARGS
12322 {
12323 #pragma unused(oidp)
12324 int *name = (int *)arg1;
12325 u_int namelen = arg2;
12326 int idx;
12327 ifnet_t ifp = NULL;
12328 u_int32_t frames_array_count;
12329 size_t frame_data_offset;
12330 u_int32_t used_frames_count;
12331 struct ifnet_keepalive_offload_frame *frames_array = NULL;
12332 int error = 0;
12333 u_int32_t i;
12334
12335 /*
12336 * Only root can get look at other people TCP frames
12337 */
12338 error = proc_suser(current_proc());
12339 if (error != 0) {
12340 goto done;
12341 }
12342 /*
12343 * Validate the input parameters
12344 */
12345 if (req->newptr != USER_ADDR_NULL) {
12346 error = EPERM;
12347 goto done;
12348 }
12349 if (namelen != 3) {
12350 error = EINVAL;
12351 goto done;
12352 }
12353 if (req->oldptr == USER_ADDR_NULL) {
12354 error = EINVAL;
12355 goto done;
12356 }
12357 if (req->oldlen == 0) {
12358 error = EINVAL;
12359 goto done;
12360 }
12361 idx = name[0];
12362 frames_array_count = name[1];
12363 frame_data_offset = name[2];
12364
12365 /* Make sure the passed buffer is large enough */
12366 if (frames_array_count * sizeof(struct ifnet_keepalive_offload_frame) >
12367 req->oldlen) {
12368 error = ENOMEM;
12369 goto done;
12370 }
12371
12372 ifnet_head_lock_shared();
12373 if (!IF_INDEX_IN_RANGE(idx)) {
12374 ifnet_head_done();
12375 error = ENOENT;
12376 goto done;
12377 }
12378 ifp = ifindex2ifnet[idx];
12379 ifnet_head_done();
12380
12381 frames_array = (struct ifnet_keepalive_offload_frame *)kalloc_data(
12382 frames_array_count * sizeof(struct ifnet_keepalive_offload_frame),
12383 Z_WAITOK);
12384 if (frames_array == NULL) {
12385 error = ENOMEM;
12386 goto done;
12387 }
12388
12389 error = ifnet_get_keepalive_offload_frames(ifp, frames_array,
12390 frames_array_count, frame_data_offset, &used_frames_count);
12391 if (error != 0) {
12392 DLIL_PRINTF("%s: ifnet_get_keepalive_offload_frames error %d\n",
12393 __func__, error);
12394 goto done;
12395 }
12396
12397 for (i = 0; i < used_frames_count; i++) {
12398 error = SYSCTL_OUT(req, frames_array + i,
12399 sizeof(struct ifnet_keepalive_offload_frame));
12400 if (error != 0) {
12401 goto done;
12402 }
12403 }
12404 done:
12405 if (frames_array != NULL) {
12406 kfree_data(frames_array, frames_array_count *
12407 sizeof(struct ifnet_keepalive_offload_frame));
12408 }
12409 return error;
12410 }
12411 #endif /* DEVELOPMENT || DEBUG */
12412
12413 void
ifnet_update_stats_per_flow(struct ifnet_stats_per_flow * ifs,struct ifnet * ifp)12414 ifnet_update_stats_per_flow(struct ifnet_stats_per_flow *ifs,
12415 struct ifnet *ifp)
12416 {
12417 tcp_update_stats_per_flow(ifs, ifp);
12418 }
12419
12420 static inline u_int32_t
_set_flags(u_int32_t * flags_p,u_int32_t set_flags)12421 _set_flags(u_int32_t *flags_p, u_int32_t set_flags)
12422 {
12423 return (u_int32_t)OSBitOrAtomic(set_flags, flags_p);
12424 }
12425
12426 static inline void
_clear_flags(u_int32_t * flags_p,u_int32_t clear_flags)12427 _clear_flags(u_int32_t *flags_p, u_int32_t clear_flags)
12428 {
12429 OSBitAndAtomic(~clear_flags, flags_p);
12430 }
12431
12432 __private_extern__ u_int32_t
if_set_eflags(ifnet_t interface,u_int32_t set_flags)12433 if_set_eflags(ifnet_t interface, u_int32_t set_flags)
12434 {
12435 return _set_flags(&interface->if_eflags, set_flags);
12436 }
12437
12438 __private_extern__ void
if_clear_eflags(ifnet_t interface,u_int32_t clear_flags)12439 if_clear_eflags(ifnet_t interface, u_int32_t clear_flags)
12440 {
12441 _clear_flags(&interface->if_eflags, clear_flags);
12442 }
12443
12444 __private_extern__ u_int32_t
if_set_xflags(ifnet_t interface,u_int32_t set_flags)12445 if_set_xflags(ifnet_t interface, u_int32_t set_flags)
12446 {
12447 return _set_flags(&interface->if_xflags, set_flags);
12448 }
12449
12450 __private_extern__ void
if_clear_xflags(ifnet_t interface,u_int32_t clear_flags)12451 if_clear_xflags(ifnet_t interface, u_int32_t clear_flags)
12452 {
12453 _clear_flags(&interface->if_xflags, clear_flags);
12454 }
12455
12456 __private_extern__ void
ifnet_update_traffic_rule_genid(ifnet_t ifp)12457 ifnet_update_traffic_rule_genid(ifnet_t ifp)
12458 {
12459 atomic_add_32(&ifp->if_traffic_rule_genid, 1);
12460 }
12461
12462 __private_extern__ boolean_t
ifnet_sync_traffic_rule_genid(ifnet_t ifp,uint32_t * genid)12463 ifnet_sync_traffic_rule_genid(ifnet_t ifp, uint32_t *genid)
12464 {
12465 if (*genid != ifp->if_traffic_rule_genid) {
12466 *genid = ifp->if_traffic_rule_genid;
12467 return TRUE;
12468 }
12469 return FALSE;
12470 }
12471 __private_extern__ void
ifnet_update_traffic_rule_count(ifnet_t ifp,uint32_t count)12472 ifnet_update_traffic_rule_count(ifnet_t ifp, uint32_t count)
12473 {
12474 atomic_set_32(&ifp->if_traffic_rule_count, count);
12475 ifnet_update_traffic_rule_genid(ifp);
12476 }
12477
12478 static void
log_hexdump(void * data,size_t len)12479 log_hexdump(void *data, size_t len)
12480 {
12481 size_t i, j, k;
12482 unsigned char *ptr = (unsigned char *)data;
12483 #define MAX_DUMP_BUF 32
12484 unsigned char buf[3 * MAX_DUMP_BUF + 1];
12485
12486 for (i = 0; i < len; i += MAX_DUMP_BUF) {
12487 for (j = i, k = 0; j < i + MAX_DUMP_BUF && j < len; j++) {
12488 unsigned char msnbl = ptr[j] >> 4;
12489 unsigned char lsnbl = ptr[j] & 0x0f;
12490
12491 buf[k++] = msnbl < 10 ? msnbl + '0' : msnbl + 'a' - 10;
12492 buf[k++] = lsnbl < 10 ? lsnbl + '0' : lsnbl + 'a' - 10;
12493
12494 if ((j % 2) == 1) {
12495 buf[k++] = ' ';
12496 }
12497 if ((j % MAX_DUMP_BUF) == MAX_DUMP_BUF - 1) {
12498 buf[k++] = ' ';
12499 }
12500 }
12501 buf[k] = 0;
12502 os_log(OS_LOG_DEFAULT, "%3lu: %s", i, buf);
12503 }
12504 }
12505
12506 #if SKYWALK && defined(XNU_TARGET_OS_OSX)
12507 static bool
net_check_compatible_if_filter(struct ifnet * ifp)12508 net_check_compatible_if_filter(struct ifnet *ifp)
12509 {
12510 if (ifp == NULL) {
12511 if (net_api_stats.nas_iflt_attach_count > net_api_stats.nas_iflt_attach_os_count) {
12512 return false;
12513 }
12514 } else {
12515 if (ifp->if_flt_non_os_count > 0) {
12516 return false;
12517 }
12518 }
12519 return true;
12520 }
12521 #endif /* SKYWALK && XNU_TARGET_OS_OSX */
12522
12523 #define DUMP_BUF_CHK() { \
12524 clen -= k; \
12525 if (clen < 1) \
12526 goto done; \
12527 c += k; \
12528 }
12529
12530 int dlil_dump_top_if_qlen(char *, int);
12531 int
dlil_dump_top_if_qlen(char * str,int str_len)12532 dlil_dump_top_if_qlen(char *str, int str_len)
12533 {
12534 char *c = str;
12535 int k, clen = str_len;
12536 struct ifnet *top_ifcq_ifp = NULL;
12537 uint32_t top_ifcq_len = 0;
12538 struct ifnet *top_inq_ifp = NULL;
12539 uint32_t top_inq_len = 0;
12540
12541 for (int ifidx = 1; ifidx < if_index; ifidx++) {
12542 struct ifnet *ifp = ifindex2ifnet[ifidx];
12543 struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
12544
12545 if (ifp == NULL) {
12546 continue;
12547 }
12548 if (ifp->if_snd != NULL && ifp->if_snd->ifcq_len > top_ifcq_len) {
12549 top_ifcq_len = ifp->if_snd->ifcq_len;
12550 top_ifcq_ifp = ifp;
12551 }
12552 if (dl_if->dl_if_inpstorage.dlth_pkts.qlen > top_inq_len) {
12553 top_inq_len = dl_if->dl_if_inpstorage.dlth_pkts.qlen;
12554 top_inq_ifp = ifp;
12555 }
12556 }
12557
12558 if (top_ifcq_ifp != NULL) {
12559 k = scnprintf(c, clen, "\ntop ifcq_len %u packets by %s\n",
12560 top_ifcq_len, top_ifcq_ifp->if_xname);
12561 DUMP_BUF_CHK();
12562 }
12563 if (top_inq_ifp != NULL) {
12564 k = scnprintf(c, clen, "\ntop inq_len %u packets by %s\n",
12565 top_inq_len, top_inq_ifp->if_xname);
12566 DUMP_BUF_CHK();
12567 }
12568 done:
12569 return str_len - clen;
12570 }
12571
12572 #if DEVELOPMENT || DEBUG
12573 __private_extern__ int
packet_dump_trace_update(__unused struct sysctl_oid * oidp,__unused void * arg1,__unused int arg2,struct sysctl_req * req)12574 packet_dump_trace_update(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
12575 {
12576 struct flow_key key = {};
12577 int error = 0;
12578
12579 if (req->newptr == USER_ADDR_NULL) {
12580 return EINVAL;
12581 }
12582 if (req->newlen < sizeof(struct flow_key)) {
12583 return EINVAL;
12584 }
12585 error = SYSCTL_IN(req, &key, sizeof(struct flow_key));
12586 if (error != 0) {
12587 return error;
12588 }
12589
12590 switch (key.fk_ipver) {
12591 case IPVERSION:
12592 if (key.fk_proto != IPPROTO_UDP ||
12593 key.fk_sport == 0 || key.fk_dport == 0) {
12594 return EINVAL;
12595 }
12596
12597 if (key.fk_src4.s_addr == INADDR_ANY ||
12598 key.fk_dst4.s_addr == INADDR_ANY) {
12599 return EINVAL;
12600 }
12601
12602 break;
12603 case IPV6_VERSION:
12604 if (key.fk_proto != IPPROTO_UDP ||
12605 key.fk_sport == 0 || key.fk_dport == 0) {
12606 return EINVAL;
12607 }
12608
12609 if (IN6_IS_ADDR_UNSPECIFIED(&key.fk_src6) ||
12610 IN6_IS_ADDR_UNSPECIFIED(&key.fk_dst6)) {
12611 return EINVAL;
12612 }
12613
12614 break;
12615 case 0:
12616 if (key.fk_proto != 0 ||
12617 key.fk_sport != 0 || key.fk_dport != 0) {
12618 return EINVAL;
12619 }
12620
12621 if (!IN6_IS_ADDR_UNSPECIFIED(&key.fk_src6) ||
12622 !IN6_IS_ADDR_UNSPECIFIED(&key.fk_dst6)) {
12623 return EINVAL;
12624 }
12625
12626 break;
12627 default:
12628 return EINVAL;
12629 }
12630
12631 memcpy(&flow_key_trace, &key, sizeof(struct flow_key));
12632 return 0;
12633 }
12634 #endif /* DEVELOPMENT || DEBUG */
12635