1 /*
2 * Copyright (c) 1999-2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
30 * support for mandatory and extensible security protections. This notice
31 * is included in support of clause 2.2 (b) of the Apple Public License,
32 * Version 2.0.
33 */
34 #include <stddef.h>
35 #include <ptrauth.h>
36
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/kernel.h>
40 #include <sys/malloc.h>
41 #include <sys/mbuf.h>
42 #include <sys/socket.h>
43 #include <sys/domain.h>
44 #include <sys/user.h>
45 #include <sys/random.h>
46 #include <sys/socketvar.h>
47 #include <net/if_dl.h>
48 #include <net/if.h>
49 #include <net/route.h>
50 #include <net/if_var.h>
51 #include <net/dlil.h>
52 #include <net/if_arp.h>
53 #include <net/iptap.h>
54 #include <net/pktap.h>
55 #include <net/nwk_wq.h>
56 #include <sys/kern_event.h>
57 #include <sys/kdebug.h>
58 #include <sys/mcache.h>
59 #include <sys/syslog.h>
60 #include <sys/protosw.h>
61 #include <sys/priv.h>
62
63 #include <kern/assert.h>
64 #include <kern/task.h>
65 #include <kern/thread.h>
66 #include <kern/sched_prim.h>
67 #include <kern/locks.h>
68 #include <kern/zalloc.h>
69
70 #include <net/kpi_protocol.h>
71 #include <net/if_types.h>
72 #include <net/if_ipsec.h>
73 #include <net/if_llreach.h>
74 #include <net/if_utun.h>
75 #include <net/kpi_interfacefilter.h>
76 #include <net/classq/classq.h>
77 #include <net/classq/classq_sfb.h>
78 #include <net/flowhash.h>
79 #include <net/ntstat.h>
80 #if defined(SKYWALK) && defined(XNU_TARGET_OS_OSX)
81 #include <skywalk/lib/net_filter_event.h>
82 #endif /* SKYWALK && XNU_TARGET_OS_OSX */
83 #include <net/if_llatbl.h>
84 #include <net/net_api_stats.h>
85 #include <net/if_ports_used.h>
86 #include <net/if_vlan_var.h>
87 #include <netinet/in.h>
88 #if INET
89 #include <netinet/in_var.h>
90 #include <netinet/igmp_var.h>
91 #include <netinet/ip_var.h>
92 #include <netinet/tcp.h>
93 #include <netinet/tcp_var.h>
94 #include <netinet/udp.h>
95 #include <netinet/udp_var.h>
96 #include <netinet/if_ether.h>
97 #include <netinet/in_pcb.h>
98 #include <netinet/in_tclass.h>
99 #include <netinet/ip.h>
100 #include <netinet/ip_icmp.h>
101 #include <netinet/icmp_var.h>
102 #endif /* INET */
103
104 #include <net/nat464_utils.h>
105 #include <netinet6/in6_var.h>
106 #include <netinet6/nd6.h>
107 #include <netinet6/mld6_var.h>
108 #include <netinet6/scope6_var.h>
109 #include <netinet/ip6.h>
110 #include <netinet/icmp6.h>
111 #include <net/pf_pbuf.h>
112 #include <libkern/OSAtomic.h>
113 #include <libkern/tree.h>
114
115 #include <dev/random/randomdev.h>
116 #include <machine/machine_routines.h>
117
118 #include <mach/thread_act.h>
119 #include <mach/sdt.h>
120
121 #if CONFIG_MACF
122 #include <sys/kauth.h>
123 #include <security/mac_framework.h>
124 #include <net/ethernet.h>
125 #include <net/firewire.h>
126 #endif
127
128 #if PF
129 #include <net/pfvar.h>
130 #endif /* PF */
131 #include <net/pktsched/pktsched.h>
132 #include <net/pktsched/pktsched_netem.h>
133
134 #if NECP
135 #include <net/necp.h>
136 #endif /* NECP */
137
138 #if SKYWALK
139 #include <skywalk/packet/packet_queue.h>
140 #include <skywalk/nexus/netif/nx_netif.h>
141 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
142 #endif /* SKYWALK */
143
144 #include <os/log.h>
145
146 #define DBG_LAYER_BEG DLILDBG_CODE(DBG_DLIL_STATIC, 0)
147 #define DBG_LAYER_END DLILDBG_CODE(DBG_DLIL_STATIC, 2)
148 #define DBG_FNC_DLIL_INPUT DLILDBG_CODE(DBG_DLIL_STATIC, (1 << 8))
149 #define DBG_FNC_DLIL_OUTPUT DLILDBG_CODE(DBG_DLIL_STATIC, (2 << 8))
150 #define DBG_FNC_DLIL_IFOUT DLILDBG_CODE(DBG_DLIL_STATIC, (3 << 8))
151
152 #define MAX_FRAME_TYPE_SIZE 4 /* LONGWORDS */
153 #define MAX_LINKADDR 4 /* LONGWORDS */
154 #define M_NKE M_IFADDR
155
156 #if 1
157 #define DLIL_PRINTF printf
158 #else
159 #define DLIL_PRINTF kprintf
160 #endif
161
162 #define IF_DATA_REQUIRE_ALIGNED_64(f) \
163 _CASSERT(!(offsetof(struct if_data_internal, f) % sizeof (u_int64_t)))
164
165 #define IFNET_IF_DATA_REQUIRE_ALIGNED_64(f) \
166 _CASSERT(!(offsetof(struct ifnet, if_data.f) % sizeof (u_int64_t)))
167
168 enum {
169 kProtoKPI_v1 = 1,
170 kProtoKPI_v2 = 2
171 };
172
173 /*
174 * List of if_proto structures in if_proto_hash[] is protected by
175 * the ifnet lock. The rest of the fields are initialized at protocol
176 * attach time and never change, thus no lock required as long as
177 * a reference to it is valid, via if_proto_ref().
178 */
179 struct if_proto {
180 SLIST_ENTRY(if_proto) next_hash;
181 u_int32_t refcount;
182 u_int32_t detached;
183 struct ifnet *ifp;
184 protocol_family_t protocol_family;
185 int proto_kpi;
186 union {
187 struct {
188 proto_media_input input;
189 proto_media_preout pre_output;
190 proto_media_event event;
191 proto_media_ioctl ioctl;
192 proto_media_detached detached;
193 proto_media_resolve_multi resolve_multi;
194 proto_media_send_arp send_arp;
195 } v1;
196 struct {
197 proto_media_input_v2 input;
198 proto_media_preout pre_output;
199 proto_media_event event;
200 proto_media_ioctl ioctl;
201 proto_media_detached detached;
202 proto_media_resolve_multi resolve_multi;
203 proto_media_send_arp send_arp;
204 } v2;
205 } kpi;
206 };
207
208 SLIST_HEAD(proto_hash_entry, if_proto);
209
210 #define DLIL_SDLDATALEN \
211 (DLIL_SDLMAXLEN - offsetof(struct sockaddr_dl, sdl_data[0]))
212
213 struct dlil_ifnet {
214 struct ifnet dl_if; /* public ifnet */
215 /*
216 * DLIL private fields, protected by dl_if_lock
217 */
218 decl_lck_mtx_data(, dl_if_lock);
219 TAILQ_ENTRY(dlil_ifnet) dl_if_link; /* dlil_ifnet link */
220 u_int32_t dl_if_flags; /* flags (below) */
221 u_int32_t dl_if_refcnt; /* refcnt */
222 void (*dl_if_trace)(struct dlil_ifnet *, int); /* ref trace callback */
223 void *dl_if_uniqueid; /* unique interface id */
224 size_t dl_if_uniqueid_len; /* length of the unique id */
225 char dl_if_namestorage[IFNAMSIZ]; /* interface name storage */
226 char dl_if_xnamestorage[IFXNAMSIZ]; /* external name storage */
227 struct {
228 struct ifaddr ifa; /* lladdr ifa */
229 u_int8_t asdl[DLIL_SDLMAXLEN]; /* addr storage */
230 u_int8_t msdl[DLIL_SDLMAXLEN]; /* mask storage */
231 } dl_if_lladdr;
232 u_int8_t dl_if_descstorage[IF_DESCSIZE]; /* desc storage */
233 u_int8_t dl_if_permanent_ether[ETHER_ADDR_LEN]; /* permanent address */
234 u_int8_t dl_if_permanent_ether_is_set;
235 u_int8_t dl_if_unused;
236 struct dlil_threading_info dl_if_inpstorage; /* input thread storage */
237 ctrace_t dl_if_attach; /* attach PC stacktrace */
238 ctrace_t dl_if_detach; /* detach PC stacktrace */
239 };
240
241 /* Values for dl_if_flags (private to DLIL) */
242 #define DLIF_INUSE 0x1 /* DLIL ifnet recycler, ifnet in use */
243 #define DLIF_REUSE 0x2 /* DLIL ifnet recycles, ifnet is not new */
244 #define DLIF_DEBUG 0x4 /* has debugging info */
245
246 #define IF_REF_TRACE_HIST_SIZE 8 /* size of ref trace history */
247
248 /* For gdb */
249 __private_extern__ unsigned int if_ref_trace_hist_size = IF_REF_TRACE_HIST_SIZE;
250
251 struct dlil_ifnet_dbg {
252 struct dlil_ifnet dldbg_dlif; /* dlil_ifnet */
253 u_int16_t dldbg_if_refhold_cnt; /* # ifnet references */
254 u_int16_t dldbg_if_refrele_cnt; /* # ifnet releases */
255 /*
256 * Circular lists of ifnet_{reference,release} callers.
257 */
258 ctrace_t dldbg_if_refhold[IF_REF_TRACE_HIST_SIZE];
259 ctrace_t dldbg_if_refrele[IF_REF_TRACE_HIST_SIZE];
260 };
261
262 #define DLIL_TO_IFP(s) (&s->dl_if)
263 #define IFP_TO_DLIL(s) ((struct dlil_ifnet *)s)
264
265 struct ifnet_filter {
266 TAILQ_ENTRY(ifnet_filter) filt_next;
267 u_int32_t filt_skip;
268 u_int32_t filt_flags;
269 ifnet_t filt_ifp;
270 const char *filt_name;
271 void *filt_cookie;
272 protocol_family_t filt_protocol;
273 iff_input_func filt_input;
274 iff_output_func filt_output;
275 iff_event_func filt_event;
276 iff_ioctl_func filt_ioctl;
277 iff_detached_func filt_detached;
278 };
279
280 struct proto_input_entry;
281
282 static TAILQ_HEAD(, dlil_ifnet) dlil_ifnet_head;
283
284 static LCK_ATTR_DECLARE(dlil_lck_attributes, 0, 0);
285
286 static LCK_GRP_DECLARE(dlil_lock_group, "DLIL internal locks");
287 LCK_GRP_DECLARE(ifnet_lock_group, "ifnet locks");
288 static LCK_GRP_DECLARE(ifnet_head_lock_group, "ifnet head lock");
289 static LCK_GRP_DECLARE(ifnet_snd_lock_group, "ifnet snd locks");
290 static LCK_GRP_DECLARE(ifnet_rcv_lock_group, "ifnet rcv locks");
291
292 LCK_ATTR_DECLARE(ifnet_lock_attr, 0, 0);
293 static LCK_RW_DECLARE_ATTR(ifnet_head_lock, &ifnet_head_lock_group,
294 &dlil_lck_attributes);
295 static LCK_MTX_DECLARE_ATTR(dlil_ifnet_lock, &dlil_lock_group,
296 &dlil_lck_attributes);
297
298 #if DEBUG
299 static unsigned int ifnet_debug = 1; /* debugging (enabled) */
300 #else
301 static unsigned int ifnet_debug; /* debugging (disabled) */
302 #endif /* !DEBUG */
303 static unsigned int dlif_size; /* size of dlil_ifnet to allocate */
304 static unsigned int dlif_bufsize; /* size of dlif_size + headroom */
305 static struct zone *dlif_zone; /* zone for dlil_ifnet */
306 #define DLIF_ZONE_NAME "ifnet" /* zone name */
307
308 static ZONE_DECLARE(dlif_filt_zone, "ifnet_filter",
309 sizeof(struct ifnet_filter), ZC_ZFREE_CLEARMEM);
310
311 static ZONE_DECLARE(dlif_phash_zone, "ifnet_proto_hash",
312 sizeof(struct proto_hash_entry) * PROTO_HASH_SLOTS, ZC_ZFREE_CLEARMEM);
313
314 static ZONE_DECLARE(dlif_proto_zone, "ifnet_proto",
315 sizeof(struct if_proto), ZC_ZFREE_CLEARMEM);
316
317 static unsigned int dlif_tcpstat_size; /* size of tcpstat_local to allocate */
318 static unsigned int dlif_tcpstat_bufsize; /* size of dlif_tcpstat_size + headroom */
319 static struct zone *dlif_tcpstat_zone; /* zone for tcpstat_local */
320 #define DLIF_TCPSTAT_ZONE_NAME "ifnet_tcpstat" /* zone name */
321
322 static unsigned int dlif_udpstat_size; /* size of udpstat_local to allocate */
323 static unsigned int dlif_udpstat_bufsize; /* size of dlif_udpstat_size + headroom */
324 static struct zone *dlif_udpstat_zone; /* zone for udpstat_local */
325 #define DLIF_UDPSTAT_ZONE_NAME "ifnet_udpstat" /* zone name */
326
327 static u_int32_t net_rtref;
328
329 static struct dlil_main_threading_info dlil_main_input_thread_info;
330 __private_extern__ struct dlil_threading_info *dlil_main_input_thread =
331 (struct dlil_threading_info *)&dlil_main_input_thread_info;
332
333 static int dlil_event_internal(struct ifnet *ifp, struct kev_msg *msg, bool update_generation);
334 static int dlil_detach_filter_internal(interface_filter_t filter, int detached);
335 static void dlil_if_trace(struct dlil_ifnet *, int);
336 static void if_proto_ref(struct if_proto *);
337 static void if_proto_free(struct if_proto *);
338 static struct if_proto *find_attached_proto(struct ifnet *, u_int32_t);
339 static u_int32_t dlil_ifp_protolist(struct ifnet *ifp, protocol_family_t *list,
340 u_int32_t list_count);
341 static void _dlil_if_release(ifnet_t ifp, bool clear_in_use);
342 static void if_flt_monitor_busy(struct ifnet *);
343 static void if_flt_monitor_unbusy(struct ifnet *);
344 static void if_flt_monitor_enter(struct ifnet *);
345 static void if_flt_monitor_leave(struct ifnet *);
346 static int dlil_interface_filters_input(struct ifnet *, struct mbuf **,
347 char **, protocol_family_t);
348 static int dlil_interface_filters_output(struct ifnet *, struct mbuf **,
349 protocol_family_t);
350 static struct ifaddr *dlil_alloc_lladdr(struct ifnet *,
351 const struct sockaddr_dl *);
352 static int ifnet_lookup(struct ifnet *);
353 static void if_purgeaddrs(struct ifnet *);
354
355 static errno_t ifproto_media_input_v1(struct ifnet *, protocol_family_t,
356 struct mbuf *, char *);
357 static errno_t ifproto_media_input_v2(struct ifnet *, protocol_family_t,
358 struct mbuf *);
359 static errno_t ifproto_media_preout(struct ifnet *, protocol_family_t,
360 mbuf_t *, const struct sockaddr *, void *, char *, char *);
361 static void ifproto_media_event(struct ifnet *, protocol_family_t,
362 const struct kev_msg *);
363 static errno_t ifproto_media_ioctl(struct ifnet *, protocol_family_t,
364 unsigned long, void *);
365 static errno_t ifproto_media_resolve_multi(ifnet_t, const struct sockaddr *,
366 struct sockaddr_dl *, size_t);
367 static errno_t ifproto_media_send_arp(struct ifnet *, u_short,
368 const struct sockaddr_dl *, const struct sockaddr *,
369 const struct sockaddr_dl *, const struct sockaddr *);
370
371 static errno_t ifp_if_input(struct ifnet *ifp, struct mbuf *m_head,
372 struct mbuf *m_tail, const struct ifnet_stat_increment_param *s,
373 boolean_t poll, struct thread *tp);
374 static void ifp_if_input_poll(struct ifnet *, u_int32_t, u_int32_t,
375 struct mbuf **, struct mbuf **, u_int32_t *, u_int32_t *);
376 static errno_t ifp_if_ctl(struct ifnet *, ifnet_ctl_cmd_t, u_int32_t, void *);
377 static errno_t ifp_if_demux(struct ifnet *, struct mbuf *, char *,
378 protocol_family_t *);
379 static errno_t ifp_if_add_proto(struct ifnet *, protocol_family_t,
380 const struct ifnet_demux_desc *, u_int32_t);
381 static errno_t ifp_if_del_proto(struct ifnet *, protocol_family_t);
382 static errno_t ifp_if_check_multi(struct ifnet *, const struct sockaddr *);
383 #if !XNU_TARGET_OS_OSX
384 static errno_t ifp_if_framer(struct ifnet *, struct mbuf **,
385 const struct sockaddr *, const char *, const char *,
386 u_int32_t *, u_int32_t *);
387 #else /* XNU_TARGET_OS_OSX */
388 static errno_t ifp_if_framer(struct ifnet *, struct mbuf **,
389 const struct sockaddr *, const char *, const char *);
390 #endif /* XNU_TARGET_OS_OSX */
391 static errno_t ifp_if_framer_extended(struct ifnet *, struct mbuf **,
392 const struct sockaddr *, const char *, const char *,
393 u_int32_t *, u_int32_t *);
394 static errno_t ifp_if_set_bpf_tap(struct ifnet *, bpf_tap_mode, bpf_packet_func);
395 static void ifp_if_free(struct ifnet *);
396 static void ifp_if_event(struct ifnet *, const struct kev_msg *);
397 static __inline void ifp_inc_traffic_class_in(struct ifnet *, struct mbuf *);
398 static __inline void ifp_inc_traffic_class_out(struct ifnet *, struct mbuf *);
399
400 static errno_t dlil_input_async(struct dlil_threading_info *, struct ifnet *,
401 struct mbuf *, struct mbuf *, const struct ifnet_stat_increment_param *,
402 boolean_t, struct thread *);
403 static errno_t dlil_input_sync(struct dlil_threading_info *, struct ifnet *,
404 struct mbuf *, struct mbuf *, const struct ifnet_stat_increment_param *,
405 boolean_t, struct thread *);
406
407 static void dlil_main_input_thread_func(void *, wait_result_t);
408 static void dlil_main_input_thread_cont(void *, wait_result_t);
409
410 static void dlil_input_thread_func(void *, wait_result_t);
411 static void dlil_input_thread_cont(void *, wait_result_t);
412
413 static void dlil_rxpoll_input_thread_func(void *, wait_result_t);
414 static void dlil_rxpoll_input_thread_cont(void *, wait_result_t);
415
416 static int dlil_create_input_thread(ifnet_t, struct dlil_threading_info *,
417 thread_continue_t *);
418 static void dlil_terminate_input_thread(struct dlil_threading_info *);
419 static void dlil_input_stats_add(const struct ifnet_stat_increment_param *,
420 struct dlil_threading_info *, struct ifnet *, boolean_t);
421 static boolean_t dlil_input_stats_sync(struct ifnet *,
422 struct dlil_threading_info *);
423 static void dlil_input_packet_list_common(struct ifnet *, struct mbuf *,
424 u_int32_t, ifnet_model_t, boolean_t);
425 static errno_t ifnet_input_common(struct ifnet *, struct mbuf *, struct mbuf *,
426 const struct ifnet_stat_increment_param *, boolean_t, boolean_t);
427 static int dlil_is_clat_needed(protocol_family_t, mbuf_t );
428 static errno_t dlil_clat46(ifnet_t, protocol_family_t *, mbuf_t *);
429 static errno_t dlil_clat64(ifnet_t, protocol_family_t *, mbuf_t *);
430 #if DEBUG || DEVELOPMENT
431 static void dlil_verify_sum16(void);
432 #endif /* DEBUG || DEVELOPMENT */
433 static void dlil_output_cksum_dbg(struct ifnet *, struct mbuf *, uint32_t,
434 protocol_family_t);
435 static void dlil_input_cksum_dbg(struct ifnet *, struct mbuf *, char *,
436 protocol_family_t);
437
438 static void dlil_incr_pending_thread_count(void);
439 static void dlil_decr_pending_thread_count(void);
440
441 static void ifnet_detacher_thread_func(void *, wait_result_t);
442 static void ifnet_detacher_thread_cont(void *, wait_result_t);
443 static void ifnet_detach_final(struct ifnet *);
444 static void ifnet_detaching_enqueue(struct ifnet *);
445 static struct ifnet *ifnet_detaching_dequeue(void);
446
447 static void ifnet_start_thread_func(void *, wait_result_t);
448 static void ifnet_start_thread_cont(void *, wait_result_t);
449
450 static void ifnet_poll_thread_func(void *, wait_result_t);
451 static void ifnet_poll_thread_cont(void *, wait_result_t);
452
453 static errno_t ifnet_enqueue_common(struct ifnet *, struct ifclassq *,
454 classq_pkt_t *, boolean_t, boolean_t *);
455
456 static void ifp_src_route_copyout(struct ifnet *, struct route *);
457 static void ifp_src_route_copyin(struct ifnet *, struct route *);
458 static void ifp_src_route6_copyout(struct ifnet *, struct route_in6 *);
459 static void ifp_src_route6_copyin(struct ifnet *, struct route_in6 *);
460
461 static int sysctl_rxpoll SYSCTL_HANDLER_ARGS;
462 static int sysctl_rxpoll_mode_holdtime SYSCTL_HANDLER_ARGS;
463 static int sysctl_rxpoll_sample_holdtime SYSCTL_HANDLER_ARGS;
464 static int sysctl_rxpoll_interval_time SYSCTL_HANDLER_ARGS;
465 static int sysctl_rxpoll_wlowat SYSCTL_HANDLER_ARGS;
466 static int sysctl_rxpoll_whiwat SYSCTL_HANDLER_ARGS;
467 static int sysctl_sndq_maxlen SYSCTL_HANDLER_ARGS;
468 static int sysctl_rcvq_maxlen SYSCTL_HANDLER_ARGS;
469 static int sysctl_hwcksum_dbg_mode SYSCTL_HANDLER_ARGS;
470 static int sysctl_hwcksum_dbg_partial_rxoff_forced SYSCTL_HANDLER_ARGS;
471 static int sysctl_hwcksum_dbg_partial_rxoff_adj SYSCTL_HANDLER_ARGS;
472
473 struct chain_len_stats tx_chain_len_stats;
474 static int sysctl_tx_chain_len_stats SYSCTL_HANDLER_ARGS;
475
476 #if TEST_INPUT_THREAD_TERMINATION
477 static int sysctl_input_thread_termination_spin SYSCTL_HANDLER_ARGS;
478 #endif /* TEST_INPUT_THREAD_TERMINATION */
479
480 /* The following are protected by dlil_ifnet_lock */
481 static TAILQ_HEAD(, ifnet) ifnet_detaching_head;
482 static u_int32_t ifnet_detaching_cnt;
483 static boolean_t ifnet_detaching_embryonic;
484 static void *ifnet_delayed_run; /* wait channel for detaching thread */
485
486 static LCK_MTX_DECLARE_ATTR(ifnet_fc_lock, &dlil_lock_group,
487 &dlil_lck_attributes);
488
489 static uint32_t ifnet_flowhash_seed;
490
491 struct ifnet_flowhash_key {
492 char ifk_name[IFNAMSIZ];
493 uint32_t ifk_unit;
494 uint32_t ifk_flags;
495 uint32_t ifk_eflags;
496 uint32_t ifk_capabilities;
497 uint32_t ifk_capenable;
498 uint32_t ifk_output_sched_model;
499 uint32_t ifk_rand1;
500 uint32_t ifk_rand2;
501 };
502
503 /* Flow control entry per interface */
504 struct ifnet_fc_entry {
505 RB_ENTRY(ifnet_fc_entry) ifce_entry;
506 u_int32_t ifce_flowhash;
507 struct ifnet *ifce_ifp;
508 };
509
510 static uint32_t ifnet_calc_flowhash(struct ifnet *);
511 static int ifce_cmp(const struct ifnet_fc_entry *,
512 const struct ifnet_fc_entry *);
513 static int ifnet_fc_add(struct ifnet *);
514 static struct ifnet_fc_entry *ifnet_fc_get(u_int32_t);
515 static void ifnet_fc_entry_free(struct ifnet_fc_entry *);
516
517 /* protected by ifnet_fc_lock */
518 RB_HEAD(ifnet_fc_tree, ifnet_fc_entry) ifnet_fc_tree;
519 RB_PROTOTYPE(ifnet_fc_tree, ifnet_fc_entry, ifce_entry, ifce_cmp);
520 RB_GENERATE(ifnet_fc_tree, ifnet_fc_entry, ifce_entry, ifce_cmp);
521
522 static ZONE_DECLARE(ifnet_fc_zone, "ifnet_fc_zone",
523 sizeof(struct ifnet_fc_entry), ZC_ZFREE_CLEARMEM);
524
525 extern void bpfdetach(struct ifnet *);
526 extern void proto_input_run(void);
527
528 extern uint32_t udp_count_opportunistic(unsigned int ifindex,
529 u_int32_t flags);
530 extern uint32_t tcp_count_opportunistic(unsigned int ifindex,
531 u_int32_t flags);
532
533 __private_extern__ void link_rtrequest(int, struct rtentry *, struct sockaddr *);
534
535 #if CONFIG_MACF
536 #if !XNU_TARGET_OS_OSX
537 int dlil_lladdr_ckreq = 1;
538 #else /* XNU_TARGET_OS_OSX */
539 int dlil_lladdr_ckreq = 0;
540 #endif /* XNU_TARGET_OS_OSX */
541 #endif /* CONFIG_MACF */
542
543 #if DEBUG
544 int dlil_verbose = 1;
545 #else
546 int dlil_verbose = 0;
547 #endif /* DEBUG */
548 #if IFNET_INPUT_SANITY_CHK
549 /* sanity checking of input packet lists received */
550 static u_int32_t dlil_input_sanity_check = 0;
551 #endif /* IFNET_INPUT_SANITY_CHK */
552 /* rate limit debug messages */
553 struct timespec dlil_dbgrate = { .tv_sec = 1, .tv_nsec = 0 };
554
555 SYSCTL_DECL(_net_link_generic_system);
556
557 SYSCTL_INT(_net_link_generic_system, OID_AUTO, dlil_verbose,
558 CTLFLAG_RW | CTLFLAG_LOCKED, &dlil_verbose, 0, "Log DLIL error messages");
559
560 #define IF_SNDQ_MINLEN 32
561 u_int32_t if_sndq_maxlen = IFQ_MAXLEN;
562 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, sndq_maxlen,
563 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_sndq_maxlen, IFQ_MAXLEN,
564 sysctl_sndq_maxlen, "I", "Default transmit queue max length");
565
566 #define IF_RCVQ_MINLEN 32
567 #define IF_RCVQ_MAXLEN 256
568 u_int32_t if_rcvq_maxlen = IF_RCVQ_MAXLEN;
569 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rcvq_maxlen,
570 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rcvq_maxlen, IFQ_MAXLEN,
571 sysctl_rcvq_maxlen, "I", "Default receive queue max length");
572
573 #define IF_RXPOLL_DECAY 2 /* ilog2 of EWMA decay rate (4) */
574 u_int32_t if_rxpoll_decay = IF_RXPOLL_DECAY;
575 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, rxpoll_decay,
576 CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_decay, IF_RXPOLL_DECAY,
577 "ilog2 of EWMA decay rate of avg inbound packets");
578
579 #define IF_RXPOLL_MODE_HOLDTIME_MIN (10ULL * 1000 * 1000) /* 10 ms */
580 #define IF_RXPOLL_MODE_HOLDTIME (1000ULL * 1000 * 1000) /* 1 sec */
581 static u_int64_t if_rxpoll_mode_holdtime = IF_RXPOLL_MODE_HOLDTIME;
582 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_freeze_time,
583 CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_mode_holdtime,
584 IF_RXPOLL_MODE_HOLDTIME, sysctl_rxpoll_mode_holdtime,
585 "Q", "input poll mode freeze time");
586
587 #define IF_RXPOLL_SAMPLETIME_MIN (1ULL * 1000 * 1000) /* 1 ms */
588 #define IF_RXPOLL_SAMPLETIME (10ULL * 1000 * 1000) /* 10 ms */
589 static u_int64_t if_rxpoll_sample_holdtime = IF_RXPOLL_SAMPLETIME;
590 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_sample_time,
591 CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_sample_holdtime,
592 IF_RXPOLL_SAMPLETIME, sysctl_rxpoll_sample_holdtime,
593 "Q", "input poll sampling time");
594
595 static u_int64_t if_rxpoll_interval_time = IF_RXPOLL_INTERVALTIME;
596 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_interval_time,
597 CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_interval_time,
598 IF_RXPOLL_INTERVALTIME, sysctl_rxpoll_interval_time,
599 "Q", "input poll interval (time)");
600
601 #define IF_RXPOLL_INTERVAL_PKTS 0 /* 0 (disabled) */
602 u_int32_t if_rxpoll_interval_pkts = IF_RXPOLL_INTERVAL_PKTS;
603 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, rxpoll_interval_pkts,
604 CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_interval_pkts,
605 IF_RXPOLL_INTERVAL_PKTS, "input poll interval (packets)");
606
607 #define IF_RXPOLL_WLOWAT 10
608 static u_int32_t if_sysctl_rxpoll_wlowat = IF_RXPOLL_WLOWAT;
609 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_wakeups_lowat,
610 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_sysctl_rxpoll_wlowat,
611 IF_RXPOLL_WLOWAT, sysctl_rxpoll_wlowat,
612 "I", "input poll wakeup low watermark");
613
614 #define IF_RXPOLL_WHIWAT 100
615 static u_int32_t if_sysctl_rxpoll_whiwat = IF_RXPOLL_WHIWAT;
616 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_wakeups_hiwat,
617 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_sysctl_rxpoll_whiwat,
618 IF_RXPOLL_WHIWAT, sysctl_rxpoll_whiwat,
619 "I", "input poll wakeup high watermark");
620
621 static u_int32_t if_rxpoll_max = 0; /* 0 (automatic) */
622 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, rxpoll_max,
623 CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_max, 0,
624 "max packets per poll call");
625
626 u_int32_t if_rxpoll = 1;
627 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll,
628 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll, 0,
629 sysctl_rxpoll, "I", "enable opportunistic input polling");
630
631 #if TEST_INPUT_THREAD_TERMINATION
632 static u_int32_t if_input_thread_termination_spin = 0;
633 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, input_thread_termination_spin,
634 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
635 &if_input_thread_termination_spin, 0,
636 sysctl_input_thread_termination_spin,
637 "I", "input thread termination spin limit");
638 #endif /* TEST_INPUT_THREAD_TERMINATION */
639
640 static u_int32_t cur_dlil_input_threads = 0;
641 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, dlil_input_threads,
642 CTLFLAG_RD | CTLFLAG_LOCKED, &cur_dlil_input_threads, 0,
643 "Current number of DLIL input threads");
644
645 #if IFNET_INPUT_SANITY_CHK
646 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, dlil_input_sanity_check,
647 CTLFLAG_RW | CTLFLAG_LOCKED, &dlil_input_sanity_check, 0,
648 "Turn on sanity checking in DLIL input");
649 #endif /* IFNET_INPUT_SANITY_CHK */
650
651 static u_int32_t if_flowadv = 1;
652 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, flow_advisory,
653 CTLFLAG_RW | CTLFLAG_LOCKED, &if_flowadv, 1,
654 "enable flow-advisory mechanism");
655
656 static u_int32_t if_delaybased_queue = 1;
657 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, delaybased_queue,
658 CTLFLAG_RW | CTLFLAG_LOCKED, &if_delaybased_queue, 1,
659 "enable delay based dynamic queue sizing");
660
661 static uint64_t hwcksum_in_invalidated = 0;
662 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
663 hwcksum_in_invalidated, CTLFLAG_RD | CTLFLAG_LOCKED,
664 &hwcksum_in_invalidated, "inbound packets with invalidated hardware cksum");
665
666 uint32_t hwcksum_dbg = 0;
667 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, hwcksum_dbg,
668 CTLFLAG_RW | CTLFLAG_LOCKED, &hwcksum_dbg, 0,
669 "enable hardware cksum debugging");
670
671 u_int32_t ifnet_start_delayed = 0;
672 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, start_delayed,
673 CTLFLAG_RW | CTLFLAG_LOCKED, &ifnet_start_delayed, 0,
674 "number of times start was delayed");
675
676 u_int32_t ifnet_delay_start_disabled = 0;
677 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, start_delay_disabled,
678 CTLFLAG_RW | CTLFLAG_LOCKED, &ifnet_delay_start_disabled, 0,
679 "number of times start was delayed");
680
681 static inline void
ifnet_delay_start_disabled_increment(void)682 ifnet_delay_start_disabled_increment(void)
683 {
684 OSIncrementAtomic(&ifnet_delay_start_disabled);
685 }
686
687 #define HWCKSUM_DBG_PARTIAL_FORCED 0x1 /* forced partial checksum */
688 #define HWCKSUM_DBG_PARTIAL_RXOFF_ADJ 0x2 /* adjust start offset */
689 #define HWCKSUM_DBG_FINALIZE_FORCED 0x10 /* forced finalize */
690 #define HWCKSUM_DBG_MASK \
691 (HWCKSUM_DBG_PARTIAL_FORCED | HWCKSUM_DBG_PARTIAL_RXOFF_ADJ | \
692 HWCKSUM_DBG_FINALIZE_FORCED)
693
694 static uint32_t hwcksum_dbg_mode = 0;
695 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, hwcksum_dbg_mode,
696 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &hwcksum_dbg_mode,
697 0, sysctl_hwcksum_dbg_mode, "I", "hardware cksum debugging mode");
698
699 static uint64_t hwcksum_dbg_partial_forced = 0;
700 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
701 hwcksum_dbg_partial_forced, CTLFLAG_RD | CTLFLAG_LOCKED,
702 &hwcksum_dbg_partial_forced, "packets forced using partial cksum");
703
704 static uint64_t hwcksum_dbg_partial_forced_bytes = 0;
705 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
706 hwcksum_dbg_partial_forced_bytes, CTLFLAG_RD | CTLFLAG_LOCKED,
707 &hwcksum_dbg_partial_forced_bytes, "bytes forced using partial cksum");
708
709 static uint32_t hwcksum_dbg_partial_rxoff_forced = 0;
710 SYSCTL_PROC(_net_link_generic_system, OID_AUTO,
711 hwcksum_dbg_partial_rxoff_forced, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
712 &hwcksum_dbg_partial_rxoff_forced, 0,
713 sysctl_hwcksum_dbg_partial_rxoff_forced, "I",
714 "forced partial cksum rx offset");
715
716 static uint32_t hwcksum_dbg_partial_rxoff_adj = 0;
717 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, hwcksum_dbg_partial_rxoff_adj,
718 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &hwcksum_dbg_partial_rxoff_adj,
719 0, sysctl_hwcksum_dbg_partial_rxoff_adj, "I",
720 "adjusted partial cksum rx offset");
721
722 static uint64_t hwcksum_dbg_verified = 0;
723 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
724 hwcksum_dbg_verified, CTLFLAG_RD | CTLFLAG_LOCKED,
725 &hwcksum_dbg_verified, "packets verified for having good checksum");
726
727 static uint64_t hwcksum_dbg_bad_cksum = 0;
728 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
729 hwcksum_dbg_bad_cksum, CTLFLAG_RD | CTLFLAG_LOCKED,
730 &hwcksum_dbg_bad_cksum, "packets with bad hardware calculated checksum");
731
732 static uint64_t hwcksum_dbg_bad_rxoff = 0;
733 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
734 hwcksum_dbg_bad_rxoff, CTLFLAG_RD | CTLFLAG_LOCKED,
735 &hwcksum_dbg_bad_rxoff, "packets with invalid rxoff");
736
737 static uint64_t hwcksum_dbg_adjusted = 0;
738 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
739 hwcksum_dbg_adjusted, CTLFLAG_RD | CTLFLAG_LOCKED,
740 &hwcksum_dbg_adjusted, "packets with rxoff adjusted");
741
742 static uint64_t hwcksum_dbg_finalized_hdr = 0;
743 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
744 hwcksum_dbg_finalized_hdr, CTLFLAG_RD | CTLFLAG_LOCKED,
745 &hwcksum_dbg_finalized_hdr, "finalized headers");
746
747 static uint64_t hwcksum_dbg_finalized_data = 0;
748 SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
749 hwcksum_dbg_finalized_data, CTLFLAG_RD | CTLFLAG_LOCKED,
750 &hwcksum_dbg_finalized_data, "finalized payloads");
751
752 uint32_t hwcksum_tx = 1;
753 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, hwcksum_tx,
754 CTLFLAG_RW | CTLFLAG_LOCKED, &hwcksum_tx, 0,
755 "enable transmit hardware checksum offload");
756
757 uint32_t hwcksum_rx = 1;
758 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, hwcksum_rx,
759 CTLFLAG_RW | CTLFLAG_LOCKED, &hwcksum_rx, 0,
760 "enable receive hardware checksum offload");
761
762 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, tx_chain_len_stats,
763 CTLFLAG_RD | CTLFLAG_LOCKED, 0, 9,
764 sysctl_tx_chain_len_stats, "S", "");
765
766 uint32_t tx_chain_len_count = 0;
767 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, tx_chain_len_count,
768 CTLFLAG_RW | CTLFLAG_LOCKED, &tx_chain_len_count, 0, "");
769
770 static uint32_t threshold_notify = 1; /* enable/disable */
771 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, threshold_notify,
772 CTLFLAG_RW | CTLFLAG_LOCKED, &threshold_notify, 0, "");
773
774 static uint32_t threshold_interval = 2; /* in seconds */
775 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, threshold_interval,
776 CTLFLAG_RW | CTLFLAG_LOCKED, &threshold_interval, 0, "");
777
778 #if (DEVELOPMENT || DEBUG)
779 static int sysctl_get_kao_frames SYSCTL_HANDLER_ARGS;
780 SYSCTL_NODE(_net_link_generic_system, OID_AUTO, get_kao_frames,
781 CTLFLAG_RD | CTLFLAG_LOCKED, sysctl_get_kao_frames, "");
782 #endif /* DEVELOPMENT || DEBUG */
783
784 struct net_api_stats net_api_stats;
785 SYSCTL_STRUCT(_net, OID_AUTO, api_stats, CTLFLAG_RD | CTLFLAG_LOCKED,
786 &net_api_stats, net_api_stats, "");
787
788 uint32_t net_wake_pkt_debug = 0;
789 SYSCTL_UINT(_net_link_generic_system, OID_AUTO, wake_pkt_debug,
790 CTLFLAG_RW | CTLFLAG_LOCKED, &net_wake_pkt_debug, 0, "");
791
792 static void log_hexdump(void *data, size_t len);
793
794 unsigned int net_rxpoll = 1;
795 unsigned int net_affinity = 1;
796 unsigned int net_async = 1; /* 0: synchronous, 1: asynchronous */
797
798 static kern_return_t dlil_affinity_set(struct thread *, u_int32_t);
799
800 extern u_int32_t inject_buckets;
801
802 /* DLIL data threshold thread call */
803 static void dlil_dt_tcall_fn(thread_call_param_t, thread_call_param_t);
804
805 void
ifnet_filter_update_tso(struct ifnet * ifp,boolean_t filter_enable)806 ifnet_filter_update_tso(struct ifnet *ifp, boolean_t filter_enable)
807 {
808 /*
809 * update filter count and route_generation ID to let TCP
810 * know it should reevalute doing TSO or not
811 */
812 if (filter_enable) {
813 OSAddAtomic(1, &ifp->if_flt_no_tso_count);
814 } else {
815 VERIFY(ifp->if_flt_no_tso_count != 0);
816 OSAddAtomic(-1, &ifp->if_flt_no_tso_count);
817 }
818 routegenid_update();
819 }
820
821 #if SKYWALK
822
823 #if defined(XNU_TARGET_OS_OSX)
824 static bool net_check_compatible_if_filter(struct ifnet *ifp);
825 #endif /* XNU_TARGET_OS_OSX */
826
827 /* if_attach_nx flags defined in os_skywalk_private.h */
828 static unsigned int if_attach_nx = IF_ATTACH_NX_DEFAULT;
829 unsigned int if_enable_fsw_ip_netagent =
830 ((IF_ATTACH_NX_DEFAULT & IF_ATTACH_NX_FSW_IP_NETAGENT) != 0);
831 unsigned int if_enable_fsw_transport_netagent =
832 ((IF_ATTACH_NX_DEFAULT & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0);
833
834 unsigned int if_netif_all =
835 ((IF_ATTACH_NX_DEFAULT & IF_ATTACH_NX_NETIF_ALL) != 0);
836
837 /* Configure flowswitch to use max mtu sized buffer */
838 static bool fsw_use_max_mtu_buffer = false;
839
840 #if (DEVELOPMENT || DEBUG)
841 static int
842 if_attach_nx_sysctl SYSCTL_HANDLER_ARGS
843 {
844 #pragma unused(oidp, arg1, arg2)
845 unsigned int new_value;
846 int changed;
847 int error = sysctl_io_number(req, if_attach_nx, sizeof(if_attach_nx),
848 &new_value, &changed);
849 if (error) {
850 return error;
851 }
852 if (changed) {
853 if ((new_value & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) !=
854 (if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT)) {
855 return ENOTSUP;
856 }
857 if_attach_nx = new_value;
858 }
859 return 0;
860 }
861
862 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, if_attach_nx,
863 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
864 0, 0, &if_attach_nx_sysctl, "IU", "attach nexus");
865
866 #endif /* DEVELOPMENT || DEBUG */
867
868 static int
869 if_enable_fsw_transport_netagent_sysctl SYSCTL_HANDLER_ARGS
870 {
871 #pragma unused(oidp, arg1, arg2)
872 unsigned int new_value;
873 int changed;
874 int error;
875
876 error = sysctl_io_number(req, if_enable_fsw_transport_netagent,
877 sizeof(if_enable_fsw_transport_netagent),
878 &new_value, &changed);
879 if (error == 0 && changed != 0) {
880 if (new_value != 0 && new_value != 1) {
881 /* only allow 0 or 1 */
882 error = EINVAL;
883 } else if ((if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0) {
884 /* netagent can be enabled/disabled */
885 if_enable_fsw_transport_netagent = new_value;
886 if (new_value == 0) {
887 kern_nexus_deregister_netagents();
888 } else {
889 kern_nexus_register_netagents();
890 }
891 } else {
892 /* netagent can't be enabled */
893 error = ENOTSUP;
894 }
895 }
896 return error;
897 }
898
899 SYSCTL_PROC(_net_link_generic_system, OID_AUTO, enable_netagent,
900 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
901 0, 0, &if_enable_fsw_transport_netagent_sysctl, "IU",
902 "enable flowswitch netagent");
903
904 static void dlil_detach_flowswitch_nexus(if_nexus_flowswitch_t nexus_fsw);
905
906 #include <skywalk/os_skywalk_private.h>
907
908 boolean_t
ifnet_nx_noauto(ifnet_t ifp)909 ifnet_nx_noauto(ifnet_t ifp)
910 {
911 return (ifp->if_xflags & IFXF_NX_NOAUTO) != 0;
912 }
913
914 boolean_t
ifnet_nx_noauto_flowswitch(ifnet_t ifp)915 ifnet_nx_noauto_flowswitch(ifnet_t ifp)
916 {
917 return ifnet_is_low_latency(ifp);
918 }
919
920 boolean_t
ifnet_is_low_latency(ifnet_t ifp)921 ifnet_is_low_latency(ifnet_t ifp)
922 {
923 return (ifp->if_xflags & IFXF_LOW_LATENCY) != 0;
924 }
925
926 boolean_t
ifnet_needs_compat(ifnet_t ifp)927 ifnet_needs_compat(ifnet_t ifp)
928 {
929 if ((if_attach_nx & IF_ATTACH_NX_NETIF_COMPAT) == 0) {
930 return FALSE;
931 }
932 #if !XNU_TARGET_OS_OSX
933 /*
934 * To conserve memory, we plumb in the compat layer selectively; this
935 * can be overridden via if_attach_nx flag IF_ATTACH_NX_NETIF_ALL.
936 * In particular, we check for Wi-Fi Access Point.
937 */
938 if (IFNET_IS_WIFI(ifp)) {
939 /* Wi-Fi Access Point */
940 if (ifp->if_name[0] == 'a' && ifp->if_name[1] == 'p' &&
941 ifp->if_name[2] == '\0') {
942 return if_netif_all;
943 }
944 }
945 #else /* XNU_TARGET_OS_OSX */
946 #pragma unused(ifp)
947 #endif /* XNU_TARGET_OS_OSX */
948 return TRUE;
949 }
950
951 boolean_t
ifnet_needs_fsw_transport_netagent(ifnet_t ifp)952 ifnet_needs_fsw_transport_netagent(ifnet_t ifp)
953 {
954 if (if_is_fsw_transport_netagent_enabled()) {
955 /* check if netagent has been manually enabled for ipsec/utun */
956 if (ifp->if_family == IFNET_FAMILY_IPSEC) {
957 return ipsec_interface_needs_netagent(ifp);
958 } else if (ifp->if_family == IFNET_FAMILY_UTUN) {
959 return utun_interface_needs_netagent(ifp);
960 }
961
962 /* check ifnet no auto nexus override */
963 if (ifnet_nx_noauto(ifp)) {
964 return FALSE;
965 }
966
967 /* check global if_attach_nx configuration */
968 switch (ifp->if_family) {
969 case IFNET_FAMILY_CELLULAR:
970 case IFNET_FAMILY_ETHERNET:
971 if ((if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0) {
972 return TRUE;
973 }
974 break;
975 default:
976 break;
977 }
978 }
979 return FALSE;
980 }
981
982 boolean_t
ifnet_needs_fsw_ip_netagent(ifnet_t ifp)983 ifnet_needs_fsw_ip_netagent(ifnet_t ifp)
984 {
985 #pragma unused(ifp)
986 if ((if_attach_nx & IF_ATTACH_NX_FSW_IP_NETAGENT) != 0) {
987 return TRUE;
988 }
989 return FALSE;
990 }
991
992 boolean_t
ifnet_needs_netif_netagent(ifnet_t ifp)993 ifnet_needs_netif_netagent(ifnet_t ifp)
994 {
995 #pragma unused(ifp)
996 return (if_attach_nx & IF_ATTACH_NX_NETIF_NETAGENT) != 0;
997 }
998
999 static boolean_t
dlil_detach_nexus_instance(nexus_controller_t controller,const char * func_str,uuid_t instance,uuid_t device)1000 dlil_detach_nexus_instance(nexus_controller_t controller,
1001 const char *func_str, uuid_t instance, uuid_t device)
1002 {
1003 errno_t err;
1004
1005 if (instance == NULL || uuid_is_null(instance)) {
1006 return FALSE;
1007 }
1008
1009 /* followed by the device port */
1010 if (device != NULL && !uuid_is_null(device)) {
1011 err = kern_nexus_ifdetach(controller, instance, device);
1012 if (err != 0) {
1013 DLIL_PRINTF("%s kern_nexus_ifdetach device failed %d\n",
1014 func_str, err);
1015 }
1016 }
1017 err = kern_nexus_controller_free_provider_instance(controller,
1018 instance);
1019 if (err != 0) {
1020 DLIL_PRINTF("%s free_provider_instance failed %d\n",
1021 func_str, err);
1022 }
1023 return TRUE;
1024 }
1025
1026 static boolean_t
dlil_detach_nexus(const char * func_str,uuid_t provider,uuid_t instance,uuid_t device)1027 dlil_detach_nexus(const char *func_str, uuid_t provider, uuid_t instance,
1028 uuid_t device)
1029 {
1030 boolean_t detached = FALSE;
1031 nexus_controller_t controller = kern_nexus_shared_controller();
1032 int err;
1033
1034 if (dlil_detach_nexus_instance(controller, func_str, instance,
1035 device)) {
1036 detached = TRUE;
1037 }
1038 if (provider != NULL && !uuid_is_null(provider)) {
1039 detached = TRUE;
1040 err = kern_nexus_controller_deregister_provider(controller,
1041 provider);
1042 if (err != 0) {
1043 DLIL_PRINTF("%s deregister_provider %d\n",
1044 func_str, err);
1045 }
1046 }
1047 return detached;
1048 }
1049
1050 static errno_t
dlil_create_provider_and_instance(nexus_controller_t controller,nexus_type_t type,ifnet_t ifp,uuid_t * provider,uuid_t * instance,nexus_attr_t attr)1051 dlil_create_provider_and_instance(nexus_controller_t controller,
1052 nexus_type_t type, ifnet_t ifp, uuid_t *provider, uuid_t *instance,
1053 nexus_attr_t attr)
1054 {
1055 uuid_t dom_prov;
1056 errno_t err;
1057 nexus_name_t provider_name;
1058 const char *type_name =
1059 (type == NEXUS_TYPE_NET_IF) ? "netif" : "flowswitch";
1060 struct kern_nexus_init init;
1061
1062 err = kern_nexus_get_default_domain_provider(type, &dom_prov);
1063 if (err != 0) {
1064 DLIL_PRINTF("%s can't get %s provider, error %d\n",
1065 __func__, type_name, err);
1066 goto failed;
1067 }
1068
1069 snprintf((char *)provider_name, sizeof(provider_name),
1070 "com.apple.%s.%s", type_name, if_name(ifp));
1071 err = kern_nexus_controller_register_provider(controller,
1072 dom_prov,
1073 provider_name,
1074 NULL,
1075 0,
1076 attr,
1077 provider);
1078 if (err != 0) {
1079 DLIL_PRINTF("%s register %s provider failed, error %d\n",
1080 __func__, type_name, err);
1081 goto failed;
1082 }
1083 bzero(&init, sizeof(init));
1084 init.nxi_version = KERN_NEXUS_CURRENT_VERSION;
1085 err = kern_nexus_controller_alloc_provider_instance(controller,
1086 *provider,
1087 NULL, NULL,
1088 instance, &init);
1089 if (err != 0) {
1090 DLIL_PRINTF("%s alloc_provider_instance %s failed, %d\n",
1091 __func__, type_name, err);
1092 kern_nexus_controller_deregister_provider(controller,
1093 *provider);
1094 goto failed;
1095 }
1096 failed:
1097 return err;
1098 }
1099
1100 static boolean_t
dlil_attach_netif_nexus_common(ifnet_t ifp,if_nexus_netif_t netif_nx)1101 dlil_attach_netif_nexus_common(ifnet_t ifp, if_nexus_netif_t netif_nx)
1102 {
1103 nexus_attr_t attr = NULL;
1104 nexus_controller_t controller;
1105 errno_t err;
1106
1107 if ((ifp->if_capabilities & IFCAP_SKYWALK) != 0) {
1108 /* it's already attached */
1109 if (dlil_verbose) {
1110 DLIL_PRINTF("%s: %s already has nexus attached\n",
1111 __func__, if_name(ifp));
1112 /* already attached */
1113 }
1114 goto failed;
1115 }
1116
1117 err = kern_nexus_attr_create(&attr);
1118 if (err != 0) {
1119 DLIL_PRINTF("%s: nexus attr create for %s\n", __func__,
1120 if_name(ifp));
1121 goto failed;
1122 }
1123 err = kern_nexus_attr_set(attr, NEXUS_ATTR_IFINDEX, ifp->if_index);
1124 VERIFY(err == 0);
1125
1126 controller = kern_nexus_shared_controller();
1127
1128 /* create the netif provider and instance */
1129 err = dlil_create_provider_and_instance(controller,
1130 NEXUS_TYPE_NET_IF, ifp, &netif_nx->if_nif_provider,
1131 &netif_nx->if_nif_instance, attr);
1132 if (err != 0) {
1133 goto failed;
1134 }
1135 err = kern_nexus_ifattach(controller, netif_nx->if_nif_instance,
1136 ifp, NULL, FALSE, &netif_nx->if_nif_attach);
1137 if (err != 0) {
1138 DLIL_PRINTF("%s kern_nexus_ifattach %d\n",
1139 __func__, err);
1140 /* cleanup provider and instance */
1141 dlil_detach_nexus(__func__, netif_nx->if_nif_provider,
1142 netif_nx->if_nif_instance, NULL);
1143 goto failed;
1144 }
1145 return TRUE;
1146
1147 failed:
1148 if (attr != NULL) {
1149 kern_nexus_attr_destroy(attr);
1150 }
1151 return FALSE;
1152 }
1153
1154 static boolean_t
dlil_attach_netif_compat_nexus(ifnet_t ifp,if_nexus_netif_t netif_nx)1155 dlil_attach_netif_compat_nexus(ifnet_t ifp, if_nexus_netif_t netif_nx)
1156 {
1157 if (ifnet_nx_noauto(ifp) || IFNET_IS_INTCOPROC(ifp) ||
1158 IFNET_IS_VMNET(ifp)) {
1159 goto failed;
1160 }
1161 switch (ifp->if_type) {
1162 case IFT_CELLULAR:
1163 case IFT_ETHER:
1164 if ((if_attach_nx & IF_ATTACH_NX_NETIF_COMPAT) == 0) {
1165 /* don't auto-attach */
1166 goto failed;
1167 }
1168 break;
1169 default:
1170 /* don't auto-attach */
1171 goto failed;
1172 }
1173 return dlil_attach_netif_nexus_common(ifp, netif_nx);
1174
1175 failed:
1176 return FALSE;
1177 }
1178
1179 static boolean_t
dlil_is_native_netif_nexus(ifnet_t ifp)1180 dlil_is_native_netif_nexus(ifnet_t ifp)
1181 {
1182 return (ifp->if_eflags & IFEF_SKYWALK_NATIVE) && ifp->if_na != NULL;
1183 }
1184
1185 static void
dlil_detach_netif_nexus(if_nexus_netif_t nexus_netif)1186 dlil_detach_netif_nexus(if_nexus_netif_t nexus_netif)
1187 {
1188 dlil_detach_nexus(__func__, nexus_netif->if_nif_provider,
1189 nexus_netif->if_nif_instance, nexus_netif->if_nif_attach);
1190 }
1191
1192 static inline int
dlil_siocgifdevmtu(struct ifnet * ifp,struct ifdevmtu * ifdm_p)1193 dlil_siocgifdevmtu(struct ifnet * ifp, struct ifdevmtu * ifdm_p)
1194 {
1195 struct ifreq ifr;
1196 int error;
1197
1198 bzero(&ifr, sizeof(ifr));
1199 error = ifnet_ioctl(ifp, 0, SIOCGIFDEVMTU, &ifr);
1200 if (error == 0) {
1201 *ifdm_p = ifr.ifr_devmtu;
1202 }
1203 return error;
1204 }
1205
1206 static inline int
_dlil_get_flowswitch_buffer_size(ifnet_t ifp,uuid_t netif,uint64_t * buf_size,bool * use_multi_buflet)1207 _dlil_get_flowswitch_buffer_size(ifnet_t ifp, uuid_t netif, uint64_t *buf_size,
1208 bool *use_multi_buflet)
1209 {
1210 struct kern_pbufpool_memory_info rx_pp_info;
1211 struct kern_pbufpool_memory_info tx_pp_info;
1212 uint32_t if_max_mtu = 0;
1213 uint32_t drv_buf_size;
1214 struct ifdevmtu ifdm;
1215 int err;
1216
1217 /*
1218 * To perform intra-stack RX aggregation flowswitch needs to use
1219 * multi-buflet packet.
1220 */
1221 *use_multi_buflet = (sk_fsw_rx_agg_tcp != 0);
1222
1223 /*
1224 * IP over Thunderbolt interface can deliver the largest IP packet,
1225 * but the driver advertises the MAX MTU as only 9K.
1226 */
1227 if (IFNET_IS_THUNDERBOLT_IP(ifp)) {
1228 if_max_mtu = IP_MAXPACKET;
1229 goto skip_mtu_ioctl;
1230 }
1231
1232 /* determine max mtu */
1233 bzero(&ifdm, sizeof(ifdm));
1234 err = dlil_siocgifdevmtu(ifp, &ifdm);
1235 if (__improbable(err != 0)) {
1236 DLIL_PRINTF("%s: SIOCGIFDEVMTU failed for %s\n",
1237 __func__, if_name(ifp));
1238 /* use default flowswitch buffer size */
1239 if_max_mtu = NX_FSW_BUFSIZE;
1240 } else {
1241 DLIL_PRINTF("%s: %s %d %d\n", __func__, if_name(ifp),
1242 ifdm.ifdm_max, ifdm.ifdm_current);
1243 /* rdar://problem/44589731 */
1244 if_max_mtu = MAX(ifdm.ifdm_max, ifdm.ifdm_current);
1245 }
1246
1247 skip_mtu_ioctl:
1248 if (if_max_mtu == 0) {
1249 DLIL_PRINTF("%s: can't determine MAX MTU for %s\n",
1250 __func__, if_name(ifp));
1251 return EINVAL;
1252 }
1253 if ((if_max_mtu > NX_FSW_MAXBUFSIZE) && fsw_use_max_mtu_buffer) {
1254 DLIL_PRINTF("%s: interace (%s) has MAX MTU (%u) > flowswitch "
1255 "max bufsize(%d)\n", __func__,
1256 if_name(ifp), if_max_mtu, NX_FSW_MAXBUFSIZE);
1257 return EINVAL;
1258 }
1259
1260 /*
1261 * for skywalk native driver, consult the driver packet pool also.
1262 */
1263 if (dlil_is_native_netif_nexus(ifp)) {
1264 err = kern_nexus_get_pbufpool_info(netif, &rx_pp_info,
1265 &tx_pp_info);
1266 if (err != 0) {
1267 DLIL_PRINTF("%s: can't get pbufpool info for %s\n",
1268 __func__, if_name(ifp));
1269 return ENXIO;
1270 }
1271 drv_buf_size = tx_pp_info.kpm_bufsize *
1272 tx_pp_info.kpm_max_frags;
1273 if (if_max_mtu > drv_buf_size) {
1274 DLIL_PRINTF("%s: interface %s packet pool (rx %d * %d, "
1275 "tx %d * %d) can't support max mtu(%d)\n", __func__,
1276 if_name(ifp), rx_pp_info.kpm_bufsize,
1277 rx_pp_info.kpm_max_frags, tx_pp_info.kpm_bufsize,
1278 tx_pp_info.kpm_max_frags, if_max_mtu);
1279 return EINVAL;
1280 }
1281 } else {
1282 drv_buf_size = if_max_mtu;
1283 }
1284
1285 if ((drv_buf_size > NX_FSW_BUFSIZE) && (!fsw_use_max_mtu_buffer)) {
1286 _CASSERT((NX_FSW_BUFSIZE * NX_PBUF_FRAGS_MAX) >= IP_MAXPACKET);
1287 *use_multi_buflet = true;
1288 /* default flowswitch buffer size */
1289 *buf_size = NX_FSW_BUFSIZE;
1290 } else {
1291 *buf_size = MAX(drv_buf_size, NX_FSW_BUFSIZE);
1292 }
1293 return 0;
1294 }
1295
1296 static boolean_t
_dlil_attach_flowswitch_nexus(ifnet_t ifp,if_nexus_flowswitch_t nexus_fsw)1297 _dlil_attach_flowswitch_nexus(ifnet_t ifp, if_nexus_flowswitch_t nexus_fsw)
1298 {
1299 nexus_attr_t attr = NULL;
1300 nexus_controller_t controller;
1301 errno_t err = 0;
1302 uuid_t netif;
1303 uint64_t buf_size = 0;
1304 bool multi_buflet;
1305
1306 if (ifnet_nx_noauto(ifp) || ifnet_nx_noauto_flowswitch(ifp) ||
1307 IFNET_IS_VMNET(ifp)) {
1308 goto failed;
1309 }
1310
1311 if ((ifp->if_capabilities & IFCAP_SKYWALK) == 0) {
1312 /* not possible to attach (netif native/compat not plumbed) */
1313 goto failed;
1314 }
1315
1316 if ((if_attach_nx & IF_ATTACH_NX_FLOWSWITCH) == 0) {
1317 /* don't auto-attach */
1318 goto failed;
1319 }
1320
1321 /* get the netif instance from the ifp */
1322 err = kern_nexus_get_netif_instance(ifp, netif);
1323 if (err != 0) {
1324 DLIL_PRINTF("%s: can't find netif for %s\n", __func__,
1325 if_name(ifp));
1326 goto failed;
1327 }
1328
1329 err = kern_nexus_attr_create(&attr);
1330 if (err != 0) {
1331 DLIL_PRINTF("%s: nexus attr create for %s\n", __func__,
1332 if_name(ifp));
1333 goto failed;
1334 }
1335
1336 err = _dlil_get_flowswitch_buffer_size(ifp, netif, &buf_size,
1337 &multi_buflet);
1338 if (err != 0) {
1339 goto failed;
1340 }
1341 ASSERT((buf_size >= NX_FSW_BUFSIZE) && (buf_size <= NX_FSW_MAXBUFSIZE));
1342
1343 /* Configure flowswitch buffer size */
1344 err = kern_nexus_attr_set(attr, NEXUS_ATTR_SLOT_BUF_SIZE, buf_size);
1345 VERIFY(err == 0);
1346
1347 /*
1348 * Configure flowswitch to use super-packet (multi-buflet).
1349 */
1350 err = kern_nexus_attr_set(attr, NEXUS_ATTR_MAX_FRAGS,
1351 multi_buflet ? NX_PBUF_FRAGS_MAX : 1);
1352 VERIFY(err == 0);
1353
1354 /* create the flowswitch provider and instance */
1355 controller = kern_nexus_shared_controller();
1356 err = dlil_create_provider_and_instance(controller,
1357 NEXUS_TYPE_FLOW_SWITCH, ifp, &nexus_fsw->if_fsw_provider,
1358 &nexus_fsw->if_fsw_instance, attr);
1359 if (err != 0) {
1360 goto failed;
1361 }
1362
1363 /* attach the device port */
1364 err = kern_nexus_ifattach(controller, nexus_fsw->if_fsw_instance,
1365 NULL, netif, FALSE, &nexus_fsw->if_fsw_device);
1366 if (err != 0) {
1367 DLIL_PRINTF("%s kern_nexus_ifattach device failed %d %s\n",
1368 __func__, err, if_name(ifp));
1369 /* cleanup provider and instance */
1370 dlil_detach_nexus(__func__, nexus_fsw->if_fsw_provider,
1371 nexus_fsw->if_fsw_instance, nexus_fsw->if_fsw_device);
1372 goto failed;
1373 }
1374 return TRUE;
1375
1376 failed:
1377 if (err != 0) {
1378 DLIL_PRINTF("%s: failed to attach flowswitch to %s, error %d\n",
1379 __func__, if_name(ifp), err);
1380 } else {
1381 DLIL_PRINTF("%s: not attaching flowswitch to %s\n",
1382 __func__, if_name(ifp));
1383 }
1384 if (attr != NULL) {
1385 kern_nexus_attr_destroy(attr);
1386 }
1387 return FALSE;
1388 }
1389
1390 static boolean_t
dlil_attach_flowswitch_nexus(ifnet_t ifp)1391 dlil_attach_flowswitch_nexus(ifnet_t ifp)
1392 {
1393 boolean_t attached;
1394 if_nexus_flowswitch nexus_fsw;
1395
1396 #if (DEVELOPMENT || DEBUG)
1397 if (skywalk_netif_direct_allowed(if_name(ifp))) {
1398 DLIL_PRINTF("skip attaching fsw to %s", if_name(ifp));
1399 return FALSE;
1400 }
1401 #endif /* (DEVELOPMENT || DEBUG) */
1402
1403 /*
1404 * flowswitch attachment is not supported for interface using the
1405 * legacy model (IFNET_INIT_LEGACY)
1406 */
1407 if ((ifp->if_eflags & IFEF_TXSTART) == 0) {
1408 DLIL_PRINTF("skip attaching fsw to %s using legacy TX model",
1409 if_name(ifp));
1410 return FALSE;
1411 }
1412
1413 if (uuid_is_null(ifp->if_nx_flowswitch.if_fsw_instance) == 0) {
1414 /* it's already attached */
1415 return FALSE;
1416 }
1417 bzero(&nexus_fsw, sizeof(nexus_fsw));
1418 attached = _dlil_attach_flowswitch_nexus(ifp, &nexus_fsw);
1419 if (attached) {
1420 ifnet_lock_exclusive(ifp);
1421 if (!IF_FULLY_ATTACHED(ifp)) {
1422 /* interface is going away */
1423 attached = FALSE;
1424 } else {
1425 ifp->if_nx_flowswitch = nexus_fsw;
1426 }
1427 ifnet_lock_done(ifp);
1428 if (!attached) {
1429 /* clean up flowswitch nexus */
1430 dlil_detach_flowswitch_nexus(&nexus_fsw);
1431 }
1432 }
1433 return attached;
1434 }
1435
1436 static void
dlil_detach_flowswitch_nexus(if_nexus_flowswitch_t nexus_fsw)1437 dlil_detach_flowswitch_nexus(if_nexus_flowswitch_t nexus_fsw)
1438 {
1439 dlil_detach_nexus(__func__, nexus_fsw->if_fsw_provider,
1440 nexus_fsw->if_fsw_instance, nexus_fsw->if_fsw_device);
1441 }
1442
1443 boolean_t
ifnet_add_netagent(ifnet_t ifp)1444 ifnet_add_netagent(ifnet_t ifp)
1445 {
1446 int error;
1447
1448 error = kern_nexus_interface_add_netagent(ifp);
1449 os_log(OS_LOG_DEFAULT,
1450 "kern_nexus_interface_add_netagent(%s) returned %d",
1451 ifp->if_xname, error);
1452 return error == 0;
1453 }
1454
1455 boolean_t
ifnet_remove_netagent(ifnet_t ifp)1456 ifnet_remove_netagent(ifnet_t ifp)
1457 {
1458 int error;
1459
1460 error = kern_nexus_interface_remove_netagent(ifp);
1461 os_log(OS_LOG_DEFAULT,
1462 "kern_nexus_interface_remove_netagent(%s) returned %d",
1463 ifp->if_xname, error);
1464 return error == 0;
1465 }
1466
1467 boolean_t
ifnet_attach_flowswitch_nexus(ifnet_t ifp)1468 ifnet_attach_flowswitch_nexus(ifnet_t ifp)
1469 {
1470 if (!IF_FULLY_ATTACHED(ifp)) {
1471 return FALSE;
1472 }
1473 return dlil_attach_flowswitch_nexus(ifp);
1474 }
1475
1476 boolean_t
ifnet_detach_flowswitch_nexus(ifnet_t ifp)1477 ifnet_detach_flowswitch_nexus(ifnet_t ifp)
1478 {
1479 if_nexus_flowswitch nexus_fsw;
1480
1481 ifnet_lock_exclusive(ifp);
1482 nexus_fsw = ifp->if_nx_flowswitch;
1483 bzero(&ifp->if_nx_flowswitch, sizeof(ifp->if_nx_flowswitch));
1484 ifnet_lock_done(ifp);
1485 return dlil_detach_nexus(__func__, nexus_fsw.if_fsw_provider,
1486 nexus_fsw.if_fsw_instance, nexus_fsw.if_fsw_device);
1487 }
1488
1489 boolean_t
ifnet_attach_netif_nexus(ifnet_t ifp)1490 ifnet_attach_netif_nexus(ifnet_t ifp)
1491 {
1492 boolean_t nexus_attached;
1493 if_nexus_netif nexus_netif;
1494
1495 if (!IF_FULLY_ATTACHED(ifp)) {
1496 return FALSE;
1497 }
1498 nexus_attached = dlil_attach_netif_nexus_common(ifp, &nexus_netif);
1499 if (nexus_attached) {
1500 ifnet_lock_exclusive(ifp);
1501 ifp->if_nx_netif = nexus_netif;
1502 ifnet_lock_done(ifp);
1503 }
1504 return nexus_attached;
1505 }
1506
1507 boolean_t
ifnet_detach_netif_nexus(ifnet_t ifp)1508 ifnet_detach_netif_nexus(ifnet_t ifp)
1509 {
1510 if_nexus_netif nexus_netif;
1511
1512 ifnet_lock_exclusive(ifp);
1513 nexus_netif = ifp->if_nx_netif;
1514 bzero(&ifp->if_nx_netif, sizeof(ifp->if_nx_netif));
1515 ifnet_lock_done(ifp);
1516
1517 return dlil_detach_nexus(__func__, nexus_netif.if_nif_provider,
1518 nexus_netif.if_nif_instance, nexus_netif.if_nif_attach);
1519 }
1520
1521 #endif /* SKYWALK */
1522
1523 #define DLIL_INPUT_CHECK(m, ifp) { \
1524 struct ifnet *_rcvif = mbuf_pkthdr_rcvif(m); \
1525 if (_rcvif == NULL || (ifp != lo_ifp && _rcvif != ifp) || \
1526 !(mbuf_flags(m) & MBUF_PKTHDR)) { \
1527 panic_plain("%s: invalid mbuf %p\n", __func__, m); \
1528 /* NOTREACHED */ \
1529 } \
1530 }
1531
1532 #define DLIL_EWMA(old, new, decay) do { \
1533 u_int32_t _avg; \
1534 if ((_avg = (old)) > 0) \
1535 _avg = (((_avg << (decay)) - _avg) + (new)) >> (decay); \
1536 else \
1537 _avg = (new); \
1538 (old) = _avg; \
1539 } while (0)
1540
1541 #define MBPS (1ULL * 1000 * 1000)
1542 #define GBPS (MBPS * 1000)
1543
1544 struct rxpoll_time_tbl {
1545 u_int64_t speed; /* downlink speed */
1546 u_int32_t plowat; /* packets low watermark */
1547 u_int32_t phiwat; /* packets high watermark */
1548 u_int32_t blowat; /* bytes low watermark */
1549 u_int32_t bhiwat; /* bytes high watermark */
1550 };
1551
1552 static struct rxpoll_time_tbl rxpoll_tbl[] = {
1553 { .speed = 10 * MBPS, .plowat = 2, .phiwat = 8, .blowat = (1 * 1024), .bhiwat = (6 * 1024) },
1554 { .speed = 100 * MBPS, .plowat = 10, .phiwat = 40, .blowat = (4 * 1024), .bhiwat = (64 * 1024) },
1555 { .speed = 1 * GBPS, .plowat = 10, .phiwat = 40, .blowat = (4 * 1024), .bhiwat = (64 * 1024) },
1556 { .speed = 10 * GBPS, .plowat = 10, .phiwat = 40, .blowat = (4 * 1024), .bhiwat = (64 * 1024) },
1557 { .speed = 100 * GBPS, .plowat = 10, .phiwat = 40, .blowat = (4 * 1024), .bhiwat = (64 * 1024) },
1558 { .speed = 0, .plowat = 0, .phiwat = 0, .blowat = 0, .bhiwat = 0 }
1559 };
1560
1561 static LCK_MTX_DECLARE_ATTR(dlil_thread_sync_lock, &dlil_lock_group,
1562 &dlil_lck_attributes);
1563 static uint32_t dlil_pending_thread_cnt = 0;
1564
1565 static void
dlil_incr_pending_thread_count(void)1566 dlil_incr_pending_thread_count(void)
1567 {
1568 LCK_MTX_ASSERT(&dlil_thread_sync_lock, LCK_MTX_ASSERT_NOTOWNED);
1569 lck_mtx_lock(&dlil_thread_sync_lock);
1570 dlil_pending_thread_cnt++;
1571 lck_mtx_unlock(&dlil_thread_sync_lock);
1572 }
1573
1574 static void
dlil_decr_pending_thread_count(void)1575 dlil_decr_pending_thread_count(void)
1576 {
1577 LCK_MTX_ASSERT(&dlil_thread_sync_lock, LCK_MTX_ASSERT_NOTOWNED);
1578 lck_mtx_lock(&dlil_thread_sync_lock);
1579 VERIFY(dlil_pending_thread_cnt > 0);
1580 dlil_pending_thread_cnt--;
1581 if (dlil_pending_thread_cnt == 0) {
1582 wakeup(&dlil_pending_thread_cnt);
1583 }
1584 lck_mtx_unlock(&dlil_thread_sync_lock);
1585 }
1586
1587 int
proto_hash_value(u_int32_t protocol_family)1588 proto_hash_value(u_int32_t protocol_family)
1589 {
1590 /*
1591 * dlil_proto_unplumb_all() depends on the mapping between
1592 * the hash bucket index and the protocol family defined
1593 * here; future changes must be applied there as well.
1594 */
1595 switch (protocol_family) {
1596 case PF_INET:
1597 return 0;
1598 case PF_INET6:
1599 return 1;
1600 case PF_VLAN:
1601 return 2;
1602 case PF_802154:
1603 return 3;
1604 case PF_UNSPEC:
1605 default:
1606 return 4;
1607 }
1608 }
1609
1610 /*
1611 * Caller must already be holding ifnet lock.
1612 */
1613 static struct if_proto *
find_attached_proto(struct ifnet * ifp,u_int32_t protocol_family)1614 find_attached_proto(struct ifnet *ifp, u_int32_t protocol_family)
1615 {
1616 struct if_proto *proto = NULL;
1617 u_int32_t i = proto_hash_value(protocol_family);
1618
1619 ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_OWNED);
1620
1621 if (ifp->if_proto_hash != NULL) {
1622 proto = SLIST_FIRST(&ifp->if_proto_hash[i]);
1623 }
1624
1625 while (proto != NULL && proto->protocol_family != protocol_family) {
1626 proto = SLIST_NEXT(proto, next_hash);
1627 }
1628
1629 if (proto != NULL) {
1630 if_proto_ref(proto);
1631 }
1632
1633 return proto;
1634 }
1635
1636 static void
if_proto_ref(struct if_proto * proto)1637 if_proto_ref(struct if_proto *proto)
1638 {
1639 atomic_add_32(&proto->refcount, 1);
1640 }
1641
1642 extern void if_rtproto_del(struct ifnet *ifp, int protocol);
1643
1644 static void
if_proto_free(struct if_proto * proto)1645 if_proto_free(struct if_proto *proto)
1646 {
1647 u_int32_t oldval;
1648 struct ifnet *ifp = proto->ifp;
1649 u_int32_t proto_family = proto->protocol_family;
1650 struct kev_dl_proto_data ev_pr_data;
1651
1652 oldval = atomic_add_32_ov(&proto->refcount, -1);
1653 if (oldval > 1) {
1654 return;
1655 }
1656
1657 if (proto->proto_kpi == kProtoKPI_v1) {
1658 if (proto->kpi.v1.detached) {
1659 proto->kpi.v1.detached(ifp, proto->protocol_family);
1660 }
1661 }
1662 if (proto->proto_kpi == kProtoKPI_v2) {
1663 if (proto->kpi.v2.detached) {
1664 proto->kpi.v2.detached(ifp, proto->protocol_family);
1665 }
1666 }
1667
1668 /*
1669 * Cleanup routes that may still be in the routing table for that
1670 * interface/protocol pair.
1671 */
1672 if_rtproto_del(ifp, proto_family);
1673
1674 ifnet_lock_shared(ifp);
1675
1676 /* No more reference on this, protocol must have been detached */
1677 VERIFY(proto->detached);
1678
1679 /*
1680 * The reserved field carries the number of protocol still attached
1681 * (subject to change)
1682 */
1683 ev_pr_data.proto_family = proto_family;
1684 ev_pr_data.proto_remaining_count = dlil_ifp_protolist(ifp, NULL, 0);
1685
1686 ifnet_lock_done(ifp);
1687
1688 dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_PROTO_DETACHED,
1689 (struct net_event_data *)&ev_pr_data,
1690 sizeof(struct kev_dl_proto_data));
1691
1692 if (ev_pr_data.proto_remaining_count == 0) {
1693 /*
1694 * The protocol count has gone to zero, mark the interface down.
1695 * This used to be done by configd.KernelEventMonitor, but that
1696 * is inherently prone to races (rdar://problem/30810208).
1697 */
1698 (void) ifnet_set_flags(ifp, 0, IFF_UP);
1699 (void) ifnet_ioctl(ifp, 0, SIOCSIFFLAGS, NULL);
1700 dlil_post_sifflags_msg(ifp);
1701 }
1702
1703 zfree(dlif_proto_zone, proto);
1704 }
1705
1706 __private_extern__ void
ifnet_lock_assert(struct ifnet * ifp,ifnet_lock_assert_t what)1707 ifnet_lock_assert(struct ifnet *ifp, ifnet_lock_assert_t what)
1708 {
1709 #if !MACH_ASSERT
1710 #pragma unused(ifp)
1711 #endif
1712 unsigned int type = 0;
1713 int ass = 1;
1714
1715 switch (what) {
1716 case IFNET_LCK_ASSERT_EXCLUSIVE:
1717 type = LCK_RW_ASSERT_EXCLUSIVE;
1718 break;
1719
1720 case IFNET_LCK_ASSERT_SHARED:
1721 type = LCK_RW_ASSERT_SHARED;
1722 break;
1723
1724 case IFNET_LCK_ASSERT_OWNED:
1725 type = LCK_RW_ASSERT_HELD;
1726 break;
1727
1728 case IFNET_LCK_ASSERT_NOTOWNED:
1729 /* nothing to do here for RW lock; bypass assert */
1730 ass = 0;
1731 break;
1732
1733 default:
1734 panic("bad ifnet assert type: %d", what);
1735 /* NOTREACHED */
1736 }
1737 if (ass) {
1738 LCK_RW_ASSERT(&ifp->if_lock, type);
1739 }
1740 }
1741
1742 __private_extern__ void
ifnet_lock_shared(struct ifnet * ifp)1743 ifnet_lock_shared(struct ifnet *ifp)
1744 {
1745 lck_rw_lock_shared(&ifp->if_lock);
1746 }
1747
1748 __private_extern__ void
ifnet_lock_exclusive(struct ifnet * ifp)1749 ifnet_lock_exclusive(struct ifnet *ifp)
1750 {
1751 lck_rw_lock_exclusive(&ifp->if_lock);
1752 }
1753
1754 __private_extern__ void
ifnet_lock_done(struct ifnet * ifp)1755 ifnet_lock_done(struct ifnet *ifp)
1756 {
1757 lck_rw_done(&ifp->if_lock);
1758 }
1759
1760 #if INET
1761 __private_extern__ void
if_inetdata_lock_shared(struct ifnet * ifp)1762 if_inetdata_lock_shared(struct ifnet *ifp)
1763 {
1764 lck_rw_lock_shared(&ifp->if_inetdata_lock);
1765 }
1766
1767 __private_extern__ void
if_inetdata_lock_exclusive(struct ifnet * ifp)1768 if_inetdata_lock_exclusive(struct ifnet *ifp)
1769 {
1770 lck_rw_lock_exclusive(&ifp->if_inetdata_lock);
1771 }
1772
1773 __private_extern__ void
if_inetdata_lock_done(struct ifnet * ifp)1774 if_inetdata_lock_done(struct ifnet *ifp)
1775 {
1776 lck_rw_done(&ifp->if_inetdata_lock);
1777 }
1778 #endif
1779
1780 __private_extern__ void
if_inet6data_lock_shared(struct ifnet * ifp)1781 if_inet6data_lock_shared(struct ifnet *ifp)
1782 {
1783 lck_rw_lock_shared(&ifp->if_inet6data_lock);
1784 }
1785
1786 __private_extern__ void
if_inet6data_lock_exclusive(struct ifnet * ifp)1787 if_inet6data_lock_exclusive(struct ifnet *ifp)
1788 {
1789 lck_rw_lock_exclusive(&ifp->if_inet6data_lock);
1790 }
1791
1792 __private_extern__ void
if_inet6data_lock_done(struct ifnet * ifp)1793 if_inet6data_lock_done(struct ifnet *ifp)
1794 {
1795 lck_rw_done(&ifp->if_inet6data_lock);
1796 }
1797
1798 __private_extern__ void
ifnet_head_lock_shared(void)1799 ifnet_head_lock_shared(void)
1800 {
1801 lck_rw_lock_shared(&ifnet_head_lock);
1802 }
1803
1804 __private_extern__ void
ifnet_head_lock_exclusive(void)1805 ifnet_head_lock_exclusive(void)
1806 {
1807 lck_rw_lock_exclusive(&ifnet_head_lock);
1808 }
1809
1810 __private_extern__ void
ifnet_head_done(void)1811 ifnet_head_done(void)
1812 {
1813 lck_rw_done(&ifnet_head_lock);
1814 }
1815
1816 __private_extern__ void
ifnet_head_assert_exclusive(void)1817 ifnet_head_assert_exclusive(void)
1818 {
1819 LCK_RW_ASSERT(&ifnet_head_lock, LCK_RW_ASSERT_EXCLUSIVE);
1820 }
1821
1822 /*
1823 * dlil_ifp_protolist
1824 * - get the list of protocols attached to the interface, or just the number
1825 * of attached protocols
1826 * - if the number returned is greater than 'list_count', truncation occurred
1827 *
1828 * Note:
1829 * - caller must already be holding ifnet lock.
1830 */
1831 static u_int32_t
dlil_ifp_protolist(struct ifnet * ifp,protocol_family_t * list,u_int32_t list_count)1832 dlil_ifp_protolist(struct ifnet *ifp, protocol_family_t *list,
1833 u_int32_t list_count)
1834 {
1835 u_int32_t count = 0;
1836 int i;
1837
1838 ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_OWNED);
1839
1840 if (ifp->if_proto_hash == NULL) {
1841 goto done;
1842 }
1843
1844 for (i = 0; i < PROTO_HASH_SLOTS; i++) {
1845 struct if_proto *proto;
1846 SLIST_FOREACH(proto, &ifp->if_proto_hash[i], next_hash) {
1847 if (list != NULL && count < list_count) {
1848 list[count] = proto->protocol_family;
1849 }
1850 count++;
1851 }
1852 }
1853 done:
1854 return count;
1855 }
1856
1857 __private_extern__ u_int32_t
if_get_protolist(struct ifnet * ifp,u_int32_t * protolist,u_int32_t count)1858 if_get_protolist(struct ifnet * ifp, u_int32_t *protolist, u_int32_t count)
1859 {
1860 ifnet_lock_shared(ifp);
1861 count = dlil_ifp_protolist(ifp, protolist, count);
1862 ifnet_lock_done(ifp);
1863 return count;
1864 }
1865
1866 __private_extern__ void
if_free_protolist(u_int32_t * list)1867 if_free_protolist(u_int32_t *list)
1868 {
1869 kfree_data_addr(list);
1870 }
1871
1872 __private_extern__ int
dlil_post_msg(struct ifnet * ifp,u_int32_t event_subclass,u_int32_t event_code,struct net_event_data * event_data,u_int32_t event_data_len)1873 dlil_post_msg(struct ifnet *ifp, u_int32_t event_subclass,
1874 u_int32_t event_code, struct net_event_data *event_data,
1875 u_int32_t event_data_len)
1876 {
1877 struct net_event_data ev_data;
1878 struct kev_msg ev_msg;
1879
1880 bzero(&ev_msg, sizeof(ev_msg));
1881 bzero(&ev_data, sizeof(ev_data));
1882 /*
1883 * a net event always starts with a net_event_data structure
1884 * but the caller can generate a simple net event or
1885 * provide a longer event structure to post
1886 */
1887 ev_msg.vendor_code = KEV_VENDOR_APPLE;
1888 ev_msg.kev_class = KEV_NETWORK_CLASS;
1889 ev_msg.kev_subclass = event_subclass;
1890 ev_msg.event_code = event_code;
1891
1892 if (event_data == NULL) {
1893 event_data = &ev_data;
1894 event_data_len = sizeof(struct net_event_data);
1895 }
1896
1897 strlcpy(&event_data->if_name[0], ifp->if_name, IFNAMSIZ);
1898 event_data->if_family = ifp->if_family;
1899 event_data->if_unit = (u_int32_t)ifp->if_unit;
1900
1901 ev_msg.dv[0].data_length = event_data_len;
1902 ev_msg.dv[0].data_ptr = event_data;
1903 ev_msg.dv[1].data_length = 0;
1904
1905 bool update_generation = true;
1906 if (event_subclass == KEV_DL_SUBCLASS) {
1907 /* Don't update interface generation for frequent link quality and state changes */
1908 switch (event_code) {
1909 case KEV_DL_LINK_QUALITY_METRIC_CHANGED:
1910 case KEV_DL_RRC_STATE_CHANGED:
1911 case KEV_DL_NODE_PRESENCE:
1912 case KEV_DL_NODE_ABSENCE:
1913 case KEV_DL_PRIMARY_ELECTED:
1914 update_generation = false;
1915 break;
1916 default:
1917 break;
1918 }
1919 }
1920
1921 return dlil_event_internal(ifp, &ev_msg, update_generation);
1922 }
1923
1924 __private_extern__ int
dlil_alloc_local_stats(struct ifnet * ifp)1925 dlil_alloc_local_stats(struct ifnet *ifp)
1926 {
1927 int ret = EINVAL;
1928 void *buf, *base, **pbuf;
1929
1930 if (ifp == NULL) {
1931 goto end;
1932 }
1933
1934 if (ifp->if_tcp_stat == NULL && ifp->if_udp_stat == NULL) {
1935 /* allocate tcpstat_local structure */
1936 buf = zalloc_flags(dlif_tcpstat_zone,
1937 Z_WAITOK | Z_ZERO | Z_NOFAIL);
1938
1939 /* Get the 64-bit aligned base address for this object */
1940 base = (void *)P2ROUNDUP((intptr_t)buf + sizeof(u_int64_t),
1941 sizeof(u_int64_t));
1942 VERIFY(((intptr_t)base + dlif_tcpstat_size) <=
1943 ((intptr_t)buf + dlif_tcpstat_bufsize));
1944
1945 /*
1946 * Wind back a pointer size from the aligned base and
1947 * save the original address so we can free it later.
1948 */
1949 pbuf = (void **)((intptr_t)base - sizeof(void *));
1950 *pbuf = buf;
1951 ifp->if_tcp_stat = base;
1952
1953 /* allocate udpstat_local structure */
1954 buf = zalloc_flags(dlif_udpstat_zone,
1955 Z_WAITOK | Z_ZERO | Z_NOFAIL);
1956
1957 /* Get the 64-bit aligned base address for this object */
1958 base = (void *)P2ROUNDUP((intptr_t)buf + sizeof(u_int64_t),
1959 sizeof(u_int64_t));
1960 VERIFY(((intptr_t)base + dlif_udpstat_size) <=
1961 ((intptr_t)buf + dlif_udpstat_bufsize));
1962
1963 /*
1964 * Wind back a pointer size from the aligned base and
1965 * save the original address so we can free it later.
1966 */
1967 pbuf = (void **)((intptr_t)base - sizeof(void *));
1968 *pbuf = buf;
1969 ifp->if_udp_stat = base;
1970
1971 VERIFY(IS_P2ALIGNED(ifp->if_tcp_stat, sizeof(u_int64_t)) &&
1972 IS_P2ALIGNED(ifp->if_udp_stat, sizeof(u_int64_t)));
1973
1974 ret = 0;
1975 }
1976
1977 if (ifp->if_ipv4_stat == NULL) {
1978 ifp->if_ipv4_stat = kalloc_type(struct if_tcp_ecn_stat, Z_WAITOK | Z_ZERO);
1979 }
1980
1981 if (ifp->if_ipv6_stat == NULL) {
1982 ifp->if_ipv6_stat = kalloc_type(struct if_tcp_ecn_stat, Z_WAITOK | Z_ZERO);
1983 }
1984 end:
1985 if (ifp != NULL && ret != 0) {
1986 if (ifp->if_tcp_stat != NULL) {
1987 pbuf = (void **)
1988 ((intptr_t)ifp->if_tcp_stat - sizeof(void *));
1989 zfree(dlif_tcpstat_zone, *pbuf);
1990 ifp->if_tcp_stat = NULL;
1991 }
1992 if (ifp->if_udp_stat != NULL) {
1993 pbuf = (void **)
1994 ((intptr_t)ifp->if_udp_stat - sizeof(void *));
1995 zfree(dlif_udpstat_zone, *pbuf);
1996 ifp->if_udp_stat = NULL;
1997 }
1998 /* The macro kfree_type sets the passed pointer to NULL */
1999 if (ifp->if_ipv4_stat != NULL) {
2000 kfree_type(struct if_tcp_ecn_stat, ifp->if_ipv4_stat);
2001 }
2002 if (ifp->if_ipv6_stat != NULL) {
2003 kfree_type(struct if_tcp_ecn_stat, ifp->if_ipv6_stat);
2004 }
2005 }
2006
2007 return ret;
2008 }
2009
2010 static void
dlil_reset_rxpoll_params(ifnet_t ifp)2011 dlil_reset_rxpoll_params(ifnet_t ifp)
2012 {
2013 ASSERT(ifp != NULL);
2014 ifnet_set_poll_cycle(ifp, NULL);
2015 ifp->if_poll_update = 0;
2016 ifp->if_poll_flags = 0;
2017 ifp->if_poll_req = 0;
2018 ifp->if_poll_mode = IFNET_MODEL_INPUT_POLL_OFF;
2019 bzero(&ifp->if_poll_tstats, sizeof(ifp->if_poll_tstats));
2020 bzero(&ifp->if_poll_pstats, sizeof(ifp->if_poll_pstats));
2021 bzero(&ifp->if_poll_sstats, sizeof(ifp->if_poll_sstats));
2022 net_timerclear(&ifp->if_poll_mode_holdtime);
2023 net_timerclear(&ifp->if_poll_mode_lasttime);
2024 net_timerclear(&ifp->if_poll_sample_holdtime);
2025 net_timerclear(&ifp->if_poll_sample_lasttime);
2026 net_timerclear(&ifp->if_poll_dbg_lasttime);
2027 }
2028
2029 static int
dlil_create_input_thread(ifnet_t ifp,struct dlil_threading_info * inp,thread_continue_t * thfunc)2030 dlil_create_input_thread(ifnet_t ifp, struct dlil_threading_info *inp,
2031 thread_continue_t *thfunc)
2032 {
2033 boolean_t dlil_rxpoll_input;
2034 thread_continue_t func = NULL;
2035 u_int32_t limit;
2036 int error = 0;
2037
2038 dlil_rxpoll_input = (ifp != NULL && net_rxpoll &&
2039 (ifp->if_eflags & IFEF_RXPOLL) && (ifp->if_xflags & IFXF_LEGACY));
2040
2041 /* default strategy utilizes the DLIL worker thread */
2042 inp->dlth_strategy = dlil_input_async;
2043
2044 /* NULL ifp indicates the main input thread, called at dlil_init time */
2045 if (ifp == NULL) {
2046 /*
2047 * Main input thread only.
2048 */
2049 func = dlil_main_input_thread_func;
2050 VERIFY(inp == dlil_main_input_thread);
2051 (void) strlcat(inp->dlth_name,
2052 "main_input", DLIL_THREADNAME_LEN);
2053 } else if (dlil_rxpoll_input) {
2054 /*
2055 * Legacy (non-netif) hybrid polling.
2056 */
2057 func = dlil_rxpoll_input_thread_func;
2058 VERIFY(inp != dlil_main_input_thread);
2059 (void) snprintf(inp->dlth_name, DLIL_THREADNAME_LEN,
2060 "%s_input_poll", if_name(ifp));
2061 } else if (net_async || (ifp->if_xflags & IFXF_LEGACY)) {
2062 /*
2063 * Asynchronous strategy.
2064 */
2065 func = dlil_input_thread_func;
2066 VERIFY(inp != dlil_main_input_thread);
2067 (void) snprintf(inp->dlth_name, DLIL_THREADNAME_LEN,
2068 "%s_input", if_name(ifp));
2069 } else {
2070 /*
2071 * Synchronous strategy if there's a netif below and
2072 * the device isn't capable of hybrid polling.
2073 */
2074 ASSERT(func == NULL);
2075 ASSERT(!(ifp->if_xflags & IFXF_LEGACY));
2076 VERIFY(inp != dlil_main_input_thread);
2077 ASSERT(!inp->dlth_affinity);
2078 inp->dlth_strategy = dlil_input_sync;
2079 }
2080 VERIFY(inp->dlth_thread == THREAD_NULL);
2081
2082 /* let caller know */
2083 if (thfunc != NULL) {
2084 *thfunc = func;
2085 }
2086
2087 inp->dlth_lock_grp = lck_grp_alloc_init(inp->dlth_name, LCK_GRP_ATTR_NULL);
2088 lck_mtx_init(&inp->dlth_lock, inp->dlth_lock_grp, &dlil_lck_attributes);
2089
2090 inp->dlth_ifp = ifp; /* NULL for main input thread */
2091 /*
2092 * For interfaces that support opportunistic polling, set the
2093 * low and high watermarks for outstanding inbound packets/bytes.
2094 * Also define freeze times for transitioning between modes
2095 * and updating the average.
2096 */
2097 if (ifp != NULL && net_rxpoll && (ifp->if_eflags & IFEF_RXPOLL)) {
2098 limit = MAX(if_rcvq_maxlen, IF_RCVQ_MINLEN);
2099 if (ifp->if_xflags & IFXF_LEGACY) {
2100 (void) dlil_rxpoll_set_params(ifp, NULL, FALSE);
2101 }
2102 } else {
2103 limit = (u_int32_t)-1;
2104 }
2105
2106 _qinit(&inp->dlth_pkts, Q_DROPTAIL, limit, QP_MBUF);
2107 if (inp == dlil_main_input_thread) {
2108 struct dlil_main_threading_info *inpm =
2109 (struct dlil_main_threading_info *)inp;
2110 _qinit(&inpm->lo_rcvq_pkts, Q_DROPTAIL, limit, QP_MBUF);
2111 }
2112
2113 if (func == NULL) {
2114 ASSERT(!(ifp->if_xflags & IFXF_LEGACY));
2115 ASSERT(error == 0);
2116 error = ENODEV;
2117 goto done;
2118 }
2119
2120 error = kernel_thread_start(func, inp, &inp->dlth_thread);
2121 if (error == KERN_SUCCESS) {
2122 thread_precedence_policy_data_t info;
2123 __unused kern_return_t kret;
2124
2125 bzero(&info, sizeof(info));
2126 info.importance = 0;
2127 kret = thread_policy_set(inp->dlth_thread,
2128 THREAD_PRECEDENCE_POLICY, (thread_policy_t)&info,
2129 THREAD_PRECEDENCE_POLICY_COUNT);
2130 ASSERT(kret == KERN_SUCCESS);
2131 /*
2132 * We create an affinity set so that the matching workloop
2133 * thread or the starter thread (for loopback) can be
2134 * scheduled on the same processor set as the input thread.
2135 */
2136 if (net_affinity) {
2137 struct thread *tp = inp->dlth_thread;
2138 u_int32_t tag;
2139 /*
2140 * Randomize to reduce the probability
2141 * of affinity tag namespace collision.
2142 */
2143 read_frandom(&tag, sizeof(tag));
2144 if (dlil_affinity_set(tp, tag) == KERN_SUCCESS) {
2145 thread_reference(tp);
2146 inp->dlth_affinity_tag = tag;
2147 inp->dlth_affinity = TRUE;
2148 }
2149 }
2150 } else if (inp == dlil_main_input_thread) {
2151 panic_plain("%s: couldn't create main input thread", __func__);
2152 /* NOTREACHED */
2153 } else {
2154 panic_plain("%s: couldn't create %s input thread", __func__,
2155 if_name(ifp));
2156 /* NOTREACHED */
2157 }
2158 OSAddAtomic(1, &cur_dlil_input_threads);
2159
2160 done:
2161 return error;
2162 }
2163
2164 #if TEST_INPUT_THREAD_TERMINATION
2165 static int
2166 sysctl_input_thread_termination_spin SYSCTL_HANDLER_ARGS
2167 {
2168 #pragma unused(arg1, arg2)
2169 uint32_t i;
2170 int err;
2171
2172 i = if_input_thread_termination_spin;
2173
2174 err = sysctl_handle_int(oidp, &i, 0, req);
2175 if (err != 0 || req->newptr == USER_ADDR_NULL) {
2176 return err;
2177 }
2178
2179 if (net_rxpoll == 0) {
2180 return ENXIO;
2181 }
2182
2183 if_input_thread_termination_spin = i;
2184 return err;
2185 }
2186 #endif /* TEST_INPUT_THREAD_TERMINATION */
2187
2188 static void
dlil_clean_threading_info(struct dlil_threading_info * inp)2189 dlil_clean_threading_info(struct dlil_threading_info *inp)
2190 {
2191 lck_mtx_destroy(&inp->dlth_lock, inp->dlth_lock_grp);
2192 lck_grp_free(inp->dlth_lock_grp);
2193 inp->dlth_lock_grp = NULL;
2194
2195 inp->dlth_flags = 0;
2196 inp->dlth_wtot = 0;
2197 bzero(inp->dlth_name, sizeof(inp->dlth_name));
2198 inp->dlth_ifp = NULL;
2199 VERIFY(qhead(&inp->dlth_pkts) == NULL && qempty(&inp->dlth_pkts));
2200 qlimit(&inp->dlth_pkts) = 0;
2201 bzero(&inp->dlth_stats, sizeof(inp->dlth_stats));
2202
2203 VERIFY(!inp->dlth_affinity);
2204 inp->dlth_thread = THREAD_NULL;
2205 inp->dlth_strategy = NULL;
2206 VERIFY(inp->dlth_driver_thread == THREAD_NULL);
2207 VERIFY(inp->dlth_poller_thread == THREAD_NULL);
2208 VERIFY(inp->dlth_affinity_tag == 0);
2209 #if IFNET_INPUT_SANITY_CHK
2210 inp->dlth_pkts_cnt = 0;
2211 #endif /* IFNET_INPUT_SANITY_CHK */
2212 }
2213
2214 static void
dlil_terminate_input_thread(struct dlil_threading_info * inp)2215 dlil_terminate_input_thread(struct dlil_threading_info *inp)
2216 {
2217 struct ifnet *ifp = inp->dlth_ifp;
2218 classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
2219
2220 VERIFY(current_thread() == inp->dlth_thread);
2221 VERIFY(inp != dlil_main_input_thread);
2222
2223 OSAddAtomic(-1, &cur_dlil_input_threads);
2224
2225 #if TEST_INPUT_THREAD_TERMINATION
2226 { /* do something useless that won't get optimized away */
2227 uint32_t v = 1;
2228 for (uint32_t i = 0;
2229 i < if_input_thread_termination_spin;
2230 i++) {
2231 v = (i + 1) * v;
2232 }
2233 DLIL_PRINTF("the value is %d\n", v);
2234 }
2235 #endif /* TEST_INPUT_THREAD_TERMINATION */
2236
2237 lck_mtx_lock_spin(&inp->dlth_lock);
2238 _getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
2239 VERIFY((inp->dlth_flags & DLIL_INPUT_TERMINATE) != 0);
2240 inp->dlth_flags |= DLIL_INPUT_TERMINATE_COMPLETE;
2241 wakeup_one((caddr_t)&inp->dlth_flags);
2242 lck_mtx_unlock(&inp->dlth_lock);
2243
2244 /* free up pending packets */
2245 if (pkt.cp_mbuf != NULL) {
2246 mbuf_freem_list(pkt.cp_mbuf);
2247 }
2248
2249 /* for the extra refcnt from kernel_thread_start() */
2250 thread_deallocate(current_thread());
2251
2252 if (dlil_verbose) {
2253 DLIL_PRINTF("%s: input thread terminated\n",
2254 if_name(ifp));
2255 }
2256
2257 /* this is the end */
2258 thread_terminate(current_thread());
2259 /* NOTREACHED */
2260 }
2261
2262 static kern_return_t
dlil_affinity_set(struct thread * tp,u_int32_t tag)2263 dlil_affinity_set(struct thread *tp, u_int32_t tag)
2264 {
2265 thread_affinity_policy_data_t policy;
2266
2267 bzero(&policy, sizeof(policy));
2268 policy.affinity_tag = tag;
2269 return thread_policy_set(tp, THREAD_AFFINITY_POLICY,
2270 (thread_policy_t)&policy, THREAD_AFFINITY_POLICY_COUNT);
2271 }
2272
2273 #if defined(SKYWALK) && defined(XNU_TARGET_OS_OSX)
2274 static void
dlil_filter_event(struct eventhandler_entry_arg arg __unused,enum net_filter_event_subsystems state)2275 dlil_filter_event(struct eventhandler_entry_arg arg __unused,
2276 enum net_filter_event_subsystems state)
2277 {
2278 if (state == 0) {
2279 if_enable_fsw_transport_netagent = 1;
2280 } else {
2281 if_enable_fsw_transport_netagent = 0;
2282 }
2283 kern_nexus_update_netagents();
2284 }
2285 #endif /* SKYWALK && XNU_TARGET_OS_OSX */
2286
2287 void
dlil_init(void)2288 dlil_init(void)
2289 {
2290 thread_t thread = THREAD_NULL;
2291
2292 /*
2293 * The following fields must be 64-bit aligned for atomic operations.
2294 */
2295 IF_DATA_REQUIRE_ALIGNED_64(ifi_ipackets);
2296 IF_DATA_REQUIRE_ALIGNED_64(ifi_ierrors);
2297 IF_DATA_REQUIRE_ALIGNED_64(ifi_opackets);
2298 IF_DATA_REQUIRE_ALIGNED_64(ifi_oerrors);
2299 IF_DATA_REQUIRE_ALIGNED_64(ifi_collisions);
2300 IF_DATA_REQUIRE_ALIGNED_64(ifi_ibytes);
2301 IF_DATA_REQUIRE_ALIGNED_64(ifi_obytes);
2302 IF_DATA_REQUIRE_ALIGNED_64(ifi_imcasts);
2303 IF_DATA_REQUIRE_ALIGNED_64(ifi_omcasts);
2304 IF_DATA_REQUIRE_ALIGNED_64(ifi_iqdrops);
2305 IF_DATA_REQUIRE_ALIGNED_64(ifi_noproto);
2306 IF_DATA_REQUIRE_ALIGNED_64(ifi_alignerrs);
2307 IF_DATA_REQUIRE_ALIGNED_64(ifi_dt_bytes);
2308 IF_DATA_REQUIRE_ALIGNED_64(ifi_fpackets);
2309 IF_DATA_REQUIRE_ALIGNED_64(ifi_fbytes);
2310
2311 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_ipackets);
2312 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_ierrors);
2313 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_opackets);
2314 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_oerrors);
2315 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_collisions);
2316 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_ibytes);
2317 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_obytes);
2318 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_imcasts);
2319 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_omcasts);
2320 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_iqdrops);
2321 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_noproto);
2322 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_alignerrs);
2323 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_dt_bytes);
2324 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_fpackets);
2325 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_fbytes);
2326
2327 /*
2328 * These IF_HWASSIST_ flags must be equal to their IFNET_* counterparts.
2329 */
2330 _CASSERT(IF_HWASSIST_CSUM_IP == IFNET_CSUM_IP);
2331 _CASSERT(IF_HWASSIST_CSUM_TCP == IFNET_CSUM_TCP);
2332 _CASSERT(IF_HWASSIST_CSUM_UDP == IFNET_CSUM_UDP);
2333 _CASSERT(IF_HWASSIST_CSUM_IP_FRAGS == IFNET_CSUM_FRAGMENT);
2334 _CASSERT(IF_HWASSIST_CSUM_FRAGMENT == IFNET_IP_FRAGMENT);
2335 _CASSERT(IF_HWASSIST_CSUM_TCPIPV6 == IFNET_CSUM_TCPIPV6);
2336 _CASSERT(IF_HWASSIST_CSUM_UDPIPV6 == IFNET_CSUM_UDPIPV6);
2337 _CASSERT(IF_HWASSIST_CSUM_FRAGMENT_IPV6 == IFNET_IPV6_FRAGMENT);
2338 _CASSERT(IF_HWASSIST_CSUM_PARTIAL == IFNET_CSUM_PARTIAL);
2339 _CASSERT(IF_HWASSIST_CSUM_ZERO_INVERT == IFNET_CSUM_ZERO_INVERT);
2340 _CASSERT(IF_HWASSIST_VLAN_TAGGING == IFNET_VLAN_TAGGING);
2341 _CASSERT(IF_HWASSIST_VLAN_MTU == IFNET_VLAN_MTU);
2342 _CASSERT(IF_HWASSIST_TSO_V4 == IFNET_TSO_IPV4);
2343 _CASSERT(IF_HWASSIST_TSO_V6 == IFNET_TSO_IPV6);
2344
2345 /*
2346 * ... as well as the mbuf checksum flags counterparts.
2347 */
2348 _CASSERT(CSUM_IP == IF_HWASSIST_CSUM_IP);
2349 _CASSERT(CSUM_TCP == IF_HWASSIST_CSUM_TCP);
2350 _CASSERT(CSUM_UDP == IF_HWASSIST_CSUM_UDP);
2351 _CASSERT(CSUM_IP_FRAGS == IF_HWASSIST_CSUM_IP_FRAGS);
2352 _CASSERT(CSUM_FRAGMENT == IF_HWASSIST_CSUM_FRAGMENT);
2353 _CASSERT(CSUM_TCPIPV6 == IF_HWASSIST_CSUM_TCPIPV6);
2354 _CASSERT(CSUM_UDPIPV6 == IF_HWASSIST_CSUM_UDPIPV6);
2355 _CASSERT(CSUM_FRAGMENT_IPV6 == IF_HWASSIST_CSUM_FRAGMENT_IPV6);
2356 _CASSERT(CSUM_PARTIAL == IF_HWASSIST_CSUM_PARTIAL);
2357 _CASSERT(CSUM_ZERO_INVERT == IF_HWASSIST_CSUM_ZERO_INVERT);
2358 _CASSERT(CSUM_VLAN_TAG_VALID == IF_HWASSIST_VLAN_TAGGING);
2359
2360 /*
2361 * Make sure we have at least IF_LLREACH_MAXLEN in the llreach info.
2362 */
2363 _CASSERT(IF_LLREACH_MAXLEN <= IF_LLREACHINFO_ADDRLEN);
2364 _CASSERT(IFNET_LLREACHINFO_ADDRLEN == IF_LLREACHINFO_ADDRLEN);
2365
2366 _CASSERT(IFRLOGF_DLIL == IFNET_LOGF_DLIL);
2367 _CASSERT(IFRLOGF_FAMILY == IFNET_LOGF_FAMILY);
2368 _CASSERT(IFRLOGF_DRIVER == IFNET_LOGF_DRIVER);
2369 _CASSERT(IFRLOGF_FIRMWARE == IFNET_LOGF_FIRMWARE);
2370
2371 _CASSERT(IFRLOGCAT_CONNECTIVITY == IFNET_LOGCAT_CONNECTIVITY);
2372 _CASSERT(IFRLOGCAT_QUALITY == IFNET_LOGCAT_QUALITY);
2373 _CASSERT(IFRLOGCAT_PERFORMANCE == IFNET_LOGCAT_PERFORMANCE);
2374
2375 _CASSERT(IFRTYPE_FAMILY_ANY == IFNET_FAMILY_ANY);
2376 _CASSERT(IFRTYPE_FAMILY_LOOPBACK == IFNET_FAMILY_LOOPBACK);
2377 _CASSERT(IFRTYPE_FAMILY_ETHERNET == IFNET_FAMILY_ETHERNET);
2378 _CASSERT(IFRTYPE_FAMILY_SLIP == IFNET_FAMILY_SLIP);
2379 _CASSERT(IFRTYPE_FAMILY_TUN == IFNET_FAMILY_TUN);
2380 _CASSERT(IFRTYPE_FAMILY_VLAN == IFNET_FAMILY_VLAN);
2381 _CASSERT(IFRTYPE_FAMILY_PPP == IFNET_FAMILY_PPP);
2382 _CASSERT(IFRTYPE_FAMILY_PVC == IFNET_FAMILY_PVC);
2383 _CASSERT(IFRTYPE_FAMILY_DISC == IFNET_FAMILY_DISC);
2384 _CASSERT(IFRTYPE_FAMILY_MDECAP == IFNET_FAMILY_MDECAP);
2385 _CASSERT(IFRTYPE_FAMILY_GIF == IFNET_FAMILY_GIF);
2386 _CASSERT(IFRTYPE_FAMILY_FAITH == IFNET_FAMILY_FAITH);
2387 _CASSERT(IFRTYPE_FAMILY_STF == IFNET_FAMILY_STF);
2388 _CASSERT(IFRTYPE_FAMILY_FIREWIRE == IFNET_FAMILY_FIREWIRE);
2389 _CASSERT(IFRTYPE_FAMILY_BOND == IFNET_FAMILY_BOND);
2390 _CASSERT(IFRTYPE_FAMILY_CELLULAR == IFNET_FAMILY_CELLULAR);
2391 _CASSERT(IFRTYPE_FAMILY_6LOWPAN == IFNET_FAMILY_6LOWPAN);
2392 _CASSERT(IFRTYPE_FAMILY_UTUN == IFNET_FAMILY_UTUN);
2393 _CASSERT(IFRTYPE_FAMILY_IPSEC == IFNET_FAMILY_IPSEC);
2394
2395 _CASSERT(IFRTYPE_SUBFAMILY_ANY == IFNET_SUBFAMILY_ANY);
2396 _CASSERT(IFRTYPE_SUBFAMILY_USB == IFNET_SUBFAMILY_USB);
2397 _CASSERT(IFRTYPE_SUBFAMILY_BLUETOOTH == IFNET_SUBFAMILY_BLUETOOTH);
2398 _CASSERT(IFRTYPE_SUBFAMILY_WIFI == IFNET_SUBFAMILY_WIFI);
2399 _CASSERT(IFRTYPE_SUBFAMILY_THUNDERBOLT == IFNET_SUBFAMILY_THUNDERBOLT);
2400 _CASSERT(IFRTYPE_SUBFAMILY_RESERVED == IFNET_SUBFAMILY_RESERVED);
2401 _CASSERT(IFRTYPE_SUBFAMILY_INTCOPROC == IFNET_SUBFAMILY_INTCOPROC);
2402 _CASSERT(IFRTYPE_SUBFAMILY_QUICKRELAY == IFNET_SUBFAMILY_QUICKRELAY);
2403 _CASSERT(IFRTYPE_SUBFAMILY_DEFAULT == IFNET_SUBFAMILY_DEFAULT);
2404
2405 _CASSERT(DLIL_MODIDLEN == IFNET_MODIDLEN);
2406 _CASSERT(DLIL_MODARGLEN == IFNET_MODARGLEN);
2407
2408 PE_parse_boot_argn("net_affinity", &net_affinity,
2409 sizeof(net_affinity));
2410
2411 PE_parse_boot_argn("net_rxpoll", &net_rxpoll, sizeof(net_rxpoll));
2412
2413 PE_parse_boot_argn("net_rtref", &net_rtref, sizeof(net_rtref));
2414
2415 PE_parse_boot_argn("net_async", &net_async, sizeof(net_async));
2416
2417 PE_parse_boot_argn("ifnet_debug", &ifnet_debug, sizeof(ifnet_debug));
2418
2419 VERIFY(dlil_pending_thread_cnt == 0);
2420 #if SKYWALK
2421 boolean_t pe_enable_fsw_transport_netagent = FALSE;
2422 boolean_t pe_disable_fsw_transport_netagent = FALSE;
2423 boolean_t enable_fsw_netagent =
2424 (((if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0) ||
2425 (if_attach_nx & IF_ATTACH_NX_FSW_IP_NETAGENT) != 0);
2426
2427 /*
2428 * Check the device tree to see if Skywalk netagent has been explicitly
2429 * enabled or disabled. This can be overridden via if_attach_nx below.
2430 * Note that the property is a 0-length key, and so checking for the
2431 * presence itself is enough (no need to check for the actual value of
2432 * the retrieved variable.)
2433 */
2434 pe_enable_fsw_transport_netagent =
2435 PE_get_default("kern.skywalk_netagent_enable",
2436 &pe_enable_fsw_transport_netagent,
2437 sizeof(pe_enable_fsw_transport_netagent));
2438 pe_disable_fsw_transport_netagent =
2439 PE_get_default("kern.skywalk_netagent_disable",
2440 &pe_disable_fsw_transport_netagent,
2441 sizeof(pe_disable_fsw_transport_netagent));
2442
2443 /*
2444 * These two are mutually exclusive, i.e. they both can be absent,
2445 * but only one can be present at a time, and so we assert to make
2446 * sure it is correct.
2447 */
2448 VERIFY((!pe_enable_fsw_transport_netagent &&
2449 !pe_disable_fsw_transport_netagent) ||
2450 (pe_enable_fsw_transport_netagent ^
2451 pe_disable_fsw_transport_netagent));
2452
2453 if (pe_enable_fsw_transport_netagent) {
2454 kprintf("SK: netagent is enabled via an override for "
2455 "this platform\n");
2456 if_attach_nx = SKYWALK_NETWORKING_ENABLED;
2457 } else if (pe_disable_fsw_transport_netagent) {
2458 kprintf("SK: netagent is disabled via an override for "
2459 "this platform\n");
2460 if_attach_nx = SKYWALK_NETWORKING_DISABLED;
2461 } else {
2462 kprintf("SK: netagent is %s by default for this platform\n",
2463 (enable_fsw_netagent ? "enabled" : "disabled"));
2464 if_attach_nx = IF_ATTACH_NX_DEFAULT;
2465 }
2466
2467 /*
2468 * Now see if there's a boot-arg override.
2469 */
2470 (void) PE_parse_boot_argn("if_attach_nx", &if_attach_nx,
2471 sizeof(if_attach_nx));
2472 if_enable_fsw_transport_netagent =
2473 ((if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0);
2474
2475 if_netif_all = ((if_attach_nx & IF_ATTACH_NX_NETIF_ALL) != 0);
2476
2477 if (pe_disable_fsw_transport_netagent &&
2478 if_enable_fsw_transport_netagent) {
2479 kprintf("SK: netagent is force-enabled\n");
2480 } else if (!pe_disable_fsw_transport_netagent &&
2481 !if_enable_fsw_transport_netagent) {
2482 kprintf("SK: netagent is force-disabled\n");
2483 }
2484 #ifdef XNU_TARGET_OS_OSX
2485 if (if_enable_fsw_transport_netagent) {
2486 net_filter_event_register(dlil_filter_event);
2487 }
2488 #endif /* XNU_TARGET_OS_OSX */
2489
2490 #if (DEVELOPMENT || DEBUG)
2491 (void) PE_parse_boot_argn("fsw_use_max_mtu_buffer",
2492 &fsw_use_max_mtu_buffer, sizeof(fsw_use_max_mtu_buffer));
2493 #endif /* (DEVELOPMENT || DEBUG) */
2494
2495 #endif /* SKYWALK */
2496 dlif_size = (ifnet_debug == 0) ? sizeof(struct dlil_ifnet) :
2497 sizeof(struct dlil_ifnet_dbg);
2498 /* Enforce 64-bit alignment for dlil_ifnet structure */
2499 dlif_bufsize = dlif_size + sizeof(void *) + sizeof(u_int64_t);
2500 dlif_bufsize = (uint32_t)P2ROUNDUP(dlif_bufsize, sizeof(u_int64_t));
2501 dlif_zone = zone_create(DLIF_ZONE_NAME, dlif_bufsize, ZC_ZFREE_CLEARMEM);
2502
2503 dlif_tcpstat_size = sizeof(struct tcpstat_local);
2504 /* Enforce 64-bit alignment for tcpstat_local structure */
2505 dlif_tcpstat_bufsize =
2506 dlif_tcpstat_size + sizeof(void *) + sizeof(u_int64_t);
2507 dlif_tcpstat_bufsize = (uint32_t)
2508 P2ROUNDUP(dlif_tcpstat_bufsize, sizeof(u_int64_t));
2509 dlif_tcpstat_zone = zone_create(DLIF_TCPSTAT_ZONE_NAME,
2510 dlif_tcpstat_bufsize, ZC_ZFREE_CLEARMEM);
2511
2512 dlif_udpstat_size = sizeof(struct udpstat_local);
2513 /* Enforce 64-bit alignment for udpstat_local structure */
2514 dlif_udpstat_bufsize =
2515 dlif_udpstat_size + sizeof(void *) + sizeof(u_int64_t);
2516 dlif_udpstat_bufsize = (uint32_t)
2517 P2ROUNDUP(dlif_udpstat_bufsize, sizeof(u_int64_t));
2518 dlif_udpstat_zone = zone_create(DLIF_UDPSTAT_ZONE_NAME,
2519 dlif_udpstat_bufsize, ZC_ZFREE_CLEARMEM);
2520
2521 eventhandler_lists_ctxt_init(&ifnet_evhdlr_ctxt);
2522
2523 TAILQ_INIT(&dlil_ifnet_head);
2524 TAILQ_INIT(&ifnet_head);
2525 TAILQ_INIT(&ifnet_detaching_head);
2526 TAILQ_INIT(&ifnet_ordered_head);
2527
2528 /* Initialize interface address subsystem */
2529 ifa_init();
2530
2531 #if PF
2532 /* Initialize the packet filter */
2533 pfinit();
2534 #endif /* PF */
2535
2536 /* Initialize queue algorithms */
2537 classq_init();
2538
2539 /* Initialize packet schedulers */
2540 pktsched_init();
2541
2542 /* Initialize flow advisory subsystem */
2543 flowadv_init();
2544
2545 /* Initialize the pktap virtual interface */
2546 pktap_init();
2547
2548 /* Initialize the service class to dscp map */
2549 net_qos_map_init();
2550
2551 /* Initialize the interface low power mode event handler */
2552 if_low_power_evhdlr_init();
2553
2554 /* Initialize the interface offload port list subsystem */
2555 if_ports_used_init();
2556
2557 #if DEBUG || DEVELOPMENT
2558 /* Run self-tests */
2559 dlil_verify_sum16();
2560 #endif /* DEBUG || DEVELOPMENT */
2561
2562 /*
2563 * Create and start up the main DLIL input thread and the interface
2564 * detacher threads once everything is initialized.
2565 */
2566 dlil_incr_pending_thread_count();
2567 (void) dlil_create_input_thread(NULL, dlil_main_input_thread, NULL);
2568
2569 /*
2570 * Create ifnet detacher thread.
2571 * When an interface gets detached, part of the detach processing
2572 * is delayed. The interface is added to delayed detach list
2573 * and this thread is woken up to call ifnet_detach_final
2574 * on these interfaces.
2575 */
2576 dlil_incr_pending_thread_count();
2577 if (kernel_thread_start(ifnet_detacher_thread_func,
2578 NULL, &thread) != KERN_SUCCESS) {
2579 panic_plain("%s: couldn't create detacher thread", __func__);
2580 /* NOTREACHED */
2581 }
2582 thread_deallocate(thread);
2583
2584 /*
2585 * Wait for the created kernel threads for dlil to get
2586 * scheduled and run at least once before we proceed
2587 */
2588 lck_mtx_lock(&dlil_thread_sync_lock);
2589 while (dlil_pending_thread_cnt != 0) {
2590 DLIL_PRINTF("%s: Waiting for all the create dlil kernel "
2591 "threads to get scheduled at least once.\n", __func__);
2592 (void) msleep(&dlil_pending_thread_cnt, &dlil_thread_sync_lock,
2593 (PZERO - 1), __func__, NULL);
2594 LCK_MTX_ASSERT(&dlil_thread_sync_lock, LCK_ASSERT_OWNED);
2595 }
2596 lck_mtx_unlock(&dlil_thread_sync_lock);
2597 DLIL_PRINTF("%s: All the created dlil kernel threads have been "
2598 "scheduled at least once. Proceeding.\n", __func__);
2599 }
2600
2601 static void
if_flt_monitor_busy(struct ifnet * ifp)2602 if_flt_monitor_busy(struct ifnet *ifp)
2603 {
2604 LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
2605
2606 ++ifp->if_flt_busy;
2607 VERIFY(ifp->if_flt_busy != 0);
2608 }
2609
2610 static void
if_flt_monitor_unbusy(struct ifnet * ifp)2611 if_flt_monitor_unbusy(struct ifnet *ifp)
2612 {
2613 if_flt_monitor_leave(ifp);
2614 }
2615
2616 static void
if_flt_monitor_enter(struct ifnet * ifp)2617 if_flt_monitor_enter(struct ifnet *ifp)
2618 {
2619 LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
2620
2621 while (ifp->if_flt_busy) {
2622 ++ifp->if_flt_waiters;
2623 (void) msleep(&ifp->if_flt_head, &ifp->if_flt_lock,
2624 (PZERO - 1), "if_flt_monitor", NULL);
2625 }
2626 if_flt_monitor_busy(ifp);
2627 }
2628
2629 static void
if_flt_monitor_leave(struct ifnet * ifp)2630 if_flt_monitor_leave(struct ifnet *ifp)
2631 {
2632 LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
2633
2634 VERIFY(ifp->if_flt_busy != 0);
2635 --ifp->if_flt_busy;
2636
2637 if (ifp->if_flt_busy == 0 && ifp->if_flt_waiters > 0) {
2638 ifp->if_flt_waiters = 0;
2639 wakeup(&ifp->if_flt_head);
2640 }
2641 }
2642
2643 __private_extern__ int
dlil_attach_filter(struct ifnet * ifp,const struct iff_filter * if_filter,interface_filter_t * filter_ref,u_int32_t flags)2644 dlil_attach_filter(struct ifnet *ifp, const struct iff_filter *if_filter,
2645 interface_filter_t *filter_ref, u_int32_t flags)
2646 {
2647 int retval = 0;
2648 struct ifnet_filter *filter = NULL;
2649
2650 ifnet_head_lock_shared();
2651
2652 /* Check that the interface is in the global list */
2653 if (!ifnet_lookup(ifp)) {
2654 retval = ENXIO;
2655 goto done;
2656 }
2657 if (!ifnet_is_attached(ifp, 1)) {
2658 os_log(OS_LOG_DEFAULT, "%s: %s is no longer attached",
2659 __func__, if_name(ifp));
2660 retval = ENXIO;
2661 goto done;
2662 }
2663
2664 filter = zalloc_flags(dlif_filt_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
2665
2666 /* refcnt held above during lookup */
2667 filter->filt_flags = flags;
2668 filter->filt_ifp = ifp;
2669 filter->filt_cookie = if_filter->iff_cookie;
2670 filter->filt_name = if_filter->iff_name;
2671 filter->filt_protocol = if_filter->iff_protocol;
2672 /*
2673 * Do not install filter callbacks for internal coproc interface
2674 */
2675 if (!IFNET_IS_INTCOPROC(ifp)) {
2676 filter->filt_input = if_filter->iff_input;
2677 filter->filt_output = if_filter->iff_output;
2678 filter->filt_event = if_filter->iff_event;
2679 filter->filt_ioctl = if_filter->iff_ioctl;
2680 }
2681 filter->filt_detached = if_filter->iff_detached;
2682
2683 lck_mtx_lock(&ifp->if_flt_lock);
2684 if_flt_monitor_enter(ifp);
2685
2686 LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
2687 TAILQ_INSERT_TAIL(&ifp->if_flt_head, filter, filt_next);
2688
2689 *filter_ref = filter;
2690
2691 /*
2692 * Bump filter count and route_generation ID to let TCP
2693 * know it shouldn't do TSO on this connection
2694 */
2695 if ((filter->filt_flags & DLIL_IFF_TSO) == 0) {
2696 ifnet_filter_update_tso(ifp, TRUE);
2697 }
2698 OSIncrementAtomic64(&net_api_stats.nas_iflt_attach_count);
2699 INC_ATOMIC_INT64_LIM(net_api_stats.nas_iflt_attach_total);
2700 if (filter->filt_flags & DLIL_IFF_INTERNAL) {
2701 OSIncrementAtomic64(&net_api_stats.nas_iflt_attach_os_count);
2702 INC_ATOMIC_INT64_LIM(net_api_stats.nas_iflt_attach_os_total);
2703 } else {
2704 OSAddAtomic(1, &ifp->if_flt_non_os_count);
2705 }
2706 if_flt_monitor_leave(ifp);
2707 lck_mtx_unlock(&ifp->if_flt_lock);
2708
2709 #if defined(SKYWALK) && defined(XNU_TARGET_OS_OSX)
2710 net_filter_event_mark(NET_FILTER_EVENT_INTERFACE,
2711 net_check_compatible_if_filter(NULL));
2712 #endif /* SKYWALK && XNU_TARGET_OS_OSX */
2713
2714 if (dlil_verbose) {
2715 DLIL_PRINTF("%s: %s filter attached\n", if_name(ifp),
2716 if_filter->iff_name);
2717 }
2718 ifnet_decr_iorefcnt(ifp);
2719
2720 done:
2721 ifnet_head_done();
2722 if (retval != 0 && ifp != NULL) {
2723 DLIL_PRINTF("%s: failed to attach %s (err=%d)\n",
2724 if_name(ifp), if_filter->iff_name, retval);
2725 }
2726 if (retval != 0 && filter != NULL) {
2727 zfree(dlif_filt_zone, filter);
2728 }
2729
2730 return retval;
2731 }
2732
2733 static int
dlil_detach_filter_internal(interface_filter_t filter,int detached)2734 dlil_detach_filter_internal(interface_filter_t filter, int detached)
2735 {
2736 int retval = 0;
2737
2738 if (detached == 0) {
2739 ifnet_t ifp = NULL;
2740
2741 ifnet_head_lock_shared();
2742 TAILQ_FOREACH(ifp, &ifnet_head, if_link) {
2743 interface_filter_t entry = NULL;
2744
2745 lck_mtx_lock(&ifp->if_flt_lock);
2746 TAILQ_FOREACH(entry, &ifp->if_flt_head, filt_next) {
2747 if (entry != filter || entry->filt_skip) {
2748 continue;
2749 }
2750 /*
2751 * We've found a match; since it's possible
2752 * that the thread gets blocked in the monitor,
2753 * we do the lock dance. Interface should
2754 * not be detached since we still have a use
2755 * count held during filter attach.
2756 */
2757 entry->filt_skip = 1; /* skip input/output */
2758 lck_mtx_unlock(&ifp->if_flt_lock);
2759 ifnet_head_done();
2760
2761 lck_mtx_lock(&ifp->if_flt_lock);
2762 if_flt_monitor_enter(ifp);
2763 LCK_MTX_ASSERT(&ifp->if_flt_lock,
2764 LCK_MTX_ASSERT_OWNED);
2765
2766 /* Remove the filter from the list */
2767 TAILQ_REMOVE(&ifp->if_flt_head, filter,
2768 filt_next);
2769
2770 if (dlil_verbose) {
2771 DLIL_PRINTF("%s: %s filter detached\n",
2772 if_name(ifp), filter->filt_name);
2773 }
2774 if (!(filter->filt_flags & DLIL_IFF_INTERNAL)) {
2775 VERIFY(ifp->if_flt_non_os_count != 0);
2776 OSAddAtomic(-1, &ifp->if_flt_non_os_count);
2777 }
2778 /*
2779 * Decrease filter count and route_generation
2780 * ID to let TCP know it should reevalute doing
2781 * TSO or not.
2782 */
2783 if ((filter->filt_flags & DLIL_IFF_TSO) == 0) {
2784 ifnet_filter_update_tso(ifp, FALSE);
2785 }
2786 if_flt_monitor_leave(ifp);
2787 lck_mtx_unlock(&ifp->if_flt_lock);
2788 goto destroy;
2789 }
2790 lck_mtx_unlock(&ifp->if_flt_lock);
2791 }
2792 ifnet_head_done();
2793
2794 /* filter parameter is not a valid filter ref */
2795 retval = EINVAL;
2796 goto done;
2797 } else {
2798 struct ifnet *ifp = filter->filt_ifp;
2799 /*
2800 * Here we are called from ifnet_detach_final(); the
2801 * caller had emptied if_flt_head and we're doing an
2802 * implicit filter detach because the interface is
2803 * about to go away. Make sure to adjust the counters
2804 * in this case. We don't need the protection of the
2805 * filter monitor since we're called as part of the
2806 * final detach in the context of the detacher thread.
2807 */
2808 if (!(filter->filt_flags & DLIL_IFF_INTERNAL)) {
2809 VERIFY(ifp->if_flt_non_os_count != 0);
2810 OSAddAtomic(-1, &ifp->if_flt_non_os_count);
2811 }
2812 /*
2813 * Decrease filter count and route_generation
2814 * ID to let TCP know it should reevalute doing
2815 * TSO or not.
2816 */
2817 if ((filter->filt_flags & DLIL_IFF_TSO) == 0) {
2818 ifnet_filter_update_tso(ifp, FALSE);
2819 }
2820 }
2821
2822 if (dlil_verbose) {
2823 DLIL_PRINTF("%s filter detached\n", filter->filt_name);
2824 }
2825
2826 destroy:
2827
2828 /* Call the detached function if there is one */
2829 if (filter->filt_detached) {
2830 filter->filt_detached(filter->filt_cookie, filter->filt_ifp);
2831 }
2832
2833 VERIFY(OSDecrementAtomic64(&net_api_stats.nas_iflt_attach_count) > 0);
2834 if (filter->filt_flags & DLIL_IFF_INTERNAL) {
2835 VERIFY(OSDecrementAtomic64(&net_api_stats.nas_iflt_attach_os_count) > 0);
2836 }
2837 #if defined(SKYWALK) && defined(XNU_TARGET_OS_OSX)
2838 net_filter_event_mark(NET_FILTER_EVENT_INTERFACE,
2839 net_check_compatible_if_filter(NULL));
2840 #endif /* SKYWALK && XNU_TARGET_OS_OSX */
2841
2842 /* Free the filter */
2843 zfree(dlif_filt_zone, filter);
2844 filter = NULL;
2845 done:
2846 if (retval != 0 && filter != NULL) {
2847 DLIL_PRINTF("failed to detach %s filter (err=%d)\n",
2848 filter->filt_name, retval);
2849 }
2850
2851 return retval;
2852 }
2853
2854 __private_extern__ void
dlil_detach_filter(interface_filter_t filter)2855 dlil_detach_filter(interface_filter_t filter)
2856 {
2857 if (filter == NULL) {
2858 return;
2859 }
2860 dlil_detach_filter_internal(filter, 0);
2861 }
2862
2863 __private_extern__ boolean_t
dlil_has_ip_filter(void)2864 dlil_has_ip_filter(void)
2865 {
2866 boolean_t has_filter = (net_api_stats.nas_ipf_add_count > 0);
2867 DTRACE_IP1(dlil_has_ip_filter, boolean_t, has_filter);
2868 return has_filter;
2869 }
2870
2871 __private_extern__ boolean_t
dlil_has_if_filter(struct ifnet * ifp)2872 dlil_has_if_filter(struct ifnet *ifp)
2873 {
2874 boolean_t has_filter = !TAILQ_EMPTY(&ifp->if_flt_head);
2875 DTRACE_IP1(dlil_has_if_filter, boolean_t, has_filter);
2876 return has_filter;
2877 }
2878
2879 static inline void
dlil_input_wakeup(struct dlil_threading_info * inp)2880 dlil_input_wakeup(struct dlil_threading_info *inp)
2881 {
2882 LCK_MTX_ASSERT(&inp->dlth_lock, LCK_MTX_ASSERT_OWNED);
2883
2884 inp->dlth_flags |= DLIL_INPUT_WAITING;
2885 if (!(inp->dlth_flags & DLIL_INPUT_RUNNING)) {
2886 inp->dlth_wtot++;
2887 wakeup_one((caddr_t)&inp->dlth_flags);
2888 }
2889 }
2890
2891 __attribute__((noreturn))
2892 static void
dlil_main_input_thread_func(void * v,wait_result_t w)2893 dlil_main_input_thread_func(void *v, wait_result_t w)
2894 {
2895 #pragma unused(w)
2896 struct dlil_threading_info *inp = v;
2897
2898 VERIFY(inp == dlil_main_input_thread);
2899 VERIFY(inp->dlth_ifp == NULL);
2900 VERIFY(current_thread() == inp->dlth_thread);
2901
2902 lck_mtx_lock(&inp->dlth_lock);
2903 VERIFY(!(inp->dlth_flags & (DLIL_INPUT_EMBRYONIC | DLIL_INPUT_RUNNING)));
2904 (void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
2905 inp->dlth_flags |= DLIL_INPUT_EMBRYONIC;
2906 /* wake up once to get out of embryonic state */
2907 dlil_input_wakeup(inp);
2908 lck_mtx_unlock(&inp->dlth_lock);
2909 (void) thread_block_parameter(dlil_main_input_thread_cont, inp);
2910 /* NOTREACHED */
2911 __builtin_unreachable();
2912 }
2913
2914 /*
2915 * Main input thread:
2916 *
2917 * a) handles all inbound packets for lo0
2918 * b) handles all inbound packets for interfaces with no dedicated
2919 * input thread (e.g. anything but Ethernet/PDP or those that support
2920 * opportunistic polling.)
2921 * c) protocol registrations
2922 * d) packet injections
2923 */
2924 __attribute__((noreturn))
2925 static void
dlil_main_input_thread_cont(void * v,wait_result_t wres)2926 dlil_main_input_thread_cont(void *v, wait_result_t wres)
2927 {
2928 struct dlil_main_threading_info *inpm = v;
2929 struct dlil_threading_info *inp = v;
2930
2931 /* main input thread is uninterruptible */
2932 VERIFY(wres != THREAD_INTERRUPTED);
2933 lck_mtx_lock_spin(&inp->dlth_lock);
2934 VERIFY(!(inp->dlth_flags & (DLIL_INPUT_TERMINATE |
2935 DLIL_INPUT_RUNNING)));
2936 inp->dlth_flags |= DLIL_INPUT_RUNNING;
2937
2938 while (1) {
2939 struct mbuf *m = NULL, *m_loop = NULL;
2940 u_int32_t m_cnt, m_cnt_loop;
2941 classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
2942 boolean_t proto_req;
2943 boolean_t embryonic;
2944
2945 inp->dlth_flags &= ~DLIL_INPUT_WAITING;
2946
2947 if (__improbable(embryonic =
2948 (inp->dlth_flags & DLIL_INPUT_EMBRYONIC))) {
2949 inp->dlth_flags &= ~DLIL_INPUT_EMBRYONIC;
2950 }
2951
2952 proto_req = (inp->dlth_flags &
2953 (DLIL_PROTO_WAITING | DLIL_PROTO_REGISTER));
2954
2955 /* Packets for non-dedicated interfaces other than lo0 */
2956 m_cnt = qlen(&inp->dlth_pkts);
2957 _getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
2958 m = pkt.cp_mbuf;
2959
2960 /* Packets exclusive to lo0 */
2961 m_cnt_loop = qlen(&inpm->lo_rcvq_pkts);
2962 _getq_all(&inpm->lo_rcvq_pkts, &pkt, NULL, NULL, NULL);
2963 m_loop = pkt.cp_mbuf;
2964
2965 inp->dlth_wtot = 0;
2966
2967 lck_mtx_unlock(&inp->dlth_lock);
2968
2969 if (__improbable(embryonic)) {
2970 dlil_decr_pending_thread_count();
2971 }
2972
2973 /*
2974 * NOTE warning %%% attention !!!!
2975 * We should think about putting some thread starvation
2976 * safeguards if we deal with long chains of packets.
2977 */
2978 if (__probable(m_loop != NULL)) {
2979 dlil_input_packet_list_extended(lo_ifp, m_loop,
2980 m_cnt_loop, IFNET_MODEL_INPUT_POLL_OFF);
2981 }
2982
2983 if (__probable(m != NULL)) {
2984 dlil_input_packet_list_extended(NULL, m,
2985 m_cnt, IFNET_MODEL_INPUT_POLL_OFF);
2986 }
2987
2988 if (__improbable(proto_req)) {
2989 proto_input_run();
2990 }
2991
2992 lck_mtx_lock_spin(&inp->dlth_lock);
2993 VERIFY(inp->dlth_flags & DLIL_INPUT_RUNNING);
2994 /* main input thread cannot be terminated */
2995 VERIFY(!(inp->dlth_flags & DLIL_INPUT_TERMINATE));
2996 if (!(inp->dlth_flags & ~DLIL_INPUT_RUNNING)) {
2997 break;
2998 }
2999 }
3000
3001 inp->dlth_flags &= ~DLIL_INPUT_RUNNING;
3002 (void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3003 lck_mtx_unlock(&inp->dlth_lock);
3004 (void) thread_block_parameter(dlil_main_input_thread_cont, inp);
3005
3006 VERIFY(0); /* we should never get here */
3007 /* NOTREACHED */
3008 __builtin_unreachable();
3009 }
3010
3011 /*
3012 * Input thread for interfaces with legacy input model.
3013 */
3014 __attribute__((noreturn))
3015 static void
dlil_input_thread_func(void * v,wait_result_t w)3016 dlil_input_thread_func(void *v, wait_result_t w)
3017 {
3018 #pragma unused(w)
3019 char thread_name[MAXTHREADNAMESIZE];
3020 struct dlil_threading_info *inp = v;
3021 struct ifnet *ifp = inp->dlth_ifp;
3022
3023 VERIFY(inp != dlil_main_input_thread);
3024 VERIFY(ifp != NULL);
3025 VERIFY(!(ifp->if_eflags & IFEF_RXPOLL) || !net_rxpoll ||
3026 !(ifp->if_xflags & IFXF_LEGACY));
3027 VERIFY(ifp->if_poll_mode == IFNET_MODEL_INPUT_POLL_OFF ||
3028 !(ifp->if_xflags & IFXF_LEGACY));
3029 VERIFY(current_thread() == inp->dlth_thread);
3030
3031 /* construct the name for this thread, and then apply it */
3032 bzero(thread_name, sizeof(thread_name));
3033 (void) snprintf(thread_name, sizeof(thread_name),
3034 "dlil_input_%s", ifp->if_xname);
3035 thread_set_thread_name(inp->dlth_thread, thread_name);
3036
3037 lck_mtx_lock(&inp->dlth_lock);
3038 VERIFY(!(inp->dlth_flags & (DLIL_INPUT_EMBRYONIC | DLIL_INPUT_RUNNING)));
3039 (void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3040 inp->dlth_flags |= DLIL_INPUT_EMBRYONIC;
3041 /* wake up once to get out of embryonic state */
3042 dlil_input_wakeup(inp);
3043 lck_mtx_unlock(&inp->dlth_lock);
3044 (void) thread_block_parameter(dlil_input_thread_cont, inp);
3045 /* NOTREACHED */
3046 __builtin_unreachable();
3047 }
3048
3049 __attribute__((noreturn))
3050 static void
dlil_input_thread_cont(void * v,wait_result_t wres)3051 dlil_input_thread_cont(void *v, wait_result_t wres)
3052 {
3053 struct dlil_threading_info *inp = v;
3054 struct ifnet *ifp = inp->dlth_ifp;
3055
3056 lck_mtx_lock_spin(&inp->dlth_lock);
3057 if (__improbable(wres == THREAD_INTERRUPTED ||
3058 (inp->dlth_flags & DLIL_INPUT_TERMINATE))) {
3059 goto terminate;
3060 }
3061
3062 VERIFY(!(inp->dlth_flags & DLIL_INPUT_RUNNING));
3063 inp->dlth_flags |= DLIL_INPUT_RUNNING;
3064
3065 while (1) {
3066 struct mbuf *m = NULL;
3067 classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
3068 boolean_t notify = FALSE;
3069 boolean_t embryonic;
3070 u_int32_t m_cnt;
3071
3072 inp->dlth_flags &= ~DLIL_INPUT_WAITING;
3073
3074 if (__improbable(embryonic =
3075 (inp->dlth_flags & DLIL_INPUT_EMBRYONIC))) {
3076 inp->dlth_flags &= ~DLIL_INPUT_EMBRYONIC;
3077 }
3078
3079 /*
3080 * Protocol registration and injection must always use
3081 * the main input thread; in theory the latter can utilize
3082 * the corresponding input thread where the packet arrived
3083 * on, but that requires our knowing the interface in advance
3084 * (and the benefits might not worth the trouble.)
3085 */
3086 VERIFY(!(inp->dlth_flags &
3087 (DLIL_PROTO_WAITING | DLIL_PROTO_REGISTER)));
3088
3089 /* Packets for this interface */
3090 m_cnt = qlen(&inp->dlth_pkts);
3091 _getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
3092 m = pkt.cp_mbuf;
3093
3094 inp->dlth_wtot = 0;
3095
3096 #if SKYWALK
3097 /*
3098 * If this interface is attached to a netif nexus,
3099 * the stats are already incremented there; otherwise
3100 * do it here.
3101 */
3102 if (!(ifp->if_capabilities & IFCAP_SKYWALK))
3103 #endif /* SKYWALK */
3104 notify = dlil_input_stats_sync(ifp, inp);
3105
3106 lck_mtx_unlock(&inp->dlth_lock);
3107
3108 if (__improbable(embryonic)) {
3109 ifnet_decr_pending_thread_count(ifp);
3110 }
3111
3112 if (__improbable(notify)) {
3113 ifnet_notify_data_threshold(ifp);
3114 }
3115
3116 /*
3117 * NOTE warning %%% attention !!!!
3118 * We should think about putting some thread starvation
3119 * safeguards if we deal with long chains of packets.
3120 */
3121 if (__probable(m != NULL)) {
3122 dlil_input_packet_list_extended(NULL, m,
3123 m_cnt, ifp->if_poll_mode);
3124 }
3125
3126 lck_mtx_lock_spin(&inp->dlth_lock);
3127 VERIFY(inp->dlth_flags & DLIL_INPUT_RUNNING);
3128 if (!(inp->dlth_flags & ~(DLIL_INPUT_RUNNING |
3129 DLIL_INPUT_TERMINATE))) {
3130 break;
3131 }
3132 }
3133
3134 inp->dlth_flags &= ~DLIL_INPUT_RUNNING;
3135
3136 if (__improbable(inp->dlth_flags & DLIL_INPUT_TERMINATE)) {
3137 terminate:
3138 lck_mtx_unlock(&inp->dlth_lock);
3139 dlil_terminate_input_thread(inp);
3140 /* NOTREACHED */
3141 } else {
3142 (void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3143 lck_mtx_unlock(&inp->dlth_lock);
3144 (void) thread_block_parameter(dlil_input_thread_cont, inp);
3145 /* NOTREACHED */
3146 }
3147
3148 VERIFY(0); /* we should never get here */
3149 /* NOTREACHED */
3150 __builtin_unreachable();
3151 }
3152
3153 /*
3154 * Input thread for interfaces with opportunistic polling input model.
3155 */
3156 __attribute__((noreturn))
3157 static void
dlil_rxpoll_input_thread_func(void * v,wait_result_t w)3158 dlil_rxpoll_input_thread_func(void *v, wait_result_t w)
3159 {
3160 #pragma unused(w)
3161 char thread_name[MAXTHREADNAMESIZE];
3162 struct dlil_threading_info *inp = v;
3163 struct ifnet *ifp = inp->dlth_ifp;
3164
3165 VERIFY(inp != dlil_main_input_thread);
3166 VERIFY(ifp != NULL && (ifp->if_eflags & IFEF_RXPOLL) &&
3167 (ifp->if_xflags & IFXF_LEGACY));
3168 VERIFY(current_thread() == inp->dlth_thread);
3169
3170 /* construct the name for this thread, and then apply it */
3171 bzero(thread_name, sizeof(thread_name));
3172 (void) snprintf(thread_name, sizeof(thread_name),
3173 "dlil_input_poll_%s", ifp->if_xname);
3174 thread_set_thread_name(inp->dlth_thread, thread_name);
3175
3176 lck_mtx_lock(&inp->dlth_lock);
3177 VERIFY(!(inp->dlth_flags & (DLIL_INPUT_EMBRYONIC | DLIL_INPUT_RUNNING)));
3178 (void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3179 inp->dlth_flags |= DLIL_INPUT_EMBRYONIC;
3180 /* wake up once to get out of embryonic state */
3181 dlil_input_wakeup(inp);
3182 lck_mtx_unlock(&inp->dlth_lock);
3183 (void) thread_block_parameter(dlil_rxpoll_input_thread_cont, inp);
3184 /* NOTREACHED */
3185 __builtin_unreachable();
3186 }
3187
3188 __attribute__((noreturn))
3189 static void
dlil_rxpoll_input_thread_cont(void * v,wait_result_t wres)3190 dlil_rxpoll_input_thread_cont(void *v, wait_result_t wres)
3191 {
3192 struct dlil_threading_info *inp = v;
3193 struct ifnet *ifp = inp->dlth_ifp;
3194 struct timespec ts;
3195
3196 lck_mtx_lock_spin(&inp->dlth_lock);
3197 if (__improbable(wres == THREAD_INTERRUPTED ||
3198 (inp->dlth_flags & DLIL_INPUT_TERMINATE))) {
3199 goto terminate;
3200 }
3201
3202 VERIFY(!(inp->dlth_flags & DLIL_INPUT_RUNNING));
3203 inp->dlth_flags |= DLIL_INPUT_RUNNING;
3204
3205 while (1) {
3206 struct mbuf *m = NULL;
3207 uint32_t m_cnt, poll_req = 0;
3208 uint64_t m_size = 0;
3209 ifnet_model_t mode;
3210 struct timespec now, delta;
3211 classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
3212 boolean_t notify;
3213 boolean_t embryonic;
3214 uint64_t ival;
3215
3216 inp->dlth_flags &= ~DLIL_INPUT_WAITING;
3217
3218 if (__improbable(embryonic =
3219 (inp->dlth_flags & DLIL_INPUT_EMBRYONIC))) {
3220 inp->dlth_flags &= ~DLIL_INPUT_EMBRYONIC;
3221 goto skip;
3222 }
3223
3224 if ((ival = ifp->if_rxpoll_ival) < IF_RXPOLL_INTERVALTIME_MIN) {
3225 ival = IF_RXPOLL_INTERVALTIME_MIN;
3226 }
3227
3228 /* Link parameters changed? */
3229 if (ifp->if_poll_update != 0) {
3230 ifp->if_poll_update = 0;
3231 (void) dlil_rxpoll_set_params(ifp, NULL, TRUE);
3232 }
3233
3234 /* Current operating mode */
3235 mode = ifp->if_poll_mode;
3236
3237 /*
3238 * Protocol registration and injection must always use
3239 * the main input thread; in theory the latter can utilize
3240 * the corresponding input thread where the packet arrived
3241 * on, but that requires our knowing the interface in advance
3242 * (and the benefits might not worth the trouble.)
3243 */
3244 VERIFY(!(inp->dlth_flags &
3245 (DLIL_PROTO_WAITING | DLIL_PROTO_REGISTER)));
3246
3247 /* Total count of all packets */
3248 m_cnt = qlen(&inp->dlth_pkts);
3249
3250 /* Total bytes of all packets */
3251 m_size = qsize(&inp->dlth_pkts);
3252
3253 /* Packets for this interface */
3254 _getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
3255 m = pkt.cp_mbuf;
3256 VERIFY(m != NULL || m_cnt == 0);
3257
3258 nanouptime(&now);
3259 if (!net_timerisset(&ifp->if_poll_sample_lasttime)) {
3260 *(&ifp->if_poll_sample_lasttime) = *(&now);
3261 }
3262
3263 net_timersub(&now, &ifp->if_poll_sample_lasttime, &delta);
3264 if (if_rxpoll && net_timerisset(&ifp->if_poll_sample_holdtime)) {
3265 u_int32_t ptot, btot;
3266
3267 /* Accumulate statistics for current sampling */
3268 PKTCNTR_ADD(&ifp->if_poll_sstats, m_cnt, m_size);
3269
3270 if (net_timercmp(&delta, &ifp->if_poll_sample_holdtime, <)) {
3271 goto skip;
3272 }
3273
3274 *(&ifp->if_poll_sample_lasttime) = *(&now);
3275
3276 /* Calculate min/max of inbound bytes */
3277 btot = (u_int32_t)ifp->if_poll_sstats.bytes;
3278 if (ifp->if_rxpoll_bmin == 0 || ifp->if_rxpoll_bmin > btot) {
3279 ifp->if_rxpoll_bmin = btot;
3280 }
3281 if (btot > ifp->if_rxpoll_bmax) {
3282 ifp->if_rxpoll_bmax = btot;
3283 }
3284
3285 /* Calculate EWMA of inbound bytes */
3286 DLIL_EWMA(ifp->if_rxpoll_bavg, btot, if_rxpoll_decay);
3287
3288 /* Calculate min/max of inbound packets */
3289 ptot = (u_int32_t)ifp->if_poll_sstats.packets;
3290 if (ifp->if_rxpoll_pmin == 0 || ifp->if_rxpoll_pmin > ptot) {
3291 ifp->if_rxpoll_pmin = ptot;
3292 }
3293 if (ptot > ifp->if_rxpoll_pmax) {
3294 ifp->if_rxpoll_pmax = ptot;
3295 }
3296
3297 /* Calculate EWMA of inbound packets */
3298 DLIL_EWMA(ifp->if_rxpoll_pavg, ptot, if_rxpoll_decay);
3299
3300 /* Reset sampling statistics */
3301 PKTCNTR_CLEAR(&ifp->if_poll_sstats);
3302
3303 /* Calculate EWMA of wakeup requests */
3304 DLIL_EWMA(ifp->if_rxpoll_wavg, inp->dlth_wtot,
3305 if_rxpoll_decay);
3306 inp->dlth_wtot = 0;
3307
3308 if (dlil_verbose) {
3309 if (!net_timerisset(&ifp->if_poll_dbg_lasttime)) {
3310 *(&ifp->if_poll_dbg_lasttime) = *(&now);
3311 }
3312 net_timersub(&now, &ifp->if_poll_dbg_lasttime, &delta);
3313 if (net_timercmp(&delta, &dlil_dbgrate, >=)) {
3314 *(&ifp->if_poll_dbg_lasttime) = *(&now);
3315 DLIL_PRINTF("%s: [%s] pkts avg %d max %d "
3316 "limits [%d/%d], wreq avg %d "
3317 "limits [%d/%d], bytes avg %d "
3318 "limits [%d/%d]\n", if_name(ifp),
3319 (ifp->if_poll_mode ==
3320 IFNET_MODEL_INPUT_POLL_ON) ?
3321 "ON" : "OFF", ifp->if_rxpoll_pavg,
3322 ifp->if_rxpoll_pmax,
3323 ifp->if_rxpoll_plowat,
3324 ifp->if_rxpoll_phiwat,
3325 ifp->if_rxpoll_wavg,
3326 ifp->if_rxpoll_wlowat,
3327 ifp->if_rxpoll_whiwat,
3328 ifp->if_rxpoll_bavg,
3329 ifp->if_rxpoll_blowat,
3330 ifp->if_rxpoll_bhiwat);
3331 }
3332 }
3333
3334 /* Perform mode transition, if necessary */
3335 if (!net_timerisset(&ifp->if_poll_mode_lasttime)) {
3336 *(&ifp->if_poll_mode_lasttime) = *(&now);
3337 }
3338
3339 net_timersub(&now, &ifp->if_poll_mode_lasttime, &delta);
3340 if (net_timercmp(&delta, &ifp->if_poll_mode_holdtime, <)) {
3341 goto skip;
3342 }
3343
3344 if (ifp->if_rxpoll_pavg <= ifp->if_rxpoll_plowat &&
3345 ifp->if_rxpoll_bavg <= ifp->if_rxpoll_blowat &&
3346 ifp->if_poll_mode != IFNET_MODEL_INPUT_POLL_OFF) {
3347 mode = IFNET_MODEL_INPUT_POLL_OFF;
3348 } else if (ifp->if_rxpoll_pavg >= ifp->if_rxpoll_phiwat &&
3349 (ifp->if_rxpoll_bavg >= ifp->if_rxpoll_bhiwat ||
3350 ifp->if_rxpoll_wavg >= ifp->if_rxpoll_whiwat) &&
3351 ifp->if_poll_mode != IFNET_MODEL_INPUT_POLL_ON) {
3352 mode = IFNET_MODEL_INPUT_POLL_ON;
3353 }
3354
3355 if (mode != ifp->if_poll_mode) {
3356 ifp->if_poll_mode = mode;
3357 *(&ifp->if_poll_mode_lasttime) = *(&now);
3358 poll_req++;
3359 }
3360 }
3361 skip:
3362 notify = dlil_input_stats_sync(ifp, inp);
3363
3364 lck_mtx_unlock(&inp->dlth_lock);
3365
3366 if (__improbable(embryonic)) {
3367 ifnet_decr_pending_thread_count(ifp);
3368 }
3369
3370 if (__improbable(notify)) {
3371 ifnet_notify_data_threshold(ifp);
3372 }
3373
3374 /*
3375 * If there's a mode change and interface is still attached,
3376 * perform a downcall to the driver for the new mode. Also
3377 * hold an IO refcnt on the interface to prevent it from
3378 * being detached (will be release below.)
3379 */
3380 if (poll_req != 0 && ifnet_is_attached(ifp, 1)) {
3381 struct ifnet_model_params p = {
3382 .model = mode, .reserved = { 0 }
3383 };
3384 errno_t err;
3385
3386 if (dlil_verbose) {
3387 DLIL_PRINTF("%s: polling is now %s, "
3388 "pkts avg %d max %d limits [%d/%d], "
3389 "wreq avg %d limits [%d/%d], "
3390 "bytes avg %d limits [%d/%d]\n",
3391 if_name(ifp),
3392 (mode == IFNET_MODEL_INPUT_POLL_ON) ?
3393 "ON" : "OFF", ifp->if_rxpoll_pavg,
3394 ifp->if_rxpoll_pmax, ifp->if_rxpoll_plowat,
3395 ifp->if_rxpoll_phiwat, ifp->if_rxpoll_wavg,
3396 ifp->if_rxpoll_wlowat, ifp->if_rxpoll_whiwat,
3397 ifp->if_rxpoll_bavg, ifp->if_rxpoll_blowat,
3398 ifp->if_rxpoll_bhiwat);
3399 }
3400
3401 if ((err = ((*ifp->if_input_ctl)(ifp,
3402 IFNET_CTL_SET_INPUT_MODEL, sizeof(p), &p))) != 0) {
3403 DLIL_PRINTF("%s: error setting polling mode "
3404 "to %s (%d)\n", if_name(ifp),
3405 (mode == IFNET_MODEL_INPUT_POLL_ON) ?
3406 "ON" : "OFF", err);
3407 }
3408
3409 switch (mode) {
3410 case IFNET_MODEL_INPUT_POLL_OFF:
3411 ifnet_set_poll_cycle(ifp, NULL);
3412 ifp->if_rxpoll_offreq++;
3413 if (err != 0) {
3414 ifp->if_rxpoll_offerr++;
3415 }
3416 break;
3417
3418 case IFNET_MODEL_INPUT_POLL_ON:
3419 net_nsectimer(&ival, &ts);
3420 ifnet_set_poll_cycle(ifp, &ts);
3421 ifnet_poll(ifp);
3422 ifp->if_rxpoll_onreq++;
3423 if (err != 0) {
3424 ifp->if_rxpoll_onerr++;
3425 }
3426 break;
3427
3428 default:
3429 VERIFY(0);
3430 /* NOTREACHED */
3431 }
3432
3433 /* Release the IO refcnt */
3434 ifnet_decr_iorefcnt(ifp);
3435 }
3436
3437 /*
3438 * NOTE warning %%% attention !!!!
3439 * We should think about putting some thread starvation
3440 * safeguards if we deal with long chains of packets.
3441 */
3442 if (__probable(m != NULL)) {
3443 dlil_input_packet_list_extended(NULL, m, m_cnt, mode);
3444 }
3445
3446 lck_mtx_lock_spin(&inp->dlth_lock);
3447 VERIFY(inp->dlth_flags & DLIL_INPUT_RUNNING);
3448 if (!(inp->dlth_flags & ~(DLIL_INPUT_RUNNING |
3449 DLIL_INPUT_TERMINATE))) {
3450 break;
3451 }
3452 }
3453
3454 inp->dlth_flags &= ~DLIL_INPUT_RUNNING;
3455
3456 if (__improbable(inp->dlth_flags & DLIL_INPUT_TERMINATE)) {
3457 terminate:
3458 lck_mtx_unlock(&inp->dlth_lock);
3459 dlil_terminate_input_thread(inp);
3460 /* NOTREACHED */
3461 } else {
3462 (void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3463 lck_mtx_unlock(&inp->dlth_lock);
3464 (void) thread_block_parameter(dlil_rxpoll_input_thread_cont,
3465 inp);
3466 /* NOTREACHED */
3467 }
3468
3469 VERIFY(0); /* we should never get here */
3470 /* NOTREACHED */
3471 __builtin_unreachable();
3472 }
3473
3474 errno_t
dlil_rxpoll_validate_params(struct ifnet_poll_params * p)3475 dlil_rxpoll_validate_params(struct ifnet_poll_params *p)
3476 {
3477 if (p != NULL) {
3478 if ((p->packets_lowat == 0 && p->packets_hiwat != 0) ||
3479 (p->packets_lowat != 0 && p->packets_hiwat == 0)) {
3480 return EINVAL;
3481 }
3482 if (p->packets_lowat != 0 && /* hiwat must be non-zero */
3483 p->packets_lowat >= p->packets_hiwat) {
3484 return EINVAL;
3485 }
3486 if ((p->bytes_lowat == 0 && p->bytes_hiwat != 0) ||
3487 (p->bytes_lowat != 0 && p->bytes_hiwat == 0)) {
3488 return EINVAL;
3489 }
3490 if (p->bytes_lowat != 0 && /* hiwat must be non-zero */
3491 p->bytes_lowat >= p->bytes_hiwat) {
3492 return EINVAL;
3493 }
3494 if (p->interval_time != 0 &&
3495 p->interval_time < IF_RXPOLL_INTERVALTIME_MIN) {
3496 p->interval_time = IF_RXPOLL_INTERVALTIME_MIN;
3497 }
3498 }
3499 return 0;
3500 }
3501
3502 void
dlil_rxpoll_update_params(struct ifnet * ifp,struct ifnet_poll_params * p)3503 dlil_rxpoll_update_params(struct ifnet *ifp, struct ifnet_poll_params *p)
3504 {
3505 u_int64_t sample_holdtime, inbw;
3506
3507 if ((inbw = ifnet_input_linkrate(ifp)) == 0 && p == NULL) {
3508 sample_holdtime = 0; /* polling is disabled */
3509 ifp->if_rxpoll_wlowat = ifp->if_rxpoll_plowat =
3510 ifp->if_rxpoll_blowat = 0;
3511 ifp->if_rxpoll_whiwat = ifp->if_rxpoll_phiwat =
3512 ifp->if_rxpoll_bhiwat = (u_int32_t)-1;
3513 ifp->if_rxpoll_plim = 0;
3514 ifp->if_rxpoll_ival = IF_RXPOLL_INTERVALTIME_MIN;
3515 } else {
3516 u_int32_t plowat, phiwat, blowat, bhiwat, plim;
3517 u_int64_t ival;
3518 unsigned int n, i;
3519
3520 for (n = 0, i = 0; rxpoll_tbl[i].speed != 0; i++) {
3521 if (inbw < rxpoll_tbl[i].speed) {
3522 break;
3523 }
3524 n = i;
3525 }
3526 /* auto-tune if caller didn't specify a value */
3527 plowat = ((p == NULL || p->packets_lowat == 0) ?
3528 rxpoll_tbl[n].plowat : p->packets_lowat);
3529 phiwat = ((p == NULL || p->packets_hiwat == 0) ?
3530 rxpoll_tbl[n].phiwat : p->packets_hiwat);
3531 blowat = ((p == NULL || p->bytes_lowat == 0) ?
3532 rxpoll_tbl[n].blowat : p->bytes_lowat);
3533 bhiwat = ((p == NULL || p->bytes_hiwat == 0) ?
3534 rxpoll_tbl[n].bhiwat : p->bytes_hiwat);
3535 plim = ((p == NULL || p->packets_limit == 0 ||
3536 if_rxpoll_max != 0) ? if_rxpoll_max : p->packets_limit);
3537 ival = ((p == NULL || p->interval_time == 0 ||
3538 if_rxpoll_interval_time != IF_RXPOLL_INTERVALTIME) ?
3539 if_rxpoll_interval_time : p->interval_time);
3540
3541 VERIFY(plowat != 0 && phiwat != 0);
3542 VERIFY(blowat != 0 && bhiwat != 0);
3543 VERIFY(ival >= IF_RXPOLL_INTERVALTIME_MIN);
3544
3545 sample_holdtime = if_rxpoll_sample_holdtime;
3546 ifp->if_rxpoll_wlowat = if_sysctl_rxpoll_wlowat;
3547 ifp->if_rxpoll_whiwat = if_sysctl_rxpoll_whiwat;
3548 ifp->if_rxpoll_plowat = plowat;
3549 ifp->if_rxpoll_phiwat = phiwat;
3550 ifp->if_rxpoll_blowat = blowat;
3551 ifp->if_rxpoll_bhiwat = bhiwat;
3552 ifp->if_rxpoll_plim = plim;
3553 ifp->if_rxpoll_ival = ival;
3554 }
3555
3556 net_nsectimer(&if_rxpoll_mode_holdtime, &ifp->if_poll_mode_holdtime);
3557 net_nsectimer(&sample_holdtime, &ifp->if_poll_sample_holdtime);
3558
3559 if (dlil_verbose) {
3560 DLIL_PRINTF("%s: speed %llu bps, sample per %llu nsec, "
3561 "poll interval %llu nsec, pkts per poll %u, "
3562 "pkt limits [%u/%u], wreq limits [%u/%u], "
3563 "bytes limits [%u/%u]\n", if_name(ifp),
3564 inbw, sample_holdtime, ifp->if_rxpoll_ival,
3565 ifp->if_rxpoll_plim, ifp->if_rxpoll_plowat,
3566 ifp->if_rxpoll_phiwat, ifp->if_rxpoll_wlowat,
3567 ifp->if_rxpoll_whiwat, ifp->if_rxpoll_blowat,
3568 ifp->if_rxpoll_bhiwat);
3569 }
3570 }
3571
3572 /*
3573 * Must be called on an attached ifnet (caller is expected to check.)
3574 * Caller may pass NULL for poll parameters to indicate "auto-tuning."
3575 */
3576 errno_t
dlil_rxpoll_set_params(struct ifnet * ifp,struct ifnet_poll_params * p,boolean_t locked)3577 dlil_rxpoll_set_params(struct ifnet *ifp, struct ifnet_poll_params *p,
3578 boolean_t locked)
3579 {
3580 errno_t err;
3581 struct dlil_threading_info *inp;
3582
3583 VERIFY(ifp != NULL);
3584 if (!(ifp->if_eflags & IFEF_RXPOLL) || (inp = ifp->if_inp) == NULL) {
3585 return ENXIO;
3586 }
3587 err = dlil_rxpoll_validate_params(p);
3588 if (err != 0) {
3589 return err;
3590 }
3591
3592 if (!locked) {
3593 lck_mtx_lock(&inp->dlth_lock);
3594 }
3595 LCK_MTX_ASSERT(&inp->dlth_lock, LCK_MTX_ASSERT_OWNED);
3596 /*
3597 * Normally, we'd reset the parameters to the auto-tuned values
3598 * if the the input thread detects a change in link rate. If the
3599 * driver provides its own parameters right after a link rate
3600 * changes, but before the input thread gets to run, we want to
3601 * make sure to keep the driver's values. Clearing if_poll_update
3602 * will achieve that.
3603 */
3604 if (p != NULL && !locked && ifp->if_poll_update != 0) {
3605 ifp->if_poll_update = 0;
3606 }
3607 dlil_rxpoll_update_params(ifp, p);
3608 if (!locked) {
3609 lck_mtx_unlock(&inp->dlth_lock);
3610 }
3611 return 0;
3612 }
3613
3614 /*
3615 * Must be called on an attached ifnet (caller is expected to check.)
3616 */
3617 errno_t
dlil_rxpoll_get_params(struct ifnet * ifp,struct ifnet_poll_params * p)3618 dlil_rxpoll_get_params(struct ifnet *ifp, struct ifnet_poll_params *p)
3619 {
3620 struct dlil_threading_info *inp;
3621
3622 VERIFY(ifp != NULL && p != NULL);
3623 if (!(ifp->if_eflags & IFEF_RXPOLL) || (inp = ifp->if_inp) == NULL) {
3624 return ENXIO;
3625 }
3626
3627 bzero(p, sizeof(*p));
3628
3629 lck_mtx_lock(&inp->dlth_lock);
3630 p->packets_limit = ifp->if_rxpoll_plim;
3631 p->packets_lowat = ifp->if_rxpoll_plowat;
3632 p->packets_hiwat = ifp->if_rxpoll_phiwat;
3633 p->bytes_lowat = ifp->if_rxpoll_blowat;
3634 p->bytes_hiwat = ifp->if_rxpoll_bhiwat;
3635 p->interval_time = ifp->if_rxpoll_ival;
3636 lck_mtx_unlock(&inp->dlth_lock);
3637
3638 return 0;
3639 }
3640
3641 errno_t
ifnet_input(struct ifnet * ifp,struct mbuf * m_head,const struct ifnet_stat_increment_param * s)3642 ifnet_input(struct ifnet *ifp, struct mbuf *m_head,
3643 const struct ifnet_stat_increment_param *s)
3644 {
3645 return ifnet_input_common(ifp, m_head, NULL, s, FALSE, FALSE);
3646 }
3647
3648 errno_t
ifnet_input_extended(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s)3649 ifnet_input_extended(struct ifnet *ifp, struct mbuf *m_head,
3650 struct mbuf *m_tail, const struct ifnet_stat_increment_param *s)
3651 {
3652 return ifnet_input_common(ifp, m_head, m_tail, s, TRUE, FALSE);
3653 }
3654
3655 errno_t
ifnet_input_poll(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s)3656 ifnet_input_poll(struct ifnet *ifp, struct mbuf *m_head,
3657 struct mbuf *m_tail, const struct ifnet_stat_increment_param *s)
3658 {
3659 return ifnet_input_common(ifp, m_head, m_tail, s,
3660 (m_head != NULL), TRUE);
3661 }
3662
3663 static errno_t
ifnet_input_common(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t ext,boolean_t poll)3664 ifnet_input_common(struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail,
3665 const struct ifnet_stat_increment_param *s, boolean_t ext, boolean_t poll)
3666 {
3667 dlil_input_func input_func;
3668 struct ifnet_stat_increment_param _s;
3669 u_int32_t m_cnt = 0, m_size = 0;
3670 struct mbuf *last;
3671 errno_t err = 0;
3672
3673 if ((m_head == NULL && !poll) || (s == NULL && ext)) {
3674 if (m_head != NULL) {
3675 mbuf_freem_list(m_head);
3676 }
3677 return EINVAL;
3678 }
3679
3680 VERIFY(m_head != NULL || (s == NULL && m_tail == NULL && !ext && poll));
3681 VERIFY(m_tail == NULL || ext);
3682 VERIFY(s != NULL || !ext);
3683
3684 /*
3685 * Drop the packet(s) if the parameters are invalid, or if the
3686 * interface is no longer attached; else hold an IO refcnt to
3687 * prevent it from being detached (will be released below.)
3688 */
3689 if (ifp == NULL || (ifp != lo_ifp && !ifnet_datamov_begin(ifp))) {
3690 if (m_head != NULL) {
3691 mbuf_freem_list(m_head);
3692 }
3693 return EINVAL;
3694 }
3695
3696 input_func = ifp->if_input_dlil;
3697 VERIFY(input_func != NULL);
3698
3699 if (m_tail == NULL) {
3700 last = m_head;
3701 while (m_head != NULL) {
3702 #if IFNET_INPUT_SANITY_CHK
3703 if (__improbable(dlil_input_sanity_check != 0)) {
3704 DLIL_INPUT_CHECK(last, ifp);
3705 }
3706 #endif /* IFNET_INPUT_SANITY_CHK */
3707 m_cnt++;
3708 m_size += m_length(last);
3709 if (mbuf_nextpkt(last) == NULL) {
3710 break;
3711 }
3712 last = mbuf_nextpkt(last);
3713 }
3714 m_tail = last;
3715 } else {
3716 #if IFNET_INPUT_SANITY_CHK
3717 if (__improbable(dlil_input_sanity_check != 0)) {
3718 last = m_head;
3719 while (1) {
3720 DLIL_INPUT_CHECK(last, ifp);
3721 m_cnt++;
3722 m_size += m_length(last);
3723 if (mbuf_nextpkt(last) == NULL) {
3724 break;
3725 }
3726 last = mbuf_nextpkt(last);
3727 }
3728 } else {
3729 m_cnt = s->packets_in;
3730 m_size = s->bytes_in;
3731 last = m_tail;
3732 }
3733 #else
3734 m_cnt = s->packets_in;
3735 m_size = s->bytes_in;
3736 last = m_tail;
3737 #endif /* IFNET_INPUT_SANITY_CHK */
3738 }
3739
3740 if (last != m_tail) {
3741 panic_plain("%s: invalid input packet chain for %s, "
3742 "tail mbuf %p instead of %p\n", __func__, if_name(ifp),
3743 m_tail, last);
3744 }
3745
3746 /*
3747 * Assert packet count only for the extended variant, for backwards
3748 * compatibility, since this came directly from the device driver.
3749 * Relax this assertion for input bytes, as the driver may have
3750 * included the link-layer headers in the computation; hence
3751 * m_size is just an approximation.
3752 */
3753 if (ext && s->packets_in != m_cnt) {
3754 panic_plain("%s: input packet count mismatch for %s, "
3755 "%d instead of %d\n", __func__, if_name(ifp),
3756 s->packets_in, m_cnt);
3757 }
3758
3759 if (s == NULL) {
3760 bzero(&_s, sizeof(_s));
3761 s = &_s;
3762 } else {
3763 _s = *s;
3764 }
3765 _s.packets_in = m_cnt;
3766 _s.bytes_in = m_size;
3767
3768 err = (*input_func)(ifp, m_head, m_tail, s, poll, current_thread());
3769
3770 if (ifp != lo_ifp) {
3771 /* Release the IO refcnt */
3772 ifnet_datamov_end(ifp);
3773 }
3774
3775 return err;
3776 }
3777
3778 #if SKYWALK
3779 errno_t
dlil_set_input_handler(struct ifnet * ifp,dlil_input_func fn)3780 dlil_set_input_handler(struct ifnet *ifp, dlil_input_func fn)
3781 {
3782 return atomic_test_set_ptr(&ifp->if_input_dlil,
3783 ptrauth_nop_cast(void *, &dlil_input_handler),
3784 ptrauth_nop_cast(void *, fn)) ? 0 : EBUSY;
3785 }
3786
3787 void
dlil_reset_input_handler(struct ifnet * ifp)3788 dlil_reset_input_handler(struct ifnet *ifp)
3789 {
3790 while (!atomic_test_set_ptr(&ifp->if_input_dlil,
3791 ptrauth_nop_cast(void *, ifp->if_input_dlil),
3792 ptrauth_nop_cast(void *, &dlil_input_handler))) {
3793 ;
3794 }
3795 }
3796 errno_t
dlil_set_output_handler(struct ifnet * ifp,dlil_output_func fn)3797 dlil_set_output_handler(struct ifnet *ifp, dlil_output_func fn)
3798 {
3799 return atomic_test_set_ptr(&ifp->if_output_dlil,
3800 ptrauth_nop_cast(void *, &dlil_output_handler),
3801 ptrauth_nop_cast(void *, fn)) ? 0 : EBUSY;
3802 }
3803
3804 void
dlil_reset_output_handler(struct ifnet * ifp)3805 dlil_reset_output_handler(struct ifnet *ifp)
3806 {
3807 while (!atomic_test_set_ptr(&ifp->if_output_dlil,
3808 ptrauth_nop_cast(void *, ifp->if_output_dlil),
3809 ptrauth_nop_cast(void *, &dlil_output_handler))) {
3810 ;
3811 }
3812 }
3813 #endif /* SKYWALK */
3814
3815 errno_t
dlil_output_handler(struct ifnet * ifp,struct mbuf * m)3816 dlil_output_handler(struct ifnet *ifp, struct mbuf *m)
3817 {
3818 return ifp->if_output(ifp, m);
3819 }
3820
3821 errno_t
dlil_input_handler(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t poll,struct thread * tp)3822 dlil_input_handler(struct ifnet *ifp, struct mbuf *m_head,
3823 struct mbuf *m_tail, const struct ifnet_stat_increment_param *s,
3824 boolean_t poll, struct thread *tp)
3825 {
3826 struct dlil_threading_info *inp = ifp->if_inp;
3827
3828 if (__improbable(inp == NULL)) {
3829 inp = dlil_main_input_thread;
3830 }
3831
3832 return inp->dlth_strategy(inp, ifp, m_head, m_tail, s, poll, tp);
3833 }
3834
3835 static errno_t
dlil_input_async(struct dlil_threading_info * inp,struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t poll,struct thread * tp)3836 dlil_input_async(struct dlil_threading_info *inp,
3837 struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail,
3838 const struct ifnet_stat_increment_param *s, boolean_t poll,
3839 struct thread *tp)
3840 {
3841 u_int32_t m_cnt = s->packets_in;
3842 u_int32_t m_size = s->bytes_in;
3843 boolean_t notify = FALSE;
3844
3845 /*
3846 * If there is a matching DLIL input thread associated with an
3847 * affinity set, associate this thread with the same set. We
3848 * will only do this once.
3849 */
3850 lck_mtx_lock_spin(&inp->dlth_lock);
3851 if (inp != dlil_main_input_thread && inp->dlth_affinity && tp != NULL &&
3852 ((!poll && inp->dlth_driver_thread == THREAD_NULL) ||
3853 (poll && inp->dlth_poller_thread == THREAD_NULL))) {
3854 u_int32_t tag = inp->dlth_affinity_tag;
3855
3856 if (poll) {
3857 VERIFY(inp->dlth_poller_thread == THREAD_NULL);
3858 inp->dlth_poller_thread = tp;
3859 } else {
3860 VERIFY(inp->dlth_driver_thread == THREAD_NULL);
3861 inp->dlth_driver_thread = tp;
3862 }
3863 lck_mtx_unlock(&inp->dlth_lock);
3864
3865 /* Associate the current thread with the new affinity tag */
3866 (void) dlil_affinity_set(tp, tag);
3867
3868 /*
3869 * Take a reference on the current thread; during detach,
3870 * we will need to refer to it in order to tear down its
3871 * affinity.
3872 */
3873 thread_reference(tp);
3874 lck_mtx_lock_spin(&inp->dlth_lock);
3875 }
3876
3877 VERIFY(m_head != NULL || (m_tail == NULL && m_cnt == 0));
3878
3879 /*
3880 * Because of loopbacked multicast we cannot stuff the ifp in
3881 * the rcvif of the packet header: loopback (lo0) packets use a
3882 * dedicated list so that we can later associate them with lo_ifp
3883 * on their way up the stack. Packets for other interfaces without
3884 * dedicated input threads go to the regular list.
3885 */
3886 if (m_head != NULL) {
3887 classq_pkt_t head, tail;
3888 CLASSQ_PKT_INIT_MBUF(&head, m_head);
3889 CLASSQ_PKT_INIT_MBUF(&tail, m_tail);
3890 if (inp == dlil_main_input_thread && ifp == lo_ifp) {
3891 struct dlil_main_threading_info *inpm =
3892 (struct dlil_main_threading_info *)inp;
3893 _addq_multi(&inpm->lo_rcvq_pkts, &head, &tail,
3894 m_cnt, m_size);
3895 } else {
3896 _addq_multi(&inp->dlth_pkts, &head, &tail,
3897 m_cnt, m_size);
3898 }
3899 }
3900
3901 #if IFNET_INPUT_SANITY_CHK
3902 if (__improbable(dlil_input_sanity_check != 0)) {
3903 u_int32_t count = 0, size = 0;
3904 struct mbuf *m0;
3905
3906 for (m0 = m_head; m0; m0 = mbuf_nextpkt(m0)) {
3907 size += m_length(m0);
3908 count++;
3909 }
3910
3911 if (count != m_cnt) {
3912 panic_plain("%s: invalid total packet count %u "
3913 "(expected %u)\n", if_name(ifp), count, m_cnt);
3914 /* NOTREACHED */
3915 __builtin_unreachable();
3916 } else if (size != m_size) {
3917 panic_plain("%s: invalid total packet size %u "
3918 "(expected %u)\n", if_name(ifp), size, m_size);
3919 /* NOTREACHED */
3920 __builtin_unreachable();
3921 }
3922
3923 inp->dlth_pkts_cnt += m_cnt;
3924 }
3925 #endif /* IFNET_INPUT_SANITY_CHK */
3926
3927 dlil_input_stats_add(s, inp, ifp, poll);
3928 /*
3929 * If we're using the main input thread, synchronize the
3930 * stats now since we have the interface context. All
3931 * other cases involving dedicated input threads will
3932 * have their stats synchronized there.
3933 */
3934 if (inp == dlil_main_input_thread) {
3935 notify = dlil_input_stats_sync(ifp, inp);
3936 }
3937
3938 dlil_input_wakeup(inp);
3939 lck_mtx_unlock(&inp->dlth_lock);
3940
3941 if (notify) {
3942 ifnet_notify_data_threshold(ifp);
3943 }
3944
3945 return 0;
3946 }
3947
3948 static errno_t
dlil_input_sync(struct dlil_threading_info * inp,struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t poll,struct thread * tp)3949 dlil_input_sync(struct dlil_threading_info *inp,
3950 struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail,
3951 const struct ifnet_stat_increment_param *s, boolean_t poll,
3952 struct thread *tp)
3953 {
3954 #pragma unused(tp)
3955 u_int32_t m_cnt = s->packets_in;
3956 u_int32_t m_size = s->bytes_in;
3957 boolean_t notify = FALSE;
3958 classq_pkt_t head, tail;
3959
3960 ASSERT(inp != dlil_main_input_thread);
3961
3962 /* XXX: should we just assert instead? */
3963 if (__improbable(m_head == NULL)) {
3964 return 0;
3965 }
3966
3967 CLASSQ_PKT_INIT_MBUF(&head, m_head);
3968 CLASSQ_PKT_INIT_MBUF(&tail, m_tail);
3969
3970 lck_mtx_lock_spin(&inp->dlth_lock);
3971 _addq_multi(&inp->dlth_pkts, &head, &tail, m_cnt, m_size);
3972
3973 #if IFNET_INPUT_SANITY_CHK
3974 if (__improbable(dlil_input_sanity_check != 0)) {
3975 u_int32_t count = 0, size = 0;
3976 struct mbuf *m0;
3977
3978 for (m0 = m_head; m0; m0 = mbuf_nextpkt(m0)) {
3979 size += m_length(m0);
3980 count++;
3981 }
3982
3983 if (count != m_cnt) {
3984 panic_plain("%s: invalid total packet count %u "
3985 "(expected %u)\n", if_name(ifp), count, m_cnt);
3986 /* NOTREACHED */
3987 __builtin_unreachable();
3988 } else if (size != m_size) {
3989 panic_plain("%s: invalid total packet size %u "
3990 "(expected %u)\n", if_name(ifp), size, m_size);
3991 /* NOTREACHED */
3992 __builtin_unreachable();
3993 }
3994
3995 inp->dlth_pkts_cnt += m_cnt;
3996 }
3997 #endif /* IFNET_INPUT_SANITY_CHK */
3998
3999 dlil_input_stats_add(s, inp, ifp, poll);
4000
4001 m_cnt = qlen(&inp->dlth_pkts);
4002 _getq_all(&inp->dlth_pkts, &head, NULL, NULL, NULL);
4003
4004 #if SKYWALK
4005 /*
4006 * If this interface is attached to a netif nexus,
4007 * the stats are already incremented there; otherwise
4008 * do it here.
4009 */
4010 if (!(ifp->if_capabilities & IFCAP_SKYWALK))
4011 #endif /* SKYWALK */
4012 notify = dlil_input_stats_sync(ifp, inp);
4013
4014 lck_mtx_unlock(&inp->dlth_lock);
4015
4016 if (notify) {
4017 ifnet_notify_data_threshold(ifp);
4018 }
4019
4020 /*
4021 * NOTE warning %%% attention !!!!
4022 * We should think about putting some thread starvation
4023 * safeguards if we deal with long chains of packets.
4024 */
4025 if (head.cp_mbuf != NULL) {
4026 dlil_input_packet_list_extended(NULL, head.cp_mbuf,
4027 m_cnt, ifp->if_poll_mode);
4028 }
4029
4030 return 0;
4031 }
4032
4033 #if SKYWALK
4034 errno_t
ifnet_set_output_handler(struct ifnet * ifp,ifnet_output_func fn)4035 ifnet_set_output_handler(struct ifnet *ifp, ifnet_output_func fn)
4036 {
4037 return atomic_test_set_ptr(&ifp->if_output,
4038 ptrauth_nop_cast(void *, ifp->if_save_output),
4039 ptrauth_nop_cast(void *, fn)) ? 0 : EBUSY;
4040 }
4041
4042 void
ifnet_reset_output_handler(struct ifnet * ifp)4043 ifnet_reset_output_handler(struct ifnet *ifp)
4044 {
4045 while (!atomic_test_set_ptr(&ifp->if_output,
4046 ptrauth_nop_cast(void *, ifp->if_output),
4047 ptrauth_nop_cast(void *, ifp->if_save_output))) {
4048 ;
4049 }
4050 }
4051
4052 errno_t
ifnet_set_start_handler(struct ifnet * ifp,ifnet_start_func fn)4053 ifnet_set_start_handler(struct ifnet *ifp, ifnet_start_func fn)
4054 {
4055 return atomic_test_set_ptr(&ifp->if_start,
4056 ptrauth_nop_cast(void *, ifp->if_save_start),
4057 ptrauth_nop_cast(void *, fn)) ? 0 : EBUSY;
4058 }
4059
4060 void
ifnet_reset_start_handler(struct ifnet * ifp)4061 ifnet_reset_start_handler(struct ifnet *ifp)
4062 {
4063 while (!atomic_test_set_ptr(&ifp->if_start,
4064 ptrauth_nop_cast(void *, ifp->if_start),
4065 ptrauth_nop_cast(void *, ifp->if_save_start))) {
4066 ;
4067 }
4068 }
4069 #endif /* SKYWALK */
4070
4071 static void
ifnet_start_common(struct ifnet * ifp,boolean_t resetfc)4072 ifnet_start_common(struct ifnet *ifp, boolean_t resetfc)
4073 {
4074 if (!(ifp->if_eflags & IFEF_TXSTART)) {
4075 return;
4076 }
4077 /*
4078 * If the starter thread is inactive, signal it to do work,
4079 * unless the interface is being flow controlled from below,
4080 * e.g. a virtual interface being flow controlled by a real
4081 * network interface beneath it, or it's been disabled via
4082 * a call to ifnet_disable_output().
4083 */
4084 lck_mtx_lock_spin(&ifp->if_start_lock);
4085 if (resetfc) {
4086 ifp->if_start_flags &= ~IFSF_FLOW_CONTROLLED;
4087 } else if (ifp->if_start_flags & IFSF_FLOW_CONTROLLED) {
4088 lck_mtx_unlock(&ifp->if_start_lock);
4089 return;
4090 }
4091 ifp->if_start_req++;
4092 if (!ifp->if_start_active && ifp->if_start_thread != THREAD_NULL &&
4093 (resetfc || !(ifp->if_eflags & IFEF_ENQUEUE_MULTI) ||
4094 IFCQ_LEN(ifp->if_snd) >= ifp->if_start_delay_qlen ||
4095 ifp->if_start_delayed == 0)) {
4096 (void) wakeup_one((caddr_t)&ifp->if_start_thread);
4097 }
4098 lck_mtx_unlock(&ifp->if_start_lock);
4099 }
4100
4101 void
ifnet_start(struct ifnet * ifp)4102 ifnet_start(struct ifnet *ifp)
4103 {
4104 ifnet_start_common(ifp, FALSE);
4105 }
4106
4107 __attribute__((noreturn))
4108 static void
ifnet_start_thread_func(void * v,wait_result_t w)4109 ifnet_start_thread_func(void *v, wait_result_t w)
4110 {
4111 #pragma unused(w)
4112 struct ifnet *ifp = v;
4113 char thread_name[MAXTHREADNAMESIZE];
4114
4115 /* Construct the name for this thread, and then apply it. */
4116 bzero(thread_name, sizeof(thread_name));
4117 (void) snprintf(thread_name, sizeof(thread_name),
4118 "ifnet_start_%s", ifp->if_xname);
4119 #if SKYWALK
4120 /* override name for native Skywalk interface */
4121 if (ifp->if_eflags & IFEF_SKYWALK_NATIVE) {
4122 (void) snprintf(thread_name, sizeof(thread_name),
4123 "skywalk_doorbell_%s_tx", ifp->if_xname);
4124 }
4125 #endif /* SKYWALK */
4126 ASSERT(ifp->if_start_thread == current_thread());
4127 thread_set_thread_name(current_thread(), thread_name);
4128
4129 /*
4130 * Treat the dedicated starter thread for lo0 as equivalent to
4131 * the driver workloop thread; if net_affinity is enabled for
4132 * the main input thread, associate this starter thread to it
4133 * by binding them with the same affinity tag. This is done
4134 * only once (as we only have one lo_ifp which never goes away.)
4135 */
4136 if (ifp == lo_ifp) {
4137 struct dlil_threading_info *inp = dlil_main_input_thread;
4138 struct thread *tp = current_thread();
4139 #if SKYWALK
4140 /* native skywalk loopback not yet implemented */
4141 VERIFY(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
4142 #endif /* SKYWALK */
4143
4144 lck_mtx_lock(&inp->dlth_lock);
4145 if (inp->dlth_affinity) {
4146 u_int32_t tag = inp->dlth_affinity_tag;
4147
4148 VERIFY(inp->dlth_driver_thread == THREAD_NULL);
4149 VERIFY(inp->dlth_poller_thread == THREAD_NULL);
4150 inp->dlth_driver_thread = tp;
4151 lck_mtx_unlock(&inp->dlth_lock);
4152
4153 /* Associate this thread with the affinity tag */
4154 (void) dlil_affinity_set(tp, tag);
4155 } else {
4156 lck_mtx_unlock(&inp->dlth_lock);
4157 }
4158 }
4159
4160 lck_mtx_lock(&ifp->if_start_lock);
4161 VERIFY(!ifp->if_start_embryonic && !ifp->if_start_active);
4162 (void) assert_wait(&ifp->if_start_thread, THREAD_UNINT);
4163 ifp->if_start_embryonic = 1;
4164 /* wake up once to get out of embryonic state */
4165 ifp->if_start_req++;
4166 (void) wakeup_one((caddr_t)&ifp->if_start_thread);
4167 lck_mtx_unlock(&ifp->if_start_lock);
4168 (void) thread_block_parameter(ifnet_start_thread_cont, ifp);
4169 /* NOTREACHED */
4170 __builtin_unreachable();
4171 }
4172
4173 __attribute__((noreturn))
4174 static void
ifnet_start_thread_cont(void * v,wait_result_t wres)4175 ifnet_start_thread_cont(void *v, wait_result_t wres)
4176 {
4177 struct ifnet *ifp = v;
4178 struct ifclassq *ifq = ifp->if_snd;
4179
4180 lck_mtx_lock_spin(&ifp->if_start_lock);
4181 if (__improbable(wres == THREAD_INTERRUPTED ||
4182 (ifp->if_start_flags & IFSF_TERMINATING) != 0)) {
4183 goto terminate;
4184 }
4185
4186 if (__improbable(ifp->if_start_embryonic)) {
4187 ifp->if_start_embryonic = 0;
4188 lck_mtx_unlock(&ifp->if_start_lock);
4189 ifnet_decr_pending_thread_count(ifp);
4190 lck_mtx_lock_spin(&ifp->if_start_lock);
4191 goto skip;
4192 }
4193
4194 ifp->if_start_active = 1;
4195
4196 /*
4197 * Keep on servicing until no more request.
4198 */
4199 for (;;) {
4200 u_int32_t req = ifp->if_start_req;
4201 if (!IFCQ_IS_EMPTY(ifq) &&
4202 (ifp->if_eflags & IFEF_ENQUEUE_MULTI) &&
4203 ifp->if_start_delayed == 0 &&
4204 IFCQ_LEN(ifq) < ifp->if_start_delay_qlen &&
4205 (ifp->if_eflags & IFEF_DELAY_START)) {
4206 ifp->if_start_delayed = 1;
4207 ifnet_start_delayed++;
4208 break;
4209 }
4210 ifp->if_start_delayed = 0;
4211 lck_mtx_unlock(&ifp->if_start_lock);
4212
4213 /*
4214 * If no longer attached, don't call start because ifp
4215 * is being destroyed; else hold an IO refcnt to
4216 * prevent the interface from being detached (will be
4217 * released below.)
4218 */
4219 if (!ifnet_datamov_begin(ifp)) {
4220 lck_mtx_lock_spin(&ifp->if_start_lock);
4221 break;
4222 }
4223
4224 /* invoke the driver's start routine */
4225 ((*ifp->if_start)(ifp));
4226
4227 /*
4228 * Release the io ref count taken above.
4229 */
4230 ifnet_datamov_end(ifp);
4231
4232 lck_mtx_lock_spin(&ifp->if_start_lock);
4233
4234 /*
4235 * If there's no pending request or if the
4236 * interface has been disabled, we're done.
4237 */
4238 #define _IFSF_DISABLED (IFSF_FLOW_CONTROLLED | IFSF_TERMINATING)
4239 if (req == ifp->if_start_req ||
4240 (ifp->if_start_flags & _IFSF_DISABLED) != 0) {
4241 break;
4242 }
4243 }
4244 skip:
4245 ifp->if_start_req = 0;
4246 ifp->if_start_active = 0;
4247
4248 #if SKYWALK
4249 /*
4250 * Wakeup any waiters, e.g. any threads waiting to
4251 * detach the interface from the flowswitch, etc.
4252 */
4253 if (ifp->if_start_waiters != 0) {
4254 ifp->if_start_waiters = 0;
4255 wakeup(&ifp->if_start_waiters);
4256 }
4257 #endif /* SKYWALK */
4258 if (__probable((ifp->if_start_flags & IFSF_TERMINATING) == 0)) {
4259 uint64_t deadline = TIMEOUT_WAIT_FOREVER;
4260 struct timespec delay_start_ts;
4261 struct timespec *ts;
4262
4263 /*
4264 * Wakeup N ns from now if rate-controlled by TBR, and if
4265 * there are still packets in the send queue which haven't
4266 * been dequeued so far; else sleep indefinitely (ts = NULL)
4267 * until ifnet_start() is called again.
4268 */
4269 ts = ((IFCQ_TBR_IS_ENABLED(ifq) && !IFCQ_IS_EMPTY(ifq)) ?
4270 &ifp->if_start_cycle : NULL);
4271
4272 if (ts == NULL && ifp->if_start_delayed == 1) {
4273 delay_start_ts.tv_sec = 0;
4274 delay_start_ts.tv_nsec = ifp->if_start_delay_timeout;
4275 ts = &delay_start_ts;
4276 }
4277
4278 if (ts != NULL && ts->tv_sec == 0 && ts->tv_nsec == 0) {
4279 ts = NULL;
4280 }
4281
4282 if (__improbable(ts != NULL)) {
4283 clock_interval_to_deadline((uint32_t)(ts->tv_nsec +
4284 (ts->tv_sec * NSEC_PER_SEC)), 1, &deadline);
4285 }
4286
4287 (void) assert_wait_deadline(&ifp->if_start_thread,
4288 THREAD_UNINT, deadline);
4289 lck_mtx_unlock(&ifp->if_start_lock);
4290 (void) thread_block_parameter(ifnet_start_thread_cont, ifp);
4291 /* NOTREACHED */
4292 } else {
4293 terminate:
4294 /* interface is detached? */
4295 ifnet_set_start_cycle(ifp, NULL);
4296
4297 /* clear if_start_thread to allow termination to continue */
4298 ASSERT(ifp->if_start_thread != THREAD_NULL);
4299 ifp->if_start_thread = THREAD_NULL;
4300 wakeup((caddr_t)&ifp->if_start_thread);
4301 lck_mtx_unlock(&ifp->if_start_lock);
4302
4303 if (dlil_verbose) {
4304 DLIL_PRINTF("%s: starter thread terminated\n",
4305 if_name(ifp));
4306 }
4307
4308 /* for the extra refcnt from kernel_thread_start() */
4309 thread_deallocate(current_thread());
4310 /* this is the end */
4311 thread_terminate(current_thread());
4312 /* NOTREACHED */
4313 }
4314
4315 /* must never get here */
4316 VERIFY(0);
4317 /* NOTREACHED */
4318 __builtin_unreachable();
4319 }
4320
4321 void
ifnet_set_start_cycle(struct ifnet * ifp,struct timespec * ts)4322 ifnet_set_start_cycle(struct ifnet *ifp, struct timespec *ts)
4323 {
4324 if (ts == NULL) {
4325 bzero(&ifp->if_start_cycle, sizeof(ifp->if_start_cycle));
4326 } else {
4327 *(&ifp->if_start_cycle) = *ts;
4328 }
4329
4330 if (ts != NULL && ts->tv_nsec != 0 && dlil_verbose) {
4331 DLIL_PRINTF("%s: restart interval set to %lu nsec\n",
4332 if_name(ifp), ts->tv_nsec);
4333 }
4334 }
4335
4336 static inline void
ifnet_poll_wakeup(struct ifnet * ifp)4337 ifnet_poll_wakeup(struct ifnet *ifp)
4338 {
4339 LCK_MTX_ASSERT(&ifp->if_poll_lock, LCK_MTX_ASSERT_OWNED);
4340
4341 ifp->if_poll_req++;
4342 if (!(ifp->if_poll_flags & IF_POLLF_RUNNING) &&
4343 ifp->if_poll_thread != THREAD_NULL) {
4344 wakeup_one((caddr_t)&ifp->if_poll_thread);
4345 }
4346 }
4347
4348 void
ifnet_poll(struct ifnet * ifp)4349 ifnet_poll(struct ifnet *ifp)
4350 {
4351 /*
4352 * If the poller thread is inactive, signal it to do work.
4353 */
4354 lck_mtx_lock_spin(&ifp->if_poll_lock);
4355 ifnet_poll_wakeup(ifp);
4356 lck_mtx_unlock(&ifp->if_poll_lock);
4357 }
4358
4359 __attribute__((noreturn))
4360 static void
ifnet_poll_thread_func(void * v,wait_result_t w)4361 ifnet_poll_thread_func(void *v, wait_result_t w)
4362 {
4363 #pragma unused(w)
4364 char thread_name[MAXTHREADNAMESIZE];
4365 struct ifnet *ifp = v;
4366
4367 VERIFY(ifp->if_eflags & IFEF_RXPOLL);
4368 VERIFY(current_thread() == ifp->if_poll_thread);
4369
4370 /* construct the name for this thread, and then apply it */
4371 bzero(thread_name, sizeof(thread_name));
4372 (void) snprintf(thread_name, sizeof(thread_name),
4373 "ifnet_poller_%s", ifp->if_xname);
4374 thread_set_thread_name(ifp->if_poll_thread, thread_name);
4375
4376 lck_mtx_lock(&ifp->if_poll_lock);
4377 VERIFY(!(ifp->if_poll_flags & (IF_POLLF_EMBRYONIC | IF_POLLF_RUNNING)));
4378 (void) assert_wait(&ifp->if_poll_thread, THREAD_UNINT);
4379 ifp->if_poll_flags |= IF_POLLF_EMBRYONIC;
4380 /* wake up once to get out of embryonic state */
4381 ifnet_poll_wakeup(ifp);
4382 lck_mtx_unlock(&ifp->if_poll_lock);
4383 (void) thread_block_parameter(ifnet_poll_thread_cont, ifp);
4384 /* NOTREACHED */
4385 __builtin_unreachable();
4386 }
4387
4388 __attribute__((noreturn))
4389 static void
ifnet_poll_thread_cont(void * v,wait_result_t wres)4390 ifnet_poll_thread_cont(void *v, wait_result_t wres)
4391 {
4392 struct dlil_threading_info *inp;
4393 struct ifnet *ifp = v;
4394 struct ifnet_stat_increment_param s;
4395 struct timespec start_time;
4396
4397 VERIFY(ifp->if_eflags & IFEF_RXPOLL);
4398
4399 bzero(&s, sizeof(s));
4400 net_timerclear(&start_time);
4401
4402 lck_mtx_lock_spin(&ifp->if_poll_lock);
4403 if (__improbable(wres == THREAD_INTERRUPTED ||
4404 (ifp->if_poll_flags & IF_POLLF_TERMINATING) != 0)) {
4405 goto terminate;
4406 }
4407
4408 inp = ifp->if_inp;
4409 VERIFY(inp != NULL);
4410
4411 if (__improbable(ifp->if_poll_flags & IF_POLLF_EMBRYONIC)) {
4412 ifp->if_poll_flags &= ~IF_POLLF_EMBRYONIC;
4413 lck_mtx_unlock(&ifp->if_poll_lock);
4414 ifnet_decr_pending_thread_count(ifp);
4415 lck_mtx_lock_spin(&ifp->if_poll_lock);
4416 goto skip;
4417 }
4418
4419 ifp->if_poll_flags |= IF_POLLF_RUNNING;
4420
4421 /*
4422 * Keep on servicing until no more request.
4423 */
4424 for (;;) {
4425 struct mbuf *m_head, *m_tail;
4426 u_int32_t m_lim, m_cnt, m_totlen;
4427 u_int16_t req = ifp->if_poll_req;
4428
4429 m_lim = (ifp->if_rxpoll_plim != 0) ? ifp->if_rxpoll_plim :
4430 MAX((qlimit(&inp->dlth_pkts)), (ifp->if_rxpoll_phiwat << 2));
4431 lck_mtx_unlock(&ifp->if_poll_lock);
4432
4433 /*
4434 * If no longer attached, there's nothing to do;
4435 * else hold an IO refcnt to prevent the interface
4436 * from being detached (will be released below.)
4437 */
4438 if (!ifnet_is_attached(ifp, 1)) {
4439 lck_mtx_lock_spin(&ifp->if_poll_lock);
4440 break;
4441 }
4442
4443 if (dlil_verbose > 1) {
4444 DLIL_PRINTF("%s: polling up to %d pkts, "
4445 "pkts avg %d max %d, wreq avg %d, "
4446 "bytes avg %d\n",
4447 if_name(ifp), m_lim,
4448 ifp->if_rxpoll_pavg, ifp->if_rxpoll_pmax,
4449 ifp->if_rxpoll_wavg, ifp->if_rxpoll_bavg);
4450 }
4451
4452 /* invoke the driver's input poll routine */
4453 ((*ifp->if_input_poll)(ifp, 0, m_lim, &m_head, &m_tail,
4454 &m_cnt, &m_totlen));
4455
4456 if (m_head != NULL) {
4457 VERIFY(m_tail != NULL && m_cnt > 0);
4458
4459 if (dlil_verbose > 1) {
4460 DLIL_PRINTF("%s: polled %d pkts, "
4461 "pkts avg %d max %d, wreq avg %d, "
4462 "bytes avg %d\n",
4463 if_name(ifp), m_cnt,
4464 ifp->if_rxpoll_pavg, ifp->if_rxpoll_pmax,
4465 ifp->if_rxpoll_wavg, ifp->if_rxpoll_bavg);
4466 }
4467
4468 /* stats are required for extended variant */
4469 s.packets_in = m_cnt;
4470 s.bytes_in = m_totlen;
4471
4472 (void) ifnet_input_common(ifp, m_head, m_tail,
4473 &s, TRUE, TRUE);
4474 } else {
4475 if (dlil_verbose > 1) {
4476 DLIL_PRINTF("%s: no packets, "
4477 "pkts avg %d max %d, wreq avg %d, "
4478 "bytes avg %d\n",
4479 if_name(ifp), ifp->if_rxpoll_pavg,
4480 ifp->if_rxpoll_pmax, ifp->if_rxpoll_wavg,
4481 ifp->if_rxpoll_bavg);
4482 }
4483
4484 (void) ifnet_input_common(ifp, NULL, NULL,
4485 NULL, FALSE, TRUE);
4486 }
4487
4488 /* Release the io ref count */
4489 ifnet_decr_iorefcnt(ifp);
4490
4491 lck_mtx_lock_spin(&ifp->if_poll_lock);
4492
4493 /* if there's no pending request, we're done */
4494 if (req == ifp->if_poll_req ||
4495 (ifp->if_poll_flags & IF_POLLF_TERMINATING) != 0) {
4496 break;
4497 }
4498 }
4499 skip:
4500 ifp->if_poll_req = 0;
4501 ifp->if_poll_flags &= ~IF_POLLF_RUNNING;
4502
4503 if (__probable((ifp->if_poll_flags & IF_POLLF_TERMINATING) == 0)) {
4504 uint64_t deadline = TIMEOUT_WAIT_FOREVER;
4505 struct timespec *ts;
4506
4507 /*
4508 * Wakeup N ns from now, else sleep indefinitely (ts = NULL)
4509 * until ifnet_poll() is called again.
4510 */
4511 ts = &ifp->if_poll_cycle;
4512 if (ts->tv_sec == 0 && ts->tv_nsec == 0) {
4513 ts = NULL;
4514 }
4515
4516 if (ts != NULL) {
4517 clock_interval_to_deadline((uint32_t)(ts->tv_nsec +
4518 (ts->tv_sec * NSEC_PER_SEC)), 1, &deadline);
4519 }
4520
4521 (void) assert_wait_deadline(&ifp->if_poll_thread,
4522 THREAD_UNINT, deadline);
4523 lck_mtx_unlock(&ifp->if_poll_lock);
4524 (void) thread_block_parameter(ifnet_poll_thread_cont, ifp);
4525 /* NOTREACHED */
4526 } else {
4527 terminate:
4528 /* interface is detached (maybe while asleep)? */
4529 ifnet_set_poll_cycle(ifp, NULL);
4530
4531 /* clear if_poll_thread to allow termination to continue */
4532 ASSERT(ifp->if_poll_thread != THREAD_NULL);
4533 ifp->if_poll_thread = THREAD_NULL;
4534 wakeup((caddr_t)&ifp->if_poll_thread);
4535 lck_mtx_unlock(&ifp->if_poll_lock);
4536
4537 if (dlil_verbose) {
4538 DLIL_PRINTF("%s: poller thread terminated\n",
4539 if_name(ifp));
4540 }
4541
4542 /* for the extra refcnt from kernel_thread_start() */
4543 thread_deallocate(current_thread());
4544 /* this is the end */
4545 thread_terminate(current_thread());
4546 /* NOTREACHED */
4547 }
4548
4549 /* must never get here */
4550 VERIFY(0);
4551 /* NOTREACHED */
4552 __builtin_unreachable();
4553 }
4554
4555 void
ifnet_set_poll_cycle(struct ifnet * ifp,struct timespec * ts)4556 ifnet_set_poll_cycle(struct ifnet *ifp, struct timespec *ts)
4557 {
4558 if (ts == NULL) {
4559 bzero(&ifp->if_poll_cycle, sizeof(ifp->if_poll_cycle));
4560 } else {
4561 *(&ifp->if_poll_cycle) = *ts;
4562 }
4563
4564 if (ts != NULL && ts->tv_nsec != 0 && dlil_verbose) {
4565 DLIL_PRINTF("%s: poll interval set to %lu nsec\n",
4566 if_name(ifp), ts->tv_nsec);
4567 }
4568 }
4569
4570 void
ifnet_purge(struct ifnet * ifp)4571 ifnet_purge(struct ifnet *ifp)
4572 {
4573 if (ifp != NULL && (ifp->if_eflags & IFEF_TXSTART)) {
4574 if_qflush_snd(ifp, false);
4575 }
4576 }
4577
4578 void
ifnet_update_sndq(struct ifclassq * ifq,cqev_t ev)4579 ifnet_update_sndq(struct ifclassq *ifq, cqev_t ev)
4580 {
4581 IFCQ_LOCK_ASSERT_HELD(ifq);
4582
4583 if (!(IFCQ_IS_READY(ifq))) {
4584 return;
4585 }
4586
4587 if (IFCQ_TBR_IS_ENABLED(ifq)) {
4588 struct tb_profile tb = {
4589 .rate = ifq->ifcq_tbr.tbr_rate_raw,
4590 .percent = ifq->ifcq_tbr.tbr_percent, .depth = 0
4591 };
4592 (void) ifclassq_tbr_set(ifq, &tb, FALSE);
4593 }
4594
4595 ifclassq_update(ifq, ev);
4596 }
4597
4598 void
ifnet_update_rcv(struct ifnet * ifp,cqev_t ev)4599 ifnet_update_rcv(struct ifnet *ifp, cqev_t ev)
4600 {
4601 switch (ev) {
4602 case CLASSQ_EV_LINK_BANDWIDTH:
4603 if (net_rxpoll && (ifp->if_eflags & IFEF_RXPOLL)) {
4604 ifp->if_poll_update++;
4605 }
4606 break;
4607
4608 default:
4609 break;
4610 }
4611 }
4612
4613 errno_t
ifnet_set_output_sched_model(struct ifnet * ifp,u_int32_t model)4614 ifnet_set_output_sched_model(struct ifnet *ifp, u_int32_t model)
4615 {
4616 struct ifclassq *ifq;
4617 u_int32_t omodel;
4618 errno_t err;
4619
4620 if (ifp == NULL || model >= IFNET_SCHED_MODEL_MAX) {
4621 return EINVAL;
4622 } else if (!(ifp->if_eflags & IFEF_TXSTART)) {
4623 return ENXIO;
4624 }
4625
4626 ifq = ifp->if_snd;
4627 IFCQ_LOCK(ifq);
4628 omodel = ifp->if_output_sched_model;
4629 ifp->if_output_sched_model = model;
4630 if ((err = ifclassq_pktsched_setup(ifq)) != 0) {
4631 ifp->if_output_sched_model = omodel;
4632 }
4633 IFCQ_UNLOCK(ifq);
4634
4635 return err;
4636 }
4637
4638 errno_t
ifnet_set_sndq_maxlen(struct ifnet * ifp,u_int32_t maxqlen)4639 ifnet_set_sndq_maxlen(struct ifnet *ifp, u_int32_t maxqlen)
4640 {
4641 if (ifp == NULL) {
4642 return EINVAL;
4643 } else if (!(ifp->if_eflags & IFEF_TXSTART)) {
4644 return ENXIO;
4645 }
4646
4647 ifclassq_set_maxlen(ifp->if_snd, maxqlen);
4648
4649 return 0;
4650 }
4651
4652 errno_t
ifnet_get_sndq_maxlen(struct ifnet * ifp,u_int32_t * maxqlen)4653 ifnet_get_sndq_maxlen(struct ifnet *ifp, u_int32_t *maxqlen)
4654 {
4655 if (ifp == NULL || maxqlen == NULL) {
4656 return EINVAL;
4657 } else if (!(ifp->if_eflags & IFEF_TXSTART)) {
4658 return ENXIO;
4659 }
4660
4661 *maxqlen = ifclassq_get_maxlen(ifp->if_snd);
4662
4663 return 0;
4664 }
4665
4666 errno_t
ifnet_get_sndq_len(struct ifnet * ifp,u_int32_t * pkts)4667 ifnet_get_sndq_len(struct ifnet *ifp, u_int32_t *pkts)
4668 {
4669 errno_t err;
4670
4671 if (ifp == NULL || pkts == NULL) {
4672 err = EINVAL;
4673 } else if (!(ifp->if_eflags & IFEF_TXSTART)) {
4674 err = ENXIO;
4675 } else {
4676 err = ifclassq_get_len(ifp->if_snd, MBUF_SC_UNSPEC,
4677 pkts, NULL);
4678 }
4679
4680 return err;
4681 }
4682
4683 errno_t
ifnet_get_service_class_sndq_len(struct ifnet * ifp,mbuf_svc_class_t sc,u_int32_t * pkts,u_int32_t * bytes)4684 ifnet_get_service_class_sndq_len(struct ifnet *ifp, mbuf_svc_class_t sc,
4685 u_int32_t *pkts, u_int32_t *bytes)
4686 {
4687 errno_t err;
4688
4689 if (ifp == NULL || !MBUF_VALID_SC(sc) ||
4690 (pkts == NULL && bytes == NULL)) {
4691 err = EINVAL;
4692 } else if (!(ifp->if_eflags & IFEF_TXSTART)) {
4693 err = ENXIO;
4694 } else {
4695 err = ifclassq_get_len(ifp->if_snd, sc, pkts, bytes);
4696 }
4697
4698 return err;
4699 }
4700
4701 errno_t
ifnet_set_rcvq_maxlen(struct ifnet * ifp,u_int32_t maxqlen)4702 ifnet_set_rcvq_maxlen(struct ifnet *ifp, u_int32_t maxqlen)
4703 {
4704 struct dlil_threading_info *inp;
4705
4706 if (ifp == NULL) {
4707 return EINVAL;
4708 } else if (!(ifp->if_eflags & IFEF_RXPOLL) || ifp->if_inp == NULL) {
4709 return ENXIO;
4710 }
4711
4712 if (maxqlen == 0) {
4713 maxqlen = if_rcvq_maxlen;
4714 } else if (maxqlen < IF_RCVQ_MINLEN) {
4715 maxqlen = IF_RCVQ_MINLEN;
4716 }
4717
4718 inp = ifp->if_inp;
4719 lck_mtx_lock(&inp->dlth_lock);
4720 qlimit(&inp->dlth_pkts) = maxqlen;
4721 lck_mtx_unlock(&inp->dlth_lock);
4722
4723 return 0;
4724 }
4725
4726 errno_t
ifnet_get_rcvq_maxlen(struct ifnet * ifp,u_int32_t * maxqlen)4727 ifnet_get_rcvq_maxlen(struct ifnet *ifp, u_int32_t *maxqlen)
4728 {
4729 struct dlil_threading_info *inp;
4730
4731 if (ifp == NULL || maxqlen == NULL) {
4732 return EINVAL;
4733 } else if (!(ifp->if_eflags & IFEF_RXPOLL) || ifp->if_inp == NULL) {
4734 return ENXIO;
4735 }
4736
4737 inp = ifp->if_inp;
4738 lck_mtx_lock(&inp->dlth_lock);
4739 *maxqlen = qlimit(&inp->dlth_pkts);
4740 lck_mtx_unlock(&inp->dlth_lock);
4741 return 0;
4742 }
4743
4744 void
ifnet_enqueue_multi_setup(struct ifnet * ifp,uint16_t delay_qlen,uint16_t delay_timeout)4745 ifnet_enqueue_multi_setup(struct ifnet *ifp, uint16_t delay_qlen,
4746 uint16_t delay_timeout)
4747 {
4748 if (delay_qlen > 0 && delay_timeout > 0) {
4749 if_set_eflags(ifp, IFEF_ENQUEUE_MULTI);
4750 ifp->if_start_delay_qlen = MIN(100, delay_qlen);
4751 ifp->if_start_delay_timeout = min(20000, delay_timeout);
4752 /* convert timeout to nanoseconds */
4753 ifp->if_start_delay_timeout *= 1000;
4754 kprintf("%s: forced IFEF_ENQUEUE_MULTI qlen %u timeout %u\n",
4755 ifp->if_xname, (uint32_t)delay_qlen,
4756 (uint32_t)delay_timeout);
4757 } else {
4758 if_clear_eflags(ifp, IFEF_ENQUEUE_MULTI);
4759 }
4760 }
4761
4762 /*
4763 * This function clears the DSCP bits in the IPV4/V6 header pointed to by buf.
4764 * While it's ok for buf to be not 32 bit aligned, the caller must ensure that
4765 * buf holds the full header.
4766 */
4767 static __attribute__((noinline)) void
ifnet_mcast_clear_dscp(uint8_t * buf,uint8_t ip_ver)4768 ifnet_mcast_clear_dscp(uint8_t *buf, uint8_t ip_ver)
4769 {
4770 struct ip *ip;
4771 struct ip6_hdr *ip6;
4772 uint8_t lbuf[64] __attribute__((aligned(8)));
4773 uint8_t *p = buf;
4774
4775 if (ip_ver == IPVERSION) {
4776 uint8_t old_tos;
4777 uint32_t sum;
4778
4779 if (__improbable(!IP_HDR_ALIGNED_P(p))) {
4780 DTRACE_IP1(not__aligned__v4, uint8_t *, buf);
4781 bcopy(buf, lbuf, sizeof(struct ip));
4782 p = lbuf;
4783 }
4784 ip = (struct ip *)(void *)p;
4785 if (__probable((ip->ip_tos & ~IPTOS_ECN_MASK) == 0)) {
4786 return;
4787 }
4788
4789 DTRACE_IP1(clear__v4, struct ip *, ip);
4790 old_tos = ip->ip_tos;
4791 ip->ip_tos &= IPTOS_ECN_MASK;
4792 sum = ip->ip_sum + htons(old_tos) - htons(ip->ip_tos);
4793 sum = (sum >> 16) + (sum & 0xffff);
4794 ip->ip_sum = (uint16_t)(sum & 0xffff);
4795
4796 if (__improbable(p == lbuf)) {
4797 bcopy(lbuf, buf, sizeof(struct ip));
4798 }
4799 } else {
4800 uint32_t flow;
4801 ASSERT(ip_ver == IPV6_VERSION);
4802
4803 if (__improbable(!IP_HDR_ALIGNED_P(p))) {
4804 DTRACE_IP1(not__aligned__v6, uint8_t *, buf);
4805 bcopy(buf, lbuf, sizeof(struct ip6_hdr));
4806 p = lbuf;
4807 }
4808 ip6 = (struct ip6_hdr *)(void *)p;
4809 flow = ntohl(ip6->ip6_flow);
4810 if (__probable((flow & IP6FLOW_DSCP_MASK) == 0)) {
4811 return;
4812 }
4813
4814 DTRACE_IP1(clear__v6, struct ip6_hdr *, ip6);
4815 ip6->ip6_flow = htonl(flow & ~IP6FLOW_DSCP_MASK);
4816
4817 if (__improbable(p == lbuf)) {
4818 bcopy(lbuf, buf, sizeof(struct ip6_hdr));
4819 }
4820 }
4821 }
4822
4823 static inline errno_t
ifnet_enqueue_ifclassq(struct ifnet * ifp,struct ifclassq * ifcq,classq_pkt_t * p,boolean_t flush,boolean_t * pdrop)4824 ifnet_enqueue_ifclassq(struct ifnet *ifp, struct ifclassq *ifcq,
4825 classq_pkt_t *p, boolean_t flush, boolean_t *pdrop)
4826 {
4827 #if SKYWALK
4828 volatile struct sk_nexusadv *nxadv = NULL;
4829 #endif /* SKYWALK */
4830 volatile uint64_t *fg_ts = NULL;
4831 volatile uint64_t *rt_ts = NULL;
4832 struct timespec now;
4833 u_int64_t now_nsec = 0;
4834 int error = 0;
4835 uint8_t *mcast_buf = NULL;
4836 uint8_t ip_ver;
4837 uint32_t pktlen;
4838
4839 ASSERT(ifp->if_eflags & IFEF_TXSTART);
4840 #if SKYWALK
4841 /*
4842 * If attached to flowswitch, grab pointers to the
4843 * timestamp variables in the nexus advisory region.
4844 */
4845 if ((ifp->if_capabilities & IFCAP_SKYWALK) && ifp->if_na != NULL &&
4846 (nxadv = ifp->if_na->nifna_netif->nif_fsw_nxadv) != NULL) {
4847 fg_ts = &nxadv->nxadv_fg_sendts;
4848 rt_ts = &nxadv->nxadv_rt_sendts;
4849 }
4850 #endif /* SKYWALK */
4851
4852 /*
4853 * If packet already carries a timestamp, either from dlil_output()
4854 * or from flowswitch, use it here. Otherwise, record timestamp.
4855 * PKTF_TS_VALID is always cleared prior to entering classq, i.e.
4856 * the timestamp value is used internally there.
4857 */
4858 switch (p->cp_ptype) {
4859 case QP_MBUF:
4860 #if SKYWALK
4861 /*
4862 * Valid only for non-native (compat) Skywalk interface.
4863 * If the data source uses packet, caller must convert
4864 * it to mbuf first prior to calling this routine.
4865 */
4866 ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
4867 #endif /* SKYWALK */
4868 ASSERT(p->cp_mbuf->m_flags & M_PKTHDR);
4869 ASSERT(p->cp_mbuf->m_nextpkt == NULL);
4870
4871 if (!(p->cp_mbuf->m_pkthdr.pkt_flags & PKTF_TS_VALID) ||
4872 p->cp_mbuf->m_pkthdr.pkt_timestamp == 0) {
4873 nanouptime(&now);
4874 net_timernsec(&now, &now_nsec);
4875 p->cp_mbuf->m_pkthdr.pkt_timestamp = now_nsec;
4876 }
4877 p->cp_mbuf->m_pkthdr.pkt_flags &= ~PKTF_TS_VALID;
4878 /*
4879 * If the packet service class is not background,
4880 * update the timestamp to indicate recent activity
4881 * on a foreground socket.
4882 */
4883 if ((p->cp_mbuf->m_pkthdr.pkt_flags & PKTF_FLOW_ID) &&
4884 p->cp_mbuf->m_pkthdr.pkt_flowsrc == FLOWSRC_INPCB) {
4885 if (!(p->cp_mbuf->m_pkthdr.pkt_flags &
4886 PKTF_SO_BACKGROUND)) {
4887 ifp->if_fg_sendts = (uint32_t)_net_uptime;
4888 if (fg_ts != NULL) {
4889 *fg_ts = (uint32_t)_net_uptime;
4890 }
4891 }
4892 if (p->cp_mbuf->m_pkthdr.pkt_flags & PKTF_SO_REALTIME) {
4893 ifp->if_rt_sendts = (uint32_t)_net_uptime;
4894 if (rt_ts != NULL) {
4895 *rt_ts = (uint32_t)_net_uptime;
4896 }
4897 }
4898 }
4899 pktlen = m_pktlen(p->cp_mbuf);
4900
4901 /*
4902 * Some Wi-Fi AP implementations do not correctly handle
4903 * multicast IP packets with DSCP bits set (radr://9331522).
4904 * As a workaround we clear the DSCP bits but keep service
4905 * class (rdar://51507725).
4906 */
4907 if ((p->cp_mbuf->m_flags & M_MCAST) != 0 &&
4908 IFNET_IS_WIFI_INFRA(ifp)) {
4909 size_t len = mbuf_len(p->cp_mbuf), hlen;
4910 struct ether_header *eh;
4911 boolean_t pullup = FALSE;
4912 uint16_t etype;
4913
4914 if (__improbable(len < sizeof(struct ether_header))) {
4915 DTRACE_IP1(small__ether, size_t, len);
4916 if ((p->cp_mbuf = m_pullup(p->cp_mbuf,
4917 sizeof(struct ether_header))) == NULL) {
4918 return ENOMEM;
4919 }
4920 }
4921 eh = (struct ether_header *)mbuf_data(p->cp_mbuf);
4922 etype = ntohs(eh->ether_type);
4923 if (etype == ETHERTYPE_IP) {
4924 hlen = sizeof(struct ether_header) +
4925 sizeof(struct ip);
4926 if (len < hlen) {
4927 DTRACE_IP1(small__v4, size_t, len);
4928 pullup = TRUE;
4929 }
4930 ip_ver = IPVERSION;
4931 } else if (etype == ETHERTYPE_IPV6) {
4932 hlen = sizeof(struct ether_header) +
4933 sizeof(struct ip6_hdr);
4934 if (len < hlen) {
4935 DTRACE_IP1(small__v6, size_t, len);
4936 pullup = TRUE;
4937 }
4938 ip_ver = IPV6_VERSION;
4939 } else {
4940 DTRACE_IP1(invalid__etype, uint16_t, etype);
4941 break;
4942 }
4943 if (pullup) {
4944 if ((p->cp_mbuf = m_pullup(p->cp_mbuf, (int)hlen)) ==
4945 NULL) {
4946 return ENOMEM;
4947 }
4948
4949 eh = (struct ether_header *)mbuf_data(
4950 p->cp_mbuf);
4951 }
4952 mcast_buf = (uint8_t *)(eh + 1);
4953 /*
4954 * ifnet_mcast_clear_dscp() will finish the work below.
4955 * Note that the pullups above ensure that mcast_buf
4956 * points to a full IP header.
4957 */
4958 }
4959 break;
4960
4961 #if SKYWALK
4962 case QP_PACKET:
4963 /*
4964 * Valid only for native Skywalk interface. If the data
4965 * source uses mbuf, caller must convert it to packet first
4966 * prior to calling this routine.
4967 */
4968 ASSERT(ifp->if_eflags & IFEF_SKYWALK_NATIVE);
4969 if (!(p->cp_kpkt->pkt_pflags & PKT_F_TS_VALID) ||
4970 p->cp_kpkt->pkt_timestamp == 0) {
4971 nanouptime(&now);
4972 net_timernsec(&now, &now_nsec);
4973 p->cp_kpkt->pkt_timestamp = now_nsec;
4974 }
4975 p->cp_kpkt->pkt_pflags &= ~PKT_F_TS_VALID;
4976 /*
4977 * If the packet service class is not background,
4978 * update the timestamps on the interface, as well as
4979 * the ones in nexus-wide advisory to indicate recent
4980 * activity on a foreground flow.
4981 */
4982 if (!(p->cp_kpkt->pkt_pflags & PKT_F_BACKGROUND)) {
4983 ifp->if_fg_sendts = (uint32_t)_net_uptime;
4984 if (fg_ts != NULL) {
4985 *fg_ts = (uint32_t)_net_uptime;
4986 }
4987 }
4988 if (p->cp_kpkt->pkt_pflags & PKT_F_REALTIME) {
4989 ifp->if_rt_sendts = (uint32_t)_net_uptime;
4990 if (rt_ts != NULL) {
4991 *rt_ts = (uint32_t)_net_uptime;
4992 }
4993 }
4994 pktlen = p->cp_kpkt->pkt_length;
4995
4996 /*
4997 * Some Wi-Fi AP implementations do not correctly handle
4998 * multicast IP packets with DSCP bits set (radr://9331522).
4999 * As a workaround we clear the DSCP bits but keep service
5000 * class (rdar://51507725).
5001 */
5002 if ((p->cp_kpkt->pkt_link_flags & PKT_LINKF_MCAST) != 0 &&
5003 IFNET_IS_WIFI_INFRA(ifp)) {
5004 uint8_t *baddr;
5005 struct ether_header *eh;
5006 uint16_t etype;
5007
5008 MD_BUFLET_ADDR_ABS(p->cp_kpkt, baddr);
5009 baddr += p->cp_kpkt->pkt_headroom;
5010 if (__improbable(pktlen < sizeof(struct ether_header))) {
5011 DTRACE_IP1(pkt__small__ether, __kern_packet *,
5012 p->cp_kpkt);
5013 break;
5014 }
5015 eh = (struct ether_header *)(void *)baddr;
5016 etype = ntohs(eh->ether_type);
5017 if (etype == ETHERTYPE_IP) {
5018 if (pktlen < sizeof(struct ether_header) +
5019 sizeof(struct ip)) {
5020 DTRACE_IP1(pkt__small__v4, uint32_t,
5021 pktlen);
5022 break;
5023 }
5024 ip_ver = IPVERSION;
5025 } else if (etype == ETHERTYPE_IPV6) {
5026 if (pktlen < sizeof(struct ether_header) +
5027 sizeof(struct ip6_hdr)) {
5028 DTRACE_IP1(pkt__small__v6, uint32_t,
5029 pktlen);
5030 break;
5031 }
5032 ip_ver = IPV6_VERSION;
5033 } else {
5034 DTRACE_IP1(pkt__invalid__etype, uint16_t,
5035 etype);
5036 break;
5037 }
5038 mcast_buf = (uint8_t *)(eh + 1);
5039 /*
5040 * ifnet_mcast_clear_dscp() will finish the work below.
5041 * The checks above verify that the IP header is in the
5042 * first buflet.
5043 */
5044 }
5045 break;
5046 #endif /* SKYWALK */
5047
5048 default:
5049 VERIFY(0);
5050 /* NOTREACHED */
5051 __builtin_unreachable();
5052 }
5053
5054 if (mcast_buf != NULL) {
5055 ifnet_mcast_clear_dscp(mcast_buf, ip_ver);
5056 }
5057
5058 if (ifp->if_eflags & IFEF_ENQUEUE_MULTI) {
5059 if (now_nsec == 0) {
5060 nanouptime(&now);
5061 net_timernsec(&now, &now_nsec);
5062 }
5063 /*
5064 * If the driver chose to delay start callback for
5065 * coalescing multiple packets, Then use the following
5066 * heuristics to make sure that start callback will
5067 * be delayed only when bulk data transfer is detected.
5068 * 1. number of packets enqueued in (delay_win * 2) is
5069 * greater than or equal to the delay qlen.
5070 * 2. If delay_start is enabled it will stay enabled for
5071 * another 10 idle windows. This is to take into account
5072 * variable RTT and burst traffic.
5073 * 3. If the time elapsed since last enqueue is more
5074 * than 200ms we disable delaying start callback. This is
5075 * is to take idle time into account.
5076 */
5077 u_int64_t dwin = (ifp->if_start_delay_timeout << 1);
5078 if (ifp->if_start_delay_swin > 0) {
5079 if ((ifp->if_start_delay_swin + dwin) > now_nsec) {
5080 ifp->if_start_delay_cnt++;
5081 } else if ((now_nsec - ifp->if_start_delay_swin)
5082 >= (200 * 1000 * 1000)) {
5083 ifp->if_start_delay_swin = now_nsec;
5084 ifp->if_start_delay_cnt = 1;
5085 ifp->if_start_delay_idle = 0;
5086 if (ifp->if_eflags & IFEF_DELAY_START) {
5087 if_clear_eflags(ifp, IFEF_DELAY_START);
5088 ifnet_delay_start_disabled_increment();
5089 }
5090 } else {
5091 if (ifp->if_start_delay_cnt >=
5092 ifp->if_start_delay_qlen) {
5093 if_set_eflags(ifp, IFEF_DELAY_START);
5094 ifp->if_start_delay_idle = 0;
5095 } else {
5096 if (ifp->if_start_delay_idle >= 10) {
5097 if_clear_eflags(ifp,
5098 IFEF_DELAY_START);
5099 ifnet_delay_start_disabled_increment();
5100 } else {
5101 ifp->if_start_delay_idle++;
5102 }
5103 }
5104 ifp->if_start_delay_swin = now_nsec;
5105 ifp->if_start_delay_cnt = 1;
5106 }
5107 } else {
5108 ifp->if_start_delay_swin = now_nsec;
5109 ifp->if_start_delay_cnt = 1;
5110 ifp->if_start_delay_idle = 0;
5111 if_clear_eflags(ifp, IFEF_DELAY_START);
5112 }
5113 } else {
5114 if_clear_eflags(ifp, IFEF_DELAY_START);
5115 }
5116
5117 /* enqueue the packet (caller consumes object) */
5118 error = ifclassq_enqueue(((ifcq != NULL) ? ifcq : ifp->if_snd), p, p,
5119 1, pktlen, pdrop);
5120
5121 /*
5122 * Tell the driver to start dequeueing; do this even when the queue
5123 * for the packet is suspended (EQSUSPENDED), as the driver could still
5124 * be dequeueing from other unsuspended queues.
5125 */
5126 if (!(ifp->if_eflags & IFEF_ENQUEUE_MULTI) &&
5127 ((error == 0 && flush) || error == EQFULL || error == EQSUSPENDED)) {
5128 ifnet_start(ifp);
5129 }
5130
5131 return error;
5132 }
5133
5134 static inline errno_t
ifnet_enqueue_ifclassq_chain(struct ifnet * ifp,classq_pkt_t * head,classq_pkt_t * tail,uint32_t cnt,uint32_t bytes,boolean_t flush,boolean_t * pdrop)5135 ifnet_enqueue_ifclassq_chain(struct ifnet *ifp, classq_pkt_t *head,
5136 classq_pkt_t *tail, uint32_t cnt, uint32_t bytes, boolean_t flush,
5137 boolean_t *pdrop)
5138 {
5139 int error;
5140
5141 /* enqueue the packet (caller consumes object) */
5142 error = ifclassq_enqueue(ifp->if_snd, head, tail, cnt, bytes, pdrop);
5143
5144 /*
5145 * Tell the driver to start dequeueing; do this even when the queue
5146 * for the packet is suspended (EQSUSPENDED), as the driver could still
5147 * be dequeueing from other unsuspended queues.
5148 */
5149 if ((error == 0 && flush) || error == EQFULL || error == EQSUSPENDED) {
5150 ifnet_start(ifp);
5151 }
5152 return error;
5153 }
5154
5155 int
ifnet_enqueue_netem(void * handle,pktsched_pkt_t * pkts,uint32_t n_pkts)5156 ifnet_enqueue_netem(void *handle, pktsched_pkt_t *pkts, uint32_t n_pkts)
5157 {
5158 struct ifnet *ifp = handle;
5159 boolean_t pdrop; /* dummy */
5160 uint32_t i;
5161
5162 ASSERT(n_pkts >= 1);
5163 for (i = 0; i < n_pkts - 1; i++) {
5164 (void) ifnet_enqueue_ifclassq(ifp, NULL, &pkts[i].pktsched_pkt,
5165 FALSE, &pdrop);
5166 }
5167 /* flush with the last packet */
5168 (void) ifnet_enqueue_ifclassq(ifp, NULL, &pkts[i].pktsched_pkt,
5169 TRUE, &pdrop);
5170
5171 return 0;
5172 }
5173
5174 static inline errno_t
ifnet_enqueue_common(struct ifnet * ifp,struct ifclassq * ifcq,classq_pkt_t * pkt,boolean_t flush,boolean_t * pdrop)5175 ifnet_enqueue_common(struct ifnet *ifp, struct ifclassq *ifcq,
5176 classq_pkt_t *pkt, boolean_t flush, boolean_t *pdrop)
5177 {
5178 if (ifp->if_output_netem != NULL) {
5179 return netem_enqueue(ifp->if_output_netem, pkt, pdrop);
5180 } else {
5181 return ifnet_enqueue_ifclassq(ifp, ifcq, pkt, flush, pdrop);
5182 }
5183 }
5184
5185 errno_t
ifnet_enqueue(struct ifnet * ifp,struct mbuf * m)5186 ifnet_enqueue(struct ifnet *ifp, struct mbuf *m)
5187 {
5188 boolean_t pdrop;
5189 return ifnet_enqueue_mbuf(ifp, m, TRUE, &pdrop);
5190 }
5191
5192 errno_t
ifnet_enqueue_mbuf(struct ifnet * ifp,struct mbuf * m,boolean_t flush,boolean_t * pdrop)5193 ifnet_enqueue_mbuf(struct ifnet *ifp, struct mbuf *m, boolean_t flush,
5194 boolean_t *pdrop)
5195 {
5196 classq_pkt_t pkt;
5197
5198 if (ifp == NULL || m == NULL || !(m->m_flags & M_PKTHDR) ||
5199 m->m_nextpkt != NULL) {
5200 if (m != NULL) {
5201 m_freem_list(m);
5202 *pdrop = TRUE;
5203 }
5204 return EINVAL;
5205 } else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5206 !IF_FULLY_ATTACHED(ifp)) {
5207 /* flag tested without lock for performance */
5208 m_freem(m);
5209 *pdrop = TRUE;
5210 return ENXIO;
5211 } else if (!(ifp->if_flags & IFF_UP)) {
5212 m_freem(m);
5213 *pdrop = TRUE;
5214 return ENETDOWN;
5215 }
5216
5217 CLASSQ_PKT_INIT_MBUF(&pkt, m);
5218 return ifnet_enqueue_common(ifp, NULL, &pkt, flush, pdrop);
5219 }
5220
5221 errno_t
ifnet_enqueue_mbuf_chain(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,uint32_t cnt,uint32_t bytes,boolean_t flush,boolean_t * pdrop)5222 ifnet_enqueue_mbuf_chain(struct ifnet *ifp, struct mbuf *m_head,
5223 struct mbuf *m_tail, uint32_t cnt, uint32_t bytes, boolean_t flush,
5224 boolean_t *pdrop)
5225 {
5226 classq_pkt_t head, tail;
5227
5228 ASSERT(m_head != NULL);
5229 ASSERT((m_head->m_flags & M_PKTHDR) != 0);
5230 ASSERT(m_tail != NULL);
5231 ASSERT((m_tail->m_flags & M_PKTHDR) != 0);
5232 ASSERT(ifp != NULL);
5233 ASSERT((ifp->if_eflags & IFEF_TXSTART) != 0);
5234
5235 if (!IF_FULLY_ATTACHED(ifp)) {
5236 /* flag tested without lock for performance */
5237 m_freem_list(m_head);
5238 *pdrop = TRUE;
5239 return ENXIO;
5240 } else if (!(ifp->if_flags & IFF_UP)) {
5241 m_freem_list(m_head);
5242 *pdrop = TRUE;
5243 return ENETDOWN;
5244 }
5245
5246 CLASSQ_PKT_INIT_MBUF(&head, m_head);
5247 CLASSQ_PKT_INIT_MBUF(&tail, m_tail);
5248 return ifnet_enqueue_ifclassq_chain(ifp, &head, &tail, cnt, bytes,
5249 flush, pdrop);
5250 }
5251
5252 #if SKYWALK
5253 static errno_t
ifnet_enqueue_pkt_common(struct ifnet * ifp,struct ifclassq * ifcq,struct __kern_packet * kpkt,boolean_t flush,boolean_t * pdrop)5254 ifnet_enqueue_pkt_common(struct ifnet *ifp, struct ifclassq *ifcq,
5255 struct __kern_packet *kpkt, boolean_t flush, boolean_t *pdrop)
5256 {
5257 classq_pkt_t pkt;
5258
5259 ASSERT(kpkt == NULL || kpkt->pkt_nextpkt == NULL);
5260
5261 if (__improbable(ifp == NULL || kpkt == NULL)) {
5262 if (kpkt != NULL) {
5263 pp_free_packet(__DECONST(struct kern_pbufpool *,
5264 kpkt->pkt_qum.qum_pp), SK_PTR_ADDR(kpkt));
5265 *pdrop = TRUE;
5266 }
5267 return EINVAL;
5268 } else if (__improbable(!(ifp->if_eflags & IFEF_TXSTART) ||
5269 !IF_FULLY_ATTACHED(ifp))) {
5270 /* flag tested without lock for performance */
5271 pp_free_packet(__DECONST(struct kern_pbufpool *,
5272 kpkt->pkt_qum.qum_pp), SK_PTR_ADDR(kpkt));
5273 *pdrop = TRUE;
5274 return ENXIO;
5275 } else if (__improbable(!(ifp->if_flags & IFF_UP))) {
5276 pp_free_packet(__DECONST(struct kern_pbufpool *,
5277 kpkt->pkt_qum.qum_pp), SK_PTR_ADDR(kpkt));
5278 *pdrop = TRUE;
5279 return ENETDOWN;
5280 }
5281
5282 CLASSQ_PKT_INIT_PACKET(&pkt, kpkt);
5283 return ifnet_enqueue_common(ifp, ifcq, &pkt, flush, pdrop);
5284 }
5285
5286 errno_t
ifnet_enqueue_pkt(struct ifnet * ifp,struct __kern_packet * kpkt,boolean_t flush,boolean_t * pdrop)5287 ifnet_enqueue_pkt(struct ifnet *ifp, struct __kern_packet *kpkt,
5288 boolean_t flush, boolean_t *pdrop)
5289 {
5290 return ifnet_enqueue_pkt_common(ifp, NULL, kpkt, flush, pdrop);
5291 }
5292
5293 errno_t
ifnet_enqueue_ifcq_pkt(struct ifnet * ifp,struct ifclassq * ifcq,struct __kern_packet * kpkt,boolean_t flush,boolean_t * pdrop)5294 ifnet_enqueue_ifcq_pkt(struct ifnet *ifp, struct ifclassq *ifcq,
5295 struct __kern_packet *kpkt, boolean_t flush, boolean_t *pdrop)
5296 {
5297 return ifnet_enqueue_pkt_common(ifp, ifcq, kpkt, flush, pdrop);
5298 }
5299
5300 errno_t
ifnet_enqueue_pkt_chain(struct ifnet * ifp,struct __kern_packet * k_head,struct __kern_packet * k_tail,uint32_t cnt,uint32_t bytes,boolean_t flush,boolean_t * pdrop)5301 ifnet_enqueue_pkt_chain(struct ifnet *ifp, struct __kern_packet *k_head,
5302 struct __kern_packet *k_tail, uint32_t cnt, uint32_t bytes, boolean_t flush,
5303 boolean_t *pdrop)
5304 {
5305 classq_pkt_t head, tail;
5306
5307 ASSERT(k_head != NULL);
5308 ASSERT(k_tail != NULL);
5309 ASSERT(ifp != NULL);
5310 ASSERT((ifp->if_eflags & IFEF_TXSTART) != 0);
5311
5312 if (!IF_FULLY_ATTACHED(ifp)) {
5313 /* flag tested without lock for performance */
5314 pp_free_packet_chain(k_head, NULL);
5315 *pdrop = TRUE;
5316 return ENXIO;
5317 } else if (__improbable(!(ifp->if_flags & IFF_UP))) {
5318 pp_free_packet_chain(k_head, NULL);
5319 *pdrop = TRUE;
5320 return ENETDOWN;
5321 }
5322
5323 CLASSQ_PKT_INIT_PACKET(&head, k_head);
5324 CLASSQ_PKT_INIT_PACKET(&tail, k_tail);
5325 return ifnet_enqueue_ifclassq_chain(ifp, &head, &tail, cnt, bytes,
5326 flush, pdrop);
5327 }
5328 #endif /* SKYWALK */
5329
5330 errno_t
ifnet_dequeue(struct ifnet * ifp,struct mbuf ** mp)5331 ifnet_dequeue(struct ifnet *ifp, struct mbuf **mp)
5332 {
5333 errno_t rc;
5334 classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
5335
5336 if (ifp == NULL || mp == NULL) {
5337 return EINVAL;
5338 } else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5339 ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5340 return ENXIO;
5341 }
5342 if (!ifnet_is_attached(ifp, 1)) {
5343 return ENXIO;
5344 }
5345
5346 #if SKYWALK
5347 ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5348 #endif /* SKYWALK */
5349 rc = ifclassq_dequeue(ifp->if_snd, 1, CLASSQ_DEQUEUE_MAX_BYTE_LIMIT,
5350 &pkt, NULL, NULL, NULL);
5351 VERIFY((pkt.cp_ptype == QP_MBUF) || (pkt.cp_mbuf == NULL));
5352 ifnet_decr_iorefcnt(ifp);
5353 *mp = pkt.cp_mbuf;
5354 return rc;
5355 }
5356
5357 errno_t
ifnet_dequeue_service_class(struct ifnet * ifp,mbuf_svc_class_t sc,struct mbuf ** mp)5358 ifnet_dequeue_service_class(struct ifnet *ifp, mbuf_svc_class_t sc,
5359 struct mbuf **mp)
5360 {
5361 errno_t rc;
5362 classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
5363
5364 if (ifp == NULL || mp == NULL || !MBUF_VALID_SC(sc)) {
5365 return EINVAL;
5366 } else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5367 ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5368 return ENXIO;
5369 }
5370 if (!ifnet_is_attached(ifp, 1)) {
5371 return ENXIO;
5372 }
5373
5374 #if SKYWALK
5375 ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5376 #endif /* SKYWALK */
5377 rc = ifclassq_dequeue_sc(ifp->if_snd, sc, 1,
5378 CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, &pkt, NULL, NULL, NULL);
5379 VERIFY((pkt.cp_ptype == QP_MBUF) || (pkt.cp_mbuf == NULL));
5380 ifnet_decr_iorefcnt(ifp);
5381 *mp = pkt.cp_mbuf;
5382 return rc;
5383 }
5384
5385 errno_t
ifnet_dequeue_multi(struct ifnet * ifp,u_int32_t pkt_limit,struct mbuf ** head,struct mbuf ** tail,u_int32_t * cnt,u_int32_t * len)5386 ifnet_dequeue_multi(struct ifnet *ifp, u_int32_t pkt_limit,
5387 struct mbuf **head, struct mbuf **tail, u_int32_t *cnt, u_int32_t *len)
5388 {
5389 errno_t rc;
5390 classq_pkt_t pkt_head = CLASSQ_PKT_INITIALIZER(pkt_head);
5391 classq_pkt_t pkt_tail = CLASSQ_PKT_INITIALIZER(pkt_tail);
5392
5393 if (ifp == NULL || head == NULL || pkt_limit < 1) {
5394 return EINVAL;
5395 } else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5396 ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5397 return ENXIO;
5398 }
5399 if (!ifnet_is_attached(ifp, 1)) {
5400 return ENXIO;
5401 }
5402
5403 #if SKYWALK
5404 ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5405 #endif /* SKYWALK */
5406 rc = ifclassq_dequeue(ifp->if_snd, pkt_limit,
5407 CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, &pkt_head, &pkt_tail, cnt, len);
5408 VERIFY((pkt_head.cp_ptype == QP_MBUF) || (pkt_head.cp_mbuf == NULL));
5409 ifnet_decr_iorefcnt(ifp);
5410 *head = pkt_head.cp_mbuf;
5411 if (tail != NULL) {
5412 *tail = pkt_tail.cp_mbuf;
5413 }
5414 return rc;
5415 }
5416
5417 errno_t
ifnet_dequeue_multi_bytes(struct ifnet * ifp,u_int32_t byte_limit,struct mbuf ** head,struct mbuf ** tail,u_int32_t * cnt,u_int32_t * len)5418 ifnet_dequeue_multi_bytes(struct ifnet *ifp, u_int32_t byte_limit,
5419 struct mbuf **head, struct mbuf **tail, u_int32_t *cnt, u_int32_t *len)
5420 {
5421 errno_t rc;
5422 classq_pkt_t pkt_head = CLASSQ_PKT_INITIALIZER(pkt_head);
5423 classq_pkt_t pkt_tail = CLASSQ_PKT_INITIALIZER(pkt_tail);
5424
5425 if (ifp == NULL || head == NULL || byte_limit < 1) {
5426 return EINVAL;
5427 } else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5428 ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5429 return ENXIO;
5430 }
5431 if (!ifnet_is_attached(ifp, 1)) {
5432 return ENXIO;
5433 }
5434
5435 #if SKYWALK
5436 ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5437 #endif /* SKYWALK */
5438 rc = ifclassq_dequeue(ifp->if_snd, CLASSQ_DEQUEUE_MAX_PKT_LIMIT,
5439 byte_limit, &pkt_head, &pkt_tail, cnt, len);
5440 VERIFY((pkt_head.cp_ptype == QP_MBUF) || (pkt_head.cp_mbuf == NULL));
5441 ifnet_decr_iorefcnt(ifp);
5442 *head = pkt_head.cp_mbuf;
5443 if (tail != NULL) {
5444 *tail = pkt_tail.cp_mbuf;
5445 }
5446 return rc;
5447 }
5448
5449 errno_t
ifnet_dequeue_service_class_multi(struct ifnet * ifp,mbuf_svc_class_t sc,u_int32_t pkt_limit,struct mbuf ** head,struct mbuf ** tail,u_int32_t * cnt,u_int32_t * len)5450 ifnet_dequeue_service_class_multi(struct ifnet *ifp, mbuf_svc_class_t sc,
5451 u_int32_t pkt_limit, struct mbuf **head, struct mbuf **tail, u_int32_t *cnt,
5452 u_int32_t *len)
5453 {
5454 errno_t rc;
5455 classq_pkt_t pkt_head = CLASSQ_PKT_INITIALIZER(pkt_head);
5456 classq_pkt_t pkt_tail = CLASSQ_PKT_INITIALIZER(pkt_tail);
5457
5458 if (ifp == NULL || head == NULL || pkt_limit < 1 ||
5459 !MBUF_VALID_SC(sc)) {
5460 return EINVAL;
5461 } else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5462 ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5463 return ENXIO;
5464 }
5465 if (!ifnet_is_attached(ifp, 1)) {
5466 return ENXIO;
5467 }
5468
5469 #if SKYWALK
5470 ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5471 #endif /* SKYWALK */
5472 rc = ifclassq_dequeue_sc(ifp->if_snd, sc, pkt_limit,
5473 CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, &pkt_head, &pkt_tail,
5474 cnt, len);
5475 VERIFY((pkt_head.cp_ptype == QP_MBUF) || (pkt_head.cp_mbuf == NULL));
5476 ifnet_decr_iorefcnt(ifp);
5477 *head = pkt_head.cp_mbuf;
5478 if (tail != NULL) {
5479 *tail = pkt_tail.cp_mbuf;
5480 }
5481 return rc;
5482 }
5483
5484 #if XNU_TARGET_OS_OSX
5485 errno_t
ifnet_framer_stub(struct ifnet * ifp,struct mbuf ** m,const struct sockaddr * dest,const char * dest_linkaddr,const char * frame_type,u_int32_t * pre,u_int32_t * post)5486 ifnet_framer_stub(struct ifnet *ifp, struct mbuf **m,
5487 const struct sockaddr *dest, const char *dest_linkaddr,
5488 const char *frame_type, u_int32_t *pre, u_int32_t *post)
5489 {
5490 if (pre != NULL) {
5491 *pre = 0;
5492 }
5493 if (post != NULL) {
5494 *post = 0;
5495 }
5496
5497 return ifp->if_framer_legacy(ifp, m, dest, dest_linkaddr, frame_type);
5498 }
5499 #endif /* XNU_TARGET_OS_OSX */
5500
5501 static boolean_t
packet_has_vlan_tag(struct mbuf * m)5502 packet_has_vlan_tag(struct mbuf * m)
5503 {
5504 u_int tag = 0;
5505
5506 if ((m->m_pkthdr.csum_flags & CSUM_VLAN_TAG_VALID) != 0) {
5507 tag = EVL_VLANOFTAG(m->m_pkthdr.vlan_tag);
5508 if (tag == 0) {
5509 /* the packet is just priority-tagged, clear the bit */
5510 m->m_pkthdr.csum_flags &= ~CSUM_VLAN_TAG_VALID;
5511 }
5512 }
5513 return tag != 0;
5514 }
5515
5516 static int
dlil_interface_filters_input(struct ifnet * ifp,struct mbuf ** m_p,char ** frame_header_p,protocol_family_t protocol_family)5517 dlil_interface_filters_input(struct ifnet *ifp, struct mbuf **m_p,
5518 char **frame_header_p, protocol_family_t protocol_family)
5519 {
5520 boolean_t is_vlan_packet = FALSE;
5521 struct ifnet_filter *filter;
5522 struct mbuf *m = *m_p;
5523
5524 is_vlan_packet = packet_has_vlan_tag(m);
5525
5526 if (TAILQ_EMPTY(&ifp->if_flt_head)) {
5527 return 0;
5528 }
5529
5530 /*
5531 * Pass the inbound packet to the interface filters
5532 */
5533 lck_mtx_lock_spin(&ifp->if_flt_lock);
5534 /* prevent filter list from changing in case we drop the lock */
5535 if_flt_monitor_busy(ifp);
5536 TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
5537 int result;
5538
5539 /* exclude VLAN packets from external filters PR-3586856 */
5540 if (is_vlan_packet &&
5541 (filter->filt_flags & DLIL_IFF_INTERNAL) == 0) {
5542 continue;
5543 }
5544
5545 if (!filter->filt_skip && filter->filt_input != NULL &&
5546 (filter->filt_protocol == 0 ||
5547 filter->filt_protocol == protocol_family)) {
5548 lck_mtx_unlock(&ifp->if_flt_lock);
5549
5550 result = (*filter->filt_input)(filter->filt_cookie,
5551 ifp, protocol_family, m_p, frame_header_p);
5552
5553 lck_mtx_lock_spin(&ifp->if_flt_lock);
5554 if (result != 0) {
5555 /* we're done with the filter list */
5556 if_flt_monitor_unbusy(ifp);
5557 lck_mtx_unlock(&ifp->if_flt_lock);
5558 return result;
5559 }
5560 }
5561 }
5562 /* we're done with the filter list */
5563 if_flt_monitor_unbusy(ifp);
5564 lck_mtx_unlock(&ifp->if_flt_lock);
5565
5566 /*
5567 * Strip away M_PROTO1 bit prior to sending packet up the stack as
5568 * it is meant to be local to a subsystem -- if_bridge for M_PROTO1
5569 */
5570 if (*m_p != NULL) {
5571 (*m_p)->m_flags &= ~M_PROTO1;
5572 }
5573
5574 return 0;
5575 }
5576
5577 __attribute__((noinline))
5578 static int
dlil_interface_filters_output(struct ifnet * ifp,struct mbuf ** m_p,protocol_family_t protocol_family)5579 dlil_interface_filters_output(struct ifnet *ifp, struct mbuf **m_p,
5580 protocol_family_t protocol_family)
5581 {
5582 boolean_t is_vlan_packet;
5583 struct ifnet_filter *filter;
5584 struct mbuf *m = *m_p;
5585
5586 is_vlan_packet = packet_has_vlan_tag(m);
5587
5588 /*
5589 * Pass the outbound packet to the interface filters
5590 */
5591 lck_mtx_lock_spin(&ifp->if_flt_lock);
5592 /* prevent filter list from changing in case we drop the lock */
5593 if_flt_monitor_busy(ifp);
5594 TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
5595 int result;
5596
5597 /* exclude VLAN packets from external filters PR-3586856 */
5598 if (is_vlan_packet &&
5599 (filter->filt_flags & DLIL_IFF_INTERNAL) == 0) {
5600 continue;
5601 }
5602
5603 if (!filter->filt_skip && filter->filt_output != NULL &&
5604 (filter->filt_protocol == 0 ||
5605 filter->filt_protocol == protocol_family)) {
5606 lck_mtx_unlock(&ifp->if_flt_lock);
5607
5608 result = filter->filt_output(filter->filt_cookie, ifp,
5609 protocol_family, m_p);
5610
5611 lck_mtx_lock_spin(&ifp->if_flt_lock);
5612 if (result != 0) {
5613 /* we're done with the filter list */
5614 if_flt_monitor_unbusy(ifp);
5615 lck_mtx_unlock(&ifp->if_flt_lock);
5616 return result;
5617 }
5618 }
5619 }
5620 /* we're done with the filter list */
5621 if_flt_monitor_unbusy(ifp);
5622 lck_mtx_unlock(&ifp->if_flt_lock);
5623
5624 return 0;
5625 }
5626
5627 static void
dlil_ifproto_input(struct if_proto * ifproto,mbuf_t m)5628 dlil_ifproto_input(struct if_proto * ifproto, mbuf_t m)
5629 {
5630 int error;
5631
5632 if (ifproto->proto_kpi == kProtoKPI_v1) {
5633 /* Version 1 protocols get one packet at a time */
5634 while (m != NULL) {
5635 char * frame_header;
5636 mbuf_t next_packet;
5637
5638 next_packet = m->m_nextpkt;
5639 m->m_nextpkt = NULL;
5640 frame_header = m->m_pkthdr.pkt_hdr;
5641 m->m_pkthdr.pkt_hdr = NULL;
5642 error = (*ifproto->kpi.v1.input)(ifproto->ifp,
5643 ifproto->protocol_family, m, frame_header);
5644 if (error != 0 && error != EJUSTRETURN) {
5645 m_freem(m);
5646 }
5647 m = next_packet;
5648 }
5649 } else if (ifproto->proto_kpi == kProtoKPI_v2) {
5650 /* Version 2 protocols support packet lists */
5651 error = (*ifproto->kpi.v2.input)(ifproto->ifp,
5652 ifproto->protocol_family, m);
5653 if (error != 0 && error != EJUSTRETURN) {
5654 m_freem_list(m);
5655 }
5656 }
5657 }
5658
5659 static void
dlil_input_stats_add(const struct ifnet_stat_increment_param * s,struct dlil_threading_info * inp,struct ifnet * ifp,boolean_t poll)5660 dlil_input_stats_add(const struct ifnet_stat_increment_param *s,
5661 struct dlil_threading_info *inp, struct ifnet *ifp, boolean_t poll)
5662 {
5663 struct ifnet_stat_increment_param *d = &inp->dlth_stats;
5664
5665 if (s->packets_in != 0) {
5666 d->packets_in += s->packets_in;
5667 }
5668 if (s->bytes_in != 0) {
5669 d->bytes_in += s->bytes_in;
5670 }
5671 if (s->errors_in != 0) {
5672 d->errors_in += s->errors_in;
5673 }
5674
5675 if (s->packets_out != 0) {
5676 d->packets_out += s->packets_out;
5677 }
5678 if (s->bytes_out != 0) {
5679 d->bytes_out += s->bytes_out;
5680 }
5681 if (s->errors_out != 0) {
5682 d->errors_out += s->errors_out;
5683 }
5684
5685 if (s->collisions != 0) {
5686 d->collisions += s->collisions;
5687 }
5688 if (s->dropped != 0) {
5689 d->dropped += s->dropped;
5690 }
5691
5692 if (poll) {
5693 PKTCNTR_ADD(&ifp->if_poll_tstats, s->packets_in, s->bytes_in);
5694 }
5695 }
5696
5697 static boolean_t
dlil_input_stats_sync(struct ifnet * ifp,struct dlil_threading_info * inp)5698 dlil_input_stats_sync(struct ifnet *ifp, struct dlil_threading_info *inp)
5699 {
5700 struct ifnet_stat_increment_param *s = &inp->dlth_stats;
5701
5702 /*
5703 * Use of atomic operations is unavoidable here because
5704 * these stats may also be incremented elsewhere via KPIs.
5705 */
5706 if (s->packets_in != 0) {
5707 atomic_add_64(&ifp->if_data.ifi_ipackets, s->packets_in);
5708 s->packets_in = 0;
5709 }
5710 if (s->bytes_in != 0) {
5711 atomic_add_64(&ifp->if_data.ifi_ibytes, s->bytes_in);
5712 s->bytes_in = 0;
5713 }
5714 if (s->errors_in != 0) {
5715 atomic_add_64(&ifp->if_data.ifi_ierrors, s->errors_in);
5716 s->errors_in = 0;
5717 }
5718
5719 if (s->packets_out != 0) {
5720 atomic_add_64(&ifp->if_data.ifi_opackets, s->packets_out);
5721 s->packets_out = 0;
5722 }
5723 if (s->bytes_out != 0) {
5724 atomic_add_64(&ifp->if_data.ifi_obytes, s->bytes_out);
5725 s->bytes_out = 0;
5726 }
5727 if (s->errors_out != 0) {
5728 atomic_add_64(&ifp->if_data.ifi_oerrors, s->errors_out);
5729 s->errors_out = 0;
5730 }
5731
5732 if (s->collisions != 0) {
5733 atomic_add_64(&ifp->if_data.ifi_collisions, s->collisions);
5734 s->collisions = 0;
5735 }
5736 if (s->dropped != 0) {
5737 atomic_add_64(&ifp->if_data.ifi_iqdrops, s->dropped);
5738 s->dropped = 0;
5739 }
5740
5741 /*
5742 * No need for atomic operations as they are modified here
5743 * only from within the DLIL input thread context.
5744 */
5745 if (ifp->if_poll_tstats.packets != 0) {
5746 ifp->if_poll_pstats.ifi_poll_packets += ifp->if_poll_tstats.packets;
5747 ifp->if_poll_tstats.packets = 0;
5748 }
5749 if (ifp->if_poll_tstats.bytes != 0) {
5750 ifp->if_poll_pstats.ifi_poll_bytes += ifp->if_poll_tstats.bytes;
5751 ifp->if_poll_tstats.bytes = 0;
5752 }
5753
5754 return ifp->if_data_threshold != 0;
5755 }
5756
5757 __private_extern__ void
dlil_input_packet_list(struct ifnet * ifp,struct mbuf * m)5758 dlil_input_packet_list(struct ifnet *ifp, struct mbuf *m)
5759 {
5760 return dlil_input_packet_list_common(ifp, m, 0,
5761 IFNET_MODEL_INPUT_POLL_OFF, FALSE);
5762 }
5763
5764 __private_extern__ void
dlil_input_packet_list_extended(struct ifnet * ifp,struct mbuf * m,u_int32_t cnt,ifnet_model_t mode)5765 dlil_input_packet_list_extended(struct ifnet *ifp, struct mbuf *m,
5766 u_int32_t cnt, ifnet_model_t mode)
5767 {
5768 return dlil_input_packet_list_common(ifp, m, cnt, mode, TRUE);
5769 }
5770
5771 static void
dlil_input_packet_list_common(struct ifnet * ifp_param,struct mbuf * m,u_int32_t cnt,ifnet_model_t mode,boolean_t ext)5772 dlil_input_packet_list_common(struct ifnet *ifp_param, struct mbuf *m,
5773 u_int32_t cnt, ifnet_model_t mode, boolean_t ext)
5774 {
5775 int error = 0;
5776 protocol_family_t protocol_family;
5777 mbuf_t next_packet;
5778 ifnet_t ifp = ifp_param;
5779 char *frame_header = NULL;
5780 struct if_proto *last_ifproto = NULL;
5781 mbuf_t pkt_first = NULL;
5782 mbuf_t *pkt_next = NULL;
5783 u_int32_t poll_thresh = 0, poll_ival = 0;
5784 int iorefcnt = 0;
5785
5786 KERNEL_DEBUG(DBG_FNC_DLIL_INPUT | DBG_FUNC_START, 0, 0, 0, 0, 0);
5787
5788 if (ext && mode == IFNET_MODEL_INPUT_POLL_ON && cnt > 1 &&
5789 (poll_ival = if_rxpoll_interval_pkts) > 0) {
5790 poll_thresh = cnt;
5791 }
5792
5793 while (m != NULL) {
5794 struct if_proto *ifproto = NULL;
5795 uint32_t pktf_mask; /* pkt flags to preserve */
5796
5797 m_add_crumb(m, PKT_CRUMB_DLIL_INPUT);
5798
5799 if (ifp_param == NULL) {
5800 ifp = m->m_pkthdr.rcvif;
5801 }
5802
5803 if ((ifp->if_eflags & IFEF_RXPOLL) &&
5804 (ifp->if_xflags & IFXF_LEGACY) && poll_thresh != 0 &&
5805 poll_ival > 0 && (--poll_thresh % poll_ival) == 0) {
5806 ifnet_poll(ifp);
5807 }
5808
5809 /* Check if this mbuf looks valid */
5810 MBUF_INPUT_CHECK(m, ifp);
5811
5812 next_packet = m->m_nextpkt;
5813 m->m_nextpkt = NULL;
5814 frame_header = m->m_pkthdr.pkt_hdr;
5815 m->m_pkthdr.pkt_hdr = NULL;
5816
5817 /*
5818 * Get an IO reference count if the interface is not
5819 * loopback (lo0) and it is attached; lo0 never goes
5820 * away, so optimize for that.
5821 */
5822 if (ifp != lo_ifp) {
5823 /* iorefcnt is 0 if it hasn't been taken yet */
5824 if (iorefcnt == 0) {
5825 if (!ifnet_datamov_begin(ifp)) {
5826 m_freem(m);
5827 goto next;
5828 }
5829 }
5830 iorefcnt = 1;
5831 /*
5832 * Preserve the time stamp and skip pktap flags.
5833 */
5834 pktf_mask = PKTF_TS_VALID | PKTF_SKIP_PKTAP;
5835 } else {
5836 /*
5837 * If this arrived on lo0, preserve interface addr
5838 * info to allow for connectivity between loopback
5839 * and local interface addresses.
5840 */
5841 pktf_mask = (PKTF_LOOP | PKTF_IFAINFO);
5842 }
5843 pktf_mask |= PKTF_WAKE_PKT;
5844
5845 /* make sure packet comes in clean */
5846 m_classifier_init(m, pktf_mask);
5847
5848 ifp_inc_traffic_class_in(ifp, m);
5849
5850 /* find which protocol family this packet is for */
5851 ifnet_lock_shared(ifp);
5852 error = (*ifp->if_demux)(ifp, m, frame_header,
5853 &protocol_family);
5854 ifnet_lock_done(ifp);
5855 if (error != 0) {
5856 if (error == EJUSTRETURN) {
5857 goto next;
5858 }
5859 protocol_family = 0;
5860 }
5861
5862 #if (DEVELOPMENT || DEBUG)
5863 /*
5864 * For testing we do not care about broadcast and multicast packets as
5865 * they are not as controllable as unicast traffic
5866 */
5867 if (__improbable(ifp->if_xflags & IFXF_MARK_WAKE_PKT)) {
5868 if ((protocol_family == PF_INET || protocol_family == PF_INET6) &&
5869 (m->m_flags & (M_BCAST | M_MCAST)) == 0) {
5870 /*
5871 * This is a one-shot command
5872 */
5873 ifp->if_xflags &= ~IFXF_MARK_WAKE_PKT;
5874 m->m_pkthdr.pkt_flags |= PKTF_WAKE_PKT;
5875 }
5876 }
5877 #endif /* (DEVELOPMENT || DEBUG) */
5878 if (__improbable(net_wake_pkt_debug > 0 && (m->m_pkthdr.pkt_flags & PKTF_WAKE_PKT))) {
5879 char buffer[64];
5880 size_t buflen = MIN(mbuf_pkthdr_len(m), sizeof(buffer));
5881
5882 os_log(OS_LOG_DEFAULT, "wake packet from %s len %d",
5883 ifp->if_xname, m_pktlen(m));
5884 if (mbuf_copydata(m, 0, buflen, buffer) == 0) {
5885 log_hexdump(buffer, buflen);
5886 }
5887 }
5888
5889 pktap_input(ifp, protocol_family, m, frame_header);
5890
5891 /* Drop v4 packets received on CLAT46 enabled cell interface */
5892 if (protocol_family == PF_INET && IS_INTF_CLAT46(ifp) &&
5893 ifp->if_type == IFT_CELLULAR) {
5894 m_freem(m);
5895 ip6stat.ip6s_clat464_in_v4_drop++;
5896 goto next;
5897 }
5898
5899 /* Translate the packet if it is received on CLAT interface */
5900 if (protocol_family == PF_INET6 && IS_INTF_CLAT46(ifp)
5901 && dlil_is_clat_needed(protocol_family, m)) {
5902 char *data = NULL;
5903 struct ether_header eh;
5904 struct ether_header *ehp = NULL;
5905
5906 if (ifp->if_type == IFT_ETHER) {
5907 ehp = (struct ether_header *)(void *)frame_header;
5908 /* Skip RX Ethernet packets if they are not IPV6 */
5909 if (ntohs(ehp->ether_type) != ETHERTYPE_IPV6) {
5910 goto skip_clat;
5911 }
5912
5913 /* Keep a copy of frame_header for Ethernet packets */
5914 bcopy(frame_header, (caddr_t)&eh, ETHER_HDR_LEN);
5915 }
5916 error = dlil_clat64(ifp, &protocol_family, &m);
5917 data = (char *) mbuf_data(m);
5918 if (error != 0) {
5919 m_freem(m);
5920 ip6stat.ip6s_clat464_in_drop++;
5921 goto next;
5922 }
5923 /* Native v6 should be No-op */
5924 if (protocol_family != PF_INET) {
5925 goto skip_clat;
5926 }
5927
5928 /* Do this only for translated v4 packets. */
5929 switch (ifp->if_type) {
5930 case IFT_CELLULAR:
5931 frame_header = data;
5932 break;
5933 case IFT_ETHER:
5934 /*
5935 * Drop if the mbuf doesn't have enough
5936 * space for Ethernet header
5937 */
5938 if (M_LEADINGSPACE(m) < ETHER_HDR_LEN) {
5939 m_free(m);
5940 ip6stat.ip6s_clat464_in_drop++;
5941 goto next;
5942 }
5943 /*
5944 * Set the frame_header ETHER_HDR_LEN bytes
5945 * preceeding the data pointer. Change
5946 * the ether_type too.
5947 */
5948 frame_header = data - ETHER_HDR_LEN;
5949 eh.ether_type = htons(ETHERTYPE_IP);
5950 bcopy((caddr_t)&eh, frame_header, ETHER_HDR_LEN);
5951 break;
5952 }
5953 }
5954 skip_clat:
5955 /*
5956 * Match the wake packet against the list of ports that has been
5957 * been queried by the driver before the device went to sleep
5958 */
5959 if (__improbable(m->m_pkthdr.pkt_flags & PKTF_WAKE_PKT)) {
5960 if (protocol_family != PF_INET && protocol_family != PF_INET6) {
5961 if_ports_used_match_mbuf(ifp, protocol_family, m);
5962 }
5963 }
5964 if (hwcksum_dbg != 0 && !(ifp->if_flags & IFF_LOOPBACK) &&
5965 !(m->m_pkthdr.pkt_flags & PKTF_LOOP)) {
5966 dlil_input_cksum_dbg(ifp, m, frame_header,
5967 protocol_family);
5968 }
5969 /*
5970 * For partial checksum offload, we expect the driver to
5971 * set the start offset indicating the start of the span
5972 * that is covered by the hardware-computed checksum;
5973 * adjust this start offset accordingly because the data
5974 * pointer has been advanced beyond the link-layer header.
5975 *
5976 * Virtual lan types (bridge, vlan, bond) can call
5977 * dlil_input_packet_list() with the same packet with the
5978 * checksum flags set. Set a flag indicating that the
5979 * adjustment has already been done.
5980 */
5981 if ((m->m_pkthdr.csum_flags & CSUM_ADJUST_DONE) != 0) {
5982 /* adjustment has already been done */
5983 } else if ((m->m_pkthdr.csum_flags &
5984 (CSUM_DATA_VALID | CSUM_PARTIAL)) ==
5985 (CSUM_DATA_VALID | CSUM_PARTIAL)) {
5986 int adj;
5987 if (frame_header == NULL ||
5988 frame_header < (char *)mbuf_datastart(m) ||
5989 frame_header > (char *)m->m_data ||
5990 (adj = (int)(m->m_data - frame_header)) >
5991 m->m_pkthdr.csum_rx_start) {
5992 m->m_pkthdr.csum_data = 0;
5993 m->m_pkthdr.csum_flags &= ~CSUM_DATA_VALID;
5994 hwcksum_in_invalidated++;
5995 } else {
5996 m->m_pkthdr.csum_rx_start -= adj;
5997 }
5998 /* make sure we don't adjust more than once */
5999 m->m_pkthdr.csum_flags |= CSUM_ADJUST_DONE;
6000 }
6001 if (clat_debug) {
6002 pktap_input(ifp, protocol_family, m, frame_header);
6003 }
6004
6005 if (m->m_flags & (M_BCAST | M_MCAST)) {
6006 atomic_add_64(&ifp->if_imcasts, 1);
6007 }
6008
6009 /* run interface filters */
6010 error = dlil_interface_filters_input(ifp, &m,
6011 &frame_header, protocol_family);
6012 if (error != 0) {
6013 if (error != EJUSTRETURN) {
6014 m_freem(m);
6015 }
6016 goto next;
6017 }
6018 /*
6019 * A VLAN interface receives VLAN-tagged packets by attaching
6020 * its PF_VLAN protocol to a parent interface. When a VLAN
6021 * interface is a member of a bridge, the parent interface
6022 * receives VLAN-tagged M_PROMISC packets. A VLAN-tagged
6023 * M_PROMISC packet must be processed by the VLAN protocol
6024 * so that it can be sent up the stack via
6025 * dlil_input_packet_list(). That allows the bridge interface's
6026 * input filter, attached to the VLAN interface, to process
6027 * the packet.
6028 */
6029 if (protocol_family != PF_VLAN &&
6030 (m->m_flags & M_PROMISC) != 0) {
6031 m_freem(m);
6032 goto next;
6033 }
6034
6035 /* Lookup the protocol attachment to this interface */
6036 if (protocol_family == 0) {
6037 ifproto = NULL;
6038 } else if (last_ifproto != NULL && last_ifproto->ifp == ifp &&
6039 (last_ifproto->protocol_family == protocol_family)) {
6040 VERIFY(ifproto == NULL);
6041 ifproto = last_ifproto;
6042 if_proto_ref(last_ifproto);
6043 } else {
6044 VERIFY(ifproto == NULL);
6045 ifnet_lock_shared(ifp);
6046 /* callee holds a proto refcnt upon success */
6047 ifproto = find_attached_proto(ifp, protocol_family);
6048 ifnet_lock_done(ifp);
6049 }
6050 if (ifproto == NULL) {
6051 /* no protocol for this packet, discard */
6052 m_freem(m);
6053 goto next;
6054 }
6055 if (ifproto != last_ifproto) {
6056 if (last_ifproto != NULL) {
6057 /* pass up the list for the previous protocol */
6058 dlil_ifproto_input(last_ifproto, pkt_first);
6059 pkt_first = NULL;
6060 if_proto_free(last_ifproto);
6061 }
6062 last_ifproto = ifproto;
6063 if_proto_ref(ifproto);
6064 }
6065 /* extend the list */
6066 m->m_pkthdr.pkt_hdr = frame_header;
6067 if (pkt_first == NULL) {
6068 pkt_first = m;
6069 } else {
6070 *pkt_next = m;
6071 }
6072 pkt_next = &m->m_nextpkt;
6073
6074 next:
6075 if (next_packet == NULL && last_ifproto != NULL) {
6076 /* pass up the last list of packets */
6077 dlil_ifproto_input(last_ifproto, pkt_first);
6078 if_proto_free(last_ifproto);
6079 last_ifproto = NULL;
6080 }
6081 if (ifproto != NULL) {
6082 if_proto_free(ifproto);
6083 ifproto = NULL;
6084 }
6085
6086 m = next_packet;
6087
6088 /* update the driver's multicast filter, if needed */
6089 if (ifp->if_updatemcasts > 0 && if_mcasts_update(ifp) == 0) {
6090 ifp->if_updatemcasts = 0;
6091 }
6092 if (iorefcnt == 1) {
6093 /* If the next mbuf is on a different interface, unlock data-mov */
6094 if (!m || (ifp != ifp_param && ifp != m->m_pkthdr.rcvif)) {
6095 ifnet_datamov_end(ifp);
6096 iorefcnt = 0;
6097 }
6098 }
6099 }
6100
6101 KERNEL_DEBUG(DBG_FNC_DLIL_INPUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
6102 }
6103
6104 errno_t
if_mcasts_update(struct ifnet * ifp)6105 if_mcasts_update(struct ifnet *ifp)
6106 {
6107 errno_t err;
6108
6109 err = ifnet_ioctl(ifp, 0, SIOCADDMULTI, NULL);
6110 if (err == EAFNOSUPPORT) {
6111 err = 0;
6112 }
6113 DLIL_PRINTF("%s: %s %d suspended link-layer multicast membership(s) "
6114 "(err=%d)\n", if_name(ifp),
6115 (err == 0 ? "successfully restored" : "failed to restore"),
6116 ifp->if_updatemcasts, err);
6117
6118 /* just return success */
6119 return 0;
6120 }
6121
6122 /* If ifp is set, we will increment the generation for the interface */
6123 int
dlil_post_complete_msg(struct ifnet * ifp,struct kev_msg * event)6124 dlil_post_complete_msg(struct ifnet *ifp, struct kev_msg *event)
6125 {
6126 if (ifp != NULL) {
6127 ifnet_increment_generation(ifp);
6128 }
6129
6130 #if NECP
6131 necp_update_all_clients();
6132 #endif /* NECP */
6133
6134 return kev_post_msg(event);
6135 }
6136
6137 __private_extern__ void
dlil_post_sifflags_msg(struct ifnet * ifp)6138 dlil_post_sifflags_msg(struct ifnet * ifp)
6139 {
6140 struct kev_msg ev_msg;
6141 struct net_event_data ev_data;
6142
6143 bzero(&ev_data, sizeof(ev_data));
6144 bzero(&ev_msg, sizeof(ev_msg));
6145 ev_msg.vendor_code = KEV_VENDOR_APPLE;
6146 ev_msg.kev_class = KEV_NETWORK_CLASS;
6147 ev_msg.kev_subclass = KEV_DL_SUBCLASS;
6148 ev_msg.event_code = KEV_DL_SIFFLAGS;
6149 strlcpy(&ev_data.if_name[0], ifp->if_name, IFNAMSIZ);
6150 ev_data.if_family = ifp->if_family;
6151 ev_data.if_unit = (u_int32_t) ifp->if_unit;
6152 ev_msg.dv[0].data_length = sizeof(struct net_event_data);
6153 ev_msg.dv[0].data_ptr = &ev_data;
6154 ev_msg.dv[1].data_length = 0;
6155 dlil_post_complete_msg(ifp, &ev_msg);
6156 }
6157
6158 #define TMP_IF_PROTO_ARR_SIZE 10
6159 static int
dlil_event_internal(struct ifnet * ifp,struct kev_msg * event,bool update_generation)6160 dlil_event_internal(struct ifnet *ifp, struct kev_msg *event, bool update_generation)
6161 {
6162 struct ifnet_filter *filter = NULL;
6163 struct if_proto *proto = NULL;
6164 int if_proto_count = 0;
6165 struct if_proto **tmp_ifproto_arr = NULL;
6166 struct if_proto *tmp_ifproto_stack_arr[TMP_IF_PROTO_ARR_SIZE] = {NULL};
6167 int tmp_ifproto_arr_idx = 0;
6168 bool tmp_malloc = false;
6169
6170 /*
6171 * Pass the event to the interface filters
6172 */
6173 lck_mtx_lock_spin(&ifp->if_flt_lock);
6174 /* prevent filter list from changing in case we drop the lock */
6175 if_flt_monitor_busy(ifp);
6176 TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
6177 if (filter->filt_event != NULL) {
6178 lck_mtx_unlock(&ifp->if_flt_lock);
6179
6180 filter->filt_event(filter->filt_cookie, ifp,
6181 filter->filt_protocol, event);
6182
6183 lck_mtx_lock_spin(&ifp->if_flt_lock);
6184 }
6185 }
6186 /* we're done with the filter list */
6187 if_flt_monitor_unbusy(ifp);
6188 lck_mtx_unlock(&ifp->if_flt_lock);
6189
6190 /* Get an io ref count if the interface is attached */
6191 if (!ifnet_is_attached(ifp, 1)) {
6192 goto done;
6193 }
6194
6195 /*
6196 * An embedded tmp_list_entry in if_proto may still get
6197 * over-written by another thread after giving up ifnet lock,
6198 * therefore we are avoiding embedded pointers here.
6199 */
6200 ifnet_lock_shared(ifp);
6201 if_proto_count = dlil_ifp_protolist(ifp, NULL, 0);
6202 if (if_proto_count) {
6203 int i;
6204 VERIFY(ifp->if_proto_hash != NULL);
6205 if (if_proto_count <= TMP_IF_PROTO_ARR_SIZE) {
6206 tmp_ifproto_arr = tmp_ifproto_stack_arr;
6207 } else {
6208 MALLOC(tmp_ifproto_arr, struct if_proto **,
6209 sizeof(*tmp_ifproto_arr) * if_proto_count,
6210 M_TEMP, M_ZERO);
6211 if (tmp_ifproto_arr == NULL) {
6212 ifnet_lock_done(ifp);
6213 goto cleanup;
6214 }
6215 tmp_malloc = true;
6216 }
6217
6218 for (i = 0; i < PROTO_HASH_SLOTS; i++) {
6219 SLIST_FOREACH(proto, &ifp->if_proto_hash[i],
6220 next_hash) {
6221 if_proto_ref(proto);
6222 tmp_ifproto_arr[tmp_ifproto_arr_idx] = proto;
6223 tmp_ifproto_arr_idx++;
6224 }
6225 }
6226 VERIFY(if_proto_count == tmp_ifproto_arr_idx);
6227 }
6228 ifnet_lock_done(ifp);
6229
6230 for (tmp_ifproto_arr_idx = 0; tmp_ifproto_arr_idx < if_proto_count;
6231 tmp_ifproto_arr_idx++) {
6232 proto = tmp_ifproto_arr[tmp_ifproto_arr_idx];
6233 VERIFY(proto != NULL);
6234 proto_media_event eventp =
6235 (proto->proto_kpi == kProtoKPI_v1 ?
6236 proto->kpi.v1.event :
6237 proto->kpi.v2.event);
6238
6239 if (eventp != NULL) {
6240 eventp(ifp, proto->protocol_family,
6241 event);
6242 }
6243 if_proto_free(proto);
6244 }
6245
6246 cleanup:
6247 if (tmp_malloc) {
6248 FREE(tmp_ifproto_arr, M_TEMP);
6249 }
6250
6251 /* Pass the event to the interface */
6252 if (ifp->if_event != NULL) {
6253 ifp->if_event(ifp, event);
6254 }
6255
6256 /* Release the io ref count */
6257 ifnet_decr_iorefcnt(ifp);
6258 done:
6259 return dlil_post_complete_msg(update_generation ? ifp : NULL, event);
6260 }
6261
6262 errno_t
ifnet_event(ifnet_t ifp,struct kern_event_msg * event)6263 ifnet_event(ifnet_t ifp, struct kern_event_msg *event)
6264 {
6265 struct kev_msg kev_msg;
6266 int result = 0;
6267
6268 if (ifp == NULL || event == NULL) {
6269 return EINVAL;
6270 }
6271
6272 bzero(&kev_msg, sizeof(kev_msg));
6273 kev_msg.vendor_code = event->vendor_code;
6274 kev_msg.kev_class = event->kev_class;
6275 kev_msg.kev_subclass = event->kev_subclass;
6276 kev_msg.event_code = event->event_code;
6277 kev_msg.dv[0].data_ptr = &event->event_data[0];
6278 kev_msg.dv[0].data_length = event->total_size - KEV_MSG_HEADER_SIZE;
6279 kev_msg.dv[1].data_length = 0;
6280
6281 result = dlil_event_internal(ifp, &kev_msg, TRUE);
6282
6283 return result;
6284 }
6285
6286 static void
dlil_count_chain_len(mbuf_t m,struct chain_len_stats * cls)6287 dlil_count_chain_len(mbuf_t m, struct chain_len_stats *cls)
6288 {
6289 mbuf_t n = m;
6290 int chainlen = 0;
6291
6292 while (n != NULL) {
6293 chainlen++;
6294 n = n->m_next;
6295 }
6296 switch (chainlen) {
6297 case 0:
6298 break;
6299 case 1:
6300 atomic_add_64(&cls->cls_one, 1);
6301 break;
6302 case 2:
6303 atomic_add_64(&cls->cls_two, 1);
6304 break;
6305 case 3:
6306 atomic_add_64(&cls->cls_three, 1);
6307 break;
6308 case 4:
6309 atomic_add_64(&cls->cls_four, 1);
6310 break;
6311 case 5:
6312 default:
6313 atomic_add_64(&cls->cls_five_or_more, 1);
6314 break;
6315 }
6316 }
6317
6318 #if CONFIG_DTRACE
6319 __attribute__((noinline))
6320 static void
dlil_output_dtrace(ifnet_t ifp,protocol_family_t proto_family,mbuf_t m)6321 dlil_output_dtrace(ifnet_t ifp, protocol_family_t proto_family, mbuf_t m)
6322 {
6323 if (proto_family == PF_INET) {
6324 struct ip *ip = mtod(m, struct ip *);
6325 DTRACE_IP6(send, struct mbuf *, m, struct inpcb *, NULL,
6326 struct ip *, ip, struct ifnet *, ifp,
6327 struct ip *, ip, struct ip6_hdr *, NULL);
6328 } else if (proto_family == PF_INET6) {
6329 struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
6330 DTRACE_IP6(send, struct mbuf *, m, struct inpcb *, NULL,
6331 struct ip6_hdr *, ip6, struct ifnet *, ifp,
6332 struct ip *, NULL, struct ip6_hdr *, ip6);
6333 }
6334 }
6335 #endif /* CONFIG_DTRACE */
6336
6337 /*
6338 * dlil_output
6339 *
6340 * Caller should have a lock on the protocol domain if the protocol
6341 * doesn't support finer grained locking. In most cases, the lock
6342 * will be held from the socket layer and won't be released until
6343 * we return back to the socket layer.
6344 *
6345 * This does mean that we must take a protocol lock before we take
6346 * an interface lock if we're going to take both. This makes sense
6347 * because a protocol is likely to interact with an ifp while it
6348 * is under the protocol lock.
6349 *
6350 * An advisory code will be returned if adv is not null. This
6351 * can be used to provide feedback about interface queues to the
6352 * application.
6353 */
6354 errno_t
dlil_output(ifnet_t ifp,protocol_family_t proto_family,mbuf_t packetlist,void * route,const struct sockaddr * dest,int raw,struct flowadv * adv)6355 dlil_output(ifnet_t ifp, protocol_family_t proto_family, mbuf_t packetlist,
6356 void *route, const struct sockaddr *dest, int raw, struct flowadv *adv)
6357 {
6358 char *frame_type = NULL;
6359 char *dst_linkaddr = NULL;
6360 int retval = 0;
6361 char frame_type_buffer[MAX_FRAME_TYPE_SIZE * 4];
6362 char dst_linkaddr_buffer[MAX_LINKADDR * 4];
6363 struct if_proto *proto = NULL;
6364 mbuf_t m = NULL;
6365 mbuf_t send_head = NULL;
6366 mbuf_t *send_tail = &send_head;
6367 int iorefcnt = 0;
6368 u_int32_t pre = 0, post = 0;
6369 u_int32_t fpkts = 0, fbytes = 0;
6370 int32_t flen = 0;
6371 struct timespec now;
6372 u_int64_t now_nsec;
6373 boolean_t did_clat46 = FALSE;
6374 protocol_family_t old_proto_family = proto_family;
6375 struct sockaddr_in6 dest6;
6376 struct rtentry *rt = NULL;
6377 u_int32_t m_loop_set = 0;
6378
6379 KERNEL_DEBUG(DBG_FNC_DLIL_OUTPUT | DBG_FUNC_START, 0, 0, 0, 0, 0);
6380
6381 /*
6382 * Get an io refcnt if the interface is attached to prevent ifnet_detach
6383 * from happening while this operation is in progress
6384 */
6385 if (!ifnet_datamov_begin(ifp)) {
6386 retval = ENXIO;
6387 goto cleanup;
6388 }
6389 iorefcnt = 1;
6390
6391 VERIFY(ifp->if_output_dlil != NULL);
6392
6393 /* update the driver's multicast filter, if needed */
6394 if (ifp->if_updatemcasts > 0 && if_mcasts_update(ifp) == 0) {
6395 ifp->if_updatemcasts = 0;
6396 }
6397
6398 frame_type = frame_type_buffer;
6399 dst_linkaddr = dst_linkaddr_buffer;
6400
6401 if (raw == 0) {
6402 ifnet_lock_shared(ifp);
6403 /* callee holds a proto refcnt upon success */
6404 proto = find_attached_proto(ifp, proto_family);
6405 if (proto == NULL) {
6406 ifnet_lock_done(ifp);
6407 retval = ENXIO;
6408 goto cleanup;
6409 }
6410 ifnet_lock_done(ifp);
6411 }
6412
6413 preout_again:
6414 if (packetlist == NULL) {
6415 goto cleanup;
6416 }
6417
6418 m = packetlist;
6419 packetlist = packetlist->m_nextpkt;
6420 m->m_nextpkt = NULL;
6421
6422 m_add_crumb(m, PKT_CRUMB_DLIL_OUTPUT);
6423
6424 /*
6425 * Perform address family translation for the first
6426 * packet outside the loop in order to perform address
6427 * lookup for the translated proto family.
6428 */
6429 if (proto_family == PF_INET && IS_INTF_CLAT46(ifp) &&
6430 (ifp->if_type == IFT_CELLULAR ||
6431 dlil_is_clat_needed(proto_family, m))) {
6432 retval = dlil_clat46(ifp, &proto_family, &m);
6433 /*
6434 * Go to the next packet if translation fails
6435 */
6436 if (retval != 0) {
6437 m_freem(m);
6438 m = NULL;
6439 ip6stat.ip6s_clat464_out_drop++;
6440 /* Make sure that the proto family is PF_INET */
6441 ASSERT(proto_family == PF_INET);
6442 goto preout_again;
6443 }
6444 /*
6445 * Free the old one and make it point to the IPv6 proto structure.
6446 *
6447 * Change proto for the first time we have successfully
6448 * performed address family translation.
6449 */
6450 if (!did_clat46 && proto_family == PF_INET6) {
6451 did_clat46 = TRUE;
6452
6453 if (proto != NULL) {
6454 if_proto_free(proto);
6455 }
6456 ifnet_lock_shared(ifp);
6457 /* callee holds a proto refcnt upon success */
6458 proto = find_attached_proto(ifp, proto_family);
6459 if (proto == NULL) {
6460 ifnet_lock_done(ifp);
6461 retval = ENXIO;
6462 m_freem(m);
6463 m = NULL;
6464 goto cleanup;
6465 }
6466 ifnet_lock_done(ifp);
6467 if (ifp->if_type == IFT_ETHER) {
6468 /* Update the dest to translated v6 address */
6469 dest6.sin6_len = sizeof(struct sockaddr_in6);
6470 dest6.sin6_family = AF_INET6;
6471 dest6.sin6_addr = (mtod(m, struct ip6_hdr *))->ip6_dst;
6472 dest = (const struct sockaddr *)&dest6;
6473
6474 /*
6475 * Lookup route to the translated destination
6476 * Free this route ref during cleanup
6477 */
6478 rt = rtalloc1_scoped((struct sockaddr *)&dest6,
6479 0, 0, ifp->if_index);
6480
6481 route = rt;
6482 }
6483 }
6484 }
6485
6486 /*
6487 * This path gets packet chain going to the same destination.
6488 * The pre output routine is used to either trigger resolution of
6489 * the next hop or retreive the next hop's link layer addressing.
6490 * For ex: ether_inet(6)_pre_output routine.
6491 *
6492 * If the routine returns EJUSTRETURN, it implies that packet has
6493 * been queued, and therefore we have to call preout_again for the
6494 * following packet in the chain.
6495 *
6496 * For errors other than EJUSTRETURN, the current packet is freed
6497 * and the rest of the chain (pointed by packetlist is freed as
6498 * part of clean up.
6499 *
6500 * Else if there is no error the retrieved information is used for
6501 * all the packets in the chain.
6502 */
6503 if (raw == 0) {
6504 proto_media_preout preoutp = (proto->proto_kpi == kProtoKPI_v1 ?
6505 proto->kpi.v1.pre_output : proto->kpi.v2.pre_output);
6506 retval = 0;
6507 if (preoutp != NULL) {
6508 retval = preoutp(ifp, proto_family, &m, dest, route,
6509 frame_type, dst_linkaddr);
6510
6511 if (retval != 0) {
6512 if (retval == EJUSTRETURN) {
6513 goto preout_again;
6514 }
6515 m_freem(m);
6516 m = NULL;
6517 goto cleanup;
6518 }
6519 }
6520 }
6521
6522 do {
6523 /*
6524 * pkt_hdr is set here to point to m_data prior to
6525 * calling into the framer. This value of pkt_hdr is
6526 * used by the netif gso logic to retrieve the ip header
6527 * for the TCP packets, offloaded for TSO processing.
6528 */
6529 if ((raw != 0) && (ifp->if_family == IFNET_FAMILY_ETHERNET)) {
6530 uint8_t vlan_encap_len = 0;
6531
6532 if ((m->m_pkthdr.csum_flags & CSUM_VLAN_ENCAP_PRESENT) != 0) {
6533 vlan_encap_len = ETHER_VLAN_ENCAP_LEN;
6534 }
6535 m->m_pkthdr.pkt_hdr = mtod(m, char *) + ETHER_HDR_LEN + vlan_encap_len;
6536 } else {
6537 m->m_pkthdr.pkt_hdr = mtod(m, void *);
6538 }
6539
6540 /*
6541 * Perform address family translation if needed.
6542 * For now we only support stateless 4 to 6 translation
6543 * on the out path.
6544 *
6545 * The routine below translates IP header, updates protocol
6546 * checksum and also translates ICMP.
6547 *
6548 * We skip the first packet as it is already translated and
6549 * the proto family is set to PF_INET6.
6550 */
6551 if (proto_family == PF_INET && IS_INTF_CLAT46(ifp) &&
6552 (ifp->if_type == IFT_CELLULAR ||
6553 dlil_is_clat_needed(proto_family, m))) {
6554 retval = dlil_clat46(ifp, &proto_family, &m);
6555 /* Goto the next packet if the translation fails */
6556 if (retval != 0) {
6557 m_freem(m);
6558 m = NULL;
6559 ip6stat.ip6s_clat464_out_drop++;
6560 goto next;
6561 }
6562 }
6563
6564 #if CONFIG_DTRACE
6565 if (!raw) {
6566 dlil_output_dtrace(ifp, proto_family, m);
6567 }
6568 #endif /* CONFIG_DTRACE */
6569
6570 if (raw == 0 && ifp->if_framer != NULL) {
6571 int rcvif_set = 0;
6572
6573 /*
6574 * If this is a broadcast packet that needs to be
6575 * looped back into the system, set the inbound ifp
6576 * to that of the outbound ifp. This will allow
6577 * us to determine that it is a legitimate packet
6578 * for the system. Only set the ifp if it's not
6579 * already set, just to be safe.
6580 */
6581 if ((m->m_flags & (M_BCAST | M_LOOP)) &&
6582 m->m_pkthdr.rcvif == NULL) {
6583 m->m_pkthdr.rcvif = ifp;
6584 rcvif_set = 1;
6585 }
6586 m_loop_set = m->m_flags & M_LOOP;
6587 retval = ifp->if_framer(ifp, &m, dest, dst_linkaddr,
6588 frame_type, &pre, &post);
6589 if (retval != 0) {
6590 if (retval != EJUSTRETURN) {
6591 m_freem(m);
6592 }
6593 goto next;
6594 }
6595
6596 /*
6597 * For partial checksum offload, adjust the start
6598 * and stuff offsets based on the prepended header.
6599 */
6600 if ((m->m_pkthdr.csum_flags &
6601 (CSUM_DATA_VALID | CSUM_PARTIAL)) ==
6602 (CSUM_DATA_VALID | CSUM_PARTIAL)) {
6603 m->m_pkthdr.csum_tx_stuff += pre;
6604 m->m_pkthdr.csum_tx_start += pre;
6605 }
6606
6607 if (hwcksum_dbg != 0 && !(ifp->if_flags & IFF_LOOPBACK)) {
6608 dlil_output_cksum_dbg(ifp, m, pre,
6609 proto_family);
6610 }
6611
6612 /*
6613 * Clear the ifp if it was set above, and to be
6614 * safe, only if it is still the same as the
6615 * outbound ifp we have in context. If it was
6616 * looped back, then a copy of it was sent to the
6617 * loopback interface with the rcvif set, and we
6618 * are clearing the one that will go down to the
6619 * layer below.
6620 */
6621 if (rcvif_set && m->m_pkthdr.rcvif == ifp) {
6622 m->m_pkthdr.rcvif = NULL;
6623 }
6624 }
6625
6626 /*
6627 * Let interface filters (if any) do their thing ...
6628 */
6629 retval = dlil_interface_filters_output(ifp, &m, proto_family);
6630 if (retval != 0) {
6631 if (retval != EJUSTRETURN) {
6632 m_freem(m);
6633 }
6634 goto next;
6635 }
6636 /*
6637 * Strip away M_PROTO1 bit prior to sending packet
6638 * to the driver as this field may be used by the driver
6639 */
6640 m->m_flags &= ~M_PROTO1;
6641
6642 /*
6643 * If the underlying interface is not capable of handling a
6644 * packet whose data portion spans across physically disjoint
6645 * pages, we need to "normalize" the packet so that we pass
6646 * down a chain of mbufs where each mbuf points to a span that
6647 * resides in the system page boundary. If the packet does
6648 * not cross page(s), the following is a no-op.
6649 */
6650 if (!(ifp->if_hwassist & IFNET_MULTIPAGES)) {
6651 if ((m = m_normalize(m)) == NULL) {
6652 goto next;
6653 }
6654 }
6655
6656 /*
6657 * If this is a TSO packet, make sure the interface still
6658 * advertise TSO capability.
6659 */
6660 if (TSO_IPV4_NOTOK(ifp, m) || TSO_IPV6_NOTOK(ifp, m)) {
6661 retval = EMSGSIZE;
6662 m_freem(m);
6663 goto cleanup;
6664 }
6665
6666 ifp_inc_traffic_class_out(ifp, m);
6667
6668 #if SKYWALK
6669 /*
6670 * For native skywalk devices, packets will be passed to pktap
6671 * after GSO or after the mbuf to packet conversion.
6672 * This is done for IPv4/IPv6 packets only because there is no
6673 * space in the mbuf to pass down the proto family.
6674 */
6675 if (dlil_is_native_netif_nexus(ifp)) {
6676 if (raw || m->m_pkthdr.pkt_proto == 0) {
6677 pktap_output(ifp, proto_family, m, pre, post);
6678 m->m_pkthdr.pkt_flags |= PKTF_SKIP_PKTAP;
6679 }
6680 } else {
6681 pktap_output(ifp, proto_family, m, pre, post);
6682 }
6683 #else /* SKYWALK */
6684 pktap_output(ifp, proto_family, m, pre, post);
6685 #endif /* SKYWALK */
6686
6687 /*
6688 * Count the number of elements in the mbuf chain
6689 */
6690 if (tx_chain_len_count) {
6691 dlil_count_chain_len(m, &tx_chain_len_stats);
6692 }
6693
6694 /*
6695 * Record timestamp; ifnet_enqueue() will use this info
6696 * rather than redoing the work. An optimization could
6697 * involve doing this just once at the top, if there are
6698 * no interface filters attached, but that's probably
6699 * not a big deal.
6700 */
6701 nanouptime(&now);
6702 net_timernsec(&now, &now_nsec);
6703 (void) mbuf_set_timestamp(m, now_nsec, TRUE);
6704
6705 /*
6706 * Discard partial sum information if this packet originated
6707 * from another interface; the packet would already have the
6708 * final checksum and we shouldn't recompute it.
6709 */
6710 if ((m->m_pkthdr.pkt_flags & PKTF_FORWARDED) &&
6711 (m->m_pkthdr.csum_flags & (CSUM_DATA_VALID | CSUM_PARTIAL)) ==
6712 (CSUM_DATA_VALID | CSUM_PARTIAL)) {
6713 m->m_pkthdr.csum_flags &= ~CSUM_TX_FLAGS;
6714 m->m_pkthdr.csum_data = 0;
6715 }
6716
6717 /*
6718 * Finally, call the driver.
6719 */
6720 if (ifp->if_eflags & (IFEF_SENDLIST | IFEF_ENQUEUE_MULTI)) {
6721 if (m->m_pkthdr.pkt_flags & PKTF_FORWARDED) {
6722 flen += (m_pktlen(m) - (pre + post));
6723 m->m_pkthdr.pkt_flags &= ~PKTF_FORWARDED;
6724 }
6725 *send_tail = m;
6726 send_tail = &m->m_nextpkt;
6727 } else {
6728 if (m->m_pkthdr.pkt_flags & PKTF_FORWARDED) {
6729 flen = (m_pktlen(m) - (pre + post));
6730 m->m_pkthdr.pkt_flags &= ~PKTF_FORWARDED;
6731 } else {
6732 flen = 0;
6733 }
6734 KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_START,
6735 0, 0, 0, 0, 0);
6736 retval = (*ifp->if_output_dlil)(ifp, m);
6737 if (retval == EQFULL || retval == EQSUSPENDED) {
6738 if (adv != NULL && adv->code == FADV_SUCCESS) {
6739 adv->code = (retval == EQFULL ?
6740 FADV_FLOW_CONTROLLED :
6741 FADV_SUSPENDED);
6742 }
6743 retval = 0;
6744 }
6745 if (retval == 0 && flen > 0) {
6746 fbytes += flen;
6747 fpkts++;
6748 }
6749 if (retval != 0 && dlil_verbose) {
6750 DLIL_PRINTF("%s: output error on %s retval = %d\n",
6751 __func__, if_name(ifp),
6752 retval);
6753 }
6754 KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_END,
6755 0, 0, 0, 0, 0);
6756 }
6757 KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
6758
6759 next:
6760 m = packetlist;
6761 if (m != NULL) {
6762 m->m_flags |= m_loop_set;
6763 packetlist = packetlist->m_nextpkt;
6764 m->m_nextpkt = NULL;
6765 }
6766 /* Reset the proto family to old proto family for CLAT */
6767 if (did_clat46) {
6768 proto_family = old_proto_family;
6769 }
6770 } while (m != NULL);
6771
6772 if (send_head != NULL) {
6773 KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_START,
6774 0, 0, 0, 0, 0);
6775 if (ifp->if_eflags & IFEF_SENDLIST) {
6776 retval = (*ifp->if_output_dlil)(ifp, send_head);
6777 if (retval == EQFULL || retval == EQSUSPENDED) {
6778 if (adv != NULL) {
6779 adv->code = (retval == EQFULL ?
6780 FADV_FLOW_CONTROLLED :
6781 FADV_SUSPENDED);
6782 }
6783 retval = 0;
6784 }
6785 if (retval == 0 && flen > 0) {
6786 fbytes += flen;
6787 fpkts++;
6788 }
6789 if (retval != 0 && dlil_verbose) {
6790 DLIL_PRINTF("%s: output error on %s retval = %d\n",
6791 __func__, if_name(ifp), retval);
6792 }
6793 } else {
6794 struct mbuf *send_m;
6795 int enq_cnt = 0;
6796 VERIFY(ifp->if_eflags & IFEF_ENQUEUE_MULTI);
6797 while (send_head != NULL) {
6798 send_m = send_head;
6799 send_head = send_m->m_nextpkt;
6800 send_m->m_nextpkt = NULL;
6801 retval = (*ifp->if_output_dlil)(ifp, send_m);
6802 if (retval == EQFULL || retval == EQSUSPENDED) {
6803 if (adv != NULL) {
6804 adv->code = (retval == EQFULL ?
6805 FADV_FLOW_CONTROLLED :
6806 FADV_SUSPENDED);
6807 }
6808 retval = 0;
6809 }
6810 if (retval == 0) {
6811 enq_cnt++;
6812 if (flen > 0) {
6813 fpkts++;
6814 }
6815 }
6816 if (retval != 0 && dlil_verbose) {
6817 DLIL_PRINTF("%s: output error on %s "
6818 "retval = %d\n",
6819 __func__, if_name(ifp), retval);
6820 }
6821 }
6822 if (enq_cnt > 0) {
6823 fbytes += flen;
6824 ifnet_start(ifp);
6825 }
6826 }
6827 KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
6828 }
6829
6830 KERNEL_DEBUG(DBG_FNC_DLIL_OUTPUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
6831
6832 cleanup:
6833 if (fbytes > 0) {
6834 ifp->if_fbytes += fbytes;
6835 }
6836 if (fpkts > 0) {
6837 ifp->if_fpackets += fpkts;
6838 }
6839 if (proto != NULL) {
6840 if_proto_free(proto);
6841 }
6842 if (packetlist) { /* if any packets are left, clean up */
6843 mbuf_freem_list(packetlist);
6844 }
6845 if (retval == EJUSTRETURN) {
6846 retval = 0;
6847 }
6848 if (iorefcnt == 1) {
6849 ifnet_datamov_end(ifp);
6850 }
6851 if (rt != NULL) {
6852 rtfree(rt);
6853 rt = NULL;
6854 }
6855
6856 return retval;
6857 }
6858
6859 /*
6860 * This routine checks if the destination address is not a loopback, link-local,
6861 * multicast or broadcast address.
6862 */
6863 static int
dlil_is_clat_needed(protocol_family_t proto_family,mbuf_t m)6864 dlil_is_clat_needed(protocol_family_t proto_family, mbuf_t m)
6865 {
6866 int ret = 0;
6867 switch (proto_family) {
6868 case PF_INET: {
6869 struct ip *iph = mtod(m, struct ip *);
6870 if (CLAT46_NEEDED(ntohl(iph->ip_dst.s_addr))) {
6871 ret = 1;
6872 }
6873 break;
6874 }
6875 case PF_INET6: {
6876 struct ip6_hdr *ip6h = mtod(m, struct ip6_hdr *);
6877 if ((size_t)m_pktlen(m) >= sizeof(struct ip6_hdr) &&
6878 CLAT64_NEEDED(&ip6h->ip6_dst)) {
6879 ret = 1;
6880 }
6881 break;
6882 }
6883 }
6884
6885 return ret;
6886 }
6887 /*
6888 * @brief This routine translates IPv4 packet to IPv6 packet,
6889 * updates protocol checksum and also translates ICMP for code
6890 * along with inner header translation.
6891 *
6892 * @param ifp Pointer to the interface
6893 * @param proto_family pointer to protocol family. It is updated if function
6894 * performs the translation successfully.
6895 * @param m Pointer to the pointer pointing to the packet. Needed because this
6896 * routine can end up changing the mbuf to a different one.
6897 *
6898 * @return 0 on success or else a negative value.
6899 */
6900 static errno_t
dlil_clat46(ifnet_t ifp,protocol_family_t * proto_family,mbuf_t * m)6901 dlil_clat46(ifnet_t ifp, protocol_family_t *proto_family, mbuf_t *m)
6902 {
6903 VERIFY(*proto_family == PF_INET);
6904 VERIFY(IS_INTF_CLAT46(ifp));
6905
6906 pbuf_t pbuf_store, *pbuf = NULL;
6907 struct ip *iph = NULL;
6908 struct in_addr osrc, odst;
6909 uint8_t proto = 0;
6910 struct in6_ifaddr *ia6_clat_src = NULL;
6911 struct in6_addr *src = NULL;
6912 struct in6_addr dst;
6913 int error = 0;
6914 uint16_t off = 0;
6915 uint16_t tot_len = 0;
6916 uint16_t ip_id_val = 0;
6917 uint16_t ip_frag_off = 0;
6918
6919 boolean_t is_frag = FALSE;
6920 boolean_t is_first_frag = TRUE;
6921 boolean_t is_last_frag = TRUE;
6922
6923 pbuf_init_mbuf(&pbuf_store, *m, ifp);
6924 pbuf = &pbuf_store;
6925 iph = pbuf->pb_data;
6926
6927 osrc = iph->ip_src;
6928 odst = iph->ip_dst;
6929 proto = iph->ip_p;
6930 off = (uint16_t)(iph->ip_hl << 2);
6931 ip_id_val = iph->ip_id;
6932 ip_frag_off = ntohs(iph->ip_off) & IP_OFFMASK;
6933
6934 tot_len = ntohs(iph->ip_len);
6935
6936 /*
6937 * For packets that are not first frags
6938 * we only need to adjust CSUM.
6939 * For 4 to 6, Fragmentation header gets appended
6940 * after proto translation.
6941 */
6942 if (ntohs(iph->ip_off) & ~(IP_DF | IP_RF)) {
6943 is_frag = TRUE;
6944
6945 /* If the offset is not zero, it is not first frag */
6946 if (ip_frag_off != 0) {
6947 is_first_frag = FALSE;
6948 }
6949
6950 /* If IP_MF is set, then it is not last frag */
6951 if (ntohs(iph->ip_off) & IP_MF) {
6952 is_last_frag = FALSE;
6953 }
6954 }
6955
6956 /*
6957 * Retrive the local IPv6 CLAT46 address reserved for stateless
6958 * translation.
6959 */
6960 ia6_clat_src = in6ifa_ifpwithflag(ifp, IN6_IFF_CLAT46);
6961 if (ia6_clat_src == NULL) {
6962 ip6stat.ip6s_clat464_out_nov6addr_drop++;
6963 error = -1;
6964 goto cleanup;
6965 }
6966
6967 src = &ia6_clat_src->ia_addr.sin6_addr;
6968
6969 /*
6970 * Translate IPv4 destination to IPv6 destination by using the
6971 * prefixes learned through prior PLAT discovery.
6972 */
6973 if ((error = nat464_synthesize_ipv6(ifp, &odst, &dst)) != 0) {
6974 ip6stat.ip6s_clat464_out_v6synthfail_drop++;
6975 goto cleanup;
6976 }
6977
6978 /* Translate the IP header part first */
6979 error = (nat464_translate_46(pbuf, off, iph->ip_tos, iph->ip_p,
6980 iph->ip_ttl, *src, dst, tot_len) == NT_NAT64) ? 0 : -1;
6981
6982 iph = NULL; /* Invalidate iph as pbuf has been modified */
6983
6984 if (error != 0) {
6985 ip6stat.ip6s_clat464_out_46transfail_drop++;
6986 goto cleanup;
6987 }
6988
6989 /*
6990 * Translate protocol header, update checksum, checksum flags
6991 * and related fields.
6992 */
6993 error = (nat464_translate_proto(pbuf, (struct nat464_addr *)&osrc, (struct nat464_addr *)&odst,
6994 proto, PF_INET, PF_INET6, NT_OUT, !is_first_frag) == NT_NAT64) ? 0 : -1;
6995
6996 if (error != 0) {
6997 ip6stat.ip6s_clat464_out_46proto_transfail_drop++;
6998 goto cleanup;
6999 }
7000
7001 /* Now insert the IPv6 fragment header */
7002 if (is_frag) {
7003 error = nat464_insert_frag46(pbuf, ip_id_val, ip_frag_off, is_last_frag);
7004
7005 if (error != 0) {
7006 ip6stat.ip6s_clat464_out_46frag_transfail_drop++;
7007 goto cleanup;
7008 }
7009 }
7010
7011 cleanup:
7012 if (ia6_clat_src != NULL) {
7013 IFA_REMREF(&ia6_clat_src->ia_ifa);
7014 }
7015
7016 if (pbuf_is_valid(pbuf)) {
7017 *m = pbuf->pb_mbuf;
7018 pbuf->pb_mbuf = NULL;
7019 pbuf_destroy(pbuf);
7020 } else {
7021 error = -1;
7022 ip6stat.ip6s_clat464_out_invalpbuf_drop++;
7023 }
7024
7025 if (error == 0) {
7026 *proto_family = PF_INET6;
7027 ip6stat.ip6s_clat464_out_success++;
7028 }
7029
7030 return error;
7031 }
7032
7033 /*
7034 * @brief This routine translates incoming IPv6 to IPv4 packet,
7035 * updates protocol checksum and also translates ICMPv6 outer
7036 * and inner headers
7037 *
7038 * @return 0 on success or else a negative value.
7039 */
7040 static errno_t
dlil_clat64(ifnet_t ifp,protocol_family_t * proto_family,mbuf_t * m)7041 dlil_clat64(ifnet_t ifp, protocol_family_t *proto_family, mbuf_t *m)
7042 {
7043 VERIFY(*proto_family == PF_INET6);
7044 VERIFY(IS_INTF_CLAT46(ifp));
7045
7046 struct ip6_hdr *ip6h = NULL;
7047 struct in6_addr osrc, odst;
7048 uint8_t proto = 0;
7049 struct in6_ifaddr *ia6_clat_dst = NULL;
7050 struct in_ifaddr *ia4_clat_dst = NULL;
7051 struct in_addr *dst = NULL;
7052 struct in_addr src;
7053 int error = 0;
7054 uint32_t off = 0;
7055 u_int64_t tot_len = 0;
7056 uint8_t tos = 0;
7057 boolean_t is_first_frag = TRUE;
7058
7059 /* Incoming mbuf does not contain valid IP6 header */
7060 if ((size_t)(*m)->m_pkthdr.len < sizeof(struct ip6_hdr) ||
7061 ((size_t)(*m)->m_len < sizeof(struct ip6_hdr) &&
7062 (*m = m_pullup(*m, sizeof(struct ip6_hdr))) == NULL)) {
7063 ip6stat.ip6s_clat464_in_tooshort_drop++;
7064 return -1;
7065 }
7066
7067 ip6h = mtod(*m, struct ip6_hdr *);
7068 /* Validate that mbuf contains IP payload equal to ip6_plen */
7069 if ((size_t)(*m)->m_pkthdr.len < ntohs(ip6h->ip6_plen) + sizeof(struct ip6_hdr)) {
7070 ip6stat.ip6s_clat464_in_tooshort_drop++;
7071 return -1;
7072 }
7073
7074 osrc = ip6h->ip6_src;
7075 odst = ip6h->ip6_dst;
7076
7077 /*
7078 * Retrieve the local CLAT46 reserved IPv6 address.
7079 * Let the packet pass if we don't find one, as the flag
7080 * may get set before IPv6 configuration has taken place.
7081 */
7082 ia6_clat_dst = in6ifa_ifpwithflag(ifp, IN6_IFF_CLAT46);
7083 if (ia6_clat_dst == NULL) {
7084 goto done;
7085 }
7086
7087 /*
7088 * Check if the original dest in the packet is same as the reserved
7089 * CLAT46 IPv6 address
7090 */
7091 if (IN6_ARE_ADDR_EQUAL(&odst, &ia6_clat_dst->ia_addr.sin6_addr)) {
7092 pbuf_t pbuf_store, *pbuf = NULL;
7093 pbuf_init_mbuf(&pbuf_store, *m, ifp);
7094 pbuf = &pbuf_store;
7095
7096 /*
7097 * Retrive the local CLAT46 IPv4 address reserved for stateless
7098 * translation.
7099 */
7100 ia4_clat_dst = inifa_ifpclatv4(ifp);
7101 if (ia4_clat_dst == NULL) {
7102 IFA_REMREF(&ia6_clat_dst->ia_ifa);
7103 ip6stat.ip6s_clat464_in_nov4addr_drop++;
7104 error = -1;
7105 goto cleanup;
7106 }
7107 IFA_REMREF(&ia6_clat_dst->ia_ifa);
7108
7109 /* Translate IPv6 src to IPv4 src by removing the NAT64 prefix */
7110 dst = &ia4_clat_dst->ia_addr.sin_addr;
7111 if ((error = nat464_synthesize_ipv4(ifp, &osrc, &src)) != 0) {
7112 ip6stat.ip6s_clat464_in_v4synthfail_drop++;
7113 error = -1;
7114 goto cleanup;
7115 }
7116
7117 ip6h = pbuf->pb_data;
7118 off = sizeof(struct ip6_hdr);
7119 proto = ip6h->ip6_nxt;
7120 tos = (ntohl(ip6h->ip6_flow) >> 20) & 0xff;
7121 tot_len = ntohs(ip6h->ip6_plen) + sizeof(struct ip6_hdr);
7122
7123 /*
7124 * Translate the IP header and update the fragmentation
7125 * header if needed
7126 */
7127 error = (nat464_translate_64(pbuf, off, tos, &proto,
7128 ip6h->ip6_hlim, src, *dst, tot_len, &is_first_frag) == NT_NAT64) ?
7129 0 : -1;
7130
7131 ip6h = NULL; /* Invalidate ip6h as pbuf has been changed */
7132
7133 if (error != 0) {
7134 ip6stat.ip6s_clat464_in_64transfail_drop++;
7135 goto cleanup;
7136 }
7137
7138 /*
7139 * Translate protocol header, update checksum, checksum flags
7140 * and related fields.
7141 */
7142 error = (nat464_translate_proto(pbuf, (struct nat464_addr *)&osrc,
7143 (struct nat464_addr *)&odst, proto, PF_INET6, PF_INET,
7144 NT_IN, !is_first_frag) == NT_NAT64) ? 0 : -1;
7145
7146 if (error != 0) {
7147 ip6stat.ip6s_clat464_in_64proto_transfail_drop++;
7148 goto cleanup;
7149 }
7150
7151 cleanup:
7152 if (ia4_clat_dst != NULL) {
7153 IFA_REMREF(&ia4_clat_dst->ia_ifa);
7154 }
7155
7156 if (pbuf_is_valid(pbuf)) {
7157 *m = pbuf->pb_mbuf;
7158 pbuf->pb_mbuf = NULL;
7159 pbuf_destroy(pbuf);
7160 } else {
7161 error = -1;
7162 ip6stat.ip6s_clat464_in_invalpbuf_drop++;
7163 }
7164
7165 if (error == 0) {
7166 *proto_family = PF_INET;
7167 ip6stat.ip6s_clat464_in_success++;
7168 }
7169 } /* CLAT traffic */
7170
7171 done:
7172 return error;
7173 }
7174
7175 /* The following is used to enqueue work items for ifnet ioctl events */
7176 static void ifnet_ioctl_event_callback(void *);
7177
7178 struct ifnet_ioctl_event {
7179 struct ifnet *ifp;
7180 u_long ioctl_code;
7181 };
7182
7183 struct ifnet_ioctl_event_nwk_wq_entry {
7184 struct nwk_wq_entry nwk_wqe;
7185 struct ifnet_ioctl_event ifnet_ioctl_ev_arg;
7186 };
7187
7188 void
ifnet_ioctl_async(struct ifnet * ifp,u_long ioctl_code)7189 ifnet_ioctl_async(struct ifnet *ifp, u_long ioctl_code)
7190 {
7191 struct ifnet_ioctl_event_nwk_wq_entry *p_ifnet_ioctl_ev = NULL;
7192
7193 /*
7194 * Get an io ref count if the interface is attached.
7195 * At this point it most likely is. We are taking a reference for
7196 * deferred processing.
7197 */
7198 if (!ifnet_is_attached(ifp, 1)) {
7199 os_log(OS_LOG_DEFAULT, "%s:%d %s Failed for ioctl %lu as interface "
7200 "is not attached",
7201 __func__, __LINE__, if_name(ifp), ioctl_code);
7202 return;
7203 }
7204
7205 MALLOC(p_ifnet_ioctl_ev, struct ifnet_ioctl_event_nwk_wq_entry *,
7206 sizeof(struct ifnet_ioctl_event_nwk_wq_entry),
7207 M_NWKWQ, M_WAITOK | M_ZERO);
7208
7209 p_ifnet_ioctl_ev->ifnet_ioctl_ev_arg.ifp = ifp;
7210 p_ifnet_ioctl_ev->ifnet_ioctl_ev_arg.ioctl_code = ioctl_code;
7211
7212 p_ifnet_ioctl_ev->nwk_wqe.func = ifnet_ioctl_event_callback;
7213 p_ifnet_ioctl_ev->nwk_wqe.is_arg_managed = TRUE;
7214 p_ifnet_ioctl_ev->nwk_wqe.arg = &p_ifnet_ioctl_ev->ifnet_ioctl_ev_arg;
7215 nwk_wq_enqueue((struct nwk_wq_entry*)p_ifnet_ioctl_ev);
7216 }
7217
7218 static void
ifnet_ioctl_event_callback(void * arg)7219 ifnet_ioctl_event_callback(void *arg)
7220 {
7221 struct ifnet_ioctl_event *p_ifnet_ioctl_ev = (struct ifnet_ioctl_event *)arg;
7222 struct ifnet *ifp = p_ifnet_ioctl_ev->ifp;
7223 u_long ioctl_code = p_ifnet_ioctl_ev->ioctl_code;
7224 int ret = 0;
7225
7226 if ((ret = ifnet_ioctl(ifp, 0, ioctl_code, NULL)) != 0) {
7227 os_log(OS_LOG_DEFAULT, "%s:%d %s ifnet_ioctl returned %d for ioctl %lu",
7228 __func__, __LINE__, if_name(ifp), ret, ioctl_code);
7229 } else if (dlil_verbose) {
7230 os_log(OS_LOG_DEFAULT, "%s:%d %s ifnet_ioctl returned successfully "
7231 "for ioctl %lu",
7232 __func__, __LINE__, if_name(ifp), ioctl_code);
7233 }
7234 ifnet_decr_iorefcnt(ifp);
7235 return;
7236 }
7237
7238 errno_t
ifnet_ioctl(ifnet_t ifp,protocol_family_t proto_fam,u_long ioctl_code,void * ioctl_arg)7239 ifnet_ioctl(ifnet_t ifp, protocol_family_t proto_fam, u_long ioctl_code,
7240 void *ioctl_arg)
7241 {
7242 struct ifnet_filter *filter;
7243 int retval = EOPNOTSUPP;
7244 int result = 0;
7245
7246 if (ifp == NULL || ioctl_code == 0) {
7247 return EINVAL;
7248 }
7249
7250 /* Get an io ref count if the interface is attached */
7251 if (!ifnet_is_attached(ifp, 1)) {
7252 return EOPNOTSUPP;
7253 }
7254
7255 /*
7256 * Run the interface filters first.
7257 * We want to run all filters before calling the protocol,
7258 * interface family, or interface.
7259 */
7260 lck_mtx_lock_spin(&ifp->if_flt_lock);
7261 /* prevent filter list from changing in case we drop the lock */
7262 if_flt_monitor_busy(ifp);
7263 TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
7264 if (filter->filt_ioctl != NULL && (filter->filt_protocol == 0 ||
7265 filter->filt_protocol == proto_fam)) {
7266 lck_mtx_unlock(&ifp->if_flt_lock);
7267
7268 result = filter->filt_ioctl(filter->filt_cookie, ifp,
7269 proto_fam, ioctl_code, ioctl_arg);
7270
7271 lck_mtx_lock_spin(&ifp->if_flt_lock);
7272
7273 /* Only update retval if no one has handled the ioctl */
7274 if (retval == EOPNOTSUPP || result == EJUSTRETURN) {
7275 if (result == ENOTSUP) {
7276 result = EOPNOTSUPP;
7277 }
7278 retval = result;
7279 if (retval != 0 && retval != EOPNOTSUPP) {
7280 /* we're done with the filter list */
7281 if_flt_monitor_unbusy(ifp);
7282 lck_mtx_unlock(&ifp->if_flt_lock);
7283 goto cleanup;
7284 }
7285 }
7286 }
7287 }
7288 /* we're done with the filter list */
7289 if_flt_monitor_unbusy(ifp);
7290 lck_mtx_unlock(&ifp->if_flt_lock);
7291
7292 /* Allow the protocol to handle the ioctl */
7293 if (proto_fam != 0) {
7294 struct if_proto *proto;
7295
7296 /* callee holds a proto refcnt upon success */
7297 ifnet_lock_shared(ifp);
7298 proto = find_attached_proto(ifp, proto_fam);
7299 ifnet_lock_done(ifp);
7300 if (proto != NULL) {
7301 proto_media_ioctl ioctlp =
7302 (proto->proto_kpi == kProtoKPI_v1 ?
7303 proto->kpi.v1.ioctl : proto->kpi.v2.ioctl);
7304 result = EOPNOTSUPP;
7305 if (ioctlp != NULL) {
7306 result = ioctlp(ifp, proto_fam, ioctl_code,
7307 ioctl_arg);
7308 }
7309 if_proto_free(proto);
7310
7311 /* Only update retval if no one has handled the ioctl */
7312 if (retval == EOPNOTSUPP || result == EJUSTRETURN) {
7313 if (result == ENOTSUP) {
7314 result = EOPNOTSUPP;
7315 }
7316 retval = result;
7317 if (retval && retval != EOPNOTSUPP) {
7318 goto cleanup;
7319 }
7320 }
7321 }
7322 }
7323
7324 /* retval is either 0 or EOPNOTSUPP */
7325
7326 /*
7327 * Let the interface handle this ioctl.
7328 * If it returns EOPNOTSUPP, ignore that, we may have
7329 * already handled this in the protocol or family.
7330 */
7331 if (ifp->if_ioctl) {
7332 result = (*ifp->if_ioctl)(ifp, ioctl_code, ioctl_arg);
7333 }
7334
7335 /* Only update retval if no one has handled the ioctl */
7336 if (retval == EOPNOTSUPP || result == EJUSTRETURN) {
7337 if (result == ENOTSUP) {
7338 result = EOPNOTSUPP;
7339 }
7340 retval = result;
7341 if (retval && retval != EOPNOTSUPP) {
7342 goto cleanup;
7343 }
7344 }
7345
7346 cleanup:
7347 if (retval == EJUSTRETURN) {
7348 retval = 0;
7349 }
7350
7351 ifnet_decr_iorefcnt(ifp);
7352
7353 return retval;
7354 }
7355
7356 __private_extern__ errno_t
dlil_set_bpf_tap(ifnet_t ifp,bpf_tap_mode mode,bpf_packet_func callback)7357 dlil_set_bpf_tap(ifnet_t ifp, bpf_tap_mode mode, bpf_packet_func callback)
7358 {
7359 errno_t error = 0;
7360
7361
7362 if (ifp->if_set_bpf_tap) {
7363 /* Get an io reference on the interface if it is attached */
7364 if (!ifnet_is_attached(ifp, 1)) {
7365 return ENXIO;
7366 }
7367 error = ifp->if_set_bpf_tap(ifp, mode, callback);
7368 ifnet_decr_iorefcnt(ifp);
7369 }
7370 return error;
7371 }
7372
7373 errno_t
dlil_resolve_multi(struct ifnet * ifp,const struct sockaddr * proto_addr,struct sockaddr * ll_addr,size_t ll_len)7374 dlil_resolve_multi(struct ifnet *ifp, const struct sockaddr *proto_addr,
7375 struct sockaddr *ll_addr, size_t ll_len)
7376 {
7377 errno_t result = EOPNOTSUPP;
7378 struct if_proto *proto;
7379 const struct sockaddr *verify;
7380 proto_media_resolve_multi resolvep;
7381
7382 if (!ifnet_is_attached(ifp, 1)) {
7383 return result;
7384 }
7385
7386 bzero(ll_addr, ll_len);
7387
7388 /* Call the protocol first; callee holds a proto refcnt upon success */
7389 ifnet_lock_shared(ifp);
7390 proto = find_attached_proto(ifp, proto_addr->sa_family);
7391 ifnet_lock_done(ifp);
7392 if (proto != NULL) {
7393 resolvep = (proto->proto_kpi == kProtoKPI_v1 ?
7394 proto->kpi.v1.resolve_multi : proto->kpi.v2.resolve_multi);
7395 if (resolvep != NULL) {
7396 result = resolvep(ifp, proto_addr,
7397 (struct sockaddr_dl *)(void *)ll_addr, ll_len);
7398 }
7399 if_proto_free(proto);
7400 }
7401
7402 /* Let the interface verify the multicast address */
7403 if ((result == EOPNOTSUPP || result == 0) && ifp->if_check_multi) {
7404 if (result == 0) {
7405 verify = ll_addr;
7406 } else {
7407 verify = proto_addr;
7408 }
7409 result = ifp->if_check_multi(ifp, verify);
7410 }
7411
7412 ifnet_decr_iorefcnt(ifp);
7413 return result;
7414 }
7415
7416 __private_extern__ errno_t
dlil_send_arp_internal(ifnet_t ifp,u_short arpop,const struct sockaddr_dl * sender_hw,const struct sockaddr * sender_proto,const struct sockaddr_dl * target_hw,const struct sockaddr * target_proto)7417 dlil_send_arp_internal(ifnet_t ifp, u_short arpop,
7418 const struct sockaddr_dl *sender_hw, const struct sockaddr *sender_proto,
7419 const struct sockaddr_dl *target_hw, const struct sockaddr *target_proto)
7420 {
7421 struct if_proto *proto;
7422 errno_t result = 0;
7423
7424 /* callee holds a proto refcnt upon success */
7425 ifnet_lock_shared(ifp);
7426 proto = find_attached_proto(ifp, target_proto->sa_family);
7427 ifnet_lock_done(ifp);
7428 if (proto == NULL) {
7429 result = ENOTSUP;
7430 } else {
7431 proto_media_send_arp arpp;
7432 arpp = (proto->proto_kpi == kProtoKPI_v1 ?
7433 proto->kpi.v1.send_arp : proto->kpi.v2.send_arp);
7434 if (arpp == NULL) {
7435 result = ENOTSUP;
7436 } else {
7437 switch (arpop) {
7438 case ARPOP_REQUEST:
7439 arpstat.txrequests++;
7440 if (target_hw != NULL) {
7441 arpstat.txurequests++;
7442 }
7443 break;
7444 case ARPOP_REPLY:
7445 arpstat.txreplies++;
7446 break;
7447 }
7448 result = arpp(ifp, arpop, sender_hw, sender_proto,
7449 target_hw, target_proto);
7450 }
7451 if_proto_free(proto);
7452 }
7453
7454 return result;
7455 }
7456
7457 struct net_thread_marks { };
7458 static const struct net_thread_marks net_thread_marks_base = { };
7459
7460 __private_extern__ const net_thread_marks_t net_thread_marks_none =
7461 &net_thread_marks_base;
7462
7463 __private_extern__ net_thread_marks_t
net_thread_marks_push(u_int32_t push)7464 net_thread_marks_push(u_int32_t push)
7465 {
7466 static const char *const base = (const void*)&net_thread_marks_base;
7467 u_int32_t pop = 0;
7468
7469 if (push != 0) {
7470 struct uthread *uth = current_uthread();
7471
7472 pop = push & ~uth->uu_network_marks;
7473 if (pop != 0) {
7474 uth->uu_network_marks |= pop;
7475 }
7476 }
7477
7478 return (net_thread_marks_t)&base[pop];
7479 }
7480
7481 __private_extern__ net_thread_marks_t
net_thread_unmarks_push(u_int32_t unpush)7482 net_thread_unmarks_push(u_int32_t unpush)
7483 {
7484 static const char *const base = (const void*)&net_thread_marks_base;
7485 u_int32_t unpop = 0;
7486
7487 if (unpush != 0) {
7488 struct uthread *uth = current_uthread();
7489
7490 unpop = unpush & uth->uu_network_marks;
7491 if (unpop != 0) {
7492 uth->uu_network_marks &= ~unpop;
7493 }
7494 }
7495
7496 return (net_thread_marks_t)&base[unpop];
7497 }
7498
7499 __private_extern__ void
net_thread_marks_pop(net_thread_marks_t popx)7500 net_thread_marks_pop(net_thread_marks_t popx)
7501 {
7502 static const char *const base = (const void*)&net_thread_marks_base;
7503 const ptrdiff_t pop = (const char *)popx - (const char *)base;
7504
7505 if (pop != 0) {
7506 static const ptrdiff_t ones = (ptrdiff_t)(u_int32_t)~0U;
7507 struct uthread *uth = current_uthread();
7508
7509 VERIFY((pop & ones) == pop);
7510 VERIFY((ptrdiff_t)(uth->uu_network_marks & pop) == pop);
7511 uth->uu_network_marks &= ~pop;
7512 }
7513 }
7514
7515 __private_extern__ void
net_thread_unmarks_pop(net_thread_marks_t unpopx)7516 net_thread_unmarks_pop(net_thread_marks_t unpopx)
7517 {
7518 static const char *const base = (const void*)&net_thread_marks_base;
7519 ptrdiff_t unpop = (const char *)unpopx - (const char *)base;
7520
7521 if (unpop != 0) {
7522 static const ptrdiff_t ones = (ptrdiff_t)(u_int32_t)~0U;
7523 struct uthread *uth = current_uthread();
7524
7525 VERIFY((unpop & ones) == unpop);
7526 VERIFY((ptrdiff_t)(uth->uu_network_marks & unpop) == 0);
7527 uth->uu_network_marks |= unpop;
7528 }
7529 }
7530
7531 __private_extern__ u_int32_t
net_thread_is_marked(u_int32_t check)7532 net_thread_is_marked(u_int32_t check)
7533 {
7534 if (check != 0) {
7535 struct uthread *uth = current_uthread();
7536 return uth->uu_network_marks & check;
7537 } else {
7538 return 0;
7539 }
7540 }
7541
7542 __private_extern__ u_int32_t
net_thread_is_unmarked(u_int32_t check)7543 net_thread_is_unmarked(u_int32_t check)
7544 {
7545 if (check != 0) {
7546 struct uthread *uth = current_uthread();
7547 return ~uth->uu_network_marks & check;
7548 } else {
7549 return 0;
7550 }
7551 }
7552
7553 static __inline__ int
_is_announcement(const struct sockaddr_in * sender_sin,const struct sockaddr_in * target_sin)7554 _is_announcement(const struct sockaddr_in * sender_sin,
7555 const struct sockaddr_in * target_sin)
7556 {
7557 if (target_sin == NULL || sender_sin == NULL) {
7558 return FALSE;
7559 }
7560
7561 return sender_sin->sin_addr.s_addr == target_sin->sin_addr.s_addr;
7562 }
7563
7564 __private_extern__ errno_t
dlil_send_arp(ifnet_t ifp,u_short arpop,const struct sockaddr_dl * sender_hw,const struct sockaddr * sender_proto,const struct sockaddr_dl * target_hw,const struct sockaddr * target_proto0,u_int32_t rtflags)7565 dlil_send_arp(ifnet_t ifp, u_short arpop, const struct sockaddr_dl *sender_hw,
7566 const struct sockaddr *sender_proto, const struct sockaddr_dl *target_hw,
7567 const struct sockaddr *target_proto0, u_int32_t rtflags)
7568 {
7569 errno_t result = 0;
7570 const struct sockaddr_in * sender_sin;
7571 const struct sockaddr_in * target_sin;
7572 struct sockaddr_inarp target_proto_sinarp;
7573 struct sockaddr *target_proto = (void *)(uintptr_t)target_proto0;
7574
7575 if (target_proto == NULL || sender_proto == NULL) {
7576 return EINVAL;
7577 }
7578
7579 if (sender_proto->sa_family != target_proto->sa_family) {
7580 return EINVAL;
7581 }
7582
7583 /*
7584 * If the target is a (default) router, provide that
7585 * information to the send_arp callback routine.
7586 */
7587 if (rtflags & RTF_ROUTER) {
7588 bcopy(target_proto, &target_proto_sinarp,
7589 sizeof(struct sockaddr_in));
7590 target_proto_sinarp.sin_other |= SIN_ROUTER;
7591 target_proto = (struct sockaddr *)&target_proto_sinarp;
7592 }
7593
7594 /*
7595 * If this is an ARP request and the target IP is IPv4LL,
7596 * send the request on all interfaces. The exception is
7597 * an announcement, which must only appear on the specific
7598 * interface.
7599 */
7600 sender_sin = (struct sockaddr_in *)(void *)(uintptr_t)sender_proto;
7601 target_sin = (struct sockaddr_in *)(void *)(uintptr_t)target_proto;
7602 if (target_proto->sa_family == AF_INET &&
7603 IN_LINKLOCAL(ntohl(target_sin->sin_addr.s_addr)) &&
7604 ipv4_ll_arp_aware != 0 && arpop == ARPOP_REQUEST &&
7605 !_is_announcement(sender_sin, target_sin)) {
7606 ifnet_t *ifp_list;
7607 u_int32_t count;
7608 u_int32_t ifp_on;
7609
7610 result = ENOTSUP;
7611
7612 if (ifnet_list_get(IFNET_FAMILY_ANY, &ifp_list, &count) == 0) {
7613 for (ifp_on = 0; ifp_on < count; ifp_on++) {
7614 errno_t new_result;
7615 ifaddr_t source_hw = NULL;
7616 ifaddr_t source_ip = NULL;
7617 struct sockaddr_in source_ip_copy;
7618 struct ifnet *cur_ifp = ifp_list[ifp_on];
7619
7620 /*
7621 * Only arp on interfaces marked for IPv4LL
7622 * ARPing. This may mean that we don't ARP on
7623 * the interface the subnet route points to.
7624 */
7625 if (!(cur_ifp->if_eflags & IFEF_ARPLL)) {
7626 continue;
7627 }
7628
7629 /* Find the source IP address */
7630 ifnet_lock_shared(cur_ifp);
7631 source_hw = cur_ifp->if_lladdr;
7632 TAILQ_FOREACH(source_ip, &cur_ifp->if_addrhead,
7633 ifa_link) {
7634 IFA_LOCK(source_ip);
7635 if (source_ip->ifa_addr != NULL &&
7636 source_ip->ifa_addr->sa_family ==
7637 AF_INET) {
7638 /* Copy the source IP address */
7639 source_ip_copy =
7640 *(struct sockaddr_in *)
7641 (void *)source_ip->ifa_addr;
7642 IFA_UNLOCK(source_ip);
7643 break;
7644 }
7645 IFA_UNLOCK(source_ip);
7646 }
7647
7648 /* No IP Source, don't arp */
7649 if (source_ip == NULL) {
7650 ifnet_lock_done(cur_ifp);
7651 continue;
7652 }
7653
7654 IFA_ADDREF(source_hw);
7655 ifnet_lock_done(cur_ifp);
7656
7657 /* Send the ARP */
7658 new_result = dlil_send_arp_internal(cur_ifp,
7659 arpop, (struct sockaddr_dl *)(void *)
7660 source_hw->ifa_addr,
7661 (struct sockaddr *)&source_ip_copy, NULL,
7662 target_proto);
7663
7664 IFA_REMREF(source_hw);
7665 if (result == ENOTSUP) {
7666 result = new_result;
7667 }
7668 }
7669 ifnet_list_free(ifp_list);
7670 }
7671 } else {
7672 result = dlil_send_arp_internal(ifp, arpop, sender_hw,
7673 sender_proto, target_hw, target_proto);
7674 }
7675
7676 return result;
7677 }
7678
7679 /*
7680 * Caller must hold ifnet head lock.
7681 */
7682 static int
ifnet_lookup(struct ifnet * ifp)7683 ifnet_lookup(struct ifnet *ifp)
7684 {
7685 struct ifnet *_ifp;
7686
7687 LCK_RW_ASSERT(&ifnet_head_lock, LCK_RW_ASSERT_HELD);
7688 TAILQ_FOREACH(_ifp, &ifnet_head, if_link) {
7689 if (_ifp == ifp) {
7690 break;
7691 }
7692 }
7693 return _ifp != NULL;
7694 }
7695
7696 /*
7697 * Caller has to pass a non-zero refio argument to get a
7698 * IO reference count. This will prevent ifnet_detach from
7699 * being called when there are outstanding io reference counts.
7700 */
7701 int
ifnet_is_attached(struct ifnet * ifp,int refio)7702 ifnet_is_attached(struct ifnet *ifp, int refio)
7703 {
7704 int ret;
7705
7706 lck_mtx_lock_spin(&ifp->if_ref_lock);
7707 if ((ret = IF_FULLY_ATTACHED(ifp))) {
7708 if (refio > 0) {
7709 ifp->if_refio++;
7710 }
7711 }
7712 lck_mtx_unlock(&ifp->if_ref_lock);
7713
7714 return ret;
7715 }
7716
7717 void
ifnet_incr_pending_thread_count(struct ifnet * ifp)7718 ifnet_incr_pending_thread_count(struct ifnet *ifp)
7719 {
7720 lck_mtx_lock_spin(&ifp->if_ref_lock);
7721 ifp->if_threads_pending++;
7722 lck_mtx_unlock(&ifp->if_ref_lock);
7723 }
7724
7725 void
ifnet_decr_pending_thread_count(struct ifnet * ifp)7726 ifnet_decr_pending_thread_count(struct ifnet *ifp)
7727 {
7728 lck_mtx_lock_spin(&ifp->if_ref_lock);
7729 VERIFY(ifp->if_threads_pending > 0);
7730 ifp->if_threads_pending--;
7731 if (ifp->if_threads_pending == 0) {
7732 wakeup(&ifp->if_threads_pending);
7733 }
7734 lck_mtx_unlock(&ifp->if_ref_lock);
7735 }
7736
7737 /*
7738 * Caller must ensure the interface is attached; the assumption is that
7739 * there is at least an outstanding IO reference count held already.
7740 * Most callers would call ifnet_is_{attached,data_ready}() instead.
7741 */
7742 void
ifnet_incr_iorefcnt(struct ifnet * ifp)7743 ifnet_incr_iorefcnt(struct ifnet *ifp)
7744 {
7745 lck_mtx_lock_spin(&ifp->if_ref_lock);
7746 VERIFY(IF_FULLY_ATTACHED(ifp));
7747 VERIFY(ifp->if_refio > 0);
7748 ifp->if_refio++;
7749 lck_mtx_unlock(&ifp->if_ref_lock);
7750 }
7751
7752 __attribute__((always_inline))
7753 static void
ifnet_decr_iorefcnt_locked(struct ifnet * ifp)7754 ifnet_decr_iorefcnt_locked(struct ifnet *ifp)
7755 {
7756 LCK_MTX_ASSERT(&ifp->if_ref_lock, LCK_MTX_ASSERT_OWNED);
7757
7758 VERIFY(ifp->if_refio > 0);
7759 VERIFY(ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING));
7760
7761 ifp->if_refio--;
7762 VERIFY(ifp->if_refio != 0 || ifp->if_datamov == 0);
7763
7764 /*
7765 * if there are no more outstanding io references, wakeup the
7766 * ifnet_detach thread if detaching flag is set.
7767 */
7768 if (ifp->if_refio == 0 && (ifp->if_refflags & IFRF_DETACHING)) {
7769 wakeup(&(ifp->if_refio));
7770 }
7771 }
7772
7773 void
ifnet_decr_iorefcnt(struct ifnet * ifp)7774 ifnet_decr_iorefcnt(struct ifnet *ifp)
7775 {
7776 lck_mtx_lock_spin(&ifp->if_ref_lock);
7777 ifnet_decr_iorefcnt_locked(ifp);
7778 lck_mtx_unlock(&ifp->if_ref_lock);
7779 }
7780
7781 boolean_t
ifnet_datamov_begin(struct ifnet * ifp)7782 ifnet_datamov_begin(struct ifnet *ifp)
7783 {
7784 boolean_t ret;
7785
7786 lck_mtx_lock_spin(&ifp->if_ref_lock);
7787 if ((ret = IF_FULLY_ATTACHED_AND_READY(ifp))) {
7788 ifp->if_refio++;
7789 ifp->if_datamov++;
7790 }
7791 lck_mtx_unlock(&ifp->if_ref_lock);
7792
7793 return ret;
7794 }
7795
7796 void
ifnet_datamov_end(struct ifnet * ifp)7797 ifnet_datamov_end(struct ifnet *ifp)
7798 {
7799 lck_mtx_lock_spin(&ifp->if_ref_lock);
7800 VERIFY(ifp->if_datamov > 0);
7801 /*
7802 * if there's no more thread moving data, wakeup any
7803 * drainers that's blocked waiting for this.
7804 */
7805 if (--ifp->if_datamov == 0 && ifp->if_drainers > 0) {
7806 wakeup(&(ifp->if_datamov));
7807 }
7808 ifnet_decr_iorefcnt_locked(ifp);
7809 lck_mtx_unlock(&ifp->if_ref_lock);
7810 }
7811
7812 void
ifnet_datamov_suspend(struct ifnet * ifp)7813 ifnet_datamov_suspend(struct ifnet *ifp)
7814 {
7815 lck_mtx_lock_spin(&ifp->if_ref_lock);
7816 VERIFY(ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING));
7817 ifp->if_refio++;
7818 if (ifp->if_suspend++ == 0) {
7819 VERIFY(ifp->if_refflags & IFRF_READY);
7820 ifp->if_refflags &= ~IFRF_READY;
7821 }
7822 lck_mtx_unlock(&ifp->if_ref_lock);
7823 }
7824
7825 void
ifnet_datamov_drain(struct ifnet * ifp)7826 ifnet_datamov_drain(struct ifnet *ifp)
7827 {
7828 lck_mtx_lock(&ifp->if_ref_lock);
7829 VERIFY(ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING));
7830 /* data movement must already be suspended */
7831 VERIFY(ifp->if_suspend > 0);
7832 VERIFY(!(ifp->if_refflags & IFRF_READY));
7833 ifp->if_drainers++;
7834 while (ifp->if_datamov != 0) {
7835 #if SKYWALK
7836 SK_ERR("Waiting for data path(s) to quiesce on %s",
7837 if_name(ifp));
7838 #endif /* SKYWALK */
7839 (void) msleep(&(ifp->if_datamov), &ifp->if_ref_lock,
7840 (PZERO - 1), __func__, NULL);
7841 }
7842 VERIFY(!(ifp->if_refflags & IFRF_READY));
7843 VERIFY(ifp->if_drainers > 0);
7844 ifp->if_drainers--;
7845 lck_mtx_unlock(&ifp->if_ref_lock);
7846
7847 /* purge the interface queues */
7848 if ((ifp->if_eflags & IFEF_TXSTART) != 0) {
7849 if_qflush_snd(ifp, false);
7850 }
7851 }
7852
7853 void
ifnet_datamov_resume(struct ifnet * ifp)7854 ifnet_datamov_resume(struct ifnet *ifp)
7855 {
7856 lck_mtx_lock(&ifp->if_ref_lock);
7857 /* data movement must already be suspended */
7858 VERIFY(ifp->if_suspend > 0);
7859 if (--ifp->if_suspend == 0) {
7860 VERIFY(!(ifp->if_refflags & IFRF_READY));
7861 ifp->if_refflags |= IFRF_READY;
7862 }
7863 ifnet_decr_iorefcnt_locked(ifp);
7864 lck_mtx_unlock(&ifp->if_ref_lock);
7865 }
7866
7867 static void
dlil_if_trace(struct dlil_ifnet * dl_if,int refhold)7868 dlil_if_trace(struct dlil_ifnet *dl_if, int refhold)
7869 {
7870 struct dlil_ifnet_dbg *dl_if_dbg = (struct dlil_ifnet_dbg *)dl_if;
7871 ctrace_t *tr;
7872 u_int32_t idx;
7873 u_int16_t *cnt;
7874
7875 if (!(dl_if->dl_if_flags & DLIF_DEBUG)) {
7876 panic("%s: dl_if %p has no debug structure", __func__, dl_if);
7877 /* NOTREACHED */
7878 }
7879
7880 if (refhold) {
7881 cnt = &dl_if_dbg->dldbg_if_refhold_cnt;
7882 tr = dl_if_dbg->dldbg_if_refhold;
7883 } else {
7884 cnt = &dl_if_dbg->dldbg_if_refrele_cnt;
7885 tr = dl_if_dbg->dldbg_if_refrele;
7886 }
7887
7888 idx = atomic_add_16_ov(cnt, 1) % IF_REF_TRACE_HIST_SIZE;
7889 ctrace_record(&tr[idx]);
7890 }
7891
7892 errno_t
dlil_if_ref(struct ifnet * ifp)7893 dlil_if_ref(struct ifnet *ifp)
7894 {
7895 struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
7896
7897 if (dl_if == NULL) {
7898 return EINVAL;
7899 }
7900
7901 lck_mtx_lock_spin(&dl_if->dl_if_lock);
7902 ++dl_if->dl_if_refcnt;
7903 if (dl_if->dl_if_refcnt == 0) {
7904 panic("%s: wraparound refcnt for ifp=%p", __func__, ifp);
7905 /* NOTREACHED */
7906 }
7907 if (dl_if->dl_if_trace != NULL) {
7908 (*dl_if->dl_if_trace)(dl_if, TRUE);
7909 }
7910 lck_mtx_unlock(&dl_if->dl_if_lock);
7911
7912 return 0;
7913 }
7914
7915 errno_t
dlil_if_free(struct ifnet * ifp)7916 dlil_if_free(struct ifnet *ifp)
7917 {
7918 struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
7919 bool need_release = FALSE;
7920
7921 if (dl_if == NULL) {
7922 return EINVAL;
7923 }
7924
7925 lck_mtx_lock_spin(&dl_if->dl_if_lock);
7926 switch (dl_if->dl_if_refcnt) {
7927 case 0:
7928 panic("%s: negative refcnt for ifp=%p", __func__, ifp);
7929 /* NOTREACHED */
7930 break;
7931 case 1:
7932 if ((ifp->if_refflags & IFRF_EMBRYONIC) != 0) {
7933 need_release = TRUE;
7934 }
7935 break;
7936 default:
7937 break;
7938 }
7939 --dl_if->dl_if_refcnt;
7940 if (dl_if->dl_if_trace != NULL) {
7941 (*dl_if->dl_if_trace)(dl_if, FALSE);
7942 }
7943 lck_mtx_unlock(&dl_if->dl_if_lock);
7944 if (need_release) {
7945 _dlil_if_release(ifp, true);
7946 }
7947 return 0;
7948 }
7949
7950 static errno_t
dlil_attach_protocol(struct if_proto * proto,const struct ifnet_demux_desc * demux_list,u_int32_t demux_count,uint32_t * proto_count)7951 dlil_attach_protocol(struct if_proto *proto,
7952 const struct ifnet_demux_desc *demux_list, u_int32_t demux_count,
7953 uint32_t * proto_count)
7954 {
7955 struct kev_dl_proto_data ev_pr_data;
7956 struct ifnet *ifp = proto->ifp;
7957 errno_t retval = 0;
7958 u_int32_t hash_value = proto_hash_value(proto->protocol_family);
7959 struct if_proto *prev_proto;
7960 struct if_proto *_proto;
7961
7962 /* don't allow attaching anything but PF_BRIDGE to vmnet interfaces */
7963 if (IFNET_IS_VMNET(ifp) && proto->protocol_family != PF_BRIDGE) {
7964 return EINVAL;
7965 }
7966
7967 if (!ifnet_is_attached(ifp, 1)) {
7968 os_log(OS_LOG_DEFAULT, "%s: %s is no longer attached",
7969 __func__, if_name(ifp));
7970 return ENXIO;
7971 }
7972 /* callee holds a proto refcnt upon success */
7973 ifnet_lock_exclusive(ifp);
7974 _proto = find_attached_proto(ifp, proto->protocol_family);
7975 if (_proto != NULL) {
7976 ifnet_lock_done(ifp);
7977 if_proto_free(_proto);
7978 retval = EEXIST;
7979 goto ioref_done;
7980 }
7981
7982 /*
7983 * Call family module add_proto routine so it can refine the
7984 * demux descriptors as it wishes.
7985 */
7986 retval = ifp->if_add_proto(ifp, proto->protocol_family, demux_list,
7987 demux_count);
7988 if (retval) {
7989 ifnet_lock_done(ifp);
7990 goto ioref_done;
7991 }
7992
7993 /*
7994 * Insert the protocol in the hash
7995 */
7996 prev_proto = SLIST_FIRST(&ifp->if_proto_hash[hash_value]);
7997 while (prev_proto != NULL && SLIST_NEXT(prev_proto, next_hash) != NULL) {
7998 prev_proto = SLIST_NEXT(prev_proto, next_hash);
7999 }
8000 if (prev_proto) {
8001 SLIST_INSERT_AFTER(prev_proto, proto, next_hash);
8002 } else {
8003 SLIST_INSERT_HEAD(&ifp->if_proto_hash[hash_value],
8004 proto, next_hash);
8005 }
8006
8007 /* hold a proto refcnt for attach */
8008 if_proto_ref(proto);
8009
8010 /*
8011 * The reserved field carries the number of protocol still attached
8012 * (subject to change)
8013 */
8014 ev_pr_data.proto_family = proto->protocol_family;
8015 ev_pr_data.proto_remaining_count = dlil_ifp_protolist(ifp, NULL, 0);
8016
8017 ifnet_lock_done(ifp);
8018
8019 dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_PROTO_ATTACHED,
8020 (struct net_event_data *)&ev_pr_data,
8021 sizeof(struct kev_dl_proto_data));
8022 if (proto_count != NULL) {
8023 *proto_count = ev_pr_data.proto_remaining_count;
8024 }
8025 ioref_done:
8026 ifnet_decr_iorefcnt(ifp);
8027 return retval;
8028 }
8029
8030 static void
dlil_handle_proto_attach(ifnet_t ifp,protocol_family_t protocol)8031 dlil_handle_proto_attach(ifnet_t ifp, protocol_family_t protocol)
8032 {
8033 /*
8034 * A protocol has been attached, mark the interface up.
8035 * This used to be done by configd.KernelEventMonitor, but that
8036 * is inherently prone to races (rdar://problem/30810208).
8037 */
8038 (void) ifnet_set_flags(ifp, IFF_UP, IFF_UP);
8039 (void) ifnet_ioctl(ifp, 0, SIOCSIFFLAGS, NULL);
8040 dlil_post_sifflags_msg(ifp);
8041 #if SKYWALK
8042 switch (protocol) {
8043 case AF_INET:
8044 case AF_INET6:
8045 /* don't attach the flowswitch unless attaching IP */
8046 dlil_attach_flowswitch_nexus(ifp);
8047 break;
8048 default:
8049 break;
8050 }
8051 #endif /* SKYWALK */
8052 }
8053
8054 errno_t
ifnet_attach_protocol(ifnet_t ifp,protocol_family_t protocol,const struct ifnet_attach_proto_param * proto_details)8055 ifnet_attach_protocol(ifnet_t ifp, protocol_family_t protocol,
8056 const struct ifnet_attach_proto_param *proto_details)
8057 {
8058 int retval = 0;
8059 struct if_proto *ifproto = NULL;
8060 uint32_t proto_count = 0;
8061
8062 ifnet_head_lock_shared();
8063 if (ifp == NULL || protocol == 0 || proto_details == NULL) {
8064 retval = EINVAL;
8065 goto end;
8066 }
8067 /* Check that the interface is in the global list */
8068 if (!ifnet_lookup(ifp)) {
8069 retval = ENXIO;
8070 goto end;
8071 }
8072
8073 ifproto = zalloc_flags(dlif_proto_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
8074
8075 /* refcnt held above during lookup */
8076 ifproto->ifp = ifp;
8077 ifproto->protocol_family = protocol;
8078 ifproto->proto_kpi = kProtoKPI_v1;
8079 ifproto->kpi.v1.input = proto_details->input;
8080 ifproto->kpi.v1.pre_output = proto_details->pre_output;
8081 ifproto->kpi.v1.event = proto_details->event;
8082 ifproto->kpi.v1.ioctl = proto_details->ioctl;
8083 ifproto->kpi.v1.detached = proto_details->detached;
8084 ifproto->kpi.v1.resolve_multi = proto_details->resolve;
8085 ifproto->kpi.v1.send_arp = proto_details->send_arp;
8086
8087 retval = dlil_attach_protocol(ifproto,
8088 proto_details->demux_list, proto_details->demux_count,
8089 &proto_count);
8090
8091 end:
8092 if (retval == EEXIST) {
8093 /* already attached */
8094 if (dlil_verbose) {
8095 DLIL_PRINTF("%s: protocol %d already attached\n",
8096 ifp != NULL ? if_name(ifp) : "N/A",
8097 protocol);
8098 }
8099 } else if (retval != 0) {
8100 DLIL_PRINTF("%s: failed to attach v1 protocol %d (err=%d)\n",
8101 ifp != NULL ? if_name(ifp) : "N/A", protocol, retval);
8102 } else if (dlil_verbose) {
8103 DLIL_PRINTF("%s: attached v1 protocol %d (count = %d)\n",
8104 ifp != NULL ? if_name(ifp) : "N/A",
8105 protocol, proto_count);
8106 }
8107 ifnet_head_done();
8108 if (retval == 0) {
8109 dlil_handle_proto_attach(ifp, protocol);
8110 } else if (ifproto != NULL) {
8111 zfree(dlif_proto_zone, ifproto);
8112 }
8113 return retval;
8114 }
8115
8116 errno_t
ifnet_attach_protocol_v2(ifnet_t ifp,protocol_family_t protocol,const struct ifnet_attach_proto_param_v2 * proto_details)8117 ifnet_attach_protocol_v2(ifnet_t ifp, protocol_family_t protocol,
8118 const struct ifnet_attach_proto_param_v2 *proto_details)
8119 {
8120 int retval = 0;
8121 struct if_proto *ifproto = NULL;
8122 uint32_t proto_count = 0;
8123
8124 ifnet_head_lock_shared();
8125 if (ifp == NULL || protocol == 0 || proto_details == NULL) {
8126 retval = EINVAL;
8127 goto end;
8128 }
8129 /* Check that the interface is in the global list */
8130 if (!ifnet_lookup(ifp)) {
8131 retval = ENXIO;
8132 goto end;
8133 }
8134
8135 ifproto = zalloc_flags(dlif_proto_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
8136
8137 /* refcnt held above during lookup */
8138 ifproto->ifp = ifp;
8139 ifproto->protocol_family = protocol;
8140 ifproto->proto_kpi = kProtoKPI_v2;
8141 ifproto->kpi.v2.input = proto_details->input;
8142 ifproto->kpi.v2.pre_output = proto_details->pre_output;
8143 ifproto->kpi.v2.event = proto_details->event;
8144 ifproto->kpi.v2.ioctl = proto_details->ioctl;
8145 ifproto->kpi.v2.detached = proto_details->detached;
8146 ifproto->kpi.v2.resolve_multi = proto_details->resolve;
8147 ifproto->kpi.v2.send_arp = proto_details->send_arp;
8148
8149 retval = dlil_attach_protocol(ifproto,
8150 proto_details->demux_list, proto_details->demux_count,
8151 &proto_count);
8152
8153 end:
8154 if (retval == EEXIST) {
8155 /* already attached */
8156 if (dlil_verbose) {
8157 DLIL_PRINTF("%s: protocol %d already attached\n",
8158 ifp != NULL ? if_name(ifp) : "N/A",
8159 protocol);
8160 }
8161 } else if (retval != 0) {
8162 DLIL_PRINTF("%s: failed to attach v2 protocol %d (err=%d)\n",
8163 ifp != NULL ? if_name(ifp) : "N/A", protocol, retval);
8164 } else if (dlil_verbose) {
8165 DLIL_PRINTF("%s: attached v2 protocol %d (count = %d)\n",
8166 ifp != NULL ? if_name(ifp) : "N/A",
8167 protocol, proto_count);
8168 }
8169 ifnet_head_done();
8170 if (retval == 0) {
8171 dlil_handle_proto_attach(ifp, protocol);
8172 } else if (ifproto != NULL) {
8173 zfree(dlif_proto_zone, ifproto);
8174 }
8175 return retval;
8176 }
8177
8178 errno_t
ifnet_detach_protocol(ifnet_t ifp,protocol_family_t proto_family)8179 ifnet_detach_protocol(ifnet_t ifp, protocol_family_t proto_family)
8180 {
8181 struct if_proto *proto = NULL;
8182 int retval = 0;
8183
8184 if (ifp == NULL || proto_family == 0) {
8185 retval = EINVAL;
8186 goto end;
8187 }
8188
8189 ifnet_lock_exclusive(ifp);
8190 /* callee holds a proto refcnt upon success */
8191 proto = find_attached_proto(ifp, proto_family);
8192 if (proto == NULL) {
8193 retval = ENXIO;
8194 ifnet_lock_done(ifp);
8195 goto end;
8196 }
8197
8198 /* call family module del_proto */
8199 if (ifp->if_del_proto) {
8200 ifp->if_del_proto(ifp, proto->protocol_family);
8201 }
8202
8203 SLIST_REMOVE(&ifp->if_proto_hash[proto_hash_value(proto_family)],
8204 proto, if_proto, next_hash);
8205
8206 if (proto->proto_kpi == kProtoKPI_v1) {
8207 proto->kpi.v1.input = ifproto_media_input_v1;
8208 proto->kpi.v1.pre_output = ifproto_media_preout;
8209 proto->kpi.v1.event = ifproto_media_event;
8210 proto->kpi.v1.ioctl = ifproto_media_ioctl;
8211 proto->kpi.v1.resolve_multi = ifproto_media_resolve_multi;
8212 proto->kpi.v1.send_arp = ifproto_media_send_arp;
8213 } else {
8214 proto->kpi.v2.input = ifproto_media_input_v2;
8215 proto->kpi.v2.pre_output = ifproto_media_preout;
8216 proto->kpi.v2.event = ifproto_media_event;
8217 proto->kpi.v2.ioctl = ifproto_media_ioctl;
8218 proto->kpi.v2.resolve_multi = ifproto_media_resolve_multi;
8219 proto->kpi.v2.send_arp = ifproto_media_send_arp;
8220 }
8221 proto->detached = 1;
8222 ifnet_lock_done(ifp);
8223
8224 if (dlil_verbose) {
8225 DLIL_PRINTF("%s: detached %s protocol %d\n", if_name(ifp),
8226 (proto->proto_kpi == kProtoKPI_v1) ?
8227 "v1" : "v2", proto_family);
8228 }
8229
8230 /* release proto refcnt held during protocol attach */
8231 if_proto_free(proto);
8232
8233 /*
8234 * Release proto refcnt held during lookup; the rest of
8235 * protocol detach steps will happen when the last proto
8236 * reference is released.
8237 */
8238 if_proto_free(proto);
8239
8240 end:
8241 return retval;
8242 }
8243
8244
8245 static errno_t
ifproto_media_input_v1(struct ifnet * ifp,protocol_family_t protocol,struct mbuf * packet,char * header)8246 ifproto_media_input_v1(struct ifnet *ifp, protocol_family_t protocol,
8247 struct mbuf *packet, char *header)
8248 {
8249 #pragma unused(ifp, protocol, packet, header)
8250 return ENXIO;
8251 }
8252
8253 static errno_t
ifproto_media_input_v2(struct ifnet * ifp,protocol_family_t protocol,struct mbuf * packet)8254 ifproto_media_input_v2(struct ifnet *ifp, protocol_family_t protocol,
8255 struct mbuf *packet)
8256 {
8257 #pragma unused(ifp, protocol, packet)
8258 return ENXIO;
8259 }
8260
8261 static errno_t
ifproto_media_preout(struct ifnet * ifp,protocol_family_t protocol,mbuf_t * packet,const struct sockaddr * dest,void * route,char * frame_type,char * link_layer_dest)8262 ifproto_media_preout(struct ifnet *ifp, protocol_family_t protocol,
8263 mbuf_t *packet, const struct sockaddr *dest, void *route, char *frame_type,
8264 char *link_layer_dest)
8265 {
8266 #pragma unused(ifp, protocol, packet, dest, route, frame_type, link_layer_dest)
8267 return ENXIO;
8268 }
8269
8270 static void
ifproto_media_event(struct ifnet * ifp,protocol_family_t protocol,const struct kev_msg * event)8271 ifproto_media_event(struct ifnet *ifp, protocol_family_t protocol,
8272 const struct kev_msg *event)
8273 {
8274 #pragma unused(ifp, protocol, event)
8275 }
8276
8277 static errno_t
ifproto_media_ioctl(struct ifnet * ifp,protocol_family_t protocol,unsigned long command,void * argument)8278 ifproto_media_ioctl(struct ifnet *ifp, protocol_family_t protocol,
8279 unsigned long command, void *argument)
8280 {
8281 #pragma unused(ifp, protocol, command, argument)
8282 return ENXIO;
8283 }
8284
8285 static errno_t
ifproto_media_resolve_multi(ifnet_t ifp,const struct sockaddr * proto_addr,struct sockaddr_dl * out_ll,size_t ll_len)8286 ifproto_media_resolve_multi(ifnet_t ifp, const struct sockaddr *proto_addr,
8287 struct sockaddr_dl *out_ll, size_t ll_len)
8288 {
8289 #pragma unused(ifp, proto_addr, out_ll, ll_len)
8290 return ENXIO;
8291 }
8292
8293 static errno_t
ifproto_media_send_arp(struct ifnet * ifp,u_short arpop,const struct sockaddr_dl * sender_hw,const struct sockaddr * sender_proto,const struct sockaddr_dl * target_hw,const struct sockaddr * target_proto)8294 ifproto_media_send_arp(struct ifnet *ifp, u_short arpop,
8295 const struct sockaddr_dl *sender_hw, const struct sockaddr *sender_proto,
8296 const struct sockaddr_dl *target_hw, const struct sockaddr *target_proto)
8297 {
8298 #pragma unused(ifp, arpop, sender_hw, sender_proto, target_hw, target_proto)
8299 return ENXIO;
8300 }
8301
8302 extern int if_next_index(void);
8303 extern int tcp_ecn_outbound;
8304
8305 void
dlil_ifclassq_setup(struct ifnet * ifp,struct ifclassq * ifcq)8306 dlil_ifclassq_setup(struct ifnet *ifp, struct ifclassq *ifcq)
8307 {
8308 uint32_t sflags = 0;
8309 int err;
8310
8311 if (if_flowadv) {
8312 sflags |= PKTSCHEDF_QALG_FLOWCTL;
8313 }
8314
8315 if (if_delaybased_queue) {
8316 sflags |= PKTSCHEDF_QALG_DELAYBASED;
8317 }
8318
8319 if (ifp->if_output_sched_model ==
8320 IFNET_SCHED_MODEL_DRIVER_MANAGED) {
8321 sflags |= PKTSCHEDF_QALG_DRIVER_MANAGED;
8322 }
8323 /* Inherit drop limit from the default queue */
8324 if (ifp->if_snd != ifcq) {
8325 IFCQ_PKT_DROP_LIMIT(ifcq) = IFCQ_PKT_DROP_LIMIT(ifp->if_snd);
8326 }
8327 /* Initialize transmit queue(s) */
8328 err = ifclassq_setup(ifcq, ifp, sflags);
8329 if (err != 0) {
8330 panic_plain("%s: ifp=%p couldn't initialize transmit queue; "
8331 "err=%d", __func__, ifp, err);
8332 /* NOTREACHED */
8333 }
8334 }
8335
8336 errno_t
ifnet_attach(ifnet_t ifp,const struct sockaddr_dl * ll_addr)8337 ifnet_attach(ifnet_t ifp, const struct sockaddr_dl *ll_addr)
8338 {
8339 #if SKYWALK
8340 boolean_t netif_compat;
8341 if_nexus_netif nexus_netif;
8342 #endif /* SKYWALK */
8343 struct ifnet *tmp_if;
8344 struct ifaddr *ifa;
8345 struct if_data_internal if_data_saved;
8346 struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
8347 struct dlil_threading_info *dl_inp;
8348 thread_continue_t thfunc = NULL;
8349 int err;
8350
8351 if (ifp == NULL) {
8352 return EINVAL;
8353 }
8354
8355 /*
8356 * Serialize ifnet attach using dlil_ifnet_lock, in order to
8357 * prevent the interface from being configured while it is
8358 * embryonic, as ifnet_head_lock is dropped and reacquired
8359 * below prior to marking the ifnet with IFRF_ATTACHED.
8360 */
8361 dlil_if_lock();
8362 ifnet_head_lock_exclusive();
8363 /* Verify we aren't already on the list */
8364 TAILQ_FOREACH(tmp_if, &ifnet_head, if_link) {
8365 if (tmp_if == ifp) {
8366 ifnet_head_done();
8367 dlil_if_unlock();
8368 return EEXIST;
8369 }
8370 }
8371
8372 lck_mtx_lock_spin(&ifp->if_ref_lock);
8373 if (!(ifp->if_refflags & IFRF_EMBRYONIC)) {
8374 panic_plain("%s: flags mismatch (embryonic not set) ifp=%p",
8375 __func__, ifp);
8376 /* NOTREACHED */
8377 }
8378 lck_mtx_unlock(&ifp->if_ref_lock);
8379
8380 ifnet_lock_exclusive(ifp);
8381
8382 /* Sanity check */
8383 VERIFY(ifp->if_detaching_link.tqe_next == NULL);
8384 VERIFY(ifp->if_detaching_link.tqe_prev == NULL);
8385 VERIFY(ifp->if_threads_pending == 0);
8386
8387 if (ll_addr != NULL) {
8388 if (ifp->if_addrlen == 0) {
8389 ifp->if_addrlen = ll_addr->sdl_alen;
8390 } else if (ll_addr->sdl_alen != ifp->if_addrlen) {
8391 ifnet_lock_done(ifp);
8392 ifnet_head_done();
8393 dlil_if_unlock();
8394 return EINVAL;
8395 }
8396 }
8397
8398 /*
8399 * Allow interfaces without protocol families to attach
8400 * only if they have the necessary fields filled out.
8401 */
8402 if (ifp->if_add_proto == NULL || ifp->if_del_proto == NULL) {
8403 DLIL_PRINTF("%s: Attempt to attach interface without "
8404 "family module - %d\n", __func__, ifp->if_family);
8405 ifnet_lock_done(ifp);
8406 ifnet_head_done();
8407 dlil_if_unlock();
8408 return ENODEV;
8409 }
8410
8411 /* Allocate protocol hash table */
8412 VERIFY(ifp->if_proto_hash == NULL);
8413 ifp->if_proto_hash = zalloc_flags(dlif_phash_zone,
8414 Z_WAITOK | Z_ZERO | Z_NOFAIL);
8415
8416 lck_mtx_lock_spin(&ifp->if_flt_lock);
8417 VERIFY(TAILQ_EMPTY(&ifp->if_flt_head));
8418 TAILQ_INIT(&ifp->if_flt_head);
8419 VERIFY(ifp->if_flt_busy == 0);
8420 VERIFY(ifp->if_flt_waiters == 0);
8421 VERIFY(ifp->if_flt_non_os_count == 0);
8422 VERIFY(ifp->if_flt_no_tso_count == 0);
8423 lck_mtx_unlock(&ifp->if_flt_lock);
8424
8425 if (!(dl_if->dl_if_flags & DLIF_REUSE)) {
8426 VERIFY(LIST_EMPTY(&ifp->if_multiaddrs));
8427 LIST_INIT(&ifp->if_multiaddrs);
8428 }
8429
8430 VERIFY(ifp->if_allhostsinm == NULL);
8431 VERIFY(TAILQ_EMPTY(&ifp->if_addrhead));
8432 TAILQ_INIT(&ifp->if_addrhead);
8433
8434 if (ifp->if_index == 0) {
8435 int idx = if_next_index();
8436
8437 if (idx == -1) {
8438 ifp->if_index = 0;
8439 ifnet_lock_done(ifp);
8440 ifnet_head_done();
8441 dlil_if_unlock();
8442 return ENOBUFS;
8443 }
8444 ifp->if_index = (uint16_t)idx;
8445
8446 /* the lladdr passed at attach time is the permanent address */
8447 if (ll_addr != NULL && ifp->if_type == IFT_ETHER &&
8448 ll_addr->sdl_alen == ETHER_ADDR_LEN) {
8449 bcopy(CONST_LLADDR(ll_addr),
8450 dl_if->dl_if_permanent_ether,
8451 ETHER_ADDR_LEN);
8452 dl_if->dl_if_permanent_ether_is_set = 1;
8453 }
8454 }
8455 /* There should not be anything occupying this slot */
8456 VERIFY(ifindex2ifnet[ifp->if_index] == NULL);
8457
8458 /* allocate (if needed) and initialize a link address */
8459 ifa = dlil_alloc_lladdr(ifp, ll_addr);
8460 if (ifa == NULL) {
8461 ifnet_lock_done(ifp);
8462 ifnet_head_done();
8463 dlil_if_unlock();
8464 return ENOBUFS;
8465 }
8466
8467 VERIFY(ifnet_addrs[ifp->if_index - 1] == NULL);
8468 ifnet_addrs[ifp->if_index - 1] = ifa;
8469
8470 /* make this address the first on the list */
8471 IFA_LOCK(ifa);
8472 /* hold a reference for ifnet_addrs[] */
8473 IFA_ADDREF_LOCKED(ifa);
8474 /* if_attach_link_ifa() holds a reference for ifa_link */
8475 if_attach_link_ifa(ifp, ifa);
8476 IFA_UNLOCK(ifa);
8477
8478 TAILQ_INSERT_TAIL(&ifnet_head, ifp, if_link);
8479 ifindex2ifnet[ifp->if_index] = ifp;
8480
8481 /* Hold a reference to the underlying dlil_ifnet */
8482 ifnet_reference(ifp);
8483
8484 /* Clear stats (save and restore other fields that we care) */
8485 if_data_saved = ifp->if_data;
8486 bzero(&ifp->if_data, sizeof(ifp->if_data));
8487 ifp->if_data.ifi_type = if_data_saved.ifi_type;
8488 ifp->if_data.ifi_typelen = if_data_saved.ifi_typelen;
8489 ifp->if_data.ifi_physical = if_data_saved.ifi_physical;
8490 ifp->if_data.ifi_addrlen = if_data_saved.ifi_addrlen;
8491 ifp->if_data.ifi_hdrlen = if_data_saved.ifi_hdrlen;
8492 ifp->if_data.ifi_mtu = if_data_saved.ifi_mtu;
8493 ifp->if_data.ifi_baudrate = if_data_saved.ifi_baudrate;
8494 ifp->if_data.ifi_hwassist = if_data_saved.ifi_hwassist;
8495 ifp->if_data.ifi_tso_v4_mtu = if_data_saved.ifi_tso_v4_mtu;
8496 ifp->if_data.ifi_tso_v6_mtu = if_data_saved.ifi_tso_v6_mtu;
8497 ifnet_touch_lastchange(ifp);
8498
8499 VERIFY(ifp->if_output_sched_model == IFNET_SCHED_MODEL_NORMAL ||
8500 ifp->if_output_sched_model == IFNET_SCHED_MODEL_DRIVER_MANAGED ||
8501 ifp->if_output_sched_model == IFNET_SCHED_MODEL_FQ_CODEL);
8502
8503 dlil_ifclassq_setup(ifp, ifp->if_snd);
8504
8505 /* Sanity checks on the input thread storage */
8506 dl_inp = &dl_if->dl_if_inpstorage;
8507 bzero(&dl_inp->dlth_stats, sizeof(dl_inp->dlth_stats));
8508 VERIFY(dl_inp->dlth_flags == 0);
8509 VERIFY(dl_inp->dlth_wtot == 0);
8510 VERIFY(dl_inp->dlth_ifp == NULL);
8511 VERIFY(qhead(&dl_inp->dlth_pkts) == NULL && qempty(&dl_inp->dlth_pkts));
8512 VERIFY(qlimit(&dl_inp->dlth_pkts) == 0);
8513 VERIFY(!dl_inp->dlth_affinity);
8514 VERIFY(ifp->if_inp == NULL);
8515 VERIFY(dl_inp->dlth_thread == THREAD_NULL);
8516 VERIFY(dl_inp->dlth_strategy == NULL);
8517 VERIFY(dl_inp->dlth_driver_thread == THREAD_NULL);
8518 VERIFY(dl_inp->dlth_poller_thread == THREAD_NULL);
8519 VERIFY(dl_inp->dlth_affinity_tag == 0);
8520
8521 #if IFNET_INPUT_SANITY_CHK
8522 VERIFY(dl_inp->dlth_pkts_cnt == 0);
8523 #endif /* IFNET_INPUT_SANITY_CHK */
8524
8525 VERIFY(ifp->if_poll_thread == THREAD_NULL);
8526 dlil_reset_rxpoll_params(ifp);
8527 /*
8528 * A specific DLIL input thread is created per non-loopback interface.
8529 */
8530 if (ifp->if_family != IFNET_FAMILY_LOOPBACK) {
8531 ifp->if_inp = dl_inp;
8532 ifnet_incr_pending_thread_count(ifp);
8533 err = dlil_create_input_thread(ifp, ifp->if_inp, &thfunc);
8534 if (err == ENODEV) {
8535 VERIFY(thfunc == NULL);
8536 ifnet_decr_pending_thread_count(ifp);
8537 } else if (err != 0) {
8538 panic_plain("%s: ifp=%p couldn't get an input thread; "
8539 "err=%d", __func__, ifp, err);
8540 /* NOTREACHED */
8541 }
8542 }
8543 /*
8544 * If the driver supports the new transmit model, calculate flow hash
8545 * and create a workloop starter thread to invoke the if_start callback
8546 * where the packets may be dequeued and transmitted.
8547 */
8548 if (ifp->if_eflags & IFEF_TXSTART) {
8549 thread_precedence_policy_data_t info;
8550 __unused kern_return_t kret;
8551
8552 ifp->if_flowhash = ifnet_calc_flowhash(ifp);
8553 VERIFY(ifp->if_flowhash != 0);
8554 VERIFY(ifp->if_start_thread == THREAD_NULL);
8555
8556 ifnet_set_start_cycle(ifp, NULL);
8557 ifp->if_start_active = 0;
8558 ifp->if_start_req = 0;
8559 ifp->if_start_flags = 0;
8560 VERIFY(ifp->if_start != NULL);
8561 ifnet_incr_pending_thread_count(ifp);
8562 if ((err = kernel_thread_start(ifnet_start_thread_func,
8563 ifp, &ifp->if_start_thread)) != KERN_SUCCESS) {
8564 panic_plain("%s: "
8565 "ifp=%p couldn't get a start thread; "
8566 "err=%d", __func__, ifp, err);
8567 /* NOTREACHED */
8568 }
8569 bzero(&info, sizeof(info));
8570 info.importance = 1;
8571 kret = thread_policy_set(ifp->if_start_thread,
8572 THREAD_PRECEDENCE_POLICY, (thread_policy_t)&info,
8573 THREAD_PRECEDENCE_POLICY_COUNT);
8574 ASSERT(kret == KERN_SUCCESS);
8575 } else {
8576 ifp->if_flowhash = 0;
8577 }
8578
8579 /* Reset polling parameters */
8580 ifnet_set_poll_cycle(ifp, NULL);
8581 ifp->if_poll_update = 0;
8582 ifp->if_poll_flags = 0;
8583 ifp->if_poll_req = 0;
8584 VERIFY(ifp->if_poll_thread == THREAD_NULL);
8585
8586 /*
8587 * If the driver supports the new receive model, create a poller
8588 * thread to invoke if_input_poll callback where the packets may
8589 * be dequeued from the driver and processed for reception.
8590 * if the interface is netif compat then the poller thread is
8591 * managed by netif.
8592 */
8593 if (thfunc == dlil_rxpoll_input_thread_func) {
8594 thread_precedence_policy_data_t info;
8595 __unused kern_return_t kret;
8596 #if SKYWALK
8597 VERIFY(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
8598 #endif /* SKYWALK */
8599 VERIFY(ifp->if_input_poll != NULL);
8600 VERIFY(ifp->if_input_ctl != NULL);
8601 ifnet_incr_pending_thread_count(ifp);
8602 if ((err = kernel_thread_start(ifnet_poll_thread_func, ifp,
8603 &ifp->if_poll_thread)) != KERN_SUCCESS) {
8604 panic_plain("%s: ifp=%p couldn't get a poll thread; "
8605 "err=%d", __func__, ifp, err);
8606 /* NOTREACHED */
8607 }
8608 bzero(&info, sizeof(info));
8609 info.importance = 1;
8610 kret = thread_policy_set(ifp->if_poll_thread,
8611 THREAD_PRECEDENCE_POLICY, (thread_policy_t)&info,
8612 THREAD_PRECEDENCE_POLICY_COUNT);
8613 ASSERT(kret == KERN_SUCCESS);
8614 }
8615
8616 VERIFY(ifp->if_desc.ifd_maxlen == IF_DESCSIZE);
8617 VERIFY(ifp->if_desc.ifd_len == 0);
8618 VERIFY(ifp->if_desc.ifd_desc != NULL);
8619
8620 /* Record attach PC stacktrace */
8621 ctrace_record(&((struct dlil_ifnet *)ifp)->dl_if_attach);
8622
8623 ifp->if_updatemcasts = 0;
8624 if (!LIST_EMPTY(&ifp->if_multiaddrs)) {
8625 struct ifmultiaddr *ifma;
8626 LIST_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
8627 IFMA_LOCK(ifma);
8628 if (ifma->ifma_addr->sa_family == AF_LINK ||
8629 ifma->ifma_addr->sa_family == AF_UNSPEC) {
8630 ifp->if_updatemcasts++;
8631 }
8632 IFMA_UNLOCK(ifma);
8633 }
8634
8635 DLIL_PRINTF("%s: attached with %d suspended link-layer multicast "
8636 "membership(s)\n", if_name(ifp),
8637 ifp->if_updatemcasts);
8638 }
8639
8640 /* Clear logging parameters */
8641 bzero(&ifp->if_log, sizeof(ifp->if_log));
8642
8643 /* Clear foreground/realtime activity timestamps */
8644 ifp->if_fg_sendts = 0;
8645 ifp->if_rt_sendts = 0;
8646
8647 /* Clear throughput estimates and radio type */
8648 ifp->if_estimated_up_bucket = 0;
8649 ifp->if_estimated_down_bucket = 0;
8650 ifp->if_radio_type = 0;
8651 ifp->if_radio_channel = 0;
8652
8653 VERIFY(ifp->if_delegated.ifp == NULL);
8654 VERIFY(ifp->if_delegated.type == 0);
8655 VERIFY(ifp->if_delegated.family == 0);
8656 VERIFY(ifp->if_delegated.subfamily == 0);
8657 VERIFY(ifp->if_delegated.expensive == 0);
8658 VERIFY(ifp->if_delegated.constrained == 0);
8659
8660 VERIFY(ifp->if_agentids == NULL);
8661 VERIFY(ifp->if_agentcount == 0);
8662
8663 /* Reset interface state */
8664 bzero(&ifp->if_interface_state, sizeof(ifp->if_interface_state));
8665 ifp->if_interface_state.valid_bitmask |=
8666 IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID;
8667 ifp->if_interface_state.interface_availability =
8668 IF_INTERFACE_STATE_INTERFACE_AVAILABLE;
8669
8670 /* Initialize Link Quality Metric (loopback [lo0] is always good) */
8671 if (ifp == lo_ifp) {
8672 ifp->if_interface_state.lqm_state = IFNET_LQM_THRESH_GOOD;
8673 ifp->if_interface_state.valid_bitmask |=
8674 IF_INTERFACE_STATE_LQM_STATE_VALID;
8675 } else {
8676 ifp->if_interface_state.lqm_state = IFNET_LQM_THRESH_UNKNOWN;
8677 }
8678
8679 /*
8680 * Enable ECN capability on this interface depending on the
8681 * value of ECN global setting
8682 */
8683 if (tcp_ecn_outbound == 2 && !IFNET_IS_CELLULAR(ifp)) {
8684 if_set_eflags(ifp, IFEF_ECN_ENABLE);
8685 if_clear_eflags(ifp, IFEF_ECN_DISABLE);
8686 }
8687
8688 /*
8689 * Built-in Cyclops always on policy for WiFi infra
8690 */
8691 if (IFNET_IS_WIFI_INFRA(ifp) && net_qos_policy_wifi_enabled != 0) {
8692 errno_t error;
8693
8694 error = if_set_qosmarking_mode(ifp,
8695 IFRTYPE_QOSMARKING_FASTLANE);
8696 if (error != 0) {
8697 DLIL_PRINTF("%s if_set_qosmarking_mode(%s) error %d\n",
8698 __func__, ifp->if_xname, error);
8699 } else {
8700 if_set_eflags(ifp, IFEF_QOSMARKING_ENABLED);
8701 #if (DEVELOPMENT || DEBUG)
8702 DLIL_PRINTF("%s fastlane enabled on %s\n",
8703 __func__, ifp->if_xname);
8704 #endif /* (DEVELOPMENT || DEBUG) */
8705 }
8706 }
8707
8708 ifnet_lock_done(ifp);
8709 ifnet_head_done();
8710
8711 #if SKYWALK
8712 netif_compat = dlil_attach_netif_compat_nexus(ifp, &nexus_netif);
8713 #endif /* SKYWALK */
8714
8715 lck_mtx_lock(&ifp->if_cached_route_lock);
8716 /* Enable forwarding cached route */
8717 ifp->if_fwd_cacheok = 1;
8718 /* Clean up any existing cached routes */
8719 ROUTE_RELEASE(&ifp->if_fwd_route);
8720 bzero(&ifp->if_fwd_route, sizeof(ifp->if_fwd_route));
8721 ROUTE_RELEASE(&ifp->if_src_route);
8722 bzero(&ifp->if_src_route, sizeof(ifp->if_src_route));
8723 ROUTE_RELEASE(&ifp->if_src_route6);
8724 bzero(&ifp->if_src_route6, sizeof(ifp->if_src_route6));
8725 lck_mtx_unlock(&ifp->if_cached_route_lock);
8726
8727 ifnet_llreach_ifattach(ifp, (dl_if->dl_if_flags & DLIF_REUSE));
8728
8729 /*
8730 * Allocate and attach IGMPv3/MLDv2 interface specific variables
8731 * and trees; do this before the ifnet is marked as attached.
8732 * The ifnet keeps the reference to the info structures even after
8733 * the ifnet is detached, since the network-layer records still
8734 * refer to the info structures even after that. This also
8735 * makes it possible for them to still function after the ifnet
8736 * is recycled or reattached.
8737 */
8738 #if INET
8739 if (IGMP_IFINFO(ifp) == NULL) {
8740 IGMP_IFINFO(ifp) = igmp_domifattach(ifp, Z_WAITOK);
8741 VERIFY(IGMP_IFINFO(ifp) != NULL);
8742 } else {
8743 VERIFY(IGMP_IFINFO(ifp)->igi_ifp == ifp);
8744 igmp_domifreattach(IGMP_IFINFO(ifp));
8745 }
8746 #endif /* INET */
8747 if (MLD_IFINFO(ifp) == NULL) {
8748 MLD_IFINFO(ifp) = mld_domifattach(ifp, Z_WAITOK);
8749 VERIFY(MLD_IFINFO(ifp) != NULL);
8750 } else {
8751 VERIFY(MLD_IFINFO(ifp)->mli_ifp == ifp);
8752 mld_domifreattach(MLD_IFINFO(ifp));
8753 }
8754
8755 VERIFY(ifp->if_data_threshold == 0);
8756 VERIFY(ifp->if_dt_tcall != NULL);
8757
8758 /*
8759 * Wait for the created kernel threads for I/O to get
8760 * scheduled and run at least once before we proceed
8761 * to mark interface as attached.
8762 */
8763 lck_mtx_lock(&ifp->if_ref_lock);
8764 while (ifp->if_threads_pending != 0) {
8765 DLIL_PRINTF("%s: Waiting for all kernel threads created for "
8766 "interface %s to get scheduled at least once.\n",
8767 __func__, ifp->if_xname);
8768 (void) msleep(&ifp->if_threads_pending, &ifp->if_ref_lock, (PZERO - 1),
8769 __func__, NULL);
8770 LCK_MTX_ASSERT(&ifp->if_ref_lock, LCK_ASSERT_OWNED);
8771 }
8772 lck_mtx_unlock(&ifp->if_ref_lock);
8773 DLIL_PRINTF("%s: All kernel threads created for interface %s have been scheduled "
8774 "at least once. Proceeding.\n", __func__, ifp->if_xname);
8775
8776 /* Final mark this ifnet as attached. */
8777 ifnet_lock_exclusive(ifp);
8778 lck_mtx_lock_spin(&ifp->if_ref_lock);
8779 ifp->if_refflags = (IFRF_ATTACHED | IFRF_READY); /* clears embryonic */
8780 lck_mtx_unlock(&ifp->if_ref_lock);
8781 if (net_rtref) {
8782 /* boot-args override; enable idle notification */
8783 (void) ifnet_set_idle_flags_locked(ifp, IFRF_IDLE_NOTIFY,
8784 IFRF_IDLE_NOTIFY);
8785 } else {
8786 /* apply previous request(s) to set the idle flags, if any */
8787 (void) ifnet_set_idle_flags_locked(ifp, ifp->if_idle_new_flags,
8788 ifp->if_idle_new_flags_mask);
8789 }
8790 #if SKYWALK
8791 /* the interface is fully attached; let the nexus adapter know */
8792 if (netif_compat || dlil_is_native_netif_nexus(ifp)) {
8793 if (netif_compat) {
8794 if (sk_netif_compat_txmodel ==
8795 NETIF_COMPAT_TXMODEL_ENQUEUE_MULTI) {
8796 ifnet_enqueue_multi_setup(ifp,
8797 sk_tx_delay_qlen, sk_tx_delay_timeout);
8798 }
8799 ifp->if_nx_netif = nexus_netif;
8800 }
8801 ifp->if_na_ops->ni_finalize(ifp->if_na, ifp);
8802 }
8803 #endif /* SKYWALK */
8804 ifnet_lock_done(ifp);
8805 dlil_if_unlock();
8806
8807 #if PF
8808 /*
8809 * Attach packet filter to this interface, if enabled.
8810 */
8811 pf_ifnet_hook(ifp, 1);
8812 #endif /* PF */
8813
8814 dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_IF_ATTACHED, NULL, 0);
8815
8816 if (dlil_verbose) {
8817 DLIL_PRINTF("%s: attached%s\n", if_name(ifp),
8818 (dl_if->dl_if_flags & DLIF_REUSE) ? " (recycled)" : "");
8819 }
8820
8821 return 0;
8822 }
8823
8824 /*
8825 * Prepare the storage for the first/permanent link address, which must
8826 * must have the same lifetime as the ifnet itself. Although the link
8827 * address gets removed from if_addrhead and ifnet_addrs[] at detach time,
8828 * its location in memory must never change as it may still be referred
8829 * to by some parts of the system afterwards (unfortunate implementation
8830 * artifacts inherited from BSD.)
8831 *
8832 * Caller must hold ifnet lock as writer.
8833 */
8834 static struct ifaddr *
dlil_alloc_lladdr(struct ifnet * ifp,const struct sockaddr_dl * ll_addr)8835 dlil_alloc_lladdr(struct ifnet *ifp, const struct sockaddr_dl *ll_addr)
8836 {
8837 struct ifaddr *ifa, *oifa;
8838 struct sockaddr_dl *asdl, *msdl;
8839 char workbuf[IFNAMSIZ * 2];
8840 int namelen, masklen, socksize;
8841 struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
8842
8843 ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_EXCLUSIVE);
8844 VERIFY(ll_addr == NULL || ll_addr->sdl_alen == ifp->if_addrlen);
8845
8846 namelen = scnprintf(workbuf, sizeof(workbuf), "%s",
8847 if_name(ifp));
8848 masklen = offsetof(struct sockaddr_dl, sdl_data[0])
8849 + ((namelen > 0) ? namelen : 0);
8850 socksize = masklen + ifp->if_addrlen;
8851 #define ROUNDUP(a) (1 + (((a) - 1) | (sizeof (u_int32_t) - 1)))
8852 if ((u_int32_t)socksize < sizeof(struct sockaddr_dl)) {
8853 socksize = sizeof(struct sockaddr_dl);
8854 }
8855 socksize = ROUNDUP(socksize);
8856 #undef ROUNDUP
8857
8858 ifa = ifp->if_lladdr;
8859 if (socksize > DLIL_SDLMAXLEN ||
8860 (ifa != NULL && ifa != &dl_if->dl_if_lladdr.ifa)) {
8861 /*
8862 * Rare, but in the event that the link address requires
8863 * more storage space than DLIL_SDLMAXLEN, allocate the
8864 * largest possible storages for address and mask, such
8865 * that we can reuse the same space when if_addrlen grows.
8866 * This same space will be used when if_addrlen shrinks.
8867 */
8868 if (ifa == NULL || ifa == &dl_if->dl_if_lladdr.ifa) {
8869 int ifasize = sizeof(*ifa) + 2 * SOCK_MAXADDRLEN;
8870 ifa = _MALLOC(ifasize, M_IFADDR, M_WAITOK | M_ZERO);
8871 if (ifa == NULL) {
8872 return NULL;
8873 }
8874 ifa_lock_init(ifa);
8875 /* Don't set IFD_ALLOC, as this is permanent */
8876 ifa->ifa_debug = IFD_LINK;
8877 }
8878 IFA_LOCK(ifa);
8879 /* address and mask sockaddr_dl locations */
8880 asdl = (struct sockaddr_dl *)(ifa + 1);
8881 bzero(asdl, SOCK_MAXADDRLEN);
8882 msdl = (struct sockaddr_dl *)(void *)
8883 ((char *)asdl + SOCK_MAXADDRLEN);
8884 bzero(msdl, SOCK_MAXADDRLEN);
8885 } else {
8886 VERIFY(ifa == NULL || ifa == &dl_if->dl_if_lladdr.ifa);
8887 /*
8888 * Use the storage areas for address and mask within the
8889 * dlil_ifnet structure. This is the most common case.
8890 */
8891 if (ifa == NULL) {
8892 ifa = &dl_if->dl_if_lladdr.ifa;
8893 ifa_lock_init(ifa);
8894 /* Don't set IFD_ALLOC, as this is permanent */
8895 ifa->ifa_debug = IFD_LINK;
8896 }
8897 IFA_LOCK(ifa);
8898 /* address and mask sockaddr_dl locations */
8899 asdl = (struct sockaddr_dl *)(void *)&dl_if->dl_if_lladdr.asdl;
8900 bzero(asdl, sizeof(dl_if->dl_if_lladdr.asdl));
8901 msdl = (struct sockaddr_dl *)(void *)&dl_if->dl_if_lladdr.msdl;
8902 bzero(msdl, sizeof(dl_if->dl_if_lladdr.msdl));
8903 }
8904
8905 /* hold a permanent reference for the ifnet itself */
8906 IFA_ADDREF_LOCKED(ifa);
8907 oifa = ifp->if_lladdr;
8908 ifp->if_lladdr = ifa;
8909
8910 VERIFY(ifa->ifa_debug == IFD_LINK);
8911 ifa->ifa_ifp = ifp;
8912 ifa->ifa_rtrequest = link_rtrequest;
8913 ifa->ifa_addr = (struct sockaddr *)asdl;
8914 asdl->sdl_len = (u_char)socksize;
8915 asdl->sdl_family = AF_LINK;
8916 if (namelen > 0) {
8917 bcopy(workbuf, asdl->sdl_data, min(namelen,
8918 sizeof(asdl->sdl_data)));
8919 asdl->sdl_nlen = (u_char)namelen;
8920 } else {
8921 asdl->sdl_nlen = 0;
8922 }
8923 asdl->sdl_index = ifp->if_index;
8924 asdl->sdl_type = ifp->if_type;
8925 if (ll_addr != NULL) {
8926 asdl->sdl_alen = ll_addr->sdl_alen;
8927 bcopy(CONST_LLADDR(ll_addr), LLADDR(asdl), asdl->sdl_alen);
8928 } else {
8929 asdl->sdl_alen = 0;
8930 }
8931 ifa->ifa_netmask = (struct sockaddr *)msdl;
8932 msdl->sdl_len = (u_char)masklen;
8933 while (namelen > 0) {
8934 msdl->sdl_data[--namelen] = 0xff;
8935 }
8936 IFA_UNLOCK(ifa);
8937
8938 if (oifa != NULL) {
8939 IFA_REMREF(oifa);
8940 }
8941
8942 return ifa;
8943 }
8944
8945 static void
if_purgeaddrs(struct ifnet * ifp)8946 if_purgeaddrs(struct ifnet *ifp)
8947 {
8948 #if INET
8949 in_purgeaddrs(ifp);
8950 #endif /* INET */
8951 in6_purgeaddrs(ifp);
8952 }
8953
8954 errno_t
ifnet_detach(ifnet_t ifp)8955 ifnet_detach(ifnet_t ifp)
8956 {
8957 struct ifnet *delegated_ifp;
8958 struct nd_ifinfo *ndi = NULL;
8959 #if SKYWALK
8960 if_nexus_netif nexus_netif;
8961 if_nexus_flowswitch nexus_fsw;
8962 #endif /* SKYWALK */
8963
8964 if (ifp == NULL) {
8965 return EINVAL;
8966 }
8967
8968 ndi = ND_IFINFO(ifp);
8969 if (NULL != ndi) {
8970 ndi->cga_initialized = FALSE;
8971 }
8972
8973 /* Mark the interface down */
8974 if_down(ifp);
8975
8976 /*
8977 * IMPORTANT NOTE
8978 *
8979 * Any field in the ifnet that relies on IF_FULLY_ATTACHED()
8980 * or equivalently, ifnet_is_attached(ifp, 1), can't be modified
8981 * until after we've waited for all I/O references to drain
8982 * in ifnet_detach_final().
8983 */
8984
8985 ifnet_head_lock_exclusive();
8986 ifnet_lock_exclusive(ifp);
8987
8988 if (ifp->if_output_netem != NULL) {
8989 netem_destroy(ifp->if_output_netem);
8990 ifp->if_output_netem = NULL;
8991 }
8992
8993 /*
8994 * Check to see if this interface has previously triggered
8995 * aggressive protocol draining; if so, decrement the global
8996 * refcnt and clear PR_AGGDRAIN on the route domain if
8997 * there are no more of such an interface around.
8998 */
8999 (void) ifnet_set_idle_flags_locked(ifp, 0, ~0);
9000
9001 lck_mtx_lock_spin(&ifp->if_ref_lock);
9002 if (!(ifp->if_refflags & IFRF_ATTACHED)) {
9003 lck_mtx_unlock(&ifp->if_ref_lock);
9004 ifnet_lock_done(ifp);
9005 ifnet_head_done();
9006 return EINVAL;
9007 } else if (ifp->if_refflags & IFRF_DETACHING) {
9008 /* Interface has already been detached */
9009 lck_mtx_unlock(&ifp->if_ref_lock);
9010 ifnet_lock_done(ifp);
9011 ifnet_head_done();
9012 return ENXIO;
9013 }
9014 VERIFY(!(ifp->if_refflags & IFRF_EMBRYONIC));
9015 /* Indicate this interface is being detached */
9016 ifp->if_refflags &= ~IFRF_ATTACHED;
9017 ifp->if_refflags |= IFRF_DETACHING;
9018 lck_mtx_unlock(&ifp->if_ref_lock);
9019
9020 if (dlil_verbose) {
9021 DLIL_PRINTF("%s: detaching\n", if_name(ifp));
9022 }
9023
9024 /* clean up flow control entry object if there's any */
9025 if (ifp->if_eflags & IFEF_TXSTART) {
9026 ifnet_flowadv(ifp->if_flowhash);
9027 }
9028
9029 /* Reset ECN enable/disable flags */
9030 /* Reset CLAT46 flag */
9031 if_clear_eflags(ifp, IFEF_ECN_ENABLE | IFEF_ECN_DISABLE | IFEF_CLAT46);
9032
9033 /*
9034 * We do not reset the TCP keep alive counters in case
9035 * a TCP connection stays connection after the interface
9036 * went down
9037 */
9038 if (ifp->if_tcp_kao_cnt > 0) {
9039 os_log(OS_LOG_DEFAULT, "%s %s tcp_kao_cnt %u not zero",
9040 __func__, if_name(ifp), ifp->if_tcp_kao_cnt);
9041 }
9042 ifp->if_tcp_kao_max = 0;
9043
9044 /*
9045 * Remove ifnet from the ifnet_head, ifindex2ifnet[]; it will
9046 * no longer be visible during lookups from this point.
9047 */
9048 VERIFY(ifindex2ifnet[ifp->if_index] == ifp);
9049 TAILQ_REMOVE(&ifnet_head, ifp, if_link);
9050 ifp->if_link.tqe_next = NULL;
9051 ifp->if_link.tqe_prev = NULL;
9052 if (ifp->if_ordered_link.tqe_next != NULL ||
9053 ifp->if_ordered_link.tqe_prev != NULL) {
9054 ifnet_remove_from_ordered_list(ifp);
9055 }
9056 ifindex2ifnet[ifp->if_index] = NULL;
9057
9058 /* 18717626 - reset router mode */
9059 if_clear_eflags(ifp, IFEF_IPV4_ROUTER);
9060 ifp->if_ipv6_router_mode = IPV6_ROUTER_MODE_DISABLED;
9061
9062 /* Record detach PC stacktrace */
9063 ctrace_record(&((struct dlil_ifnet *)ifp)->dl_if_detach);
9064
9065 /* Clear logging parameters */
9066 bzero(&ifp->if_log, sizeof(ifp->if_log));
9067
9068 /* Clear delegated interface info (reference released below) */
9069 delegated_ifp = ifp->if_delegated.ifp;
9070 bzero(&ifp->if_delegated, sizeof(ifp->if_delegated));
9071
9072 /* Reset interface state */
9073 bzero(&ifp->if_interface_state, sizeof(ifp->if_interface_state));
9074
9075 #if SKYWALK
9076 /* save then clear the nexus configuration */
9077 nexus_netif = ifp->if_nx_netif;
9078 bzero(&ifp->if_nx_netif, sizeof(ifp->if_nx_netif));
9079 nexus_fsw = ifp->if_nx_flowswitch;
9080 bzero(&ifp->if_nx_flowswitch, sizeof(ifp->if_nx_flowswitch));
9081 #endif /* SKYWALK */
9082 ifnet_lock_done(ifp);
9083 ifnet_head_done();
9084
9085 #if SKYWALK
9086 /* detach nexus configuration */
9087 dlil_detach_flowswitch_nexus(&nexus_fsw);
9088 dlil_detach_netif_nexus(&nexus_netif);
9089 #endif /* SKYWALK */
9090
9091 /* Release reference held on the delegated interface */
9092 if (delegated_ifp != NULL) {
9093 ifnet_release(delegated_ifp);
9094 }
9095
9096 /* Reset Link Quality Metric (unless loopback [lo0]) */
9097 if (ifp != lo_ifp) {
9098 if_lqm_update(ifp, IFNET_LQM_THRESH_OFF, 0);
9099 }
9100
9101 /* Reset TCP local statistics */
9102 if (ifp->if_tcp_stat != NULL) {
9103 bzero(ifp->if_tcp_stat, sizeof(*ifp->if_tcp_stat));
9104 }
9105
9106 /* Reset UDP local statistics */
9107 if (ifp->if_udp_stat != NULL) {
9108 bzero(ifp->if_udp_stat, sizeof(*ifp->if_udp_stat));
9109 }
9110
9111 /* Reset ifnet IPv4 stats */
9112 if (ifp->if_ipv4_stat != NULL) {
9113 bzero(ifp->if_ipv4_stat, sizeof(*ifp->if_ipv4_stat));
9114 }
9115
9116 /* Reset ifnet IPv6 stats */
9117 if (ifp->if_ipv6_stat != NULL) {
9118 bzero(ifp->if_ipv6_stat, sizeof(*ifp->if_ipv6_stat));
9119 }
9120
9121 /* Release memory held for interface link status report */
9122 if (ifp->if_link_status != NULL) {
9123 kfree_type(struct if_link_status, ifp->if_link_status);
9124 ifp->if_link_status = NULL;
9125 }
9126
9127 /* Let BPF know we're detaching */
9128 bpfdetach(ifp);
9129
9130 /* Disable forwarding cached route */
9131 lck_mtx_lock(&ifp->if_cached_route_lock);
9132 ifp->if_fwd_cacheok = 0;
9133 lck_mtx_unlock(&ifp->if_cached_route_lock);
9134
9135 /* Disable data threshold and wait for any pending event posting */
9136 ifp->if_data_threshold = 0;
9137 VERIFY(ifp->if_dt_tcall != NULL);
9138 (void) thread_call_cancel_wait(ifp->if_dt_tcall);
9139
9140 /*
9141 * Drain any deferred IGMPv3/MLDv2 query responses, but keep the
9142 * references to the info structures and leave them attached to
9143 * this ifnet.
9144 */
9145 #if INET
9146 igmp_domifdetach(ifp);
9147 #endif /* INET */
9148 mld_domifdetach(ifp);
9149
9150 #if SKYWALK
9151 /* Clean up any netns tokens still pointing to to this ifnet */
9152 netns_ifnet_detach(ifp);
9153 #endif /* SKYWALK */
9154 dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_IF_DETACHING, NULL, 0);
9155
9156 /* Let worker thread take care of the rest, to avoid reentrancy */
9157 dlil_if_lock();
9158 ifnet_detaching_enqueue(ifp);
9159 dlil_if_unlock();
9160
9161 return 0;
9162 }
9163
9164 static void
ifnet_detaching_enqueue(struct ifnet * ifp)9165 ifnet_detaching_enqueue(struct ifnet *ifp)
9166 {
9167 dlil_if_lock_assert();
9168
9169 ++ifnet_detaching_cnt;
9170 VERIFY(ifnet_detaching_cnt != 0);
9171 TAILQ_INSERT_TAIL(&ifnet_detaching_head, ifp, if_detaching_link);
9172 wakeup((caddr_t)&ifnet_delayed_run);
9173 }
9174
9175 static struct ifnet *
ifnet_detaching_dequeue(void)9176 ifnet_detaching_dequeue(void)
9177 {
9178 struct ifnet *ifp;
9179
9180 dlil_if_lock_assert();
9181
9182 ifp = TAILQ_FIRST(&ifnet_detaching_head);
9183 VERIFY(ifnet_detaching_cnt != 0 || ifp == NULL);
9184 if (ifp != NULL) {
9185 VERIFY(ifnet_detaching_cnt != 0);
9186 --ifnet_detaching_cnt;
9187 TAILQ_REMOVE(&ifnet_detaching_head, ifp, if_detaching_link);
9188 ifp->if_detaching_link.tqe_next = NULL;
9189 ifp->if_detaching_link.tqe_prev = NULL;
9190 }
9191 return ifp;
9192 }
9193
9194 __attribute__((noreturn))
9195 static void
ifnet_detacher_thread_cont(void * v,wait_result_t wres)9196 ifnet_detacher_thread_cont(void *v, wait_result_t wres)
9197 {
9198 #pragma unused(v, wres)
9199 struct ifnet *ifp;
9200
9201 dlil_if_lock();
9202 if (__improbable(ifnet_detaching_embryonic)) {
9203 ifnet_detaching_embryonic = FALSE;
9204 /* there's no lock ordering constrain so OK to do this here */
9205 dlil_decr_pending_thread_count();
9206 }
9207
9208 for (;;) {
9209 dlil_if_lock_assert();
9210
9211 if (ifnet_detaching_cnt == 0) {
9212 break;
9213 }
9214
9215 net_update_uptime();
9216
9217 VERIFY(TAILQ_FIRST(&ifnet_detaching_head) != NULL);
9218
9219 /* Take care of detaching ifnet */
9220 ifp = ifnet_detaching_dequeue();
9221 if (ifp != NULL) {
9222 dlil_if_unlock();
9223 ifnet_detach_final(ifp);
9224 dlil_if_lock();
9225 }
9226 }
9227
9228 (void) assert_wait(&ifnet_delayed_run, THREAD_UNINT);
9229 dlil_if_unlock();
9230 (void) thread_block(ifnet_detacher_thread_cont);
9231
9232 VERIFY(0); /* we should never get here */
9233 /* NOTREACHED */
9234 __builtin_unreachable();
9235 }
9236
9237 __dead2
9238 static void
ifnet_detacher_thread_func(void * v,wait_result_t w)9239 ifnet_detacher_thread_func(void *v, wait_result_t w)
9240 {
9241 #pragma unused(v, w)
9242 dlil_if_lock();
9243 (void) assert_wait(&ifnet_delayed_run, THREAD_UNINT);
9244 ifnet_detaching_embryonic = TRUE;
9245 /* wake up once to get out of embryonic state */
9246 wakeup((caddr_t)&ifnet_delayed_run);
9247 dlil_if_unlock();
9248 (void) thread_block(ifnet_detacher_thread_cont);
9249 VERIFY(0);
9250 /* NOTREACHED */
9251 __builtin_unreachable();
9252 }
9253
9254 static void
ifnet_detach_final(struct ifnet * ifp)9255 ifnet_detach_final(struct ifnet *ifp)
9256 {
9257 struct ifnet_filter *filter, *filter_next;
9258 struct dlil_ifnet *dlifp;
9259 struct ifnet_filter_head fhead;
9260 struct dlil_threading_info *inp;
9261 struct ifaddr *ifa;
9262 ifnet_detached_func if_free;
9263 int i;
9264
9265 lck_mtx_lock(&ifp->if_ref_lock);
9266 if (!(ifp->if_refflags & IFRF_DETACHING)) {
9267 panic("%s: flags mismatch (detaching not set) ifp=%p",
9268 __func__, ifp);
9269 /* NOTREACHED */
9270 }
9271
9272 /*
9273 * Wait until the existing IO references get released
9274 * before we proceed with ifnet_detach. This is not a
9275 * common case, so block without using a continuation.
9276 */
9277 while (ifp->if_refio > 0) {
9278 DLIL_PRINTF("%s: Waiting for IO references on %s interface "
9279 "to be released\n", __func__, if_name(ifp));
9280 (void) msleep(&(ifp->if_refio), &ifp->if_ref_lock,
9281 (PZERO - 1), "ifnet_ioref_wait", NULL);
9282 }
9283
9284 VERIFY(ifp->if_datamov == 0);
9285 VERIFY(ifp->if_drainers == 0);
9286 VERIFY(ifp->if_suspend == 0);
9287 ifp->if_refflags &= ~IFRF_READY;
9288 lck_mtx_unlock(&ifp->if_ref_lock);
9289
9290 /* Clear agent IDs */
9291 if (ifp->if_agentids != NULL) {
9292 kfree_data(ifp->if_agentids,
9293 sizeof(uuid_t) * ifp->if_agentcount);
9294 ifp->if_agentids = NULL;
9295 }
9296 ifp->if_agentcount = 0;
9297
9298 #if SKYWALK
9299 VERIFY(SLIST_EMPTY(&ifp->if_netns_tokens));
9300 #endif /* SKYWALK */
9301 /* Drain and destroy send queue */
9302 ifclassq_teardown(ifp->if_snd);
9303
9304 /* Detach interface filters */
9305 lck_mtx_lock(&ifp->if_flt_lock);
9306 if_flt_monitor_enter(ifp);
9307
9308 LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
9309 fhead = ifp->if_flt_head;
9310 TAILQ_INIT(&ifp->if_flt_head);
9311
9312 for (filter = TAILQ_FIRST(&fhead); filter; filter = filter_next) {
9313 filter_next = TAILQ_NEXT(filter, filt_next);
9314 lck_mtx_unlock(&ifp->if_flt_lock);
9315
9316 dlil_detach_filter_internal(filter, 1);
9317 lck_mtx_lock(&ifp->if_flt_lock);
9318 }
9319 if_flt_monitor_leave(ifp);
9320 lck_mtx_unlock(&ifp->if_flt_lock);
9321
9322 /* Tell upper layers to drop their network addresses */
9323 if_purgeaddrs(ifp);
9324
9325 ifnet_lock_exclusive(ifp);
9326
9327 /* Unplumb all protocols */
9328 for (i = 0; i < PROTO_HASH_SLOTS; i++) {
9329 struct if_proto *proto;
9330
9331 proto = SLIST_FIRST(&ifp->if_proto_hash[i]);
9332 while (proto != NULL) {
9333 protocol_family_t family = proto->protocol_family;
9334 ifnet_lock_done(ifp);
9335 proto_unplumb(family, ifp);
9336 ifnet_lock_exclusive(ifp);
9337 proto = SLIST_FIRST(&ifp->if_proto_hash[i]);
9338 }
9339 /* There should not be any protocols left */
9340 VERIFY(SLIST_EMPTY(&ifp->if_proto_hash[i]));
9341 }
9342 zfree(dlif_phash_zone, ifp->if_proto_hash);
9343 ifp->if_proto_hash = NULL;
9344
9345 /* Detach (permanent) link address from if_addrhead */
9346 ifa = TAILQ_FIRST(&ifp->if_addrhead);
9347 VERIFY(ifnet_addrs[ifp->if_index - 1] == ifa);
9348 IFA_LOCK(ifa);
9349 if_detach_link_ifa(ifp, ifa);
9350 IFA_UNLOCK(ifa);
9351
9352 /* Remove (permanent) link address from ifnet_addrs[] */
9353 IFA_REMREF(ifa);
9354 ifnet_addrs[ifp->if_index - 1] = NULL;
9355
9356 /* This interface should not be on {ifnet_head,detaching} */
9357 VERIFY(ifp->if_link.tqe_next == NULL);
9358 VERIFY(ifp->if_link.tqe_prev == NULL);
9359 VERIFY(ifp->if_detaching_link.tqe_next == NULL);
9360 VERIFY(ifp->if_detaching_link.tqe_prev == NULL);
9361 VERIFY(ifp->if_ordered_link.tqe_next == NULL);
9362 VERIFY(ifp->if_ordered_link.tqe_prev == NULL);
9363
9364 /* The slot should have been emptied */
9365 VERIFY(ifindex2ifnet[ifp->if_index] == NULL);
9366
9367 /* There should not be any addresses left */
9368 VERIFY(TAILQ_EMPTY(&ifp->if_addrhead));
9369
9370 /*
9371 * Signal the starter thread to terminate itself, and wait until
9372 * it has exited.
9373 */
9374 if (ifp->if_start_thread != THREAD_NULL) {
9375 lck_mtx_lock_spin(&ifp->if_start_lock);
9376 ifp->if_start_flags |= IFSF_TERMINATING;
9377 wakeup_one((caddr_t)&ifp->if_start_thread);
9378 lck_mtx_unlock(&ifp->if_start_lock);
9379
9380 /* wait for starter thread to terminate */
9381 lck_mtx_lock(&ifp->if_start_lock);
9382 while (ifp->if_start_thread != THREAD_NULL) {
9383 if (dlil_verbose) {
9384 DLIL_PRINTF("%s: waiting for %s starter thread to terminate\n",
9385 __func__,
9386 if_name(ifp));
9387 }
9388 (void) msleep(&ifp->if_start_thread,
9389 &ifp->if_start_lock, (PZERO - 1),
9390 "ifnet_start_thread_exit", NULL);
9391 }
9392 lck_mtx_unlock(&ifp->if_start_lock);
9393 if (dlil_verbose) {
9394 DLIL_PRINTF("%s: %s starter thread termination complete",
9395 __func__, if_name(ifp));
9396 }
9397 }
9398
9399 /*
9400 * Signal the poller thread to terminate itself, and wait until
9401 * it has exited.
9402 */
9403 if (ifp->if_poll_thread != THREAD_NULL) {
9404 #if SKYWALK
9405 VERIFY(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
9406 #endif /* SKYWALK */
9407 lck_mtx_lock_spin(&ifp->if_poll_lock);
9408 ifp->if_poll_flags |= IF_POLLF_TERMINATING;
9409 wakeup_one((caddr_t)&ifp->if_poll_thread);
9410 lck_mtx_unlock(&ifp->if_poll_lock);
9411
9412 /* wait for poller thread to terminate */
9413 lck_mtx_lock(&ifp->if_poll_lock);
9414 while (ifp->if_poll_thread != THREAD_NULL) {
9415 if (dlil_verbose) {
9416 DLIL_PRINTF("%s: waiting for %s poller thread to terminate\n",
9417 __func__,
9418 if_name(ifp));
9419 }
9420 (void) msleep(&ifp->if_poll_thread,
9421 &ifp->if_poll_lock, (PZERO - 1),
9422 "ifnet_poll_thread_exit", NULL);
9423 }
9424 lck_mtx_unlock(&ifp->if_poll_lock);
9425 if (dlil_verbose) {
9426 DLIL_PRINTF("%s: %s poller thread termination complete\n",
9427 __func__, if_name(ifp));
9428 }
9429 }
9430
9431 /*
9432 * If thread affinity was set for the workloop thread, we will need
9433 * to tear down the affinity and release the extra reference count
9434 * taken at attach time. Does not apply to lo0 or other interfaces
9435 * without dedicated input threads.
9436 */
9437 if ((inp = ifp->if_inp) != NULL) {
9438 VERIFY(inp != dlil_main_input_thread);
9439
9440 if (inp->dlth_affinity) {
9441 struct thread *tp, *wtp, *ptp;
9442
9443 lck_mtx_lock_spin(&inp->dlth_lock);
9444 wtp = inp->dlth_driver_thread;
9445 inp->dlth_driver_thread = THREAD_NULL;
9446 ptp = inp->dlth_poller_thread;
9447 inp->dlth_poller_thread = THREAD_NULL;
9448 ASSERT(inp->dlth_thread != THREAD_NULL);
9449 tp = inp->dlth_thread; /* don't nullify now */
9450 inp->dlth_affinity_tag = 0;
9451 inp->dlth_affinity = FALSE;
9452 lck_mtx_unlock(&inp->dlth_lock);
9453
9454 /* Tear down poll thread affinity */
9455 if (ptp != NULL) {
9456 VERIFY(ifp->if_eflags & IFEF_RXPOLL);
9457 VERIFY(ifp->if_xflags & IFXF_LEGACY);
9458 (void) dlil_affinity_set(ptp,
9459 THREAD_AFFINITY_TAG_NULL);
9460 thread_deallocate(ptp);
9461 }
9462
9463 /* Tear down workloop thread affinity */
9464 if (wtp != NULL) {
9465 (void) dlil_affinity_set(wtp,
9466 THREAD_AFFINITY_TAG_NULL);
9467 thread_deallocate(wtp);
9468 }
9469
9470 /* Tear down DLIL input thread affinity */
9471 (void) dlil_affinity_set(tp, THREAD_AFFINITY_TAG_NULL);
9472 thread_deallocate(tp);
9473 }
9474
9475 /* disassociate ifp DLIL input thread */
9476 ifp->if_inp = NULL;
9477
9478 /* if the worker thread was created, tell it to terminate */
9479 if (inp->dlth_thread != THREAD_NULL) {
9480 lck_mtx_lock_spin(&inp->dlth_lock);
9481 inp->dlth_flags |= DLIL_INPUT_TERMINATE;
9482 if (!(inp->dlth_flags & DLIL_INPUT_RUNNING)) {
9483 wakeup_one((caddr_t)&inp->dlth_flags);
9484 }
9485 lck_mtx_unlock(&inp->dlth_lock);
9486 ifnet_lock_done(ifp);
9487
9488 /* wait for the input thread to terminate */
9489 lck_mtx_lock_spin(&inp->dlth_lock);
9490 while ((inp->dlth_flags & DLIL_INPUT_TERMINATE_COMPLETE)
9491 == 0) {
9492 (void) msleep(&inp->dlth_flags, &inp->dlth_lock,
9493 (PZERO - 1) | PSPIN, inp->dlth_name, NULL);
9494 }
9495 lck_mtx_unlock(&inp->dlth_lock);
9496 ifnet_lock_exclusive(ifp);
9497 }
9498
9499 /* clean-up input thread state */
9500 dlil_clean_threading_info(inp);
9501 /* clean-up poll parameters */
9502 VERIFY(ifp->if_poll_thread == THREAD_NULL);
9503 dlil_reset_rxpoll_params(ifp);
9504 }
9505
9506 /* The driver might unload, so point these to ourselves */
9507 if_free = ifp->if_free;
9508 ifp->if_output_dlil = ifp_if_output;
9509 ifp->if_output = ifp_if_output;
9510 ifp->if_pre_enqueue = ifp_if_output;
9511 ifp->if_start = ifp_if_start;
9512 ifp->if_output_ctl = ifp_if_ctl;
9513 ifp->if_input_dlil = ifp_if_input;
9514 ifp->if_input_poll = ifp_if_input_poll;
9515 ifp->if_input_ctl = ifp_if_ctl;
9516 ifp->if_ioctl = ifp_if_ioctl;
9517 ifp->if_set_bpf_tap = ifp_if_set_bpf_tap;
9518 ifp->if_free = ifp_if_free;
9519 ifp->if_demux = ifp_if_demux;
9520 ifp->if_event = ifp_if_event;
9521 ifp->if_framer_legacy = ifp_if_framer;
9522 ifp->if_framer = ifp_if_framer_extended;
9523 ifp->if_add_proto = ifp_if_add_proto;
9524 ifp->if_del_proto = ifp_if_del_proto;
9525 ifp->if_check_multi = ifp_if_check_multi;
9526
9527 /* wipe out interface description */
9528 VERIFY(ifp->if_desc.ifd_maxlen == IF_DESCSIZE);
9529 ifp->if_desc.ifd_len = 0;
9530 VERIFY(ifp->if_desc.ifd_desc != NULL);
9531 bzero(ifp->if_desc.ifd_desc, IF_DESCSIZE);
9532
9533 /* there shouldn't be any delegation by now */
9534 VERIFY(ifp->if_delegated.ifp == NULL);
9535 VERIFY(ifp->if_delegated.type == 0);
9536 VERIFY(ifp->if_delegated.family == 0);
9537 VERIFY(ifp->if_delegated.subfamily == 0);
9538 VERIFY(ifp->if_delegated.expensive == 0);
9539 VERIFY(ifp->if_delegated.constrained == 0);
9540
9541 /* QoS marking get cleared */
9542 if_clear_eflags(ifp, IFEF_QOSMARKING_ENABLED);
9543 if_set_qosmarking_mode(ifp, IFRTYPE_QOSMARKING_MODE_NONE);
9544
9545 #if SKYWALK
9546 /* the nexus destructor is responsible for clearing these */
9547 VERIFY(ifp->if_na_ops == NULL);
9548 VERIFY(ifp->if_na == NULL);
9549 #endif /* SKYWALK */
9550
9551 /* promiscuous count needs to start at zero again */
9552 ifp->if_pcount = 0;
9553 ifp->if_flags &= ~IFF_PROMISC;
9554
9555 ifnet_lock_done(ifp);
9556
9557 #if PF
9558 /*
9559 * Detach this interface from packet filter, if enabled.
9560 */
9561 pf_ifnet_hook(ifp, 0);
9562 #endif /* PF */
9563
9564 /* Filter list should be empty */
9565 lck_mtx_lock_spin(&ifp->if_flt_lock);
9566 VERIFY(TAILQ_EMPTY(&ifp->if_flt_head));
9567 VERIFY(ifp->if_flt_busy == 0);
9568 VERIFY(ifp->if_flt_waiters == 0);
9569 VERIFY(ifp->if_flt_non_os_count == 0);
9570 VERIFY(ifp->if_flt_no_tso_count == 0);
9571 lck_mtx_unlock(&ifp->if_flt_lock);
9572
9573 /* Last chance to drain send queue */
9574 if_qflush_snd(ifp, 0);
9575
9576 /* Last chance to cleanup any cached route */
9577 lck_mtx_lock(&ifp->if_cached_route_lock);
9578 VERIFY(!ifp->if_fwd_cacheok);
9579 ROUTE_RELEASE(&ifp->if_fwd_route);
9580 bzero(&ifp->if_fwd_route, sizeof(ifp->if_fwd_route));
9581 ROUTE_RELEASE(&ifp->if_src_route);
9582 bzero(&ifp->if_src_route, sizeof(ifp->if_src_route));
9583 ROUTE_RELEASE(&ifp->if_src_route6);
9584 bzero(&ifp->if_src_route6, sizeof(ifp->if_src_route6));
9585 lck_mtx_unlock(&ifp->if_cached_route_lock);
9586
9587 VERIFY(ifp->if_data_threshold == 0);
9588 VERIFY(ifp->if_dt_tcall != NULL);
9589 VERIFY(!thread_call_isactive(ifp->if_dt_tcall));
9590
9591 ifnet_llreach_ifdetach(ifp);
9592
9593 dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_IF_DETACHED, NULL, 0);
9594
9595 /*
9596 * Finally, mark this ifnet as detached.
9597 */
9598 if (dlil_verbose) {
9599 DLIL_PRINTF("%s: detached\n", if_name(ifp));
9600 }
9601 lck_mtx_lock_spin(&ifp->if_ref_lock);
9602 if (!(ifp->if_refflags & IFRF_DETACHING)) {
9603 panic("%s: flags mismatch (detaching not set) ifp=%p",
9604 __func__, ifp);
9605 /* NOTREACHED */
9606 }
9607 ifp->if_refflags &= ~IFRF_DETACHING;
9608 lck_mtx_unlock(&ifp->if_ref_lock);
9609 if (if_free != NULL) {
9610 if_free(ifp);
9611 }
9612
9613 ifclassq_release(&ifp->if_snd);
9614
9615 /* we're fully detached, clear the "in use" bit */
9616 dlifp = (struct dlil_ifnet *)ifp;
9617 lck_mtx_lock(&dlifp->dl_if_lock);
9618 ASSERT((dlifp->dl_if_flags & DLIF_INUSE) != 0);
9619 dlifp->dl_if_flags &= ~DLIF_INUSE;
9620 lck_mtx_unlock(&dlifp->dl_if_lock);
9621
9622 /* Release reference held during ifnet attach */
9623 ifnet_release(ifp);
9624 }
9625
9626 errno_t
ifp_if_output(struct ifnet * ifp,struct mbuf * m)9627 ifp_if_output(struct ifnet *ifp, struct mbuf *m)
9628 {
9629 #pragma unused(ifp)
9630 m_freem_list(m);
9631 return 0;
9632 }
9633
9634 void
ifp_if_start(struct ifnet * ifp)9635 ifp_if_start(struct ifnet *ifp)
9636 {
9637 ifnet_purge(ifp);
9638 }
9639
9640 static errno_t
ifp_if_input(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t poll,struct thread * tp)9641 ifp_if_input(struct ifnet *ifp, struct mbuf *m_head,
9642 struct mbuf *m_tail, const struct ifnet_stat_increment_param *s,
9643 boolean_t poll, struct thread *tp)
9644 {
9645 #pragma unused(ifp, m_tail, s, poll, tp)
9646 m_freem_list(m_head);
9647 return ENXIO;
9648 }
9649
9650 static void
ifp_if_input_poll(struct ifnet * ifp,u_int32_t flags,u_int32_t max_cnt,struct mbuf ** m_head,struct mbuf ** m_tail,u_int32_t * cnt,u_int32_t * len)9651 ifp_if_input_poll(struct ifnet *ifp, u_int32_t flags, u_int32_t max_cnt,
9652 struct mbuf **m_head, struct mbuf **m_tail, u_int32_t *cnt, u_int32_t *len)
9653 {
9654 #pragma unused(ifp, flags, max_cnt)
9655 if (m_head != NULL) {
9656 *m_head = NULL;
9657 }
9658 if (m_tail != NULL) {
9659 *m_tail = NULL;
9660 }
9661 if (cnt != NULL) {
9662 *cnt = 0;
9663 }
9664 if (len != NULL) {
9665 *len = 0;
9666 }
9667 }
9668
9669 static errno_t
ifp_if_ctl(struct ifnet * ifp,ifnet_ctl_cmd_t cmd,u_int32_t arglen,void * arg)9670 ifp_if_ctl(struct ifnet *ifp, ifnet_ctl_cmd_t cmd, u_int32_t arglen, void *arg)
9671 {
9672 #pragma unused(ifp, cmd, arglen, arg)
9673 return EOPNOTSUPP;
9674 }
9675
9676 static errno_t
ifp_if_demux(struct ifnet * ifp,struct mbuf * m,char * fh,protocol_family_t * pf)9677 ifp_if_demux(struct ifnet *ifp, struct mbuf *m, char *fh, protocol_family_t *pf)
9678 {
9679 #pragma unused(ifp, fh, pf)
9680 m_freem(m);
9681 return EJUSTRETURN;
9682 }
9683
9684 static errno_t
ifp_if_add_proto(struct ifnet * ifp,protocol_family_t pf,const struct ifnet_demux_desc * da,u_int32_t dc)9685 ifp_if_add_proto(struct ifnet *ifp, protocol_family_t pf,
9686 const struct ifnet_demux_desc *da, u_int32_t dc)
9687 {
9688 #pragma unused(ifp, pf, da, dc)
9689 return EINVAL;
9690 }
9691
9692 static errno_t
ifp_if_del_proto(struct ifnet * ifp,protocol_family_t pf)9693 ifp_if_del_proto(struct ifnet *ifp, protocol_family_t pf)
9694 {
9695 #pragma unused(ifp, pf)
9696 return EINVAL;
9697 }
9698
9699 static errno_t
ifp_if_check_multi(struct ifnet * ifp,const struct sockaddr * sa)9700 ifp_if_check_multi(struct ifnet *ifp, const struct sockaddr *sa)
9701 {
9702 #pragma unused(ifp, sa)
9703 return EOPNOTSUPP;
9704 }
9705
9706 #if !XNU_TARGET_OS_OSX
9707 static errno_t
ifp_if_framer(struct ifnet * ifp,struct mbuf ** m,const struct sockaddr * sa,const char * ll,const char * t,u_int32_t * pre,u_int32_t * post)9708 ifp_if_framer(struct ifnet *ifp, struct mbuf **m,
9709 const struct sockaddr *sa, const char *ll, const char *t,
9710 u_int32_t *pre, u_int32_t *post)
9711 #else /* XNU_TARGET_OS_OSX */
9712 static errno_t
9713 ifp_if_framer(struct ifnet *ifp, struct mbuf **m,
9714 const struct sockaddr *sa, const char *ll, const char *t)
9715 #endif /* XNU_TARGET_OS_OSX */
9716 {
9717 #pragma unused(ifp, m, sa, ll, t)
9718 #if !XNU_TARGET_OS_OSX
9719 return ifp_if_framer_extended(ifp, m, sa, ll, t, pre, post);
9720 #else /* XNU_TARGET_OS_OSX */
9721 return ifp_if_framer_extended(ifp, m, sa, ll, t, NULL, NULL);
9722 #endif /* XNU_TARGET_OS_OSX */
9723 }
9724
9725 static errno_t
ifp_if_framer_extended(struct ifnet * ifp,struct mbuf ** m,const struct sockaddr * sa,const char * ll,const char * t,u_int32_t * pre,u_int32_t * post)9726 ifp_if_framer_extended(struct ifnet *ifp, struct mbuf **m,
9727 const struct sockaddr *sa, const char *ll, const char *t,
9728 u_int32_t *pre, u_int32_t *post)
9729 {
9730 #pragma unused(ifp, sa, ll, t)
9731 m_freem(*m);
9732 *m = NULL;
9733
9734 if (pre != NULL) {
9735 *pre = 0;
9736 }
9737 if (post != NULL) {
9738 *post = 0;
9739 }
9740
9741 return EJUSTRETURN;
9742 }
9743
9744 errno_t
ifp_if_ioctl(struct ifnet * ifp,unsigned long cmd,void * arg)9745 ifp_if_ioctl(struct ifnet *ifp, unsigned long cmd, void *arg)
9746 {
9747 #pragma unused(ifp, cmd, arg)
9748 return EOPNOTSUPP;
9749 }
9750
9751 static errno_t
ifp_if_set_bpf_tap(struct ifnet * ifp,bpf_tap_mode tm,bpf_packet_func f)9752 ifp_if_set_bpf_tap(struct ifnet *ifp, bpf_tap_mode tm, bpf_packet_func f)
9753 {
9754 #pragma unused(ifp, tm, f)
9755 /* XXX not sure what to do here */
9756 return 0;
9757 }
9758
9759 static void
ifp_if_free(struct ifnet * ifp)9760 ifp_if_free(struct ifnet *ifp)
9761 {
9762 #pragma unused(ifp)
9763 }
9764
9765 static void
ifp_if_event(struct ifnet * ifp,const struct kev_msg * e)9766 ifp_if_event(struct ifnet *ifp, const struct kev_msg *e)
9767 {
9768 #pragma unused(ifp, e)
9769 }
9770
9771 int
dlil_if_acquire(u_int32_t family,const void * uniqueid,size_t uniqueid_len,const char * ifxname,struct ifnet ** ifp)9772 dlil_if_acquire(u_int32_t family, const void *uniqueid,
9773 size_t uniqueid_len, const char *ifxname, struct ifnet **ifp)
9774 {
9775 struct ifnet *ifp1 = NULL;
9776 struct dlil_ifnet *dlifp1 = NULL;
9777 struct dlil_ifnet *dlifp1_saved = NULL;
9778 void *buf, *base, **pbuf;
9779 int ret = 0;
9780
9781 VERIFY(*ifp == NULL);
9782 dlil_if_lock();
9783 /*
9784 * We absolutely can't have an interface with the same name
9785 * in in-use state.
9786 * To make sure of that list has to be traversed completely
9787 */
9788 TAILQ_FOREACH(dlifp1, &dlil_ifnet_head, dl_if_link) {
9789 ifp1 = (struct ifnet *)dlifp1;
9790
9791 if (ifp1->if_family != family) {
9792 continue;
9793 }
9794
9795 /*
9796 * If interface is in use, return EBUSY if either unique id
9797 * or interface extended names are the same
9798 */
9799 lck_mtx_lock(&dlifp1->dl_if_lock);
9800 if (strncmp(ifxname, ifp1->if_xname, IFXNAMSIZ) == 0 &&
9801 (dlifp1->dl_if_flags & DLIF_INUSE) != 0) {
9802 lck_mtx_unlock(&dlifp1->dl_if_lock);
9803 ret = EBUSY;
9804 goto end;
9805 }
9806
9807 if (uniqueid_len != 0 &&
9808 uniqueid_len == dlifp1->dl_if_uniqueid_len &&
9809 bcmp(uniqueid, dlifp1->dl_if_uniqueid, uniqueid_len) == 0) {
9810 if ((dlifp1->dl_if_flags & DLIF_INUSE) != 0) {
9811 lck_mtx_unlock(&dlifp1->dl_if_lock);
9812 ret = EBUSY;
9813 goto end;
9814 }
9815 if (dlifp1_saved == NULL) {
9816 /* cache the first match */
9817 dlifp1_saved = dlifp1;
9818 }
9819 /*
9820 * Do not break or jump to end as we have to traverse
9821 * the whole list to ensure there are no name collisions
9822 */
9823 }
9824 lck_mtx_unlock(&dlifp1->dl_if_lock);
9825 }
9826
9827 /* If there's an interface that can be recycled, use that */
9828 if (dlifp1_saved != NULL) {
9829 lck_mtx_lock(&dlifp1_saved->dl_if_lock);
9830 if ((dlifp1_saved->dl_if_flags & DLIF_INUSE) != 0) {
9831 /* some other thread got in ahead of us */
9832 lck_mtx_unlock(&dlifp1_saved->dl_if_lock);
9833 ret = EBUSY;
9834 goto end;
9835 }
9836 dlifp1_saved->dl_if_flags |= (DLIF_INUSE | DLIF_REUSE);
9837 lck_mtx_unlock(&dlifp1_saved->dl_if_lock);
9838 *ifp = (struct ifnet *)dlifp1_saved;
9839 dlil_if_ref(*ifp);
9840 goto end;
9841 }
9842
9843 /* no interface found, allocate a new one */
9844 buf = zalloc_flags(dlif_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
9845
9846 /* Get the 64-bit aligned base address for this object */
9847 base = (void *)P2ROUNDUP((intptr_t)buf + sizeof(u_int64_t),
9848 sizeof(u_int64_t));
9849 VERIFY(((intptr_t)base + dlif_size) <= ((intptr_t)buf + dlif_bufsize));
9850
9851 /*
9852 * Wind back a pointer size from the aligned base and
9853 * save the original address so we can free it later.
9854 */
9855 pbuf = (void **)((intptr_t)base - sizeof(void *));
9856 *pbuf = buf;
9857 dlifp1 = base;
9858
9859 if (uniqueid_len) {
9860 MALLOC(dlifp1->dl_if_uniqueid, void *, uniqueid_len,
9861 M_NKE, M_WAITOK);
9862 if (dlifp1->dl_if_uniqueid == NULL) {
9863 zfree(dlif_zone, buf);
9864 ret = ENOMEM;
9865 goto end;
9866 }
9867 bcopy(uniqueid, dlifp1->dl_if_uniqueid, uniqueid_len);
9868 dlifp1->dl_if_uniqueid_len = uniqueid_len;
9869 }
9870
9871 ifp1 = (struct ifnet *)dlifp1;
9872 dlifp1->dl_if_flags = DLIF_INUSE;
9873 if (ifnet_debug) {
9874 dlifp1->dl_if_flags |= DLIF_DEBUG;
9875 dlifp1->dl_if_trace = dlil_if_trace;
9876 }
9877 ifp1->if_name = dlifp1->dl_if_namestorage;
9878 ifp1->if_xname = dlifp1->dl_if_xnamestorage;
9879
9880 /* initialize interface description */
9881 ifp1->if_desc.ifd_maxlen = IF_DESCSIZE;
9882 ifp1->if_desc.ifd_len = 0;
9883 ifp1->if_desc.ifd_desc = dlifp1->dl_if_descstorage;
9884
9885 #if SKYWALK
9886 SLIST_INIT(&ifp1->if_netns_tokens);
9887 #endif /* SKYWALK */
9888
9889 if ((ret = dlil_alloc_local_stats(ifp1)) != 0) {
9890 DLIL_PRINTF("%s: failed to allocate if local stats, "
9891 "error: %d\n", __func__, ret);
9892 /* This probably shouldn't be fatal */
9893 ret = 0;
9894 }
9895
9896 lck_mtx_init(&dlifp1->dl_if_lock, &ifnet_lock_group, &ifnet_lock_attr);
9897 lck_rw_init(&ifp1->if_lock, &ifnet_lock_group, &ifnet_lock_attr);
9898 lck_mtx_init(&ifp1->if_ref_lock, &ifnet_lock_group, &ifnet_lock_attr);
9899 lck_mtx_init(&ifp1->if_flt_lock, &ifnet_lock_group, &ifnet_lock_attr);
9900 lck_mtx_init(&ifp1->if_addrconfig_lock, &ifnet_lock_group,
9901 &ifnet_lock_attr);
9902 lck_rw_init(&ifp1->if_llreach_lock, &ifnet_lock_group, &ifnet_lock_attr);
9903 #if INET
9904 lck_rw_init(&ifp1->if_inetdata_lock, &ifnet_lock_group,
9905 &ifnet_lock_attr);
9906 ifp1->if_inetdata = NULL;
9907 #endif
9908 lck_rw_init(&ifp1->if_inet6data_lock, &ifnet_lock_group,
9909 &ifnet_lock_attr);
9910 ifp1->if_inet6data = NULL;
9911 lck_rw_init(&ifp1->if_link_status_lock, &ifnet_lock_group,
9912 &ifnet_lock_attr);
9913 ifp1->if_link_status = NULL;
9914
9915 /* for send data paths */
9916 lck_mtx_init(&ifp1->if_start_lock, &ifnet_snd_lock_group,
9917 &ifnet_lock_attr);
9918 lck_mtx_init(&ifp1->if_cached_route_lock, &ifnet_snd_lock_group,
9919 &ifnet_lock_attr);
9920
9921 /* for receive data paths */
9922 lck_mtx_init(&ifp1->if_poll_lock, &ifnet_rcv_lock_group,
9923 &ifnet_lock_attr);
9924
9925 /* thread call allocation is done with sleeping zalloc */
9926 ifp1->if_dt_tcall = thread_call_allocate_with_options(dlil_dt_tcall_fn,
9927 ifp1, THREAD_CALL_PRIORITY_KERNEL, THREAD_CALL_OPTIONS_ONCE);
9928 if (ifp1->if_dt_tcall == NULL) {
9929 panic_plain("%s: couldn't create if_dt_tcall", __func__);
9930 /* NOTREACHED */
9931 }
9932
9933 TAILQ_INSERT_TAIL(&dlil_ifnet_head, dlifp1, dl_if_link);
9934
9935 *ifp = ifp1;
9936 dlil_if_ref(*ifp);
9937
9938 end:
9939 dlil_if_unlock();
9940
9941 VERIFY(dlifp1 == NULL || (IS_P2ALIGNED(dlifp1, sizeof(u_int64_t)) &&
9942 IS_P2ALIGNED(&ifp1->if_data, sizeof(u_int64_t))));
9943
9944 return ret;
9945 }
9946
9947 static void
_dlil_if_release(ifnet_t ifp,bool clear_in_use)9948 _dlil_if_release(ifnet_t ifp, bool clear_in_use)
9949 {
9950 struct dlil_ifnet *dlifp = (struct dlil_ifnet *)ifp;
9951
9952 VERIFY(OSDecrementAtomic64(&net_api_stats.nas_ifnet_alloc_count) > 0);
9953 if (!(ifp->if_xflags & IFXF_ALLOC_KPI)) {
9954 VERIFY(OSDecrementAtomic64(&net_api_stats.nas_ifnet_alloc_os_count) > 0);
9955 }
9956
9957 ifnet_lock_exclusive(ifp);
9958 if (ifp->if_broadcast.length > sizeof(ifp->if_broadcast.u.buffer)) {
9959 kfree_data(ifp->if_broadcast.u.ptr, ifp->if_broadcast.length);
9960 ifp->if_broadcast.length = 0;
9961 ifp->if_broadcast.u.ptr = NULL;
9962 }
9963 lck_mtx_lock(&dlifp->dl_if_lock);
9964 strlcpy(dlifp->dl_if_namestorage, ifp->if_name, IFNAMSIZ);
9965 ifp->if_name = dlifp->dl_if_namestorage;
9966 /* Reset external name (name + unit) */
9967 ifp->if_xname = dlifp->dl_if_xnamestorage;
9968 snprintf(__DECONST(char *, ifp->if_xname), IFXNAMSIZ,
9969 "%s?", ifp->if_name);
9970 if (clear_in_use) {
9971 ASSERT((dlifp->dl_if_flags & DLIF_INUSE) != 0);
9972 dlifp->dl_if_flags &= ~DLIF_INUSE;
9973 }
9974 lck_mtx_unlock(&dlifp->dl_if_lock);
9975 ifnet_lock_done(ifp);
9976 }
9977
9978 __private_extern__ void
dlil_if_release(ifnet_t ifp)9979 dlil_if_release(ifnet_t ifp)
9980 {
9981 _dlil_if_release(ifp, false);
9982 }
9983
9984 __private_extern__ void
dlil_if_lock(void)9985 dlil_if_lock(void)
9986 {
9987 lck_mtx_lock(&dlil_ifnet_lock);
9988 }
9989
9990 __private_extern__ void
dlil_if_unlock(void)9991 dlil_if_unlock(void)
9992 {
9993 lck_mtx_unlock(&dlil_ifnet_lock);
9994 }
9995
9996 __private_extern__ void
dlil_if_lock_assert(void)9997 dlil_if_lock_assert(void)
9998 {
9999 LCK_MTX_ASSERT(&dlil_ifnet_lock, LCK_MTX_ASSERT_OWNED);
10000 }
10001
10002 __private_extern__ void
dlil_proto_unplumb_all(struct ifnet * ifp)10003 dlil_proto_unplumb_all(struct ifnet *ifp)
10004 {
10005 /*
10006 * if_proto_hash[0-2] are for PF_INET, PF_INET6 and PF_VLAN, where
10007 * each bucket contains exactly one entry; PF_VLAN does not need an
10008 * explicit unplumb.
10009 *
10010 * if_proto_hash[3] is for other protocols; we expect anything
10011 * in this bucket to respond to the DETACHING event (which would
10012 * have happened by now) and do the unplumb then.
10013 */
10014 (void) proto_unplumb(PF_INET, ifp);
10015 (void) proto_unplumb(PF_INET6, ifp);
10016 }
10017
10018 static void
ifp_src_route_copyout(struct ifnet * ifp,struct route * dst)10019 ifp_src_route_copyout(struct ifnet *ifp, struct route *dst)
10020 {
10021 lck_mtx_lock_spin(&ifp->if_cached_route_lock);
10022 lck_mtx_convert_spin(&ifp->if_cached_route_lock);
10023
10024 route_copyout(dst, &ifp->if_src_route, sizeof(*dst));
10025
10026 lck_mtx_unlock(&ifp->if_cached_route_lock);
10027 }
10028
10029 static void
ifp_src_route_copyin(struct ifnet * ifp,struct route * src)10030 ifp_src_route_copyin(struct ifnet *ifp, struct route *src)
10031 {
10032 lck_mtx_lock_spin(&ifp->if_cached_route_lock);
10033 lck_mtx_convert_spin(&ifp->if_cached_route_lock);
10034
10035 if (ifp->if_fwd_cacheok) {
10036 route_copyin(src, &ifp->if_src_route, sizeof(*src));
10037 } else {
10038 ROUTE_RELEASE(src);
10039 }
10040 lck_mtx_unlock(&ifp->if_cached_route_lock);
10041 }
10042
10043 static void
ifp_src_route6_copyout(struct ifnet * ifp,struct route_in6 * dst)10044 ifp_src_route6_copyout(struct ifnet *ifp, struct route_in6 *dst)
10045 {
10046 lck_mtx_lock_spin(&ifp->if_cached_route_lock);
10047 lck_mtx_convert_spin(&ifp->if_cached_route_lock);
10048
10049 route_copyout((struct route *)dst, (struct route *)&ifp->if_src_route6,
10050 sizeof(*dst));
10051
10052 lck_mtx_unlock(&ifp->if_cached_route_lock);
10053 }
10054
10055 static void
ifp_src_route6_copyin(struct ifnet * ifp,struct route_in6 * src)10056 ifp_src_route6_copyin(struct ifnet *ifp, struct route_in6 *src)
10057 {
10058 lck_mtx_lock_spin(&ifp->if_cached_route_lock);
10059 lck_mtx_convert_spin(&ifp->if_cached_route_lock);
10060
10061 if (ifp->if_fwd_cacheok) {
10062 route_copyin((struct route *)src,
10063 (struct route *)&ifp->if_src_route6, sizeof(*src));
10064 } else {
10065 ROUTE_RELEASE(src);
10066 }
10067 lck_mtx_unlock(&ifp->if_cached_route_lock);
10068 }
10069
10070 struct rtentry *
ifnet_cached_rtlookup_inet(struct ifnet * ifp,struct in_addr src_ip)10071 ifnet_cached_rtlookup_inet(struct ifnet *ifp, struct in_addr src_ip)
10072 {
10073 struct route src_rt;
10074 struct sockaddr_in *dst;
10075
10076 dst = (struct sockaddr_in *)(void *)(&src_rt.ro_dst);
10077
10078 ifp_src_route_copyout(ifp, &src_rt);
10079
10080 if (ROUTE_UNUSABLE(&src_rt) || src_ip.s_addr != dst->sin_addr.s_addr) {
10081 ROUTE_RELEASE(&src_rt);
10082 if (dst->sin_family != AF_INET) {
10083 bzero(&src_rt.ro_dst, sizeof(src_rt.ro_dst));
10084 dst->sin_len = sizeof(src_rt.ro_dst);
10085 dst->sin_family = AF_INET;
10086 }
10087 dst->sin_addr = src_ip;
10088
10089 VERIFY(src_rt.ro_rt == NULL);
10090 src_rt.ro_rt = rtalloc1_scoped((struct sockaddr *)dst,
10091 0, 0, ifp->if_index);
10092
10093 if (src_rt.ro_rt != NULL) {
10094 /* retain a ref, copyin consumes one */
10095 struct rtentry *rte = src_rt.ro_rt;
10096 RT_ADDREF(rte);
10097 ifp_src_route_copyin(ifp, &src_rt);
10098 src_rt.ro_rt = rte;
10099 }
10100 }
10101
10102 return src_rt.ro_rt;
10103 }
10104
10105 struct rtentry *
ifnet_cached_rtlookup_inet6(struct ifnet * ifp,struct in6_addr * src_ip6)10106 ifnet_cached_rtlookup_inet6(struct ifnet *ifp, struct in6_addr *src_ip6)
10107 {
10108 struct route_in6 src_rt;
10109
10110 ifp_src_route6_copyout(ifp, &src_rt);
10111
10112 if (ROUTE_UNUSABLE(&src_rt) ||
10113 !IN6_ARE_ADDR_EQUAL(src_ip6, &src_rt.ro_dst.sin6_addr)) {
10114 ROUTE_RELEASE(&src_rt);
10115 if (src_rt.ro_dst.sin6_family != AF_INET6) {
10116 bzero(&src_rt.ro_dst, sizeof(src_rt.ro_dst));
10117 src_rt.ro_dst.sin6_len = sizeof(src_rt.ro_dst);
10118 src_rt.ro_dst.sin6_family = AF_INET6;
10119 }
10120 src_rt.ro_dst.sin6_scope_id = in6_addr2scopeid(ifp, src_ip6);
10121 bcopy(src_ip6, &src_rt.ro_dst.sin6_addr,
10122 sizeof(src_rt.ro_dst.sin6_addr));
10123
10124 if (src_rt.ro_rt == NULL) {
10125 src_rt.ro_rt = rtalloc1_scoped(
10126 (struct sockaddr *)&src_rt.ro_dst, 0, 0,
10127 ifp->if_index);
10128
10129 if (src_rt.ro_rt != NULL) {
10130 /* retain a ref, copyin consumes one */
10131 struct rtentry *rte = src_rt.ro_rt;
10132 RT_ADDREF(rte);
10133 ifp_src_route6_copyin(ifp, &src_rt);
10134 src_rt.ro_rt = rte;
10135 }
10136 }
10137 }
10138
10139 return src_rt.ro_rt;
10140 }
10141
10142 void
if_lqm_update(struct ifnet * ifp,int lqm,int locked)10143 if_lqm_update(struct ifnet *ifp, int lqm, int locked)
10144 {
10145 struct kev_dl_link_quality_metric_data ev_lqm_data;
10146
10147 VERIFY(lqm >= IFNET_LQM_MIN && lqm <= IFNET_LQM_MAX);
10148
10149 /* Normalize to edge */
10150 if (lqm >= 0 && lqm <= IFNET_LQM_THRESH_ABORT) {
10151 lqm = IFNET_LQM_THRESH_ABORT;
10152 atomic_bitset_32(&tcbinfo.ipi_flags,
10153 INPCBINFO_HANDLE_LQM_ABORT);
10154 inpcb_timer_sched(&tcbinfo, INPCB_TIMER_FAST);
10155 } else if (lqm > IFNET_LQM_THRESH_ABORT &&
10156 lqm <= IFNET_LQM_THRESH_MINIMALLY_VIABLE) {
10157 lqm = IFNET_LQM_THRESH_MINIMALLY_VIABLE;
10158 } else if (lqm > IFNET_LQM_THRESH_MINIMALLY_VIABLE &&
10159 lqm <= IFNET_LQM_THRESH_POOR) {
10160 lqm = IFNET_LQM_THRESH_POOR;
10161 } else if (lqm > IFNET_LQM_THRESH_POOR &&
10162 lqm <= IFNET_LQM_THRESH_GOOD) {
10163 lqm = IFNET_LQM_THRESH_GOOD;
10164 }
10165
10166 /*
10167 * Take the lock if needed
10168 */
10169 if (!locked) {
10170 ifnet_lock_exclusive(ifp);
10171 }
10172
10173 if (lqm == ifp->if_interface_state.lqm_state &&
10174 (ifp->if_interface_state.valid_bitmask &
10175 IF_INTERFACE_STATE_LQM_STATE_VALID)) {
10176 /*
10177 * Release the lock if was not held by the caller
10178 */
10179 if (!locked) {
10180 ifnet_lock_done(ifp);
10181 }
10182 return; /* nothing to update */
10183 }
10184 ifp->if_interface_state.valid_bitmask |=
10185 IF_INTERFACE_STATE_LQM_STATE_VALID;
10186 ifp->if_interface_state.lqm_state = (int8_t)lqm;
10187
10188 /*
10189 * Don't want to hold the lock when issuing kernel events
10190 */
10191 ifnet_lock_done(ifp);
10192
10193 bzero(&ev_lqm_data, sizeof(ev_lqm_data));
10194 ev_lqm_data.link_quality_metric = lqm;
10195
10196 dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_LINK_QUALITY_METRIC_CHANGED,
10197 (struct net_event_data *)&ev_lqm_data, sizeof(ev_lqm_data));
10198
10199 /*
10200 * Reacquire the lock for the caller
10201 */
10202 if (locked) {
10203 ifnet_lock_exclusive(ifp);
10204 }
10205 }
10206
10207 static void
if_rrc_state_update(struct ifnet * ifp,unsigned int rrc_state)10208 if_rrc_state_update(struct ifnet *ifp, unsigned int rrc_state)
10209 {
10210 struct kev_dl_rrc_state kev;
10211
10212 if (rrc_state == ifp->if_interface_state.rrc_state &&
10213 (ifp->if_interface_state.valid_bitmask &
10214 IF_INTERFACE_STATE_RRC_STATE_VALID)) {
10215 return;
10216 }
10217
10218 ifp->if_interface_state.valid_bitmask |=
10219 IF_INTERFACE_STATE_RRC_STATE_VALID;
10220
10221 ifp->if_interface_state.rrc_state = (uint8_t)rrc_state;
10222
10223 /*
10224 * Don't want to hold the lock when issuing kernel events
10225 */
10226 ifnet_lock_done(ifp);
10227
10228 bzero(&kev, sizeof(struct kev_dl_rrc_state));
10229 kev.rrc_state = rrc_state;
10230
10231 dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_RRC_STATE_CHANGED,
10232 (struct net_event_data *)&kev, sizeof(struct kev_dl_rrc_state));
10233
10234 ifnet_lock_exclusive(ifp);
10235 }
10236
10237 errno_t
if_state_update(struct ifnet * ifp,struct if_interface_state * if_interface_state)10238 if_state_update(struct ifnet *ifp,
10239 struct if_interface_state *if_interface_state)
10240 {
10241 u_short if_index_available = 0;
10242
10243 ifnet_lock_exclusive(ifp);
10244
10245 if ((ifp->if_type != IFT_CELLULAR) &&
10246 (if_interface_state->valid_bitmask &
10247 IF_INTERFACE_STATE_RRC_STATE_VALID)) {
10248 ifnet_lock_done(ifp);
10249 return ENOTSUP;
10250 }
10251 if ((if_interface_state->valid_bitmask &
10252 IF_INTERFACE_STATE_LQM_STATE_VALID) &&
10253 (if_interface_state->lqm_state < IFNET_LQM_MIN ||
10254 if_interface_state->lqm_state > IFNET_LQM_MAX)) {
10255 ifnet_lock_done(ifp);
10256 return EINVAL;
10257 }
10258 if ((if_interface_state->valid_bitmask &
10259 IF_INTERFACE_STATE_RRC_STATE_VALID) &&
10260 if_interface_state->rrc_state !=
10261 IF_INTERFACE_STATE_RRC_STATE_IDLE &&
10262 if_interface_state->rrc_state !=
10263 IF_INTERFACE_STATE_RRC_STATE_CONNECTED) {
10264 ifnet_lock_done(ifp);
10265 return EINVAL;
10266 }
10267
10268 if (if_interface_state->valid_bitmask &
10269 IF_INTERFACE_STATE_LQM_STATE_VALID) {
10270 if_lqm_update(ifp, if_interface_state->lqm_state, 1);
10271 }
10272 if (if_interface_state->valid_bitmask &
10273 IF_INTERFACE_STATE_RRC_STATE_VALID) {
10274 if_rrc_state_update(ifp, if_interface_state->rrc_state);
10275 }
10276 if (if_interface_state->valid_bitmask &
10277 IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID) {
10278 ifp->if_interface_state.valid_bitmask |=
10279 IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID;
10280 ifp->if_interface_state.interface_availability =
10281 if_interface_state->interface_availability;
10282
10283 if (ifp->if_interface_state.interface_availability ==
10284 IF_INTERFACE_STATE_INTERFACE_AVAILABLE) {
10285 os_log(OS_LOG_DEFAULT, "%s: interface %s (%u) available\n",
10286 __func__, if_name(ifp), ifp->if_index);
10287 if_index_available = ifp->if_index;
10288 } else {
10289 os_log(OS_LOG_DEFAULT, "%s: interface %s (%u) unavailable)\n",
10290 __func__, if_name(ifp), ifp->if_index);
10291 }
10292 }
10293 ifnet_lock_done(ifp);
10294
10295 /*
10296 * Check if the TCP connections going on this interface should be
10297 * forced to send probe packets instead of waiting for TCP timers
10298 * to fire. This is done on an explicit notification such as
10299 * SIOCSIFINTERFACESTATE which marks the interface as available.
10300 */
10301 if (if_index_available > 0) {
10302 tcp_interface_send_probe(if_index_available);
10303 }
10304
10305 return 0;
10306 }
10307
10308 void
if_get_state(struct ifnet * ifp,struct if_interface_state * if_interface_state)10309 if_get_state(struct ifnet *ifp,
10310 struct if_interface_state *if_interface_state)
10311 {
10312 ifnet_lock_shared(ifp);
10313
10314 if_interface_state->valid_bitmask = 0;
10315
10316 if (ifp->if_interface_state.valid_bitmask &
10317 IF_INTERFACE_STATE_RRC_STATE_VALID) {
10318 if_interface_state->valid_bitmask |=
10319 IF_INTERFACE_STATE_RRC_STATE_VALID;
10320 if_interface_state->rrc_state =
10321 ifp->if_interface_state.rrc_state;
10322 }
10323 if (ifp->if_interface_state.valid_bitmask &
10324 IF_INTERFACE_STATE_LQM_STATE_VALID) {
10325 if_interface_state->valid_bitmask |=
10326 IF_INTERFACE_STATE_LQM_STATE_VALID;
10327 if_interface_state->lqm_state =
10328 ifp->if_interface_state.lqm_state;
10329 }
10330 if (ifp->if_interface_state.valid_bitmask &
10331 IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID) {
10332 if_interface_state->valid_bitmask |=
10333 IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID;
10334 if_interface_state->interface_availability =
10335 ifp->if_interface_state.interface_availability;
10336 }
10337
10338 ifnet_lock_done(ifp);
10339 }
10340
10341 errno_t
if_probe_connectivity(struct ifnet * ifp,u_int32_t conn_probe)10342 if_probe_connectivity(struct ifnet *ifp, u_int32_t conn_probe)
10343 {
10344 if (conn_probe > 1) {
10345 return EINVAL;
10346 }
10347 if (conn_probe == 0) {
10348 if_clear_eflags(ifp, IFEF_PROBE_CONNECTIVITY);
10349 } else {
10350 if_set_eflags(ifp, IFEF_PROBE_CONNECTIVITY);
10351 }
10352
10353 #if NECP
10354 necp_update_all_clients();
10355 #endif /* NECP */
10356
10357 tcp_probe_connectivity(ifp, conn_probe);
10358 return 0;
10359 }
10360
10361 /* for uuid.c */
10362 static int
get_ether_index(int * ret_other_index)10363 get_ether_index(int * ret_other_index)
10364 {
10365 struct ifnet *ifp;
10366 int en0_index = 0;
10367 int other_en_index = 0;
10368 int any_ether_index = 0;
10369 short best_unit = 0;
10370
10371 *ret_other_index = 0;
10372 TAILQ_FOREACH(ifp, &ifnet_head, if_link) {
10373 /*
10374 * find en0, or if not en0, the lowest unit en*, and if not
10375 * that, any ethernet
10376 */
10377 ifnet_lock_shared(ifp);
10378 if (strcmp(ifp->if_name, "en") == 0) {
10379 if (ifp->if_unit == 0) {
10380 /* found en0, we're done */
10381 en0_index = ifp->if_index;
10382 ifnet_lock_done(ifp);
10383 break;
10384 }
10385 if (other_en_index == 0 || ifp->if_unit < best_unit) {
10386 other_en_index = ifp->if_index;
10387 best_unit = ifp->if_unit;
10388 }
10389 } else if (ifp->if_type == IFT_ETHER && any_ether_index == 0) {
10390 any_ether_index = ifp->if_index;
10391 }
10392 ifnet_lock_done(ifp);
10393 }
10394 if (en0_index == 0) {
10395 if (other_en_index != 0) {
10396 *ret_other_index = other_en_index;
10397 } else if (any_ether_index != 0) {
10398 *ret_other_index = any_ether_index;
10399 }
10400 }
10401 return en0_index;
10402 }
10403
10404 int
uuid_get_ethernet(u_int8_t * node)10405 uuid_get_ethernet(u_int8_t *node)
10406 {
10407 static int en0_index;
10408 struct ifnet *ifp;
10409 int other_index = 0;
10410 int the_index = 0;
10411 int ret;
10412
10413 ifnet_head_lock_shared();
10414 if (en0_index == 0 || ifindex2ifnet[en0_index] == NULL) {
10415 en0_index = get_ether_index(&other_index);
10416 }
10417 if (en0_index != 0) {
10418 the_index = en0_index;
10419 } else if (other_index != 0) {
10420 the_index = other_index;
10421 }
10422 if (the_index != 0) {
10423 struct dlil_ifnet *dl_if;
10424
10425 ifp = ifindex2ifnet[the_index];
10426 VERIFY(ifp != NULL);
10427 dl_if = (struct dlil_ifnet *)ifp;
10428 if (dl_if->dl_if_permanent_ether_is_set != 0) {
10429 /*
10430 * Use the permanent ethernet address if it is
10431 * available because it will never change.
10432 */
10433 memcpy(node, dl_if->dl_if_permanent_ether,
10434 ETHER_ADDR_LEN);
10435 } else {
10436 memcpy(node, IF_LLADDR(ifp), ETHER_ADDR_LEN);
10437 }
10438 ret = 0;
10439 } else {
10440 ret = -1;
10441 }
10442 ifnet_head_done();
10443 return ret;
10444 }
10445
10446 static int
10447 sysctl_rxpoll SYSCTL_HANDLER_ARGS
10448 {
10449 #pragma unused(arg1, arg2)
10450 uint32_t i;
10451 int err;
10452
10453 i = if_rxpoll;
10454
10455 err = sysctl_handle_int(oidp, &i, 0, req);
10456 if (err != 0 || req->newptr == USER_ADDR_NULL) {
10457 return err;
10458 }
10459
10460 if (net_rxpoll == 0) {
10461 return ENXIO;
10462 }
10463
10464 if_rxpoll = i;
10465 return err;
10466 }
10467
10468 static int
10469 sysctl_rxpoll_mode_holdtime SYSCTL_HANDLER_ARGS
10470 {
10471 #pragma unused(arg1, arg2)
10472 uint64_t q;
10473 int err;
10474
10475 q = if_rxpoll_mode_holdtime;
10476
10477 err = sysctl_handle_quad(oidp, &q, 0, req);
10478 if (err != 0 || req->newptr == USER_ADDR_NULL) {
10479 return err;
10480 }
10481
10482 if (q < IF_RXPOLL_MODE_HOLDTIME_MIN) {
10483 q = IF_RXPOLL_MODE_HOLDTIME_MIN;
10484 }
10485
10486 if_rxpoll_mode_holdtime = q;
10487
10488 return err;
10489 }
10490
10491 static int
10492 sysctl_rxpoll_sample_holdtime SYSCTL_HANDLER_ARGS
10493 {
10494 #pragma unused(arg1, arg2)
10495 uint64_t q;
10496 int err;
10497
10498 q = if_rxpoll_sample_holdtime;
10499
10500 err = sysctl_handle_quad(oidp, &q, 0, req);
10501 if (err != 0 || req->newptr == USER_ADDR_NULL) {
10502 return err;
10503 }
10504
10505 if (q < IF_RXPOLL_SAMPLETIME_MIN) {
10506 q = IF_RXPOLL_SAMPLETIME_MIN;
10507 }
10508
10509 if_rxpoll_sample_holdtime = q;
10510
10511 return err;
10512 }
10513
10514 static int
10515 sysctl_rxpoll_interval_time SYSCTL_HANDLER_ARGS
10516 {
10517 #pragma unused(arg1, arg2)
10518 uint64_t q;
10519 int err;
10520
10521 q = if_rxpoll_interval_time;
10522
10523 err = sysctl_handle_quad(oidp, &q, 0, req);
10524 if (err != 0 || req->newptr == USER_ADDR_NULL) {
10525 return err;
10526 }
10527
10528 if (q < IF_RXPOLL_INTERVALTIME_MIN) {
10529 q = IF_RXPOLL_INTERVALTIME_MIN;
10530 }
10531
10532 if_rxpoll_interval_time = q;
10533
10534 return err;
10535 }
10536
10537 static int
10538 sysctl_rxpoll_wlowat SYSCTL_HANDLER_ARGS
10539 {
10540 #pragma unused(arg1, arg2)
10541 uint32_t i;
10542 int err;
10543
10544 i = if_sysctl_rxpoll_wlowat;
10545
10546 err = sysctl_handle_int(oidp, &i, 0, req);
10547 if (err != 0 || req->newptr == USER_ADDR_NULL) {
10548 return err;
10549 }
10550
10551 if (i == 0 || i >= if_sysctl_rxpoll_whiwat) {
10552 return EINVAL;
10553 }
10554
10555 if_sysctl_rxpoll_wlowat = i;
10556 return err;
10557 }
10558
10559 static int
10560 sysctl_rxpoll_whiwat SYSCTL_HANDLER_ARGS
10561 {
10562 #pragma unused(arg1, arg2)
10563 uint32_t i;
10564 int err;
10565
10566 i = if_sysctl_rxpoll_whiwat;
10567
10568 err = sysctl_handle_int(oidp, &i, 0, req);
10569 if (err != 0 || req->newptr == USER_ADDR_NULL) {
10570 return err;
10571 }
10572
10573 if (i <= if_sysctl_rxpoll_wlowat) {
10574 return EINVAL;
10575 }
10576
10577 if_sysctl_rxpoll_whiwat = i;
10578 return err;
10579 }
10580
10581 static int
10582 sysctl_sndq_maxlen SYSCTL_HANDLER_ARGS
10583 {
10584 #pragma unused(arg1, arg2)
10585 int i, err;
10586
10587 i = if_sndq_maxlen;
10588
10589 err = sysctl_handle_int(oidp, &i, 0, req);
10590 if (err != 0 || req->newptr == USER_ADDR_NULL) {
10591 return err;
10592 }
10593
10594 if (i < IF_SNDQ_MINLEN) {
10595 i = IF_SNDQ_MINLEN;
10596 }
10597
10598 if_sndq_maxlen = i;
10599 return err;
10600 }
10601
10602 static int
10603 sysctl_rcvq_maxlen SYSCTL_HANDLER_ARGS
10604 {
10605 #pragma unused(arg1, arg2)
10606 int i, err;
10607
10608 i = if_rcvq_maxlen;
10609
10610 err = sysctl_handle_int(oidp, &i, 0, req);
10611 if (err != 0 || req->newptr == USER_ADDR_NULL) {
10612 return err;
10613 }
10614
10615 if (i < IF_RCVQ_MINLEN) {
10616 i = IF_RCVQ_MINLEN;
10617 }
10618
10619 if_rcvq_maxlen = i;
10620 return err;
10621 }
10622
10623 int
dlil_node_present(struct ifnet * ifp,struct sockaddr * sa,int32_t rssi,int lqm,int npm,u_int8_t srvinfo[48])10624 dlil_node_present(struct ifnet *ifp, struct sockaddr *sa,
10625 int32_t rssi, int lqm, int npm, u_int8_t srvinfo[48])
10626 {
10627 struct kev_dl_node_presence kev;
10628 struct sockaddr_dl *sdl;
10629 struct sockaddr_in6 *sin6;
10630 int ret = 0;
10631
10632 VERIFY(ifp);
10633 VERIFY(sa);
10634 VERIFY(sa->sa_family == AF_LINK || sa->sa_family == AF_INET6);
10635
10636 bzero(&kev, sizeof(kev));
10637 sin6 = &kev.sin6_node_address;
10638 sdl = &kev.sdl_node_address;
10639 nd6_alt_node_addr_decompose(ifp, sa, sdl, sin6);
10640 kev.rssi = rssi;
10641 kev.link_quality_metric = lqm;
10642 kev.node_proximity_metric = npm;
10643 bcopy(srvinfo, kev.node_service_info, sizeof(kev.node_service_info));
10644
10645 ret = nd6_alt_node_present(ifp, sin6, sdl, rssi, lqm, npm);
10646 if (ret == 0) {
10647 int err = dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_NODE_PRESENCE,
10648 &kev.link_data, sizeof(kev));
10649 if (err != 0) {
10650 log(LOG_ERR, "%s: Post DL_NODE_PRESENCE failed with"
10651 "error %d\n", __func__, err);
10652 }
10653 }
10654 return ret;
10655 }
10656
10657 void
dlil_node_absent(struct ifnet * ifp,struct sockaddr * sa)10658 dlil_node_absent(struct ifnet *ifp, struct sockaddr *sa)
10659 {
10660 struct kev_dl_node_absence kev = {};
10661 struct sockaddr_in6 *kev_sin6 = NULL;
10662 struct sockaddr_dl *kev_sdl = NULL;
10663
10664 VERIFY(ifp != NULL);
10665 VERIFY(sa != NULL);
10666 VERIFY(sa->sa_family == AF_LINK || sa->sa_family == AF_INET6);
10667
10668 kev_sin6 = &kev.sin6_node_address;
10669 kev_sdl = &kev.sdl_node_address;
10670
10671 if (sa->sa_family == AF_INET6) {
10672 /*
10673 * If IPv6 address is given, get the link layer
10674 * address from what was cached in the neighbor cache
10675 */
10676 VERIFY(sa->sa_len <= sizeof(*kev_sin6));
10677 bcopy(sa, kev_sin6, sa->sa_len);
10678 nd6_alt_node_absent(ifp, kev_sin6, kev_sdl);
10679 } else {
10680 /*
10681 * If passed address is AF_LINK type, derive the address
10682 * based on the link address.
10683 */
10684 nd6_alt_node_addr_decompose(ifp, sa, kev_sdl, kev_sin6);
10685 nd6_alt_node_absent(ifp, kev_sin6, NULL);
10686 }
10687
10688 kev_sdl->sdl_type = ifp->if_type;
10689 kev_sdl->sdl_index = ifp->if_index;
10690
10691 dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_NODE_ABSENCE,
10692 &kev.link_data, sizeof(kev));
10693 }
10694
10695 int
dlil_node_present_v2(struct ifnet * ifp,struct sockaddr * sa,struct sockaddr_dl * sdl,int32_t rssi,int lqm,int npm,u_int8_t srvinfo[48])10696 dlil_node_present_v2(struct ifnet *ifp, struct sockaddr *sa, struct sockaddr_dl *sdl,
10697 int32_t rssi, int lqm, int npm, u_int8_t srvinfo[48])
10698 {
10699 struct kev_dl_node_presence kev = {};
10700 struct sockaddr_dl *kev_sdl = NULL;
10701 struct sockaddr_in6 *kev_sin6 = NULL;
10702 int ret = 0;
10703
10704 VERIFY(ifp != NULL);
10705 VERIFY(sa != NULL && sdl != NULL);
10706 VERIFY(sa->sa_family == AF_INET6 && sdl->sdl_family == AF_LINK);
10707
10708 kev_sin6 = &kev.sin6_node_address;
10709 kev_sdl = &kev.sdl_node_address;
10710
10711 VERIFY(sdl->sdl_len <= sizeof(*kev_sdl));
10712 bcopy(sdl, kev_sdl, sdl->sdl_len);
10713 kev_sdl->sdl_type = ifp->if_type;
10714 kev_sdl->sdl_index = ifp->if_index;
10715
10716 VERIFY(sa->sa_len <= sizeof(*kev_sin6));
10717 bcopy(sa, kev_sin6, sa->sa_len);
10718
10719 kev.rssi = rssi;
10720 kev.link_quality_metric = lqm;
10721 kev.node_proximity_metric = npm;
10722 bcopy(srvinfo, kev.node_service_info, sizeof(kev.node_service_info));
10723
10724 ret = nd6_alt_node_present(ifp, SIN6(sa), sdl, rssi, lqm, npm);
10725 if (ret == 0) {
10726 int err = dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_NODE_PRESENCE,
10727 &kev.link_data, sizeof(kev));
10728 if (err != 0) {
10729 log(LOG_ERR, "%s: Post DL_NODE_PRESENCE failed with error %d\n", __func__, err);
10730 }
10731 }
10732 return ret;
10733 }
10734
10735 const void *
dlil_ifaddr_bytes(const struct sockaddr_dl * sdl,size_t * sizep,kauth_cred_t * credp)10736 dlil_ifaddr_bytes(const struct sockaddr_dl *sdl, size_t *sizep,
10737 kauth_cred_t *credp)
10738 {
10739 const u_int8_t *bytes;
10740 size_t size;
10741
10742 bytes = CONST_LLADDR(sdl);
10743 size = sdl->sdl_alen;
10744
10745 #if CONFIG_MACF
10746 if (dlil_lladdr_ckreq) {
10747 switch (sdl->sdl_type) {
10748 case IFT_ETHER:
10749 case IFT_IEEE1394:
10750 break;
10751 default:
10752 credp = NULL;
10753 break;
10754 }
10755 ;
10756
10757 if (credp && mac_system_check_info(*credp, "net.link.addr")) {
10758 static const u_int8_t unspec[FIREWIRE_EUI64_LEN] = {
10759 [0] = 2
10760 };
10761
10762 bytes = unspec;
10763 }
10764 }
10765 #else
10766 #pragma unused(credp)
10767 #endif
10768
10769 if (sizep != NULL) {
10770 *sizep = size;
10771 }
10772 return bytes;
10773 }
10774
10775 void
dlil_report_issues(struct ifnet * ifp,u_int8_t modid[DLIL_MODIDLEN],u_int8_t info[DLIL_MODARGLEN])10776 dlil_report_issues(struct ifnet *ifp, u_int8_t modid[DLIL_MODIDLEN],
10777 u_int8_t info[DLIL_MODARGLEN])
10778 {
10779 struct kev_dl_issues kev;
10780 struct timeval tv;
10781
10782 VERIFY(ifp != NULL);
10783 VERIFY(modid != NULL);
10784 _CASSERT(sizeof(kev.modid) == DLIL_MODIDLEN);
10785 _CASSERT(sizeof(kev.info) == DLIL_MODARGLEN);
10786
10787 bzero(&kev, sizeof(kev));
10788
10789 microtime(&tv);
10790 kev.timestamp = tv.tv_sec;
10791 bcopy(modid, &kev.modid, DLIL_MODIDLEN);
10792 if (info != NULL) {
10793 bcopy(info, &kev.info, DLIL_MODARGLEN);
10794 }
10795
10796 dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_ISSUES,
10797 &kev.link_data, sizeof(kev));
10798 }
10799
10800 errno_t
ifnet_getset_opportunistic(ifnet_t ifp,u_long cmd,struct ifreq * ifr,struct proc * p)10801 ifnet_getset_opportunistic(ifnet_t ifp, u_long cmd, struct ifreq *ifr,
10802 struct proc *p)
10803 {
10804 u_int32_t level = IFNET_THROTTLE_OFF;
10805 errno_t result = 0;
10806
10807 VERIFY(cmd == SIOCSIFOPPORTUNISTIC || cmd == SIOCGIFOPPORTUNISTIC);
10808
10809 if (cmd == SIOCSIFOPPORTUNISTIC) {
10810 /*
10811 * XXX: Use priv_check_cred() instead of root check?
10812 */
10813 if ((result = proc_suser(p)) != 0) {
10814 return result;
10815 }
10816
10817 if (ifr->ifr_opportunistic.ifo_flags ==
10818 IFRIFOF_BLOCK_OPPORTUNISTIC) {
10819 level = IFNET_THROTTLE_OPPORTUNISTIC;
10820 } else if (ifr->ifr_opportunistic.ifo_flags == 0) {
10821 level = IFNET_THROTTLE_OFF;
10822 } else {
10823 result = EINVAL;
10824 }
10825
10826 if (result == 0) {
10827 result = ifnet_set_throttle(ifp, level);
10828 }
10829 } else if ((result = ifnet_get_throttle(ifp, &level)) == 0) {
10830 ifr->ifr_opportunistic.ifo_flags = 0;
10831 if (level == IFNET_THROTTLE_OPPORTUNISTIC) {
10832 ifr->ifr_opportunistic.ifo_flags |=
10833 IFRIFOF_BLOCK_OPPORTUNISTIC;
10834 }
10835 }
10836
10837 /*
10838 * Return the count of current opportunistic connections
10839 * over the interface.
10840 */
10841 if (result == 0) {
10842 uint32_t flags = 0;
10843 flags |= (cmd == SIOCSIFOPPORTUNISTIC) ?
10844 INPCB_OPPORTUNISTIC_SETCMD : 0;
10845 flags |= (level == IFNET_THROTTLE_OPPORTUNISTIC) ?
10846 INPCB_OPPORTUNISTIC_THROTTLEON : 0;
10847 ifr->ifr_opportunistic.ifo_inuse =
10848 udp_count_opportunistic(ifp->if_index, flags) +
10849 tcp_count_opportunistic(ifp->if_index, flags);
10850 }
10851
10852 if (result == EALREADY) {
10853 result = 0;
10854 }
10855
10856 return result;
10857 }
10858
10859 int
ifnet_get_throttle(struct ifnet * ifp,u_int32_t * level)10860 ifnet_get_throttle(struct ifnet *ifp, u_int32_t *level)
10861 {
10862 struct ifclassq *ifq;
10863 int err = 0;
10864
10865 if (!(ifp->if_eflags & IFEF_TXSTART)) {
10866 return ENXIO;
10867 }
10868
10869 *level = IFNET_THROTTLE_OFF;
10870
10871 ifq = ifp->if_snd;
10872 IFCQ_LOCK(ifq);
10873 /* Throttling works only for IFCQ, not ALTQ instances */
10874 if (IFCQ_IS_ENABLED(ifq)) {
10875 cqrq_throttle_t req = { 0, IFNET_THROTTLE_OFF };
10876
10877 err = fq_if_request_classq(ifq, CLASSQRQ_THROTTLE, &req);
10878 *level = req.level;
10879 }
10880 IFCQ_UNLOCK(ifq);
10881
10882 return err;
10883 }
10884
10885 int
ifnet_set_throttle(struct ifnet * ifp,u_int32_t level)10886 ifnet_set_throttle(struct ifnet *ifp, u_int32_t level)
10887 {
10888 struct ifclassq *ifq;
10889 int err = 0;
10890
10891 if (!(ifp->if_eflags & IFEF_TXSTART)) {
10892 return ENXIO;
10893 }
10894
10895 ifq = ifp->if_snd;
10896
10897 switch (level) {
10898 case IFNET_THROTTLE_OFF:
10899 case IFNET_THROTTLE_OPPORTUNISTIC:
10900 break;
10901 default:
10902 return EINVAL;
10903 }
10904
10905 IFCQ_LOCK(ifq);
10906 if (IFCQ_IS_ENABLED(ifq)) {
10907 cqrq_throttle_t req = { 1, level };
10908
10909 err = fq_if_request_classq(ifq, CLASSQRQ_THROTTLE, &req);
10910 }
10911 IFCQ_UNLOCK(ifq);
10912
10913 if (err == 0) {
10914 DLIL_PRINTF("%s: throttling level set to %d\n", if_name(ifp),
10915 level);
10916 #if NECP
10917 necp_update_all_clients();
10918 #endif /* NECP */
10919 if (level == IFNET_THROTTLE_OFF) {
10920 ifnet_start(ifp);
10921 }
10922 }
10923
10924 return err;
10925 }
10926
10927 errno_t
ifnet_getset_log(ifnet_t ifp,u_long cmd,struct ifreq * ifr,struct proc * p)10928 ifnet_getset_log(ifnet_t ifp, u_long cmd, struct ifreq *ifr,
10929 struct proc *p)
10930 {
10931 #pragma unused(p)
10932 errno_t result = 0;
10933 uint32_t flags;
10934 int level, category, subcategory;
10935
10936 VERIFY(cmd == SIOCSIFLOG || cmd == SIOCGIFLOG);
10937
10938 if (cmd == SIOCSIFLOG) {
10939 if ((result = priv_check_cred(kauth_cred_get(),
10940 PRIV_NET_INTERFACE_CONTROL, 0)) != 0) {
10941 return result;
10942 }
10943
10944 level = ifr->ifr_log.ifl_level;
10945 if (level < IFNET_LOG_MIN || level > IFNET_LOG_MAX) {
10946 result = EINVAL;
10947 }
10948
10949 flags = ifr->ifr_log.ifl_flags;
10950 if ((flags &= IFNET_LOGF_MASK) == 0) {
10951 result = EINVAL;
10952 }
10953
10954 category = ifr->ifr_log.ifl_category;
10955 subcategory = ifr->ifr_log.ifl_subcategory;
10956
10957 if (result == 0) {
10958 result = ifnet_set_log(ifp, level, flags,
10959 category, subcategory);
10960 }
10961 } else {
10962 result = ifnet_get_log(ifp, &level, &flags, &category,
10963 &subcategory);
10964 if (result == 0) {
10965 ifr->ifr_log.ifl_level = level;
10966 ifr->ifr_log.ifl_flags = flags;
10967 ifr->ifr_log.ifl_category = category;
10968 ifr->ifr_log.ifl_subcategory = subcategory;
10969 }
10970 }
10971
10972 return result;
10973 }
10974
10975 int
ifnet_set_log(struct ifnet * ifp,int32_t level,uint32_t flags,int32_t category,int32_t subcategory)10976 ifnet_set_log(struct ifnet *ifp, int32_t level, uint32_t flags,
10977 int32_t category, int32_t subcategory)
10978 {
10979 int err = 0;
10980
10981 VERIFY(level >= IFNET_LOG_MIN && level <= IFNET_LOG_MAX);
10982 VERIFY(flags & IFNET_LOGF_MASK);
10983
10984 /*
10985 * The logging level applies to all facilities; make sure to
10986 * update them all with the most current level.
10987 */
10988 flags |= ifp->if_log.flags;
10989
10990 if (ifp->if_output_ctl != NULL) {
10991 struct ifnet_log_params l;
10992
10993 bzero(&l, sizeof(l));
10994 l.level = level;
10995 l.flags = flags;
10996 l.flags &= ~IFNET_LOGF_DLIL;
10997 l.category = category;
10998 l.subcategory = subcategory;
10999
11000 /* Send this request to lower layers */
11001 if (l.flags != 0) {
11002 err = ifp->if_output_ctl(ifp, IFNET_CTL_SET_LOG,
11003 sizeof(l), &l);
11004 }
11005 } else if ((flags & ~IFNET_LOGF_DLIL) && ifp->if_output_ctl == NULL) {
11006 /*
11007 * If targeted to the lower layers without an output
11008 * control callback registered on the interface, just
11009 * silently ignore facilities other than ours.
11010 */
11011 flags &= IFNET_LOGF_DLIL;
11012 if (flags == 0 && (!(ifp->if_log.flags & IFNET_LOGF_DLIL))) {
11013 level = 0;
11014 }
11015 }
11016
11017 if (err == 0) {
11018 if ((ifp->if_log.level = level) == IFNET_LOG_DEFAULT) {
11019 ifp->if_log.flags = 0;
11020 } else {
11021 ifp->if_log.flags |= flags;
11022 }
11023
11024 log(LOG_INFO, "%s: logging level set to %d flags=%b "
11025 "arg=%b, category=%d subcategory=%d\n", if_name(ifp),
11026 ifp->if_log.level, ifp->if_log.flags,
11027 IFNET_LOGF_BITS, flags, IFNET_LOGF_BITS,
11028 category, subcategory);
11029 }
11030
11031 return err;
11032 }
11033
11034 int
ifnet_get_log(struct ifnet * ifp,int32_t * level,uint32_t * flags,int32_t * category,int32_t * subcategory)11035 ifnet_get_log(struct ifnet *ifp, int32_t *level, uint32_t *flags,
11036 int32_t *category, int32_t *subcategory)
11037 {
11038 if (level != NULL) {
11039 *level = ifp->if_log.level;
11040 }
11041 if (flags != NULL) {
11042 *flags = ifp->if_log.flags;
11043 }
11044 if (category != NULL) {
11045 *category = ifp->if_log.category;
11046 }
11047 if (subcategory != NULL) {
11048 *subcategory = ifp->if_log.subcategory;
11049 }
11050
11051 return 0;
11052 }
11053
11054 int
ifnet_notify_address(struct ifnet * ifp,int af)11055 ifnet_notify_address(struct ifnet *ifp, int af)
11056 {
11057 struct ifnet_notify_address_params na;
11058
11059 #if PF
11060 (void) pf_ifaddr_hook(ifp);
11061 #endif /* PF */
11062
11063 if (ifp->if_output_ctl == NULL) {
11064 return EOPNOTSUPP;
11065 }
11066
11067 bzero(&na, sizeof(na));
11068 na.address_family = (sa_family_t)af;
11069
11070 return ifp->if_output_ctl(ifp, IFNET_CTL_NOTIFY_ADDRESS,
11071 sizeof(na), &na);
11072 }
11073
11074 errno_t
ifnet_flowid(struct ifnet * ifp,uint32_t * flowid)11075 ifnet_flowid(struct ifnet *ifp, uint32_t *flowid)
11076 {
11077 if (ifp == NULL || flowid == NULL) {
11078 return EINVAL;
11079 } else if (!(ifp->if_eflags & IFEF_TXSTART) ||
11080 !IF_FULLY_ATTACHED(ifp)) {
11081 return ENXIO;
11082 }
11083
11084 *flowid = ifp->if_flowhash;
11085
11086 return 0;
11087 }
11088
11089 errno_t
ifnet_disable_output(struct ifnet * ifp)11090 ifnet_disable_output(struct ifnet *ifp)
11091 {
11092 int err;
11093
11094 if (ifp == NULL) {
11095 return EINVAL;
11096 } else if (!(ifp->if_eflags & IFEF_TXSTART) ||
11097 !IF_FULLY_ATTACHED(ifp)) {
11098 return ENXIO;
11099 }
11100
11101 if ((err = ifnet_fc_add(ifp)) == 0) {
11102 lck_mtx_lock_spin(&ifp->if_start_lock);
11103 ifp->if_start_flags |= IFSF_FLOW_CONTROLLED;
11104 lck_mtx_unlock(&ifp->if_start_lock);
11105 }
11106 return err;
11107 }
11108
11109 errno_t
ifnet_enable_output(struct ifnet * ifp)11110 ifnet_enable_output(struct ifnet *ifp)
11111 {
11112 if (ifp == NULL) {
11113 return EINVAL;
11114 } else if (!(ifp->if_eflags & IFEF_TXSTART) ||
11115 !IF_FULLY_ATTACHED(ifp)) {
11116 return ENXIO;
11117 }
11118
11119 ifnet_start_common(ifp, TRUE);
11120 return 0;
11121 }
11122
11123 void
ifnet_flowadv(uint32_t flowhash)11124 ifnet_flowadv(uint32_t flowhash)
11125 {
11126 struct ifnet_fc_entry *ifce;
11127 struct ifnet *ifp;
11128
11129 ifce = ifnet_fc_get(flowhash);
11130 if (ifce == NULL) {
11131 return;
11132 }
11133
11134 VERIFY(ifce->ifce_ifp != NULL);
11135 ifp = ifce->ifce_ifp;
11136
11137 /* flow hash gets recalculated per attach, so check */
11138 if (ifnet_is_attached(ifp, 1)) {
11139 if (ifp->if_flowhash == flowhash) {
11140 (void) ifnet_enable_output(ifp);
11141 }
11142 ifnet_decr_iorefcnt(ifp);
11143 }
11144 ifnet_fc_entry_free(ifce);
11145 }
11146
11147 /*
11148 * Function to compare ifnet_fc_entries in ifnet flow control tree
11149 */
11150 static inline int
ifce_cmp(const struct ifnet_fc_entry * fc1,const struct ifnet_fc_entry * fc2)11151 ifce_cmp(const struct ifnet_fc_entry *fc1, const struct ifnet_fc_entry *fc2)
11152 {
11153 return fc1->ifce_flowhash - fc2->ifce_flowhash;
11154 }
11155
11156 static int
ifnet_fc_add(struct ifnet * ifp)11157 ifnet_fc_add(struct ifnet *ifp)
11158 {
11159 struct ifnet_fc_entry keyfc, *ifce;
11160 uint32_t flowhash;
11161
11162 VERIFY(ifp != NULL && (ifp->if_eflags & IFEF_TXSTART));
11163 VERIFY(ifp->if_flowhash != 0);
11164 flowhash = ifp->if_flowhash;
11165
11166 bzero(&keyfc, sizeof(keyfc));
11167 keyfc.ifce_flowhash = flowhash;
11168
11169 lck_mtx_lock_spin(&ifnet_fc_lock);
11170 ifce = RB_FIND(ifnet_fc_tree, &ifnet_fc_tree, &keyfc);
11171 if (ifce != NULL && ifce->ifce_ifp == ifp) {
11172 /* Entry is already in ifnet_fc_tree, return */
11173 lck_mtx_unlock(&ifnet_fc_lock);
11174 return 0;
11175 }
11176
11177 if (ifce != NULL) {
11178 /*
11179 * There is a different fc entry with the same flow hash
11180 * but different ifp pointer. There can be a collision
11181 * on flow hash but the probability is low. Let's just
11182 * avoid adding a second one when there is a collision.
11183 */
11184 lck_mtx_unlock(&ifnet_fc_lock);
11185 return EAGAIN;
11186 }
11187
11188 /* become regular mutex */
11189 lck_mtx_convert_spin(&ifnet_fc_lock);
11190
11191 ifce = zalloc_flags(ifnet_fc_zone, Z_WAITOK | Z_ZERO);
11192 ifce->ifce_flowhash = flowhash;
11193 ifce->ifce_ifp = ifp;
11194
11195 RB_INSERT(ifnet_fc_tree, &ifnet_fc_tree, ifce);
11196 lck_mtx_unlock(&ifnet_fc_lock);
11197 return 0;
11198 }
11199
11200 static struct ifnet_fc_entry *
ifnet_fc_get(uint32_t flowhash)11201 ifnet_fc_get(uint32_t flowhash)
11202 {
11203 struct ifnet_fc_entry keyfc, *ifce;
11204 struct ifnet *ifp;
11205
11206 bzero(&keyfc, sizeof(keyfc));
11207 keyfc.ifce_flowhash = flowhash;
11208
11209 lck_mtx_lock_spin(&ifnet_fc_lock);
11210 ifce = RB_FIND(ifnet_fc_tree, &ifnet_fc_tree, &keyfc);
11211 if (ifce == NULL) {
11212 /* Entry is not present in ifnet_fc_tree, return */
11213 lck_mtx_unlock(&ifnet_fc_lock);
11214 return NULL;
11215 }
11216
11217 RB_REMOVE(ifnet_fc_tree, &ifnet_fc_tree, ifce);
11218
11219 VERIFY(ifce->ifce_ifp != NULL);
11220 ifp = ifce->ifce_ifp;
11221
11222 /* become regular mutex */
11223 lck_mtx_convert_spin(&ifnet_fc_lock);
11224
11225 if (!ifnet_is_attached(ifp, 0)) {
11226 /*
11227 * This ifp is not attached or in the process of being
11228 * detached; just don't process it.
11229 */
11230 ifnet_fc_entry_free(ifce);
11231 ifce = NULL;
11232 }
11233 lck_mtx_unlock(&ifnet_fc_lock);
11234
11235 return ifce;
11236 }
11237
11238 static void
ifnet_fc_entry_free(struct ifnet_fc_entry * ifce)11239 ifnet_fc_entry_free(struct ifnet_fc_entry *ifce)
11240 {
11241 zfree(ifnet_fc_zone, ifce);
11242 }
11243
11244 static uint32_t
ifnet_calc_flowhash(struct ifnet * ifp)11245 ifnet_calc_flowhash(struct ifnet *ifp)
11246 {
11247 struct ifnet_flowhash_key fh __attribute__((aligned(8)));
11248 uint32_t flowhash = 0;
11249
11250 if (ifnet_flowhash_seed == 0) {
11251 ifnet_flowhash_seed = RandomULong();
11252 }
11253
11254 bzero(&fh, sizeof(fh));
11255
11256 (void) snprintf(fh.ifk_name, sizeof(fh.ifk_name), "%s", ifp->if_name);
11257 fh.ifk_unit = ifp->if_unit;
11258 fh.ifk_flags = ifp->if_flags;
11259 fh.ifk_eflags = ifp->if_eflags;
11260 fh.ifk_capabilities = ifp->if_capabilities;
11261 fh.ifk_capenable = ifp->if_capenable;
11262 fh.ifk_output_sched_model = ifp->if_output_sched_model;
11263 fh.ifk_rand1 = RandomULong();
11264 fh.ifk_rand2 = RandomULong();
11265
11266 try_again:
11267 flowhash = net_flowhash(&fh, sizeof(fh), ifnet_flowhash_seed);
11268 if (flowhash == 0) {
11269 /* try to get a non-zero flowhash */
11270 ifnet_flowhash_seed = RandomULong();
11271 goto try_again;
11272 }
11273
11274 return flowhash;
11275 }
11276
11277 int
ifnet_set_netsignature(struct ifnet * ifp,uint8_t family,uint8_t len,uint16_t flags,uint8_t * data)11278 ifnet_set_netsignature(struct ifnet *ifp, uint8_t family, uint8_t len,
11279 uint16_t flags, uint8_t *data)
11280 {
11281 #pragma unused(flags)
11282 int error = 0;
11283
11284 switch (family) {
11285 case AF_INET:
11286 if_inetdata_lock_exclusive(ifp);
11287 if (IN_IFEXTRA(ifp) != NULL) {
11288 if (len == 0) {
11289 /* Allow clearing the signature */
11290 IN_IFEXTRA(ifp)->netsig_len = 0;
11291 bzero(IN_IFEXTRA(ifp)->netsig,
11292 sizeof(IN_IFEXTRA(ifp)->netsig));
11293 if_inetdata_lock_done(ifp);
11294 break;
11295 } else if (len > sizeof(IN_IFEXTRA(ifp)->netsig)) {
11296 error = EINVAL;
11297 if_inetdata_lock_done(ifp);
11298 break;
11299 }
11300 IN_IFEXTRA(ifp)->netsig_len = len;
11301 bcopy(data, IN_IFEXTRA(ifp)->netsig, len);
11302 } else {
11303 error = ENOMEM;
11304 }
11305 if_inetdata_lock_done(ifp);
11306 break;
11307
11308 case AF_INET6:
11309 if_inet6data_lock_exclusive(ifp);
11310 if (IN6_IFEXTRA(ifp) != NULL) {
11311 if (len == 0) {
11312 /* Allow clearing the signature */
11313 IN6_IFEXTRA(ifp)->netsig_len = 0;
11314 bzero(IN6_IFEXTRA(ifp)->netsig,
11315 sizeof(IN6_IFEXTRA(ifp)->netsig));
11316 if_inet6data_lock_done(ifp);
11317 break;
11318 } else if (len > sizeof(IN6_IFEXTRA(ifp)->netsig)) {
11319 error = EINVAL;
11320 if_inet6data_lock_done(ifp);
11321 break;
11322 }
11323 IN6_IFEXTRA(ifp)->netsig_len = len;
11324 bcopy(data, IN6_IFEXTRA(ifp)->netsig, len);
11325 } else {
11326 error = ENOMEM;
11327 }
11328 if_inet6data_lock_done(ifp);
11329 break;
11330
11331 default:
11332 error = EINVAL;
11333 break;
11334 }
11335
11336 return error;
11337 }
11338
11339 int
ifnet_get_netsignature(struct ifnet * ifp,uint8_t family,uint8_t * len,uint16_t * flags,uint8_t * data)11340 ifnet_get_netsignature(struct ifnet *ifp, uint8_t family, uint8_t *len,
11341 uint16_t *flags, uint8_t *data)
11342 {
11343 int error = 0;
11344
11345 if (ifp == NULL || len == NULL || data == NULL) {
11346 return EINVAL;
11347 }
11348
11349 switch (family) {
11350 case AF_INET:
11351 if_inetdata_lock_shared(ifp);
11352 if (IN_IFEXTRA(ifp) != NULL) {
11353 if (*len == 0 || *len < IN_IFEXTRA(ifp)->netsig_len) {
11354 error = EINVAL;
11355 if_inetdata_lock_done(ifp);
11356 break;
11357 }
11358 if ((*len = (uint8_t)IN_IFEXTRA(ifp)->netsig_len) > 0) {
11359 bcopy(IN_IFEXTRA(ifp)->netsig, data, *len);
11360 } else {
11361 error = ENOENT;
11362 }
11363 } else {
11364 error = ENOMEM;
11365 }
11366 if_inetdata_lock_done(ifp);
11367 break;
11368
11369 case AF_INET6:
11370 if_inet6data_lock_shared(ifp);
11371 if (IN6_IFEXTRA(ifp) != NULL) {
11372 if (*len == 0 || *len < IN6_IFEXTRA(ifp)->netsig_len) {
11373 error = EINVAL;
11374 if_inet6data_lock_done(ifp);
11375 break;
11376 }
11377 if ((*len = (uint8_t)IN6_IFEXTRA(ifp)->netsig_len) > 0) {
11378 bcopy(IN6_IFEXTRA(ifp)->netsig, data, *len);
11379 } else {
11380 error = ENOENT;
11381 }
11382 } else {
11383 error = ENOMEM;
11384 }
11385 if_inet6data_lock_done(ifp);
11386 break;
11387
11388 default:
11389 error = EINVAL;
11390 break;
11391 }
11392
11393 if (error == 0 && flags != NULL) {
11394 *flags = 0;
11395 }
11396
11397 return error;
11398 }
11399
11400 int
ifnet_set_nat64prefix(struct ifnet * ifp,struct ipv6_prefix * prefixes)11401 ifnet_set_nat64prefix(struct ifnet *ifp, struct ipv6_prefix *prefixes)
11402 {
11403 int i, error = 0, one_set = 0;
11404
11405 if_inet6data_lock_exclusive(ifp);
11406
11407 if (IN6_IFEXTRA(ifp) == NULL) {
11408 error = ENOMEM;
11409 goto out;
11410 }
11411
11412 for (i = 0; i < NAT64_MAX_NUM_PREFIXES; i++) {
11413 uint32_t prefix_len =
11414 prefixes[i].prefix_len;
11415 struct in6_addr *prefix =
11416 &prefixes[i].ipv6_prefix;
11417
11418 if (prefix_len == 0) {
11419 clat_log0((LOG_DEBUG,
11420 "NAT64 prefixes purged from Interface %s\n",
11421 if_name(ifp)));
11422 /* Allow clearing the signature */
11423 IN6_IFEXTRA(ifp)->nat64_prefixes[i].prefix_len = 0;
11424 bzero(&IN6_IFEXTRA(ifp)->nat64_prefixes[i].ipv6_prefix,
11425 sizeof(struct in6_addr));
11426
11427 continue;
11428 } else if (prefix_len != NAT64_PREFIX_LEN_32 &&
11429 prefix_len != NAT64_PREFIX_LEN_40 &&
11430 prefix_len != NAT64_PREFIX_LEN_48 &&
11431 prefix_len != NAT64_PREFIX_LEN_56 &&
11432 prefix_len != NAT64_PREFIX_LEN_64 &&
11433 prefix_len != NAT64_PREFIX_LEN_96) {
11434 clat_log0((LOG_DEBUG,
11435 "NAT64 prefixlen is incorrect %d\n", prefix_len));
11436 error = EINVAL;
11437 goto out;
11438 }
11439
11440 if (IN6_IS_SCOPE_EMBED(prefix)) {
11441 clat_log0((LOG_DEBUG,
11442 "NAT64 prefix has interface/link local scope.\n"));
11443 error = EINVAL;
11444 goto out;
11445 }
11446
11447 IN6_IFEXTRA(ifp)->nat64_prefixes[i].prefix_len = prefix_len;
11448 bcopy(prefix, &IN6_IFEXTRA(ifp)->nat64_prefixes[i].ipv6_prefix,
11449 sizeof(struct in6_addr));
11450 clat_log0((LOG_DEBUG,
11451 "NAT64 prefix set to %s with prefixlen: %d\n",
11452 ip6_sprintf(prefix), prefix_len));
11453 one_set = 1;
11454 }
11455
11456 out:
11457 if_inet6data_lock_done(ifp);
11458
11459 if (error == 0 && one_set != 0) {
11460 necp_update_all_clients();
11461 }
11462
11463 return error;
11464 }
11465
11466 int
ifnet_get_nat64prefix(struct ifnet * ifp,struct ipv6_prefix * prefixes)11467 ifnet_get_nat64prefix(struct ifnet *ifp, struct ipv6_prefix *prefixes)
11468 {
11469 int i, found_one = 0, error = 0;
11470
11471 if (ifp == NULL) {
11472 return EINVAL;
11473 }
11474
11475 if_inet6data_lock_shared(ifp);
11476
11477 if (IN6_IFEXTRA(ifp) == NULL) {
11478 error = ENOMEM;
11479 goto out;
11480 }
11481
11482 for (i = 0; i < NAT64_MAX_NUM_PREFIXES; i++) {
11483 if (IN6_IFEXTRA(ifp)->nat64_prefixes[i].prefix_len != 0) {
11484 found_one = 1;
11485 }
11486 }
11487
11488 if (found_one == 0) {
11489 error = ENOENT;
11490 goto out;
11491 }
11492
11493 if (prefixes) {
11494 bcopy(IN6_IFEXTRA(ifp)->nat64_prefixes, prefixes,
11495 sizeof(IN6_IFEXTRA(ifp)->nat64_prefixes));
11496 }
11497
11498 out:
11499 if_inet6data_lock_done(ifp);
11500
11501 return error;
11502 }
11503
11504 __attribute__((noinline))
11505 static void
dlil_output_cksum_dbg(struct ifnet * ifp,struct mbuf * m,uint32_t hoff,protocol_family_t pf)11506 dlil_output_cksum_dbg(struct ifnet *ifp, struct mbuf *m, uint32_t hoff,
11507 protocol_family_t pf)
11508 {
11509 #pragma unused(ifp)
11510 uint32_t did_sw;
11511
11512 if (!(hwcksum_dbg_mode & HWCKSUM_DBG_FINALIZE_FORCED) ||
11513 (m->m_pkthdr.csum_flags & (CSUM_TSO_IPV4 | CSUM_TSO_IPV6))) {
11514 return;
11515 }
11516
11517 switch (pf) {
11518 case PF_INET:
11519 did_sw = in_finalize_cksum(m, hoff, m->m_pkthdr.csum_flags);
11520 if (did_sw & CSUM_DELAY_IP) {
11521 hwcksum_dbg_finalized_hdr++;
11522 }
11523 if (did_sw & CSUM_DELAY_DATA) {
11524 hwcksum_dbg_finalized_data++;
11525 }
11526 break;
11527 case PF_INET6:
11528 /*
11529 * Checksum offload should not have been enabled when
11530 * extension headers exist; that also means that we
11531 * cannot force-finalize packets with extension headers.
11532 * Indicate to the callee should it skip such case by
11533 * setting optlen to -1.
11534 */
11535 did_sw = in6_finalize_cksum(m, hoff, -1, -1,
11536 m->m_pkthdr.csum_flags);
11537 if (did_sw & CSUM_DELAY_IPV6_DATA) {
11538 hwcksum_dbg_finalized_data++;
11539 }
11540 break;
11541 default:
11542 return;
11543 }
11544 }
11545
11546 static void
dlil_input_cksum_dbg(struct ifnet * ifp,struct mbuf * m,char * frame_header,protocol_family_t pf)11547 dlil_input_cksum_dbg(struct ifnet *ifp, struct mbuf *m, char *frame_header,
11548 protocol_family_t pf)
11549 {
11550 uint16_t sum = 0;
11551 uint32_t hlen;
11552
11553 if (frame_header == NULL ||
11554 frame_header < (char *)mbuf_datastart(m) ||
11555 frame_header > (char *)m->m_data) {
11556 DLIL_PRINTF("%s: frame header pointer 0x%llx out of range "
11557 "[0x%llx,0x%llx] for mbuf 0x%llx\n", if_name(ifp),
11558 (uint64_t)VM_KERNEL_ADDRPERM(frame_header),
11559 (uint64_t)VM_KERNEL_ADDRPERM(mbuf_datastart(m)),
11560 (uint64_t)VM_KERNEL_ADDRPERM(m->m_data),
11561 (uint64_t)VM_KERNEL_ADDRPERM(m));
11562 return;
11563 }
11564 hlen = (uint32_t)(m->m_data - frame_header);
11565
11566 switch (pf) {
11567 case PF_INET:
11568 case PF_INET6:
11569 break;
11570 default:
11571 return;
11572 }
11573
11574 /*
11575 * Force partial checksum offload; useful to simulate cases
11576 * where the hardware does not support partial checksum offload,
11577 * in order to validate correctness throughout the layers above.
11578 */
11579 if (hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_FORCED) {
11580 uint32_t foff = hwcksum_dbg_partial_rxoff_forced;
11581
11582 if (foff > (uint32_t)m->m_pkthdr.len) {
11583 return;
11584 }
11585
11586 m->m_pkthdr.csum_flags &= ~CSUM_RX_FLAGS;
11587
11588 /* Compute 16-bit 1's complement sum from forced offset */
11589 sum = m_sum16(m, foff, (m->m_pkthdr.len - foff));
11590
11591 m->m_pkthdr.csum_flags |= (CSUM_DATA_VALID | CSUM_PARTIAL);
11592 m->m_pkthdr.csum_rx_val = sum;
11593 m->m_pkthdr.csum_rx_start = (uint16_t)(foff + hlen);
11594
11595 hwcksum_dbg_partial_forced++;
11596 hwcksum_dbg_partial_forced_bytes += m->m_pkthdr.len;
11597 }
11598
11599 /*
11600 * Partial checksum offload verification (and adjustment);
11601 * useful to validate and test cases where the hardware
11602 * supports partial checksum offload.
11603 */
11604 if ((m->m_pkthdr.csum_flags &
11605 (CSUM_DATA_VALID | CSUM_PARTIAL | CSUM_PSEUDO_HDR)) ==
11606 (CSUM_DATA_VALID | CSUM_PARTIAL)) {
11607 uint32_t rxoff;
11608
11609 /* Start offset must begin after frame header */
11610 rxoff = m->m_pkthdr.csum_rx_start;
11611 if (hlen > rxoff) {
11612 hwcksum_dbg_bad_rxoff++;
11613 if (dlil_verbose) {
11614 DLIL_PRINTF("%s: partial cksum start offset %d "
11615 "is less than frame header length %d for "
11616 "mbuf 0x%llx\n", if_name(ifp), rxoff, hlen,
11617 (uint64_t)VM_KERNEL_ADDRPERM(m));
11618 }
11619 return;
11620 }
11621 rxoff -= hlen;
11622
11623 if (!(hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_FORCED)) {
11624 /*
11625 * Compute the expected 16-bit 1's complement sum;
11626 * skip this if we've already computed it above
11627 * when partial checksum offload is forced.
11628 */
11629 sum = m_sum16(m, rxoff, (m->m_pkthdr.len - rxoff));
11630
11631 /* Hardware or driver is buggy */
11632 if (sum != m->m_pkthdr.csum_rx_val) {
11633 hwcksum_dbg_bad_cksum++;
11634 if (dlil_verbose) {
11635 DLIL_PRINTF("%s: bad partial cksum value "
11636 "0x%x (expected 0x%x) for mbuf "
11637 "0x%llx [rx_start %d]\n",
11638 if_name(ifp),
11639 m->m_pkthdr.csum_rx_val, sum,
11640 (uint64_t)VM_KERNEL_ADDRPERM(m),
11641 m->m_pkthdr.csum_rx_start);
11642 }
11643 return;
11644 }
11645 }
11646 hwcksum_dbg_verified++;
11647
11648 /*
11649 * This code allows us to emulate various hardwares that
11650 * perform 16-bit 1's complement sum beginning at various
11651 * start offset values.
11652 */
11653 if (hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_RXOFF_ADJ) {
11654 uint32_t aoff = hwcksum_dbg_partial_rxoff_adj;
11655
11656 if (aoff == rxoff || aoff > (uint32_t)m->m_pkthdr.len) {
11657 return;
11658 }
11659
11660 sum = m_adj_sum16(m, rxoff, aoff,
11661 m_pktlen(m) - aoff, sum);
11662
11663 m->m_pkthdr.csum_rx_val = sum;
11664 m->m_pkthdr.csum_rx_start = (uint16_t)(aoff + hlen);
11665
11666 hwcksum_dbg_adjusted++;
11667 }
11668 }
11669 }
11670
11671 static int
11672 sysctl_hwcksum_dbg_mode SYSCTL_HANDLER_ARGS
11673 {
11674 #pragma unused(arg1, arg2)
11675 u_int32_t i;
11676 int err;
11677
11678 i = hwcksum_dbg_mode;
11679
11680 err = sysctl_handle_int(oidp, &i, 0, req);
11681 if (err != 0 || req->newptr == USER_ADDR_NULL) {
11682 return err;
11683 }
11684
11685 if (hwcksum_dbg == 0) {
11686 return ENODEV;
11687 }
11688
11689 if ((i & ~HWCKSUM_DBG_MASK) != 0) {
11690 return EINVAL;
11691 }
11692
11693 hwcksum_dbg_mode = (i & HWCKSUM_DBG_MASK);
11694
11695 return err;
11696 }
11697
11698 static int
11699 sysctl_hwcksum_dbg_partial_rxoff_forced SYSCTL_HANDLER_ARGS
11700 {
11701 #pragma unused(arg1, arg2)
11702 u_int32_t i;
11703 int err;
11704
11705 i = hwcksum_dbg_partial_rxoff_forced;
11706
11707 err = sysctl_handle_int(oidp, &i, 0, req);
11708 if (err != 0 || req->newptr == USER_ADDR_NULL) {
11709 return err;
11710 }
11711
11712 if (!(hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_FORCED)) {
11713 return ENODEV;
11714 }
11715
11716 hwcksum_dbg_partial_rxoff_forced = i;
11717
11718 return err;
11719 }
11720
11721 static int
11722 sysctl_hwcksum_dbg_partial_rxoff_adj SYSCTL_HANDLER_ARGS
11723 {
11724 #pragma unused(arg1, arg2)
11725 u_int32_t i;
11726 int err;
11727
11728 i = hwcksum_dbg_partial_rxoff_adj;
11729
11730 err = sysctl_handle_int(oidp, &i, 0, req);
11731 if (err != 0 || req->newptr == USER_ADDR_NULL) {
11732 return err;
11733 }
11734
11735 if (!(hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_RXOFF_ADJ)) {
11736 return ENODEV;
11737 }
11738
11739 hwcksum_dbg_partial_rxoff_adj = i;
11740
11741 return err;
11742 }
11743
11744 static int
11745 sysctl_tx_chain_len_stats SYSCTL_HANDLER_ARGS
11746 {
11747 #pragma unused(oidp, arg1, arg2)
11748 int err;
11749
11750 if (req->oldptr == USER_ADDR_NULL) {
11751 }
11752 if (req->newptr != USER_ADDR_NULL) {
11753 return EPERM;
11754 }
11755 err = SYSCTL_OUT(req, &tx_chain_len_stats,
11756 sizeof(struct chain_len_stats));
11757
11758 return err;
11759 }
11760
11761
11762 #if DEBUG || DEVELOPMENT
11763 /* Blob for sum16 verification */
11764 static uint8_t sumdata[] = {
11765 0x1f, 0x8b, 0x08, 0x08, 0x4c, 0xe5, 0x9a, 0x4f, 0x00, 0x03,
11766 0x5f, 0x00, 0x5d, 0x91, 0x41, 0x4e, 0xc4, 0x30, 0x0c, 0x45,
11767 0xf7, 0x9c, 0xc2, 0x07, 0x18, 0xf5, 0x0e, 0xb0, 0xe2, 0x00,
11768 0x48, 0x88, 0xa5, 0xdb, 0xba, 0x49, 0x34, 0x69, 0xdc, 0x71,
11769 0x92, 0xa9, 0xc2, 0x8a, 0x6b, 0x70, 0x3d, 0x4e, 0x82, 0x93,
11770 0xb4, 0x08, 0xd8, 0xc5, 0xb1, 0xfd, 0xff, 0xb3, 0xfd, 0x4c,
11771 0x42, 0x5f, 0x1f, 0x9f, 0x11, 0x12, 0x43, 0xb2, 0x04, 0x93,
11772 0xe0, 0x7b, 0x01, 0x0e, 0x14, 0x07, 0x78, 0xd1, 0x78, 0x75,
11773 0x71, 0x71, 0xe9, 0x08, 0x84, 0x46, 0xf2, 0xc7, 0x3b, 0x09,
11774 0xe7, 0xd1, 0xd3, 0x8a, 0x57, 0x92, 0x33, 0xcd, 0x39, 0xcc,
11775 0xb0, 0x91, 0x89, 0xe0, 0x42, 0x53, 0x8b, 0xb7, 0x8c, 0x42,
11776 0x60, 0xd9, 0x9f, 0x7a, 0x55, 0x19, 0x76, 0xcb, 0x10, 0x49,
11777 0x35, 0xac, 0x0b, 0x5a, 0x3c, 0xbb, 0x65, 0x51, 0x8c, 0x90,
11778 0x7c, 0x69, 0x45, 0x45, 0x81, 0xb4, 0x2b, 0x70, 0x82, 0x85,
11779 0x55, 0x91, 0x17, 0x90, 0xdc, 0x14, 0x1e, 0x35, 0x52, 0xdd,
11780 0x02, 0x16, 0xef, 0xb5, 0x40, 0x89, 0xe2, 0x46, 0x53, 0xad,
11781 0x93, 0x6e, 0x98, 0x30, 0xe5, 0x08, 0xb7, 0xcc, 0x03, 0xbc,
11782 0x71, 0x86, 0x09, 0x43, 0x0d, 0x52, 0xf5, 0xa2, 0xf5, 0xa2,
11783 0x56, 0x11, 0x8d, 0xa8, 0xf5, 0xee, 0x92, 0x3d, 0xfe, 0x8c,
11784 0x67, 0x71, 0x8b, 0x0e, 0x2d, 0x70, 0x77, 0xbe, 0xbe, 0xea,
11785 0xbf, 0x9a, 0x8d, 0x9c, 0x53, 0x53, 0xe5, 0xe0, 0x4b, 0x87,
11786 0x85, 0xd2, 0x45, 0x95, 0x30, 0xc1, 0xcc, 0xe0, 0x74, 0x54,
11787 0x13, 0x58, 0xe8, 0xe8, 0x79, 0xa2, 0x09, 0x73, 0xa4, 0x0e,
11788 0x39, 0x59, 0x0c, 0xe6, 0x9c, 0xb2, 0x4f, 0x06, 0x5b, 0x8e,
11789 0xcd, 0x17, 0x6c, 0x5e, 0x95, 0x4d, 0x70, 0xa2, 0x0a, 0xbf,
11790 0xa3, 0xcc, 0x03, 0xbc, 0x5a, 0xe7, 0x75, 0x06, 0x5e, 0x75,
11791 0xef, 0x58, 0x8e, 0x15, 0xd1, 0x0a, 0x18, 0xff, 0xdd, 0xe6,
11792 0x02, 0x3b, 0xb5, 0xb4, 0xa1, 0xe0, 0x72, 0xfc, 0xe3, 0xab,
11793 0x07, 0xe0, 0x4d, 0x65, 0xea, 0x92, 0xeb, 0xf2, 0x7b, 0x17,
11794 0x05, 0xce, 0xc6, 0xf6, 0x2b, 0xbb, 0x70, 0x3d, 0x00, 0x95,
11795 0xe0, 0x07, 0x52, 0x3b, 0x58, 0xfc, 0x7c, 0x69, 0x4d, 0xe9,
11796 0xf7, 0xa9, 0x66, 0x1e, 0x1e, 0xbe, 0x01, 0x69, 0x98, 0xfe,
11797 0xc8, 0x28, 0x02, 0x00, 0x00
11798 };
11799
11800 /* Precomputed 16-bit 1's complement sums for various spans of the above data */
11801 static struct {
11802 boolean_t init;
11803 uint16_t len;
11804 uint16_t sumr; /* reference */
11805 uint16_t sumrp; /* reference, precomputed */
11806 } sumtbl[] = {
11807 { FALSE, 0, 0, 0x0000 },
11808 { FALSE, 1, 0, 0x001f },
11809 { FALSE, 2, 0, 0x8b1f },
11810 { FALSE, 3, 0, 0x8b27 },
11811 { FALSE, 7, 0, 0x790e },
11812 { FALSE, 11, 0, 0xcb6d },
11813 { FALSE, 20, 0, 0x20dd },
11814 { FALSE, 27, 0, 0xbabd },
11815 { FALSE, 32, 0, 0xf3e8 },
11816 { FALSE, 37, 0, 0x197d },
11817 { FALSE, 43, 0, 0x9eae },
11818 { FALSE, 64, 0, 0x4678 },
11819 { FALSE, 127, 0, 0x9399 },
11820 { FALSE, 256, 0, 0xd147 },
11821 { FALSE, 325, 0, 0x0358 },
11822 };
11823 #define SUMTBL_MAX ((int)sizeof (sumtbl) / (int)sizeof (sumtbl[0]))
11824
11825 static void
dlil_verify_sum16(void)11826 dlil_verify_sum16(void)
11827 {
11828 struct mbuf *m;
11829 uint8_t *buf;
11830 int n;
11831
11832 /* Make sure test data plus extra room for alignment fits in cluster */
11833 _CASSERT((sizeof(sumdata) + (sizeof(uint64_t) * 2)) <= MCLBYTES);
11834
11835 kprintf("DLIL: running SUM16 self-tests ... ");
11836
11837 m = m_getcl(M_WAITOK, MT_DATA, M_PKTHDR);
11838 m_align(m, sizeof(sumdata) + (sizeof(uint64_t) * 2));
11839
11840 buf = mtod(m, uint8_t *); /* base address */
11841
11842 for (n = 0; n < SUMTBL_MAX; n++) {
11843 uint16_t len = sumtbl[n].len;
11844 int i;
11845
11846 /* Verify for all possible alignments */
11847 for (i = 0; i < (int)sizeof(uint64_t); i++) {
11848 uint16_t sum, sumr;
11849 uint8_t *c;
11850
11851 /* Copy over test data to mbuf */
11852 VERIFY(len <= sizeof(sumdata));
11853 c = buf + i;
11854 bcopy(sumdata, c, len);
11855
11856 /* Zero-offset test (align by data pointer) */
11857 m->m_data = (caddr_t)c;
11858 m->m_len = len;
11859 sum = m_sum16(m, 0, len);
11860
11861 if (!sumtbl[n].init) {
11862 sumr = (uint16_t)in_cksum_mbuf_ref(m, len, 0, 0);
11863 sumtbl[n].sumr = sumr;
11864 sumtbl[n].init = TRUE;
11865 } else {
11866 sumr = sumtbl[n].sumr;
11867 }
11868
11869 /* Something is horribly broken; stop now */
11870 if (sumr != sumtbl[n].sumrp) {
11871 panic_plain("\n%s: broken in_cksum_mbuf_ref() "
11872 "for len=%d align=%d sum=0x%04x "
11873 "[expected=0x%04x]\n", __func__,
11874 len, i, sum, sumr);
11875 /* NOTREACHED */
11876 } else if (sum != sumr) {
11877 panic_plain("\n%s: broken m_sum16() for len=%d "
11878 "align=%d sum=0x%04x [expected=0x%04x]\n",
11879 __func__, len, i, sum, sumr);
11880 /* NOTREACHED */
11881 }
11882
11883 /* Alignment test by offset (fixed data pointer) */
11884 m->m_data = (caddr_t)buf;
11885 m->m_len = i + len;
11886 sum = m_sum16(m, i, len);
11887
11888 /* Something is horribly broken; stop now */
11889 if (sum != sumr) {
11890 panic_plain("\n%s: broken m_sum16() for len=%d "
11891 "offset=%d sum=0x%04x [expected=0x%04x]\n",
11892 __func__, len, i, sum, sumr);
11893 /* NOTREACHED */
11894 }
11895 #if INET
11896 /* Simple sum16 contiguous buffer test by aligment */
11897 sum = b_sum16(c, len);
11898
11899 /* Something is horribly broken; stop now */
11900 if (sum != sumr) {
11901 panic_plain("\n%s: broken b_sum16() for len=%d "
11902 "align=%d sum=0x%04x [expected=0x%04x]\n",
11903 __func__, len, i, sum, sumr);
11904 /* NOTREACHED */
11905 }
11906 #endif /* INET */
11907 }
11908 }
11909 m_freem(m);
11910
11911 kprintf("PASSED\n");
11912 }
11913 #endif /* DEBUG || DEVELOPMENT */
11914
11915 #define CASE_STRINGIFY(x) case x: return #x
11916
11917 __private_extern__ const char *
dlil_kev_dl_code_str(u_int32_t event_code)11918 dlil_kev_dl_code_str(u_int32_t event_code)
11919 {
11920 switch (event_code) {
11921 CASE_STRINGIFY(KEV_DL_SIFFLAGS);
11922 CASE_STRINGIFY(KEV_DL_SIFMETRICS);
11923 CASE_STRINGIFY(KEV_DL_SIFMTU);
11924 CASE_STRINGIFY(KEV_DL_SIFPHYS);
11925 CASE_STRINGIFY(KEV_DL_SIFMEDIA);
11926 CASE_STRINGIFY(KEV_DL_SIFGENERIC);
11927 CASE_STRINGIFY(KEV_DL_ADDMULTI);
11928 CASE_STRINGIFY(KEV_DL_DELMULTI);
11929 CASE_STRINGIFY(KEV_DL_IF_ATTACHED);
11930 CASE_STRINGIFY(KEV_DL_IF_DETACHING);
11931 CASE_STRINGIFY(KEV_DL_IF_DETACHED);
11932 CASE_STRINGIFY(KEV_DL_LINK_OFF);
11933 CASE_STRINGIFY(KEV_DL_LINK_ON);
11934 CASE_STRINGIFY(KEV_DL_PROTO_ATTACHED);
11935 CASE_STRINGIFY(KEV_DL_PROTO_DETACHED);
11936 CASE_STRINGIFY(KEV_DL_LINK_ADDRESS_CHANGED);
11937 CASE_STRINGIFY(KEV_DL_WAKEFLAGS_CHANGED);
11938 CASE_STRINGIFY(KEV_DL_IF_IDLE_ROUTE_REFCNT);
11939 CASE_STRINGIFY(KEV_DL_IFCAP_CHANGED);
11940 CASE_STRINGIFY(KEV_DL_LINK_QUALITY_METRIC_CHANGED);
11941 CASE_STRINGIFY(KEV_DL_NODE_PRESENCE);
11942 CASE_STRINGIFY(KEV_DL_NODE_ABSENCE);
11943 CASE_STRINGIFY(KEV_DL_PRIMARY_ELECTED);
11944 CASE_STRINGIFY(KEV_DL_ISSUES);
11945 CASE_STRINGIFY(KEV_DL_IFDELEGATE_CHANGED);
11946 default:
11947 break;
11948 }
11949 return "";
11950 }
11951
11952 static void
dlil_dt_tcall_fn(thread_call_param_t arg0,thread_call_param_t arg1)11953 dlil_dt_tcall_fn(thread_call_param_t arg0, thread_call_param_t arg1)
11954 {
11955 #pragma unused(arg1)
11956 struct ifnet *ifp = arg0;
11957
11958 if (ifnet_is_attached(ifp, 1)) {
11959 nstat_ifnet_threshold_reached(ifp->if_index);
11960 ifnet_decr_iorefcnt(ifp);
11961 }
11962 }
11963
11964 void
ifnet_notify_data_threshold(struct ifnet * ifp)11965 ifnet_notify_data_threshold(struct ifnet *ifp)
11966 {
11967 uint64_t bytes = (ifp->if_ibytes + ifp->if_obytes);
11968 uint64_t oldbytes = ifp->if_dt_bytes;
11969
11970 ASSERT(ifp->if_dt_tcall != NULL);
11971
11972 /*
11973 * If we went over the threshold, notify NetworkStatistics.
11974 * We rate-limit it based on the threshold interval value.
11975 */
11976 if (threshold_notify && (bytes - oldbytes) > ifp->if_data_threshold &&
11977 OSCompareAndSwap64(oldbytes, bytes, &ifp->if_dt_bytes) &&
11978 !thread_call_isactive(ifp->if_dt_tcall)) {
11979 uint64_t tival = (threshold_interval * NSEC_PER_SEC);
11980 uint64_t now = mach_absolute_time(), deadline = now;
11981 uint64_t ival;
11982
11983 if (tival != 0) {
11984 nanoseconds_to_absolutetime(tival, &ival);
11985 clock_deadline_for_periodic_event(ival, now, &deadline);
11986 (void) thread_call_enter_delayed(ifp->if_dt_tcall,
11987 deadline);
11988 } else {
11989 (void) thread_call_enter(ifp->if_dt_tcall);
11990 }
11991 }
11992 }
11993
11994 #if (DEVELOPMENT || DEBUG)
11995 /*
11996 * The sysctl variable name contains the input parameters of
11997 * ifnet_get_keepalive_offload_frames()
11998 * ifp (interface index): name[0]
11999 * frames_array_count: name[1]
12000 * frame_data_offset: name[2]
12001 * The return length gives used_frames_count
12002 */
12003 static int
12004 sysctl_get_kao_frames SYSCTL_HANDLER_ARGS
12005 {
12006 #pragma unused(oidp)
12007 int *name = (int *)arg1;
12008 u_int namelen = arg2;
12009 int idx;
12010 ifnet_t ifp = NULL;
12011 u_int32_t frames_array_count;
12012 size_t frame_data_offset;
12013 u_int32_t used_frames_count;
12014 struct ifnet_keepalive_offload_frame *frames_array = NULL;
12015 int error = 0;
12016 u_int32_t i;
12017
12018 /*
12019 * Only root can get look at other people TCP frames
12020 */
12021 error = proc_suser(current_proc());
12022 if (error != 0) {
12023 goto done;
12024 }
12025 /*
12026 * Validate the input parameters
12027 */
12028 if (req->newptr != USER_ADDR_NULL) {
12029 error = EPERM;
12030 goto done;
12031 }
12032 if (namelen != 3) {
12033 error = EINVAL;
12034 goto done;
12035 }
12036 if (req->oldptr == USER_ADDR_NULL) {
12037 error = EINVAL;
12038 goto done;
12039 }
12040 if (req->oldlen == 0) {
12041 error = EINVAL;
12042 goto done;
12043 }
12044 idx = name[0];
12045 frames_array_count = name[1];
12046 frame_data_offset = name[2];
12047
12048 /* Make sure the passed buffer is large enough */
12049 if (frames_array_count * sizeof(struct ifnet_keepalive_offload_frame) >
12050 req->oldlen) {
12051 error = ENOMEM;
12052 goto done;
12053 }
12054
12055 ifnet_head_lock_shared();
12056 if (!IF_INDEX_IN_RANGE(idx)) {
12057 ifnet_head_done();
12058 error = ENOENT;
12059 goto done;
12060 }
12061 ifp = ifindex2ifnet[idx];
12062 ifnet_head_done();
12063
12064 frames_array = (struct ifnet_keepalive_offload_frame *)kalloc_data(
12065 frames_array_count * sizeof(struct ifnet_keepalive_offload_frame),
12066 Z_WAITOK);
12067 if (frames_array == NULL) {
12068 error = ENOMEM;
12069 goto done;
12070 }
12071
12072 error = ifnet_get_keepalive_offload_frames(ifp, frames_array,
12073 frames_array_count, frame_data_offset, &used_frames_count);
12074 if (error != 0) {
12075 DLIL_PRINTF("%s: ifnet_get_keepalive_offload_frames error %d\n",
12076 __func__, error);
12077 goto done;
12078 }
12079
12080 for (i = 0; i < used_frames_count; i++) {
12081 error = SYSCTL_OUT(req, frames_array + i,
12082 sizeof(struct ifnet_keepalive_offload_frame));
12083 if (error != 0) {
12084 goto done;
12085 }
12086 }
12087 done:
12088 if (frames_array != NULL) {
12089 kfree_data(frames_array, frames_array_count *
12090 sizeof(struct ifnet_keepalive_offload_frame));
12091 }
12092 return error;
12093 }
12094 #endif /* DEVELOPMENT || DEBUG */
12095
12096 void
ifnet_update_stats_per_flow(struct ifnet_stats_per_flow * ifs,struct ifnet * ifp)12097 ifnet_update_stats_per_flow(struct ifnet_stats_per_flow *ifs,
12098 struct ifnet *ifp)
12099 {
12100 tcp_update_stats_per_flow(ifs, ifp);
12101 }
12102
12103 static inline u_int32_t
_set_flags(u_int32_t * flags_p,u_int32_t set_flags)12104 _set_flags(u_int32_t *flags_p, u_int32_t set_flags)
12105 {
12106 return (u_int32_t)OSBitOrAtomic(set_flags, flags_p);
12107 }
12108
12109 static inline void
_clear_flags(u_int32_t * flags_p,u_int32_t clear_flags)12110 _clear_flags(u_int32_t *flags_p, u_int32_t clear_flags)
12111 {
12112 OSBitAndAtomic(~clear_flags, flags_p);
12113 }
12114
12115 __private_extern__ u_int32_t
if_set_eflags(ifnet_t interface,u_int32_t set_flags)12116 if_set_eflags(ifnet_t interface, u_int32_t set_flags)
12117 {
12118 return _set_flags(&interface->if_eflags, set_flags);
12119 }
12120
12121 __private_extern__ void
if_clear_eflags(ifnet_t interface,u_int32_t clear_flags)12122 if_clear_eflags(ifnet_t interface, u_int32_t clear_flags)
12123 {
12124 _clear_flags(&interface->if_eflags, clear_flags);
12125 }
12126
12127 __private_extern__ u_int32_t
if_set_xflags(ifnet_t interface,u_int32_t set_flags)12128 if_set_xflags(ifnet_t interface, u_int32_t set_flags)
12129 {
12130 return _set_flags(&interface->if_xflags, set_flags);
12131 }
12132
12133 __private_extern__ void
if_clear_xflags(ifnet_t interface,u_int32_t clear_flags)12134 if_clear_xflags(ifnet_t interface, u_int32_t clear_flags)
12135 {
12136 _clear_flags(&interface->if_xflags, clear_flags);
12137 }
12138
12139 static void
log_hexdump(void * data,size_t len)12140 log_hexdump(void *data, size_t len)
12141 {
12142 size_t i, j, k;
12143 unsigned char *ptr = (unsigned char *)data;
12144 #define MAX_DUMP_BUF 32
12145 unsigned char buf[3 * MAX_DUMP_BUF + 1];
12146
12147 for (i = 0; i < len; i += MAX_DUMP_BUF) {
12148 for (j = i, k = 0; j < i + MAX_DUMP_BUF && j < len; j++) {
12149 unsigned char msnbl = ptr[j] >> 4;
12150 unsigned char lsnbl = ptr[j] & 0x0f;
12151
12152 buf[k++] = msnbl < 10 ? msnbl + '0' : msnbl + 'a' - 10;
12153 buf[k++] = lsnbl < 10 ? lsnbl + '0' : lsnbl + 'a' - 10;
12154
12155 if ((j % 2) == 1) {
12156 buf[k++] = ' ';
12157 }
12158 if ((j % MAX_DUMP_BUF) == MAX_DUMP_BUF - 1) {
12159 buf[k++] = ' ';
12160 }
12161 }
12162 buf[k] = 0;
12163 os_log(OS_LOG_DEFAULT, "%3lu: %s", i, buf);
12164 }
12165 }
12166
12167 #if defined(SKYWALK) && defined(XNU_TARGET_OS_OSX)
12168 static bool
net_check_compatible_if_filter(struct ifnet * ifp)12169 net_check_compatible_if_filter(struct ifnet *ifp)
12170 {
12171 if (ifp == NULL) {
12172 if (net_api_stats.nas_iflt_attach_count > net_api_stats.nas_iflt_attach_os_count) {
12173 return false;
12174 }
12175 } else {
12176 if (ifp->if_flt_non_os_count > 0) {
12177 return false;
12178 }
12179 }
12180 return true;
12181 }
12182 #endif /* SKYWALK && XNU_TARGET_OS_OSX */
12183