1 /*
2 * Copyright (c) 1999-2024 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
30 * support for mandatory and extensible security protections. This notice
31 * is included in support of clause 2.2 (b) of the Apple Public License,
32 * Version 2.0.
33 */
34 #include "kpi_interface.h"
35 #include <stddef.h>
36 #include <ptrauth.h>
37
38 #include <sys/param.h>
39 #include <sys/systm.h>
40 #include <sys/kernel.h>
41 #include <sys/malloc.h>
42 #include <sys/mbuf.h>
43 #include <sys/socket.h>
44 #include <sys/domain.h>
45 #include <sys/user.h>
46 #include <sys/random.h>
47 #include <sys/socketvar.h>
48 #include <net/if_dl.h>
49 #include <net/if.h>
50 #include <net/route.h>
51 #include <net/if_var.h>
52 #include <net/dlil.h>
53 #include <net/dlil_sysctl.h>
54 #include <net/dlil_var_private.h>
55 #include <net/if_arp.h>
56 #include <net/iptap.h>
57 #include <net/pktap.h>
58 #include <net/droptap.h>
59 #include <net/nwk_wq.h>
60 #include <sys/kern_event.h>
61 #include <sys/kdebug.h>
62 #include <sys/mcache.h>
63 #include <sys/syslog.h>
64 #include <sys/protosw.h>
65 #include <sys/priv.h>
66
67 #include <kern/assert.h>
68 #include <kern/task.h>
69 #include <kern/thread.h>
70 #include <kern/sched_prim.h>
71 #include <kern/locks.h>
72 #include <kern/zalloc.h>
73
74 #include <net/kpi_protocol.h>
75 #include <net/if_types.h>
76 #include <net/if_ipsec.h>
77 #include <net/if_llreach.h>
78 #include <net/if_utun.h>
79 #include <net/kpi_interfacefilter.h>
80 #include <net/classq/classq.h>
81 #include <net/classq/classq_sfb.h>
82 #include <net/flowhash.h>
83 #include <net/ntstat.h>
84 #if SKYWALK
85 #include <skywalk/lib/net_filter_event.h>
86 #endif /* SKYWALK */
87 #include <net/net_api_stats.h>
88 #include <net/if_ports_used.h>
89 #include <net/if_vlan_var.h>
90 #include <netinet/in.h>
91 #if INET
92 #include <netinet/in_var.h>
93 #include <netinet/igmp_var.h>
94 #include <netinet/ip_var.h>
95 #include <netinet/tcp.h>
96 #include <netinet/tcp_var.h>
97 #include <netinet/udp.h>
98 #include <netinet/udp_var.h>
99 #include <netinet/if_ether.h>
100 #include <netinet/in_pcb.h>
101 #include <netinet/in_tclass.h>
102 #include <netinet/ip.h>
103 #include <netinet/ip_icmp.h>
104 #include <netinet/icmp_var.h>
105 #endif /* INET */
106
107 #include <net/nat464_utils.h>
108 #include <netinet6/in6_var.h>
109 #include <netinet6/nd6.h>
110 #include <netinet6/mld6_var.h>
111 #include <netinet6/scope6_var.h>
112 #include <netinet/ip6.h>
113 #include <netinet/icmp6.h>
114 #include <net/pf_pbuf.h>
115 #include <libkern/OSAtomic.h>
116 #include <libkern/tree.h>
117
118 #include <dev/random/randomdev.h>
119 #include <machine/machine_routines.h>
120
121 #include <mach/thread_act.h>
122 #include <mach/sdt.h>
123
124 #if CONFIG_MACF
125 #include <sys/kauth.h>
126 #include <security/mac_framework.h>
127 #include <net/ethernet.h>
128 #include <net/firewire.h>
129 #endif
130
131 #if PF
132 #include <net/pfvar.h>
133 #endif /* PF */
134 #include <net/pktsched/pktsched.h>
135 #include <net/pktsched/pktsched_netem.h>
136
137 #if NECP
138 #include <net/necp.h>
139 #endif /* NECP */
140
141 #if SKYWALK
142 #include <skywalk/packet/packet_queue.h>
143 #include <skywalk/nexus/netif/nx_netif.h>
144 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
145 #endif /* SKYWALK */
146
147 #include <net/sockaddr_utils.h>
148
149 #include <os/log.h>
150
151 #define DBG_LAYER_BEG DLILDBG_CODE(DBG_DLIL_STATIC, 0)
152 #define DBG_LAYER_END DLILDBG_CODE(DBG_DLIL_STATIC, 2)
153 #define DBG_FNC_DLIL_INPUT DLILDBG_CODE(DBG_DLIL_STATIC, (1 << 8))
154 #define DBG_FNC_DLIL_OUTPUT DLILDBG_CODE(DBG_DLIL_STATIC, (2 << 8))
155 #define DBG_FNC_DLIL_IFOUT DLILDBG_CODE(DBG_DLIL_STATIC, (3 << 8))
156
157 #define IF_DATA_REQUIRE_ALIGNED_64(f) \
158 _CASSERT(!(offsetof(struct if_data_internal, f) % sizeof (u_int64_t)))
159
160 #define IFNET_IF_DATA_REQUIRE_ALIGNED_64(f) \
161 _CASSERT(!(offsetof(struct ifnet, if_data.f) % sizeof (u_int64_t)))
162
163 enum {
164 kProtoKPI_v1 = 1,
165 kProtoKPI_v2 = 2
166 };
167
168 uint64_t if_creation_generation_count = 0;
169
170 /*
171 * List of if_proto structures in if_proto_hash[] is protected by
172 * the ifnet lock. The rest of the fields are initialized at protocol
173 * attach time and never change, thus no lock required as long as
174 * a reference to it is valid, via if_proto_ref().
175 */
176 struct if_proto {
177 SLIST_ENTRY(if_proto) next_hash;
178 u_int32_t refcount;
179 u_int32_t detached;
180 struct ifnet *ifp;
181 protocol_family_t protocol_family;
182 int proto_kpi;
183 union {
184 struct {
185 proto_media_input input;
186 proto_media_preout pre_output;
187 proto_media_event event;
188 proto_media_ioctl ioctl;
189 proto_media_detached detached;
190 proto_media_resolve_multi resolve_multi;
191 proto_media_send_arp send_arp;
192 } v1;
193 struct {
194 proto_media_input_v2 input;
195 proto_media_preout pre_output;
196 proto_media_event event;
197 proto_media_ioctl ioctl;
198 proto_media_detached detached;
199 proto_media_resolve_multi resolve_multi;
200 proto_media_send_arp send_arp;
201 } v2;
202 } kpi;
203 };
204
205 SLIST_HEAD(proto_hash_entry, if_proto);
206
207 #define DLIL_SDLDATALEN \
208 (DLIL_SDLMAXLEN - offsetof(struct sockaddr_dl, sdl_data[0]))
209
210 /*
211 * In the common case, the LL address is stored in the
212 * `dl_if_lladdr' member of the `dlil_ifnet'. This is sufficient
213 * for LL addresses that do not exceed the `DLIL_SDLMAXLEN' constant.
214 */
215 struct dl_if_lladdr_std {
216 struct ifaddr ifa;
217 u_int8_t addr_sdl_bytes[DLIL_SDLMAXLEN];
218 u_int8_t mask_sdl_bytes[DLIL_SDLMAXLEN];
219 };
220
221 /*
222 * However, in some rare cases we encounter LL addresses which
223 * would not fit in the `DLIL_SDLMAXLEN' limitation. In such cases
224 * we allocate the storage in the permanent arena, using this memory layout.
225 */
226 struct dl_if_lladdr_xtra_space {
227 struct ifaddr ifa;
228 u_int8_t addr_sdl_bytes[SOCK_MAXADDRLEN];
229 u_int8_t mask_sdl_bytes[SOCK_MAXADDRLEN];
230 };
231
232 struct dlil_ifnet {
233 struct ifnet dl_if; /* public ifnet */
234 /*
235 * DLIL private fields, protected by dl_if_lock
236 */
237 decl_lck_mtx_data(, dl_if_lock);
238 TAILQ_ENTRY(dlil_ifnet) dl_if_link; /* dlil_ifnet link */
239 u_int32_t dl_if_flags; /* flags (below) */
240 u_int32_t dl_if_refcnt; /* refcnt */
241 void (*dl_if_trace)(struct dlil_ifnet *, int); /* ref trace callback */
242 void *dl_if_uniqueid; /* unique interface id */
243 size_t dl_if_uniqueid_len; /* length of the unique id */
244 char dl_if_namestorage[IFNAMSIZ]; /* interface name storage */
245 char dl_if_xnamestorage[IFXNAMSIZ]; /* external name storage */
246 struct dl_if_lladdr_std dl_if_lladdr; /* link-level address storage*/
247 u_int8_t dl_if_descstorage[IF_DESCSIZE]; /* desc storage */
248 u_int8_t dl_if_permanent_ether[ETHER_ADDR_LEN]; /* permanent address */
249 u_int8_t dl_if_permanent_ether_is_set;
250 u_int8_t dl_if_unused;
251 struct dlil_threading_info dl_if_inpstorage; /* input thread storage */
252 ctrace_t dl_if_attach; /* attach PC stacktrace */
253 ctrace_t dl_if_detach; /* detach PC stacktrace */
254 };
255
256 /* Values for dl_if_flags (private to DLIL) */
257 #define DLIF_INUSE 0x1 /* DLIL ifnet recycler, ifnet in use */
258 #define DLIF_REUSE 0x2 /* DLIL ifnet recycles, ifnet is not new */
259 #define DLIF_DEBUG 0x4 /* has debugging info */
260
261 #define IF_REF_TRACE_HIST_SIZE 8 /* size of ref trace history */
262
263 /* For gdb */
264 __private_extern__ unsigned int if_ref_trace_hist_size = IF_REF_TRACE_HIST_SIZE;
265
266 struct dlil_ifnet_dbg {
267 struct dlil_ifnet dldbg_dlif; /* dlil_ifnet */
268 u_int16_t dldbg_if_refhold_cnt; /* # ifnet references */
269 u_int16_t dldbg_if_refrele_cnt; /* # ifnet releases */
270 /*
271 * Circular lists of ifnet_{reference,release} callers.
272 */
273 ctrace_t dldbg_if_refhold[IF_REF_TRACE_HIST_SIZE];
274 ctrace_t dldbg_if_refrele[IF_REF_TRACE_HIST_SIZE];
275 };
276
277 #define DLIL_TO_IFP(s) (&s->dl_if)
278 #define IFP_TO_DLIL(s) ((struct dlil_ifnet *)s)
279
280 struct ifnet_filter {
281 TAILQ_ENTRY(ifnet_filter) filt_next;
282 u_int32_t filt_skip;
283 u_int32_t filt_flags;
284 ifnet_t filt_ifp;
285 const char *filt_name;
286 void *filt_cookie;
287 protocol_family_t filt_protocol;
288 iff_input_func filt_input;
289 iff_output_func filt_output;
290 iff_event_func filt_event;
291 iff_ioctl_func filt_ioctl;
292 iff_detached_func filt_detached;
293 };
294
295 /* Mbuf queue used for freeing the excessive mbufs */
296 typedef MBUFQ_HEAD(dlil_freeq) dlil_freeq_t;
297
298 struct proto_input_entry;
299
300 static TAILQ_HEAD(, dlil_ifnet) dlil_ifnet_head;
301
302 static LCK_ATTR_DECLARE(dlil_lck_attributes, 0, 0);
303
304 static LCK_GRP_DECLARE(dlil_lock_group, "DLIL internal locks");
305 LCK_GRP_DECLARE(ifnet_lock_group, "ifnet locks");
306 static LCK_GRP_DECLARE(ifnet_head_lock_group, "ifnet head lock");
307 static LCK_GRP_DECLARE(ifnet_snd_lock_group, "ifnet snd locks");
308 static LCK_GRP_DECLARE(ifnet_rcv_lock_group, "ifnet rcv locks");
309
310 LCK_ATTR_DECLARE(ifnet_lock_attr, 0, 0);
311 static LCK_RW_DECLARE_ATTR(ifnet_head_lock, &ifnet_head_lock_group,
312 &dlil_lck_attributes);
313 static LCK_MTX_DECLARE_ATTR(dlil_ifnet_lock, &dlil_lock_group,
314 &dlil_lck_attributes);
315
316 #if DEBUG
317 static unsigned int ifnet_debug = 1; /* debugging (enabled) */
318 #else
319 static unsigned int ifnet_debug; /* debugging (disabled) */
320 #endif /* !DEBUG */
321 static unsigned int dlif_size; /* size of dlil_ifnet to allocate */
322 static unsigned int dlif_bufsize; /* size of dlif_size + headroom */
323 static struct zone *dlif_zone; /* zone for dlil_ifnet */
324 #define DLIF_ZONE_NAME "ifnet" /* zone name */
325
326 static KALLOC_TYPE_DEFINE(dlif_filt_zone, struct ifnet_filter, NET_KT_DEFAULT);
327
328 static KALLOC_TYPE_DEFINE(dlif_proto_zone, struct if_proto, NET_KT_DEFAULT);
329
330 static unsigned int dlif_tcpstat_size; /* size of tcpstat_local to allocate */
331 static unsigned int dlif_tcpstat_bufsize; /* size of dlif_tcpstat_size + headroom */
332 static struct zone *dlif_tcpstat_zone; /* zone for tcpstat_local */
333 #define DLIF_TCPSTAT_ZONE_NAME "ifnet_tcpstat" /* zone name */
334
335 static unsigned int dlif_udpstat_size; /* size of udpstat_local to allocate */
336 static unsigned int dlif_udpstat_bufsize; /* size of dlif_udpstat_size + headroom */
337 static struct zone *dlif_udpstat_zone; /* zone for udpstat_local */
338 #define DLIF_UDPSTAT_ZONE_NAME "ifnet_udpstat" /* zone name */
339
340 static u_int32_t net_rtref;
341
342 static struct dlil_main_threading_info dlil_main_input_thread_info;
343 __private_extern__ struct dlil_threading_info *dlil_main_input_thread =
344 (struct dlil_threading_info *)&dlil_main_input_thread_info;
345
346 static int dlil_event_internal(struct ifnet *ifp, struct kev_msg *msg, bool update_generation);
347 static int dlil_detach_filter_internal(interface_filter_t filter, int detached);
348 static void dlil_if_trace(struct dlil_ifnet *, int);
349 static void if_proto_ref(struct if_proto *);
350 static void if_proto_free(struct if_proto *);
351 static struct if_proto *find_attached_proto(struct ifnet *, u_int32_t);
352 static u_int32_t dlil_ifp_protolist(struct ifnet *ifp, protocol_family_t *list,
353 u_int32_t list_count);
354 static void _dlil_if_release(ifnet_t ifp, bool clear_in_use);
355 static void if_flt_monitor_busy(struct ifnet *);
356 static void if_flt_monitor_unbusy(struct ifnet *);
357 static void if_flt_monitor_enter(struct ifnet *);
358 static void if_flt_monitor_leave(struct ifnet *);
359 static int dlil_interface_filters_input(struct ifnet *, struct mbuf **,
360 char **, protocol_family_t, boolean_t);
361 static int dlil_interface_filters_output(struct ifnet *, struct mbuf **,
362 protocol_family_t);
363 static struct ifaddr *dlil_alloc_lladdr(struct ifnet *,
364 const struct sockaddr_dl *);
365 static int ifnet_lookup(struct ifnet *);
366 static void if_purgeaddrs(struct ifnet *);
367
368 static errno_t ifproto_media_input_v1(struct ifnet *, protocol_family_t,
369 struct mbuf *, char *);
370 static errno_t ifproto_media_input_v2(struct ifnet *, protocol_family_t,
371 struct mbuf *);
372 static errno_t ifproto_media_preout(struct ifnet *, protocol_family_t,
373 mbuf_t *, const struct sockaddr *, void *, char *, char *);
374 static void ifproto_media_event(struct ifnet *, protocol_family_t,
375 const struct kev_msg *);
376 static errno_t ifproto_media_ioctl(struct ifnet *, protocol_family_t,
377 unsigned long, void *);
378 static errno_t ifproto_media_resolve_multi(ifnet_t, const struct sockaddr *,
379 struct sockaddr_dl *, size_t);
380 static errno_t ifproto_media_send_arp(struct ifnet *, u_short,
381 const struct sockaddr_dl *, const struct sockaddr *,
382 const struct sockaddr_dl *, const struct sockaddr *);
383
384 static errno_t ifp_if_input(struct ifnet *ifp, struct mbuf *m_head,
385 struct mbuf *m_tail, const struct ifnet_stat_increment_param *s,
386 boolean_t poll, struct thread *tp);
387 static void ifp_if_input_poll(struct ifnet *, u_int32_t, u_int32_t,
388 struct mbuf **, struct mbuf **, u_int32_t *, u_int32_t *);
389 static errno_t ifp_if_ctl(struct ifnet *, ifnet_ctl_cmd_t, u_int32_t, void *);
390 static errno_t ifp_if_demux(struct ifnet *, struct mbuf *, char *,
391 protocol_family_t *);
392 static errno_t ifp_if_add_proto(struct ifnet *, protocol_family_t,
393 const struct ifnet_demux_desc *, u_int32_t);
394 static errno_t ifp_if_del_proto(struct ifnet *, protocol_family_t);
395 static errno_t ifp_if_check_multi(struct ifnet *, const struct sockaddr *);
396 #if !XNU_TARGET_OS_OSX
397 static errno_t ifp_if_framer(struct ifnet *, struct mbuf **,
398 const struct sockaddr *, const char *, const char *,
399 u_int32_t *, u_int32_t *);
400 #else /* XNU_TARGET_OS_OSX */
401 static errno_t ifp_if_framer(struct ifnet *, struct mbuf **,
402 const struct sockaddr *, const char *, const char *);
403 #endif /* XNU_TARGET_OS_OSX */
404 static errno_t ifp_if_framer_extended(struct ifnet *, struct mbuf **,
405 const struct sockaddr *, const char *, const char *,
406 u_int32_t *, u_int32_t *);
407 static errno_t ifp_if_set_bpf_tap(struct ifnet *, bpf_tap_mode, bpf_packet_func);
408 static void ifp_if_free(struct ifnet *);
409 static void ifp_if_event(struct ifnet *, const struct kev_msg *);
410 static __inline void ifp_inc_traffic_class_in(struct ifnet *, struct mbuf *);
411 static __inline void ifp_inc_traffic_class_out(struct ifnet *, struct mbuf *);
412
413 static uint32_t dlil_trim_overcomitted_queue_locked(class_queue_t *,
414 dlil_freeq_t *, struct ifnet_stat_increment_param *);
415
416 static errno_t dlil_input_async(struct dlil_threading_info *, struct ifnet *,
417 struct mbuf *, struct mbuf *, const struct ifnet_stat_increment_param *,
418 boolean_t, struct thread *);
419 static errno_t dlil_input_sync(struct dlil_threading_info *, struct ifnet *,
420 struct mbuf *, struct mbuf *, const struct ifnet_stat_increment_param *,
421 boolean_t, struct thread *);
422
423 static void dlil_main_input_thread_func(void *, wait_result_t);
424 static void dlil_main_input_thread_cont(void *, wait_result_t);
425
426 static void dlil_input_thread_func(void *, wait_result_t);
427 static void dlil_input_thread_cont(void *, wait_result_t);
428
429 static void dlil_rxpoll_input_thread_func(void *, wait_result_t);
430 static void dlil_rxpoll_input_thread_cont(void *, wait_result_t);
431
432 static int dlil_create_input_thread(ifnet_t, struct dlil_threading_info *,
433 thread_continue_t *);
434 static void dlil_terminate_input_thread(struct dlil_threading_info *);
435 static void dlil_input_stats_add(const struct ifnet_stat_increment_param *,
436 struct dlil_threading_info *, struct ifnet *, boolean_t);
437 static boolean_t dlil_input_stats_sync(struct ifnet *,
438 struct dlil_threading_info *);
439 static void dlil_input_packet_list_common(struct ifnet *, struct mbuf *,
440 u_int32_t, ifnet_model_t, boolean_t);
441 static errno_t ifnet_input_common(struct ifnet *, struct mbuf *, struct mbuf *,
442 const struct ifnet_stat_increment_param *, boolean_t, boolean_t);
443 static int dlil_is_clat_needed(protocol_family_t, mbuf_t );
444 static errno_t dlil_clat46(ifnet_t, protocol_family_t *, mbuf_t *);
445 static errno_t dlil_clat64(ifnet_t, protocol_family_t *, mbuf_t *);
446 #if DEBUG || DEVELOPMENT
447 static void dlil_verify_sum16(void);
448 #endif /* DEBUG || DEVELOPMENT */
449 static void dlil_output_cksum_dbg(struct ifnet *, struct mbuf *, uint32_t,
450 protocol_family_t);
451 static void dlil_input_cksum_dbg(struct ifnet *, struct mbuf *, char *,
452 protocol_family_t);
453
454 static void dlil_incr_pending_thread_count(void);
455 static void dlil_decr_pending_thread_count(void);
456
457 static void ifnet_detacher_thread_func(void *, wait_result_t);
458 static void ifnet_detacher_thread_cont(void *, wait_result_t);
459 static void ifnet_detach_final(struct ifnet *);
460 static void ifnet_detaching_enqueue(struct ifnet *);
461 static struct ifnet *ifnet_detaching_dequeue(void);
462
463 static void ifnet_start_thread_func(void *, wait_result_t);
464 static void ifnet_start_thread_cont(void *, wait_result_t);
465
466 static void ifnet_poll_thread_func(void *, wait_result_t);
467 static void ifnet_poll_thread_cont(void *, wait_result_t);
468
469 static errno_t ifnet_enqueue_common(struct ifnet *, struct ifclassq *,
470 classq_pkt_t *, boolean_t, boolean_t *);
471
472 static void ifp_src_route_copyout(struct ifnet *, struct route *);
473 static void ifp_src_route_copyin(struct ifnet *, struct route *);
474 static void ifp_src_route6_copyout(struct ifnet *, struct route_in6 *);
475 static void ifp_src_route6_copyin(struct ifnet *, struct route_in6 *);
476
477 static errno_t if_mcasts_update_async(struct ifnet *);
478
479 /* The following are protected by dlil_ifnet_lock */
480 static TAILQ_HEAD(, ifnet) ifnet_detaching_head;
481 static u_int32_t ifnet_detaching_cnt;
482 static boolean_t ifnet_detaching_embryonic;
483 static void *ifnet_delayed_run; /* wait channel for detaching thread */
484
485 static LCK_MTX_DECLARE_ATTR(ifnet_fc_lock, &dlil_lock_group,
486 &dlil_lck_attributes);
487
488 static uint32_t ifnet_flowhash_seed;
489
490 struct ifnet_flowhash_key {
491 char ifk_name[IFNAMSIZ];
492 uint32_t ifk_unit;
493 uint32_t ifk_flags;
494 uint32_t ifk_eflags;
495 uint32_t ifk_capabilities;
496 uint32_t ifk_capenable;
497 uint32_t ifk_output_sched_model;
498 uint32_t ifk_rand1;
499 uint32_t ifk_rand2;
500 };
501
502 /* Flow control entry per interface */
503 struct ifnet_fc_entry {
504 RB_ENTRY(ifnet_fc_entry) ifce_entry;
505 u_int32_t ifce_flowhash;
506 struct ifnet *ifce_ifp;
507 };
508
509 static uint32_t ifnet_calc_flowhash(struct ifnet *);
510 static int ifce_cmp(const struct ifnet_fc_entry *,
511 const struct ifnet_fc_entry *);
512 static int ifnet_fc_add(struct ifnet *);
513 static struct ifnet_fc_entry *ifnet_fc_get(u_int32_t);
514 static void ifnet_fc_entry_free(struct ifnet_fc_entry *);
515
516 /* protected by ifnet_fc_lock */
517 RB_HEAD(ifnet_fc_tree, ifnet_fc_entry) ifnet_fc_tree;
518 RB_PROTOTYPE(ifnet_fc_tree, ifnet_fc_entry, ifce_entry, ifce_cmp);
519 RB_GENERATE(ifnet_fc_tree, ifnet_fc_entry, ifce_entry, ifce_cmp);
520
521 static KALLOC_TYPE_DEFINE(ifnet_fc_zone, struct ifnet_fc_entry, NET_KT_DEFAULT);
522
523 extern void bpfdetach(struct ifnet *);
524 extern void proto_input_run(void);
525
526 extern uint32_t udp_count_opportunistic(unsigned int ifindex,
527 u_int32_t flags);
528 extern uint32_t tcp_count_opportunistic(unsigned int ifindex,
529 u_int32_t flags);
530
531 __private_extern__ void link_rtrequest(int, struct rtentry *, struct sockaddr *);
532
533 #if CONFIG_MACF
534 #if !XNU_TARGET_OS_OSX
535 int dlil_lladdr_ckreq = 1;
536 #else /* XNU_TARGET_OS_OSX */
537 int dlil_lladdr_ckreq = 0;
538 #endif /* XNU_TARGET_OS_OSX */
539 #endif /* CONFIG_MACF */
540
541 /* rate limit debug messages */
542 struct timespec dlil_dbgrate = { .tv_sec = 1, .tv_nsec = 0 };
543
544 static inline void
ifnet_delay_start_disabled_increment(void)545 ifnet_delay_start_disabled_increment(void)
546 {
547 OSIncrementAtomic(&ifnet_delay_start_disabled);
548 }
549
550 static void log_hexdump(void *data, size_t len);
551
552 unsigned int net_rxpoll = 1;
553 unsigned int net_affinity = 1;
554 unsigned int net_async = 1; /* 0: synchronous, 1: asynchronous */
555
556 static kern_return_t dlil_affinity_set(struct thread *, u_int32_t);
557
558 extern u_int32_t inject_buckets;
559
560 /* DLIL data threshold thread call */
561 static void dlil_dt_tcall_fn(thread_call_param_t, thread_call_param_t);
562
563 void
ifnet_filter_update_tso(struct ifnet * ifp,boolean_t filter_enable)564 ifnet_filter_update_tso(struct ifnet *ifp, boolean_t filter_enable)
565 {
566 /*
567 * update filter count and route_generation ID to let TCP
568 * know it should reevalute doing TSO or not
569 */
570 if (filter_enable) {
571 OSAddAtomic(1, &ifp->if_flt_no_tso_count);
572 } else {
573 VERIFY(ifp->if_flt_no_tso_count != 0);
574 OSAddAtomic(-1, &ifp->if_flt_no_tso_count);
575 }
576 routegenid_update();
577 }
578
579 #if SKYWALK
580
581 static bool net_check_compatible_if_filter(struct ifnet *ifp);
582
583 /* if_attach_nx flags defined in os_skywalk_private.h */
584 unsigned int if_attach_nx = IF_ATTACH_NX_DEFAULT;
585 unsigned int if_enable_fsw_ip_netagent =
586 ((IF_ATTACH_NX_DEFAULT & IF_ATTACH_NX_FSW_IP_NETAGENT) != 0);
587 unsigned int if_enable_fsw_transport_netagent =
588 ((IF_ATTACH_NX_DEFAULT & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0);
589
590 unsigned int if_netif_all =
591 ((IF_ATTACH_NX_DEFAULT & IF_ATTACH_NX_NETIF_ALL) != 0);
592
593 /* Configure flowswitch to use max mtu sized buffer */
594 static bool fsw_use_max_mtu_buffer = false;
595
596
597 static void dlil_detach_flowswitch_nexus(if_nexus_flowswitch_t nexus_fsw);
598
599 #include <skywalk/os_skywalk_private.h>
600
601 boolean_t
ifnet_nx_noauto(ifnet_t ifp)602 ifnet_nx_noauto(ifnet_t ifp)
603 {
604 return (ifp->if_xflags & IFXF_NX_NOAUTO) != 0;
605 }
606
607 boolean_t
ifnet_nx_noauto_flowswitch(ifnet_t ifp)608 ifnet_nx_noauto_flowswitch(ifnet_t ifp)
609 {
610 return ifnet_is_low_latency(ifp);
611 }
612
613 boolean_t
ifnet_is_low_latency(ifnet_t ifp)614 ifnet_is_low_latency(ifnet_t ifp)
615 {
616 return (ifp->if_xflags & IFXF_LOW_LATENCY) != 0;
617 }
618
619 boolean_t
ifnet_needs_compat(ifnet_t ifp)620 ifnet_needs_compat(ifnet_t ifp)
621 {
622 if ((if_attach_nx & IF_ATTACH_NX_NETIF_COMPAT) == 0) {
623 return FALSE;
624 }
625 #if !XNU_TARGET_OS_OSX
626 /*
627 * To conserve memory, we plumb in the compat layer selectively; this
628 * can be overridden via if_attach_nx flag IF_ATTACH_NX_NETIF_ALL.
629 * In particular, we check for Wi-Fi Access Point.
630 */
631 if (IFNET_IS_WIFI(ifp)) {
632 /* Wi-Fi Access Point */
633 if (ifp->if_name[0] == 'a' && ifp->if_name[1] == 'p' &&
634 ifp->if_name[2] == '\0') {
635 return if_netif_all;
636 }
637 }
638 #else /* XNU_TARGET_OS_OSX */
639 #pragma unused(ifp)
640 #endif /* XNU_TARGET_OS_OSX */
641 return TRUE;
642 }
643
644 boolean_t
ifnet_needs_fsw_transport_netagent(ifnet_t ifp)645 ifnet_needs_fsw_transport_netagent(ifnet_t ifp)
646 {
647 if (if_is_fsw_transport_netagent_enabled()) {
648 /* check if netagent has been manually enabled for ipsec/utun */
649 if (ifp->if_family == IFNET_FAMILY_IPSEC) {
650 return ipsec_interface_needs_netagent(ifp);
651 } else if (ifp->if_family == IFNET_FAMILY_UTUN) {
652 return utun_interface_needs_netagent(ifp);
653 }
654
655 /* check ifnet no auto nexus override */
656 if (ifnet_nx_noauto(ifp)) {
657 return FALSE;
658 }
659
660 /* check global if_attach_nx configuration */
661 switch (ifp->if_family) {
662 case IFNET_FAMILY_CELLULAR:
663 case IFNET_FAMILY_ETHERNET:
664 if ((if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0) {
665 return TRUE;
666 }
667 break;
668 default:
669 break;
670 }
671 }
672 return FALSE;
673 }
674
675 boolean_t
ifnet_needs_fsw_ip_netagent(ifnet_t ifp)676 ifnet_needs_fsw_ip_netagent(ifnet_t ifp)
677 {
678 #pragma unused(ifp)
679 if ((if_attach_nx & IF_ATTACH_NX_FSW_IP_NETAGENT) != 0) {
680 return TRUE;
681 }
682 return FALSE;
683 }
684
685 boolean_t
ifnet_needs_netif_netagent(ifnet_t ifp)686 ifnet_needs_netif_netagent(ifnet_t ifp)
687 {
688 #pragma unused(ifp)
689 return (if_attach_nx & IF_ATTACH_NX_NETIF_NETAGENT) != 0;
690 }
691
692 static boolean_t
dlil_detach_nexus_instance(nexus_controller_t controller,const char * func_str,uuid_t instance,uuid_t device)693 dlil_detach_nexus_instance(nexus_controller_t controller,
694 const char *func_str, uuid_t instance, uuid_t device)
695 {
696 errno_t err;
697
698 if (instance == NULL || uuid_is_null(instance)) {
699 return FALSE;
700 }
701
702 /* followed by the device port */
703 if (device != NULL && !uuid_is_null(device)) {
704 err = kern_nexus_ifdetach(controller, instance, device);
705 if (err != 0) {
706 DLIL_PRINTF("%s kern_nexus_ifdetach device failed %d\n",
707 func_str, err);
708 }
709 }
710 err = kern_nexus_controller_free_provider_instance(controller,
711 instance);
712 if (err != 0) {
713 DLIL_PRINTF("%s free_provider_instance failed %d\n",
714 func_str, err);
715 }
716 return TRUE;
717 }
718
719 static boolean_t
dlil_detach_nexus(const char * func_str,uuid_t provider,uuid_t instance,uuid_t device)720 dlil_detach_nexus(const char *func_str, uuid_t provider, uuid_t instance,
721 uuid_t device)
722 {
723 boolean_t detached = FALSE;
724 nexus_controller_t controller = kern_nexus_shared_controller();
725 int err;
726
727 if (dlil_detach_nexus_instance(controller, func_str, instance,
728 device)) {
729 detached = TRUE;
730 }
731 if (provider != NULL && !uuid_is_null(provider)) {
732 detached = TRUE;
733 err = kern_nexus_controller_deregister_provider(controller,
734 provider);
735 if (err != 0) {
736 DLIL_PRINTF("%s deregister_provider %d\n",
737 func_str, err);
738 }
739 }
740 return detached;
741 }
742
743 static errno_t
dlil_create_provider_and_instance(nexus_controller_t controller,nexus_type_t type,ifnet_t ifp,uuid_t * provider,uuid_t * instance,nexus_attr_t attr)744 dlil_create_provider_and_instance(nexus_controller_t controller,
745 nexus_type_t type, ifnet_t ifp, uuid_t *provider, uuid_t *instance,
746 nexus_attr_t attr)
747 {
748 uuid_t dom_prov;
749 errno_t err;
750 nexus_name_t provider_name;
751 const char *type_name =
752 (type == NEXUS_TYPE_NET_IF) ? "netif" : "flowswitch";
753 struct kern_nexus_init init;
754
755 err = kern_nexus_get_default_domain_provider(type, &dom_prov);
756 if (err != 0) {
757 DLIL_PRINTF("%s can't get %s provider, error %d\n",
758 __func__, type_name, err);
759 goto failed;
760 }
761
762 snprintf((char *)provider_name, sizeof(provider_name),
763 "com.apple.%s.%s", type_name, if_name(ifp));
764 err = kern_nexus_controller_register_provider(controller,
765 dom_prov,
766 provider_name,
767 NULL,
768 0,
769 attr,
770 provider);
771 if (err != 0) {
772 DLIL_PRINTF("%s register %s provider failed, error %d\n",
773 __func__, type_name, err);
774 goto failed;
775 }
776 bzero(&init, sizeof(init));
777 init.nxi_version = KERN_NEXUS_CURRENT_VERSION;
778 err = kern_nexus_controller_alloc_provider_instance(controller,
779 *provider,
780 NULL, NULL,
781 instance, &init);
782 if (err != 0) {
783 DLIL_PRINTF("%s alloc_provider_instance %s failed, %d\n",
784 __func__, type_name, err);
785 kern_nexus_controller_deregister_provider(controller,
786 *provider);
787 goto failed;
788 }
789 failed:
790 return err;
791 }
792
793 static boolean_t
dlil_attach_netif_nexus_common(ifnet_t ifp,if_nexus_netif_t netif_nx)794 dlil_attach_netif_nexus_common(ifnet_t ifp, if_nexus_netif_t netif_nx)
795 {
796 nexus_attr_t attr = NULL;
797 nexus_controller_t controller;
798 errno_t err;
799
800 if ((ifp->if_capabilities & IFCAP_SKYWALK) != 0) {
801 /* it's already attached */
802 if (dlil_verbose) {
803 DLIL_PRINTF("%s: %s already has nexus attached\n",
804 __func__, if_name(ifp));
805 /* already attached */
806 }
807 goto failed;
808 }
809
810 err = kern_nexus_attr_create(&attr);
811 if (err != 0) {
812 DLIL_PRINTF("%s: nexus attr create for %s\n", __func__,
813 if_name(ifp));
814 goto failed;
815 }
816 err = kern_nexus_attr_set(attr, NEXUS_ATTR_IFINDEX, ifp->if_index);
817 VERIFY(err == 0);
818
819 controller = kern_nexus_shared_controller();
820
821 /* create the netif provider and instance */
822 err = dlil_create_provider_and_instance(controller,
823 NEXUS_TYPE_NET_IF, ifp, &netif_nx->if_nif_provider,
824 &netif_nx->if_nif_instance, attr);
825 if (err != 0) {
826 goto failed;
827 }
828 err = kern_nexus_ifattach(controller, netif_nx->if_nif_instance,
829 ifp, NULL, FALSE, &netif_nx->if_nif_attach);
830 if (err != 0) {
831 DLIL_PRINTF("%s kern_nexus_ifattach %d\n",
832 __func__, err);
833 /* cleanup provider and instance */
834 dlil_detach_nexus(__func__, netif_nx->if_nif_provider,
835 netif_nx->if_nif_instance, NULL);
836 goto failed;
837 }
838 return TRUE;
839
840 failed:
841 if (attr != NULL) {
842 kern_nexus_attr_destroy(attr);
843 }
844 return FALSE;
845 }
846
847 static boolean_t
dlil_attach_netif_compat_nexus(ifnet_t ifp,if_nexus_netif_t netif_nx)848 dlil_attach_netif_compat_nexus(ifnet_t ifp, if_nexus_netif_t netif_nx)
849 {
850 if (ifnet_nx_noauto(ifp) || IFNET_IS_INTCOPROC(ifp) ||
851 IFNET_IS_MANAGEMENT(ifp) || IFNET_IS_VMNET(ifp)) {
852 goto failed;
853 }
854 switch (ifp->if_type) {
855 case IFT_CELLULAR:
856 case IFT_ETHER:
857 if ((if_attach_nx & IF_ATTACH_NX_NETIF_COMPAT) == 0) {
858 /* don't auto-attach */
859 goto failed;
860 }
861 break;
862 default:
863 /* don't auto-attach */
864 goto failed;
865 }
866 return dlil_attach_netif_nexus_common(ifp, netif_nx);
867
868 failed:
869 return FALSE;
870 }
871
872 static boolean_t
dlil_is_native_netif_nexus(ifnet_t ifp)873 dlil_is_native_netif_nexus(ifnet_t ifp)
874 {
875 return (ifp->if_eflags & IFEF_SKYWALK_NATIVE) && ifp->if_na != NULL;
876 }
877
878 __attribute__((noinline))
879 static void
dlil_detach_netif_nexus(if_nexus_netif_t nexus_netif)880 dlil_detach_netif_nexus(if_nexus_netif_t nexus_netif)
881 {
882 dlil_detach_nexus(__func__, nexus_netif->if_nif_provider,
883 nexus_netif->if_nif_instance, nexus_netif->if_nif_attach);
884 }
885
886 static inline int
dlil_siocgifdevmtu(struct ifnet * ifp,struct ifdevmtu * ifdm_p)887 dlil_siocgifdevmtu(struct ifnet * ifp, struct ifdevmtu * ifdm_p)
888 {
889 struct ifreq ifr;
890 int error;
891
892 bzero(&ifr, sizeof(ifr));
893 error = ifnet_ioctl(ifp, 0, SIOCGIFDEVMTU, &ifr);
894 if (error == 0) {
895 *ifdm_p = ifr.ifr_devmtu;
896 }
897 return error;
898 }
899
900 static inline void
_dlil_adjust_large_buf_size_for_tso(ifnet_t ifp,uint32_t * large_buf_size)901 _dlil_adjust_large_buf_size_for_tso(ifnet_t ifp, uint32_t *large_buf_size)
902 {
903 uint32_t tso_v4_mtu = 0;
904 uint32_t tso_v6_mtu = 0;
905
906 if (!kernel_is_macos_or_server()) {
907 return;
908 }
909
910 if (!dlil_is_native_netif_nexus(ifp)) {
911 return;
912 }
913 /*
914 * Note that we are reading the real hwassist flags set by the driver
915 * and not the adjusted ones because nx_netif_host_adjust_if_capabilities()
916 * hasn't been called yet.
917 */
918 if ((ifp->if_hwassist & IFNET_TSO_IPV4) != 0) {
919 tso_v4_mtu = ifp->if_tso_v4_mtu;
920 }
921 if ((ifp->if_hwassist & IFNET_TSO_IPV6) != 0) {
922 tso_v6_mtu = ifp->if_tso_v6_mtu;
923 }
924 /*
925 * If the hardware supports TSO, adjust the large buf size to match the
926 * supported TSO MTU size.
927 */
928 if (tso_v4_mtu != 0 || tso_v6_mtu != 0) {
929 *large_buf_size = MAX(tso_v4_mtu, tso_v6_mtu);
930 } else {
931 *large_buf_size = MAX(*large_buf_size, sk_fsw_gso_mtu);
932 }
933 *large_buf_size = MIN(NX_FSW_MAX_LARGE_BUFSIZE, *large_buf_size);
934 }
935
936 static inline int
_dlil_get_flowswitch_buffer_size(ifnet_t ifp,uuid_t netif,uint32_t * buf_size,bool * use_multi_buflet,uint32_t * large_buf_size)937 _dlil_get_flowswitch_buffer_size(ifnet_t ifp, uuid_t netif, uint32_t *buf_size,
938 bool *use_multi_buflet, uint32_t *large_buf_size)
939 {
940 struct kern_pbufpool_memory_info rx_pp_info;
941 struct kern_pbufpool_memory_info tx_pp_info;
942 uint32_t if_max_mtu = 0;
943 uint32_t drv_buf_size;
944 struct ifdevmtu ifdm;
945 int err;
946
947 /*
948 * To perform intra-stack RX aggregation flowswitch needs to use
949 * multi-buflet packet.
950 */
951 *use_multi_buflet = NX_FSW_TCP_RX_AGG_ENABLED();
952
953 *large_buf_size = *use_multi_buflet ? NX_FSW_DEF_LARGE_BUFSIZE : 0;
954 /*
955 * IP over Thunderbolt interface can deliver the largest IP packet,
956 * but the driver advertises the MAX MTU as only 9K.
957 */
958 if (IFNET_IS_THUNDERBOLT_IP(ifp)) {
959 if_max_mtu = IP_MAXPACKET;
960 goto skip_mtu_ioctl;
961 }
962
963 /* determine max mtu */
964 bzero(&ifdm, sizeof(ifdm));
965 err = dlil_siocgifdevmtu(ifp, &ifdm);
966 if (__improbable(err != 0)) {
967 DLIL_PRINTF("%s: SIOCGIFDEVMTU failed for %s\n",
968 __func__, if_name(ifp));
969 /* use default flowswitch buffer size */
970 if_max_mtu = NX_FSW_BUFSIZE;
971 } else {
972 DLIL_PRINTF("%s: %s %d %d\n", __func__, if_name(ifp),
973 ifdm.ifdm_max, ifdm.ifdm_current);
974 /* rdar://problem/44589731 */
975 if_max_mtu = MAX(ifdm.ifdm_max, ifdm.ifdm_current);
976 }
977
978 skip_mtu_ioctl:
979 if (if_max_mtu == 0) {
980 DLIL_PRINTF("%s: can't determine MAX MTU for %s\n",
981 __func__, if_name(ifp));
982 return EINVAL;
983 }
984 if ((if_max_mtu > NX_FSW_MAXBUFSIZE) && fsw_use_max_mtu_buffer) {
985 DLIL_PRINTF("%s: interace (%s) has MAX MTU (%u) > flowswitch "
986 "max bufsize(%d)\n", __func__,
987 if_name(ifp), if_max_mtu, NX_FSW_MAXBUFSIZE);
988 return EINVAL;
989 }
990
991 /*
992 * for skywalk native driver, consult the driver packet pool also.
993 */
994 if (dlil_is_native_netif_nexus(ifp)) {
995 err = kern_nexus_get_pbufpool_info(netif, &rx_pp_info,
996 &tx_pp_info);
997 if (err != 0) {
998 DLIL_PRINTF("%s: can't get pbufpool info for %s\n",
999 __func__, if_name(ifp));
1000 return ENXIO;
1001 }
1002 drv_buf_size = tx_pp_info.kpm_bufsize *
1003 tx_pp_info.kpm_max_frags;
1004 if (if_max_mtu > drv_buf_size) {
1005 DLIL_PRINTF("%s: interface %s packet pool (rx %d * %d, "
1006 "tx %d * %d) can't support max mtu(%d)\n", __func__,
1007 if_name(ifp), rx_pp_info.kpm_bufsize,
1008 rx_pp_info.kpm_max_frags, tx_pp_info.kpm_bufsize,
1009 tx_pp_info.kpm_max_frags, if_max_mtu);
1010 return EINVAL;
1011 }
1012 } else {
1013 drv_buf_size = if_max_mtu;
1014 }
1015
1016 if ((drv_buf_size > NX_FSW_BUFSIZE) && (!fsw_use_max_mtu_buffer)) {
1017 _CASSERT((NX_FSW_BUFSIZE * NX_PBUF_FRAGS_MAX) >= IP_MAXPACKET);
1018 *use_multi_buflet = true;
1019 /* default flowswitch buffer size */
1020 *buf_size = NX_FSW_BUFSIZE;
1021 *large_buf_size = MIN(NX_FSW_MAX_LARGE_BUFSIZE, drv_buf_size);
1022 } else {
1023 *buf_size = MAX(drv_buf_size, NX_FSW_BUFSIZE);
1024 }
1025 _dlil_adjust_large_buf_size_for_tso(ifp, large_buf_size);
1026 ASSERT(*buf_size <= NX_FSW_MAXBUFSIZE);
1027 if (*buf_size >= *large_buf_size) {
1028 *large_buf_size = 0;
1029 }
1030 return 0;
1031 }
1032
1033 static boolean_t
_dlil_attach_flowswitch_nexus(ifnet_t ifp,if_nexus_flowswitch_t nexus_fsw)1034 _dlil_attach_flowswitch_nexus(ifnet_t ifp, if_nexus_flowswitch_t nexus_fsw)
1035 {
1036 nexus_attr_t attr = NULL;
1037 nexus_controller_t controller;
1038 errno_t err = 0;
1039 uuid_t netif;
1040 uint32_t buf_size = 0;
1041 uint32_t large_buf_size = 0;
1042 bool multi_buflet;
1043
1044 if (ifnet_nx_noauto(ifp) || ifnet_nx_noauto_flowswitch(ifp) ||
1045 IFNET_IS_VMNET(ifp)) {
1046 goto failed;
1047 }
1048
1049 if ((ifp->if_capabilities & IFCAP_SKYWALK) == 0) {
1050 /* not possible to attach (netif native/compat not plumbed) */
1051 goto failed;
1052 }
1053
1054 if ((if_attach_nx & IF_ATTACH_NX_FLOWSWITCH) == 0) {
1055 /* don't auto-attach */
1056 goto failed;
1057 }
1058
1059 /* get the netif instance from the ifp */
1060 err = kern_nexus_get_netif_instance(ifp, netif);
1061 if (err != 0) {
1062 DLIL_PRINTF("%s: can't find netif for %s\n", __func__,
1063 if_name(ifp));
1064 goto failed;
1065 }
1066
1067 err = kern_nexus_attr_create(&attr);
1068 if (err != 0) {
1069 DLIL_PRINTF("%s: nexus attr create for %s\n", __func__,
1070 if_name(ifp));
1071 goto failed;
1072 }
1073
1074 err = _dlil_get_flowswitch_buffer_size(ifp, netif, &buf_size,
1075 &multi_buflet, &large_buf_size);
1076 if (err != 0) {
1077 goto failed;
1078 }
1079 ASSERT((buf_size >= NX_FSW_BUFSIZE) && (buf_size <= NX_FSW_MAXBUFSIZE));
1080 ASSERT(large_buf_size <= NX_FSW_MAX_LARGE_BUFSIZE);
1081
1082 /* Configure flowswitch buffer size */
1083 err = kern_nexus_attr_set(attr, NEXUS_ATTR_SLOT_BUF_SIZE, buf_size);
1084 VERIFY(err == 0);
1085 err = kern_nexus_attr_set(attr, NEXUS_ATTR_LARGE_BUF_SIZE,
1086 large_buf_size);
1087 VERIFY(err == 0);
1088
1089 /*
1090 * Configure flowswitch to use super-packet (multi-buflet).
1091 */
1092 err = kern_nexus_attr_set(attr, NEXUS_ATTR_MAX_FRAGS,
1093 multi_buflet ? NX_PBUF_FRAGS_MAX : 1);
1094 VERIFY(err == 0);
1095
1096 /* create the flowswitch provider and instance */
1097 controller = kern_nexus_shared_controller();
1098 err = dlil_create_provider_and_instance(controller,
1099 NEXUS_TYPE_FLOW_SWITCH, ifp, &nexus_fsw->if_fsw_provider,
1100 &nexus_fsw->if_fsw_instance, attr);
1101 if (err != 0) {
1102 goto failed;
1103 }
1104
1105 /* attach the device port */
1106 err = kern_nexus_ifattach(controller, nexus_fsw->if_fsw_instance,
1107 NULL, netif, FALSE, &nexus_fsw->if_fsw_device);
1108 if (err != 0) {
1109 DLIL_PRINTF("%s kern_nexus_ifattach device failed %d %s\n",
1110 __func__, err, if_name(ifp));
1111 /* cleanup provider and instance */
1112 dlil_detach_nexus(__func__, nexus_fsw->if_fsw_provider,
1113 nexus_fsw->if_fsw_instance, nexus_fsw->if_fsw_device);
1114 goto failed;
1115 }
1116 return TRUE;
1117
1118 failed:
1119 if (err != 0) {
1120 DLIL_PRINTF("%s: failed to attach flowswitch to %s, error %d\n",
1121 __func__, if_name(ifp), err);
1122 } else {
1123 DLIL_PRINTF("%s: not attaching flowswitch to %s\n",
1124 __func__, if_name(ifp));
1125 }
1126 if (attr != NULL) {
1127 kern_nexus_attr_destroy(attr);
1128 }
1129 return FALSE;
1130 }
1131
1132 static boolean_t
dlil_attach_flowswitch_nexus(ifnet_t ifp)1133 dlil_attach_flowswitch_nexus(ifnet_t ifp)
1134 {
1135 boolean_t attached = FALSE;
1136 if_nexus_flowswitch nexus_fsw;
1137
1138 #if (DEVELOPMENT || DEBUG)
1139 if (skywalk_netif_direct_allowed(if_name(ifp))) {
1140 DLIL_PRINTF("skip attaching fsw to %s\n", if_name(ifp));
1141 return FALSE;
1142 }
1143 #endif /* (DEVELOPMENT || DEBUG) */
1144
1145 /*
1146 * flowswitch attachment is not supported for interface using the
1147 * legacy model (IFNET_INIT_LEGACY)
1148 */
1149 if ((ifp->if_eflags & IFEF_TXSTART) == 0) {
1150 DLIL_PRINTF("skip attaching fsw to %s using legacy TX model\n",
1151 if_name(ifp));
1152 return FALSE;
1153 }
1154 bzero(&nexus_fsw, sizeof(nexus_fsw));
1155 if (!ifnet_is_attached(ifp, 1)) {
1156 os_log(OS_LOG_DEFAULT, "%s: %s not attached",
1157 __func__, ifp->if_xname);
1158 goto done;
1159 }
1160 if (uuid_is_null(ifp->if_nx_flowswitch.if_fsw_instance)) {
1161 attached = _dlil_attach_flowswitch_nexus(ifp, &nexus_fsw);
1162 if (attached) {
1163 ifnet_lock_exclusive(ifp);
1164 ifp->if_nx_flowswitch = nexus_fsw;
1165 ifnet_lock_done(ifp);
1166 }
1167 }
1168 ifnet_decr_iorefcnt(ifp);
1169
1170 done:
1171 return attached;
1172 }
1173
1174 __attribute__((noinline))
1175 static void
dlil_detach_flowswitch_nexus(if_nexus_flowswitch_t nexus_fsw)1176 dlil_detach_flowswitch_nexus(if_nexus_flowswitch_t nexus_fsw)
1177 {
1178 dlil_detach_nexus(__func__, nexus_fsw->if_fsw_provider,
1179 nexus_fsw->if_fsw_instance, nexus_fsw->if_fsw_device);
1180 }
1181
1182 __attribute__((noinline))
1183 static void
dlil_netif_detach_notify(ifnet_t ifp)1184 dlil_netif_detach_notify(ifnet_t ifp)
1185 {
1186 ifnet_detach_notify_cb_t notify = NULL;
1187 void *arg = NULL;
1188
1189 ifnet_get_detach_notify(ifp, ¬ify, &arg);
1190 if (notify == NULL) {
1191 DTRACE_SKYWALK1(no__notify, ifnet_t, ifp);
1192 return;
1193 }
1194 (*notify)(arg);
1195 }
1196
1197 __attribute__((noinline))
1198 static void
dlil_quiesce_and_detach_nexuses(ifnet_t ifp)1199 dlil_quiesce_and_detach_nexuses(ifnet_t ifp)
1200 {
1201 if_nexus_flowswitch *nx_fsw = &ifp->if_nx_flowswitch;
1202 if_nexus_netif *nx_netif = &ifp->if_nx_netif;
1203
1204 ifnet_datamov_suspend_and_drain(ifp);
1205 if (!uuid_is_null(nx_fsw->if_fsw_device)) {
1206 ASSERT(!uuid_is_null(nx_fsw->if_fsw_provider));
1207 ASSERT(!uuid_is_null(nx_fsw->if_fsw_instance));
1208 dlil_detach_flowswitch_nexus(nx_fsw);
1209 } else {
1210 ASSERT(uuid_is_null(nx_fsw->if_fsw_provider));
1211 ASSERT(uuid_is_null(nx_fsw->if_fsw_instance));
1212 DTRACE_IP1(fsw__not__attached, ifnet_t, ifp);
1213 }
1214
1215 if (!uuid_is_null(nx_netif->if_nif_attach)) {
1216 ASSERT(!uuid_is_null(nx_netif->if_nif_provider));
1217 ASSERT(!uuid_is_null(nx_netif->if_nif_instance));
1218 dlil_detach_netif_nexus(nx_netif);
1219 } else {
1220 ASSERT(uuid_is_null(nx_netif->if_nif_provider));
1221 ASSERT(uuid_is_null(nx_netif->if_nif_instance));
1222 DTRACE_IP1(netif__not__attached, ifnet_t, ifp);
1223 }
1224 ifnet_datamov_resume(ifp);
1225 }
1226
1227 boolean_t
ifnet_add_netagent(ifnet_t ifp)1228 ifnet_add_netagent(ifnet_t ifp)
1229 {
1230 int error;
1231
1232 error = kern_nexus_interface_add_netagent(ifp);
1233 os_log(OS_LOG_DEFAULT,
1234 "kern_nexus_interface_add_netagent(%s) returned %d",
1235 ifp->if_xname, error);
1236 return error == 0;
1237 }
1238
1239 boolean_t
ifnet_remove_netagent(ifnet_t ifp)1240 ifnet_remove_netagent(ifnet_t ifp)
1241 {
1242 int error;
1243
1244 error = kern_nexus_interface_remove_netagent(ifp);
1245 os_log(OS_LOG_DEFAULT,
1246 "kern_nexus_interface_remove_netagent(%s) returned %d",
1247 ifp->if_xname, error);
1248 return error == 0;
1249 }
1250
1251 boolean_t
ifnet_attach_flowswitch_nexus(ifnet_t ifp)1252 ifnet_attach_flowswitch_nexus(ifnet_t ifp)
1253 {
1254 if (!IF_FULLY_ATTACHED(ifp)) {
1255 return FALSE;
1256 }
1257 return dlil_attach_flowswitch_nexus(ifp);
1258 }
1259
1260 boolean_t
ifnet_detach_flowswitch_nexus(ifnet_t ifp)1261 ifnet_detach_flowswitch_nexus(ifnet_t ifp)
1262 {
1263 if_nexus_flowswitch nexus_fsw;
1264
1265 ifnet_lock_exclusive(ifp);
1266 nexus_fsw = ifp->if_nx_flowswitch;
1267 bzero(&ifp->if_nx_flowswitch, sizeof(ifp->if_nx_flowswitch));
1268 ifnet_lock_done(ifp);
1269 return dlil_detach_nexus(__func__, nexus_fsw.if_fsw_provider,
1270 nexus_fsw.if_fsw_instance, nexus_fsw.if_fsw_device);
1271 }
1272
1273 void
ifnet_attach_native_flowswitch(ifnet_t ifp)1274 ifnet_attach_native_flowswitch(ifnet_t ifp)
1275 {
1276 if (!dlil_is_native_netif_nexus(ifp)) {
1277 /* not a native netif */
1278 return;
1279 }
1280 ifnet_attach_flowswitch_nexus(ifp);
1281 }
1282
1283 int
ifnet_set_flowswitch_rx_callback(ifnet_t ifp,ifnet_fsw_rx_cb_t cb,void * arg)1284 ifnet_set_flowswitch_rx_callback(ifnet_t ifp, ifnet_fsw_rx_cb_t cb, void *arg)
1285 {
1286 lck_mtx_lock(&ifp->if_delegate_lock);
1287 while (ifp->if_fsw_rx_cb_ref > 0) {
1288 DTRACE_SKYWALK1(wait__fsw, ifnet_t, ifp);
1289 (void) msleep(&ifp->if_fsw_rx_cb_ref, &ifp->if_delegate_lock,
1290 (PZERO + 1), __FUNCTION__, NULL);
1291 DTRACE_SKYWALK1(wake__fsw, ifnet_t, ifp);
1292 }
1293 ifp->if_fsw_rx_cb = cb;
1294 ifp->if_fsw_rx_cb_arg = arg;
1295 lck_mtx_unlock(&ifp->if_delegate_lock);
1296 return 0;
1297 }
1298
1299 int
ifnet_get_flowswitch_rx_callback(ifnet_t ifp,ifnet_fsw_rx_cb_t * cbp,void ** argp)1300 ifnet_get_flowswitch_rx_callback(ifnet_t ifp, ifnet_fsw_rx_cb_t *cbp, void **argp)
1301 {
1302 /*
1303 * This is for avoiding the unnecessary lock acquire for interfaces
1304 * not used by a redirect interface.
1305 */
1306 if (ifp->if_fsw_rx_cb == NULL) {
1307 return ENOENT;
1308 }
1309 lck_mtx_lock(&ifp->if_delegate_lock);
1310 if (ifp->if_fsw_rx_cb == NULL) {
1311 lck_mtx_unlock(&ifp->if_delegate_lock);
1312 return ENOENT;
1313 }
1314 *cbp = ifp->if_fsw_rx_cb;
1315 *argp = ifp->if_fsw_rx_cb_arg;
1316 ifp->if_fsw_rx_cb_ref++;
1317 lck_mtx_unlock(&ifp->if_delegate_lock);
1318 return 0;
1319 }
1320
1321 void
ifnet_release_flowswitch_rx_callback(ifnet_t ifp)1322 ifnet_release_flowswitch_rx_callback(ifnet_t ifp)
1323 {
1324 lck_mtx_lock(&ifp->if_delegate_lock);
1325 if (--ifp->if_fsw_rx_cb_ref == 0) {
1326 wakeup(&ifp->if_fsw_rx_cb_ref);
1327 }
1328 lck_mtx_unlock(&ifp->if_delegate_lock);
1329 }
1330
1331 int
ifnet_set_delegate_parent(ifnet_t difp,ifnet_t parent)1332 ifnet_set_delegate_parent(ifnet_t difp, ifnet_t parent)
1333 {
1334 lck_mtx_lock(&difp->if_delegate_lock);
1335 while (difp->if_delegate_parent_ref > 0) {
1336 DTRACE_SKYWALK1(wait__parent, ifnet_t, difp);
1337 (void) msleep(&difp->if_delegate_parent_ref, &difp->if_delegate_lock,
1338 (PZERO + 1), __FUNCTION__, NULL);
1339 DTRACE_SKYWALK1(wake__parent, ifnet_t, difp);
1340 }
1341 difp->if_delegate_parent = parent;
1342 lck_mtx_unlock(&difp->if_delegate_lock);
1343 return 0;
1344 }
1345
1346 int
ifnet_get_delegate_parent(ifnet_t difp,ifnet_t * parentp)1347 ifnet_get_delegate_parent(ifnet_t difp, ifnet_t *parentp)
1348 {
1349 lck_mtx_lock(&difp->if_delegate_lock);
1350 if (difp->if_delegate_parent == NULL) {
1351 lck_mtx_unlock(&difp->if_delegate_lock);
1352 return ENOENT;
1353 }
1354 *parentp = difp->if_delegate_parent;
1355 difp->if_delegate_parent_ref++;
1356 lck_mtx_unlock(&difp->if_delegate_lock);
1357 return 0;
1358 }
1359
1360 void
ifnet_release_delegate_parent(ifnet_t difp)1361 ifnet_release_delegate_parent(ifnet_t difp)
1362 {
1363 lck_mtx_lock(&difp->if_delegate_lock);
1364 if (--difp->if_delegate_parent_ref == 0) {
1365 wakeup(&difp->if_delegate_parent_ref);
1366 }
1367 lck_mtx_unlock(&difp->if_delegate_lock);
1368 }
1369
1370 __attribute__((noinline))
1371 void
ifnet_set_detach_notify_locked(ifnet_t ifp,ifnet_detach_notify_cb_t notify,void * arg)1372 ifnet_set_detach_notify_locked(ifnet_t ifp, ifnet_detach_notify_cb_t notify, void *arg)
1373 {
1374 ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_EXCLUSIVE);
1375 ifp->if_detach_notify = notify;
1376 ifp->if_detach_notify_arg = arg;
1377 }
1378
1379 __attribute__((noinline))
1380 void
ifnet_get_detach_notify_locked(ifnet_t ifp,ifnet_detach_notify_cb_t * notifyp,void ** argp)1381 ifnet_get_detach_notify_locked(ifnet_t ifp, ifnet_detach_notify_cb_t *notifyp, void **argp)
1382 {
1383 ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_EXCLUSIVE);
1384 *notifyp = ifp->if_detach_notify;
1385 *argp = ifp->if_detach_notify_arg;
1386 }
1387
1388 __attribute__((noinline))
1389 void
ifnet_set_detach_notify(ifnet_t ifp,ifnet_detach_notify_cb_t notify,void * arg)1390 ifnet_set_detach_notify(ifnet_t ifp, ifnet_detach_notify_cb_t notify, void *arg)
1391 {
1392 ifnet_lock_exclusive(ifp);
1393 ifnet_set_detach_notify_locked(ifp, notify, arg);
1394 ifnet_lock_done(ifp);
1395 }
1396
1397 __attribute__((noinline))
1398 void
ifnet_get_detach_notify(ifnet_t ifp,ifnet_detach_notify_cb_t * notifyp,void ** argp)1399 ifnet_get_detach_notify(ifnet_t ifp, ifnet_detach_notify_cb_t *notifyp, void **argp)
1400 {
1401 ifnet_lock_exclusive(ifp);
1402 ifnet_get_detach_notify_locked(ifp, notifyp, argp);
1403 ifnet_lock_done(ifp);
1404 }
1405 #endif /* SKYWALK */
1406
1407 #define DLIL_INPUT_CHECK(m, ifp) { \
1408 struct ifnet *_rcvif = mbuf_pkthdr_rcvif(m); \
1409 if (_rcvif == NULL || (ifp != lo_ifp && _rcvif != ifp) || \
1410 !(mbuf_flags(m) & MBUF_PKTHDR)) { \
1411 panic_plain("%s: invalid mbuf %p\n", __func__, m); \
1412 /* NOTREACHED */ \
1413 } \
1414 }
1415
1416 #define DLIL_EWMA(old, new, decay) do { \
1417 u_int32_t _avg; \
1418 if ((_avg = (old)) > 0) \
1419 _avg = (((_avg << (decay)) - _avg) + (new)) >> (decay); \
1420 else \
1421 _avg = (new); \
1422 (old) = _avg; \
1423 } while (0)
1424
1425 #define MBPS (1ULL * 1000 * 1000)
1426 #define GBPS (MBPS * 1000)
1427
1428 struct rxpoll_time_tbl {
1429 u_int64_t speed; /* downlink speed */
1430 u_int32_t plowat; /* packets low watermark */
1431 u_int32_t phiwat; /* packets high watermark */
1432 u_int32_t blowat; /* bytes low watermark */
1433 u_int32_t bhiwat; /* bytes high watermark */
1434 };
1435
1436 static struct rxpoll_time_tbl rxpoll_tbl[] = {
1437 { .speed = 10 * MBPS, .plowat = 2, .phiwat = 8, .blowat = (1 * 1024), .bhiwat = (6 * 1024) },
1438 { .speed = 100 * MBPS, .plowat = 10, .phiwat = 40, .blowat = (4 * 1024), .bhiwat = (64 * 1024) },
1439 { .speed = 1 * GBPS, .plowat = 10, .phiwat = 40, .blowat = (4 * 1024), .bhiwat = (64 * 1024) },
1440 { .speed = 10 * GBPS, .plowat = 10, .phiwat = 40, .blowat = (4 * 1024), .bhiwat = (64 * 1024) },
1441 { .speed = 100 * GBPS, .plowat = 10, .phiwat = 40, .blowat = (4 * 1024), .bhiwat = (64 * 1024) },
1442 { .speed = 0, .plowat = 0, .phiwat = 0, .blowat = 0, .bhiwat = 0 }
1443 };
1444
1445 static LCK_MTX_DECLARE_ATTR(dlil_thread_sync_lock, &dlil_lock_group,
1446 &dlil_lck_attributes);
1447 static uint32_t dlil_pending_thread_cnt = 0;
1448
1449 static void
dlil_incr_pending_thread_count(void)1450 dlil_incr_pending_thread_count(void)
1451 {
1452 LCK_MTX_ASSERT(&dlil_thread_sync_lock, LCK_MTX_ASSERT_NOTOWNED);
1453 lck_mtx_lock(&dlil_thread_sync_lock);
1454 dlil_pending_thread_cnt++;
1455 lck_mtx_unlock(&dlil_thread_sync_lock);
1456 }
1457
1458 static void
dlil_decr_pending_thread_count(void)1459 dlil_decr_pending_thread_count(void)
1460 {
1461 LCK_MTX_ASSERT(&dlil_thread_sync_lock, LCK_MTX_ASSERT_NOTOWNED);
1462 lck_mtx_lock(&dlil_thread_sync_lock);
1463 VERIFY(dlil_pending_thread_cnt > 0);
1464 dlil_pending_thread_cnt--;
1465 if (dlil_pending_thread_cnt == 0) {
1466 wakeup(&dlil_pending_thread_cnt);
1467 }
1468 lck_mtx_unlock(&dlil_thread_sync_lock);
1469 }
1470
1471 int
proto_hash_value(u_int32_t protocol_family)1472 proto_hash_value(u_int32_t protocol_family)
1473 {
1474 /*
1475 * dlil_proto_unplumb_all() depends on the mapping between
1476 * the hash bucket index and the protocol family defined
1477 * here; future changes must be applied there as well.
1478 */
1479 switch (protocol_family) {
1480 case PF_INET:
1481 return 0;
1482 case PF_INET6:
1483 return 1;
1484 case PF_VLAN:
1485 return 2;
1486 case PF_UNSPEC:
1487 default:
1488 return 3;
1489 }
1490 }
1491
1492 /*
1493 * Caller must already be holding ifnet lock.
1494 */
1495 static struct if_proto *
find_attached_proto(struct ifnet * ifp,u_int32_t protocol_family)1496 find_attached_proto(struct ifnet *ifp, u_int32_t protocol_family)
1497 {
1498 struct if_proto *proto = NULL;
1499 u_int32_t i = proto_hash_value(protocol_family);
1500
1501 ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_OWNED);
1502
1503 if (ifp->if_proto_hash != NULL) {
1504 proto = SLIST_FIRST(&ifp->if_proto_hash[i]);
1505 }
1506
1507 while (proto != NULL && proto->protocol_family != protocol_family) {
1508 proto = SLIST_NEXT(proto, next_hash);
1509 }
1510
1511 if (proto != NULL) {
1512 if_proto_ref(proto);
1513 }
1514
1515 return proto;
1516 }
1517
1518 static void
if_proto_ref(struct if_proto * proto)1519 if_proto_ref(struct if_proto *proto)
1520 {
1521 os_atomic_inc(&proto->refcount, relaxed);
1522 }
1523
1524 extern void if_rtproto_del(struct ifnet *ifp, int protocol);
1525
1526 static void
if_proto_free(struct if_proto * proto)1527 if_proto_free(struct if_proto *proto)
1528 {
1529 u_int32_t oldval;
1530 struct ifnet *ifp = proto->ifp;
1531 u_int32_t proto_family = proto->protocol_family;
1532 struct kev_dl_proto_data ev_pr_data;
1533
1534 oldval = os_atomic_dec_orig(&proto->refcount, relaxed);
1535 if (oldval > 1) {
1536 return;
1537 }
1538
1539 if (proto->proto_kpi == kProtoKPI_v1) {
1540 if (proto->kpi.v1.detached) {
1541 proto->kpi.v1.detached(ifp, proto->protocol_family);
1542 }
1543 }
1544 if (proto->proto_kpi == kProtoKPI_v2) {
1545 if (proto->kpi.v2.detached) {
1546 proto->kpi.v2.detached(ifp, proto->protocol_family);
1547 }
1548 }
1549
1550 /*
1551 * Cleanup routes that may still be in the routing table for that
1552 * interface/protocol pair.
1553 */
1554 if_rtproto_del(ifp, proto_family);
1555
1556 ifnet_lock_shared(ifp);
1557
1558 /* No more reference on this, protocol must have been detached */
1559 VERIFY(proto->detached);
1560
1561 /*
1562 * The reserved field carries the number of protocol still attached
1563 * (subject to change)
1564 */
1565 ev_pr_data.proto_family = proto_family;
1566 ev_pr_data.proto_remaining_count = dlil_ifp_protolist(ifp, NULL, 0);
1567
1568 ifnet_lock_done(ifp);
1569
1570 dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_PROTO_DETACHED,
1571 (struct net_event_data *)&ev_pr_data,
1572 sizeof(struct kev_dl_proto_data), FALSE);
1573
1574 if (ev_pr_data.proto_remaining_count == 0) {
1575 /*
1576 * The protocol count has gone to zero, mark the interface down.
1577 * This used to be done by configd.KernelEventMonitor, but that
1578 * is inherently prone to races (rdar://problem/30810208).
1579 */
1580 (void) ifnet_set_flags(ifp, 0, IFF_UP);
1581 (void) ifnet_ioctl(ifp, 0, SIOCSIFFLAGS, NULL);
1582 dlil_post_sifflags_msg(ifp);
1583 }
1584
1585 zfree(dlif_proto_zone, proto);
1586 }
1587
1588 __private_extern__ void
ifnet_lock_assert(struct ifnet * ifp,ifnet_lock_assert_t what)1589 ifnet_lock_assert(struct ifnet *ifp, ifnet_lock_assert_t what)
1590 {
1591 #if !MACH_ASSERT
1592 #pragma unused(ifp)
1593 #endif
1594 unsigned int type = 0;
1595 int ass = 1;
1596
1597 switch (what) {
1598 case IFNET_LCK_ASSERT_EXCLUSIVE:
1599 type = LCK_RW_ASSERT_EXCLUSIVE;
1600 break;
1601
1602 case IFNET_LCK_ASSERT_SHARED:
1603 type = LCK_RW_ASSERT_SHARED;
1604 break;
1605
1606 case IFNET_LCK_ASSERT_OWNED:
1607 type = LCK_RW_ASSERT_HELD;
1608 break;
1609
1610 case IFNET_LCK_ASSERT_NOTOWNED:
1611 /* nothing to do here for RW lock; bypass assert */
1612 ass = 0;
1613 break;
1614
1615 default:
1616 panic("bad ifnet assert type: %d", what);
1617 /* NOTREACHED */
1618 }
1619 if (ass) {
1620 LCK_RW_ASSERT(&ifp->if_lock, type);
1621 }
1622 }
1623
1624 __private_extern__ void
ifnet_lock_shared(struct ifnet * ifp)1625 ifnet_lock_shared(struct ifnet *ifp)
1626 {
1627 lck_rw_lock_shared(&ifp->if_lock);
1628 }
1629
1630 __private_extern__ void
ifnet_lock_exclusive(struct ifnet * ifp)1631 ifnet_lock_exclusive(struct ifnet *ifp)
1632 {
1633 lck_rw_lock_exclusive(&ifp->if_lock);
1634 }
1635
1636 __private_extern__ void
ifnet_lock_done(struct ifnet * ifp)1637 ifnet_lock_done(struct ifnet *ifp)
1638 {
1639 lck_rw_done(&ifp->if_lock);
1640 }
1641
1642 #if INET
1643 __private_extern__ void
if_inetdata_lock_shared(struct ifnet * ifp)1644 if_inetdata_lock_shared(struct ifnet *ifp)
1645 {
1646 lck_rw_lock_shared(&ifp->if_inetdata_lock);
1647 }
1648
1649 __private_extern__ void
if_inetdata_lock_exclusive(struct ifnet * ifp)1650 if_inetdata_lock_exclusive(struct ifnet *ifp)
1651 {
1652 lck_rw_lock_exclusive(&ifp->if_inetdata_lock);
1653 }
1654
1655 __private_extern__ void
if_inetdata_lock_done(struct ifnet * ifp)1656 if_inetdata_lock_done(struct ifnet *ifp)
1657 {
1658 lck_rw_done(&ifp->if_inetdata_lock);
1659 }
1660 #endif
1661
1662 __private_extern__ void
if_inet6data_lock_shared(struct ifnet * ifp)1663 if_inet6data_lock_shared(struct ifnet *ifp)
1664 {
1665 lck_rw_lock_shared(&ifp->if_inet6data_lock);
1666 }
1667
1668 __private_extern__ void
if_inet6data_lock_exclusive(struct ifnet * ifp)1669 if_inet6data_lock_exclusive(struct ifnet *ifp)
1670 {
1671 lck_rw_lock_exclusive(&ifp->if_inet6data_lock);
1672 }
1673
1674 __private_extern__ void
if_inet6data_lock_done(struct ifnet * ifp)1675 if_inet6data_lock_done(struct ifnet *ifp)
1676 {
1677 lck_rw_done(&ifp->if_inet6data_lock);
1678 }
1679
1680 __private_extern__ void
ifnet_head_lock_shared(void)1681 ifnet_head_lock_shared(void)
1682 {
1683 lck_rw_lock_shared(&ifnet_head_lock);
1684 }
1685
1686 __private_extern__ void
ifnet_head_lock_exclusive(void)1687 ifnet_head_lock_exclusive(void)
1688 {
1689 lck_rw_lock_exclusive(&ifnet_head_lock);
1690 }
1691
1692 __private_extern__ void
ifnet_head_done(void)1693 ifnet_head_done(void)
1694 {
1695 lck_rw_done(&ifnet_head_lock);
1696 }
1697
1698 __private_extern__ void
ifnet_head_assert_exclusive(void)1699 ifnet_head_assert_exclusive(void)
1700 {
1701 LCK_RW_ASSERT(&ifnet_head_lock, LCK_RW_ASSERT_EXCLUSIVE);
1702 }
1703
1704 /*
1705 * dlil_ifp_protolist
1706 * - get the list of protocols attached to the interface, or just the number
1707 * of attached protocols
1708 * - if the number returned is greater than 'list_count', truncation occurred
1709 *
1710 * Note:
1711 * - caller must already be holding ifnet lock.
1712 */
1713 static u_int32_t
dlil_ifp_protolist(struct ifnet * ifp,protocol_family_t * list,u_int32_t list_count)1714 dlil_ifp_protolist(struct ifnet *ifp, protocol_family_t *list,
1715 u_int32_t list_count)
1716 {
1717 u_int32_t count = 0;
1718 int i;
1719
1720 ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_OWNED);
1721
1722 if (ifp->if_proto_hash == NULL) {
1723 goto done;
1724 }
1725
1726 for (i = 0; i < PROTO_HASH_SLOTS; i++) {
1727 struct if_proto *proto;
1728 SLIST_FOREACH(proto, &ifp->if_proto_hash[i], next_hash) {
1729 if (list != NULL && count < list_count) {
1730 list[count] = proto->protocol_family;
1731 }
1732 count++;
1733 }
1734 }
1735 done:
1736 return count;
1737 }
1738
1739 __private_extern__ u_int32_t
if_get_protolist(struct ifnet * ifp,u_int32_t * protolist,u_int32_t count)1740 if_get_protolist(struct ifnet * ifp, u_int32_t *protolist, u_int32_t count)
1741 {
1742 ifnet_lock_shared(ifp);
1743 count = dlil_ifp_protolist(ifp, protolist, count);
1744 ifnet_lock_done(ifp);
1745 return count;
1746 }
1747
1748 __private_extern__ void
if_free_protolist(u_int32_t * list)1749 if_free_protolist(u_int32_t *list)
1750 {
1751 kfree_data_addr(list);
1752 }
1753
1754 __private_extern__ int
dlil_post_msg(struct ifnet * ifp,u_int32_t event_subclass,u_int32_t event_code,struct net_event_data * event_data,u_int32_t event_data_len,boolean_t suppress_generation)1755 dlil_post_msg(struct ifnet *ifp, u_int32_t event_subclass,
1756 u_int32_t event_code, struct net_event_data *event_data,
1757 u_int32_t event_data_len, boolean_t suppress_generation)
1758 {
1759 struct net_event_data ev_data;
1760 struct kev_msg ev_msg;
1761
1762 bzero(&ev_msg, sizeof(ev_msg));
1763 bzero(&ev_data, sizeof(ev_data));
1764 /*
1765 * a net event always starts with a net_event_data structure
1766 * but the caller can generate a simple net event or
1767 * provide a longer event structure to post
1768 */
1769 ev_msg.vendor_code = KEV_VENDOR_APPLE;
1770 ev_msg.kev_class = KEV_NETWORK_CLASS;
1771 ev_msg.kev_subclass = event_subclass;
1772 ev_msg.event_code = event_code;
1773
1774 if (event_data == NULL) {
1775 event_data = &ev_data;
1776 event_data_len = sizeof(struct net_event_data);
1777 }
1778
1779 strlcpy(&event_data->if_name[0], ifp->if_name, IFNAMSIZ);
1780 event_data->if_family = ifp->if_family;
1781 event_data->if_unit = (u_int32_t)ifp->if_unit;
1782
1783 ev_msg.dv[0].data_length = event_data_len;
1784 ev_msg.dv[0].data_ptr = event_data;
1785 ev_msg.dv[1].data_length = 0;
1786
1787 bool update_generation = true;
1788 if (event_subclass == KEV_DL_SUBCLASS) {
1789 /* Don't update interface generation for frequent link quality and state changes */
1790 switch (event_code) {
1791 case KEV_DL_LINK_QUALITY_METRIC_CHANGED:
1792 case KEV_DL_RRC_STATE_CHANGED:
1793 case KEV_DL_PRIMARY_ELECTED:
1794 update_generation = false;
1795 break;
1796 default:
1797 break;
1798 }
1799 }
1800
1801 /*
1802 * Some events that update generation counts might
1803 * want to suppress generation count.
1804 * One example is node presence/absence where we still
1805 * issue kernel event for the invocation but want to avoid
1806 * expensive operation of updating generation which triggers
1807 * NECP client updates.
1808 */
1809 if (suppress_generation) {
1810 update_generation = false;
1811 }
1812
1813 return dlil_event_internal(ifp, &ev_msg, update_generation);
1814 }
1815
1816 __private_extern__ int
dlil_alloc_local_stats(struct ifnet * ifp)1817 dlil_alloc_local_stats(struct ifnet *ifp)
1818 {
1819 int ret = EINVAL;
1820 void *buf, *base, **pbuf;
1821
1822 if (ifp == NULL) {
1823 goto end;
1824 }
1825
1826 if (ifp->if_tcp_stat == NULL && ifp->if_udp_stat == NULL) {
1827 /* allocate tcpstat_local structure */
1828 buf = zalloc_flags(dlif_tcpstat_zone,
1829 Z_WAITOK | Z_ZERO | Z_NOFAIL);
1830
1831 /* Get the 64-bit aligned base address for this object */
1832 base = (void *)P2ROUNDUP((intptr_t)buf + sizeof(u_int64_t),
1833 sizeof(u_int64_t));
1834 VERIFY(((intptr_t)base + dlif_tcpstat_size) <=
1835 ((intptr_t)buf + dlif_tcpstat_bufsize));
1836
1837 /*
1838 * Wind back a pointer size from the aligned base and
1839 * save the original address so we can free it later.
1840 */
1841 pbuf = (void **)((intptr_t)base - sizeof(void *));
1842 *pbuf = buf;
1843 ifp->if_tcp_stat = base;
1844
1845 /* allocate udpstat_local structure */
1846 buf = zalloc_flags(dlif_udpstat_zone,
1847 Z_WAITOK | Z_ZERO | Z_NOFAIL);
1848
1849 /* Get the 64-bit aligned base address for this object */
1850 base = (void *)P2ROUNDUP((intptr_t)buf + sizeof(u_int64_t),
1851 sizeof(u_int64_t));
1852 VERIFY(((intptr_t)base + dlif_udpstat_size) <=
1853 ((intptr_t)buf + dlif_udpstat_bufsize));
1854
1855 /*
1856 * Wind back a pointer size from the aligned base and
1857 * save the original address so we can free it later.
1858 */
1859 pbuf = (void **)((intptr_t)base - sizeof(void *));
1860 *pbuf = buf;
1861 ifp->if_udp_stat = base;
1862
1863 VERIFY(IS_P2ALIGNED(ifp->if_tcp_stat, sizeof(u_int64_t)) &&
1864 IS_P2ALIGNED(ifp->if_udp_stat, sizeof(u_int64_t)));
1865
1866 ret = 0;
1867 }
1868
1869 if (ifp->if_ipv4_stat == NULL) {
1870 ifp->if_ipv4_stat = kalloc_type(struct if_tcp_ecn_stat, Z_WAITOK | Z_ZERO);
1871 }
1872
1873 if (ifp->if_ipv6_stat == NULL) {
1874 ifp->if_ipv6_stat = kalloc_type(struct if_tcp_ecn_stat, Z_WAITOK | Z_ZERO);
1875 }
1876 end:
1877 if (ifp != NULL && ret != 0) {
1878 if (ifp->if_tcp_stat != NULL) {
1879 pbuf = (void **)
1880 ((intptr_t)ifp->if_tcp_stat - sizeof(void *));
1881 zfree(dlif_tcpstat_zone, *pbuf);
1882 ifp->if_tcp_stat = NULL;
1883 }
1884 if (ifp->if_udp_stat != NULL) {
1885 pbuf = (void **)
1886 ((intptr_t)ifp->if_udp_stat - sizeof(void *));
1887 zfree(dlif_udpstat_zone, *pbuf);
1888 ifp->if_udp_stat = NULL;
1889 }
1890 /* The macro kfree_type sets the passed pointer to NULL */
1891 if (ifp->if_ipv4_stat != NULL) {
1892 kfree_type(struct if_tcp_ecn_stat, ifp->if_ipv4_stat);
1893 }
1894 if (ifp->if_ipv6_stat != NULL) {
1895 kfree_type(struct if_tcp_ecn_stat, ifp->if_ipv6_stat);
1896 }
1897 }
1898
1899 return ret;
1900 }
1901
1902 static void
dlil_reset_rxpoll_params(ifnet_t ifp)1903 dlil_reset_rxpoll_params(ifnet_t ifp)
1904 {
1905 ASSERT(ifp != NULL);
1906 ifnet_set_poll_cycle(ifp, NULL);
1907 ifp->if_poll_update = 0;
1908 ifp->if_poll_flags = 0;
1909 ifp->if_poll_req = 0;
1910 ifp->if_poll_mode = IFNET_MODEL_INPUT_POLL_OFF;
1911 bzero(&ifp->if_poll_tstats, sizeof(ifp->if_poll_tstats));
1912 bzero(&ifp->if_poll_pstats, sizeof(ifp->if_poll_pstats));
1913 bzero(&ifp->if_poll_sstats, sizeof(ifp->if_poll_sstats));
1914 net_timerclear(&ifp->if_poll_mode_holdtime);
1915 net_timerclear(&ifp->if_poll_mode_lasttime);
1916 net_timerclear(&ifp->if_poll_sample_holdtime);
1917 net_timerclear(&ifp->if_poll_sample_lasttime);
1918 net_timerclear(&ifp->if_poll_dbg_lasttime);
1919 }
1920
1921 static int
dlil_create_input_thread(ifnet_t ifp,struct dlil_threading_info * inp,thread_continue_t * thfunc)1922 dlil_create_input_thread(ifnet_t ifp, struct dlil_threading_info *inp,
1923 thread_continue_t *thfunc)
1924 {
1925 boolean_t dlil_rxpoll_input;
1926 thread_continue_t func = NULL;
1927 u_int32_t limit;
1928 int error = 0;
1929
1930 dlil_rxpoll_input = (ifp != NULL && net_rxpoll &&
1931 (ifp->if_eflags & IFEF_RXPOLL) && (ifp->if_xflags & IFXF_LEGACY));
1932
1933 /* default strategy utilizes the DLIL worker thread */
1934 inp->dlth_strategy = dlil_input_async;
1935
1936 /* NULL ifp indicates the main input thread, called at dlil_init time */
1937 if (ifp == NULL) {
1938 /*
1939 * Main input thread only.
1940 */
1941 func = dlil_main_input_thread_func;
1942 VERIFY(inp == dlil_main_input_thread);
1943 (void) strlcat(inp->dlth_name,
1944 "main_input", DLIL_THREADNAME_LEN);
1945 } else if (dlil_rxpoll_input) {
1946 /*
1947 * Legacy (non-netif) hybrid polling.
1948 */
1949 func = dlil_rxpoll_input_thread_func;
1950 VERIFY(inp != dlil_main_input_thread);
1951 (void) snprintf(inp->dlth_name, DLIL_THREADNAME_LEN,
1952 "%s_input_poll", if_name(ifp));
1953 } else if (net_async || (ifp->if_xflags & IFXF_LEGACY)) {
1954 /*
1955 * Asynchronous strategy.
1956 */
1957 func = dlil_input_thread_func;
1958 VERIFY(inp != dlil_main_input_thread);
1959 (void) snprintf(inp->dlth_name, DLIL_THREADNAME_LEN,
1960 "%s_input", if_name(ifp));
1961 } else {
1962 /*
1963 * Synchronous strategy if there's a netif below and
1964 * the device isn't capable of hybrid polling.
1965 */
1966 ASSERT(func == NULL);
1967 ASSERT(!(ifp->if_xflags & IFXF_LEGACY));
1968 VERIFY(inp != dlil_main_input_thread);
1969 ASSERT(!inp->dlth_affinity);
1970 inp->dlth_strategy = dlil_input_sync;
1971 }
1972 VERIFY(inp->dlth_thread == THREAD_NULL);
1973
1974 /* let caller know */
1975 if (thfunc != NULL) {
1976 *thfunc = func;
1977 }
1978
1979 inp->dlth_lock_grp = lck_grp_alloc_init(inp->dlth_name, LCK_GRP_ATTR_NULL);
1980 lck_mtx_init(&inp->dlth_lock, inp->dlth_lock_grp, &dlil_lck_attributes);
1981
1982 inp->dlth_ifp = ifp; /* NULL for main input thread */
1983
1984 /*
1985 * For interfaces that support opportunistic polling, set the
1986 * low and high watermarks for outstanding inbound packets/bytes.
1987 * Also define freeze times for transitioning between modes
1988 * and updating the average.
1989 */
1990 if (ifp != NULL && net_rxpoll && (ifp->if_eflags & IFEF_RXPOLL)) {
1991 limit = MAX(if_rcvq_maxlen, IF_RCVQ_MINLEN);
1992 if (ifp->if_xflags & IFXF_LEGACY) {
1993 (void) dlil_rxpoll_set_params(ifp, NULL, FALSE);
1994 }
1995 } else {
1996 /*
1997 * For interfaces that don't support opportunistic
1998 * polling, set the burst limit to prevent memory exhaustion.
1999 * The values of `if_rcvq_burst_limit' are safeguarded
2000 * on customer builds by `sysctl_rcvq_burst_limit'.
2001 */
2002 limit = if_rcvq_burst_limit;
2003 }
2004
2005 _qinit(&inp->dlth_pkts, Q_DROPTAIL, limit, QP_MBUF);
2006 if (inp == dlil_main_input_thread) {
2007 struct dlil_main_threading_info *inpm =
2008 (struct dlil_main_threading_info *)inp;
2009 _qinit(&inpm->lo_rcvq_pkts, Q_DROPTAIL, limit, QP_MBUF);
2010 }
2011
2012 if (func == NULL) {
2013 ASSERT(!(ifp->if_xflags & IFXF_LEGACY));
2014 ASSERT(error == 0);
2015 error = ENODEV;
2016 goto done;
2017 }
2018
2019 error = kernel_thread_start(func, inp, &inp->dlth_thread);
2020 if (error == KERN_SUCCESS) {
2021 thread_precedence_policy_data_t info;
2022 __unused kern_return_t kret;
2023
2024 bzero(&info, sizeof(info));
2025 info.importance = 0;
2026 kret = thread_policy_set(inp->dlth_thread,
2027 THREAD_PRECEDENCE_POLICY, (thread_policy_t)&info,
2028 THREAD_PRECEDENCE_POLICY_COUNT);
2029 ASSERT(kret == KERN_SUCCESS);
2030 /*
2031 * We create an affinity set so that the matching workloop
2032 * thread or the starter thread (for loopback) can be
2033 * scheduled on the same processor set as the input thread.
2034 */
2035 if (net_affinity) {
2036 struct thread *tp = inp->dlth_thread;
2037 u_int32_t tag;
2038 /*
2039 * Randomize to reduce the probability
2040 * of affinity tag namespace collision.
2041 */
2042 read_frandom(&tag, sizeof(tag));
2043 if (dlil_affinity_set(tp, tag) == KERN_SUCCESS) {
2044 thread_reference(tp);
2045 inp->dlth_affinity_tag = tag;
2046 inp->dlth_affinity = TRUE;
2047 }
2048 }
2049 } else if (inp == dlil_main_input_thread) {
2050 panic_plain("%s: couldn't create main input thread", __func__);
2051 /* NOTREACHED */
2052 } else {
2053 panic_plain("%s: couldn't create %s input thread", __func__,
2054 if_name(ifp));
2055 /* NOTREACHED */
2056 }
2057 OSAddAtomic(1, &cur_dlil_input_threads);
2058
2059 done:
2060 return error;
2061 }
2062
2063 static void
dlil_clean_threading_info(struct dlil_threading_info * inp)2064 dlil_clean_threading_info(struct dlil_threading_info *inp)
2065 {
2066 lck_mtx_destroy(&inp->dlth_lock, inp->dlth_lock_grp);
2067 lck_grp_free(inp->dlth_lock_grp);
2068 inp->dlth_lock_grp = NULL;
2069
2070 inp->dlth_flags = 0;
2071 inp->dlth_wtot = 0;
2072 bzero(inp->dlth_name, sizeof(inp->dlth_name));
2073 inp->dlth_ifp = NULL;
2074 VERIFY(qhead(&inp->dlth_pkts) == NULL && qempty(&inp->dlth_pkts));
2075 qlimit(&inp->dlth_pkts) = 0;
2076 bzero(&inp->dlth_stats, sizeof(inp->dlth_stats));
2077
2078 VERIFY(!inp->dlth_affinity);
2079 inp->dlth_thread = THREAD_NULL;
2080 inp->dlth_strategy = NULL;
2081 VERIFY(inp->dlth_driver_thread == THREAD_NULL);
2082 VERIFY(inp->dlth_poller_thread == THREAD_NULL);
2083 VERIFY(inp->dlth_affinity_tag == 0);
2084 #if IFNET_INPUT_SANITY_CHK
2085 inp->dlth_pkts_cnt = 0;
2086 #endif /* IFNET_INPUT_SANITY_CHK */
2087 }
2088
2089 static void
dlil_terminate_input_thread(struct dlil_threading_info * inp)2090 dlil_terminate_input_thread(struct dlil_threading_info *inp)
2091 {
2092 struct ifnet *ifp = inp->dlth_ifp;
2093 classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
2094
2095 VERIFY(current_thread() == inp->dlth_thread);
2096 VERIFY(inp != dlil_main_input_thread);
2097
2098 OSAddAtomic(-1, &cur_dlil_input_threads);
2099
2100 #if TEST_INPUT_THREAD_TERMINATION
2101 { /* do something useless that won't get optimized away */
2102 uint32_t v = 1;
2103 for (uint32_t i = 0;
2104 i < if_input_thread_termination_spin;
2105 i++) {
2106 v = (i + 1) * v;
2107 }
2108 DLIL_PRINTF("the value is %d\n", v);
2109 }
2110 #endif /* TEST_INPUT_THREAD_TERMINATION */
2111
2112 lck_mtx_lock_spin(&inp->dlth_lock);
2113 _getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
2114 VERIFY((inp->dlth_flags & DLIL_INPUT_TERMINATE) != 0);
2115 inp->dlth_flags |= DLIL_INPUT_TERMINATE_COMPLETE;
2116 wakeup_one((caddr_t)&inp->dlth_flags);
2117 lck_mtx_unlock(&inp->dlth_lock);
2118
2119 /* free up pending packets */
2120 if (pkt.cp_mbuf != NULL) {
2121 mbuf_freem_list(pkt.cp_mbuf);
2122 }
2123
2124 /* for the extra refcnt from kernel_thread_start() */
2125 thread_deallocate(current_thread());
2126
2127 if (dlil_verbose) {
2128 DLIL_PRINTF("%s: input thread terminated\n",
2129 if_name(ifp));
2130 }
2131
2132 /* this is the end */
2133 thread_terminate(current_thread());
2134 /* NOTREACHED */
2135 }
2136
2137 static kern_return_t
dlil_affinity_set(struct thread * tp,u_int32_t tag)2138 dlil_affinity_set(struct thread *tp, u_int32_t tag)
2139 {
2140 thread_affinity_policy_data_t policy;
2141
2142 bzero(&policy, sizeof(policy));
2143 policy.affinity_tag = tag;
2144 return thread_policy_set(tp, THREAD_AFFINITY_POLICY,
2145 (thread_policy_t)&policy, THREAD_AFFINITY_POLICY_COUNT);
2146 }
2147
2148 #if SKYWALK
2149 static void
dlil_filter_event(struct eventhandler_entry_arg arg __unused,enum net_filter_event_subsystems state)2150 dlil_filter_event(struct eventhandler_entry_arg arg __unused,
2151 enum net_filter_event_subsystems state)
2152 {
2153 evhlog(debug, "%s: eventhandler saw event type=net_filter_event_state event_code=0x%d",
2154 __func__, state);
2155
2156 bool old_if_enable_fsw_transport_netagent = if_enable_fsw_transport_netagent;
2157 if ((state & ~NET_FILTER_EVENT_PF_PRIVATE_PROXY) == 0) {
2158 if_enable_fsw_transport_netagent = 1;
2159 } else {
2160 if_enable_fsw_transport_netagent = 0;
2161 }
2162 if (old_if_enable_fsw_transport_netagent != if_enable_fsw_transport_netagent) {
2163 kern_nexus_update_netagents();
2164 } else if (!if_enable_fsw_transport_netagent) {
2165 necp_update_all_clients();
2166 }
2167 }
2168 #endif /* SKYWALK */
2169
2170 void
dlil_init(void)2171 dlil_init(void)
2172 {
2173 thread_t thread = THREAD_NULL;
2174
2175 /*
2176 * The following fields must be 64-bit aligned for atomic operations.
2177 */
2178 IF_DATA_REQUIRE_ALIGNED_64(ifi_ipackets);
2179 IF_DATA_REQUIRE_ALIGNED_64(ifi_ierrors);
2180 IF_DATA_REQUIRE_ALIGNED_64(ifi_opackets);
2181 IF_DATA_REQUIRE_ALIGNED_64(ifi_oerrors);
2182 IF_DATA_REQUIRE_ALIGNED_64(ifi_collisions);
2183 IF_DATA_REQUIRE_ALIGNED_64(ifi_ibytes);
2184 IF_DATA_REQUIRE_ALIGNED_64(ifi_obytes);
2185 IF_DATA_REQUIRE_ALIGNED_64(ifi_imcasts);
2186 IF_DATA_REQUIRE_ALIGNED_64(ifi_omcasts);
2187 IF_DATA_REQUIRE_ALIGNED_64(ifi_iqdrops);
2188 IF_DATA_REQUIRE_ALIGNED_64(ifi_noproto);
2189 IF_DATA_REQUIRE_ALIGNED_64(ifi_alignerrs);
2190 IF_DATA_REQUIRE_ALIGNED_64(ifi_dt_bytes);
2191 IF_DATA_REQUIRE_ALIGNED_64(ifi_fpackets);
2192 IF_DATA_REQUIRE_ALIGNED_64(ifi_fbytes);
2193
2194 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_ipackets);
2195 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_ierrors);
2196 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_opackets);
2197 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_oerrors);
2198 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_collisions);
2199 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_ibytes);
2200 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_obytes);
2201 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_imcasts);
2202 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_omcasts);
2203 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_iqdrops);
2204 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_noproto);
2205 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_alignerrs);
2206 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_dt_bytes);
2207 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_fpackets);
2208 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_fbytes);
2209
2210 /*
2211 * These IF_HWASSIST_ flags must be equal to their IFNET_* counterparts.
2212 */
2213 _CASSERT(IF_HWASSIST_CSUM_IP == IFNET_CSUM_IP);
2214 _CASSERT(IF_HWASSIST_CSUM_TCP == IFNET_CSUM_TCP);
2215 _CASSERT(IF_HWASSIST_CSUM_UDP == IFNET_CSUM_UDP);
2216 _CASSERT(IF_HWASSIST_CSUM_IP_FRAGS == IFNET_CSUM_FRAGMENT);
2217 _CASSERT(IF_HWASSIST_CSUM_FRAGMENT == IFNET_IP_FRAGMENT);
2218 _CASSERT(IF_HWASSIST_CSUM_TCPIPV6 == IFNET_CSUM_TCPIPV6);
2219 _CASSERT(IF_HWASSIST_CSUM_UDPIPV6 == IFNET_CSUM_UDPIPV6);
2220 _CASSERT(IF_HWASSIST_CSUM_FRAGMENT_IPV6 == IFNET_IPV6_FRAGMENT);
2221 _CASSERT(IF_HWASSIST_CSUM_PARTIAL == IFNET_CSUM_PARTIAL);
2222 _CASSERT(IF_HWASSIST_CSUM_ZERO_INVERT == IFNET_CSUM_ZERO_INVERT);
2223 _CASSERT(IF_HWASSIST_VLAN_TAGGING == IFNET_VLAN_TAGGING);
2224 _CASSERT(IF_HWASSIST_VLAN_MTU == IFNET_VLAN_MTU);
2225 _CASSERT(IF_HWASSIST_TSO_V4 == IFNET_TSO_IPV4);
2226 _CASSERT(IF_HWASSIST_TSO_V6 == IFNET_TSO_IPV6);
2227
2228 /*
2229 * ... as well as the mbuf checksum flags counterparts.
2230 */
2231 _CASSERT(CSUM_IP == IF_HWASSIST_CSUM_IP);
2232 _CASSERT(CSUM_TCP == IF_HWASSIST_CSUM_TCP);
2233 _CASSERT(CSUM_UDP == IF_HWASSIST_CSUM_UDP);
2234 _CASSERT(CSUM_IP_FRAGS == IF_HWASSIST_CSUM_IP_FRAGS);
2235 _CASSERT(CSUM_FRAGMENT == IF_HWASSIST_CSUM_FRAGMENT);
2236 _CASSERT(CSUM_TCPIPV6 == IF_HWASSIST_CSUM_TCPIPV6);
2237 _CASSERT(CSUM_UDPIPV6 == IF_HWASSIST_CSUM_UDPIPV6);
2238 _CASSERT(CSUM_FRAGMENT_IPV6 == IF_HWASSIST_CSUM_FRAGMENT_IPV6);
2239 _CASSERT(CSUM_PARTIAL == IF_HWASSIST_CSUM_PARTIAL);
2240 _CASSERT(CSUM_ZERO_INVERT == IF_HWASSIST_CSUM_ZERO_INVERT);
2241 _CASSERT(CSUM_VLAN_TAG_VALID == IF_HWASSIST_VLAN_TAGGING);
2242
2243 /*
2244 * Make sure we have at least IF_LLREACH_MAXLEN in the llreach info.
2245 */
2246 _CASSERT(IF_LLREACH_MAXLEN <= IF_LLREACHINFO_ADDRLEN);
2247 _CASSERT(IFNET_LLREACHINFO_ADDRLEN == IF_LLREACHINFO_ADDRLEN);
2248
2249 _CASSERT(IFRLOGF_DLIL == IFNET_LOGF_DLIL);
2250 _CASSERT(IFRLOGF_FAMILY == IFNET_LOGF_FAMILY);
2251 _CASSERT(IFRLOGF_DRIVER == IFNET_LOGF_DRIVER);
2252 _CASSERT(IFRLOGF_FIRMWARE == IFNET_LOGF_FIRMWARE);
2253
2254 _CASSERT(IFRLOGCAT_CONNECTIVITY == IFNET_LOGCAT_CONNECTIVITY);
2255 _CASSERT(IFRLOGCAT_QUALITY == IFNET_LOGCAT_QUALITY);
2256 _CASSERT(IFRLOGCAT_PERFORMANCE == IFNET_LOGCAT_PERFORMANCE);
2257
2258 _CASSERT(IFRTYPE_FAMILY_ANY == IFNET_FAMILY_ANY);
2259 _CASSERT(IFRTYPE_FAMILY_LOOPBACK == IFNET_FAMILY_LOOPBACK);
2260 _CASSERT(IFRTYPE_FAMILY_ETHERNET == IFNET_FAMILY_ETHERNET);
2261 _CASSERT(IFRTYPE_FAMILY_SLIP == IFNET_FAMILY_SLIP);
2262 _CASSERT(IFRTYPE_FAMILY_TUN == IFNET_FAMILY_TUN);
2263 _CASSERT(IFRTYPE_FAMILY_VLAN == IFNET_FAMILY_VLAN);
2264 _CASSERT(IFRTYPE_FAMILY_PPP == IFNET_FAMILY_PPP);
2265 _CASSERT(IFRTYPE_FAMILY_PVC == IFNET_FAMILY_PVC);
2266 _CASSERT(IFRTYPE_FAMILY_DISC == IFNET_FAMILY_DISC);
2267 _CASSERT(IFRTYPE_FAMILY_MDECAP == IFNET_FAMILY_MDECAP);
2268 _CASSERT(IFRTYPE_FAMILY_GIF == IFNET_FAMILY_GIF);
2269 _CASSERT(IFRTYPE_FAMILY_FAITH == IFNET_FAMILY_FAITH);
2270 _CASSERT(IFRTYPE_FAMILY_STF == IFNET_FAMILY_STF);
2271 _CASSERT(IFRTYPE_FAMILY_FIREWIRE == IFNET_FAMILY_FIREWIRE);
2272 _CASSERT(IFRTYPE_FAMILY_BOND == IFNET_FAMILY_BOND);
2273 _CASSERT(IFRTYPE_FAMILY_CELLULAR == IFNET_FAMILY_CELLULAR);
2274 _CASSERT(IFRTYPE_FAMILY_UTUN == IFNET_FAMILY_UTUN);
2275 _CASSERT(IFRTYPE_FAMILY_IPSEC == IFNET_FAMILY_IPSEC);
2276
2277 _CASSERT(IFRTYPE_SUBFAMILY_ANY == IFNET_SUBFAMILY_ANY);
2278 _CASSERT(IFRTYPE_SUBFAMILY_USB == IFNET_SUBFAMILY_USB);
2279 _CASSERT(IFRTYPE_SUBFAMILY_BLUETOOTH == IFNET_SUBFAMILY_BLUETOOTH);
2280 _CASSERT(IFRTYPE_SUBFAMILY_WIFI == IFNET_SUBFAMILY_WIFI);
2281 _CASSERT(IFRTYPE_SUBFAMILY_THUNDERBOLT == IFNET_SUBFAMILY_THUNDERBOLT);
2282 _CASSERT(IFRTYPE_SUBFAMILY_RESERVED == IFNET_SUBFAMILY_RESERVED);
2283 _CASSERT(IFRTYPE_SUBFAMILY_INTCOPROC == IFNET_SUBFAMILY_INTCOPROC);
2284 _CASSERT(IFRTYPE_SUBFAMILY_QUICKRELAY == IFNET_SUBFAMILY_QUICKRELAY);
2285 _CASSERT(IFRTYPE_SUBFAMILY_VMNET == IFNET_SUBFAMILY_VMNET);
2286 _CASSERT(IFRTYPE_SUBFAMILY_SIMCELL == IFNET_SUBFAMILY_SIMCELL);
2287 _CASSERT(IFRTYPE_SUBFAMILY_MANAGEMENT == IFNET_SUBFAMILY_MANAGEMENT);
2288
2289 _CASSERT(DLIL_MODIDLEN == IFNET_MODIDLEN);
2290 _CASSERT(DLIL_MODARGLEN == IFNET_MODARGLEN);
2291
2292 PE_parse_boot_argn("net_affinity", &net_affinity,
2293 sizeof(net_affinity));
2294
2295 PE_parse_boot_argn("net_rxpoll", &net_rxpoll, sizeof(net_rxpoll));
2296
2297 PE_parse_boot_argn("net_rtref", &net_rtref, sizeof(net_rtref));
2298
2299 PE_parse_boot_argn("net_async", &net_async, sizeof(net_async));
2300
2301 PE_parse_boot_argn("ifnet_debug", &ifnet_debug, sizeof(ifnet_debug));
2302
2303 VERIFY(dlil_pending_thread_cnt == 0);
2304 #if SKYWALK
2305 boolean_t pe_enable_fsw_transport_netagent = FALSE;
2306 boolean_t pe_disable_fsw_transport_netagent = FALSE;
2307 boolean_t enable_fsw_netagent =
2308 (((if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0) ||
2309 (if_attach_nx & IF_ATTACH_NX_FSW_IP_NETAGENT) != 0);
2310
2311 /*
2312 * Check the device tree to see if Skywalk netagent has been explicitly
2313 * enabled or disabled. This can be overridden via if_attach_nx below.
2314 * Note that the property is a 0-length key, and so checking for the
2315 * presence itself is enough (no need to check for the actual value of
2316 * the retrieved variable.)
2317 */
2318 pe_enable_fsw_transport_netagent =
2319 PE_get_default("kern.skywalk_netagent_enable",
2320 &pe_enable_fsw_transport_netagent,
2321 sizeof(pe_enable_fsw_transport_netagent));
2322 pe_disable_fsw_transport_netagent =
2323 PE_get_default("kern.skywalk_netagent_disable",
2324 &pe_disable_fsw_transport_netagent,
2325 sizeof(pe_disable_fsw_transport_netagent));
2326
2327 /*
2328 * These two are mutually exclusive, i.e. they both can be absent,
2329 * but only one can be present at a time, and so we assert to make
2330 * sure it is correct.
2331 */
2332 VERIFY((!pe_enable_fsw_transport_netagent &&
2333 !pe_disable_fsw_transport_netagent) ||
2334 (pe_enable_fsw_transport_netagent ^
2335 pe_disable_fsw_transport_netagent));
2336
2337 if (pe_enable_fsw_transport_netagent) {
2338 kprintf("SK: netagent is enabled via an override for "
2339 "this platform\n");
2340 if_attach_nx = SKYWALK_NETWORKING_ENABLED;
2341 } else if (pe_disable_fsw_transport_netagent) {
2342 kprintf("SK: netagent is disabled via an override for "
2343 "this platform\n");
2344 if_attach_nx = SKYWALK_NETWORKING_DISABLED;
2345 } else {
2346 kprintf("SK: netagent is %s by default for this platform\n",
2347 (enable_fsw_netagent ? "enabled" : "disabled"));
2348 if_attach_nx = IF_ATTACH_NX_DEFAULT;
2349 }
2350
2351 /*
2352 * Now see if there's a boot-arg override.
2353 */
2354 (void) PE_parse_boot_argn("if_attach_nx", &if_attach_nx,
2355 sizeof(if_attach_nx));
2356 if_enable_fsw_transport_netagent =
2357 ((if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0);
2358
2359 if_netif_all = ((if_attach_nx & IF_ATTACH_NX_NETIF_ALL) != 0);
2360
2361 if (pe_disable_fsw_transport_netagent &&
2362 if_enable_fsw_transport_netagent) {
2363 kprintf("SK: netagent is force-enabled\n");
2364 } else if (!pe_disable_fsw_transport_netagent &&
2365 !if_enable_fsw_transport_netagent) {
2366 kprintf("SK: netagent is force-disabled\n");
2367 }
2368 if (kernel_is_macos_or_server() && if_enable_fsw_transport_netagent) {
2369 net_filter_event_register(dlil_filter_event);
2370 }
2371
2372 #if (DEVELOPMENT || DEBUG)
2373 (void) PE_parse_boot_argn("fsw_use_max_mtu_buffer",
2374 &fsw_use_max_mtu_buffer, sizeof(fsw_use_max_mtu_buffer));
2375 #endif /* (DEVELOPMENT || DEBUG) */
2376
2377 #endif /* SKYWALK */
2378 dlif_size = (ifnet_debug == 0) ? sizeof(struct dlil_ifnet) :
2379 sizeof(struct dlil_ifnet_dbg);
2380 /* Enforce 64-bit alignment for dlil_ifnet structure */
2381 dlif_bufsize = dlif_size + sizeof(void *) + sizeof(u_int64_t);
2382 dlif_bufsize = (uint32_t)P2ROUNDUP(dlif_bufsize, sizeof(u_int64_t));
2383 dlif_zone = zone_create(DLIF_ZONE_NAME, dlif_bufsize, ZC_ZFREE_CLEARMEM);
2384
2385 dlif_tcpstat_size = sizeof(struct tcpstat_local);
2386 /* Enforce 64-bit alignment for tcpstat_local structure */
2387 dlif_tcpstat_bufsize =
2388 dlif_tcpstat_size + sizeof(void *) + sizeof(u_int64_t);
2389 dlif_tcpstat_bufsize = (uint32_t)
2390 P2ROUNDUP(dlif_tcpstat_bufsize, sizeof(u_int64_t));
2391 dlif_tcpstat_zone = zone_create(DLIF_TCPSTAT_ZONE_NAME,
2392 dlif_tcpstat_bufsize, ZC_ZFREE_CLEARMEM);
2393
2394 dlif_udpstat_size = sizeof(struct udpstat_local);
2395 /* Enforce 64-bit alignment for udpstat_local structure */
2396 dlif_udpstat_bufsize =
2397 dlif_udpstat_size + sizeof(void *) + sizeof(u_int64_t);
2398 dlif_udpstat_bufsize = (uint32_t)
2399 P2ROUNDUP(dlif_udpstat_bufsize, sizeof(u_int64_t));
2400 dlif_udpstat_zone = zone_create(DLIF_UDPSTAT_ZONE_NAME,
2401 dlif_udpstat_bufsize, ZC_ZFREE_CLEARMEM);
2402
2403 eventhandler_lists_ctxt_init(&ifnet_evhdlr_ctxt);
2404
2405 TAILQ_INIT(&dlil_ifnet_head);
2406 TAILQ_INIT(&ifnet_head);
2407 TAILQ_INIT(&ifnet_detaching_head);
2408 TAILQ_INIT(&ifnet_ordered_head);
2409
2410 /* Initialize interface address subsystem */
2411 ifa_init();
2412
2413 #if PF
2414 /* Initialize the packet filter */
2415 pfinit();
2416 #endif /* PF */
2417
2418 /* Initialize queue algorithms */
2419 classq_init();
2420
2421 /* Initialize packet schedulers */
2422 pktsched_init();
2423
2424 /* Initialize flow advisory subsystem */
2425 flowadv_init();
2426
2427 /* Initialize the pktap virtual interface */
2428 pktap_init();
2429
2430 /* Initialize droptap interface */
2431 droptap_init();
2432
2433 /* Initialize the service class to dscp map */
2434 net_qos_map_init();
2435
2436 /* Initialize the interface low power mode event handler */
2437 if_low_power_evhdlr_init();
2438
2439 /* Initialize the interface offload port list subsystem */
2440 if_ports_used_init();
2441
2442 #if DEBUG || DEVELOPMENT
2443 /* Run self-tests */
2444 dlil_verify_sum16();
2445 #endif /* DEBUG || DEVELOPMENT */
2446
2447 /*
2448 * Create and start up the main DLIL input thread and the interface
2449 * detacher threads once everything is initialized.
2450 */
2451 dlil_incr_pending_thread_count();
2452 (void) dlil_create_input_thread(NULL, dlil_main_input_thread, NULL);
2453
2454 /*
2455 * Create ifnet detacher thread.
2456 * When an interface gets detached, part of the detach processing
2457 * is delayed. The interface is added to delayed detach list
2458 * and this thread is woken up to call ifnet_detach_final
2459 * on these interfaces.
2460 */
2461 dlil_incr_pending_thread_count();
2462 if (kernel_thread_start(ifnet_detacher_thread_func,
2463 NULL, &thread) != KERN_SUCCESS) {
2464 panic_plain("%s: couldn't create detacher thread", __func__);
2465 /* NOTREACHED */
2466 }
2467 thread_deallocate(thread);
2468
2469 /*
2470 * Wait for the created kernel threads for dlil to get
2471 * scheduled and run at least once before we proceed
2472 */
2473 lck_mtx_lock(&dlil_thread_sync_lock);
2474 while (dlil_pending_thread_cnt != 0) {
2475 DLIL_PRINTF("%s: Waiting for all the create dlil kernel "
2476 "threads to get scheduled at least once.\n", __func__);
2477 (void) msleep(&dlil_pending_thread_cnt, &dlil_thread_sync_lock,
2478 (PZERO - 1), __func__, NULL);
2479 LCK_MTX_ASSERT(&dlil_thread_sync_lock, LCK_ASSERT_OWNED);
2480 }
2481 lck_mtx_unlock(&dlil_thread_sync_lock);
2482 DLIL_PRINTF("%s: All the created dlil kernel threads have been "
2483 "scheduled at least once. Proceeding.\n", __func__);
2484 }
2485
2486 static void
if_flt_monitor_busy(struct ifnet * ifp)2487 if_flt_monitor_busy(struct ifnet *ifp)
2488 {
2489 LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
2490
2491 ++ifp->if_flt_busy;
2492 VERIFY(ifp->if_flt_busy != 0);
2493 }
2494
2495 static void
if_flt_monitor_unbusy(struct ifnet * ifp)2496 if_flt_monitor_unbusy(struct ifnet *ifp)
2497 {
2498 if_flt_monitor_leave(ifp);
2499 }
2500
2501 static void
if_flt_monitor_enter(struct ifnet * ifp)2502 if_flt_monitor_enter(struct ifnet *ifp)
2503 {
2504 LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
2505
2506 while (ifp->if_flt_busy) {
2507 ++ifp->if_flt_waiters;
2508 (void) msleep(&ifp->if_flt_head, &ifp->if_flt_lock,
2509 (PZERO - 1), "if_flt_monitor", NULL);
2510 }
2511 if_flt_monitor_busy(ifp);
2512 }
2513
2514 static void
if_flt_monitor_leave(struct ifnet * ifp)2515 if_flt_monitor_leave(struct ifnet *ifp)
2516 {
2517 LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
2518
2519 VERIFY(ifp->if_flt_busy != 0);
2520 --ifp->if_flt_busy;
2521
2522 if (ifp->if_flt_busy == 0 && ifp->if_flt_waiters > 0) {
2523 ifp->if_flt_waiters = 0;
2524 wakeup(&ifp->if_flt_head);
2525 }
2526 }
2527
2528 __private_extern__ int
dlil_attach_filter(struct ifnet * ifp,const struct iff_filter * if_filter,interface_filter_t * filter_ref,u_int32_t flags)2529 dlil_attach_filter(struct ifnet *ifp, const struct iff_filter *if_filter,
2530 interface_filter_t *filter_ref, u_int32_t flags)
2531 {
2532 int retval = 0;
2533 struct ifnet_filter *filter = NULL;
2534
2535 ifnet_head_lock_shared();
2536
2537 /* Check that the interface is in the global list */
2538 if (!ifnet_lookup(ifp)) {
2539 retval = ENXIO;
2540 goto done;
2541 }
2542 if (!ifnet_is_attached(ifp, 1)) {
2543 os_log(OS_LOG_DEFAULT, "%s: %s is no longer attached",
2544 __func__, if_name(ifp));
2545 retval = ENXIO;
2546 goto done;
2547 }
2548
2549 filter = zalloc_flags(dlif_filt_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
2550
2551 /* refcnt held above during lookup */
2552 filter->filt_flags = flags;
2553 filter->filt_ifp = ifp;
2554 filter->filt_cookie = if_filter->iff_cookie;
2555 filter->filt_name = if_filter->iff_name;
2556 filter->filt_protocol = if_filter->iff_protocol;
2557 /*
2558 * Do not install filter callbacks for internal coproc interface
2559 * and for management interfaces
2560 */
2561 if (!IFNET_IS_INTCOPROC(ifp) && !IFNET_IS_MANAGEMENT(ifp)) {
2562 filter->filt_input = if_filter->iff_input;
2563 filter->filt_output = if_filter->iff_output;
2564 filter->filt_event = if_filter->iff_event;
2565 filter->filt_ioctl = if_filter->iff_ioctl;
2566 }
2567 filter->filt_detached = if_filter->iff_detached;
2568
2569 lck_mtx_lock(&ifp->if_flt_lock);
2570 if_flt_monitor_enter(ifp);
2571
2572 LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
2573 TAILQ_INSERT_TAIL(&ifp->if_flt_head, filter, filt_next);
2574
2575 *filter_ref = filter;
2576
2577 /*
2578 * Bump filter count and route_generation ID to let TCP
2579 * know it shouldn't do TSO on this connection
2580 */
2581 if ((filter->filt_flags & DLIL_IFF_TSO) == 0) {
2582 ifnet_filter_update_tso(ifp, TRUE);
2583 }
2584 OSIncrementAtomic64(&net_api_stats.nas_iflt_attach_count);
2585 INC_ATOMIC_INT64_LIM(net_api_stats.nas_iflt_attach_total);
2586 if (filter->filt_flags & DLIL_IFF_INTERNAL) {
2587 OSIncrementAtomic64(&net_api_stats.nas_iflt_attach_os_count);
2588 INC_ATOMIC_INT64_LIM(net_api_stats.nas_iflt_attach_os_total);
2589 } else {
2590 OSAddAtomic(1, &ifp->if_flt_non_os_count);
2591 }
2592 if_flt_monitor_leave(ifp);
2593 lck_mtx_unlock(&ifp->if_flt_lock);
2594
2595 #if SKYWALK
2596 if (kernel_is_macos_or_server()) {
2597 net_filter_event_mark(NET_FILTER_EVENT_INTERFACE,
2598 net_check_compatible_if_filter(NULL));
2599 }
2600 #endif /* SKYWALK */
2601
2602 if (dlil_verbose) {
2603 DLIL_PRINTF("%s: %s filter attached\n", if_name(ifp),
2604 if_filter->iff_name);
2605 }
2606 ifnet_decr_iorefcnt(ifp);
2607
2608 done:
2609 ifnet_head_done();
2610 if (retval != 0 && ifp != NULL) {
2611 DLIL_PRINTF("%s: failed to attach %s (err=%d)\n",
2612 if_name(ifp), if_filter->iff_name, retval);
2613 }
2614 if (retval != 0 && filter != NULL) {
2615 zfree(dlif_filt_zone, filter);
2616 }
2617
2618 return retval;
2619 }
2620
2621 static int
dlil_detach_filter_internal(interface_filter_t filter,int detached)2622 dlil_detach_filter_internal(interface_filter_t filter, int detached)
2623 {
2624 int retval = 0;
2625
2626 if (detached == 0) {
2627 ifnet_t ifp = NULL;
2628
2629 ifnet_head_lock_shared();
2630 TAILQ_FOREACH(ifp, &ifnet_head, if_link) {
2631 interface_filter_t entry = NULL;
2632
2633 lck_mtx_lock(&ifp->if_flt_lock);
2634 TAILQ_FOREACH(entry, &ifp->if_flt_head, filt_next) {
2635 if (entry != filter || entry->filt_skip) {
2636 continue;
2637 }
2638 /*
2639 * We've found a match; since it's possible
2640 * that the thread gets blocked in the monitor,
2641 * we do the lock dance. Interface should
2642 * not be detached since we still have a use
2643 * count held during filter attach.
2644 */
2645 entry->filt_skip = 1; /* skip input/output */
2646 lck_mtx_unlock(&ifp->if_flt_lock);
2647 ifnet_head_done();
2648
2649 lck_mtx_lock(&ifp->if_flt_lock);
2650 if_flt_monitor_enter(ifp);
2651 LCK_MTX_ASSERT(&ifp->if_flt_lock,
2652 LCK_MTX_ASSERT_OWNED);
2653
2654 /* Remove the filter from the list */
2655 TAILQ_REMOVE(&ifp->if_flt_head, filter,
2656 filt_next);
2657
2658 if (dlil_verbose) {
2659 DLIL_PRINTF("%s: %s filter detached\n",
2660 if_name(ifp), filter->filt_name);
2661 }
2662 if (!(filter->filt_flags & DLIL_IFF_INTERNAL)) {
2663 VERIFY(ifp->if_flt_non_os_count != 0);
2664 OSAddAtomic(-1, &ifp->if_flt_non_os_count);
2665 }
2666 /*
2667 * Decrease filter count and route_generation
2668 * ID to let TCP know it should reevalute doing
2669 * TSO or not.
2670 */
2671 if ((filter->filt_flags & DLIL_IFF_TSO) == 0) {
2672 ifnet_filter_update_tso(ifp, FALSE);
2673 }
2674 /*
2675 * When we remove the bridge's interface filter,
2676 * clear the field in the ifnet.
2677 */
2678 if ((filter->filt_flags & DLIL_IFF_BRIDGE)
2679 != 0) {
2680 ifp->if_bridge = NULL;
2681 }
2682 if_flt_monitor_leave(ifp);
2683 lck_mtx_unlock(&ifp->if_flt_lock);
2684 goto destroy;
2685 }
2686 lck_mtx_unlock(&ifp->if_flt_lock);
2687 }
2688 ifnet_head_done();
2689
2690 /* filter parameter is not a valid filter ref */
2691 retval = EINVAL;
2692 goto done;
2693 } else {
2694 struct ifnet *ifp = filter->filt_ifp;
2695 /*
2696 * Here we are called from ifnet_detach_final(); the
2697 * caller had emptied if_flt_head and we're doing an
2698 * implicit filter detach because the interface is
2699 * about to go away. Make sure to adjust the counters
2700 * in this case. We don't need the protection of the
2701 * filter monitor since we're called as part of the
2702 * final detach in the context of the detacher thread.
2703 */
2704 if (!(filter->filt_flags & DLIL_IFF_INTERNAL)) {
2705 VERIFY(ifp->if_flt_non_os_count != 0);
2706 OSAddAtomic(-1, &ifp->if_flt_non_os_count);
2707 }
2708 /*
2709 * Decrease filter count and route_generation
2710 * ID to let TCP know it should reevalute doing
2711 * TSO or not.
2712 */
2713 if ((filter->filt_flags & DLIL_IFF_TSO) == 0) {
2714 ifnet_filter_update_tso(ifp, FALSE);
2715 }
2716 }
2717
2718 if (dlil_verbose) {
2719 DLIL_PRINTF("%s filter detached\n", filter->filt_name);
2720 }
2721
2722 destroy:
2723
2724 /* Call the detached function if there is one */
2725 if (filter->filt_detached) {
2726 filter->filt_detached(filter->filt_cookie, filter->filt_ifp);
2727 }
2728
2729 VERIFY(OSDecrementAtomic64(&net_api_stats.nas_iflt_attach_count) > 0);
2730 if (filter->filt_flags & DLIL_IFF_INTERNAL) {
2731 VERIFY(OSDecrementAtomic64(&net_api_stats.nas_iflt_attach_os_count) > 0);
2732 }
2733 #if SKYWALK
2734 if (kernel_is_macos_or_server()) {
2735 net_filter_event_mark(NET_FILTER_EVENT_INTERFACE,
2736 net_check_compatible_if_filter(NULL));
2737 }
2738 #endif /* SKYWALK */
2739
2740 /* Free the filter */
2741 zfree(dlif_filt_zone, filter);
2742 filter = NULL;
2743 done:
2744 if (retval != 0 && filter != NULL) {
2745 DLIL_PRINTF("failed to detach %s filter (err=%d)\n",
2746 filter->filt_name, retval);
2747 }
2748
2749 return retval;
2750 }
2751
2752 __private_extern__ void
dlil_detach_filter(interface_filter_t filter)2753 dlil_detach_filter(interface_filter_t filter)
2754 {
2755 if (filter == NULL) {
2756 return;
2757 }
2758 dlil_detach_filter_internal(filter, 0);
2759 }
2760
2761 __private_extern__ boolean_t
dlil_has_ip_filter(void)2762 dlil_has_ip_filter(void)
2763 {
2764 boolean_t has_filter = ((net_api_stats.nas_ipf_add_count - net_api_stats.nas_ipf_add_os_count) > 0);
2765
2766 VERIFY(net_api_stats.nas_ipf_add_count >= net_api_stats.nas_ipf_add_os_count);
2767
2768 DTRACE_IP1(dlil_has_ip_filter, boolean_t, has_filter);
2769 return has_filter;
2770 }
2771
2772 __private_extern__ boolean_t
dlil_has_if_filter(struct ifnet * ifp)2773 dlil_has_if_filter(struct ifnet *ifp)
2774 {
2775 boolean_t has_filter = !TAILQ_EMPTY(&ifp->if_flt_head);
2776 DTRACE_IP1(dlil_has_if_filter, boolean_t, has_filter);
2777 return has_filter;
2778 }
2779
2780 static inline void
dlil_input_wakeup(struct dlil_threading_info * inp)2781 dlil_input_wakeup(struct dlil_threading_info *inp)
2782 {
2783 LCK_MTX_ASSERT(&inp->dlth_lock, LCK_MTX_ASSERT_OWNED);
2784
2785 inp->dlth_flags |= DLIL_INPUT_WAITING;
2786 if (!(inp->dlth_flags & DLIL_INPUT_RUNNING)) {
2787 inp->dlth_wtot++;
2788 wakeup_one((caddr_t)&inp->dlth_flags);
2789 }
2790 }
2791
2792 __attribute__((noreturn))
2793 static void
dlil_main_input_thread_func(void * v,wait_result_t w)2794 dlil_main_input_thread_func(void *v, wait_result_t w)
2795 {
2796 #pragma unused(w)
2797 struct dlil_threading_info *inp = v;
2798
2799 VERIFY(inp == dlil_main_input_thread);
2800 VERIFY(inp->dlth_ifp == NULL);
2801 VERIFY(current_thread() == inp->dlth_thread);
2802
2803 lck_mtx_lock(&inp->dlth_lock);
2804 VERIFY(!(inp->dlth_flags & (DLIL_INPUT_EMBRYONIC | DLIL_INPUT_RUNNING)));
2805 (void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
2806 inp->dlth_flags |= DLIL_INPUT_EMBRYONIC;
2807 /* wake up once to get out of embryonic state */
2808 dlil_input_wakeup(inp);
2809 lck_mtx_unlock(&inp->dlth_lock);
2810 (void) thread_block_parameter(dlil_main_input_thread_cont, inp);
2811 /* NOTREACHED */
2812 __builtin_unreachable();
2813 }
2814
2815 /*
2816 * Main input thread:
2817 *
2818 * a) handles all inbound packets for lo0
2819 * b) handles all inbound packets for interfaces with no dedicated
2820 * input thread (e.g. anything but Ethernet/PDP or those that support
2821 * opportunistic polling.)
2822 * c) protocol registrations
2823 * d) packet injections
2824 */
2825 __attribute__((noreturn))
2826 static void
dlil_main_input_thread_cont(void * v,wait_result_t wres)2827 dlil_main_input_thread_cont(void *v, wait_result_t wres)
2828 {
2829 struct dlil_main_threading_info *inpm = v;
2830 struct dlil_threading_info *inp = v;
2831
2832 /* main input thread is uninterruptible */
2833 VERIFY(wres != THREAD_INTERRUPTED);
2834 lck_mtx_lock_spin(&inp->dlth_lock);
2835 VERIFY(!(inp->dlth_flags & (DLIL_INPUT_TERMINATE |
2836 DLIL_INPUT_RUNNING)));
2837 inp->dlth_flags |= DLIL_INPUT_RUNNING;
2838
2839 while (1) {
2840 struct mbuf *m = NULL, *m_loop = NULL;
2841 u_int32_t m_cnt, m_cnt_loop;
2842 classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
2843 boolean_t proto_req;
2844 boolean_t embryonic;
2845
2846 inp->dlth_flags &= ~DLIL_INPUT_WAITING;
2847
2848 if (__improbable(embryonic =
2849 (inp->dlth_flags & DLIL_INPUT_EMBRYONIC))) {
2850 inp->dlth_flags &= ~DLIL_INPUT_EMBRYONIC;
2851 }
2852
2853 proto_req = (inp->dlth_flags &
2854 (DLIL_PROTO_WAITING | DLIL_PROTO_REGISTER));
2855
2856 /* Packets for non-dedicated interfaces other than lo0 */
2857 m_cnt = qlen(&inp->dlth_pkts);
2858 _getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
2859 m = pkt.cp_mbuf;
2860
2861 /* Packets exclusive to lo0 */
2862 m_cnt_loop = qlen(&inpm->lo_rcvq_pkts);
2863 _getq_all(&inpm->lo_rcvq_pkts, &pkt, NULL, NULL, NULL);
2864 m_loop = pkt.cp_mbuf;
2865
2866 inp->dlth_wtot = 0;
2867
2868 lck_mtx_unlock(&inp->dlth_lock);
2869
2870 if (__improbable(embryonic)) {
2871 dlil_decr_pending_thread_count();
2872 }
2873
2874 /*
2875 * NOTE warning %%% attention !!!!
2876 * We should think about putting some thread starvation
2877 * safeguards if we deal with long chains of packets.
2878 */
2879 if (__probable(m_loop != NULL)) {
2880 dlil_input_packet_list_extended(lo_ifp, m_loop,
2881 m_cnt_loop, IFNET_MODEL_INPUT_POLL_OFF);
2882 }
2883
2884 if (__probable(m != NULL)) {
2885 dlil_input_packet_list_extended(NULL, m,
2886 m_cnt, IFNET_MODEL_INPUT_POLL_OFF);
2887 }
2888
2889 if (__improbable(proto_req)) {
2890 proto_input_run();
2891 }
2892
2893 lck_mtx_lock_spin(&inp->dlth_lock);
2894 VERIFY(inp->dlth_flags & DLIL_INPUT_RUNNING);
2895 /* main input thread cannot be terminated */
2896 VERIFY(!(inp->dlth_flags & DLIL_INPUT_TERMINATE));
2897 if (!(inp->dlth_flags & ~DLIL_INPUT_RUNNING)) {
2898 break;
2899 }
2900 }
2901
2902 inp->dlth_flags &= ~DLIL_INPUT_RUNNING;
2903 (void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
2904 lck_mtx_unlock(&inp->dlth_lock);
2905 (void) thread_block_parameter(dlil_main_input_thread_cont, inp);
2906
2907 VERIFY(0); /* we should never get here */
2908 /* NOTREACHED */
2909 __builtin_unreachable();
2910 }
2911
2912 /*
2913 * Input thread for interfaces with legacy input model.
2914 */
2915 __attribute__((noreturn))
2916 static void
dlil_input_thread_func(void * v,wait_result_t w)2917 dlil_input_thread_func(void *v, wait_result_t w)
2918 {
2919 #pragma unused(w)
2920 char thread_name[MAXTHREADNAMESIZE];
2921 struct dlil_threading_info *inp = v;
2922 struct ifnet *ifp = inp->dlth_ifp;
2923
2924 VERIFY(inp != dlil_main_input_thread);
2925 VERIFY(ifp != NULL);
2926 VERIFY(!(ifp->if_eflags & IFEF_RXPOLL) || !net_rxpoll ||
2927 !(ifp->if_xflags & IFXF_LEGACY));
2928 VERIFY(ifp->if_poll_mode == IFNET_MODEL_INPUT_POLL_OFF ||
2929 !(ifp->if_xflags & IFXF_LEGACY));
2930 VERIFY(current_thread() == inp->dlth_thread);
2931
2932 /* construct the name for this thread, and then apply it */
2933 bzero(thread_name, sizeof(thread_name));
2934 (void) snprintf(thread_name, sizeof(thread_name),
2935 "dlil_input_%s", ifp->if_xname);
2936 thread_set_thread_name(inp->dlth_thread, thread_name);
2937
2938 lck_mtx_lock(&inp->dlth_lock);
2939 VERIFY(!(inp->dlth_flags & (DLIL_INPUT_EMBRYONIC | DLIL_INPUT_RUNNING)));
2940 (void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
2941 inp->dlth_flags |= DLIL_INPUT_EMBRYONIC;
2942 /* wake up once to get out of embryonic state */
2943 dlil_input_wakeup(inp);
2944 lck_mtx_unlock(&inp->dlth_lock);
2945 (void) thread_block_parameter(dlil_input_thread_cont, inp);
2946 /* NOTREACHED */
2947 __builtin_unreachable();
2948 }
2949
2950 __attribute__((noreturn))
2951 static void
dlil_input_thread_cont(void * v,wait_result_t wres)2952 dlil_input_thread_cont(void *v, wait_result_t wres)
2953 {
2954 struct dlil_threading_info *inp = v;
2955 struct ifnet *ifp = inp->dlth_ifp;
2956
2957 lck_mtx_lock_spin(&inp->dlth_lock);
2958 if (__improbable(wres == THREAD_INTERRUPTED ||
2959 (inp->dlth_flags & DLIL_INPUT_TERMINATE))) {
2960 goto terminate;
2961 }
2962
2963 VERIFY(!(inp->dlth_flags & DLIL_INPUT_RUNNING));
2964 inp->dlth_flags |= DLIL_INPUT_RUNNING;
2965
2966 while (1) {
2967 struct mbuf *m = NULL;
2968 classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
2969 boolean_t notify = FALSE;
2970 boolean_t embryonic;
2971 u_int32_t m_cnt;
2972
2973 inp->dlth_flags &= ~DLIL_INPUT_WAITING;
2974
2975 if (__improbable(embryonic =
2976 (inp->dlth_flags & DLIL_INPUT_EMBRYONIC))) {
2977 inp->dlth_flags &= ~DLIL_INPUT_EMBRYONIC;
2978 }
2979
2980 /*
2981 * Protocol registration and injection must always use
2982 * the main input thread; in theory the latter can utilize
2983 * the corresponding input thread where the packet arrived
2984 * on, but that requires our knowing the interface in advance
2985 * (and the benefits might not worth the trouble.)
2986 */
2987 VERIFY(!(inp->dlth_flags &
2988 (DLIL_PROTO_WAITING | DLIL_PROTO_REGISTER)));
2989
2990 /* Packets for this interface */
2991 m_cnt = qlen(&inp->dlth_pkts);
2992 _getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
2993 m = pkt.cp_mbuf;
2994
2995 inp->dlth_wtot = 0;
2996
2997 #if SKYWALK
2998 /*
2999 * If this interface is attached to a netif nexus,
3000 * the stats are already incremented there; otherwise
3001 * do it here.
3002 */
3003 if (!(ifp->if_capabilities & IFCAP_SKYWALK))
3004 #endif /* SKYWALK */
3005 notify = dlil_input_stats_sync(ifp, inp);
3006
3007 lck_mtx_unlock(&inp->dlth_lock);
3008
3009 if (__improbable(embryonic)) {
3010 ifnet_decr_pending_thread_count(ifp);
3011 }
3012
3013 if (__improbable(notify)) {
3014 ifnet_notify_data_threshold(ifp);
3015 }
3016
3017 /*
3018 * NOTE warning %%% attention !!!!
3019 * We should think about putting some thread starvation
3020 * safeguards if we deal with long chains of packets.
3021 */
3022 if (__probable(m != NULL)) {
3023 dlil_input_packet_list_extended(ifp, m,
3024 m_cnt, ifp->if_poll_mode);
3025 }
3026
3027 lck_mtx_lock_spin(&inp->dlth_lock);
3028 VERIFY(inp->dlth_flags & DLIL_INPUT_RUNNING);
3029 if (!(inp->dlth_flags & ~(DLIL_INPUT_RUNNING |
3030 DLIL_INPUT_TERMINATE))) {
3031 break;
3032 }
3033 }
3034
3035 inp->dlth_flags &= ~DLIL_INPUT_RUNNING;
3036
3037 if (__improbable(inp->dlth_flags & DLIL_INPUT_TERMINATE)) {
3038 terminate:
3039 lck_mtx_unlock(&inp->dlth_lock);
3040 dlil_terminate_input_thread(inp);
3041 /* NOTREACHED */
3042 } else {
3043 (void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3044 lck_mtx_unlock(&inp->dlth_lock);
3045 (void) thread_block_parameter(dlil_input_thread_cont, inp);
3046 /* NOTREACHED */
3047 }
3048
3049 VERIFY(0); /* we should never get here */
3050 /* NOTREACHED */
3051 __builtin_unreachable();
3052 }
3053
3054 /*
3055 * Input thread for interfaces with opportunistic polling input model.
3056 */
3057 __attribute__((noreturn))
3058 static void
dlil_rxpoll_input_thread_func(void * v,wait_result_t w)3059 dlil_rxpoll_input_thread_func(void *v, wait_result_t w)
3060 {
3061 #pragma unused(w)
3062 char thread_name[MAXTHREADNAMESIZE];
3063 struct dlil_threading_info *inp = v;
3064 struct ifnet *ifp = inp->dlth_ifp;
3065
3066 VERIFY(inp != dlil_main_input_thread);
3067 VERIFY(ifp != NULL && (ifp->if_eflags & IFEF_RXPOLL) &&
3068 (ifp->if_xflags & IFXF_LEGACY));
3069 VERIFY(current_thread() == inp->dlth_thread);
3070
3071 /* construct the name for this thread, and then apply it */
3072 bzero(thread_name, sizeof(thread_name));
3073 (void) snprintf(thread_name, sizeof(thread_name),
3074 "dlil_input_poll_%s", ifp->if_xname);
3075 thread_set_thread_name(inp->dlth_thread, thread_name);
3076
3077 lck_mtx_lock(&inp->dlth_lock);
3078 VERIFY(!(inp->dlth_flags & (DLIL_INPUT_EMBRYONIC | DLIL_INPUT_RUNNING)));
3079 (void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3080 inp->dlth_flags |= DLIL_INPUT_EMBRYONIC;
3081 /* wake up once to get out of embryonic state */
3082 dlil_input_wakeup(inp);
3083 lck_mtx_unlock(&inp->dlth_lock);
3084 (void) thread_block_parameter(dlil_rxpoll_input_thread_cont, inp);
3085 /* NOTREACHED */
3086 __builtin_unreachable();
3087 }
3088
3089 __attribute__((noreturn))
3090 static void
dlil_rxpoll_input_thread_cont(void * v,wait_result_t wres)3091 dlil_rxpoll_input_thread_cont(void *v, wait_result_t wres)
3092 {
3093 struct dlil_threading_info *inp = v;
3094 struct ifnet *ifp = inp->dlth_ifp;
3095 struct timespec ts;
3096
3097 lck_mtx_lock_spin(&inp->dlth_lock);
3098 if (__improbable(wres == THREAD_INTERRUPTED ||
3099 (inp->dlth_flags & DLIL_INPUT_TERMINATE))) {
3100 goto terminate;
3101 }
3102
3103 VERIFY(!(inp->dlth_flags & DLIL_INPUT_RUNNING));
3104 inp->dlth_flags |= DLIL_INPUT_RUNNING;
3105
3106 while (1) {
3107 struct mbuf *m = NULL;
3108 uint32_t m_cnt, poll_req = 0;
3109 uint64_t m_size = 0;
3110 ifnet_model_t mode;
3111 struct timespec now, delta;
3112 classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
3113 boolean_t notify;
3114 boolean_t embryonic;
3115 uint64_t ival;
3116
3117 inp->dlth_flags &= ~DLIL_INPUT_WAITING;
3118
3119 if (__improbable(embryonic =
3120 (inp->dlth_flags & DLIL_INPUT_EMBRYONIC))) {
3121 inp->dlth_flags &= ~DLIL_INPUT_EMBRYONIC;
3122 goto skip;
3123 }
3124
3125 if ((ival = ifp->if_rxpoll_ival) < IF_RXPOLL_INTERVALTIME_MIN) {
3126 ival = IF_RXPOLL_INTERVALTIME_MIN;
3127 }
3128
3129 /* Link parameters changed? */
3130 if (ifp->if_poll_update != 0) {
3131 ifp->if_poll_update = 0;
3132 (void) dlil_rxpoll_set_params(ifp, NULL, TRUE);
3133 }
3134
3135 /* Current operating mode */
3136 mode = ifp->if_poll_mode;
3137
3138 /*
3139 * Protocol registration and injection must always use
3140 * the main input thread; in theory the latter can utilize
3141 * the corresponding input thread where the packet arrived
3142 * on, but that requires our knowing the interface in advance
3143 * (and the benefits might not worth the trouble.)
3144 */
3145 VERIFY(!(inp->dlth_flags &
3146 (DLIL_PROTO_WAITING | DLIL_PROTO_REGISTER)));
3147
3148 /* Total count of all packets */
3149 m_cnt = qlen(&inp->dlth_pkts);
3150
3151 /* Total bytes of all packets */
3152 m_size = qsize(&inp->dlth_pkts);
3153
3154 /* Packets for this interface */
3155 _getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
3156 m = pkt.cp_mbuf;
3157 VERIFY(m != NULL || m_cnt == 0);
3158
3159 nanouptime(&now);
3160 if (!net_timerisset(&ifp->if_poll_sample_lasttime)) {
3161 *(&ifp->if_poll_sample_lasttime) = *(&now);
3162 }
3163
3164 net_timersub(&now, &ifp->if_poll_sample_lasttime, &delta);
3165 if (if_rxpoll && net_timerisset(&ifp->if_poll_sample_holdtime)) {
3166 u_int32_t ptot, btot;
3167
3168 /* Accumulate statistics for current sampling */
3169 PKTCNTR_ADD(&ifp->if_poll_sstats, m_cnt, m_size);
3170
3171 if (net_timercmp(&delta, &ifp->if_poll_sample_holdtime, <)) {
3172 goto skip;
3173 }
3174
3175 *(&ifp->if_poll_sample_lasttime) = *(&now);
3176
3177 /* Calculate min/max of inbound bytes */
3178 btot = (u_int32_t)ifp->if_poll_sstats.bytes;
3179 if (ifp->if_rxpoll_bmin == 0 || ifp->if_rxpoll_bmin > btot) {
3180 ifp->if_rxpoll_bmin = btot;
3181 }
3182 if (btot > ifp->if_rxpoll_bmax) {
3183 ifp->if_rxpoll_bmax = btot;
3184 }
3185
3186 /* Calculate EWMA of inbound bytes */
3187 DLIL_EWMA(ifp->if_rxpoll_bavg, btot, if_rxpoll_decay);
3188
3189 /* Calculate min/max of inbound packets */
3190 ptot = (u_int32_t)ifp->if_poll_sstats.packets;
3191 if (ifp->if_rxpoll_pmin == 0 || ifp->if_rxpoll_pmin > ptot) {
3192 ifp->if_rxpoll_pmin = ptot;
3193 }
3194 if (ptot > ifp->if_rxpoll_pmax) {
3195 ifp->if_rxpoll_pmax = ptot;
3196 }
3197
3198 /* Calculate EWMA of inbound packets */
3199 DLIL_EWMA(ifp->if_rxpoll_pavg, ptot, if_rxpoll_decay);
3200
3201 /* Reset sampling statistics */
3202 PKTCNTR_CLEAR(&ifp->if_poll_sstats);
3203
3204 /* Calculate EWMA of wakeup requests */
3205 DLIL_EWMA(ifp->if_rxpoll_wavg, inp->dlth_wtot,
3206 if_rxpoll_decay);
3207 inp->dlth_wtot = 0;
3208
3209 if (dlil_verbose) {
3210 if (!net_timerisset(&ifp->if_poll_dbg_lasttime)) {
3211 *(&ifp->if_poll_dbg_lasttime) = *(&now);
3212 }
3213 net_timersub(&now, &ifp->if_poll_dbg_lasttime, &delta);
3214 if (net_timercmp(&delta, &dlil_dbgrate, >=)) {
3215 *(&ifp->if_poll_dbg_lasttime) = *(&now);
3216 DLIL_PRINTF("%s: [%s] pkts avg %d max %d "
3217 "limits [%d/%d], wreq avg %d "
3218 "limits [%d/%d], bytes avg %d "
3219 "limits [%d/%d]\n", if_name(ifp),
3220 (ifp->if_poll_mode ==
3221 IFNET_MODEL_INPUT_POLL_ON) ?
3222 "ON" : "OFF", ifp->if_rxpoll_pavg,
3223 ifp->if_rxpoll_pmax,
3224 ifp->if_rxpoll_plowat,
3225 ifp->if_rxpoll_phiwat,
3226 ifp->if_rxpoll_wavg,
3227 ifp->if_rxpoll_wlowat,
3228 ifp->if_rxpoll_whiwat,
3229 ifp->if_rxpoll_bavg,
3230 ifp->if_rxpoll_blowat,
3231 ifp->if_rxpoll_bhiwat);
3232 }
3233 }
3234
3235 /* Perform mode transition, if necessary */
3236 if (!net_timerisset(&ifp->if_poll_mode_lasttime)) {
3237 *(&ifp->if_poll_mode_lasttime) = *(&now);
3238 }
3239
3240 net_timersub(&now, &ifp->if_poll_mode_lasttime, &delta);
3241 if (net_timercmp(&delta, &ifp->if_poll_mode_holdtime, <)) {
3242 goto skip;
3243 }
3244
3245 if (ifp->if_rxpoll_pavg <= ifp->if_rxpoll_plowat &&
3246 ifp->if_rxpoll_bavg <= ifp->if_rxpoll_blowat &&
3247 ifp->if_poll_mode != IFNET_MODEL_INPUT_POLL_OFF) {
3248 mode = IFNET_MODEL_INPUT_POLL_OFF;
3249 } else if (ifp->if_rxpoll_pavg >= ifp->if_rxpoll_phiwat &&
3250 (ifp->if_rxpoll_bavg >= ifp->if_rxpoll_bhiwat ||
3251 ifp->if_rxpoll_wavg >= ifp->if_rxpoll_whiwat) &&
3252 ifp->if_poll_mode != IFNET_MODEL_INPUT_POLL_ON) {
3253 mode = IFNET_MODEL_INPUT_POLL_ON;
3254 }
3255
3256 if (mode != ifp->if_poll_mode) {
3257 ifp->if_poll_mode = mode;
3258 *(&ifp->if_poll_mode_lasttime) = *(&now);
3259 poll_req++;
3260 }
3261 }
3262 skip:
3263 notify = dlil_input_stats_sync(ifp, inp);
3264
3265 lck_mtx_unlock(&inp->dlth_lock);
3266
3267 if (__improbable(embryonic)) {
3268 ifnet_decr_pending_thread_count(ifp);
3269 }
3270
3271 if (__improbable(notify)) {
3272 ifnet_notify_data_threshold(ifp);
3273 }
3274
3275 /*
3276 * If there's a mode change and interface is still attached,
3277 * perform a downcall to the driver for the new mode. Also
3278 * hold an IO refcnt on the interface to prevent it from
3279 * being detached (will be release below.)
3280 */
3281 if (poll_req != 0 && ifnet_is_attached(ifp, 1)) {
3282 struct ifnet_model_params p = {
3283 .model = mode, .reserved = { 0 }
3284 };
3285 errno_t err;
3286
3287 if (dlil_verbose) {
3288 DLIL_PRINTF("%s: polling is now %s, "
3289 "pkts avg %d max %d limits [%d/%d], "
3290 "wreq avg %d limits [%d/%d], "
3291 "bytes avg %d limits [%d/%d]\n",
3292 if_name(ifp),
3293 (mode == IFNET_MODEL_INPUT_POLL_ON) ?
3294 "ON" : "OFF", ifp->if_rxpoll_pavg,
3295 ifp->if_rxpoll_pmax, ifp->if_rxpoll_plowat,
3296 ifp->if_rxpoll_phiwat, ifp->if_rxpoll_wavg,
3297 ifp->if_rxpoll_wlowat, ifp->if_rxpoll_whiwat,
3298 ifp->if_rxpoll_bavg, ifp->if_rxpoll_blowat,
3299 ifp->if_rxpoll_bhiwat);
3300 }
3301
3302 if ((err = ((*ifp->if_input_ctl)(ifp,
3303 IFNET_CTL_SET_INPUT_MODEL, sizeof(p), &p))) != 0) {
3304 DLIL_PRINTF("%s: error setting polling mode "
3305 "to %s (%d)\n", if_name(ifp),
3306 (mode == IFNET_MODEL_INPUT_POLL_ON) ?
3307 "ON" : "OFF", err);
3308 }
3309
3310 switch (mode) {
3311 case IFNET_MODEL_INPUT_POLL_OFF:
3312 ifnet_set_poll_cycle(ifp, NULL);
3313 ifp->if_rxpoll_offreq++;
3314 if (err != 0) {
3315 ifp->if_rxpoll_offerr++;
3316 }
3317 break;
3318
3319 case IFNET_MODEL_INPUT_POLL_ON:
3320 net_nsectimer(&ival, &ts);
3321 ifnet_set_poll_cycle(ifp, &ts);
3322 ifnet_poll(ifp);
3323 ifp->if_rxpoll_onreq++;
3324 if (err != 0) {
3325 ifp->if_rxpoll_onerr++;
3326 }
3327 break;
3328
3329 default:
3330 VERIFY(0);
3331 /* NOTREACHED */
3332 }
3333
3334 /* Release the IO refcnt */
3335 ifnet_decr_iorefcnt(ifp);
3336 }
3337
3338 /*
3339 * NOTE warning %%% attention !!!!
3340 * We should think about putting some thread starvation
3341 * safeguards if we deal with long chains of packets.
3342 */
3343 if (__probable(m != NULL)) {
3344 dlil_input_packet_list_extended(ifp, m, m_cnt, mode);
3345 }
3346
3347 lck_mtx_lock_spin(&inp->dlth_lock);
3348 VERIFY(inp->dlth_flags & DLIL_INPUT_RUNNING);
3349 if (!(inp->dlth_flags & ~(DLIL_INPUT_RUNNING |
3350 DLIL_INPUT_TERMINATE))) {
3351 break;
3352 }
3353 }
3354
3355 inp->dlth_flags &= ~DLIL_INPUT_RUNNING;
3356
3357 if (__improbable(inp->dlth_flags & DLIL_INPUT_TERMINATE)) {
3358 terminate:
3359 lck_mtx_unlock(&inp->dlth_lock);
3360 dlil_terminate_input_thread(inp);
3361 /* NOTREACHED */
3362 } else {
3363 (void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3364 lck_mtx_unlock(&inp->dlth_lock);
3365 (void) thread_block_parameter(dlil_rxpoll_input_thread_cont,
3366 inp);
3367 /* NOTREACHED */
3368 }
3369
3370 VERIFY(0); /* we should never get here */
3371 /* NOTREACHED */
3372 __builtin_unreachable();
3373 }
3374
3375 errno_t
dlil_rxpoll_validate_params(struct ifnet_poll_params * p)3376 dlil_rxpoll_validate_params(struct ifnet_poll_params *p)
3377 {
3378 if (p != NULL) {
3379 if ((p->packets_lowat == 0 && p->packets_hiwat != 0) ||
3380 (p->packets_lowat != 0 && p->packets_hiwat == 0)) {
3381 return EINVAL;
3382 }
3383 if (p->packets_lowat != 0 && /* hiwat must be non-zero */
3384 p->packets_lowat >= p->packets_hiwat) {
3385 return EINVAL;
3386 }
3387 if ((p->bytes_lowat == 0 && p->bytes_hiwat != 0) ||
3388 (p->bytes_lowat != 0 && p->bytes_hiwat == 0)) {
3389 return EINVAL;
3390 }
3391 if (p->bytes_lowat != 0 && /* hiwat must be non-zero */
3392 p->bytes_lowat >= p->bytes_hiwat) {
3393 return EINVAL;
3394 }
3395 if (p->interval_time != 0 &&
3396 p->interval_time < IF_RXPOLL_INTERVALTIME_MIN) {
3397 p->interval_time = IF_RXPOLL_INTERVALTIME_MIN;
3398 }
3399 }
3400 return 0;
3401 }
3402
3403 void
dlil_rxpoll_update_params(struct ifnet * ifp,struct ifnet_poll_params * p)3404 dlil_rxpoll_update_params(struct ifnet *ifp, struct ifnet_poll_params *p)
3405 {
3406 u_int64_t sample_holdtime, inbw;
3407
3408 if ((inbw = ifnet_input_linkrate(ifp)) == 0 && p == NULL) {
3409 sample_holdtime = 0; /* polling is disabled */
3410 ifp->if_rxpoll_wlowat = ifp->if_rxpoll_plowat =
3411 ifp->if_rxpoll_blowat = 0;
3412 ifp->if_rxpoll_whiwat = ifp->if_rxpoll_phiwat =
3413 ifp->if_rxpoll_bhiwat = (u_int32_t)-1;
3414 ifp->if_rxpoll_plim = 0;
3415 ifp->if_rxpoll_ival = IF_RXPOLL_INTERVALTIME_MIN;
3416 } else {
3417 u_int32_t plowat, phiwat, blowat, bhiwat, plim;
3418 u_int64_t ival;
3419 unsigned int n, i;
3420
3421 for (n = 0, i = 0; rxpoll_tbl[i].speed != 0; i++) {
3422 if (inbw < rxpoll_tbl[i].speed) {
3423 break;
3424 }
3425 n = i;
3426 }
3427 /* auto-tune if caller didn't specify a value */
3428 plowat = ((p == NULL || p->packets_lowat == 0) ?
3429 rxpoll_tbl[n].plowat : p->packets_lowat);
3430 phiwat = ((p == NULL || p->packets_hiwat == 0) ?
3431 rxpoll_tbl[n].phiwat : p->packets_hiwat);
3432 blowat = ((p == NULL || p->bytes_lowat == 0) ?
3433 rxpoll_tbl[n].blowat : p->bytes_lowat);
3434 bhiwat = ((p == NULL || p->bytes_hiwat == 0) ?
3435 rxpoll_tbl[n].bhiwat : p->bytes_hiwat);
3436 plim = ((p == NULL || p->packets_limit == 0 ||
3437 if_rxpoll_max != 0) ? if_rxpoll_max : p->packets_limit);
3438 ival = ((p == NULL || p->interval_time == 0 ||
3439 if_rxpoll_interval_time != IF_RXPOLL_INTERVALTIME) ?
3440 if_rxpoll_interval_time : p->interval_time);
3441
3442 VERIFY(plowat != 0 && phiwat != 0);
3443 VERIFY(blowat != 0 && bhiwat != 0);
3444 VERIFY(ival >= IF_RXPOLL_INTERVALTIME_MIN);
3445
3446 sample_holdtime = if_rxpoll_sample_holdtime;
3447 ifp->if_rxpoll_wlowat = if_sysctl_rxpoll_wlowat;
3448 ifp->if_rxpoll_whiwat = if_sysctl_rxpoll_whiwat;
3449 ifp->if_rxpoll_plowat = plowat;
3450 ifp->if_rxpoll_phiwat = phiwat;
3451 ifp->if_rxpoll_blowat = blowat;
3452 ifp->if_rxpoll_bhiwat = bhiwat;
3453 ifp->if_rxpoll_plim = plim;
3454 ifp->if_rxpoll_ival = ival;
3455 }
3456
3457 net_nsectimer(&if_rxpoll_mode_holdtime, &ifp->if_poll_mode_holdtime);
3458 net_nsectimer(&sample_holdtime, &ifp->if_poll_sample_holdtime);
3459
3460 if (dlil_verbose) {
3461 DLIL_PRINTF("%s: speed %llu bps, sample per %llu nsec, "
3462 "poll interval %llu nsec, pkts per poll %u, "
3463 "pkt limits [%u/%u], wreq limits [%u/%u], "
3464 "bytes limits [%u/%u]\n", if_name(ifp),
3465 inbw, sample_holdtime, ifp->if_rxpoll_ival,
3466 ifp->if_rxpoll_plim, ifp->if_rxpoll_plowat,
3467 ifp->if_rxpoll_phiwat, ifp->if_rxpoll_wlowat,
3468 ifp->if_rxpoll_whiwat, ifp->if_rxpoll_blowat,
3469 ifp->if_rxpoll_bhiwat);
3470 }
3471 }
3472
3473 /*
3474 * Must be called on an attached ifnet (caller is expected to check.)
3475 * Caller may pass NULL for poll parameters to indicate "auto-tuning."
3476 */
3477 errno_t
dlil_rxpoll_set_params(struct ifnet * ifp,struct ifnet_poll_params * p,boolean_t locked)3478 dlil_rxpoll_set_params(struct ifnet *ifp, struct ifnet_poll_params *p,
3479 boolean_t locked)
3480 {
3481 errno_t err;
3482 struct dlil_threading_info *inp;
3483
3484 VERIFY(ifp != NULL);
3485 if (!(ifp->if_eflags & IFEF_RXPOLL) || (inp = ifp->if_inp) == NULL) {
3486 return ENXIO;
3487 }
3488 err = dlil_rxpoll_validate_params(p);
3489 if (err != 0) {
3490 return err;
3491 }
3492
3493 if (!locked) {
3494 lck_mtx_lock(&inp->dlth_lock);
3495 }
3496 LCK_MTX_ASSERT(&inp->dlth_lock, LCK_MTX_ASSERT_OWNED);
3497 /*
3498 * Normally, we'd reset the parameters to the auto-tuned values
3499 * if the the input thread detects a change in link rate. If the
3500 * driver provides its own parameters right after a link rate
3501 * changes, but before the input thread gets to run, we want to
3502 * make sure to keep the driver's values. Clearing if_poll_update
3503 * will achieve that.
3504 */
3505 if (p != NULL && !locked && ifp->if_poll_update != 0) {
3506 ifp->if_poll_update = 0;
3507 }
3508 dlil_rxpoll_update_params(ifp, p);
3509 if (!locked) {
3510 lck_mtx_unlock(&inp->dlth_lock);
3511 }
3512 return 0;
3513 }
3514
3515 /*
3516 * Must be called on an attached ifnet (caller is expected to check.)
3517 */
3518 errno_t
dlil_rxpoll_get_params(struct ifnet * ifp,struct ifnet_poll_params * p)3519 dlil_rxpoll_get_params(struct ifnet *ifp, struct ifnet_poll_params *p)
3520 {
3521 struct dlil_threading_info *inp;
3522
3523 VERIFY(ifp != NULL && p != NULL);
3524 if (!(ifp->if_eflags & IFEF_RXPOLL) || (inp = ifp->if_inp) == NULL) {
3525 return ENXIO;
3526 }
3527
3528 bzero(p, sizeof(*p));
3529
3530 lck_mtx_lock(&inp->dlth_lock);
3531 p->packets_limit = ifp->if_rxpoll_plim;
3532 p->packets_lowat = ifp->if_rxpoll_plowat;
3533 p->packets_hiwat = ifp->if_rxpoll_phiwat;
3534 p->bytes_lowat = ifp->if_rxpoll_blowat;
3535 p->bytes_hiwat = ifp->if_rxpoll_bhiwat;
3536 p->interval_time = ifp->if_rxpoll_ival;
3537 lck_mtx_unlock(&inp->dlth_lock);
3538
3539 return 0;
3540 }
3541
3542 errno_t
ifnet_input(struct ifnet * ifp,struct mbuf * m_head,const struct ifnet_stat_increment_param * s)3543 ifnet_input(struct ifnet *ifp, struct mbuf *m_head,
3544 const struct ifnet_stat_increment_param *s)
3545 {
3546 return ifnet_input_common(ifp, m_head, NULL, s, FALSE, FALSE);
3547 }
3548
3549 errno_t
ifnet_input_extended(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s)3550 ifnet_input_extended(struct ifnet *ifp, struct mbuf *m_head,
3551 struct mbuf *m_tail, const struct ifnet_stat_increment_param *s)
3552 {
3553 return ifnet_input_common(ifp, m_head, m_tail, s, TRUE, FALSE);
3554 }
3555
3556 errno_t
ifnet_input_poll(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s)3557 ifnet_input_poll(struct ifnet *ifp, struct mbuf *m_head,
3558 struct mbuf *m_tail, const struct ifnet_stat_increment_param *s)
3559 {
3560 return ifnet_input_common(ifp, m_head, m_tail, s,
3561 (m_head != NULL), TRUE);
3562 }
3563
3564 static errno_t
ifnet_input_common(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t ext,boolean_t poll)3565 ifnet_input_common(struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail,
3566 const struct ifnet_stat_increment_param *s, boolean_t ext, boolean_t poll)
3567 {
3568 dlil_input_func input_func;
3569 struct ifnet_stat_increment_param _s;
3570 u_int32_t m_cnt = 0, m_size = 0;
3571 struct mbuf *last;
3572 errno_t err = 0;
3573
3574 if ((m_head == NULL && !poll) || (s == NULL && ext)) {
3575 if (m_head != NULL) {
3576 mbuf_freem_list(m_head);
3577 }
3578 return EINVAL;
3579 }
3580
3581 VERIFY(m_head != NULL || (s == NULL && m_tail == NULL && !ext && poll));
3582 VERIFY(m_tail == NULL || ext);
3583 VERIFY(s != NULL || !ext);
3584
3585 /*
3586 * Drop the packet(s) if the parameters are invalid, or if the
3587 * interface is no longer attached; else hold an IO refcnt to
3588 * prevent it from being detached (will be released below.)
3589 */
3590 if (ifp == NULL || (ifp != lo_ifp && !ifnet_datamov_begin(ifp))) {
3591 if (m_head != NULL) {
3592 mbuf_freem_list(m_head);
3593 }
3594 return EINVAL;
3595 }
3596
3597 input_func = ifp->if_input_dlil;
3598 VERIFY(input_func != NULL);
3599
3600 if (m_tail == NULL) {
3601 last = m_head;
3602 while (m_head != NULL) {
3603 #if IFNET_INPUT_SANITY_CHK
3604 if (__improbable(dlil_input_sanity_check != 0)) {
3605 DLIL_INPUT_CHECK(last, ifp);
3606 }
3607 #endif /* IFNET_INPUT_SANITY_CHK */
3608 m_cnt++;
3609 m_size += m_length(last);
3610 if (mbuf_nextpkt(last) == NULL) {
3611 break;
3612 }
3613 last = mbuf_nextpkt(last);
3614 }
3615 m_tail = last;
3616 } else {
3617 #if IFNET_INPUT_SANITY_CHK
3618 if (__improbable(dlil_input_sanity_check != 0)) {
3619 last = m_head;
3620 while (1) {
3621 DLIL_INPUT_CHECK(last, ifp);
3622 m_cnt++;
3623 m_size += m_length(last);
3624 if (mbuf_nextpkt(last) == NULL) {
3625 break;
3626 }
3627 last = mbuf_nextpkt(last);
3628 }
3629 } else {
3630 m_cnt = s->packets_in;
3631 m_size = s->bytes_in;
3632 last = m_tail;
3633 }
3634 #else
3635 m_cnt = s->packets_in;
3636 m_size = s->bytes_in;
3637 last = m_tail;
3638 #endif /* IFNET_INPUT_SANITY_CHK */
3639 }
3640
3641 if (last != m_tail) {
3642 panic_plain("%s: invalid input packet chain for %s, "
3643 "tail mbuf %p instead of %p\n", __func__, if_name(ifp),
3644 m_tail, last);
3645 }
3646
3647 /*
3648 * Assert packet count only for the extended variant, for backwards
3649 * compatibility, since this came directly from the device driver.
3650 * Relax this assertion for input bytes, as the driver may have
3651 * included the link-layer headers in the computation; hence
3652 * m_size is just an approximation.
3653 */
3654 if (ext && s->packets_in != m_cnt) {
3655 panic_plain("%s: input packet count mismatch for %s, "
3656 "%d instead of %d\n", __func__, if_name(ifp),
3657 s->packets_in, m_cnt);
3658 }
3659
3660 if (s == NULL) {
3661 bzero(&_s, sizeof(_s));
3662 s = &_s;
3663 } else {
3664 _s = *s;
3665 }
3666 _s.packets_in = m_cnt;
3667 _s.bytes_in = m_size;
3668
3669 if (ifp->if_xflags & IFXF_DISABLE_INPUT) {
3670 m_freem_list(m_head);
3671
3672 os_atomic_add(&ifp->if_data.ifi_ipackets, _s.packets_in, relaxed);
3673 os_atomic_add(&ifp->if_data.ifi_ibytes, _s.bytes_in, relaxed);
3674
3675 goto done;
3676 }
3677
3678 err = (*input_func)(ifp, m_head, m_tail, s, poll, current_thread());
3679
3680 done:
3681 if (ifp != lo_ifp) {
3682 /* Release the IO refcnt */
3683 ifnet_datamov_end(ifp);
3684 }
3685
3686 return err;
3687 }
3688
3689 #if SKYWALK
3690 errno_t
dlil_set_input_handler(struct ifnet * ifp,dlil_input_func fn)3691 dlil_set_input_handler(struct ifnet *ifp, dlil_input_func fn)
3692 {
3693 return os_atomic_cmpxchg((void * volatile *)&ifp->if_input_dlil,
3694 ptrauth_nop_cast(void *, &dlil_input_handler),
3695 ptrauth_nop_cast(void *, fn), acq_rel) ? 0 : EBUSY;
3696 }
3697
3698 void
dlil_reset_input_handler(struct ifnet * ifp)3699 dlil_reset_input_handler(struct ifnet *ifp)
3700 {
3701 while (!os_atomic_cmpxchg((void * volatile *)&ifp->if_input_dlil,
3702 ptrauth_nop_cast(void *, ifp->if_input_dlil),
3703 ptrauth_nop_cast(void *, &dlil_input_handler), acq_rel)) {
3704 ;
3705 }
3706 }
3707
3708 errno_t
dlil_set_output_handler(struct ifnet * ifp,dlil_output_func fn)3709 dlil_set_output_handler(struct ifnet *ifp, dlil_output_func fn)
3710 {
3711 return os_atomic_cmpxchg((void * volatile *)&ifp->if_output_dlil,
3712 ptrauth_nop_cast(void *, &dlil_output_handler),
3713 ptrauth_nop_cast(void *, fn), acq_rel) ? 0 : EBUSY;
3714 }
3715
3716 void
dlil_reset_output_handler(struct ifnet * ifp)3717 dlil_reset_output_handler(struct ifnet *ifp)
3718 {
3719 while (!os_atomic_cmpxchg((void * volatile *)&ifp->if_output_dlil,
3720 ptrauth_nop_cast(void *, ifp->if_output_dlil),
3721 ptrauth_nop_cast(void *, &dlil_output_handler), acq_rel)) {
3722 ;
3723 }
3724 }
3725 #endif /* SKYWALK */
3726
3727 errno_t
dlil_output_handler(struct ifnet * ifp,struct mbuf * m)3728 dlil_output_handler(struct ifnet *ifp, struct mbuf *m)
3729 {
3730 return ifp->if_output(ifp, m);
3731 }
3732
3733 errno_t
dlil_input_handler(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t poll,struct thread * tp)3734 dlil_input_handler(struct ifnet *ifp, struct mbuf *m_head,
3735 struct mbuf *m_tail, const struct ifnet_stat_increment_param *s,
3736 boolean_t poll, struct thread *tp)
3737 {
3738 struct dlil_threading_info *inp = ifp->if_inp;
3739
3740 if (__improbable(inp == NULL)) {
3741 inp = dlil_main_input_thread;
3742 }
3743
3744 #if (DEVELOPMENT || DEBUG)
3745 if (__improbable(net_thread_is_marked(NET_THREAD_SYNC_RX))) {
3746 return dlil_input_sync(inp, ifp, m_head, m_tail, s, poll, tp);
3747 } else
3748 #endif /* (DEVELOPMENT || DEBUG) */
3749 {
3750 return inp->dlth_strategy(inp, ifp, m_head, m_tail, s, poll, tp);
3751 }
3752 }
3753
3754 /*
3755 * Detect whether a queue contains a burst that needs to be trimmed.
3756 */
3757 #define MBUF_QUEUE_IS_OVERCOMMITTED(q) \
3758 __improbable(MAX(if_rcvq_burst_limit, qlimit(q)) < qlen(q) && \
3759 qtype(q) == QP_MBUF)
3760
3761 #define MAX_KNOWN_MBUF_CLASS 8
3762
3763 static uint32_t
dlil_trim_overcomitted_queue_locked(class_queue_t * input_queue,dlil_freeq_t * freeq,struct ifnet_stat_increment_param * stat_delta)3764 dlil_trim_overcomitted_queue_locked(class_queue_t *input_queue,
3765 dlil_freeq_t *freeq, struct ifnet_stat_increment_param *stat_delta)
3766 {
3767 uint32_t overcommitted_qlen; /* Length in packets. */
3768 uint64_t overcommitted_qsize; /* Size in bytes. */
3769 uint32_t target_qlen; /* The desired queue length after trimming. */
3770 uint32_t pkts_to_drop = 0; /* Number of packets to drop. */
3771 uint32_t dropped_pkts = 0; /* Number of packets that were dropped. */
3772 uint32_t dropped_bytes = 0; /* Number of dropped bytes. */
3773 struct mbuf *m = NULL, *m_tmp = NULL;
3774
3775 overcommitted_qlen = qlen(input_queue);
3776 overcommitted_qsize = qsize(input_queue);
3777 target_qlen = (qlimit(input_queue) * if_rcvq_trim_pct) / 100;
3778
3779 if (overcommitted_qlen <= target_qlen) {
3780 /*
3781 * The queue is already within the target limits.
3782 */
3783 dropped_pkts = 0;
3784 goto out;
3785 }
3786
3787 pkts_to_drop = overcommitted_qlen - target_qlen;
3788
3789 /*
3790 * Proceed to removing packets from the head of the queue,
3791 * starting from the oldest, until the desired number of packets
3792 * has been dropped.
3793 */
3794 MBUFQ_FOREACH_SAFE(m, &qmbufq(input_queue), m_tmp) {
3795 if (pkts_to_drop <= dropped_pkts) {
3796 break;
3797 }
3798 MBUFQ_REMOVE(&qmbufq(input_queue), m);
3799 MBUFQ_NEXT(m) = NULL;
3800 MBUFQ_ENQUEUE(freeq, m);
3801
3802 dropped_pkts += 1;
3803 dropped_bytes += m_length(m);
3804 }
3805
3806 /*
3807 * Adjust the length and the estimated size of the queue
3808 * after trimming.
3809 */
3810 VERIFY(overcommitted_qlen == target_qlen + dropped_pkts);
3811 qlen(input_queue) = target_qlen;
3812
3813 /* qsize() is an approximation. */
3814 if (dropped_bytes < qsize(input_queue)) {
3815 qsize(input_queue) -= dropped_bytes;
3816 } else {
3817 qsize(input_queue) = 0;
3818 }
3819
3820 /*
3821 * Adjust the ifnet statistics increments, if needed.
3822 */
3823 stat_delta->dropped += dropped_pkts;
3824 if (dropped_pkts < stat_delta->packets_in) {
3825 stat_delta->packets_in -= dropped_pkts;
3826 } else {
3827 stat_delta->packets_in = 0;
3828 }
3829 if (dropped_bytes < stat_delta->bytes_in) {
3830 stat_delta->bytes_in -= dropped_bytes;
3831 } else {
3832 stat_delta->bytes_in = 0;
3833 }
3834
3835 out:
3836 if (dlil_verbose) {
3837 /*
3838 * The basic information about the drop is logged
3839 * by the invoking function (dlil_input_{,a}sync).
3840 * If `dlil_verbose' flag is set, provide more information
3841 * that can be useful for debugging.
3842 */
3843 DLIL_PRINTF("%s: "
3844 "qlen: %u -> %u, "
3845 "qsize: %llu -> %llu "
3846 "qlimit: %u (sysctl: %u) "
3847 "target_qlen: %u (if_rcvq_trim_pct: %u) pkts_to_drop: %u "
3848 "dropped_pkts: %u dropped_bytes %u\n",
3849 __func__,
3850 overcommitted_qlen, qlen(input_queue),
3851 overcommitted_qsize, qsize(input_queue),
3852 qlimit(input_queue), if_rcvq_burst_limit,
3853 target_qlen, if_rcvq_trim_pct, pkts_to_drop,
3854 dropped_pkts, dropped_bytes);
3855 }
3856
3857 return dropped_pkts;
3858 }
3859
3860 static errno_t
dlil_input_async(struct dlil_threading_info * inp,struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t poll,struct thread * tp)3861 dlil_input_async(struct dlil_threading_info *inp,
3862 struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail,
3863 const struct ifnet_stat_increment_param *s, boolean_t poll,
3864 struct thread *tp)
3865 {
3866 u_int32_t m_cnt = s->packets_in;
3867 u_int32_t m_size = s->bytes_in;
3868 boolean_t notify = FALSE;
3869 struct ifnet_stat_increment_param s_adj = *s;
3870 dlil_freeq_t freeq;
3871 MBUFQ_INIT(&freeq);
3872
3873 /*
3874 * If there is a matching DLIL input thread associated with an
3875 * affinity set, associate this thread with the same set. We
3876 * will only do this once.
3877 */
3878 lck_mtx_lock_spin(&inp->dlth_lock);
3879 if (inp != dlil_main_input_thread && inp->dlth_affinity && tp != NULL &&
3880 ((!poll && inp->dlth_driver_thread == THREAD_NULL) ||
3881 (poll && inp->dlth_poller_thread == THREAD_NULL))) {
3882 u_int32_t tag = inp->dlth_affinity_tag;
3883
3884 if (poll) {
3885 VERIFY(inp->dlth_poller_thread == THREAD_NULL);
3886 inp->dlth_poller_thread = tp;
3887 } else {
3888 VERIFY(inp->dlth_driver_thread == THREAD_NULL);
3889 inp->dlth_driver_thread = tp;
3890 }
3891 lck_mtx_unlock(&inp->dlth_lock);
3892
3893 /* Associate the current thread with the new affinity tag */
3894 (void) dlil_affinity_set(tp, tag);
3895
3896 /*
3897 * Take a reference on the current thread; during detach,
3898 * we will need to refer to it in order to tear down its
3899 * affinity.
3900 */
3901 thread_reference(tp);
3902 lck_mtx_lock_spin(&inp->dlth_lock);
3903 }
3904
3905 VERIFY(m_head != NULL || (m_tail == NULL && m_cnt == 0));
3906
3907 /*
3908 * Because of loopbacked multicast we cannot stuff the ifp in
3909 * the rcvif of the packet header: loopback (lo0) packets use a
3910 * dedicated list so that we can later associate them with lo_ifp
3911 * on their way up the stack. Packets for other interfaces without
3912 * dedicated input threads go to the regular list.
3913 */
3914 if (m_head != NULL) {
3915 classq_pkt_t head, tail;
3916 class_queue_t *input_queue;
3917 CLASSQ_PKT_INIT_MBUF(&head, m_head);
3918 CLASSQ_PKT_INIT_MBUF(&tail, m_tail);
3919 if (inp == dlil_main_input_thread && ifp == lo_ifp) {
3920 struct dlil_main_threading_info *inpm =
3921 (struct dlil_main_threading_info *)inp;
3922 input_queue = &inpm->lo_rcvq_pkts;
3923 } else {
3924 input_queue = &inp->dlth_pkts;
3925 }
3926
3927 _addq_multi(input_queue, &head, &tail, m_cnt, m_size);
3928
3929 if (MBUF_QUEUE_IS_OVERCOMMITTED(input_queue)) {
3930 dlil_trim_overcomitted_queue_locked(input_queue, &freeq, &s_adj);
3931 inp->dlth_trim_pkts_dropped += s_adj.dropped;
3932 inp->dlth_trim_cnt += 1;
3933
3934 os_log_error(OS_LOG_DEFAULT,
3935 "%s %s burst limit %u (sysctl: %u) exceeded. "
3936 "%u packets dropped [%u total in %u events]. new qlen %u ",
3937 __func__, if_name(ifp), qlimit(input_queue), if_rcvq_burst_limit,
3938 s_adj.dropped, inp->dlth_trim_pkts_dropped, inp->dlth_trim_cnt,
3939 qlen(input_queue));
3940 }
3941 }
3942
3943 #if IFNET_INPUT_SANITY_CHK
3944 /*
3945 * Verify that the original stat increment parameter
3946 * accurately describes the input chain `m_head`.
3947 * This is not affected by the trimming of input queue.
3948 */
3949 if (__improbable(dlil_input_sanity_check != 0)) {
3950 u_int32_t count = 0, size = 0;
3951 struct mbuf *m0;
3952
3953 for (m0 = m_head; m0; m0 = mbuf_nextpkt(m0)) {
3954 size += m_length(m0);
3955 count++;
3956 }
3957
3958 if (count != m_cnt) {
3959 panic_plain("%s: invalid total packet count %u "
3960 "(expected %u)\n", if_name(ifp), count, m_cnt);
3961 /* NOTREACHED */
3962 __builtin_unreachable();
3963 } else if (size != m_size) {
3964 panic_plain("%s: invalid total packet size %u "
3965 "(expected %u)\n", if_name(ifp), size, m_size);
3966 /* NOTREACHED */
3967 __builtin_unreachable();
3968 }
3969
3970 inp->dlth_pkts_cnt += m_cnt;
3971 }
3972 #endif /* IFNET_INPUT_SANITY_CHK */
3973
3974 /* NOTE: use the adjusted parameter, vs the original one */
3975 dlil_input_stats_add(&s_adj, inp, ifp, poll);
3976 /*
3977 * If we're using the main input thread, synchronize the
3978 * stats now since we have the interface context. All
3979 * other cases involving dedicated input threads will
3980 * have their stats synchronized there.
3981 */
3982 if (inp == dlil_main_input_thread) {
3983 notify = dlil_input_stats_sync(ifp, inp);
3984 }
3985
3986 dlil_input_wakeup(inp);
3987 lck_mtx_unlock(&inp->dlth_lock);
3988
3989 /*
3990 * Actual freeing of the excess packets must happen
3991 * after the dlth_lock had been released.
3992 */
3993 if (!MBUFQ_EMPTY(&freeq)) {
3994 m_freem_list(MBUFQ_FIRST(&freeq));
3995 }
3996
3997 if (notify) {
3998 ifnet_notify_data_threshold(ifp);
3999 }
4000
4001 return 0;
4002 }
4003
4004 static errno_t
dlil_input_sync(struct dlil_threading_info * inp,struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t poll,struct thread * tp)4005 dlil_input_sync(struct dlil_threading_info *inp,
4006 struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail,
4007 const struct ifnet_stat_increment_param *s, boolean_t poll,
4008 struct thread *tp)
4009 {
4010 #pragma unused(tp)
4011 u_int32_t m_cnt = s->packets_in;
4012 u_int32_t m_size = s->bytes_in;
4013 boolean_t notify = FALSE;
4014 classq_pkt_t head, tail;
4015 struct ifnet_stat_increment_param s_adj = *s;
4016 dlil_freeq_t freeq;
4017 MBUFQ_INIT(&freeq);
4018
4019 ASSERT(inp != dlil_main_input_thread);
4020
4021 /* XXX: should we just assert instead? */
4022 if (__improbable(m_head == NULL)) {
4023 return 0;
4024 }
4025
4026 CLASSQ_PKT_INIT_MBUF(&head, m_head);
4027 CLASSQ_PKT_INIT_MBUF(&tail, m_tail);
4028
4029 lck_mtx_lock_spin(&inp->dlth_lock);
4030 _addq_multi(&inp->dlth_pkts, &head, &tail, m_cnt, m_size);
4031
4032 if (MBUF_QUEUE_IS_OVERCOMMITTED(&inp->dlth_pkts)) {
4033 dlil_trim_overcomitted_queue_locked(&inp->dlth_pkts, &freeq, &s_adj);
4034 inp->dlth_trim_pkts_dropped += s_adj.dropped;
4035 inp->dlth_trim_cnt += 1;
4036
4037 os_log_error(OS_LOG_DEFAULT,
4038 "%s %s burst limit %u (sysctl: %u) exceeded. "
4039 "%u packets dropped [%u total in %u events]. new qlen %u \n",
4040 __func__, if_name(ifp), qlimit(&inp->dlth_pkts), if_rcvq_burst_limit,
4041 s_adj.dropped, inp->dlth_trim_pkts_dropped, inp->dlth_trim_cnt,
4042 qlen(&inp->dlth_pkts));
4043 }
4044
4045 #if IFNET_INPUT_SANITY_CHK
4046 if (__improbable(dlil_input_sanity_check != 0)) {
4047 u_int32_t count = 0, size = 0;
4048 struct mbuf *m0;
4049
4050 for (m0 = m_head; m0; m0 = mbuf_nextpkt(m0)) {
4051 size += m_length(m0);
4052 count++;
4053 }
4054
4055 if (count != m_cnt) {
4056 panic_plain("%s: invalid total packet count %u "
4057 "(expected %u)\n", if_name(ifp), count, m_cnt);
4058 /* NOTREACHED */
4059 __builtin_unreachable();
4060 } else if (size != m_size) {
4061 panic_plain("%s: invalid total packet size %u "
4062 "(expected %u)\n", if_name(ifp), size, m_size);
4063 /* NOTREACHED */
4064 __builtin_unreachable();
4065 }
4066
4067 inp->dlth_pkts_cnt += m_cnt;
4068 }
4069 #endif /* IFNET_INPUT_SANITY_CHK */
4070
4071 /* NOTE: use the adjusted parameter, vs the original one */
4072 dlil_input_stats_add(&s_adj, inp, ifp, poll);
4073
4074 m_cnt = qlen(&inp->dlth_pkts);
4075 _getq_all(&inp->dlth_pkts, &head, NULL, NULL, NULL);
4076
4077 #if SKYWALK
4078 /*
4079 * If this interface is attached to a netif nexus,
4080 * the stats are already incremented there; otherwise
4081 * do it here.
4082 */
4083 if (!(ifp->if_capabilities & IFCAP_SKYWALK))
4084 #endif /* SKYWALK */
4085 notify = dlil_input_stats_sync(ifp, inp);
4086
4087 lck_mtx_unlock(&inp->dlth_lock);
4088
4089 /*
4090 * Actual freeing of the excess packets must happen
4091 * after the dlth_lock had been released.
4092 */
4093 if (!MBUFQ_EMPTY(&freeq)) {
4094 m_freem_list(MBUFQ_FIRST(&freeq));
4095 }
4096
4097 if (notify) {
4098 ifnet_notify_data_threshold(ifp);
4099 }
4100
4101 /*
4102 * NOTE warning %%% attention !!!!
4103 * We should think about putting some thread starvation
4104 * safeguards if we deal with long chains of packets.
4105 */
4106 if (head.cp_mbuf != NULL) {
4107 dlil_input_packet_list_extended(ifp, head.cp_mbuf,
4108 m_cnt, ifp->if_poll_mode);
4109 }
4110
4111 return 0;
4112 }
4113
4114 #if SKYWALK
4115 errno_t
ifnet_set_output_handler(struct ifnet * ifp,ifnet_output_func fn)4116 ifnet_set_output_handler(struct ifnet *ifp, ifnet_output_func fn)
4117 {
4118 return os_atomic_cmpxchg((void * volatile *)&ifp->if_output,
4119 ptrauth_nop_cast(void *, ifp->if_save_output),
4120 ptrauth_nop_cast(void *, fn), acq_rel) ? 0 : EBUSY;
4121 }
4122
4123 void
ifnet_reset_output_handler(struct ifnet * ifp)4124 ifnet_reset_output_handler(struct ifnet *ifp)
4125 {
4126 while (!os_atomic_cmpxchg((void * volatile *)&ifp->if_output,
4127 ptrauth_nop_cast(void *, ifp->if_output),
4128 ptrauth_nop_cast(void *, ifp->if_save_output), acq_rel)) {
4129 ;
4130 }
4131 }
4132
4133 errno_t
ifnet_set_start_handler(struct ifnet * ifp,ifnet_start_func fn)4134 ifnet_set_start_handler(struct ifnet *ifp, ifnet_start_func fn)
4135 {
4136 return os_atomic_cmpxchg((void * volatile *)&ifp->if_start,
4137 ptrauth_nop_cast(void *, ifp->if_save_start),
4138 ptrauth_nop_cast(void *, fn), acq_rel) ? 0 : EBUSY;
4139 }
4140
4141 void
ifnet_reset_start_handler(struct ifnet * ifp)4142 ifnet_reset_start_handler(struct ifnet *ifp)
4143 {
4144 while (!os_atomic_cmpxchg((void * volatile *)&ifp->if_start,
4145 ptrauth_nop_cast(void *, ifp->if_start),
4146 ptrauth_nop_cast(void *, ifp->if_save_start), acq_rel)) {
4147 ;
4148 }
4149 }
4150 #endif /* SKYWALK */
4151
4152 static void
ifnet_start_common(struct ifnet * ifp,boolean_t resetfc,boolean_t ignore_delay)4153 ifnet_start_common(struct ifnet *ifp, boolean_t resetfc, boolean_t ignore_delay)
4154 {
4155 if (!(ifp->if_eflags & IFEF_TXSTART)) {
4156 return;
4157 }
4158 /*
4159 * If the starter thread is inactive, signal it to do work,
4160 * unless the interface is being flow controlled from below,
4161 * e.g. a virtual interface being flow controlled by a real
4162 * network interface beneath it, or it's been disabled via
4163 * a call to ifnet_disable_output().
4164 */
4165 lck_mtx_lock_spin(&ifp->if_start_lock);
4166 if (ignore_delay) {
4167 ifp->if_start_flags |= IFSF_NO_DELAY;
4168 }
4169 if (resetfc) {
4170 ifp->if_start_flags &= ~IFSF_FLOW_CONTROLLED;
4171 } else if (ifp->if_start_flags & IFSF_FLOW_CONTROLLED) {
4172 lck_mtx_unlock(&ifp->if_start_lock);
4173 return;
4174 }
4175 ifp->if_start_req++;
4176 if (!ifp->if_start_active && ifp->if_start_thread != THREAD_NULL &&
4177 (resetfc || !(ifp->if_eflags & IFEF_ENQUEUE_MULTI) ||
4178 IFCQ_LEN(ifp->if_snd) >= ifp->if_start_delay_qlen ||
4179 ifp->if_start_delayed == 0)) {
4180 (void) wakeup_one((caddr_t)&ifp->if_start_thread);
4181 }
4182 lck_mtx_unlock(&ifp->if_start_lock);
4183 }
4184
4185 void
ifnet_start(struct ifnet * ifp)4186 ifnet_start(struct ifnet *ifp)
4187 {
4188 ifnet_start_common(ifp, FALSE, FALSE);
4189 }
4190
4191 void
ifnet_start_ignore_delay(struct ifnet * ifp)4192 ifnet_start_ignore_delay(struct ifnet *ifp)
4193 {
4194 ifnet_start_common(ifp, FALSE, TRUE);
4195 }
4196
4197 __attribute__((noreturn))
4198 static void
ifnet_start_thread_func(void * v,wait_result_t w)4199 ifnet_start_thread_func(void *v, wait_result_t w)
4200 {
4201 #pragma unused(w)
4202 struct ifnet *ifp = v;
4203 char thread_name[MAXTHREADNAMESIZE];
4204
4205 /* Construct the name for this thread, and then apply it. */
4206 bzero(thread_name, sizeof(thread_name));
4207 (void) snprintf(thread_name, sizeof(thread_name),
4208 "ifnet_start_%s", ifp->if_xname);
4209 #if SKYWALK
4210 /* override name for native Skywalk interface */
4211 if (ifp->if_eflags & IFEF_SKYWALK_NATIVE) {
4212 (void) snprintf(thread_name, sizeof(thread_name),
4213 "skywalk_doorbell_%s_tx", ifp->if_xname);
4214 }
4215 #endif /* SKYWALK */
4216 ASSERT(ifp->if_start_thread == current_thread());
4217 thread_set_thread_name(current_thread(), thread_name);
4218
4219 /*
4220 * Treat the dedicated starter thread for lo0 as equivalent to
4221 * the driver workloop thread; if net_affinity is enabled for
4222 * the main input thread, associate this starter thread to it
4223 * by binding them with the same affinity tag. This is done
4224 * only once (as we only have one lo_ifp which never goes away.)
4225 */
4226 if (ifp == lo_ifp) {
4227 struct dlil_threading_info *inp = dlil_main_input_thread;
4228 struct thread *tp = current_thread();
4229 #if SKYWALK
4230 /* native skywalk loopback not yet implemented */
4231 VERIFY(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
4232 #endif /* SKYWALK */
4233
4234 lck_mtx_lock(&inp->dlth_lock);
4235 if (inp->dlth_affinity) {
4236 u_int32_t tag = inp->dlth_affinity_tag;
4237
4238 VERIFY(inp->dlth_driver_thread == THREAD_NULL);
4239 VERIFY(inp->dlth_poller_thread == THREAD_NULL);
4240 inp->dlth_driver_thread = tp;
4241 lck_mtx_unlock(&inp->dlth_lock);
4242
4243 /* Associate this thread with the affinity tag */
4244 (void) dlil_affinity_set(tp, tag);
4245 } else {
4246 lck_mtx_unlock(&inp->dlth_lock);
4247 }
4248 }
4249
4250 lck_mtx_lock(&ifp->if_start_lock);
4251 VERIFY(!ifp->if_start_embryonic && !ifp->if_start_active);
4252 (void) assert_wait(&ifp->if_start_thread, THREAD_UNINT);
4253 ifp->if_start_embryonic = 1;
4254 /* wake up once to get out of embryonic state */
4255 ifp->if_start_req++;
4256 (void) wakeup_one((caddr_t)&ifp->if_start_thread);
4257 lck_mtx_unlock(&ifp->if_start_lock);
4258 (void) thread_block_parameter(ifnet_start_thread_cont, ifp);
4259 /* NOTREACHED */
4260 __builtin_unreachable();
4261 }
4262
4263 __attribute__((noreturn))
4264 static void
ifnet_start_thread_cont(void * v,wait_result_t wres)4265 ifnet_start_thread_cont(void *v, wait_result_t wres)
4266 {
4267 struct ifnet *ifp = v;
4268 struct ifclassq *ifq = ifp->if_snd;
4269
4270 lck_mtx_lock_spin(&ifp->if_start_lock);
4271 if (__improbable(wres == THREAD_INTERRUPTED ||
4272 (ifp->if_start_flags & IFSF_TERMINATING) != 0)) {
4273 goto terminate;
4274 }
4275
4276 if (__improbable(ifp->if_start_embryonic)) {
4277 ifp->if_start_embryonic = 0;
4278 lck_mtx_unlock(&ifp->if_start_lock);
4279 ifnet_decr_pending_thread_count(ifp);
4280 lck_mtx_lock_spin(&ifp->if_start_lock);
4281 goto skip;
4282 }
4283
4284 ifp->if_start_active = 1;
4285
4286 /*
4287 * Keep on servicing until no more request.
4288 */
4289 for (;;) {
4290 u_int32_t req = ifp->if_start_req;
4291 if ((ifp->if_start_flags & IFSF_NO_DELAY) == 0 &&
4292 !IFCQ_IS_EMPTY(ifq) &&
4293 (ifp->if_eflags & IFEF_ENQUEUE_MULTI) &&
4294 ifp->if_start_delayed == 0 &&
4295 IFCQ_LEN(ifq) < ifp->if_start_delay_qlen &&
4296 (ifp->if_eflags & IFEF_DELAY_START)) {
4297 ifp->if_start_delayed = 1;
4298 ifnet_start_delayed++;
4299 break;
4300 }
4301 ifp->if_start_flags &= ~IFSF_NO_DELAY;
4302 ifp->if_start_delayed = 0;
4303 lck_mtx_unlock(&ifp->if_start_lock);
4304
4305 /*
4306 * If no longer attached, don't call start because ifp
4307 * is being destroyed; else hold an IO refcnt to
4308 * prevent the interface from being detached (will be
4309 * released below.)
4310 */
4311 if (!ifnet_datamov_begin(ifp)) {
4312 lck_mtx_lock_spin(&ifp->if_start_lock);
4313 break;
4314 }
4315
4316 /* invoke the driver's start routine */
4317 ((*ifp->if_start)(ifp));
4318
4319 /*
4320 * Release the io ref count taken above.
4321 */
4322 ifnet_datamov_end(ifp);
4323
4324 lck_mtx_lock_spin(&ifp->if_start_lock);
4325
4326 /*
4327 * If there's no pending request or if the
4328 * interface has been disabled, we're done.
4329 */
4330 #define _IFSF_DISABLED (IFSF_FLOW_CONTROLLED | IFSF_TERMINATING)
4331 if (req == ifp->if_start_req ||
4332 (ifp->if_start_flags & _IFSF_DISABLED) != 0) {
4333 break;
4334 }
4335 }
4336 skip:
4337 ifp->if_start_req = 0;
4338 ifp->if_start_active = 0;
4339
4340 #if SKYWALK
4341 /*
4342 * Wakeup any waiters, e.g. any threads waiting to
4343 * detach the interface from the flowswitch, etc.
4344 */
4345 if (ifp->if_start_waiters != 0) {
4346 ifp->if_start_waiters = 0;
4347 wakeup(&ifp->if_start_waiters);
4348 }
4349 #endif /* SKYWALK */
4350 if (__probable((ifp->if_start_flags & IFSF_TERMINATING) == 0)) {
4351 uint64_t deadline = TIMEOUT_WAIT_FOREVER;
4352 struct timespec delay_start_ts;
4353 struct timespec *ts = NULL;
4354
4355 if (ts == NULL) {
4356 ts = ((IFCQ_TBR_IS_ENABLED(ifq) && !IFCQ_IS_EMPTY(ifq)) ?
4357 &ifp->if_start_cycle : NULL);
4358 }
4359
4360 if (ts == NULL && ifp->if_start_delayed == 1) {
4361 delay_start_ts.tv_sec = 0;
4362 delay_start_ts.tv_nsec = ifp->if_start_delay_timeout;
4363 ts = &delay_start_ts;
4364 }
4365
4366 if (ts != NULL && ts->tv_sec == 0 && ts->tv_nsec == 0) {
4367 ts = NULL;
4368 }
4369
4370 if (__improbable(ts != NULL)) {
4371 clock_interval_to_deadline((uint32_t)(ts->tv_nsec +
4372 (ts->tv_sec * NSEC_PER_SEC)), 1, &deadline);
4373 }
4374
4375 (void) assert_wait_deadline(&ifp->if_start_thread,
4376 THREAD_UNINT, deadline);
4377 lck_mtx_unlock(&ifp->if_start_lock);
4378 (void) thread_block_parameter(ifnet_start_thread_cont, ifp);
4379 /* NOTREACHED */
4380 } else {
4381 terminate:
4382 /* interface is detached? */
4383 ifnet_set_start_cycle(ifp, NULL);
4384
4385 /* clear if_start_thread to allow termination to continue */
4386 ASSERT(ifp->if_start_thread != THREAD_NULL);
4387 ifp->if_start_thread = THREAD_NULL;
4388 wakeup((caddr_t)&ifp->if_start_thread);
4389 lck_mtx_unlock(&ifp->if_start_lock);
4390
4391 if (dlil_verbose) {
4392 DLIL_PRINTF("%s: starter thread terminated\n",
4393 if_name(ifp));
4394 }
4395
4396 /* for the extra refcnt from kernel_thread_start() */
4397 thread_deallocate(current_thread());
4398 /* this is the end */
4399 thread_terminate(current_thread());
4400 /* NOTREACHED */
4401 }
4402
4403 /* must never get here */
4404 VERIFY(0);
4405 /* NOTREACHED */
4406 __builtin_unreachable();
4407 }
4408
4409 void
ifnet_set_start_cycle(struct ifnet * ifp,struct timespec * ts)4410 ifnet_set_start_cycle(struct ifnet *ifp, struct timespec *ts)
4411 {
4412 if (ts == NULL) {
4413 bzero(&ifp->if_start_cycle, sizeof(ifp->if_start_cycle));
4414 } else {
4415 *(&ifp->if_start_cycle) = *ts;
4416 }
4417
4418 if (ts != NULL && ts->tv_nsec != 0 && dlil_verbose) {
4419 DLIL_PRINTF("%s: restart interval set to %lu nsec\n",
4420 if_name(ifp), ts->tv_nsec);
4421 }
4422 }
4423
4424 static inline void
ifnet_poll_wakeup(struct ifnet * ifp)4425 ifnet_poll_wakeup(struct ifnet *ifp)
4426 {
4427 LCK_MTX_ASSERT(&ifp->if_poll_lock, LCK_MTX_ASSERT_OWNED);
4428
4429 ifp->if_poll_req++;
4430 if (!(ifp->if_poll_flags & IF_POLLF_RUNNING) &&
4431 ifp->if_poll_thread != THREAD_NULL) {
4432 wakeup_one((caddr_t)&ifp->if_poll_thread);
4433 }
4434 }
4435
4436 void
ifnet_poll(struct ifnet * ifp)4437 ifnet_poll(struct ifnet *ifp)
4438 {
4439 /*
4440 * If the poller thread is inactive, signal it to do work.
4441 */
4442 lck_mtx_lock_spin(&ifp->if_poll_lock);
4443 ifnet_poll_wakeup(ifp);
4444 lck_mtx_unlock(&ifp->if_poll_lock);
4445 }
4446
4447 __attribute__((noreturn))
4448 static void
ifnet_poll_thread_func(void * v,wait_result_t w)4449 ifnet_poll_thread_func(void *v, wait_result_t w)
4450 {
4451 #pragma unused(w)
4452 char thread_name[MAXTHREADNAMESIZE];
4453 struct ifnet *ifp = v;
4454
4455 VERIFY(ifp->if_eflags & IFEF_RXPOLL);
4456 VERIFY(current_thread() == ifp->if_poll_thread);
4457
4458 /* construct the name for this thread, and then apply it */
4459 bzero(thread_name, sizeof(thread_name));
4460 (void) snprintf(thread_name, sizeof(thread_name),
4461 "ifnet_poller_%s", ifp->if_xname);
4462 thread_set_thread_name(ifp->if_poll_thread, thread_name);
4463
4464 lck_mtx_lock(&ifp->if_poll_lock);
4465 VERIFY(!(ifp->if_poll_flags & (IF_POLLF_EMBRYONIC | IF_POLLF_RUNNING)));
4466 (void) assert_wait(&ifp->if_poll_thread, THREAD_UNINT);
4467 ifp->if_poll_flags |= IF_POLLF_EMBRYONIC;
4468 /* wake up once to get out of embryonic state */
4469 ifnet_poll_wakeup(ifp);
4470 lck_mtx_unlock(&ifp->if_poll_lock);
4471 (void) thread_block_parameter(ifnet_poll_thread_cont, ifp);
4472 /* NOTREACHED */
4473 __builtin_unreachable();
4474 }
4475
4476 __attribute__((noreturn))
4477 static void
ifnet_poll_thread_cont(void * v,wait_result_t wres)4478 ifnet_poll_thread_cont(void *v, wait_result_t wres)
4479 {
4480 struct dlil_threading_info *inp;
4481 struct ifnet *ifp = v;
4482 struct ifnet_stat_increment_param s;
4483 struct timespec start_time;
4484
4485 VERIFY(ifp->if_eflags & IFEF_RXPOLL);
4486
4487 bzero(&s, sizeof(s));
4488 net_timerclear(&start_time);
4489
4490 lck_mtx_lock_spin(&ifp->if_poll_lock);
4491 if (__improbable(wres == THREAD_INTERRUPTED ||
4492 (ifp->if_poll_flags & IF_POLLF_TERMINATING) != 0)) {
4493 goto terminate;
4494 }
4495
4496 inp = ifp->if_inp;
4497 VERIFY(inp != NULL);
4498
4499 if (__improbable(ifp->if_poll_flags & IF_POLLF_EMBRYONIC)) {
4500 ifp->if_poll_flags &= ~IF_POLLF_EMBRYONIC;
4501 lck_mtx_unlock(&ifp->if_poll_lock);
4502 ifnet_decr_pending_thread_count(ifp);
4503 lck_mtx_lock_spin(&ifp->if_poll_lock);
4504 goto skip;
4505 }
4506
4507 ifp->if_poll_flags |= IF_POLLF_RUNNING;
4508
4509 /*
4510 * Keep on servicing until no more request.
4511 */
4512 for (;;) {
4513 struct mbuf *m_head, *m_tail;
4514 u_int32_t m_lim, m_cnt, m_totlen;
4515 u_int16_t req = ifp->if_poll_req;
4516
4517 m_lim = (ifp->if_rxpoll_plim != 0) ? ifp->if_rxpoll_plim :
4518 MAX((qlimit(&inp->dlth_pkts)), (ifp->if_rxpoll_phiwat << 2));
4519 lck_mtx_unlock(&ifp->if_poll_lock);
4520
4521 /*
4522 * If no longer attached, there's nothing to do;
4523 * else hold an IO refcnt to prevent the interface
4524 * from being detached (will be released below.)
4525 */
4526 if (!ifnet_is_attached(ifp, 1)) {
4527 lck_mtx_lock_spin(&ifp->if_poll_lock);
4528 break;
4529 }
4530
4531 if (dlil_verbose > 1) {
4532 DLIL_PRINTF("%s: polling up to %d pkts, "
4533 "pkts avg %d max %d, wreq avg %d, "
4534 "bytes avg %d\n",
4535 if_name(ifp), m_lim,
4536 ifp->if_rxpoll_pavg, ifp->if_rxpoll_pmax,
4537 ifp->if_rxpoll_wavg, ifp->if_rxpoll_bavg);
4538 }
4539
4540 /* invoke the driver's input poll routine */
4541 ((*ifp->if_input_poll)(ifp, 0, m_lim, &m_head, &m_tail,
4542 &m_cnt, &m_totlen));
4543
4544 if (m_head != NULL) {
4545 VERIFY(m_tail != NULL && m_cnt > 0);
4546
4547 if (dlil_verbose > 1) {
4548 DLIL_PRINTF("%s: polled %d pkts, "
4549 "pkts avg %d max %d, wreq avg %d, "
4550 "bytes avg %d\n",
4551 if_name(ifp), m_cnt,
4552 ifp->if_rxpoll_pavg, ifp->if_rxpoll_pmax,
4553 ifp->if_rxpoll_wavg, ifp->if_rxpoll_bavg);
4554 }
4555
4556 /* stats are required for extended variant */
4557 s.packets_in = m_cnt;
4558 s.bytes_in = m_totlen;
4559
4560 (void) ifnet_input_common(ifp, m_head, m_tail,
4561 &s, TRUE, TRUE);
4562 } else {
4563 if (dlil_verbose > 1) {
4564 DLIL_PRINTF("%s: no packets, "
4565 "pkts avg %d max %d, wreq avg %d, "
4566 "bytes avg %d\n",
4567 if_name(ifp), ifp->if_rxpoll_pavg,
4568 ifp->if_rxpoll_pmax, ifp->if_rxpoll_wavg,
4569 ifp->if_rxpoll_bavg);
4570 }
4571
4572 (void) ifnet_input_common(ifp, NULL, NULL,
4573 NULL, FALSE, TRUE);
4574 }
4575
4576 /* Release the io ref count */
4577 ifnet_decr_iorefcnt(ifp);
4578
4579 lck_mtx_lock_spin(&ifp->if_poll_lock);
4580
4581 /* if there's no pending request, we're done */
4582 if (req == ifp->if_poll_req ||
4583 (ifp->if_poll_flags & IF_POLLF_TERMINATING) != 0) {
4584 break;
4585 }
4586 }
4587 skip:
4588 ifp->if_poll_req = 0;
4589 ifp->if_poll_flags &= ~IF_POLLF_RUNNING;
4590
4591 if (__probable((ifp->if_poll_flags & IF_POLLF_TERMINATING) == 0)) {
4592 uint64_t deadline = TIMEOUT_WAIT_FOREVER;
4593 struct timespec *ts;
4594
4595 /*
4596 * Wakeup N ns from now, else sleep indefinitely (ts = NULL)
4597 * until ifnet_poll() is called again.
4598 */
4599 ts = &ifp->if_poll_cycle;
4600 if (ts->tv_sec == 0 && ts->tv_nsec == 0) {
4601 ts = NULL;
4602 }
4603
4604 if (ts != NULL) {
4605 clock_interval_to_deadline((uint32_t)(ts->tv_nsec +
4606 (ts->tv_sec * NSEC_PER_SEC)), 1, &deadline);
4607 }
4608
4609 (void) assert_wait_deadline(&ifp->if_poll_thread,
4610 THREAD_UNINT, deadline);
4611 lck_mtx_unlock(&ifp->if_poll_lock);
4612 (void) thread_block_parameter(ifnet_poll_thread_cont, ifp);
4613 /* NOTREACHED */
4614 } else {
4615 terminate:
4616 /* interface is detached (maybe while asleep)? */
4617 ifnet_set_poll_cycle(ifp, NULL);
4618
4619 /* clear if_poll_thread to allow termination to continue */
4620 ASSERT(ifp->if_poll_thread != THREAD_NULL);
4621 ifp->if_poll_thread = THREAD_NULL;
4622 wakeup((caddr_t)&ifp->if_poll_thread);
4623 lck_mtx_unlock(&ifp->if_poll_lock);
4624
4625 if (dlil_verbose) {
4626 DLIL_PRINTF("%s: poller thread terminated\n",
4627 if_name(ifp));
4628 }
4629
4630 /* for the extra refcnt from kernel_thread_start() */
4631 thread_deallocate(current_thread());
4632 /* this is the end */
4633 thread_terminate(current_thread());
4634 /* NOTREACHED */
4635 }
4636
4637 /* must never get here */
4638 VERIFY(0);
4639 /* NOTREACHED */
4640 __builtin_unreachable();
4641 }
4642
4643 void
ifnet_set_poll_cycle(struct ifnet * ifp,struct timespec * ts)4644 ifnet_set_poll_cycle(struct ifnet *ifp, struct timespec *ts)
4645 {
4646 if (ts == NULL) {
4647 bzero(&ifp->if_poll_cycle, sizeof(ifp->if_poll_cycle));
4648 } else {
4649 *(&ifp->if_poll_cycle) = *ts;
4650 }
4651
4652 if (ts != NULL && ts->tv_nsec != 0 && dlil_verbose) {
4653 DLIL_PRINTF("%s: poll interval set to %lu nsec\n",
4654 if_name(ifp), ts->tv_nsec);
4655 }
4656 }
4657
4658 void
ifnet_purge(struct ifnet * ifp)4659 ifnet_purge(struct ifnet *ifp)
4660 {
4661 if (ifp != NULL && (ifp->if_eflags & IFEF_TXSTART)) {
4662 if_qflush_snd(ifp, false);
4663 }
4664 }
4665
4666 void
ifnet_update_sndq(struct ifclassq * ifq,cqev_t ev)4667 ifnet_update_sndq(struct ifclassq *ifq, cqev_t ev)
4668 {
4669 IFCQ_LOCK_ASSERT_HELD(ifq);
4670
4671 if (!(IFCQ_IS_READY(ifq))) {
4672 return;
4673 }
4674
4675 if (IFCQ_TBR_IS_ENABLED(ifq)) {
4676 struct tb_profile tb = {
4677 .rate = ifq->ifcq_tbr.tbr_rate_raw,
4678 .percent = ifq->ifcq_tbr.tbr_percent, .depth = 0
4679 };
4680 (void) ifclassq_tbr_set(ifq, &tb, FALSE);
4681 }
4682
4683 ifclassq_update(ifq, ev);
4684 }
4685
4686 void
ifnet_update_rcv(struct ifnet * ifp,cqev_t ev)4687 ifnet_update_rcv(struct ifnet *ifp, cqev_t ev)
4688 {
4689 switch (ev) {
4690 case CLASSQ_EV_LINK_BANDWIDTH:
4691 if (net_rxpoll && (ifp->if_eflags & IFEF_RXPOLL)) {
4692 ifp->if_poll_update++;
4693 }
4694 break;
4695
4696 default:
4697 break;
4698 }
4699 }
4700
4701 errno_t
ifnet_set_output_sched_model(struct ifnet * ifp,u_int32_t model)4702 ifnet_set_output_sched_model(struct ifnet *ifp, u_int32_t model)
4703 {
4704 struct ifclassq *ifq;
4705 u_int32_t omodel;
4706 errno_t err;
4707
4708 if (ifp == NULL || model >= IFNET_SCHED_MODEL_MAX) {
4709 return EINVAL;
4710 } else if (!(ifp->if_eflags & IFEF_TXSTART)) {
4711 return ENXIO;
4712 }
4713
4714 ifq = ifp->if_snd;
4715 IFCQ_LOCK(ifq);
4716 omodel = ifp->if_output_sched_model;
4717 ifp->if_output_sched_model = model;
4718 if ((err = ifclassq_pktsched_setup(ifq)) != 0) {
4719 ifp->if_output_sched_model = omodel;
4720 }
4721 IFCQ_UNLOCK(ifq);
4722
4723 return err;
4724 }
4725
4726 errno_t
ifnet_set_sndq_maxlen(struct ifnet * ifp,u_int32_t maxqlen)4727 ifnet_set_sndq_maxlen(struct ifnet *ifp, u_int32_t maxqlen)
4728 {
4729 if (ifp == NULL) {
4730 return EINVAL;
4731 } else if (!(ifp->if_eflags & IFEF_TXSTART)) {
4732 return ENXIO;
4733 }
4734
4735 ifclassq_set_maxlen(ifp->if_snd, maxqlen);
4736
4737 return 0;
4738 }
4739
4740 errno_t
ifnet_get_sndq_maxlen(struct ifnet * ifp,u_int32_t * maxqlen)4741 ifnet_get_sndq_maxlen(struct ifnet *ifp, u_int32_t *maxqlen)
4742 {
4743 if (ifp == NULL || maxqlen == NULL) {
4744 return EINVAL;
4745 } else if (!(ifp->if_eflags & IFEF_TXSTART)) {
4746 return ENXIO;
4747 }
4748
4749 *maxqlen = ifclassq_get_maxlen(ifp->if_snd);
4750
4751 return 0;
4752 }
4753
4754 errno_t
ifnet_get_sndq_len(struct ifnet * ifp,u_int32_t * pkts)4755 ifnet_get_sndq_len(struct ifnet *ifp, u_int32_t *pkts)
4756 {
4757 errno_t err;
4758
4759 if (ifp == NULL || pkts == NULL) {
4760 err = EINVAL;
4761 } else if (!(ifp->if_eflags & IFEF_TXSTART)) {
4762 err = ENXIO;
4763 } else {
4764 err = ifclassq_get_len(ifp->if_snd, MBUF_SC_UNSPEC,
4765 IF_CLASSQ_ALL_GRPS, pkts, NULL);
4766 }
4767
4768 return err;
4769 }
4770
4771 errno_t
ifnet_get_service_class_sndq_len(struct ifnet * ifp,mbuf_svc_class_t sc,u_int32_t * pkts,u_int32_t * bytes)4772 ifnet_get_service_class_sndq_len(struct ifnet *ifp, mbuf_svc_class_t sc,
4773 u_int32_t *pkts, u_int32_t *bytes)
4774 {
4775 errno_t err;
4776
4777 if (ifp == NULL || !MBUF_VALID_SC(sc) ||
4778 (pkts == NULL && bytes == NULL)) {
4779 err = EINVAL;
4780 } else if (!(ifp->if_eflags & IFEF_TXSTART)) {
4781 err = ENXIO;
4782 } else {
4783 err = ifclassq_get_len(ifp->if_snd, sc, IF_CLASSQ_ALL_GRPS,
4784 pkts, bytes);
4785 }
4786
4787 return err;
4788 }
4789
4790 errno_t
ifnet_set_rcvq_maxlen(struct ifnet * ifp,u_int32_t maxqlen)4791 ifnet_set_rcvq_maxlen(struct ifnet *ifp, u_int32_t maxqlen)
4792 {
4793 struct dlil_threading_info *inp;
4794
4795 if (ifp == NULL) {
4796 return EINVAL;
4797 } else if (!(ifp->if_eflags & IFEF_RXPOLL) || ifp->if_inp == NULL) {
4798 return ENXIO;
4799 }
4800
4801 if (maxqlen == 0) {
4802 maxqlen = if_rcvq_maxlen;
4803 } else if (maxqlen < IF_RCVQ_MINLEN) {
4804 maxqlen = IF_RCVQ_MINLEN;
4805 }
4806
4807 inp = ifp->if_inp;
4808 lck_mtx_lock(&inp->dlth_lock);
4809 qlimit(&inp->dlth_pkts) = maxqlen;
4810 lck_mtx_unlock(&inp->dlth_lock);
4811
4812 return 0;
4813 }
4814
4815 errno_t
ifnet_get_rcvq_maxlen(struct ifnet * ifp,u_int32_t * maxqlen)4816 ifnet_get_rcvq_maxlen(struct ifnet *ifp, u_int32_t *maxqlen)
4817 {
4818 struct dlil_threading_info *inp;
4819
4820 if (ifp == NULL || maxqlen == NULL) {
4821 return EINVAL;
4822 } else if (!(ifp->if_eflags & IFEF_RXPOLL) || ifp->if_inp == NULL) {
4823 return ENXIO;
4824 }
4825
4826 inp = ifp->if_inp;
4827 lck_mtx_lock(&inp->dlth_lock);
4828 *maxqlen = qlimit(&inp->dlth_pkts);
4829 lck_mtx_unlock(&inp->dlth_lock);
4830 return 0;
4831 }
4832
4833 void
ifnet_enqueue_multi_setup(struct ifnet * ifp,uint16_t delay_qlen,uint16_t delay_timeout)4834 ifnet_enqueue_multi_setup(struct ifnet *ifp, uint16_t delay_qlen,
4835 uint16_t delay_timeout)
4836 {
4837 if (delay_qlen > 0 && delay_timeout > 0) {
4838 if_set_eflags(ifp, IFEF_ENQUEUE_MULTI);
4839 ifp->if_start_delay_qlen = MIN(100, delay_qlen);
4840 ifp->if_start_delay_timeout = min(20000, delay_timeout);
4841 /* convert timeout to nanoseconds */
4842 ifp->if_start_delay_timeout *= 1000;
4843 kprintf("%s: forced IFEF_ENQUEUE_MULTI qlen %u timeout %u\n",
4844 ifp->if_xname, (uint32_t)delay_qlen,
4845 (uint32_t)delay_timeout);
4846 } else {
4847 if_clear_eflags(ifp, IFEF_ENQUEUE_MULTI);
4848 }
4849 }
4850
4851 /*
4852 * This function clears the DSCP bits in the IPV4/V6 header pointed to by buf.
4853 * While it's ok for buf to be not 32 bit aligned, the caller must ensure that
4854 * buf holds the full header.
4855 */
4856 static __attribute__((noinline)) void
ifnet_mcast_clear_dscp(uint8_t * buf,uint8_t ip_ver)4857 ifnet_mcast_clear_dscp(uint8_t *buf, uint8_t ip_ver)
4858 {
4859 struct ip *ip;
4860 struct ip6_hdr *ip6;
4861 uint8_t lbuf[64] __attribute__((aligned(8)));
4862 uint8_t *p = buf;
4863
4864 if (ip_ver == IPVERSION) {
4865 uint8_t old_tos;
4866 uint32_t sum;
4867
4868 if (__improbable(!IP_HDR_ALIGNED_P(p))) {
4869 DTRACE_IP1(not__aligned__v4, uint8_t *, buf);
4870 bcopy(buf, lbuf, sizeof(struct ip));
4871 p = lbuf;
4872 }
4873 ip = (struct ip *)(void *)p;
4874 if (__probable((ip->ip_tos & ~IPTOS_ECN_MASK) == 0)) {
4875 return;
4876 }
4877
4878 DTRACE_IP1(clear__v4, struct ip *, ip);
4879 old_tos = ip->ip_tos;
4880 ip->ip_tos &= IPTOS_ECN_MASK;
4881 sum = ip->ip_sum + htons(old_tos) - htons(ip->ip_tos);
4882 sum = (sum >> 16) + (sum & 0xffff);
4883 ip->ip_sum = (uint16_t)(sum & 0xffff);
4884
4885 if (__improbable(p == lbuf)) {
4886 bcopy(lbuf, buf, sizeof(struct ip));
4887 }
4888 } else {
4889 uint32_t flow;
4890 ASSERT(ip_ver == IPV6_VERSION);
4891
4892 if (__improbable(!IP_HDR_ALIGNED_P(p))) {
4893 DTRACE_IP1(not__aligned__v6, uint8_t *, buf);
4894 bcopy(buf, lbuf, sizeof(struct ip6_hdr));
4895 p = lbuf;
4896 }
4897 ip6 = (struct ip6_hdr *)(void *)p;
4898 flow = ntohl(ip6->ip6_flow);
4899 if (__probable((flow & IP6FLOW_DSCP_MASK) == 0)) {
4900 return;
4901 }
4902
4903 DTRACE_IP1(clear__v6, struct ip6_hdr *, ip6);
4904 ip6->ip6_flow = htonl(flow & ~IP6FLOW_DSCP_MASK);
4905
4906 if (__improbable(p == lbuf)) {
4907 bcopy(lbuf, buf, sizeof(struct ip6_hdr));
4908 }
4909 }
4910 }
4911
4912 static inline errno_t
ifnet_enqueue_ifclassq(struct ifnet * ifp,struct ifclassq * ifcq,classq_pkt_t * p,boolean_t flush,boolean_t * pdrop)4913 ifnet_enqueue_ifclassq(struct ifnet *ifp, struct ifclassq *ifcq,
4914 classq_pkt_t *p, boolean_t flush, boolean_t *pdrop)
4915 {
4916 #if SKYWALK
4917 volatile struct sk_nexusadv *nxadv = NULL;
4918 #endif /* SKYWALK */
4919 volatile uint64_t *fg_ts = NULL;
4920 volatile uint64_t *rt_ts = NULL;
4921 struct timespec now;
4922 u_int64_t now_nsec = 0;
4923 int error = 0;
4924 uint8_t *mcast_buf = NULL;
4925 uint8_t ip_ver;
4926 uint32_t pktlen;
4927
4928 ASSERT(ifp->if_eflags & IFEF_TXSTART);
4929 #if SKYWALK
4930 /*
4931 * If attached to flowswitch, grab pointers to the
4932 * timestamp variables in the nexus advisory region.
4933 */
4934 if ((ifp->if_capabilities & IFCAP_SKYWALK) && ifp->if_na != NULL &&
4935 (nxadv = ifp->if_na->nifna_netif->nif_fsw_nxadv) != NULL) {
4936 fg_ts = &nxadv->nxadv_fg_sendts;
4937 rt_ts = &nxadv->nxadv_rt_sendts;
4938 }
4939 #endif /* SKYWALK */
4940
4941 /*
4942 * If packet already carries a timestamp, either from dlil_output()
4943 * or from flowswitch, use it here. Otherwise, record timestamp.
4944 * PKTF_TS_VALID is always cleared prior to entering classq, i.e.
4945 * the timestamp value is used internally there.
4946 */
4947 switch (p->cp_ptype) {
4948 case QP_MBUF:
4949 #if SKYWALK
4950 /*
4951 * Valid only for non-native (compat) Skywalk interface.
4952 * If the data source uses packet, caller must convert
4953 * it to mbuf first prior to calling this routine.
4954 */
4955 ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
4956 #endif /* SKYWALK */
4957 ASSERT(p->cp_mbuf->m_flags & M_PKTHDR);
4958 ASSERT(p->cp_mbuf->m_nextpkt == NULL);
4959
4960 if (!(p->cp_mbuf->m_pkthdr.pkt_flags & PKTF_TS_VALID) ||
4961 p->cp_mbuf->m_pkthdr.pkt_timestamp == 0) {
4962 nanouptime(&now);
4963 net_timernsec(&now, &now_nsec);
4964 p->cp_mbuf->m_pkthdr.pkt_timestamp = now_nsec;
4965 }
4966 p->cp_mbuf->m_pkthdr.pkt_flags &= ~PKTF_TS_VALID;
4967 /*
4968 * If the packet service class is not background,
4969 * update the timestamp to indicate recent activity
4970 * on a foreground socket.
4971 */
4972 if ((p->cp_mbuf->m_pkthdr.pkt_flags & PKTF_FLOW_ID) &&
4973 p->cp_mbuf->m_pkthdr.pkt_flowsrc == FLOWSRC_INPCB) {
4974 if (!(p->cp_mbuf->m_pkthdr.pkt_flags &
4975 PKTF_SO_BACKGROUND)) {
4976 ifp->if_fg_sendts = (uint32_t)_net_uptime;
4977 if (fg_ts != NULL) {
4978 *fg_ts = (uint32_t)_net_uptime;
4979 }
4980 }
4981 if (p->cp_mbuf->m_pkthdr.pkt_flags & PKTF_SO_REALTIME) {
4982 ifp->if_rt_sendts = (uint32_t)_net_uptime;
4983 if (rt_ts != NULL) {
4984 *rt_ts = (uint32_t)_net_uptime;
4985 }
4986 }
4987 }
4988 pktlen = m_pktlen(p->cp_mbuf);
4989
4990 /*
4991 * Some Wi-Fi AP implementations do not correctly handle
4992 * multicast IP packets with DSCP bits set (radr://9331522).
4993 * As a workaround we clear the DSCP bits but keep service
4994 * class (rdar://51507725).
4995 */
4996 if ((p->cp_mbuf->m_flags & M_MCAST) != 0 &&
4997 IFNET_IS_WIFI_INFRA(ifp)) {
4998 size_t len = mbuf_len(p->cp_mbuf), hlen;
4999 struct ether_header *eh;
5000 boolean_t pullup = FALSE;
5001 uint16_t etype;
5002
5003 if (__improbable(len < sizeof(struct ether_header))) {
5004 DTRACE_IP1(small__ether, size_t, len);
5005 if ((p->cp_mbuf = m_pullup(p->cp_mbuf,
5006 sizeof(struct ether_header))) == NULL) {
5007 return ENOMEM;
5008 }
5009 }
5010 eh = mtod(p->cp_mbuf, struct ether_header *);
5011 etype = ntohs(eh->ether_type);
5012 if (etype == ETHERTYPE_IP) {
5013 hlen = sizeof(struct ether_header) +
5014 sizeof(struct ip);
5015 if (len < hlen) {
5016 DTRACE_IP1(small__v4, size_t, len);
5017 pullup = TRUE;
5018 }
5019 ip_ver = IPVERSION;
5020 } else if (etype == ETHERTYPE_IPV6) {
5021 hlen = sizeof(struct ether_header) +
5022 sizeof(struct ip6_hdr);
5023 if (len < hlen) {
5024 DTRACE_IP1(small__v6, size_t, len);
5025 pullup = TRUE;
5026 }
5027 ip_ver = IPV6_VERSION;
5028 } else {
5029 DTRACE_IP1(invalid__etype, uint16_t, etype);
5030 break;
5031 }
5032 if (pullup) {
5033 if ((p->cp_mbuf = m_pullup(p->cp_mbuf, (int)hlen)) ==
5034 NULL) {
5035 return ENOMEM;
5036 }
5037
5038 eh = mtod(p->cp_mbuf, struct ether_header *);
5039 }
5040 mcast_buf = (uint8_t *)(eh + 1);
5041 /*
5042 * ifnet_mcast_clear_dscp() will finish the work below.
5043 * Note that the pullups above ensure that mcast_buf
5044 * points to a full IP header.
5045 */
5046 }
5047 break;
5048
5049 #if SKYWALK
5050 case QP_PACKET:
5051 /*
5052 * Valid only for native Skywalk interface. If the data
5053 * source uses mbuf, caller must convert it to packet first
5054 * prior to calling this routine.
5055 */
5056 ASSERT(ifp->if_eflags & IFEF_SKYWALK_NATIVE);
5057 if (!(p->cp_kpkt->pkt_pflags & PKT_F_TS_VALID) ||
5058 p->cp_kpkt->pkt_timestamp == 0) {
5059 nanouptime(&now);
5060 net_timernsec(&now, &now_nsec);
5061 p->cp_kpkt->pkt_timestamp = now_nsec;
5062 }
5063 p->cp_kpkt->pkt_pflags &= ~PKT_F_TS_VALID;
5064 /*
5065 * If the packet service class is not background,
5066 * update the timestamps on the interface, as well as
5067 * the ones in nexus-wide advisory to indicate recent
5068 * activity on a foreground flow.
5069 */
5070 if (!(p->cp_kpkt->pkt_pflags & PKT_F_BACKGROUND)) {
5071 ifp->if_fg_sendts = (uint32_t)_net_uptime;
5072 if (fg_ts != NULL) {
5073 *fg_ts = (uint32_t)_net_uptime;
5074 }
5075 }
5076 if (p->cp_kpkt->pkt_pflags & PKT_F_REALTIME) {
5077 ifp->if_rt_sendts = (uint32_t)_net_uptime;
5078 if (rt_ts != NULL) {
5079 *rt_ts = (uint32_t)_net_uptime;
5080 }
5081 }
5082 pktlen = p->cp_kpkt->pkt_length;
5083
5084 /*
5085 * Some Wi-Fi AP implementations do not correctly handle
5086 * multicast IP packets with DSCP bits set (radr://9331522).
5087 * As a workaround we clear the DSCP bits but keep service
5088 * class (rdar://51507725).
5089 */
5090 if ((p->cp_kpkt->pkt_link_flags & PKT_LINKF_MCAST) != 0 &&
5091 IFNET_IS_WIFI_INFRA(ifp)) {
5092 uint8_t *baddr;
5093 struct ether_header *eh;
5094 uint16_t etype;
5095
5096 MD_BUFLET_ADDR_ABS(p->cp_kpkt, baddr);
5097 baddr += p->cp_kpkt->pkt_headroom;
5098 if (__improbable(pktlen < sizeof(struct ether_header))) {
5099 DTRACE_IP1(pkt__small__ether, __kern_packet *,
5100 p->cp_kpkt);
5101 break;
5102 }
5103 eh = (struct ether_header *)(void *)baddr;
5104 etype = ntohs(eh->ether_type);
5105 if (etype == ETHERTYPE_IP) {
5106 if (pktlen < sizeof(struct ether_header) +
5107 sizeof(struct ip)) {
5108 DTRACE_IP1(pkt__small__v4, uint32_t,
5109 pktlen);
5110 break;
5111 }
5112 ip_ver = IPVERSION;
5113 } else if (etype == ETHERTYPE_IPV6) {
5114 if (pktlen < sizeof(struct ether_header) +
5115 sizeof(struct ip6_hdr)) {
5116 DTRACE_IP1(pkt__small__v6, uint32_t,
5117 pktlen);
5118 break;
5119 }
5120 ip_ver = IPV6_VERSION;
5121 } else {
5122 DTRACE_IP1(pkt__invalid__etype, uint16_t,
5123 etype);
5124 break;
5125 }
5126 mcast_buf = (uint8_t *)(eh + 1);
5127 /*
5128 * ifnet_mcast_clear_dscp() will finish the work below.
5129 * The checks above verify that the IP header is in the
5130 * first buflet.
5131 */
5132 }
5133 break;
5134 #endif /* SKYWALK */
5135
5136 default:
5137 VERIFY(0);
5138 /* NOTREACHED */
5139 __builtin_unreachable();
5140 }
5141
5142 if (mcast_buf != NULL) {
5143 ifnet_mcast_clear_dscp(mcast_buf, ip_ver);
5144 }
5145
5146 if (ifp->if_eflags & IFEF_ENQUEUE_MULTI) {
5147 if (now_nsec == 0) {
5148 nanouptime(&now);
5149 net_timernsec(&now, &now_nsec);
5150 }
5151 /*
5152 * If the driver chose to delay start callback for
5153 * coalescing multiple packets, Then use the following
5154 * heuristics to make sure that start callback will
5155 * be delayed only when bulk data transfer is detected.
5156 * 1. number of packets enqueued in (delay_win * 2) is
5157 * greater than or equal to the delay qlen.
5158 * 2. If delay_start is enabled it will stay enabled for
5159 * another 10 idle windows. This is to take into account
5160 * variable RTT and burst traffic.
5161 * 3. If the time elapsed since last enqueue is more
5162 * than 200ms we disable delaying start callback. This is
5163 * is to take idle time into account.
5164 */
5165 u_int64_t dwin = (ifp->if_start_delay_timeout << 1);
5166 if (ifp->if_start_delay_swin > 0) {
5167 if ((ifp->if_start_delay_swin + dwin) > now_nsec) {
5168 ifp->if_start_delay_cnt++;
5169 } else if ((now_nsec - ifp->if_start_delay_swin)
5170 >= (200 * 1000 * 1000)) {
5171 ifp->if_start_delay_swin = now_nsec;
5172 ifp->if_start_delay_cnt = 1;
5173 ifp->if_start_delay_idle = 0;
5174 if (ifp->if_eflags & IFEF_DELAY_START) {
5175 if_clear_eflags(ifp, IFEF_DELAY_START);
5176 ifnet_delay_start_disabled_increment();
5177 }
5178 } else {
5179 if (ifp->if_start_delay_cnt >=
5180 ifp->if_start_delay_qlen) {
5181 if_set_eflags(ifp, IFEF_DELAY_START);
5182 ifp->if_start_delay_idle = 0;
5183 } else {
5184 if (ifp->if_start_delay_idle >= 10) {
5185 if_clear_eflags(ifp,
5186 IFEF_DELAY_START);
5187 ifnet_delay_start_disabled_increment();
5188 } else {
5189 ifp->if_start_delay_idle++;
5190 }
5191 }
5192 ifp->if_start_delay_swin = now_nsec;
5193 ifp->if_start_delay_cnt = 1;
5194 }
5195 } else {
5196 ifp->if_start_delay_swin = now_nsec;
5197 ifp->if_start_delay_cnt = 1;
5198 ifp->if_start_delay_idle = 0;
5199 if_clear_eflags(ifp, IFEF_DELAY_START);
5200 }
5201 } else {
5202 if_clear_eflags(ifp, IFEF_DELAY_START);
5203 }
5204
5205 /* enqueue the packet (caller consumes object) */
5206 error = ifclassq_enqueue(((ifcq != NULL) ? ifcq : ifp->if_snd), p, p,
5207 1, pktlen, pdrop);
5208
5209 /*
5210 * Tell the driver to start dequeueing; do this even when the queue
5211 * for the packet is suspended (EQSUSPENDED), as the driver could still
5212 * be dequeueing from other unsuspended queues.
5213 */
5214 if (!(ifp->if_eflags & IFEF_ENQUEUE_MULTI) &&
5215 ((error == 0 && flush) || error == EQFULL || error == EQSUSPENDED)) {
5216 ifnet_start(ifp);
5217 }
5218
5219 return error;
5220 }
5221
5222 static inline errno_t
ifnet_enqueue_ifclassq_chain(struct ifnet * ifp,struct ifclassq * ifcq,classq_pkt_t * head,classq_pkt_t * tail,uint32_t cnt,uint32_t bytes,boolean_t flush,boolean_t * pdrop)5223 ifnet_enqueue_ifclassq_chain(struct ifnet *ifp, struct ifclassq *ifcq,
5224 classq_pkt_t *head, classq_pkt_t *tail, uint32_t cnt, uint32_t bytes,
5225 boolean_t flush, boolean_t *pdrop)
5226 {
5227 int error;
5228
5229 /* enqueue the packet (caller consumes object) */
5230 error = ifclassq_enqueue(ifcq != NULL ? ifcq : ifp->if_snd, head, tail,
5231 cnt, bytes, pdrop);
5232
5233 /*
5234 * Tell the driver to start dequeueing; do this even when the queue
5235 * for the packet is suspended (EQSUSPENDED), as the driver could still
5236 * be dequeueing from other unsuspended queues.
5237 */
5238 if ((error == 0 && flush) || error == EQFULL || error == EQSUSPENDED) {
5239 ifnet_start(ifp);
5240 }
5241 return error;
5242 }
5243
5244 int
ifnet_enqueue_netem(void * handle,pktsched_pkt_t * pkts,uint32_t n_pkts)5245 ifnet_enqueue_netem(void *handle, pktsched_pkt_t *pkts, uint32_t n_pkts)
5246 {
5247 struct ifnet *ifp = handle;
5248 boolean_t pdrop; /* dummy */
5249 uint32_t i;
5250
5251 ASSERT(n_pkts >= 1);
5252 for (i = 0; i < n_pkts - 1; i++) {
5253 (void) ifnet_enqueue_ifclassq(ifp, NULL, &pkts[i].pktsched_pkt,
5254 FALSE, &pdrop);
5255 }
5256 /* flush with the last packet */
5257 (void) ifnet_enqueue_ifclassq(ifp, NULL, &pkts[i].pktsched_pkt,
5258 TRUE, &pdrop);
5259
5260 return 0;
5261 }
5262
5263 static inline errno_t
ifnet_enqueue_common(struct ifnet * ifp,struct ifclassq * ifcq,classq_pkt_t * pkt,boolean_t flush,boolean_t * pdrop)5264 ifnet_enqueue_common(struct ifnet *ifp, struct ifclassq *ifcq,
5265 classq_pkt_t *pkt, boolean_t flush, boolean_t *pdrop)
5266 {
5267 if (ifp->if_output_netem != NULL) {
5268 bool drop;
5269 errno_t error;
5270 error = netem_enqueue(ifp->if_output_netem, pkt, &drop);
5271 *pdrop = drop ? TRUE : FALSE;
5272 return error;
5273 } else {
5274 return ifnet_enqueue_ifclassq(ifp, ifcq, pkt, flush, pdrop);
5275 }
5276 }
5277
5278 errno_t
ifnet_enqueue(struct ifnet * ifp,struct mbuf * m)5279 ifnet_enqueue(struct ifnet *ifp, struct mbuf *m)
5280 {
5281 uint32_t bytes = m_pktlen(m);
5282 struct mbuf *tail = m;
5283 uint32_t cnt = 1;
5284 boolean_t pdrop;
5285
5286 while (tail->m_nextpkt) {
5287 VERIFY(tail->m_flags & M_PKTHDR);
5288 tail = tail->m_nextpkt;
5289 cnt++;
5290 bytes += m_pktlen(tail);
5291 }
5292
5293 return ifnet_enqueue_mbuf_chain(ifp, m, tail, cnt, bytes, TRUE, &pdrop);
5294 }
5295
5296 errno_t
ifnet_enqueue_mbuf(struct ifnet * ifp,struct mbuf * m,boolean_t flush,boolean_t * pdrop)5297 ifnet_enqueue_mbuf(struct ifnet *ifp, struct mbuf *m, boolean_t flush,
5298 boolean_t *pdrop)
5299 {
5300 classq_pkt_t pkt;
5301
5302 if (ifp == NULL || m == NULL || !(m->m_flags & M_PKTHDR) ||
5303 m->m_nextpkt != NULL) {
5304 if (m != NULL) {
5305 m_freem_list(m);
5306 *pdrop = TRUE;
5307 }
5308 return EINVAL;
5309 } else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5310 !IF_FULLY_ATTACHED(ifp)) {
5311 /* flag tested without lock for performance */
5312 m_freem(m);
5313 *pdrop = TRUE;
5314 return ENXIO;
5315 } else if (!(ifp->if_flags & IFF_UP)) {
5316 m_freem(m);
5317 *pdrop = TRUE;
5318 return ENETDOWN;
5319 }
5320
5321 CLASSQ_PKT_INIT_MBUF(&pkt, m);
5322 return ifnet_enqueue_common(ifp, NULL, &pkt, flush, pdrop);
5323 }
5324
5325 errno_t
ifnet_enqueue_mbuf_chain(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,uint32_t cnt,uint32_t bytes,boolean_t flush,boolean_t * pdrop)5326 ifnet_enqueue_mbuf_chain(struct ifnet *ifp, struct mbuf *m_head,
5327 struct mbuf *m_tail, uint32_t cnt, uint32_t bytes, boolean_t flush,
5328 boolean_t *pdrop)
5329 {
5330 classq_pkt_t head, tail;
5331
5332 ASSERT(m_head != NULL);
5333 ASSERT((m_head->m_flags & M_PKTHDR) != 0);
5334 ASSERT(m_tail != NULL);
5335 ASSERT((m_tail->m_flags & M_PKTHDR) != 0);
5336 ASSERT(ifp != NULL);
5337 ASSERT((ifp->if_eflags & IFEF_TXSTART) != 0);
5338
5339 if (!IF_FULLY_ATTACHED(ifp)) {
5340 /* flag tested without lock for performance */
5341 m_freem_list(m_head);
5342 *pdrop = TRUE;
5343 return ENXIO;
5344 } else if (!(ifp->if_flags & IFF_UP)) {
5345 m_freem_list(m_head);
5346 *pdrop = TRUE;
5347 return ENETDOWN;
5348 }
5349
5350 CLASSQ_PKT_INIT_MBUF(&head, m_head);
5351 CLASSQ_PKT_INIT_MBUF(&tail, m_tail);
5352 return ifnet_enqueue_ifclassq_chain(ifp, NULL, &head, &tail, cnt, bytes,
5353 flush, pdrop);
5354 }
5355
5356 #if SKYWALK
5357 static errno_t
ifnet_enqueue_pkt_common(struct ifnet * ifp,struct ifclassq * ifcq,struct __kern_packet * kpkt,boolean_t flush,boolean_t * pdrop)5358 ifnet_enqueue_pkt_common(struct ifnet *ifp, struct ifclassq *ifcq,
5359 struct __kern_packet *kpkt, boolean_t flush, boolean_t *pdrop)
5360 {
5361 classq_pkt_t pkt;
5362
5363 ASSERT(kpkt == NULL || kpkt->pkt_nextpkt == NULL);
5364
5365 if (__improbable(ifp == NULL || kpkt == NULL)) {
5366 if (kpkt != NULL) {
5367 pp_free_packet(__DECONST(struct kern_pbufpool *,
5368 kpkt->pkt_qum.qum_pp), SK_PTR_ADDR(kpkt));
5369 *pdrop = TRUE;
5370 }
5371 return EINVAL;
5372 } else if (__improbable(!(ifp->if_eflags & IFEF_TXSTART) ||
5373 !IF_FULLY_ATTACHED(ifp))) {
5374 /* flag tested without lock for performance */
5375 pp_free_packet(__DECONST(struct kern_pbufpool *,
5376 kpkt->pkt_qum.qum_pp), SK_PTR_ADDR(kpkt));
5377 *pdrop = TRUE;
5378 return ENXIO;
5379 } else if (__improbable(!(ifp->if_flags & IFF_UP))) {
5380 pp_free_packet(__DECONST(struct kern_pbufpool *,
5381 kpkt->pkt_qum.qum_pp), SK_PTR_ADDR(kpkt));
5382 *pdrop = TRUE;
5383 return ENETDOWN;
5384 }
5385
5386 CLASSQ_PKT_INIT_PACKET(&pkt, kpkt);
5387 return ifnet_enqueue_common(ifp, ifcq, &pkt, flush, pdrop);
5388 }
5389
5390 errno_t
ifnet_enqueue_pkt(struct ifnet * ifp,struct __kern_packet * kpkt,boolean_t flush,boolean_t * pdrop)5391 ifnet_enqueue_pkt(struct ifnet *ifp, struct __kern_packet *kpkt,
5392 boolean_t flush, boolean_t *pdrop)
5393 {
5394 return ifnet_enqueue_pkt_common(ifp, NULL, kpkt, flush, pdrop);
5395 }
5396
5397 errno_t
ifnet_enqueue_ifcq_pkt(struct ifnet * ifp,struct ifclassq * ifcq,struct __kern_packet * kpkt,boolean_t flush,boolean_t * pdrop)5398 ifnet_enqueue_ifcq_pkt(struct ifnet *ifp, struct ifclassq *ifcq,
5399 struct __kern_packet *kpkt, boolean_t flush, boolean_t *pdrop)
5400 {
5401 return ifnet_enqueue_pkt_common(ifp, ifcq, kpkt, flush, pdrop);
5402 }
5403
5404 static errno_t
ifnet_enqueue_pkt_chain_common(struct ifnet * ifp,struct ifclassq * ifcq,struct __kern_packet * k_head,struct __kern_packet * k_tail,uint32_t cnt,uint32_t bytes,boolean_t flush,boolean_t * pdrop)5405 ifnet_enqueue_pkt_chain_common(struct ifnet *ifp, struct ifclassq *ifcq,
5406 struct __kern_packet *k_head, struct __kern_packet *k_tail, uint32_t cnt,
5407 uint32_t bytes, boolean_t flush, boolean_t *pdrop)
5408 {
5409 classq_pkt_t head, tail;
5410
5411 ASSERT(k_head != NULL);
5412 ASSERT(k_tail != NULL);
5413 ASSERT(ifp != NULL);
5414 ASSERT((ifp->if_eflags & IFEF_TXSTART) != 0);
5415
5416 if (!IF_FULLY_ATTACHED(ifp)) {
5417 /* flag tested without lock for performance */
5418 pp_free_packet_chain(k_head, NULL);
5419 *pdrop = TRUE;
5420 return ENXIO;
5421 } else if (__improbable(!(ifp->if_flags & IFF_UP))) {
5422 pp_free_packet_chain(k_head, NULL);
5423 *pdrop = TRUE;
5424 return ENETDOWN;
5425 }
5426
5427 CLASSQ_PKT_INIT_PACKET(&head, k_head);
5428 CLASSQ_PKT_INIT_PACKET(&tail, k_tail);
5429 return ifnet_enqueue_ifclassq_chain(ifp, ifcq, &head, &tail, cnt, bytes,
5430 flush, pdrop);
5431 }
5432
5433 errno_t
ifnet_enqueue_pkt_chain(struct ifnet * ifp,struct __kern_packet * k_head,struct __kern_packet * k_tail,uint32_t cnt,uint32_t bytes,boolean_t flush,boolean_t * pdrop)5434 ifnet_enqueue_pkt_chain(struct ifnet *ifp, struct __kern_packet *k_head,
5435 struct __kern_packet *k_tail, uint32_t cnt, uint32_t bytes, boolean_t flush,
5436 boolean_t *pdrop)
5437 {
5438 return ifnet_enqueue_pkt_chain_common(ifp, NULL, k_head, k_tail,
5439 cnt, bytes, flush, pdrop);
5440 }
5441
5442 errno_t
ifnet_enqueue_ifcq_pkt_chain(struct ifnet * ifp,struct ifclassq * ifcq,struct __kern_packet * k_head,struct __kern_packet * k_tail,uint32_t cnt,uint32_t bytes,boolean_t flush,boolean_t * pdrop)5443 ifnet_enqueue_ifcq_pkt_chain(struct ifnet *ifp, struct ifclassq *ifcq,
5444 struct __kern_packet *k_head, struct __kern_packet *k_tail, uint32_t cnt,
5445 uint32_t bytes, boolean_t flush, boolean_t *pdrop)
5446 {
5447 return ifnet_enqueue_pkt_chain_common(ifp, ifcq, k_head, k_tail,
5448 cnt, bytes, flush, pdrop);
5449 }
5450 #endif /* SKYWALK */
5451
5452 errno_t
ifnet_dequeue(struct ifnet * ifp,struct mbuf ** mp)5453 ifnet_dequeue(struct ifnet *ifp, struct mbuf **mp)
5454 {
5455 errno_t rc;
5456 classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
5457
5458 if (ifp == NULL || mp == NULL) {
5459 return EINVAL;
5460 } else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5461 ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5462 return ENXIO;
5463 }
5464 if (!ifnet_is_attached(ifp, 1)) {
5465 return ENXIO;
5466 }
5467
5468 #if SKYWALK
5469 ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5470 #endif /* SKYWALK */
5471 rc = ifclassq_dequeue(ifp->if_snd, 1, CLASSQ_DEQUEUE_MAX_BYTE_LIMIT,
5472 &pkt, NULL, NULL, NULL, 0);
5473 VERIFY((pkt.cp_ptype == QP_MBUF) || (pkt.cp_mbuf == NULL));
5474 ifnet_decr_iorefcnt(ifp);
5475 *mp = pkt.cp_mbuf;
5476 return rc;
5477 }
5478
5479 errno_t
ifnet_dequeue_service_class(struct ifnet * ifp,mbuf_svc_class_t sc,struct mbuf ** mp)5480 ifnet_dequeue_service_class(struct ifnet *ifp, mbuf_svc_class_t sc,
5481 struct mbuf **mp)
5482 {
5483 errno_t rc;
5484 classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
5485
5486 if (ifp == NULL || mp == NULL || !MBUF_VALID_SC(sc)) {
5487 return EINVAL;
5488 } else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5489 ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5490 return ENXIO;
5491 }
5492 if (!ifnet_is_attached(ifp, 1)) {
5493 return ENXIO;
5494 }
5495
5496 #if SKYWALK
5497 ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5498 #endif /* SKYWALK */
5499 rc = ifclassq_dequeue_sc(ifp->if_snd, sc, 1,
5500 CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, &pkt, NULL, NULL, NULL, 0);
5501 VERIFY((pkt.cp_ptype == QP_MBUF) || (pkt.cp_mbuf == NULL));
5502 ifnet_decr_iorefcnt(ifp);
5503 *mp = pkt.cp_mbuf;
5504 return rc;
5505 }
5506
5507 errno_t
ifnet_dequeue_multi(struct ifnet * ifp,u_int32_t pkt_limit,struct mbuf ** head,struct mbuf ** tail,u_int32_t * cnt,u_int32_t * len)5508 ifnet_dequeue_multi(struct ifnet *ifp, u_int32_t pkt_limit,
5509 struct mbuf **head, struct mbuf **tail, u_int32_t *cnt, u_int32_t *len)
5510 {
5511 errno_t rc;
5512 classq_pkt_t pkt_head = CLASSQ_PKT_INITIALIZER(pkt_head);
5513 classq_pkt_t pkt_tail = CLASSQ_PKT_INITIALIZER(pkt_tail);
5514
5515 if (ifp == NULL || head == NULL || pkt_limit < 1) {
5516 return EINVAL;
5517 } else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5518 ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5519 return ENXIO;
5520 }
5521 if (!ifnet_is_attached(ifp, 1)) {
5522 return ENXIO;
5523 }
5524
5525 #if SKYWALK
5526 ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5527 #endif /* SKYWALK */
5528 rc = ifclassq_dequeue(ifp->if_snd, pkt_limit,
5529 CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, &pkt_head, &pkt_tail, cnt, len, 0);
5530 VERIFY((pkt_head.cp_ptype == QP_MBUF) || (pkt_head.cp_mbuf == NULL));
5531 ifnet_decr_iorefcnt(ifp);
5532 *head = pkt_head.cp_mbuf;
5533 if (tail != NULL) {
5534 *tail = pkt_tail.cp_mbuf;
5535 }
5536 return rc;
5537 }
5538
5539 errno_t
ifnet_dequeue_multi_bytes(struct ifnet * ifp,u_int32_t byte_limit,struct mbuf ** head,struct mbuf ** tail,u_int32_t * cnt,u_int32_t * len)5540 ifnet_dequeue_multi_bytes(struct ifnet *ifp, u_int32_t byte_limit,
5541 struct mbuf **head, struct mbuf **tail, u_int32_t *cnt, u_int32_t *len)
5542 {
5543 errno_t rc;
5544 classq_pkt_t pkt_head = CLASSQ_PKT_INITIALIZER(pkt_head);
5545 classq_pkt_t pkt_tail = CLASSQ_PKT_INITIALIZER(pkt_tail);
5546
5547 if (ifp == NULL || head == NULL || byte_limit < 1) {
5548 return EINVAL;
5549 } else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5550 ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5551 return ENXIO;
5552 }
5553 if (!ifnet_is_attached(ifp, 1)) {
5554 return ENXIO;
5555 }
5556
5557 #if SKYWALK
5558 ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5559 #endif /* SKYWALK */
5560 rc = ifclassq_dequeue(ifp->if_snd, CLASSQ_DEQUEUE_MAX_PKT_LIMIT,
5561 byte_limit, &pkt_head, &pkt_tail, cnt, len, 0);
5562 VERIFY((pkt_head.cp_ptype == QP_MBUF) || (pkt_head.cp_mbuf == NULL));
5563 ifnet_decr_iorefcnt(ifp);
5564 *head = pkt_head.cp_mbuf;
5565 if (tail != NULL) {
5566 *tail = pkt_tail.cp_mbuf;
5567 }
5568 return rc;
5569 }
5570
5571 errno_t
ifnet_dequeue_service_class_multi(struct ifnet * ifp,mbuf_svc_class_t sc,u_int32_t pkt_limit,struct mbuf ** head,struct mbuf ** tail,u_int32_t * cnt,u_int32_t * len)5572 ifnet_dequeue_service_class_multi(struct ifnet *ifp, mbuf_svc_class_t sc,
5573 u_int32_t pkt_limit, struct mbuf **head, struct mbuf **tail, u_int32_t *cnt,
5574 u_int32_t *len)
5575 {
5576 errno_t rc;
5577 classq_pkt_t pkt_head = CLASSQ_PKT_INITIALIZER(pkt_head);
5578 classq_pkt_t pkt_tail = CLASSQ_PKT_INITIALIZER(pkt_tail);
5579
5580 if (ifp == NULL || head == NULL || pkt_limit < 1 ||
5581 !MBUF_VALID_SC(sc)) {
5582 return EINVAL;
5583 } else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5584 ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5585 return ENXIO;
5586 }
5587 if (!ifnet_is_attached(ifp, 1)) {
5588 return ENXIO;
5589 }
5590
5591 #if SKYWALK
5592 ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5593 #endif /* SKYWALK */
5594 rc = ifclassq_dequeue_sc(ifp->if_snd, sc, pkt_limit,
5595 CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, &pkt_head, &pkt_tail,
5596 cnt, len, 0);
5597 VERIFY((pkt_head.cp_ptype == QP_MBUF) || (pkt_head.cp_mbuf == NULL));
5598 ifnet_decr_iorefcnt(ifp);
5599 *head = pkt_head.cp_mbuf;
5600 if (tail != NULL) {
5601 *tail = pkt_tail.cp_mbuf;
5602 }
5603 return rc;
5604 }
5605
5606 #if XNU_TARGET_OS_OSX
5607 errno_t
ifnet_framer_stub(struct ifnet * ifp,struct mbuf ** m,const struct sockaddr * dest,const char * dest_linkaddr,const char * frame_type,u_int32_t * pre,u_int32_t * post)5608 ifnet_framer_stub(struct ifnet *ifp, struct mbuf **m,
5609 const struct sockaddr *dest, const char *dest_linkaddr,
5610 const char *frame_type, u_int32_t *pre, u_int32_t *post)
5611 {
5612 if (pre != NULL) {
5613 *pre = 0;
5614 }
5615 if (post != NULL) {
5616 *post = 0;
5617 }
5618
5619 return ifp->if_framer_legacy(ifp, m, dest, dest_linkaddr, frame_type);
5620 }
5621 #endif /* XNU_TARGET_OS_OSX */
5622
5623 static boolean_t
packet_has_vlan_tag(struct mbuf * m)5624 packet_has_vlan_tag(struct mbuf * m)
5625 {
5626 u_int tag = 0;
5627
5628 if ((m->m_pkthdr.csum_flags & CSUM_VLAN_TAG_VALID) != 0) {
5629 tag = EVL_VLANOFTAG(m->m_pkthdr.vlan_tag);
5630 if (tag == 0) {
5631 /* the packet is just priority-tagged, clear the bit */
5632 m->m_pkthdr.csum_flags &= ~CSUM_VLAN_TAG_VALID;
5633 }
5634 }
5635 return tag != 0;
5636 }
5637
5638 static int
dlil_interface_filters_input(struct ifnet * ifp,struct mbuf ** m_p,char ** frame_header_p,protocol_family_t protocol_family,boolean_t skip_bridge)5639 dlil_interface_filters_input(struct ifnet *ifp, struct mbuf **m_p,
5640 char **frame_header_p, protocol_family_t protocol_family,
5641 boolean_t skip_bridge)
5642 {
5643 boolean_t is_vlan_packet = FALSE;
5644 struct ifnet_filter *filter;
5645 struct mbuf *m = *m_p;
5646
5647 is_vlan_packet = packet_has_vlan_tag(m);
5648
5649 if (TAILQ_EMPTY(&ifp->if_flt_head)) {
5650 return 0;
5651 }
5652
5653 /*
5654 * Pass the inbound packet to the interface filters
5655 */
5656 lck_mtx_lock_spin(&ifp->if_flt_lock);
5657 /* prevent filter list from changing in case we drop the lock */
5658 if_flt_monitor_busy(ifp);
5659 TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
5660 int result;
5661
5662 /* exclude VLAN packets from external filters PR-3586856 */
5663 if (is_vlan_packet &&
5664 (filter->filt_flags & DLIL_IFF_INTERNAL) == 0) {
5665 continue;
5666 }
5667 /* the bridge has already seen the packet */
5668 if (skip_bridge &&
5669 (filter->filt_flags & DLIL_IFF_BRIDGE) != 0) {
5670 continue;
5671 }
5672 if (!filter->filt_skip && filter->filt_input != NULL &&
5673 (filter->filt_protocol == 0 ||
5674 filter->filt_protocol == protocol_family)) {
5675 lck_mtx_unlock(&ifp->if_flt_lock);
5676
5677 result = (*filter->filt_input)(filter->filt_cookie,
5678 ifp, protocol_family, m_p, frame_header_p);
5679
5680 lck_mtx_lock_spin(&ifp->if_flt_lock);
5681 if (result != 0) {
5682 /* we're done with the filter list */
5683 if_flt_monitor_unbusy(ifp);
5684 lck_mtx_unlock(&ifp->if_flt_lock);
5685 return result;
5686 }
5687 }
5688 }
5689 /* we're done with the filter list */
5690 if_flt_monitor_unbusy(ifp);
5691 lck_mtx_unlock(&ifp->if_flt_lock);
5692
5693 /*
5694 * Strip away M_PROTO1 bit prior to sending packet up the stack as
5695 * it is meant to be local to a subsystem -- if_bridge for M_PROTO1
5696 */
5697 if (*m_p != NULL) {
5698 (*m_p)->m_flags &= ~M_PROTO1;
5699 }
5700
5701 return 0;
5702 }
5703
5704 __attribute__((noinline))
5705 static int
dlil_interface_filters_output(struct ifnet * ifp,struct mbuf ** m_p,protocol_family_t protocol_family)5706 dlil_interface_filters_output(struct ifnet *ifp, struct mbuf **m_p,
5707 protocol_family_t protocol_family)
5708 {
5709 boolean_t is_vlan_packet;
5710 struct ifnet_filter *filter;
5711 struct mbuf *m = *m_p;
5712
5713 if (TAILQ_EMPTY(&ifp->if_flt_head)) {
5714 return 0;
5715 }
5716 is_vlan_packet = packet_has_vlan_tag(m);
5717
5718 /*
5719 * Pass the outbound packet to the interface filters
5720 */
5721 lck_mtx_lock_spin(&ifp->if_flt_lock);
5722 /* prevent filter list from changing in case we drop the lock */
5723 if_flt_monitor_busy(ifp);
5724 TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
5725 int result;
5726
5727 /* exclude VLAN packets from external filters PR-3586856 */
5728 if (is_vlan_packet &&
5729 (filter->filt_flags & DLIL_IFF_INTERNAL) == 0) {
5730 continue;
5731 }
5732
5733 if (!filter->filt_skip && filter->filt_output != NULL &&
5734 (filter->filt_protocol == 0 ||
5735 filter->filt_protocol == protocol_family)) {
5736 lck_mtx_unlock(&ifp->if_flt_lock);
5737
5738 result = filter->filt_output(filter->filt_cookie, ifp,
5739 protocol_family, m_p);
5740
5741 lck_mtx_lock_spin(&ifp->if_flt_lock);
5742 if (result != 0) {
5743 /* we're done with the filter list */
5744 if_flt_monitor_unbusy(ifp);
5745 lck_mtx_unlock(&ifp->if_flt_lock);
5746 return result;
5747 }
5748 }
5749 }
5750 /* we're done with the filter list */
5751 if_flt_monitor_unbusy(ifp);
5752 lck_mtx_unlock(&ifp->if_flt_lock);
5753
5754 return 0;
5755 }
5756
5757 static void
dlil_ifproto_input(struct if_proto * ifproto,mbuf_t m)5758 dlil_ifproto_input(struct if_proto * ifproto, mbuf_t m)
5759 {
5760 int error;
5761
5762 if (ifproto->proto_kpi == kProtoKPI_v1) {
5763 /* Version 1 protocols get one packet at a time */
5764 while (m != NULL) {
5765 char * frame_header;
5766 mbuf_t next_packet;
5767
5768 next_packet = m->m_nextpkt;
5769 m->m_nextpkt = NULL;
5770 frame_header = m->m_pkthdr.pkt_hdr;
5771 m->m_pkthdr.pkt_hdr = NULL;
5772 error = (*ifproto->kpi.v1.input)(ifproto->ifp,
5773 ifproto->protocol_family, m, frame_header);
5774 if (error != 0 && error != EJUSTRETURN) {
5775 m_freem(m);
5776 }
5777 m = next_packet;
5778 }
5779 } else if (ifproto->proto_kpi == kProtoKPI_v2) {
5780 /* Version 2 protocols support packet lists */
5781 error = (*ifproto->kpi.v2.input)(ifproto->ifp,
5782 ifproto->protocol_family, m);
5783 if (error != 0 && error != EJUSTRETURN) {
5784 m_freem_list(m);
5785 }
5786 }
5787 }
5788
5789 static void
dlil_input_stats_add(const struct ifnet_stat_increment_param * s,struct dlil_threading_info * inp,struct ifnet * ifp,boolean_t poll)5790 dlil_input_stats_add(const struct ifnet_stat_increment_param *s,
5791 struct dlil_threading_info *inp, struct ifnet *ifp, boolean_t poll)
5792 {
5793 struct ifnet_stat_increment_param *d = &inp->dlth_stats;
5794
5795 if (s->packets_in != 0) {
5796 d->packets_in += s->packets_in;
5797 }
5798 if (s->bytes_in != 0) {
5799 d->bytes_in += s->bytes_in;
5800 }
5801 if (s->errors_in != 0) {
5802 d->errors_in += s->errors_in;
5803 }
5804
5805 if (s->packets_out != 0) {
5806 d->packets_out += s->packets_out;
5807 }
5808 if (s->bytes_out != 0) {
5809 d->bytes_out += s->bytes_out;
5810 }
5811 if (s->errors_out != 0) {
5812 d->errors_out += s->errors_out;
5813 }
5814
5815 if (s->collisions != 0) {
5816 d->collisions += s->collisions;
5817 }
5818 if (s->dropped != 0) {
5819 d->dropped += s->dropped;
5820 }
5821
5822 if (poll) {
5823 PKTCNTR_ADD(&ifp->if_poll_tstats, s->packets_in, s->bytes_in);
5824 }
5825 }
5826
5827 static boolean_t
dlil_input_stats_sync(struct ifnet * ifp,struct dlil_threading_info * inp)5828 dlil_input_stats_sync(struct ifnet *ifp, struct dlil_threading_info *inp)
5829 {
5830 struct ifnet_stat_increment_param *s = &inp->dlth_stats;
5831
5832 /*
5833 * Use of atomic operations is unavoidable here because
5834 * these stats may also be incremented elsewhere via KPIs.
5835 */
5836 if (s->packets_in != 0) {
5837 os_atomic_add(&ifp->if_data.ifi_ipackets, s->packets_in, relaxed);
5838 s->packets_in = 0;
5839 }
5840 if (s->bytes_in != 0) {
5841 os_atomic_add(&ifp->if_data.ifi_ibytes, s->bytes_in, relaxed);
5842 s->bytes_in = 0;
5843 }
5844 if (s->errors_in != 0) {
5845 os_atomic_add(&ifp->if_data.ifi_ierrors, s->errors_in, relaxed);
5846 s->errors_in = 0;
5847 }
5848
5849 if (s->packets_out != 0) {
5850 os_atomic_add(&ifp->if_data.ifi_opackets, s->packets_out, relaxed);
5851 s->packets_out = 0;
5852 }
5853 if (s->bytes_out != 0) {
5854 os_atomic_add(&ifp->if_data.ifi_obytes, s->bytes_out, relaxed);
5855 s->bytes_out = 0;
5856 }
5857 if (s->errors_out != 0) {
5858 os_atomic_add(&ifp->if_data.ifi_oerrors, s->errors_out, relaxed);
5859 s->errors_out = 0;
5860 }
5861
5862 if (s->collisions != 0) {
5863 os_atomic_add(&ifp->if_data.ifi_collisions, s->collisions, relaxed);
5864 s->collisions = 0;
5865 }
5866 if (s->dropped != 0) {
5867 os_atomic_add(&ifp->if_data.ifi_iqdrops, s->dropped, relaxed);
5868 s->dropped = 0;
5869 }
5870
5871 /*
5872 * No need for atomic operations as they are modified here
5873 * only from within the DLIL input thread context.
5874 */
5875 if (ifp->if_poll_tstats.packets != 0) {
5876 ifp->if_poll_pstats.ifi_poll_packets += ifp->if_poll_tstats.packets;
5877 ifp->if_poll_tstats.packets = 0;
5878 }
5879 if (ifp->if_poll_tstats.bytes != 0) {
5880 ifp->if_poll_pstats.ifi_poll_bytes += ifp->if_poll_tstats.bytes;
5881 ifp->if_poll_tstats.bytes = 0;
5882 }
5883
5884 return ifp->if_data_threshold != 0;
5885 }
5886
5887 __private_extern__ void
dlil_input_packet_list(struct ifnet * ifp,struct mbuf * m)5888 dlil_input_packet_list(struct ifnet *ifp, struct mbuf *m)
5889 {
5890 return dlil_input_packet_list_common(ifp, m, 0,
5891 IFNET_MODEL_INPUT_POLL_OFF, FALSE);
5892 }
5893
5894 __private_extern__ void
dlil_input_packet_list_extended(struct ifnet * ifp,struct mbuf * m,u_int32_t cnt,ifnet_model_t mode)5895 dlil_input_packet_list_extended(struct ifnet *ifp, struct mbuf *m,
5896 u_int32_t cnt, ifnet_model_t mode)
5897 {
5898 return dlil_input_packet_list_common(ifp, m, cnt, mode, TRUE);
5899 }
5900
5901 static inline mbuf_t
handle_bridge_early_input(ifnet_t ifp,mbuf_t m,u_int32_t cnt)5902 handle_bridge_early_input(ifnet_t ifp, mbuf_t m, u_int32_t cnt)
5903 {
5904 lck_mtx_lock_spin(&ifp->if_flt_lock);
5905 if_flt_monitor_busy(ifp);
5906 lck_mtx_unlock(&ifp->if_flt_lock);
5907
5908 if (ifp->if_bridge != NULL) {
5909 m = bridge_early_input(ifp, m, cnt);
5910 }
5911 lck_mtx_lock_spin(&ifp->if_flt_lock);
5912 if_flt_monitor_unbusy(ifp);
5913 lck_mtx_unlock(&ifp->if_flt_lock);
5914 return m;
5915 }
5916
5917 static void
dlil_input_packet_list_common(struct ifnet * ifp_param,struct mbuf * m,u_int32_t cnt,ifnet_model_t mode,boolean_t ext)5918 dlil_input_packet_list_common(struct ifnet *ifp_param, struct mbuf *m,
5919 u_int32_t cnt, ifnet_model_t mode, boolean_t ext)
5920 {
5921 int error = 0;
5922 protocol_family_t protocol_family;
5923 mbuf_t next_packet;
5924 ifnet_t ifp = ifp_param;
5925 char *frame_header = NULL;
5926 struct if_proto *last_ifproto = NULL;
5927 mbuf_t pkt_first = NULL;
5928 mbuf_t *pkt_next = NULL;
5929 u_int32_t poll_thresh = 0, poll_ival = 0;
5930 int iorefcnt = 0;
5931 boolean_t skip_bridge_filter = FALSE;
5932
5933 KERNEL_DEBUG(DBG_FNC_DLIL_INPUT | DBG_FUNC_START, 0, 0, 0, 0, 0);
5934
5935 if (ext && mode == IFNET_MODEL_INPUT_POLL_ON && cnt > 1 &&
5936 (poll_ival = if_rxpoll_interval_pkts) > 0) {
5937 poll_thresh = cnt;
5938 }
5939 if (bridge_enable_early_input != 0 &&
5940 ifp != NULL && ifp->if_bridge != NULL) {
5941 m = handle_bridge_early_input(ifp, m, cnt);
5942 skip_bridge_filter = TRUE;
5943 }
5944 while (m != NULL) {
5945 struct if_proto *ifproto = NULL;
5946 uint32_t pktf_mask; /* pkt flags to preserve */
5947
5948 m_add_crumb(m, PKT_CRUMB_DLIL_INPUT);
5949
5950 if (ifp_param == NULL) {
5951 ifp = m->m_pkthdr.rcvif;
5952 }
5953
5954 if ((ifp->if_eflags & IFEF_RXPOLL) &&
5955 (ifp->if_xflags & IFXF_LEGACY) && poll_thresh != 0 &&
5956 poll_ival > 0 && (--poll_thresh % poll_ival) == 0) {
5957 ifnet_poll(ifp);
5958 }
5959
5960 /* Check if this mbuf looks valid */
5961 MBUF_INPUT_CHECK(m, ifp);
5962
5963 next_packet = m->m_nextpkt;
5964 m->m_nextpkt = NULL;
5965 frame_header = m->m_pkthdr.pkt_hdr;
5966 m->m_pkthdr.pkt_hdr = NULL;
5967
5968 /*
5969 * Get an IO reference count if the interface is not
5970 * loopback (lo0) and it is attached; lo0 never goes
5971 * away, so optimize for that.
5972 */
5973 if (ifp != lo_ifp) {
5974 /* iorefcnt is 0 if it hasn't been taken yet */
5975 if (iorefcnt == 0) {
5976 if (!ifnet_datamov_begin(ifp)) {
5977 m_freem(m);
5978 goto next;
5979 }
5980 }
5981 iorefcnt = 1;
5982 /*
5983 * Preserve the time stamp and skip pktap flags.
5984 */
5985 pktf_mask = PKTF_TS_VALID | PKTF_SKIP_PKTAP;
5986 } else {
5987 /*
5988 * If this arrived on lo0, preserve interface addr
5989 * info to allow for connectivity between loopback
5990 * and local interface addresses.
5991 */
5992 pktf_mask = (PKTF_LOOP | PKTF_IFAINFO);
5993 }
5994 pktf_mask |= PKTF_WAKE_PKT;
5995
5996 /* make sure packet comes in clean */
5997 m_classifier_init(m, pktf_mask);
5998
5999 ifp_inc_traffic_class_in(ifp, m);
6000
6001 /* find which protocol family this packet is for */
6002 ifnet_lock_shared(ifp);
6003 error = (*ifp->if_demux)(ifp, m, frame_header,
6004 &protocol_family);
6005 ifnet_lock_done(ifp);
6006 if (error != 0) {
6007 if (error == EJUSTRETURN) {
6008 goto next;
6009 }
6010 protocol_family = 0;
6011 }
6012 /* check for an updated frame header */
6013 if (m->m_pkthdr.pkt_hdr != NULL) {
6014 frame_header = m->m_pkthdr.pkt_hdr;
6015 m->m_pkthdr.pkt_hdr = NULL;
6016 }
6017
6018 #if (DEVELOPMENT || DEBUG)
6019 /*
6020 * For testing we do not care about broadcast and multicast packets as
6021 * they are not as controllable as unicast traffic
6022 */
6023 if (__improbable(ifp->if_xflags & IFXF_MARK_WAKE_PKT)) {
6024 if ((protocol_family == PF_INET || protocol_family == PF_INET6) &&
6025 (m->m_flags & (M_BCAST | M_MCAST)) == 0) {
6026 /*
6027 * This is a one-shot command
6028 */
6029 ifp->if_xflags &= ~IFXF_MARK_WAKE_PKT;
6030 m->m_pkthdr.pkt_flags |= PKTF_WAKE_PKT;
6031 }
6032 }
6033 #endif /* (DEVELOPMENT || DEBUG) */
6034 if (__improbable(net_wake_pkt_debug > 0 && (m->m_pkthdr.pkt_flags & PKTF_WAKE_PKT))) {
6035 char buffer[64];
6036 size_t buflen = MIN(mbuf_pkthdr_len(m), sizeof(buffer));
6037
6038 os_log(OS_LOG_DEFAULT, "wake packet from %s len %d",
6039 ifp->if_xname, m_pktlen(m));
6040 if (mbuf_copydata(m, 0, buflen, buffer) == 0) {
6041 log_hexdump(buffer, buflen);
6042 }
6043 }
6044
6045 pktap_input(ifp, protocol_family, m, frame_header);
6046
6047 /* Drop v4 packets received on CLAT46 enabled cell interface */
6048 if (protocol_family == PF_INET && IS_INTF_CLAT46(ifp) &&
6049 ifp->if_type == IFT_CELLULAR) {
6050 m_freem(m);
6051 ip6stat.ip6s_clat464_in_v4_drop++;
6052 goto next;
6053 }
6054
6055 /* Translate the packet if it is received on CLAT interface */
6056 if ((m->m_flags & M_PROMISC) == 0 &&
6057 protocol_family == PF_INET6 &&
6058 IS_INTF_CLAT46(ifp) &&
6059 dlil_is_clat_needed(protocol_family, m)) {
6060 char *data = NULL;
6061 struct ether_header eh;
6062 struct ether_header *ehp = NULL;
6063
6064 if (ifp->if_type == IFT_ETHER) {
6065 ehp = (struct ether_header *)(void *)frame_header;
6066 /* Skip RX Ethernet packets if they are not IPV6 */
6067 if (ntohs(ehp->ether_type) != ETHERTYPE_IPV6) {
6068 goto skip_clat;
6069 }
6070
6071 /* Keep a copy of frame_header for Ethernet packets */
6072 bcopy(frame_header, (caddr_t)&eh, ETHER_HDR_LEN);
6073 }
6074 error = dlil_clat64(ifp, &protocol_family, &m);
6075 data = mtod(m, char*);
6076 if (error != 0) {
6077 m_freem(m);
6078 ip6stat.ip6s_clat464_in_drop++;
6079 goto next;
6080 }
6081 /* Native v6 should be No-op */
6082 if (protocol_family != PF_INET) {
6083 goto skip_clat;
6084 }
6085
6086 /* Do this only for translated v4 packets. */
6087 switch (ifp->if_type) {
6088 case IFT_CELLULAR:
6089 frame_header = data;
6090 break;
6091 case IFT_ETHER:
6092 /*
6093 * Drop if the mbuf doesn't have enough
6094 * space for Ethernet header
6095 */
6096 if (M_LEADINGSPACE(m) < ETHER_HDR_LEN) {
6097 m_freem(m);
6098 ip6stat.ip6s_clat464_in_drop++;
6099 goto next;
6100 }
6101 /*
6102 * Set the frame_header ETHER_HDR_LEN bytes
6103 * preceeding the data pointer. Change
6104 * the ether_type too.
6105 */
6106 frame_header = data - ETHER_HDR_LEN;
6107 eh.ether_type = htons(ETHERTYPE_IP);
6108 bcopy((caddr_t)&eh, frame_header, ETHER_HDR_LEN);
6109 break;
6110 }
6111 }
6112 skip_clat:
6113 /*
6114 * Match the wake packet against the list of ports that has been
6115 * been queried by the driver before the device went to sleep
6116 */
6117 if (__improbable(m->m_pkthdr.pkt_flags & PKTF_WAKE_PKT)) {
6118 if (protocol_family != PF_INET && protocol_family != PF_INET6) {
6119 if_ports_used_match_mbuf(ifp, protocol_family, m);
6120 }
6121 }
6122 if (hwcksum_dbg != 0 && !(ifp->if_flags & IFF_LOOPBACK) &&
6123 !(m->m_pkthdr.pkt_flags & PKTF_LOOP)) {
6124 dlil_input_cksum_dbg(ifp, m, frame_header,
6125 protocol_family);
6126 }
6127 /*
6128 * For partial checksum offload, we expect the driver to
6129 * set the start offset indicating the start of the span
6130 * that is covered by the hardware-computed checksum;
6131 * adjust this start offset accordingly because the data
6132 * pointer has been advanced beyond the link-layer header.
6133 *
6134 * Virtual lan types (bridge, vlan, bond) can call
6135 * dlil_input_packet_list() with the same packet with the
6136 * checksum flags set. Set a flag indicating that the
6137 * adjustment has already been done.
6138 */
6139 if ((m->m_pkthdr.csum_flags & CSUM_ADJUST_DONE) != 0) {
6140 /* adjustment has already been done */
6141 } else if ((m->m_pkthdr.csum_flags &
6142 (CSUM_DATA_VALID | CSUM_PARTIAL)) ==
6143 (CSUM_DATA_VALID | CSUM_PARTIAL)) {
6144 int adj;
6145 if (frame_header == NULL ||
6146 frame_header < (char *)mbuf_datastart(m) ||
6147 frame_header > (char *)m->m_data ||
6148 (adj = (int)(m->m_data - (uintptr_t)frame_header)) >
6149 m->m_pkthdr.csum_rx_start) {
6150 m->m_pkthdr.csum_data = 0;
6151 m->m_pkthdr.csum_flags &= ~CSUM_DATA_VALID;
6152 hwcksum_in_invalidated++;
6153 } else {
6154 m->m_pkthdr.csum_rx_start -= adj;
6155 }
6156 /* make sure we don't adjust more than once */
6157 m->m_pkthdr.csum_flags |= CSUM_ADJUST_DONE;
6158 }
6159 if (clat_debug) {
6160 pktap_input(ifp, protocol_family, m, frame_header);
6161 }
6162
6163 if (m->m_flags & (M_BCAST | M_MCAST)) {
6164 os_atomic_inc(&ifp->if_imcasts, relaxed);
6165 }
6166
6167 /* run interface filters */
6168 error = dlil_interface_filters_input(ifp, &m,
6169 &frame_header, protocol_family, skip_bridge_filter);
6170 if (error != 0) {
6171 if (error != EJUSTRETURN) {
6172 m_freem(m);
6173 }
6174 goto next;
6175 }
6176 /*
6177 * A VLAN and Bond interface receives packets by attaching
6178 * a "protocol" to the underlying interface.
6179 * A promiscuous packet needs to be delivered to the
6180 * VLAN or Bond interface since:
6181 * - Bond interface member may not support setting the
6182 * MAC address, so packets are inherently "promiscuous"
6183 * - A VLAN or Bond interface could be members of a bridge,
6184 * where promiscuous packets correspond to other
6185 * devices that the bridge forwards packets to/from
6186 */
6187 if ((m->m_flags & M_PROMISC) != 0) {
6188 switch (protocol_family) {
6189 case PF_VLAN:
6190 case PF_BOND:
6191 /* VLAN and Bond get promiscuous packets */
6192 break;
6193 default:
6194 m_freem(m);
6195 goto next;
6196 }
6197 }
6198
6199 /* Lookup the protocol attachment to this interface */
6200 if (protocol_family == 0) {
6201 ifproto = NULL;
6202 } else if (last_ifproto != NULL && last_ifproto->ifp == ifp &&
6203 (last_ifproto->protocol_family == protocol_family)) {
6204 VERIFY(ifproto == NULL);
6205 ifproto = last_ifproto;
6206 if_proto_ref(last_ifproto);
6207 } else {
6208 VERIFY(ifproto == NULL);
6209 ifnet_lock_shared(ifp);
6210 /* callee holds a proto refcnt upon success */
6211 ifproto = find_attached_proto(ifp, protocol_family);
6212 ifnet_lock_done(ifp);
6213 }
6214 if (ifproto == NULL) {
6215 /* no protocol for this packet, discard */
6216 m_freem(m);
6217 goto next;
6218 }
6219 if (ifproto != last_ifproto) {
6220 if (last_ifproto != NULL) {
6221 /* pass up the list for the previous protocol */
6222 dlil_ifproto_input(last_ifproto, pkt_first);
6223 pkt_first = NULL;
6224 if_proto_free(last_ifproto);
6225 }
6226 last_ifproto = ifproto;
6227 if_proto_ref(ifproto);
6228 }
6229 /* extend the list */
6230 m->m_pkthdr.pkt_hdr = frame_header;
6231 if (pkt_first == NULL) {
6232 pkt_first = m;
6233 } else {
6234 *pkt_next = m;
6235 }
6236 pkt_next = &m->m_nextpkt;
6237
6238 next:
6239 if (next_packet == NULL && last_ifproto != NULL) {
6240 /* pass up the last list of packets */
6241 dlil_ifproto_input(last_ifproto, pkt_first);
6242 if_proto_free(last_ifproto);
6243 last_ifproto = NULL;
6244 }
6245 if (ifproto != NULL) {
6246 if_proto_free(ifproto);
6247 ifproto = NULL;
6248 }
6249
6250 m = next_packet;
6251
6252 /* update the driver's multicast filter, if needed */
6253 if (ifp->if_updatemcasts > 0 && if_mcasts_update(ifp) == 0) {
6254 ifp->if_updatemcasts = 0;
6255 }
6256 if (iorefcnt == 1) {
6257 /* If the next mbuf is on a different interface, unlock data-mov */
6258 if (!m || (ifp != ifp_param && ifp != m->m_pkthdr.rcvif)) {
6259 ifnet_datamov_end(ifp);
6260 iorefcnt = 0;
6261 }
6262 }
6263 }
6264
6265 KERNEL_DEBUG(DBG_FNC_DLIL_INPUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
6266 }
6267
6268 static errno_t
if_mcasts_update_common(struct ifnet * ifp,bool sync)6269 if_mcasts_update_common(struct ifnet * ifp, bool sync)
6270 {
6271 errno_t err;
6272
6273 if (sync) {
6274 err = ifnet_ioctl(ifp, 0, SIOCADDMULTI, NULL);
6275 if (err == EAFNOSUPPORT) {
6276 err = 0;
6277 }
6278 } else {
6279 ifnet_ioctl_async(ifp, SIOCADDMULTI);
6280 err = 0;
6281 }
6282 DLIL_PRINTF("%s: %s %d suspended link-layer multicast membership(s) "
6283 "(err=%d)\n", if_name(ifp),
6284 (err == 0 ? "successfully restored" : "failed to restore"),
6285 ifp->if_updatemcasts, err);
6286
6287 /* just return success */
6288 return 0;
6289 }
6290
6291 static errno_t
if_mcasts_update_async(struct ifnet * ifp)6292 if_mcasts_update_async(struct ifnet *ifp)
6293 {
6294 return if_mcasts_update_common(ifp, false);
6295 }
6296
6297 errno_t
if_mcasts_update(struct ifnet * ifp)6298 if_mcasts_update(struct ifnet *ifp)
6299 {
6300 return if_mcasts_update_common(ifp, true);
6301 }
6302
6303 /* If ifp is set, we will increment the generation for the interface */
6304 int
dlil_post_complete_msg(struct ifnet * ifp,struct kev_msg * event)6305 dlil_post_complete_msg(struct ifnet *ifp, struct kev_msg *event)
6306 {
6307 if (ifp != NULL) {
6308 ifnet_increment_generation(ifp);
6309 }
6310
6311 #if NECP
6312 necp_update_all_clients();
6313 #endif /* NECP */
6314
6315 return kev_post_msg(event);
6316 }
6317
6318 __private_extern__ void
dlil_post_sifflags_msg(struct ifnet * ifp)6319 dlil_post_sifflags_msg(struct ifnet * ifp)
6320 {
6321 struct kev_msg ev_msg;
6322 struct net_event_data ev_data;
6323
6324 bzero(&ev_data, sizeof(ev_data));
6325 bzero(&ev_msg, sizeof(ev_msg));
6326 ev_msg.vendor_code = KEV_VENDOR_APPLE;
6327 ev_msg.kev_class = KEV_NETWORK_CLASS;
6328 ev_msg.kev_subclass = KEV_DL_SUBCLASS;
6329 ev_msg.event_code = KEV_DL_SIFFLAGS;
6330 strlcpy(&ev_data.if_name[0], ifp->if_name, IFNAMSIZ);
6331 ev_data.if_family = ifp->if_family;
6332 ev_data.if_unit = (u_int32_t) ifp->if_unit;
6333 ev_msg.dv[0].data_length = sizeof(struct net_event_data);
6334 ev_msg.dv[0].data_ptr = &ev_data;
6335 ev_msg.dv[1].data_length = 0;
6336 dlil_post_complete_msg(ifp, &ev_msg);
6337 }
6338
6339 #define TMP_IF_PROTO_ARR_SIZE 10
6340 static int
dlil_event_internal(struct ifnet * ifp,struct kev_msg * event,bool update_generation)6341 dlil_event_internal(struct ifnet *ifp, struct kev_msg *event, bool update_generation)
6342 {
6343 struct ifnet_filter *filter = NULL;
6344 struct if_proto *proto = NULL;
6345 int if_proto_count = 0;
6346 struct if_proto *tmp_ifproto_stack_arr[TMP_IF_PROTO_ARR_SIZE] = {NULL};
6347 struct if_proto **tmp_ifproto_arr = tmp_ifproto_stack_arr;
6348 int tmp_ifproto_arr_idx = 0;
6349
6350 /*
6351 * Pass the event to the interface filters
6352 */
6353 lck_mtx_lock_spin(&ifp->if_flt_lock);
6354 /* prevent filter list from changing in case we drop the lock */
6355 if_flt_monitor_busy(ifp);
6356 TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
6357 if (filter->filt_event != NULL) {
6358 lck_mtx_unlock(&ifp->if_flt_lock);
6359
6360 filter->filt_event(filter->filt_cookie, ifp,
6361 filter->filt_protocol, event);
6362
6363 lck_mtx_lock_spin(&ifp->if_flt_lock);
6364 }
6365 }
6366 /* we're done with the filter list */
6367 if_flt_monitor_unbusy(ifp);
6368 lck_mtx_unlock(&ifp->if_flt_lock);
6369
6370 /* Get an io ref count if the interface is attached */
6371 if (!ifnet_is_attached(ifp, 1)) {
6372 goto done;
6373 }
6374
6375 /*
6376 * An embedded tmp_list_entry in if_proto may still get
6377 * over-written by another thread after giving up ifnet lock,
6378 * therefore we are avoiding embedded pointers here.
6379 */
6380 ifnet_lock_shared(ifp);
6381 if_proto_count = dlil_ifp_protolist(ifp, NULL, 0);
6382 if (if_proto_count) {
6383 int i;
6384 VERIFY(ifp->if_proto_hash != NULL);
6385 if (if_proto_count <= TMP_IF_PROTO_ARR_SIZE) {
6386 tmp_ifproto_arr = tmp_ifproto_stack_arr;
6387 } else {
6388 tmp_ifproto_arr = kalloc_type(struct if_proto *,
6389 if_proto_count, Z_WAITOK | Z_ZERO);
6390 if (tmp_ifproto_arr == NULL) {
6391 ifnet_lock_done(ifp);
6392 goto cleanup;
6393 }
6394 }
6395
6396 for (i = 0; i < PROTO_HASH_SLOTS; i++) {
6397 SLIST_FOREACH(proto, &ifp->if_proto_hash[i],
6398 next_hash) {
6399 if_proto_ref(proto);
6400 tmp_ifproto_arr[tmp_ifproto_arr_idx] = proto;
6401 tmp_ifproto_arr_idx++;
6402 }
6403 }
6404 VERIFY(if_proto_count == tmp_ifproto_arr_idx);
6405 }
6406 ifnet_lock_done(ifp);
6407
6408 for (tmp_ifproto_arr_idx = 0; tmp_ifproto_arr_idx < if_proto_count;
6409 tmp_ifproto_arr_idx++) {
6410 proto = tmp_ifproto_arr[tmp_ifproto_arr_idx];
6411 VERIFY(proto != NULL);
6412 proto_media_event eventp =
6413 (proto->proto_kpi == kProtoKPI_v1 ?
6414 proto->kpi.v1.event :
6415 proto->kpi.v2.event);
6416
6417 if (eventp != NULL) {
6418 eventp(ifp, proto->protocol_family,
6419 event);
6420 }
6421 if_proto_free(proto);
6422 }
6423
6424 cleanup:
6425 if (tmp_ifproto_arr != tmp_ifproto_stack_arr) {
6426 kfree_type(struct if_proto *, if_proto_count, tmp_ifproto_arr);
6427 }
6428
6429 /* Pass the event to the interface */
6430 if (ifp->if_event != NULL) {
6431 ifp->if_event(ifp, event);
6432 }
6433
6434 /* Release the io ref count */
6435 ifnet_decr_iorefcnt(ifp);
6436 done:
6437 return dlil_post_complete_msg(update_generation ? ifp : NULL, event);
6438 }
6439
6440 errno_t
ifnet_event(ifnet_t ifp,struct kern_event_msg * event)6441 ifnet_event(ifnet_t ifp, struct kern_event_msg *event)
6442 {
6443 struct kev_msg kev_msg;
6444 int result = 0;
6445
6446 if (ifp == NULL || event == NULL) {
6447 return EINVAL;
6448 }
6449
6450 bzero(&kev_msg, sizeof(kev_msg));
6451 kev_msg.vendor_code = event->vendor_code;
6452 kev_msg.kev_class = event->kev_class;
6453 kev_msg.kev_subclass = event->kev_subclass;
6454 kev_msg.event_code = event->event_code;
6455 kev_msg.dv[0].data_ptr = &event->event_data[0];
6456 kev_msg.dv[0].data_length = event->total_size - KEV_MSG_HEADER_SIZE;
6457 kev_msg.dv[1].data_length = 0;
6458
6459 result = dlil_event_internal(ifp, &kev_msg, TRUE);
6460
6461 return result;
6462 }
6463
6464 static void
dlil_count_chain_len(mbuf_t m,struct chain_len_stats * cls)6465 dlil_count_chain_len(mbuf_t m, struct chain_len_stats *cls)
6466 {
6467 mbuf_t n = m;
6468 int chainlen = 0;
6469
6470 while (n != NULL) {
6471 chainlen++;
6472 n = n->m_next;
6473 }
6474 switch (chainlen) {
6475 case 0:
6476 break;
6477 case 1:
6478 os_atomic_inc(&cls->cls_one, relaxed);
6479 break;
6480 case 2:
6481 os_atomic_inc(&cls->cls_two, relaxed);
6482 break;
6483 case 3:
6484 os_atomic_inc(&cls->cls_three, relaxed);
6485 break;
6486 case 4:
6487 os_atomic_inc(&cls->cls_four, relaxed);
6488 break;
6489 case 5:
6490 default:
6491 os_atomic_inc(&cls->cls_five_or_more, relaxed);
6492 break;
6493 }
6494 }
6495
6496 #if CONFIG_DTRACE
6497 __attribute__((noinline))
6498 static void
dlil_output_dtrace(ifnet_t ifp,protocol_family_t proto_family,mbuf_t m)6499 dlil_output_dtrace(ifnet_t ifp, protocol_family_t proto_family, mbuf_t m)
6500 {
6501 if (proto_family == PF_INET) {
6502 struct ip *ip = mtod(m, struct ip *);
6503 DTRACE_IP6(send, struct mbuf *, m, struct inpcb *, NULL,
6504 struct ip *, ip, struct ifnet *, ifp,
6505 struct ip *, ip, struct ip6_hdr *, NULL);
6506 } else if (proto_family == PF_INET6) {
6507 struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
6508 DTRACE_IP6(send, struct mbuf *, m, struct inpcb *, NULL,
6509 struct ip6_hdr *, ip6, struct ifnet *, ifp,
6510 struct ip *, NULL, struct ip6_hdr *, ip6);
6511 }
6512 }
6513 #endif /* CONFIG_DTRACE */
6514
6515 /*
6516 * dlil_output
6517 *
6518 * Caller should have a lock on the protocol domain if the protocol
6519 * doesn't support finer grained locking. In most cases, the lock
6520 * will be held from the socket layer and won't be released until
6521 * we return back to the socket layer.
6522 *
6523 * This does mean that we must take a protocol lock before we take
6524 * an interface lock if we're going to take both. This makes sense
6525 * because a protocol is likely to interact with an ifp while it
6526 * is under the protocol lock.
6527 *
6528 * An advisory code will be returned if adv is not null. This
6529 * can be used to provide feedback about interface queues to the
6530 * application.
6531 */
6532 errno_t
dlil_output(ifnet_t ifp,protocol_family_t proto_family,mbuf_t packetlist,void * route,const struct sockaddr * dest,int flags,struct flowadv * adv)6533 dlil_output(ifnet_t ifp, protocol_family_t proto_family, mbuf_t packetlist,
6534 void *route, const struct sockaddr *dest, int flags, struct flowadv *adv)
6535 {
6536 char *frame_type = NULL;
6537 char *dst_linkaddr = NULL;
6538 int retval = 0;
6539 char frame_type_buffer[DLIL_MAX_FRAME_TYPE_BUFFER_SIZE];
6540 char dst_linkaddr_buffer[DLIL_MAX_LINKADDR_BUFFER_SIZE];
6541 struct if_proto *proto = NULL;
6542 mbuf_t m = NULL;
6543 mbuf_t send_head = NULL;
6544 mbuf_t *send_tail = &send_head;
6545 int iorefcnt = 0;
6546 u_int32_t pre = 0, post = 0;
6547 u_int32_t fpkts = 0, fbytes = 0;
6548 int32_t flen = 0;
6549 struct timespec now;
6550 u_int64_t now_nsec;
6551 boolean_t did_clat46 = FALSE;
6552 protocol_family_t old_proto_family = proto_family;
6553 struct sockaddr_in6 dest6;
6554 struct rtentry *rt = NULL;
6555 u_int16_t m_loop_set = 0;
6556 bool raw = (flags & DLIL_OUTPUT_FLAGS_RAW) != 0;
6557
6558 KERNEL_DEBUG(DBG_FNC_DLIL_OUTPUT | DBG_FUNC_START, 0, 0, 0, 0, 0);
6559
6560 /*
6561 * Get an io refcnt if the interface is attached to prevent ifnet_detach
6562 * from happening while this operation is in progress
6563 */
6564 if (!ifnet_datamov_begin(ifp)) {
6565 retval = ENXIO;
6566 goto cleanup;
6567 }
6568 iorefcnt = 1;
6569
6570 VERIFY(ifp->if_output_dlil != NULL);
6571
6572 /* update the driver's multicast filter, if needed */
6573 if (ifp->if_updatemcasts > 0) {
6574 if_mcasts_update_async(ifp);
6575 ifp->if_updatemcasts = 0;
6576 }
6577
6578 frame_type = frame_type_buffer;
6579 dst_linkaddr = dst_linkaddr_buffer;
6580
6581 if (flags == DLIL_OUTPUT_FLAGS_NONE) {
6582 ifnet_lock_shared(ifp);
6583 /* callee holds a proto refcnt upon success */
6584 proto = find_attached_proto(ifp, proto_family);
6585 if (proto == NULL) {
6586 ifnet_lock_done(ifp);
6587 retval = ENXIO;
6588 goto cleanup;
6589 }
6590 ifnet_lock_done(ifp);
6591 }
6592
6593 preout_again:
6594 if (packetlist == NULL) {
6595 goto cleanup;
6596 }
6597
6598 m = packetlist;
6599 packetlist = packetlist->m_nextpkt;
6600 m->m_nextpkt = NULL;
6601
6602 m_add_crumb(m, PKT_CRUMB_DLIL_OUTPUT);
6603
6604 /*
6605 * Perform address family translation for the first
6606 * packet outside the loop in order to perform address
6607 * lookup for the translated proto family.
6608 */
6609 if (proto_family == PF_INET && IS_INTF_CLAT46(ifp) &&
6610 (ifp->if_type == IFT_CELLULAR ||
6611 dlil_is_clat_needed(proto_family, m))) {
6612 retval = dlil_clat46(ifp, &proto_family, &m);
6613 /*
6614 * Go to the next packet if translation fails
6615 */
6616 if (retval != 0) {
6617 m_freem(m);
6618 m = NULL;
6619 ip6stat.ip6s_clat464_out_drop++;
6620 /* Make sure that the proto family is PF_INET */
6621 ASSERT(proto_family == PF_INET);
6622 goto preout_again;
6623 }
6624 /*
6625 * Free the old one and make it point to the IPv6 proto structure.
6626 *
6627 * Change proto for the first time we have successfully
6628 * performed address family translation.
6629 */
6630 if (!did_clat46 && proto_family == PF_INET6) {
6631 did_clat46 = TRUE;
6632
6633 if (proto != NULL) {
6634 if_proto_free(proto);
6635 }
6636 ifnet_lock_shared(ifp);
6637 /* callee holds a proto refcnt upon success */
6638 proto = find_attached_proto(ifp, proto_family);
6639 if (proto == NULL) {
6640 ifnet_lock_done(ifp);
6641 retval = ENXIO;
6642 m_freem(m);
6643 m = NULL;
6644 goto cleanup;
6645 }
6646 ifnet_lock_done(ifp);
6647 if (ifp->if_type == IFT_ETHER) {
6648 /* Update the dest to translated v6 address */
6649 dest6.sin6_len = sizeof(struct sockaddr_in6);
6650 dest6.sin6_family = AF_INET6;
6651 dest6.sin6_addr = (mtod(m, struct ip6_hdr *))->ip6_dst;
6652 dest = SA(&dest6);
6653
6654 /*
6655 * Lookup route to the translated destination
6656 * Free this route ref during cleanup
6657 */
6658 rt = rtalloc1_scoped(SA(&dest6),
6659 0, 0, ifp->if_index);
6660
6661 route = rt;
6662 }
6663 }
6664 }
6665
6666 /*
6667 * This path gets packet chain going to the same destination.
6668 * The pre output routine is used to either trigger resolution of
6669 * the next hop or retrieve the next hop's link layer addressing.
6670 * For ex: ether_inet(6)_pre_output routine.
6671 *
6672 * If the routine returns EJUSTRETURN, it implies that packet has
6673 * been queued, and therefore we have to call preout_again for the
6674 * following packet in the chain.
6675 *
6676 * For errors other than EJUSTRETURN, the current packet is freed
6677 * and the rest of the chain (pointed by packetlist is freed as
6678 * part of clean up.
6679 *
6680 * Else if there is no error the retrieved information is used for
6681 * all the packets in the chain.
6682 */
6683 if (flags == DLIL_OUTPUT_FLAGS_NONE) {
6684 proto_media_preout preoutp = (proto->proto_kpi == kProtoKPI_v1 ?
6685 proto->kpi.v1.pre_output : proto->kpi.v2.pre_output);
6686 retval = 0;
6687 if (preoutp != NULL) {
6688 retval = preoutp(ifp, proto_family, &m, dest, route,
6689 frame_type, dst_linkaddr);
6690
6691 if (retval != 0) {
6692 if (retval == EJUSTRETURN) {
6693 goto preout_again;
6694 }
6695 m_freem(m);
6696 m = NULL;
6697 goto cleanup;
6698 }
6699 }
6700 }
6701
6702 nanouptime(&now);
6703 net_timernsec(&now, &now_nsec);
6704
6705 do {
6706 /*
6707 * pkt_hdr is set here to point to m_data prior to
6708 * calling into the framer. This value of pkt_hdr is
6709 * used by the netif gso logic to retrieve the ip header
6710 * for the TCP packets, offloaded for TSO processing.
6711 */
6712 if (raw && (ifp->if_family == IFNET_FAMILY_ETHERNET)) {
6713 uint8_t vlan_encap_len = 0;
6714
6715 if ((m->m_pkthdr.csum_flags & CSUM_VLAN_ENCAP_PRESENT) != 0) {
6716 vlan_encap_len = ETHER_VLAN_ENCAP_LEN;
6717 }
6718 m->m_pkthdr.pkt_hdr = mtod(m, char *) + ETHER_HDR_LEN + vlan_encap_len;
6719 } else {
6720 m->m_pkthdr.pkt_hdr = mtod(m, void *);
6721 }
6722
6723 /*
6724 * Perform address family translation if needed.
6725 * For now we only support stateless 4 to 6 translation
6726 * on the out path.
6727 *
6728 * The routine below translates IP header, updates protocol
6729 * checksum and also translates ICMP.
6730 *
6731 * We skip the first packet as it is already translated and
6732 * the proto family is set to PF_INET6.
6733 */
6734 if (proto_family == PF_INET && IS_INTF_CLAT46(ifp) &&
6735 (ifp->if_type == IFT_CELLULAR ||
6736 dlil_is_clat_needed(proto_family, m))) {
6737 retval = dlil_clat46(ifp, &proto_family, &m);
6738 /* Goto the next packet if the translation fails */
6739 if (retval != 0) {
6740 m_freem(m);
6741 m = NULL;
6742 ip6stat.ip6s_clat464_out_drop++;
6743 goto next;
6744 }
6745 }
6746
6747 #if CONFIG_DTRACE
6748 if (flags == DLIL_OUTPUT_FLAGS_NONE) {
6749 dlil_output_dtrace(ifp, proto_family, m);
6750 }
6751 #endif /* CONFIG_DTRACE */
6752
6753 if (flags == DLIL_OUTPUT_FLAGS_NONE && ifp->if_framer != NULL) {
6754 int rcvif_set = 0;
6755
6756 /*
6757 * If this is a broadcast packet that needs to be
6758 * looped back into the system, set the inbound ifp
6759 * to that of the outbound ifp. This will allow
6760 * us to determine that it is a legitimate packet
6761 * for the system. Only set the ifp if it's not
6762 * already set, just to be safe.
6763 */
6764 if ((m->m_flags & (M_BCAST | M_LOOP)) &&
6765 m->m_pkthdr.rcvif == NULL) {
6766 m->m_pkthdr.rcvif = ifp;
6767 rcvif_set = 1;
6768 }
6769 m_loop_set = m->m_flags & M_LOOP;
6770 retval = ifp->if_framer(ifp, &m, dest, dst_linkaddr,
6771 frame_type, &pre, &post);
6772 if (retval != 0) {
6773 if (retval != EJUSTRETURN) {
6774 m_freem(m);
6775 }
6776 goto next;
6777 }
6778
6779 /*
6780 * For partial checksum offload, adjust the start
6781 * and stuff offsets based on the prepended header.
6782 */
6783 if ((m->m_pkthdr.csum_flags &
6784 (CSUM_DATA_VALID | CSUM_PARTIAL)) ==
6785 (CSUM_DATA_VALID | CSUM_PARTIAL)) {
6786 m->m_pkthdr.csum_tx_stuff += pre;
6787 m->m_pkthdr.csum_tx_start += pre;
6788 }
6789
6790 if (hwcksum_dbg != 0 && !(ifp->if_flags & IFF_LOOPBACK)) {
6791 dlil_output_cksum_dbg(ifp, m, pre,
6792 proto_family);
6793 }
6794
6795 /*
6796 * Clear the ifp if it was set above, and to be
6797 * safe, only if it is still the same as the
6798 * outbound ifp we have in context. If it was
6799 * looped back, then a copy of it was sent to the
6800 * loopback interface with the rcvif set, and we
6801 * are clearing the one that will go down to the
6802 * layer below.
6803 */
6804 if (rcvif_set && m->m_pkthdr.rcvif == ifp) {
6805 m->m_pkthdr.rcvif = NULL;
6806 }
6807 }
6808
6809 /*
6810 * Let interface filters (if any) do their thing ...
6811 */
6812 if ((flags & DLIL_OUTPUT_FLAGS_SKIP_IF_FILTERS) == 0) {
6813 retval = dlil_interface_filters_output(ifp, &m, proto_family);
6814 if (retval != 0) {
6815 if (retval != EJUSTRETURN) {
6816 m_freem(m);
6817 }
6818 goto next;
6819 }
6820 }
6821 /*
6822 * Strip away M_PROTO1 bit prior to sending packet
6823 * to the driver as this field may be used by the driver
6824 */
6825 m->m_flags &= ~M_PROTO1;
6826
6827 /*
6828 * If the underlying interface is not capable of handling a
6829 * packet whose data portion spans across physically disjoint
6830 * pages, we need to "normalize" the packet so that we pass
6831 * down a chain of mbufs where each mbuf points to a span that
6832 * resides in the system page boundary. If the packet does
6833 * not cross page(s), the following is a no-op.
6834 */
6835 if (!(ifp->if_hwassist & IFNET_MULTIPAGES)) {
6836 if ((m = m_normalize(m)) == NULL) {
6837 goto next;
6838 }
6839 }
6840
6841 /*
6842 * If this is a TSO packet, make sure the interface still
6843 * advertise TSO capability.
6844 */
6845 if (TSO_IPV4_NOTOK(ifp, m) || TSO_IPV6_NOTOK(ifp, m)) {
6846 retval = EMSGSIZE;
6847 m_freem(m);
6848 goto cleanup;
6849 }
6850
6851 ifp_inc_traffic_class_out(ifp, m);
6852
6853 #if SKYWALK
6854 /*
6855 * For native skywalk devices, packets will be passed to pktap
6856 * after GSO or after the mbuf to packet conversion.
6857 * This is done for IPv4/IPv6 packets only because there is no
6858 * space in the mbuf to pass down the proto family.
6859 */
6860 if (dlil_is_native_netif_nexus(ifp)) {
6861 if (raw || m->m_pkthdr.pkt_proto == 0) {
6862 pktap_output(ifp, proto_family, m, pre, post);
6863 m->m_pkthdr.pkt_flags |= PKTF_SKIP_PKTAP;
6864 }
6865 } else {
6866 pktap_output(ifp, proto_family, m, pre, post);
6867 }
6868 #else /* SKYWALK */
6869 pktap_output(ifp, proto_family, m, pre, post);
6870 #endif /* SKYWALK */
6871
6872 /*
6873 * Count the number of elements in the mbuf chain
6874 */
6875 if (tx_chain_len_count) {
6876 dlil_count_chain_len(m, &tx_chain_len_stats);
6877 }
6878
6879 /*
6880 * Discard partial sum information if this packet originated
6881 * from another interface; the packet would already have the
6882 * final checksum and we shouldn't recompute it.
6883 */
6884 if ((m->m_pkthdr.pkt_flags & PKTF_FORWARDED) &&
6885 (m->m_pkthdr.csum_flags & (CSUM_DATA_VALID | CSUM_PARTIAL)) ==
6886 (CSUM_DATA_VALID | CSUM_PARTIAL)) {
6887 m->m_pkthdr.csum_flags &= ~CSUM_TX_FLAGS;
6888 m->m_pkthdr.csum_data = 0;
6889 }
6890
6891 /*
6892 * Finally, call the driver.
6893 */
6894 if (ifp->if_eflags & (IFEF_SENDLIST | IFEF_ENQUEUE_MULTI)) {
6895 if (m->m_pkthdr.pkt_flags & PKTF_FORWARDED) {
6896 flen += (m_pktlen(m) - (pre + post));
6897 m->m_pkthdr.pkt_flags &= ~PKTF_FORWARDED;
6898 }
6899 (void) mbuf_set_timestamp(m, now_nsec, TRUE);
6900
6901 *send_tail = m;
6902 send_tail = &m->m_nextpkt;
6903 } else {
6904 /*
6905 * Record timestamp; ifnet_enqueue() will use this info
6906 * rather than redoing the work.
6907 */
6908 nanouptime(&now);
6909 net_timernsec(&now, &now_nsec);
6910 (void) mbuf_set_timestamp(m, now_nsec, TRUE);
6911
6912 if (m->m_pkthdr.pkt_flags & PKTF_FORWARDED) {
6913 flen = (m_pktlen(m) - (pre + post));
6914 m->m_pkthdr.pkt_flags &= ~PKTF_FORWARDED;
6915 } else {
6916 flen = 0;
6917 }
6918 KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_START,
6919 0, 0, 0, 0, 0);
6920 retval = (*ifp->if_output_dlil)(ifp, m);
6921 if (retval == EQFULL || retval == EQSUSPENDED) {
6922 if (adv != NULL && adv->code == FADV_SUCCESS) {
6923 adv->code = (retval == EQFULL ?
6924 FADV_FLOW_CONTROLLED :
6925 FADV_SUSPENDED);
6926 }
6927 retval = 0;
6928 }
6929 if (retval == 0 && flen > 0) {
6930 fbytes += flen;
6931 fpkts++;
6932 }
6933 if (retval != 0 && dlil_verbose) {
6934 DLIL_PRINTF("%s: output error on %s retval = %d\n",
6935 __func__, if_name(ifp),
6936 retval);
6937 }
6938 KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_END,
6939 0, 0, 0, 0, 0);
6940 }
6941 KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
6942
6943 next:
6944 m = packetlist;
6945 if (m != NULL) {
6946 m->m_flags |= m_loop_set;
6947 packetlist = packetlist->m_nextpkt;
6948 m->m_nextpkt = NULL;
6949 }
6950 /* Reset the proto family to old proto family for CLAT */
6951 if (did_clat46) {
6952 proto_family = old_proto_family;
6953 }
6954 } while (m != NULL);
6955
6956 if (send_head != NULL) {
6957 KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_START,
6958 0, 0, 0, 0, 0);
6959 if (ifp->if_eflags & IFEF_SENDLIST) {
6960 retval = (*ifp->if_output_dlil)(ifp, send_head);
6961 if (retval == EQFULL || retval == EQSUSPENDED) {
6962 if (adv != NULL) {
6963 adv->code = (retval == EQFULL ?
6964 FADV_FLOW_CONTROLLED :
6965 FADV_SUSPENDED);
6966 }
6967 retval = 0;
6968 }
6969 if (retval == 0 && flen > 0) {
6970 fbytes += flen;
6971 fpkts++;
6972 }
6973 if (retval != 0 && dlil_verbose) {
6974 DLIL_PRINTF("%s: output error on %s retval = %d\n",
6975 __func__, if_name(ifp), retval);
6976 }
6977 } else {
6978 struct mbuf *send_m;
6979 int enq_cnt = 0;
6980 VERIFY(ifp->if_eflags & IFEF_ENQUEUE_MULTI);
6981 while (send_head != NULL) {
6982 send_m = send_head;
6983 send_head = send_m->m_nextpkt;
6984 send_m->m_nextpkt = NULL;
6985 retval = (*ifp->if_output_dlil)(ifp, send_m);
6986 if (retval == EQFULL || retval == EQSUSPENDED) {
6987 if (adv != NULL) {
6988 adv->code = (retval == EQFULL ?
6989 FADV_FLOW_CONTROLLED :
6990 FADV_SUSPENDED);
6991 }
6992 retval = 0;
6993 }
6994 if (retval == 0) {
6995 enq_cnt++;
6996 if (flen > 0) {
6997 fpkts++;
6998 }
6999 }
7000 if (retval != 0 && dlil_verbose) {
7001 DLIL_PRINTF("%s: output error on %s "
7002 "retval = %d\n",
7003 __func__, if_name(ifp), retval);
7004 }
7005 }
7006 if (enq_cnt > 0) {
7007 fbytes += flen;
7008 ifnet_start(ifp);
7009 }
7010 }
7011 KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
7012 }
7013
7014 KERNEL_DEBUG(DBG_FNC_DLIL_OUTPUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
7015
7016 cleanup:
7017 if (fbytes > 0) {
7018 ifp->if_fbytes += fbytes;
7019 }
7020 if (fpkts > 0) {
7021 ifp->if_fpackets += fpkts;
7022 }
7023 if (proto != NULL) {
7024 if_proto_free(proto);
7025 }
7026 if (packetlist) { /* if any packets are left, clean up */
7027 mbuf_freem_list(packetlist);
7028 }
7029 if (retval == EJUSTRETURN) {
7030 retval = 0;
7031 }
7032 if (iorefcnt == 1) {
7033 ifnet_datamov_end(ifp);
7034 }
7035 if (rt != NULL) {
7036 rtfree(rt);
7037 rt = NULL;
7038 }
7039
7040 return retval;
7041 }
7042
7043 /*
7044 * This routine checks if the destination address is not a loopback, link-local,
7045 * multicast or broadcast address.
7046 */
7047 static int
dlil_is_clat_needed(protocol_family_t proto_family,mbuf_t m)7048 dlil_is_clat_needed(protocol_family_t proto_family, mbuf_t m)
7049 {
7050 int ret = 0;
7051 switch (proto_family) {
7052 case PF_INET: {
7053 struct ip *iph = mtod(m, struct ip *);
7054 if (CLAT46_NEEDED(ntohl(iph->ip_dst.s_addr))) {
7055 ret = 1;
7056 }
7057 break;
7058 }
7059 case PF_INET6: {
7060 struct ip6_hdr *ip6h = mtod(m, struct ip6_hdr *);
7061 if ((size_t)m_pktlen(m) >= sizeof(struct ip6_hdr) &&
7062 CLAT64_NEEDED(&ip6h->ip6_dst)) {
7063 ret = 1;
7064 }
7065 break;
7066 }
7067 }
7068
7069 return ret;
7070 }
7071 /*
7072 * @brief This routine translates IPv4 packet to IPv6 packet,
7073 * updates protocol checksum and also translates ICMP for code
7074 * along with inner header translation.
7075 *
7076 * @param ifp Pointer to the interface
7077 * @param proto_family pointer to protocol family. It is updated if function
7078 * performs the translation successfully.
7079 * @param m Pointer to the pointer pointing to the packet. Needed because this
7080 * routine can end up changing the mbuf to a different one.
7081 *
7082 * @return 0 on success or else a negative value.
7083 */
7084 static errno_t
dlil_clat46(ifnet_t ifp,protocol_family_t * proto_family,mbuf_t * m)7085 dlil_clat46(ifnet_t ifp, protocol_family_t *proto_family, mbuf_t *m)
7086 {
7087 VERIFY(*proto_family == PF_INET);
7088 VERIFY(IS_INTF_CLAT46(ifp));
7089
7090 pbuf_t pbuf_store, *pbuf = NULL;
7091 struct ip *iph = NULL;
7092 struct in_addr osrc, odst;
7093 uint8_t proto = 0;
7094 struct in6_addr src_storage = {};
7095 struct in6_addr *src = NULL;
7096 struct sockaddr_in6 dstsock = {};
7097 int error = 0;
7098 uint16_t off = 0;
7099 uint16_t tot_len = 0;
7100 uint16_t ip_id_val = 0;
7101 uint16_t ip_frag_off = 0;
7102
7103 boolean_t is_frag = FALSE;
7104 boolean_t is_first_frag = TRUE;
7105 boolean_t is_last_frag = TRUE;
7106
7107 pbuf_init_mbuf(&pbuf_store, *m, ifp);
7108 pbuf = &pbuf_store;
7109 iph = pbuf->pb_data;
7110
7111 osrc = iph->ip_src;
7112 odst = iph->ip_dst;
7113 proto = iph->ip_p;
7114 off = (uint16_t)(iph->ip_hl << 2);
7115 ip_id_val = iph->ip_id;
7116 ip_frag_off = ntohs(iph->ip_off) & IP_OFFMASK;
7117
7118 tot_len = ntohs(iph->ip_len);
7119
7120 /*
7121 * For packets that are not first frags
7122 * we only need to adjust CSUM.
7123 * For 4 to 6, Fragmentation header gets appended
7124 * after proto translation.
7125 */
7126 if (ntohs(iph->ip_off) & ~(IP_DF | IP_RF)) {
7127 is_frag = TRUE;
7128
7129 /* If the offset is not zero, it is not first frag */
7130 if (ip_frag_off != 0) {
7131 is_first_frag = FALSE;
7132 }
7133
7134 /* If IP_MF is set, then it is not last frag */
7135 if (ntohs(iph->ip_off) & IP_MF) {
7136 is_last_frag = FALSE;
7137 }
7138 }
7139
7140 /*
7141 * Translate IPv4 destination to IPv6 destination by using the
7142 * prefixes learned through prior PLAT discovery.
7143 */
7144 if ((error = nat464_synthesize_ipv6(ifp, &odst, &dstsock.sin6_addr)) != 0) {
7145 ip6stat.ip6s_clat464_out_v6synthfail_drop++;
7146 goto cleanup;
7147 }
7148
7149 dstsock.sin6_len = sizeof(struct sockaddr_in6);
7150 dstsock.sin6_family = AF_INET6;
7151
7152 /*
7153 * Retrive the local IPv6 CLAT46 address reserved for stateless
7154 * translation.
7155 */
7156 src = in6_selectsrc_core(&dstsock, 0, ifp, 0, &src_storage, NULL, &error,
7157 NULL, NULL, TRUE);
7158
7159 if (src == NULL) {
7160 ip6stat.ip6s_clat464_out_nov6addr_drop++;
7161 error = -1;
7162 goto cleanup;
7163 }
7164
7165
7166 /* Translate the IP header part first */
7167 error = (nat464_translate_46(pbuf, off, iph->ip_tos, iph->ip_p,
7168 iph->ip_ttl, src_storage, dstsock.sin6_addr, tot_len) == NT_NAT64) ? 0 : -1;
7169
7170 iph = NULL; /* Invalidate iph as pbuf has been modified */
7171
7172 if (error != 0) {
7173 ip6stat.ip6s_clat464_out_46transfail_drop++;
7174 goto cleanup;
7175 }
7176
7177 /*
7178 * Translate protocol header, update checksum, checksum flags
7179 * and related fields.
7180 */
7181 error = (nat464_translate_proto(pbuf, (struct nat464_addr *)&osrc, (struct nat464_addr *)&odst,
7182 proto, PF_INET, PF_INET6, NT_OUT, !is_first_frag) == NT_NAT64) ? 0 : -1;
7183
7184 if (error != 0) {
7185 ip6stat.ip6s_clat464_out_46proto_transfail_drop++;
7186 goto cleanup;
7187 }
7188
7189 /* Now insert the IPv6 fragment header */
7190 if (is_frag) {
7191 error = nat464_insert_frag46(pbuf, ip_id_val, ip_frag_off, is_last_frag);
7192
7193 if (error != 0) {
7194 ip6stat.ip6s_clat464_out_46frag_transfail_drop++;
7195 goto cleanup;
7196 }
7197 }
7198
7199 cleanup:
7200 if (pbuf_is_valid(pbuf)) {
7201 *m = pbuf->pb_mbuf;
7202 pbuf->pb_mbuf = NULL;
7203 pbuf_destroy(pbuf);
7204 } else {
7205 error = -1;
7206 *m = NULL;
7207 ip6stat.ip6s_clat464_out_invalpbuf_drop++;
7208 }
7209
7210 if (error == 0) {
7211 *proto_family = PF_INET6;
7212 ip6stat.ip6s_clat464_out_success++;
7213 }
7214
7215 return error;
7216 }
7217
7218 /*
7219 * @brief This routine translates incoming IPv6 to IPv4 packet,
7220 * updates protocol checksum and also translates ICMPv6 outer
7221 * and inner headers
7222 *
7223 * @return 0 on success or else a negative value.
7224 */
7225 static errno_t
dlil_clat64(ifnet_t ifp,protocol_family_t * proto_family,mbuf_t * m)7226 dlil_clat64(ifnet_t ifp, protocol_family_t *proto_family, mbuf_t *m)
7227 {
7228 VERIFY(*proto_family == PF_INET6);
7229 VERIFY(IS_INTF_CLAT46(ifp));
7230
7231 struct ip6_hdr *ip6h = NULL;
7232 struct in6_addr osrc, odst;
7233 uint8_t proto = 0;
7234 struct in6_ifaddr *ia6_clat_dst = NULL;
7235 struct in_ifaddr *ia4_clat_dst = NULL;
7236 struct in_addr *dst = NULL;
7237 struct in_addr src;
7238 int error = 0;
7239 uint32_t off = 0;
7240 u_int64_t tot_len = 0;
7241 uint8_t tos = 0;
7242 boolean_t is_first_frag = TRUE;
7243
7244 /* Incoming mbuf does not contain valid IP6 header */
7245 if ((size_t)(*m)->m_pkthdr.len < sizeof(struct ip6_hdr) ||
7246 ((size_t)(*m)->m_len < sizeof(struct ip6_hdr) &&
7247 (*m = m_pullup(*m, sizeof(struct ip6_hdr))) == NULL)) {
7248 ip6stat.ip6s_clat464_in_tooshort_drop++;
7249 return -1;
7250 }
7251
7252 ip6h = mtod(*m, struct ip6_hdr *);
7253 /* Validate that mbuf contains IP payload equal to ip6_plen */
7254 if ((size_t)(*m)->m_pkthdr.len < ntohs(ip6h->ip6_plen) + sizeof(struct ip6_hdr)) {
7255 ip6stat.ip6s_clat464_in_tooshort_drop++;
7256 return -1;
7257 }
7258
7259 osrc = ip6h->ip6_src;
7260 odst = ip6h->ip6_dst;
7261
7262 /*
7263 * Retrieve the local CLAT46 reserved IPv6 address.
7264 * Let the packet pass if we don't find one, as the flag
7265 * may get set before IPv6 configuration has taken place.
7266 */
7267 ia6_clat_dst = in6ifa_ifpwithflag(ifp, IN6_IFF_CLAT46);
7268 if (ia6_clat_dst == NULL) {
7269 goto done;
7270 }
7271
7272 /*
7273 * Check if the original dest in the packet is same as the reserved
7274 * CLAT46 IPv6 address
7275 */
7276 if (IN6_ARE_ADDR_EQUAL(&odst, &ia6_clat_dst->ia_addr.sin6_addr)) {
7277 pbuf_t pbuf_store, *pbuf = NULL;
7278 pbuf_init_mbuf(&pbuf_store, *m, ifp);
7279 pbuf = &pbuf_store;
7280
7281 /*
7282 * Retrive the local CLAT46 IPv4 address reserved for stateless
7283 * translation.
7284 */
7285 ia4_clat_dst = inifa_ifpclatv4(ifp);
7286 if (ia4_clat_dst == NULL) {
7287 ifa_remref(&ia6_clat_dst->ia_ifa);
7288 ip6stat.ip6s_clat464_in_nov4addr_drop++;
7289 error = -1;
7290 goto cleanup;
7291 }
7292 ifa_remref(&ia6_clat_dst->ia_ifa);
7293
7294 /* Translate IPv6 src to IPv4 src by removing the NAT64 prefix */
7295 dst = &ia4_clat_dst->ia_addr.sin_addr;
7296 if ((error = nat464_synthesize_ipv4(ifp, &osrc, &src)) != 0) {
7297 ip6stat.ip6s_clat464_in_v4synthfail_drop++;
7298 error = -1;
7299 goto cleanup;
7300 }
7301
7302 ip6h = pbuf->pb_data;
7303 off = sizeof(struct ip6_hdr);
7304 proto = ip6h->ip6_nxt;
7305 tos = (ntohl(ip6h->ip6_flow) >> 20) & 0xff;
7306 tot_len = ntohs(ip6h->ip6_plen) + sizeof(struct ip6_hdr);
7307
7308 /*
7309 * Translate the IP header and update the fragmentation
7310 * header if needed
7311 */
7312 error = (nat464_translate_64(pbuf, off, tos, &proto,
7313 ip6h->ip6_hlim, src, *dst, tot_len, &is_first_frag) == NT_NAT64) ?
7314 0 : -1;
7315
7316 ip6h = NULL; /* Invalidate ip6h as pbuf has been changed */
7317
7318 if (error != 0) {
7319 ip6stat.ip6s_clat464_in_64transfail_drop++;
7320 goto cleanup;
7321 }
7322
7323 /*
7324 * Translate protocol header, update checksum, checksum flags
7325 * and related fields.
7326 */
7327 error = (nat464_translate_proto(pbuf, (struct nat464_addr *)&osrc,
7328 (struct nat464_addr *)&odst, proto, PF_INET6, PF_INET,
7329 NT_IN, !is_first_frag) == NT_NAT64) ? 0 : -1;
7330
7331 if (error != 0) {
7332 ip6stat.ip6s_clat464_in_64proto_transfail_drop++;
7333 goto cleanup;
7334 }
7335
7336 cleanup:
7337 if (ia4_clat_dst != NULL) {
7338 ifa_remref(&ia4_clat_dst->ia_ifa);
7339 }
7340
7341 if (pbuf_is_valid(pbuf)) {
7342 *m = pbuf->pb_mbuf;
7343 pbuf->pb_mbuf = NULL;
7344 pbuf_destroy(pbuf);
7345 } else {
7346 error = -1;
7347 ip6stat.ip6s_clat464_in_invalpbuf_drop++;
7348 }
7349
7350 if (error == 0) {
7351 *proto_family = PF_INET;
7352 ip6stat.ip6s_clat464_in_success++;
7353 }
7354 } /* CLAT traffic */
7355
7356 done:
7357 return error;
7358 }
7359
7360 /* The following is used to enqueue work items for ifnet ioctl events */
7361 static void ifnet_ioctl_event_callback(struct nwk_wq_entry *);
7362
7363 struct ifnet_ioctl_event {
7364 struct ifnet *ifp;
7365 u_long ioctl_code;
7366 };
7367
7368 struct ifnet_ioctl_event_nwk_wq_entry {
7369 struct nwk_wq_entry nwk_wqe;
7370 struct ifnet_ioctl_event ifnet_ioctl_ev_arg;
7371 };
7372
7373 void
ifnet_ioctl_async(struct ifnet * ifp,u_long ioctl_code)7374 ifnet_ioctl_async(struct ifnet *ifp, u_long ioctl_code)
7375 {
7376 struct ifnet_ioctl_event_nwk_wq_entry *p_ifnet_ioctl_ev = NULL;
7377 bool compare_expected;
7378
7379 /*
7380 * Get an io ref count if the interface is attached.
7381 * At this point it most likely is. We are taking a reference for
7382 * deferred processing.
7383 */
7384 if (!ifnet_is_attached(ifp, 1)) {
7385 os_log(OS_LOG_DEFAULT, "%s:%d %s Failed for ioctl %lu as interface "
7386 "is not attached",
7387 __func__, __LINE__, if_name(ifp), ioctl_code);
7388 return;
7389 }
7390 switch (ioctl_code) {
7391 case SIOCADDMULTI:
7392 compare_expected = false;
7393 if (!atomic_compare_exchange_strong(&ifp->if_mcast_add_signaled, &compare_expected, true)) {
7394 ifnet_decr_iorefcnt(ifp);
7395 return;
7396 }
7397 break;
7398 case SIOCDELMULTI:
7399 compare_expected = false;
7400 if (!atomic_compare_exchange_strong(&ifp->if_mcast_del_signaled, &compare_expected, true)) {
7401 ifnet_decr_iorefcnt(ifp);
7402 return;
7403 }
7404 break;
7405 default:
7406 os_log(OS_LOG_DEFAULT, "%s:%d %s unknown ioctl %lu",
7407 __func__, __LINE__, if_name(ifp), ioctl_code);
7408 return;
7409 }
7410
7411 p_ifnet_ioctl_ev = kalloc_type(struct ifnet_ioctl_event_nwk_wq_entry,
7412 Z_WAITOK | Z_ZERO | Z_NOFAIL);
7413
7414 p_ifnet_ioctl_ev->ifnet_ioctl_ev_arg.ifp = ifp;
7415 p_ifnet_ioctl_ev->ifnet_ioctl_ev_arg.ioctl_code = ioctl_code;
7416 p_ifnet_ioctl_ev->nwk_wqe.func = ifnet_ioctl_event_callback;
7417 nwk_wq_enqueue(&p_ifnet_ioctl_ev->nwk_wqe);
7418 }
7419
7420 static void
ifnet_ioctl_event_callback(struct nwk_wq_entry * nwk_item)7421 ifnet_ioctl_event_callback(struct nwk_wq_entry *nwk_item)
7422 {
7423 struct ifnet_ioctl_event_nwk_wq_entry *p_ev = __container_of(nwk_item,
7424 struct ifnet_ioctl_event_nwk_wq_entry, nwk_wqe);
7425
7426 struct ifnet *ifp = p_ev->ifnet_ioctl_ev_arg.ifp;
7427 u_long ioctl_code = p_ev->ifnet_ioctl_ev_arg.ioctl_code;
7428 int ret = 0;
7429
7430 switch (ioctl_code) {
7431 case SIOCADDMULTI:
7432 atomic_store(&ifp->if_mcast_add_signaled, false);
7433 break;
7434 case SIOCDELMULTI:
7435 atomic_store(&ifp->if_mcast_del_signaled, false);
7436 break;
7437 }
7438 if ((ret = ifnet_ioctl(ifp, 0, ioctl_code, NULL)) != 0) {
7439 os_log(OS_LOG_DEFAULT, "%s:%d %s ifnet_ioctl returned %d for ioctl %lu",
7440 __func__, __LINE__, if_name(ifp), ret, ioctl_code);
7441 } else if (dlil_verbose) {
7442 os_log(OS_LOG_DEFAULT, "%s:%d %s ifnet_ioctl returned successfully "
7443 "for ioctl %lu",
7444 __func__, __LINE__, if_name(ifp), ioctl_code);
7445 }
7446 ifnet_decr_iorefcnt(ifp);
7447 kfree_type(struct ifnet_ioctl_event_nwk_wq_entry, p_ev);
7448 return;
7449 }
7450
7451 errno_t
ifnet_ioctl(ifnet_t ifp,protocol_family_t proto_fam,u_long ioctl_code,void * ioctl_arg)7452 ifnet_ioctl(ifnet_t ifp, protocol_family_t proto_fam, u_long ioctl_code,
7453 void *ioctl_arg)
7454 {
7455 struct ifnet_filter *filter;
7456 int retval = EOPNOTSUPP;
7457 int result = 0;
7458
7459 if (ifp == NULL || ioctl_code == 0) {
7460 return EINVAL;
7461 }
7462
7463 /* Get an io ref count if the interface is attached */
7464 if (!ifnet_is_attached(ifp, 1)) {
7465 return EOPNOTSUPP;
7466 }
7467
7468 /*
7469 * Run the interface filters first.
7470 * We want to run all filters before calling the protocol,
7471 * interface family, or interface.
7472 */
7473 lck_mtx_lock_spin(&ifp->if_flt_lock);
7474 /* prevent filter list from changing in case we drop the lock */
7475 if_flt_monitor_busy(ifp);
7476 TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
7477 if (filter->filt_ioctl != NULL && (filter->filt_protocol == 0 ||
7478 filter->filt_protocol == proto_fam)) {
7479 lck_mtx_unlock(&ifp->if_flt_lock);
7480
7481 result = filter->filt_ioctl(filter->filt_cookie, ifp,
7482 proto_fam, ioctl_code, ioctl_arg);
7483
7484 lck_mtx_lock_spin(&ifp->if_flt_lock);
7485
7486 /* Only update retval if no one has handled the ioctl */
7487 if (retval == EOPNOTSUPP || result == EJUSTRETURN) {
7488 if (result == ENOTSUP) {
7489 result = EOPNOTSUPP;
7490 }
7491 retval = result;
7492 if (retval != 0 && retval != EOPNOTSUPP) {
7493 /* we're done with the filter list */
7494 if_flt_monitor_unbusy(ifp);
7495 lck_mtx_unlock(&ifp->if_flt_lock);
7496 goto cleanup;
7497 }
7498 }
7499 }
7500 }
7501 /* we're done with the filter list */
7502 if_flt_monitor_unbusy(ifp);
7503 lck_mtx_unlock(&ifp->if_flt_lock);
7504
7505 /* Allow the protocol to handle the ioctl */
7506 if (proto_fam != 0) {
7507 struct if_proto *proto;
7508
7509 /* callee holds a proto refcnt upon success */
7510 ifnet_lock_shared(ifp);
7511 proto = find_attached_proto(ifp, proto_fam);
7512 ifnet_lock_done(ifp);
7513 if (proto != NULL) {
7514 proto_media_ioctl ioctlp =
7515 (proto->proto_kpi == kProtoKPI_v1 ?
7516 proto->kpi.v1.ioctl : proto->kpi.v2.ioctl);
7517 result = EOPNOTSUPP;
7518 if (ioctlp != NULL) {
7519 result = ioctlp(ifp, proto_fam, ioctl_code,
7520 ioctl_arg);
7521 }
7522 if_proto_free(proto);
7523
7524 /* Only update retval if no one has handled the ioctl */
7525 if (retval == EOPNOTSUPP || result == EJUSTRETURN) {
7526 if (result == ENOTSUP) {
7527 result = EOPNOTSUPP;
7528 }
7529 retval = result;
7530 if (retval && retval != EOPNOTSUPP) {
7531 goto cleanup;
7532 }
7533 }
7534 }
7535 }
7536
7537 /* retval is either 0 or EOPNOTSUPP */
7538
7539 /*
7540 * Let the interface handle this ioctl.
7541 * If it returns EOPNOTSUPP, ignore that, we may have
7542 * already handled this in the protocol or family.
7543 */
7544 if (ifp->if_ioctl) {
7545 result = (*ifp->if_ioctl)(ifp, ioctl_code, ioctl_arg);
7546 }
7547
7548 /* Only update retval if no one has handled the ioctl */
7549 if (retval == EOPNOTSUPP || result == EJUSTRETURN) {
7550 if (result == ENOTSUP) {
7551 result = EOPNOTSUPP;
7552 }
7553 retval = result;
7554 if (retval && retval != EOPNOTSUPP) {
7555 goto cleanup;
7556 }
7557 }
7558
7559 cleanup:
7560 if (retval == EJUSTRETURN) {
7561 retval = 0;
7562 }
7563
7564 ifnet_decr_iorefcnt(ifp);
7565
7566 return retval;
7567 }
7568
7569 __private_extern__ errno_t
dlil_set_bpf_tap(ifnet_t ifp,bpf_tap_mode mode,bpf_packet_func callback)7570 dlil_set_bpf_tap(ifnet_t ifp, bpf_tap_mode mode, bpf_packet_func callback)
7571 {
7572 errno_t error = 0;
7573
7574 if (ifp->if_set_bpf_tap) {
7575 /* Get an io reference on the interface if it is attached */
7576 if (!ifnet_is_attached(ifp, 1)) {
7577 return ENXIO;
7578 }
7579 error = ifp->if_set_bpf_tap(ifp, mode, callback);
7580 ifnet_decr_iorefcnt(ifp);
7581 }
7582 return error;
7583 }
7584
7585 errno_t
dlil_resolve_multi(struct ifnet * ifp,const struct sockaddr * proto_addr,struct sockaddr * ll_addr,size_t ll_len)7586 dlil_resolve_multi(struct ifnet *ifp, const struct sockaddr *proto_addr,
7587 struct sockaddr *ll_addr, size_t ll_len)
7588 {
7589 errno_t result = EOPNOTSUPP;
7590 struct if_proto *proto;
7591 const struct sockaddr *verify;
7592 proto_media_resolve_multi resolvep;
7593
7594 if (!ifnet_is_attached(ifp, 1)) {
7595 return result;
7596 }
7597
7598 bzero(ll_addr, ll_len);
7599
7600 /* Call the protocol first; callee holds a proto refcnt upon success */
7601 ifnet_lock_shared(ifp);
7602 proto = find_attached_proto(ifp, proto_addr->sa_family);
7603 ifnet_lock_done(ifp);
7604 if (proto != NULL) {
7605 resolvep = (proto->proto_kpi == kProtoKPI_v1 ?
7606 proto->kpi.v1.resolve_multi : proto->kpi.v2.resolve_multi);
7607 if (resolvep != NULL) {
7608 result = resolvep(ifp, proto_addr, SDL(ll_addr), ll_len);
7609 }
7610 if_proto_free(proto);
7611 }
7612
7613 /* Let the interface verify the multicast address */
7614 if ((result == EOPNOTSUPP || result == 0) && ifp->if_check_multi) {
7615 if (result == 0) {
7616 verify = ll_addr;
7617 } else {
7618 verify = proto_addr;
7619 }
7620 result = ifp->if_check_multi(ifp, verify);
7621 }
7622
7623 ifnet_decr_iorefcnt(ifp);
7624 return result;
7625 }
7626
7627 __private_extern__ errno_t
dlil_send_arp_internal(ifnet_t ifp,u_short arpop,const struct sockaddr_dl * sender_hw,const struct sockaddr * sender_proto,const struct sockaddr_dl * target_hw,const struct sockaddr * target_proto)7628 dlil_send_arp_internal(ifnet_t ifp, u_short arpop,
7629 const struct sockaddr_dl *sender_hw, const struct sockaddr *sender_proto,
7630 const struct sockaddr_dl *target_hw, const struct sockaddr *target_proto)
7631 {
7632 struct if_proto *proto;
7633 errno_t result = 0;
7634
7635 if ((ifp->if_flags & IFF_NOARP) != 0) {
7636 result = ENOTSUP;
7637 goto done;
7638 }
7639
7640 /* callee holds a proto refcnt upon success */
7641 ifnet_lock_shared(ifp);
7642 proto = find_attached_proto(ifp, target_proto->sa_family);
7643 ifnet_lock_done(ifp);
7644 if (proto == NULL) {
7645 result = ENOTSUP;
7646 } else {
7647 proto_media_send_arp arpp;
7648 arpp = (proto->proto_kpi == kProtoKPI_v1 ?
7649 proto->kpi.v1.send_arp : proto->kpi.v2.send_arp);
7650 if (arpp == NULL) {
7651 result = ENOTSUP;
7652 } else {
7653 switch (arpop) {
7654 case ARPOP_REQUEST:
7655 arpstat.txrequests++;
7656 if (target_hw != NULL) {
7657 arpstat.txurequests++;
7658 }
7659 break;
7660 case ARPOP_REPLY:
7661 arpstat.txreplies++;
7662 break;
7663 }
7664 result = arpp(ifp, arpop, sender_hw, sender_proto,
7665 target_hw, target_proto);
7666 }
7667 if_proto_free(proto);
7668 }
7669 done:
7670 return result;
7671 }
7672
7673 struct net_thread_marks { };
7674 static const struct net_thread_marks net_thread_marks_base = { };
7675
7676 __private_extern__ const net_thread_marks_t net_thread_marks_none =
7677 &net_thread_marks_base;
7678
7679 __private_extern__ net_thread_marks_t
net_thread_marks_push(u_int32_t push)7680 net_thread_marks_push(u_int32_t push)
7681 {
7682 static const char *const base = (const void*)&net_thread_marks_base;
7683 u_int32_t pop = 0;
7684
7685 if (push != 0) {
7686 struct uthread *uth = current_uthread();
7687
7688 pop = push & ~uth->uu_network_marks;
7689 if (pop != 0) {
7690 uth->uu_network_marks |= pop;
7691 }
7692 }
7693
7694 return (net_thread_marks_t)&base[pop];
7695 }
7696
7697 __private_extern__ net_thread_marks_t
net_thread_unmarks_push(u_int32_t unpush)7698 net_thread_unmarks_push(u_int32_t unpush)
7699 {
7700 static const char *const base = (const void*)&net_thread_marks_base;
7701 u_int32_t unpop = 0;
7702
7703 if (unpush != 0) {
7704 struct uthread *uth = current_uthread();
7705
7706 unpop = unpush & uth->uu_network_marks;
7707 if (unpop != 0) {
7708 uth->uu_network_marks &= ~unpop;
7709 }
7710 }
7711
7712 return (net_thread_marks_t)&base[unpop];
7713 }
7714
7715 __private_extern__ void
net_thread_marks_pop(net_thread_marks_t popx)7716 net_thread_marks_pop(net_thread_marks_t popx)
7717 {
7718 static const char *const base = (const void*)&net_thread_marks_base;
7719 const ptrdiff_t pop = (const char *)popx - (const char *)base;
7720
7721 if (pop != 0) {
7722 static const ptrdiff_t ones = (ptrdiff_t)(u_int32_t)~0U;
7723 struct uthread *uth = current_uthread();
7724
7725 VERIFY((pop & ones) == pop);
7726 VERIFY((ptrdiff_t)(uth->uu_network_marks & pop) == pop);
7727 uth->uu_network_marks &= ~pop;
7728 }
7729 }
7730
7731 __private_extern__ void
net_thread_unmarks_pop(net_thread_marks_t unpopx)7732 net_thread_unmarks_pop(net_thread_marks_t unpopx)
7733 {
7734 static const char *const base = (const void*)&net_thread_marks_base;
7735 ptrdiff_t unpop = (const char *)unpopx - (const char *)base;
7736
7737 if (unpop != 0) {
7738 static const ptrdiff_t ones = (ptrdiff_t)(u_int32_t)~0U;
7739 struct uthread *uth = current_uthread();
7740
7741 VERIFY((unpop & ones) == unpop);
7742 VERIFY((ptrdiff_t)(uth->uu_network_marks & unpop) == 0);
7743 uth->uu_network_marks |= (u_int32_t)unpop;
7744 }
7745 }
7746
7747 __private_extern__ u_int32_t
net_thread_is_marked(u_int32_t check)7748 net_thread_is_marked(u_int32_t check)
7749 {
7750 if (check != 0) {
7751 struct uthread *uth = current_uthread();
7752 return uth->uu_network_marks & check;
7753 } else {
7754 return 0;
7755 }
7756 }
7757
7758 __private_extern__ u_int32_t
net_thread_is_unmarked(u_int32_t check)7759 net_thread_is_unmarked(u_int32_t check)
7760 {
7761 if (check != 0) {
7762 struct uthread *uth = current_uthread();
7763 return ~uth->uu_network_marks & check;
7764 } else {
7765 return 0;
7766 }
7767 }
7768
7769 static __inline__ int
_is_announcement(const struct sockaddr_in * sender_sin,const struct sockaddr_in * target_sin)7770 _is_announcement(const struct sockaddr_in * sender_sin,
7771 const struct sockaddr_in * target_sin)
7772 {
7773 if (target_sin == NULL || sender_sin == NULL) {
7774 return FALSE;
7775 }
7776
7777 return sender_sin->sin_addr.s_addr == target_sin->sin_addr.s_addr;
7778 }
7779
7780 __private_extern__ errno_t
dlil_send_arp(ifnet_t ifp,u_short arpop,const struct sockaddr_dl * sender_hw,const struct sockaddr * sender_proto,const struct sockaddr_dl * target_hw,const struct sockaddr * target_proto0,u_int32_t rtflags)7781 dlil_send_arp(ifnet_t ifp, u_short arpop, const struct sockaddr_dl *sender_hw,
7782 const struct sockaddr *sender_proto, const struct sockaddr_dl *target_hw,
7783 const struct sockaddr *target_proto0, u_int32_t rtflags)
7784 {
7785 errno_t result = 0;
7786 const struct sockaddr_in * sender_sin;
7787 const struct sockaddr_in * target_sin;
7788 struct sockaddr_inarp target_proto_sinarp;
7789 struct sockaddr *target_proto = __DECONST_SA(target_proto0);
7790
7791 if (target_proto == NULL || sender_proto == NULL) {
7792 return EINVAL;
7793 }
7794
7795 if (sender_proto->sa_family != target_proto->sa_family) {
7796 return EINVAL;
7797 }
7798
7799 /*
7800 * If the target is a (default) router, provide that
7801 * information to the send_arp callback routine.
7802 */
7803 if (rtflags & RTF_ROUTER) {
7804 SOCKADDR_COPY(target_proto, &target_proto_sinarp, sizeof(struct sockaddr_in));
7805 target_proto_sinarp.sin_other |= SIN_ROUTER;
7806 target_proto = SA(&target_proto_sinarp);
7807 }
7808
7809 /*
7810 * If this is an ARP request and the target IP is IPv4LL,
7811 * send the request on all interfaces. The exception is
7812 * an announcement, which must only appear on the specific
7813 * interface.
7814 */
7815 sender_sin = SIN(sender_proto);
7816 target_sin = SIN(target_proto);
7817 if (target_proto->sa_family == AF_INET &&
7818 IN_LINKLOCAL(ntohl(target_sin->sin_addr.s_addr)) &&
7819 ipv4_ll_arp_aware != 0 && arpop == ARPOP_REQUEST &&
7820 !_is_announcement(sender_sin, target_sin)) {
7821 ifnet_t *__counted_by(count) ifp_list;
7822 u_int32_t count;
7823 u_int32_t ifp_on;
7824
7825 result = ENOTSUP;
7826
7827 if (ifnet_list_get(IFNET_FAMILY_ANY, &ifp_list, &count) == 0) {
7828 for (ifp_on = 0; ifp_on < count; ifp_on++) {
7829 errno_t new_result;
7830 ifaddr_t source_hw = NULL;
7831 ifaddr_t source_ip = NULL;
7832 struct sockaddr_in source_ip_copy;
7833 struct ifnet *cur_ifp = ifp_list[ifp_on];
7834
7835 /*
7836 * Only arp on interfaces marked for IPv4LL
7837 * ARPing. This may mean that we don't ARP on
7838 * the interface the subnet route points to.
7839 */
7840 if (!(cur_ifp->if_eflags & IFEF_ARPLL)) {
7841 continue;
7842 }
7843
7844 /* Find the source IP address */
7845 ifnet_lock_shared(cur_ifp);
7846 source_hw = cur_ifp->if_lladdr;
7847 TAILQ_FOREACH(source_ip, &cur_ifp->if_addrhead,
7848 ifa_link) {
7849 IFA_LOCK(source_ip);
7850 if (source_ip->ifa_addr != NULL &&
7851 source_ip->ifa_addr->sa_family ==
7852 AF_INET) {
7853 /* Copy the source IP address */
7854 SOCKADDR_COPY(SIN(source_ip->ifa_addr), &source_ip_copy, sizeof(source_ip_copy));
7855 IFA_UNLOCK(source_ip);
7856 break;
7857 }
7858 IFA_UNLOCK(source_ip);
7859 }
7860
7861 /* No IP Source, don't arp */
7862 if (source_ip == NULL) {
7863 ifnet_lock_done(cur_ifp);
7864 continue;
7865 }
7866
7867 ifa_addref(source_hw);
7868 ifnet_lock_done(cur_ifp);
7869
7870 /* Send the ARP */
7871 new_result = dlil_send_arp_internal(cur_ifp,
7872 arpop, SDL(source_hw->ifa_addr),
7873 SA(&source_ip_copy), NULL,
7874 target_proto);
7875
7876 ifa_remref(source_hw);
7877 if (result == ENOTSUP) {
7878 result = new_result;
7879 }
7880 }
7881 ifnet_list_free_counted_by(ifp_list, count);
7882 }
7883 } else {
7884 result = dlil_send_arp_internal(ifp, arpop, sender_hw,
7885 sender_proto, target_hw, target_proto);
7886 }
7887
7888 return result;
7889 }
7890
7891 /*
7892 * Caller must hold ifnet head lock.
7893 */
7894 static int
ifnet_lookup(struct ifnet * ifp)7895 ifnet_lookup(struct ifnet *ifp)
7896 {
7897 struct ifnet *_ifp;
7898
7899 LCK_RW_ASSERT(&ifnet_head_lock, LCK_RW_ASSERT_HELD);
7900 TAILQ_FOREACH(_ifp, &ifnet_head, if_link) {
7901 if (_ifp == ifp) {
7902 break;
7903 }
7904 }
7905 return _ifp != NULL;
7906 }
7907
7908 /*
7909 * Caller has to pass a non-zero refio argument to get a
7910 * IO reference count. This will prevent ifnet_detach from
7911 * being called when there are outstanding io reference counts.
7912 */
7913 int
ifnet_is_attached(struct ifnet * ifp,int refio)7914 ifnet_is_attached(struct ifnet *ifp, int refio)
7915 {
7916 int ret;
7917
7918 lck_mtx_lock_spin(&ifp->if_ref_lock);
7919 if ((ret = IF_FULLY_ATTACHED(ifp))) {
7920 if (refio > 0) {
7921 ifp->if_refio++;
7922 }
7923 }
7924 lck_mtx_unlock(&ifp->if_ref_lock);
7925
7926 return ret;
7927 }
7928
7929 void
ifnet_incr_pending_thread_count(struct ifnet * ifp)7930 ifnet_incr_pending_thread_count(struct ifnet *ifp)
7931 {
7932 lck_mtx_lock_spin(&ifp->if_ref_lock);
7933 ifp->if_threads_pending++;
7934 lck_mtx_unlock(&ifp->if_ref_lock);
7935 }
7936
7937 void
ifnet_decr_pending_thread_count(struct ifnet * ifp)7938 ifnet_decr_pending_thread_count(struct ifnet *ifp)
7939 {
7940 lck_mtx_lock_spin(&ifp->if_ref_lock);
7941 VERIFY(ifp->if_threads_pending > 0);
7942 ifp->if_threads_pending--;
7943 if (ifp->if_threads_pending == 0) {
7944 wakeup(&ifp->if_threads_pending);
7945 }
7946 lck_mtx_unlock(&ifp->if_ref_lock);
7947 }
7948
7949 /*
7950 * Caller must ensure the interface is attached; the assumption is that
7951 * there is at least an outstanding IO reference count held already.
7952 * Most callers would call ifnet_is_{attached,data_ready}() instead.
7953 */
7954 void
ifnet_incr_iorefcnt(struct ifnet * ifp)7955 ifnet_incr_iorefcnt(struct ifnet *ifp)
7956 {
7957 lck_mtx_lock_spin(&ifp->if_ref_lock);
7958 VERIFY(IF_FULLY_ATTACHED(ifp));
7959 VERIFY(ifp->if_refio > 0);
7960 ifp->if_refio++;
7961 lck_mtx_unlock(&ifp->if_ref_lock);
7962 }
7963
7964 __attribute__((always_inline))
7965 static void
ifnet_decr_iorefcnt_locked(struct ifnet * ifp)7966 ifnet_decr_iorefcnt_locked(struct ifnet *ifp)
7967 {
7968 LCK_MTX_ASSERT(&ifp->if_ref_lock, LCK_MTX_ASSERT_OWNED);
7969
7970 VERIFY(ifp->if_refio > 0);
7971 VERIFY(ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING));
7972
7973 ifp->if_refio--;
7974 VERIFY(ifp->if_refio != 0 || ifp->if_datamov == 0);
7975
7976 /*
7977 * if there are no more outstanding io references, wakeup the
7978 * ifnet_detach thread if detaching flag is set.
7979 */
7980 if (ifp->if_refio == 0 && (ifp->if_refflags & IFRF_DETACHING)) {
7981 wakeup(&(ifp->if_refio));
7982 }
7983 }
7984
7985 void
ifnet_decr_iorefcnt(struct ifnet * ifp)7986 ifnet_decr_iorefcnt(struct ifnet *ifp)
7987 {
7988 lck_mtx_lock_spin(&ifp->if_ref_lock);
7989 ifnet_decr_iorefcnt_locked(ifp);
7990 lck_mtx_unlock(&ifp->if_ref_lock);
7991 }
7992
7993 boolean_t
ifnet_datamov_begin(struct ifnet * ifp)7994 ifnet_datamov_begin(struct ifnet *ifp)
7995 {
7996 boolean_t ret;
7997
7998 lck_mtx_lock_spin(&ifp->if_ref_lock);
7999 if ((ret = IF_FULLY_ATTACHED_AND_READY(ifp))) {
8000 ifp->if_refio++;
8001 ifp->if_datamov++;
8002 }
8003 lck_mtx_unlock(&ifp->if_ref_lock);
8004
8005 DTRACE_IP2(datamov__begin, struct ifnet *, ifp, boolean_t, ret);
8006 return ret;
8007 }
8008
8009 void
ifnet_datamov_end(struct ifnet * ifp)8010 ifnet_datamov_end(struct ifnet *ifp)
8011 {
8012 lck_mtx_lock_spin(&ifp->if_ref_lock);
8013 VERIFY(ifp->if_datamov > 0);
8014 /*
8015 * if there's no more thread moving data, wakeup any
8016 * drainers that's blocked waiting for this.
8017 */
8018 if (--ifp->if_datamov == 0 && ifp->if_drainers > 0) {
8019 DLIL_PRINTF("Waking up drainers on %s\n", if_name(ifp));
8020 DTRACE_IP1(datamov__drain__wake, struct ifnet *, ifp);
8021 wakeup(&(ifp->if_datamov));
8022 }
8023 ifnet_decr_iorefcnt_locked(ifp);
8024 lck_mtx_unlock(&ifp->if_ref_lock);
8025
8026 DTRACE_IP1(datamov__end, struct ifnet *, ifp);
8027 }
8028
8029 static void
ifnet_datamov_suspend_locked(struct ifnet * ifp)8030 ifnet_datamov_suspend_locked(struct ifnet *ifp)
8031 {
8032 LCK_MTX_ASSERT(&ifp->if_ref_lock, LCK_MTX_ASSERT_OWNED);
8033 ifp->if_refio++;
8034 if (ifp->if_suspend++ == 0) {
8035 VERIFY(ifp->if_refflags & IFRF_READY);
8036 ifp->if_refflags &= ~IFRF_READY;
8037 }
8038 }
8039
8040 void
ifnet_datamov_suspend(struct ifnet * ifp)8041 ifnet_datamov_suspend(struct ifnet *ifp)
8042 {
8043 lck_mtx_lock_spin(&ifp->if_ref_lock);
8044 VERIFY(ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING));
8045 ifnet_datamov_suspend_locked(ifp);
8046 lck_mtx_unlock(&ifp->if_ref_lock);
8047 }
8048
8049 boolean_t
ifnet_datamov_suspend_if_needed(struct ifnet * ifp)8050 ifnet_datamov_suspend_if_needed(struct ifnet *ifp)
8051 {
8052 lck_mtx_lock_spin(&ifp->if_ref_lock);
8053 VERIFY(ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING));
8054 if (ifp->if_suspend > 0) {
8055 lck_mtx_unlock(&ifp->if_ref_lock);
8056 return FALSE;
8057 }
8058 ifnet_datamov_suspend_locked(ifp);
8059 lck_mtx_unlock(&ifp->if_ref_lock);
8060 return TRUE;
8061 }
8062
8063 void
ifnet_datamov_drain(struct ifnet * ifp)8064 ifnet_datamov_drain(struct ifnet *ifp)
8065 {
8066 lck_mtx_lock(&ifp->if_ref_lock);
8067 VERIFY(ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING));
8068 /* data movement must already be suspended */
8069 VERIFY(ifp->if_suspend > 0);
8070 VERIFY(!(ifp->if_refflags & IFRF_READY));
8071 ifp->if_drainers++;
8072 while (ifp->if_datamov != 0) {
8073 DLIL_PRINTF("Waiting for data path(s) to quiesce on %s\n",
8074 if_name(ifp));
8075 DTRACE_IP1(datamov__wait, struct ifnet *, ifp);
8076 (void) msleep(&(ifp->if_datamov), &ifp->if_ref_lock,
8077 (PZERO - 1), __func__, NULL);
8078 DTRACE_IP1(datamov__wake, struct ifnet *, ifp);
8079 }
8080 VERIFY(!(ifp->if_refflags & IFRF_READY));
8081 VERIFY(ifp->if_drainers > 0);
8082 ifp->if_drainers--;
8083 lck_mtx_unlock(&ifp->if_ref_lock);
8084
8085 /* purge the interface queues */
8086 if ((ifp->if_eflags & IFEF_TXSTART) != 0) {
8087 if_qflush_snd(ifp, false);
8088 }
8089 }
8090
8091 void
ifnet_datamov_suspend_and_drain(struct ifnet * ifp)8092 ifnet_datamov_suspend_and_drain(struct ifnet *ifp)
8093 {
8094 ifnet_datamov_suspend(ifp);
8095 ifnet_datamov_drain(ifp);
8096 }
8097
8098 void
ifnet_datamov_resume(struct ifnet * ifp)8099 ifnet_datamov_resume(struct ifnet *ifp)
8100 {
8101 lck_mtx_lock(&ifp->if_ref_lock);
8102 /* data movement must already be suspended */
8103 VERIFY(ifp->if_suspend > 0);
8104 if (--ifp->if_suspend == 0) {
8105 VERIFY(!(ifp->if_refflags & IFRF_READY));
8106 ifp->if_refflags |= IFRF_READY;
8107 }
8108 ifnet_decr_iorefcnt_locked(ifp);
8109 lck_mtx_unlock(&ifp->if_ref_lock);
8110 }
8111
8112 static void
dlil_if_trace(struct dlil_ifnet * dl_if,int refhold)8113 dlil_if_trace(struct dlil_ifnet *dl_if, int refhold)
8114 {
8115 struct dlil_ifnet_dbg *dl_if_dbg = (struct dlil_ifnet_dbg *)dl_if;
8116 ctrace_t *tr;
8117 u_int32_t idx;
8118 u_int16_t *cnt;
8119
8120 if (!(dl_if->dl_if_flags & DLIF_DEBUG)) {
8121 panic("%s: dl_if %p has no debug structure", __func__, dl_if);
8122 /* NOTREACHED */
8123 }
8124
8125 if (refhold) {
8126 cnt = &dl_if_dbg->dldbg_if_refhold_cnt;
8127 tr = dl_if_dbg->dldbg_if_refhold;
8128 } else {
8129 cnt = &dl_if_dbg->dldbg_if_refrele_cnt;
8130 tr = dl_if_dbg->dldbg_if_refrele;
8131 }
8132
8133 idx = os_atomic_inc_orig(cnt, relaxed) % IF_REF_TRACE_HIST_SIZE;
8134 ctrace_record(&tr[idx]);
8135 }
8136
8137 errno_t
dlil_if_ref(struct ifnet * ifp)8138 dlil_if_ref(struct ifnet *ifp)
8139 {
8140 struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
8141
8142 if (dl_if == NULL) {
8143 return EINVAL;
8144 }
8145
8146 lck_mtx_lock_spin(&dl_if->dl_if_lock);
8147 ++dl_if->dl_if_refcnt;
8148 if (dl_if->dl_if_refcnt == 0) {
8149 panic("%s: wraparound refcnt for ifp=%p", __func__, ifp);
8150 /* NOTREACHED */
8151 }
8152 if (dl_if->dl_if_trace != NULL) {
8153 (*dl_if->dl_if_trace)(dl_if, TRUE);
8154 }
8155 lck_mtx_unlock(&dl_if->dl_if_lock);
8156
8157 return 0;
8158 }
8159
8160 errno_t
dlil_if_free(struct ifnet * ifp)8161 dlil_if_free(struct ifnet *ifp)
8162 {
8163 struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
8164 bool need_release = FALSE;
8165
8166 if (dl_if == NULL) {
8167 return EINVAL;
8168 }
8169
8170 lck_mtx_lock_spin(&dl_if->dl_if_lock);
8171 switch (dl_if->dl_if_refcnt) {
8172 case 0:
8173 panic("%s: negative refcnt for ifp=%p", __func__, ifp);
8174 /* NOTREACHED */
8175 break;
8176 case 1:
8177 if ((ifp->if_refflags & IFRF_EMBRYONIC) != 0) {
8178 need_release = TRUE;
8179 }
8180 break;
8181 default:
8182 break;
8183 }
8184 --dl_if->dl_if_refcnt;
8185 if (dl_if->dl_if_trace != NULL) {
8186 (*dl_if->dl_if_trace)(dl_if, FALSE);
8187 }
8188 lck_mtx_unlock(&dl_if->dl_if_lock);
8189 if (need_release) {
8190 _dlil_if_release(ifp, true);
8191 }
8192 return 0;
8193 }
8194
8195 static errno_t
dlil_attach_protocol(struct if_proto * proto,const struct ifnet_demux_desc * demux_list,u_int32_t demux_count,uint32_t * proto_count)8196 dlil_attach_protocol(struct if_proto *proto,
8197 const struct ifnet_demux_desc *demux_list, u_int32_t demux_count,
8198 uint32_t * proto_count)
8199 {
8200 struct kev_dl_proto_data ev_pr_data;
8201 struct ifnet *ifp = proto->ifp;
8202 errno_t retval = 0;
8203 u_int32_t hash_value = proto_hash_value(proto->protocol_family);
8204 struct if_proto *prev_proto;
8205 struct if_proto *_proto;
8206
8207 /* don't allow attaching anything but PF_BRIDGE to vmnet interfaces */
8208 if (IFNET_IS_VMNET(ifp) && proto->protocol_family != PF_BRIDGE) {
8209 return EINVAL;
8210 }
8211
8212 if (!ifnet_is_attached(ifp, 1)) {
8213 os_log(OS_LOG_DEFAULT, "%s: %s is no longer attached",
8214 __func__, if_name(ifp));
8215 return ENXIO;
8216 }
8217 /* callee holds a proto refcnt upon success */
8218 ifnet_lock_exclusive(ifp);
8219 _proto = find_attached_proto(ifp, proto->protocol_family);
8220 if (_proto != NULL) {
8221 ifnet_lock_done(ifp);
8222 if_proto_free(_proto);
8223 retval = EEXIST;
8224 goto ioref_done;
8225 }
8226
8227 /*
8228 * Call family module add_proto routine so it can refine the
8229 * demux descriptors as it wishes.
8230 */
8231 retval = ifp->if_add_proto(ifp, proto->protocol_family, demux_list,
8232 demux_count);
8233 if (retval) {
8234 ifnet_lock_done(ifp);
8235 goto ioref_done;
8236 }
8237
8238 /*
8239 * Insert the protocol in the hash
8240 */
8241 prev_proto = SLIST_FIRST(&ifp->if_proto_hash[hash_value]);
8242 while (prev_proto != NULL && SLIST_NEXT(prev_proto, next_hash) != NULL) {
8243 prev_proto = SLIST_NEXT(prev_proto, next_hash);
8244 }
8245 if (prev_proto) {
8246 SLIST_INSERT_AFTER(prev_proto, proto, next_hash);
8247 } else {
8248 SLIST_INSERT_HEAD(&ifp->if_proto_hash[hash_value],
8249 proto, next_hash);
8250 }
8251
8252 /* hold a proto refcnt for attach */
8253 if_proto_ref(proto);
8254
8255 /*
8256 * The reserved field carries the number of protocol still attached
8257 * (subject to change)
8258 */
8259 ev_pr_data.proto_family = proto->protocol_family;
8260 ev_pr_data.proto_remaining_count = dlil_ifp_protolist(ifp, NULL, 0);
8261
8262 ifnet_lock_done(ifp);
8263
8264 dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_PROTO_ATTACHED,
8265 (struct net_event_data *)&ev_pr_data,
8266 sizeof(struct kev_dl_proto_data), FALSE);
8267 if (proto_count != NULL) {
8268 *proto_count = ev_pr_data.proto_remaining_count;
8269 }
8270 ioref_done:
8271 ifnet_decr_iorefcnt(ifp);
8272 return retval;
8273 }
8274
8275 static void
dlil_handle_proto_attach(ifnet_t ifp,protocol_family_t protocol)8276 dlil_handle_proto_attach(ifnet_t ifp, protocol_family_t protocol)
8277 {
8278 /*
8279 * A protocol has been attached, mark the interface up.
8280 * This used to be done by configd.KernelEventMonitor, but that
8281 * is inherently prone to races (rdar://problem/30810208).
8282 */
8283 (void) ifnet_set_flags(ifp, IFF_UP, IFF_UP);
8284 (void) ifnet_ioctl(ifp, 0, SIOCSIFFLAGS, NULL);
8285 dlil_post_sifflags_msg(ifp);
8286 #if SKYWALK
8287 switch (protocol) {
8288 case AF_INET:
8289 case AF_INET6:
8290 /* don't attach the flowswitch unless attaching IP */
8291 dlil_attach_flowswitch_nexus(ifp);
8292 break;
8293 default:
8294 break;
8295 }
8296 #endif /* SKYWALK */
8297 }
8298
8299 errno_t
ifnet_attach_protocol(ifnet_t ifp,protocol_family_t protocol,const struct ifnet_attach_proto_param * proto_details)8300 ifnet_attach_protocol(ifnet_t ifp, protocol_family_t protocol,
8301 const struct ifnet_attach_proto_param *proto_details)
8302 {
8303 int retval = 0;
8304 struct if_proto *ifproto = NULL;
8305 uint32_t proto_count = 0;
8306
8307 ifnet_head_lock_shared();
8308 if (ifp == NULL || protocol == 0 || proto_details == NULL) {
8309 retval = EINVAL;
8310 goto end;
8311 }
8312 /* Check that the interface is in the global list */
8313 if (!ifnet_lookup(ifp)) {
8314 retval = ENXIO;
8315 goto end;
8316 }
8317
8318 ifproto = zalloc_flags(dlif_proto_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
8319
8320 /* refcnt held above during lookup */
8321 ifproto->ifp = ifp;
8322 ifproto->protocol_family = protocol;
8323 ifproto->proto_kpi = kProtoKPI_v1;
8324 ifproto->kpi.v1.input = proto_details->input;
8325 ifproto->kpi.v1.pre_output = proto_details->pre_output;
8326 ifproto->kpi.v1.event = proto_details->event;
8327 ifproto->kpi.v1.ioctl = proto_details->ioctl;
8328 ifproto->kpi.v1.detached = proto_details->detached;
8329 ifproto->kpi.v1.resolve_multi = proto_details->resolve;
8330 ifproto->kpi.v1.send_arp = proto_details->send_arp;
8331
8332 retval = dlil_attach_protocol(ifproto,
8333 proto_details->demux_list, proto_details->demux_count,
8334 &proto_count);
8335
8336 end:
8337 if (retval == EEXIST) {
8338 /* already attached */
8339 if (dlil_verbose) {
8340 DLIL_PRINTF("%s: protocol %d already attached\n",
8341 ifp != NULL ? if_name(ifp) : "N/A",
8342 protocol);
8343 }
8344 } else if (retval != 0) {
8345 DLIL_PRINTF("%s: failed to attach v1 protocol %d (err=%d)\n",
8346 ifp != NULL ? if_name(ifp) : "N/A", protocol, retval);
8347 } else if (dlil_verbose) {
8348 DLIL_PRINTF("%s: attached v1 protocol %d (count = %d)\n",
8349 ifp != NULL ? if_name(ifp) : "N/A",
8350 protocol, proto_count);
8351 }
8352 ifnet_head_done();
8353 if (retval == 0) {
8354 dlil_handle_proto_attach(ifp, protocol);
8355 } else if (ifproto != NULL) {
8356 zfree(dlif_proto_zone, ifproto);
8357 }
8358 return retval;
8359 }
8360
8361 errno_t
ifnet_attach_protocol_v2(ifnet_t ifp,protocol_family_t protocol,const struct ifnet_attach_proto_param_v2 * proto_details)8362 ifnet_attach_protocol_v2(ifnet_t ifp, protocol_family_t protocol,
8363 const struct ifnet_attach_proto_param_v2 *proto_details)
8364 {
8365 int retval = 0;
8366 struct if_proto *ifproto = NULL;
8367 uint32_t proto_count = 0;
8368
8369 ifnet_head_lock_shared();
8370 if (ifp == NULL || protocol == 0 || proto_details == NULL) {
8371 retval = EINVAL;
8372 goto end;
8373 }
8374 /* Check that the interface is in the global list */
8375 if (!ifnet_lookup(ifp)) {
8376 retval = ENXIO;
8377 goto end;
8378 }
8379
8380 ifproto = zalloc_flags(dlif_proto_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
8381
8382 /* refcnt held above during lookup */
8383 ifproto->ifp = ifp;
8384 ifproto->protocol_family = protocol;
8385 ifproto->proto_kpi = kProtoKPI_v2;
8386 ifproto->kpi.v2.input = proto_details->input;
8387 ifproto->kpi.v2.pre_output = proto_details->pre_output;
8388 ifproto->kpi.v2.event = proto_details->event;
8389 ifproto->kpi.v2.ioctl = proto_details->ioctl;
8390 ifproto->kpi.v2.detached = proto_details->detached;
8391 ifproto->kpi.v2.resolve_multi = proto_details->resolve;
8392 ifproto->kpi.v2.send_arp = proto_details->send_arp;
8393
8394 retval = dlil_attach_protocol(ifproto,
8395 proto_details->demux_list, proto_details->demux_count,
8396 &proto_count);
8397
8398 end:
8399 if (retval == EEXIST) {
8400 /* already attached */
8401 if (dlil_verbose) {
8402 DLIL_PRINTF("%s: protocol %d already attached\n",
8403 ifp != NULL ? if_name(ifp) : "N/A",
8404 protocol);
8405 }
8406 } else if (retval != 0) {
8407 DLIL_PRINTF("%s: failed to attach v2 protocol %d (err=%d)\n",
8408 ifp != NULL ? if_name(ifp) : "N/A", protocol, retval);
8409 } else if (dlil_verbose) {
8410 DLIL_PRINTF("%s: attached v2 protocol %d (count = %d)\n",
8411 ifp != NULL ? if_name(ifp) : "N/A",
8412 protocol, proto_count);
8413 }
8414 ifnet_head_done();
8415 if (retval == 0) {
8416 dlil_handle_proto_attach(ifp, protocol);
8417 } else if (ifproto != NULL) {
8418 zfree(dlif_proto_zone, ifproto);
8419 }
8420 return retval;
8421 }
8422
8423 errno_t
ifnet_detach_protocol(ifnet_t ifp,protocol_family_t proto_family)8424 ifnet_detach_protocol(ifnet_t ifp, protocol_family_t proto_family)
8425 {
8426 struct if_proto *proto = NULL;
8427 int retval = 0;
8428
8429 if (ifp == NULL || proto_family == 0) {
8430 retval = EINVAL;
8431 goto end;
8432 }
8433
8434 ifnet_lock_exclusive(ifp);
8435 /* callee holds a proto refcnt upon success */
8436 proto = find_attached_proto(ifp, proto_family);
8437 if (proto == NULL) {
8438 retval = ENXIO;
8439 ifnet_lock_done(ifp);
8440 goto end;
8441 }
8442
8443 /* call family module del_proto */
8444 if (ifp->if_del_proto) {
8445 ifp->if_del_proto(ifp, proto->protocol_family);
8446 }
8447
8448 SLIST_REMOVE(&ifp->if_proto_hash[proto_hash_value(proto_family)],
8449 proto, if_proto, next_hash);
8450
8451 if (proto->proto_kpi == kProtoKPI_v1) {
8452 proto->kpi.v1.input = ifproto_media_input_v1;
8453 proto->kpi.v1.pre_output = ifproto_media_preout;
8454 proto->kpi.v1.event = ifproto_media_event;
8455 proto->kpi.v1.ioctl = ifproto_media_ioctl;
8456 proto->kpi.v1.resolve_multi = ifproto_media_resolve_multi;
8457 proto->kpi.v1.send_arp = ifproto_media_send_arp;
8458 } else {
8459 proto->kpi.v2.input = ifproto_media_input_v2;
8460 proto->kpi.v2.pre_output = ifproto_media_preout;
8461 proto->kpi.v2.event = ifproto_media_event;
8462 proto->kpi.v2.ioctl = ifproto_media_ioctl;
8463 proto->kpi.v2.resolve_multi = ifproto_media_resolve_multi;
8464 proto->kpi.v2.send_arp = ifproto_media_send_arp;
8465 }
8466 proto->detached = 1;
8467 ifnet_lock_done(ifp);
8468
8469 if (dlil_verbose) {
8470 DLIL_PRINTF("%s: detached %s protocol %d\n", if_name(ifp),
8471 (proto->proto_kpi == kProtoKPI_v1) ?
8472 "v1" : "v2", proto_family);
8473 }
8474
8475 /* release proto refcnt held during protocol attach */
8476 if_proto_free(proto);
8477
8478 /*
8479 * Release proto refcnt held during lookup; the rest of
8480 * protocol detach steps will happen when the last proto
8481 * reference is released.
8482 */
8483 if_proto_free(proto);
8484
8485 end:
8486 return retval;
8487 }
8488
8489 static errno_t
ifproto_media_input_v1(struct ifnet * ifp,protocol_family_t protocol,struct mbuf * packet,char * header)8490 ifproto_media_input_v1(struct ifnet *ifp, protocol_family_t protocol,
8491 struct mbuf *packet, char *header)
8492 {
8493 #pragma unused(ifp, protocol, packet, header)
8494 return ENXIO;
8495 }
8496
8497 static errno_t
ifproto_media_input_v2(struct ifnet * ifp,protocol_family_t protocol,struct mbuf * packet)8498 ifproto_media_input_v2(struct ifnet *ifp, protocol_family_t protocol,
8499 struct mbuf *packet)
8500 {
8501 #pragma unused(ifp, protocol, packet)
8502 return ENXIO;
8503 }
8504
8505 static errno_t
ifproto_media_preout(struct ifnet * ifp,protocol_family_t protocol,mbuf_t * packet,const struct sockaddr * dest,void * route,char * frame_type,char * link_layer_dest)8506 ifproto_media_preout(struct ifnet *ifp, protocol_family_t protocol,
8507 mbuf_t *packet, const struct sockaddr *dest, void *route, char *frame_type,
8508 char *link_layer_dest)
8509 {
8510 #pragma unused(ifp, protocol, packet, dest, route, frame_type, link_layer_dest)
8511 return ENXIO;
8512 }
8513
8514 static void
ifproto_media_event(struct ifnet * ifp,protocol_family_t protocol,const struct kev_msg * event)8515 ifproto_media_event(struct ifnet *ifp, protocol_family_t protocol,
8516 const struct kev_msg *event)
8517 {
8518 #pragma unused(ifp, protocol, event)
8519 }
8520
8521 static errno_t
ifproto_media_ioctl(struct ifnet * ifp,protocol_family_t protocol,unsigned long command,void * argument)8522 ifproto_media_ioctl(struct ifnet *ifp, protocol_family_t protocol,
8523 unsigned long command, void *argument)
8524 {
8525 #pragma unused(ifp, protocol, command, argument)
8526 return ENXIO;
8527 }
8528
8529 static errno_t
ifproto_media_resolve_multi(ifnet_t ifp,const struct sockaddr * proto_addr,struct sockaddr_dl * out_ll,size_t ll_len)8530 ifproto_media_resolve_multi(ifnet_t ifp, const struct sockaddr *proto_addr,
8531 struct sockaddr_dl *out_ll, size_t ll_len)
8532 {
8533 #pragma unused(ifp, proto_addr, out_ll, ll_len)
8534 return ENXIO;
8535 }
8536
8537 static errno_t
ifproto_media_send_arp(struct ifnet * ifp,u_short arpop,const struct sockaddr_dl * sender_hw,const struct sockaddr * sender_proto,const struct sockaddr_dl * target_hw,const struct sockaddr * target_proto)8538 ifproto_media_send_arp(struct ifnet *ifp, u_short arpop,
8539 const struct sockaddr_dl *sender_hw, const struct sockaddr *sender_proto,
8540 const struct sockaddr_dl *target_hw, const struct sockaddr *target_proto)
8541 {
8542 #pragma unused(ifp, arpop, sender_hw, sender_proto, target_hw, target_proto)
8543 return ENXIO;
8544 }
8545
8546 extern int if_next_index(void);
8547 extern int tcp_ecn_outbound;
8548
8549 void
dlil_ifclassq_setup(struct ifnet * ifp,struct ifclassq * ifcq)8550 dlil_ifclassq_setup(struct ifnet *ifp, struct ifclassq *ifcq)
8551 {
8552 uint32_t sflags = 0;
8553 int err;
8554
8555 if (if_flowadv) {
8556 sflags |= PKTSCHEDF_QALG_FLOWCTL;
8557 }
8558
8559 if (if_delaybased_queue) {
8560 sflags |= PKTSCHEDF_QALG_DELAYBASED;
8561 }
8562
8563 if (ifp->if_output_sched_model ==
8564 IFNET_SCHED_MODEL_DRIVER_MANAGED) {
8565 sflags |= PKTSCHEDF_QALG_DRIVER_MANAGED;
8566 }
8567 /* Inherit drop limit from the default queue */
8568 if (ifp->if_snd != ifcq) {
8569 IFCQ_PKT_DROP_LIMIT(ifcq) = IFCQ_PKT_DROP_LIMIT(ifp->if_snd);
8570 }
8571 /* Initialize transmit queue(s) */
8572 err = ifclassq_setup(ifcq, ifp, sflags);
8573 if (err != 0) {
8574 panic_plain("%s: ifp=%p couldn't initialize transmit queue; "
8575 "err=%d", __func__, ifp, err);
8576 /* NOTREACHED */
8577 }
8578 }
8579
8580 errno_t
ifnet_attach(ifnet_t ifp,const struct sockaddr_dl * ll_addr)8581 ifnet_attach(ifnet_t ifp, const struct sockaddr_dl *ll_addr)
8582 {
8583 #if SKYWALK
8584 boolean_t netif_compat;
8585 if_nexus_netif nexus_netif;
8586 #endif /* SKYWALK */
8587 struct ifnet *tmp_if;
8588 struct ifaddr *ifa;
8589 struct if_data_internal if_data_saved;
8590 struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
8591 struct dlil_threading_info *dl_inp;
8592 thread_continue_t thfunc = NULL;
8593 int err;
8594
8595 if (ifp == NULL) {
8596 return EINVAL;
8597 }
8598
8599 /*
8600 * Serialize ifnet attach using dlil_ifnet_lock, in order to
8601 * prevent the interface from being configured while it is
8602 * embryonic, as ifnet_head_lock is dropped and reacquired
8603 * below prior to marking the ifnet with IFRF_ATTACHED.
8604 */
8605 dlil_if_lock();
8606 ifnet_head_lock_exclusive();
8607 /* Verify we aren't already on the list */
8608 TAILQ_FOREACH(tmp_if, &ifnet_head, if_link) {
8609 if (tmp_if == ifp) {
8610 ifnet_head_done();
8611 dlil_if_unlock();
8612 return EEXIST;
8613 }
8614 }
8615
8616 lck_mtx_lock_spin(&ifp->if_ref_lock);
8617 if (!(ifp->if_refflags & IFRF_EMBRYONIC)) {
8618 panic_plain("%s: flags mismatch (embryonic not set) ifp=%p",
8619 __func__, ifp);
8620 /* NOTREACHED */
8621 }
8622 lck_mtx_unlock(&ifp->if_ref_lock);
8623
8624 ifnet_lock_exclusive(ifp);
8625
8626 /* Sanity check */
8627 VERIFY(ifp->if_detaching_link.tqe_next == NULL);
8628 VERIFY(ifp->if_detaching_link.tqe_prev == NULL);
8629 VERIFY(ifp->if_threads_pending == 0);
8630
8631 if (ll_addr != NULL) {
8632 if (ifp->if_addrlen == 0) {
8633 ifp->if_addrlen = ll_addr->sdl_alen;
8634 } else if (ll_addr->sdl_alen != ifp->if_addrlen) {
8635 ifnet_lock_done(ifp);
8636 ifnet_head_done();
8637 dlil_if_unlock();
8638 return EINVAL;
8639 }
8640 }
8641
8642 /*
8643 * Allow interfaces without protocol families to attach
8644 * only if they have the necessary fields filled out.
8645 */
8646 if (ifp->if_add_proto == NULL || ifp->if_del_proto == NULL) {
8647 DLIL_PRINTF("%s: Attempt to attach interface without "
8648 "family module - %d\n", __func__, ifp->if_family);
8649 ifnet_lock_done(ifp);
8650 ifnet_head_done();
8651 dlil_if_unlock();
8652 return ENODEV;
8653 }
8654
8655 /* Allocate protocol hash table */
8656 VERIFY(ifp->if_proto_hash == NULL);
8657 ifp->if_proto_hash = kalloc_type(struct proto_hash_entry,
8658 PROTO_HASH_SLOTS, Z_WAITOK | Z_ZERO | Z_NOFAIL);
8659
8660 lck_mtx_lock_spin(&ifp->if_flt_lock);
8661 VERIFY(TAILQ_EMPTY(&ifp->if_flt_head));
8662 TAILQ_INIT(&ifp->if_flt_head);
8663 VERIFY(ifp->if_flt_busy == 0);
8664 VERIFY(ifp->if_flt_waiters == 0);
8665 VERIFY(ifp->if_flt_non_os_count == 0);
8666 VERIFY(ifp->if_flt_no_tso_count == 0);
8667 lck_mtx_unlock(&ifp->if_flt_lock);
8668
8669 if (!(dl_if->dl_if_flags & DLIF_REUSE)) {
8670 VERIFY(LIST_EMPTY(&ifp->if_multiaddrs));
8671 LIST_INIT(&ifp->if_multiaddrs);
8672 }
8673
8674 VERIFY(ifp->if_allhostsinm == NULL);
8675 VERIFY(TAILQ_EMPTY(&ifp->if_addrhead));
8676 TAILQ_INIT(&ifp->if_addrhead);
8677
8678 if (ifp->if_index == 0) {
8679 int idx = if_next_index();
8680
8681 /*
8682 * Since we exhausted the list of
8683 * if_index's, try to find an empty slot
8684 * in ifindex2ifnet.
8685 */
8686 if (idx == -1 && if_index >= UINT16_MAX) {
8687 for (int i = 1; i < if_index; i++) {
8688 if (ifindex2ifnet[i] == NULL &&
8689 ifnet_addrs[i - 1] == NULL) {
8690 idx = i;
8691 break;
8692 }
8693 }
8694 }
8695 if (idx == -1) {
8696 ifp->if_index = 0;
8697 ifnet_lock_done(ifp);
8698 ifnet_head_done();
8699 dlil_if_unlock();
8700 return ENOBUFS;
8701 }
8702 ifp->if_index = (uint16_t)idx;
8703
8704 /* the lladdr passed at attach time is the permanent address */
8705 if (ll_addr != NULL && ifp->if_type == IFT_ETHER &&
8706 ll_addr->sdl_alen == ETHER_ADDR_LEN) {
8707 bcopy(CONST_LLADDR(ll_addr),
8708 dl_if->dl_if_permanent_ether,
8709 ETHER_ADDR_LEN);
8710 dl_if->dl_if_permanent_ether_is_set = 1;
8711 }
8712 }
8713 /* There should not be anything occupying this slot */
8714 VERIFY(ifindex2ifnet[ifp->if_index] == NULL);
8715
8716 /* allocate (if needed) and initialize a link address */
8717 ifa = dlil_alloc_lladdr(ifp, ll_addr);
8718 if (ifa == NULL) {
8719 ifnet_lock_done(ifp);
8720 ifnet_head_done();
8721 dlil_if_unlock();
8722 return ENOBUFS;
8723 }
8724
8725 VERIFY(ifnet_addrs[ifp->if_index - 1] == NULL);
8726 ifnet_addrs[ifp->if_index - 1] = ifa;
8727
8728 /* make this address the first on the list */
8729 IFA_LOCK(ifa);
8730 /* hold a reference for ifnet_addrs[] */
8731 ifa_addref(ifa);
8732 /* if_attach_link_ifa() holds a reference for ifa_link */
8733 if_attach_link_ifa(ifp, ifa);
8734 IFA_UNLOCK(ifa);
8735
8736 TAILQ_INSERT_TAIL(&ifnet_head, ifp, if_link);
8737 ifindex2ifnet[ifp->if_index] = ifp;
8738
8739 /* Hold a reference to the underlying dlil_ifnet */
8740 ifnet_reference(ifp);
8741
8742 /* Clear stats (save and restore other fields that we care) */
8743 if_data_saved = ifp->if_data;
8744 bzero(&ifp->if_data, sizeof(ifp->if_data));
8745 ifp->if_data.ifi_type = if_data_saved.ifi_type;
8746 ifp->if_data.ifi_typelen = if_data_saved.ifi_typelen;
8747 ifp->if_data.ifi_physical = if_data_saved.ifi_physical;
8748 ifp->if_data.ifi_addrlen = if_data_saved.ifi_addrlen;
8749 ifp->if_data.ifi_hdrlen = if_data_saved.ifi_hdrlen;
8750 ifp->if_data.ifi_mtu = if_data_saved.ifi_mtu;
8751 ifp->if_data.ifi_baudrate = if_data_saved.ifi_baudrate;
8752 ifp->if_data.ifi_hwassist = if_data_saved.ifi_hwassist;
8753 ifp->if_data.ifi_tso_v4_mtu = if_data_saved.ifi_tso_v4_mtu;
8754 ifp->if_data.ifi_tso_v6_mtu = if_data_saved.ifi_tso_v6_mtu;
8755 ifnet_touch_lastchange(ifp);
8756
8757 VERIFY(ifp->if_output_sched_model == IFNET_SCHED_MODEL_NORMAL ||
8758 ifp->if_output_sched_model == IFNET_SCHED_MODEL_DRIVER_MANAGED ||
8759 ifp->if_output_sched_model == IFNET_SCHED_MODEL_FQ_CODEL);
8760
8761 dlil_ifclassq_setup(ifp, ifp->if_snd);
8762
8763 /* Sanity checks on the input thread storage */
8764 dl_inp = &dl_if->dl_if_inpstorage;
8765 bzero(&dl_inp->dlth_stats, sizeof(dl_inp->dlth_stats));
8766 VERIFY(dl_inp->dlth_flags == 0);
8767 VERIFY(dl_inp->dlth_wtot == 0);
8768 VERIFY(dl_inp->dlth_ifp == NULL);
8769 VERIFY(qhead(&dl_inp->dlth_pkts) == NULL && qempty(&dl_inp->dlth_pkts));
8770 VERIFY(qlimit(&dl_inp->dlth_pkts) == 0);
8771 VERIFY(!dl_inp->dlth_affinity);
8772 VERIFY(ifp->if_inp == NULL);
8773 VERIFY(dl_inp->dlth_thread == THREAD_NULL);
8774 VERIFY(dl_inp->dlth_strategy == NULL);
8775 VERIFY(dl_inp->dlth_driver_thread == THREAD_NULL);
8776 VERIFY(dl_inp->dlth_poller_thread == THREAD_NULL);
8777 VERIFY(dl_inp->dlth_affinity_tag == 0);
8778
8779 #if IFNET_INPUT_SANITY_CHK
8780 VERIFY(dl_inp->dlth_pkts_cnt == 0);
8781 #endif /* IFNET_INPUT_SANITY_CHK */
8782
8783 VERIFY(ifp->if_poll_thread == THREAD_NULL);
8784 dlil_reset_rxpoll_params(ifp);
8785 /*
8786 * A specific DLIL input thread is created per non-loopback interface.
8787 */
8788 if (ifp->if_family != IFNET_FAMILY_LOOPBACK) {
8789 ifp->if_inp = dl_inp;
8790 ifnet_incr_pending_thread_count(ifp);
8791 err = dlil_create_input_thread(ifp, ifp->if_inp, &thfunc);
8792 if (err == ENODEV) {
8793 VERIFY(thfunc == NULL);
8794 ifnet_decr_pending_thread_count(ifp);
8795 } else if (err != 0) {
8796 panic_plain("%s: ifp=%p couldn't get an input thread; "
8797 "err=%d", __func__, ifp, err);
8798 /* NOTREACHED */
8799 }
8800 }
8801 /*
8802 * If the driver supports the new transmit model, calculate flow hash
8803 * and create a workloop starter thread to invoke the if_start callback
8804 * where the packets may be dequeued and transmitted.
8805 */
8806 if (ifp->if_eflags & IFEF_TXSTART) {
8807 thread_precedence_policy_data_t info;
8808 __unused kern_return_t kret;
8809
8810 ifp->if_flowhash = ifnet_calc_flowhash(ifp);
8811 VERIFY(ifp->if_flowhash != 0);
8812 VERIFY(ifp->if_start_thread == THREAD_NULL);
8813
8814 ifnet_set_start_cycle(ifp, NULL);
8815 ifp->if_start_active = 0;
8816 ifp->if_start_req = 0;
8817 ifp->if_start_flags = 0;
8818 VERIFY(ifp->if_start != NULL);
8819 ifnet_incr_pending_thread_count(ifp);
8820 if ((err = kernel_thread_start(ifnet_start_thread_func,
8821 ifp, &ifp->if_start_thread)) != KERN_SUCCESS) {
8822 panic_plain("%s: "
8823 "ifp=%p couldn't get a start thread; "
8824 "err=%d", __func__, ifp, err);
8825 /* NOTREACHED */
8826 }
8827 bzero(&info, sizeof(info));
8828 info.importance = 1;
8829 kret = thread_policy_set(ifp->if_start_thread,
8830 THREAD_PRECEDENCE_POLICY, (thread_policy_t)&info,
8831 THREAD_PRECEDENCE_POLICY_COUNT);
8832 ASSERT(kret == KERN_SUCCESS);
8833 } else {
8834 ifp->if_flowhash = 0;
8835 }
8836
8837 /* Reset polling parameters */
8838 ifnet_set_poll_cycle(ifp, NULL);
8839 ifp->if_poll_update = 0;
8840 ifp->if_poll_flags = 0;
8841 ifp->if_poll_req = 0;
8842 VERIFY(ifp->if_poll_thread == THREAD_NULL);
8843
8844 /*
8845 * If the driver supports the new receive model, create a poller
8846 * thread to invoke if_input_poll callback where the packets may
8847 * be dequeued from the driver and processed for reception.
8848 * if the interface is netif compat then the poller thread is
8849 * managed by netif.
8850 */
8851 if (thfunc == dlil_rxpoll_input_thread_func) {
8852 thread_precedence_policy_data_t info;
8853 __unused kern_return_t kret;
8854 #if SKYWALK
8855 VERIFY(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
8856 #endif /* SKYWALK */
8857 VERIFY(ifp->if_input_poll != NULL);
8858 VERIFY(ifp->if_input_ctl != NULL);
8859 ifnet_incr_pending_thread_count(ifp);
8860 if ((err = kernel_thread_start(ifnet_poll_thread_func, ifp,
8861 &ifp->if_poll_thread)) != KERN_SUCCESS) {
8862 panic_plain("%s: ifp=%p couldn't get a poll thread; "
8863 "err=%d", __func__, ifp, err);
8864 /* NOTREACHED */
8865 }
8866 bzero(&info, sizeof(info));
8867 info.importance = 1;
8868 kret = thread_policy_set(ifp->if_poll_thread,
8869 THREAD_PRECEDENCE_POLICY, (thread_policy_t)&info,
8870 THREAD_PRECEDENCE_POLICY_COUNT);
8871 ASSERT(kret == KERN_SUCCESS);
8872 }
8873
8874 VERIFY(ifp->if_desc.ifd_maxlen == IF_DESCSIZE);
8875 VERIFY(ifp->if_desc.ifd_len == 0);
8876 VERIFY(ifp->if_desc.ifd_desc != NULL);
8877
8878 /* Record attach PC stacktrace */
8879 ctrace_record(&((struct dlil_ifnet *)ifp)->dl_if_attach);
8880
8881 ifp->if_updatemcasts = 0;
8882 if (!LIST_EMPTY(&ifp->if_multiaddrs)) {
8883 struct ifmultiaddr *ifma;
8884 LIST_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
8885 IFMA_LOCK(ifma);
8886 if (ifma->ifma_addr->sa_family == AF_LINK ||
8887 ifma->ifma_addr->sa_family == AF_UNSPEC) {
8888 ifp->if_updatemcasts++;
8889 }
8890 IFMA_UNLOCK(ifma);
8891 }
8892
8893 DLIL_PRINTF("%s: attached with %d suspended link-layer multicast "
8894 "membership(s)\n", if_name(ifp),
8895 ifp->if_updatemcasts);
8896 }
8897
8898 /* Clear logging parameters */
8899 bzero(&ifp->if_log, sizeof(ifp->if_log));
8900
8901 /* Clear foreground/realtime activity timestamps */
8902 ifp->if_fg_sendts = 0;
8903 ifp->if_rt_sendts = 0;
8904
8905 /* Clear throughput estimates and radio type */
8906 ifp->if_estimated_up_bucket = 0;
8907 ifp->if_estimated_down_bucket = 0;
8908 ifp->if_radio_type = 0;
8909 ifp->if_radio_channel = 0;
8910
8911 VERIFY(ifp->if_delegated.ifp == NULL);
8912 VERIFY(ifp->if_delegated.type == 0);
8913 VERIFY(ifp->if_delegated.family == 0);
8914 VERIFY(ifp->if_delegated.subfamily == 0);
8915 VERIFY(ifp->if_delegated.expensive == 0);
8916 VERIFY(ifp->if_delegated.constrained == 0);
8917 VERIFY(ifp->if_delegated.ultra_constrained == 0);
8918
8919 VERIFY(ifp->if_agentids == NULL);
8920 VERIFY(ifp->if_agentcount == 0);
8921
8922 /* Reset interface state */
8923 bzero(&ifp->if_interface_state, sizeof(ifp->if_interface_state));
8924 ifp->if_interface_state.valid_bitmask |=
8925 IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID;
8926 ifp->if_interface_state.interface_availability =
8927 IF_INTERFACE_STATE_INTERFACE_AVAILABLE;
8928
8929 /* Initialize Link Quality Metric (loopback [lo0] is always good) */
8930 if (ifp == lo_ifp) {
8931 ifp->if_interface_state.lqm_state = IFNET_LQM_THRESH_GOOD;
8932 ifp->if_interface_state.valid_bitmask |=
8933 IF_INTERFACE_STATE_LQM_STATE_VALID;
8934 } else {
8935 ifp->if_interface_state.lqm_state = IFNET_LQM_THRESH_UNKNOWN;
8936 }
8937
8938 /*
8939 * Enable ECN capability on this interface depending on the
8940 * value of ECN global setting
8941 */
8942 if (tcp_ecn_outbound == 2 && !IFNET_IS_CELLULAR(ifp)) {
8943 if_set_eflags(ifp, IFEF_ECN_ENABLE);
8944 if_clear_eflags(ifp, IFEF_ECN_DISABLE);
8945 }
8946
8947 /*
8948 * Built-in Cyclops always on policy for WiFi infra
8949 */
8950 if (IFNET_IS_WIFI_INFRA(ifp) && net_qos_policy_wifi_enabled != 0) {
8951 errno_t error;
8952
8953 error = if_set_qosmarking_mode(ifp,
8954 IFRTYPE_QOSMARKING_FASTLANE);
8955 if (error != 0) {
8956 DLIL_PRINTF("%s if_set_qosmarking_mode(%s) error %d\n",
8957 __func__, ifp->if_xname, error);
8958 } else {
8959 if_set_eflags(ifp, IFEF_QOSMARKING_ENABLED);
8960 #if (DEVELOPMENT || DEBUG)
8961 DLIL_PRINTF("%s fastlane enabled on %s\n",
8962 __func__, ifp->if_xname);
8963 #endif /* (DEVELOPMENT || DEBUG) */
8964 }
8965 }
8966
8967 ifnet_lock_done(ifp);
8968 ifnet_head_done();
8969
8970 #if SKYWALK
8971 netif_compat = dlil_attach_netif_compat_nexus(ifp, &nexus_netif);
8972 #endif /* SKYWALK */
8973
8974 lck_mtx_lock(&ifp->if_cached_route_lock);
8975 /* Enable forwarding cached route */
8976 ifp->if_fwd_cacheok = 1;
8977 /* Clean up any existing cached routes */
8978 ROUTE_RELEASE(&ifp->if_fwd_route);
8979 bzero(&ifp->if_fwd_route, sizeof(ifp->if_fwd_route));
8980 ROUTE_RELEASE(&ifp->if_src_route);
8981 bzero(&ifp->if_src_route, sizeof(ifp->if_src_route));
8982 ROUTE_RELEASE(&ifp->if_src_route6);
8983 bzero(&ifp->if_src_route6, sizeof(ifp->if_src_route6));
8984 lck_mtx_unlock(&ifp->if_cached_route_lock);
8985
8986 ifnet_llreach_ifattach(ifp, (dl_if->dl_if_flags & DLIF_REUSE));
8987
8988 /*
8989 * Allocate and attach IGMPv3/MLDv2 interface specific variables
8990 * and trees; do this before the ifnet is marked as attached.
8991 * The ifnet keeps the reference to the info structures even after
8992 * the ifnet is detached, since the network-layer records still
8993 * refer to the info structures even after that. This also
8994 * makes it possible for them to still function after the ifnet
8995 * is recycled or reattached.
8996 */
8997 #if INET
8998 if (IGMP_IFINFO(ifp) == NULL) {
8999 IGMP_IFINFO(ifp) = igmp_domifattach(ifp, Z_WAITOK);
9000 VERIFY(IGMP_IFINFO(ifp) != NULL);
9001 } else {
9002 VERIFY(IGMP_IFINFO(ifp)->igi_ifp == ifp);
9003 igmp_domifreattach(IGMP_IFINFO(ifp));
9004 }
9005 #endif /* INET */
9006 if (MLD_IFINFO(ifp) == NULL) {
9007 MLD_IFINFO(ifp) = mld_domifattach(ifp, Z_WAITOK);
9008 VERIFY(MLD_IFINFO(ifp) != NULL);
9009 } else {
9010 VERIFY(MLD_IFINFO(ifp)->mli_ifp == ifp);
9011 mld_domifreattach(MLD_IFINFO(ifp));
9012 }
9013
9014 VERIFY(ifp->if_data_threshold == 0);
9015 VERIFY(ifp->if_dt_tcall != NULL);
9016
9017 /*
9018 * Wait for the created kernel threads for I/O to get
9019 * scheduled and run at least once before we proceed
9020 * to mark interface as attached.
9021 */
9022 lck_mtx_lock(&ifp->if_ref_lock);
9023 while (ifp->if_threads_pending != 0) {
9024 DLIL_PRINTF("%s: Waiting for all kernel threads created for "
9025 "interface %s to get scheduled at least once.\n",
9026 __func__, ifp->if_xname);
9027 (void) msleep(&ifp->if_threads_pending, &ifp->if_ref_lock, (PZERO - 1),
9028 __func__, NULL);
9029 LCK_MTX_ASSERT(&ifp->if_ref_lock, LCK_ASSERT_OWNED);
9030 }
9031 lck_mtx_unlock(&ifp->if_ref_lock);
9032 DLIL_PRINTF("%s: All kernel threads created for interface %s have been scheduled "
9033 "at least once. Proceeding.\n", __func__, ifp->if_xname);
9034
9035 /* Final mark this ifnet as attached. */
9036 ifnet_lock_exclusive(ifp);
9037 lck_mtx_lock_spin(&ifp->if_ref_lock);
9038 ifp->if_refflags = (IFRF_ATTACHED | IFRF_READY); /* clears embryonic */
9039 lck_mtx_unlock(&ifp->if_ref_lock);
9040 if (net_rtref) {
9041 /* boot-args override; enable idle notification */
9042 (void) ifnet_set_idle_flags_locked(ifp, IFRF_IDLE_NOTIFY,
9043 IFRF_IDLE_NOTIFY);
9044 } else {
9045 /* apply previous request(s) to set the idle flags, if any */
9046 (void) ifnet_set_idle_flags_locked(ifp, ifp->if_idle_new_flags,
9047 ifp->if_idle_new_flags_mask);
9048 }
9049 #if SKYWALK
9050 /* the interface is fully attached; let the nexus adapter know */
9051 if (netif_compat || dlil_is_native_netif_nexus(ifp)) {
9052 if (netif_compat) {
9053 if (sk_netif_compat_txmodel ==
9054 NETIF_COMPAT_TXMODEL_ENQUEUE_MULTI) {
9055 ifnet_enqueue_multi_setup(ifp,
9056 sk_tx_delay_qlen, sk_tx_delay_timeout);
9057 }
9058 ifp->if_nx_netif = nexus_netif;
9059 }
9060 ifp->if_na_ops->ni_finalize(ifp->if_na, ifp);
9061 }
9062 #endif /* SKYWALK */
9063 ifnet_lock_done(ifp);
9064 dlil_if_unlock();
9065
9066 #if PF
9067 /*
9068 * Attach packet filter to this interface, if enabled.
9069 */
9070 pf_ifnet_hook(ifp, 1);
9071 #endif /* PF */
9072
9073 dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_IF_ATTACHED, NULL, 0, FALSE);
9074
9075 if (dlil_verbose) {
9076 DLIL_PRINTF("%s: attached%s\n", if_name(ifp),
9077 (dl_if->dl_if_flags & DLIF_REUSE) ? " (recycled)" : "");
9078 }
9079
9080 return 0;
9081 }
9082
9083 /*
9084 * Prepare the storage for the first/permanent link address, which must
9085 * must have the same lifetime as the ifnet itself. Although the link
9086 * address gets removed from if_addrhead and ifnet_addrs[] at detach time,
9087 * its location in memory must never change as it may still be referred
9088 * to by some parts of the system afterwards (unfortunate implementation
9089 * artifacts inherited from BSD.)
9090 *
9091 * Caller must hold ifnet lock as writer.
9092 */
9093 static struct ifaddr *
dlil_alloc_lladdr(struct ifnet * ifp,const struct sockaddr_dl * ll_addr)9094 dlil_alloc_lladdr(struct ifnet *ifp, const struct sockaddr_dl *ll_addr)
9095 {
9096 struct ifaddr *ifa, *oifa = NULL;
9097 struct sockaddr_dl *addr_sdl, *mask_sdl;
9098 char workbuf[IFNAMSIZ * 2];
9099 int namelen, masklen, socksize;
9100 struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
9101
9102 ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_EXCLUSIVE);
9103 VERIFY(ll_addr == NULL || ll_addr->sdl_alen == ifp->if_addrlen);
9104
9105 namelen = scnprintf(workbuf, sizeof(workbuf), "%s",
9106 if_name(ifp));
9107 masklen = offsetof(struct sockaddr_dl, sdl_data[0])
9108 + ((namelen > 0) ? namelen : 0);
9109 socksize = masklen + ifp->if_addrlen;
9110 #define ROUNDUP(a) (1 + (((a) - 1) | (sizeof (u_int32_t) - 1)))
9111 if ((u_int32_t)socksize < sizeof(struct sockaddr_dl)) {
9112 socksize = sizeof(struct sockaddr_dl);
9113 }
9114 socksize = ROUNDUP(socksize);
9115 #undef ROUNDUP
9116
9117 ifa = ifp->if_lladdr;
9118 if (socksize > DLIL_SDLMAXLEN ||
9119 (ifa != NULL && ifa != &dl_if->dl_if_lladdr.ifa)) {
9120 /*
9121 * Rare, but in the event that the link address requires
9122 * more storage space than DLIL_SDLMAXLEN, allocate the
9123 * largest possible storages for address and mask, such
9124 * that we can reuse the same space when if_addrlen grows.
9125 * This same space will be used when if_addrlen shrinks.
9126 */
9127 struct dl_if_lladdr_xtra_space *__single dl_if_lladdr_ext;
9128
9129 if (ifa == NULL || ifa == &dl_if->dl_if_lladdr.ifa) {
9130 dl_if_lladdr_ext = zalloc_permanent(
9131 sizeof(*dl_if_lladdr_ext), ZALIGN(struct ifaddr));
9132
9133 ifa = &dl_if_lladdr_ext->ifa;
9134 ifa_lock_init(ifa);
9135 ifa_initref(ifa);
9136 /* Don't set IFD_ALLOC, as this is permanent */
9137 ifa->ifa_debug = IFD_LINK;
9138 } else {
9139 dl_if_lladdr_ext = __unsafe_forge_single(
9140 struct dl_if_lladdr_xtra_space*, ifa);
9141 ifa = &dl_if_lladdr_ext->ifa;
9142 }
9143
9144 IFA_LOCK(ifa);
9145 /* address and mask sockaddr_dl locations */
9146 bzero(dl_if_lladdr_ext->addr_sdl_bytes,
9147 sizeof(dl_if_lladdr_ext->addr_sdl_bytes));
9148 bzero(dl_if_lladdr_ext->mask_sdl_bytes,
9149 sizeof(dl_if_lladdr_ext->mask_sdl_bytes));
9150 addr_sdl = SDL(dl_if_lladdr_ext->addr_sdl_bytes);
9151 mask_sdl = SDL(dl_if_lladdr_ext->mask_sdl_bytes);
9152 } else {
9153 VERIFY(ifa == NULL || ifa == &dl_if->dl_if_lladdr.ifa);
9154 /*
9155 * Use the storage areas for address and mask within the
9156 * dlil_ifnet structure. This is the most common case.
9157 */
9158 if (ifa == NULL) {
9159 ifa = &dl_if->dl_if_lladdr.ifa;
9160 ifa_lock_init(ifa);
9161 ifa_initref(ifa);
9162 /* Don't set IFD_ALLOC, as this is permanent */
9163 ifa->ifa_debug = IFD_LINK;
9164 }
9165 IFA_LOCK(ifa);
9166 /* address and mask sockaddr_dl locations */
9167 bzero(dl_if->dl_if_lladdr.addr_sdl_bytes,
9168 sizeof(dl_if->dl_if_lladdr.addr_sdl_bytes));
9169 bzero(dl_if->dl_if_lladdr.mask_sdl_bytes,
9170 sizeof(dl_if->dl_if_lladdr.mask_sdl_bytes));
9171 addr_sdl = SDL(dl_if->dl_if_lladdr.addr_sdl_bytes);
9172 mask_sdl = SDL(dl_if->dl_if_lladdr.mask_sdl_bytes);
9173 }
9174
9175 if (ifp->if_lladdr != ifa) {
9176 oifa = ifp->if_lladdr;
9177 ifp->if_lladdr = ifa;
9178 }
9179
9180 VERIFY(ifa->ifa_debug == IFD_LINK);
9181 ifa->ifa_ifp = ifp;
9182 ifa->ifa_rtrequest = link_rtrequest;
9183 ifa->ifa_addr = SA(addr_sdl);
9184 addr_sdl->sdl_len = (u_char)socksize;
9185 addr_sdl->sdl_family = AF_LINK;
9186 if (namelen > 0) {
9187 bcopy(workbuf, addr_sdl->sdl_data, min(namelen,
9188 sizeof(addr_sdl->sdl_data)));
9189 addr_sdl->sdl_nlen = (u_char)namelen;
9190 } else {
9191 addr_sdl->sdl_nlen = 0;
9192 }
9193 addr_sdl->sdl_index = ifp->if_index;
9194 addr_sdl->sdl_type = ifp->if_type;
9195 if (ll_addr != NULL) {
9196 addr_sdl->sdl_alen = ll_addr->sdl_alen;
9197 bcopy(CONST_LLADDR(ll_addr), LLADDR(addr_sdl), addr_sdl->sdl_alen);
9198 } else {
9199 addr_sdl->sdl_alen = 0;
9200 }
9201 ifa->ifa_netmask = SA(mask_sdl);
9202 mask_sdl->sdl_len = (u_char)masklen;
9203 while (namelen > 0) {
9204 mask_sdl->sdl_data[--namelen] = 0xff;
9205 }
9206 IFA_UNLOCK(ifa);
9207
9208 if (oifa != NULL) {
9209 ifa_remref(oifa);
9210 }
9211
9212 return ifa;
9213 }
9214
9215 static void
if_purgeaddrs(struct ifnet * ifp)9216 if_purgeaddrs(struct ifnet *ifp)
9217 {
9218 #if INET
9219 in_purgeaddrs(ifp);
9220 #endif /* INET */
9221 in6_purgeaddrs(ifp);
9222 }
9223
9224 errno_t
ifnet_detach(ifnet_t ifp)9225 ifnet_detach(ifnet_t ifp)
9226 {
9227 struct ifnet *delegated_ifp;
9228 struct nd_ifinfo *ndi = NULL;
9229
9230 if (ifp == NULL) {
9231 return EINVAL;
9232 }
9233
9234 ndi = ND_IFINFO(ifp);
9235 if (NULL != ndi) {
9236 ndi->cga_initialized = FALSE;
9237 }
9238
9239 /* Mark the interface down */
9240 if_down(ifp);
9241
9242 /*
9243 * IMPORTANT NOTE
9244 *
9245 * Any field in the ifnet that relies on IF_FULLY_ATTACHED()
9246 * or equivalently, ifnet_is_attached(ifp, 1), can't be modified
9247 * until after we've waited for all I/O references to drain
9248 * in ifnet_detach_final().
9249 */
9250
9251 ifnet_head_lock_exclusive();
9252 ifnet_lock_exclusive(ifp);
9253
9254 if (ifp->if_output_netem != NULL) {
9255 netem_destroy(ifp->if_output_netem);
9256 ifp->if_output_netem = NULL;
9257 }
9258
9259 /*
9260 * Check to see if this interface has previously triggered
9261 * aggressive protocol draining; if so, decrement the global
9262 * refcnt and clear PR_AGGDRAIN on the route domain if
9263 * there are no more of such an interface around.
9264 */
9265 (void) ifnet_set_idle_flags_locked(ifp, 0, ~0);
9266
9267 lck_mtx_lock_spin(&ifp->if_ref_lock);
9268 if (!(ifp->if_refflags & IFRF_ATTACHED)) {
9269 lck_mtx_unlock(&ifp->if_ref_lock);
9270 ifnet_lock_done(ifp);
9271 ifnet_head_done();
9272 return EINVAL;
9273 } else if (ifp->if_refflags & IFRF_DETACHING) {
9274 /* Interface has already been detached */
9275 lck_mtx_unlock(&ifp->if_ref_lock);
9276 ifnet_lock_done(ifp);
9277 ifnet_head_done();
9278 return ENXIO;
9279 }
9280 VERIFY(!(ifp->if_refflags & IFRF_EMBRYONIC));
9281 /* Indicate this interface is being detached */
9282 ifp->if_refflags &= ~IFRF_ATTACHED;
9283 ifp->if_refflags |= IFRF_DETACHING;
9284 lck_mtx_unlock(&ifp->if_ref_lock);
9285
9286 if (dlil_verbose) {
9287 DLIL_PRINTF("%s: detaching\n", if_name(ifp));
9288 }
9289
9290 /* clean up flow control entry object if there's any */
9291 if (ifp->if_eflags & IFEF_TXSTART) {
9292 ifnet_flowadv(ifp->if_flowhash);
9293 }
9294
9295 /* Reset ECN enable/disable flags */
9296 /* Reset CLAT46 flag */
9297 if_clear_eflags(ifp, IFEF_ECN_ENABLE | IFEF_ECN_DISABLE | IFEF_CLAT46);
9298
9299 /*
9300 * We do not reset the TCP keep alive counters in case
9301 * a TCP connection stays connection after the interface
9302 * went down
9303 */
9304 if (ifp->if_tcp_kao_cnt > 0) {
9305 os_log(OS_LOG_DEFAULT, "%s %s tcp_kao_cnt %u not zero",
9306 __func__, if_name(ifp), ifp->if_tcp_kao_cnt);
9307 }
9308 ifp->if_tcp_kao_max = 0;
9309
9310 /*
9311 * Remove ifnet from the ifnet_head, ifindex2ifnet[]; it will
9312 * no longer be visible during lookups from this point.
9313 */
9314 VERIFY(ifindex2ifnet[ifp->if_index] == ifp);
9315 TAILQ_REMOVE(&ifnet_head, ifp, if_link);
9316 ifp->if_link.tqe_next = NULL;
9317 ifp->if_link.tqe_prev = NULL;
9318 if (ifp->if_ordered_link.tqe_next != NULL ||
9319 ifp->if_ordered_link.tqe_prev != NULL) {
9320 ifnet_remove_from_ordered_list(ifp);
9321 }
9322 ifindex2ifnet[ifp->if_index] = NULL;
9323
9324 /* 18717626 - reset router mode */
9325 if_clear_eflags(ifp, IFEF_IPV4_ROUTER);
9326 ifp->if_ipv6_router_mode = IPV6_ROUTER_MODE_DISABLED;
9327
9328 /* Record detach PC stacktrace */
9329 ctrace_record(&((struct dlil_ifnet *)ifp)->dl_if_detach);
9330
9331 /* Clear logging parameters */
9332 bzero(&ifp->if_log, sizeof(ifp->if_log));
9333
9334 /* Clear delegated interface info (reference released below) */
9335 delegated_ifp = ifp->if_delegated.ifp;
9336 bzero(&ifp->if_delegated, sizeof(ifp->if_delegated));
9337
9338 /* Reset interface state */
9339 bzero(&ifp->if_interface_state, sizeof(ifp->if_interface_state));
9340
9341 /*
9342 * Increment the generation count on interface deletion
9343 */
9344 ifp->if_creation_generation_id = os_atomic_inc(&if_creation_generation_count, relaxed);
9345
9346 ifnet_lock_done(ifp);
9347 ifnet_head_done();
9348
9349 /* Release reference held on the delegated interface */
9350 if (delegated_ifp != NULL) {
9351 ifnet_release(delegated_ifp);
9352 }
9353
9354 /* Reset Link Quality Metric (unless loopback [lo0]) */
9355 if (ifp != lo_ifp) {
9356 if_lqm_update(ifp, IFNET_LQM_THRESH_OFF, 0);
9357 }
9358
9359 /* Reset TCP local statistics */
9360 if (ifp->if_tcp_stat != NULL) {
9361 bzero(ifp->if_tcp_stat, sizeof(*ifp->if_tcp_stat));
9362 }
9363
9364 /* Reset UDP local statistics */
9365 if (ifp->if_udp_stat != NULL) {
9366 bzero(ifp->if_udp_stat, sizeof(*ifp->if_udp_stat));
9367 }
9368
9369 /* Reset ifnet IPv4 stats */
9370 if (ifp->if_ipv4_stat != NULL) {
9371 bzero(ifp->if_ipv4_stat, sizeof(*ifp->if_ipv4_stat));
9372 }
9373
9374 /* Reset ifnet IPv6 stats */
9375 if (ifp->if_ipv6_stat != NULL) {
9376 bzero(ifp->if_ipv6_stat, sizeof(*ifp->if_ipv6_stat));
9377 }
9378
9379 /* Release memory held for interface link status report */
9380 if (ifp->if_link_status != NULL) {
9381 kfree_type(struct if_link_status, ifp->if_link_status);
9382 ifp->if_link_status = NULL;
9383 }
9384
9385 /* Disable forwarding cached route */
9386 lck_mtx_lock(&ifp->if_cached_route_lock);
9387 ifp->if_fwd_cacheok = 0;
9388 lck_mtx_unlock(&ifp->if_cached_route_lock);
9389
9390 /* Disable data threshold and wait for any pending event posting */
9391 ifp->if_data_threshold = 0;
9392 VERIFY(ifp->if_dt_tcall != NULL);
9393 (void) thread_call_cancel_wait(ifp->if_dt_tcall);
9394
9395 /*
9396 * Drain any deferred IGMPv3/MLDv2 query responses, but keep the
9397 * references to the info structures and leave them attached to
9398 * this ifnet.
9399 */
9400 #if INET
9401 igmp_domifdetach(ifp);
9402 #endif /* INET */
9403 mld_domifdetach(ifp);
9404
9405 #if SKYWALK
9406 /* Clean up any netns tokens still pointing to to this ifnet */
9407 netns_ifnet_detach(ifp);
9408 #endif /* SKYWALK */
9409 dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_IF_DETACHING, NULL, 0, FALSE);
9410
9411 /* Let worker thread take care of the rest, to avoid reentrancy */
9412 dlil_if_lock();
9413 ifnet_detaching_enqueue(ifp);
9414 dlil_if_unlock();
9415
9416 return 0;
9417 }
9418
9419 static void
ifnet_detaching_enqueue(struct ifnet * ifp)9420 ifnet_detaching_enqueue(struct ifnet *ifp)
9421 {
9422 dlil_if_lock_assert();
9423
9424 ++ifnet_detaching_cnt;
9425 VERIFY(ifnet_detaching_cnt != 0);
9426 TAILQ_INSERT_TAIL(&ifnet_detaching_head, ifp, if_detaching_link);
9427 wakeup((caddr_t)&ifnet_delayed_run);
9428 }
9429
9430 static struct ifnet *
ifnet_detaching_dequeue(void)9431 ifnet_detaching_dequeue(void)
9432 {
9433 struct ifnet *ifp;
9434
9435 dlil_if_lock_assert();
9436
9437 ifp = TAILQ_FIRST(&ifnet_detaching_head);
9438 VERIFY(ifnet_detaching_cnt != 0 || ifp == NULL);
9439 if (ifp != NULL) {
9440 VERIFY(ifnet_detaching_cnt != 0);
9441 --ifnet_detaching_cnt;
9442 TAILQ_REMOVE(&ifnet_detaching_head, ifp, if_detaching_link);
9443 ifp->if_detaching_link.tqe_next = NULL;
9444 ifp->if_detaching_link.tqe_prev = NULL;
9445 }
9446 return ifp;
9447 }
9448
9449 __attribute__((noreturn))
9450 static void
ifnet_detacher_thread_cont(void * v,wait_result_t wres)9451 ifnet_detacher_thread_cont(void *v, wait_result_t wres)
9452 {
9453 #pragma unused(v, wres)
9454 struct ifnet *ifp;
9455
9456 dlil_if_lock();
9457 if (__improbable(ifnet_detaching_embryonic)) {
9458 ifnet_detaching_embryonic = FALSE;
9459 /* there's no lock ordering constrain so OK to do this here */
9460 dlil_decr_pending_thread_count();
9461 }
9462
9463 for (;;) {
9464 dlil_if_lock_assert();
9465
9466 if (ifnet_detaching_cnt == 0) {
9467 break;
9468 }
9469
9470 net_update_uptime();
9471
9472 VERIFY(TAILQ_FIRST(&ifnet_detaching_head) != NULL);
9473
9474 /* Take care of detaching ifnet */
9475 ifp = ifnet_detaching_dequeue();
9476 if (ifp != NULL) {
9477 dlil_if_unlock();
9478 ifnet_detach_final(ifp);
9479 dlil_if_lock();
9480 }
9481 }
9482
9483 (void) assert_wait(&ifnet_delayed_run, THREAD_UNINT);
9484 dlil_if_unlock();
9485 (void) thread_block(ifnet_detacher_thread_cont);
9486
9487 VERIFY(0); /* we should never get here */
9488 /* NOTREACHED */
9489 __builtin_unreachable();
9490 }
9491
9492 __dead2
9493 static void
ifnet_detacher_thread_func(void * v,wait_result_t w)9494 ifnet_detacher_thread_func(void *v, wait_result_t w)
9495 {
9496 #pragma unused(v, w)
9497 dlil_if_lock();
9498 (void) assert_wait(&ifnet_delayed_run, THREAD_UNINT);
9499 ifnet_detaching_embryonic = TRUE;
9500 /* wake up once to get out of embryonic state */
9501 wakeup((caddr_t)&ifnet_delayed_run);
9502 dlil_if_unlock();
9503 (void) thread_block(ifnet_detacher_thread_cont);
9504 VERIFY(0);
9505 /* NOTREACHED */
9506 __builtin_unreachable();
9507 }
9508
9509 static void
ifnet_detach_final(struct ifnet * ifp)9510 ifnet_detach_final(struct ifnet *ifp)
9511 {
9512 struct ifnet_filter *filter, *filter_next;
9513 struct dlil_ifnet *dlifp;
9514 struct ifnet_filter_head fhead;
9515 struct dlil_threading_info *inp;
9516 struct ifaddr *ifa;
9517 ifnet_detached_func if_free;
9518 int i;
9519 bool waited = false;
9520
9521 /* Let BPF know we're detaching */
9522 bpfdetach(ifp);
9523
9524 #if SKYWALK
9525 dlil_netif_detach_notify(ifp);
9526 /*
9527 * Wait for the datapath to quiesce before tearing down
9528 * netif/flowswitch nexuses.
9529 */
9530 dlil_quiesce_and_detach_nexuses(ifp);
9531 #endif /* SKYWALK */
9532
9533 lck_mtx_lock(&ifp->if_ref_lock);
9534 if (!(ifp->if_refflags & IFRF_DETACHING)) {
9535 panic("%s: flags mismatch (detaching not set) ifp=%p",
9536 __func__, ifp);
9537 /* NOTREACHED */
9538 }
9539
9540 /*
9541 * Wait until the existing IO references get released
9542 * before we proceed with ifnet_detach. This is not a
9543 * common case, so block without using a continuation.
9544 */
9545 while (ifp->if_refio > 0) {
9546 waited = true;
9547 DLIL_PRINTF("%s: %s waiting for IO references to drain\n",
9548 __func__, if_name(ifp));
9549 (void) msleep(&(ifp->if_refio), &ifp->if_ref_lock,
9550 (PZERO - 1), "ifnet_ioref_wait", NULL);
9551 }
9552 if (waited) {
9553 DLIL_PRINTF("%s: %s IO references drained\n",
9554 __func__, if_name(ifp));
9555 }
9556 VERIFY(ifp->if_datamov == 0);
9557 VERIFY(ifp->if_drainers == 0);
9558 VERIFY(ifp->if_suspend == 0);
9559 ifp->if_refflags &= ~IFRF_READY;
9560 lck_mtx_unlock(&ifp->if_ref_lock);
9561
9562 /* Clear agent IDs */
9563 if (ifp->if_agentids != NULL) {
9564 kfree_data(ifp->if_agentids,
9565 sizeof(uuid_t) * ifp->if_agentcount);
9566 ifp->if_agentids = NULL;
9567 }
9568 ifp->if_agentcount = 0;
9569
9570 #if SKYWALK
9571 VERIFY(LIST_EMPTY(&ifp->if_netns_tokens));
9572 #endif /* SKYWALK */
9573 /* Drain and destroy send queue */
9574 ifclassq_teardown(ifp->if_snd);
9575
9576 /* Detach interface filters */
9577 lck_mtx_lock(&ifp->if_flt_lock);
9578 if_flt_monitor_enter(ifp);
9579
9580 LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
9581 fhead = ifp->if_flt_head;
9582 TAILQ_INIT(&ifp->if_flt_head);
9583
9584 for (filter = TAILQ_FIRST(&fhead); filter; filter = filter_next) {
9585 filter_next = TAILQ_NEXT(filter, filt_next);
9586 lck_mtx_unlock(&ifp->if_flt_lock);
9587
9588 dlil_detach_filter_internal(filter, 1);
9589 lck_mtx_lock(&ifp->if_flt_lock);
9590 }
9591 if_flt_monitor_leave(ifp);
9592 lck_mtx_unlock(&ifp->if_flt_lock);
9593
9594 /* Tell upper layers to drop their network addresses */
9595 if_purgeaddrs(ifp);
9596
9597 ifnet_lock_exclusive(ifp);
9598
9599 bzero(&ifp->if_nx_netif, sizeof(ifp->if_nx_netif));
9600 bzero(&ifp->if_nx_flowswitch, sizeof(ifp->if_nx_flowswitch));
9601
9602 /* Unplumb all protocols */
9603 for (i = 0; i < PROTO_HASH_SLOTS; i++) {
9604 struct if_proto *proto;
9605
9606 proto = SLIST_FIRST(&ifp->if_proto_hash[i]);
9607 while (proto != NULL) {
9608 protocol_family_t family = proto->protocol_family;
9609 ifnet_lock_done(ifp);
9610 proto_unplumb(family, ifp);
9611 ifnet_lock_exclusive(ifp);
9612 proto = SLIST_FIRST(&ifp->if_proto_hash[i]);
9613 }
9614 /* There should not be any protocols left */
9615 VERIFY(SLIST_EMPTY(&ifp->if_proto_hash[i]));
9616 }
9617 kfree_type(struct proto_hash_entry, PROTO_HASH_SLOTS, ifp->if_proto_hash);
9618 ifp->if_proto_hash = NULL;
9619
9620 /* Detach (permanent) link address from if_addrhead */
9621 ifa = TAILQ_FIRST(&ifp->if_addrhead);
9622 VERIFY(ifnet_addrs[ifp->if_index - 1] == ifa);
9623 IFA_LOCK(ifa);
9624 if_detach_link_ifa(ifp, ifa);
9625 IFA_UNLOCK(ifa);
9626
9627 /* Remove (permanent) link address from ifnet_addrs[] */
9628 ifa_remref(ifa);
9629 ifnet_addrs[ifp->if_index - 1] = NULL;
9630
9631 /* This interface should not be on {ifnet_head,detaching} */
9632 VERIFY(ifp->if_link.tqe_next == NULL);
9633 VERIFY(ifp->if_link.tqe_prev == NULL);
9634 VERIFY(ifp->if_detaching_link.tqe_next == NULL);
9635 VERIFY(ifp->if_detaching_link.tqe_prev == NULL);
9636 VERIFY(ifp->if_ordered_link.tqe_next == NULL);
9637 VERIFY(ifp->if_ordered_link.tqe_prev == NULL);
9638
9639 /* The slot should have been emptied */
9640 VERIFY(ifindex2ifnet[ifp->if_index] == NULL);
9641
9642 /* There should not be any addresses left */
9643 VERIFY(TAILQ_EMPTY(&ifp->if_addrhead));
9644
9645 /*
9646 * Signal the starter thread to terminate itself, and wait until
9647 * it has exited.
9648 */
9649 if (ifp->if_start_thread != THREAD_NULL) {
9650 lck_mtx_lock_spin(&ifp->if_start_lock);
9651 ifp->if_start_flags |= IFSF_TERMINATING;
9652 wakeup_one((caddr_t)&ifp->if_start_thread);
9653 lck_mtx_unlock(&ifp->if_start_lock);
9654
9655 /* wait for starter thread to terminate */
9656 lck_mtx_lock(&ifp->if_start_lock);
9657 while (ifp->if_start_thread != THREAD_NULL) {
9658 if (dlil_verbose) {
9659 DLIL_PRINTF("%s: waiting for %s starter thread to terminate\n",
9660 __func__,
9661 if_name(ifp));
9662 }
9663 (void) msleep(&ifp->if_start_thread,
9664 &ifp->if_start_lock, (PZERO - 1),
9665 "ifnet_start_thread_exit", NULL);
9666 }
9667 lck_mtx_unlock(&ifp->if_start_lock);
9668 if (dlil_verbose) {
9669 DLIL_PRINTF("%s: %s starter thread termination complete",
9670 __func__, if_name(ifp));
9671 }
9672 }
9673
9674 /*
9675 * Signal the poller thread to terminate itself, and wait until
9676 * it has exited.
9677 */
9678 if (ifp->if_poll_thread != THREAD_NULL) {
9679 #if SKYWALK
9680 VERIFY(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
9681 #endif /* SKYWALK */
9682 lck_mtx_lock_spin(&ifp->if_poll_lock);
9683 ifp->if_poll_flags |= IF_POLLF_TERMINATING;
9684 wakeup_one((caddr_t)&ifp->if_poll_thread);
9685 lck_mtx_unlock(&ifp->if_poll_lock);
9686
9687 /* wait for poller thread to terminate */
9688 lck_mtx_lock(&ifp->if_poll_lock);
9689 while (ifp->if_poll_thread != THREAD_NULL) {
9690 if (dlil_verbose) {
9691 DLIL_PRINTF("%s: waiting for %s poller thread to terminate\n",
9692 __func__,
9693 if_name(ifp));
9694 }
9695 (void) msleep(&ifp->if_poll_thread,
9696 &ifp->if_poll_lock, (PZERO - 1),
9697 "ifnet_poll_thread_exit", NULL);
9698 }
9699 lck_mtx_unlock(&ifp->if_poll_lock);
9700 if (dlil_verbose) {
9701 DLIL_PRINTF("%s: %s poller thread termination complete\n",
9702 __func__, if_name(ifp));
9703 }
9704 }
9705
9706 /*
9707 * If thread affinity was set for the workloop thread, we will need
9708 * to tear down the affinity and release the extra reference count
9709 * taken at attach time. Does not apply to lo0 or other interfaces
9710 * without dedicated input threads.
9711 */
9712 if ((inp = ifp->if_inp) != NULL) {
9713 VERIFY(inp != dlil_main_input_thread);
9714
9715 if (inp->dlth_affinity) {
9716 struct thread *tp, *wtp, *ptp;
9717
9718 lck_mtx_lock_spin(&inp->dlth_lock);
9719 wtp = inp->dlth_driver_thread;
9720 inp->dlth_driver_thread = THREAD_NULL;
9721 ptp = inp->dlth_poller_thread;
9722 inp->dlth_poller_thread = THREAD_NULL;
9723 ASSERT(inp->dlth_thread != THREAD_NULL);
9724 tp = inp->dlth_thread; /* don't nullify now */
9725 inp->dlth_affinity_tag = 0;
9726 inp->dlth_affinity = FALSE;
9727 lck_mtx_unlock(&inp->dlth_lock);
9728
9729 /* Tear down poll thread affinity */
9730 if (ptp != NULL) {
9731 VERIFY(ifp->if_eflags & IFEF_RXPOLL);
9732 VERIFY(ifp->if_xflags & IFXF_LEGACY);
9733 (void) dlil_affinity_set(ptp,
9734 THREAD_AFFINITY_TAG_NULL);
9735 thread_deallocate(ptp);
9736 }
9737
9738 /* Tear down workloop thread affinity */
9739 if (wtp != NULL) {
9740 (void) dlil_affinity_set(wtp,
9741 THREAD_AFFINITY_TAG_NULL);
9742 thread_deallocate(wtp);
9743 }
9744
9745 /* Tear down DLIL input thread affinity */
9746 (void) dlil_affinity_set(tp, THREAD_AFFINITY_TAG_NULL);
9747 thread_deallocate(tp);
9748 }
9749
9750 /* disassociate ifp DLIL input thread */
9751 ifp->if_inp = NULL;
9752
9753 /* if the worker thread was created, tell it to terminate */
9754 if (inp->dlth_thread != THREAD_NULL) {
9755 lck_mtx_lock_spin(&inp->dlth_lock);
9756 inp->dlth_flags |= DLIL_INPUT_TERMINATE;
9757 if (!(inp->dlth_flags & DLIL_INPUT_RUNNING)) {
9758 wakeup_one((caddr_t)&inp->dlth_flags);
9759 }
9760 lck_mtx_unlock(&inp->dlth_lock);
9761 ifnet_lock_done(ifp);
9762
9763 /* wait for the input thread to terminate */
9764 lck_mtx_lock_spin(&inp->dlth_lock);
9765 while ((inp->dlth_flags & DLIL_INPUT_TERMINATE_COMPLETE)
9766 == 0) {
9767 (void) msleep(&inp->dlth_flags, &inp->dlth_lock,
9768 (PZERO - 1) | PSPIN, inp->dlth_name, NULL);
9769 }
9770 lck_mtx_unlock(&inp->dlth_lock);
9771 ifnet_lock_exclusive(ifp);
9772 }
9773
9774 /* clean-up input thread state */
9775 dlil_clean_threading_info(inp);
9776 /* clean-up poll parameters */
9777 VERIFY(ifp->if_poll_thread == THREAD_NULL);
9778 dlil_reset_rxpoll_params(ifp);
9779 }
9780
9781 /* The driver might unload, so point these to ourselves */
9782 if_free = ifp->if_free;
9783 ifp->if_output_dlil = ifp_if_output;
9784 ifp->if_output = ifp_if_output;
9785 ifp->if_pre_enqueue = ifp_if_output;
9786 ifp->if_start = ifp_if_start;
9787 ifp->if_output_ctl = ifp_if_ctl;
9788 ifp->if_input_dlil = ifp_if_input;
9789 ifp->if_input_poll = ifp_if_input_poll;
9790 ifp->if_input_ctl = ifp_if_ctl;
9791 ifp->if_ioctl = ifp_if_ioctl;
9792 ifp->if_set_bpf_tap = ifp_if_set_bpf_tap;
9793 ifp->if_free = ifp_if_free;
9794 ifp->if_demux = ifp_if_demux;
9795 ifp->if_event = ifp_if_event;
9796 ifp->if_framer_legacy = ifp_if_framer;
9797 ifp->if_framer = ifp_if_framer_extended;
9798 ifp->if_add_proto = ifp_if_add_proto;
9799 ifp->if_del_proto = ifp_if_del_proto;
9800 ifp->if_check_multi = ifp_if_check_multi;
9801
9802 /* wipe out interface description */
9803 VERIFY(ifp->if_desc.ifd_maxlen == IF_DESCSIZE);
9804 ifp->if_desc.ifd_len = 0;
9805 VERIFY(ifp->if_desc.ifd_desc != NULL);
9806 bzero(ifp->if_desc.ifd_desc, IF_DESCSIZE);
9807
9808 /* there shouldn't be any delegation by now */
9809 VERIFY(ifp->if_delegated.ifp == NULL);
9810 VERIFY(ifp->if_delegated.type == 0);
9811 VERIFY(ifp->if_delegated.family == 0);
9812 VERIFY(ifp->if_delegated.subfamily == 0);
9813 VERIFY(ifp->if_delegated.expensive == 0);
9814 VERIFY(ifp->if_delegated.constrained == 0);
9815 VERIFY(ifp->if_delegated.ultra_constrained == 0);
9816
9817 /* QoS marking get cleared */
9818 if_clear_eflags(ifp, IFEF_QOSMARKING_ENABLED);
9819 if_set_qosmarking_mode(ifp, IFRTYPE_QOSMARKING_MODE_NONE);
9820
9821 #if SKYWALK
9822 /* the nexus destructor is responsible for clearing these */
9823 VERIFY(ifp->if_na_ops == NULL);
9824 VERIFY(ifp->if_na == NULL);
9825 #endif /* SKYWALK */
9826
9827 /* promiscuous/allmulti counts need to start at zero again */
9828 ifp->if_pcount = 0;
9829 ifp->if_amcount = 0;
9830 ifp->if_flags &= ~(IFF_PROMISC | IFF_ALLMULTI);
9831
9832 ifnet_lock_done(ifp);
9833
9834 #if PF
9835 /*
9836 * Detach this interface from packet filter, if enabled.
9837 */
9838 pf_ifnet_hook(ifp, 0);
9839 #endif /* PF */
9840
9841 /* Filter list should be empty */
9842 lck_mtx_lock_spin(&ifp->if_flt_lock);
9843 VERIFY(TAILQ_EMPTY(&ifp->if_flt_head));
9844 VERIFY(ifp->if_flt_busy == 0);
9845 VERIFY(ifp->if_flt_waiters == 0);
9846 VERIFY(ifp->if_flt_non_os_count == 0);
9847 VERIFY(ifp->if_flt_no_tso_count == 0);
9848 lck_mtx_unlock(&ifp->if_flt_lock);
9849
9850 /* Last chance to drain send queue */
9851 if_qflush_snd(ifp, 0);
9852
9853 /* Last chance to cleanup any cached route */
9854 lck_mtx_lock(&ifp->if_cached_route_lock);
9855 VERIFY(!ifp->if_fwd_cacheok);
9856 ROUTE_RELEASE(&ifp->if_fwd_route);
9857 bzero(&ifp->if_fwd_route, sizeof(ifp->if_fwd_route));
9858 ROUTE_RELEASE(&ifp->if_src_route);
9859 bzero(&ifp->if_src_route, sizeof(ifp->if_src_route));
9860 ROUTE_RELEASE(&ifp->if_src_route6);
9861 bzero(&ifp->if_src_route6, sizeof(ifp->if_src_route6));
9862 lck_mtx_unlock(&ifp->if_cached_route_lock);
9863
9864 /* Ignore any pending data threshold as the interface is anyways gone */
9865 ifp->if_data_threshold = 0;
9866
9867 VERIFY(ifp->if_dt_tcall != NULL);
9868 VERIFY(!thread_call_isactive(ifp->if_dt_tcall));
9869
9870 ifnet_llreach_ifdetach(ifp);
9871
9872 dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_IF_DETACHED, NULL, 0, FALSE);
9873
9874 /*
9875 * Finally, mark this ifnet as detached.
9876 */
9877 if (dlil_verbose) {
9878 DLIL_PRINTF("%s: detached\n", if_name(ifp));
9879 }
9880 lck_mtx_lock_spin(&ifp->if_ref_lock);
9881 if (!(ifp->if_refflags & IFRF_DETACHING)) {
9882 panic("%s: flags mismatch (detaching not set) ifp=%p",
9883 __func__, ifp);
9884 /* NOTREACHED */
9885 }
9886 ifp->if_refflags &= ~IFRF_DETACHING;
9887 lck_mtx_unlock(&ifp->if_ref_lock);
9888 if (if_free != NULL) {
9889 if_free(ifp);
9890 }
9891
9892 ifclassq_release(&ifp->if_snd);
9893
9894 /* we're fully detached, clear the "in use" bit */
9895 dlifp = (struct dlil_ifnet *)ifp;
9896 lck_mtx_lock(&dlifp->dl_if_lock);
9897 ASSERT((dlifp->dl_if_flags & DLIF_INUSE) != 0);
9898 dlifp->dl_if_flags &= ~DLIF_INUSE;
9899 lck_mtx_unlock(&dlifp->dl_if_lock);
9900
9901 /* Release reference held during ifnet attach */
9902 ifnet_release(ifp);
9903 }
9904
9905 errno_t
ifp_if_output(struct ifnet * ifp,struct mbuf * m)9906 ifp_if_output(struct ifnet *ifp, struct mbuf *m)
9907 {
9908 #pragma unused(ifp)
9909 m_freem_list(m);
9910 return 0;
9911 }
9912
9913 void
ifp_if_start(struct ifnet * ifp)9914 ifp_if_start(struct ifnet *ifp)
9915 {
9916 ifnet_purge(ifp);
9917 }
9918
9919 static errno_t
ifp_if_input(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t poll,struct thread * tp)9920 ifp_if_input(struct ifnet *ifp, struct mbuf *m_head,
9921 struct mbuf *m_tail, const struct ifnet_stat_increment_param *s,
9922 boolean_t poll, struct thread *tp)
9923 {
9924 #pragma unused(ifp, m_tail, s, poll, tp)
9925 m_freem_list(m_head);
9926 return ENXIO;
9927 }
9928
9929 static void
ifp_if_input_poll(struct ifnet * ifp,u_int32_t flags,u_int32_t max_cnt,struct mbuf ** m_head,struct mbuf ** m_tail,u_int32_t * cnt,u_int32_t * len)9930 ifp_if_input_poll(struct ifnet *ifp, u_int32_t flags, u_int32_t max_cnt,
9931 struct mbuf **m_head, struct mbuf **m_tail, u_int32_t *cnt, u_int32_t *len)
9932 {
9933 #pragma unused(ifp, flags, max_cnt)
9934 if (m_head != NULL) {
9935 *m_head = NULL;
9936 }
9937 if (m_tail != NULL) {
9938 *m_tail = NULL;
9939 }
9940 if (cnt != NULL) {
9941 *cnt = 0;
9942 }
9943 if (len != NULL) {
9944 *len = 0;
9945 }
9946 }
9947
9948 static errno_t
ifp_if_ctl(struct ifnet * ifp,ifnet_ctl_cmd_t cmd,u_int32_t arglen,void * arg)9949 ifp_if_ctl(struct ifnet *ifp, ifnet_ctl_cmd_t cmd, u_int32_t arglen, void *arg)
9950 {
9951 #pragma unused(ifp, cmd, arglen, arg)
9952 return EOPNOTSUPP;
9953 }
9954
9955 static errno_t
ifp_if_demux(struct ifnet * ifp,struct mbuf * m,char * fh,protocol_family_t * pf)9956 ifp_if_demux(struct ifnet *ifp, struct mbuf *m, char *fh, protocol_family_t *pf)
9957 {
9958 #pragma unused(ifp, fh, pf)
9959 m_freem(m);
9960 return EJUSTRETURN;
9961 }
9962
9963 static errno_t
ifp_if_add_proto(struct ifnet * ifp,protocol_family_t pf,const struct ifnet_demux_desc * da,u_int32_t dc)9964 ifp_if_add_proto(struct ifnet *ifp, protocol_family_t pf,
9965 const struct ifnet_demux_desc *da, u_int32_t dc)
9966 {
9967 #pragma unused(ifp, pf, da, dc)
9968 return EINVAL;
9969 }
9970
9971 static errno_t
ifp_if_del_proto(struct ifnet * ifp,protocol_family_t pf)9972 ifp_if_del_proto(struct ifnet *ifp, protocol_family_t pf)
9973 {
9974 #pragma unused(ifp, pf)
9975 return EINVAL;
9976 }
9977
9978 static errno_t
ifp_if_check_multi(struct ifnet * ifp,const struct sockaddr * sa)9979 ifp_if_check_multi(struct ifnet *ifp, const struct sockaddr *sa)
9980 {
9981 #pragma unused(ifp, sa)
9982 return EOPNOTSUPP;
9983 }
9984
9985 #if !XNU_TARGET_OS_OSX
9986 static errno_t
ifp_if_framer(struct ifnet * ifp,struct mbuf ** m,const struct sockaddr * sa,const char * ll,const char * t,u_int32_t * pre,u_int32_t * post)9987 ifp_if_framer(struct ifnet *ifp, struct mbuf **m,
9988 const struct sockaddr *sa, const char *ll, const char *t,
9989 u_int32_t *pre, u_int32_t *post)
9990 #else /* XNU_TARGET_OS_OSX */
9991 static errno_t
9992 ifp_if_framer(struct ifnet *ifp, struct mbuf **m,
9993 const struct sockaddr *sa, const char *ll, const char *t)
9994 #endif /* XNU_TARGET_OS_OSX */
9995 {
9996 #pragma unused(ifp, m, sa, ll, t)
9997 #if !XNU_TARGET_OS_OSX
9998 return ifp_if_framer_extended(ifp, m, sa, ll, t, pre, post);
9999 #else /* XNU_TARGET_OS_OSX */
10000 return ifp_if_framer_extended(ifp, m, sa, ll, t, NULL, NULL);
10001 #endif /* XNU_TARGET_OS_OSX */
10002 }
10003
10004 static errno_t
ifp_if_framer_extended(struct ifnet * ifp,struct mbuf ** m,const struct sockaddr * sa,const char * ll,const char * t,u_int32_t * pre,u_int32_t * post)10005 ifp_if_framer_extended(struct ifnet *ifp, struct mbuf **m,
10006 const struct sockaddr *sa, const char *ll, const char *t,
10007 u_int32_t *pre, u_int32_t *post)
10008 {
10009 #pragma unused(ifp, sa, ll, t)
10010 m_freem(*m);
10011 *m = NULL;
10012
10013 if (pre != NULL) {
10014 *pre = 0;
10015 }
10016 if (post != NULL) {
10017 *post = 0;
10018 }
10019
10020 return EJUSTRETURN;
10021 }
10022
10023 errno_t
ifp_if_ioctl(struct ifnet * ifp,unsigned long cmd,void * arg)10024 ifp_if_ioctl(struct ifnet *ifp, unsigned long cmd, void *arg)
10025 {
10026 #pragma unused(ifp, cmd, arg)
10027 return EOPNOTSUPP;
10028 }
10029
10030 static errno_t
ifp_if_set_bpf_tap(struct ifnet * ifp,bpf_tap_mode tm,bpf_packet_func f)10031 ifp_if_set_bpf_tap(struct ifnet *ifp, bpf_tap_mode tm, bpf_packet_func f)
10032 {
10033 #pragma unused(ifp, tm, f)
10034 /* XXX not sure what to do here */
10035 return 0;
10036 }
10037
10038 static void
ifp_if_free(struct ifnet * ifp)10039 ifp_if_free(struct ifnet *ifp)
10040 {
10041 #pragma unused(ifp)
10042 }
10043
10044 static void
ifp_if_event(struct ifnet * ifp,const struct kev_msg * e)10045 ifp_if_event(struct ifnet *ifp, const struct kev_msg *e)
10046 {
10047 #pragma unused(ifp, e)
10048 }
10049
10050 int
dlil_if_acquire(u_int32_t family,const void * uniqueid,size_t uniqueid_len,const char * ifxname,struct ifnet ** ifp)10051 dlil_if_acquire(u_int32_t family, const void *uniqueid,
10052 size_t uniqueid_len, const char *ifxname, struct ifnet **ifp)
10053 {
10054 struct ifnet *ifp1 = NULL;
10055 struct dlil_ifnet *dlifp1 = NULL;
10056 struct dlil_ifnet *dlifp1_saved = NULL;
10057 void *buf, *base, **pbuf;
10058 int ret = 0;
10059
10060 VERIFY(*ifp == NULL);
10061 dlil_if_lock();
10062 /*
10063 * We absolutely can't have an interface with the same name
10064 * in in-use state.
10065 * To make sure of that list has to be traversed completely
10066 */
10067 TAILQ_FOREACH(dlifp1, &dlil_ifnet_head, dl_if_link) {
10068 ifp1 = (struct ifnet *)dlifp1;
10069
10070 if (ifp1->if_family != family) {
10071 continue;
10072 }
10073
10074 /*
10075 * If interface is in use, return EBUSY if either unique id
10076 * or interface extended names are the same
10077 */
10078 lck_mtx_lock(&dlifp1->dl_if_lock);
10079 if (strncmp(ifxname, ifp1->if_xname, IFXNAMSIZ) == 0 &&
10080 (dlifp1->dl_if_flags & DLIF_INUSE) != 0) {
10081 lck_mtx_unlock(&dlifp1->dl_if_lock);
10082 ret = EBUSY;
10083 goto end;
10084 }
10085
10086 if (uniqueid_len != 0 &&
10087 uniqueid_len == dlifp1->dl_if_uniqueid_len &&
10088 bcmp(uniqueid, dlifp1->dl_if_uniqueid, uniqueid_len) == 0) {
10089 if ((dlifp1->dl_if_flags & DLIF_INUSE) != 0) {
10090 lck_mtx_unlock(&dlifp1->dl_if_lock);
10091 ret = EBUSY;
10092 goto end;
10093 }
10094 if (dlifp1_saved == NULL) {
10095 /* cache the first match */
10096 dlifp1_saved = dlifp1;
10097 }
10098 /*
10099 * Do not break or jump to end as we have to traverse
10100 * the whole list to ensure there are no name collisions
10101 */
10102 }
10103 lck_mtx_unlock(&dlifp1->dl_if_lock);
10104 }
10105
10106 /* If there's an interface that can be recycled, use that */
10107 if (dlifp1_saved != NULL) {
10108 lck_mtx_lock(&dlifp1_saved->dl_if_lock);
10109 if ((dlifp1_saved->dl_if_flags & DLIF_INUSE) != 0) {
10110 /* some other thread got in ahead of us */
10111 lck_mtx_unlock(&dlifp1_saved->dl_if_lock);
10112 ret = EBUSY;
10113 goto end;
10114 }
10115 dlifp1_saved->dl_if_flags |= (DLIF_INUSE | DLIF_REUSE);
10116 lck_mtx_unlock(&dlifp1_saved->dl_if_lock);
10117 *ifp = (struct ifnet *)dlifp1_saved;
10118 dlil_if_ref(*ifp);
10119 goto end;
10120 }
10121
10122 /* no interface found, allocate a new one */
10123 buf = zalloc_flags(dlif_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
10124
10125 /* Get the 64-bit aligned base address for this object */
10126 base = (void *)P2ROUNDUP((intptr_t)buf + sizeof(u_int64_t),
10127 sizeof(u_int64_t));
10128 VERIFY(((intptr_t)base + dlif_size) <= ((intptr_t)buf + dlif_bufsize));
10129
10130 /*
10131 * Wind back a pointer size from the aligned base and
10132 * save the original address so we can free it later.
10133 */
10134 pbuf = (void **)((intptr_t)base - sizeof(void *));
10135 *pbuf = buf;
10136 dlifp1 = base;
10137
10138 if (uniqueid_len) {
10139 dlifp1->dl_if_uniqueid = kalloc_data(uniqueid_len,
10140 Z_WAITOK);
10141 if (dlifp1->dl_if_uniqueid == NULL) {
10142 zfree(dlif_zone, buf);
10143 ret = ENOMEM;
10144 goto end;
10145 }
10146 bcopy(uniqueid, dlifp1->dl_if_uniqueid, uniqueid_len);
10147 dlifp1->dl_if_uniqueid_len = uniqueid_len;
10148 }
10149
10150 ifp1 = (struct ifnet *)dlifp1;
10151 dlifp1->dl_if_flags = DLIF_INUSE;
10152 if (ifnet_debug) {
10153 dlifp1->dl_if_flags |= DLIF_DEBUG;
10154 dlifp1->dl_if_trace = dlil_if_trace;
10155 }
10156 ifp1->if_name = dlifp1->dl_if_namestorage;
10157 ifp1->if_xname = dlifp1->dl_if_xnamestorage;
10158
10159 /* initialize interface description */
10160 ifp1->if_desc.ifd_maxlen = IF_DESCSIZE;
10161 ifp1->if_desc.ifd_len = 0;
10162 ifp1->if_desc.ifd_desc = dlifp1->dl_if_descstorage;
10163
10164 #if SKYWALK
10165 LIST_INIT(&ifp1->if_netns_tokens);
10166 #endif /* SKYWALK */
10167
10168 if ((ret = dlil_alloc_local_stats(ifp1)) != 0) {
10169 DLIL_PRINTF("%s: failed to allocate if local stats, "
10170 "error: %d\n", __func__, ret);
10171 /* This probably shouldn't be fatal */
10172 ret = 0;
10173 }
10174
10175 lck_mtx_init(&dlifp1->dl_if_lock, &ifnet_lock_group, &ifnet_lock_attr);
10176 lck_rw_init(&ifp1->if_lock, &ifnet_lock_group, &ifnet_lock_attr);
10177 lck_mtx_init(&ifp1->if_ref_lock, &ifnet_lock_group, &ifnet_lock_attr);
10178 lck_mtx_init(&ifp1->if_flt_lock, &ifnet_lock_group, &ifnet_lock_attr);
10179 lck_mtx_init(&ifp1->if_addrconfig_lock, &ifnet_lock_group,
10180 &ifnet_lock_attr);
10181 lck_rw_init(&ifp1->if_llreach_lock, &ifnet_lock_group, &ifnet_lock_attr);
10182 #if INET
10183 lck_rw_init(&ifp1->if_inetdata_lock, &ifnet_lock_group,
10184 &ifnet_lock_attr);
10185 ifp1->if_inetdata = NULL;
10186 #endif
10187 lck_mtx_init(&ifp1->if_inet6_ioctl_lock, &ifnet_lock_group, &ifnet_lock_attr);
10188 ifp1->if_inet6_ioctl_busy = FALSE;
10189 lck_rw_init(&ifp1->if_inet6data_lock, &ifnet_lock_group,
10190 &ifnet_lock_attr);
10191 ifp1->if_inet6data = NULL;
10192 lck_rw_init(&ifp1->if_link_status_lock, &ifnet_lock_group,
10193 &ifnet_lock_attr);
10194 ifp1->if_link_status = NULL;
10195 lck_mtx_init(&ifp1->if_delegate_lock, &ifnet_lock_group, &ifnet_lock_attr);
10196
10197 /* for send data paths */
10198 lck_mtx_init(&ifp1->if_start_lock, &ifnet_snd_lock_group,
10199 &ifnet_lock_attr);
10200 lck_mtx_init(&ifp1->if_cached_route_lock, &ifnet_snd_lock_group,
10201 &ifnet_lock_attr);
10202
10203 /* for receive data paths */
10204 lck_mtx_init(&ifp1->if_poll_lock, &ifnet_rcv_lock_group,
10205 &ifnet_lock_attr);
10206
10207 /* thread call allocation is done with sleeping zalloc */
10208 ifp1->if_dt_tcall = thread_call_allocate_with_options(dlil_dt_tcall_fn,
10209 ifp1, THREAD_CALL_PRIORITY_KERNEL, THREAD_CALL_OPTIONS_ONCE);
10210 if (ifp1->if_dt_tcall == NULL) {
10211 panic_plain("%s: couldn't create if_dt_tcall", __func__);
10212 /* NOTREACHED */
10213 }
10214
10215 TAILQ_INSERT_TAIL(&dlil_ifnet_head, dlifp1, dl_if_link);
10216
10217 *ifp = ifp1;
10218 dlil_if_ref(*ifp);
10219
10220 end:
10221 dlil_if_unlock();
10222
10223 VERIFY(dlifp1 == NULL || (IS_P2ALIGNED(dlifp1, sizeof(u_int64_t)) &&
10224 IS_P2ALIGNED(&ifp1->if_data, sizeof(u_int64_t))));
10225
10226 return ret;
10227 }
10228
10229 static void
_dlil_if_release(ifnet_t ifp,bool clear_in_use)10230 _dlil_if_release(ifnet_t ifp, bool clear_in_use)
10231 {
10232 struct dlil_ifnet *dlifp = (struct dlil_ifnet *)ifp;
10233
10234 VERIFY(OSDecrementAtomic64(&net_api_stats.nas_ifnet_alloc_count) > 0);
10235 if (!(ifp->if_xflags & IFXF_ALLOC_KPI)) {
10236 VERIFY(OSDecrementAtomic64(&net_api_stats.nas_ifnet_alloc_os_count) > 0);
10237 }
10238
10239 ifnet_lock_exclusive(ifp);
10240 kfree_data_counted_by(ifp->if_broadcast.ptr, ifp->if_broadcast.length);
10241 lck_mtx_lock(&dlifp->dl_if_lock);
10242 strlcpy(dlifp->dl_if_namestorage, ifp->if_name, IFNAMSIZ);
10243 ifp->if_name = dlifp->dl_if_namestorage;
10244 /* Reset external name (name + unit) */
10245 ifp->if_xname = dlifp->dl_if_xnamestorage;
10246 snprintf(__DECONST(char *, ifp->if_xname), IFXNAMSIZ,
10247 "%s?", ifp->if_name);
10248 if (clear_in_use) {
10249 ASSERT((dlifp->dl_if_flags & DLIF_INUSE) != 0);
10250 dlifp->dl_if_flags &= ~DLIF_INUSE;
10251 }
10252 lck_mtx_unlock(&dlifp->dl_if_lock);
10253 ifnet_lock_done(ifp);
10254 }
10255
10256 __private_extern__ void
dlil_if_release(ifnet_t ifp)10257 dlil_if_release(ifnet_t ifp)
10258 {
10259 _dlil_if_release(ifp, false);
10260 }
10261
10262 __private_extern__ void
dlil_if_lock(void)10263 dlil_if_lock(void)
10264 {
10265 lck_mtx_lock(&dlil_ifnet_lock);
10266 }
10267
10268 __private_extern__ void
dlil_if_unlock(void)10269 dlil_if_unlock(void)
10270 {
10271 lck_mtx_unlock(&dlil_ifnet_lock);
10272 }
10273
10274 __private_extern__ void
dlil_if_lock_assert(void)10275 dlil_if_lock_assert(void)
10276 {
10277 LCK_MTX_ASSERT(&dlil_ifnet_lock, LCK_MTX_ASSERT_OWNED);
10278 }
10279
10280 __private_extern__ void
dlil_proto_unplumb_all(struct ifnet * ifp)10281 dlil_proto_unplumb_all(struct ifnet *ifp)
10282 {
10283 /*
10284 * if_proto_hash[0-2] are for PF_INET, PF_INET6 and PF_VLAN, where
10285 * each bucket contains exactly one entry; PF_VLAN does not need an
10286 * explicit unplumb.
10287 *
10288 * if_proto_hash[3] is for other protocols; we expect anything
10289 * in this bucket to respond to the DETACHING event (which would
10290 * have happened by now) and do the unplumb then.
10291 */
10292 (void) proto_unplumb(PF_INET, ifp);
10293 (void) proto_unplumb(PF_INET6, ifp);
10294 }
10295
10296 static void
ifp_src_route_copyout(struct ifnet * ifp,struct route * dst)10297 ifp_src_route_copyout(struct ifnet *ifp, struct route *dst)
10298 {
10299 lck_mtx_lock_spin(&ifp->if_cached_route_lock);
10300 lck_mtx_convert_spin(&ifp->if_cached_route_lock);
10301
10302 route_copyout(dst, &ifp->if_src_route, sizeof(*dst));
10303
10304 lck_mtx_unlock(&ifp->if_cached_route_lock);
10305 }
10306
10307 static void
ifp_src_route_copyin(struct ifnet * ifp,struct route * src)10308 ifp_src_route_copyin(struct ifnet *ifp, struct route *src)
10309 {
10310 lck_mtx_lock_spin(&ifp->if_cached_route_lock);
10311 lck_mtx_convert_spin(&ifp->if_cached_route_lock);
10312
10313 if (ifp->if_fwd_cacheok) {
10314 route_copyin(src, &ifp->if_src_route, sizeof(*src));
10315 } else {
10316 ROUTE_RELEASE(src);
10317 }
10318 lck_mtx_unlock(&ifp->if_cached_route_lock);
10319 }
10320
10321 static void
ifp_src_route6_copyout(struct ifnet * ifp,struct route_in6 * dst)10322 ifp_src_route6_copyout(struct ifnet *ifp, struct route_in6 *dst)
10323 {
10324 lck_mtx_lock_spin(&ifp->if_cached_route_lock);
10325 lck_mtx_convert_spin(&ifp->if_cached_route_lock);
10326
10327 route_copyout((struct route *)dst, (struct route *)&ifp->if_src_route6,
10328 sizeof(*dst));
10329
10330 lck_mtx_unlock(&ifp->if_cached_route_lock);
10331 }
10332
10333 static void
ifp_src_route6_copyin(struct ifnet * ifp,struct route_in6 * src)10334 ifp_src_route6_copyin(struct ifnet *ifp, struct route_in6 *src)
10335 {
10336 lck_mtx_lock_spin(&ifp->if_cached_route_lock);
10337 lck_mtx_convert_spin(&ifp->if_cached_route_lock);
10338
10339 if (ifp->if_fwd_cacheok) {
10340 route_copyin((struct route *)src,
10341 (struct route *)&ifp->if_src_route6, sizeof(*src));
10342 } else {
10343 ROUTE_RELEASE(src);
10344 }
10345 lck_mtx_unlock(&ifp->if_cached_route_lock);
10346 }
10347
10348 struct rtentry *
ifnet_cached_rtlookup_inet(struct ifnet * ifp,struct in_addr src_ip)10349 ifnet_cached_rtlookup_inet(struct ifnet *ifp, struct in_addr src_ip)
10350 {
10351 struct route src_rt;
10352 struct sockaddr_in *dst;
10353
10354 dst = SIN(&src_rt.ro_dst);
10355
10356 ifp_src_route_copyout(ifp, &src_rt);
10357
10358 if (ROUTE_UNUSABLE(&src_rt) || src_ip.s_addr != dst->sin_addr.s_addr) {
10359 ROUTE_RELEASE(&src_rt);
10360 if (dst->sin_family != AF_INET) {
10361 SOCKADDR_ZERO(&src_rt.ro_dst, sizeof(src_rt.ro_dst));
10362 dst->sin_len = sizeof(src_rt.ro_dst);
10363 dst->sin_family = AF_INET;
10364 }
10365 dst->sin_addr = src_ip;
10366
10367 VERIFY(src_rt.ro_rt == NULL);
10368 src_rt.ro_rt = rtalloc1_scoped(SA(dst),
10369 0, 0, ifp->if_index);
10370
10371 if (src_rt.ro_rt != NULL) {
10372 /* retain a ref, copyin consumes one */
10373 struct rtentry *rte = src_rt.ro_rt;
10374 RT_ADDREF(rte);
10375 ifp_src_route_copyin(ifp, &src_rt);
10376 src_rt.ro_rt = rte;
10377 }
10378 }
10379
10380 return src_rt.ro_rt;
10381 }
10382
10383 struct rtentry *
ifnet_cached_rtlookup_inet6(struct ifnet * ifp,struct in6_addr * src_ip6)10384 ifnet_cached_rtlookup_inet6(struct ifnet *ifp, struct in6_addr *src_ip6)
10385 {
10386 struct route_in6 src_rt;
10387
10388 ifp_src_route6_copyout(ifp, &src_rt);
10389
10390 if (ROUTE_UNUSABLE(&src_rt) ||
10391 !IN6_ARE_ADDR_EQUAL(src_ip6, &src_rt.ro_dst.sin6_addr)) {
10392 ROUTE_RELEASE(&src_rt);
10393 if (src_rt.ro_dst.sin6_family != AF_INET6) {
10394 SOCKADDR_ZERO(&src_rt.ro_dst, sizeof(src_rt.ro_dst));
10395 src_rt.ro_dst.sin6_len = sizeof(src_rt.ro_dst);
10396 src_rt.ro_dst.sin6_family = AF_INET6;
10397 }
10398 src_rt.ro_dst.sin6_scope_id = in6_addr2scopeid(ifp, src_ip6);
10399 bcopy(src_ip6, &src_rt.ro_dst.sin6_addr,
10400 sizeof(src_rt.ro_dst.sin6_addr));
10401
10402 if (src_rt.ro_rt == NULL) {
10403 src_rt.ro_rt = rtalloc1_scoped(
10404 SA(&src_rt.ro_dst), 0, 0,
10405 ifp->if_index);
10406
10407 if (src_rt.ro_rt != NULL) {
10408 /* retain a ref, copyin consumes one */
10409 struct rtentry *rte = src_rt.ro_rt;
10410 RT_ADDREF(rte);
10411 ifp_src_route6_copyin(ifp, &src_rt);
10412 src_rt.ro_rt = rte;
10413 }
10414 }
10415 }
10416
10417 return src_rt.ro_rt;
10418 }
10419
10420 void
if_lqm_update(struct ifnet * ifp,int lqm,int locked)10421 if_lqm_update(struct ifnet *ifp, int lqm, int locked)
10422 {
10423 struct kev_dl_link_quality_metric_data ev_lqm_data;
10424
10425 VERIFY(lqm >= IFNET_LQM_MIN && lqm <= IFNET_LQM_MAX);
10426
10427 /* Normalize to edge */
10428 if (lqm >= 0 && lqm <= IFNET_LQM_THRESH_ABORT) {
10429 lqm = IFNET_LQM_THRESH_ABORT;
10430 os_atomic_or(&tcbinfo.ipi_flags, INPCBINFO_HANDLE_LQM_ABORT, relaxed);
10431 inpcb_timer_sched(&tcbinfo, INPCB_TIMER_FAST);
10432 } else if (lqm > IFNET_LQM_THRESH_ABORT &&
10433 lqm <= IFNET_LQM_THRESH_MINIMALLY_VIABLE) {
10434 lqm = IFNET_LQM_THRESH_MINIMALLY_VIABLE;
10435 } else if (lqm > IFNET_LQM_THRESH_MINIMALLY_VIABLE &&
10436 lqm <= IFNET_LQM_THRESH_POOR) {
10437 lqm = IFNET_LQM_THRESH_POOR;
10438 } else if (lqm > IFNET_LQM_THRESH_POOR &&
10439 lqm <= IFNET_LQM_THRESH_GOOD) {
10440 lqm = IFNET_LQM_THRESH_GOOD;
10441 }
10442
10443 /*
10444 * Take the lock if needed
10445 */
10446 if (!locked) {
10447 ifnet_lock_exclusive(ifp);
10448 }
10449
10450 if (lqm == ifp->if_interface_state.lqm_state &&
10451 (ifp->if_interface_state.valid_bitmask &
10452 IF_INTERFACE_STATE_LQM_STATE_VALID)) {
10453 /*
10454 * Release the lock if was not held by the caller
10455 */
10456 if (!locked) {
10457 ifnet_lock_done(ifp);
10458 }
10459 return; /* nothing to update */
10460 }
10461 ifp->if_interface_state.valid_bitmask |=
10462 IF_INTERFACE_STATE_LQM_STATE_VALID;
10463 ifp->if_interface_state.lqm_state = (int8_t)lqm;
10464
10465 /*
10466 * Don't want to hold the lock when issuing kernel events
10467 */
10468 ifnet_lock_done(ifp);
10469
10470 bzero(&ev_lqm_data, sizeof(ev_lqm_data));
10471 ev_lqm_data.link_quality_metric = lqm;
10472
10473 dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_LINK_QUALITY_METRIC_CHANGED,
10474 (struct net_event_data *)&ev_lqm_data, sizeof(ev_lqm_data), FALSE);
10475
10476 /*
10477 * Reacquire the lock for the caller
10478 */
10479 if (locked) {
10480 ifnet_lock_exclusive(ifp);
10481 }
10482 }
10483
10484 static void
if_rrc_state_update(struct ifnet * ifp,unsigned int rrc_state)10485 if_rrc_state_update(struct ifnet *ifp, unsigned int rrc_state)
10486 {
10487 struct kev_dl_rrc_state kev;
10488
10489 if (rrc_state == ifp->if_interface_state.rrc_state &&
10490 (ifp->if_interface_state.valid_bitmask &
10491 IF_INTERFACE_STATE_RRC_STATE_VALID)) {
10492 return;
10493 }
10494
10495 ifp->if_interface_state.valid_bitmask |=
10496 IF_INTERFACE_STATE_RRC_STATE_VALID;
10497
10498 ifp->if_interface_state.rrc_state = (uint8_t)rrc_state;
10499
10500 /*
10501 * Don't want to hold the lock when issuing kernel events
10502 */
10503 ifnet_lock_done(ifp);
10504
10505 bzero(&kev, sizeof(struct kev_dl_rrc_state));
10506 kev.rrc_state = rrc_state;
10507
10508 dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_RRC_STATE_CHANGED,
10509 (struct net_event_data *)&kev, sizeof(struct kev_dl_rrc_state), FALSE);
10510
10511 ifnet_lock_exclusive(ifp);
10512 }
10513
10514 errno_t
if_state_update(struct ifnet * ifp,struct if_interface_state * if_interface_state)10515 if_state_update(struct ifnet *ifp,
10516 struct if_interface_state *if_interface_state)
10517 {
10518 u_short if_index_available = 0;
10519
10520 ifnet_lock_exclusive(ifp);
10521
10522 if ((ifp->if_type != IFT_CELLULAR) &&
10523 (if_interface_state->valid_bitmask &
10524 IF_INTERFACE_STATE_RRC_STATE_VALID)) {
10525 ifnet_lock_done(ifp);
10526 return ENOTSUP;
10527 }
10528 if ((if_interface_state->valid_bitmask &
10529 IF_INTERFACE_STATE_LQM_STATE_VALID) &&
10530 (if_interface_state->lqm_state < IFNET_LQM_MIN ||
10531 if_interface_state->lqm_state > IFNET_LQM_MAX)) {
10532 ifnet_lock_done(ifp);
10533 return EINVAL;
10534 }
10535 if ((if_interface_state->valid_bitmask &
10536 IF_INTERFACE_STATE_RRC_STATE_VALID) &&
10537 if_interface_state->rrc_state !=
10538 IF_INTERFACE_STATE_RRC_STATE_IDLE &&
10539 if_interface_state->rrc_state !=
10540 IF_INTERFACE_STATE_RRC_STATE_CONNECTED) {
10541 ifnet_lock_done(ifp);
10542 return EINVAL;
10543 }
10544
10545 if (if_interface_state->valid_bitmask &
10546 IF_INTERFACE_STATE_LQM_STATE_VALID) {
10547 if_lqm_update(ifp, if_interface_state->lqm_state, 1);
10548 }
10549 if (if_interface_state->valid_bitmask &
10550 IF_INTERFACE_STATE_RRC_STATE_VALID) {
10551 if_rrc_state_update(ifp, if_interface_state->rrc_state);
10552 }
10553 if (if_interface_state->valid_bitmask &
10554 IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID) {
10555 ifp->if_interface_state.valid_bitmask |=
10556 IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID;
10557 ifp->if_interface_state.interface_availability =
10558 if_interface_state->interface_availability;
10559
10560 if (ifp->if_interface_state.interface_availability ==
10561 IF_INTERFACE_STATE_INTERFACE_AVAILABLE) {
10562 os_log(OS_LOG_DEFAULT, "%s: interface %s (%u) available\n",
10563 __func__, if_name(ifp), ifp->if_index);
10564 if_index_available = ifp->if_index;
10565 } else {
10566 os_log(OS_LOG_DEFAULT, "%s: interface %s (%u) unavailable)\n",
10567 __func__, if_name(ifp), ifp->if_index);
10568 }
10569 }
10570 ifnet_lock_done(ifp);
10571
10572 /*
10573 * Check if the TCP connections going on this interface should be
10574 * forced to send probe packets instead of waiting for TCP timers
10575 * to fire. This is done on an explicit notification such as
10576 * SIOCSIFINTERFACESTATE which marks the interface as available.
10577 */
10578 if (if_index_available > 0) {
10579 tcp_interface_send_probe(if_index_available);
10580 }
10581
10582 return 0;
10583 }
10584
10585 void
if_get_state(struct ifnet * ifp,struct if_interface_state * if_interface_state)10586 if_get_state(struct ifnet *ifp,
10587 struct if_interface_state *if_interface_state)
10588 {
10589 ifnet_lock_shared(ifp);
10590
10591 if_interface_state->valid_bitmask = 0;
10592
10593 if (ifp->if_interface_state.valid_bitmask &
10594 IF_INTERFACE_STATE_RRC_STATE_VALID) {
10595 if_interface_state->valid_bitmask |=
10596 IF_INTERFACE_STATE_RRC_STATE_VALID;
10597 if_interface_state->rrc_state =
10598 ifp->if_interface_state.rrc_state;
10599 }
10600 if (ifp->if_interface_state.valid_bitmask &
10601 IF_INTERFACE_STATE_LQM_STATE_VALID) {
10602 if_interface_state->valid_bitmask |=
10603 IF_INTERFACE_STATE_LQM_STATE_VALID;
10604 if_interface_state->lqm_state =
10605 ifp->if_interface_state.lqm_state;
10606 }
10607 if (ifp->if_interface_state.valid_bitmask &
10608 IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID) {
10609 if_interface_state->valid_bitmask |=
10610 IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID;
10611 if_interface_state->interface_availability =
10612 ifp->if_interface_state.interface_availability;
10613 }
10614
10615 ifnet_lock_done(ifp);
10616 }
10617
10618 errno_t
if_probe_connectivity(struct ifnet * ifp,u_int32_t conn_probe)10619 if_probe_connectivity(struct ifnet *ifp, u_int32_t conn_probe)
10620 {
10621 if (conn_probe > 1) {
10622 return EINVAL;
10623 }
10624 if (conn_probe == 0) {
10625 if_clear_eflags(ifp, IFEF_PROBE_CONNECTIVITY);
10626 } else {
10627 if_set_eflags(ifp, IFEF_PROBE_CONNECTIVITY);
10628 }
10629
10630 #if NECP
10631 necp_update_all_clients();
10632 #endif /* NECP */
10633
10634 tcp_probe_connectivity(ifp, conn_probe);
10635 return 0;
10636 }
10637
10638 /* for uuid.c */
10639 static int
get_ether_index(int * ret_other_index)10640 get_ether_index(int * ret_other_index)
10641 {
10642 struct ifnet *ifp;
10643 int en0_index = 0;
10644 int other_en_index = 0;
10645 int any_ether_index = 0;
10646 short best_unit = 0;
10647
10648 *ret_other_index = 0;
10649 TAILQ_FOREACH(ifp, &ifnet_head, if_link) {
10650 /*
10651 * find en0, or if not en0, the lowest unit en*, and if not
10652 * that, any ethernet
10653 */
10654 ifnet_lock_shared(ifp);
10655 if (strcmp(ifp->if_name, "en") == 0) {
10656 if (ifp->if_unit == 0) {
10657 /* found en0, we're done */
10658 en0_index = ifp->if_index;
10659 ifnet_lock_done(ifp);
10660 break;
10661 }
10662 if (other_en_index == 0 || ifp->if_unit < best_unit) {
10663 other_en_index = ifp->if_index;
10664 best_unit = ifp->if_unit;
10665 }
10666 } else if (ifp->if_type == IFT_ETHER && any_ether_index == 0) {
10667 any_ether_index = ifp->if_index;
10668 }
10669 ifnet_lock_done(ifp);
10670 }
10671 if (en0_index == 0) {
10672 if (other_en_index != 0) {
10673 *ret_other_index = other_en_index;
10674 } else if (any_ether_index != 0) {
10675 *ret_other_index = any_ether_index;
10676 }
10677 }
10678 return en0_index;
10679 }
10680
10681 int
uuid_get_ethernet(u_int8_t * node)10682 uuid_get_ethernet(u_int8_t *node)
10683 {
10684 static int en0_index;
10685 struct ifnet *ifp;
10686 int other_index = 0;
10687 int the_index = 0;
10688 int ret;
10689
10690 ifnet_head_lock_shared();
10691 if (en0_index == 0 || ifindex2ifnet[en0_index] == NULL) {
10692 en0_index = get_ether_index(&other_index);
10693 }
10694 if (en0_index != 0) {
10695 the_index = en0_index;
10696 } else if (other_index != 0) {
10697 the_index = other_index;
10698 }
10699 if (the_index != 0) {
10700 struct dlil_ifnet *dl_if;
10701
10702 ifp = ifindex2ifnet[the_index];
10703 VERIFY(ifp != NULL);
10704 dl_if = (struct dlil_ifnet *)ifp;
10705 if (dl_if->dl_if_permanent_ether_is_set != 0) {
10706 /*
10707 * Use the permanent ethernet address if it is
10708 * available because it will never change.
10709 */
10710 memcpy(node, dl_if->dl_if_permanent_ether,
10711 ETHER_ADDR_LEN);
10712 } else {
10713 memcpy(node, IF_LLADDR(ifp), ETHER_ADDR_LEN);
10714 }
10715 ret = 0;
10716 } else {
10717 ret = -1;
10718 }
10719 ifnet_head_done();
10720 return ret;
10721 }
10722
10723 int
dlil_node_present(struct ifnet * ifp,struct sockaddr * sa,int32_t rssi,int lqm,int npm,u_int8_t srvinfo[48])10724 dlil_node_present(struct ifnet *ifp, struct sockaddr *sa,
10725 int32_t rssi, int lqm, int npm, u_int8_t srvinfo[48])
10726 {
10727 struct kev_dl_node_presence kev;
10728 struct sockaddr_dl *sdl;
10729 struct sockaddr_in6 *sin6;
10730 int ret = 0;
10731
10732 VERIFY(ifp);
10733 VERIFY(sa);
10734 VERIFY(sa->sa_family == AF_LINK || sa->sa_family == AF_INET6);
10735
10736 bzero(&kev, sizeof(kev));
10737 sin6 = &kev.sin6_node_address;
10738 sdl = &kev.sdl_node_address;
10739 nd6_alt_node_addr_decompose(ifp, sa, sdl, sin6);
10740 kev.rssi = rssi;
10741 kev.link_quality_metric = lqm;
10742 kev.node_proximity_metric = npm;
10743 bcopy(srvinfo, kev.node_service_info, sizeof(kev.node_service_info));
10744
10745 ret = nd6_alt_node_present(ifp, sin6, sdl, rssi, lqm, npm);
10746 if (ret == 0 || ret == EEXIST) {
10747 int err = dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_NODE_PRESENCE,
10748 &kev.link_data, sizeof(kev), (ret == EEXIST) ? TRUE : FALSE);
10749 if (err != 0) {
10750 log(LOG_ERR, "%s: Post DL_NODE_PRESENCE failed with"
10751 "error %d\n", __func__, err);
10752 }
10753 }
10754
10755 if (ret == EEXIST) {
10756 ret = 0;
10757 }
10758 return ret;
10759 }
10760
10761 void
dlil_node_absent(struct ifnet * ifp,struct sockaddr * sa)10762 dlil_node_absent(struct ifnet *ifp, struct sockaddr *sa)
10763 {
10764 struct kev_dl_node_absence kev = {};
10765 struct sockaddr_in6 *kev_sin6 = NULL;
10766 struct sockaddr_dl *kev_sdl = NULL;
10767 int error = 0;
10768
10769 VERIFY(ifp != NULL);
10770 VERIFY(sa != NULL);
10771 VERIFY(sa->sa_family == AF_LINK || sa->sa_family == AF_INET6);
10772
10773 kev_sin6 = &kev.sin6_node_address;
10774 kev_sdl = &kev.sdl_node_address;
10775
10776 if (sa->sa_family == AF_INET6) {
10777 /*
10778 * If IPv6 address is given, get the link layer
10779 * address from what was cached in the neighbor cache
10780 */
10781 VERIFY(sa->sa_len <= sizeof(*kev_sin6));
10782 bcopy(sa, kev_sin6, sa->sa_len);
10783 error = nd6_alt_node_absent(ifp, kev_sin6, kev_sdl);
10784 } else {
10785 /*
10786 * If passed address is AF_LINK type, derive the address
10787 * based on the link address.
10788 */
10789 nd6_alt_node_addr_decompose(ifp, sa, kev_sdl, kev_sin6);
10790 error = nd6_alt_node_absent(ifp, kev_sin6, NULL);
10791 }
10792
10793 if (error == 0) {
10794 kev_sdl->sdl_type = ifp->if_type;
10795 kev_sdl->sdl_index = ifp->if_index;
10796
10797 dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_NODE_ABSENCE,
10798 &kev.link_data, sizeof(kev), FALSE);
10799 }
10800 }
10801
10802 int
dlil_node_present_v2(struct ifnet * ifp,struct sockaddr * sa,struct sockaddr_dl * sdl,int32_t rssi,int lqm,int npm,u_int8_t srvinfo[48])10803 dlil_node_present_v2(struct ifnet *ifp, struct sockaddr *sa, struct sockaddr_dl *sdl,
10804 int32_t rssi, int lqm, int npm, u_int8_t srvinfo[48])
10805 {
10806 struct kev_dl_node_presence kev = {};
10807 struct sockaddr_dl *kev_sdl = NULL;
10808 struct sockaddr_in6 *kev_sin6 = NULL;
10809 int ret = 0;
10810
10811 VERIFY(ifp != NULL);
10812 VERIFY(sa != NULL && sdl != NULL);
10813 VERIFY(sa->sa_family == AF_INET6 && sdl->sdl_family == AF_LINK);
10814
10815 kev_sin6 = &kev.sin6_node_address;
10816 kev_sdl = &kev.sdl_node_address;
10817
10818 VERIFY(sdl->sdl_len <= sizeof(*kev_sdl));
10819 bcopy(sdl, kev_sdl, sdl->sdl_len);
10820 kev_sdl->sdl_type = ifp->if_type;
10821 kev_sdl->sdl_index = ifp->if_index;
10822
10823 VERIFY(sa->sa_len <= sizeof(*kev_sin6));
10824 bcopy(sa, kev_sin6, sa->sa_len);
10825
10826 kev.rssi = rssi;
10827 kev.link_quality_metric = lqm;
10828 kev.node_proximity_metric = npm;
10829 bcopy(srvinfo, kev.node_service_info, sizeof(kev.node_service_info));
10830
10831 ret = nd6_alt_node_present(ifp, SIN6(sa), sdl, rssi, lqm, npm);
10832 if (ret == 0 || ret == EEXIST) {
10833 int err = dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_NODE_PRESENCE,
10834 &kev.link_data, sizeof(kev), (ret == EEXIST) ? TRUE : FALSE);
10835 if (err != 0) {
10836 log(LOG_ERR, "%s: Post DL_NODE_PRESENCE failed with error %d\n", __func__, err);
10837 }
10838 }
10839
10840 if (ret == EEXIST) {
10841 ret = 0;
10842 }
10843 return ret;
10844 }
10845
10846 const void *
dlil_ifaddr_bytes(const struct sockaddr_dl * sdl,size_t * sizep,kauth_cred_t * credp)10847 dlil_ifaddr_bytes(const struct sockaddr_dl *sdl, size_t *sizep,
10848 kauth_cred_t *credp)
10849 {
10850 const u_int8_t *bytes;
10851 size_t size;
10852
10853 bytes = CONST_LLADDR(sdl);
10854 size = sdl->sdl_alen;
10855
10856 #if CONFIG_MACF
10857 if (dlil_lladdr_ckreq) {
10858 switch (sdl->sdl_type) {
10859 case IFT_ETHER:
10860 case IFT_IEEE1394:
10861 break;
10862 default:
10863 credp = NULL;
10864 break;
10865 }
10866 ;
10867
10868 if (credp && mac_system_check_info(*credp, "net.link.addr")) {
10869 static const u_int8_t unspec[FIREWIRE_EUI64_LEN] = {
10870 [0] = 2
10871 };
10872
10873 bytes = unspec;
10874 }
10875 }
10876 #else
10877 #pragma unused(credp)
10878 #endif
10879
10880 if (sizep != NULL) {
10881 *sizep = size;
10882 }
10883 return bytes;
10884 }
10885
10886 void
dlil_report_issues(struct ifnet * ifp,u_int8_t modid[DLIL_MODIDLEN],u_int8_t info[DLIL_MODARGLEN])10887 dlil_report_issues(struct ifnet *ifp, u_int8_t modid[DLIL_MODIDLEN],
10888 u_int8_t info[DLIL_MODARGLEN])
10889 {
10890 struct kev_dl_issues kev;
10891 struct timeval tv;
10892
10893 VERIFY(ifp != NULL);
10894 VERIFY(modid != NULL);
10895 _CASSERT(sizeof(kev.modid) == DLIL_MODIDLEN);
10896 _CASSERT(sizeof(kev.info) == DLIL_MODARGLEN);
10897
10898 bzero(&kev, sizeof(kev));
10899
10900 microtime(&tv);
10901 kev.timestamp = tv.tv_sec;
10902 bcopy(modid, &kev.modid, DLIL_MODIDLEN);
10903 if (info != NULL) {
10904 bcopy(info, &kev.info, DLIL_MODARGLEN);
10905 }
10906
10907 dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_ISSUES,
10908 &kev.link_data, sizeof(kev), FALSE);
10909 }
10910
10911 errno_t
ifnet_getset_opportunistic(ifnet_t ifp,u_long cmd,struct ifreq * ifr,struct proc * p)10912 ifnet_getset_opportunistic(ifnet_t ifp, u_long cmd, struct ifreq *ifr,
10913 struct proc *p)
10914 {
10915 u_int32_t level = IFNET_THROTTLE_OFF;
10916 errno_t result = 0;
10917
10918 VERIFY(cmd == SIOCSIFOPPORTUNISTIC || cmd == SIOCGIFOPPORTUNISTIC);
10919
10920 if (cmd == SIOCSIFOPPORTUNISTIC) {
10921 /*
10922 * XXX: Use priv_check_cred() instead of root check?
10923 */
10924 if ((result = proc_suser(p)) != 0) {
10925 return result;
10926 }
10927
10928 if (ifr->ifr_opportunistic.ifo_flags ==
10929 IFRIFOF_BLOCK_OPPORTUNISTIC) {
10930 level = IFNET_THROTTLE_OPPORTUNISTIC;
10931 } else if (ifr->ifr_opportunistic.ifo_flags == 0) {
10932 level = IFNET_THROTTLE_OFF;
10933 } else {
10934 result = EINVAL;
10935 }
10936
10937 if (result == 0) {
10938 result = ifnet_set_throttle(ifp, level);
10939 }
10940 } else if ((result = ifnet_get_throttle(ifp, &level)) == 0) {
10941 ifr->ifr_opportunistic.ifo_flags = 0;
10942 if (level == IFNET_THROTTLE_OPPORTUNISTIC) {
10943 ifr->ifr_opportunistic.ifo_flags |=
10944 IFRIFOF_BLOCK_OPPORTUNISTIC;
10945 }
10946 }
10947
10948 /*
10949 * Return the count of current opportunistic connections
10950 * over the interface.
10951 */
10952 if (result == 0) {
10953 uint32_t flags = 0;
10954 flags |= (cmd == SIOCSIFOPPORTUNISTIC) ?
10955 INPCB_OPPORTUNISTIC_SETCMD : 0;
10956 flags |= (level == IFNET_THROTTLE_OPPORTUNISTIC) ?
10957 INPCB_OPPORTUNISTIC_THROTTLEON : 0;
10958 ifr->ifr_opportunistic.ifo_inuse =
10959 udp_count_opportunistic(ifp->if_index, flags) +
10960 tcp_count_opportunistic(ifp->if_index, flags);
10961 }
10962
10963 if (result == EALREADY) {
10964 result = 0;
10965 }
10966
10967 return result;
10968 }
10969
10970 int
ifnet_get_throttle(struct ifnet * ifp,u_int32_t * level)10971 ifnet_get_throttle(struct ifnet *ifp, u_int32_t *level)
10972 {
10973 struct ifclassq *ifq;
10974 int err = 0;
10975
10976 if (!(ifp->if_eflags & IFEF_TXSTART)) {
10977 return ENXIO;
10978 }
10979
10980 *level = IFNET_THROTTLE_OFF;
10981
10982 ifq = ifp->if_snd;
10983 IFCQ_LOCK(ifq);
10984 /* Throttling works only for IFCQ, not ALTQ instances */
10985 if (IFCQ_IS_ENABLED(ifq)) {
10986 cqrq_throttle_t req = { 0, IFNET_THROTTLE_OFF };
10987
10988 err = fq_if_request_classq(ifq, CLASSQRQ_THROTTLE, &req);
10989 *level = req.level;
10990 }
10991 IFCQ_UNLOCK(ifq);
10992
10993 return err;
10994 }
10995
10996 int
ifnet_set_throttle(struct ifnet * ifp,u_int32_t level)10997 ifnet_set_throttle(struct ifnet *ifp, u_int32_t level)
10998 {
10999 struct ifclassq *ifq;
11000 int err = 0;
11001
11002 if (!(ifp->if_eflags & IFEF_TXSTART)) {
11003 return ENXIO;
11004 }
11005
11006 ifq = ifp->if_snd;
11007
11008 switch (level) {
11009 case IFNET_THROTTLE_OFF:
11010 case IFNET_THROTTLE_OPPORTUNISTIC:
11011 break;
11012 default:
11013 return EINVAL;
11014 }
11015
11016 IFCQ_LOCK(ifq);
11017 if (IFCQ_IS_ENABLED(ifq)) {
11018 cqrq_throttle_t req = { 1, level };
11019
11020 err = fq_if_request_classq(ifq, CLASSQRQ_THROTTLE, &req);
11021 }
11022 IFCQ_UNLOCK(ifq);
11023
11024 if (err == 0) {
11025 DLIL_PRINTF("%s: throttling level set to %d\n", if_name(ifp),
11026 level);
11027 #if NECP
11028 necp_update_all_clients();
11029 #endif /* NECP */
11030 if (level == IFNET_THROTTLE_OFF) {
11031 ifnet_start(ifp);
11032 }
11033 }
11034
11035 return err;
11036 }
11037
11038 errno_t
ifnet_getset_log(ifnet_t ifp,u_long cmd,struct ifreq * ifr,struct proc * p)11039 ifnet_getset_log(ifnet_t ifp, u_long cmd, struct ifreq *ifr,
11040 struct proc *p)
11041 {
11042 #pragma unused(p)
11043 errno_t result = 0;
11044 uint32_t flags;
11045 int level, category, subcategory;
11046
11047 VERIFY(cmd == SIOCSIFLOG || cmd == SIOCGIFLOG);
11048
11049 if (cmd == SIOCSIFLOG) {
11050 if ((result = priv_check_cred(kauth_cred_get(),
11051 PRIV_NET_INTERFACE_CONTROL, 0)) != 0) {
11052 return result;
11053 }
11054
11055 level = ifr->ifr_log.ifl_level;
11056 if (level < IFNET_LOG_MIN || level > IFNET_LOG_MAX) {
11057 result = EINVAL;
11058 }
11059
11060 flags = ifr->ifr_log.ifl_flags;
11061 if ((flags &= IFNET_LOGF_MASK) == 0) {
11062 result = EINVAL;
11063 }
11064
11065 category = ifr->ifr_log.ifl_category;
11066 subcategory = ifr->ifr_log.ifl_subcategory;
11067
11068 if (result == 0) {
11069 result = ifnet_set_log(ifp, level, flags,
11070 category, subcategory);
11071 }
11072 } else {
11073 result = ifnet_get_log(ifp, &level, &flags, &category,
11074 &subcategory);
11075 if (result == 0) {
11076 ifr->ifr_log.ifl_level = level;
11077 ifr->ifr_log.ifl_flags = flags;
11078 ifr->ifr_log.ifl_category = category;
11079 ifr->ifr_log.ifl_subcategory = subcategory;
11080 }
11081 }
11082
11083 return result;
11084 }
11085
11086 int
ifnet_set_log(struct ifnet * ifp,int32_t level,uint32_t flags,int32_t category,int32_t subcategory)11087 ifnet_set_log(struct ifnet *ifp, int32_t level, uint32_t flags,
11088 int32_t category, int32_t subcategory)
11089 {
11090 int err = 0;
11091
11092 VERIFY(level >= IFNET_LOG_MIN && level <= IFNET_LOG_MAX);
11093 VERIFY(flags & IFNET_LOGF_MASK);
11094
11095 /*
11096 * The logging level applies to all facilities; make sure to
11097 * update them all with the most current level.
11098 */
11099 flags |= ifp->if_log.flags;
11100
11101 if (ifp->if_output_ctl != NULL) {
11102 struct ifnet_log_params l;
11103
11104 bzero(&l, sizeof(l));
11105 l.level = level;
11106 l.flags = flags;
11107 l.flags &= ~IFNET_LOGF_DLIL;
11108 l.category = category;
11109 l.subcategory = subcategory;
11110
11111 /* Send this request to lower layers */
11112 if (l.flags != 0) {
11113 err = ifp->if_output_ctl(ifp, IFNET_CTL_SET_LOG,
11114 sizeof(l), &l);
11115 }
11116 } else if ((flags & ~IFNET_LOGF_DLIL) && ifp->if_output_ctl == NULL) {
11117 /*
11118 * If targeted to the lower layers without an output
11119 * control callback registered on the interface, just
11120 * silently ignore facilities other than ours.
11121 */
11122 flags &= IFNET_LOGF_DLIL;
11123 if (flags == 0 && (!(ifp->if_log.flags & IFNET_LOGF_DLIL))) {
11124 level = 0;
11125 }
11126 }
11127
11128 if (err == 0) {
11129 if ((ifp->if_log.level = level) == IFNET_LOG_DEFAULT) {
11130 ifp->if_log.flags = 0;
11131 } else {
11132 ifp->if_log.flags |= flags;
11133 }
11134
11135 log(LOG_INFO, "%s: logging level set to %d flags=0x%x "
11136 "arg=0x%x, category=%d subcategory=%d\n", if_name(ifp),
11137 ifp->if_log.level, ifp->if_log.flags, flags,
11138 category, subcategory);
11139 }
11140
11141 return err;
11142 }
11143
11144 int
ifnet_get_log(struct ifnet * ifp,int32_t * level,uint32_t * flags,int32_t * category,int32_t * subcategory)11145 ifnet_get_log(struct ifnet *ifp, int32_t *level, uint32_t *flags,
11146 int32_t *category, int32_t *subcategory)
11147 {
11148 if (level != NULL) {
11149 *level = ifp->if_log.level;
11150 }
11151 if (flags != NULL) {
11152 *flags = ifp->if_log.flags;
11153 }
11154 if (category != NULL) {
11155 *category = ifp->if_log.category;
11156 }
11157 if (subcategory != NULL) {
11158 *subcategory = ifp->if_log.subcategory;
11159 }
11160
11161 return 0;
11162 }
11163
11164 int
ifnet_notify_address(struct ifnet * ifp,int af)11165 ifnet_notify_address(struct ifnet *ifp, int af)
11166 {
11167 struct ifnet_notify_address_params na;
11168
11169 #if PF
11170 (void) pf_ifaddr_hook(ifp);
11171 #endif /* PF */
11172
11173 if (ifp->if_output_ctl == NULL) {
11174 return EOPNOTSUPP;
11175 }
11176
11177 bzero(&na, sizeof(na));
11178 na.address_family = (sa_family_t)af;
11179
11180 return ifp->if_output_ctl(ifp, IFNET_CTL_NOTIFY_ADDRESS,
11181 sizeof(na), &na);
11182 }
11183
11184 errno_t
ifnet_flowid(struct ifnet * ifp,uint32_t * flowid)11185 ifnet_flowid(struct ifnet *ifp, uint32_t *flowid)
11186 {
11187 if (ifp == NULL || flowid == NULL) {
11188 return EINVAL;
11189 } else if (!(ifp->if_eflags & IFEF_TXSTART) ||
11190 !IF_FULLY_ATTACHED(ifp)) {
11191 return ENXIO;
11192 }
11193
11194 *flowid = ifp->if_flowhash;
11195
11196 return 0;
11197 }
11198
11199 errno_t
ifnet_disable_output(struct ifnet * ifp)11200 ifnet_disable_output(struct ifnet *ifp)
11201 {
11202 int err = 0;
11203
11204 if (ifp == NULL) {
11205 return EINVAL;
11206 } else if (!(ifp->if_eflags & IFEF_TXSTART) ||
11207 !IF_FULLY_ATTACHED(ifp)) {
11208 return ENXIO;
11209 }
11210
11211 lck_mtx_lock(&ifp->if_start_lock);
11212 if (ifp->if_start_flags & IFSF_FLOW_RESUME_PENDING) {
11213 ifp->if_start_flags &= ~(IFSF_FLOW_RESUME_PENDING | IFSF_FLOW_CONTROLLED);
11214 } else if ((err = ifnet_fc_add(ifp)) == 0) {
11215 ifp->if_start_flags |= IFSF_FLOW_CONTROLLED;
11216 }
11217 lck_mtx_unlock(&ifp->if_start_lock);
11218
11219 return err;
11220 }
11221
11222 errno_t
ifnet_enable_output(struct ifnet * ifp)11223 ifnet_enable_output(struct ifnet *ifp)
11224 {
11225 if (ifp == NULL) {
11226 return EINVAL;
11227 } else if (!(ifp->if_eflags & IFEF_TXSTART) ||
11228 !IF_FULLY_ATTACHED(ifp)) {
11229 return ENXIO;
11230 }
11231
11232 ifnet_start_common(ifp, TRUE, FALSE);
11233 return 0;
11234 }
11235
11236 void
ifnet_flowadv(uint32_t flowhash)11237 ifnet_flowadv(uint32_t flowhash)
11238 {
11239 struct ifnet_fc_entry *ifce;
11240 struct ifnet *ifp;
11241
11242 ifce = ifnet_fc_get(flowhash);
11243 if (ifce == NULL) {
11244 return;
11245 }
11246
11247 VERIFY(ifce->ifce_ifp != NULL);
11248 ifp = ifce->ifce_ifp;
11249
11250 /* flow hash gets recalculated per attach, so check */
11251 if (ifnet_is_attached(ifp, 1)) {
11252 if (ifp->if_flowhash == flowhash) {
11253 lck_mtx_lock_spin(&ifp->if_start_lock);
11254 if ((ifp->if_start_flags & IFSF_FLOW_CONTROLLED) == 0) {
11255 ifp->if_start_flags |= IFSF_FLOW_RESUME_PENDING;
11256 }
11257 lck_mtx_unlock(&ifp->if_start_lock);
11258 (void) ifnet_enable_output(ifp);
11259 }
11260 ifnet_decr_iorefcnt(ifp);
11261 }
11262 ifnet_fc_entry_free(ifce);
11263 }
11264
11265 /*
11266 * Function to compare ifnet_fc_entries in ifnet flow control tree
11267 */
11268 static inline int
ifce_cmp(const struct ifnet_fc_entry * fc1,const struct ifnet_fc_entry * fc2)11269 ifce_cmp(const struct ifnet_fc_entry *fc1, const struct ifnet_fc_entry *fc2)
11270 {
11271 return fc1->ifce_flowhash - fc2->ifce_flowhash;
11272 }
11273
11274 static int
ifnet_fc_add(struct ifnet * ifp)11275 ifnet_fc_add(struct ifnet *ifp)
11276 {
11277 struct ifnet_fc_entry keyfc, *ifce;
11278 uint32_t flowhash;
11279
11280 VERIFY(ifp != NULL && (ifp->if_eflags & IFEF_TXSTART));
11281 VERIFY(ifp->if_flowhash != 0);
11282 flowhash = ifp->if_flowhash;
11283
11284 bzero(&keyfc, sizeof(keyfc));
11285 keyfc.ifce_flowhash = flowhash;
11286
11287 lck_mtx_lock_spin(&ifnet_fc_lock);
11288 ifce = RB_FIND(ifnet_fc_tree, &ifnet_fc_tree, &keyfc);
11289 if (ifce != NULL && ifce->ifce_ifp == ifp) {
11290 /* Entry is already in ifnet_fc_tree, return */
11291 lck_mtx_unlock(&ifnet_fc_lock);
11292 return 0;
11293 }
11294
11295 if (ifce != NULL) {
11296 /*
11297 * There is a different fc entry with the same flow hash
11298 * but different ifp pointer. There can be a collision
11299 * on flow hash but the probability is low. Let's just
11300 * avoid adding a second one when there is a collision.
11301 */
11302 lck_mtx_unlock(&ifnet_fc_lock);
11303 return EAGAIN;
11304 }
11305
11306 /* become regular mutex */
11307 lck_mtx_convert_spin(&ifnet_fc_lock);
11308
11309 ifce = zalloc_flags(ifnet_fc_zone, Z_WAITOK | Z_ZERO);
11310 ifce->ifce_flowhash = flowhash;
11311 ifce->ifce_ifp = ifp;
11312
11313 RB_INSERT(ifnet_fc_tree, &ifnet_fc_tree, ifce);
11314 lck_mtx_unlock(&ifnet_fc_lock);
11315 return 0;
11316 }
11317
11318 static struct ifnet_fc_entry *
ifnet_fc_get(uint32_t flowhash)11319 ifnet_fc_get(uint32_t flowhash)
11320 {
11321 struct ifnet_fc_entry keyfc, *ifce;
11322 struct ifnet *ifp;
11323
11324 bzero(&keyfc, sizeof(keyfc));
11325 keyfc.ifce_flowhash = flowhash;
11326
11327 lck_mtx_lock_spin(&ifnet_fc_lock);
11328 ifce = RB_FIND(ifnet_fc_tree, &ifnet_fc_tree, &keyfc);
11329 if (ifce == NULL) {
11330 /* Entry is not present in ifnet_fc_tree, return */
11331 lck_mtx_unlock(&ifnet_fc_lock);
11332 return NULL;
11333 }
11334
11335 RB_REMOVE(ifnet_fc_tree, &ifnet_fc_tree, ifce);
11336
11337 VERIFY(ifce->ifce_ifp != NULL);
11338 ifp = ifce->ifce_ifp;
11339
11340 /* become regular mutex */
11341 lck_mtx_convert_spin(&ifnet_fc_lock);
11342
11343 if (!ifnet_is_attached(ifp, 0)) {
11344 /*
11345 * This ifp is not attached or in the process of being
11346 * detached; just don't process it.
11347 */
11348 ifnet_fc_entry_free(ifce);
11349 ifce = NULL;
11350 }
11351 lck_mtx_unlock(&ifnet_fc_lock);
11352
11353 return ifce;
11354 }
11355
11356 static void
ifnet_fc_entry_free(struct ifnet_fc_entry * ifce)11357 ifnet_fc_entry_free(struct ifnet_fc_entry *ifce)
11358 {
11359 zfree(ifnet_fc_zone, ifce);
11360 }
11361
11362 static uint32_t
ifnet_calc_flowhash(struct ifnet * ifp)11363 ifnet_calc_flowhash(struct ifnet *ifp)
11364 {
11365 struct ifnet_flowhash_key fh __attribute__((aligned(8)));
11366 uint32_t flowhash = 0;
11367
11368 if (ifnet_flowhash_seed == 0) {
11369 ifnet_flowhash_seed = RandomULong();
11370 }
11371
11372 bzero(&fh, sizeof(fh));
11373
11374 (void) snprintf(fh.ifk_name, sizeof(fh.ifk_name), "%s", ifp->if_name);
11375 fh.ifk_unit = ifp->if_unit;
11376 fh.ifk_flags = ifp->if_flags;
11377 fh.ifk_eflags = ifp->if_eflags;
11378 fh.ifk_capabilities = ifp->if_capabilities;
11379 fh.ifk_capenable = ifp->if_capenable;
11380 fh.ifk_output_sched_model = ifp->if_output_sched_model;
11381 fh.ifk_rand1 = RandomULong();
11382 fh.ifk_rand2 = RandomULong();
11383
11384 try_again:
11385 flowhash = net_flowhash(&fh, sizeof(fh), ifnet_flowhash_seed);
11386 if (flowhash == 0) {
11387 /* try to get a non-zero flowhash */
11388 ifnet_flowhash_seed = RandomULong();
11389 goto try_again;
11390 }
11391
11392 return flowhash;
11393 }
11394
11395 int
ifnet_set_netsignature(struct ifnet * ifp,uint8_t family,uint8_t len,uint16_t flags,uint8_t * data)11396 ifnet_set_netsignature(struct ifnet *ifp, uint8_t family, uint8_t len,
11397 uint16_t flags, uint8_t *data)
11398 {
11399 #pragma unused(flags)
11400 int error = 0;
11401
11402 switch (family) {
11403 case AF_INET:
11404 if_inetdata_lock_exclusive(ifp);
11405 if (IN_IFEXTRA(ifp) != NULL) {
11406 if (len == 0) {
11407 /* Allow clearing the signature */
11408 IN_IFEXTRA(ifp)->netsig_len = 0;
11409 bzero(IN_IFEXTRA(ifp)->netsig,
11410 sizeof(IN_IFEXTRA(ifp)->netsig));
11411 if_inetdata_lock_done(ifp);
11412 break;
11413 } else if (len > sizeof(IN_IFEXTRA(ifp)->netsig)) {
11414 error = EINVAL;
11415 if_inetdata_lock_done(ifp);
11416 break;
11417 }
11418 IN_IFEXTRA(ifp)->netsig_len = len;
11419 bcopy(data, IN_IFEXTRA(ifp)->netsig, len);
11420 } else {
11421 error = ENOMEM;
11422 }
11423 if_inetdata_lock_done(ifp);
11424 break;
11425
11426 case AF_INET6:
11427 if_inet6data_lock_exclusive(ifp);
11428 if (IN6_IFEXTRA(ifp) != NULL) {
11429 if (len == 0) {
11430 /* Allow clearing the signature */
11431 IN6_IFEXTRA(ifp)->netsig_len = 0;
11432 bzero(IN6_IFEXTRA(ifp)->netsig,
11433 sizeof(IN6_IFEXTRA(ifp)->netsig));
11434 if_inet6data_lock_done(ifp);
11435 break;
11436 } else if (len > sizeof(IN6_IFEXTRA(ifp)->netsig)) {
11437 error = EINVAL;
11438 if_inet6data_lock_done(ifp);
11439 break;
11440 }
11441 IN6_IFEXTRA(ifp)->netsig_len = len;
11442 bcopy(data, IN6_IFEXTRA(ifp)->netsig, len);
11443 } else {
11444 error = ENOMEM;
11445 }
11446 if_inet6data_lock_done(ifp);
11447 break;
11448
11449 default:
11450 error = EINVAL;
11451 break;
11452 }
11453
11454 return error;
11455 }
11456
11457 int
ifnet_get_netsignature(struct ifnet * ifp,uint8_t family,uint8_t * len,uint16_t * flags,uint8_t * data)11458 ifnet_get_netsignature(struct ifnet *ifp, uint8_t family, uint8_t *len,
11459 uint16_t *flags, uint8_t *data)
11460 {
11461 int error = 0;
11462
11463 if (ifp == NULL || len == NULL || data == NULL) {
11464 return EINVAL;
11465 }
11466
11467 switch (family) {
11468 case AF_INET:
11469 if_inetdata_lock_shared(ifp);
11470 if (IN_IFEXTRA(ifp) != NULL) {
11471 if (*len == 0 || *len < IN_IFEXTRA(ifp)->netsig_len) {
11472 error = EINVAL;
11473 if_inetdata_lock_done(ifp);
11474 break;
11475 }
11476 if ((*len = (uint8_t)IN_IFEXTRA(ifp)->netsig_len) > 0) {
11477 bcopy(IN_IFEXTRA(ifp)->netsig, data, *len);
11478 } else {
11479 error = ENOENT;
11480 }
11481 } else {
11482 error = ENOMEM;
11483 }
11484 if_inetdata_lock_done(ifp);
11485 break;
11486
11487 case AF_INET6:
11488 if_inet6data_lock_shared(ifp);
11489 if (IN6_IFEXTRA(ifp) != NULL) {
11490 if (*len == 0 || *len < IN6_IFEXTRA(ifp)->netsig_len) {
11491 error = EINVAL;
11492 if_inet6data_lock_done(ifp);
11493 break;
11494 }
11495 if ((*len = (uint8_t)IN6_IFEXTRA(ifp)->netsig_len) > 0) {
11496 bcopy(IN6_IFEXTRA(ifp)->netsig, data, *len);
11497 } else {
11498 error = ENOENT;
11499 }
11500 } else {
11501 error = ENOMEM;
11502 }
11503 if_inet6data_lock_done(ifp);
11504 break;
11505
11506 default:
11507 error = EINVAL;
11508 break;
11509 }
11510
11511 if (error == 0 && flags != NULL) {
11512 *flags = 0;
11513 }
11514
11515 return error;
11516 }
11517
11518 int
ifnet_set_nat64prefix(struct ifnet * ifp,struct ipv6_prefix * prefixes)11519 ifnet_set_nat64prefix(struct ifnet *ifp, struct ipv6_prefix *prefixes)
11520 {
11521 int i, error = 0, one_set = 0;
11522
11523 if_inet6data_lock_exclusive(ifp);
11524
11525 if (IN6_IFEXTRA(ifp) == NULL) {
11526 error = ENOMEM;
11527 goto out;
11528 }
11529
11530 for (i = 0; i < NAT64_MAX_NUM_PREFIXES; i++) {
11531 uint32_t prefix_len =
11532 prefixes[i].prefix_len;
11533 struct in6_addr *prefix =
11534 &prefixes[i].ipv6_prefix;
11535
11536 if (prefix_len == 0) {
11537 clat_log0((LOG_DEBUG,
11538 "NAT64 prefixes purged from Interface %s\n",
11539 if_name(ifp)));
11540 /* Allow clearing the signature */
11541 IN6_IFEXTRA(ifp)->nat64_prefixes[i].prefix_len = 0;
11542 bzero(&IN6_IFEXTRA(ifp)->nat64_prefixes[i].ipv6_prefix,
11543 sizeof(struct in6_addr));
11544
11545 continue;
11546 } else if (prefix_len != NAT64_PREFIX_LEN_32 &&
11547 prefix_len != NAT64_PREFIX_LEN_40 &&
11548 prefix_len != NAT64_PREFIX_LEN_48 &&
11549 prefix_len != NAT64_PREFIX_LEN_56 &&
11550 prefix_len != NAT64_PREFIX_LEN_64 &&
11551 prefix_len != NAT64_PREFIX_LEN_96) {
11552 clat_log0((LOG_DEBUG,
11553 "NAT64 prefixlen is incorrect %d\n", prefix_len));
11554 error = EINVAL;
11555 goto out;
11556 }
11557
11558 if (IN6_IS_SCOPE_EMBED(prefix)) {
11559 clat_log0((LOG_DEBUG,
11560 "NAT64 prefix has interface/link local scope.\n"));
11561 error = EINVAL;
11562 goto out;
11563 }
11564
11565 IN6_IFEXTRA(ifp)->nat64_prefixes[i].prefix_len = prefix_len;
11566 bcopy(prefix, &IN6_IFEXTRA(ifp)->nat64_prefixes[i].ipv6_prefix,
11567 sizeof(struct in6_addr));
11568 clat_log0((LOG_DEBUG,
11569 "NAT64 prefix set to %s with prefixlen: %d\n",
11570 ip6_sprintf(prefix), prefix_len));
11571 one_set = 1;
11572 }
11573
11574 out:
11575 if_inet6data_lock_done(ifp);
11576
11577 if (error == 0 && one_set != 0) {
11578 necp_update_all_clients();
11579 }
11580
11581 return error;
11582 }
11583
11584 int
ifnet_get_nat64prefix(struct ifnet * ifp,struct ipv6_prefix * prefixes)11585 ifnet_get_nat64prefix(struct ifnet *ifp, struct ipv6_prefix *prefixes)
11586 {
11587 int i, found_one = 0, error = 0;
11588
11589 if (ifp == NULL) {
11590 return EINVAL;
11591 }
11592
11593 if_inet6data_lock_shared(ifp);
11594
11595 if (IN6_IFEXTRA(ifp) == NULL) {
11596 error = ENOMEM;
11597 goto out;
11598 }
11599
11600 for (i = 0; i < NAT64_MAX_NUM_PREFIXES; i++) {
11601 if (IN6_IFEXTRA(ifp)->nat64_prefixes[i].prefix_len != 0) {
11602 found_one = 1;
11603 }
11604 }
11605
11606 if (found_one == 0) {
11607 error = ENOENT;
11608 goto out;
11609 }
11610
11611 if (prefixes) {
11612 bcopy(IN6_IFEXTRA(ifp)->nat64_prefixes, prefixes,
11613 sizeof(IN6_IFEXTRA(ifp)->nat64_prefixes));
11614 }
11615
11616 out:
11617 if_inet6data_lock_done(ifp);
11618
11619 return error;
11620 }
11621
11622 __attribute__((noinline))
11623 static void
dlil_output_cksum_dbg(struct ifnet * ifp,struct mbuf * m,uint32_t hoff,protocol_family_t pf)11624 dlil_output_cksum_dbg(struct ifnet *ifp, struct mbuf *m, uint32_t hoff,
11625 protocol_family_t pf)
11626 {
11627 #pragma unused(ifp)
11628 uint32_t did_sw;
11629
11630 if (!(hwcksum_dbg_mode & HWCKSUM_DBG_FINALIZE_FORCED) ||
11631 (m->m_pkthdr.csum_flags & (CSUM_TSO_IPV4 | CSUM_TSO_IPV6))) {
11632 return;
11633 }
11634
11635 switch (pf) {
11636 case PF_INET:
11637 did_sw = in_finalize_cksum(m, hoff, m->m_pkthdr.csum_flags);
11638 if (did_sw & CSUM_DELAY_IP) {
11639 hwcksum_dbg_finalized_hdr++;
11640 }
11641 if (did_sw & CSUM_DELAY_DATA) {
11642 hwcksum_dbg_finalized_data++;
11643 }
11644 break;
11645 case PF_INET6:
11646 /*
11647 * Checksum offload should not have been enabled when
11648 * extension headers exist; that also means that we
11649 * cannot force-finalize packets with extension headers.
11650 * Indicate to the callee should it skip such case by
11651 * setting optlen to -1.
11652 */
11653 did_sw = in6_finalize_cksum(m, hoff, -1, -1,
11654 m->m_pkthdr.csum_flags);
11655 if (did_sw & CSUM_DELAY_IPV6_DATA) {
11656 hwcksum_dbg_finalized_data++;
11657 }
11658 break;
11659 default:
11660 return;
11661 }
11662 }
11663
11664 static void
dlil_input_cksum_dbg(struct ifnet * ifp,struct mbuf * m,char * frame_header,protocol_family_t pf)11665 dlil_input_cksum_dbg(struct ifnet *ifp, struct mbuf *m, char *frame_header,
11666 protocol_family_t pf)
11667 {
11668 uint16_t sum = 0;
11669 uint32_t hlen;
11670
11671 if (frame_header == NULL ||
11672 frame_header < (char *)mbuf_datastart(m) ||
11673 frame_header > (char *)m->m_data) {
11674 DLIL_PRINTF("%s: frame header pointer 0x%llx out of range "
11675 "[0x%llx,0x%llx] for mbuf 0x%llx\n", if_name(ifp),
11676 (uint64_t)VM_KERNEL_ADDRPERM(frame_header),
11677 (uint64_t)VM_KERNEL_ADDRPERM(mbuf_datastart(m)),
11678 (uint64_t)VM_KERNEL_ADDRPERM(m->m_data),
11679 (uint64_t)VM_KERNEL_ADDRPERM(m));
11680 return;
11681 }
11682 hlen = (uint32_t)(m->m_data - (uintptr_t)frame_header);
11683
11684 switch (pf) {
11685 case PF_INET:
11686 case PF_INET6:
11687 break;
11688 default:
11689 return;
11690 }
11691
11692 /*
11693 * Force partial checksum offload; useful to simulate cases
11694 * where the hardware does not support partial checksum offload,
11695 * in order to validate correctness throughout the layers above.
11696 */
11697 if (hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_FORCED) {
11698 uint32_t foff = hwcksum_dbg_partial_rxoff_forced;
11699
11700 if (foff > (uint32_t)m->m_pkthdr.len) {
11701 return;
11702 }
11703
11704 m->m_pkthdr.csum_flags &= ~CSUM_RX_FLAGS;
11705
11706 /* Compute 16-bit 1's complement sum from forced offset */
11707 sum = m_sum16(m, foff, (m->m_pkthdr.len - foff));
11708
11709 m->m_pkthdr.csum_flags |= (CSUM_DATA_VALID | CSUM_PARTIAL);
11710 m->m_pkthdr.csum_rx_val = sum;
11711 m->m_pkthdr.csum_rx_start = (uint16_t)(foff + hlen);
11712
11713 hwcksum_dbg_partial_forced++;
11714 hwcksum_dbg_partial_forced_bytes += m->m_pkthdr.len;
11715 }
11716
11717 /*
11718 * Partial checksum offload verification (and adjustment);
11719 * useful to validate and test cases where the hardware
11720 * supports partial checksum offload.
11721 */
11722 if ((m->m_pkthdr.csum_flags &
11723 (CSUM_DATA_VALID | CSUM_PARTIAL | CSUM_PSEUDO_HDR)) ==
11724 (CSUM_DATA_VALID | CSUM_PARTIAL)) {
11725 uint32_t rxoff;
11726
11727 /* Start offset must begin after frame header */
11728 rxoff = m->m_pkthdr.csum_rx_start;
11729 if (hlen > rxoff) {
11730 hwcksum_dbg_bad_rxoff++;
11731 if (dlil_verbose) {
11732 DLIL_PRINTF("%s: partial cksum start offset %d "
11733 "is less than frame header length %d for "
11734 "mbuf 0x%llx\n", if_name(ifp), rxoff, hlen,
11735 (uint64_t)VM_KERNEL_ADDRPERM(m));
11736 }
11737 return;
11738 }
11739 rxoff -= hlen;
11740
11741 if (!(hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_FORCED)) {
11742 /*
11743 * Compute the expected 16-bit 1's complement sum;
11744 * skip this if we've already computed it above
11745 * when partial checksum offload is forced.
11746 */
11747 sum = m_sum16(m, rxoff, (m->m_pkthdr.len - rxoff));
11748
11749 /* Hardware or driver is buggy */
11750 if (sum != m->m_pkthdr.csum_rx_val) {
11751 hwcksum_dbg_bad_cksum++;
11752 if (dlil_verbose) {
11753 DLIL_PRINTF("%s: bad partial cksum value "
11754 "0x%x (expected 0x%x) for mbuf "
11755 "0x%llx [rx_start %d]\n",
11756 if_name(ifp),
11757 m->m_pkthdr.csum_rx_val, sum,
11758 (uint64_t)VM_KERNEL_ADDRPERM(m),
11759 m->m_pkthdr.csum_rx_start);
11760 }
11761 return;
11762 }
11763 }
11764 hwcksum_dbg_verified++;
11765
11766 /*
11767 * This code allows us to emulate various hardwares that
11768 * perform 16-bit 1's complement sum beginning at various
11769 * start offset values.
11770 */
11771 if (hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_RXOFF_ADJ) {
11772 uint32_t aoff = hwcksum_dbg_partial_rxoff_adj;
11773
11774 if (aoff == rxoff || aoff > (uint32_t)m->m_pkthdr.len) {
11775 return;
11776 }
11777
11778 sum = m_adj_sum16(m, rxoff, aoff,
11779 m_pktlen(m) - aoff, sum);
11780
11781 m->m_pkthdr.csum_rx_val = sum;
11782 m->m_pkthdr.csum_rx_start = (uint16_t)(aoff + hlen);
11783
11784 hwcksum_dbg_adjusted++;
11785 }
11786 }
11787 }
11788
11789 #if DEBUG || DEVELOPMENT
11790 /* Blob for sum16 verification */
11791 static uint8_t sumdata[] = {
11792 0x1f, 0x8b, 0x08, 0x08, 0x4c, 0xe5, 0x9a, 0x4f, 0x00, 0x03,
11793 0x5f, 0x00, 0x5d, 0x91, 0x41, 0x4e, 0xc4, 0x30, 0x0c, 0x45,
11794 0xf7, 0x9c, 0xc2, 0x07, 0x18, 0xf5, 0x0e, 0xb0, 0xe2, 0x00,
11795 0x48, 0x88, 0xa5, 0xdb, 0xba, 0x49, 0x34, 0x69, 0xdc, 0x71,
11796 0x92, 0xa9, 0xc2, 0x8a, 0x6b, 0x70, 0x3d, 0x4e, 0x82, 0x93,
11797 0xb4, 0x08, 0xd8, 0xc5, 0xb1, 0xfd, 0xff, 0xb3, 0xfd, 0x4c,
11798 0x42, 0x5f, 0x1f, 0x9f, 0x11, 0x12, 0x43, 0xb2, 0x04, 0x93,
11799 0xe0, 0x7b, 0x01, 0x0e, 0x14, 0x07, 0x78, 0xd1, 0x78, 0x75,
11800 0x71, 0x71, 0xe9, 0x08, 0x84, 0x46, 0xf2, 0xc7, 0x3b, 0x09,
11801 0xe7, 0xd1, 0xd3, 0x8a, 0x57, 0x92, 0x33, 0xcd, 0x39, 0xcc,
11802 0xb0, 0x91, 0x89, 0xe0, 0x42, 0x53, 0x8b, 0xb7, 0x8c, 0x42,
11803 0x60, 0xd9, 0x9f, 0x7a, 0x55, 0x19, 0x76, 0xcb, 0x10, 0x49,
11804 0x35, 0xac, 0x0b, 0x5a, 0x3c, 0xbb, 0x65, 0x51, 0x8c, 0x90,
11805 0x7c, 0x69, 0x45, 0x45, 0x81, 0xb4, 0x2b, 0x70, 0x82, 0x85,
11806 0x55, 0x91, 0x17, 0x90, 0xdc, 0x14, 0x1e, 0x35, 0x52, 0xdd,
11807 0x02, 0x16, 0xef, 0xb5, 0x40, 0x89, 0xe2, 0x46, 0x53, 0xad,
11808 0x93, 0x6e, 0x98, 0x30, 0xe5, 0x08, 0xb7, 0xcc, 0x03, 0xbc,
11809 0x71, 0x86, 0x09, 0x43, 0x0d, 0x52, 0xf5, 0xa2, 0xf5, 0xa2,
11810 0x56, 0x11, 0x8d, 0xa8, 0xf5, 0xee, 0x92, 0x3d, 0xfe, 0x8c,
11811 0x67, 0x71, 0x8b, 0x0e, 0x2d, 0x70, 0x77, 0xbe, 0xbe, 0xea,
11812 0xbf, 0x9a, 0x8d, 0x9c, 0x53, 0x53, 0xe5, 0xe0, 0x4b, 0x87,
11813 0x85, 0xd2, 0x45, 0x95, 0x30, 0xc1, 0xcc, 0xe0, 0x74, 0x54,
11814 0x13, 0x58, 0xe8, 0xe8, 0x79, 0xa2, 0x09, 0x73, 0xa4, 0x0e,
11815 0x39, 0x59, 0x0c, 0xe6, 0x9c, 0xb2, 0x4f, 0x06, 0x5b, 0x8e,
11816 0xcd, 0x17, 0x6c, 0x5e, 0x95, 0x4d, 0x70, 0xa2, 0x0a, 0xbf,
11817 0xa3, 0xcc, 0x03, 0xbc, 0x5a, 0xe7, 0x75, 0x06, 0x5e, 0x75,
11818 0xef, 0x58, 0x8e, 0x15, 0xd1, 0x0a, 0x18, 0xff, 0xdd, 0xe6,
11819 0x02, 0x3b, 0xb5, 0xb4, 0xa1, 0xe0, 0x72, 0xfc, 0xe3, 0xab,
11820 0x07, 0xe0, 0x4d, 0x65, 0xea, 0x92, 0xeb, 0xf2, 0x7b, 0x17,
11821 0x05, 0xce, 0xc6, 0xf6, 0x2b, 0xbb, 0x70, 0x3d, 0x00, 0x95,
11822 0xe0, 0x07, 0x52, 0x3b, 0x58, 0xfc, 0x7c, 0x69, 0x4d, 0xe9,
11823 0xf7, 0xa9, 0x66, 0x1e, 0x1e, 0xbe, 0x01, 0x69, 0x98, 0xfe,
11824 0xc8, 0x28, 0x02, 0x00, 0x00
11825 };
11826
11827 /* Precomputed 16-bit 1's complement sums for various spans of the above data */
11828 static struct {
11829 boolean_t init;
11830 uint16_t len;
11831 uint16_t sumr; /* reference */
11832 uint16_t sumrp; /* reference, precomputed */
11833 } sumtbl[] = {
11834 { FALSE, 0, 0, 0x0000 },
11835 { FALSE, 1, 0, 0x001f },
11836 { FALSE, 2, 0, 0x8b1f },
11837 { FALSE, 3, 0, 0x8b27 },
11838 { FALSE, 7, 0, 0x790e },
11839 { FALSE, 11, 0, 0xcb6d },
11840 { FALSE, 20, 0, 0x20dd },
11841 { FALSE, 27, 0, 0xbabd },
11842 { FALSE, 32, 0, 0xf3e8 },
11843 { FALSE, 37, 0, 0x197d },
11844 { FALSE, 43, 0, 0x9eae },
11845 { FALSE, 64, 0, 0x4678 },
11846 { FALSE, 127, 0, 0x9399 },
11847 { FALSE, 256, 0, 0xd147 },
11848 { FALSE, 325, 0, 0x0358 },
11849 };
11850 #define SUMTBL_MAX ((int)sizeof (sumtbl) / (int)sizeof (sumtbl[0]))
11851
11852 static void
dlil_verify_sum16(void)11853 dlil_verify_sum16(void)
11854 {
11855 struct mbuf *m;
11856 uint8_t *buf;
11857 int n;
11858
11859 /* Make sure test data plus extra room for alignment fits in cluster */
11860 _CASSERT((sizeof(sumdata) + (sizeof(uint64_t) * 2)) <= MCLBYTES);
11861
11862 kprintf("DLIL: running SUM16 self-tests ... ");
11863
11864 m = m_getcl(M_WAITOK, MT_DATA, M_PKTHDR);
11865 m_align(m, sizeof(sumdata) + (sizeof(uint64_t) * 2));
11866
11867 buf = mtod(m, uint8_t *); /* base address */
11868
11869 for (n = 0; n < SUMTBL_MAX; n++) {
11870 uint16_t len = sumtbl[n].len;
11871 int i;
11872
11873 /* Verify for all possible alignments */
11874 for (i = 0; i < (int)sizeof(uint64_t); i++) {
11875 uint16_t sum, sumr;
11876 uint8_t *c;
11877
11878 /* Copy over test data to mbuf */
11879 VERIFY(len <= sizeof(sumdata));
11880 c = buf + i;
11881 bcopy(sumdata, c, len);
11882
11883 /* Zero-offset test (align by data pointer) */
11884 m->m_data = (uintptr_t)c;
11885 m->m_len = len;
11886 sum = m_sum16(m, 0, len);
11887
11888 if (!sumtbl[n].init) {
11889 sumr = (uint16_t)in_cksum_mbuf_ref(m, len, 0, 0);
11890 sumtbl[n].sumr = sumr;
11891 sumtbl[n].init = TRUE;
11892 } else {
11893 sumr = sumtbl[n].sumr;
11894 }
11895
11896 /* Something is horribly broken; stop now */
11897 if (sumr != sumtbl[n].sumrp) {
11898 panic_plain("\n%s: broken in_cksum_mbuf_ref() "
11899 "for len=%d align=%d sum=0x%04x "
11900 "[expected=0x%04x]\n", __func__,
11901 len, i, sum, sumr);
11902 /* NOTREACHED */
11903 } else if (sum != sumr) {
11904 panic_plain("\n%s: broken m_sum16() for len=%d "
11905 "align=%d sum=0x%04x [expected=0x%04x]\n",
11906 __func__, len, i, sum, sumr);
11907 /* NOTREACHED */
11908 }
11909
11910 /* Alignment test by offset (fixed data pointer) */
11911 m->m_data = (uintptr_t)buf;
11912 m->m_len = i + len;
11913 sum = m_sum16(m, i, len);
11914
11915 /* Something is horribly broken; stop now */
11916 if (sum != sumr) {
11917 panic_plain("\n%s: broken m_sum16() for len=%d "
11918 "offset=%d sum=0x%04x [expected=0x%04x]\n",
11919 __func__, len, i, sum, sumr);
11920 /* NOTREACHED */
11921 }
11922 #if INET
11923 /* Simple sum16 contiguous buffer test by aligment */
11924 sum = b_sum16(c, len);
11925
11926 /* Something is horribly broken; stop now */
11927 if (sum != sumr) {
11928 panic_plain("\n%s: broken b_sum16() for len=%d "
11929 "align=%d sum=0x%04x [expected=0x%04x]\n",
11930 __func__, len, i, sum, sumr);
11931 /* NOTREACHED */
11932 }
11933 #endif /* INET */
11934 }
11935 }
11936 m_freem(m);
11937
11938 kprintf("PASSED\n");
11939 }
11940 #endif /* DEBUG || DEVELOPMENT */
11941
11942 #define CASE_STRINGIFY(x) case x: return #x
11943
11944 __private_extern__ const char *
dlil_kev_dl_code_str(u_int32_t event_code)11945 dlil_kev_dl_code_str(u_int32_t event_code)
11946 {
11947 switch (event_code) {
11948 CASE_STRINGIFY(KEV_DL_SIFFLAGS);
11949 CASE_STRINGIFY(KEV_DL_SIFMETRICS);
11950 CASE_STRINGIFY(KEV_DL_SIFMTU);
11951 CASE_STRINGIFY(KEV_DL_SIFPHYS);
11952 CASE_STRINGIFY(KEV_DL_SIFMEDIA);
11953 CASE_STRINGIFY(KEV_DL_SIFGENERIC);
11954 CASE_STRINGIFY(KEV_DL_ADDMULTI);
11955 CASE_STRINGIFY(KEV_DL_DELMULTI);
11956 CASE_STRINGIFY(KEV_DL_IF_ATTACHED);
11957 CASE_STRINGIFY(KEV_DL_IF_DETACHING);
11958 CASE_STRINGIFY(KEV_DL_IF_DETACHED);
11959 CASE_STRINGIFY(KEV_DL_LINK_OFF);
11960 CASE_STRINGIFY(KEV_DL_LINK_ON);
11961 CASE_STRINGIFY(KEV_DL_PROTO_ATTACHED);
11962 CASE_STRINGIFY(KEV_DL_PROTO_DETACHED);
11963 CASE_STRINGIFY(KEV_DL_LINK_ADDRESS_CHANGED);
11964 CASE_STRINGIFY(KEV_DL_WAKEFLAGS_CHANGED);
11965 CASE_STRINGIFY(KEV_DL_IF_IDLE_ROUTE_REFCNT);
11966 CASE_STRINGIFY(KEV_DL_IFCAP_CHANGED);
11967 CASE_STRINGIFY(KEV_DL_LINK_QUALITY_METRIC_CHANGED);
11968 CASE_STRINGIFY(KEV_DL_NODE_PRESENCE);
11969 CASE_STRINGIFY(KEV_DL_NODE_ABSENCE);
11970 CASE_STRINGIFY(KEV_DL_PRIMARY_ELECTED);
11971 CASE_STRINGIFY(KEV_DL_ISSUES);
11972 CASE_STRINGIFY(KEV_DL_IFDELEGATE_CHANGED);
11973 default:
11974 break;
11975 }
11976 return "";
11977 }
11978
11979 static void
dlil_dt_tcall_fn(thread_call_param_t arg0,thread_call_param_t arg1)11980 dlil_dt_tcall_fn(thread_call_param_t arg0, thread_call_param_t arg1)
11981 {
11982 #pragma unused(arg1)
11983 struct ifnet *ifp = arg0;
11984
11985 if (ifnet_is_attached(ifp, 1)) {
11986 nstat_ifnet_threshold_reached(ifp->if_index);
11987 ifnet_decr_iorefcnt(ifp);
11988 }
11989 }
11990
11991 void
ifnet_notify_data_threshold(struct ifnet * ifp)11992 ifnet_notify_data_threshold(struct ifnet *ifp)
11993 {
11994 uint64_t bytes = (ifp->if_ibytes + ifp->if_obytes);
11995 uint64_t oldbytes = ifp->if_dt_bytes;
11996
11997 ASSERT(ifp->if_dt_tcall != NULL);
11998
11999 /*
12000 * If we went over the threshold, notify NetworkStatistics.
12001 * We rate-limit it based on the threshold interval value.
12002 */
12003 if (threshold_notify && (bytes - oldbytes) > ifp->if_data_threshold &&
12004 OSCompareAndSwap64(oldbytes, bytes, &ifp->if_dt_bytes) &&
12005 !thread_call_isactive(ifp->if_dt_tcall)) {
12006 uint64_t tival = (threshold_interval * NSEC_PER_SEC);
12007 uint64_t now = mach_absolute_time(), deadline = now;
12008 uint64_t ival;
12009
12010 if (tival != 0) {
12011 nanoseconds_to_absolutetime(tival, &ival);
12012 clock_deadline_for_periodic_event(ival, now, &deadline);
12013 (void) thread_call_enter_delayed(ifp->if_dt_tcall,
12014 deadline);
12015 } else {
12016 (void) thread_call_enter(ifp->if_dt_tcall);
12017 }
12018 }
12019 }
12020
12021
12022 void
ifnet_update_stats_per_flow(struct ifnet_stats_per_flow * ifs,struct ifnet * ifp)12023 ifnet_update_stats_per_flow(struct ifnet_stats_per_flow *ifs,
12024 struct ifnet *ifp)
12025 {
12026 tcp_update_stats_per_flow(ifs, ifp);
12027 }
12028
12029 static inline u_int32_t
_set_flags(u_int32_t * flags_p,u_int32_t set_flags)12030 _set_flags(u_int32_t *flags_p, u_int32_t set_flags)
12031 {
12032 return (u_int32_t)OSBitOrAtomic(set_flags, flags_p);
12033 }
12034
12035 static inline void
_clear_flags(u_int32_t * flags_p,u_int32_t clear_flags)12036 _clear_flags(u_int32_t *flags_p, u_int32_t clear_flags)
12037 {
12038 OSBitAndAtomic(~clear_flags, flags_p);
12039 }
12040
12041 __private_extern__ u_int32_t
if_set_eflags(ifnet_t interface,u_int32_t set_flags)12042 if_set_eflags(ifnet_t interface, u_int32_t set_flags)
12043 {
12044 return _set_flags(&interface->if_eflags, set_flags);
12045 }
12046
12047 __private_extern__ void
if_clear_eflags(ifnet_t interface,u_int32_t clear_flags)12048 if_clear_eflags(ifnet_t interface, u_int32_t clear_flags)
12049 {
12050 _clear_flags(&interface->if_eflags, clear_flags);
12051 }
12052
12053 __private_extern__ u_int32_t
if_set_xflags(ifnet_t interface,u_int32_t set_flags)12054 if_set_xflags(ifnet_t interface, u_int32_t set_flags)
12055 {
12056 return _set_flags(&interface->if_xflags, set_flags);
12057 }
12058
12059 __private_extern__ void
if_clear_xflags(ifnet_t interface,u_int32_t clear_flags)12060 if_clear_xflags(ifnet_t interface, u_int32_t clear_flags)
12061 {
12062 _clear_flags(&interface->if_xflags, clear_flags);
12063 }
12064
12065 __private_extern__ void
ifnet_update_traffic_rule_genid(ifnet_t ifp)12066 ifnet_update_traffic_rule_genid(ifnet_t ifp)
12067 {
12068 os_atomic_inc(&ifp->if_traffic_rule_genid, relaxed);
12069 }
12070
12071 __private_extern__ boolean_t
ifnet_sync_traffic_rule_genid(ifnet_t ifp,uint32_t * genid)12072 ifnet_sync_traffic_rule_genid(ifnet_t ifp, uint32_t *genid)
12073 {
12074 if (*genid != ifp->if_traffic_rule_genid) {
12075 *genid = ifp->if_traffic_rule_genid;
12076 return TRUE;
12077 }
12078 return FALSE;
12079 }
12080 __private_extern__ void
ifnet_update_traffic_rule_count(ifnet_t ifp,uint32_t count)12081 ifnet_update_traffic_rule_count(ifnet_t ifp, uint32_t count)
12082 {
12083 os_atomic_store(&ifp->if_traffic_rule_count, count, release);
12084 ifnet_update_traffic_rule_genid(ifp);
12085 }
12086
12087 static void
log_hexdump(void * data,size_t len)12088 log_hexdump(void *data, size_t len)
12089 {
12090 size_t i, j, k;
12091 unsigned char *ptr = (unsigned char *)data;
12092 #define MAX_DUMP_BUF 32
12093 unsigned char buf[3 * MAX_DUMP_BUF + 1];
12094
12095 for (i = 0; i < len; i += MAX_DUMP_BUF) {
12096 for (j = i, k = 0; j < i + MAX_DUMP_BUF && j < len; j++) {
12097 unsigned char msnbl = ptr[j] >> 4;
12098 unsigned char lsnbl = ptr[j] & 0x0f;
12099
12100 buf[k++] = msnbl < 10 ? msnbl + '0' : msnbl + 'a' - 10;
12101 buf[k++] = lsnbl < 10 ? lsnbl + '0' : lsnbl + 'a' - 10;
12102
12103 if ((j % 2) == 1) {
12104 buf[k++] = ' ';
12105 }
12106 if ((j % MAX_DUMP_BUF) == MAX_DUMP_BUF - 1) {
12107 buf[k++] = ' ';
12108 }
12109 }
12110 buf[k] = 0;
12111 os_log(OS_LOG_DEFAULT, "%3lu: %s", i, buf);
12112 }
12113 }
12114
12115 #if SKYWALK
12116 static bool
net_check_compatible_if_filter(struct ifnet * ifp)12117 net_check_compatible_if_filter(struct ifnet *ifp)
12118 {
12119 if (ifp == NULL) {
12120 if (net_api_stats.nas_iflt_attach_count > net_api_stats.nas_iflt_attach_os_count) {
12121 return false;
12122 }
12123 } else {
12124 if (ifp->if_flt_non_os_count > 0) {
12125 return false;
12126 }
12127 }
12128 return true;
12129 }
12130 #endif /* SKYWALK */
12131
12132 #define DUMP_BUF_CHK() { \
12133 clen -= k; \
12134 if (clen < 1) \
12135 goto done; \
12136 c += k; \
12137 }
12138
12139 int dlil_dump_top_if_qlen(char *, int);
12140 int
dlil_dump_top_if_qlen(char * str,int str_len)12141 dlil_dump_top_if_qlen(char *str, int str_len)
12142 {
12143 char *c = str;
12144 int k, clen = str_len;
12145 struct ifnet *top_ifcq_ifp = NULL;
12146 uint32_t top_ifcq_len = 0;
12147 struct ifnet *top_inq_ifp = NULL;
12148 uint32_t top_inq_len = 0;
12149
12150 for (int ifidx = 1; ifidx < if_index; ifidx++) {
12151 struct ifnet *ifp = ifindex2ifnet[ifidx];
12152 struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
12153
12154 if (ifp == NULL) {
12155 continue;
12156 }
12157 if (ifp->if_snd != NULL && ifp->if_snd->ifcq_len > top_ifcq_len) {
12158 top_ifcq_len = ifp->if_snd->ifcq_len;
12159 top_ifcq_ifp = ifp;
12160 }
12161 if (dl_if->dl_if_inpstorage.dlth_pkts.qlen > top_inq_len) {
12162 top_inq_len = dl_if->dl_if_inpstorage.dlth_pkts.qlen;
12163 top_inq_ifp = ifp;
12164 }
12165 }
12166
12167 if (top_ifcq_ifp != NULL) {
12168 k = scnprintf(c, clen, "\ntop ifcq_len %u packets by %s\n",
12169 top_ifcq_len, top_ifcq_ifp->if_xname);
12170 DUMP_BUF_CHK();
12171 }
12172 if (top_inq_ifp != NULL) {
12173 k = scnprintf(c, clen, "\ntop inq_len %u packets by %s\n",
12174 top_inq_len, top_inq_ifp->if_xname);
12175 DUMP_BUF_CHK();
12176 }
12177 done:
12178 return str_len - clen;
12179 }
12180