1 /*
2 * Copyright (c) 1999-2024 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
30 * support for mandatory and extensible security protections. This notice
31 * is included in support of clause 2.2 (b) of the Apple Public License,
32 * Version 2.0.
33 */
34 #include "kpi_interface.h"
35 #include <stddef.h>
36 #include <ptrauth.h>
37
38 #include <sys/param.h>
39 #include <sys/systm.h>
40 #include <sys/kernel.h>
41 #include <sys/malloc.h>
42 #include <sys/mbuf.h>
43 #include <sys/socket.h>
44 #include <sys/domain.h>
45 #include <sys/user.h>
46 #include <sys/random.h>
47 #include <sys/socketvar.h>
48 #include <net/if_dl.h>
49 #include <net/if.h>
50 #include <net/route.h>
51 #include <net/if_var.h>
52 #include <net/dlil.h>
53 #include <net/dlil_sysctl.h>
54 #include <net/dlil_var_private.h>
55 #include <net/if_arp.h>
56 #include <net/iptap.h>
57 #include <net/pktap.h>
58 #include <net/droptap.h>
59 #include <net/nwk_wq.h>
60 #include <sys/kern_event.h>
61 #include <sys/kdebug.h>
62 #include <sys/mcache.h>
63 #include <sys/syslog.h>
64 #include <sys/protosw.h>
65 #include <sys/priv.h>
66
67 #include <kern/assert.h>
68 #include <kern/task.h>
69 #include <kern/thread.h>
70 #include <kern/sched_prim.h>
71 #include <kern/locks.h>
72 #include <kern/zalloc.h>
73
74 #include <net/kpi_protocol.h>
75 #include <net/if_types.h>
76 #include <net/if_ipsec.h>
77 #include <net/if_llreach.h>
78 #include <net/if_utun.h>
79 #include <net/kpi_interfacefilter.h>
80 #include <net/classq/classq.h>
81 #include <net/classq/classq_sfb.h>
82 #include <net/flowhash.h>
83 #include <net/ntstat.h>
84 #if SKYWALK
85 #include <skywalk/lib/net_filter_event.h>
86 #endif /* SKYWALK */
87 #include <net/net_api_stats.h>
88 #include <net/if_ports_used.h>
89 #include <net/if_vlan_var.h>
90 #include <netinet/in.h>
91 #if INET
92 #include <netinet/in_var.h>
93 #include <netinet/igmp_var.h>
94 #include <netinet/ip_var.h>
95 #include <netinet/tcp.h>
96 #include <netinet/tcp_var.h>
97 #include <netinet/udp.h>
98 #include <netinet/udp_var.h>
99 #include <netinet/if_ether.h>
100 #include <netinet/in_pcb.h>
101 #include <netinet/in_tclass.h>
102 #include <netinet/ip.h>
103 #include <netinet/ip_icmp.h>
104 #include <netinet/icmp_var.h>
105 #endif /* INET */
106
107 #include <net/nat464_utils.h>
108 #include <netinet6/in6_var.h>
109 #include <netinet6/nd6.h>
110 #include <netinet6/mld6_var.h>
111 #include <netinet6/scope6_var.h>
112 #include <netinet/ip6.h>
113 #include <netinet/icmp6.h>
114 #include <net/pf_pbuf.h>
115 #include <libkern/OSAtomic.h>
116 #include <libkern/tree.h>
117
118 #include <dev/random/randomdev.h>
119 #include <machine/machine_routines.h>
120
121 #include <mach/thread_act.h>
122 #include <mach/sdt.h>
123
124 #if CONFIG_MACF
125 #include <sys/kauth.h>
126 #include <security/mac_framework.h>
127 #include <net/ethernet.h>
128 #include <net/firewire.h>
129 #endif
130
131 #if PF
132 #include <net/pfvar.h>
133 #endif /* PF */
134 #include <net/pktsched/pktsched.h>
135 #include <net/pktsched/pktsched_netem.h>
136
137 #if NECP
138 #include <net/necp.h>
139 #endif /* NECP */
140
141 #if SKYWALK
142 #include <skywalk/packet/packet_queue.h>
143 #include <skywalk/nexus/netif/nx_netif.h>
144 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
145 #endif /* SKYWALK */
146
147 #include <net/sockaddr_utils.h>
148
149 #include <os/log.h>
150
151 #define DBG_LAYER_BEG DLILDBG_CODE(DBG_DLIL_STATIC, 0)
152 #define DBG_LAYER_END DLILDBG_CODE(DBG_DLIL_STATIC, 2)
153 #define DBG_FNC_DLIL_INPUT DLILDBG_CODE(DBG_DLIL_STATIC, (1 << 8))
154 #define DBG_FNC_DLIL_OUTPUT DLILDBG_CODE(DBG_DLIL_STATIC, (2 << 8))
155 #define DBG_FNC_DLIL_IFOUT DLILDBG_CODE(DBG_DLIL_STATIC, (3 << 8))
156
157 #define IF_DATA_REQUIRE_ALIGNED_64(f) \
158 _CASSERT(!(offsetof(struct if_data_internal, f) % sizeof (u_int64_t)))
159
160 #define IFNET_IF_DATA_REQUIRE_ALIGNED_64(f) \
161 _CASSERT(!(offsetof(struct ifnet, if_data.f) % sizeof (u_int64_t)))
162
163 enum {
164 kProtoKPI_v1 = 1,
165 kProtoKPI_v2 = 2
166 };
167
168 uint64_t if_creation_generation_count = 0;
169
170 /*
171 * List of if_proto structures in if_proto_hash[] is protected by
172 * the ifnet lock. The rest of the fields are initialized at protocol
173 * attach time and never change, thus no lock required as long as
174 * a reference to it is valid, via if_proto_ref().
175 */
176 struct if_proto {
177 SLIST_ENTRY(if_proto) next_hash;
178 u_int32_t refcount;
179 u_int32_t detached;
180 struct ifnet *ifp;
181 protocol_family_t protocol_family;
182 int proto_kpi;
183 union {
184 struct {
185 proto_media_input input;
186 proto_media_preout pre_output;
187 proto_media_event event;
188 proto_media_ioctl ioctl;
189 proto_media_detached detached;
190 proto_media_resolve_multi resolve_multi;
191 proto_media_send_arp send_arp;
192 } v1;
193 struct {
194 proto_media_input_v2 input;
195 proto_media_preout pre_output;
196 proto_media_event event;
197 proto_media_ioctl ioctl;
198 proto_media_detached detached;
199 proto_media_resolve_multi resolve_multi;
200 proto_media_send_arp send_arp;
201 } v2;
202 } kpi;
203 };
204
205 SLIST_HEAD(proto_hash_entry, if_proto);
206
207 #define DLIL_SDLDATALEN \
208 (DLIL_SDLMAXLEN - offsetof(struct sockaddr_dl, sdl_data[0]))
209
210 /*
211 * In the common case, the LL address is stored in the
212 * `dl_if_lladdr' member of the `dlil_ifnet'. This is sufficient
213 * for LL addresses that do not exceed the `DLIL_SDLMAXLEN' constant.
214 */
215 struct dl_if_lladdr_std {
216 struct ifaddr ifa;
217 u_int8_t addr_sdl_bytes[DLIL_SDLMAXLEN];
218 u_int8_t mask_sdl_bytes[DLIL_SDLMAXLEN];
219 };
220
221 /*
222 * However, in some rare cases we encounter LL addresses which
223 * would not fit in the `DLIL_SDLMAXLEN' limitation. In such cases
224 * we allocate the storage in the permanent arena, using this memory layout.
225 */
226 struct dl_if_lladdr_xtra_space {
227 struct ifaddr ifa;
228 u_int8_t addr_sdl_bytes[SOCK_MAXADDRLEN];
229 u_int8_t mask_sdl_bytes[SOCK_MAXADDRLEN];
230 };
231
232 struct dlil_ifnet {
233 struct ifnet dl_if; /* public ifnet */
234 /*
235 * DLIL private fields, protected by dl_if_lock
236 */
237 decl_lck_mtx_data(, dl_if_lock);
238 TAILQ_ENTRY(dlil_ifnet) dl_if_link; /* dlil_ifnet link */
239 u_int32_t dl_if_flags; /* flags (below) */
240 u_int32_t dl_if_refcnt; /* refcnt */
241 void (*dl_if_trace)(struct dlil_ifnet *, int); /* ref trace callback */
242 void *dl_if_uniqueid; /* unique interface id */
243 size_t dl_if_uniqueid_len; /* length of the unique id */
244 char dl_if_namestorage[IFNAMSIZ]; /* interface name storage */
245 char dl_if_xnamestorage[IFXNAMSIZ]; /* external name storage */
246 struct dl_if_lladdr_std dl_if_lladdr; /* link-level address storage*/
247 u_int8_t dl_if_descstorage[IF_DESCSIZE]; /* desc storage */
248 u_int8_t dl_if_permanent_ether[ETHER_ADDR_LEN]; /* permanent address */
249 u_int8_t dl_if_permanent_ether_is_set;
250 u_int8_t dl_if_unused;
251 struct dlil_threading_info dl_if_inpstorage; /* input thread storage */
252 ctrace_t dl_if_attach; /* attach PC stacktrace */
253 ctrace_t dl_if_detach; /* detach PC stacktrace */
254 };
255
256 /* Values for dl_if_flags (private to DLIL) */
257 #define DLIF_INUSE 0x1 /* DLIL ifnet recycler, ifnet in use */
258 #define DLIF_REUSE 0x2 /* DLIL ifnet recycles, ifnet is not new */
259 #define DLIF_DEBUG 0x4 /* has debugging info */
260
261 #define IF_REF_TRACE_HIST_SIZE 8 /* size of ref trace history */
262
263 /* For gdb */
264 __private_extern__ unsigned int if_ref_trace_hist_size = IF_REF_TRACE_HIST_SIZE;
265
266 struct dlil_ifnet_dbg {
267 struct dlil_ifnet dldbg_dlif; /* dlil_ifnet */
268 u_int16_t dldbg_if_refhold_cnt; /* # ifnet references */
269 u_int16_t dldbg_if_refrele_cnt; /* # ifnet releases */
270 /*
271 * Circular lists of ifnet_{reference,release} callers.
272 */
273 ctrace_t dldbg_if_refhold[IF_REF_TRACE_HIST_SIZE];
274 ctrace_t dldbg_if_refrele[IF_REF_TRACE_HIST_SIZE];
275 };
276
277 #define DLIL_TO_IFP(s) (&s->dl_if)
278 #define IFP_TO_DLIL(s) ((struct dlil_ifnet *)s)
279
280 struct ifnet_filter {
281 TAILQ_ENTRY(ifnet_filter) filt_next;
282 u_int32_t filt_skip;
283 u_int32_t filt_flags;
284 ifnet_t filt_ifp;
285 const char *filt_name;
286 void *filt_cookie;
287 protocol_family_t filt_protocol;
288 iff_input_func filt_input;
289 iff_output_func filt_output;
290 iff_event_func filt_event;
291 iff_ioctl_func filt_ioctl;
292 iff_detached_func filt_detached;
293 };
294
295 /* Mbuf queue used for freeing the excessive mbufs */
296 typedef MBUFQ_HEAD(dlil_freeq) dlil_freeq_t;
297
298 struct proto_input_entry;
299
300 static TAILQ_HEAD(, dlil_ifnet) dlil_ifnet_head;
301
302 static LCK_ATTR_DECLARE(dlil_lck_attributes, 0, 0);
303
304 static LCK_GRP_DECLARE(dlil_lock_group, "DLIL internal locks");
305 LCK_GRP_DECLARE(ifnet_lock_group, "ifnet locks");
306 static LCK_GRP_DECLARE(ifnet_head_lock_group, "ifnet head lock");
307 static LCK_GRP_DECLARE(ifnet_snd_lock_group, "ifnet snd locks");
308 static LCK_GRP_DECLARE(ifnet_rcv_lock_group, "ifnet rcv locks");
309
310 LCK_ATTR_DECLARE(ifnet_lock_attr, 0, 0);
311 static LCK_RW_DECLARE_ATTR(ifnet_head_lock, &ifnet_head_lock_group,
312 &dlil_lck_attributes);
313 static LCK_MTX_DECLARE_ATTR(dlil_ifnet_lock, &dlil_lock_group,
314 &dlil_lck_attributes);
315
316 #if DEBUG
317 static unsigned int ifnet_debug = 1; /* debugging (enabled) */
318 #else
319 static unsigned int ifnet_debug; /* debugging (disabled) */
320 #endif /* !DEBUG */
321 static unsigned int dlif_size; /* size of dlil_ifnet to allocate */
322 static unsigned int dlif_bufsize; /* size of dlif_size + headroom */
323 static struct zone *dlif_zone; /* zone for dlil_ifnet */
324 #define DLIF_ZONE_NAME "ifnet" /* zone name */
325
326 static KALLOC_TYPE_DEFINE(dlif_filt_zone, struct ifnet_filter, NET_KT_DEFAULT);
327
328 static KALLOC_TYPE_DEFINE(dlif_proto_zone, struct if_proto, NET_KT_DEFAULT);
329
330 static unsigned int dlif_tcpstat_size; /* size of tcpstat_local to allocate */
331 static unsigned int dlif_tcpstat_bufsize; /* size of dlif_tcpstat_size + headroom */
332 static struct zone *dlif_tcpstat_zone; /* zone for tcpstat_local */
333 #define DLIF_TCPSTAT_ZONE_NAME "ifnet_tcpstat" /* zone name */
334
335 static unsigned int dlif_udpstat_size; /* size of udpstat_local to allocate */
336 static unsigned int dlif_udpstat_bufsize; /* size of dlif_udpstat_size + headroom */
337 static struct zone *dlif_udpstat_zone; /* zone for udpstat_local */
338 #define DLIF_UDPSTAT_ZONE_NAME "ifnet_udpstat" /* zone name */
339
340 static u_int32_t net_rtref;
341
342 static struct dlil_main_threading_info dlil_main_input_thread_info;
343 __private_extern__ struct dlil_threading_info *dlil_main_input_thread =
344 (struct dlil_threading_info *)&dlil_main_input_thread_info;
345
346 static int dlil_event_internal(struct ifnet *ifp, struct kev_msg *msg, bool update_generation);
347 static int dlil_detach_filter_internal(interface_filter_t filter, int detached);
348 static void dlil_if_trace(struct dlil_ifnet *, int);
349 static void if_proto_ref(struct if_proto *);
350 static void if_proto_free(struct if_proto *);
351 static struct if_proto *find_attached_proto(struct ifnet *, u_int32_t);
352 static u_int32_t dlil_ifp_protolist(struct ifnet *ifp, protocol_family_t *list,
353 u_int32_t list_count);
354 static void _dlil_if_release(ifnet_t ifp, bool clear_in_use);
355 static void if_flt_monitor_busy(struct ifnet *);
356 static void if_flt_monitor_unbusy(struct ifnet *);
357 static void if_flt_monitor_enter(struct ifnet *);
358 static void if_flt_monitor_leave(struct ifnet *);
359 static int dlil_interface_filters_input(struct ifnet *, struct mbuf **,
360 char **, protocol_family_t, boolean_t);
361 static int dlil_interface_filters_output(struct ifnet *, struct mbuf **,
362 protocol_family_t);
363 static struct ifaddr *dlil_alloc_lladdr(struct ifnet *,
364 const struct sockaddr_dl *);
365 static int ifnet_lookup(struct ifnet *);
366 static void if_purgeaddrs(struct ifnet *);
367
368 static errno_t ifproto_media_input_v1(struct ifnet *, protocol_family_t,
369 struct mbuf *, char *);
370 static errno_t ifproto_media_input_v2(struct ifnet *, protocol_family_t,
371 struct mbuf *);
372 static errno_t ifproto_media_preout(struct ifnet *, protocol_family_t,
373 mbuf_t *, const struct sockaddr *, void *, char *, char *);
374 static void ifproto_media_event(struct ifnet *, protocol_family_t,
375 const struct kev_msg *);
376 static errno_t ifproto_media_ioctl(struct ifnet *, protocol_family_t,
377 unsigned long, void *);
378 static errno_t ifproto_media_resolve_multi(ifnet_t, const struct sockaddr *,
379 struct sockaddr_dl *, size_t);
380 static errno_t ifproto_media_send_arp(struct ifnet *, u_short,
381 const struct sockaddr_dl *, const struct sockaddr *,
382 const struct sockaddr_dl *, const struct sockaddr *);
383
384 static errno_t ifp_if_input(struct ifnet *ifp, struct mbuf *m_head,
385 struct mbuf *m_tail, const struct ifnet_stat_increment_param *s,
386 boolean_t poll, struct thread *tp);
387 static void ifp_if_input_poll(struct ifnet *, u_int32_t, u_int32_t,
388 struct mbuf **, struct mbuf **, u_int32_t *, u_int32_t *);
389 static errno_t ifp_if_ctl(struct ifnet *, ifnet_ctl_cmd_t, u_int32_t, void *);
390 static errno_t ifp_if_demux(struct ifnet *, struct mbuf *, char *,
391 protocol_family_t *);
392 static errno_t ifp_if_add_proto(struct ifnet *, protocol_family_t,
393 const struct ifnet_demux_desc *, u_int32_t);
394 static errno_t ifp_if_del_proto(struct ifnet *, protocol_family_t);
395 static errno_t ifp_if_check_multi(struct ifnet *, const struct sockaddr *);
396 #if !XNU_TARGET_OS_OSX
397 static errno_t ifp_if_framer(struct ifnet *, struct mbuf **,
398 const struct sockaddr *, const char *, const char *,
399 u_int32_t *, u_int32_t *);
400 #else /* XNU_TARGET_OS_OSX */
401 static errno_t ifp_if_framer(struct ifnet *, struct mbuf **,
402 const struct sockaddr *, const char *, const char *);
403 #endif /* XNU_TARGET_OS_OSX */
404 static errno_t ifp_if_framer_extended(struct ifnet *, struct mbuf **,
405 const struct sockaddr *, const char *, const char *,
406 u_int32_t *, u_int32_t *);
407 static errno_t ifp_if_set_bpf_tap(struct ifnet *, bpf_tap_mode, bpf_packet_func);
408 static void ifp_if_free(struct ifnet *);
409 static void ifp_if_event(struct ifnet *, const struct kev_msg *);
410 static __inline void ifp_inc_traffic_class_in(struct ifnet *, struct mbuf *);
411 static __inline void ifp_inc_traffic_class_out(struct ifnet *, struct mbuf *);
412
413 static uint32_t dlil_trim_overcomitted_queue_locked(class_queue_t *,
414 dlil_freeq_t *, struct ifnet_stat_increment_param *);
415
416 static errno_t dlil_input_async(struct dlil_threading_info *, struct ifnet *,
417 struct mbuf *, struct mbuf *, const struct ifnet_stat_increment_param *,
418 boolean_t, struct thread *);
419 static errno_t dlil_input_sync(struct dlil_threading_info *, struct ifnet *,
420 struct mbuf *, struct mbuf *, const struct ifnet_stat_increment_param *,
421 boolean_t, struct thread *);
422
423 static void dlil_main_input_thread_func(void *, wait_result_t);
424 static void dlil_main_input_thread_cont(void *, wait_result_t);
425
426 static void dlil_input_thread_func(void *, wait_result_t);
427 static void dlil_input_thread_cont(void *, wait_result_t);
428
429 static void dlil_rxpoll_input_thread_func(void *, wait_result_t);
430 static void dlil_rxpoll_input_thread_cont(void *, wait_result_t);
431
432 static int dlil_create_input_thread(ifnet_t, struct dlil_threading_info *,
433 thread_continue_t *);
434 static void dlil_terminate_input_thread(struct dlil_threading_info *);
435 static void dlil_input_stats_add(const struct ifnet_stat_increment_param *,
436 struct dlil_threading_info *, struct ifnet *, boolean_t);
437 static boolean_t dlil_input_stats_sync(struct ifnet *,
438 struct dlil_threading_info *);
439 static void dlil_input_packet_list_common(struct ifnet *, struct mbuf *,
440 u_int32_t, ifnet_model_t, boolean_t);
441 static errno_t ifnet_input_common(struct ifnet *, struct mbuf *, struct mbuf *,
442 const struct ifnet_stat_increment_param *, boolean_t, boolean_t);
443 static int dlil_is_clat_needed(protocol_family_t, mbuf_t );
444 static errno_t dlil_clat46(ifnet_t, protocol_family_t *, mbuf_t *);
445 static errno_t dlil_clat64(ifnet_t, protocol_family_t *, mbuf_t *);
446 #if DEBUG || DEVELOPMENT
447 static void dlil_verify_sum16(void);
448 #endif /* DEBUG || DEVELOPMENT */
449 static void dlil_output_cksum_dbg(struct ifnet *, struct mbuf *, uint32_t,
450 protocol_family_t);
451 static void dlil_input_cksum_dbg(struct ifnet *, struct mbuf *, char *,
452 protocol_family_t);
453
454 static void dlil_incr_pending_thread_count(void);
455 static void dlil_decr_pending_thread_count(void);
456
457 static void ifnet_detacher_thread_func(void *, wait_result_t);
458 static void ifnet_detacher_thread_cont(void *, wait_result_t);
459 static void ifnet_detach_final(struct ifnet *);
460 static void ifnet_detaching_enqueue(struct ifnet *);
461 static struct ifnet *ifnet_detaching_dequeue(void);
462
463 static void ifnet_start_thread_func(void *, wait_result_t);
464 static void ifnet_start_thread_cont(void *, wait_result_t);
465
466 static void ifnet_poll_thread_func(void *, wait_result_t);
467 static void ifnet_poll_thread_cont(void *, wait_result_t);
468
469 static errno_t ifnet_enqueue_common(struct ifnet *, struct ifclassq *,
470 classq_pkt_t *, boolean_t, boolean_t *);
471
472 static void ifp_src_route_copyout(struct ifnet *, struct route *);
473 static void ifp_src_route_copyin(struct ifnet *, struct route *);
474 static void ifp_src_route6_copyout(struct ifnet *, struct route_in6 *);
475 static void ifp_src_route6_copyin(struct ifnet *, struct route_in6 *);
476
477 static errno_t if_mcasts_update_async(struct ifnet *);
478
479 /* The following are protected by dlil_ifnet_lock */
480 static TAILQ_HEAD(, ifnet) ifnet_detaching_head;
481 static u_int32_t ifnet_detaching_cnt;
482 static boolean_t ifnet_detaching_embryonic;
483 static void *ifnet_delayed_run; /* wait channel for detaching thread */
484
485 static LCK_MTX_DECLARE_ATTR(ifnet_fc_lock, &dlil_lock_group,
486 &dlil_lck_attributes);
487
488 static uint32_t ifnet_flowhash_seed;
489
490 struct ifnet_flowhash_key {
491 char ifk_name[IFNAMSIZ];
492 uint32_t ifk_unit;
493 uint32_t ifk_flags;
494 uint32_t ifk_eflags;
495 uint32_t ifk_capabilities;
496 uint32_t ifk_capenable;
497 uint32_t ifk_output_sched_model;
498 uint32_t ifk_rand1;
499 uint32_t ifk_rand2;
500 };
501
502 /* Flow control entry per interface */
503 struct ifnet_fc_entry {
504 RB_ENTRY(ifnet_fc_entry) ifce_entry;
505 u_int32_t ifce_flowhash;
506 struct ifnet *ifce_ifp;
507 };
508
509 static uint32_t ifnet_calc_flowhash(struct ifnet *);
510 static int ifce_cmp(const struct ifnet_fc_entry *,
511 const struct ifnet_fc_entry *);
512 static int ifnet_fc_add(struct ifnet *);
513 static struct ifnet_fc_entry *ifnet_fc_get(u_int32_t);
514 static void ifnet_fc_entry_free(struct ifnet_fc_entry *);
515
516 /* protected by ifnet_fc_lock */
517 RB_HEAD(ifnet_fc_tree, ifnet_fc_entry) ifnet_fc_tree;
518 RB_PROTOTYPE(ifnet_fc_tree, ifnet_fc_entry, ifce_entry, ifce_cmp);
519 RB_GENERATE(ifnet_fc_tree, ifnet_fc_entry, ifce_entry, ifce_cmp);
520
521 static KALLOC_TYPE_DEFINE(ifnet_fc_zone, struct ifnet_fc_entry, NET_KT_DEFAULT);
522
523 extern void bpfdetach(struct ifnet *);
524 extern void proto_input_run(void);
525
526 extern uint32_t udp_count_opportunistic(unsigned int ifindex,
527 u_int32_t flags);
528 extern uint32_t tcp_count_opportunistic(unsigned int ifindex,
529 u_int32_t flags);
530
531 __private_extern__ void link_rtrequest(int, struct rtentry *, struct sockaddr *);
532
533 #if CONFIG_MACF
534 #if !XNU_TARGET_OS_OSX
535 int dlil_lladdr_ckreq = 1;
536 #else /* XNU_TARGET_OS_OSX */
537 int dlil_lladdr_ckreq = 0;
538 #endif /* XNU_TARGET_OS_OSX */
539 #endif /* CONFIG_MACF */
540
541 /* rate limit debug messages */
542 struct timespec dlil_dbgrate = { .tv_sec = 1, .tv_nsec = 0 };
543
544 static inline void
ifnet_delay_start_disabled_increment(void)545 ifnet_delay_start_disabled_increment(void)
546 {
547 OSIncrementAtomic(&ifnet_delay_start_disabled);
548 }
549
550 static void log_hexdump(void *data, size_t len);
551
552 unsigned int net_rxpoll = 1;
553 unsigned int net_affinity = 1;
554 unsigned int net_async = 1; /* 0: synchronous, 1: asynchronous */
555
556 static kern_return_t dlil_affinity_set(struct thread *, u_int32_t);
557
558 extern u_int32_t inject_buckets;
559
560 /* DLIL data threshold thread call */
561 static void dlil_dt_tcall_fn(thread_call_param_t, thread_call_param_t);
562
563 void
ifnet_filter_update_tso(struct ifnet * ifp,boolean_t filter_enable)564 ifnet_filter_update_tso(struct ifnet *ifp, boolean_t filter_enable)
565 {
566 /*
567 * update filter count and route_generation ID to let TCP
568 * know it should reevalute doing TSO or not
569 */
570 if (filter_enable) {
571 OSAddAtomic(1, &ifp->if_flt_no_tso_count);
572 } else {
573 VERIFY(ifp->if_flt_no_tso_count != 0);
574 OSAddAtomic(-1, &ifp->if_flt_no_tso_count);
575 }
576 routegenid_update();
577 }
578
579 #if SKYWALK
580
581 static bool net_check_compatible_if_filter(struct ifnet *ifp);
582
583 /* if_attach_nx flags defined in os_skywalk_private.h */
584 unsigned int if_attach_nx = IF_ATTACH_NX_DEFAULT;
585 unsigned int if_enable_fsw_ip_netagent =
586 ((IF_ATTACH_NX_DEFAULT & IF_ATTACH_NX_FSW_IP_NETAGENT) != 0);
587 unsigned int if_enable_fsw_transport_netagent =
588 ((IF_ATTACH_NX_DEFAULT & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0);
589
590 unsigned int if_netif_all =
591 ((IF_ATTACH_NX_DEFAULT & IF_ATTACH_NX_NETIF_ALL) != 0);
592
593 /* Configure flowswitch to use max mtu sized buffer */
594 static bool fsw_use_max_mtu_buffer = false;
595
596
597 static void dlil_detach_flowswitch_nexus(if_nexus_flowswitch_t nexus_fsw);
598
599 #include <skywalk/os_skywalk_private.h>
600
601 boolean_t
ifnet_nx_noauto(ifnet_t ifp)602 ifnet_nx_noauto(ifnet_t ifp)
603 {
604 return (ifp->if_xflags & IFXF_NX_NOAUTO) != 0;
605 }
606
607 boolean_t
ifnet_nx_noauto_flowswitch(ifnet_t ifp)608 ifnet_nx_noauto_flowswitch(ifnet_t ifp)
609 {
610 return ifnet_is_low_latency(ifp);
611 }
612
613 boolean_t
ifnet_is_low_latency(ifnet_t ifp)614 ifnet_is_low_latency(ifnet_t ifp)
615 {
616 return (ifp->if_xflags & IFXF_LOW_LATENCY) != 0;
617 }
618
619 boolean_t
ifnet_needs_compat(ifnet_t ifp)620 ifnet_needs_compat(ifnet_t ifp)
621 {
622 if ((if_attach_nx & IF_ATTACH_NX_NETIF_COMPAT) == 0) {
623 return FALSE;
624 }
625 #if !XNU_TARGET_OS_OSX
626 /*
627 * To conserve memory, we plumb in the compat layer selectively; this
628 * can be overridden via if_attach_nx flag IF_ATTACH_NX_NETIF_ALL.
629 * In particular, we check for Wi-Fi Access Point.
630 */
631 if (IFNET_IS_WIFI(ifp)) {
632 /* Wi-Fi Access Point */
633 if (ifp->if_name[0] == 'a' && ifp->if_name[1] == 'p' &&
634 ifp->if_name[2] == '\0') {
635 return if_netif_all;
636 }
637 }
638 #else /* XNU_TARGET_OS_OSX */
639 #pragma unused(ifp)
640 #endif /* XNU_TARGET_OS_OSX */
641 return TRUE;
642 }
643
644 boolean_t
ifnet_needs_fsw_transport_netagent(ifnet_t ifp)645 ifnet_needs_fsw_transport_netagent(ifnet_t ifp)
646 {
647 if (if_is_fsw_transport_netagent_enabled()) {
648 /* check if netagent has been manually enabled for ipsec/utun */
649 if (ifp->if_family == IFNET_FAMILY_IPSEC) {
650 return ipsec_interface_needs_netagent(ifp);
651 } else if (ifp->if_family == IFNET_FAMILY_UTUN) {
652 return utun_interface_needs_netagent(ifp);
653 }
654
655 /* check ifnet no auto nexus override */
656 if (ifnet_nx_noauto(ifp)) {
657 return FALSE;
658 }
659
660 /* check global if_attach_nx configuration */
661 switch (ifp->if_family) {
662 case IFNET_FAMILY_CELLULAR:
663 case IFNET_FAMILY_ETHERNET:
664 if ((if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0) {
665 return TRUE;
666 }
667 break;
668 default:
669 break;
670 }
671 }
672 return FALSE;
673 }
674
675 boolean_t
ifnet_needs_fsw_ip_netagent(ifnet_t ifp)676 ifnet_needs_fsw_ip_netagent(ifnet_t ifp)
677 {
678 #pragma unused(ifp)
679 if ((if_attach_nx & IF_ATTACH_NX_FSW_IP_NETAGENT) != 0) {
680 return TRUE;
681 }
682 return FALSE;
683 }
684
685 boolean_t
ifnet_needs_netif_netagent(ifnet_t ifp)686 ifnet_needs_netif_netagent(ifnet_t ifp)
687 {
688 #pragma unused(ifp)
689 return (if_attach_nx & IF_ATTACH_NX_NETIF_NETAGENT) != 0;
690 }
691
692 static boolean_t
dlil_detach_nexus_instance(nexus_controller_t controller,const char * func_str,uuid_t instance,uuid_t device)693 dlil_detach_nexus_instance(nexus_controller_t controller,
694 const char *func_str, uuid_t instance, uuid_t device)
695 {
696 errno_t err;
697
698 if (instance == NULL || uuid_is_null(instance)) {
699 return FALSE;
700 }
701
702 /* followed by the device port */
703 if (device != NULL && !uuid_is_null(device)) {
704 err = kern_nexus_ifdetach(controller, instance, device);
705 if (err != 0) {
706 DLIL_PRINTF("%s kern_nexus_ifdetach device failed %d\n",
707 func_str, err);
708 }
709 }
710 err = kern_nexus_controller_free_provider_instance(controller,
711 instance);
712 if (err != 0) {
713 DLIL_PRINTF("%s free_provider_instance failed %d\n",
714 func_str, err);
715 }
716 return TRUE;
717 }
718
719 static boolean_t
dlil_detach_nexus(const char * func_str,uuid_t provider,uuid_t instance,uuid_t device)720 dlil_detach_nexus(const char *func_str, uuid_t provider, uuid_t instance,
721 uuid_t device)
722 {
723 boolean_t detached = FALSE;
724 nexus_controller_t controller = kern_nexus_shared_controller();
725 int err;
726
727 if (dlil_detach_nexus_instance(controller, func_str, instance,
728 device)) {
729 detached = TRUE;
730 }
731 if (provider != NULL && !uuid_is_null(provider)) {
732 detached = TRUE;
733 err = kern_nexus_controller_deregister_provider(controller,
734 provider);
735 if (err != 0) {
736 DLIL_PRINTF("%s deregister_provider %d\n",
737 func_str, err);
738 }
739 }
740 return detached;
741 }
742
743 static errno_t
dlil_create_provider_and_instance(nexus_controller_t controller,nexus_type_t type,ifnet_t ifp,uuid_t * provider,uuid_t * instance,nexus_attr_t attr)744 dlil_create_provider_and_instance(nexus_controller_t controller,
745 nexus_type_t type, ifnet_t ifp, uuid_t *provider, uuid_t *instance,
746 nexus_attr_t attr)
747 {
748 uuid_t dom_prov;
749 errno_t err;
750 nexus_name_t provider_name;
751 const char *type_name =
752 (type == NEXUS_TYPE_NET_IF) ? "netif" : "flowswitch";
753 struct kern_nexus_init init;
754
755 err = kern_nexus_get_default_domain_provider(type, &dom_prov);
756 if (err != 0) {
757 DLIL_PRINTF("%s can't get %s provider, error %d\n",
758 __func__, type_name, err);
759 goto failed;
760 }
761
762 snprintf((char *)provider_name, sizeof(provider_name),
763 "com.apple.%s.%s", type_name, if_name(ifp));
764 err = kern_nexus_controller_register_provider(controller,
765 dom_prov,
766 provider_name,
767 NULL,
768 0,
769 attr,
770 provider);
771 if (err != 0) {
772 DLIL_PRINTF("%s register %s provider failed, error %d\n",
773 __func__, type_name, err);
774 goto failed;
775 }
776 bzero(&init, sizeof(init));
777 init.nxi_version = KERN_NEXUS_CURRENT_VERSION;
778 err = kern_nexus_controller_alloc_provider_instance(controller,
779 *provider,
780 NULL, NULL,
781 instance, &init);
782 if (err != 0) {
783 DLIL_PRINTF("%s alloc_provider_instance %s failed, %d\n",
784 __func__, type_name, err);
785 kern_nexus_controller_deregister_provider(controller,
786 *provider);
787 goto failed;
788 }
789 failed:
790 return err;
791 }
792
793 static boolean_t
dlil_attach_netif_nexus_common(ifnet_t ifp,if_nexus_netif_t netif_nx)794 dlil_attach_netif_nexus_common(ifnet_t ifp, if_nexus_netif_t netif_nx)
795 {
796 nexus_attr_t attr = NULL;
797 nexus_controller_t controller;
798 errno_t err;
799
800 if ((ifp->if_capabilities & IFCAP_SKYWALK) != 0) {
801 /* it's already attached */
802 if (dlil_verbose) {
803 DLIL_PRINTF("%s: %s already has nexus attached\n",
804 __func__, if_name(ifp));
805 /* already attached */
806 }
807 goto failed;
808 }
809
810 err = kern_nexus_attr_create(&attr);
811 if (err != 0) {
812 DLIL_PRINTF("%s: nexus attr create for %s\n", __func__,
813 if_name(ifp));
814 goto failed;
815 }
816 err = kern_nexus_attr_set(attr, NEXUS_ATTR_IFINDEX, ifp->if_index);
817 VERIFY(err == 0);
818
819 controller = kern_nexus_shared_controller();
820
821 /* create the netif provider and instance */
822 err = dlil_create_provider_and_instance(controller,
823 NEXUS_TYPE_NET_IF, ifp, &netif_nx->if_nif_provider,
824 &netif_nx->if_nif_instance, attr);
825 if (err != 0) {
826 goto failed;
827 }
828 err = kern_nexus_ifattach(controller, netif_nx->if_nif_instance,
829 ifp, NULL, FALSE, &netif_nx->if_nif_attach);
830 if (err != 0) {
831 DLIL_PRINTF("%s kern_nexus_ifattach %d\n",
832 __func__, err);
833 /* cleanup provider and instance */
834 dlil_detach_nexus(__func__, netif_nx->if_nif_provider,
835 netif_nx->if_nif_instance, NULL);
836 goto failed;
837 }
838 return TRUE;
839
840 failed:
841 if (attr != NULL) {
842 kern_nexus_attr_destroy(attr);
843 }
844 return FALSE;
845 }
846
847 static boolean_t
dlil_attach_netif_compat_nexus(ifnet_t ifp,if_nexus_netif_t netif_nx)848 dlil_attach_netif_compat_nexus(ifnet_t ifp, if_nexus_netif_t netif_nx)
849 {
850 if (ifnet_nx_noauto(ifp) || IFNET_IS_INTCOPROC(ifp) ||
851 IFNET_IS_MANAGEMENT(ifp) || IFNET_IS_VMNET(ifp)) {
852 goto failed;
853 }
854 switch (ifp->if_type) {
855 case IFT_CELLULAR:
856 case IFT_ETHER:
857 if ((if_attach_nx & IF_ATTACH_NX_NETIF_COMPAT) == 0) {
858 /* don't auto-attach */
859 goto failed;
860 }
861 break;
862 default:
863 /* don't auto-attach */
864 goto failed;
865 }
866 return dlil_attach_netif_nexus_common(ifp, netif_nx);
867
868 failed:
869 return FALSE;
870 }
871
872 static boolean_t
dlil_is_native_netif_nexus(ifnet_t ifp)873 dlil_is_native_netif_nexus(ifnet_t ifp)
874 {
875 return (ifp->if_eflags & IFEF_SKYWALK_NATIVE) && ifp->if_na != NULL;
876 }
877
878 __attribute__((noinline))
879 static void
dlil_detach_netif_nexus(if_nexus_netif_t nexus_netif)880 dlil_detach_netif_nexus(if_nexus_netif_t nexus_netif)
881 {
882 dlil_detach_nexus(__func__, nexus_netif->if_nif_provider,
883 nexus_netif->if_nif_instance, nexus_netif->if_nif_attach);
884 }
885
886 static inline int
dlil_siocgifdevmtu(struct ifnet * ifp,struct ifdevmtu * ifdm_p)887 dlil_siocgifdevmtu(struct ifnet * ifp, struct ifdevmtu * ifdm_p)
888 {
889 struct ifreq ifr;
890 int error;
891
892 bzero(&ifr, sizeof(ifr));
893 error = ifnet_ioctl(ifp, 0, SIOCGIFDEVMTU, &ifr);
894 if (error == 0) {
895 *ifdm_p = ifr.ifr_devmtu;
896 }
897 return error;
898 }
899
900 static inline void
_dlil_adjust_large_buf_size_for_tso(ifnet_t ifp,uint32_t * large_buf_size)901 _dlil_adjust_large_buf_size_for_tso(ifnet_t ifp, uint32_t *large_buf_size)
902 {
903 uint32_t tso_v4_mtu = 0;
904 uint32_t tso_v6_mtu = 0;
905
906 if (!kernel_is_macos_or_server()) {
907 return;
908 }
909
910 if (!dlil_is_native_netif_nexus(ifp)) {
911 return;
912 }
913 /*
914 * Note that we are reading the real hwassist flags set by the driver
915 * and not the adjusted ones because nx_netif_host_adjust_if_capabilities()
916 * hasn't been called yet.
917 */
918 if ((ifp->if_hwassist & IFNET_TSO_IPV4) != 0) {
919 tso_v4_mtu = ifp->if_tso_v4_mtu;
920 }
921 if ((ifp->if_hwassist & IFNET_TSO_IPV6) != 0) {
922 tso_v6_mtu = ifp->if_tso_v6_mtu;
923 }
924 /*
925 * If the hardware supports TSO, adjust the large buf size to match the
926 * supported TSO MTU size.
927 */
928 if (tso_v4_mtu != 0 || tso_v6_mtu != 0) {
929 *large_buf_size = MAX(tso_v4_mtu, tso_v6_mtu);
930 } else {
931 *large_buf_size = MAX(*large_buf_size, sk_fsw_gso_mtu);
932 }
933 *large_buf_size = MIN(NX_FSW_MAX_LARGE_BUFSIZE, *large_buf_size);
934 }
935
936 static inline int
_dlil_get_flowswitch_buffer_size(ifnet_t ifp,uuid_t netif,uint32_t * buf_size,bool * use_multi_buflet,uint32_t * large_buf_size)937 _dlil_get_flowswitch_buffer_size(ifnet_t ifp, uuid_t netif, uint32_t *buf_size,
938 bool *use_multi_buflet, uint32_t *large_buf_size)
939 {
940 struct kern_pbufpool_memory_info rx_pp_info;
941 struct kern_pbufpool_memory_info tx_pp_info;
942 uint32_t if_max_mtu = 0;
943 uint32_t drv_buf_size;
944 struct ifdevmtu ifdm;
945 int err;
946
947 /*
948 * To perform intra-stack RX aggregation flowswitch needs to use
949 * multi-buflet packet.
950 */
951 *use_multi_buflet = NX_FSW_TCP_RX_AGG_ENABLED();
952
953 *large_buf_size = *use_multi_buflet ? NX_FSW_DEF_LARGE_BUFSIZE : 0;
954 /*
955 * IP over Thunderbolt interface can deliver the largest IP packet,
956 * but the driver advertises the MAX MTU as only 9K.
957 */
958 if (IFNET_IS_THUNDERBOLT_IP(ifp)) {
959 if_max_mtu = IP_MAXPACKET;
960 goto skip_mtu_ioctl;
961 }
962
963 /* determine max mtu */
964 bzero(&ifdm, sizeof(ifdm));
965 err = dlil_siocgifdevmtu(ifp, &ifdm);
966 if (__improbable(err != 0)) {
967 DLIL_PRINTF("%s: SIOCGIFDEVMTU failed for %s\n",
968 __func__, if_name(ifp));
969 /* use default flowswitch buffer size */
970 if_max_mtu = NX_FSW_BUFSIZE;
971 } else {
972 DLIL_PRINTF("%s: %s %d %d\n", __func__, if_name(ifp),
973 ifdm.ifdm_max, ifdm.ifdm_current);
974 /* rdar://problem/44589731 */
975 if_max_mtu = MAX(ifdm.ifdm_max, ifdm.ifdm_current);
976 }
977
978 skip_mtu_ioctl:
979 if (if_max_mtu == 0) {
980 DLIL_PRINTF("%s: can't determine MAX MTU for %s\n",
981 __func__, if_name(ifp));
982 return EINVAL;
983 }
984 if ((if_max_mtu > NX_FSW_MAXBUFSIZE) && fsw_use_max_mtu_buffer) {
985 DLIL_PRINTF("%s: interace (%s) has MAX MTU (%u) > flowswitch "
986 "max bufsize(%d)\n", __func__,
987 if_name(ifp), if_max_mtu, NX_FSW_MAXBUFSIZE);
988 return EINVAL;
989 }
990
991 /*
992 * for skywalk native driver, consult the driver packet pool also.
993 */
994 if (dlil_is_native_netif_nexus(ifp)) {
995 err = kern_nexus_get_pbufpool_info(netif, &rx_pp_info,
996 &tx_pp_info);
997 if (err != 0) {
998 DLIL_PRINTF("%s: can't get pbufpool info for %s\n",
999 __func__, if_name(ifp));
1000 return ENXIO;
1001 }
1002 drv_buf_size = tx_pp_info.kpm_bufsize *
1003 tx_pp_info.kpm_max_frags;
1004 if (if_max_mtu > drv_buf_size) {
1005 DLIL_PRINTF("%s: interface %s packet pool (rx %d * %d, "
1006 "tx %d * %d) can't support max mtu(%d)\n", __func__,
1007 if_name(ifp), rx_pp_info.kpm_bufsize,
1008 rx_pp_info.kpm_max_frags, tx_pp_info.kpm_bufsize,
1009 tx_pp_info.kpm_max_frags, if_max_mtu);
1010 return EINVAL;
1011 }
1012 } else {
1013 drv_buf_size = if_max_mtu;
1014 }
1015
1016 if ((drv_buf_size > NX_FSW_BUFSIZE) && (!fsw_use_max_mtu_buffer)) {
1017 _CASSERT((NX_FSW_BUFSIZE * NX_PBUF_FRAGS_MAX) >= IP_MAXPACKET);
1018 *use_multi_buflet = true;
1019 /* default flowswitch buffer size */
1020 *buf_size = NX_FSW_BUFSIZE;
1021 *large_buf_size = MIN(NX_FSW_MAX_LARGE_BUFSIZE, drv_buf_size);
1022 } else {
1023 *buf_size = MAX(drv_buf_size, NX_FSW_BUFSIZE);
1024 }
1025 _dlil_adjust_large_buf_size_for_tso(ifp, large_buf_size);
1026 ASSERT(*buf_size <= NX_FSW_MAXBUFSIZE);
1027 if (*buf_size >= *large_buf_size) {
1028 *large_buf_size = 0;
1029 }
1030 return 0;
1031 }
1032
1033 static boolean_t
_dlil_attach_flowswitch_nexus(ifnet_t ifp,if_nexus_flowswitch_t nexus_fsw)1034 _dlil_attach_flowswitch_nexus(ifnet_t ifp, if_nexus_flowswitch_t nexus_fsw)
1035 {
1036 nexus_attr_t attr = NULL;
1037 nexus_controller_t controller;
1038 errno_t err = 0;
1039 uuid_t netif;
1040 uint32_t buf_size = 0;
1041 uint32_t large_buf_size = 0;
1042 bool multi_buflet;
1043
1044 if (ifnet_nx_noauto(ifp) || ifnet_nx_noauto_flowswitch(ifp) ||
1045 IFNET_IS_VMNET(ifp)) {
1046 goto failed;
1047 }
1048
1049 if ((ifp->if_capabilities & IFCAP_SKYWALK) == 0) {
1050 /* not possible to attach (netif native/compat not plumbed) */
1051 goto failed;
1052 }
1053
1054 if ((if_attach_nx & IF_ATTACH_NX_FLOWSWITCH) == 0) {
1055 /* don't auto-attach */
1056 goto failed;
1057 }
1058
1059 /* get the netif instance from the ifp */
1060 err = kern_nexus_get_netif_instance(ifp, netif);
1061 if (err != 0) {
1062 DLIL_PRINTF("%s: can't find netif for %s\n", __func__,
1063 if_name(ifp));
1064 goto failed;
1065 }
1066
1067 err = kern_nexus_attr_create(&attr);
1068 if (err != 0) {
1069 DLIL_PRINTF("%s: nexus attr create for %s\n", __func__,
1070 if_name(ifp));
1071 goto failed;
1072 }
1073
1074 err = _dlil_get_flowswitch_buffer_size(ifp, netif, &buf_size,
1075 &multi_buflet, &large_buf_size);
1076 if (err != 0) {
1077 goto failed;
1078 }
1079 ASSERT((buf_size >= NX_FSW_BUFSIZE) && (buf_size <= NX_FSW_MAXBUFSIZE));
1080 ASSERT(large_buf_size <= NX_FSW_MAX_LARGE_BUFSIZE);
1081
1082 /* Configure flowswitch buffer size */
1083 err = kern_nexus_attr_set(attr, NEXUS_ATTR_SLOT_BUF_SIZE, buf_size);
1084 VERIFY(err == 0);
1085 err = kern_nexus_attr_set(attr, NEXUS_ATTR_LARGE_BUF_SIZE,
1086 large_buf_size);
1087 VERIFY(err == 0);
1088
1089 /*
1090 * Configure flowswitch to use super-packet (multi-buflet).
1091 */
1092 err = kern_nexus_attr_set(attr, NEXUS_ATTR_MAX_FRAGS,
1093 multi_buflet ? NX_PBUF_FRAGS_MAX : 1);
1094 VERIFY(err == 0);
1095
1096 /* create the flowswitch provider and instance */
1097 controller = kern_nexus_shared_controller();
1098 err = dlil_create_provider_and_instance(controller,
1099 NEXUS_TYPE_FLOW_SWITCH, ifp, &nexus_fsw->if_fsw_provider,
1100 &nexus_fsw->if_fsw_instance, attr);
1101 if (err != 0) {
1102 goto failed;
1103 }
1104
1105 /* attach the device port */
1106 err = kern_nexus_ifattach(controller, nexus_fsw->if_fsw_instance,
1107 NULL, netif, FALSE, &nexus_fsw->if_fsw_device);
1108 if (err != 0) {
1109 DLIL_PRINTF("%s kern_nexus_ifattach device failed %d %s\n",
1110 __func__, err, if_name(ifp));
1111 /* cleanup provider and instance */
1112 dlil_detach_nexus(__func__, nexus_fsw->if_fsw_provider,
1113 nexus_fsw->if_fsw_instance, nexus_fsw->if_fsw_device);
1114 goto failed;
1115 }
1116 return TRUE;
1117
1118 failed:
1119 if (err != 0) {
1120 DLIL_PRINTF("%s: failed to attach flowswitch to %s, error %d\n",
1121 __func__, if_name(ifp), err);
1122 } else {
1123 DLIL_PRINTF("%s: not attaching flowswitch to %s\n",
1124 __func__, if_name(ifp));
1125 }
1126 if (attr != NULL) {
1127 kern_nexus_attr_destroy(attr);
1128 }
1129 return FALSE;
1130 }
1131
1132 static boolean_t
dlil_attach_flowswitch_nexus(ifnet_t ifp)1133 dlil_attach_flowswitch_nexus(ifnet_t ifp)
1134 {
1135 boolean_t attached;
1136 if_nexus_flowswitch nexus_fsw;
1137
1138 #if (DEVELOPMENT || DEBUG)
1139 if (skywalk_netif_direct_allowed(if_name(ifp))) {
1140 DLIL_PRINTF("skip attaching fsw to %s\n", if_name(ifp));
1141 return FALSE;
1142 }
1143 #endif /* (DEVELOPMENT || DEBUG) */
1144
1145 /*
1146 * flowswitch attachment is not supported for interface using the
1147 * legacy model (IFNET_INIT_LEGACY)
1148 */
1149 if ((ifp->if_eflags & IFEF_TXSTART) == 0) {
1150 DLIL_PRINTF("skip attaching fsw to %s using legacy TX model\n",
1151 if_name(ifp));
1152 return FALSE;
1153 }
1154
1155 if (uuid_is_null(ifp->if_nx_flowswitch.if_fsw_instance) == 0) {
1156 /* it's already attached */
1157 return FALSE;
1158 }
1159 bzero(&nexus_fsw, sizeof(nexus_fsw));
1160 attached = _dlil_attach_flowswitch_nexus(ifp, &nexus_fsw);
1161 if (attached) {
1162 ifnet_lock_exclusive(ifp);
1163 if (!IF_FULLY_ATTACHED(ifp)) {
1164 /* interface is going away */
1165 attached = FALSE;
1166 } else {
1167 ifp->if_nx_flowswitch = nexus_fsw;
1168 }
1169 ifnet_lock_done(ifp);
1170 if (!attached) {
1171 /* clean up flowswitch nexus */
1172 dlil_detach_flowswitch_nexus(&nexus_fsw);
1173 }
1174 }
1175 return attached;
1176 }
1177
1178 __attribute__((noinline))
1179 static void
dlil_detach_flowswitch_nexus(if_nexus_flowswitch_t nexus_fsw)1180 dlil_detach_flowswitch_nexus(if_nexus_flowswitch_t nexus_fsw)
1181 {
1182 dlil_detach_nexus(__func__, nexus_fsw->if_fsw_provider,
1183 nexus_fsw->if_fsw_instance, nexus_fsw->if_fsw_device);
1184 }
1185
1186 __attribute__((noinline))
1187 static void
dlil_netif_detach_notify(ifnet_t ifp)1188 dlil_netif_detach_notify(ifnet_t ifp)
1189 {
1190 ifnet_detach_notify_cb_t notify = NULL;
1191 void *arg = NULL;
1192
1193 ifnet_get_detach_notify(ifp, ¬ify, &arg);
1194 if (notify == NULL) {
1195 DTRACE_SKYWALK1(no__notify, ifnet_t, ifp);
1196 return;
1197 }
1198 (*notify)(arg);
1199 }
1200
1201 __attribute__((noinline))
1202 static void
dlil_quiesce_and_detach_nexuses(ifnet_t ifp)1203 dlil_quiesce_and_detach_nexuses(ifnet_t ifp)
1204 {
1205 if_nexus_flowswitch *nx_fsw = &ifp->if_nx_flowswitch;
1206 if_nexus_netif *nx_netif = &ifp->if_nx_netif;
1207
1208 ifnet_datamov_suspend_and_drain(ifp);
1209 if (!uuid_is_null(nx_fsw->if_fsw_device)) {
1210 ASSERT(!uuid_is_null(nx_fsw->if_fsw_provider));
1211 ASSERT(!uuid_is_null(nx_fsw->if_fsw_instance));
1212 dlil_detach_flowswitch_nexus(nx_fsw);
1213 bzero(nx_fsw, sizeof(*nx_fsw));
1214 } else {
1215 ASSERT(uuid_is_null(nx_fsw->if_fsw_provider));
1216 ASSERT(uuid_is_null(nx_fsw->if_fsw_instance));
1217 DTRACE_IP1(fsw__not__attached, ifnet_t, ifp);
1218 }
1219
1220 if (!uuid_is_null(nx_netif->if_nif_attach)) {
1221 ASSERT(!uuid_is_null(nx_netif->if_nif_provider));
1222 ASSERT(!uuid_is_null(nx_netif->if_nif_instance));
1223 dlil_detach_netif_nexus(nx_netif);
1224 bzero(nx_netif, sizeof(*nx_netif));
1225 } else {
1226 ASSERT(uuid_is_null(nx_netif->if_nif_provider));
1227 ASSERT(uuid_is_null(nx_netif->if_nif_instance));
1228 DTRACE_IP1(netif__not__attached, ifnet_t, ifp);
1229 }
1230 ifnet_datamov_resume(ifp);
1231 }
1232
1233 boolean_t
ifnet_add_netagent(ifnet_t ifp)1234 ifnet_add_netagent(ifnet_t ifp)
1235 {
1236 int error;
1237
1238 error = kern_nexus_interface_add_netagent(ifp);
1239 os_log(OS_LOG_DEFAULT,
1240 "kern_nexus_interface_add_netagent(%s) returned %d",
1241 ifp->if_xname, error);
1242 return error == 0;
1243 }
1244
1245 boolean_t
ifnet_remove_netagent(ifnet_t ifp)1246 ifnet_remove_netagent(ifnet_t ifp)
1247 {
1248 int error;
1249
1250 error = kern_nexus_interface_remove_netagent(ifp);
1251 os_log(OS_LOG_DEFAULT,
1252 "kern_nexus_interface_remove_netagent(%s) returned %d",
1253 ifp->if_xname, error);
1254 return error == 0;
1255 }
1256
1257 boolean_t
ifnet_attach_flowswitch_nexus(ifnet_t ifp)1258 ifnet_attach_flowswitch_nexus(ifnet_t ifp)
1259 {
1260 if (!IF_FULLY_ATTACHED(ifp)) {
1261 return FALSE;
1262 }
1263 return dlil_attach_flowswitch_nexus(ifp);
1264 }
1265
1266 boolean_t
ifnet_detach_flowswitch_nexus(ifnet_t ifp)1267 ifnet_detach_flowswitch_nexus(ifnet_t ifp)
1268 {
1269 if_nexus_flowswitch nexus_fsw;
1270
1271 ifnet_lock_exclusive(ifp);
1272 nexus_fsw = ifp->if_nx_flowswitch;
1273 bzero(&ifp->if_nx_flowswitch, sizeof(ifp->if_nx_flowswitch));
1274 ifnet_lock_done(ifp);
1275 return dlil_detach_nexus(__func__, nexus_fsw.if_fsw_provider,
1276 nexus_fsw.if_fsw_instance, nexus_fsw.if_fsw_device);
1277 }
1278
1279 void
ifnet_attach_native_flowswitch(ifnet_t ifp)1280 ifnet_attach_native_flowswitch(ifnet_t ifp)
1281 {
1282 if (!dlil_is_native_netif_nexus(ifp)) {
1283 /* not a native netif */
1284 return;
1285 }
1286 ifnet_attach_flowswitch_nexus(ifp);
1287 }
1288
1289 int
ifnet_set_flowswitch_rx_callback(ifnet_t ifp,ifnet_fsw_rx_cb_t cb,void * arg)1290 ifnet_set_flowswitch_rx_callback(ifnet_t ifp, ifnet_fsw_rx_cb_t cb, void *arg)
1291 {
1292 lck_mtx_lock(&ifp->if_delegate_lock);
1293 while (ifp->if_fsw_rx_cb_ref > 0) {
1294 DTRACE_SKYWALK1(wait__fsw, ifnet_t, ifp);
1295 (void) msleep(&ifp->if_fsw_rx_cb_ref, &ifp->if_delegate_lock,
1296 (PZERO + 1), __FUNCTION__, NULL);
1297 DTRACE_SKYWALK1(wake__fsw, ifnet_t, ifp);
1298 }
1299 ifp->if_fsw_rx_cb = cb;
1300 ifp->if_fsw_rx_cb_arg = arg;
1301 lck_mtx_unlock(&ifp->if_delegate_lock);
1302 return 0;
1303 }
1304
1305 int
ifnet_get_flowswitch_rx_callback(ifnet_t ifp,ifnet_fsw_rx_cb_t * cbp,void ** argp)1306 ifnet_get_flowswitch_rx_callback(ifnet_t ifp, ifnet_fsw_rx_cb_t *cbp, void **argp)
1307 {
1308 /*
1309 * This is for avoiding the unnecessary lock acquire for interfaces
1310 * not used by a redirect interface.
1311 */
1312 if (ifp->if_fsw_rx_cb == NULL) {
1313 return ENOENT;
1314 }
1315 lck_mtx_lock(&ifp->if_delegate_lock);
1316 if (ifp->if_fsw_rx_cb == NULL) {
1317 lck_mtx_unlock(&ifp->if_delegate_lock);
1318 return ENOENT;
1319 }
1320 *cbp = ifp->if_fsw_rx_cb;
1321 *argp = ifp->if_fsw_rx_cb_arg;
1322 ifp->if_fsw_rx_cb_ref++;
1323 lck_mtx_unlock(&ifp->if_delegate_lock);
1324 return 0;
1325 }
1326
1327 void
ifnet_release_flowswitch_rx_callback(ifnet_t ifp)1328 ifnet_release_flowswitch_rx_callback(ifnet_t ifp)
1329 {
1330 lck_mtx_lock(&ifp->if_delegate_lock);
1331 if (--ifp->if_fsw_rx_cb_ref == 0) {
1332 wakeup(&ifp->if_fsw_rx_cb_ref);
1333 }
1334 lck_mtx_unlock(&ifp->if_delegate_lock);
1335 }
1336
1337 int
ifnet_set_delegate_parent(ifnet_t difp,ifnet_t parent)1338 ifnet_set_delegate_parent(ifnet_t difp, ifnet_t parent)
1339 {
1340 lck_mtx_lock(&difp->if_delegate_lock);
1341 while (difp->if_delegate_parent_ref > 0) {
1342 DTRACE_SKYWALK1(wait__parent, ifnet_t, difp);
1343 (void) msleep(&difp->if_delegate_parent_ref, &difp->if_delegate_lock,
1344 (PZERO + 1), __FUNCTION__, NULL);
1345 DTRACE_SKYWALK1(wake__parent, ifnet_t, difp);
1346 }
1347 difp->if_delegate_parent = parent;
1348 lck_mtx_unlock(&difp->if_delegate_lock);
1349 return 0;
1350 }
1351
1352 int
ifnet_get_delegate_parent(ifnet_t difp,ifnet_t * parentp)1353 ifnet_get_delegate_parent(ifnet_t difp, ifnet_t *parentp)
1354 {
1355 lck_mtx_lock(&difp->if_delegate_lock);
1356 if (difp->if_delegate_parent == NULL) {
1357 lck_mtx_unlock(&difp->if_delegate_lock);
1358 return ENOENT;
1359 }
1360 *parentp = difp->if_delegate_parent;
1361 difp->if_delegate_parent_ref++;
1362 lck_mtx_unlock(&difp->if_delegate_lock);
1363 return 0;
1364 }
1365
1366 void
ifnet_release_delegate_parent(ifnet_t difp)1367 ifnet_release_delegate_parent(ifnet_t difp)
1368 {
1369 lck_mtx_lock(&difp->if_delegate_lock);
1370 if (--difp->if_delegate_parent_ref == 0) {
1371 wakeup(&difp->if_delegate_parent_ref);
1372 }
1373 lck_mtx_unlock(&difp->if_delegate_lock);
1374 }
1375
1376 __attribute__((noinline))
1377 void
ifnet_set_detach_notify_locked(ifnet_t ifp,ifnet_detach_notify_cb_t notify,void * arg)1378 ifnet_set_detach_notify_locked(ifnet_t ifp, ifnet_detach_notify_cb_t notify, void *arg)
1379 {
1380 ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_EXCLUSIVE);
1381 ifp->if_detach_notify = notify;
1382 ifp->if_detach_notify_arg = arg;
1383 }
1384
1385 __attribute__((noinline))
1386 void
ifnet_get_detach_notify_locked(ifnet_t ifp,ifnet_detach_notify_cb_t * notifyp,void ** argp)1387 ifnet_get_detach_notify_locked(ifnet_t ifp, ifnet_detach_notify_cb_t *notifyp, void **argp)
1388 {
1389 ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_EXCLUSIVE);
1390 *notifyp = ifp->if_detach_notify;
1391 *argp = ifp->if_detach_notify_arg;
1392 }
1393
1394 __attribute__((noinline))
1395 void
ifnet_set_detach_notify(ifnet_t ifp,ifnet_detach_notify_cb_t notify,void * arg)1396 ifnet_set_detach_notify(ifnet_t ifp, ifnet_detach_notify_cb_t notify, void *arg)
1397 {
1398 ifnet_lock_exclusive(ifp);
1399 ifnet_set_detach_notify_locked(ifp, notify, arg);
1400 ifnet_lock_done(ifp);
1401 }
1402
1403 __attribute__((noinline))
1404 void
ifnet_get_detach_notify(ifnet_t ifp,ifnet_detach_notify_cb_t * notifyp,void ** argp)1405 ifnet_get_detach_notify(ifnet_t ifp, ifnet_detach_notify_cb_t *notifyp, void **argp)
1406 {
1407 ifnet_lock_exclusive(ifp);
1408 ifnet_get_detach_notify_locked(ifp, notifyp, argp);
1409 ifnet_lock_done(ifp);
1410 }
1411 #endif /* SKYWALK */
1412
1413 #define DLIL_INPUT_CHECK(m, ifp) { \
1414 struct ifnet *_rcvif = mbuf_pkthdr_rcvif(m); \
1415 if (_rcvif == NULL || (ifp != lo_ifp && _rcvif != ifp) || \
1416 !(mbuf_flags(m) & MBUF_PKTHDR)) { \
1417 panic_plain("%s: invalid mbuf %p\n", __func__, m); \
1418 /* NOTREACHED */ \
1419 } \
1420 }
1421
1422 #define DLIL_EWMA(old, new, decay) do { \
1423 u_int32_t _avg; \
1424 if ((_avg = (old)) > 0) \
1425 _avg = (((_avg << (decay)) - _avg) + (new)) >> (decay); \
1426 else \
1427 _avg = (new); \
1428 (old) = _avg; \
1429 } while (0)
1430
1431 #define MBPS (1ULL * 1000 * 1000)
1432 #define GBPS (MBPS * 1000)
1433
1434 struct rxpoll_time_tbl {
1435 u_int64_t speed; /* downlink speed */
1436 u_int32_t plowat; /* packets low watermark */
1437 u_int32_t phiwat; /* packets high watermark */
1438 u_int32_t blowat; /* bytes low watermark */
1439 u_int32_t bhiwat; /* bytes high watermark */
1440 };
1441
1442 static struct rxpoll_time_tbl rxpoll_tbl[] = {
1443 { .speed = 10 * MBPS, .plowat = 2, .phiwat = 8, .blowat = (1 * 1024), .bhiwat = (6 * 1024) },
1444 { .speed = 100 * MBPS, .plowat = 10, .phiwat = 40, .blowat = (4 * 1024), .bhiwat = (64 * 1024) },
1445 { .speed = 1 * GBPS, .plowat = 10, .phiwat = 40, .blowat = (4 * 1024), .bhiwat = (64 * 1024) },
1446 { .speed = 10 * GBPS, .plowat = 10, .phiwat = 40, .blowat = (4 * 1024), .bhiwat = (64 * 1024) },
1447 { .speed = 100 * GBPS, .plowat = 10, .phiwat = 40, .blowat = (4 * 1024), .bhiwat = (64 * 1024) },
1448 { .speed = 0, .plowat = 0, .phiwat = 0, .blowat = 0, .bhiwat = 0 }
1449 };
1450
1451 static LCK_MTX_DECLARE_ATTR(dlil_thread_sync_lock, &dlil_lock_group,
1452 &dlil_lck_attributes);
1453 static uint32_t dlil_pending_thread_cnt = 0;
1454
1455 static void
dlil_incr_pending_thread_count(void)1456 dlil_incr_pending_thread_count(void)
1457 {
1458 LCK_MTX_ASSERT(&dlil_thread_sync_lock, LCK_MTX_ASSERT_NOTOWNED);
1459 lck_mtx_lock(&dlil_thread_sync_lock);
1460 dlil_pending_thread_cnt++;
1461 lck_mtx_unlock(&dlil_thread_sync_lock);
1462 }
1463
1464 static void
dlil_decr_pending_thread_count(void)1465 dlil_decr_pending_thread_count(void)
1466 {
1467 LCK_MTX_ASSERT(&dlil_thread_sync_lock, LCK_MTX_ASSERT_NOTOWNED);
1468 lck_mtx_lock(&dlil_thread_sync_lock);
1469 VERIFY(dlil_pending_thread_cnt > 0);
1470 dlil_pending_thread_cnt--;
1471 if (dlil_pending_thread_cnt == 0) {
1472 wakeup(&dlil_pending_thread_cnt);
1473 }
1474 lck_mtx_unlock(&dlil_thread_sync_lock);
1475 }
1476
1477 int
proto_hash_value(u_int32_t protocol_family)1478 proto_hash_value(u_int32_t protocol_family)
1479 {
1480 /*
1481 * dlil_proto_unplumb_all() depends on the mapping between
1482 * the hash bucket index and the protocol family defined
1483 * here; future changes must be applied there as well.
1484 */
1485 switch (protocol_family) {
1486 case PF_INET:
1487 return 0;
1488 case PF_INET6:
1489 return 1;
1490 case PF_VLAN:
1491 return 2;
1492 case PF_UNSPEC:
1493 default:
1494 return 3;
1495 }
1496 }
1497
1498 /*
1499 * Caller must already be holding ifnet lock.
1500 */
1501 static struct if_proto *
find_attached_proto(struct ifnet * ifp,u_int32_t protocol_family)1502 find_attached_proto(struct ifnet *ifp, u_int32_t protocol_family)
1503 {
1504 struct if_proto *proto = NULL;
1505 u_int32_t i = proto_hash_value(protocol_family);
1506
1507 ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_OWNED);
1508
1509 if (ifp->if_proto_hash != NULL) {
1510 proto = SLIST_FIRST(&ifp->if_proto_hash[i]);
1511 }
1512
1513 while (proto != NULL && proto->protocol_family != protocol_family) {
1514 proto = SLIST_NEXT(proto, next_hash);
1515 }
1516
1517 if (proto != NULL) {
1518 if_proto_ref(proto);
1519 }
1520
1521 return proto;
1522 }
1523
1524 static void
if_proto_ref(struct if_proto * proto)1525 if_proto_ref(struct if_proto *proto)
1526 {
1527 os_atomic_inc(&proto->refcount, relaxed);
1528 }
1529
1530 extern void if_rtproto_del(struct ifnet *ifp, int protocol);
1531
1532 static void
if_proto_free(struct if_proto * proto)1533 if_proto_free(struct if_proto *proto)
1534 {
1535 u_int32_t oldval;
1536 struct ifnet *ifp = proto->ifp;
1537 u_int32_t proto_family = proto->protocol_family;
1538 struct kev_dl_proto_data ev_pr_data;
1539
1540 oldval = os_atomic_dec_orig(&proto->refcount, relaxed);
1541 if (oldval > 1) {
1542 return;
1543 }
1544
1545 if (proto->proto_kpi == kProtoKPI_v1) {
1546 if (proto->kpi.v1.detached) {
1547 proto->kpi.v1.detached(ifp, proto->protocol_family);
1548 }
1549 }
1550 if (proto->proto_kpi == kProtoKPI_v2) {
1551 if (proto->kpi.v2.detached) {
1552 proto->kpi.v2.detached(ifp, proto->protocol_family);
1553 }
1554 }
1555
1556 /*
1557 * Cleanup routes that may still be in the routing table for that
1558 * interface/protocol pair.
1559 */
1560 if_rtproto_del(ifp, proto_family);
1561
1562 ifnet_lock_shared(ifp);
1563
1564 /* No more reference on this, protocol must have been detached */
1565 VERIFY(proto->detached);
1566
1567 /*
1568 * The reserved field carries the number of protocol still attached
1569 * (subject to change)
1570 */
1571 ev_pr_data.proto_family = proto_family;
1572 ev_pr_data.proto_remaining_count = dlil_ifp_protolist(ifp, NULL, 0);
1573
1574 ifnet_lock_done(ifp);
1575
1576 dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_PROTO_DETACHED,
1577 (struct net_event_data *)&ev_pr_data,
1578 sizeof(struct kev_dl_proto_data), FALSE);
1579
1580 if (ev_pr_data.proto_remaining_count == 0) {
1581 /*
1582 * The protocol count has gone to zero, mark the interface down.
1583 * This used to be done by configd.KernelEventMonitor, but that
1584 * is inherently prone to races (rdar://problem/30810208).
1585 */
1586 (void) ifnet_set_flags(ifp, 0, IFF_UP);
1587 (void) ifnet_ioctl(ifp, 0, SIOCSIFFLAGS, NULL);
1588 dlil_post_sifflags_msg(ifp);
1589 }
1590
1591 zfree(dlif_proto_zone, proto);
1592 }
1593
1594 __private_extern__ void
ifnet_lock_assert(struct ifnet * ifp,ifnet_lock_assert_t what)1595 ifnet_lock_assert(struct ifnet *ifp, ifnet_lock_assert_t what)
1596 {
1597 #if !MACH_ASSERT
1598 #pragma unused(ifp)
1599 #endif
1600 unsigned int type = 0;
1601 int ass = 1;
1602
1603 switch (what) {
1604 case IFNET_LCK_ASSERT_EXCLUSIVE:
1605 type = LCK_RW_ASSERT_EXCLUSIVE;
1606 break;
1607
1608 case IFNET_LCK_ASSERT_SHARED:
1609 type = LCK_RW_ASSERT_SHARED;
1610 break;
1611
1612 case IFNET_LCK_ASSERT_OWNED:
1613 type = LCK_RW_ASSERT_HELD;
1614 break;
1615
1616 case IFNET_LCK_ASSERT_NOTOWNED:
1617 /* nothing to do here for RW lock; bypass assert */
1618 ass = 0;
1619 break;
1620
1621 default:
1622 panic("bad ifnet assert type: %d", what);
1623 /* NOTREACHED */
1624 }
1625 if (ass) {
1626 LCK_RW_ASSERT(&ifp->if_lock, type);
1627 }
1628 }
1629
1630 __private_extern__ void
ifnet_lock_shared(struct ifnet * ifp)1631 ifnet_lock_shared(struct ifnet *ifp)
1632 {
1633 lck_rw_lock_shared(&ifp->if_lock);
1634 }
1635
1636 __private_extern__ void
ifnet_lock_exclusive(struct ifnet * ifp)1637 ifnet_lock_exclusive(struct ifnet *ifp)
1638 {
1639 lck_rw_lock_exclusive(&ifp->if_lock);
1640 }
1641
1642 __private_extern__ void
ifnet_lock_done(struct ifnet * ifp)1643 ifnet_lock_done(struct ifnet *ifp)
1644 {
1645 lck_rw_done(&ifp->if_lock);
1646 }
1647
1648 #if INET
1649 __private_extern__ void
if_inetdata_lock_shared(struct ifnet * ifp)1650 if_inetdata_lock_shared(struct ifnet *ifp)
1651 {
1652 lck_rw_lock_shared(&ifp->if_inetdata_lock);
1653 }
1654
1655 __private_extern__ void
if_inetdata_lock_exclusive(struct ifnet * ifp)1656 if_inetdata_lock_exclusive(struct ifnet *ifp)
1657 {
1658 lck_rw_lock_exclusive(&ifp->if_inetdata_lock);
1659 }
1660
1661 __private_extern__ void
if_inetdata_lock_done(struct ifnet * ifp)1662 if_inetdata_lock_done(struct ifnet *ifp)
1663 {
1664 lck_rw_done(&ifp->if_inetdata_lock);
1665 }
1666 #endif
1667
1668 __private_extern__ void
if_inet6data_lock_shared(struct ifnet * ifp)1669 if_inet6data_lock_shared(struct ifnet *ifp)
1670 {
1671 lck_rw_lock_shared(&ifp->if_inet6data_lock);
1672 }
1673
1674 __private_extern__ void
if_inet6data_lock_exclusive(struct ifnet * ifp)1675 if_inet6data_lock_exclusive(struct ifnet *ifp)
1676 {
1677 lck_rw_lock_exclusive(&ifp->if_inet6data_lock);
1678 }
1679
1680 __private_extern__ void
if_inet6data_lock_done(struct ifnet * ifp)1681 if_inet6data_lock_done(struct ifnet *ifp)
1682 {
1683 lck_rw_done(&ifp->if_inet6data_lock);
1684 }
1685
1686 __private_extern__ void
ifnet_head_lock_shared(void)1687 ifnet_head_lock_shared(void)
1688 {
1689 lck_rw_lock_shared(&ifnet_head_lock);
1690 }
1691
1692 __private_extern__ void
ifnet_head_lock_exclusive(void)1693 ifnet_head_lock_exclusive(void)
1694 {
1695 lck_rw_lock_exclusive(&ifnet_head_lock);
1696 }
1697
1698 __private_extern__ void
ifnet_head_done(void)1699 ifnet_head_done(void)
1700 {
1701 lck_rw_done(&ifnet_head_lock);
1702 }
1703
1704 __private_extern__ void
ifnet_head_assert_exclusive(void)1705 ifnet_head_assert_exclusive(void)
1706 {
1707 LCK_RW_ASSERT(&ifnet_head_lock, LCK_RW_ASSERT_EXCLUSIVE);
1708 }
1709
1710 /*
1711 * dlil_ifp_protolist
1712 * - get the list of protocols attached to the interface, or just the number
1713 * of attached protocols
1714 * - if the number returned is greater than 'list_count', truncation occurred
1715 *
1716 * Note:
1717 * - caller must already be holding ifnet lock.
1718 */
1719 static u_int32_t
dlil_ifp_protolist(struct ifnet * ifp,protocol_family_t * list,u_int32_t list_count)1720 dlil_ifp_protolist(struct ifnet *ifp, protocol_family_t *list,
1721 u_int32_t list_count)
1722 {
1723 u_int32_t count = 0;
1724 int i;
1725
1726 ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_OWNED);
1727
1728 if (ifp->if_proto_hash == NULL) {
1729 goto done;
1730 }
1731
1732 for (i = 0; i < PROTO_HASH_SLOTS; i++) {
1733 struct if_proto *proto;
1734 SLIST_FOREACH(proto, &ifp->if_proto_hash[i], next_hash) {
1735 if (list != NULL && count < list_count) {
1736 list[count] = proto->protocol_family;
1737 }
1738 count++;
1739 }
1740 }
1741 done:
1742 return count;
1743 }
1744
1745 __private_extern__ u_int32_t
if_get_protolist(struct ifnet * ifp,u_int32_t * protolist,u_int32_t count)1746 if_get_protolist(struct ifnet * ifp, u_int32_t *protolist, u_int32_t count)
1747 {
1748 ifnet_lock_shared(ifp);
1749 count = dlil_ifp_protolist(ifp, protolist, count);
1750 ifnet_lock_done(ifp);
1751 return count;
1752 }
1753
1754 __private_extern__ void
if_free_protolist(u_int32_t * list)1755 if_free_protolist(u_int32_t *list)
1756 {
1757 kfree_data_addr(list);
1758 }
1759
1760 __private_extern__ int
dlil_post_msg(struct ifnet * ifp,u_int32_t event_subclass,u_int32_t event_code,struct net_event_data * event_data,u_int32_t event_data_len,boolean_t suppress_generation)1761 dlil_post_msg(struct ifnet *ifp, u_int32_t event_subclass,
1762 u_int32_t event_code, struct net_event_data *event_data,
1763 u_int32_t event_data_len, boolean_t suppress_generation)
1764 {
1765 struct net_event_data ev_data;
1766 struct kev_msg ev_msg;
1767
1768 bzero(&ev_msg, sizeof(ev_msg));
1769 bzero(&ev_data, sizeof(ev_data));
1770 /*
1771 * a net event always starts with a net_event_data structure
1772 * but the caller can generate a simple net event or
1773 * provide a longer event structure to post
1774 */
1775 ev_msg.vendor_code = KEV_VENDOR_APPLE;
1776 ev_msg.kev_class = KEV_NETWORK_CLASS;
1777 ev_msg.kev_subclass = event_subclass;
1778 ev_msg.event_code = event_code;
1779
1780 if (event_data == NULL) {
1781 event_data = &ev_data;
1782 event_data_len = sizeof(struct net_event_data);
1783 }
1784
1785 strlcpy(&event_data->if_name[0], ifp->if_name, IFNAMSIZ);
1786 event_data->if_family = ifp->if_family;
1787 event_data->if_unit = (u_int32_t)ifp->if_unit;
1788
1789 ev_msg.dv[0].data_length = event_data_len;
1790 ev_msg.dv[0].data_ptr = event_data;
1791 ev_msg.dv[1].data_length = 0;
1792
1793 bool update_generation = true;
1794 if (event_subclass == KEV_DL_SUBCLASS) {
1795 /* Don't update interface generation for frequent link quality and state changes */
1796 switch (event_code) {
1797 case KEV_DL_LINK_QUALITY_METRIC_CHANGED:
1798 case KEV_DL_RRC_STATE_CHANGED:
1799 case KEV_DL_PRIMARY_ELECTED:
1800 update_generation = false;
1801 break;
1802 default:
1803 break;
1804 }
1805 }
1806
1807 /*
1808 * Some events that update generation counts might
1809 * want to suppress generation count.
1810 * One example is node presence/absence where we still
1811 * issue kernel event for the invocation but want to avoid
1812 * expensive operation of updating generation which triggers
1813 * NECP client updates.
1814 */
1815 if (suppress_generation) {
1816 update_generation = false;
1817 }
1818
1819 return dlil_event_internal(ifp, &ev_msg, update_generation);
1820 }
1821
1822 __private_extern__ int
dlil_alloc_local_stats(struct ifnet * ifp)1823 dlil_alloc_local_stats(struct ifnet *ifp)
1824 {
1825 int ret = EINVAL;
1826 void *buf, *base, **pbuf;
1827
1828 if (ifp == NULL) {
1829 goto end;
1830 }
1831
1832 if (ifp->if_tcp_stat == NULL && ifp->if_udp_stat == NULL) {
1833 /* allocate tcpstat_local structure */
1834 buf = zalloc_flags(dlif_tcpstat_zone,
1835 Z_WAITOK | Z_ZERO | Z_NOFAIL);
1836
1837 /* Get the 64-bit aligned base address for this object */
1838 base = (void *)P2ROUNDUP((intptr_t)buf + sizeof(u_int64_t),
1839 sizeof(u_int64_t));
1840 VERIFY(((intptr_t)base + dlif_tcpstat_size) <=
1841 ((intptr_t)buf + dlif_tcpstat_bufsize));
1842
1843 /*
1844 * Wind back a pointer size from the aligned base and
1845 * save the original address so we can free it later.
1846 */
1847 pbuf = (void **)((intptr_t)base - sizeof(void *));
1848 *pbuf = buf;
1849 ifp->if_tcp_stat = base;
1850
1851 /* allocate udpstat_local structure */
1852 buf = zalloc_flags(dlif_udpstat_zone,
1853 Z_WAITOK | Z_ZERO | Z_NOFAIL);
1854
1855 /* Get the 64-bit aligned base address for this object */
1856 base = (void *)P2ROUNDUP((intptr_t)buf + sizeof(u_int64_t),
1857 sizeof(u_int64_t));
1858 VERIFY(((intptr_t)base + dlif_udpstat_size) <=
1859 ((intptr_t)buf + dlif_udpstat_bufsize));
1860
1861 /*
1862 * Wind back a pointer size from the aligned base and
1863 * save the original address so we can free it later.
1864 */
1865 pbuf = (void **)((intptr_t)base - sizeof(void *));
1866 *pbuf = buf;
1867 ifp->if_udp_stat = base;
1868
1869 VERIFY(IS_P2ALIGNED(ifp->if_tcp_stat, sizeof(u_int64_t)) &&
1870 IS_P2ALIGNED(ifp->if_udp_stat, sizeof(u_int64_t)));
1871
1872 ret = 0;
1873 }
1874
1875 if (ifp->if_ipv4_stat == NULL) {
1876 ifp->if_ipv4_stat = kalloc_type(struct if_tcp_ecn_stat, Z_WAITOK | Z_ZERO);
1877 }
1878
1879 if (ifp->if_ipv6_stat == NULL) {
1880 ifp->if_ipv6_stat = kalloc_type(struct if_tcp_ecn_stat, Z_WAITOK | Z_ZERO);
1881 }
1882 end:
1883 if (ifp != NULL && ret != 0) {
1884 if (ifp->if_tcp_stat != NULL) {
1885 pbuf = (void **)
1886 ((intptr_t)ifp->if_tcp_stat - sizeof(void *));
1887 zfree(dlif_tcpstat_zone, *pbuf);
1888 ifp->if_tcp_stat = NULL;
1889 }
1890 if (ifp->if_udp_stat != NULL) {
1891 pbuf = (void **)
1892 ((intptr_t)ifp->if_udp_stat - sizeof(void *));
1893 zfree(dlif_udpstat_zone, *pbuf);
1894 ifp->if_udp_stat = NULL;
1895 }
1896 /* The macro kfree_type sets the passed pointer to NULL */
1897 if (ifp->if_ipv4_stat != NULL) {
1898 kfree_type(struct if_tcp_ecn_stat, ifp->if_ipv4_stat);
1899 }
1900 if (ifp->if_ipv6_stat != NULL) {
1901 kfree_type(struct if_tcp_ecn_stat, ifp->if_ipv6_stat);
1902 }
1903 }
1904
1905 return ret;
1906 }
1907
1908 static void
dlil_reset_rxpoll_params(ifnet_t ifp)1909 dlil_reset_rxpoll_params(ifnet_t ifp)
1910 {
1911 ASSERT(ifp != NULL);
1912 ifnet_set_poll_cycle(ifp, NULL);
1913 ifp->if_poll_update = 0;
1914 ifp->if_poll_flags = 0;
1915 ifp->if_poll_req = 0;
1916 ifp->if_poll_mode = IFNET_MODEL_INPUT_POLL_OFF;
1917 bzero(&ifp->if_poll_tstats, sizeof(ifp->if_poll_tstats));
1918 bzero(&ifp->if_poll_pstats, sizeof(ifp->if_poll_pstats));
1919 bzero(&ifp->if_poll_sstats, sizeof(ifp->if_poll_sstats));
1920 net_timerclear(&ifp->if_poll_mode_holdtime);
1921 net_timerclear(&ifp->if_poll_mode_lasttime);
1922 net_timerclear(&ifp->if_poll_sample_holdtime);
1923 net_timerclear(&ifp->if_poll_sample_lasttime);
1924 net_timerclear(&ifp->if_poll_dbg_lasttime);
1925 }
1926
1927 static int
dlil_create_input_thread(ifnet_t ifp,struct dlil_threading_info * inp,thread_continue_t * thfunc)1928 dlil_create_input_thread(ifnet_t ifp, struct dlil_threading_info *inp,
1929 thread_continue_t *thfunc)
1930 {
1931 boolean_t dlil_rxpoll_input;
1932 thread_continue_t func = NULL;
1933 u_int32_t limit;
1934 int error = 0;
1935
1936 dlil_rxpoll_input = (ifp != NULL && net_rxpoll &&
1937 (ifp->if_eflags & IFEF_RXPOLL) && (ifp->if_xflags & IFXF_LEGACY));
1938
1939 /* default strategy utilizes the DLIL worker thread */
1940 inp->dlth_strategy = dlil_input_async;
1941
1942 /* NULL ifp indicates the main input thread, called at dlil_init time */
1943 if (ifp == NULL) {
1944 /*
1945 * Main input thread only.
1946 */
1947 func = dlil_main_input_thread_func;
1948 VERIFY(inp == dlil_main_input_thread);
1949 (void) strlcat(inp->dlth_name,
1950 "main_input", DLIL_THREADNAME_LEN);
1951 } else if (dlil_rxpoll_input) {
1952 /*
1953 * Legacy (non-netif) hybrid polling.
1954 */
1955 func = dlil_rxpoll_input_thread_func;
1956 VERIFY(inp != dlil_main_input_thread);
1957 (void) snprintf(inp->dlth_name, DLIL_THREADNAME_LEN,
1958 "%s_input_poll", if_name(ifp));
1959 } else if (net_async || (ifp->if_xflags & IFXF_LEGACY)) {
1960 /*
1961 * Asynchronous strategy.
1962 */
1963 func = dlil_input_thread_func;
1964 VERIFY(inp != dlil_main_input_thread);
1965 (void) snprintf(inp->dlth_name, DLIL_THREADNAME_LEN,
1966 "%s_input", if_name(ifp));
1967 } else {
1968 /*
1969 * Synchronous strategy if there's a netif below and
1970 * the device isn't capable of hybrid polling.
1971 */
1972 ASSERT(func == NULL);
1973 ASSERT(!(ifp->if_xflags & IFXF_LEGACY));
1974 VERIFY(inp != dlil_main_input_thread);
1975 ASSERT(!inp->dlth_affinity);
1976 inp->dlth_strategy = dlil_input_sync;
1977 }
1978 VERIFY(inp->dlth_thread == THREAD_NULL);
1979
1980 /* let caller know */
1981 if (thfunc != NULL) {
1982 *thfunc = func;
1983 }
1984
1985 inp->dlth_lock_grp = lck_grp_alloc_init(inp->dlth_name, LCK_GRP_ATTR_NULL);
1986 lck_mtx_init(&inp->dlth_lock, inp->dlth_lock_grp, &dlil_lck_attributes);
1987
1988 inp->dlth_ifp = ifp; /* NULL for main input thread */
1989
1990 /*
1991 * For interfaces that support opportunistic polling, set the
1992 * low and high watermarks for outstanding inbound packets/bytes.
1993 * Also define freeze times for transitioning between modes
1994 * and updating the average.
1995 */
1996 if (ifp != NULL && net_rxpoll && (ifp->if_eflags & IFEF_RXPOLL)) {
1997 limit = MAX(if_rcvq_maxlen, IF_RCVQ_MINLEN);
1998 if (ifp->if_xflags & IFXF_LEGACY) {
1999 (void) dlil_rxpoll_set_params(ifp, NULL, FALSE);
2000 }
2001 } else {
2002 /*
2003 * For interfaces that don't support opportunistic
2004 * polling, set the burst limit to prevent memory exhaustion.
2005 * The values of `if_rcvq_burst_limit' are safeguarded
2006 * on customer builds by `sysctl_rcvq_burst_limit'.
2007 */
2008 limit = if_rcvq_burst_limit;
2009 }
2010
2011 _qinit(&inp->dlth_pkts, Q_DROPTAIL, limit, QP_MBUF);
2012 if (inp == dlil_main_input_thread) {
2013 struct dlil_main_threading_info *inpm =
2014 (struct dlil_main_threading_info *)inp;
2015 _qinit(&inpm->lo_rcvq_pkts, Q_DROPTAIL, limit, QP_MBUF);
2016 }
2017
2018 if (func == NULL) {
2019 ASSERT(!(ifp->if_xflags & IFXF_LEGACY));
2020 ASSERT(error == 0);
2021 error = ENODEV;
2022 goto done;
2023 }
2024
2025 error = kernel_thread_start(func, inp, &inp->dlth_thread);
2026 if (error == KERN_SUCCESS) {
2027 thread_precedence_policy_data_t info;
2028 __unused kern_return_t kret;
2029
2030 bzero(&info, sizeof(info));
2031 info.importance = 0;
2032 kret = thread_policy_set(inp->dlth_thread,
2033 THREAD_PRECEDENCE_POLICY, (thread_policy_t)&info,
2034 THREAD_PRECEDENCE_POLICY_COUNT);
2035 ASSERT(kret == KERN_SUCCESS);
2036 /*
2037 * We create an affinity set so that the matching workloop
2038 * thread or the starter thread (for loopback) can be
2039 * scheduled on the same processor set as the input thread.
2040 */
2041 if (net_affinity) {
2042 struct thread *tp = inp->dlth_thread;
2043 u_int32_t tag;
2044 /*
2045 * Randomize to reduce the probability
2046 * of affinity tag namespace collision.
2047 */
2048 read_frandom(&tag, sizeof(tag));
2049 if (dlil_affinity_set(tp, tag) == KERN_SUCCESS) {
2050 thread_reference(tp);
2051 inp->dlth_affinity_tag = tag;
2052 inp->dlth_affinity = TRUE;
2053 }
2054 }
2055 } else if (inp == dlil_main_input_thread) {
2056 panic_plain("%s: couldn't create main input thread", __func__);
2057 /* NOTREACHED */
2058 } else {
2059 panic_plain("%s: couldn't create %s input thread", __func__,
2060 if_name(ifp));
2061 /* NOTREACHED */
2062 }
2063 OSAddAtomic(1, &cur_dlil_input_threads);
2064
2065 done:
2066 return error;
2067 }
2068
2069 static void
dlil_clean_threading_info(struct dlil_threading_info * inp)2070 dlil_clean_threading_info(struct dlil_threading_info *inp)
2071 {
2072 lck_mtx_destroy(&inp->dlth_lock, inp->dlth_lock_grp);
2073 lck_grp_free(inp->dlth_lock_grp);
2074 inp->dlth_lock_grp = NULL;
2075
2076 inp->dlth_flags = 0;
2077 inp->dlth_wtot = 0;
2078 bzero(inp->dlth_name, sizeof(inp->dlth_name));
2079 inp->dlth_ifp = NULL;
2080 VERIFY(qhead(&inp->dlth_pkts) == NULL && qempty(&inp->dlth_pkts));
2081 qlimit(&inp->dlth_pkts) = 0;
2082 bzero(&inp->dlth_stats, sizeof(inp->dlth_stats));
2083
2084 VERIFY(!inp->dlth_affinity);
2085 inp->dlth_thread = THREAD_NULL;
2086 inp->dlth_strategy = NULL;
2087 VERIFY(inp->dlth_driver_thread == THREAD_NULL);
2088 VERIFY(inp->dlth_poller_thread == THREAD_NULL);
2089 VERIFY(inp->dlth_affinity_tag == 0);
2090 #if IFNET_INPUT_SANITY_CHK
2091 inp->dlth_pkts_cnt = 0;
2092 #endif /* IFNET_INPUT_SANITY_CHK */
2093 }
2094
2095 static void
dlil_terminate_input_thread(struct dlil_threading_info * inp)2096 dlil_terminate_input_thread(struct dlil_threading_info *inp)
2097 {
2098 struct ifnet *ifp = inp->dlth_ifp;
2099 classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
2100
2101 VERIFY(current_thread() == inp->dlth_thread);
2102 VERIFY(inp != dlil_main_input_thread);
2103
2104 OSAddAtomic(-1, &cur_dlil_input_threads);
2105
2106 #if TEST_INPUT_THREAD_TERMINATION
2107 { /* do something useless that won't get optimized away */
2108 uint32_t v = 1;
2109 for (uint32_t i = 0;
2110 i < if_input_thread_termination_spin;
2111 i++) {
2112 v = (i + 1) * v;
2113 }
2114 DLIL_PRINTF("the value is %d\n", v);
2115 }
2116 #endif /* TEST_INPUT_THREAD_TERMINATION */
2117
2118 lck_mtx_lock_spin(&inp->dlth_lock);
2119 _getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
2120 VERIFY((inp->dlth_flags & DLIL_INPUT_TERMINATE) != 0);
2121 inp->dlth_flags |= DLIL_INPUT_TERMINATE_COMPLETE;
2122 wakeup_one((caddr_t)&inp->dlth_flags);
2123 lck_mtx_unlock(&inp->dlth_lock);
2124
2125 /* free up pending packets */
2126 if (pkt.cp_mbuf != NULL) {
2127 mbuf_freem_list(pkt.cp_mbuf);
2128 }
2129
2130 /* for the extra refcnt from kernel_thread_start() */
2131 thread_deallocate(current_thread());
2132
2133 if (dlil_verbose) {
2134 DLIL_PRINTF("%s: input thread terminated\n",
2135 if_name(ifp));
2136 }
2137
2138 /* this is the end */
2139 thread_terminate(current_thread());
2140 /* NOTREACHED */
2141 }
2142
2143 static kern_return_t
dlil_affinity_set(struct thread * tp,u_int32_t tag)2144 dlil_affinity_set(struct thread *tp, u_int32_t tag)
2145 {
2146 thread_affinity_policy_data_t policy;
2147
2148 bzero(&policy, sizeof(policy));
2149 policy.affinity_tag = tag;
2150 return thread_policy_set(tp, THREAD_AFFINITY_POLICY,
2151 (thread_policy_t)&policy, THREAD_AFFINITY_POLICY_COUNT);
2152 }
2153
2154 #if SKYWALK
2155 static void
dlil_filter_event(struct eventhandler_entry_arg arg __unused,enum net_filter_event_subsystems state)2156 dlil_filter_event(struct eventhandler_entry_arg arg __unused,
2157 enum net_filter_event_subsystems state)
2158 {
2159 evhlog(debug, "%s: eventhandler saw event type=net_filter_event_state event_code=0x%d",
2160 __func__, state);
2161
2162 bool old_if_enable_fsw_transport_netagent = if_enable_fsw_transport_netagent;
2163 if ((state & ~NET_FILTER_EVENT_PF_PRIVATE_PROXY) == 0) {
2164 if_enable_fsw_transport_netagent = 1;
2165 } else {
2166 if_enable_fsw_transport_netagent = 0;
2167 }
2168 if (old_if_enable_fsw_transport_netagent != if_enable_fsw_transport_netagent) {
2169 kern_nexus_update_netagents();
2170 } else if (!if_enable_fsw_transport_netagent) {
2171 necp_update_all_clients();
2172 }
2173 }
2174 #endif /* SKYWALK */
2175
2176 void
dlil_init(void)2177 dlil_init(void)
2178 {
2179 thread_t thread = THREAD_NULL;
2180
2181 /*
2182 * The following fields must be 64-bit aligned for atomic operations.
2183 */
2184 IF_DATA_REQUIRE_ALIGNED_64(ifi_ipackets);
2185 IF_DATA_REQUIRE_ALIGNED_64(ifi_ierrors);
2186 IF_DATA_REQUIRE_ALIGNED_64(ifi_opackets);
2187 IF_DATA_REQUIRE_ALIGNED_64(ifi_oerrors);
2188 IF_DATA_REQUIRE_ALIGNED_64(ifi_collisions);
2189 IF_DATA_REQUIRE_ALIGNED_64(ifi_ibytes);
2190 IF_DATA_REQUIRE_ALIGNED_64(ifi_obytes);
2191 IF_DATA_REQUIRE_ALIGNED_64(ifi_imcasts);
2192 IF_DATA_REQUIRE_ALIGNED_64(ifi_omcasts);
2193 IF_DATA_REQUIRE_ALIGNED_64(ifi_iqdrops);
2194 IF_DATA_REQUIRE_ALIGNED_64(ifi_noproto);
2195 IF_DATA_REQUIRE_ALIGNED_64(ifi_alignerrs);
2196 IF_DATA_REQUIRE_ALIGNED_64(ifi_dt_bytes);
2197 IF_DATA_REQUIRE_ALIGNED_64(ifi_fpackets);
2198 IF_DATA_REQUIRE_ALIGNED_64(ifi_fbytes);
2199
2200 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_ipackets);
2201 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_ierrors);
2202 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_opackets);
2203 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_oerrors);
2204 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_collisions);
2205 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_ibytes);
2206 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_obytes);
2207 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_imcasts);
2208 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_omcasts);
2209 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_iqdrops);
2210 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_noproto);
2211 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_alignerrs);
2212 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_dt_bytes);
2213 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_fpackets);
2214 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_fbytes);
2215
2216 /*
2217 * These IF_HWASSIST_ flags must be equal to their IFNET_* counterparts.
2218 */
2219 _CASSERT(IF_HWASSIST_CSUM_IP == IFNET_CSUM_IP);
2220 _CASSERT(IF_HWASSIST_CSUM_TCP == IFNET_CSUM_TCP);
2221 _CASSERT(IF_HWASSIST_CSUM_UDP == IFNET_CSUM_UDP);
2222 _CASSERT(IF_HWASSIST_CSUM_IP_FRAGS == IFNET_CSUM_FRAGMENT);
2223 _CASSERT(IF_HWASSIST_CSUM_FRAGMENT == IFNET_IP_FRAGMENT);
2224 _CASSERT(IF_HWASSIST_CSUM_TCPIPV6 == IFNET_CSUM_TCPIPV6);
2225 _CASSERT(IF_HWASSIST_CSUM_UDPIPV6 == IFNET_CSUM_UDPIPV6);
2226 _CASSERT(IF_HWASSIST_CSUM_FRAGMENT_IPV6 == IFNET_IPV6_FRAGMENT);
2227 _CASSERT(IF_HWASSIST_CSUM_PARTIAL == IFNET_CSUM_PARTIAL);
2228 _CASSERT(IF_HWASSIST_CSUM_ZERO_INVERT == IFNET_CSUM_ZERO_INVERT);
2229 _CASSERT(IF_HWASSIST_VLAN_TAGGING == IFNET_VLAN_TAGGING);
2230 _CASSERT(IF_HWASSIST_VLAN_MTU == IFNET_VLAN_MTU);
2231 _CASSERT(IF_HWASSIST_TSO_V4 == IFNET_TSO_IPV4);
2232 _CASSERT(IF_HWASSIST_TSO_V6 == IFNET_TSO_IPV6);
2233
2234 /*
2235 * ... as well as the mbuf checksum flags counterparts.
2236 */
2237 _CASSERT(CSUM_IP == IF_HWASSIST_CSUM_IP);
2238 _CASSERT(CSUM_TCP == IF_HWASSIST_CSUM_TCP);
2239 _CASSERT(CSUM_UDP == IF_HWASSIST_CSUM_UDP);
2240 _CASSERT(CSUM_IP_FRAGS == IF_HWASSIST_CSUM_IP_FRAGS);
2241 _CASSERT(CSUM_FRAGMENT == IF_HWASSIST_CSUM_FRAGMENT);
2242 _CASSERT(CSUM_TCPIPV6 == IF_HWASSIST_CSUM_TCPIPV6);
2243 _CASSERT(CSUM_UDPIPV6 == IF_HWASSIST_CSUM_UDPIPV6);
2244 _CASSERT(CSUM_FRAGMENT_IPV6 == IF_HWASSIST_CSUM_FRAGMENT_IPV6);
2245 _CASSERT(CSUM_PARTIAL == IF_HWASSIST_CSUM_PARTIAL);
2246 _CASSERT(CSUM_ZERO_INVERT == IF_HWASSIST_CSUM_ZERO_INVERT);
2247 _CASSERT(CSUM_VLAN_TAG_VALID == IF_HWASSIST_VLAN_TAGGING);
2248
2249 /*
2250 * Make sure we have at least IF_LLREACH_MAXLEN in the llreach info.
2251 */
2252 _CASSERT(IF_LLREACH_MAXLEN <= IF_LLREACHINFO_ADDRLEN);
2253 _CASSERT(IFNET_LLREACHINFO_ADDRLEN == IF_LLREACHINFO_ADDRLEN);
2254
2255 _CASSERT(IFRLOGF_DLIL == IFNET_LOGF_DLIL);
2256 _CASSERT(IFRLOGF_FAMILY == IFNET_LOGF_FAMILY);
2257 _CASSERT(IFRLOGF_DRIVER == IFNET_LOGF_DRIVER);
2258 _CASSERT(IFRLOGF_FIRMWARE == IFNET_LOGF_FIRMWARE);
2259
2260 _CASSERT(IFRLOGCAT_CONNECTIVITY == IFNET_LOGCAT_CONNECTIVITY);
2261 _CASSERT(IFRLOGCAT_QUALITY == IFNET_LOGCAT_QUALITY);
2262 _CASSERT(IFRLOGCAT_PERFORMANCE == IFNET_LOGCAT_PERFORMANCE);
2263
2264 _CASSERT(IFRTYPE_FAMILY_ANY == IFNET_FAMILY_ANY);
2265 _CASSERT(IFRTYPE_FAMILY_LOOPBACK == IFNET_FAMILY_LOOPBACK);
2266 _CASSERT(IFRTYPE_FAMILY_ETHERNET == IFNET_FAMILY_ETHERNET);
2267 _CASSERT(IFRTYPE_FAMILY_SLIP == IFNET_FAMILY_SLIP);
2268 _CASSERT(IFRTYPE_FAMILY_TUN == IFNET_FAMILY_TUN);
2269 _CASSERT(IFRTYPE_FAMILY_VLAN == IFNET_FAMILY_VLAN);
2270 _CASSERT(IFRTYPE_FAMILY_PPP == IFNET_FAMILY_PPP);
2271 _CASSERT(IFRTYPE_FAMILY_PVC == IFNET_FAMILY_PVC);
2272 _CASSERT(IFRTYPE_FAMILY_DISC == IFNET_FAMILY_DISC);
2273 _CASSERT(IFRTYPE_FAMILY_MDECAP == IFNET_FAMILY_MDECAP);
2274 _CASSERT(IFRTYPE_FAMILY_GIF == IFNET_FAMILY_GIF);
2275 _CASSERT(IFRTYPE_FAMILY_FAITH == IFNET_FAMILY_FAITH);
2276 _CASSERT(IFRTYPE_FAMILY_STF == IFNET_FAMILY_STF);
2277 _CASSERT(IFRTYPE_FAMILY_FIREWIRE == IFNET_FAMILY_FIREWIRE);
2278 _CASSERT(IFRTYPE_FAMILY_BOND == IFNET_FAMILY_BOND);
2279 _CASSERT(IFRTYPE_FAMILY_CELLULAR == IFNET_FAMILY_CELLULAR);
2280 _CASSERT(IFRTYPE_FAMILY_UTUN == IFNET_FAMILY_UTUN);
2281 _CASSERT(IFRTYPE_FAMILY_IPSEC == IFNET_FAMILY_IPSEC);
2282
2283 _CASSERT(IFRTYPE_SUBFAMILY_ANY == IFNET_SUBFAMILY_ANY);
2284 _CASSERT(IFRTYPE_SUBFAMILY_USB == IFNET_SUBFAMILY_USB);
2285 _CASSERT(IFRTYPE_SUBFAMILY_BLUETOOTH == IFNET_SUBFAMILY_BLUETOOTH);
2286 _CASSERT(IFRTYPE_SUBFAMILY_WIFI == IFNET_SUBFAMILY_WIFI);
2287 _CASSERT(IFRTYPE_SUBFAMILY_THUNDERBOLT == IFNET_SUBFAMILY_THUNDERBOLT);
2288 _CASSERT(IFRTYPE_SUBFAMILY_RESERVED == IFNET_SUBFAMILY_RESERVED);
2289 _CASSERT(IFRTYPE_SUBFAMILY_INTCOPROC == IFNET_SUBFAMILY_INTCOPROC);
2290 _CASSERT(IFRTYPE_SUBFAMILY_QUICKRELAY == IFNET_SUBFAMILY_QUICKRELAY);
2291 _CASSERT(IFRTYPE_SUBFAMILY_VMNET == IFNET_SUBFAMILY_VMNET);
2292 _CASSERT(IFRTYPE_SUBFAMILY_SIMCELL == IFNET_SUBFAMILY_SIMCELL);
2293 _CASSERT(IFRTYPE_SUBFAMILY_MANAGEMENT == IFNET_SUBFAMILY_MANAGEMENT);
2294
2295 _CASSERT(DLIL_MODIDLEN == IFNET_MODIDLEN);
2296 _CASSERT(DLIL_MODARGLEN == IFNET_MODARGLEN);
2297
2298 PE_parse_boot_argn("net_affinity", &net_affinity,
2299 sizeof(net_affinity));
2300
2301 PE_parse_boot_argn("net_rxpoll", &net_rxpoll, sizeof(net_rxpoll));
2302
2303 PE_parse_boot_argn("net_rtref", &net_rtref, sizeof(net_rtref));
2304
2305 PE_parse_boot_argn("net_async", &net_async, sizeof(net_async));
2306
2307 PE_parse_boot_argn("ifnet_debug", &ifnet_debug, sizeof(ifnet_debug));
2308
2309 VERIFY(dlil_pending_thread_cnt == 0);
2310 #if SKYWALK
2311 boolean_t pe_enable_fsw_transport_netagent = FALSE;
2312 boolean_t pe_disable_fsw_transport_netagent = FALSE;
2313 boolean_t enable_fsw_netagent =
2314 (((if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0) ||
2315 (if_attach_nx & IF_ATTACH_NX_FSW_IP_NETAGENT) != 0);
2316
2317 /*
2318 * Check the device tree to see if Skywalk netagent has been explicitly
2319 * enabled or disabled. This can be overridden via if_attach_nx below.
2320 * Note that the property is a 0-length key, and so checking for the
2321 * presence itself is enough (no need to check for the actual value of
2322 * the retrieved variable.)
2323 */
2324 pe_enable_fsw_transport_netagent =
2325 PE_get_default("kern.skywalk_netagent_enable",
2326 &pe_enable_fsw_transport_netagent,
2327 sizeof(pe_enable_fsw_transport_netagent));
2328 pe_disable_fsw_transport_netagent =
2329 PE_get_default("kern.skywalk_netagent_disable",
2330 &pe_disable_fsw_transport_netagent,
2331 sizeof(pe_disable_fsw_transport_netagent));
2332
2333 /*
2334 * These two are mutually exclusive, i.e. they both can be absent,
2335 * but only one can be present at a time, and so we assert to make
2336 * sure it is correct.
2337 */
2338 VERIFY((!pe_enable_fsw_transport_netagent &&
2339 !pe_disable_fsw_transport_netagent) ||
2340 (pe_enable_fsw_transport_netagent ^
2341 pe_disable_fsw_transport_netagent));
2342
2343 if (pe_enable_fsw_transport_netagent) {
2344 kprintf("SK: netagent is enabled via an override for "
2345 "this platform\n");
2346 if_attach_nx = SKYWALK_NETWORKING_ENABLED;
2347 } else if (pe_disable_fsw_transport_netagent) {
2348 kprintf("SK: netagent is disabled via an override for "
2349 "this platform\n");
2350 if_attach_nx = SKYWALK_NETWORKING_DISABLED;
2351 } else {
2352 kprintf("SK: netagent is %s by default for this platform\n",
2353 (enable_fsw_netagent ? "enabled" : "disabled"));
2354 if_attach_nx = IF_ATTACH_NX_DEFAULT;
2355 }
2356
2357 /*
2358 * Now see if there's a boot-arg override.
2359 */
2360 (void) PE_parse_boot_argn("if_attach_nx", &if_attach_nx,
2361 sizeof(if_attach_nx));
2362 if_enable_fsw_transport_netagent =
2363 ((if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0);
2364
2365 if_netif_all = ((if_attach_nx & IF_ATTACH_NX_NETIF_ALL) != 0);
2366
2367 if (pe_disable_fsw_transport_netagent &&
2368 if_enable_fsw_transport_netagent) {
2369 kprintf("SK: netagent is force-enabled\n");
2370 } else if (!pe_disable_fsw_transport_netagent &&
2371 !if_enable_fsw_transport_netagent) {
2372 kprintf("SK: netagent is force-disabled\n");
2373 }
2374 if (kernel_is_macos_or_server() && if_enable_fsw_transport_netagent) {
2375 net_filter_event_register(dlil_filter_event);
2376 }
2377
2378 #if (DEVELOPMENT || DEBUG)
2379 (void) PE_parse_boot_argn("fsw_use_max_mtu_buffer",
2380 &fsw_use_max_mtu_buffer, sizeof(fsw_use_max_mtu_buffer));
2381 #endif /* (DEVELOPMENT || DEBUG) */
2382
2383 #endif /* SKYWALK */
2384 dlif_size = (ifnet_debug == 0) ? sizeof(struct dlil_ifnet) :
2385 sizeof(struct dlil_ifnet_dbg);
2386 /* Enforce 64-bit alignment for dlil_ifnet structure */
2387 dlif_bufsize = dlif_size + sizeof(void *) + sizeof(u_int64_t);
2388 dlif_bufsize = (uint32_t)P2ROUNDUP(dlif_bufsize, sizeof(u_int64_t));
2389 dlif_zone = zone_create(DLIF_ZONE_NAME, dlif_bufsize, ZC_ZFREE_CLEARMEM);
2390
2391 dlif_tcpstat_size = sizeof(struct tcpstat_local);
2392 /* Enforce 64-bit alignment for tcpstat_local structure */
2393 dlif_tcpstat_bufsize =
2394 dlif_tcpstat_size + sizeof(void *) + sizeof(u_int64_t);
2395 dlif_tcpstat_bufsize = (uint32_t)
2396 P2ROUNDUP(dlif_tcpstat_bufsize, sizeof(u_int64_t));
2397 dlif_tcpstat_zone = zone_create(DLIF_TCPSTAT_ZONE_NAME,
2398 dlif_tcpstat_bufsize, ZC_ZFREE_CLEARMEM);
2399
2400 dlif_udpstat_size = sizeof(struct udpstat_local);
2401 /* Enforce 64-bit alignment for udpstat_local structure */
2402 dlif_udpstat_bufsize =
2403 dlif_udpstat_size + sizeof(void *) + sizeof(u_int64_t);
2404 dlif_udpstat_bufsize = (uint32_t)
2405 P2ROUNDUP(dlif_udpstat_bufsize, sizeof(u_int64_t));
2406 dlif_udpstat_zone = zone_create(DLIF_UDPSTAT_ZONE_NAME,
2407 dlif_udpstat_bufsize, ZC_ZFREE_CLEARMEM);
2408
2409 eventhandler_lists_ctxt_init(&ifnet_evhdlr_ctxt);
2410
2411 TAILQ_INIT(&dlil_ifnet_head);
2412 TAILQ_INIT(&ifnet_head);
2413 TAILQ_INIT(&ifnet_detaching_head);
2414 TAILQ_INIT(&ifnet_ordered_head);
2415
2416 /* Initialize interface address subsystem */
2417 ifa_init();
2418
2419 #if PF
2420 /* Initialize the packet filter */
2421 pfinit();
2422 #endif /* PF */
2423
2424 /* Initialize queue algorithms */
2425 classq_init();
2426
2427 /* Initialize packet schedulers */
2428 pktsched_init();
2429
2430 /* Initialize flow advisory subsystem */
2431 flowadv_init();
2432
2433 /* Initialize the pktap virtual interface */
2434 pktap_init();
2435
2436 /* Initialize droptap interface */
2437 droptap_init();
2438
2439 /* Initialize the service class to dscp map */
2440 net_qos_map_init();
2441
2442 /* Initialize the interface low power mode event handler */
2443 if_low_power_evhdlr_init();
2444
2445 /* Initialize the interface offload port list subsystem */
2446 if_ports_used_init();
2447
2448 #if DEBUG || DEVELOPMENT
2449 /* Run self-tests */
2450 dlil_verify_sum16();
2451 #endif /* DEBUG || DEVELOPMENT */
2452
2453 /*
2454 * Create and start up the main DLIL input thread and the interface
2455 * detacher threads once everything is initialized.
2456 */
2457 dlil_incr_pending_thread_count();
2458 (void) dlil_create_input_thread(NULL, dlil_main_input_thread, NULL);
2459
2460 /*
2461 * Create ifnet detacher thread.
2462 * When an interface gets detached, part of the detach processing
2463 * is delayed. The interface is added to delayed detach list
2464 * and this thread is woken up to call ifnet_detach_final
2465 * on these interfaces.
2466 */
2467 dlil_incr_pending_thread_count();
2468 if (kernel_thread_start(ifnet_detacher_thread_func,
2469 NULL, &thread) != KERN_SUCCESS) {
2470 panic_plain("%s: couldn't create detacher thread", __func__);
2471 /* NOTREACHED */
2472 }
2473 thread_deallocate(thread);
2474
2475 /*
2476 * Wait for the created kernel threads for dlil to get
2477 * scheduled and run at least once before we proceed
2478 */
2479 lck_mtx_lock(&dlil_thread_sync_lock);
2480 while (dlil_pending_thread_cnt != 0) {
2481 DLIL_PRINTF("%s: Waiting for all the create dlil kernel "
2482 "threads to get scheduled at least once.\n", __func__);
2483 (void) msleep(&dlil_pending_thread_cnt, &dlil_thread_sync_lock,
2484 (PZERO - 1), __func__, NULL);
2485 LCK_MTX_ASSERT(&dlil_thread_sync_lock, LCK_ASSERT_OWNED);
2486 }
2487 lck_mtx_unlock(&dlil_thread_sync_lock);
2488 DLIL_PRINTF("%s: All the created dlil kernel threads have been "
2489 "scheduled at least once. Proceeding.\n", __func__);
2490 }
2491
2492 static void
if_flt_monitor_busy(struct ifnet * ifp)2493 if_flt_monitor_busy(struct ifnet *ifp)
2494 {
2495 LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
2496
2497 ++ifp->if_flt_busy;
2498 VERIFY(ifp->if_flt_busy != 0);
2499 }
2500
2501 static void
if_flt_monitor_unbusy(struct ifnet * ifp)2502 if_flt_monitor_unbusy(struct ifnet *ifp)
2503 {
2504 if_flt_monitor_leave(ifp);
2505 }
2506
2507 static void
if_flt_monitor_enter(struct ifnet * ifp)2508 if_flt_monitor_enter(struct ifnet *ifp)
2509 {
2510 LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
2511
2512 while (ifp->if_flt_busy) {
2513 ++ifp->if_flt_waiters;
2514 (void) msleep(&ifp->if_flt_head, &ifp->if_flt_lock,
2515 (PZERO - 1), "if_flt_monitor", NULL);
2516 }
2517 if_flt_monitor_busy(ifp);
2518 }
2519
2520 static void
if_flt_monitor_leave(struct ifnet * ifp)2521 if_flt_monitor_leave(struct ifnet *ifp)
2522 {
2523 LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
2524
2525 VERIFY(ifp->if_flt_busy != 0);
2526 --ifp->if_flt_busy;
2527
2528 if (ifp->if_flt_busy == 0 && ifp->if_flt_waiters > 0) {
2529 ifp->if_flt_waiters = 0;
2530 wakeup(&ifp->if_flt_head);
2531 }
2532 }
2533
2534 __private_extern__ int
dlil_attach_filter(struct ifnet * ifp,const struct iff_filter * if_filter,interface_filter_t * filter_ref,u_int32_t flags)2535 dlil_attach_filter(struct ifnet *ifp, const struct iff_filter *if_filter,
2536 interface_filter_t *filter_ref, u_int32_t flags)
2537 {
2538 int retval = 0;
2539 struct ifnet_filter *filter = NULL;
2540
2541 ifnet_head_lock_shared();
2542
2543 /* Check that the interface is in the global list */
2544 if (!ifnet_lookup(ifp)) {
2545 retval = ENXIO;
2546 goto done;
2547 }
2548 if (!ifnet_is_attached(ifp, 1)) {
2549 os_log(OS_LOG_DEFAULT, "%s: %s is no longer attached",
2550 __func__, if_name(ifp));
2551 retval = ENXIO;
2552 goto done;
2553 }
2554
2555 filter = zalloc_flags(dlif_filt_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
2556
2557 /* refcnt held above during lookup */
2558 filter->filt_flags = flags;
2559 filter->filt_ifp = ifp;
2560 filter->filt_cookie = if_filter->iff_cookie;
2561 filter->filt_name = if_filter->iff_name;
2562 filter->filt_protocol = if_filter->iff_protocol;
2563 /*
2564 * Do not install filter callbacks for internal coproc interface
2565 * and for management interfaces
2566 */
2567 if (!IFNET_IS_INTCOPROC(ifp) && !IFNET_IS_MANAGEMENT(ifp)) {
2568 filter->filt_input = if_filter->iff_input;
2569 filter->filt_output = if_filter->iff_output;
2570 filter->filt_event = if_filter->iff_event;
2571 filter->filt_ioctl = if_filter->iff_ioctl;
2572 }
2573 filter->filt_detached = if_filter->iff_detached;
2574
2575 lck_mtx_lock(&ifp->if_flt_lock);
2576 if_flt_monitor_enter(ifp);
2577
2578 LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
2579 TAILQ_INSERT_TAIL(&ifp->if_flt_head, filter, filt_next);
2580
2581 *filter_ref = filter;
2582
2583 /*
2584 * Bump filter count and route_generation ID to let TCP
2585 * know it shouldn't do TSO on this connection
2586 */
2587 if ((filter->filt_flags & DLIL_IFF_TSO) == 0) {
2588 ifnet_filter_update_tso(ifp, TRUE);
2589 }
2590 OSIncrementAtomic64(&net_api_stats.nas_iflt_attach_count);
2591 INC_ATOMIC_INT64_LIM(net_api_stats.nas_iflt_attach_total);
2592 if (filter->filt_flags & DLIL_IFF_INTERNAL) {
2593 OSIncrementAtomic64(&net_api_stats.nas_iflt_attach_os_count);
2594 INC_ATOMIC_INT64_LIM(net_api_stats.nas_iflt_attach_os_total);
2595 } else {
2596 OSAddAtomic(1, &ifp->if_flt_non_os_count);
2597 }
2598 if_flt_monitor_leave(ifp);
2599 lck_mtx_unlock(&ifp->if_flt_lock);
2600
2601 #if SKYWALK
2602 if (kernel_is_macos_or_server()) {
2603 net_filter_event_mark(NET_FILTER_EVENT_INTERFACE,
2604 net_check_compatible_if_filter(NULL));
2605 }
2606 #endif /* SKYWALK */
2607
2608 if (dlil_verbose) {
2609 DLIL_PRINTF("%s: %s filter attached\n", if_name(ifp),
2610 if_filter->iff_name);
2611 }
2612 ifnet_decr_iorefcnt(ifp);
2613
2614 done:
2615 ifnet_head_done();
2616 if (retval != 0 && ifp != NULL) {
2617 DLIL_PRINTF("%s: failed to attach %s (err=%d)\n",
2618 if_name(ifp), if_filter->iff_name, retval);
2619 }
2620 if (retval != 0 && filter != NULL) {
2621 zfree(dlif_filt_zone, filter);
2622 }
2623
2624 return retval;
2625 }
2626
2627 static int
dlil_detach_filter_internal(interface_filter_t filter,int detached)2628 dlil_detach_filter_internal(interface_filter_t filter, int detached)
2629 {
2630 int retval = 0;
2631
2632 if (detached == 0) {
2633 ifnet_t ifp = NULL;
2634
2635 ifnet_head_lock_shared();
2636 TAILQ_FOREACH(ifp, &ifnet_head, if_link) {
2637 interface_filter_t entry = NULL;
2638
2639 lck_mtx_lock(&ifp->if_flt_lock);
2640 TAILQ_FOREACH(entry, &ifp->if_flt_head, filt_next) {
2641 if (entry != filter || entry->filt_skip) {
2642 continue;
2643 }
2644 /*
2645 * We've found a match; since it's possible
2646 * that the thread gets blocked in the monitor,
2647 * we do the lock dance. Interface should
2648 * not be detached since we still have a use
2649 * count held during filter attach.
2650 */
2651 entry->filt_skip = 1; /* skip input/output */
2652 lck_mtx_unlock(&ifp->if_flt_lock);
2653 ifnet_head_done();
2654
2655 lck_mtx_lock(&ifp->if_flt_lock);
2656 if_flt_monitor_enter(ifp);
2657 LCK_MTX_ASSERT(&ifp->if_flt_lock,
2658 LCK_MTX_ASSERT_OWNED);
2659
2660 /* Remove the filter from the list */
2661 TAILQ_REMOVE(&ifp->if_flt_head, filter,
2662 filt_next);
2663
2664 if (dlil_verbose) {
2665 DLIL_PRINTF("%s: %s filter detached\n",
2666 if_name(ifp), filter->filt_name);
2667 }
2668 if (!(filter->filt_flags & DLIL_IFF_INTERNAL)) {
2669 VERIFY(ifp->if_flt_non_os_count != 0);
2670 OSAddAtomic(-1, &ifp->if_flt_non_os_count);
2671 }
2672 /*
2673 * Decrease filter count and route_generation
2674 * ID to let TCP know it should reevalute doing
2675 * TSO or not.
2676 */
2677 if ((filter->filt_flags & DLIL_IFF_TSO) == 0) {
2678 ifnet_filter_update_tso(ifp, FALSE);
2679 }
2680 /*
2681 * When we remove the bridge's interface filter,
2682 * clear the field in the ifnet.
2683 */
2684 if ((filter->filt_flags & DLIL_IFF_BRIDGE)
2685 != 0) {
2686 ifp->if_bridge = NULL;
2687 }
2688 if_flt_monitor_leave(ifp);
2689 lck_mtx_unlock(&ifp->if_flt_lock);
2690 goto destroy;
2691 }
2692 lck_mtx_unlock(&ifp->if_flt_lock);
2693 }
2694 ifnet_head_done();
2695
2696 /* filter parameter is not a valid filter ref */
2697 retval = EINVAL;
2698 goto done;
2699 } else {
2700 struct ifnet *ifp = filter->filt_ifp;
2701 /*
2702 * Here we are called from ifnet_detach_final(); the
2703 * caller had emptied if_flt_head and we're doing an
2704 * implicit filter detach because the interface is
2705 * about to go away. Make sure to adjust the counters
2706 * in this case. We don't need the protection of the
2707 * filter monitor since we're called as part of the
2708 * final detach in the context of the detacher thread.
2709 */
2710 if (!(filter->filt_flags & DLIL_IFF_INTERNAL)) {
2711 VERIFY(ifp->if_flt_non_os_count != 0);
2712 OSAddAtomic(-1, &ifp->if_flt_non_os_count);
2713 }
2714 /*
2715 * Decrease filter count and route_generation
2716 * ID to let TCP know it should reevalute doing
2717 * TSO or not.
2718 */
2719 if ((filter->filt_flags & DLIL_IFF_TSO) == 0) {
2720 ifnet_filter_update_tso(ifp, FALSE);
2721 }
2722 }
2723
2724 if (dlil_verbose) {
2725 DLIL_PRINTF("%s filter detached\n", filter->filt_name);
2726 }
2727
2728 destroy:
2729
2730 /* Call the detached function if there is one */
2731 if (filter->filt_detached) {
2732 filter->filt_detached(filter->filt_cookie, filter->filt_ifp);
2733 }
2734
2735 VERIFY(OSDecrementAtomic64(&net_api_stats.nas_iflt_attach_count) > 0);
2736 if (filter->filt_flags & DLIL_IFF_INTERNAL) {
2737 VERIFY(OSDecrementAtomic64(&net_api_stats.nas_iflt_attach_os_count) > 0);
2738 }
2739 #if SKYWALK
2740 if (kernel_is_macos_or_server()) {
2741 net_filter_event_mark(NET_FILTER_EVENT_INTERFACE,
2742 net_check_compatible_if_filter(NULL));
2743 }
2744 #endif /* SKYWALK */
2745
2746 /* Free the filter */
2747 zfree(dlif_filt_zone, filter);
2748 filter = NULL;
2749 done:
2750 if (retval != 0 && filter != NULL) {
2751 DLIL_PRINTF("failed to detach %s filter (err=%d)\n",
2752 filter->filt_name, retval);
2753 }
2754
2755 return retval;
2756 }
2757
2758 __private_extern__ void
dlil_detach_filter(interface_filter_t filter)2759 dlil_detach_filter(interface_filter_t filter)
2760 {
2761 if (filter == NULL) {
2762 return;
2763 }
2764 dlil_detach_filter_internal(filter, 0);
2765 }
2766
2767 __private_extern__ boolean_t
dlil_has_ip_filter(void)2768 dlil_has_ip_filter(void)
2769 {
2770 boolean_t has_filter = ((net_api_stats.nas_ipf_add_count - net_api_stats.nas_ipf_add_os_count) > 0);
2771
2772 VERIFY(net_api_stats.nas_ipf_add_count >= net_api_stats.nas_ipf_add_os_count);
2773
2774 DTRACE_IP1(dlil_has_ip_filter, boolean_t, has_filter);
2775 return has_filter;
2776 }
2777
2778 __private_extern__ boolean_t
dlil_has_if_filter(struct ifnet * ifp)2779 dlil_has_if_filter(struct ifnet *ifp)
2780 {
2781 boolean_t has_filter = !TAILQ_EMPTY(&ifp->if_flt_head);
2782 DTRACE_IP1(dlil_has_if_filter, boolean_t, has_filter);
2783 return has_filter;
2784 }
2785
2786 static inline void
dlil_input_wakeup(struct dlil_threading_info * inp)2787 dlil_input_wakeup(struct dlil_threading_info *inp)
2788 {
2789 LCK_MTX_ASSERT(&inp->dlth_lock, LCK_MTX_ASSERT_OWNED);
2790
2791 inp->dlth_flags |= DLIL_INPUT_WAITING;
2792 if (!(inp->dlth_flags & DLIL_INPUT_RUNNING)) {
2793 inp->dlth_wtot++;
2794 wakeup_one((caddr_t)&inp->dlth_flags);
2795 }
2796 }
2797
2798 __attribute__((noreturn))
2799 static void
dlil_main_input_thread_func(void * v,wait_result_t w)2800 dlil_main_input_thread_func(void *v, wait_result_t w)
2801 {
2802 #pragma unused(w)
2803 struct dlil_threading_info *inp = v;
2804
2805 VERIFY(inp == dlil_main_input_thread);
2806 VERIFY(inp->dlth_ifp == NULL);
2807 VERIFY(current_thread() == inp->dlth_thread);
2808
2809 lck_mtx_lock(&inp->dlth_lock);
2810 VERIFY(!(inp->dlth_flags & (DLIL_INPUT_EMBRYONIC | DLIL_INPUT_RUNNING)));
2811 (void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
2812 inp->dlth_flags |= DLIL_INPUT_EMBRYONIC;
2813 /* wake up once to get out of embryonic state */
2814 dlil_input_wakeup(inp);
2815 lck_mtx_unlock(&inp->dlth_lock);
2816 (void) thread_block_parameter(dlil_main_input_thread_cont, inp);
2817 /* NOTREACHED */
2818 __builtin_unreachable();
2819 }
2820
2821 /*
2822 * Main input thread:
2823 *
2824 * a) handles all inbound packets for lo0
2825 * b) handles all inbound packets for interfaces with no dedicated
2826 * input thread (e.g. anything but Ethernet/PDP or those that support
2827 * opportunistic polling.)
2828 * c) protocol registrations
2829 * d) packet injections
2830 */
2831 __attribute__((noreturn))
2832 static void
dlil_main_input_thread_cont(void * v,wait_result_t wres)2833 dlil_main_input_thread_cont(void *v, wait_result_t wres)
2834 {
2835 struct dlil_main_threading_info *inpm = v;
2836 struct dlil_threading_info *inp = v;
2837
2838 /* main input thread is uninterruptible */
2839 VERIFY(wres != THREAD_INTERRUPTED);
2840 lck_mtx_lock_spin(&inp->dlth_lock);
2841 VERIFY(!(inp->dlth_flags & (DLIL_INPUT_TERMINATE |
2842 DLIL_INPUT_RUNNING)));
2843 inp->dlth_flags |= DLIL_INPUT_RUNNING;
2844
2845 while (1) {
2846 struct mbuf *m = NULL, *m_loop = NULL;
2847 u_int32_t m_cnt, m_cnt_loop;
2848 classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
2849 boolean_t proto_req;
2850 boolean_t embryonic;
2851
2852 inp->dlth_flags &= ~DLIL_INPUT_WAITING;
2853
2854 if (__improbable(embryonic =
2855 (inp->dlth_flags & DLIL_INPUT_EMBRYONIC))) {
2856 inp->dlth_flags &= ~DLIL_INPUT_EMBRYONIC;
2857 }
2858
2859 proto_req = (inp->dlth_flags &
2860 (DLIL_PROTO_WAITING | DLIL_PROTO_REGISTER));
2861
2862 /* Packets for non-dedicated interfaces other than lo0 */
2863 m_cnt = qlen(&inp->dlth_pkts);
2864 _getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
2865 m = pkt.cp_mbuf;
2866
2867 /* Packets exclusive to lo0 */
2868 m_cnt_loop = qlen(&inpm->lo_rcvq_pkts);
2869 _getq_all(&inpm->lo_rcvq_pkts, &pkt, NULL, NULL, NULL);
2870 m_loop = pkt.cp_mbuf;
2871
2872 inp->dlth_wtot = 0;
2873
2874 lck_mtx_unlock(&inp->dlth_lock);
2875
2876 if (__improbable(embryonic)) {
2877 dlil_decr_pending_thread_count();
2878 }
2879
2880 /*
2881 * NOTE warning %%% attention !!!!
2882 * We should think about putting some thread starvation
2883 * safeguards if we deal with long chains of packets.
2884 */
2885 if (__probable(m_loop != NULL)) {
2886 dlil_input_packet_list_extended(lo_ifp, m_loop,
2887 m_cnt_loop, IFNET_MODEL_INPUT_POLL_OFF);
2888 }
2889
2890 if (__probable(m != NULL)) {
2891 dlil_input_packet_list_extended(NULL, m,
2892 m_cnt, IFNET_MODEL_INPUT_POLL_OFF);
2893 }
2894
2895 if (__improbable(proto_req)) {
2896 proto_input_run();
2897 }
2898
2899 lck_mtx_lock_spin(&inp->dlth_lock);
2900 VERIFY(inp->dlth_flags & DLIL_INPUT_RUNNING);
2901 /* main input thread cannot be terminated */
2902 VERIFY(!(inp->dlth_flags & DLIL_INPUT_TERMINATE));
2903 if (!(inp->dlth_flags & ~DLIL_INPUT_RUNNING)) {
2904 break;
2905 }
2906 }
2907
2908 inp->dlth_flags &= ~DLIL_INPUT_RUNNING;
2909 (void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
2910 lck_mtx_unlock(&inp->dlth_lock);
2911 (void) thread_block_parameter(dlil_main_input_thread_cont, inp);
2912
2913 VERIFY(0); /* we should never get here */
2914 /* NOTREACHED */
2915 __builtin_unreachable();
2916 }
2917
2918 /*
2919 * Input thread for interfaces with legacy input model.
2920 */
2921 __attribute__((noreturn))
2922 static void
dlil_input_thread_func(void * v,wait_result_t w)2923 dlil_input_thread_func(void *v, wait_result_t w)
2924 {
2925 #pragma unused(w)
2926 char thread_name[MAXTHREADNAMESIZE];
2927 struct dlil_threading_info *inp = v;
2928 struct ifnet *ifp = inp->dlth_ifp;
2929
2930 VERIFY(inp != dlil_main_input_thread);
2931 VERIFY(ifp != NULL);
2932 VERIFY(!(ifp->if_eflags & IFEF_RXPOLL) || !net_rxpoll ||
2933 !(ifp->if_xflags & IFXF_LEGACY));
2934 VERIFY(ifp->if_poll_mode == IFNET_MODEL_INPUT_POLL_OFF ||
2935 !(ifp->if_xflags & IFXF_LEGACY));
2936 VERIFY(current_thread() == inp->dlth_thread);
2937
2938 /* construct the name for this thread, and then apply it */
2939 bzero(thread_name, sizeof(thread_name));
2940 (void) snprintf(thread_name, sizeof(thread_name),
2941 "dlil_input_%s", ifp->if_xname);
2942 thread_set_thread_name(inp->dlth_thread, thread_name);
2943
2944 lck_mtx_lock(&inp->dlth_lock);
2945 VERIFY(!(inp->dlth_flags & (DLIL_INPUT_EMBRYONIC | DLIL_INPUT_RUNNING)));
2946 (void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
2947 inp->dlth_flags |= DLIL_INPUT_EMBRYONIC;
2948 /* wake up once to get out of embryonic state */
2949 dlil_input_wakeup(inp);
2950 lck_mtx_unlock(&inp->dlth_lock);
2951 (void) thread_block_parameter(dlil_input_thread_cont, inp);
2952 /* NOTREACHED */
2953 __builtin_unreachable();
2954 }
2955
2956 __attribute__((noreturn))
2957 static void
dlil_input_thread_cont(void * v,wait_result_t wres)2958 dlil_input_thread_cont(void *v, wait_result_t wres)
2959 {
2960 struct dlil_threading_info *inp = v;
2961 struct ifnet *ifp = inp->dlth_ifp;
2962
2963 lck_mtx_lock_spin(&inp->dlth_lock);
2964 if (__improbable(wres == THREAD_INTERRUPTED ||
2965 (inp->dlth_flags & DLIL_INPUT_TERMINATE))) {
2966 goto terminate;
2967 }
2968
2969 VERIFY(!(inp->dlth_flags & DLIL_INPUT_RUNNING));
2970 inp->dlth_flags |= DLIL_INPUT_RUNNING;
2971
2972 while (1) {
2973 struct mbuf *m = NULL;
2974 classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
2975 boolean_t notify = FALSE;
2976 boolean_t embryonic;
2977 u_int32_t m_cnt;
2978
2979 inp->dlth_flags &= ~DLIL_INPUT_WAITING;
2980
2981 if (__improbable(embryonic =
2982 (inp->dlth_flags & DLIL_INPUT_EMBRYONIC))) {
2983 inp->dlth_flags &= ~DLIL_INPUT_EMBRYONIC;
2984 }
2985
2986 /*
2987 * Protocol registration and injection must always use
2988 * the main input thread; in theory the latter can utilize
2989 * the corresponding input thread where the packet arrived
2990 * on, but that requires our knowing the interface in advance
2991 * (and the benefits might not worth the trouble.)
2992 */
2993 VERIFY(!(inp->dlth_flags &
2994 (DLIL_PROTO_WAITING | DLIL_PROTO_REGISTER)));
2995
2996 /* Packets for this interface */
2997 m_cnt = qlen(&inp->dlth_pkts);
2998 _getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
2999 m = pkt.cp_mbuf;
3000
3001 inp->dlth_wtot = 0;
3002
3003 #if SKYWALK
3004 /*
3005 * If this interface is attached to a netif nexus,
3006 * the stats are already incremented there; otherwise
3007 * do it here.
3008 */
3009 if (!(ifp->if_capabilities & IFCAP_SKYWALK))
3010 #endif /* SKYWALK */
3011 notify = dlil_input_stats_sync(ifp, inp);
3012
3013 lck_mtx_unlock(&inp->dlth_lock);
3014
3015 if (__improbable(embryonic)) {
3016 ifnet_decr_pending_thread_count(ifp);
3017 }
3018
3019 if (__improbable(notify)) {
3020 ifnet_notify_data_threshold(ifp);
3021 }
3022
3023 /*
3024 * NOTE warning %%% attention !!!!
3025 * We should think about putting some thread starvation
3026 * safeguards if we deal with long chains of packets.
3027 */
3028 if (__probable(m != NULL)) {
3029 dlil_input_packet_list_extended(ifp, m,
3030 m_cnt, ifp->if_poll_mode);
3031 }
3032
3033 lck_mtx_lock_spin(&inp->dlth_lock);
3034 VERIFY(inp->dlth_flags & DLIL_INPUT_RUNNING);
3035 if (!(inp->dlth_flags & ~(DLIL_INPUT_RUNNING |
3036 DLIL_INPUT_TERMINATE))) {
3037 break;
3038 }
3039 }
3040
3041 inp->dlth_flags &= ~DLIL_INPUT_RUNNING;
3042
3043 if (__improbable(inp->dlth_flags & DLIL_INPUT_TERMINATE)) {
3044 terminate:
3045 lck_mtx_unlock(&inp->dlth_lock);
3046 dlil_terminate_input_thread(inp);
3047 /* NOTREACHED */
3048 } else {
3049 (void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3050 lck_mtx_unlock(&inp->dlth_lock);
3051 (void) thread_block_parameter(dlil_input_thread_cont, inp);
3052 /* NOTREACHED */
3053 }
3054
3055 VERIFY(0); /* we should never get here */
3056 /* NOTREACHED */
3057 __builtin_unreachable();
3058 }
3059
3060 /*
3061 * Input thread for interfaces with opportunistic polling input model.
3062 */
3063 __attribute__((noreturn))
3064 static void
dlil_rxpoll_input_thread_func(void * v,wait_result_t w)3065 dlil_rxpoll_input_thread_func(void *v, wait_result_t w)
3066 {
3067 #pragma unused(w)
3068 char thread_name[MAXTHREADNAMESIZE];
3069 struct dlil_threading_info *inp = v;
3070 struct ifnet *ifp = inp->dlth_ifp;
3071
3072 VERIFY(inp != dlil_main_input_thread);
3073 VERIFY(ifp != NULL && (ifp->if_eflags & IFEF_RXPOLL) &&
3074 (ifp->if_xflags & IFXF_LEGACY));
3075 VERIFY(current_thread() == inp->dlth_thread);
3076
3077 /* construct the name for this thread, and then apply it */
3078 bzero(thread_name, sizeof(thread_name));
3079 (void) snprintf(thread_name, sizeof(thread_name),
3080 "dlil_input_poll_%s", ifp->if_xname);
3081 thread_set_thread_name(inp->dlth_thread, thread_name);
3082
3083 lck_mtx_lock(&inp->dlth_lock);
3084 VERIFY(!(inp->dlth_flags & (DLIL_INPUT_EMBRYONIC | DLIL_INPUT_RUNNING)));
3085 (void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3086 inp->dlth_flags |= DLIL_INPUT_EMBRYONIC;
3087 /* wake up once to get out of embryonic state */
3088 dlil_input_wakeup(inp);
3089 lck_mtx_unlock(&inp->dlth_lock);
3090 (void) thread_block_parameter(dlil_rxpoll_input_thread_cont, inp);
3091 /* NOTREACHED */
3092 __builtin_unreachable();
3093 }
3094
3095 __attribute__((noreturn))
3096 static void
dlil_rxpoll_input_thread_cont(void * v,wait_result_t wres)3097 dlil_rxpoll_input_thread_cont(void *v, wait_result_t wres)
3098 {
3099 struct dlil_threading_info *inp = v;
3100 struct ifnet *ifp = inp->dlth_ifp;
3101 struct timespec ts;
3102
3103 lck_mtx_lock_spin(&inp->dlth_lock);
3104 if (__improbable(wres == THREAD_INTERRUPTED ||
3105 (inp->dlth_flags & DLIL_INPUT_TERMINATE))) {
3106 goto terminate;
3107 }
3108
3109 VERIFY(!(inp->dlth_flags & DLIL_INPUT_RUNNING));
3110 inp->dlth_flags |= DLIL_INPUT_RUNNING;
3111
3112 while (1) {
3113 struct mbuf *m = NULL;
3114 uint32_t m_cnt, poll_req = 0;
3115 uint64_t m_size = 0;
3116 ifnet_model_t mode;
3117 struct timespec now, delta;
3118 classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
3119 boolean_t notify;
3120 boolean_t embryonic;
3121 uint64_t ival;
3122
3123 inp->dlth_flags &= ~DLIL_INPUT_WAITING;
3124
3125 if (__improbable(embryonic =
3126 (inp->dlth_flags & DLIL_INPUT_EMBRYONIC))) {
3127 inp->dlth_flags &= ~DLIL_INPUT_EMBRYONIC;
3128 goto skip;
3129 }
3130
3131 if ((ival = ifp->if_rxpoll_ival) < IF_RXPOLL_INTERVALTIME_MIN) {
3132 ival = IF_RXPOLL_INTERVALTIME_MIN;
3133 }
3134
3135 /* Link parameters changed? */
3136 if (ifp->if_poll_update != 0) {
3137 ifp->if_poll_update = 0;
3138 (void) dlil_rxpoll_set_params(ifp, NULL, TRUE);
3139 }
3140
3141 /* Current operating mode */
3142 mode = ifp->if_poll_mode;
3143
3144 /*
3145 * Protocol registration and injection must always use
3146 * the main input thread; in theory the latter can utilize
3147 * the corresponding input thread where the packet arrived
3148 * on, but that requires our knowing the interface in advance
3149 * (and the benefits might not worth the trouble.)
3150 */
3151 VERIFY(!(inp->dlth_flags &
3152 (DLIL_PROTO_WAITING | DLIL_PROTO_REGISTER)));
3153
3154 /* Total count of all packets */
3155 m_cnt = qlen(&inp->dlth_pkts);
3156
3157 /* Total bytes of all packets */
3158 m_size = qsize(&inp->dlth_pkts);
3159
3160 /* Packets for this interface */
3161 _getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
3162 m = pkt.cp_mbuf;
3163 VERIFY(m != NULL || m_cnt == 0);
3164
3165 nanouptime(&now);
3166 if (!net_timerisset(&ifp->if_poll_sample_lasttime)) {
3167 *(&ifp->if_poll_sample_lasttime) = *(&now);
3168 }
3169
3170 net_timersub(&now, &ifp->if_poll_sample_lasttime, &delta);
3171 if (if_rxpoll && net_timerisset(&ifp->if_poll_sample_holdtime)) {
3172 u_int32_t ptot, btot;
3173
3174 /* Accumulate statistics for current sampling */
3175 PKTCNTR_ADD(&ifp->if_poll_sstats, m_cnt, m_size);
3176
3177 if (net_timercmp(&delta, &ifp->if_poll_sample_holdtime, <)) {
3178 goto skip;
3179 }
3180
3181 *(&ifp->if_poll_sample_lasttime) = *(&now);
3182
3183 /* Calculate min/max of inbound bytes */
3184 btot = (u_int32_t)ifp->if_poll_sstats.bytes;
3185 if (ifp->if_rxpoll_bmin == 0 || ifp->if_rxpoll_bmin > btot) {
3186 ifp->if_rxpoll_bmin = btot;
3187 }
3188 if (btot > ifp->if_rxpoll_bmax) {
3189 ifp->if_rxpoll_bmax = btot;
3190 }
3191
3192 /* Calculate EWMA of inbound bytes */
3193 DLIL_EWMA(ifp->if_rxpoll_bavg, btot, if_rxpoll_decay);
3194
3195 /* Calculate min/max of inbound packets */
3196 ptot = (u_int32_t)ifp->if_poll_sstats.packets;
3197 if (ifp->if_rxpoll_pmin == 0 || ifp->if_rxpoll_pmin > ptot) {
3198 ifp->if_rxpoll_pmin = ptot;
3199 }
3200 if (ptot > ifp->if_rxpoll_pmax) {
3201 ifp->if_rxpoll_pmax = ptot;
3202 }
3203
3204 /* Calculate EWMA of inbound packets */
3205 DLIL_EWMA(ifp->if_rxpoll_pavg, ptot, if_rxpoll_decay);
3206
3207 /* Reset sampling statistics */
3208 PKTCNTR_CLEAR(&ifp->if_poll_sstats);
3209
3210 /* Calculate EWMA of wakeup requests */
3211 DLIL_EWMA(ifp->if_rxpoll_wavg, inp->dlth_wtot,
3212 if_rxpoll_decay);
3213 inp->dlth_wtot = 0;
3214
3215 if (dlil_verbose) {
3216 if (!net_timerisset(&ifp->if_poll_dbg_lasttime)) {
3217 *(&ifp->if_poll_dbg_lasttime) = *(&now);
3218 }
3219 net_timersub(&now, &ifp->if_poll_dbg_lasttime, &delta);
3220 if (net_timercmp(&delta, &dlil_dbgrate, >=)) {
3221 *(&ifp->if_poll_dbg_lasttime) = *(&now);
3222 DLIL_PRINTF("%s: [%s] pkts avg %d max %d "
3223 "limits [%d/%d], wreq avg %d "
3224 "limits [%d/%d], bytes avg %d "
3225 "limits [%d/%d]\n", if_name(ifp),
3226 (ifp->if_poll_mode ==
3227 IFNET_MODEL_INPUT_POLL_ON) ?
3228 "ON" : "OFF", ifp->if_rxpoll_pavg,
3229 ifp->if_rxpoll_pmax,
3230 ifp->if_rxpoll_plowat,
3231 ifp->if_rxpoll_phiwat,
3232 ifp->if_rxpoll_wavg,
3233 ifp->if_rxpoll_wlowat,
3234 ifp->if_rxpoll_whiwat,
3235 ifp->if_rxpoll_bavg,
3236 ifp->if_rxpoll_blowat,
3237 ifp->if_rxpoll_bhiwat);
3238 }
3239 }
3240
3241 /* Perform mode transition, if necessary */
3242 if (!net_timerisset(&ifp->if_poll_mode_lasttime)) {
3243 *(&ifp->if_poll_mode_lasttime) = *(&now);
3244 }
3245
3246 net_timersub(&now, &ifp->if_poll_mode_lasttime, &delta);
3247 if (net_timercmp(&delta, &ifp->if_poll_mode_holdtime, <)) {
3248 goto skip;
3249 }
3250
3251 if (ifp->if_rxpoll_pavg <= ifp->if_rxpoll_plowat &&
3252 ifp->if_rxpoll_bavg <= ifp->if_rxpoll_blowat &&
3253 ifp->if_poll_mode != IFNET_MODEL_INPUT_POLL_OFF) {
3254 mode = IFNET_MODEL_INPUT_POLL_OFF;
3255 } else if (ifp->if_rxpoll_pavg >= ifp->if_rxpoll_phiwat &&
3256 (ifp->if_rxpoll_bavg >= ifp->if_rxpoll_bhiwat ||
3257 ifp->if_rxpoll_wavg >= ifp->if_rxpoll_whiwat) &&
3258 ifp->if_poll_mode != IFNET_MODEL_INPUT_POLL_ON) {
3259 mode = IFNET_MODEL_INPUT_POLL_ON;
3260 }
3261
3262 if (mode != ifp->if_poll_mode) {
3263 ifp->if_poll_mode = mode;
3264 *(&ifp->if_poll_mode_lasttime) = *(&now);
3265 poll_req++;
3266 }
3267 }
3268 skip:
3269 notify = dlil_input_stats_sync(ifp, inp);
3270
3271 lck_mtx_unlock(&inp->dlth_lock);
3272
3273 if (__improbable(embryonic)) {
3274 ifnet_decr_pending_thread_count(ifp);
3275 }
3276
3277 if (__improbable(notify)) {
3278 ifnet_notify_data_threshold(ifp);
3279 }
3280
3281 /*
3282 * If there's a mode change and interface is still attached,
3283 * perform a downcall to the driver for the new mode. Also
3284 * hold an IO refcnt on the interface to prevent it from
3285 * being detached (will be release below.)
3286 */
3287 if (poll_req != 0 && ifnet_is_attached(ifp, 1)) {
3288 struct ifnet_model_params p = {
3289 .model = mode, .reserved = { 0 }
3290 };
3291 errno_t err;
3292
3293 if (dlil_verbose) {
3294 DLIL_PRINTF("%s: polling is now %s, "
3295 "pkts avg %d max %d limits [%d/%d], "
3296 "wreq avg %d limits [%d/%d], "
3297 "bytes avg %d limits [%d/%d]\n",
3298 if_name(ifp),
3299 (mode == IFNET_MODEL_INPUT_POLL_ON) ?
3300 "ON" : "OFF", ifp->if_rxpoll_pavg,
3301 ifp->if_rxpoll_pmax, ifp->if_rxpoll_plowat,
3302 ifp->if_rxpoll_phiwat, ifp->if_rxpoll_wavg,
3303 ifp->if_rxpoll_wlowat, ifp->if_rxpoll_whiwat,
3304 ifp->if_rxpoll_bavg, ifp->if_rxpoll_blowat,
3305 ifp->if_rxpoll_bhiwat);
3306 }
3307
3308 if ((err = ((*ifp->if_input_ctl)(ifp,
3309 IFNET_CTL_SET_INPUT_MODEL, sizeof(p), &p))) != 0) {
3310 DLIL_PRINTF("%s: error setting polling mode "
3311 "to %s (%d)\n", if_name(ifp),
3312 (mode == IFNET_MODEL_INPUT_POLL_ON) ?
3313 "ON" : "OFF", err);
3314 }
3315
3316 switch (mode) {
3317 case IFNET_MODEL_INPUT_POLL_OFF:
3318 ifnet_set_poll_cycle(ifp, NULL);
3319 ifp->if_rxpoll_offreq++;
3320 if (err != 0) {
3321 ifp->if_rxpoll_offerr++;
3322 }
3323 break;
3324
3325 case IFNET_MODEL_INPUT_POLL_ON:
3326 net_nsectimer(&ival, &ts);
3327 ifnet_set_poll_cycle(ifp, &ts);
3328 ifnet_poll(ifp);
3329 ifp->if_rxpoll_onreq++;
3330 if (err != 0) {
3331 ifp->if_rxpoll_onerr++;
3332 }
3333 break;
3334
3335 default:
3336 VERIFY(0);
3337 /* NOTREACHED */
3338 }
3339
3340 /* Release the IO refcnt */
3341 ifnet_decr_iorefcnt(ifp);
3342 }
3343
3344 /*
3345 * NOTE warning %%% attention !!!!
3346 * We should think about putting some thread starvation
3347 * safeguards if we deal with long chains of packets.
3348 */
3349 if (__probable(m != NULL)) {
3350 dlil_input_packet_list_extended(ifp, m, m_cnt, mode);
3351 }
3352
3353 lck_mtx_lock_spin(&inp->dlth_lock);
3354 VERIFY(inp->dlth_flags & DLIL_INPUT_RUNNING);
3355 if (!(inp->dlth_flags & ~(DLIL_INPUT_RUNNING |
3356 DLIL_INPUT_TERMINATE))) {
3357 break;
3358 }
3359 }
3360
3361 inp->dlth_flags &= ~DLIL_INPUT_RUNNING;
3362
3363 if (__improbable(inp->dlth_flags & DLIL_INPUT_TERMINATE)) {
3364 terminate:
3365 lck_mtx_unlock(&inp->dlth_lock);
3366 dlil_terminate_input_thread(inp);
3367 /* NOTREACHED */
3368 } else {
3369 (void) assert_wait(&inp->dlth_flags, THREAD_UNINT);
3370 lck_mtx_unlock(&inp->dlth_lock);
3371 (void) thread_block_parameter(dlil_rxpoll_input_thread_cont,
3372 inp);
3373 /* NOTREACHED */
3374 }
3375
3376 VERIFY(0); /* we should never get here */
3377 /* NOTREACHED */
3378 __builtin_unreachable();
3379 }
3380
3381 errno_t
dlil_rxpoll_validate_params(struct ifnet_poll_params * p)3382 dlil_rxpoll_validate_params(struct ifnet_poll_params *p)
3383 {
3384 if (p != NULL) {
3385 if ((p->packets_lowat == 0 && p->packets_hiwat != 0) ||
3386 (p->packets_lowat != 0 && p->packets_hiwat == 0)) {
3387 return EINVAL;
3388 }
3389 if (p->packets_lowat != 0 && /* hiwat must be non-zero */
3390 p->packets_lowat >= p->packets_hiwat) {
3391 return EINVAL;
3392 }
3393 if ((p->bytes_lowat == 0 && p->bytes_hiwat != 0) ||
3394 (p->bytes_lowat != 0 && p->bytes_hiwat == 0)) {
3395 return EINVAL;
3396 }
3397 if (p->bytes_lowat != 0 && /* hiwat must be non-zero */
3398 p->bytes_lowat >= p->bytes_hiwat) {
3399 return EINVAL;
3400 }
3401 if (p->interval_time != 0 &&
3402 p->interval_time < IF_RXPOLL_INTERVALTIME_MIN) {
3403 p->interval_time = IF_RXPOLL_INTERVALTIME_MIN;
3404 }
3405 }
3406 return 0;
3407 }
3408
3409 void
dlil_rxpoll_update_params(struct ifnet * ifp,struct ifnet_poll_params * p)3410 dlil_rxpoll_update_params(struct ifnet *ifp, struct ifnet_poll_params *p)
3411 {
3412 u_int64_t sample_holdtime, inbw;
3413
3414 if ((inbw = ifnet_input_linkrate(ifp)) == 0 && p == NULL) {
3415 sample_holdtime = 0; /* polling is disabled */
3416 ifp->if_rxpoll_wlowat = ifp->if_rxpoll_plowat =
3417 ifp->if_rxpoll_blowat = 0;
3418 ifp->if_rxpoll_whiwat = ifp->if_rxpoll_phiwat =
3419 ifp->if_rxpoll_bhiwat = (u_int32_t)-1;
3420 ifp->if_rxpoll_plim = 0;
3421 ifp->if_rxpoll_ival = IF_RXPOLL_INTERVALTIME_MIN;
3422 } else {
3423 u_int32_t plowat, phiwat, blowat, bhiwat, plim;
3424 u_int64_t ival;
3425 unsigned int n, i;
3426
3427 for (n = 0, i = 0; rxpoll_tbl[i].speed != 0; i++) {
3428 if (inbw < rxpoll_tbl[i].speed) {
3429 break;
3430 }
3431 n = i;
3432 }
3433 /* auto-tune if caller didn't specify a value */
3434 plowat = ((p == NULL || p->packets_lowat == 0) ?
3435 rxpoll_tbl[n].plowat : p->packets_lowat);
3436 phiwat = ((p == NULL || p->packets_hiwat == 0) ?
3437 rxpoll_tbl[n].phiwat : p->packets_hiwat);
3438 blowat = ((p == NULL || p->bytes_lowat == 0) ?
3439 rxpoll_tbl[n].blowat : p->bytes_lowat);
3440 bhiwat = ((p == NULL || p->bytes_hiwat == 0) ?
3441 rxpoll_tbl[n].bhiwat : p->bytes_hiwat);
3442 plim = ((p == NULL || p->packets_limit == 0 ||
3443 if_rxpoll_max != 0) ? if_rxpoll_max : p->packets_limit);
3444 ival = ((p == NULL || p->interval_time == 0 ||
3445 if_rxpoll_interval_time != IF_RXPOLL_INTERVALTIME) ?
3446 if_rxpoll_interval_time : p->interval_time);
3447
3448 VERIFY(plowat != 0 && phiwat != 0);
3449 VERIFY(blowat != 0 && bhiwat != 0);
3450 VERIFY(ival >= IF_RXPOLL_INTERVALTIME_MIN);
3451
3452 sample_holdtime = if_rxpoll_sample_holdtime;
3453 ifp->if_rxpoll_wlowat = if_sysctl_rxpoll_wlowat;
3454 ifp->if_rxpoll_whiwat = if_sysctl_rxpoll_whiwat;
3455 ifp->if_rxpoll_plowat = plowat;
3456 ifp->if_rxpoll_phiwat = phiwat;
3457 ifp->if_rxpoll_blowat = blowat;
3458 ifp->if_rxpoll_bhiwat = bhiwat;
3459 ifp->if_rxpoll_plim = plim;
3460 ifp->if_rxpoll_ival = ival;
3461 }
3462
3463 net_nsectimer(&if_rxpoll_mode_holdtime, &ifp->if_poll_mode_holdtime);
3464 net_nsectimer(&sample_holdtime, &ifp->if_poll_sample_holdtime);
3465
3466 if (dlil_verbose) {
3467 DLIL_PRINTF("%s: speed %llu bps, sample per %llu nsec, "
3468 "poll interval %llu nsec, pkts per poll %u, "
3469 "pkt limits [%u/%u], wreq limits [%u/%u], "
3470 "bytes limits [%u/%u]\n", if_name(ifp),
3471 inbw, sample_holdtime, ifp->if_rxpoll_ival,
3472 ifp->if_rxpoll_plim, ifp->if_rxpoll_plowat,
3473 ifp->if_rxpoll_phiwat, ifp->if_rxpoll_wlowat,
3474 ifp->if_rxpoll_whiwat, ifp->if_rxpoll_blowat,
3475 ifp->if_rxpoll_bhiwat);
3476 }
3477 }
3478
3479 /*
3480 * Must be called on an attached ifnet (caller is expected to check.)
3481 * Caller may pass NULL for poll parameters to indicate "auto-tuning."
3482 */
3483 errno_t
dlil_rxpoll_set_params(struct ifnet * ifp,struct ifnet_poll_params * p,boolean_t locked)3484 dlil_rxpoll_set_params(struct ifnet *ifp, struct ifnet_poll_params *p,
3485 boolean_t locked)
3486 {
3487 errno_t err;
3488 struct dlil_threading_info *inp;
3489
3490 VERIFY(ifp != NULL);
3491 if (!(ifp->if_eflags & IFEF_RXPOLL) || (inp = ifp->if_inp) == NULL) {
3492 return ENXIO;
3493 }
3494 err = dlil_rxpoll_validate_params(p);
3495 if (err != 0) {
3496 return err;
3497 }
3498
3499 if (!locked) {
3500 lck_mtx_lock(&inp->dlth_lock);
3501 }
3502 LCK_MTX_ASSERT(&inp->dlth_lock, LCK_MTX_ASSERT_OWNED);
3503 /*
3504 * Normally, we'd reset the parameters to the auto-tuned values
3505 * if the the input thread detects a change in link rate. If the
3506 * driver provides its own parameters right after a link rate
3507 * changes, but before the input thread gets to run, we want to
3508 * make sure to keep the driver's values. Clearing if_poll_update
3509 * will achieve that.
3510 */
3511 if (p != NULL && !locked && ifp->if_poll_update != 0) {
3512 ifp->if_poll_update = 0;
3513 }
3514 dlil_rxpoll_update_params(ifp, p);
3515 if (!locked) {
3516 lck_mtx_unlock(&inp->dlth_lock);
3517 }
3518 return 0;
3519 }
3520
3521 /*
3522 * Must be called on an attached ifnet (caller is expected to check.)
3523 */
3524 errno_t
dlil_rxpoll_get_params(struct ifnet * ifp,struct ifnet_poll_params * p)3525 dlil_rxpoll_get_params(struct ifnet *ifp, struct ifnet_poll_params *p)
3526 {
3527 struct dlil_threading_info *inp;
3528
3529 VERIFY(ifp != NULL && p != NULL);
3530 if (!(ifp->if_eflags & IFEF_RXPOLL) || (inp = ifp->if_inp) == NULL) {
3531 return ENXIO;
3532 }
3533
3534 bzero(p, sizeof(*p));
3535
3536 lck_mtx_lock(&inp->dlth_lock);
3537 p->packets_limit = ifp->if_rxpoll_plim;
3538 p->packets_lowat = ifp->if_rxpoll_plowat;
3539 p->packets_hiwat = ifp->if_rxpoll_phiwat;
3540 p->bytes_lowat = ifp->if_rxpoll_blowat;
3541 p->bytes_hiwat = ifp->if_rxpoll_bhiwat;
3542 p->interval_time = ifp->if_rxpoll_ival;
3543 lck_mtx_unlock(&inp->dlth_lock);
3544
3545 return 0;
3546 }
3547
3548 errno_t
ifnet_input(struct ifnet * ifp,struct mbuf * m_head,const struct ifnet_stat_increment_param * s)3549 ifnet_input(struct ifnet *ifp, struct mbuf *m_head,
3550 const struct ifnet_stat_increment_param *s)
3551 {
3552 return ifnet_input_common(ifp, m_head, NULL, s, FALSE, FALSE);
3553 }
3554
3555 errno_t
ifnet_input_extended(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s)3556 ifnet_input_extended(struct ifnet *ifp, struct mbuf *m_head,
3557 struct mbuf *m_tail, const struct ifnet_stat_increment_param *s)
3558 {
3559 return ifnet_input_common(ifp, m_head, m_tail, s, TRUE, FALSE);
3560 }
3561
3562 errno_t
ifnet_input_poll(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s)3563 ifnet_input_poll(struct ifnet *ifp, struct mbuf *m_head,
3564 struct mbuf *m_tail, const struct ifnet_stat_increment_param *s)
3565 {
3566 return ifnet_input_common(ifp, m_head, m_tail, s,
3567 (m_head != NULL), TRUE);
3568 }
3569
3570 static errno_t
ifnet_input_common(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t ext,boolean_t poll)3571 ifnet_input_common(struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail,
3572 const struct ifnet_stat_increment_param *s, boolean_t ext, boolean_t poll)
3573 {
3574 dlil_input_func input_func;
3575 struct ifnet_stat_increment_param _s;
3576 u_int32_t m_cnt = 0, m_size = 0;
3577 struct mbuf *last;
3578 errno_t err = 0;
3579
3580 if ((m_head == NULL && !poll) || (s == NULL && ext)) {
3581 if (m_head != NULL) {
3582 mbuf_freem_list(m_head);
3583 }
3584 return EINVAL;
3585 }
3586
3587 VERIFY(m_head != NULL || (s == NULL && m_tail == NULL && !ext && poll));
3588 VERIFY(m_tail == NULL || ext);
3589 VERIFY(s != NULL || !ext);
3590
3591 /*
3592 * Drop the packet(s) if the parameters are invalid, or if the
3593 * interface is no longer attached; else hold an IO refcnt to
3594 * prevent it from being detached (will be released below.)
3595 */
3596 if (ifp == NULL || (ifp != lo_ifp && !ifnet_datamov_begin(ifp))) {
3597 if (m_head != NULL) {
3598 mbuf_freem_list(m_head);
3599 }
3600 return EINVAL;
3601 }
3602
3603 input_func = ifp->if_input_dlil;
3604 VERIFY(input_func != NULL);
3605
3606 if (m_tail == NULL) {
3607 last = m_head;
3608 while (m_head != NULL) {
3609 #if IFNET_INPUT_SANITY_CHK
3610 if (__improbable(dlil_input_sanity_check != 0)) {
3611 DLIL_INPUT_CHECK(last, ifp);
3612 }
3613 #endif /* IFNET_INPUT_SANITY_CHK */
3614 m_cnt++;
3615 m_size += m_length(last);
3616 if (mbuf_nextpkt(last) == NULL) {
3617 break;
3618 }
3619 last = mbuf_nextpkt(last);
3620 }
3621 m_tail = last;
3622 } else {
3623 #if IFNET_INPUT_SANITY_CHK
3624 if (__improbable(dlil_input_sanity_check != 0)) {
3625 last = m_head;
3626 while (1) {
3627 DLIL_INPUT_CHECK(last, ifp);
3628 m_cnt++;
3629 m_size += m_length(last);
3630 if (mbuf_nextpkt(last) == NULL) {
3631 break;
3632 }
3633 last = mbuf_nextpkt(last);
3634 }
3635 } else {
3636 m_cnt = s->packets_in;
3637 m_size = s->bytes_in;
3638 last = m_tail;
3639 }
3640 #else
3641 m_cnt = s->packets_in;
3642 m_size = s->bytes_in;
3643 last = m_tail;
3644 #endif /* IFNET_INPUT_SANITY_CHK */
3645 }
3646
3647 if (last != m_tail) {
3648 panic_plain("%s: invalid input packet chain for %s, "
3649 "tail mbuf %p instead of %p\n", __func__, if_name(ifp),
3650 m_tail, last);
3651 }
3652
3653 /*
3654 * Assert packet count only for the extended variant, for backwards
3655 * compatibility, since this came directly from the device driver.
3656 * Relax this assertion for input bytes, as the driver may have
3657 * included the link-layer headers in the computation; hence
3658 * m_size is just an approximation.
3659 */
3660 if (ext && s->packets_in != m_cnt) {
3661 panic_plain("%s: input packet count mismatch for %s, "
3662 "%d instead of %d\n", __func__, if_name(ifp),
3663 s->packets_in, m_cnt);
3664 }
3665
3666 if (s == NULL) {
3667 bzero(&_s, sizeof(_s));
3668 s = &_s;
3669 } else {
3670 _s = *s;
3671 }
3672 _s.packets_in = m_cnt;
3673 _s.bytes_in = m_size;
3674
3675 if (ifp->if_xflags & IFXF_DISABLE_INPUT) {
3676 m_freem_list(m_head);
3677
3678 os_atomic_add(&ifp->if_data.ifi_ipackets, _s.packets_in, relaxed);
3679 os_atomic_add(&ifp->if_data.ifi_ibytes, _s.bytes_in, relaxed);
3680
3681 goto done;
3682 }
3683
3684 err = (*input_func)(ifp, m_head, m_tail, s, poll, current_thread());
3685
3686 done:
3687 if (ifp != lo_ifp) {
3688 /* Release the IO refcnt */
3689 ifnet_datamov_end(ifp);
3690 }
3691
3692 return err;
3693 }
3694
3695 #if SKYWALK
3696 errno_t
dlil_set_input_handler(struct ifnet * ifp,dlil_input_func fn)3697 dlil_set_input_handler(struct ifnet *ifp, dlil_input_func fn)
3698 {
3699 return os_atomic_cmpxchg((void * volatile *)&ifp->if_input_dlil,
3700 ptrauth_nop_cast(void *, &dlil_input_handler),
3701 ptrauth_nop_cast(void *, fn), acq_rel) ? 0 : EBUSY;
3702 }
3703
3704 void
dlil_reset_input_handler(struct ifnet * ifp)3705 dlil_reset_input_handler(struct ifnet *ifp)
3706 {
3707 while (!os_atomic_cmpxchg((void * volatile *)&ifp->if_input_dlil,
3708 ptrauth_nop_cast(void *, ifp->if_input_dlil),
3709 ptrauth_nop_cast(void *, &dlil_input_handler), acq_rel)) {
3710 ;
3711 }
3712 }
3713
3714 errno_t
dlil_set_output_handler(struct ifnet * ifp,dlil_output_func fn)3715 dlil_set_output_handler(struct ifnet *ifp, dlil_output_func fn)
3716 {
3717 return os_atomic_cmpxchg((void * volatile *)&ifp->if_output_dlil,
3718 ptrauth_nop_cast(void *, &dlil_output_handler),
3719 ptrauth_nop_cast(void *, fn), acq_rel) ? 0 : EBUSY;
3720 }
3721
3722 void
dlil_reset_output_handler(struct ifnet * ifp)3723 dlil_reset_output_handler(struct ifnet *ifp)
3724 {
3725 while (!os_atomic_cmpxchg((void * volatile *)&ifp->if_output_dlil,
3726 ptrauth_nop_cast(void *, ifp->if_output_dlil),
3727 ptrauth_nop_cast(void *, &dlil_output_handler), acq_rel)) {
3728 ;
3729 }
3730 }
3731 #endif /* SKYWALK */
3732
3733 errno_t
dlil_output_handler(struct ifnet * ifp,struct mbuf * m)3734 dlil_output_handler(struct ifnet *ifp, struct mbuf *m)
3735 {
3736 return ifp->if_output(ifp, m);
3737 }
3738
3739 errno_t
dlil_input_handler(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t poll,struct thread * tp)3740 dlil_input_handler(struct ifnet *ifp, struct mbuf *m_head,
3741 struct mbuf *m_tail, const struct ifnet_stat_increment_param *s,
3742 boolean_t poll, struct thread *tp)
3743 {
3744 struct dlil_threading_info *inp = ifp->if_inp;
3745
3746 if (__improbable(inp == NULL)) {
3747 inp = dlil_main_input_thread;
3748 }
3749
3750 #if (DEVELOPMENT || DEBUG)
3751 if (__improbable(net_thread_is_marked(NET_THREAD_SYNC_RX))) {
3752 return dlil_input_sync(inp, ifp, m_head, m_tail, s, poll, tp);
3753 } else
3754 #endif /* (DEVELOPMENT || DEBUG) */
3755 {
3756 return inp->dlth_strategy(inp, ifp, m_head, m_tail, s, poll, tp);
3757 }
3758 }
3759
3760 /*
3761 * Detect whether a queue contains a burst that needs to be trimmed.
3762 */
3763 #define MBUF_QUEUE_IS_OVERCOMMITTED(q) \
3764 __improbable(MAX(if_rcvq_burst_limit, qlimit(q)) < qlen(q) && \
3765 qtype(q) == QP_MBUF)
3766
3767 #define MAX_KNOWN_MBUF_CLASS 8
3768
3769 static uint32_t
dlil_trim_overcomitted_queue_locked(class_queue_t * input_queue,dlil_freeq_t * freeq,struct ifnet_stat_increment_param * stat_delta)3770 dlil_trim_overcomitted_queue_locked(class_queue_t *input_queue,
3771 dlil_freeq_t *freeq, struct ifnet_stat_increment_param *stat_delta)
3772 {
3773 uint32_t overcommitted_qlen; /* Length in packets. */
3774 uint64_t overcommitted_qsize; /* Size in bytes. */
3775 uint32_t target_qlen; /* The desired queue length after trimming. */
3776 uint32_t pkts_to_drop = 0; /* Number of packets to drop. */
3777 uint32_t dropped_pkts = 0; /* Number of packets that were dropped. */
3778 uint32_t dropped_bytes = 0; /* Number of dropped bytes. */
3779 struct mbuf *m = NULL, *m_tmp = NULL;
3780
3781 overcommitted_qlen = qlen(input_queue);
3782 overcommitted_qsize = qsize(input_queue);
3783 target_qlen = (qlimit(input_queue) * if_rcvq_trim_pct) / 100;
3784
3785 if (overcommitted_qlen <= target_qlen) {
3786 /*
3787 * The queue is already within the target limits.
3788 */
3789 dropped_pkts = 0;
3790 goto out;
3791 }
3792
3793 pkts_to_drop = overcommitted_qlen - target_qlen;
3794
3795 /*
3796 * Proceed to removing packets from the head of the queue,
3797 * starting from the oldest, until the desired number of packets
3798 * has been dropped.
3799 */
3800 MBUFQ_FOREACH_SAFE(m, &qmbufq(input_queue), m_tmp) {
3801 if (pkts_to_drop <= dropped_pkts) {
3802 break;
3803 }
3804 MBUFQ_REMOVE(&qmbufq(input_queue), m);
3805 MBUFQ_NEXT(m) = NULL;
3806 MBUFQ_ENQUEUE(freeq, m);
3807
3808 dropped_pkts += 1;
3809 dropped_bytes += m_length(m);
3810 }
3811
3812 /*
3813 * Adjust the length and the estimated size of the queue
3814 * after trimming.
3815 */
3816 VERIFY(overcommitted_qlen == target_qlen + dropped_pkts);
3817 qlen(input_queue) = target_qlen;
3818
3819 /* qsize() is an approximation. */
3820 if (dropped_bytes < qsize(input_queue)) {
3821 qsize(input_queue) -= dropped_bytes;
3822 } else {
3823 qsize(input_queue) = 0;
3824 }
3825
3826 /*
3827 * Adjust the ifnet statistics increments, if needed.
3828 */
3829 stat_delta->dropped += dropped_pkts;
3830 if (dropped_pkts < stat_delta->packets_in) {
3831 stat_delta->packets_in -= dropped_pkts;
3832 } else {
3833 stat_delta->packets_in = 0;
3834 }
3835 if (dropped_bytes < stat_delta->bytes_in) {
3836 stat_delta->bytes_in -= dropped_bytes;
3837 } else {
3838 stat_delta->bytes_in = 0;
3839 }
3840
3841 out:
3842 if (dlil_verbose) {
3843 /*
3844 * The basic information about the drop is logged
3845 * by the invoking function (dlil_input_{,a}sync).
3846 * If `dlil_verbose' flag is set, provide more information
3847 * that can be useful for debugging.
3848 */
3849 DLIL_PRINTF("%s: "
3850 "qlen: %u -> %u, "
3851 "qsize: %llu -> %llu "
3852 "qlimit: %u (sysctl: %u) "
3853 "target_qlen: %u (if_rcvq_trim_pct: %u) pkts_to_drop: %u "
3854 "dropped_pkts: %u dropped_bytes %u\n",
3855 __func__,
3856 overcommitted_qlen, qlen(input_queue),
3857 overcommitted_qsize, qsize(input_queue),
3858 qlimit(input_queue), if_rcvq_burst_limit,
3859 target_qlen, if_rcvq_trim_pct, pkts_to_drop,
3860 dropped_pkts, dropped_bytes);
3861 }
3862
3863 return dropped_pkts;
3864 }
3865
3866 static errno_t
dlil_input_async(struct dlil_threading_info * inp,struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t poll,struct thread * tp)3867 dlil_input_async(struct dlil_threading_info *inp,
3868 struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail,
3869 const struct ifnet_stat_increment_param *s, boolean_t poll,
3870 struct thread *tp)
3871 {
3872 u_int32_t m_cnt = s->packets_in;
3873 u_int32_t m_size = s->bytes_in;
3874 boolean_t notify = FALSE;
3875 struct ifnet_stat_increment_param s_adj = *s;
3876 dlil_freeq_t freeq;
3877 MBUFQ_INIT(&freeq);
3878
3879 /*
3880 * If there is a matching DLIL input thread associated with an
3881 * affinity set, associate this thread with the same set. We
3882 * will only do this once.
3883 */
3884 lck_mtx_lock_spin(&inp->dlth_lock);
3885 if (inp != dlil_main_input_thread && inp->dlth_affinity && tp != NULL &&
3886 ((!poll && inp->dlth_driver_thread == THREAD_NULL) ||
3887 (poll && inp->dlth_poller_thread == THREAD_NULL))) {
3888 u_int32_t tag = inp->dlth_affinity_tag;
3889
3890 if (poll) {
3891 VERIFY(inp->dlth_poller_thread == THREAD_NULL);
3892 inp->dlth_poller_thread = tp;
3893 } else {
3894 VERIFY(inp->dlth_driver_thread == THREAD_NULL);
3895 inp->dlth_driver_thread = tp;
3896 }
3897 lck_mtx_unlock(&inp->dlth_lock);
3898
3899 /* Associate the current thread with the new affinity tag */
3900 (void) dlil_affinity_set(tp, tag);
3901
3902 /*
3903 * Take a reference on the current thread; during detach,
3904 * we will need to refer to it in order to tear down its
3905 * affinity.
3906 */
3907 thread_reference(tp);
3908 lck_mtx_lock_spin(&inp->dlth_lock);
3909 }
3910
3911 VERIFY(m_head != NULL || (m_tail == NULL && m_cnt == 0));
3912
3913 /*
3914 * Because of loopbacked multicast we cannot stuff the ifp in
3915 * the rcvif of the packet header: loopback (lo0) packets use a
3916 * dedicated list so that we can later associate them with lo_ifp
3917 * on their way up the stack. Packets for other interfaces without
3918 * dedicated input threads go to the regular list.
3919 */
3920 if (m_head != NULL) {
3921 classq_pkt_t head, tail;
3922 class_queue_t *input_queue;
3923 CLASSQ_PKT_INIT_MBUF(&head, m_head);
3924 CLASSQ_PKT_INIT_MBUF(&tail, m_tail);
3925 if (inp == dlil_main_input_thread && ifp == lo_ifp) {
3926 struct dlil_main_threading_info *inpm =
3927 (struct dlil_main_threading_info *)inp;
3928 input_queue = &inpm->lo_rcvq_pkts;
3929 } else {
3930 input_queue = &inp->dlth_pkts;
3931 }
3932
3933 _addq_multi(input_queue, &head, &tail, m_cnt, m_size);
3934
3935 if (MBUF_QUEUE_IS_OVERCOMMITTED(input_queue)) {
3936 dlil_trim_overcomitted_queue_locked(input_queue, &freeq, &s_adj);
3937 inp->dlth_trim_pkts_dropped += s_adj.dropped;
3938 inp->dlth_trim_cnt += 1;
3939
3940 os_log_error(OS_LOG_DEFAULT,
3941 "%s %s burst limit %u (sysctl: %u) exceeded. "
3942 "%u packets dropped [%u total in %u events]. new qlen %u ",
3943 __func__, if_name(ifp), qlimit(input_queue), if_rcvq_burst_limit,
3944 s_adj.dropped, inp->dlth_trim_pkts_dropped, inp->dlth_trim_cnt,
3945 qlen(input_queue));
3946 }
3947 }
3948
3949 #if IFNET_INPUT_SANITY_CHK
3950 /*
3951 * Verify that the original stat increment parameter
3952 * accurately describes the input chain `m_head`.
3953 * This is not affected by the trimming of input queue.
3954 */
3955 if (__improbable(dlil_input_sanity_check != 0)) {
3956 u_int32_t count = 0, size = 0;
3957 struct mbuf *m0;
3958
3959 for (m0 = m_head; m0; m0 = mbuf_nextpkt(m0)) {
3960 size += m_length(m0);
3961 count++;
3962 }
3963
3964 if (count != m_cnt) {
3965 panic_plain("%s: invalid total packet count %u "
3966 "(expected %u)\n", if_name(ifp), count, m_cnt);
3967 /* NOTREACHED */
3968 __builtin_unreachable();
3969 } else if (size != m_size) {
3970 panic_plain("%s: invalid total packet size %u "
3971 "(expected %u)\n", if_name(ifp), size, m_size);
3972 /* NOTREACHED */
3973 __builtin_unreachable();
3974 }
3975
3976 inp->dlth_pkts_cnt += m_cnt;
3977 }
3978 #endif /* IFNET_INPUT_SANITY_CHK */
3979
3980 /* NOTE: use the adjusted parameter, vs the original one */
3981 dlil_input_stats_add(&s_adj, inp, ifp, poll);
3982 /*
3983 * If we're using the main input thread, synchronize the
3984 * stats now since we have the interface context. All
3985 * other cases involving dedicated input threads will
3986 * have their stats synchronized there.
3987 */
3988 if (inp == dlil_main_input_thread) {
3989 notify = dlil_input_stats_sync(ifp, inp);
3990 }
3991
3992 dlil_input_wakeup(inp);
3993 lck_mtx_unlock(&inp->dlth_lock);
3994
3995 /*
3996 * Actual freeing of the excess packets must happen
3997 * after the dlth_lock had been released.
3998 */
3999 if (!MBUFQ_EMPTY(&freeq)) {
4000 m_freem_list(MBUFQ_FIRST(&freeq));
4001 }
4002
4003 if (notify) {
4004 ifnet_notify_data_threshold(ifp);
4005 }
4006
4007 return 0;
4008 }
4009
4010 static errno_t
dlil_input_sync(struct dlil_threading_info * inp,struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t poll,struct thread * tp)4011 dlil_input_sync(struct dlil_threading_info *inp,
4012 struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail,
4013 const struct ifnet_stat_increment_param *s, boolean_t poll,
4014 struct thread *tp)
4015 {
4016 #pragma unused(tp)
4017 u_int32_t m_cnt = s->packets_in;
4018 u_int32_t m_size = s->bytes_in;
4019 boolean_t notify = FALSE;
4020 classq_pkt_t head, tail;
4021 struct ifnet_stat_increment_param s_adj = *s;
4022 dlil_freeq_t freeq;
4023 MBUFQ_INIT(&freeq);
4024
4025 ASSERT(inp != dlil_main_input_thread);
4026
4027 /* XXX: should we just assert instead? */
4028 if (__improbable(m_head == NULL)) {
4029 return 0;
4030 }
4031
4032 CLASSQ_PKT_INIT_MBUF(&head, m_head);
4033 CLASSQ_PKT_INIT_MBUF(&tail, m_tail);
4034
4035 lck_mtx_lock_spin(&inp->dlth_lock);
4036 _addq_multi(&inp->dlth_pkts, &head, &tail, m_cnt, m_size);
4037
4038 if (MBUF_QUEUE_IS_OVERCOMMITTED(&inp->dlth_pkts)) {
4039 dlil_trim_overcomitted_queue_locked(&inp->dlth_pkts, &freeq, &s_adj);
4040 inp->dlth_trim_pkts_dropped += s_adj.dropped;
4041 inp->dlth_trim_cnt += 1;
4042
4043 os_log_error(OS_LOG_DEFAULT,
4044 "%s %s burst limit %u (sysctl: %u) exceeded. "
4045 "%u packets dropped [%u total in %u events]. new qlen %u \n",
4046 __func__, if_name(ifp), qlimit(&inp->dlth_pkts), if_rcvq_burst_limit,
4047 s_adj.dropped, inp->dlth_trim_pkts_dropped, inp->dlth_trim_cnt,
4048 qlen(&inp->dlth_pkts));
4049 }
4050
4051 #if IFNET_INPUT_SANITY_CHK
4052 if (__improbable(dlil_input_sanity_check != 0)) {
4053 u_int32_t count = 0, size = 0;
4054 struct mbuf *m0;
4055
4056 for (m0 = m_head; m0; m0 = mbuf_nextpkt(m0)) {
4057 size += m_length(m0);
4058 count++;
4059 }
4060
4061 if (count != m_cnt) {
4062 panic_plain("%s: invalid total packet count %u "
4063 "(expected %u)\n", if_name(ifp), count, m_cnt);
4064 /* NOTREACHED */
4065 __builtin_unreachable();
4066 } else if (size != m_size) {
4067 panic_plain("%s: invalid total packet size %u "
4068 "(expected %u)\n", if_name(ifp), size, m_size);
4069 /* NOTREACHED */
4070 __builtin_unreachable();
4071 }
4072
4073 inp->dlth_pkts_cnt += m_cnt;
4074 }
4075 #endif /* IFNET_INPUT_SANITY_CHK */
4076
4077 /* NOTE: use the adjusted parameter, vs the original one */
4078 dlil_input_stats_add(&s_adj, inp, ifp, poll);
4079
4080 m_cnt = qlen(&inp->dlth_pkts);
4081 _getq_all(&inp->dlth_pkts, &head, NULL, NULL, NULL);
4082
4083 #if SKYWALK
4084 /*
4085 * If this interface is attached to a netif nexus,
4086 * the stats are already incremented there; otherwise
4087 * do it here.
4088 */
4089 if (!(ifp->if_capabilities & IFCAP_SKYWALK))
4090 #endif /* SKYWALK */
4091 notify = dlil_input_stats_sync(ifp, inp);
4092
4093 lck_mtx_unlock(&inp->dlth_lock);
4094
4095 /*
4096 * Actual freeing of the excess packets must happen
4097 * after the dlth_lock had been released.
4098 */
4099 if (!MBUFQ_EMPTY(&freeq)) {
4100 m_freem_list(MBUFQ_FIRST(&freeq));
4101 }
4102
4103 if (notify) {
4104 ifnet_notify_data_threshold(ifp);
4105 }
4106
4107 /*
4108 * NOTE warning %%% attention !!!!
4109 * We should think about putting some thread starvation
4110 * safeguards if we deal with long chains of packets.
4111 */
4112 if (head.cp_mbuf != NULL) {
4113 dlil_input_packet_list_extended(ifp, head.cp_mbuf,
4114 m_cnt, ifp->if_poll_mode);
4115 }
4116
4117 return 0;
4118 }
4119
4120 #if SKYWALK
4121 errno_t
ifnet_set_output_handler(struct ifnet * ifp,ifnet_output_func fn)4122 ifnet_set_output_handler(struct ifnet *ifp, ifnet_output_func fn)
4123 {
4124 return os_atomic_cmpxchg((void * volatile *)&ifp->if_output,
4125 ptrauth_nop_cast(void *, ifp->if_save_output),
4126 ptrauth_nop_cast(void *, fn), acq_rel) ? 0 : EBUSY;
4127 }
4128
4129 void
ifnet_reset_output_handler(struct ifnet * ifp)4130 ifnet_reset_output_handler(struct ifnet *ifp)
4131 {
4132 while (!os_atomic_cmpxchg((void * volatile *)&ifp->if_output,
4133 ptrauth_nop_cast(void *, ifp->if_output),
4134 ptrauth_nop_cast(void *, ifp->if_save_output), acq_rel)) {
4135 ;
4136 }
4137 }
4138
4139 errno_t
ifnet_set_start_handler(struct ifnet * ifp,ifnet_start_func fn)4140 ifnet_set_start_handler(struct ifnet *ifp, ifnet_start_func fn)
4141 {
4142 return os_atomic_cmpxchg((void * volatile *)&ifp->if_start,
4143 ptrauth_nop_cast(void *, ifp->if_save_start),
4144 ptrauth_nop_cast(void *, fn), acq_rel) ? 0 : EBUSY;
4145 }
4146
4147 void
ifnet_reset_start_handler(struct ifnet * ifp)4148 ifnet_reset_start_handler(struct ifnet *ifp)
4149 {
4150 while (!os_atomic_cmpxchg((void * volatile *)&ifp->if_start,
4151 ptrauth_nop_cast(void *, ifp->if_start),
4152 ptrauth_nop_cast(void *, ifp->if_save_start), acq_rel)) {
4153 ;
4154 }
4155 }
4156 #endif /* SKYWALK */
4157
4158 static void
ifnet_start_common(struct ifnet * ifp,boolean_t resetfc,boolean_t ignore_delay)4159 ifnet_start_common(struct ifnet *ifp, boolean_t resetfc, boolean_t ignore_delay)
4160 {
4161 if (!(ifp->if_eflags & IFEF_TXSTART)) {
4162 return;
4163 }
4164 /*
4165 * If the starter thread is inactive, signal it to do work,
4166 * unless the interface is being flow controlled from below,
4167 * e.g. a virtual interface being flow controlled by a real
4168 * network interface beneath it, or it's been disabled via
4169 * a call to ifnet_disable_output().
4170 */
4171 lck_mtx_lock_spin(&ifp->if_start_lock);
4172 if (ignore_delay) {
4173 ifp->if_start_flags |= IFSF_NO_DELAY;
4174 }
4175 if (resetfc) {
4176 ifp->if_start_flags &= ~IFSF_FLOW_CONTROLLED;
4177 } else if (ifp->if_start_flags & IFSF_FLOW_CONTROLLED) {
4178 lck_mtx_unlock(&ifp->if_start_lock);
4179 return;
4180 }
4181 ifp->if_start_req++;
4182 if (!ifp->if_start_active && ifp->if_start_thread != THREAD_NULL &&
4183 (resetfc || !(ifp->if_eflags & IFEF_ENQUEUE_MULTI) ||
4184 IFCQ_LEN(ifp->if_snd) >= ifp->if_start_delay_qlen ||
4185 ifp->if_start_delayed == 0)) {
4186 (void) wakeup_one((caddr_t)&ifp->if_start_thread);
4187 }
4188 lck_mtx_unlock(&ifp->if_start_lock);
4189 }
4190
4191 void
ifnet_start(struct ifnet * ifp)4192 ifnet_start(struct ifnet *ifp)
4193 {
4194 ifnet_start_common(ifp, FALSE, FALSE);
4195 }
4196
4197 void
ifnet_start_ignore_delay(struct ifnet * ifp)4198 ifnet_start_ignore_delay(struct ifnet *ifp)
4199 {
4200 ifnet_start_common(ifp, FALSE, TRUE);
4201 }
4202
4203 __attribute__((noreturn))
4204 static void
ifnet_start_thread_func(void * v,wait_result_t w)4205 ifnet_start_thread_func(void *v, wait_result_t w)
4206 {
4207 #pragma unused(w)
4208 struct ifnet *ifp = v;
4209 char thread_name[MAXTHREADNAMESIZE];
4210
4211 /* Construct the name for this thread, and then apply it. */
4212 bzero(thread_name, sizeof(thread_name));
4213 (void) snprintf(thread_name, sizeof(thread_name),
4214 "ifnet_start_%s", ifp->if_xname);
4215 #if SKYWALK
4216 /* override name for native Skywalk interface */
4217 if (ifp->if_eflags & IFEF_SKYWALK_NATIVE) {
4218 (void) snprintf(thread_name, sizeof(thread_name),
4219 "skywalk_doorbell_%s_tx", ifp->if_xname);
4220 }
4221 #endif /* SKYWALK */
4222 ASSERT(ifp->if_start_thread == current_thread());
4223 thread_set_thread_name(current_thread(), thread_name);
4224
4225 /*
4226 * Treat the dedicated starter thread for lo0 as equivalent to
4227 * the driver workloop thread; if net_affinity is enabled for
4228 * the main input thread, associate this starter thread to it
4229 * by binding them with the same affinity tag. This is done
4230 * only once (as we only have one lo_ifp which never goes away.)
4231 */
4232 if (ifp == lo_ifp) {
4233 struct dlil_threading_info *inp = dlil_main_input_thread;
4234 struct thread *tp = current_thread();
4235 #if SKYWALK
4236 /* native skywalk loopback not yet implemented */
4237 VERIFY(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
4238 #endif /* SKYWALK */
4239
4240 lck_mtx_lock(&inp->dlth_lock);
4241 if (inp->dlth_affinity) {
4242 u_int32_t tag = inp->dlth_affinity_tag;
4243
4244 VERIFY(inp->dlth_driver_thread == THREAD_NULL);
4245 VERIFY(inp->dlth_poller_thread == THREAD_NULL);
4246 inp->dlth_driver_thread = tp;
4247 lck_mtx_unlock(&inp->dlth_lock);
4248
4249 /* Associate this thread with the affinity tag */
4250 (void) dlil_affinity_set(tp, tag);
4251 } else {
4252 lck_mtx_unlock(&inp->dlth_lock);
4253 }
4254 }
4255
4256 lck_mtx_lock(&ifp->if_start_lock);
4257 VERIFY(!ifp->if_start_embryonic && !ifp->if_start_active);
4258 (void) assert_wait(&ifp->if_start_thread, THREAD_UNINT);
4259 ifp->if_start_embryonic = 1;
4260 /* wake up once to get out of embryonic state */
4261 ifp->if_start_req++;
4262 (void) wakeup_one((caddr_t)&ifp->if_start_thread);
4263 lck_mtx_unlock(&ifp->if_start_lock);
4264 (void) thread_block_parameter(ifnet_start_thread_cont, ifp);
4265 /* NOTREACHED */
4266 __builtin_unreachable();
4267 }
4268
4269 __attribute__((noreturn))
4270 static void
ifnet_start_thread_cont(void * v,wait_result_t wres)4271 ifnet_start_thread_cont(void *v, wait_result_t wres)
4272 {
4273 struct ifnet *ifp = v;
4274 struct ifclassq *ifq = ifp->if_snd;
4275
4276 lck_mtx_lock_spin(&ifp->if_start_lock);
4277 if (__improbable(wres == THREAD_INTERRUPTED ||
4278 (ifp->if_start_flags & IFSF_TERMINATING) != 0)) {
4279 goto terminate;
4280 }
4281
4282 if (__improbable(ifp->if_start_embryonic)) {
4283 ifp->if_start_embryonic = 0;
4284 lck_mtx_unlock(&ifp->if_start_lock);
4285 ifnet_decr_pending_thread_count(ifp);
4286 lck_mtx_lock_spin(&ifp->if_start_lock);
4287 goto skip;
4288 }
4289
4290 ifp->if_start_active = 1;
4291
4292 /*
4293 * Keep on servicing until no more request.
4294 */
4295 for (;;) {
4296 u_int32_t req = ifp->if_start_req;
4297 if ((ifp->if_start_flags & IFSF_NO_DELAY) == 0 &&
4298 !IFCQ_IS_EMPTY(ifq) &&
4299 (ifp->if_eflags & IFEF_ENQUEUE_MULTI) &&
4300 ifp->if_start_delayed == 0 &&
4301 IFCQ_LEN(ifq) < ifp->if_start_delay_qlen &&
4302 (ifp->if_eflags & IFEF_DELAY_START)) {
4303 ifp->if_start_delayed = 1;
4304 ifnet_start_delayed++;
4305 break;
4306 }
4307 ifp->if_start_flags &= ~IFSF_NO_DELAY;
4308 ifp->if_start_delayed = 0;
4309 lck_mtx_unlock(&ifp->if_start_lock);
4310
4311 /*
4312 * If no longer attached, don't call start because ifp
4313 * is being destroyed; else hold an IO refcnt to
4314 * prevent the interface from being detached (will be
4315 * released below.)
4316 */
4317 if (!ifnet_datamov_begin(ifp)) {
4318 lck_mtx_lock_spin(&ifp->if_start_lock);
4319 break;
4320 }
4321
4322 /* invoke the driver's start routine */
4323 ((*ifp->if_start)(ifp));
4324
4325 /*
4326 * Release the io ref count taken above.
4327 */
4328 ifnet_datamov_end(ifp);
4329
4330 lck_mtx_lock_spin(&ifp->if_start_lock);
4331
4332 /*
4333 * If there's no pending request or if the
4334 * interface has been disabled, we're done.
4335 */
4336 #define _IFSF_DISABLED (IFSF_FLOW_CONTROLLED | IFSF_TERMINATING)
4337 if (req == ifp->if_start_req ||
4338 (ifp->if_start_flags & _IFSF_DISABLED) != 0) {
4339 break;
4340 }
4341 }
4342 skip:
4343 ifp->if_start_req = 0;
4344 ifp->if_start_active = 0;
4345
4346 #if SKYWALK
4347 /*
4348 * Wakeup any waiters, e.g. any threads waiting to
4349 * detach the interface from the flowswitch, etc.
4350 */
4351 if (ifp->if_start_waiters != 0) {
4352 ifp->if_start_waiters = 0;
4353 wakeup(&ifp->if_start_waiters);
4354 }
4355 #endif /* SKYWALK */
4356 if (__probable((ifp->if_start_flags & IFSF_TERMINATING) == 0)) {
4357 uint64_t deadline = TIMEOUT_WAIT_FOREVER;
4358 struct timespec delay_start_ts;
4359 struct timespec *ts = NULL;
4360
4361 if (ts == NULL) {
4362 ts = ((IFCQ_TBR_IS_ENABLED(ifq) && !IFCQ_IS_EMPTY(ifq)) ?
4363 &ifp->if_start_cycle : NULL);
4364 }
4365
4366 if (ts == NULL && ifp->if_start_delayed == 1) {
4367 delay_start_ts.tv_sec = 0;
4368 delay_start_ts.tv_nsec = ifp->if_start_delay_timeout;
4369 ts = &delay_start_ts;
4370 }
4371
4372 if (ts != NULL && ts->tv_sec == 0 && ts->tv_nsec == 0) {
4373 ts = NULL;
4374 }
4375
4376 if (__improbable(ts != NULL)) {
4377 clock_interval_to_deadline((uint32_t)(ts->tv_nsec +
4378 (ts->tv_sec * NSEC_PER_SEC)), 1, &deadline);
4379 }
4380
4381 (void) assert_wait_deadline(&ifp->if_start_thread,
4382 THREAD_UNINT, deadline);
4383 lck_mtx_unlock(&ifp->if_start_lock);
4384 (void) thread_block_parameter(ifnet_start_thread_cont, ifp);
4385 /* NOTREACHED */
4386 } else {
4387 terminate:
4388 /* interface is detached? */
4389 ifnet_set_start_cycle(ifp, NULL);
4390
4391 /* clear if_start_thread to allow termination to continue */
4392 ASSERT(ifp->if_start_thread != THREAD_NULL);
4393 ifp->if_start_thread = THREAD_NULL;
4394 wakeup((caddr_t)&ifp->if_start_thread);
4395 lck_mtx_unlock(&ifp->if_start_lock);
4396
4397 if (dlil_verbose) {
4398 DLIL_PRINTF("%s: starter thread terminated\n",
4399 if_name(ifp));
4400 }
4401
4402 /* for the extra refcnt from kernel_thread_start() */
4403 thread_deallocate(current_thread());
4404 /* this is the end */
4405 thread_terminate(current_thread());
4406 /* NOTREACHED */
4407 }
4408
4409 /* must never get here */
4410 VERIFY(0);
4411 /* NOTREACHED */
4412 __builtin_unreachable();
4413 }
4414
4415 void
ifnet_set_start_cycle(struct ifnet * ifp,struct timespec * ts)4416 ifnet_set_start_cycle(struct ifnet *ifp, struct timespec *ts)
4417 {
4418 if (ts == NULL) {
4419 bzero(&ifp->if_start_cycle, sizeof(ifp->if_start_cycle));
4420 } else {
4421 *(&ifp->if_start_cycle) = *ts;
4422 }
4423
4424 if (ts != NULL && ts->tv_nsec != 0 && dlil_verbose) {
4425 DLIL_PRINTF("%s: restart interval set to %lu nsec\n",
4426 if_name(ifp), ts->tv_nsec);
4427 }
4428 }
4429
4430 static inline void
ifnet_poll_wakeup(struct ifnet * ifp)4431 ifnet_poll_wakeup(struct ifnet *ifp)
4432 {
4433 LCK_MTX_ASSERT(&ifp->if_poll_lock, LCK_MTX_ASSERT_OWNED);
4434
4435 ifp->if_poll_req++;
4436 if (!(ifp->if_poll_flags & IF_POLLF_RUNNING) &&
4437 ifp->if_poll_thread != THREAD_NULL) {
4438 wakeup_one((caddr_t)&ifp->if_poll_thread);
4439 }
4440 }
4441
4442 void
ifnet_poll(struct ifnet * ifp)4443 ifnet_poll(struct ifnet *ifp)
4444 {
4445 /*
4446 * If the poller thread is inactive, signal it to do work.
4447 */
4448 lck_mtx_lock_spin(&ifp->if_poll_lock);
4449 ifnet_poll_wakeup(ifp);
4450 lck_mtx_unlock(&ifp->if_poll_lock);
4451 }
4452
4453 __attribute__((noreturn))
4454 static void
ifnet_poll_thread_func(void * v,wait_result_t w)4455 ifnet_poll_thread_func(void *v, wait_result_t w)
4456 {
4457 #pragma unused(w)
4458 char thread_name[MAXTHREADNAMESIZE];
4459 struct ifnet *ifp = v;
4460
4461 VERIFY(ifp->if_eflags & IFEF_RXPOLL);
4462 VERIFY(current_thread() == ifp->if_poll_thread);
4463
4464 /* construct the name for this thread, and then apply it */
4465 bzero(thread_name, sizeof(thread_name));
4466 (void) snprintf(thread_name, sizeof(thread_name),
4467 "ifnet_poller_%s", ifp->if_xname);
4468 thread_set_thread_name(ifp->if_poll_thread, thread_name);
4469
4470 lck_mtx_lock(&ifp->if_poll_lock);
4471 VERIFY(!(ifp->if_poll_flags & (IF_POLLF_EMBRYONIC | IF_POLLF_RUNNING)));
4472 (void) assert_wait(&ifp->if_poll_thread, THREAD_UNINT);
4473 ifp->if_poll_flags |= IF_POLLF_EMBRYONIC;
4474 /* wake up once to get out of embryonic state */
4475 ifnet_poll_wakeup(ifp);
4476 lck_mtx_unlock(&ifp->if_poll_lock);
4477 (void) thread_block_parameter(ifnet_poll_thread_cont, ifp);
4478 /* NOTREACHED */
4479 __builtin_unreachable();
4480 }
4481
4482 __attribute__((noreturn))
4483 static void
ifnet_poll_thread_cont(void * v,wait_result_t wres)4484 ifnet_poll_thread_cont(void *v, wait_result_t wres)
4485 {
4486 struct dlil_threading_info *inp;
4487 struct ifnet *ifp = v;
4488 struct ifnet_stat_increment_param s;
4489 struct timespec start_time;
4490
4491 VERIFY(ifp->if_eflags & IFEF_RXPOLL);
4492
4493 bzero(&s, sizeof(s));
4494 net_timerclear(&start_time);
4495
4496 lck_mtx_lock_spin(&ifp->if_poll_lock);
4497 if (__improbable(wres == THREAD_INTERRUPTED ||
4498 (ifp->if_poll_flags & IF_POLLF_TERMINATING) != 0)) {
4499 goto terminate;
4500 }
4501
4502 inp = ifp->if_inp;
4503 VERIFY(inp != NULL);
4504
4505 if (__improbable(ifp->if_poll_flags & IF_POLLF_EMBRYONIC)) {
4506 ifp->if_poll_flags &= ~IF_POLLF_EMBRYONIC;
4507 lck_mtx_unlock(&ifp->if_poll_lock);
4508 ifnet_decr_pending_thread_count(ifp);
4509 lck_mtx_lock_spin(&ifp->if_poll_lock);
4510 goto skip;
4511 }
4512
4513 ifp->if_poll_flags |= IF_POLLF_RUNNING;
4514
4515 /*
4516 * Keep on servicing until no more request.
4517 */
4518 for (;;) {
4519 struct mbuf *m_head, *m_tail;
4520 u_int32_t m_lim, m_cnt, m_totlen;
4521 u_int16_t req = ifp->if_poll_req;
4522
4523 m_lim = (ifp->if_rxpoll_plim != 0) ? ifp->if_rxpoll_plim :
4524 MAX((qlimit(&inp->dlth_pkts)), (ifp->if_rxpoll_phiwat << 2));
4525 lck_mtx_unlock(&ifp->if_poll_lock);
4526
4527 /*
4528 * If no longer attached, there's nothing to do;
4529 * else hold an IO refcnt to prevent the interface
4530 * from being detached (will be released below.)
4531 */
4532 if (!ifnet_is_attached(ifp, 1)) {
4533 lck_mtx_lock_spin(&ifp->if_poll_lock);
4534 break;
4535 }
4536
4537 if (dlil_verbose > 1) {
4538 DLIL_PRINTF("%s: polling up to %d pkts, "
4539 "pkts avg %d max %d, wreq avg %d, "
4540 "bytes avg %d\n",
4541 if_name(ifp), m_lim,
4542 ifp->if_rxpoll_pavg, ifp->if_rxpoll_pmax,
4543 ifp->if_rxpoll_wavg, ifp->if_rxpoll_bavg);
4544 }
4545
4546 /* invoke the driver's input poll routine */
4547 ((*ifp->if_input_poll)(ifp, 0, m_lim, &m_head, &m_tail,
4548 &m_cnt, &m_totlen));
4549
4550 if (m_head != NULL) {
4551 VERIFY(m_tail != NULL && m_cnt > 0);
4552
4553 if (dlil_verbose > 1) {
4554 DLIL_PRINTF("%s: polled %d pkts, "
4555 "pkts avg %d max %d, wreq avg %d, "
4556 "bytes avg %d\n",
4557 if_name(ifp), m_cnt,
4558 ifp->if_rxpoll_pavg, ifp->if_rxpoll_pmax,
4559 ifp->if_rxpoll_wavg, ifp->if_rxpoll_bavg);
4560 }
4561
4562 /* stats are required for extended variant */
4563 s.packets_in = m_cnt;
4564 s.bytes_in = m_totlen;
4565
4566 (void) ifnet_input_common(ifp, m_head, m_tail,
4567 &s, TRUE, TRUE);
4568 } else {
4569 if (dlil_verbose > 1) {
4570 DLIL_PRINTF("%s: no packets, "
4571 "pkts avg %d max %d, wreq avg %d, "
4572 "bytes avg %d\n",
4573 if_name(ifp), ifp->if_rxpoll_pavg,
4574 ifp->if_rxpoll_pmax, ifp->if_rxpoll_wavg,
4575 ifp->if_rxpoll_bavg);
4576 }
4577
4578 (void) ifnet_input_common(ifp, NULL, NULL,
4579 NULL, FALSE, TRUE);
4580 }
4581
4582 /* Release the io ref count */
4583 ifnet_decr_iorefcnt(ifp);
4584
4585 lck_mtx_lock_spin(&ifp->if_poll_lock);
4586
4587 /* if there's no pending request, we're done */
4588 if (req == ifp->if_poll_req ||
4589 (ifp->if_poll_flags & IF_POLLF_TERMINATING) != 0) {
4590 break;
4591 }
4592 }
4593 skip:
4594 ifp->if_poll_req = 0;
4595 ifp->if_poll_flags &= ~IF_POLLF_RUNNING;
4596
4597 if (__probable((ifp->if_poll_flags & IF_POLLF_TERMINATING) == 0)) {
4598 uint64_t deadline = TIMEOUT_WAIT_FOREVER;
4599 struct timespec *ts;
4600
4601 /*
4602 * Wakeup N ns from now, else sleep indefinitely (ts = NULL)
4603 * until ifnet_poll() is called again.
4604 */
4605 ts = &ifp->if_poll_cycle;
4606 if (ts->tv_sec == 0 && ts->tv_nsec == 0) {
4607 ts = NULL;
4608 }
4609
4610 if (ts != NULL) {
4611 clock_interval_to_deadline((uint32_t)(ts->tv_nsec +
4612 (ts->tv_sec * NSEC_PER_SEC)), 1, &deadline);
4613 }
4614
4615 (void) assert_wait_deadline(&ifp->if_poll_thread,
4616 THREAD_UNINT, deadline);
4617 lck_mtx_unlock(&ifp->if_poll_lock);
4618 (void) thread_block_parameter(ifnet_poll_thread_cont, ifp);
4619 /* NOTREACHED */
4620 } else {
4621 terminate:
4622 /* interface is detached (maybe while asleep)? */
4623 ifnet_set_poll_cycle(ifp, NULL);
4624
4625 /* clear if_poll_thread to allow termination to continue */
4626 ASSERT(ifp->if_poll_thread != THREAD_NULL);
4627 ifp->if_poll_thread = THREAD_NULL;
4628 wakeup((caddr_t)&ifp->if_poll_thread);
4629 lck_mtx_unlock(&ifp->if_poll_lock);
4630
4631 if (dlil_verbose) {
4632 DLIL_PRINTF("%s: poller thread terminated\n",
4633 if_name(ifp));
4634 }
4635
4636 /* for the extra refcnt from kernel_thread_start() */
4637 thread_deallocate(current_thread());
4638 /* this is the end */
4639 thread_terminate(current_thread());
4640 /* NOTREACHED */
4641 }
4642
4643 /* must never get here */
4644 VERIFY(0);
4645 /* NOTREACHED */
4646 __builtin_unreachable();
4647 }
4648
4649 void
ifnet_set_poll_cycle(struct ifnet * ifp,struct timespec * ts)4650 ifnet_set_poll_cycle(struct ifnet *ifp, struct timespec *ts)
4651 {
4652 if (ts == NULL) {
4653 bzero(&ifp->if_poll_cycle, sizeof(ifp->if_poll_cycle));
4654 } else {
4655 *(&ifp->if_poll_cycle) = *ts;
4656 }
4657
4658 if (ts != NULL && ts->tv_nsec != 0 && dlil_verbose) {
4659 DLIL_PRINTF("%s: poll interval set to %lu nsec\n",
4660 if_name(ifp), ts->tv_nsec);
4661 }
4662 }
4663
4664 void
ifnet_purge(struct ifnet * ifp)4665 ifnet_purge(struct ifnet *ifp)
4666 {
4667 if (ifp != NULL && (ifp->if_eflags & IFEF_TXSTART)) {
4668 if_qflush_snd(ifp, false);
4669 }
4670 }
4671
4672 void
ifnet_update_sndq(struct ifclassq * ifq,cqev_t ev)4673 ifnet_update_sndq(struct ifclassq *ifq, cqev_t ev)
4674 {
4675 IFCQ_LOCK_ASSERT_HELD(ifq);
4676
4677 if (!(IFCQ_IS_READY(ifq))) {
4678 return;
4679 }
4680
4681 if (IFCQ_TBR_IS_ENABLED(ifq)) {
4682 struct tb_profile tb = {
4683 .rate = ifq->ifcq_tbr.tbr_rate_raw,
4684 .percent = ifq->ifcq_tbr.tbr_percent, .depth = 0
4685 };
4686 (void) ifclassq_tbr_set(ifq, &tb, FALSE);
4687 }
4688
4689 ifclassq_update(ifq, ev);
4690 }
4691
4692 void
ifnet_update_rcv(struct ifnet * ifp,cqev_t ev)4693 ifnet_update_rcv(struct ifnet *ifp, cqev_t ev)
4694 {
4695 switch (ev) {
4696 case CLASSQ_EV_LINK_BANDWIDTH:
4697 if (net_rxpoll && (ifp->if_eflags & IFEF_RXPOLL)) {
4698 ifp->if_poll_update++;
4699 }
4700 break;
4701
4702 default:
4703 break;
4704 }
4705 }
4706
4707 errno_t
ifnet_set_output_sched_model(struct ifnet * ifp,u_int32_t model)4708 ifnet_set_output_sched_model(struct ifnet *ifp, u_int32_t model)
4709 {
4710 struct ifclassq *ifq;
4711 u_int32_t omodel;
4712 errno_t err;
4713
4714 if (ifp == NULL || model >= IFNET_SCHED_MODEL_MAX) {
4715 return EINVAL;
4716 } else if (!(ifp->if_eflags & IFEF_TXSTART)) {
4717 return ENXIO;
4718 }
4719
4720 ifq = ifp->if_snd;
4721 IFCQ_LOCK(ifq);
4722 omodel = ifp->if_output_sched_model;
4723 ifp->if_output_sched_model = model;
4724 if ((err = ifclassq_pktsched_setup(ifq)) != 0) {
4725 ifp->if_output_sched_model = omodel;
4726 }
4727 IFCQ_UNLOCK(ifq);
4728
4729 return err;
4730 }
4731
4732 errno_t
ifnet_set_sndq_maxlen(struct ifnet * ifp,u_int32_t maxqlen)4733 ifnet_set_sndq_maxlen(struct ifnet *ifp, u_int32_t maxqlen)
4734 {
4735 if (ifp == NULL) {
4736 return EINVAL;
4737 } else if (!(ifp->if_eflags & IFEF_TXSTART)) {
4738 return ENXIO;
4739 }
4740
4741 ifclassq_set_maxlen(ifp->if_snd, maxqlen);
4742
4743 return 0;
4744 }
4745
4746 errno_t
ifnet_get_sndq_maxlen(struct ifnet * ifp,u_int32_t * maxqlen)4747 ifnet_get_sndq_maxlen(struct ifnet *ifp, u_int32_t *maxqlen)
4748 {
4749 if (ifp == NULL || maxqlen == NULL) {
4750 return EINVAL;
4751 } else if (!(ifp->if_eflags & IFEF_TXSTART)) {
4752 return ENXIO;
4753 }
4754
4755 *maxqlen = ifclassq_get_maxlen(ifp->if_snd);
4756
4757 return 0;
4758 }
4759
4760 errno_t
ifnet_get_sndq_len(struct ifnet * ifp,u_int32_t * pkts)4761 ifnet_get_sndq_len(struct ifnet *ifp, u_int32_t *pkts)
4762 {
4763 errno_t err;
4764
4765 if (ifp == NULL || pkts == NULL) {
4766 err = EINVAL;
4767 } else if (!(ifp->if_eflags & IFEF_TXSTART)) {
4768 err = ENXIO;
4769 } else {
4770 err = ifclassq_get_len(ifp->if_snd, MBUF_SC_UNSPEC,
4771 IF_CLASSQ_ALL_GRPS, pkts, NULL);
4772 }
4773
4774 return err;
4775 }
4776
4777 errno_t
ifnet_get_service_class_sndq_len(struct ifnet * ifp,mbuf_svc_class_t sc,u_int32_t * pkts,u_int32_t * bytes)4778 ifnet_get_service_class_sndq_len(struct ifnet *ifp, mbuf_svc_class_t sc,
4779 u_int32_t *pkts, u_int32_t *bytes)
4780 {
4781 errno_t err;
4782
4783 if (ifp == NULL || !MBUF_VALID_SC(sc) ||
4784 (pkts == NULL && bytes == NULL)) {
4785 err = EINVAL;
4786 } else if (!(ifp->if_eflags & IFEF_TXSTART)) {
4787 err = ENXIO;
4788 } else {
4789 err = ifclassq_get_len(ifp->if_snd, sc, IF_CLASSQ_ALL_GRPS,
4790 pkts, bytes);
4791 }
4792
4793 return err;
4794 }
4795
4796 errno_t
ifnet_set_rcvq_maxlen(struct ifnet * ifp,u_int32_t maxqlen)4797 ifnet_set_rcvq_maxlen(struct ifnet *ifp, u_int32_t maxqlen)
4798 {
4799 struct dlil_threading_info *inp;
4800
4801 if (ifp == NULL) {
4802 return EINVAL;
4803 } else if (!(ifp->if_eflags & IFEF_RXPOLL) || ifp->if_inp == NULL) {
4804 return ENXIO;
4805 }
4806
4807 if (maxqlen == 0) {
4808 maxqlen = if_rcvq_maxlen;
4809 } else if (maxqlen < IF_RCVQ_MINLEN) {
4810 maxqlen = IF_RCVQ_MINLEN;
4811 }
4812
4813 inp = ifp->if_inp;
4814 lck_mtx_lock(&inp->dlth_lock);
4815 qlimit(&inp->dlth_pkts) = maxqlen;
4816 lck_mtx_unlock(&inp->dlth_lock);
4817
4818 return 0;
4819 }
4820
4821 errno_t
ifnet_get_rcvq_maxlen(struct ifnet * ifp,u_int32_t * maxqlen)4822 ifnet_get_rcvq_maxlen(struct ifnet *ifp, u_int32_t *maxqlen)
4823 {
4824 struct dlil_threading_info *inp;
4825
4826 if (ifp == NULL || maxqlen == NULL) {
4827 return EINVAL;
4828 } else if (!(ifp->if_eflags & IFEF_RXPOLL) || ifp->if_inp == NULL) {
4829 return ENXIO;
4830 }
4831
4832 inp = ifp->if_inp;
4833 lck_mtx_lock(&inp->dlth_lock);
4834 *maxqlen = qlimit(&inp->dlth_pkts);
4835 lck_mtx_unlock(&inp->dlth_lock);
4836 return 0;
4837 }
4838
4839 void
ifnet_enqueue_multi_setup(struct ifnet * ifp,uint16_t delay_qlen,uint16_t delay_timeout)4840 ifnet_enqueue_multi_setup(struct ifnet *ifp, uint16_t delay_qlen,
4841 uint16_t delay_timeout)
4842 {
4843 if (delay_qlen > 0 && delay_timeout > 0) {
4844 if_set_eflags(ifp, IFEF_ENQUEUE_MULTI);
4845 ifp->if_start_delay_qlen = MIN(100, delay_qlen);
4846 ifp->if_start_delay_timeout = min(20000, delay_timeout);
4847 /* convert timeout to nanoseconds */
4848 ifp->if_start_delay_timeout *= 1000;
4849 kprintf("%s: forced IFEF_ENQUEUE_MULTI qlen %u timeout %u\n",
4850 ifp->if_xname, (uint32_t)delay_qlen,
4851 (uint32_t)delay_timeout);
4852 } else {
4853 if_clear_eflags(ifp, IFEF_ENQUEUE_MULTI);
4854 }
4855 }
4856
4857 /*
4858 * This function clears the DSCP bits in the IPV4/V6 header pointed to by buf.
4859 * While it's ok for buf to be not 32 bit aligned, the caller must ensure that
4860 * buf holds the full header.
4861 */
4862 static __attribute__((noinline)) void
ifnet_mcast_clear_dscp(uint8_t * buf,uint8_t ip_ver)4863 ifnet_mcast_clear_dscp(uint8_t *buf, uint8_t ip_ver)
4864 {
4865 struct ip *ip;
4866 struct ip6_hdr *ip6;
4867 uint8_t lbuf[64] __attribute__((aligned(8)));
4868 uint8_t *p = buf;
4869
4870 if (ip_ver == IPVERSION) {
4871 uint8_t old_tos;
4872 uint32_t sum;
4873
4874 if (__improbable(!IP_HDR_ALIGNED_P(p))) {
4875 DTRACE_IP1(not__aligned__v4, uint8_t *, buf);
4876 bcopy(buf, lbuf, sizeof(struct ip));
4877 p = lbuf;
4878 }
4879 ip = (struct ip *)(void *)p;
4880 if (__probable((ip->ip_tos & ~IPTOS_ECN_MASK) == 0)) {
4881 return;
4882 }
4883
4884 DTRACE_IP1(clear__v4, struct ip *, ip);
4885 old_tos = ip->ip_tos;
4886 ip->ip_tos &= IPTOS_ECN_MASK;
4887 sum = ip->ip_sum + htons(old_tos) - htons(ip->ip_tos);
4888 sum = (sum >> 16) + (sum & 0xffff);
4889 ip->ip_sum = (uint16_t)(sum & 0xffff);
4890
4891 if (__improbable(p == lbuf)) {
4892 bcopy(lbuf, buf, sizeof(struct ip));
4893 }
4894 } else {
4895 uint32_t flow;
4896 ASSERT(ip_ver == IPV6_VERSION);
4897
4898 if (__improbable(!IP_HDR_ALIGNED_P(p))) {
4899 DTRACE_IP1(not__aligned__v6, uint8_t *, buf);
4900 bcopy(buf, lbuf, sizeof(struct ip6_hdr));
4901 p = lbuf;
4902 }
4903 ip6 = (struct ip6_hdr *)(void *)p;
4904 flow = ntohl(ip6->ip6_flow);
4905 if (__probable((flow & IP6FLOW_DSCP_MASK) == 0)) {
4906 return;
4907 }
4908
4909 DTRACE_IP1(clear__v6, struct ip6_hdr *, ip6);
4910 ip6->ip6_flow = htonl(flow & ~IP6FLOW_DSCP_MASK);
4911
4912 if (__improbable(p == lbuf)) {
4913 bcopy(lbuf, buf, sizeof(struct ip6_hdr));
4914 }
4915 }
4916 }
4917
4918 static inline errno_t
ifnet_enqueue_ifclassq(struct ifnet * ifp,struct ifclassq * ifcq,classq_pkt_t * p,boolean_t flush,boolean_t * pdrop)4919 ifnet_enqueue_ifclassq(struct ifnet *ifp, struct ifclassq *ifcq,
4920 classq_pkt_t *p, boolean_t flush, boolean_t *pdrop)
4921 {
4922 #if SKYWALK
4923 volatile struct sk_nexusadv *nxadv = NULL;
4924 #endif /* SKYWALK */
4925 volatile uint64_t *fg_ts = NULL;
4926 volatile uint64_t *rt_ts = NULL;
4927 struct timespec now;
4928 u_int64_t now_nsec = 0;
4929 int error = 0;
4930 uint8_t *mcast_buf = NULL;
4931 uint8_t ip_ver;
4932 uint32_t pktlen;
4933
4934 ASSERT(ifp->if_eflags & IFEF_TXSTART);
4935 #if SKYWALK
4936 /*
4937 * If attached to flowswitch, grab pointers to the
4938 * timestamp variables in the nexus advisory region.
4939 */
4940 if ((ifp->if_capabilities & IFCAP_SKYWALK) && ifp->if_na != NULL &&
4941 (nxadv = ifp->if_na->nifna_netif->nif_fsw_nxadv) != NULL) {
4942 fg_ts = &nxadv->nxadv_fg_sendts;
4943 rt_ts = &nxadv->nxadv_rt_sendts;
4944 }
4945 #endif /* SKYWALK */
4946
4947 /*
4948 * If packet already carries a timestamp, either from dlil_output()
4949 * or from flowswitch, use it here. Otherwise, record timestamp.
4950 * PKTF_TS_VALID is always cleared prior to entering classq, i.e.
4951 * the timestamp value is used internally there.
4952 */
4953 switch (p->cp_ptype) {
4954 case QP_MBUF:
4955 #if SKYWALK
4956 /*
4957 * Valid only for non-native (compat) Skywalk interface.
4958 * If the data source uses packet, caller must convert
4959 * it to mbuf first prior to calling this routine.
4960 */
4961 ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
4962 #endif /* SKYWALK */
4963 ASSERT(p->cp_mbuf->m_flags & M_PKTHDR);
4964 ASSERT(p->cp_mbuf->m_nextpkt == NULL);
4965
4966 if (!(p->cp_mbuf->m_pkthdr.pkt_flags & PKTF_TS_VALID) ||
4967 p->cp_mbuf->m_pkthdr.pkt_timestamp == 0) {
4968 nanouptime(&now);
4969 net_timernsec(&now, &now_nsec);
4970 p->cp_mbuf->m_pkthdr.pkt_timestamp = now_nsec;
4971 }
4972 p->cp_mbuf->m_pkthdr.pkt_flags &= ~PKTF_TS_VALID;
4973 /*
4974 * If the packet service class is not background,
4975 * update the timestamp to indicate recent activity
4976 * on a foreground socket.
4977 */
4978 if ((p->cp_mbuf->m_pkthdr.pkt_flags & PKTF_FLOW_ID) &&
4979 p->cp_mbuf->m_pkthdr.pkt_flowsrc == FLOWSRC_INPCB) {
4980 if (!(p->cp_mbuf->m_pkthdr.pkt_flags &
4981 PKTF_SO_BACKGROUND)) {
4982 ifp->if_fg_sendts = (uint32_t)_net_uptime;
4983 if (fg_ts != NULL) {
4984 *fg_ts = (uint32_t)_net_uptime;
4985 }
4986 }
4987 if (p->cp_mbuf->m_pkthdr.pkt_flags & PKTF_SO_REALTIME) {
4988 ifp->if_rt_sendts = (uint32_t)_net_uptime;
4989 if (rt_ts != NULL) {
4990 *rt_ts = (uint32_t)_net_uptime;
4991 }
4992 }
4993 }
4994 pktlen = m_pktlen(p->cp_mbuf);
4995
4996 /*
4997 * Some Wi-Fi AP implementations do not correctly handle
4998 * multicast IP packets with DSCP bits set (radr://9331522).
4999 * As a workaround we clear the DSCP bits but keep service
5000 * class (rdar://51507725).
5001 */
5002 if ((p->cp_mbuf->m_flags & M_MCAST) != 0 &&
5003 IFNET_IS_WIFI_INFRA(ifp)) {
5004 size_t len = mbuf_len(p->cp_mbuf), hlen;
5005 struct ether_header *eh;
5006 boolean_t pullup = FALSE;
5007 uint16_t etype;
5008
5009 if (__improbable(len < sizeof(struct ether_header))) {
5010 DTRACE_IP1(small__ether, size_t, len);
5011 if ((p->cp_mbuf = m_pullup(p->cp_mbuf,
5012 sizeof(struct ether_header))) == NULL) {
5013 return ENOMEM;
5014 }
5015 }
5016 eh = mtod(p->cp_mbuf, struct ether_header *);
5017 etype = ntohs(eh->ether_type);
5018 if (etype == ETHERTYPE_IP) {
5019 hlen = sizeof(struct ether_header) +
5020 sizeof(struct ip);
5021 if (len < hlen) {
5022 DTRACE_IP1(small__v4, size_t, len);
5023 pullup = TRUE;
5024 }
5025 ip_ver = IPVERSION;
5026 } else if (etype == ETHERTYPE_IPV6) {
5027 hlen = sizeof(struct ether_header) +
5028 sizeof(struct ip6_hdr);
5029 if (len < hlen) {
5030 DTRACE_IP1(small__v6, size_t, len);
5031 pullup = TRUE;
5032 }
5033 ip_ver = IPV6_VERSION;
5034 } else {
5035 DTRACE_IP1(invalid__etype, uint16_t, etype);
5036 break;
5037 }
5038 if (pullup) {
5039 if ((p->cp_mbuf = m_pullup(p->cp_mbuf, (int)hlen)) ==
5040 NULL) {
5041 return ENOMEM;
5042 }
5043
5044 eh = mtod(p->cp_mbuf, struct ether_header *);
5045 }
5046 mcast_buf = (uint8_t *)(eh + 1);
5047 /*
5048 * ifnet_mcast_clear_dscp() will finish the work below.
5049 * Note that the pullups above ensure that mcast_buf
5050 * points to a full IP header.
5051 */
5052 }
5053 break;
5054
5055 #if SKYWALK
5056 case QP_PACKET:
5057 /*
5058 * Valid only for native Skywalk interface. If the data
5059 * source uses mbuf, caller must convert it to packet first
5060 * prior to calling this routine.
5061 */
5062 ASSERT(ifp->if_eflags & IFEF_SKYWALK_NATIVE);
5063 if (!(p->cp_kpkt->pkt_pflags & PKT_F_TS_VALID) ||
5064 p->cp_kpkt->pkt_timestamp == 0) {
5065 nanouptime(&now);
5066 net_timernsec(&now, &now_nsec);
5067 p->cp_kpkt->pkt_timestamp = now_nsec;
5068 }
5069 p->cp_kpkt->pkt_pflags &= ~PKT_F_TS_VALID;
5070 /*
5071 * If the packet service class is not background,
5072 * update the timestamps on the interface, as well as
5073 * the ones in nexus-wide advisory to indicate recent
5074 * activity on a foreground flow.
5075 */
5076 if (!(p->cp_kpkt->pkt_pflags & PKT_F_BACKGROUND)) {
5077 ifp->if_fg_sendts = (uint32_t)_net_uptime;
5078 if (fg_ts != NULL) {
5079 *fg_ts = (uint32_t)_net_uptime;
5080 }
5081 }
5082 if (p->cp_kpkt->pkt_pflags & PKT_F_REALTIME) {
5083 ifp->if_rt_sendts = (uint32_t)_net_uptime;
5084 if (rt_ts != NULL) {
5085 *rt_ts = (uint32_t)_net_uptime;
5086 }
5087 }
5088 pktlen = p->cp_kpkt->pkt_length;
5089
5090 /*
5091 * Some Wi-Fi AP implementations do not correctly handle
5092 * multicast IP packets with DSCP bits set (radr://9331522).
5093 * As a workaround we clear the DSCP bits but keep service
5094 * class (rdar://51507725).
5095 */
5096 if ((p->cp_kpkt->pkt_link_flags & PKT_LINKF_MCAST) != 0 &&
5097 IFNET_IS_WIFI_INFRA(ifp)) {
5098 uint8_t *baddr;
5099 struct ether_header *eh;
5100 uint16_t etype;
5101
5102 MD_BUFLET_ADDR_ABS(p->cp_kpkt, baddr);
5103 baddr += p->cp_kpkt->pkt_headroom;
5104 if (__improbable(pktlen < sizeof(struct ether_header))) {
5105 DTRACE_IP1(pkt__small__ether, __kern_packet *,
5106 p->cp_kpkt);
5107 break;
5108 }
5109 eh = (struct ether_header *)(void *)baddr;
5110 etype = ntohs(eh->ether_type);
5111 if (etype == ETHERTYPE_IP) {
5112 if (pktlen < sizeof(struct ether_header) +
5113 sizeof(struct ip)) {
5114 DTRACE_IP1(pkt__small__v4, uint32_t,
5115 pktlen);
5116 break;
5117 }
5118 ip_ver = IPVERSION;
5119 } else if (etype == ETHERTYPE_IPV6) {
5120 if (pktlen < sizeof(struct ether_header) +
5121 sizeof(struct ip6_hdr)) {
5122 DTRACE_IP1(pkt__small__v6, uint32_t,
5123 pktlen);
5124 break;
5125 }
5126 ip_ver = IPV6_VERSION;
5127 } else {
5128 DTRACE_IP1(pkt__invalid__etype, uint16_t,
5129 etype);
5130 break;
5131 }
5132 mcast_buf = (uint8_t *)(eh + 1);
5133 /*
5134 * ifnet_mcast_clear_dscp() will finish the work below.
5135 * The checks above verify that the IP header is in the
5136 * first buflet.
5137 */
5138 }
5139 break;
5140 #endif /* SKYWALK */
5141
5142 default:
5143 VERIFY(0);
5144 /* NOTREACHED */
5145 __builtin_unreachable();
5146 }
5147
5148 if (mcast_buf != NULL) {
5149 ifnet_mcast_clear_dscp(mcast_buf, ip_ver);
5150 }
5151
5152 if (ifp->if_eflags & IFEF_ENQUEUE_MULTI) {
5153 if (now_nsec == 0) {
5154 nanouptime(&now);
5155 net_timernsec(&now, &now_nsec);
5156 }
5157 /*
5158 * If the driver chose to delay start callback for
5159 * coalescing multiple packets, Then use the following
5160 * heuristics to make sure that start callback will
5161 * be delayed only when bulk data transfer is detected.
5162 * 1. number of packets enqueued in (delay_win * 2) is
5163 * greater than or equal to the delay qlen.
5164 * 2. If delay_start is enabled it will stay enabled for
5165 * another 10 idle windows. This is to take into account
5166 * variable RTT and burst traffic.
5167 * 3. If the time elapsed since last enqueue is more
5168 * than 200ms we disable delaying start callback. This is
5169 * is to take idle time into account.
5170 */
5171 u_int64_t dwin = (ifp->if_start_delay_timeout << 1);
5172 if (ifp->if_start_delay_swin > 0) {
5173 if ((ifp->if_start_delay_swin + dwin) > now_nsec) {
5174 ifp->if_start_delay_cnt++;
5175 } else if ((now_nsec - ifp->if_start_delay_swin)
5176 >= (200 * 1000 * 1000)) {
5177 ifp->if_start_delay_swin = now_nsec;
5178 ifp->if_start_delay_cnt = 1;
5179 ifp->if_start_delay_idle = 0;
5180 if (ifp->if_eflags & IFEF_DELAY_START) {
5181 if_clear_eflags(ifp, IFEF_DELAY_START);
5182 ifnet_delay_start_disabled_increment();
5183 }
5184 } else {
5185 if (ifp->if_start_delay_cnt >=
5186 ifp->if_start_delay_qlen) {
5187 if_set_eflags(ifp, IFEF_DELAY_START);
5188 ifp->if_start_delay_idle = 0;
5189 } else {
5190 if (ifp->if_start_delay_idle >= 10) {
5191 if_clear_eflags(ifp,
5192 IFEF_DELAY_START);
5193 ifnet_delay_start_disabled_increment();
5194 } else {
5195 ifp->if_start_delay_idle++;
5196 }
5197 }
5198 ifp->if_start_delay_swin = now_nsec;
5199 ifp->if_start_delay_cnt = 1;
5200 }
5201 } else {
5202 ifp->if_start_delay_swin = now_nsec;
5203 ifp->if_start_delay_cnt = 1;
5204 ifp->if_start_delay_idle = 0;
5205 if_clear_eflags(ifp, IFEF_DELAY_START);
5206 }
5207 } else {
5208 if_clear_eflags(ifp, IFEF_DELAY_START);
5209 }
5210
5211 /* enqueue the packet (caller consumes object) */
5212 error = ifclassq_enqueue(((ifcq != NULL) ? ifcq : ifp->if_snd), p, p,
5213 1, pktlen, pdrop);
5214
5215 /*
5216 * Tell the driver to start dequeueing; do this even when the queue
5217 * for the packet is suspended (EQSUSPENDED), as the driver could still
5218 * be dequeueing from other unsuspended queues.
5219 */
5220 if (!(ifp->if_eflags & IFEF_ENQUEUE_MULTI) &&
5221 ((error == 0 && flush) || error == EQFULL || error == EQSUSPENDED)) {
5222 ifnet_start(ifp);
5223 }
5224
5225 return error;
5226 }
5227
5228 static inline errno_t
ifnet_enqueue_ifclassq_chain(struct ifnet * ifp,struct ifclassq * ifcq,classq_pkt_t * head,classq_pkt_t * tail,uint32_t cnt,uint32_t bytes,boolean_t flush,boolean_t * pdrop)5229 ifnet_enqueue_ifclassq_chain(struct ifnet *ifp, struct ifclassq *ifcq,
5230 classq_pkt_t *head, classq_pkt_t *tail, uint32_t cnt, uint32_t bytes,
5231 boolean_t flush, boolean_t *pdrop)
5232 {
5233 int error;
5234
5235 /* enqueue the packet (caller consumes object) */
5236 error = ifclassq_enqueue(ifcq != NULL ? ifcq : ifp->if_snd, head, tail,
5237 cnt, bytes, pdrop);
5238
5239 /*
5240 * Tell the driver to start dequeueing; do this even when the queue
5241 * for the packet is suspended (EQSUSPENDED), as the driver could still
5242 * be dequeueing from other unsuspended queues.
5243 */
5244 if ((error == 0 && flush) || error == EQFULL || error == EQSUSPENDED) {
5245 ifnet_start(ifp);
5246 }
5247 return error;
5248 }
5249
5250 int
ifnet_enqueue_netem(void * handle,pktsched_pkt_t * pkts,uint32_t n_pkts)5251 ifnet_enqueue_netem(void *handle, pktsched_pkt_t *pkts, uint32_t n_pkts)
5252 {
5253 struct ifnet *ifp = handle;
5254 boolean_t pdrop; /* dummy */
5255 uint32_t i;
5256
5257 ASSERT(n_pkts >= 1);
5258 for (i = 0; i < n_pkts - 1; i++) {
5259 (void) ifnet_enqueue_ifclassq(ifp, NULL, &pkts[i].pktsched_pkt,
5260 FALSE, &pdrop);
5261 }
5262 /* flush with the last packet */
5263 (void) ifnet_enqueue_ifclassq(ifp, NULL, &pkts[i].pktsched_pkt,
5264 TRUE, &pdrop);
5265
5266 return 0;
5267 }
5268
5269 static inline errno_t
ifnet_enqueue_common(struct ifnet * ifp,struct ifclassq * ifcq,classq_pkt_t * pkt,boolean_t flush,boolean_t * pdrop)5270 ifnet_enqueue_common(struct ifnet *ifp, struct ifclassq *ifcq,
5271 classq_pkt_t *pkt, boolean_t flush, boolean_t *pdrop)
5272 {
5273 if (ifp->if_output_netem != NULL) {
5274 bool drop;
5275 errno_t error;
5276 error = netem_enqueue(ifp->if_output_netem, pkt, &drop);
5277 *pdrop = drop ? TRUE : FALSE;
5278 return error;
5279 } else {
5280 return ifnet_enqueue_ifclassq(ifp, ifcq, pkt, flush, pdrop);
5281 }
5282 }
5283
5284 errno_t
ifnet_enqueue(struct ifnet * ifp,struct mbuf * m)5285 ifnet_enqueue(struct ifnet *ifp, struct mbuf *m)
5286 {
5287 uint32_t bytes = m_pktlen(m);
5288 struct mbuf *tail = m;
5289 uint32_t cnt = 1;
5290 boolean_t pdrop;
5291
5292 while (tail->m_nextpkt) {
5293 VERIFY(tail->m_flags & M_PKTHDR);
5294 tail = tail->m_nextpkt;
5295 cnt++;
5296 bytes += m_pktlen(tail);
5297 }
5298
5299 return ifnet_enqueue_mbuf_chain(ifp, m, tail, cnt, bytes, TRUE, &pdrop);
5300 }
5301
5302 errno_t
ifnet_enqueue_mbuf(struct ifnet * ifp,struct mbuf * m,boolean_t flush,boolean_t * pdrop)5303 ifnet_enqueue_mbuf(struct ifnet *ifp, struct mbuf *m, boolean_t flush,
5304 boolean_t *pdrop)
5305 {
5306 classq_pkt_t pkt;
5307
5308 if (ifp == NULL || m == NULL || !(m->m_flags & M_PKTHDR) ||
5309 m->m_nextpkt != NULL) {
5310 if (m != NULL) {
5311 m_freem_list(m);
5312 *pdrop = TRUE;
5313 }
5314 return EINVAL;
5315 } else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5316 !IF_FULLY_ATTACHED(ifp)) {
5317 /* flag tested without lock for performance */
5318 m_freem(m);
5319 *pdrop = TRUE;
5320 return ENXIO;
5321 } else if (!(ifp->if_flags & IFF_UP)) {
5322 m_freem(m);
5323 *pdrop = TRUE;
5324 return ENETDOWN;
5325 }
5326
5327 CLASSQ_PKT_INIT_MBUF(&pkt, m);
5328 return ifnet_enqueue_common(ifp, NULL, &pkt, flush, pdrop);
5329 }
5330
5331 errno_t
ifnet_enqueue_mbuf_chain(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,uint32_t cnt,uint32_t bytes,boolean_t flush,boolean_t * pdrop)5332 ifnet_enqueue_mbuf_chain(struct ifnet *ifp, struct mbuf *m_head,
5333 struct mbuf *m_tail, uint32_t cnt, uint32_t bytes, boolean_t flush,
5334 boolean_t *pdrop)
5335 {
5336 classq_pkt_t head, tail;
5337
5338 ASSERT(m_head != NULL);
5339 ASSERT((m_head->m_flags & M_PKTHDR) != 0);
5340 ASSERT(m_tail != NULL);
5341 ASSERT((m_tail->m_flags & M_PKTHDR) != 0);
5342 ASSERT(ifp != NULL);
5343 ASSERT((ifp->if_eflags & IFEF_TXSTART) != 0);
5344
5345 if (!IF_FULLY_ATTACHED(ifp)) {
5346 /* flag tested without lock for performance */
5347 m_freem_list(m_head);
5348 *pdrop = TRUE;
5349 return ENXIO;
5350 } else if (!(ifp->if_flags & IFF_UP)) {
5351 m_freem_list(m_head);
5352 *pdrop = TRUE;
5353 return ENETDOWN;
5354 }
5355
5356 CLASSQ_PKT_INIT_MBUF(&head, m_head);
5357 CLASSQ_PKT_INIT_MBUF(&tail, m_tail);
5358 return ifnet_enqueue_ifclassq_chain(ifp, NULL, &head, &tail, cnt, bytes,
5359 flush, pdrop);
5360 }
5361
5362 #if SKYWALK
5363 static errno_t
ifnet_enqueue_pkt_common(struct ifnet * ifp,struct ifclassq * ifcq,struct __kern_packet * kpkt,boolean_t flush,boolean_t * pdrop)5364 ifnet_enqueue_pkt_common(struct ifnet *ifp, struct ifclassq *ifcq,
5365 struct __kern_packet *kpkt, boolean_t flush, boolean_t *pdrop)
5366 {
5367 classq_pkt_t pkt;
5368
5369 ASSERT(kpkt == NULL || kpkt->pkt_nextpkt == NULL);
5370
5371 if (__improbable(ifp == NULL || kpkt == NULL)) {
5372 if (kpkt != NULL) {
5373 pp_free_packet(__DECONST(struct kern_pbufpool *,
5374 kpkt->pkt_qum.qum_pp), SK_PTR_ADDR(kpkt));
5375 *pdrop = TRUE;
5376 }
5377 return EINVAL;
5378 } else if (__improbable(!(ifp->if_eflags & IFEF_TXSTART) ||
5379 !IF_FULLY_ATTACHED(ifp))) {
5380 /* flag tested without lock for performance */
5381 pp_free_packet(__DECONST(struct kern_pbufpool *,
5382 kpkt->pkt_qum.qum_pp), SK_PTR_ADDR(kpkt));
5383 *pdrop = TRUE;
5384 return ENXIO;
5385 } else if (__improbable(!(ifp->if_flags & IFF_UP))) {
5386 pp_free_packet(__DECONST(struct kern_pbufpool *,
5387 kpkt->pkt_qum.qum_pp), SK_PTR_ADDR(kpkt));
5388 *pdrop = TRUE;
5389 return ENETDOWN;
5390 }
5391
5392 CLASSQ_PKT_INIT_PACKET(&pkt, kpkt);
5393 return ifnet_enqueue_common(ifp, ifcq, &pkt, flush, pdrop);
5394 }
5395
5396 errno_t
ifnet_enqueue_pkt(struct ifnet * ifp,struct __kern_packet * kpkt,boolean_t flush,boolean_t * pdrop)5397 ifnet_enqueue_pkt(struct ifnet *ifp, struct __kern_packet *kpkt,
5398 boolean_t flush, boolean_t *pdrop)
5399 {
5400 return ifnet_enqueue_pkt_common(ifp, NULL, kpkt, flush, pdrop);
5401 }
5402
5403 errno_t
ifnet_enqueue_ifcq_pkt(struct ifnet * ifp,struct ifclassq * ifcq,struct __kern_packet * kpkt,boolean_t flush,boolean_t * pdrop)5404 ifnet_enqueue_ifcq_pkt(struct ifnet *ifp, struct ifclassq *ifcq,
5405 struct __kern_packet *kpkt, boolean_t flush, boolean_t *pdrop)
5406 {
5407 return ifnet_enqueue_pkt_common(ifp, ifcq, kpkt, flush, pdrop);
5408 }
5409
5410 static errno_t
ifnet_enqueue_pkt_chain_common(struct ifnet * ifp,struct ifclassq * ifcq,struct __kern_packet * k_head,struct __kern_packet * k_tail,uint32_t cnt,uint32_t bytes,boolean_t flush,boolean_t * pdrop)5411 ifnet_enqueue_pkt_chain_common(struct ifnet *ifp, struct ifclassq *ifcq,
5412 struct __kern_packet *k_head, struct __kern_packet *k_tail, uint32_t cnt,
5413 uint32_t bytes, boolean_t flush, boolean_t *pdrop)
5414 {
5415 classq_pkt_t head, tail;
5416
5417 ASSERT(k_head != NULL);
5418 ASSERT(k_tail != NULL);
5419 ASSERT(ifp != NULL);
5420 ASSERT((ifp->if_eflags & IFEF_TXSTART) != 0);
5421
5422 if (!IF_FULLY_ATTACHED(ifp)) {
5423 /* flag tested without lock for performance */
5424 pp_free_packet_chain(k_head, NULL);
5425 *pdrop = TRUE;
5426 return ENXIO;
5427 } else if (__improbable(!(ifp->if_flags & IFF_UP))) {
5428 pp_free_packet_chain(k_head, NULL);
5429 *pdrop = TRUE;
5430 return ENETDOWN;
5431 }
5432
5433 CLASSQ_PKT_INIT_PACKET(&head, k_head);
5434 CLASSQ_PKT_INIT_PACKET(&tail, k_tail);
5435 return ifnet_enqueue_ifclassq_chain(ifp, ifcq, &head, &tail, cnt, bytes,
5436 flush, pdrop);
5437 }
5438
5439 errno_t
ifnet_enqueue_pkt_chain(struct ifnet * ifp,struct __kern_packet * k_head,struct __kern_packet * k_tail,uint32_t cnt,uint32_t bytes,boolean_t flush,boolean_t * pdrop)5440 ifnet_enqueue_pkt_chain(struct ifnet *ifp, struct __kern_packet *k_head,
5441 struct __kern_packet *k_tail, uint32_t cnt, uint32_t bytes, boolean_t flush,
5442 boolean_t *pdrop)
5443 {
5444 return ifnet_enqueue_pkt_chain_common(ifp, NULL, k_head, k_tail,
5445 cnt, bytes, flush, pdrop);
5446 }
5447
5448 errno_t
ifnet_enqueue_ifcq_pkt_chain(struct ifnet * ifp,struct ifclassq * ifcq,struct __kern_packet * k_head,struct __kern_packet * k_tail,uint32_t cnt,uint32_t bytes,boolean_t flush,boolean_t * pdrop)5449 ifnet_enqueue_ifcq_pkt_chain(struct ifnet *ifp, struct ifclassq *ifcq,
5450 struct __kern_packet *k_head, struct __kern_packet *k_tail, uint32_t cnt,
5451 uint32_t bytes, boolean_t flush, boolean_t *pdrop)
5452 {
5453 return ifnet_enqueue_pkt_chain_common(ifp, ifcq, k_head, k_tail,
5454 cnt, bytes, flush, pdrop);
5455 }
5456 #endif /* SKYWALK */
5457
5458 errno_t
ifnet_dequeue(struct ifnet * ifp,struct mbuf ** mp)5459 ifnet_dequeue(struct ifnet *ifp, struct mbuf **mp)
5460 {
5461 errno_t rc;
5462 classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
5463
5464 if (ifp == NULL || mp == NULL) {
5465 return EINVAL;
5466 } else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5467 ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5468 return ENXIO;
5469 }
5470 if (!ifnet_is_attached(ifp, 1)) {
5471 return ENXIO;
5472 }
5473
5474 #if SKYWALK
5475 ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5476 #endif /* SKYWALK */
5477 rc = ifclassq_dequeue(ifp->if_snd, 1, CLASSQ_DEQUEUE_MAX_BYTE_LIMIT,
5478 &pkt, NULL, NULL, NULL, 0);
5479 VERIFY((pkt.cp_ptype == QP_MBUF) || (pkt.cp_mbuf == NULL));
5480 ifnet_decr_iorefcnt(ifp);
5481 *mp = pkt.cp_mbuf;
5482 return rc;
5483 }
5484
5485 errno_t
ifnet_dequeue_service_class(struct ifnet * ifp,mbuf_svc_class_t sc,struct mbuf ** mp)5486 ifnet_dequeue_service_class(struct ifnet *ifp, mbuf_svc_class_t sc,
5487 struct mbuf **mp)
5488 {
5489 errno_t rc;
5490 classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
5491
5492 if (ifp == NULL || mp == NULL || !MBUF_VALID_SC(sc)) {
5493 return EINVAL;
5494 } else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5495 ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5496 return ENXIO;
5497 }
5498 if (!ifnet_is_attached(ifp, 1)) {
5499 return ENXIO;
5500 }
5501
5502 #if SKYWALK
5503 ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5504 #endif /* SKYWALK */
5505 rc = ifclassq_dequeue_sc(ifp->if_snd, sc, 1,
5506 CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, &pkt, NULL, NULL, NULL, 0);
5507 VERIFY((pkt.cp_ptype == QP_MBUF) || (pkt.cp_mbuf == NULL));
5508 ifnet_decr_iorefcnt(ifp);
5509 *mp = pkt.cp_mbuf;
5510 return rc;
5511 }
5512
5513 errno_t
ifnet_dequeue_multi(struct ifnet * ifp,u_int32_t pkt_limit,struct mbuf ** head,struct mbuf ** tail,u_int32_t * cnt,u_int32_t * len)5514 ifnet_dequeue_multi(struct ifnet *ifp, u_int32_t pkt_limit,
5515 struct mbuf **head, struct mbuf **tail, u_int32_t *cnt, u_int32_t *len)
5516 {
5517 errno_t rc;
5518 classq_pkt_t pkt_head = CLASSQ_PKT_INITIALIZER(pkt_head);
5519 classq_pkt_t pkt_tail = CLASSQ_PKT_INITIALIZER(pkt_tail);
5520
5521 if (ifp == NULL || head == NULL || pkt_limit < 1) {
5522 return EINVAL;
5523 } else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5524 ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5525 return ENXIO;
5526 }
5527 if (!ifnet_is_attached(ifp, 1)) {
5528 return ENXIO;
5529 }
5530
5531 #if SKYWALK
5532 ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5533 #endif /* SKYWALK */
5534 rc = ifclassq_dequeue(ifp->if_snd, pkt_limit,
5535 CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, &pkt_head, &pkt_tail, cnt, len, 0);
5536 VERIFY((pkt_head.cp_ptype == QP_MBUF) || (pkt_head.cp_mbuf == NULL));
5537 ifnet_decr_iorefcnt(ifp);
5538 *head = pkt_head.cp_mbuf;
5539 if (tail != NULL) {
5540 *tail = pkt_tail.cp_mbuf;
5541 }
5542 return rc;
5543 }
5544
5545 errno_t
ifnet_dequeue_multi_bytes(struct ifnet * ifp,u_int32_t byte_limit,struct mbuf ** head,struct mbuf ** tail,u_int32_t * cnt,u_int32_t * len)5546 ifnet_dequeue_multi_bytes(struct ifnet *ifp, u_int32_t byte_limit,
5547 struct mbuf **head, struct mbuf **tail, u_int32_t *cnt, u_int32_t *len)
5548 {
5549 errno_t rc;
5550 classq_pkt_t pkt_head = CLASSQ_PKT_INITIALIZER(pkt_head);
5551 classq_pkt_t pkt_tail = CLASSQ_PKT_INITIALIZER(pkt_tail);
5552
5553 if (ifp == NULL || head == NULL || byte_limit < 1) {
5554 return EINVAL;
5555 } else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5556 ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5557 return ENXIO;
5558 }
5559 if (!ifnet_is_attached(ifp, 1)) {
5560 return ENXIO;
5561 }
5562
5563 #if SKYWALK
5564 ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5565 #endif /* SKYWALK */
5566 rc = ifclassq_dequeue(ifp->if_snd, CLASSQ_DEQUEUE_MAX_PKT_LIMIT,
5567 byte_limit, &pkt_head, &pkt_tail, cnt, len, 0);
5568 VERIFY((pkt_head.cp_ptype == QP_MBUF) || (pkt_head.cp_mbuf == NULL));
5569 ifnet_decr_iorefcnt(ifp);
5570 *head = pkt_head.cp_mbuf;
5571 if (tail != NULL) {
5572 *tail = pkt_tail.cp_mbuf;
5573 }
5574 return rc;
5575 }
5576
5577 errno_t
ifnet_dequeue_service_class_multi(struct ifnet * ifp,mbuf_svc_class_t sc,u_int32_t pkt_limit,struct mbuf ** head,struct mbuf ** tail,u_int32_t * cnt,u_int32_t * len)5578 ifnet_dequeue_service_class_multi(struct ifnet *ifp, mbuf_svc_class_t sc,
5579 u_int32_t pkt_limit, struct mbuf **head, struct mbuf **tail, u_int32_t *cnt,
5580 u_int32_t *len)
5581 {
5582 errno_t rc;
5583 classq_pkt_t pkt_head = CLASSQ_PKT_INITIALIZER(pkt_head);
5584 classq_pkt_t pkt_tail = CLASSQ_PKT_INITIALIZER(pkt_tail);
5585
5586 if (ifp == NULL || head == NULL || pkt_limit < 1 ||
5587 !MBUF_VALID_SC(sc)) {
5588 return EINVAL;
5589 } else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5590 ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5591 return ENXIO;
5592 }
5593 if (!ifnet_is_attached(ifp, 1)) {
5594 return ENXIO;
5595 }
5596
5597 #if SKYWALK
5598 ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5599 #endif /* SKYWALK */
5600 rc = ifclassq_dequeue_sc(ifp->if_snd, sc, pkt_limit,
5601 CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, &pkt_head, &pkt_tail,
5602 cnt, len, 0);
5603 VERIFY((pkt_head.cp_ptype == QP_MBUF) || (pkt_head.cp_mbuf == NULL));
5604 ifnet_decr_iorefcnt(ifp);
5605 *head = pkt_head.cp_mbuf;
5606 if (tail != NULL) {
5607 *tail = pkt_tail.cp_mbuf;
5608 }
5609 return rc;
5610 }
5611
5612 #if XNU_TARGET_OS_OSX
5613 errno_t
ifnet_framer_stub(struct ifnet * ifp,struct mbuf ** m,const struct sockaddr * dest,const char * dest_linkaddr,const char * frame_type,u_int32_t * pre,u_int32_t * post)5614 ifnet_framer_stub(struct ifnet *ifp, struct mbuf **m,
5615 const struct sockaddr *dest, const char *dest_linkaddr,
5616 const char *frame_type, u_int32_t *pre, u_int32_t *post)
5617 {
5618 if (pre != NULL) {
5619 *pre = 0;
5620 }
5621 if (post != NULL) {
5622 *post = 0;
5623 }
5624
5625 return ifp->if_framer_legacy(ifp, m, dest, dest_linkaddr, frame_type);
5626 }
5627 #endif /* XNU_TARGET_OS_OSX */
5628
5629 static boolean_t
packet_has_vlan_tag(struct mbuf * m)5630 packet_has_vlan_tag(struct mbuf * m)
5631 {
5632 u_int tag = 0;
5633
5634 if ((m->m_pkthdr.csum_flags & CSUM_VLAN_TAG_VALID) != 0) {
5635 tag = EVL_VLANOFTAG(m->m_pkthdr.vlan_tag);
5636 if (tag == 0) {
5637 /* the packet is just priority-tagged, clear the bit */
5638 m->m_pkthdr.csum_flags &= ~CSUM_VLAN_TAG_VALID;
5639 }
5640 }
5641 return tag != 0;
5642 }
5643
5644 static int
dlil_interface_filters_input(struct ifnet * ifp,struct mbuf ** m_p,char ** frame_header_p,protocol_family_t protocol_family,boolean_t skip_bridge)5645 dlil_interface_filters_input(struct ifnet *ifp, struct mbuf **m_p,
5646 char **frame_header_p, protocol_family_t protocol_family,
5647 boolean_t skip_bridge)
5648 {
5649 boolean_t is_vlan_packet = FALSE;
5650 struct ifnet_filter *filter;
5651 struct mbuf *m = *m_p;
5652
5653 is_vlan_packet = packet_has_vlan_tag(m);
5654
5655 if (TAILQ_EMPTY(&ifp->if_flt_head)) {
5656 return 0;
5657 }
5658
5659 /*
5660 * Pass the inbound packet to the interface filters
5661 */
5662 lck_mtx_lock_spin(&ifp->if_flt_lock);
5663 /* prevent filter list from changing in case we drop the lock */
5664 if_flt_monitor_busy(ifp);
5665 TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
5666 int result;
5667
5668 /* exclude VLAN packets from external filters PR-3586856 */
5669 if (is_vlan_packet &&
5670 (filter->filt_flags & DLIL_IFF_INTERNAL) == 0) {
5671 continue;
5672 }
5673 /* the bridge has already seen the packet */
5674 if (skip_bridge &&
5675 (filter->filt_flags & DLIL_IFF_BRIDGE) != 0) {
5676 continue;
5677 }
5678 if (!filter->filt_skip && filter->filt_input != NULL &&
5679 (filter->filt_protocol == 0 ||
5680 filter->filt_protocol == protocol_family)) {
5681 lck_mtx_unlock(&ifp->if_flt_lock);
5682
5683 result = (*filter->filt_input)(filter->filt_cookie,
5684 ifp, protocol_family, m_p, frame_header_p);
5685
5686 lck_mtx_lock_spin(&ifp->if_flt_lock);
5687 if (result != 0) {
5688 /* we're done with the filter list */
5689 if_flt_monitor_unbusy(ifp);
5690 lck_mtx_unlock(&ifp->if_flt_lock);
5691 return result;
5692 }
5693 }
5694 }
5695 /* we're done with the filter list */
5696 if_flt_monitor_unbusy(ifp);
5697 lck_mtx_unlock(&ifp->if_flt_lock);
5698
5699 /*
5700 * Strip away M_PROTO1 bit prior to sending packet up the stack as
5701 * it is meant to be local to a subsystem -- if_bridge for M_PROTO1
5702 */
5703 if (*m_p != NULL) {
5704 (*m_p)->m_flags &= ~M_PROTO1;
5705 }
5706
5707 return 0;
5708 }
5709
5710 __attribute__((noinline))
5711 static int
dlil_interface_filters_output(struct ifnet * ifp,struct mbuf ** m_p,protocol_family_t protocol_family)5712 dlil_interface_filters_output(struct ifnet *ifp, struct mbuf **m_p,
5713 protocol_family_t protocol_family)
5714 {
5715 boolean_t is_vlan_packet;
5716 struct ifnet_filter *filter;
5717 struct mbuf *m = *m_p;
5718
5719 if (TAILQ_EMPTY(&ifp->if_flt_head)) {
5720 return 0;
5721 }
5722 is_vlan_packet = packet_has_vlan_tag(m);
5723
5724 /*
5725 * Pass the outbound packet to the interface filters
5726 */
5727 lck_mtx_lock_spin(&ifp->if_flt_lock);
5728 /* prevent filter list from changing in case we drop the lock */
5729 if_flt_monitor_busy(ifp);
5730 TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
5731 int result;
5732
5733 /* exclude VLAN packets from external filters PR-3586856 */
5734 if (is_vlan_packet &&
5735 (filter->filt_flags & DLIL_IFF_INTERNAL) == 0) {
5736 continue;
5737 }
5738
5739 if (!filter->filt_skip && filter->filt_output != NULL &&
5740 (filter->filt_protocol == 0 ||
5741 filter->filt_protocol == protocol_family)) {
5742 lck_mtx_unlock(&ifp->if_flt_lock);
5743
5744 result = filter->filt_output(filter->filt_cookie, ifp,
5745 protocol_family, m_p);
5746
5747 lck_mtx_lock_spin(&ifp->if_flt_lock);
5748 if (result != 0) {
5749 /* we're done with the filter list */
5750 if_flt_monitor_unbusy(ifp);
5751 lck_mtx_unlock(&ifp->if_flt_lock);
5752 return result;
5753 }
5754 }
5755 }
5756 /* we're done with the filter list */
5757 if_flt_monitor_unbusy(ifp);
5758 lck_mtx_unlock(&ifp->if_flt_lock);
5759
5760 return 0;
5761 }
5762
5763 static void
dlil_ifproto_input(struct if_proto * ifproto,mbuf_t m)5764 dlil_ifproto_input(struct if_proto * ifproto, mbuf_t m)
5765 {
5766 int error;
5767
5768 if (ifproto->proto_kpi == kProtoKPI_v1) {
5769 /* Version 1 protocols get one packet at a time */
5770 while (m != NULL) {
5771 char * frame_header;
5772 mbuf_t next_packet;
5773
5774 next_packet = m->m_nextpkt;
5775 m->m_nextpkt = NULL;
5776 frame_header = m->m_pkthdr.pkt_hdr;
5777 m->m_pkthdr.pkt_hdr = NULL;
5778 error = (*ifproto->kpi.v1.input)(ifproto->ifp,
5779 ifproto->protocol_family, m, frame_header);
5780 if (error != 0 && error != EJUSTRETURN) {
5781 m_freem(m);
5782 }
5783 m = next_packet;
5784 }
5785 } else if (ifproto->proto_kpi == kProtoKPI_v2) {
5786 /* Version 2 protocols support packet lists */
5787 error = (*ifproto->kpi.v2.input)(ifproto->ifp,
5788 ifproto->protocol_family, m);
5789 if (error != 0 && error != EJUSTRETURN) {
5790 m_freem_list(m);
5791 }
5792 }
5793 }
5794
5795 static void
dlil_input_stats_add(const struct ifnet_stat_increment_param * s,struct dlil_threading_info * inp,struct ifnet * ifp,boolean_t poll)5796 dlil_input_stats_add(const struct ifnet_stat_increment_param *s,
5797 struct dlil_threading_info *inp, struct ifnet *ifp, boolean_t poll)
5798 {
5799 struct ifnet_stat_increment_param *d = &inp->dlth_stats;
5800
5801 if (s->packets_in != 0) {
5802 d->packets_in += s->packets_in;
5803 }
5804 if (s->bytes_in != 0) {
5805 d->bytes_in += s->bytes_in;
5806 }
5807 if (s->errors_in != 0) {
5808 d->errors_in += s->errors_in;
5809 }
5810
5811 if (s->packets_out != 0) {
5812 d->packets_out += s->packets_out;
5813 }
5814 if (s->bytes_out != 0) {
5815 d->bytes_out += s->bytes_out;
5816 }
5817 if (s->errors_out != 0) {
5818 d->errors_out += s->errors_out;
5819 }
5820
5821 if (s->collisions != 0) {
5822 d->collisions += s->collisions;
5823 }
5824 if (s->dropped != 0) {
5825 d->dropped += s->dropped;
5826 }
5827
5828 if (poll) {
5829 PKTCNTR_ADD(&ifp->if_poll_tstats, s->packets_in, s->bytes_in);
5830 }
5831 }
5832
5833 static boolean_t
dlil_input_stats_sync(struct ifnet * ifp,struct dlil_threading_info * inp)5834 dlil_input_stats_sync(struct ifnet *ifp, struct dlil_threading_info *inp)
5835 {
5836 struct ifnet_stat_increment_param *s = &inp->dlth_stats;
5837
5838 /*
5839 * Use of atomic operations is unavoidable here because
5840 * these stats may also be incremented elsewhere via KPIs.
5841 */
5842 if (s->packets_in != 0) {
5843 os_atomic_add(&ifp->if_data.ifi_ipackets, s->packets_in, relaxed);
5844 s->packets_in = 0;
5845 }
5846 if (s->bytes_in != 0) {
5847 os_atomic_add(&ifp->if_data.ifi_ibytes, s->bytes_in, relaxed);
5848 s->bytes_in = 0;
5849 }
5850 if (s->errors_in != 0) {
5851 os_atomic_add(&ifp->if_data.ifi_ierrors, s->errors_in, relaxed);
5852 s->errors_in = 0;
5853 }
5854
5855 if (s->packets_out != 0) {
5856 os_atomic_add(&ifp->if_data.ifi_opackets, s->packets_out, relaxed);
5857 s->packets_out = 0;
5858 }
5859 if (s->bytes_out != 0) {
5860 os_atomic_add(&ifp->if_data.ifi_obytes, s->bytes_out, relaxed);
5861 s->bytes_out = 0;
5862 }
5863 if (s->errors_out != 0) {
5864 os_atomic_add(&ifp->if_data.ifi_oerrors, s->errors_out, relaxed);
5865 s->errors_out = 0;
5866 }
5867
5868 if (s->collisions != 0) {
5869 os_atomic_add(&ifp->if_data.ifi_collisions, s->collisions, relaxed);
5870 s->collisions = 0;
5871 }
5872 if (s->dropped != 0) {
5873 os_atomic_add(&ifp->if_data.ifi_iqdrops, s->dropped, relaxed);
5874 s->dropped = 0;
5875 }
5876
5877 /*
5878 * No need for atomic operations as they are modified here
5879 * only from within the DLIL input thread context.
5880 */
5881 if (ifp->if_poll_tstats.packets != 0) {
5882 ifp->if_poll_pstats.ifi_poll_packets += ifp->if_poll_tstats.packets;
5883 ifp->if_poll_tstats.packets = 0;
5884 }
5885 if (ifp->if_poll_tstats.bytes != 0) {
5886 ifp->if_poll_pstats.ifi_poll_bytes += ifp->if_poll_tstats.bytes;
5887 ifp->if_poll_tstats.bytes = 0;
5888 }
5889
5890 return ifp->if_data_threshold != 0;
5891 }
5892
5893 __private_extern__ void
dlil_input_packet_list(struct ifnet * ifp,struct mbuf * m)5894 dlil_input_packet_list(struct ifnet *ifp, struct mbuf *m)
5895 {
5896 return dlil_input_packet_list_common(ifp, m, 0,
5897 IFNET_MODEL_INPUT_POLL_OFF, FALSE);
5898 }
5899
5900 __private_extern__ void
dlil_input_packet_list_extended(struct ifnet * ifp,struct mbuf * m,u_int32_t cnt,ifnet_model_t mode)5901 dlil_input_packet_list_extended(struct ifnet *ifp, struct mbuf *m,
5902 u_int32_t cnt, ifnet_model_t mode)
5903 {
5904 return dlil_input_packet_list_common(ifp, m, cnt, mode, TRUE);
5905 }
5906
5907 static inline mbuf_t
handle_bridge_early_input(ifnet_t ifp,mbuf_t m,u_int32_t cnt)5908 handle_bridge_early_input(ifnet_t ifp, mbuf_t m, u_int32_t cnt)
5909 {
5910 lck_mtx_lock_spin(&ifp->if_flt_lock);
5911 if_flt_monitor_busy(ifp);
5912 lck_mtx_unlock(&ifp->if_flt_lock);
5913
5914 if (ifp->if_bridge != NULL) {
5915 m = bridge_early_input(ifp, m, cnt);
5916 }
5917 lck_mtx_lock_spin(&ifp->if_flt_lock);
5918 if_flt_monitor_unbusy(ifp);
5919 lck_mtx_unlock(&ifp->if_flt_lock);
5920 return m;
5921 }
5922
5923 static void
dlil_input_packet_list_common(struct ifnet * ifp_param,struct mbuf * m,u_int32_t cnt,ifnet_model_t mode,boolean_t ext)5924 dlil_input_packet_list_common(struct ifnet *ifp_param, struct mbuf *m,
5925 u_int32_t cnt, ifnet_model_t mode, boolean_t ext)
5926 {
5927 int error = 0;
5928 protocol_family_t protocol_family;
5929 mbuf_t next_packet;
5930 ifnet_t ifp = ifp_param;
5931 char *frame_header = NULL;
5932 struct if_proto *last_ifproto = NULL;
5933 mbuf_t pkt_first = NULL;
5934 mbuf_t *pkt_next = NULL;
5935 u_int32_t poll_thresh = 0, poll_ival = 0;
5936 int iorefcnt = 0;
5937 boolean_t skip_bridge_filter = FALSE;
5938
5939 KERNEL_DEBUG(DBG_FNC_DLIL_INPUT | DBG_FUNC_START, 0, 0, 0, 0, 0);
5940
5941 if (ext && mode == IFNET_MODEL_INPUT_POLL_ON && cnt > 1 &&
5942 (poll_ival = if_rxpoll_interval_pkts) > 0) {
5943 poll_thresh = cnt;
5944 }
5945 if (bridge_enable_early_input != 0 &&
5946 ifp != NULL && ifp->if_bridge != NULL) {
5947 m = handle_bridge_early_input(ifp, m, cnt);
5948 skip_bridge_filter = TRUE;
5949 }
5950 while (m != NULL) {
5951 struct if_proto *ifproto = NULL;
5952 uint32_t pktf_mask; /* pkt flags to preserve */
5953
5954 m_add_crumb(m, PKT_CRUMB_DLIL_INPUT);
5955
5956 if (ifp_param == NULL) {
5957 ifp = m->m_pkthdr.rcvif;
5958 }
5959
5960 if ((ifp->if_eflags & IFEF_RXPOLL) &&
5961 (ifp->if_xflags & IFXF_LEGACY) && poll_thresh != 0 &&
5962 poll_ival > 0 && (--poll_thresh % poll_ival) == 0) {
5963 ifnet_poll(ifp);
5964 }
5965
5966 /* Check if this mbuf looks valid */
5967 MBUF_INPUT_CHECK(m, ifp);
5968
5969 next_packet = m->m_nextpkt;
5970 m->m_nextpkt = NULL;
5971 frame_header = m->m_pkthdr.pkt_hdr;
5972 m->m_pkthdr.pkt_hdr = NULL;
5973
5974 /*
5975 * Get an IO reference count if the interface is not
5976 * loopback (lo0) and it is attached; lo0 never goes
5977 * away, so optimize for that.
5978 */
5979 if (ifp != lo_ifp) {
5980 /* iorefcnt is 0 if it hasn't been taken yet */
5981 if (iorefcnt == 0) {
5982 if (!ifnet_datamov_begin(ifp)) {
5983 m_freem(m);
5984 goto next;
5985 }
5986 }
5987 iorefcnt = 1;
5988 /*
5989 * Preserve the time stamp and skip pktap flags.
5990 */
5991 pktf_mask = PKTF_TS_VALID | PKTF_SKIP_PKTAP;
5992 } else {
5993 /*
5994 * If this arrived on lo0, preserve interface addr
5995 * info to allow for connectivity between loopback
5996 * and local interface addresses.
5997 */
5998 pktf_mask = (PKTF_LOOP | PKTF_IFAINFO);
5999 }
6000 pktf_mask |= PKTF_WAKE_PKT;
6001
6002 /* make sure packet comes in clean */
6003 m_classifier_init(m, pktf_mask);
6004
6005 ifp_inc_traffic_class_in(ifp, m);
6006
6007 /* find which protocol family this packet is for */
6008 ifnet_lock_shared(ifp);
6009 error = (*ifp->if_demux)(ifp, m, frame_header,
6010 &protocol_family);
6011 ifnet_lock_done(ifp);
6012 if (error != 0) {
6013 if (error == EJUSTRETURN) {
6014 goto next;
6015 }
6016 protocol_family = 0;
6017 }
6018 /* check for an updated frame header */
6019 if (m->m_pkthdr.pkt_hdr != NULL) {
6020 frame_header = m->m_pkthdr.pkt_hdr;
6021 m->m_pkthdr.pkt_hdr = NULL;
6022 }
6023
6024 #if (DEVELOPMENT || DEBUG)
6025 /*
6026 * For testing we do not care about broadcast and multicast packets as
6027 * they are not as controllable as unicast traffic
6028 */
6029 if (__improbable(ifp->if_xflags & IFXF_MARK_WAKE_PKT)) {
6030 if ((protocol_family == PF_INET || protocol_family == PF_INET6) &&
6031 (m->m_flags & (M_BCAST | M_MCAST)) == 0) {
6032 /*
6033 * This is a one-shot command
6034 */
6035 ifp->if_xflags &= ~IFXF_MARK_WAKE_PKT;
6036 m->m_pkthdr.pkt_flags |= PKTF_WAKE_PKT;
6037 }
6038 }
6039 #endif /* (DEVELOPMENT || DEBUG) */
6040 if (__improbable(net_wake_pkt_debug > 0 && (m->m_pkthdr.pkt_flags & PKTF_WAKE_PKT))) {
6041 char buffer[64];
6042 size_t buflen = MIN(mbuf_pkthdr_len(m), sizeof(buffer));
6043
6044 os_log(OS_LOG_DEFAULT, "wake packet from %s len %d",
6045 ifp->if_xname, m_pktlen(m));
6046 if (mbuf_copydata(m, 0, buflen, buffer) == 0) {
6047 log_hexdump(buffer, buflen);
6048 }
6049 }
6050
6051 pktap_input(ifp, protocol_family, m, frame_header);
6052
6053 /* Drop v4 packets received on CLAT46 enabled cell interface */
6054 if (protocol_family == PF_INET && IS_INTF_CLAT46(ifp) &&
6055 ifp->if_type == IFT_CELLULAR) {
6056 m_freem(m);
6057 ip6stat.ip6s_clat464_in_v4_drop++;
6058 goto next;
6059 }
6060
6061 /* Translate the packet if it is received on CLAT interface */
6062 if ((m->m_flags & M_PROMISC) == 0 &&
6063 protocol_family == PF_INET6 &&
6064 IS_INTF_CLAT46(ifp) &&
6065 dlil_is_clat_needed(protocol_family, m)) {
6066 char *data = NULL;
6067 struct ether_header eh;
6068 struct ether_header *ehp = NULL;
6069
6070 if (ifp->if_type == IFT_ETHER) {
6071 ehp = (struct ether_header *)(void *)frame_header;
6072 /* Skip RX Ethernet packets if they are not IPV6 */
6073 if (ntohs(ehp->ether_type) != ETHERTYPE_IPV6) {
6074 goto skip_clat;
6075 }
6076
6077 /* Keep a copy of frame_header for Ethernet packets */
6078 bcopy(frame_header, (caddr_t)&eh, ETHER_HDR_LEN);
6079 }
6080 error = dlil_clat64(ifp, &protocol_family, &m);
6081 data = mtod(m, char*);
6082 if (error != 0) {
6083 m_freem(m);
6084 ip6stat.ip6s_clat464_in_drop++;
6085 goto next;
6086 }
6087 /* Native v6 should be No-op */
6088 if (protocol_family != PF_INET) {
6089 goto skip_clat;
6090 }
6091
6092 /* Do this only for translated v4 packets. */
6093 switch (ifp->if_type) {
6094 case IFT_CELLULAR:
6095 frame_header = data;
6096 break;
6097 case IFT_ETHER:
6098 /*
6099 * Drop if the mbuf doesn't have enough
6100 * space for Ethernet header
6101 */
6102 if (M_LEADINGSPACE(m) < ETHER_HDR_LEN) {
6103 m_free(m);
6104 ip6stat.ip6s_clat464_in_drop++;
6105 goto next;
6106 }
6107 /*
6108 * Set the frame_header ETHER_HDR_LEN bytes
6109 * preceeding the data pointer. Change
6110 * the ether_type too.
6111 */
6112 frame_header = data - ETHER_HDR_LEN;
6113 eh.ether_type = htons(ETHERTYPE_IP);
6114 bcopy((caddr_t)&eh, frame_header, ETHER_HDR_LEN);
6115 break;
6116 }
6117 }
6118 skip_clat:
6119 /*
6120 * Match the wake packet against the list of ports that has been
6121 * been queried by the driver before the device went to sleep
6122 */
6123 if (__improbable(m->m_pkthdr.pkt_flags & PKTF_WAKE_PKT)) {
6124 if (protocol_family != PF_INET && protocol_family != PF_INET6) {
6125 if_ports_used_match_mbuf(ifp, protocol_family, m);
6126 }
6127 }
6128 if (hwcksum_dbg != 0 && !(ifp->if_flags & IFF_LOOPBACK) &&
6129 !(m->m_pkthdr.pkt_flags & PKTF_LOOP)) {
6130 dlil_input_cksum_dbg(ifp, m, frame_header,
6131 protocol_family);
6132 }
6133 /*
6134 * For partial checksum offload, we expect the driver to
6135 * set the start offset indicating the start of the span
6136 * that is covered by the hardware-computed checksum;
6137 * adjust this start offset accordingly because the data
6138 * pointer has been advanced beyond the link-layer header.
6139 *
6140 * Virtual lan types (bridge, vlan, bond) can call
6141 * dlil_input_packet_list() with the same packet with the
6142 * checksum flags set. Set a flag indicating that the
6143 * adjustment has already been done.
6144 */
6145 if ((m->m_pkthdr.csum_flags & CSUM_ADJUST_DONE) != 0) {
6146 /* adjustment has already been done */
6147 } else if ((m->m_pkthdr.csum_flags &
6148 (CSUM_DATA_VALID | CSUM_PARTIAL)) ==
6149 (CSUM_DATA_VALID | CSUM_PARTIAL)) {
6150 int adj;
6151 if (frame_header == NULL ||
6152 frame_header < (char *)mbuf_datastart(m) ||
6153 frame_header > (char *)m->m_data ||
6154 (adj = (int)(m->m_data - (uintptr_t)frame_header)) >
6155 m->m_pkthdr.csum_rx_start) {
6156 m->m_pkthdr.csum_data = 0;
6157 m->m_pkthdr.csum_flags &= ~CSUM_DATA_VALID;
6158 hwcksum_in_invalidated++;
6159 } else {
6160 m->m_pkthdr.csum_rx_start -= adj;
6161 }
6162 /* make sure we don't adjust more than once */
6163 m->m_pkthdr.csum_flags |= CSUM_ADJUST_DONE;
6164 }
6165 if (clat_debug) {
6166 pktap_input(ifp, protocol_family, m, frame_header);
6167 }
6168
6169 if (m->m_flags & (M_BCAST | M_MCAST)) {
6170 os_atomic_inc(&ifp->if_imcasts, relaxed);
6171 }
6172
6173 /* run interface filters */
6174 error = dlil_interface_filters_input(ifp, &m,
6175 &frame_header, protocol_family, skip_bridge_filter);
6176 if (error != 0) {
6177 if (error != EJUSTRETURN) {
6178 m_freem(m);
6179 }
6180 goto next;
6181 }
6182 /*
6183 * A VLAN and Bond interface receives packets by attaching
6184 * a "protocol" to the underlying interface.
6185 * A promiscuous packet needs to be delivered to the
6186 * VLAN or Bond interface since:
6187 * - Bond interface member may not support setting the
6188 * MAC address, so packets are inherently "promiscuous"
6189 * - A VLAN or Bond interface could be members of a bridge,
6190 * where promiscuous packets correspond to other
6191 * devices that the bridge forwards packets to/from
6192 */
6193 if ((m->m_flags & M_PROMISC) != 0) {
6194 switch (protocol_family) {
6195 case PF_VLAN:
6196 case PF_BOND:
6197 /* VLAN and Bond get promiscuous packets */
6198 break;
6199 default:
6200 m_freem(m);
6201 goto next;
6202 }
6203 }
6204
6205 /* Lookup the protocol attachment to this interface */
6206 if (protocol_family == 0) {
6207 ifproto = NULL;
6208 } else if (last_ifproto != NULL && last_ifproto->ifp == ifp &&
6209 (last_ifproto->protocol_family == protocol_family)) {
6210 VERIFY(ifproto == NULL);
6211 ifproto = last_ifproto;
6212 if_proto_ref(last_ifproto);
6213 } else {
6214 VERIFY(ifproto == NULL);
6215 ifnet_lock_shared(ifp);
6216 /* callee holds a proto refcnt upon success */
6217 ifproto = find_attached_proto(ifp, protocol_family);
6218 ifnet_lock_done(ifp);
6219 }
6220 if (ifproto == NULL) {
6221 /* no protocol for this packet, discard */
6222 m_freem(m);
6223 goto next;
6224 }
6225 if (ifproto != last_ifproto) {
6226 if (last_ifproto != NULL) {
6227 /* pass up the list for the previous protocol */
6228 dlil_ifproto_input(last_ifproto, pkt_first);
6229 pkt_first = NULL;
6230 if_proto_free(last_ifproto);
6231 }
6232 last_ifproto = ifproto;
6233 if_proto_ref(ifproto);
6234 }
6235 /* extend the list */
6236 m->m_pkthdr.pkt_hdr = frame_header;
6237 if (pkt_first == NULL) {
6238 pkt_first = m;
6239 } else {
6240 *pkt_next = m;
6241 }
6242 pkt_next = &m->m_nextpkt;
6243
6244 next:
6245 if (next_packet == NULL && last_ifproto != NULL) {
6246 /* pass up the last list of packets */
6247 dlil_ifproto_input(last_ifproto, pkt_first);
6248 if_proto_free(last_ifproto);
6249 last_ifproto = NULL;
6250 }
6251 if (ifproto != NULL) {
6252 if_proto_free(ifproto);
6253 ifproto = NULL;
6254 }
6255
6256 m = next_packet;
6257
6258 /* update the driver's multicast filter, if needed */
6259 if (ifp->if_updatemcasts > 0 && if_mcasts_update(ifp) == 0) {
6260 ifp->if_updatemcasts = 0;
6261 }
6262 if (iorefcnt == 1) {
6263 /* If the next mbuf is on a different interface, unlock data-mov */
6264 if (!m || (ifp != ifp_param && ifp != m->m_pkthdr.rcvif)) {
6265 ifnet_datamov_end(ifp);
6266 iorefcnt = 0;
6267 }
6268 }
6269 }
6270
6271 KERNEL_DEBUG(DBG_FNC_DLIL_INPUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
6272 }
6273
6274 static errno_t
if_mcasts_update_common(struct ifnet * ifp,bool sync)6275 if_mcasts_update_common(struct ifnet * ifp, bool sync)
6276 {
6277 errno_t err;
6278
6279 if (sync) {
6280 err = ifnet_ioctl(ifp, 0, SIOCADDMULTI, NULL);
6281 if (err == EAFNOSUPPORT) {
6282 err = 0;
6283 }
6284 } else {
6285 ifnet_ioctl_async(ifp, SIOCADDMULTI);
6286 err = 0;
6287 }
6288 DLIL_PRINTF("%s: %s %d suspended link-layer multicast membership(s) "
6289 "(err=%d)\n", if_name(ifp),
6290 (err == 0 ? "successfully restored" : "failed to restore"),
6291 ifp->if_updatemcasts, err);
6292
6293 /* just return success */
6294 return 0;
6295 }
6296
6297 static errno_t
if_mcasts_update_async(struct ifnet * ifp)6298 if_mcasts_update_async(struct ifnet *ifp)
6299 {
6300 return if_mcasts_update_common(ifp, false);
6301 }
6302
6303 errno_t
if_mcasts_update(struct ifnet * ifp)6304 if_mcasts_update(struct ifnet *ifp)
6305 {
6306 return if_mcasts_update_common(ifp, true);
6307 }
6308
6309 /* If ifp is set, we will increment the generation for the interface */
6310 int
dlil_post_complete_msg(struct ifnet * ifp,struct kev_msg * event)6311 dlil_post_complete_msg(struct ifnet *ifp, struct kev_msg *event)
6312 {
6313 if (ifp != NULL) {
6314 ifnet_increment_generation(ifp);
6315 }
6316
6317 #if NECP
6318 necp_update_all_clients();
6319 #endif /* NECP */
6320
6321 return kev_post_msg(event);
6322 }
6323
6324 __private_extern__ void
dlil_post_sifflags_msg(struct ifnet * ifp)6325 dlil_post_sifflags_msg(struct ifnet * ifp)
6326 {
6327 struct kev_msg ev_msg;
6328 struct net_event_data ev_data;
6329
6330 bzero(&ev_data, sizeof(ev_data));
6331 bzero(&ev_msg, sizeof(ev_msg));
6332 ev_msg.vendor_code = KEV_VENDOR_APPLE;
6333 ev_msg.kev_class = KEV_NETWORK_CLASS;
6334 ev_msg.kev_subclass = KEV_DL_SUBCLASS;
6335 ev_msg.event_code = KEV_DL_SIFFLAGS;
6336 strlcpy(&ev_data.if_name[0], ifp->if_name, IFNAMSIZ);
6337 ev_data.if_family = ifp->if_family;
6338 ev_data.if_unit = (u_int32_t) ifp->if_unit;
6339 ev_msg.dv[0].data_length = sizeof(struct net_event_data);
6340 ev_msg.dv[0].data_ptr = &ev_data;
6341 ev_msg.dv[1].data_length = 0;
6342 dlil_post_complete_msg(ifp, &ev_msg);
6343 }
6344
6345 #define TMP_IF_PROTO_ARR_SIZE 10
6346 static int
dlil_event_internal(struct ifnet * ifp,struct kev_msg * event,bool update_generation)6347 dlil_event_internal(struct ifnet *ifp, struct kev_msg *event, bool update_generation)
6348 {
6349 struct ifnet_filter *filter = NULL;
6350 struct if_proto *proto = NULL;
6351 int if_proto_count = 0;
6352 struct if_proto *tmp_ifproto_stack_arr[TMP_IF_PROTO_ARR_SIZE] = {NULL};
6353 struct if_proto **tmp_ifproto_arr = tmp_ifproto_stack_arr;
6354 int tmp_ifproto_arr_idx = 0;
6355
6356 /*
6357 * Pass the event to the interface filters
6358 */
6359 lck_mtx_lock_spin(&ifp->if_flt_lock);
6360 /* prevent filter list from changing in case we drop the lock */
6361 if_flt_monitor_busy(ifp);
6362 TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
6363 if (filter->filt_event != NULL) {
6364 lck_mtx_unlock(&ifp->if_flt_lock);
6365
6366 filter->filt_event(filter->filt_cookie, ifp,
6367 filter->filt_protocol, event);
6368
6369 lck_mtx_lock_spin(&ifp->if_flt_lock);
6370 }
6371 }
6372 /* we're done with the filter list */
6373 if_flt_monitor_unbusy(ifp);
6374 lck_mtx_unlock(&ifp->if_flt_lock);
6375
6376 /* Get an io ref count if the interface is attached */
6377 if (!ifnet_is_attached(ifp, 1)) {
6378 goto done;
6379 }
6380
6381 /*
6382 * An embedded tmp_list_entry in if_proto may still get
6383 * over-written by another thread after giving up ifnet lock,
6384 * therefore we are avoiding embedded pointers here.
6385 */
6386 ifnet_lock_shared(ifp);
6387 if_proto_count = dlil_ifp_protolist(ifp, NULL, 0);
6388 if (if_proto_count) {
6389 int i;
6390 VERIFY(ifp->if_proto_hash != NULL);
6391 if (if_proto_count <= TMP_IF_PROTO_ARR_SIZE) {
6392 tmp_ifproto_arr = tmp_ifproto_stack_arr;
6393 } else {
6394 tmp_ifproto_arr = kalloc_type(struct if_proto *,
6395 if_proto_count, Z_WAITOK | Z_ZERO);
6396 if (tmp_ifproto_arr == NULL) {
6397 ifnet_lock_done(ifp);
6398 goto cleanup;
6399 }
6400 }
6401
6402 for (i = 0; i < PROTO_HASH_SLOTS; i++) {
6403 SLIST_FOREACH(proto, &ifp->if_proto_hash[i],
6404 next_hash) {
6405 if_proto_ref(proto);
6406 tmp_ifproto_arr[tmp_ifproto_arr_idx] = proto;
6407 tmp_ifproto_arr_idx++;
6408 }
6409 }
6410 VERIFY(if_proto_count == tmp_ifproto_arr_idx);
6411 }
6412 ifnet_lock_done(ifp);
6413
6414 for (tmp_ifproto_arr_idx = 0; tmp_ifproto_arr_idx < if_proto_count;
6415 tmp_ifproto_arr_idx++) {
6416 proto = tmp_ifproto_arr[tmp_ifproto_arr_idx];
6417 VERIFY(proto != NULL);
6418 proto_media_event eventp =
6419 (proto->proto_kpi == kProtoKPI_v1 ?
6420 proto->kpi.v1.event :
6421 proto->kpi.v2.event);
6422
6423 if (eventp != NULL) {
6424 eventp(ifp, proto->protocol_family,
6425 event);
6426 }
6427 if_proto_free(proto);
6428 }
6429
6430 cleanup:
6431 if (tmp_ifproto_arr != tmp_ifproto_stack_arr) {
6432 kfree_type(struct if_proto *, if_proto_count, tmp_ifproto_arr);
6433 }
6434
6435 /* Pass the event to the interface */
6436 if (ifp->if_event != NULL) {
6437 ifp->if_event(ifp, event);
6438 }
6439
6440 /* Release the io ref count */
6441 ifnet_decr_iorefcnt(ifp);
6442 done:
6443 return dlil_post_complete_msg(update_generation ? ifp : NULL, event);
6444 }
6445
6446 errno_t
ifnet_event(ifnet_t ifp,struct kern_event_msg * event)6447 ifnet_event(ifnet_t ifp, struct kern_event_msg *event)
6448 {
6449 struct kev_msg kev_msg;
6450 int result = 0;
6451
6452 if (ifp == NULL || event == NULL) {
6453 return EINVAL;
6454 }
6455
6456 bzero(&kev_msg, sizeof(kev_msg));
6457 kev_msg.vendor_code = event->vendor_code;
6458 kev_msg.kev_class = event->kev_class;
6459 kev_msg.kev_subclass = event->kev_subclass;
6460 kev_msg.event_code = event->event_code;
6461 kev_msg.dv[0].data_ptr = &event->event_data[0];
6462 kev_msg.dv[0].data_length = event->total_size - KEV_MSG_HEADER_SIZE;
6463 kev_msg.dv[1].data_length = 0;
6464
6465 result = dlil_event_internal(ifp, &kev_msg, TRUE);
6466
6467 return result;
6468 }
6469
6470 static void
dlil_count_chain_len(mbuf_t m,struct chain_len_stats * cls)6471 dlil_count_chain_len(mbuf_t m, struct chain_len_stats *cls)
6472 {
6473 mbuf_t n = m;
6474 int chainlen = 0;
6475
6476 while (n != NULL) {
6477 chainlen++;
6478 n = n->m_next;
6479 }
6480 switch (chainlen) {
6481 case 0:
6482 break;
6483 case 1:
6484 os_atomic_inc(&cls->cls_one, relaxed);
6485 break;
6486 case 2:
6487 os_atomic_inc(&cls->cls_two, relaxed);
6488 break;
6489 case 3:
6490 os_atomic_inc(&cls->cls_three, relaxed);
6491 break;
6492 case 4:
6493 os_atomic_inc(&cls->cls_four, relaxed);
6494 break;
6495 case 5:
6496 default:
6497 os_atomic_inc(&cls->cls_five_or_more, relaxed);
6498 break;
6499 }
6500 }
6501
6502 #if CONFIG_DTRACE
6503 __attribute__((noinline))
6504 static void
dlil_output_dtrace(ifnet_t ifp,protocol_family_t proto_family,mbuf_t m)6505 dlil_output_dtrace(ifnet_t ifp, protocol_family_t proto_family, mbuf_t m)
6506 {
6507 if (proto_family == PF_INET) {
6508 struct ip *ip = mtod(m, struct ip *);
6509 DTRACE_IP6(send, struct mbuf *, m, struct inpcb *, NULL,
6510 struct ip *, ip, struct ifnet *, ifp,
6511 struct ip *, ip, struct ip6_hdr *, NULL);
6512 } else if (proto_family == PF_INET6) {
6513 struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
6514 DTRACE_IP6(send, struct mbuf *, m, struct inpcb *, NULL,
6515 struct ip6_hdr *, ip6, struct ifnet *, ifp,
6516 struct ip *, NULL, struct ip6_hdr *, ip6);
6517 }
6518 }
6519 #endif /* CONFIG_DTRACE */
6520
6521 /*
6522 * dlil_output
6523 *
6524 * Caller should have a lock on the protocol domain if the protocol
6525 * doesn't support finer grained locking. In most cases, the lock
6526 * will be held from the socket layer and won't be released until
6527 * we return back to the socket layer.
6528 *
6529 * This does mean that we must take a protocol lock before we take
6530 * an interface lock if we're going to take both. This makes sense
6531 * because a protocol is likely to interact with an ifp while it
6532 * is under the protocol lock.
6533 *
6534 * An advisory code will be returned if adv is not null. This
6535 * can be used to provide feedback about interface queues to the
6536 * application.
6537 */
6538 errno_t
dlil_output(ifnet_t ifp,protocol_family_t proto_family,mbuf_t packetlist,void * route,const struct sockaddr * dest,int flags,struct flowadv * adv)6539 dlil_output(ifnet_t ifp, protocol_family_t proto_family, mbuf_t packetlist,
6540 void *route, const struct sockaddr *dest, int flags, struct flowadv *adv)
6541 {
6542 char *frame_type = NULL;
6543 char *dst_linkaddr = NULL;
6544 int retval = 0;
6545 char frame_type_buffer[DLIL_MAX_FRAME_TYPE_BUFFER_SIZE];
6546 char dst_linkaddr_buffer[DLIL_MAX_LINKADDR_BUFFER_SIZE];
6547 struct if_proto *proto = NULL;
6548 mbuf_t m = NULL;
6549 mbuf_t send_head = NULL;
6550 mbuf_t *send_tail = &send_head;
6551 int iorefcnt = 0;
6552 u_int32_t pre = 0, post = 0;
6553 u_int32_t fpkts = 0, fbytes = 0;
6554 int32_t flen = 0;
6555 struct timespec now;
6556 u_int64_t now_nsec;
6557 boolean_t did_clat46 = FALSE;
6558 protocol_family_t old_proto_family = proto_family;
6559 struct sockaddr_in6 dest6;
6560 struct rtentry *rt = NULL;
6561 u_int16_t m_loop_set = 0;
6562 bool raw = (flags & DLIL_OUTPUT_FLAGS_RAW) != 0;
6563
6564 KERNEL_DEBUG(DBG_FNC_DLIL_OUTPUT | DBG_FUNC_START, 0, 0, 0, 0, 0);
6565
6566 /*
6567 * Get an io refcnt if the interface is attached to prevent ifnet_detach
6568 * from happening while this operation is in progress
6569 */
6570 if (!ifnet_datamov_begin(ifp)) {
6571 retval = ENXIO;
6572 goto cleanup;
6573 }
6574 iorefcnt = 1;
6575
6576 VERIFY(ifp->if_output_dlil != NULL);
6577
6578 /* update the driver's multicast filter, if needed */
6579 if (ifp->if_updatemcasts > 0) {
6580 if_mcasts_update_async(ifp);
6581 ifp->if_updatemcasts = 0;
6582 }
6583
6584 frame_type = frame_type_buffer;
6585 dst_linkaddr = dst_linkaddr_buffer;
6586
6587 if (flags == DLIL_OUTPUT_FLAGS_NONE) {
6588 ifnet_lock_shared(ifp);
6589 /* callee holds a proto refcnt upon success */
6590 proto = find_attached_proto(ifp, proto_family);
6591 if (proto == NULL) {
6592 ifnet_lock_done(ifp);
6593 retval = ENXIO;
6594 goto cleanup;
6595 }
6596 ifnet_lock_done(ifp);
6597 }
6598
6599 preout_again:
6600 if (packetlist == NULL) {
6601 goto cleanup;
6602 }
6603
6604 m = packetlist;
6605 packetlist = packetlist->m_nextpkt;
6606 m->m_nextpkt = NULL;
6607
6608 m_add_crumb(m, PKT_CRUMB_DLIL_OUTPUT);
6609
6610 /*
6611 * Perform address family translation for the first
6612 * packet outside the loop in order to perform address
6613 * lookup for the translated proto family.
6614 */
6615 if (proto_family == PF_INET && IS_INTF_CLAT46(ifp) &&
6616 (ifp->if_type == IFT_CELLULAR ||
6617 dlil_is_clat_needed(proto_family, m))) {
6618 retval = dlil_clat46(ifp, &proto_family, &m);
6619 /*
6620 * Go to the next packet if translation fails
6621 */
6622 if (retval != 0) {
6623 m_freem(m);
6624 m = NULL;
6625 ip6stat.ip6s_clat464_out_drop++;
6626 /* Make sure that the proto family is PF_INET */
6627 ASSERT(proto_family == PF_INET);
6628 goto preout_again;
6629 }
6630 /*
6631 * Free the old one and make it point to the IPv6 proto structure.
6632 *
6633 * Change proto for the first time we have successfully
6634 * performed address family translation.
6635 */
6636 if (!did_clat46 && proto_family == PF_INET6) {
6637 did_clat46 = TRUE;
6638
6639 if (proto != NULL) {
6640 if_proto_free(proto);
6641 }
6642 ifnet_lock_shared(ifp);
6643 /* callee holds a proto refcnt upon success */
6644 proto = find_attached_proto(ifp, proto_family);
6645 if (proto == NULL) {
6646 ifnet_lock_done(ifp);
6647 retval = ENXIO;
6648 m_freem(m);
6649 m = NULL;
6650 goto cleanup;
6651 }
6652 ifnet_lock_done(ifp);
6653 if (ifp->if_type == IFT_ETHER) {
6654 /* Update the dest to translated v6 address */
6655 dest6.sin6_len = sizeof(struct sockaddr_in6);
6656 dest6.sin6_family = AF_INET6;
6657 dest6.sin6_addr = (mtod(m, struct ip6_hdr *))->ip6_dst;
6658 dest = SA(&dest6);
6659
6660 /*
6661 * Lookup route to the translated destination
6662 * Free this route ref during cleanup
6663 */
6664 rt = rtalloc1_scoped(SA(&dest6),
6665 0, 0, ifp->if_index);
6666
6667 route = rt;
6668 }
6669 }
6670 }
6671
6672 /*
6673 * This path gets packet chain going to the same destination.
6674 * The pre output routine is used to either trigger resolution of
6675 * the next hop or retrieve the next hop's link layer addressing.
6676 * For ex: ether_inet(6)_pre_output routine.
6677 *
6678 * If the routine returns EJUSTRETURN, it implies that packet has
6679 * been queued, and therefore we have to call preout_again for the
6680 * following packet in the chain.
6681 *
6682 * For errors other than EJUSTRETURN, the current packet is freed
6683 * and the rest of the chain (pointed by packetlist is freed as
6684 * part of clean up.
6685 *
6686 * Else if there is no error the retrieved information is used for
6687 * all the packets in the chain.
6688 */
6689 if (flags == DLIL_OUTPUT_FLAGS_NONE) {
6690 proto_media_preout preoutp = (proto->proto_kpi == kProtoKPI_v1 ?
6691 proto->kpi.v1.pre_output : proto->kpi.v2.pre_output);
6692 retval = 0;
6693 if (preoutp != NULL) {
6694 retval = preoutp(ifp, proto_family, &m, dest, route,
6695 frame_type, dst_linkaddr);
6696
6697 if (retval != 0) {
6698 if (retval == EJUSTRETURN) {
6699 goto preout_again;
6700 }
6701 m_freem(m);
6702 m = NULL;
6703 goto cleanup;
6704 }
6705 }
6706 }
6707
6708 nanouptime(&now);
6709 net_timernsec(&now, &now_nsec);
6710
6711 do {
6712 /*
6713 * pkt_hdr is set here to point to m_data prior to
6714 * calling into the framer. This value of pkt_hdr is
6715 * used by the netif gso logic to retrieve the ip header
6716 * for the TCP packets, offloaded for TSO processing.
6717 */
6718 if (raw && (ifp->if_family == IFNET_FAMILY_ETHERNET)) {
6719 uint8_t vlan_encap_len = 0;
6720
6721 if ((m->m_pkthdr.csum_flags & CSUM_VLAN_ENCAP_PRESENT) != 0) {
6722 vlan_encap_len = ETHER_VLAN_ENCAP_LEN;
6723 }
6724 m->m_pkthdr.pkt_hdr = mtod(m, char *) + ETHER_HDR_LEN + vlan_encap_len;
6725 } else {
6726 m->m_pkthdr.pkt_hdr = mtod(m, void *);
6727 }
6728
6729 /*
6730 * Perform address family translation if needed.
6731 * For now we only support stateless 4 to 6 translation
6732 * on the out path.
6733 *
6734 * The routine below translates IP header, updates protocol
6735 * checksum and also translates ICMP.
6736 *
6737 * We skip the first packet as it is already translated and
6738 * the proto family is set to PF_INET6.
6739 */
6740 if (proto_family == PF_INET && IS_INTF_CLAT46(ifp) &&
6741 (ifp->if_type == IFT_CELLULAR ||
6742 dlil_is_clat_needed(proto_family, m))) {
6743 retval = dlil_clat46(ifp, &proto_family, &m);
6744 /* Goto the next packet if the translation fails */
6745 if (retval != 0) {
6746 m_freem(m);
6747 m = NULL;
6748 ip6stat.ip6s_clat464_out_drop++;
6749 goto next;
6750 }
6751 }
6752
6753 #if CONFIG_DTRACE
6754 if (flags == DLIL_OUTPUT_FLAGS_NONE) {
6755 dlil_output_dtrace(ifp, proto_family, m);
6756 }
6757 #endif /* CONFIG_DTRACE */
6758
6759 if (flags == DLIL_OUTPUT_FLAGS_NONE && ifp->if_framer != NULL) {
6760 int rcvif_set = 0;
6761
6762 /*
6763 * If this is a broadcast packet that needs to be
6764 * looped back into the system, set the inbound ifp
6765 * to that of the outbound ifp. This will allow
6766 * us to determine that it is a legitimate packet
6767 * for the system. Only set the ifp if it's not
6768 * already set, just to be safe.
6769 */
6770 if ((m->m_flags & (M_BCAST | M_LOOP)) &&
6771 m->m_pkthdr.rcvif == NULL) {
6772 m->m_pkthdr.rcvif = ifp;
6773 rcvif_set = 1;
6774 }
6775 m_loop_set = m->m_flags & M_LOOP;
6776 retval = ifp->if_framer(ifp, &m, dest, dst_linkaddr,
6777 frame_type, &pre, &post);
6778 if (retval != 0) {
6779 if (retval != EJUSTRETURN) {
6780 m_freem(m);
6781 }
6782 goto next;
6783 }
6784
6785 /*
6786 * For partial checksum offload, adjust the start
6787 * and stuff offsets based on the prepended header.
6788 */
6789 if ((m->m_pkthdr.csum_flags &
6790 (CSUM_DATA_VALID | CSUM_PARTIAL)) ==
6791 (CSUM_DATA_VALID | CSUM_PARTIAL)) {
6792 m->m_pkthdr.csum_tx_stuff += pre;
6793 m->m_pkthdr.csum_tx_start += pre;
6794 }
6795
6796 if (hwcksum_dbg != 0 && !(ifp->if_flags & IFF_LOOPBACK)) {
6797 dlil_output_cksum_dbg(ifp, m, pre,
6798 proto_family);
6799 }
6800
6801 /*
6802 * Clear the ifp if it was set above, and to be
6803 * safe, only if it is still the same as the
6804 * outbound ifp we have in context. If it was
6805 * looped back, then a copy of it was sent to the
6806 * loopback interface with the rcvif set, and we
6807 * are clearing the one that will go down to the
6808 * layer below.
6809 */
6810 if (rcvif_set && m->m_pkthdr.rcvif == ifp) {
6811 m->m_pkthdr.rcvif = NULL;
6812 }
6813 }
6814
6815 /*
6816 * Let interface filters (if any) do their thing ...
6817 */
6818 if ((flags & DLIL_OUTPUT_FLAGS_SKIP_IF_FILTERS) == 0) {
6819 retval = dlil_interface_filters_output(ifp, &m, proto_family);
6820 if (retval != 0) {
6821 if (retval != EJUSTRETURN) {
6822 m_freem(m);
6823 }
6824 goto next;
6825 }
6826 }
6827 /*
6828 * Strip away M_PROTO1 bit prior to sending packet
6829 * to the driver as this field may be used by the driver
6830 */
6831 m->m_flags &= ~M_PROTO1;
6832
6833 /*
6834 * If the underlying interface is not capable of handling a
6835 * packet whose data portion spans across physically disjoint
6836 * pages, we need to "normalize" the packet so that we pass
6837 * down a chain of mbufs where each mbuf points to a span that
6838 * resides in the system page boundary. If the packet does
6839 * not cross page(s), the following is a no-op.
6840 */
6841 if (!(ifp->if_hwassist & IFNET_MULTIPAGES)) {
6842 if ((m = m_normalize(m)) == NULL) {
6843 goto next;
6844 }
6845 }
6846
6847 /*
6848 * If this is a TSO packet, make sure the interface still
6849 * advertise TSO capability.
6850 */
6851 if (TSO_IPV4_NOTOK(ifp, m) || TSO_IPV6_NOTOK(ifp, m)) {
6852 retval = EMSGSIZE;
6853 m_freem(m);
6854 goto cleanup;
6855 }
6856
6857 ifp_inc_traffic_class_out(ifp, m);
6858
6859 #if SKYWALK
6860 /*
6861 * For native skywalk devices, packets will be passed to pktap
6862 * after GSO or after the mbuf to packet conversion.
6863 * This is done for IPv4/IPv6 packets only because there is no
6864 * space in the mbuf to pass down the proto family.
6865 */
6866 if (dlil_is_native_netif_nexus(ifp)) {
6867 if (raw || m->m_pkthdr.pkt_proto == 0) {
6868 pktap_output(ifp, proto_family, m, pre, post);
6869 m->m_pkthdr.pkt_flags |= PKTF_SKIP_PKTAP;
6870 }
6871 } else {
6872 pktap_output(ifp, proto_family, m, pre, post);
6873 }
6874 #else /* SKYWALK */
6875 pktap_output(ifp, proto_family, m, pre, post);
6876 #endif /* SKYWALK */
6877
6878 /*
6879 * Count the number of elements in the mbuf chain
6880 */
6881 if (tx_chain_len_count) {
6882 dlil_count_chain_len(m, &tx_chain_len_stats);
6883 }
6884
6885 /*
6886 * Discard partial sum information if this packet originated
6887 * from another interface; the packet would already have the
6888 * final checksum and we shouldn't recompute it.
6889 */
6890 if ((m->m_pkthdr.pkt_flags & PKTF_FORWARDED) &&
6891 (m->m_pkthdr.csum_flags & (CSUM_DATA_VALID | CSUM_PARTIAL)) ==
6892 (CSUM_DATA_VALID | CSUM_PARTIAL)) {
6893 m->m_pkthdr.csum_flags &= ~CSUM_TX_FLAGS;
6894 m->m_pkthdr.csum_data = 0;
6895 }
6896
6897 /*
6898 * Finally, call the driver.
6899 */
6900 if (ifp->if_eflags & (IFEF_SENDLIST | IFEF_ENQUEUE_MULTI)) {
6901 if (m->m_pkthdr.pkt_flags & PKTF_FORWARDED) {
6902 flen += (m_pktlen(m) - (pre + post));
6903 m->m_pkthdr.pkt_flags &= ~PKTF_FORWARDED;
6904 }
6905 (void) mbuf_set_timestamp(m, now_nsec, TRUE);
6906
6907 *send_tail = m;
6908 send_tail = &m->m_nextpkt;
6909 } else {
6910 /*
6911 * Record timestamp; ifnet_enqueue() will use this info
6912 * rather than redoing the work.
6913 */
6914 nanouptime(&now);
6915 net_timernsec(&now, &now_nsec);
6916 (void) mbuf_set_timestamp(m, now_nsec, TRUE);
6917
6918 if (m->m_pkthdr.pkt_flags & PKTF_FORWARDED) {
6919 flen = (m_pktlen(m) - (pre + post));
6920 m->m_pkthdr.pkt_flags &= ~PKTF_FORWARDED;
6921 } else {
6922 flen = 0;
6923 }
6924 KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_START,
6925 0, 0, 0, 0, 0);
6926 retval = (*ifp->if_output_dlil)(ifp, m);
6927 if (retval == EQFULL || retval == EQSUSPENDED) {
6928 if (adv != NULL && adv->code == FADV_SUCCESS) {
6929 adv->code = (retval == EQFULL ?
6930 FADV_FLOW_CONTROLLED :
6931 FADV_SUSPENDED);
6932 }
6933 retval = 0;
6934 }
6935 if (retval == 0 && flen > 0) {
6936 fbytes += flen;
6937 fpkts++;
6938 }
6939 if (retval != 0 && dlil_verbose) {
6940 DLIL_PRINTF("%s: output error on %s retval = %d\n",
6941 __func__, if_name(ifp),
6942 retval);
6943 }
6944 KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_END,
6945 0, 0, 0, 0, 0);
6946 }
6947 KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
6948
6949 next:
6950 m = packetlist;
6951 if (m != NULL) {
6952 m->m_flags |= m_loop_set;
6953 packetlist = packetlist->m_nextpkt;
6954 m->m_nextpkt = NULL;
6955 }
6956 /* Reset the proto family to old proto family for CLAT */
6957 if (did_clat46) {
6958 proto_family = old_proto_family;
6959 }
6960 } while (m != NULL);
6961
6962 if (send_head != NULL) {
6963 KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_START,
6964 0, 0, 0, 0, 0);
6965 if (ifp->if_eflags & IFEF_SENDLIST) {
6966 retval = (*ifp->if_output_dlil)(ifp, send_head);
6967 if (retval == EQFULL || retval == EQSUSPENDED) {
6968 if (adv != NULL) {
6969 adv->code = (retval == EQFULL ?
6970 FADV_FLOW_CONTROLLED :
6971 FADV_SUSPENDED);
6972 }
6973 retval = 0;
6974 }
6975 if (retval == 0 && flen > 0) {
6976 fbytes += flen;
6977 fpkts++;
6978 }
6979 if (retval != 0 && dlil_verbose) {
6980 DLIL_PRINTF("%s: output error on %s retval = %d\n",
6981 __func__, if_name(ifp), retval);
6982 }
6983 } else {
6984 struct mbuf *send_m;
6985 int enq_cnt = 0;
6986 VERIFY(ifp->if_eflags & IFEF_ENQUEUE_MULTI);
6987 while (send_head != NULL) {
6988 send_m = send_head;
6989 send_head = send_m->m_nextpkt;
6990 send_m->m_nextpkt = NULL;
6991 retval = (*ifp->if_output_dlil)(ifp, send_m);
6992 if (retval == EQFULL || retval == EQSUSPENDED) {
6993 if (adv != NULL) {
6994 adv->code = (retval == EQFULL ?
6995 FADV_FLOW_CONTROLLED :
6996 FADV_SUSPENDED);
6997 }
6998 retval = 0;
6999 }
7000 if (retval == 0) {
7001 enq_cnt++;
7002 if (flen > 0) {
7003 fpkts++;
7004 }
7005 }
7006 if (retval != 0 && dlil_verbose) {
7007 DLIL_PRINTF("%s: output error on %s "
7008 "retval = %d\n",
7009 __func__, if_name(ifp), retval);
7010 }
7011 }
7012 if (enq_cnt > 0) {
7013 fbytes += flen;
7014 ifnet_start(ifp);
7015 }
7016 }
7017 KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
7018 }
7019
7020 KERNEL_DEBUG(DBG_FNC_DLIL_OUTPUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
7021
7022 cleanup:
7023 if (fbytes > 0) {
7024 ifp->if_fbytes += fbytes;
7025 }
7026 if (fpkts > 0) {
7027 ifp->if_fpackets += fpkts;
7028 }
7029 if (proto != NULL) {
7030 if_proto_free(proto);
7031 }
7032 if (packetlist) { /* if any packets are left, clean up */
7033 mbuf_freem_list(packetlist);
7034 }
7035 if (retval == EJUSTRETURN) {
7036 retval = 0;
7037 }
7038 if (iorefcnt == 1) {
7039 ifnet_datamov_end(ifp);
7040 }
7041 if (rt != NULL) {
7042 rtfree(rt);
7043 rt = NULL;
7044 }
7045
7046 return retval;
7047 }
7048
7049 /*
7050 * This routine checks if the destination address is not a loopback, link-local,
7051 * multicast or broadcast address.
7052 */
7053 static int
dlil_is_clat_needed(protocol_family_t proto_family,mbuf_t m)7054 dlil_is_clat_needed(protocol_family_t proto_family, mbuf_t m)
7055 {
7056 int ret = 0;
7057 switch (proto_family) {
7058 case PF_INET: {
7059 struct ip *iph = mtod(m, struct ip *);
7060 if (CLAT46_NEEDED(ntohl(iph->ip_dst.s_addr))) {
7061 ret = 1;
7062 }
7063 break;
7064 }
7065 case PF_INET6: {
7066 struct ip6_hdr *ip6h = mtod(m, struct ip6_hdr *);
7067 if ((size_t)m_pktlen(m) >= sizeof(struct ip6_hdr) &&
7068 CLAT64_NEEDED(&ip6h->ip6_dst)) {
7069 ret = 1;
7070 }
7071 break;
7072 }
7073 }
7074
7075 return ret;
7076 }
7077 /*
7078 * @brief This routine translates IPv4 packet to IPv6 packet,
7079 * updates protocol checksum and also translates ICMP for code
7080 * along with inner header translation.
7081 *
7082 * @param ifp Pointer to the interface
7083 * @param proto_family pointer to protocol family. It is updated if function
7084 * performs the translation successfully.
7085 * @param m Pointer to the pointer pointing to the packet. Needed because this
7086 * routine can end up changing the mbuf to a different one.
7087 *
7088 * @return 0 on success or else a negative value.
7089 */
7090 static errno_t
dlil_clat46(ifnet_t ifp,protocol_family_t * proto_family,mbuf_t * m)7091 dlil_clat46(ifnet_t ifp, protocol_family_t *proto_family, mbuf_t *m)
7092 {
7093 VERIFY(*proto_family == PF_INET);
7094 VERIFY(IS_INTF_CLAT46(ifp));
7095
7096 pbuf_t pbuf_store, *pbuf = NULL;
7097 struct ip *iph = NULL;
7098 struct in_addr osrc, odst;
7099 uint8_t proto = 0;
7100 struct in6_addr src_storage = {};
7101 struct in6_addr *src = NULL;
7102 struct sockaddr_in6 dstsock = {};
7103 int error = 0;
7104 uint16_t off = 0;
7105 uint16_t tot_len = 0;
7106 uint16_t ip_id_val = 0;
7107 uint16_t ip_frag_off = 0;
7108
7109 boolean_t is_frag = FALSE;
7110 boolean_t is_first_frag = TRUE;
7111 boolean_t is_last_frag = TRUE;
7112
7113 pbuf_init_mbuf(&pbuf_store, *m, ifp);
7114 pbuf = &pbuf_store;
7115 iph = pbuf->pb_data;
7116
7117 osrc = iph->ip_src;
7118 odst = iph->ip_dst;
7119 proto = iph->ip_p;
7120 off = (uint16_t)(iph->ip_hl << 2);
7121 ip_id_val = iph->ip_id;
7122 ip_frag_off = ntohs(iph->ip_off) & IP_OFFMASK;
7123
7124 tot_len = ntohs(iph->ip_len);
7125
7126 /*
7127 * For packets that are not first frags
7128 * we only need to adjust CSUM.
7129 * For 4 to 6, Fragmentation header gets appended
7130 * after proto translation.
7131 */
7132 if (ntohs(iph->ip_off) & ~(IP_DF | IP_RF)) {
7133 is_frag = TRUE;
7134
7135 /* If the offset is not zero, it is not first frag */
7136 if (ip_frag_off != 0) {
7137 is_first_frag = FALSE;
7138 }
7139
7140 /* If IP_MF is set, then it is not last frag */
7141 if (ntohs(iph->ip_off) & IP_MF) {
7142 is_last_frag = FALSE;
7143 }
7144 }
7145
7146 /*
7147 * Translate IPv4 destination to IPv6 destination by using the
7148 * prefixes learned through prior PLAT discovery.
7149 */
7150 if ((error = nat464_synthesize_ipv6(ifp, &odst, &dstsock.sin6_addr)) != 0) {
7151 ip6stat.ip6s_clat464_out_v6synthfail_drop++;
7152 goto cleanup;
7153 }
7154
7155 dstsock.sin6_len = sizeof(struct sockaddr_in6);
7156 dstsock.sin6_family = AF_INET6;
7157
7158 /*
7159 * Retrive the local IPv6 CLAT46 address reserved for stateless
7160 * translation.
7161 */
7162 src = in6_selectsrc_core(&dstsock, 0, ifp, 0, &src_storage, NULL, &error,
7163 NULL, NULL, TRUE);
7164
7165 if (src == NULL) {
7166 ip6stat.ip6s_clat464_out_nov6addr_drop++;
7167 error = -1;
7168 goto cleanup;
7169 }
7170
7171
7172 /* Translate the IP header part first */
7173 error = (nat464_translate_46(pbuf, off, iph->ip_tos, iph->ip_p,
7174 iph->ip_ttl, src_storage, dstsock.sin6_addr, tot_len) == NT_NAT64) ? 0 : -1;
7175
7176 iph = NULL; /* Invalidate iph as pbuf has been modified */
7177
7178 if (error != 0) {
7179 ip6stat.ip6s_clat464_out_46transfail_drop++;
7180 goto cleanup;
7181 }
7182
7183 /*
7184 * Translate protocol header, update checksum, checksum flags
7185 * and related fields.
7186 */
7187 error = (nat464_translate_proto(pbuf, (struct nat464_addr *)&osrc, (struct nat464_addr *)&odst,
7188 proto, PF_INET, PF_INET6, NT_OUT, !is_first_frag) == NT_NAT64) ? 0 : -1;
7189
7190 if (error != 0) {
7191 ip6stat.ip6s_clat464_out_46proto_transfail_drop++;
7192 goto cleanup;
7193 }
7194
7195 /* Now insert the IPv6 fragment header */
7196 if (is_frag) {
7197 error = nat464_insert_frag46(pbuf, ip_id_val, ip_frag_off, is_last_frag);
7198
7199 if (error != 0) {
7200 ip6stat.ip6s_clat464_out_46frag_transfail_drop++;
7201 goto cleanup;
7202 }
7203 }
7204
7205 cleanup:
7206 if (pbuf_is_valid(pbuf)) {
7207 *m = pbuf->pb_mbuf;
7208 pbuf->pb_mbuf = NULL;
7209 pbuf_destroy(pbuf);
7210 } else {
7211 error = -1;
7212 *m = NULL;
7213 ip6stat.ip6s_clat464_out_invalpbuf_drop++;
7214 }
7215
7216 if (error == 0) {
7217 *proto_family = PF_INET6;
7218 ip6stat.ip6s_clat464_out_success++;
7219 }
7220
7221 return error;
7222 }
7223
7224 /*
7225 * @brief This routine translates incoming IPv6 to IPv4 packet,
7226 * updates protocol checksum and also translates ICMPv6 outer
7227 * and inner headers
7228 *
7229 * @return 0 on success or else a negative value.
7230 */
7231 static errno_t
dlil_clat64(ifnet_t ifp,protocol_family_t * proto_family,mbuf_t * m)7232 dlil_clat64(ifnet_t ifp, protocol_family_t *proto_family, mbuf_t *m)
7233 {
7234 VERIFY(*proto_family == PF_INET6);
7235 VERIFY(IS_INTF_CLAT46(ifp));
7236
7237 struct ip6_hdr *ip6h = NULL;
7238 struct in6_addr osrc, odst;
7239 uint8_t proto = 0;
7240 struct in6_ifaddr *ia6_clat_dst = NULL;
7241 struct in_ifaddr *ia4_clat_dst = NULL;
7242 struct in_addr *dst = NULL;
7243 struct in_addr src;
7244 int error = 0;
7245 uint32_t off = 0;
7246 u_int64_t tot_len = 0;
7247 uint8_t tos = 0;
7248 boolean_t is_first_frag = TRUE;
7249
7250 /* Incoming mbuf does not contain valid IP6 header */
7251 if ((size_t)(*m)->m_pkthdr.len < sizeof(struct ip6_hdr) ||
7252 ((size_t)(*m)->m_len < sizeof(struct ip6_hdr) &&
7253 (*m = m_pullup(*m, sizeof(struct ip6_hdr))) == NULL)) {
7254 ip6stat.ip6s_clat464_in_tooshort_drop++;
7255 return -1;
7256 }
7257
7258 ip6h = mtod(*m, struct ip6_hdr *);
7259 /* Validate that mbuf contains IP payload equal to ip6_plen */
7260 if ((size_t)(*m)->m_pkthdr.len < ntohs(ip6h->ip6_plen) + sizeof(struct ip6_hdr)) {
7261 ip6stat.ip6s_clat464_in_tooshort_drop++;
7262 return -1;
7263 }
7264
7265 osrc = ip6h->ip6_src;
7266 odst = ip6h->ip6_dst;
7267
7268 /*
7269 * Retrieve the local CLAT46 reserved IPv6 address.
7270 * Let the packet pass if we don't find one, as the flag
7271 * may get set before IPv6 configuration has taken place.
7272 */
7273 ia6_clat_dst = in6ifa_ifpwithflag(ifp, IN6_IFF_CLAT46);
7274 if (ia6_clat_dst == NULL) {
7275 goto done;
7276 }
7277
7278 /*
7279 * Check if the original dest in the packet is same as the reserved
7280 * CLAT46 IPv6 address
7281 */
7282 if (IN6_ARE_ADDR_EQUAL(&odst, &ia6_clat_dst->ia_addr.sin6_addr)) {
7283 pbuf_t pbuf_store, *pbuf = NULL;
7284 pbuf_init_mbuf(&pbuf_store, *m, ifp);
7285 pbuf = &pbuf_store;
7286
7287 /*
7288 * Retrive the local CLAT46 IPv4 address reserved for stateless
7289 * translation.
7290 */
7291 ia4_clat_dst = inifa_ifpclatv4(ifp);
7292 if (ia4_clat_dst == NULL) {
7293 ifa_remref(&ia6_clat_dst->ia_ifa);
7294 ip6stat.ip6s_clat464_in_nov4addr_drop++;
7295 error = -1;
7296 goto cleanup;
7297 }
7298 ifa_remref(&ia6_clat_dst->ia_ifa);
7299
7300 /* Translate IPv6 src to IPv4 src by removing the NAT64 prefix */
7301 dst = &ia4_clat_dst->ia_addr.sin_addr;
7302 if ((error = nat464_synthesize_ipv4(ifp, &osrc, &src)) != 0) {
7303 ip6stat.ip6s_clat464_in_v4synthfail_drop++;
7304 error = -1;
7305 goto cleanup;
7306 }
7307
7308 ip6h = pbuf->pb_data;
7309 off = sizeof(struct ip6_hdr);
7310 proto = ip6h->ip6_nxt;
7311 tos = (ntohl(ip6h->ip6_flow) >> 20) & 0xff;
7312 tot_len = ntohs(ip6h->ip6_plen) + sizeof(struct ip6_hdr);
7313
7314 /*
7315 * Translate the IP header and update the fragmentation
7316 * header if needed
7317 */
7318 error = (nat464_translate_64(pbuf, off, tos, &proto,
7319 ip6h->ip6_hlim, src, *dst, tot_len, &is_first_frag) == NT_NAT64) ?
7320 0 : -1;
7321
7322 ip6h = NULL; /* Invalidate ip6h as pbuf has been changed */
7323
7324 if (error != 0) {
7325 ip6stat.ip6s_clat464_in_64transfail_drop++;
7326 goto cleanup;
7327 }
7328
7329 /*
7330 * Translate protocol header, update checksum, checksum flags
7331 * and related fields.
7332 */
7333 error = (nat464_translate_proto(pbuf, (struct nat464_addr *)&osrc,
7334 (struct nat464_addr *)&odst, proto, PF_INET6, PF_INET,
7335 NT_IN, !is_first_frag) == NT_NAT64) ? 0 : -1;
7336
7337 if (error != 0) {
7338 ip6stat.ip6s_clat464_in_64proto_transfail_drop++;
7339 goto cleanup;
7340 }
7341
7342 cleanup:
7343 if (ia4_clat_dst != NULL) {
7344 ifa_remref(&ia4_clat_dst->ia_ifa);
7345 }
7346
7347 if (pbuf_is_valid(pbuf)) {
7348 *m = pbuf->pb_mbuf;
7349 pbuf->pb_mbuf = NULL;
7350 pbuf_destroy(pbuf);
7351 } else {
7352 error = -1;
7353 ip6stat.ip6s_clat464_in_invalpbuf_drop++;
7354 }
7355
7356 if (error == 0) {
7357 *proto_family = PF_INET;
7358 ip6stat.ip6s_clat464_in_success++;
7359 }
7360 } /* CLAT traffic */
7361
7362 done:
7363 return error;
7364 }
7365
7366 /* The following is used to enqueue work items for ifnet ioctl events */
7367 static void ifnet_ioctl_event_callback(struct nwk_wq_entry *);
7368
7369 struct ifnet_ioctl_event {
7370 struct ifnet *ifp;
7371 u_long ioctl_code;
7372 };
7373
7374 struct ifnet_ioctl_event_nwk_wq_entry {
7375 struct nwk_wq_entry nwk_wqe;
7376 struct ifnet_ioctl_event ifnet_ioctl_ev_arg;
7377 };
7378
7379 void
ifnet_ioctl_async(struct ifnet * ifp,u_long ioctl_code)7380 ifnet_ioctl_async(struct ifnet *ifp, u_long ioctl_code)
7381 {
7382 struct ifnet_ioctl_event_nwk_wq_entry *p_ifnet_ioctl_ev = NULL;
7383 bool compare_expected;
7384
7385 /*
7386 * Get an io ref count if the interface is attached.
7387 * At this point it most likely is. We are taking a reference for
7388 * deferred processing.
7389 */
7390 if (!ifnet_is_attached(ifp, 1)) {
7391 os_log(OS_LOG_DEFAULT, "%s:%d %s Failed for ioctl %lu as interface "
7392 "is not attached",
7393 __func__, __LINE__, if_name(ifp), ioctl_code);
7394 return;
7395 }
7396 switch (ioctl_code) {
7397 case SIOCADDMULTI:
7398 compare_expected = false;
7399 if (!atomic_compare_exchange_strong(&ifp->if_mcast_add_signaled, &compare_expected, true)) {
7400 ifnet_decr_iorefcnt(ifp);
7401 return;
7402 }
7403 break;
7404 case SIOCDELMULTI:
7405 compare_expected = false;
7406 if (!atomic_compare_exchange_strong(&ifp->if_mcast_del_signaled, &compare_expected, true)) {
7407 ifnet_decr_iorefcnt(ifp);
7408 return;
7409 }
7410 break;
7411 default:
7412 os_log(OS_LOG_DEFAULT, "%s:%d %s unknown ioctl %lu",
7413 __func__, __LINE__, if_name(ifp), ioctl_code);
7414 return;
7415 }
7416
7417 p_ifnet_ioctl_ev = kalloc_type(struct ifnet_ioctl_event_nwk_wq_entry,
7418 Z_WAITOK | Z_ZERO | Z_NOFAIL);
7419
7420 p_ifnet_ioctl_ev->ifnet_ioctl_ev_arg.ifp = ifp;
7421 p_ifnet_ioctl_ev->ifnet_ioctl_ev_arg.ioctl_code = ioctl_code;
7422 p_ifnet_ioctl_ev->nwk_wqe.func = ifnet_ioctl_event_callback;
7423 nwk_wq_enqueue(&p_ifnet_ioctl_ev->nwk_wqe);
7424 }
7425
7426 static void
ifnet_ioctl_event_callback(struct nwk_wq_entry * nwk_item)7427 ifnet_ioctl_event_callback(struct nwk_wq_entry *nwk_item)
7428 {
7429 struct ifnet_ioctl_event_nwk_wq_entry *p_ev = __container_of(nwk_item,
7430 struct ifnet_ioctl_event_nwk_wq_entry, nwk_wqe);
7431
7432 struct ifnet *ifp = p_ev->ifnet_ioctl_ev_arg.ifp;
7433 u_long ioctl_code = p_ev->ifnet_ioctl_ev_arg.ioctl_code;
7434 int ret = 0;
7435
7436 switch (ioctl_code) {
7437 case SIOCADDMULTI:
7438 atomic_store(&ifp->if_mcast_add_signaled, false);
7439 break;
7440 case SIOCDELMULTI:
7441 atomic_store(&ifp->if_mcast_del_signaled, false);
7442 break;
7443 }
7444 if ((ret = ifnet_ioctl(ifp, 0, ioctl_code, NULL)) != 0) {
7445 os_log(OS_LOG_DEFAULT, "%s:%d %s ifnet_ioctl returned %d for ioctl %lu",
7446 __func__, __LINE__, if_name(ifp), ret, ioctl_code);
7447 } else if (dlil_verbose) {
7448 os_log(OS_LOG_DEFAULT, "%s:%d %s ifnet_ioctl returned successfully "
7449 "for ioctl %lu",
7450 __func__, __LINE__, if_name(ifp), ioctl_code);
7451 }
7452 ifnet_decr_iorefcnt(ifp);
7453 kfree_type(struct ifnet_ioctl_event_nwk_wq_entry, p_ev);
7454 return;
7455 }
7456
7457 errno_t
ifnet_ioctl(ifnet_t ifp,protocol_family_t proto_fam,u_long ioctl_code,void * ioctl_arg)7458 ifnet_ioctl(ifnet_t ifp, protocol_family_t proto_fam, u_long ioctl_code,
7459 void *ioctl_arg)
7460 {
7461 struct ifnet_filter *filter;
7462 int retval = EOPNOTSUPP;
7463 int result = 0;
7464
7465 if (ifp == NULL || ioctl_code == 0) {
7466 return EINVAL;
7467 }
7468
7469 /* Get an io ref count if the interface is attached */
7470 if (!ifnet_is_attached(ifp, 1)) {
7471 return EOPNOTSUPP;
7472 }
7473
7474 /*
7475 * Run the interface filters first.
7476 * We want to run all filters before calling the protocol,
7477 * interface family, or interface.
7478 */
7479 lck_mtx_lock_spin(&ifp->if_flt_lock);
7480 /* prevent filter list from changing in case we drop the lock */
7481 if_flt_monitor_busy(ifp);
7482 TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
7483 if (filter->filt_ioctl != NULL && (filter->filt_protocol == 0 ||
7484 filter->filt_protocol == proto_fam)) {
7485 lck_mtx_unlock(&ifp->if_flt_lock);
7486
7487 result = filter->filt_ioctl(filter->filt_cookie, ifp,
7488 proto_fam, ioctl_code, ioctl_arg);
7489
7490 lck_mtx_lock_spin(&ifp->if_flt_lock);
7491
7492 /* Only update retval if no one has handled the ioctl */
7493 if (retval == EOPNOTSUPP || result == EJUSTRETURN) {
7494 if (result == ENOTSUP) {
7495 result = EOPNOTSUPP;
7496 }
7497 retval = result;
7498 if (retval != 0 && retval != EOPNOTSUPP) {
7499 /* we're done with the filter list */
7500 if_flt_monitor_unbusy(ifp);
7501 lck_mtx_unlock(&ifp->if_flt_lock);
7502 goto cleanup;
7503 }
7504 }
7505 }
7506 }
7507 /* we're done with the filter list */
7508 if_flt_monitor_unbusy(ifp);
7509 lck_mtx_unlock(&ifp->if_flt_lock);
7510
7511 /* Allow the protocol to handle the ioctl */
7512 if (proto_fam != 0) {
7513 struct if_proto *proto;
7514
7515 /* callee holds a proto refcnt upon success */
7516 ifnet_lock_shared(ifp);
7517 proto = find_attached_proto(ifp, proto_fam);
7518 ifnet_lock_done(ifp);
7519 if (proto != NULL) {
7520 proto_media_ioctl ioctlp =
7521 (proto->proto_kpi == kProtoKPI_v1 ?
7522 proto->kpi.v1.ioctl : proto->kpi.v2.ioctl);
7523 result = EOPNOTSUPP;
7524 if (ioctlp != NULL) {
7525 result = ioctlp(ifp, proto_fam, ioctl_code,
7526 ioctl_arg);
7527 }
7528 if_proto_free(proto);
7529
7530 /* Only update retval if no one has handled the ioctl */
7531 if (retval == EOPNOTSUPP || result == EJUSTRETURN) {
7532 if (result == ENOTSUP) {
7533 result = EOPNOTSUPP;
7534 }
7535 retval = result;
7536 if (retval && retval != EOPNOTSUPP) {
7537 goto cleanup;
7538 }
7539 }
7540 }
7541 }
7542
7543 /* retval is either 0 or EOPNOTSUPP */
7544
7545 /*
7546 * Let the interface handle this ioctl.
7547 * If it returns EOPNOTSUPP, ignore that, we may have
7548 * already handled this in the protocol or family.
7549 */
7550 if (ifp->if_ioctl) {
7551 result = (*ifp->if_ioctl)(ifp, ioctl_code, ioctl_arg);
7552 }
7553
7554 /* Only update retval if no one has handled the ioctl */
7555 if (retval == EOPNOTSUPP || result == EJUSTRETURN) {
7556 if (result == ENOTSUP) {
7557 result = EOPNOTSUPP;
7558 }
7559 retval = result;
7560 if (retval && retval != EOPNOTSUPP) {
7561 goto cleanup;
7562 }
7563 }
7564
7565 cleanup:
7566 if (retval == EJUSTRETURN) {
7567 retval = 0;
7568 }
7569
7570 ifnet_decr_iorefcnt(ifp);
7571
7572 return retval;
7573 }
7574
7575 __private_extern__ errno_t
dlil_set_bpf_tap(ifnet_t ifp,bpf_tap_mode mode,bpf_packet_func callback)7576 dlil_set_bpf_tap(ifnet_t ifp, bpf_tap_mode mode, bpf_packet_func callback)
7577 {
7578 errno_t error = 0;
7579
7580 if (ifp->if_set_bpf_tap) {
7581 /* Get an io reference on the interface if it is attached */
7582 if (!ifnet_is_attached(ifp, 1)) {
7583 return ENXIO;
7584 }
7585 error = ifp->if_set_bpf_tap(ifp, mode, callback);
7586 ifnet_decr_iorefcnt(ifp);
7587 }
7588 return error;
7589 }
7590
7591 errno_t
dlil_resolve_multi(struct ifnet * ifp,const struct sockaddr * proto_addr,struct sockaddr * ll_addr,size_t ll_len)7592 dlil_resolve_multi(struct ifnet *ifp, const struct sockaddr *proto_addr,
7593 struct sockaddr *ll_addr, size_t ll_len)
7594 {
7595 errno_t result = EOPNOTSUPP;
7596 struct if_proto *proto;
7597 const struct sockaddr *verify;
7598 proto_media_resolve_multi resolvep;
7599
7600 if (!ifnet_is_attached(ifp, 1)) {
7601 return result;
7602 }
7603
7604 bzero(ll_addr, ll_len);
7605
7606 /* Call the protocol first; callee holds a proto refcnt upon success */
7607 ifnet_lock_shared(ifp);
7608 proto = find_attached_proto(ifp, proto_addr->sa_family);
7609 ifnet_lock_done(ifp);
7610 if (proto != NULL) {
7611 resolvep = (proto->proto_kpi == kProtoKPI_v1 ?
7612 proto->kpi.v1.resolve_multi : proto->kpi.v2.resolve_multi);
7613 if (resolvep != NULL) {
7614 result = resolvep(ifp, proto_addr, SDL(ll_addr), ll_len);
7615 }
7616 if_proto_free(proto);
7617 }
7618
7619 /* Let the interface verify the multicast address */
7620 if ((result == EOPNOTSUPP || result == 0) && ifp->if_check_multi) {
7621 if (result == 0) {
7622 verify = ll_addr;
7623 } else {
7624 verify = proto_addr;
7625 }
7626 result = ifp->if_check_multi(ifp, verify);
7627 }
7628
7629 ifnet_decr_iorefcnt(ifp);
7630 return result;
7631 }
7632
7633 __private_extern__ errno_t
dlil_send_arp_internal(ifnet_t ifp,u_short arpop,const struct sockaddr_dl * sender_hw,const struct sockaddr * sender_proto,const struct sockaddr_dl * target_hw,const struct sockaddr * target_proto)7634 dlil_send_arp_internal(ifnet_t ifp, u_short arpop,
7635 const struct sockaddr_dl *sender_hw, const struct sockaddr *sender_proto,
7636 const struct sockaddr_dl *target_hw, const struct sockaddr *target_proto)
7637 {
7638 struct if_proto *proto;
7639 errno_t result = 0;
7640
7641 if ((ifp->if_flags & IFF_NOARP) != 0) {
7642 result = ENOTSUP;
7643 goto done;
7644 }
7645
7646 /* callee holds a proto refcnt upon success */
7647 ifnet_lock_shared(ifp);
7648 proto = find_attached_proto(ifp, target_proto->sa_family);
7649 ifnet_lock_done(ifp);
7650 if (proto == NULL) {
7651 result = ENOTSUP;
7652 } else {
7653 proto_media_send_arp arpp;
7654 arpp = (proto->proto_kpi == kProtoKPI_v1 ?
7655 proto->kpi.v1.send_arp : proto->kpi.v2.send_arp);
7656 if (arpp == NULL) {
7657 result = ENOTSUP;
7658 } else {
7659 switch (arpop) {
7660 case ARPOP_REQUEST:
7661 arpstat.txrequests++;
7662 if (target_hw != NULL) {
7663 arpstat.txurequests++;
7664 }
7665 break;
7666 case ARPOP_REPLY:
7667 arpstat.txreplies++;
7668 break;
7669 }
7670 result = arpp(ifp, arpop, sender_hw, sender_proto,
7671 target_hw, target_proto);
7672 }
7673 if_proto_free(proto);
7674 }
7675 done:
7676 return result;
7677 }
7678
7679 struct net_thread_marks { };
7680 static const struct net_thread_marks net_thread_marks_base = { };
7681
7682 __private_extern__ const net_thread_marks_t net_thread_marks_none =
7683 &net_thread_marks_base;
7684
7685 __private_extern__ net_thread_marks_t
net_thread_marks_push(u_int32_t push)7686 net_thread_marks_push(u_int32_t push)
7687 {
7688 static const char *const base = (const void*)&net_thread_marks_base;
7689 u_int32_t pop = 0;
7690
7691 if (push != 0) {
7692 struct uthread *uth = current_uthread();
7693
7694 pop = push & ~uth->uu_network_marks;
7695 if (pop != 0) {
7696 uth->uu_network_marks |= pop;
7697 }
7698 }
7699
7700 return (net_thread_marks_t)&base[pop];
7701 }
7702
7703 __private_extern__ net_thread_marks_t
net_thread_unmarks_push(u_int32_t unpush)7704 net_thread_unmarks_push(u_int32_t unpush)
7705 {
7706 static const char *const base = (const void*)&net_thread_marks_base;
7707 u_int32_t unpop = 0;
7708
7709 if (unpush != 0) {
7710 struct uthread *uth = current_uthread();
7711
7712 unpop = unpush & uth->uu_network_marks;
7713 if (unpop != 0) {
7714 uth->uu_network_marks &= ~unpop;
7715 }
7716 }
7717
7718 return (net_thread_marks_t)&base[unpop];
7719 }
7720
7721 __private_extern__ void
net_thread_marks_pop(net_thread_marks_t popx)7722 net_thread_marks_pop(net_thread_marks_t popx)
7723 {
7724 static const char *const base = (const void*)&net_thread_marks_base;
7725 const ptrdiff_t pop = (const char *)popx - (const char *)base;
7726
7727 if (pop != 0) {
7728 static const ptrdiff_t ones = (ptrdiff_t)(u_int32_t)~0U;
7729 struct uthread *uth = current_uthread();
7730
7731 VERIFY((pop & ones) == pop);
7732 VERIFY((ptrdiff_t)(uth->uu_network_marks & pop) == pop);
7733 uth->uu_network_marks &= ~pop;
7734 }
7735 }
7736
7737 __private_extern__ void
net_thread_unmarks_pop(net_thread_marks_t unpopx)7738 net_thread_unmarks_pop(net_thread_marks_t unpopx)
7739 {
7740 static const char *const base = (const void*)&net_thread_marks_base;
7741 ptrdiff_t unpop = (const char *)unpopx - (const char *)base;
7742
7743 if (unpop != 0) {
7744 static const ptrdiff_t ones = (ptrdiff_t)(u_int32_t)~0U;
7745 struct uthread *uth = current_uthread();
7746
7747 VERIFY((unpop & ones) == unpop);
7748 VERIFY((ptrdiff_t)(uth->uu_network_marks & unpop) == 0);
7749 uth->uu_network_marks |= (u_int32_t)unpop;
7750 }
7751 }
7752
7753 __private_extern__ u_int32_t
net_thread_is_marked(u_int32_t check)7754 net_thread_is_marked(u_int32_t check)
7755 {
7756 if (check != 0) {
7757 struct uthread *uth = current_uthread();
7758 return uth->uu_network_marks & check;
7759 } else {
7760 return 0;
7761 }
7762 }
7763
7764 __private_extern__ u_int32_t
net_thread_is_unmarked(u_int32_t check)7765 net_thread_is_unmarked(u_int32_t check)
7766 {
7767 if (check != 0) {
7768 struct uthread *uth = current_uthread();
7769 return ~uth->uu_network_marks & check;
7770 } else {
7771 return 0;
7772 }
7773 }
7774
7775 static __inline__ int
_is_announcement(const struct sockaddr_in * sender_sin,const struct sockaddr_in * target_sin)7776 _is_announcement(const struct sockaddr_in * sender_sin,
7777 const struct sockaddr_in * target_sin)
7778 {
7779 if (target_sin == NULL || sender_sin == NULL) {
7780 return FALSE;
7781 }
7782
7783 return sender_sin->sin_addr.s_addr == target_sin->sin_addr.s_addr;
7784 }
7785
7786 __private_extern__ errno_t
dlil_send_arp(ifnet_t ifp,u_short arpop,const struct sockaddr_dl * sender_hw,const struct sockaddr * sender_proto,const struct sockaddr_dl * target_hw,const struct sockaddr * target_proto0,u_int32_t rtflags)7787 dlil_send_arp(ifnet_t ifp, u_short arpop, const struct sockaddr_dl *sender_hw,
7788 const struct sockaddr *sender_proto, const struct sockaddr_dl *target_hw,
7789 const struct sockaddr *target_proto0, u_int32_t rtflags)
7790 {
7791 errno_t result = 0;
7792 const struct sockaddr_in * sender_sin;
7793 const struct sockaddr_in * target_sin;
7794 struct sockaddr_inarp target_proto_sinarp;
7795 struct sockaddr *target_proto = __DECONST_SA(target_proto0);
7796
7797 if (target_proto == NULL || sender_proto == NULL) {
7798 return EINVAL;
7799 }
7800
7801 if (sender_proto->sa_family != target_proto->sa_family) {
7802 return EINVAL;
7803 }
7804
7805 /*
7806 * If the target is a (default) router, provide that
7807 * information to the send_arp callback routine.
7808 */
7809 if (rtflags & RTF_ROUTER) {
7810 SOCKADDR_COPY(target_proto, &target_proto_sinarp, sizeof(struct sockaddr_in));
7811 target_proto_sinarp.sin_other |= SIN_ROUTER;
7812 target_proto = SA(&target_proto_sinarp);
7813 }
7814
7815 /*
7816 * If this is an ARP request and the target IP is IPv4LL,
7817 * send the request on all interfaces. The exception is
7818 * an announcement, which must only appear on the specific
7819 * interface.
7820 */
7821 sender_sin = SIN(sender_proto);
7822 target_sin = SIN(target_proto);
7823 if (target_proto->sa_family == AF_INET &&
7824 IN_LINKLOCAL(ntohl(target_sin->sin_addr.s_addr)) &&
7825 ipv4_ll_arp_aware != 0 && arpop == ARPOP_REQUEST &&
7826 !_is_announcement(sender_sin, target_sin)) {
7827 ifnet_t *__counted_by(count) ifp_list;
7828 u_int32_t count;
7829 u_int32_t ifp_on;
7830
7831 result = ENOTSUP;
7832
7833 if (ifnet_list_get(IFNET_FAMILY_ANY, &ifp_list, &count) == 0) {
7834 for (ifp_on = 0; ifp_on < count; ifp_on++) {
7835 errno_t new_result;
7836 ifaddr_t source_hw = NULL;
7837 ifaddr_t source_ip = NULL;
7838 struct sockaddr_in source_ip_copy;
7839 struct ifnet *cur_ifp = ifp_list[ifp_on];
7840
7841 /*
7842 * Only arp on interfaces marked for IPv4LL
7843 * ARPing. This may mean that we don't ARP on
7844 * the interface the subnet route points to.
7845 */
7846 if (!(cur_ifp->if_eflags & IFEF_ARPLL)) {
7847 continue;
7848 }
7849
7850 /* Find the source IP address */
7851 ifnet_lock_shared(cur_ifp);
7852 source_hw = cur_ifp->if_lladdr;
7853 TAILQ_FOREACH(source_ip, &cur_ifp->if_addrhead,
7854 ifa_link) {
7855 IFA_LOCK(source_ip);
7856 if (source_ip->ifa_addr != NULL &&
7857 source_ip->ifa_addr->sa_family ==
7858 AF_INET) {
7859 /* Copy the source IP address */
7860 SOCKADDR_COPY(SIN(source_ip->ifa_addr), &source_ip_copy, sizeof(source_ip_copy));
7861 IFA_UNLOCK(source_ip);
7862 break;
7863 }
7864 IFA_UNLOCK(source_ip);
7865 }
7866
7867 /* No IP Source, don't arp */
7868 if (source_ip == NULL) {
7869 ifnet_lock_done(cur_ifp);
7870 continue;
7871 }
7872
7873 ifa_addref(source_hw);
7874 ifnet_lock_done(cur_ifp);
7875
7876 /* Send the ARP */
7877 new_result = dlil_send_arp_internal(cur_ifp,
7878 arpop, SDL(source_hw->ifa_addr),
7879 SA(&source_ip_copy), NULL,
7880 target_proto);
7881
7882 ifa_remref(source_hw);
7883 if (result == ENOTSUP) {
7884 result = new_result;
7885 }
7886 }
7887 ifnet_list_free_counted_by(ifp_list, count);
7888 }
7889 } else {
7890 result = dlil_send_arp_internal(ifp, arpop, sender_hw,
7891 sender_proto, target_hw, target_proto);
7892 }
7893
7894 return result;
7895 }
7896
7897 /*
7898 * Caller must hold ifnet head lock.
7899 */
7900 static int
ifnet_lookup(struct ifnet * ifp)7901 ifnet_lookup(struct ifnet *ifp)
7902 {
7903 struct ifnet *_ifp;
7904
7905 LCK_RW_ASSERT(&ifnet_head_lock, LCK_RW_ASSERT_HELD);
7906 TAILQ_FOREACH(_ifp, &ifnet_head, if_link) {
7907 if (_ifp == ifp) {
7908 break;
7909 }
7910 }
7911 return _ifp != NULL;
7912 }
7913
7914 /*
7915 * Caller has to pass a non-zero refio argument to get a
7916 * IO reference count. This will prevent ifnet_detach from
7917 * being called when there are outstanding io reference counts.
7918 */
7919 int
ifnet_is_attached(struct ifnet * ifp,int refio)7920 ifnet_is_attached(struct ifnet *ifp, int refio)
7921 {
7922 int ret;
7923
7924 lck_mtx_lock_spin(&ifp->if_ref_lock);
7925 if ((ret = IF_FULLY_ATTACHED(ifp))) {
7926 if (refio > 0) {
7927 ifp->if_refio++;
7928 }
7929 }
7930 lck_mtx_unlock(&ifp->if_ref_lock);
7931
7932 return ret;
7933 }
7934
7935 void
ifnet_incr_pending_thread_count(struct ifnet * ifp)7936 ifnet_incr_pending_thread_count(struct ifnet *ifp)
7937 {
7938 lck_mtx_lock_spin(&ifp->if_ref_lock);
7939 ifp->if_threads_pending++;
7940 lck_mtx_unlock(&ifp->if_ref_lock);
7941 }
7942
7943 void
ifnet_decr_pending_thread_count(struct ifnet * ifp)7944 ifnet_decr_pending_thread_count(struct ifnet *ifp)
7945 {
7946 lck_mtx_lock_spin(&ifp->if_ref_lock);
7947 VERIFY(ifp->if_threads_pending > 0);
7948 ifp->if_threads_pending--;
7949 if (ifp->if_threads_pending == 0) {
7950 wakeup(&ifp->if_threads_pending);
7951 }
7952 lck_mtx_unlock(&ifp->if_ref_lock);
7953 }
7954
7955 /*
7956 * Caller must ensure the interface is attached; the assumption is that
7957 * there is at least an outstanding IO reference count held already.
7958 * Most callers would call ifnet_is_{attached,data_ready}() instead.
7959 */
7960 void
ifnet_incr_iorefcnt(struct ifnet * ifp)7961 ifnet_incr_iorefcnt(struct ifnet *ifp)
7962 {
7963 lck_mtx_lock_spin(&ifp->if_ref_lock);
7964 VERIFY(IF_FULLY_ATTACHED(ifp));
7965 VERIFY(ifp->if_refio > 0);
7966 ifp->if_refio++;
7967 lck_mtx_unlock(&ifp->if_ref_lock);
7968 }
7969
7970 __attribute__((always_inline))
7971 static void
ifnet_decr_iorefcnt_locked(struct ifnet * ifp)7972 ifnet_decr_iorefcnt_locked(struct ifnet *ifp)
7973 {
7974 LCK_MTX_ASSERT(&ifp->if_ref_lock, LCK_MTX_ASSERT_OWNED);
7975
7976 VERIFY(ifp->if_refio > 0);
7977 VERIFY(ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING));
7978
7979 ifp->if_refio--;
7980 VERIFY(ifp->if_refio != 0 || ifp->if_datamov == 0);
7981
7982 /*
7983 * if there are no more outstanding io references, wakeup the
7984 * ifnet_detach thread if detaching flag is set.
7985 */
7986 if (ifp->if_refio == 0 && (ifp->if_refflags & IFRF_DETACHING)) {
7987 wakeup(&(ifp->if_refio));
7988 }
7989 }
7990
7991 void
ifnet_decr_iorefcnt(struct ifnet * ifp)7992 ifnet_decr_iorefcnt(struct ifnet *ifp)
7993 {
7994 lck_mtx_lock_spin(&ifp->if_ref_lock);
7995 ifnet_decr_iorefcnt_locked(ifp);
7996 lck_mtx_unlock(&ifp->if_ref_lock);
7997 }
7998
7999 boolean_t
ifnet_datamov_begin(struct ifnet * ifp)8000 ifnet_datamov_begin(struct ifnet *ifp)
8001 {
8002 boolean_t ret;
8003
8004 lck_mtx_lock_spin(&ifp->if_ref_lock);
8005 if ((ret = IF_FULLY_ATTACHED_AND_READY(ifp))) {
8006 ifp->if_refio++;
8007 ifp->if_datamov++;
8008 }
8009 lck_mtx_unlock(&ifp->if_ref_lock);
8010
8011 DTRACE_IP2(datamov__begin, struct ifnet *, ifp, boolean_t, ret);
8012 return ret;
8013 }
8014
8015 void
ifnet_datamov_end(struct ifnet * ifp)8016 ifnet_datamov_end(struct ifnet *ifp)
8017 {
8018 lck_mtx_lock_spin(&ifp->if_ref_lock);
8019 VERIFY(ifp->if_datamov > 0);
8020 /*
8021 * if there's no more thread moving data, wakeup any
8022 * drainers that's blocked waiting for this.
8023 */
8024 if (--ifp->if_datamov == 0 && ifp->if_drainers > 0) {
8025 DLIL_PRINTF("Waking up drainers on %s\n", if_name(ifp));
8026 DTRACE_IP1(datamov__drain__wake, struct ifnet *, ifp);
8027 wakeup(&(ifp->if_datamov));
8028 }
8029 ifnet_decr_iorefcnt_locked(ifp);
8030 lck_mtx_unlock(&ifp->if_ref_lock);
8031
8032 DTRACE_IP1(datamov__end, struct ifnet *, ifp);
8033 }
8034
8035 static void
ifnet_datamov_suspend_locked(struct ifnet * ifp)8036 ifnet_datamov_suspend_locked(struct ifnet *ifp)
8037 {
8038 LCK_MTX_ASSERT(&ifp->if_ref_lock, LCK_MTX_ASSERT_OWNED);
8039 ifp->if_refio++;
8040 if (ifp->if_suspend++ == 0) {
8041 VERIFY(ifp->if_refflags & IFRF_READY);
8042 ifp->if_refflags &= ~IFRF_READY;
8043 }
8044 }
8045
8046 void
ifnet_datamov_suspend(struct ifnet * ifp)8047 ifnet_datamov_suspend(struct ifnet *ifp)
8048 {
8049 lck_mtx_lock_spin(&ifp->if_ref_lock);
8050 VERIFY(ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING));
8051 ifnet_datamov_suspend_locked(ifp);
8052 lck_mtx_unlock(&ifp->if_ref_lock);
8053 }
8054
8055 boolean_t
ifnet_datamov_suspend_if_needed(struct ifnet * ifp)8056 ifnet_datamov_suspend_if_needed(struct ifnet *ifp)
8057 {
8058 lck_mtx_lock_spin(&ifp->if_ref_lock);
8059 VERIFY(ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING));
8060 if (ifp->if_suspend > 0) {
8061 lck_mtx_unlock(&ifp->if_ref_lock);
8062 return FALSE;
8063 }
8064 ifnet_datamov_suspend_locked(ifp);
8065 lck_mtx_unlock(&ifp->if_ref_lock);
8066 return TRUE;
8067 }
8068
8069 void
ifnet_datamov_drain(struct ifnet * ifp)8070 ifnet_datamov_drain(struct ifnet *ifp)
8071 {
8072 lck_mtx_lock(&ifp->if_ref_lock);
8073 VERIFY(ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING));
8074 /* data movement must already be suspended */
8075 VERIFY(ifp->if_suspend > 0);
8076 VERIFY(!(ifp->if_refflags & IFRF_READY));
8077 ifp->if_drainers++;
8078 while (ifp->if_datamov != 0) {
8079 DLIL_PRINTF("Waiting for data path(s) to quiesce on %s\n",
8080 if_name(ifp));
8081 DTRACE_IP1(datamov__wait, struct ifnet *, ifp);
8082 (void) msleep(&(ifp->if_datamov), &ifp->if_ref_lock,
8083 (PZERO - 1), __func__, NULL);
8084 DTRACE_IP1(datamov__wake, struct ifnet *, ifp);
8085 }
8086 VERIFY(!(ifp->if_refflags & IFRF_READY));
8087 VERIFY(ifp->if_drainers > 0);
8088 ifp->if_drainers--;
8089 lck_mtx_unlock(&ifp->if_ref_lock);
8090
8091 /* purge the interface queues */
8092 if ((ifp->if_eflags & IFEF_TXSTART) != 0) {
8093 if_qflush_snd(ifp, false);
8094 }
8095 }
8096
8097 void
ifnet_datamov_suspend_and_drain(struct ifnet * ifp)8098 ifnet_datamov_suspend_and_drain(struct ifnet *ifp)
8099 {
8100 ifnet_datamov_suspend(ifp);
8101 ifnet_datamov_drain(ifp);
8102 }
8103
8104 void
ifnet_datamov_resume(struct ifnet * ifp)8105 ifnet_datamov_resume(struct ifnet *ifp)
8106 {
8107 lck_mtx_lock(&ifp->if_ref_lock);
8108 /* data movement must already be suspended */
8109 VERIFY(ifp->if_suspend > 0);
8110 if (--ifp->if_suspend == 0) {
8111 VERIFY(!(ifp->if_refflags & IFRF_READY));
8112 ifp->if_refflags |= IFRF_READY;
8113 }
8114 ifnet_decr_iorefcnt_locked(ifp);
8115 lck_mtx_unlock(&ifp->if_ref_lock);
8116 }
8117
8118 static void
dlil_if_trace(struct dlil_ifnet * dl_if,int refhold)8119 dlil_if_trace(struct dlil_ifnet *dl_if, int refhold)
8120 {
8121 struct dlil_ifnet_dbg *dl_if_dbg = (struct dlil_ifnet_dbg *)dl_if;
8122 ctrace_t *tr;
8123 u_int32_t idx;
8124 u_int16_t *cnt;
8125
8126 if (!(dl_if->dl_if_flags & DLIF_DEBUG)) {
8127 panic("%s: dl_if %p has no debug structure", __func__, dl_if);
8128 /* NOTREACHED */
8129 }
8130
8131 if (refhold) {
8132 cnt = &dl_if_dbg->dldbg_if_refhold_cnt;
8133 tr = dl_if_dbg->dldbg_if_refhold;
8134 } else {
8135 cnt = &dl_if_dbg->dldbg_if_refrele_cnt;
8136 tr = dl_if_dbg->dldbg_if_refrele;
8137 }
8138
8139 idx = os_atomic_inc_orig(cnt, relaxed) % IF_REF_TRACE_HIST_SIZE;
8140 ctrace_record(&tr[idx]);
8141 }
8142
8143 errno_t
dlil_if_ref(struct ifnet * ifp)8144 dlil_if_ref(struct ifnet *ifp)
8145 {
8146 struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
8147
8148 if (dl_if == NULL) {
8149 return EINVAL;
8150 }
8151
8152 lck_mtx_lock_spin(&dl_if->dl_if_lock);
8153 ++dl_if->dl_if_refcnt;
8154 if (dl_if->dl_if_refcnt == 0) {
8155 panic("%s: wraparound refcnt for ifp=%p", __func__, ifp);
8156 /* NOTREACHED */
8157 }
8158 if (dl_if->dl_if_trace != NULL) {
8159 (*dl_if->dl_if_trace)(dl_if, TRUE);
8160 }
8161 lck_mtx_unlock(&dl_if->dl_if_lock);
8162
8163 return 0;
8164 }
8165
8166 errno_t
dlil_if_free(struct ifnet * ifp)8167 dlil_if_free(struct ifnet *ifp)
8168 {
8169 struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
8170 bool need_release = FALSE;
8171
8172 if (dl_if == NULL) {
8173 return EINVAL;
8174 }
8175
8176 lck_mtx_lock_spin(&dl_if->dl_if_lock);
8177 switch (dl_if->dl_if_refcnt) {
8178 case 0:
8179 panic("%s: negative refcnt for ifp=%p", __func__, ifp);
8180 /* NOTREACHED */
8181 break;
8182 case 1:
8183 if ((ifp->if_refflags & IFRF_EMBRYONIC) != 0) {
8184 need_release = TRUE;
8185 }
8186 break;
8187 default:
8188 break;
8189 }
8190 --dl_if->dl_if_refcnt;
8191 if (dl_if->dl_if_trace != NULL) {
8192 (*dl_if->dl_if_trace)(dl_if, FALSE);
8193 }
8194 lck_mtx_unlock(&dl_if->dl_if_lock);
8195 if (need_release) {
8196 _dlil_if_release(ifp, true);
8197 }
8198 return 0;
8199 }
8200
8201 static errno_t
dlil_attach_protocol(struct if_proto * proto,const struct ifnet_demux_desc * demux_list,u_int32_t demux_count,uint32_t * proto_count)8202 dlil_attach_protocol(struct if_proto *proto,
8203 const struct ifnet_demux_desc *demux_list, u_int32_t demux_count,
8204 uint32_t * proto_count)
8205 {
8206 struct kev_dl_proto_data ev_pr_data;
8207 struct ifnet *ifp = proto->ifp;
8208 errno_t retval = 0;
8209 u_int32_t hash_value = proto_hash_value(proto->protocol_family);
8210 struct if_proto *prev_proto;
8211 struct if_proto *_proto;
8212
8213 /* don't allow attaching anything but PF_BRIDGE to vmnet interfaces */
8214 if (IFNET_IS_VMNET(ifp) && proto->protocol_family != PF_BRIDGE) {
8215 return EINVAL;
8216 }
8217
8218 if (!ifnet_is_attached(ifp, 1)) {
8219 os_log(OS_LOG_DEFAULT, "%s: %s is no longer attached",
8220 __func__, if_name(ifp));
8221 return ENXIO;
8222 }
8223 /* callee holds a proto refcnt upon success */
8224 ifnet_lock_exclusive(ifp);
8225 _proto = find_attached_proto(ifp, proto->protocol_family);
8226 if (_proto != NULL) {
8227 ifnet_lock_done(ifp);
8228 if_proto_free(_proto);
8229 retval = EEXIST;
8230 goto ioref_done;
8231 }
8232
8233 /*
8234 * Call family module add_proto routine so it can refine the
8235 * demux descriptors as it wishes.
8236 */
8237 retval = ifp->if_add_proto(ifp, proto->protocol_family, demux_list,
8238 demux_count);
8239 if (retval) {
8240 ifnet_lock_done(ifp);
8241 goto ioref_done;
8242 }
8243
8244 /*
8245 * Insert the protocol in the hash
8246 */
8247 prev_proto = SLIST_FIRST(&ifp->if_proto_hash[hash_value]);
8248 while (prev_proto != NULL && SLIST_NEXT(prev_proto, next_hash) != NULL) {
8249 prev_proto = SLIST_NEXT(prev_proto, next_hash);
8250 }
8251 if (prev_proto) {
8252 SLIST_INSERT_AFTER(prev_proto, proto, next_hash);
8253 } else {
8254 SLIST_INSERT_HEAD(&ifp->if_proto_hash[hash_value],
8255 proto, next_hash);
8256 }
8257
8258 /* hold a proto refcnt for attach */
8259 if_proto_ref(proto);
8260
8261 /*
8262 * The reserved field carries the number of protocol still attached
8263 * (subject to change)
8264 */
8265 ev_pr_data.proto_family = proto->protocol_family;
8266 ev_pr_data.proto_remaining_count = dlil_ifp_protolist(ifp, NULL, 0);
8267
8268 ifnet_lock_done(ifp);
8269
8270 dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_PROTO_ATTACHED,
8271 (struct net_event_data *)&ev_pr_data,
8272 sizeof(struct kev_dl_proto_data), FALSE);
8273 if (proto_count != NULL) {
8274 *proto_count = ev_pr_data.proto_remaining_count;
8275 }
8276 ioref_done:
8277 ifnet_decr_iorefcnt(ifp);
8278 return retval;
8279 }
8280
8281 static void
dlil_handle_proto_attach(ifnet_t ifp,protocol_family_t protocol)8282 dlil_handle_proto_attach(ifnet_t ifp, protocol_family_t protocol)
8283 {
8284 /*
8285 * A protocol has been attached, mark the interface up.
8286 * This used to be done by configd.KernelEventMonitor, but that
8287 * is inherently prone to races (rdar://problem/30810208).
8288 */
8289 (void) ifnet_set_flags(ifp, IFF_UP, IFF_UP);
8290 (void) ifnet_ioctl(ifp, 0, SIOCSIFFLAGS, NULL);
8291 dlil_post_sifflags_msg(ifp);
8292 #if SKYWALK
8293 switch (protocol) {
8294 case AF_INET:
8295 case AF_INET6:
8296 /* don't attach the flowswitch unless attaching IP */
8297 dlil_attach_flowswitch_nexus(ifp);
8298 break;
8299 default:
8300 break;
8301 }
8302 #endif /* SKYWALK */
8303 }
8304
8305 errno_t
ifnet_attach_protocol(ifnet_t ifp,protocol_family_t protocol,const struct ifnet_attach_proto_param * proto_details)8306 ifnet_attach_protocol(ifnet_t ifp, protocol_family_t protocol,
8307 const struct ifnet_attach_proto_param *proto_details)
8308 {
8309 int retval = 0;
8310 struct if_proto *ifproto = NULL;
8311 uint32_t proto_count = 0;
8312
8313 ifnet_head_lock_shared();
8314 if (ifp == NULL || protocol == 0 || proto_details == NULL) {
8315 retval = EINVAL;
8316 goto end;
8317 }
8318 /* Check that the interface is in the global list */
8319 if (!ifnet_lookup(ifp)) {
8320 retval = ENXIO;
8321 goto end;
8322 }
8323
8324 ifproto = zalloc_flags(dlif_proto_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
8325
8326 /* refcnt held above during lookup */
8327 ifproto->ifp = ifp;
8328 ifproto->protocol_family = protocol;
8329 ifproto->proto_kpi = kProtoKPI_v1;
8330 ifproto->kpi.v1.input = proto_details->input;
8331 ifproto->kpi.v1.pre_output = proto_details->pre_output;
8332 ifproto->kpi.v1.event = proto_details->event;
8333 ifproto->kpi.v1.ioctl = proto_details->ioctl;
8334 ifproto->kpi.v1.detached = proto_details->detached;
8335 ifproto->kpi.v1.resolve_multi = proto_details->resolve;
8336 ifproto->kpi.v1.send_arp = proto_details->send_arp;
8337
8338 retval = dlil_attach_protocol(ifproto,
8339 proto_details->demux_list, proto_details->demux_count,
8340 &proto_count);
8341
8342 end:
8343 if (retval == EEXIST) {
8344 /* already attached */
8345 if (dlil_verbose) {
8346 DLIL_PRINTF("%s: protocol %d already attached\n",
8347 ifp != NULL ? if_name(ifp) : "N/A",
8348 protocol);
8349 }
8350 } else if (retval != 0) {
8351 DLIL_PRINTF("%s: failed to attach v1 protocol %d (err=%d)\n",
8352 ifp != NULL ? if_name(ifp) : "N/A", protocol, retval);
8353 } else if (dlil_verbose) {
8354 DLIL_PRINTF("%s: attached v1 protocol %d (count = %d)\n",
8355 ifp != NULL ? if_name(ifp) : "N/A",
8356 protocol, proto_count);
8357 }
8358 ifnet_head_done();
8359 if (retval == 0) {
8360 dlil_handle_proto_attach(ifp, protocol);
8361 } else if (ifproto != NULL) {
8362 zfree(dlif_proto_zone, ifproto);
8363 }
8364 return retval;
8365 }
8366
8367 errno_t
ifnet_attach_protocol_v2(ifnet_t ifp,protocol_family_t protocol,const struct ifnet_attach_proto_param_v2 * proto_details)8368 ifnet_attach_protocol_v2(ifnet_t ifp, protocol_family_t protocol,
8369 const struct ifnet_attach_proto_param_v2 *proto_details)
8370 {
8371 int retval = 0;
8372 struct if_proto *ifproto = NULL;
8373 uint32_t proto_count = 0;
8374
8375 ifnet_head_lock_shared();
8376 if (ifp == NULL || protocol == 0 || proto_details == NULL) {
8377 retval = EINVAL;
8378 goto end;
8379 }
8380 /* Check that the interface is in the global list */
8381 if (!ifnet_lookup(ifp)) {
8382 retval = ENXIO;
8383 goto end;
8384 }
8385
8386 ifproto = zalloc_flags(dlif_proto_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
8387
8388 /* refcnt held above during lookup */
8389 ifproto->ifp = ifp;
8390 ifproto->protocol_family = protocol;
8391 ifproto->proto_kpi = kProtoKPI_v2;
8392 ifproto->kpi.v2.input = proto_details->input;
8393 ifproto->kpi.v2.pre_output = proto_details->pre_output;
8394 ifproto->kpi.v2.event = proto_details->event;
8395 ifproto->kpi.v2.ioctl = proto_details->ioctl;
8396 ifproto->kpi.v2.detached = proto_details->detached;
8397 ifproto->kpi.v2.resolve_multi = proto_details->resolve;
8398 ifproto->kpi.v2.send_arp = proto_details->send_arp;
8399
8400 retval = dlil_attach_protocol(ifproto,
8401 proto_details->demux_list, proto_details->demux_count,
8402 &proto_count);
8403
8404 end:
8405 if (retval == EEXIST) {
8406 /* already attached */
8407 if (dlil_verbose) {
8408 DLIL_PRINTF("%s: protocol %d already attached\n",
8409 ifp != NULL ? if_name(ifp) : "N/A",
8410 protocol);
8411 }
8412 } else if (retval != 0) {
8413 DLIL_PRINTF("%s: failed to attach v2 protocol %d (err=%d)\n",
8414 ifp != NULL ? if_name(ifp) : "N/A", protocol, retval);
8415 } else if (dlil_verbose) {
8416 DLIL_PRINTF("%s: attached v2 protocol %d (count = %d)\n",
8417 ifp != NULL ? if_name(ifp) : "N/A",
8418 protocol, proto_count);
8419 }
8420 ifnet_head_done();
8421 if (retval == 0) {
8422 dlil_handle_proto_attach(ifp, protocol);
8423 } else if (ifproto != NULL) {
8424 zfree(dlif_proto_zone, ifproto);
8425 }
8426 return retval;
8427 }
8428
8429 errno_t
ifnet_detach_protocol(ifnet_t ifp,protocol_family_t proto_family)8430 ifnet_detach_protocol(ifnet_t ifp, protocol_family_t proto_family)
8431 {
8432 struct if_proto *proto = NULL;
8433 int retval = 0;
8434
8435 if (ifp == NULL || proto_family == 0) {
8436 retval = EINVAL;
8437 goto end;
8438 }
8439
8440 ifnet_lock_exclusive(ifp);
8441 /* callee holds a proto refcnt upon success */
8442 proto = find_attached_proto(ifp, proto_family);
8443 if (proto == NULL) {
8444 retval = ENXIO;
8445 ifnet_lock_done(ifp);
8446 goto end;
8447 }
8448
8449 /* call family module del_proto */
8450 if (ifp->if_del_proto) {
8451 ifp->if_del_proto(ifp, proto->protocol_family);
8452 }
8453
8454 SLIST_REMOVE(&ifp->if_proto_hash[proto_hash_value(proto_family)],
8455 proto, if_proto, next_hash);
8456
8457 if (proto->proto_kpi == kProtoKPI_v1) {
8458 proto->kpi.v1.input = ifproto_media_input_v1;
8459 proto->kpi.v1.pre_output = ifproto_media_preout;
8460 proto->kpi.v1.event = ifproto_media_event;
8461 proto->kpi.v1.ioctl = ifproto_media_ioctl;
8462 proto->kpi.v1.resolve_multi = ifproto_media_resolve_multi;
8463 proto->kpi.v1.send_arp = ifproto_media_send_arp;
8464 } else {
8465 proto->kpi.v2.input = ifproto_media_input_v2;
8466 proto->kpi.v2.pre_output = ifproto_media_preout;
8467 proto->kpi.v2.event = ifproto_media_event;
8468 proto->kpi.v2.ioctl = ifproto_media_ioctl;
8469 proto->kpi.v2.resolve_multi = ifproto_media_resolve_multi;
8470 proto->kpi.v2.send_arp = ifproto_media_send_arp;
8471 }
8472 proto->detached = 1;
8473 ifnet_lock_done(ifp);
8474
8475 if (dlil_verbose) {
8476 DLIL_PRINTF("%s: detached %s protocol %d\n", if_name(ifp),
8477 (proto->proto_kpi == kProtoKPI_v1) ?
8478 "v1" : "v2", proto_family);
8479 }
8480
8481 /* release proto refcnt held during protocol attach */
8482 if_proto_free(proto);
8483
8484 /*
8485 * Release proto refcnt held during lookup; the rest of
8486 * protocol detach steps will happen when the last proto
8487 * reference is released.
8488 */
8489 if_proto_free(proto);
8490
8491 end:
8492 return retval;
8493 }
8494
8495 static errno_t
ifproto_media_input_v1(struct ifnet * ifp,protocol_family_t protocol,struct mbuf * packet,char * header)8496 ifproto_media_input_v1(struct ifnet *ifp, protocol_family_t protocol,
8497 struct mbuf *packet, char *header)
8498 {
8499 #pragma unused(ifp, protocol, packet, header)
8500 return ENXIO;
8501 }
8502
8503 static errno_t
ifproto_media_input_v2(struct ifnet * ifp,protocol_family_t protocol,struct mbuf * packet)8504 ifproto_media_input_v2(struct ifnet *ifp, protocol_family_t protocol,
8505 struct mbuf *packet)
8506 {
8507 #pragma unused(ifp, protocol, packet)
8508 return ENXIO;
8509 }
8510
8511 static errno_t
ifproto_media_preout(struct ifnet * ifp,protocol_family_t protocol,mbuf_t * packet,const struct sockaddr * dest,void * route,char * frame_type,char * link_layer_dest)8512 ifproto_media_preout(struct ifnet *ifp, protocol_family_t protocol,
8513 mbuf_t *packet, const struct sockaddr *dest, void *route, char *frame_type,
8514 char *link_layer_dest)
8515 {
8516 #pragma unused(ifp, protocol, packet, dest, route, frame_type, link_layer_dest)
8517 return ENXIO;
8518 }
8519
8520 static void
ifproto_media_event(struct ifnet * ifp,protocol_family_t protocol,const struct kev_msg * event)8521 ifproto_media_event(struct ifnet *ifp, protocol_family_t protocol,
8522 const struct kev_msg *event)
8523 {
8524 #pragma unused(ifp, protocol, event)
8525 }
8526
8527 static errno_t
ifproto_media_ioctl(struct ifnet * ifp,protocol_family_t protocol,unsigned long command,void * argument)8528 ifproto_media_ioctl(struct ifnet *ifp, protocol_family_t protocol,
8529 unsigned long command, void *argument)
8530 {
8531 #pragma unused(ifp, protocol, command, argument)
8532 return ENXIO;
8533 }
8534
8535 static errno_t
ifproto_media_resolve_multi(ifnet_t ifp,const struct sockaddr * proto_addr,struct sockaddr_dl * out_ll,size_t ll_len)8536 ifproto_media_resolve_multi(ifnet_t ifp, const struct sockaddr *proto_addr,
8537 struct sockaddr_dl *out_ll, size_t ll_len)
8538 {
8539 #pragma unused(ifp, proto_addr, out_ll, ll_len)
8540 return ENXIO;
8541 }
8542
8543 static errno_t
ifproto_media_send_arp(struct ifnet * ifp,u_short arpop,const struct sockaddr_dl * sender_hw,const struct sockaddr * sender_proto,const struct sockaddr_dl * target_hw,const struct sockaddr * target_proto)8544 ifproto_media_send_arp(struct ifnet *ifp, u_short arpop,
8545 const struct sockaddr_dl *sender_hw, const struct sockaddr *sender_proto,
8546 const struct sockaddr_dl *target_hw, const struct sockaddr *target_proto)
8547 {
8548 #pragma unused(ifp, arpop, sender_hw, sender_proto, target_hw, target_proto)
8549 return ENXIO;
8550 }
8551
8552 extern int if_next_index(void);
8553 extern int tcp_ecn_outbound;
8554
8555 void
dlil_ifclassq_setup(struct ifnet * ifp,struct ifclassq * ifcq)8556 dlil_ifclassq_setup(struct ifnet *ifp, struct ifclassq *ifcq)
8557 {
8558 uint32_t sflags = 0;
8559 int err;
8560
8561 if (if_flowadv) {
8562 sflags |= PKTSCHEDF_QALG_FLOWCTL;
8563 }
8564
8565 if (if_delaybased_queue) {
8566 sflags |= PKTSCHEDF_QALG_DELAYBASED;
8567 }
8568
8569 if (ifp->if_output_sched_model ==
8570 IFNET_SCHED_MODEL_DRIVER_MANAGED) {
8571 sflags |= PKTSCHEDF_QALG_DRIVER_MANAGED;
8572 }
8573 /* Inherit drop limit from the default queue */
8574 if (ifp->if_snd != ifcq) {
8575 IFCQ_PKT_DROP_LIMIT(ifcq) = IFCQ_PKT_DROP_LIMIT(ifp->if_snd);
8576 }
8577 /* Initialize transmit queue(s) */
8578 err = ifclassq_setup(ifcq, ifp, sflags);
8579 if (err != 0) {
8580 panic_plain("%s: ifp=%p couldn't initialize transmit queue; "
8581 "err=%d", __func__, ifp, err);
8582 /* NOTREACHED */
8583 }
8584 }
8585
8586 errno_t
ifnet_attach(ifnet_t ifp,const struct sockaddr_dl * ll_addr)8587 ifnet_attach(ifnet_t ifp, const struct sockaddr_dl *ll_addr)
8588 {
8589 #if SKYWALK
8590 boolean_t netif_compat;
8591 if_nexus_netif nexus_netif;
8592 #endif /* SKYWALK */
8593 struct ifnet *tmp_if;
8594 struct ifaddr *ifa;
8595 struct if_data_internal if_data_saved;
8596 struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
8597 struct dlil_threading_info *dl_inp;
8598 thread_continue_t thfunc = NULL;
8599 int err;
8600
8601 if (ifp == NULL) {
8602 return EINVAL;
8603 }
8604
8605 /*
8606 * Serialize ifnet attach using dlil_ifnet_lock, in order to
8607 * prevent the interface from being configured while it is
8608 * embryonic, as ifnet_head_lock is dropped and reacquired
8609 * below prior to marking the ifnet with IFRF_ATTACHED.
8610 */
8611 dlil_if_lock();
8612 ifnet_head_lock_exclusive();
8613 /* Verify we aren't already on the list */
8614 TAILQ_FOREACH(tmp_if, &ifnet_head, if_link) {
8615 if (tmp_if == ifp) {
8616 ifnet_head_done();
8617 dlil_if_unlock();
8618 return EEXIST;
8619 }
8620 }
8621
8622 lck_mtx_lock_spin(&ifp->if_ref_lock);
8623 if (!(ifp->if_refflags & IFRF_EMBRYONIC)) {
8624 panic_plain("%s: flags mismatch (embryonic not set) ifp=%p",
8625 __func__, ifp);
8626 /* NOTREACHED */
8627 }
8628 lck_mtx_unlock(&ifp->if_ref_lock);
8629
8630 ifnet_lock_exclusive(ifp);
8631
8632 /* Sanity check */
8633 VERIFY(ifp->if_detaching_link.tqe_next == NULL);
8634 VERIFY(ifp->if_detaching_link.tqe_prev == NULL);
8635 VERIFY(ifp->if_threads_pending == 0);
8636
8637 if (ll_addr != NULL) {
8638 if (ifp->if_addrlen == 0) {
8639 ifp->if_addrlen = ll_addr->sdl_alen;
8640 } else if (ll_addr->sdl_alen != ifp->if_addrlen) {
8641 ifnet_lock_done(ifp);
8642 ifnet_head_done();
8643 dlil_if_unlock();
8644 return EINVAL;
8645 }
8646 }
8647
8648 /*
8649 * Allow interfaces without protocol families to attach
8650 * only if they have the necessary fields filled out.
8651 */
8652 if (ifp->if_add_proto == NULL || ifp->if_del_proto == NULL) {
8653 DLIL_PRINTF("%s: Attempt to attach interface without "
8654 "family module - %d\n", __func__, ifp->if_family);
8655 ifnet_lock_done(ifp);
8656 ifnet_head_done();
8657 dlil_if_unlock();
8658 return ENODEV;
8659 }
8660
8661 /* Allocate protocol hash table */
8662 VERIFY(ifp->if_proto_hash == NULL);
8663 ifp->if_proto_hash = kalloc_type(struct proto_hash_entry,
8664 PROTO_HASH_SLOTS, Z_WAITOK | Z_ZERO | Z_NOFAIL);
8665
8666 lck_mtx_lock_spin(&ifp->if_flt_lock);
8667 VERIFY(TAILQ_EMPTY(&ifp->if_flt_head));
8668 TAILQ_INIT(&ifp->if_flt_head);
8669 VERIFY(ifp->if_flt_busy == 0);
8670 VERIFY(ifp->if_flt_waiters == 0);
8671 VERIFY(ifp->if_flt_non_os_count == 0);
8672 VERIFY(ifp->if_flt_no_tso_count == 0);
8673 lck_mtx_unlock(&ifp->if_flt_lock);
8674
8675 if (!(dl_if->dl_if_flags & DLIF_REUSE)) {
8676 VERIFY(LIST_EMPTY(&ifp->if_multiaddrs));
8677 LIST_INIT(&ifp->if_multiaddrs);
8678 }
8679
8680 VERIFY(ifp->if_allhostsinm == NULL);
8681 VERIFY(TAILQ_EMPTY(&ifp->if_addrhead));
8682 TAILQ_INIT(&ifp->if_addrhead);
8683
8684 if (ifp->if_index == 0) {
8685 int idx = if_next_index();
8686
8687 /*
8688 * Since we exhausted the list of
8689 * if_index's, try to find an empty slot
8690 * in ifindex2ifnet.
8691 */
8692 if (idx == -1 && if_index >= UINT16_MAX) {
8693 for (int i = 1; i < if_index; i++) {
8694 if (ifindex2ifnet[i] == NULL &&
8695 ifnet_addrs[i - 1] == NULL) {
8696 idx = i;
8697 break;
8698 }
8699 }
8700 }
8701 if (idx == -1) {
8702 ifp->if_index = 0;
8703 ifnet_lock_done(ifp);
8704 ifnet_head_done();
8705 dlil_if_unlock();
8706 return ENOBUFS;
8707 }
8708 ifp->if_index = (uint16_t)idx;
8709
8710 /* the lladdr passed at attach time is the permanent address */
8711 if (ll_addr != NULL && ifp->if_type == IFT_ETHER &&
8712 ll_addr->sdl_alen == ETHER_ADDR_LEN) {
8713 bcopy(CONST_LLADDR(ll_addr),
8714 dl_if->dl_if_permanent_ether,
8715 ETHER_ADDR_LEN);
8716 dl_if->dl_if_permanent_ether_is_set = 1;
8717 }
8718 }
8719 /* There should not be anything occupying this slot */
8720 VERIFY(ifindex2ifnet[ifp->if_index] == NULL);
8721
8722 /* allocate (if needed) and initialize a link address */
8723 ifa = dlil_alloc_lladdr(ifp, ll_addr);
8724 if (ifa == NULL) {
8725 ifnet_lock_done(ifp);
8726 ifnet_head_done();
8727 dlil_if_unlock();
8728 return ENOBUFS;
8729 }
8730
8731 VERIFY(ifnet_addrs[ifp->if_index - 1] == NULL);
8732 ifnet_addrs[ifp->if_index - 1] = ifa;
8733
8734 /* make this address the first on the list */
8735 IFA_LOCK(ifa);
8736 /* hold a reference for ifnet_addrs[] */
8737 ifa_addref(ifa);
8738 /* if_attach_link_ifa() holds a reference for ifa_link */
8739 if_attach_link_ifa(ifp, ifa);
8740 IFA_UNLOCK(ifa);
8741
8742 TAILQ_INSERT_TAIL(&ifnet_head, ifp, if_link);
8743 ifindex2ifnet[ifp->if_index] = ifp;
8744
8745 /* Hold a reference to the underlying dlil_ifnet */
8746 ifnet_reference(ifp);
8747
8748 /* Clear stats (save and restore other fields that we care) */
8749 if_data_saved = ifp->if_data;
8750 bzero(&ifp->if_data, sizeof(ifp->if_data));
8751 ifp->if_data.ifi_type = if_data_saved.ifi_type;
8752 ifp->if_data.ifi_typelen = if_data_saved.ifi_typelen;
8753 ifp->if_data.ifi_physical = if_data_saved.ifi_physical;
8754 ifp->if_data.ifi_addrlen = if_data_saved.ifi_addrlen;
8755 ifp->if_data.ifi_hdrlen = if_data_saved.ifi_hdrlen;
8756 ifp->if_data.ifi_mtu = if_data_saved.ifi_mtu;
8757 ifp->if_data.ifi_baudrate = if_data_saved.ifi_baudrate;
8758 ifp->if_data.ifi_hwassist = if_data_saved.ifi_hwassist;
8759 ifp->if_data.ifi_tso_v4_mtu = if_data_saved.ifi_tso_v4_mtu;
8760 ifp->if_data.ifi_tso_v6_mtu = if_data_saved.ifi_tso_v6_mtu;
8761 ifnet_touch_lastchange(ifp);
8762
8763 VERIFY(ifp->if_output_sched_model == IFNET_SCHED_MODEL_NORMAL ||
8764 ifp->if_output_sched_model == IFNET_SCHED_MODEL_DRIVER_MANAGED ||
8765 ifp->if_output_sched_model == IFNET_SCHED_MODEL_FQ_CODEL);
8766
8767 dlil_ifclassq_setup(ifp, ifp->if_snd);
8768
8769 /* Sanity checks on the input thread storage */
8770 dl_inp = &dl_if->dl_if_inpstorage;
8771 bzero(&dl_inp->dlth_stats, sizeof(dl_inp->dlth_stats));
8772 VERIFY(dl_inp->dlth_flags == 0);
8773 VERIFY(dl_inp->dlth_wtot == 0);
8774 VERIFY(dl_inp->dlth_ifp == NULL);
8775 VERIFY(qhead(&dl_inp->dlth_pkts) == NULL && qempty(&dl_inp->dlth_pkts));
8776 VERIFY(qlimit(&dl_inp->dlth_pkts) == 0);
8777 VERIFY(!dl_inp->dlth_affinity);
8778 VERIFY(ifp->if_inp == NULL);
8779 VERIFY(dl_inp->dlth_thread == THREAD_NULL);
8780 VERIFY(dl_inp->dlth_strategy == NULL);
8781 VERIFY(dl_inp->dlth_driver_thread == THREAD_NULL);
8782 VERIFY(dl_inp->dlth_poller_thread == THREAD_NULL);
8783 VERIFY(dl_inp->dlth_affinity_tag == 0);
8784
8785 #if IFNET_INPUT_SANITY_CHK
8786 VERIFY(dl_inp->dlth_pkts_cnt == 0);
8787 #endif /* IFNET_INPUT_SANITY_CHK */
8788
8789 VERIFY(ifp->if_poll_thread == THREAD_NULL);
8790 dlil_reset_rxpoll_params(ifp);
8791 /*
8792 * A specific DLIL input thread is created per non-loopback interface.
8793 */
8794 if (ifp->if_family != IFNET_FAMILY_LOOPBACK) {
8795 ifp->if_inp = dl_inp;
8796 ifnet_incr_pending_thread_count(ifp);
8797 err = dlil_create_input_thread(ifp, ifp->if_inp, &thfunc);
8798 if (err == ENODEV) {
8799 VERIFY(thfunc == NULL);
8800 ifnet_decr_pending_thread_count(ifp);
8801 } else if (err != 0) {
8802 panic_plain("%s: ifp=%p couldn't get an input thread; "
8803 "err=%d", __func__, ifp, err);
8804 /* NOTREACHED */
8805 }
8806 }
8807 /*
8808 * If the driver supports the new transmit model, calculate flow hash
8809 * and create a workloop starter thread to invoke the if_start callback
8810 * where the packets may be dequeued and transmitted.
8811 */
8812 if (ifp->if_eflags & IFEF_TXSTART) {
8813 thread_precedence_policy_data_t info;
8814 __unused kern_return_t kret;
8815
8816 ifp->if_flowhash = ifnet_calc_flowhash(ifp);
8817 VERIFY(ifp->if_flowhash != 0);
8818 VERIFY(ifp->if_start_thread == THREAD_NULL);
8819
8820 ifnet_set_start_cycle(ifp, NULL);
8821 ifp->if_start_active = 0;
8822 ifp->if_start_req = 0;
8823 ifp->if_start_flags = 0;
8824 VERIFY(ifp->if_start != NULL);
8825 ifnet_incr_pending_thread_count(ifp);
8826 if ((err = kernel_thread_start(ifnet_start_thread_func,
8827 ifp, &ifp->if_start_thread)) != KERN_SUCCESS) {
8828 panic_plain("%s: "
8829 "ifp=%p couldn't get a start thread; "
8830 "err=%d", __func__, ifp, err);
8831 /* NOTREACHED */
8832 }
8833 bzero(&info, sizeof(info));
8834 info.importance = 1;
8835 kret = thread_policy_set(ifp->if_start_thread,
8836 THREAD_PRECEDENCE_POLICY, (thread_policy_t)&info,
8837 THREAD_PRECEDENCE_POLICY_COUNT);
8838 ASSERT(kret == KERN_SUCCESS);
8839 } else {
8840 ifp->if_flowhash = 0;
8841 }
8842
8843 /* Reset polling parameters */
8844 ifnet_set_poll_cycle(ifp, NULL);
8845 ifp->if_poll_update = 0;
8846 ifp->if_poll_flags = 0;
8847 ifp->if_poll_req = 0;
8848 VERIFY(ifp->if_poll_thread == THREAD_NULL);
8849
8850 /*
8851 * If the driver supports the new receive model, create a poller
8852 * thread to invoke if_input_poll callback where the packets may
8853 * be dequeued from the driver and processed for reception.
8854 * if the interface is netif compat then the poller thread is
8855 * managed by netif.
8856 */
8857 if (thfunc == dlil_rxpoll_input_thread_func) {
8858 thread_precedence_policy_data_t info;
8859 __unused kern_return_t kret;
8860 #if SKYWALK
8861 VERIFY(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
8862 #endif /* SKYWALK */
8863 VERIFY(ifp->if_input_poll != NULL);
8864 VERIFY(ifp->if_input_ctl != NULL);
8865 ifnet_incr_pending_thread_count(ifp);
8866 if ((err = kernel_thread_start(ifnet_poll_thread_func, ifp,
8867 &ifp->if_poll_thread)) != KERN_SUCCESS) {
8868 panic_plain("%s: ifp=%p couldn't get a poll thread; "
8869 "err=%d", __func__, ifp, err);
8870 /* NOTREACHED */
8871 }
8872 bzero(&info, sizeof(info));
8873 info.importance = 1;
8874 kret = thread_policy_set(ifp->if_poll_thread,
8875 THREAD_PRECEDENCE_POLICY, (thread_policy_t)&info,
8876 THREAD_PRECEDENCE_POLICY_COUNT);
8877 ASSERT(kret == KERN_SUCCESS);
8878 }
8879
8880 VERIFY(ifp->if_desc.ifd_maxlen == IF_DESCSIZE);
8881 VERIFY(ifp->if_desc.ifd_len == 0);
8882 VERIFY(ifp->if_desc.ifd_desc != NULL);
8883
8884 /* Record attach PC stacktrace */
8885 ctrace_record(&((struct dlil_ifnet *)ifp)->dl_if_attach);
8886
8887 ifp->if_updatemcasts = 0;
8888 if (!LIST_EMPTY(&ifp->if_multiaddrs)) {
8889 struct ifmultiaddr *ifma;
8890 LIST_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
8891 IFMA_LOCK(ifma);
8892 if (ifma->ifma_addr->sa_family == AF_LINK ||
8893 ifma->ifma_addr->sa_family == AF_UNSPEC) {
8894 ifp->if_updatemcasts++;
8895 }
8896 IFMA_UNLOCK(ifma);
8897 }
8898
8899 DLIL_PRINTF("%s: attached with %d suspended link-layer multicast "
8900 "membership(s)\n", if_name(ifp),
8901 ifp->if_updatemcasts);
8902 }
8903
8904 /* Clear logging parameters */
8905 bzero(&ifp->if_log, sizeof(ifp->if_log));
8906
8907 /* Clear foreground/realtime activity timestamps */
8908 ifp->if_fg_sendts = 0;
8909 ifp->if_rt_sendts = 0;
8910
8911 /* Clear throughput estimates and radio type */
8912 ifp->if_estimated_up_bucket = 0;
8913 ifp->if_estimated_down_bucket = 0;
8914 ifp->if_radio_type = 0;
8915 ifp->if_radio_channel = 0;
8916
8917 VERIFY(ifp->if_delegated.ifp == NULL);
8918 VERIFY(ifp->if_delegated.type == 0);
8919 VERIFY(ifp->if_delegated.family == 0);
8920 VERIFY(ifp->if_delegated.subfamily == 0);
8921 VERIFY(ifp->if_delegated.expensive == 0);
8922 VERIFY(ifp->if_delegated.constrained == 0);
8923 VERIFY(ifp->if_delegated.ultra_constrained == 0);
8924
8925 VERIFY(ifp->if_agentids == NULL);
8926 VERIFY(ifp->if_agentcount == 0);
8927
8928 /* Reset interface state */
8929 bzero(&ifp->if_interface_state, sizeof(ifp->if_interface_state));
8930 ifp->if_interface_state.valid_bitmask |=
8931 IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID;
8932 ifp->if_interface_state.interface_availability =
8933 IF_INTERFACE_STATE_INTERFACE_AVAILABLE;
8934
8935 /* Initialize Link Quality Metric (loopback [lo0] is always good) */
8936 if (ifp == lo_ifp) {
8937 ifp->if_interface_state.lqm_state = IFNET_LQM_THRESH_GOOD;
8938 ifp->if_interface_state.valid_bitmask |=
8939 IF_INTERFACE_STATE_LQM_STATE_VALID;
8940 } else {
8941 ifp->if_interface_state.lqm_state = IFNET_LQM_THRESH_UNKNOWN;
8942 }
8943
8944 /*
8945 * Enable ECN capability on this interface depending on the
8946 * value of ECN global setting
8947 */
8948 if (tcp_ecn_outbound == 2 && !IFNET_IS_CELLULAR(ifp)) {
8949 if_set_eflags(ifp, IFEF_ECN_ENABLE);
8950 if_clear_eflags(ifp, IFEF_ECN_DISABLE);
8951 }
8952
8953 /*
8954 * Built-in Cyclops always on policy for WiFi infra
8955 */
8956 if (IFNET_IS_WIFI_INFRA(ifp) && net_qos_policy_wifi_enabled != 0) {
8957 errno_t error;
8958
8959 error = if_set_qosmarking_mode(ifp,
8960 IFRTYPE_QOSMARKING_FASTLANE);
8961 if (error != 0) {
8962 DLIL_PRINTF("%s if_set_qosmarking_mode(%s) error %d\n",
8963 __func__, ifp->if_xname, error);
8964 } else {
8965 if_set_eflags(ifp, IFEF_QOSMARKING_ENABLED);
8966 #if (DEVELOPMENT || DEBUG)
8967 DLIL_PRINTF("%s fastlane enabled on %s\n",
8968 __func__, ifp->if_xname);
8969 #endif /* (DEVELOPMENT || DEBUG) */
8970 }
8971 }
8972
8973 ifnet_lock_done(ifp);
8974 ifnet_head_done();
8975
8976 #if SKYWALK
8977 netif_compat = dlil_attach_netif_compat_nexus(ifp, &nexus_netif);
8978 #endif /* SKYWALK */
8979
8980 lck_mtx_lock(&ifp->if_cached_route_lock);
8981 /* Enable forwarding cached route */
8982 ifp->if_fwd_cacheok = 1;
8983 /* Clean up any existing cached routes */
8984 ROUTE_RELEASE(&ifp->if_fwd_route);
8985 bzero(&ifp->if_fwd_route, sizeof(ifp->if_fwd_route));
8986 ROUTE_RELEASE(&ifp->if_src_route);
8987 bzero(&ifp->if_src_route, sizeof(ifp->if_src_route));
8988 ROUTE_RELEASE(&ifp->if_src_route6);
8989 bzero(&ifp->if_src_route6, sizeof(ifp->if_src_route6));
8990 lck_mtx_unlock(&ifp->if_cached_route_lock);
8991
8992 ifnet_llreach_ifattach(ifp, (dl_if->dl_if_flags & DLIF_REUSE));
8993
8994 /*
8995 * Allocate and attach IGMPv3/MLDv2 interface specific variables
8996 * and trees; do this before the ifnet is marked as attached.
8997 * The ifnet keeps the reference to the info structures even after
8998 * the ifnet is detached, since the network-layer records still
8999 * refer to the info structures even after that. This also
9000 * makes it possible for them to still function after the ifnet
9001 * is recycled or reattached.
9002 */
9003 #if INET
9004 if (IGMP_IFINFO(ifp) == NULL) {
9005 IGMP_IFINFO(ifp) = igmp_domifattach(ifp, Z_WAITOK);
9006 VERIFY(IGMP_IFINFO(ifp) != NULL);
9007 } else {
9008 VERIFY(IGMP_IFINFO(ifp)->igi_ifp == ifp);
9009 igmp_domifreattach(IGMP_IFINFO(ifp));
9010 }
9011 #endif /* INET */
9012 if (MLD_IFINFO(ifp) == NULL) {
9013 MLD_IFINFO(ifp) = mld_domifattach(ifp, Z_WAITOK);
9014 VERIFY(MLD_IFINFO(ifp) != NULL);
9015 } else {
9016 VERIFY(MLD_IFINFO(ifp)->mli_ifp == ifp);
9017 mld_domifreattach(MLD_IFINFO(ifp));
9018 }
9019
9020 VERIFY(ifp->if_data_threshold == 0);
9021 VERIFY(ifp->if_dt_tcall != NULL);
9022
9023 /*
9024 * Wait for the created kernel threads for I/O to get
9025 * scheduled and run at least once before we proceed
9026 * to mark interface as attached.
9027 */
9028 lck_mtx_lock(&ifp->if_ref_lock);
9029 while (ifp->if_threads_pending != 0) {
9030 DLIL_PRINTF("%s: Waiting for all kernel threads created for "
9031 "interface %s to get scheduled at least once.\n",
9032 __func__, ifp->if_xname);
9033 (void) msleep(&ifp->if_threads_pending, &ifp->if_ref_lock, (PZERO - 1),
9034 __func__, NULL);
9035 LCK_MTX_ASSERT(&ifp->if_ref_lock, LCK_ASSERT_OWNED);
9036 }
9037 lck_mtx_unlock(&ifp->if_ref_lock);
9038 DLIL_PRINTF("%s: All kernel threads created for interface %s have been scheduled "
9039 "at least once. Proceeding.\n", __func__, ifp->if_xname);
9040
9041 /* Final mark this ifnet as attached. */
9042 ifnet_lock_exclusive(ifp);
9043 lck_mtx_lock_spin(&ifp->if_ref_lock);
9044 ifp->if_refflags = (IFRF_ATTACHED | IFRF_READY); /* clears embryonic */
9045 lck_mtx_unlock(&ifp->if_ref_lock);
9046 if (net_rtref) {
9047 /* boot-args override; enable idle notification */
9048 (void) ifnet_set_idle_flags_locked(ifp, IFRF_IDLE_NOTIFY,
9049 IFRF_IDLE_NOTIFY);
9050 } else {
9051 /* apply previous request(s) to set the idle flags, if any */
9052 (void) ifnet_set_idle_flags_locked(ifp, ifp->if_idle_new_flags,
9053 ifp->if_idle_new_flags_mask);
9054 }
9055 #if SKYWALK
9056 /* the interface is fully attached; let the nexus adapter know */
9057 if (netif_compat || dlil_is_native_netif_nexus(ifp)) {
9058 if (netif_compat) {
9059 if (sk_netif_compat_txmodel ==
9060 NETIF_COMPAT_TXMODEL_ENQUEUE_MULTI) {
9061 ifnet_enqueue_multi_setup(ifp,
9062 sk_tx_delay_qlen, sk_tx_delay_timeout);
9063 }
9064 ifp->if_nx_netif = nexus_netif;
9065 }
9066 ifp->if_na_ops->ni_finalize(ifp->if_na, ifp);
9067 }
9068 #endif /* SKYWALK */
9069 ifnet_lock_done(ifp);
9070 dlil_if_unlock();
9071
9072 #if PF
9073 /*
9074 * Attach packet filter to this interface, if enabled.
9075 */
9076 pf_ifnet_hook(ifp, 1);
9077 #endif /* PF */
9078
9079 dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_IF_ATTACHED, NULL, 0, FALSE);
9080
9081 if (dlil_verbose) {
9082 DLIL_PRINTF("%s: attached%s\n", if_name(ifp),
9083 (dl_if->dl_if_flags & DLIF_REUSE) ? " (recycled)" : "");
9084 }
9085
9086 return 0;
9087 }
9088
9089 /*
9090 * Prepare the storage for the first/permanent link address, which must
9091 * must have the same lifetime as the ifnet itself. Although the link
9092 * address gets removed from if_addrhead and ifnet_addrs[] at detach time,
9093 * its location in memory must never change as it may still be referred
9094 * to by some parts of the system afterwards (unfortunate implementation
9095 * artifacts inherited from BSD.)
9096 *
9097 * Caller must hold ifnet lock as writer.
9098 */
9099 static struct ifaddr *
dlil_alloc_lladdr(struct ifnet * ifp,const struct sockaddr_dl * ll_addr)9100 dlil_alloc_lladdr(struct ifnet *ifp, const struct sockaddr_dl *ll_addr)
9101 {
9102 struct ifaddr *ifa, *oifa = NULL;
9103 struct sockaddr_dl *addr_sdl, *mask_sdl;
9104 char workbuf[IFNAMSIZ * 2];
9105 int namelen, masklen, socksize;
9106 struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
9107
9108 ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_EXCLUSIVE);
9109 VERIFY(ll_addr == NULL || ll_addr->sdl_alen == ifp->if_addrlen);
9110
9111 namelen = scnprintf(workbuf, sizeof(workbuf), "%s",
9112 if_name(ifp));
9113 masklen = offsetof(struct sockaddr_dl, sdl_data[0])
9114 + ((namelen > 0) ? namelen : 0);
9115 socksize = masklen + ifp->if_addrlen;
9116 #define ROUNDUP(a) (1 + (((a) - 1) | (sizeof (u_int32_t) - 1)))
9117 if ((u_int32_t)socksize < sizeof(struct sockaddr_dl)) {
9118 socksize = sizeof(struct sockaddr_dl);
9119 }
9120 socksize = ROUNDUP(socksize);
9121 #undef ROUNDUP
9122
9123 ifa = ifp->if_lladdr;
9124 if (socksize > DLIL_SDLMAXLEN ||
9125 (ifa != NULL && ifa != &dl_if->dl_if_lladdr.ifa)) {
9126 /*
9127 * Rare, but in the event that the link address requires
9128 * more storage space than DLIL_SDLMAXLEN, allocate the
9129 * largest possible storages for address and mask, such
9130 * that we can reuse the same space when if_addrlen grows.
9131 * This same space will be used when if_addrlen shrinks.
9132 */
9133 struct dl_if_lladdr_xtra_space *__single dl_if_lladdr_ext;
9134
9135 if (ifa == NULL || ifa == &dl_if->dl_if_lladdr.ifa) {
9136 dl_if_lladdr_ext = zalloc_permanent(
9137 sizeof(*dl_if_lladdr_ext), ZALIGN(struct ifaddr));
9138
9139 ifa = &dl_if_lladdr_ext->ifa;
9140 ifa_lock_init(ifa);
9141 ifa_initref(ifa);
9142 /* Don't set IFD_ALLOC, as this is permanent */
9143 ifa->ifa_debug = IFD_LINK;
9144 } else {
9145 dl_if_lladdr_ext = __unsafe_forge_single(
9146 struct dl_if_lladdr_xtra_space*, ifa);
9147 ifa = &dl_if_lladdr_ext->ifa;
9148 }
9149
9150 IFA_LOCK(ifa);
9151 /* address and mask sockaddr_dl locations */
9152 bzero(dl_if_lladdr_ext->addr_sdl_bytes,
9153 sizeof(dl_if_lladdr_ext->addr_sdl_bytes));
9154 bzero(dl_if_lladdr_ext->mask_sdl_bytes,
9155 sizeof(dl_if_lladdr_ext->mask_sdl_bytes));
9156 addr_sdl = SDL(dl_if_lladdr_ext->addr_sdl_bytes);
9157 mask_sdl = SDL(dl_if_lladdr_ext->mask_sdl_bytes);
9158 } else {
9159 VERIFY(ifa == NULL || ifa == &dl_if->dl_if_lladdr.ifa);
9160 /*
9161 * Use the storage areas for address and mask within the
9162 * dlil_ifnet structure. This is the most common case.
9163 */
9164 if (ifa == NULL) {
9165 ifa = &dl_if->dl_if_lladdr.ifa;
9166 ifa_lock_init(ifa);
9167 ifa_initref(ifa);
9168 /* Don't set IFD_ALLOC, as this is permanent */
9169 ifa->ifa_debug = IFD_LINK;
9170 }
9171 IFA_LOCK(ifa);
9172 /* address and mask sockaddr_dl locations */
9173 bzero(dl_if->dl_if_lladdr.addr_sdl_bytes,
9174 sizeof(dl_if->dl_if_lladdr.addr_sdl_bytes));
9175 bzero(dl_if->dl_if_lladdr.mask_sdl_bytes,
9176 sizeof(dl_if->dl_if_lladdr.mask_sdl_bytes));
9177 addr_sdl = SDL(dl_if->dl_if_lladdr.addr_sdl_bytes);
9178 mask_sdl = SDL(dl_if->dl_if_lladdr.mask_sdl_bytes);
9179 }
9180
9181 if (ifp->if_lladdr != ifa) {
9182 oifa = ifp->if_lladdr;
9183 ifp->if_lladdr = ifa;
9184 }
9185
9186 VERIFY(ifa->ifa_debug == IFD_LINK);
9187 ifa->ifa_ifp = ifp;
9188 ifa->ifa_rtrequest = link_rtrequest;
9189 ifa->ifa_addr = SA(addr_sdl);
9190 addr_sdl->sdl_len = (u_char)socksize;
9191 addr_sdl->sdl_family = AF_LINK;
9192 if (namelen > 0) {
9193 bcopy(workbuf, addr_sdl->sdl_data, min(namelen,
9194 sizeof(addr_sdl->sdl_data)));
9195 addr_sdl->sdl_nlen = (u_char)namelen;
9196 } else {
9197 addr_sdl->sdl_nlen = 0;
9198 }
9199 addr_sdl->sdl_index = ifp->if_index;
9200 addr_sdl->sdl_type = ifp->if_type;
9201 if (ll_addr != NULL) {
9202 addr_sdl->sdl_alen = ll_addr->sdl_alen;
9203 bcopy(CONST_LLADDR(ll_addr), LLADDR(addr_sdl), addr_sdl->sdl_alen);
9204 } else {
9205 addr_sdl->sdl_alen = 0;
9206 }
9207 ifa->ifa_netmask = SA(mask_sdl);
9208 mask_sdl->sdl_len = (u_char)masklen;
9209 while (namelen > 0) {
9210 mask_sdl->sdl_data[--namelen] = 0xff;
9211 }
9212 IFA_UNLOCK(ifa);
9213
9214 if (oifa != NULL) {
9215 ifa_remref(oifa);
9216 }
9217
9218 return ifa;
9219 }
9220
9221 static void
if_purgeaddrs(struct ifnet * ifp)9222 if_purgeaddrs(struct ifnet *ifp)
9223 {
9224 #if INET
9225 in_purgeaddrs(ifp);
9226 #endif /* INET */
9227 in6_purgeaddrs(ifp);
9228 }
9229
9230 errno_t
ifnet_detach(ifnet_t ifp)9231 ifnet_detach(ifnet_t ifp)
9232 {
9233 struct ifnet *delegated_ifp;
9234 struct nd_ifinfo *ndi = NULL;
9235
9236 if (ifp == NULL) {
9237 return EINVAL;
9238 }
9239
9240 ndi = ND_IFINFO(ifp);
9241 if (NULL != ndi) {
9242 ndi->cga_initialized = FALSE;
9243 }
9244
9245 /* Mark the interface down */
9246 if_down(ifp);
9247
9248 /*
9249 * IMPORTANT NOTE
9250 *
9251 * Any field in the ifnet that relies on IF_FULLY_ATTACHED()
9252 * or equivalently, ifnet_is_attached(ifp, 1), can't be modified
9253 * until after we've waited for all I/O references to drain
9254 * in ifnet_detach_final().
9255 */
9256
9257 ifnet_head_lock_exclusive();
9258 ifnet_lock_exclusive(ifp);
9259
9260 if (ifp->if_output_netem != NULL) {
9261 netem_destroy(ifp->if_output_netem);
9262 ifp->if_output_netem = NULL;
9263 }
9264
9265 /*
9266 * Check to see if this interface has previously triggered
9267 * aggressive protocol draining; if so, decrement the global
9268 * refcnt and clear PR_AGGDRAIN on the route domain if
9269 * there are no more of such an interface around.
9270 */
9271 (void) ifnet_set_idle_flags_locked(ifp, 0, ~0);
9272
9273 lck_mtx_lock_spin(&ifp->if_ref_lock);
9274 if (!(ifp->if_refflags & IFRF_ATTACHED)) {
9275 lck_mtx_unlock(&ifp->if_ref_lock);
9276 ifnet_lock_done(ifp);
9277 ifnet_head_done();
9278 return EINVAL;
9279 } else if (ifp->if_refflags & IFRF_DETACHING) {
9280 /* Interface has already been detached */
9281 lck_mtx_unlock(&ifp->if_ref_lock);
9282 ifnet_lock_done(ifp);
9283 ifnet_head_done();
9284 return ENXIO;
9285 }
9286 VERIFY(!(ifp->if_refflags & IFRF_EMBRYONIC));
9287 /* Indicate this interface is being detached */
9288 ifp->if_refflags &= ~IFRF_ATTACHED;
9289 ifp->if_refflags |= IFRF_DETACHING;
9290 lck_mtx_unlock(&ifp->if_ref_lock);
9291
9292 if (dlil_verbose) {
9293 DLIL_PRINTF("%s: detaching\n", if_name(ifp));
9294 }
9295
9296 /* clean up flow control entry object if there's any */
9297 if (ifp->if_eflags & IFEF_TXSTART) {
9298 ifnet_flowadv(ifp->if_flowhash);
9299 }
9300
9301 /* Reset ECN enable/disable flags */
9302 /* Reset CLAT46 flag */
9303 if_clear_eflags(ifp, IFEF_ECN_ENABLE | IFEF_ECN_DISABLE | IFEF_CLAT46);
9304
9305 /*
9306 * We do not reset the TCP keep alive counters in case
9307 * a TCP connection stays connection after the interface
9308 * went down
9309 */
9310 if (ifp->if_tcp_kao_cnt > 0) {
9311 os_log(OS_LOG_DEFAULT, "%s %s tcp_kao_cnt %u not zero",
9312 __func__, if_name(ifp), ifp->if_tcp_kao_cnt);
9313 }
9314 ifp->if_tcp_kao_max = 0;
9315
9316 /*
9317 * Remove ifnet from the ifnet_head, ifindex2ifnet[]; it will
9318 * no longer be visible during lookups from this point.
9319 */
9320 VERIFY(ifindex2ifnet[ifp->if_index] == ifp);
9321 TAILQ_REMOVE(&ifnet_head, ifp, if_link);
9322 ifp->if_link.tqe_next = NULL;
9323 ifp->if_link.tqe_prev = NULL;
9324 if (ifp->if_ordered_link.tqe_next != NULL ||
9325 ifp->if_ordered_link.tqe_prev != NULL) {
9326 ifnet_remove_from_ordered_list(ifp);
9327 }
9328 ifindex2ifnet[ifp->if_index] = NULL;
9329
9330 /* 18717626 - reset router mode */
9331 if_clear_eflags(ifp, IFEF_IPV4_ROUTER);
9332 ifp->if_ipv6_router_mode = IPV6_ROUTER_MODE_DISABLED;
9333
9334 /* Record detach PC stacktrace */
9335 ctrace_record(&((struct dlil_ifnet *)ifp)->dl_if_detach);
9336
9337 /* Clear logging parameters */
9338 bzero(&ifp->if_log, sizeof(ifp->if_log));
9339
9340 /* Clear delegated interface info (reference released below) */
9341 delegated_ifp = ifp->if_delegated.ifp;
9342 bzero(&ifp->if_delegated, sizeof(ifp->if_delegated));
9343
9344 /* Reset interface state */
9345 bzero(&ifp->if_interface_state, sizeof(ifp->if_interface_state));
9346
9347 /*
9348 * Increment the generation count on interface deletion
9349 */
9350 ifp->if_creation_generation_id = os_atomic_inc(&if_creation_generation_count, relaxed);
9351
9352 ifnet_lock_done(ifp);
9353 ifnet_head_done();
9354
9355 /* Release reference held on the delegated interface */
9356 if (delegated_ifp != NULL) {
9357 ifnet_release(delegated_ifp);
9358 }
9359
9360 /* Reset Link Quality Metric (unless loopback [lo0]) */
9361 if (ifp != lo_ifp) {
9362 if_lqm_update(ifp, IFNET_LQM_THRESH_OFF, 0);
9363 }
9364
9365 /* Reset TCP local statistics */
9366 if (ifp->if_tcp_stat != NULL) {
9367 bzero(ifp->if_tcp_stat, sizeof(*ifp->if_tcp_stat));
9368 }
9369
9370 /* Reset UDP local statistics */
9371 if (ifp->if_udp_stat != NULL) {
9372 bzero(ifp->if_udp_stat, sizeof(*ifp->if_udp_stat));
9373 }
9374
9375 /* Reset ifnet IPv4 stats */
9376 if (ifp->if_ipv4_stat != NULL) {
9377 bzero(ifp->if_ipv4_stat, sizeof(*ifp->if_ipv4_stat));
9378 }
9379
9380 /* Reset ifnet IPv6 stats */
9381 if (ifp->if_ipv6_stat != NULL) {
9382 bzero(ifp->if_ipv6_stat, sizeof(*ifp->if_ipv6_stat));
9383 }
9384
9385 /* Release memory held for interface link status report */
9386 if (ifp->if_link_status != NULL) {
9387 kfree_type(struct if_link_status, ifp->if_link_status);
9388 ifp->if_link_status = NULL;
9389 }
9390
9391 /* Disable forwarding cached route */
9392 lck_mtx_lock(&ifp->if_cached_route_lock);
9393 ifp->if_fwd_cacheok = 0;
9394 lck_mtx_unlock(&ifp->if_cached_route_lock);
9395
9396 /* Disable data threshold and wait for any pending event posting */
9397 ifp->if_data_threshold = 0;
9398 VERIFY(ifp->if_dt_tcall != NULL);
9399 (void) thread_call_cancel_wait(ifp->if_dt_tcall);
9400
9401 /*
9402 * Drain any deferred IGMPv3/MLDv2 query responses, but keep the
9403 * references to the info structures and leave them attached to
9404 * this ifnet.
9405 */
9406 #if INET
9407 igmp_domifdetach(ifp);
9408 #endif /* INET */
9409 mld_domifdetach(ifp);
9410
9411 #if SKYWALK
9412 /* Clean up any netns tokens still pointing to to this ifnet */
9413 netns_ifnet_detach(ifp);
9414 #endif /* SKYWALK */
9415 dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_IF_DETACHING, NULL, 0, FALSE);
9416
9417 /* Let worker thread take care of the rest, to avoid reentrancy */
9418 dlil_if_lock();
9419 ifnet_detaching_enqueue(ifp);
9420 dlil_if_unlock();
9421
9422 return 0;
9423 }
9424
9425 static void
ifnet_detaching_enqueue(struct ifnet * ifp)9426 ifnet_detaching_enqueue(struct ifnet *ifp)
9427 {
9428 dlil_if_lock_assert();
9429
9430 ++ifnet_detaching_cnt;
9431 VERIFY(ifnet_detaching_cnt != 0);
9432 TAILQ_INSERT_TAIL(&ifnet_detaching_head, ifp, if_detaching_link);
9433 wakeup((caddr_t)&ifnet_delayed_run);
9434 }
9435
9436 static struct ifnet *
ifnet_detaching_dequeue(void)9437 ifnet_detaching_dequeue(void)
9438 {
9439 struct ifnet *ifp;
9440
9441 dlil_if_lock_assert();
9442
9443 ifp = TAILQ_FIRST(&ifnet_detaching_head);
9444 VERIFY(ifnet_detaching_cnt != 0 || ifp == NULL);
9445 if (ifp != NULL) {
9446 VERIFY(ifnet_detaching_cnt != 0);
9447 --ifnet_detaching_cnt;
9448 TAILQ_REMOVE(&ifnet_detaching_head, ifp, if_detaching_link);
9449 ifp->if_detaching_link.tqe_next = NULL;
9450 ifp->if_detaching_link.tqe_prev = NULL;
9451 }
9452 return ifp;
9453 }
9454
9455 __attribute__((noreturn))
9456 static void
ifnet_detacher_thread_cont(void * v,wait_result_t wres)9457 ifnet_detacher_thread_cont(void *v, wait_result_t wres)
9458 {
9459 #pragma unused(v, wres)
9460 struct ifnet *ifp;
9461
9462 dlil_if_lock();
9463 if (__improbable(ifnet_detaching_embryonic)) {
9464 ifnet_detaching_embryonic = FALSE;
9465 /* there's no lock ordering constrain so OK to do this here */
9466 dlil_decr_pending_thread_count();
9467 }
9468
9469 for (;;) {
9470 dlil_if_lock_assert();
9471
9472 if (ifnet_detaching_cnt == 0) {
9473 break;
9474 }
9475
9476 net_update_uptime();
9477
9478 VERIFY(TAILQ_FIRST(&ifnet_detaching_head) != NULL);
9479
9480 /* Take care of detaching ifnet */
9481 ifp = ifnet_detaching_dequeue();
9482 if (ifp != NULL) {
9483 dlil_if_unlock();
9484 ifnet_detach_final(ifp);
9485 dlil_if_lock();
9486 }
9487 }
9488
9489 (void) assert_wait(&ifnet_delayed_run, THREAD_UNINT);
9490 dlil_if_unlock();
9491 (void) thread_block(ifnet_detacher_thread_cont);
9492
9493 VERIFY(0); /* we should never get here */
9494 /* NOTREACHED */
9495 __builtin_unreachable();
9496 }
9497
9498 __dead2
9499 static void
ifnet_detacher_thread_func(void * v,wait_result_t w)9500 ifnet_detacher_thread_func(void *v, wait_result_t w)
9501 {
9502 #pragma unused(v, w)
9503 dlil_if_lock();
9504 (void) assert_wait(&ifnet_delayed_run, THREAD_UNINT);
9505 ifnet_detaching_embryonic = TRUE;
9506 /* wake up once to get out of embryonic state */
9507 wakeup((caddr_t)&ifnet_delayed_run);
9508 dlil_if_unlock();
9509 (void) thread_block(ifnet_detacher_thread_cont);
9510 VERIFY(0);
9511 /* NOTREACHED */
9512 __builtin_unreachable();
9513 }
9514
9515 static void
ifnet_detach_final(struct ifnet * ifp)9516 ifnet_detach_final(struct ifnet *ifp)
9517 {
9518 struct ifnet_filter *filter, *filter_next;
9519 struct dlil_ifnet *dlifp;
9520 struct ifnet_filter_head fhead;
9521 struct dlil_threading_info *inp;
9522 struct ifaddr *ifa;
9523 ifnet_detached_func if_free;
9524 int i;
9525
9526 /* Let BPF know we're detaching */
9527 bpfdetach(ifp);
9528
9529 #if SKYWALK
9530 dlil_netif_detach_notify(ifp);
9531 /*
9532 * Wait for the datapath to quiesce before tearing down
9533 * netif/flowswitch nexuses.
9534 */
9535 dlil_quiesce_and_detach_nexuses(ifp);
9536 #endif /* SKYWALK */
9537
9538 lck_mtx_lock(&ifp->if_ref_lock);
9539 if (!(ifp->if_refflags & IFRF_DETACHING)) {
9540 panic("%s: flags mismatch (detaching not set) ifp=%p",
9541 __func__, ifp);
9542 /* NOTREACHED */
9543 }
9544
9545 /*
9546 * Wait until the existing IO references get released
9547 * before we proceed with ifnet_detach. This is not a
9548 * common case, so block without using a continuation.
9549 */
9550 while (ifp->if_refio > 0) {
9551 DLIL_PRINTF("%s: Waiting for IO references on %s interface "
9552 "to be released\n", __func__, if_name(ifp));
9553 (void) msleep(&(ifp->if_refio), &ifp->if_ref_lock,
9554 (PZERO - 1), "ifnet_ioref_wait", NULL);
9555 }
9556
9557 VERIFY(ifp->if_datamov == 0);
9558 VERIFY(ifp->if_drainers == 0);
9559 VERIFY(ifp->if_suspend == 0);
9560 ifp->if_refflags &= ~IFRF_READY;
9561 lck_mtx_unlock(&ifp->if_ref_lock);
9562
9563 /* Clear agent IDs */
9564 if (ifp->if_agentids != NULL) {
9565 kfree_data(ifp->if_agentids,
9566 sizeof(uuid_t) * ifp->if_agentcount);
9567 ifp->if_agentids = NULL;
9568 }
9569 ifp->if_agentcount = 0;
9570
9571 #if SKYWALK
9572 VERIFY(LIST_EMPTY(&ifp->if_netns_tokens));
9573 #endif /* SKYWALK */
9574 /* Drain and destroy send queue */
9575 ifclassq_teardown(ifp->if_snd);
9576
9577 /* Detach interface filters */
9578 lck_mtx_lock(&ifp->if_flt_lock);
9579 if_flt_monitor_enter(ifp);
9580
9581 LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
9582 fhead = ifp->if_flt_head;
9583 TAILQ_INIT(&ifp->if_flt_head);
9584
9585 for (filter = TAILQ_FIRST(&fhead); filter; filter = filter_next) {
9586 filter_next = TAILQ_NEXT(filter, filt_next);
9587 lck_mtx_unlock(&ifp->if_flt_lock);
9588
9589 dlil_detach_filter_internal(filter, 1);
9590 lck_mtx_lock(&ifp->if_flt_lock);
9591 }
9592 if_flt_monitor_leave(ifp);
9593 lck_mtx_unlock(&ifp->if_flt_lock);
9594
9595 /* Tell upper layers to drop their network addresses */
9596 if_purgeaddrs(ifp);
9597
9598 ifnet_lock_exclusive(ifp);
9599
9600 /* Unplumb all protocols */
9601 for (i = 0; i < PROTO_HASH_SLOTS; i++) {
9602 struct if_proto *proto;
9603
9604 proto = SLIST_FIRST(&ifp->if_proto_hash[i]);
9605 while (proto != NULL) {
9606 protocol_family_t family = proto->protocol_family;
9607 ifnet_lock_done(ifp);
9608 proto_unplumb(family, ifp);
9609 ifnet_lock_exclusive(ifp);
9610 proto = SLIST_FIRST(&ifp->if_proto_hash[i]);
9611 }
9612 /* There should not be any protocols left */
9613 VERIFY(SLIST_EMPTY(&ifp->if_proto_hash[i]));
9614 }
9615 kfree_type(struct proto_hash_entry, PROTO_HASH_SLOTS, ifp->if_proto_hash);
9616 ifp->if_proto_hash = NULL;
9617
9618 /* Detach (permanent) link address from if_addrhead */
9619 ifa = TAILQ_FIRST(&ifp->if_addrhead);
9620 VERIFY(ifnet_addrs[ifp->if_index - 1] == ifa);
9621 IFA_LOCK(ifa);
9622 if_detach_link_ifa(ifp, ifa);
9623 IFA_UNLOCK(ifa);
9624
9625 /* Remove (permanent) link address from ifnet_addrs[] */
9626 ifa_remref(ifa);
9627 ifnet_addrs[ifp->if_index - 1] = NULL;
9628
9629 /* This interface should not be on {ifnet_head,detaching} */
9630 VERIFY(ifp->if_link.tqe_next == NULL);
9631 VERIFY(ifp->if_link.tqe_prev == NULL);
9632 VERIFY(ifp->if_detaching_link.tqe_next == NULL);
9633 VERIFY(ifp->if_detaching_link.tqe_prev == NULL);
9634 VERIFY(ifp->if_ordered_link.tqe_next == NULL);
9635 VERIFY(ifp->if_ordered_link.tqe_prev == NULL);
9636
9637 /* The slot should have been emptied */
9638 VERIFY(ifindex2ifnet[ifp->if_index] == NULL);
9639
9640 /* There should not be any addresses left */
9641 VERIFY(TAILQ_EMPTY(&ifp->if_addrhead));
9642
9643 /*
9644 * Signal the starter thread to terminate itself, and wait until
9645 * it has exited.
9646 */
9647 if (ifp->if_start_thread != THREAD_NULL) {
9648 lck_mtx_lock_spin(&ifp->if_start_lock);
9649 ifp->if_start_flags |= IFSF_TERMINATING;
9650 wakeup_one((caddr_t)&ifp->if_start_thread);
9651 lck_mtx_unlock(&ifp->if_start_lock);
9652
9653 /* wait for starter thread to terminate */
9654 lck_mtx_lock(&ifp->if_start_lock);
9655 while (ifp->if_start_thread != THREAD_NULL) {
9656 if (dlil_verbose) {
9657 DLIL_PRINTF("%s: waiting for %s starter thread to terminate\n",
9658 __func__,
9659 if_name(ifp));
9660 }
9661 (void) msleep(&ifp->if_start_thread,
9662 &ifp->if_start_lock, (PZERO - 1),
9663 "ifnet_start_thread_exit", NULL);
9664 }
9665 lck_mtx_unlock(&ifp->if_start_lock);
9666 if (dlil_verbose) {
9667 DLIL_PRINTF("%s: %s starter thread termination complete",
9668 __func__, if_name(ifp));
9669 }
9670 }
9671
9672 /*
9673 * Signal the poller thread to terminate itself, and wait until
9674 * it has exited.
9675 */
9676 if (ifp->if_poll_thread != THREAD_NULL) {
9677 #if SKYWALK
9678 VERIFY(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
9679 #endif /* SKYWALK */
9680 lck_mtx_lock_spin(&ifp->if_poll_lock);
9681 ifp->if_poll_flags |= IF_POLLF_TERMINATING;
9682 wakeup_one((caddr_t)&ifp->if_poll_thread);
9683 lck_mtx_unlock(&ifp->if_poll_lock);
9684
9685 /* wait for poller thread to terminate */
9686 lck_mtx_lock(&ifp->if_poll_lock);
9687 while (ifp->if_poll_thread != THREAD_NULL) {
9688 if (dlil_verbose) {
9689 DLIL_PRINTF("%s: waiting for %s poller thread to terminate\n",
9690 __func__,
9691 if_name(ifp));
9692 }
9693 (void) msleep(&ifp->if_poll_thread,
9694 &ifp->if_poll_lock, (PZERO - 1),
9695 "ifnet_poll_thread_exit", NULL);
9696 }
9697 lck_mtx_unlock(&ifp->if_poll_lock);
9698 if (dlil_verbose) {
9699 DLIL_PRINTF("%s: %s poller thread termination complete\n",
9700 __func__, if_name(ifp));
9701 }
9702 }
9703
9704 /*
9705 * If thread affinity was set for the workloop thread, we will need
9706 * to tear down the affinity and release the extra reference count
9707 * taken at attach time. Does not apply to lo0 or other interfaces
9708 * without dedicated input threads.
9709 */
9710 if ((inp = ifp->if_inp) != NULL) {
9711 VERIFY(inp != dlil_main_input_thread);
9712
9713 if (inp->dlth_affinity) {
9714 struct thread *tp, *wtp, *ptp;
9715
9716 lck_mtx_lock_spin(&inp->dlth_lock);
9717 wtp = inp->dlth_driver_thread;
9718 inp->dlth_driver_thread = THREAD_NULL;
9719 ptp = inp->dlth_poller_thread;
9720 inp->dlth_poller_thread = THREAD_NULL;
9721 ASSERT(inp->dlth_thread != THREAD_NULL);
9722 tp = inp->dlth_thread; /* don't nullify now */
9723 inp->dlth_affinity_tag = 0;
9724 inp->dlth_affinity = FALSE;
9725 lck_mtx_unlock(&inp->dlth_lock);
9726
9727 /* Tear down poll thread affinity */
9728 if (ptp != NULL) {
9729 VERIFY(ifp->if_eflags & IFEF_RXPOLL);
9730 VERIFY(ifp->if_xflags & IFXF_LEGACY);
9731 (void) dlil_affinity_set(ptp,
9732 THREAD_AFFINITY_TAG_NULL);
9733 thread_deallocate(ptp);
9734 }
9735
9736 /* Tear down workloop thread affinity */
9737 if (wtp != NULL) {
9738 (void) dlil_affinity_set(wtp,
9739 THREAD_AFFINITY_TAG_NULL);
9740 thread_deallocate(wtp);
9741 }
9742
9743 /* Tear down DLIL input thread affinity */
9744 (void) dlil_affinity_set(tp, THREAD_AFFINITY_TAG_NULL);
9745 thread_deallocate(tp);
9746 }
9747
9748 /* disassociate ifp DLIL input thread */
9749 ifp->if_inp = NULL;
9750
9751 /* if the worker thread was created, tell it to terminate */
9752 if (inp->dlth_thread != THREAD_NULL) {
9753 lck_mtx_lock_spin(&inp->dlth_lock);
9754 inp->dlth_flags |= DLIL_INPUT_TERMINATE;
9755 if (!(inp->dlth_flags & DLIL_INPUT_RUNNING)) {
9756 wakeup_one((caddr_t)&inp->dlth_flags);
9757 }
9758 lck_mtx_unlock(&inp->dlth_lock);
9759 ifnet_lock_done(ifp);
9760
9761 /* wait for the input thread to terminate */
9762 lck_mtx_lock_spin(&inp->dlth_lock);
9763 while ((inp->dlth_flags & DLIL_INPUT_TERMINATE_COMPLETE)
9764 == 0) {
9765 (void) msleep(&inp->dlth_flags, &inp->dlth_lock,
9766 (PZERO - 1) | PSPIN, inp->dlth_name, NULL);
9767 }
9768 lck_mtx_unlock(&inp->dlth_lock);
9769 ifnet_lock_exclusive(ifp);
9770 }
9771
9772 /* clean-up input thread state */
9773 dlil_clean_threading_info(inp);
9774 /* clean-up poll parameters */
9775 VERIFY(ifp->if_poll_thread == THREAD_NULL);
9776 dlil_reset_rxpoll_params(ifp);
9777 }
9778
9779 /* The driver might unload, so point these to ourselves */
9780 if_free = ifp->if_free;
9781 ifp->if_output_dlil = ifp_if_output;
9782 ifp->if_output = ifp_if_output;
9783 ifp->if_pre_enqueue = ifp_if_output;
9784 ifp->if_start = ifp_if_start;
9785 ifp->if_output_ctl = ifp_if_ctl;
9786 ifp->if_input_dlil = ifp_if_input;
9787 ifp->if_input_poll = ifp_if_input_poll;
9788 ifp->if_input_ctl = ifp_if_ctl;
9789 ifp->if_ioctl = ifp_if_ioctl;
9790 ifp->if_set_bpf_tap = ifp_if_set_bpf_tap;
9791 ifp->if_free = ifp_if_free;
9792 ifp->if_demux = ifp_if_demux;
9793 ifp->if_event = ifp_if_event;
9794 ifp->if_framer_legacy = ifp_if_framer;
9795 ifp->if_framer = ifp_if_framer_extended;
9796 ifp->if_add_proto = ifp_if_add_proto;
9797 ifp->if_del_proto = ifp_if_del_proto;
9798 ifp->if_check_multi = ifp_if_check_multi;
9799
9800 /* wipe out interface description */
9801 VERIFY(ifp->if_desc.ifd_maxlen == IF_DESCSIZE);
9802 ifp->if_desc.ifd_len = 0;
9803 VERIFY(ifp->if_desc.ifd_desc != NULL);
9804 bzero(ifp->if_desc.ifd_desc, IF_DESCSIZE);
9805
9806 /* there shouldn't be any delegation by now */
9807 VERIFY(ifp->if_delegated.ifp == NULL);
9808 VERIFY(ifp->if_delegated.type == 0);
9809 VERIFY(ifp->if_delegated.family == 0);
9810 VERIFY(ifp->if_delegated.subfamily == 0);
9811 VERIFY(ifp->if_delegated.expensive == 0);
9812 VERIFY(ifp->if_delegated.constrained == 0);
9813 VERIFY(ifp->if_delegated.ultra_constrained == 0);
9814
9815 /* QoS marking get cleared */
9816 if_clear_eflags(ifp, IFEF_QOSMARKING_ENABLED);
9817 if_set_qosmarking_mode(ifp, IFRTYPE_QOSMARKING_MODE_NONE);
9818
9819 #if SKYWALK
9820 /* the nexus destructor is responsible for clearing these */
9821 VERIFY(ifp->if_na_ops == NULL);
9822 VERIFY(ifp->if_na == NULL);
9823 #endif /* SKYWALK */
9824
9825 /* promiscuous/allmulti counts need to start at zero again */
9826 ifp->if_pcount = 0;
9827 ifp->if_amcount = 0;
9828 ifp->if_flags &= ~(IFF_PROMISC | IFF_ALLMULTI);
9829
9830 ifnet_lock_done(ifp);
9831
9832 #if PF
9833 /*
9834 * Detach this interface from packet filter, if enabled.
9835 */
9836 pf_ifnet_hook(ifp, 0);
9837 #endif /* PF */
9838
9839 /* Filter list should be empty */
9840 lck_mtx_lock_spin(&ifp->if_flt_lock);
9841 VERIFY(TAILQ_EMPTY(&ifp->if_flt_head));
9842 VERIFY(ifp->if_flt_busy == 0);
9843 VERIFY(ifp->if_flt_waiters == 0);
9844 VERIFY(ifp->if_flt_non_os_count == 0);
9845 VERIFY(ifp->if_flt_no_tso_count == 0);
9846 lck_mtx_unlock(&ifp->if_flt_lock);
9847
9848 /* Last chance to drain send queue */
9849 if_qflush_snd(ifp, 0);
9850
9851 /* Last chance to cleanup any cached route */
9852 lck_mtx_lock(&ifp->if_cached_route_lock);
9853 VERIFY(!ifp->if_fwd_cacheok);
9854 ROUTE_RELEASE(&ifp->if_fwd_route);
9855 bzero(&ifp->if_fwd_route, sizeof(ifp->if_fwd_route));
9856 ROUTE_RELEASE(&ifp->if_src_route);
9857 bzero(&ifp->if_src_route, sizeof(ifp->if_src_route));
9858 ROUTE_RELEASE(&ifp->if_src_route6);
9859 bzero(&ifp->if_src_route6, sizeof(ifp->if_src_route6));
9860 lck_mtx_unlock(&ifp->if_cached_route_lock);
9861
9862 /* Ignore any pending data threshold as the interface is anyways gone */
9863 ifp->if_data_threshold = 0;
9864
9865 VERIFY(ifp->if_dt_tcall != NULL);
9866 VERIFY(!thread_call_isactive(ifp->if_dt_tcall));
9867
9868 ifnet_llreach_ifdetach(ifp);
9869
9870 dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_IF_DETACHED, NULL, 0, FALSE);
9871
9872 /*
9873 * Finally, mark this ifnet as detached.
9874 */
9875 if (dlil_verbose) {
9876 DLIL_PRINTF("%s: detached\n", if_name(ifp));
9877 }
9878 lck_mtx_lock_spin(&ifp->if_ref_lock);
9879 if (!(ifp->if_refflags & IFRF_DETACHING)) {
9880 panic("%s: flags mismatch (detaching not set) ifp=%p",
9881 __func__, ifp);
9882 /* NOTREACHED */
9883 }
9884 ifp->if_refflags &= ~IFRF_DETACHING;
9885 lck_mtx_unlock(&ifp->if_ref_lock);
9886 if (if_free != NULL) {
9887 if_free(ifp);
9888 }
9889
9890 ifclassq_release(&ifp->if_snd);
9891
9892 /* we're fully detached, clear the "in use" bit */
9893 dlifp = (struct dlil_ifnet *)ifp;
9894 lck_mtx_lock(&dlifp->dl_if_lock);
9895 ASSERT((dlifp->dl_if_flags & DLIF_INUSE) != 0);
9896 dlifp->dl_if_flags &= ~DLIF_INUSE;
9897 lck_mtx_unlock(&dlifp->dl_if_lock);
9898
9899 /* Release reference held during ifnet attach */
9900 ifnet_release(ifp);
9901 }
9902
9903 errno_t
ifp_if_output(struct ifnet * ifp,struct mbuf * m)9904 ifp_if_output(struct ifnet *ifp, struct mbuf *m)
9905 {
9906 #pragma unused(ifp)
9907 m_freem_list(m);
9908 return 0;
9909 }
9910
9911 void
ifp_if_start(struct ifnet * ifp)9912 ifp_if_start(struct ifnet *ifp)
9913 {
9914 ifnet_purge(ifp);
9915 }
9916
9917 static errno_t
ifp_if_input(struct ifnet * ifp,struct mbuf * m_head,struct mbuf * m_tail,const struct ifnet_stat_increment_param * s,boolean_t poll,struct thread * tp)9918 ifp_if_input(struct ifnet *ifp, struct mbuf *m_head,
9919 struct mbuf *m_tail, const struct ifnet_stat_increment_param *s,
9920 boolean_t poll, struct thread *tp)
9921 {
9922 #pragma unused(ifp, m_tail, s, poll, tp)
9923 m_freem_list(m_head);
9924 return ENXIO;
9925 }
9926
9927 static void
ifp_if_input_poll(struct ifnet * ifp,u_int32_t flags,u_int32_t max_cnt,struct mbuf ** m_head,struct mbuf ** m_tail,u_int32_t * cnt,u_int32_t * len)9928 ifp_if_input_poll(struct ifnet *ifp, u_int32_t flags, u_int32_t max_cnt,
9929 struct mbuf **m_head, struct mbuf **m_tail, u_int32_t *cnt, u_int32_t *len)
9930 {
9931 #pragma unused(ifp, flags, max_cnt)
9932 if (m_head != NULL) {
9933 *m_head = NULL;
9934 }
9935 if (m_tail != NULL) {
9936 *m_tail = NULL;
9937 }
9938 if (cnt != NULL) {
9939 *cnt = 0;
9940 }
9941 if (len != NULL) {
9942 *len = 0;
9943 }
9944 }
9945
9946 static errno_t
ifp_if_ctl(struct ifnet * ifp,ifnet_ctl_cmd_t cmd,u_int32_t arglen,void * arg)9947 ifp_if_ctl(struct ifnet *ifp, ifnet_ctl_cmd_t cmd, u_int32_t arglen, void *arg)
9948 {
9949 #pragma unused(ifp, cmd, arglen, arg)
9950 return EOPNOTSUPP;
9951 }
9952
9953 static errno_t
ifp_if_demux(struct ifnet * ifp,struct mbuf * m,char * fh,protocol_family_t * pf)9954 ifp_if_demux(struct ifnet *ifp, struct mbuf *m, char *fh, protocol_family_t *pf)
9955 {
9956 #pragma unused(ifp, fh, pf)
9957 m_freem(m);
9958 return EJUSTRETURN;
9959 }
9960
9961 static errno_t
ifp_if_add_proto(struct ifnet * ifp,protocol_family_t pf,const struct ifnet_demux_desc * da,u_int32_t dc)9962 ifp_if_add_proto(struct ifnet *ifp, protocol_family_t pf,
9963 const struct ifnet_demux_desc *da, u_int32_t dc)
9964 {
9965 #pragma unused(ifp, pf, da, dc)
9966 return EINVAL;
9967 }
9968
9969 static errno_t
ifp_if_del_proto(struct ifnet * ifp,protocol_family_t pf)9970 ifp_if_del_proto(struct ifnet *ifp, protocol_family_t pf)
9971 {
9972 #pragma unused(ifp, pf)
9973 return EINVAL;
9974 }
9975
9976 static errno_t
ifp_if_check_multi(struct ifnet * ifp,const struct sockaddr * sa)9977 ifp_if_check_multi(struct ifnet *ifp, const struct sockaddr *sa)
9978 {
9979 #pragma unused(ifp, sa)
9980 return EOPNOTSUPP;
9981 }
9982
9983 #if !XNU_TARGET_OS_OSX
9984 static errno_t
ifp_if_framer(struct ifnet * ifp,struct mbuf ** m,const struct sockaddr * sa,const char * ll,const char * t,u_int32_t * pre,u_int32_t * post)9985 ifp_if_framer(struct ifnet *ifp, struct mbuf **m,
9986 const struct sockaddr *sa, const char *ll, const char *t,
9987 u_int32_t *pre, u_int32_t *post)
9988 #else /* XNU_TARGET_OS_OSX */
9989 static errno_t
9990 ifp_if_framer(struct ifnet *ifp, struct mbuf **m,
9991 const struct sockaddr *sa, const char *ll, const char *t)
9992 #endif /* XNU_TARGET_OS_OSX */
9993 {
9994 #pragma unused(ifp, m, sa, ll, t)
9995 #if !XNU_TARGET_OS_OSX
9996 return ifp_if_framer_extended(ifp, m, sa, ll, t, pre, post);
9997 #else /* XNU_TARGET_OS_OSX */
9998 return ifp_if_framer_extended(ifp, m, sa, ll, t, NULL, NULL);
9999 #endif /* XNU_TARGET_OS_OSX */
10000 }
10001
10002 static errno_t
ifp_if_framer_extended(struct ifnet * ifp,struct mbuf ** m,const struct sockaddr * sa,const char * ll,const char * t,u_int32_t * pre,u_int32_t * post)10003 ifp_if_framer_extended(struct ifnet *ifp, struct mbuf **m,
10004 const struct sockaddr *sa, const char *ll, const char *t,
10005 u_int32_t *pre, u_int32_t *post)
10006 {
10007 #pragma unused(ifp, sa, ll, t)
10008 m_freem(*m);
10009 *m = NULL;
10010
10011 if (pre != NULL) {
10012 *pre = 0;
10013 }
10014 if (post != NULL) {
10015 *post = 0;
10016 }
10017
10018 return EJUSTRETURN;
10019 }
10020
10021 errno_t
ifp_if_ioctl(struct ifnet * ifp,unsigned long cmd,void * arg)10022 ifp_if_ioctl(struct ifnet *ifp, unsigned long cmd, void *arg)
10023 {
10024 #pragma unused(ifp, cmd, arg)
10025 return EOPNOTSUPP;
10026 }
10027
10028 static errno_t
ifp_if_set_bpf_tap(struct ifnet * ifp,bpf_tap_mode tm,bpf_packet_func f)10029 ifp_if_set_bpf_tap(struct ifnet *ifp, bpf_tap_mode tm, bpf_packet_func f)
10030 {
10031 #pragma unused(ifp, tm, f)
10032 /* XXX not sure what to do here */
10033 return 0;
10034 }
10035
10036 static void
ifp_if_free(struct ifnet * ifp)10037 ifp_if_free(struct ifnet *ifp)
10038 {
10039 #pragma unused(ifp)
10040 }
10041
10042 static void
ifp_if_event(struct ifnet * ifp,const struct kev_msg * e)10043 ifp_if_event(struct ifnet *ifp, const struct kev_msg *e)
10044 {
10045 #pragma unused(ifp, e)
10046 }
10047
10048 int
dlil_if_acquire(u_int32_t family,const void * uniqueid,size_t uniqueid_len,const char * ifxname,struct ifnet ** ifp)10049 dlil_if_acquire(u_int32_t family, const void *uniqueid,
10050 size_t uniqueid_len, const char *ifxname, struct ifnet **ifp)
10051 {
10052 struct ifnet *ifp1 = NULL;
10053 struct dlil_ifnet *dlifp1 = NULL;
10054 struct dlil_ifnet *dlifp1_saved = NULL;
10055 void *buf, *base, **pbuf;
10056 int ret = 0;
10057
10058 VERIFY(*ifp == NULL);
10059 dlil_if_lock();
10060 /*
10061 * We absolutely can't have an interface with the same name
10062 * in in-use state.
10063 * To make sure of that list has to be traversed completely
10064 */
10065 TAILQ_FOREACH(dlifp1, &dlil_ifnet_head, dl_if_link) {
10066 ifp1 = (struct ifnet *)dlifp1;
10067
10068 if (ifp1->if_family != family) {
10069 continue;
10070 }
10071
10072 /*
10073 * If interface is in use, return EBUSY if either unique id
10074 * or interface extended names are the same
10075 */
10076 lck_mtx_lock(&dlifp1->dl_if_lock);
10077 if (strncmp(ifxname, ifp1->if_xname, IFXNAMSIZ) == 0 &&
10078 (dlifp1->dl_if_flags & DLIF_INUSE) != 0) {
10079 lck_mtx_unlock(&dlifp1->dl_if_lock);
10080 ret = EBUSY;
10081 goto end;
10082 }
10083
10084 if (uniqueid_len != 0 &&
10085 uniqueid_len == dlifp1->dl_if_uniqueid_len &&
10086 bcmp(uniqueid, dlifp1->dl_if_uniqueid, uniqueid_len) == 0) {
10087 if ((dlifp1->dl_if_flags & DLIF_INUSE) != 0) {
10088 lck_mtx_unlock(&dlifp1->dl_if_lock);
10089 ret = EBUSY;
10090 goto end;
10091 }
10092 if (dlifp1_saved == NULL) {
10093 /* cache the first match */
10094 dlifp1_saved = dlifp1;
10095 }
10096 /*
10097 * Do not break or jump to end as we have to traverse
10098 * the whole list to ensure there are no name collisions
10099 */
10100 }
10101 lck_mtx_unlock(&dlifp1->dl_if_lock);
10102 }
10103
10104 /* If there's an interface that can be recycled, use that */
10105 if (dlifp1_saved != NULL) {
10106 lck_mtx_lock(&dlifp1_saved->dl_if_lock);
10107 if ((dlifp1_saved->dl_if_flags & DLIF_INUSE) != 0) {
10108 /* some other thread got in ahead of us */
10109 lck_mtx_unlock(&dlifp1_saved->dl_if_lock);
10110 ret = EBUSY;
10111 goto end;
10112 }
10113 dlifp1_saved->dl_if_flags |= (DLIF_INUSE | DLIF_REUSE);
10114 lck_mtx_unlock(&dlifp1_saved->dl_if_lock);
10115 *ifp = (struct ifnet *)dlifp1_saved;
10116 dlil_if_ref(*ifp);
10117 goto end;
10118 }
10119
10120 /* no interface found, allocate a new one */
10121 buf = zalloc_flags(dlif_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
10122
10123 /* Get the 64-bit aligned base address for this object */
10124 base = (void *)P2ROUNDUP((intptr_t)buf + sizeof(u_int64_t),
10125 sizeof(u_int64_t));
10126 VERIFY(((intptr_t)base + dlif_size) <= ((intptr_t)buf + dlif_bufsize));
10127
10128 /*
10129 * Wind back a pointer size from the aligned base and
10130 * save the original address so we can free it later.
10131 */
10132 pbuf = (void **)((intptr_t)base - sizeof(void *));
10133 *pbuf = buf;
10134 dlifp1 = base;
10135
10136 if (uniqueid_len) {
10137 dlifp1->dl_if_uniqueid = kalloc_data(uniqueid_len,
10138 Z_WAITOK);
10139 if (dlifp1->dl_if_uniqueid == NULL) {
10140 zfree(dlif_zone, buf);
10141 ret = ENOMEM;
10142 goto end;
10143 }
10144 bcopy(uniqueid, dlifp1->dl_if_uniqueid, uniqueid_len);
10145 dlifp1->dl_if_uniqueid_len = uniqueid_len;
10146 }
10147
10148 ifp1 = (struct ifnet *)dlifp1;
10149 dlifp1->dl_if_flags = DLIF_INUSE;
10150 if (ifnet_debug) {
10151 dlifp1->dl_if_flags |= DLIF_DEBUG;
10152 dlifp1->dl_if_trace = dlil_if_trace;
10153 }
10154 ifp1->if_name = dlifp1->dl_if_namestorage;
10155 ifp1->if_xname = dlifp1->dl_if_xnamestorage;
10156
10157 /* initialize interface description */
10158 ifp1->if_desc.ifd_maxlen = IF_DESCSIZE;
10159 ifp1->if_desc.ifd_len = 0;
10160 ifp1->if_desc.ifd_desc = dlifp1->dl_if_descstorage;
10161
10162 #if SKYWALK
10163 LIST_INIT(&ifp1->if_netns_tokens);
10164 #endif /* SKYWALK */
10165
10166 if ((ret = dlil_alloc_local_stats(ifp1)) != 0) {
10167 DLIL_PRINTF("%s: failed to allocate if local stats, "
10168 "error: %d\n", __func__, ret);
10169 /* This probably shouldn't be fatal */
10170 ret = 0;
10171 }
10172
10173 lck_mtx_init(&dlifp1->dl_if_lock, &ifnet_lock_group, &ifnet_lock_attr);
10174 lck_rw_init(&ifp1->if_lock, &ifnet_lock_group, &ifnet_lock_attr);
10175 lck_mtx_init(&ifp1->if_ref_lock, &ifnet_lock_group, &ifnet_lock_attr);
10176 lck_mtx_init(&ifp1->if_flt_lock, &ifnet_lock_group, &ifnet_lock_attr);
10177 lck_mtx_init(&ifp1->if_addrconfig_lock, &ifnet_lock_group,
10178 &ifnet_lock_attr);
10179 lck_rw_init(&ifp1->if_llreach_lock, &ifnet_lock_group, &ifnet_lock_attr);
10180 #if INET
10181 lck_rw_init(&ifp1->if_inetdata_lock, &ifnet_lock_group,
10182 &ifnet_lock_attr);
10183 ifp1->if_inetdata = NULL;
10184 #endif
10185 lck_mtx_init(&ifp1->if_inet6_ioctl_lock, &ifnet_lock_group, &ifnet_lock_attr);
10186 ifp1->if_inet6_ioctl_busy = FALSE;
10187 lck_rw_init(&ifp1->if_inet6data_lock, &ifnet_lock_group,
10188 &ifnet_lock_attr);
10189 ifp1->if_inet6data = NULL;
10190 lck_rw_init(&ifp1->if_link_status_lock, &ifnet_lock_group,
10191 &ifnet_lock_attr);
10192 ifp1->if_link_status = NULL;
10193 lck_mtx_init(&ifp1->if_delegate_lock, &ifnet_lock_group, &ifnet_lock_attr);
10194
10195 /* for send data paths */
10196 lck_mtx_init(&ifp1->if_start_lock, &ifnet_snd_lock_group,
10197 &ifnet_lock_attr);
10198 lck_mtx_init(&ifp1->if_cached_route_lock, &ifnet_snd_lock_group,
10199 &ifnet_lock_attr);
10200
10201 /* for receive data paths */
10202 lck_mtx_init(&ifp1->if_poll_lock, &ifnet_rcv_lock_group,
10203 &ifnet_lock_attr);
10204
10205 /* thread call allocation is done with sleeping zalloc */
10206 ifp1->if_dt_tcall = thread_call_allocate_with_options(dlil_dt_tcall_fn,
10207 ifp1, THREAD_CALL_PRIORITY_KERNEL, THREAD_CALL_OPTIONS_ONCE);
10208 if (ifp1->if_dt_tcall == NULL) {
10209 panic_plain("%s: couldn't create if_dt_tcall", __func__);
10210 /* NOTREACHED */
10211 }
10212
10213 TAILQ_INSERT_TAIL(&dlil_ifnet_head, dlifp1, dl_if_link);
10214
10215 *ifp = ifp1;
10216 dlil_if_ref(*ifp);
10217
10218 end:
10219 dlil_if_unlock();
10220
10221 VERIFY(dlifp1 == NULL || (IS_P2ALIGNED(dlifp1, sizeof(u_int64_t)) &&
10222 IS_P2ALIGNED(&ifp1->if_data, sizeof(u_int64_t))));
10223
10224 return ret;
10225 }
10226
10227 static void
_dlil_if_release(ifnet_t ifp,bool clear_in_use)10228 _dlil_if_release(ifnet_t ifp, bool clear_in_use)
10229 {
10230 struct dlil_ifnet *dlifp = (struct dlil_ifnet *)ifp;
10231
10232 VERIFY(OSDecrementAtomic64(&net_api_stats.nas_ifnet_alloc_count) > 0);
10233 if (!(ifp->if_xflags & IFXF_ALLOC_KPI)) {
10234 VERIFY(OSDecrementAtomic64(&net_api_stats.nas_ifnet_alloc_os_count) > 0);
10235 }
10236
10237 ifnet_lock_exclusive(ifp);
10238 kfree_data_counted_by(ifp->if_broadcast.ptr, ifp->if_broadcast.length);
10239 lck_mtx_lock(&dlifp->dl_if_lock);
10240 strlcpy(dlifp->dl_if_namestorage, ifp->if_name, IFNAMSIZ);
10241 ifp->if_name = dlifp->dl_if_namestorage;
10242 /* Reset external name (name + unit) */
10243 ifp->if_xname = dlifp->dl_if_xnamestorage;
10244 snprintf(__DECONST(char *, ifp->if_xname), IFXNAMSIZ,
10245 "%s?", ifp->if_name);
10246 if (clear_in_use) {
10247 ASSERT((dlifp->dl_if_flags & DLIF_INUSE) != 0);
10248 dlifp->dl_if_flags &= ~DLIF_INUSE;
10249 }
10250 lck_mtx_unlock(&dlifp->dl_if_lock);
10251 ifnet_lock_done(ifp);
10252 }
10253
10254 __private_extern__ void
dlil_if_release(ifnet_t ifp)10255 dlil_if_release(ifnet_t ifp)
10256 {
10257 _dlil_if_release(ifp, false);
10258 }
10259
10260 __private_extern__ void
dlil_if_lock(void)10261 dlil_if_lock(void)
10262 {
10263 lck_mtx_lock(&dlil_ifnet_lock);
10264 }
10265
10266 __private_extern__ void
dlil_if_unlock(void)10267 dlil_if_unlock(void)
10268 {
10269 lck_mtx_unlock(&dlil_ifnet_lock);
10270 }
10271
10272 __private_extern__ void
dlil_if_lock_assert(void)10273 dlil_if_lock_assert(void)
10274 {
10275 LCK_MTX_ASSERT(&dlil_ifnet_lock, LCK_MTX_ASSERT_OWNED);
10276 }
10277
10278 __private_extern__ void
dlil_proto_unplumb_all(struct ifnet * ifp)10279 dlil_proto_unplumb_all(struct ifnet *ifp)
10280 {
10281 /*
10282 * if_proto_hash[0-2] are for PF_INET, PF_INET6 and PF_VLAN, where
10283 * each bucket contains exactly one entry; PF_VLAN does not need an
10284 * explicit unplumb.
10285 *
10286 * if_proto_hash[3] is for other protocols; we expect anything
10287 * in this bucket to respond to the DETACHING event (which would
10288 * have happened by now) and do the unplumb then.
10289 */
10290 (void) proto_unplumb(PF_INET, ifp);
10291 (void) proto_unplumb(PF_INET6, ifp);
10292 }
10293
10294 static void
ifp_src_route_copyout(struct ifnet * ifp,struct route * dst)10295 ifp_src_route_copyout(struct ifnet *ifp, struct route *dst)
10296 {
10297 lck_mtx_lock_spin(&ifp->if_cached_route_lock);
10298 lck_mtx_convert_spin(&ifp->if_cached_route_lock);
10299
10300 route_copyout(dst, &ifp->if_src_route, sizeof(*dst));
10301
10302 lck_mtx_unlock(&ifp->if_cached_route_lock);
10303 }
10304
10305 static void
ifp_src_route_copyin(struct ifnet * ifp,struct route * src)10306 ifp_src_route_copyin(struct ifnet *ifp, struct route *src)
10307 {
10308 lck_mtx_lock_spin(&ifp->if_cached_route_lock);
10309 lck_mtx_convert_spin(&ifp->if_cached_route_lock);
10310
10311 if (ifp->if_fwd_cacheok) {
10312 route_copyin(src, &ifp->if_src_route, sizeof(*src));
10313 } else {
10314 ROUTE_RELEASE(src);
10315 }
10316 lck_mtx_unlock(&ifp->if_cached_route_lock);
10317 }
10318
10319 static void
ifp_src_route6_copyout(struct ifnet * ifp,struct route_in6 * dst)10320 ifp_src_route6_copyout(struct ifnet *ifp, struct route_in6 *dst)
10321 {
10322 lck_mtx_lock_spin(&ifp->if_cached_route_lock);
10323 lck_mtx_convert_spin(&ifp->if_cached_route_lock);
10324
10325 route_copyout((struct route *)dst, (struct route *)&ifp->if_src_route6,
10326 sizeof(*dst));
10327
10328 lck_mtx_unlock(&ifp->if_cached_route_lock);
10329 }
10330
10331 static void
ifp_src_route6_copyin(struct ifnet * ifp,struct route_in6 * src)10332 ifp_src_route6_copyin(struct ifnet *ifp, struct route_in6 *src)
10333 {
10334 lck_mtx_lock_spin(&ifp->if_cached_route_lock);
10335 lck_mtx_convert_spin(&ifp->if_cached_route_lock);
10336
10337 if (ifp->if_fwd_cacheok) {
10338 route_copyin((struct route *)src,
10339 (struct route *)&ifp->if_src_route6, sizeof(*src));
10340 } else {
10341 ROUTE_RELEASE(src);
10342 }
10343 lck_mtx_unlock(&ifp->if_cached_route_lock);
10344 }
10345
10346 struct rtentry *
ifnet_cached_rtlookup_inet(struct ifnet * ifp,struct in_addr src_ip)10347 ifnet_cached_rtlookup_inet(struct ifnet *ifp, struct in_addr src_ip)
10348 {
10349 struct route src_rt;
10350 struct sockaddr_in *dst;
10351
10352 dst = SIN(&src_rt.ro_dst);
10353
10354 ifp_src_route_copyout(ifp, &src_rt);
10355
10356 if (ROUTE_UNUSABLE(&src_rt) || src_ip.s_addr != dst->sin_addr.s_addr) {
10357 ROUTE_RELEASE(&src_rt);
10358 if (dst->sin_family != AF_INET) {
10359 SOCKADDR_ZERO(&src_rt.ro_dst, sizeof(src_rt.ro_dst));
10360 dst->sin_len = sizeof(src_rt.ro_dst);
10361 dst->sin_family = AF_INET;
10362 }
10363 dst->sin_addr = src_ip;
10364
10365 VERIFY(src_rt.ro_rt == NULL);
10366 src_rt.ro_rt = rtalloc1_scoped(SA(dst),
10367 0, 0, ifp->if_index);
10368
10369 if (src_rt.ro_rt != NULL) {
10370 /* retain a ref, copyin consumes one */
10371 struct rtentry *rte = src_rt.ro_rt;
10372 RT_ADDREF(rte);
10373 ifp_src_route_copyin(ifp, &src_rt);
10374 src_rt.ro_rt = rte;
10375 }
10376 }
10377
10378 return src_rt.ro_rt;
10379 }
10380
10381 struct rtentry *
ifnet_cached_rtlookup_inet6(struct ifnet * ifp,struct in6_addr * src_ip6)10382 ifnet_cached_rtlookup_inet6(struct ifnet *ifp, struct in6_addr *src_ip6)
10383 {
10384 struct route_in6 src_rt;
10385
10386 ifp_src_route6_copyout(ifp, &src_rt);
10387
10388 if (ROUTE_UNUSABLE(&src_rt) ||
10389 !IN6_ARE_ADDR_EQUAL(src_ip6, &src_rt.ro_dst.sin6_addr)) {
10390 ROUTE_RELEASE(&src_rt);
10391 if (src_rt.ro_dst.sin6_family != AF_INET6) {
10392 SOCKADDR_ZERO(&src_rt.ro_dst, sizeof(src_rt.ro_dst));
10393 src_rt.ro_dst.sin6_len = sizeof(src_rt.ro_dst);
10394 src_rt.ro_dst.sin6_family = AF_INET6;
10395 }
10396 src_rt.ro_dst.sin6_scope_id = in6_addr2scopeid(ifp, src_ip6);
10397 bcopy(src_ip6, &src_rt.ro_dst.sin6_addr,
10398 sizeof(src_rt.ro_dst.sin6_addr));
10399
10400 if (src_rt.ro_rt == NULL) {
10401 src_rt.ro_rt = rtalloc1_scoped(
10402 SA(&src_rt.ro_dst), 0, 0,
10403 ifp->if_index);
10404
10405 if (src_rt.ro_rt != NULL) {
10406 /* retain a ref, copyin consumes one */
10407 struct rtentry *rte = src_rt.ro_rt;
10408 RT_ADDREF(rte);
10409 ifp_src_route6_copyin(ifp, &src_rt);
10410 src_rt.ro_rt = rte;
10411 }
10412 }
10413 }
10414
10415 return src_rt.ro_rt;
10416 }
10417
10418 void
if_lqm_update(struct ifnet * ifp,int lqm,int locked)10419 if_lqm_update(struct ifnet *ifp, int lqm, int locked)
10420 {
10421 struct kev_dl_link_quality_metric_data ev_lqm_data;
10422
10423 VERIFY(lqm >= IFNET_LQM_MIN && lqm <= IFNET_LQM_MAX);
10424
10425 /* Normalize to edge */
10426 if (lqm >= 0 && lqm <= IFNET_LQM_THRESH_ABORT) {
10427 lqm = IFNET_LQM_THRESH_ABORT;
10428 os_atomic_or(&tcbinfo.ipi_flags, INPCBINFO_HANDLE_LQM_ABORT, relaxed);
10429 inpcb_timer_sched(&tcbinfo, INPCB_TIMER_FAST);
10430 } else if (lqm > IFNET_LQM_THRESH_ABORT &&
10431 lqm <= IFNET_LQM_THRESH_MINIMALLY_VIABLE) {
10432 lqm = IFNET_LQM_THRESH_MINIMALLY_VIABLE;
10433 } else if (lqm > IFNET_LQM_THRESH_MINIMALLY_VIABLE &&
10434 lqm <= IFNET_LQM_THRESH_POOR) {
10435 lqm = IFNET_LQM_THRESH_POOR;
10436 } else if (lqm > IFNET_LQM_THRESH_POOR &&
10437 lqm <= IFNET_LQM_THRESH_GOOD) {
10438 lqm = IFNET_LQM_THRESH_GOOD;
10439 }
10440
10441 /*
10442 * Take the lock if needed
10443 */
10444 if (!locked) {
10445 ifnet_lock_exclusive(ifp);
10446 }
10447
10448 if (lqm == ifp->if_interface_state.lqm_state &&
10449 (ifp->if_interface_state.valid_bitmask &
10450 IF_INTERFACE_STATE_LQM_STATE_VALID)) {
10451 /*
10452 * Release the lock if was not held by the caller
10453 */
10454 if (!locked) {
10455 ifnet_lock_done(ifp);
10456 }
10457 return; /* nothing to update */
10458 }
10459 ifp->if_interface_state.valid_bitmask |=
10460 IF_INTERFACE_STATE_LQM_STATE_VALID;
10461 ifp->if_interface_state.lqm_state = (int8_t)lqm;
10462
10463 /*
10464 * Don't want to hold the lock when issuing kernel events
10465 */
10466 ifnet_lock_done(ifp);
10467
10468 bzero(&ev_lqm_data, sizeof(ev_lqm_data));
10469 ev_lqm_data.link_quality_metric = lqm;
10470
10471 dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_LINK_QUALITY_METRIC_CHANGED,
10472 (struct net_event_data *)&ev_lqm_data, sizeof(ev_lqm_data), FALSE);
10473
10474 /*
10475 * Reacquire the lock for the caller
10476 */
10477 if (locked) {
10478 ifnet_lock_exclusive(ifp);
10479 }
10480 }
10481
10482 static void
if_rrc_state_update(struct ifnet * ifp,unsigned int rrc_state)10483 if_rrc_state_update(struct ifnet *ifp, unsigned int rrc_state)
10484 {
10485 struct kev_dl_rrc_state kev;
10486
10487 if (rrc_state == ifp->if_interface_state.rrc_state &&
10488 (ifp->if_interface_state.valid_bitmask &
10489 IF_INTERFACE_STATE_RRC_STATE_VALID)) {
10490 return;
10491 }
10492
10493 ifp->if_interface_state.valid_bitmask |=
10494 IF_INTERFACE_STATE_RRC_STATE_VALID;
10495
10496 ifp->if_interface_state.rrc_state = (uint8_t)rrc_state;
10497
10498 /*
10499 * Don't want to hold the lock when issuing kernel events
10500 */
10501 ifnet_lock_done(ifp);
10502
10503 bzero(&kev, sizeof(struct kev_dl_rrc_state));
10504 kev.rrc_state = rrc_state;
10505
10506 dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_RRC_STATE_CHANGED,
10507 (struct net_event_data *)&kev, sizeof(struct kev_dl_rrc_state), FALSE);
10508
10509 ifnet_lock_exclusive(ifp);
10510 }
10511
10512 errno_t
if_state_update(struct ifnet * ifp,struct if_interface_state * if_interface_state)10513 if_state_update(struct ifnet *ifp,
10514 struct if_interface_state *if_interface_state)
10515 {
10516 u_short if_index_available = 0;
10517
10518 ifnet_lock_exclusive(ifp);
10519
10520 if ((ifp->if_type != IFT_CELLULAR) &&
10521 (if_interface_state->valid_bitmask &
10522 IF_INTERFACE_STATE_RRC_STATE_VALID)) {
10523 ifnet_lock_done(ifp);
10524 return ENOTSUP;
10525 }
10526 if ((if_interface_state->valid_bitmask &
10527 IF_INTERFACE_STATE_LQM_STATE_VALID) &&
10528 (if_interface_state->lqm_state < IFNET_LQM_MIN ||
10529 if_interface_state->lqm_state > IFNET_LQM_MAX)) {
10530 ifnet_lock_done(ifp);
10531 return EINVAL;
10532 }
10533 if ((if_interface_state->valid_bitmask &
10534 IF_INTERFACE_STATE_RRC_STATE_VALID) &&
10535 if_interface_state->rrc_state !=
10536 IF_INTERFACE_STATE_RRC_STATE_IDLE &&
10537 if_interface_state->rrc_state !=
10538 IF_INTERFACE_STATE_RRC_STATE_CONNECTED) {
10539 ifnet_lock_done(ifp);
10540 return EINVAL;
10541 }
10542
10543 if (if_interface_state->valid_bitmask &
10544 IF_INTERFACE_STATE_LQM_STATE_VALID) {
10545 if_lqm_update(ifp, if_interface_state->lqm_state, 1);
10546 }
10547 if (if_interface_state->valid_bitmask &
10548 IF_INTERFACE_STATE_RRC_STATE_VALID) {
10549 if_rrc_state_update(ifp, if_interface_state->rrc_state);
10550 }
10551 if (if_interface_state->valid_bitmask &
10552 IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID) {
10553 ifp->if_interface_state.valid_bitmask |=
10554 IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID;
10555 ifp->if_interface_state.interface_availability =
10556 if_interface_state->interface_availability;
10557
10558 if (ifp->if_interface_state.interface_availability ==
10559 IF_INTERFACE_STATE_INTERFACE_AVAILABLE) {
10560 os_log(OS_LOG_DEFAULT, "%s: interface %s (%u) available\n",
10561 __func__, if_name(ifp), ifp->if_index);
10562 if_index_available = ifp->if_index;
10563 } else {
10564 os_log(OS_LOG_DEFAULT, "%s: interface %s (%u) unavailable)\n",
10565 __func__, if_name(ifp), ifp->if_index);
10566 }
10567 }
10568 ifnet_lock_done(ifp);
10569
10570 /*
10571 * Check if the TCP connections going on this interface should be
10572 * forced to send probe packets instead of waiting for TCP timers
10573 * to fire. This is done on an explicit notification such as
10574 * SIOCSIFINTERFACESTATE which marks the interface as available.
10575 */
10576 if (if_index_available > 0) {
10577 tcp_interface_send_probe(if_index_available);
10578 }
10579
10580 return 0;
10581 }
10582
10583 void
if_get_state(struct ifnet * ifp,struct if_interface_state * if_interface_state)10584 if_get_state(struct ifnet *ifp,
10585 struct if_interface_state *if_interface_state)
10586 {
10587 ifnet_lock_shared(ifp);
10588
10589 if_interface_state->valid_bitmask = 0;
10590
10591 if (ifp->if_interface_state.valid_bitmask &
10592 IF_INTERFACE_STATE_RRC_STATE_VALID) {
10593 if_interface_state->valid_bitmask |=
10594 IF_INTERFACE_STATE_RRC_STATE_VALID;
10595 if_interface_state->rrc_state =
10596 ifp->if_interface_state.rrc_state;
10597 }
10598 if (ifp->if_interface_state.valid_bitmask &
10599 IF_INTERFACE_STATE_LQM_STATE_VALID) {
10600 if_interface_state->valid_bitmask |=
10601 IF_INTERFACE_STATE_LQM_STATE_VALID;
10602 if_interface_state->lqm_state =
10603 ifp->if_interface_state.lqm_state;
10604 }
10605 if (ifp->if_interface_state.valid_bitmask &
10606 IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID) {
10607 if_interface_state->valid_bitmask |=
10608 IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID;
10609 if_interface_state->interface_availability =
10610 ifp->if_interface_state.interface_availability;
10611 }
10612
10613 ifnet_lock_done(ifp);
10614 }
10615
10616 errno_t
if_probe_connectivity(struct ifnet * ifp,u_int32_t conn_probe)10617 if_probe_connectivity(struct ifnet *ifp, u_int32_t conn_probe)
10618 {
10619 if (conn_probe > 1) {
10620 return EINVAL;
10621 }
10622 if (conn_probe == 0) {
10623 if_clear_eflags(ifp, IFEF_PROBE_CONNECTIVITY);
10624 } else {
10625 if_set_eflags(ifp, IFEF_PROBE_CONNECTIVITY);
10626 }
10627
10628 #if NECP
10629 necp_update_all_clients();
10630 #endif /* NECP */
10631
10632 tcp_probe_connectivity(ifp, conn_probe);
10633 return 0;
10634 }
10635
10636 /* for uuid.c */
10637 static int
get_ether_index(int * ret_other_index)10638 get_ether_index(int * ret_other_index)
10639 {
10640 struct ifnet *ifp;
10641 int en0_index = 0;
10642 int other_en_index = 0;
10643 int any_ether_index = 0;
10644 short best_unit = 0;
10645
10646 *ret_other_index = 0;
10647 TAILQ_FOREACH(ifp, &ifnet_head, if_link) {
10648 /*
10649 * find en0, or if not en0, the lowest unit en*, and if not
10650 * that, any ethernet
10651 */
10652 ifnet_lock_shared(ifp);
10653 if (strcmp(ifp->if_name, "en") == 0) {
10654 if (ifp->if_unit == 0) {
10655 /* found en0, we're done */
10656 en0_index = ifp->if_index;
10657 ifnet_lock_done(ifp);
10658 break;
10659 }
10660 if (other_en_index == 0 || ifp->if_unit < best_unit) {
10661 other_en_index = ifp->if_index;
10662 best_unit = ifp->if_unit;
10663 }
10664 } else if (ifp->if_type == IFT_ETHER && any_ether_index == 0) {
10665 any_ether_index = ifp->if_index;
10666 }
10667 ifnet_lock_done(ifp);
10668 }
10669 if (en0_index == 0) {
10670 if (other_en_index != 0) {
10671 *ret_other_index = other_en_index;
10672 } else if (any_ether_index != 0) {
10673 *ret_other_index = any_ether_index;
10674 }
10675 }
10676 return en0_index;
10677 }
10678
10679 int
uuid_get_ethernet(u_int8_t * node)10680 uuid_get_ethernet(u_int8_t *node)
10681 {
10682 static int en0_index;
10683 struct ifnet *ifp;
10684 int other_index = 0;
10685 int the_index = 0;
10686 int ret;
10687
10688 ifnet_head_lock_shared();
10689 if (en0_index == 0 || ifindex2ifnet[en0_index] == NULL) {
10690 en0_index = get_ether_index(&other_index);
10691 }
10692 if (en0_index != 0) {
10693 the_index = en0_index;
10694 } else if (other_index != 0) {
10695 the_index = other_index;
10696 }
10697 if (the_index != 0) {
10698 struct dlil_ifnet *dl_if;
10699
10700 ifp = ifindex2ifnet[the_index];
10701 VERIFY(ifp != NULL);
10702 dl_if = (struct dlil_ifnet *)ifp;
10703 if (dl_if->dl_if_permanent_ether_is_set != 0) {
10704 /*
10705 * Use the permanent ethernet address if it is
10706 * available because it will never change.
10707 */
10708 memcpy(node, dl_if->dl_if_permanent_ether,
10709 ETHER_ADDR_LEN);
10710 } else {
10711 memcpy(node, IF_LLADDR(ifp), ETHER_ADDR_LEN);
10712 }
10713 ret = 0;
10714 } else {
10715 ret = -1;
10716 }
10717 ifnet_head_done();
10718 return ret;
10719 }
10720
10721 int
dlil_node_present(struct ifnet * ifp,struct sockaddr * sa,int32_t rssi,int lqm,int npm,u_int8_t srvinfo[48])10722 dlil_node_present(struct ifnet *ifp, struct sockaddr *sa,
10723 int32_t rssi, int lqm, int npm, u_int8_t srvinfo[48])
10724 {
10725 struct kev_dl_node_presence kev;
10726 struct sockaddr_dl *sdl;
10727 struct sockaddr_in6 *sin6;
10728 int ret = 0;
10729
10730 VERIFY(ifp);
10731 VERIFY(sa);
10732 VERIFY(sa->sa_family == AF_LINK || sa->sa_family == AF_INET6);
10733
10734 bzero(&kev, sizeof(kev));
10735 sin6 = &kev.sin6_node_address;
10736 sdl = &kev.sdl_node_address;
10737 nd6_alt_node_addr_decompose(ifp, sa, sdl, sin6);
10738 kev.rssi = rssi;
10739 kev.link_quality_metric = lqm;
10740 kev.node_proximity_metric = npm;
10741 bcopy(srvinfo, kev.node_service_info, sizeof(kev.node_service_info));
10742
10743 ret = nd6_alt_node_present(ifp, sin6, sdl, rssi, lqm, npm);
10744 if (ret == 0 || ret == EEXIST) {
10745 int err = dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_NODE_PRESENCE,
10746 &kev.link_data, sizeof(kev), (ret == EEXIST) ? TRUE : FALSE);
10747 if (err != 0) {
10748 log(LOG_ERR, "%s: Post DL_NODE_PRESENCE failed with"
10749 "error %d\n", __func__, err);
10750 }
10751 }
10752
10753 if (ret == EEXIST) {
10754 ret = 0;
10755 }
10756 return ret;
10757 }
10758
10759 void
dlil_node_absent(struct ifnet * ifp,struct sockaddr * sa)10760 dlil_node_absent(struct ifnet *ifp, struct sockaddr *sa)
10761 {
10762 struct kev_dl_node_absence kev = {};
10763 struct sockaddr_in6 *kev_sin6 = NULL;
10764 struct sockaddr_dl *kev_sdl = NULL;
10765 int error = 0;
10766
10767 VERIFY(ifp != NULL);
10768 VERIFY(sa != NULL);
10769 VERIFY(sa->sa_family == AF_LINK || sa->sa_family == AF_INET6);
10770
10771 kev_sin6 = &kev.sin6_node_address;
10772 kev_sdl = &kev.sdl_node_address;
10773
10774 if (sa->sa_family == AF_INET6) {
10775 /*
10776 * If IPv6 address is given, get the link layer
10777 * address from what was cached in the neighbor cache
10778 */
10779 VERIFY(sa->sa_len <= sizeof(*kev_sin6));
10780 bcopy(sa, kev_sin6, sa->sa_len);
10781 error = nd6_alt_node_absent(ifp, kev_sin6, kev_sdl);
10782 } else {
10783 /*
10784 * If passed address is AF_LINK type, derive the address
10785 * based on the link address.
10786 */
10787 nd6_alt_node_addr_decompose(ifp, sa, kev_sdl, kev_sin6);
10788 error = nd6_alt_node_absent(ifp, kev_sin6, NULL);
10789 }
10790
10791 if (error == 0) {
10792 kev_sdl->sdl_type = ifp->if_type;
10793 kev_sdl->sdl_index = ifp->if_index;
10794
10795 dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_NODE_ABSENCE,
10796 &kev.link_data, sizeof(kev), FALSE);
10797 }
10798 }
10799
10800 int
dlil_node_present_v2(struct ifnet * ifp,struct sockaddr * sa,struct sockaddr_dl * sdl,int32_t rssi,int lqm,int npm,u_int8_t srvinfo[48])10801 dlil_node_present_v2(struct ifnet *ifp, struct sockaddr *sa, struct sockaddr_dl *sdl,
10802 int32_t rssi, int lqm, int npm, u_int8_t srvinfo[48])
10803 {
10804 struct kev_dl_node_presence kev = {};
10805 struct sockaddr_dl *kev_sdl = NULL;
10806 struct sockaddr_in6 *kev_sin6 = NULL;
10807 int ret = 0;
10808
10809 VERIFY(ifp != NULL);
10810 VERIFY(sa != NULL && sdl != NULL);
10811 VERIFY(sa->sa_family == AF_INET6 && sdl->sdl_family == AF_LINK);
10812
10813 kev_sin6 = &kev.sin6_node_address;
10814 kev_sdl = &kev.sdl_node_address;
10815
10816 VERIFY(sdl->sdl_len <= sizeof(*kev_sdl));
10817 bcopy(sdl, kev_sdl, sdl->sdl_len);
10818 kev_sdl->sdl_type = ifp->if_type;
10819 kev_sdl->sdl_index = ifp->if_index;
10820
10821 VERIFY(sa->sa_len <= sizeof(*kev_sin6));
10822 bcopy(sa, kev_sin6, sa->sa_len);
10823
10824 kev.rssi = rssi;
10825 kev.link_quality_metric = lqm;
10826 kev.node_proximity_metric = npm;
10827 bcopy(srvinfo, kev.node_service_info, sizeof(kev.node_service_info));
10828
10829 ret = nd6_alt_node_present(ifp, SIN6(sa), sdl, rssi, lqm, npm);
10830 if (ret == 0 || ret == EEXIST) {
10831 int err = dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_NODE_PRESENCE,
10832 &kev.link_data, sizeof(kev), (ret == EEXIST) ? TRUE : FALSE);
10833 if (err != 0) {
10834 log(LOG_ERR, "%s: Post DL_NODE_PRESENCE failed with error %d\n", __func__, err);
10835 }
10836 }
10837
10838 if (ret == EEXIST) {
10839 ret = 0;
10840 }
10841 return ret;
10842 }
10843
10844 const void *
dlil_ifaddr_bytes(const struct sockaddr_dl * sdl,size_t * sizep,kauth_cred_t * credp)10845 dlil_ifaddr_bytes(const struct sockaddr_dl *sdl, size_t *sizep,
10846 kauth_cred_t *credp)
10847 {
10848 const u_int8_t *bytes;
10849 size_t size;
10850
10851 bytes = CONST_LLADDR(sdl);
10852 size = sdl->sdl_alen;
10853
10854 #if CONFIG_MACF
10855 if (dlil_lladdr_ckreq) {
10856 switch (sdl->sdl_type) {
10857 case IFT_ETHER:
10858 case IFT_IEEE1394:
10859 break;
10860 default:
10861 credp = NULL;
10862 break;
10863 }
10864 ;
10865
10866 if (credp && mac_system_check_info(*credp, "net.link.addr")) {
10867 static const u_int8_t unspec[FIREWIRE_EUI64_LEN] = {
10868 [0] = 2
10869 };
10870
10871 bytes = unspec;
10872 }
10873 }
10874 #else
10875 #pragma unused(credp)
10876 #endif
10877
10878 if (sizep != NULL) {
10879 *sizep = size;
10880 }
10881 return bytes;
10882 }
10883
10884 void
dlil_report_issues(struct ifnet * ifp,u_int8_t modid[DLIL_MODIDLEN],u_int8_t info[DLIL_MODARGLEN])10885 dlil_report_issues(struct ifnet *ifp, u_int8_t modid[DLIL_MODIDLEN],
10886 u_int8_t info[DLIL_MODARGLEN])
10887 {
10888 struct kev_dl_issues kev;
10889 struct timeval tv;
10890
10891 VERIFY(ifp != NULL);
10892 VERIFY(modid != NULL);
10893 _CASSERT(sizeof(kev.modid) == DLIL_MODIDLEN);
10894 _CASSERT(sizeof(kev.info) == DLIL_MODARGLEN);
10895
10896 bzero(&kev, sizeof(kev));
10897
10898 microtime(&tv);
10899 kev.timestamp = tv.tv_sec;
10900 bcopy(modid, &kev.modid, DLIL_MODIDLEN);
10901 if (info != NULL) {
10902 bcopy(info, &kev.info, DLIL_MODARGLEN);
10903 }
10904
10905 dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_ISSUES,
10906 &kev.link_data, sizeof(kev), FALSE);
10907 }
10908
10909 errno_t
ifnet_getset_opportunistic(ifnet_t ifp,u_long cmd,struct ifreq * ifr,struct proc * p)10910 ifnet_getset_opportunistic(ifnet_t ifp, u_long cmd, struct ifreq *ifr,
10911 struct proc *p)
10912 {
10913 u_int32_t level = IFNET_THROTTLE_OFF;
10914 errno_t result = 0;
10915
10916 VERIFY(cmd == SIOCSIFOPPORTUNISTIC || cmd == SIOCGIFOPPORTUNISTIC);
10917
10918 if (cmd == SIOCSIFOPPORTUNISTIC) {
10919 /*
10920 * XXX: Use priv_check_cred() instead of root check?
10921 */
10922 if ((result = proc_suser(p)) != 0) {
10923 return result;
10924 }
10925
10926 if (ifr->ifr_opportunistic.ifo_flags ==
10927 IFRIFOF_BLOCK_OPPORTUNISTIC) {
10928 level = IFNET_THROTTLE_OPPORTUNISTIC;
10929 } else if (ifr->ifr_opportunistic.ifo_flags == 0) {
10930 level = IFNET_THROTTLE_OFF;
10931 } else {
10932 result = EINVAL;
10933 }
10934
10935 if (result == 0) {
10936 result = ifnet_set_throttle(ifp, level);
10937 }
10938 } else if ((result = ifnet_get_throttle(ifp, &level)) == 0) {
10939 ifr->ifr_opportunistic.ifo_flags = 0;
10940 if (level == IFNET_THROTTLE_OPPORTUNISTIC) {
10941 ifr->ifr_opportunistic.ifo_flags |=
10942 IFRIFOF_BLOCK_OPPORTUNISTIC;
10943 }
10944 }
10945
10946 /*
10947 * Return the count of current opportunistic connections
10948 * over the interface.
10949 */
10950 if (result == 0) {
10951 uint32_t flags = 0;
10952 flags |= (cmd == SIOCSIFOPPORTUNISTIC) ?
10953 INPCB_OPPORTUNISTIC_SETCMD : 0;
10954 flags |= (level == IFNET_THROTTLE_OPPORTUNISTIC) ?
10955 INPCB_OPPORTUNISTIC_THROTTLEON : 0;
10956 ifr->ifr_opportunistic.ifo_inuse =
10957 udp_count_opportunistic(ifp->if_index, flags) +
10958 tcp_count_opportunistic(ifp->if_index, flags);
10959 }
10960
10961 if (result == EALREADY) {
10962 result = 0;
10963 }
10964
10965 return result;
10966 }
10967
10968 int
ifnet_get_throttle(struct ifnet * ifp,u_int32_t * level)10969 ifnet_get_throttle(struct ifnet *ifp, u_int32_t *level)
10970 {
10971 struct ifclassq *ifq;
10972 int err = 0;
10973
10974 if (!(ifp->if_eflags & IFEF_TXSTART)) {
10975 return ENXIO;
10976 }
10977
10978 *level = IFNET_THROTTLE_OFF;
10979
10980 ifq = ifp->if_snd;
10981 IFCQ_LOCK(ifq);
10982 /* Throttling works only for IFCQ, not ALTQ instances */
10983 if (IFCQ_IS_ENABLED(ifq)) {
10984 cqrq_throttle_t req = { 0, IFNET_THROTTLE_OFF };
10985
10986 err = fq_if_request_classq(ifq, CLASSQRQ_THROTTLE, &req);
10987 *level = req.level;
10988 }
10989 IFCQ_UNLOCK(ifq);
10990
10991 return err;
10992 }
10993
10994 int
ifnet_set_throttle(struct ifnet * ifp,u_int32_t level)10995 ifnet_set_throttle(struct ifnet *ifp, u_int32_t level)
10996 {
10997 struct ifclassq *ifq;
10998 int err = 0;
10999
11000 if (!(ifp->if_eflags & IFEF_TXSTART)) {
11001 return ENXIO;
11002 }
11003
11004 ifq = ifp->if_snd;
11005
11006 switch (level) {
11007 case IFNET_THROTTLE_OFF:
11008 case IFNET_THROTTLE_OPPORTUNISTIC:
11009 break;
11010 default:
11011 return EINVAL;
11012 }
11013
11014 IFCQ_LOCK(ifq);
11015 if (IFCQ_IS_ENABLED(ifq)) {
11016 cqrq_throttle_t req = { 1, level };
11017
11018 err = fq_if_request_classq(ifq, CLASSQRQ_THROTTLE, &req);
11019 }
11020 IFCQ_UNLOCK(ifq);
11021
11022 if (err == 0) {
11023 DLIL_PRINTF("%s: throttling level set to %d\n", if_name(ifp),
11024 level);
11025 #if NECP
11026 necp_update_all_clients();
11027 #endif /* NECP */
11028 if (level == IFNET_THROTTLE_OFF) {
11029 ifnet_start(ifp);
11030 }
11031 }
11032
11033 return err;
11034 }
11035
11036 errno_t
ifnet_getset_log(ifnet_t ifp,u_long cmd,struct ifreq * ifr,struct proc * p)11037 ifnet_getset_log(ifnet_t ifp, u_long cmd, struct ifreq *ifr,
11038 struct proc *p)
11039 {
11040 #pragma unused(p)
11041 errno_t result = 0;
11042 uint32_t flags;
11043 int level, category, subcategory;
11044
11045 VERIFY(cmd == SIOCSIFLOG || cmd == SIOCGIFLOG);
11046
11047 if (cmd == SIOCSIFLOG) {
11048 if ((result = priv_check_cred(kauth_cred_get(),
11049 PRIV_NET_INTERFACE_CONTROL, 0)) != 0) {
11050 return result;
11051 }
11052
11053 level = ifr->ifr_log.ifl_level;
11054 if (level < IFNET_LOG_MIN || level > IFNET_LOG_MAX) {
11055 result = EINVAL;
11056 }
11057
11058 flags = ifr->ifr_log.ifl_flags;
11059 if ((flags &= IFNET_LOGF_MASK) == 0) {
11060 result = EINVAL;
11061 }
11062
11063 category = ifr->ifr_log.ifl_category;
11064 subcategory = ifr->ifr_log.ifl_subcategory;
11065
11066 if (result == 0) {
11067 result = ifnet_set_log(ifp, level, flags,
11068 category, subcategory);
11069 }
11070 } else {
11071 result = ifnet_get_log(ifp, &level, &flags, &category,
11072 &subcategory);
11073 if (result == 0) {
11074 ifr->ifr_log.ifl_level = level;
11075 ifr->ifr_log.ifl_flags = flags;
11076 ifr->ifr_log.ifl_category = category;
11077 ifr->ifr_log.ifl_subcategory = subcategory;
11078 }
11079 }
11080
11081 return result;
11082 }
11083
11084 int
ifnet_set_log(struct ifnet * ifp,int32_t level,uint32_t flags,int32_t category,int32_t subcategory)11085 ifnet_set_log(struct ifnet *ifp, int32_t level, uint32_t flags,
11086 int32_t category, int32_t subcategory)
11087 {
11088 int err = 0;
11089
11090 VERIFY(level >= IFNET_LOG_MIN && level <= IFNET_LOG_MAX);
11091 VERIFY(flags & IFNET_LOGF_MASK);
11092
11093 /*
11094 * The logging level applies to all facilities; make sure to
11095 * update them all with the most current level.
11096 */
11097 flags |= ifp->if_log.flags;
11098
11099 if (ifp->if_output_ctl != NULL) {
11100 struct ifnet_log_params l;
11101
11102 bzero(&l, sizeof(l));
11103 l.level = level;
11104 l.flags = flags;
11105 l.flags &= ~IFNET_LOGF_DLIL;
11106 l.category = category;
11107 l.subcategory = subcategory;
11108
11109 /* Send this request to lower layers */
11110 if (l.flags != 0) {
11111 err = ifp->if_output_ctl(ifp, IFNET_CTL_SET_LOG,
11112 sizeof(l), &l);
11113 }
11114 } else if ((flags & ~IFNET_LOGF_DLIL) && ifp->if_output_ctl == NULL) {
11115 /*
11116 * If targeted to the lower layers without an output
11117 * control callback registered on the interface, just
11118 * silently ignore facilities other than ours.
11119 */
11120 flags &= IFNET_LOGF_DLIL;
11121 if (flags == 0 && (!(ifp->if_log.flags & IFNET_LOGF_DLIL))) {
11122 level = 0;
11123 }
11124 }
11125
11126 if (err == 0) {
11127 if ((ifp->if_log.level = level) == IFNET_LOG_DEFAULT) {
11128 ifp->if_log.flags = 0;
11129 } else {
11130 ifp->if_log.flags |= flags;
11131 }
11132
11133 log(LOG_INFO, "%s: logging level set to %d flags=0x%x "
11134 "arg=0x%x, category=%d subcategory=%d\n", if_name(ifp),
11135 ifp->if_log.level, ifp->if_log.flags, flags,
11136 category, subcategory);
11137 }
11138
11139 return err;
11140 }
11141
11142 int
ifnet_get_log(struct ifnet * ifp,int32_t * level,uint32_t * flags,int32_t * category,int32_t * subcategory)11143 ifnet_get_log(struct ifnet *ifp, int32_t *level, uint32_t *flags,
11144 int32_t *category, int32_t *subcategory)
11145 {
11146 if (level != NULL) {
11147 *level = ifp->if_log.level;
11148 }
11149 if (flags != NULL) {
11150 *flags = ifp->if_log.flags;
11151 }
11152 if (category != NULL) {
11153 *category = ifp->if_log.category;
11154 }
11155 if (subcategory != NULL) {
11156 *subcategory = ifp->if_log.subcategory;
11157 }
11158
11159 return 0;
11160 }
11161
11162 int
ifnet_notify_address(struct ifnet * ifp,int af)11163 ifnet_notify_address(struct ifnet *ifp, int af)
11164 {
11165 struct ifnet_notify_address_params na;
11166
11167 #if PF
11168 (void) pf_ifaddr_hook(ifp);
11169 #endif /* PF */
11170
11171 if (ifp->if_output_ctl == NULL) {
11172 return EOPNOTSUPP;
11173 }
11174
11175 bzero(&na, sizeof(na));
11176 na.address_family = (sa_family_t)af;
11177
11178 return ifp->if_output_ctl(ifp, IFNET_CTL_NOTIFY_ADDRESS,
11179 sizeof(na), &na);
11180 }
11181
11182 errno_t
ifnet_flowid(struct ifnet * ifp,uint32_t * flowid)11183 ifnet_flowid(struct ifnet *ifp, uint32_t *flowid)
11184 {
11185 if (ifp == NULL || flowid == NULL) {
11186 return EINVAL;
11187 } else if (!(ifp->if_eflags & IFEF_TXSTART) ||
11188 !IF_FULLY_ATTACHED(ifp)) {
11189 return ENXIO;
11190 }
11191
11192 *flowid = ifp->if_flowhash;
11193
11194 return 0;
11195 }
11196
11197 errno_t
ifnet_disable_output(struct ifnet * ifp)11198 ifnet_disable_output(struct ifnet *ifp)
11199 {
11200 int err = 0;
11201
11202 if (ifp == NULL) {
11203 return EINVAL;
11204 } else if (!(ifp->if_eflags & IFEF_TXSTART) ||
11205 !IF_FULLY_ATTACHED(ifp)) {
11206 return ENXIO;
11207 }
11208
11209 lck_mtx_lock(&ifp->if_start_lock);
11210 if (ifp->if_start_flags & IFSF_FLOW_RESUME_PENDING) {
11211 ifp->if_start_flags &= ~(IFSF_FLOW_RESUME_PENDING | IFSF_FLOW_CONTROLLED);
11212 } else if ((err = ifnet_fc_add(ifp)) == 0) {
11213 ifp->if_start_flags |= IFSF_FLOW_CONTROLLED;
11214 }
11215 lck_mtx_unlock(&ifp->if_start_lock);
11216
11217 return err;
11218 }
11219
11220 errno_t
ifnet_enable_output(struct ifnet * ifp)11221 ifnet_enable_output(struct ifnet *ifp)
11222 {
11223 if (ifp == NULL) {
11224 return EINVAL;
11225 } else if (!(ifp->if_eflags & IFEF_TXSTART) ||
11226 !IF_FULLY_ATTACHED(ifp)) {
11227 return ENXIO;
11228 }
11229
11230 ifnet_start_common(ifp, TRUE, FALSE);
11231 return 0;
11232 }
11233
11234 void
ifnet_flowadv(uint32_t flowhash)11235 ifnet_flowadv(uint32_t flowhash)
11236 {
11237 struct ifnet_fc_entry *ifce;
11238 struct ifnet *ifp;
11239
11240 ifce = ifnet_fc_get(flowhash);
11241 if (ifce == NULL) {
11242 return;
11243 }
11244
11245 VERIFY(ifce->ifce_ifp != NULL);
11246 ifp = ifce->ifce_ifp;
11247
11248 /* flow hash gets recalculated per attach, so check */
11249 if (ifnet_is_attached(ifp, 1)) {
11250 if (ifp->if_flowhash == flowhash) {
11251 lck_mtx_lock_spin(&ifp->if_start_lock);
11252 if ((ifp->if_start_flags & IFSF_FLOW_CONTROLLED) == 0) {
11253 ifp->if_start_flags |= IFSF_FLOW_RESUME_PENDING;
11254 }
11255 lck_mtx_unlock(&ifp->if_start_lock);
11256 (void) ifnet_enable_output(ifp);
11257 }
11258 ifnet_decr_iorefcnt(ifp);
11259 }
11260 ifnet_fc_entry_free(ifce);
11261 }
11262
11263 /*
11264 * Function to compare ifnet_fc_entries in ifnet flow control tree
11265 */
11266 static inline int
ifce_cmp(const struct ifnet_fc_entry * fc1,const struct ifnet_fc_entry * fc2)11267 ifce_cmp(const struct ifnet_fc_entry *fc1, const struct ifnet_fc_entry *fc2)
11268 {
11269 return fc1->ifce_flowhash - fc2->ifce_flowhash;
11270 }
11271
11272 static int
ifnet_fc_add(struct ifnet * ifp)11273 ifnet_fc_add(struct ifnet *ifp)
11274 {
11275 struct ifnet_fc_entry keyfc, *ifce;
11276 uint32_t flowhash;
11277
11278 VERIFY(ifp != NULL && (ifp->if_eflags & IFEF_TXSTART));
11279 VERIFY(ifp->if_flowhash != 0);
11280 flowhash = ifp->if_flowhash;
11281
11282 bzero(&keyfc, sizeof(keyfc));
11283 keyfc.ifce_flowhash = flowhash;
11284
11285 lck_mtx_lock_spin(&ifnet_fc_lock);
11286 ifce = RB_FIND(ifnet_fc_tree, &ifnet_fc_tree, &keyfc);
11287 if (ifce != NULL && ifce->ifce_ifp == ifp) {
11288 /* Entry is already in ifnet_fc_tree, return */
11289 lck_mtx_unlock(&ifnet_fc_lock);
11290 return 0;
11291 }
11292
11293 if (ifce != NULL) {
11294 /*
11295 * There is a different fc entry with the same flow hash
11296 * but different ifp pointer. There can be a collision
11297 * on flow hash but the probability is low. Let's just
11298 * avoid adding a second one when there is a collision.
11299 */
11300 lck_mtx_unlock(&ifnet_fc_lock);
11301 return EAGAIN;
11302 }
11303
11304 /* become regular mutex */
11305 lck_mtx_convert_spin(&ifnet_fc_lock);
11306
11307 ifce = zalloc_flags(ifnet_fc_zone, Z_WAITOK | Z_ZERO);
11308 ifce->ifce_flowhash = flowhash;
11309 ifce->ifce_ifp = ifp;
11310
11311 RB_INSERT(ifnet_fc_tree, &ifnet_fc_tree, ifce);
11312 lck_mtx_unlock(&ifnet_fc_lock);
11313 return 0;
11314 }
11315
11316 static struct ifnet_fc_entry *
ifnet_fc_get(uint32_t flowhash)11317 ifnet_fc_get(uint32_t flowhash)
11318 {
11319 struct ifnet_fc_entry keyfc, *ifce;
11320 struct ifnet *ifp;
11321
11322 bzero(&keyfc, sizeof(keyfc));
11323 keyfc.ifce_flowhash = flowhash;
11324
11325 lck_mtx_lock_spin(&ifnet_fc_lock);
11326 ifce = RB_FIND(ifnet_fc_tree, &ifnet_fc_tree, &keyfc);
11327 if (ifce == NULL) {
11328 /* Entry is not present in ifnet_fc_tree, return */
11329 lck_mtx_unlock(&ifnet_fc_lock);
11330 return NULL;
11331 }
11332
11333 RB_REMOVE(ifnet_fc_tree, &ifnet_fc_tree, ifce);
11334
11335 VERIFY(ifce->ifce_ifp != NULL);
11336 ifp = ifce->ifce_ifp;
11337
11338 /* become regular mutex */
11339 lck_mtx_convert_spin(&ifnet_fc_lock);
11340
11341 if (!ifnet_is_attached(ifp, 0)) {
11342 /*
11343 * This ifp is not attached or in the process of being
11344 * detached; just don't process it.
11345 */
11346 ifnet_fc_entry_free(ifce);
11347 ifce = NULL;
11348 }
11349 lck_mtx_unlock(&ifnet_fc_lock);
11350
11351 return ifce;
11352 }
11353
11354 static void
ifnet_fc_entry_free(struct ifnet_fc_entry * ifce)11355 ifnet_fc_entry_free(struct ifnet_fc_entry *ifce)
11356 {
11357 zfree(ifnet_fc_zone, ifce);
11358 }
11359
11360 static uint32_t
ifnet_calc_flowhash(struct ifnet * ifp)11361 ifnet_calc_flowhash(struct ifnet *ifp)
11362 {
11363 struct ifnet_flowhash_key fh __attribute__((aligned(8)));
11364 uint32_t flowhash = 0;
11365
11366 if (ifnet_flowhash_seed == 0) {
11367 ifnet_flowhash_seed = RandomULong();
11368 }
11369
11370 bzero(&fh, sizeof(fh));
11371
11372 (void) snprintf(fh.ifk_name, sizeof(fh.ifk_name), "%s", ifp->if_name);
11373 fh.ifk_unit = ifp->if_unit;
11374 fh.ifk_flags = ifp->if_flags;
11375 fh.ifk_eflags = ifp->if_eflags;
11376 fh.ifk_capabilities = ifp->if_capabilities;
11377 fh.ifk_capenable = ifp->if_capenable;
11378 fh.ifk_output_sched_model = ifp->if_output_sched_model;
11379 fh.ifk_rand1 = RandomULong();
11380 fh.ifk_rand2 = RandomULong();
11381
11382 try_again:
11383 flowhash = net_flowhash(&fh, sizeof(fh), ifnet_flowhash_seed);
11384 if (flowhash == 0) {
11385 /* try to get a non-zero flowhash */
11386 ifnet_flowhash_seed = RandomULong();
11387 goto try_again;
11388 }
11389
11390 return flowhash;
11391 }
11392
11393 int
ifnet_set_netsignature(struct ifnet * ifp,uint8_t family,uint8_t len,uint16_t flags,uint8_t * data)11394 ifnet_set_netsignature(struct ifnet *ifp, uint8_t family, uint8_t len,
11395 uint16_t flags, uint8_t *data)
11396 {
11397 #pragma unused(flags)
11398 int error = 0;
11399
11400 switch (family) {
11401 case AF_INET:
11402 if_inetdata_lock_exclusive(ifp);
11403 if (IN_IFEXTRA(ifp) != NULL) {
11404 if (len == 0) {
11405 /* Allow clearing the signature */
11406 IN_IFEXTRA(ifp)->netsig_len = 0;
11407 bzero(IN_IFEXTRA(ifp)->netsig,
11408 sizeof(IN_IFEXTRA(ifp)->netsig));
11409 if_inetdata_lock_done(ifp);
11410 break;
11411 } else if (len > sizeof(IN_IFEXTRA(ifp)->netsig)) {
11412 error = EINVAL;
11413 if_inetdata_lock_done(ifp);
11414 break;
11415 }
11416 IN_IFEXTRA(ifp)->netsig_len = len;
11417 bcopy(data, IN_IFEXTRA(ifp)->netsig, len);
11418 } else {
11419 error = ENOMEM;
11420 }
11421 if_inetdata_lock_done(ifp);
11422 break;
11423
11424 case AF_INET6:
11425 if_inet6data_lock_exclusive(ifp);
11426 if (IN6_IFEXTRA(ifp) != NULL) {
11427 if (len == 0) {
11428 /* Allow clearing the signature */
11429 IN6_IFEXTRA(ifp)->netsig_len = 0;
11430 bzero(IN6_IFEXTRA(ifp)->netsig,
11431 sizeof(IN6_IFEXTRA(ifp)->netsig));
11432 if_inet6data_lock_done(ifp);
11433 break;
11434 } else if (len > sizeof(IN6_IFEXTRA(ifp)->netsig)) {
11435 error = EINVAL;
11436 if_inet6data_lock_done(ifp);
11437 break;
11438 }
11439 IN6_IFEXTRA(ifp)->netsig_len = len;
11440 bcopy(data, IN6_IFEXTRA(ifp)->netsig, len);
11441 } else {
11442 error = ENOMEM;
11443 }
11444 if_inet6data_lock_done(ifp);
11445 break;
11446
11447 default:
11448 error = EINVAL;
11449 break;
11450 }
11451
11452 return error;
11453 }
11454
11455 int
ifnet_get_netsignature(struct ifnet * ifp,uint8_t family,uint8_t * len,uint16_t * flags,uint8_t * data)11456 ifnet_get_netsignature(struct ifnet *ifp, uint8_t family, uint8_t *len,
11457 uint16_t *flags, uint8_t *data)
11458 {
11459 int error = 0;
11460
11461 if (ifp == NULL || len == NULL || data == NULL) {
11462 return EINVAL;
11463 }
11464
11465 switch (family) {
11466 case AF_INET:
11467 if_inetdata_lock_shared(ifp);
11468 if (IN_IFEXTRA(ifp) != NULL) {
11469 if (*len == 0 || *len < IN_IFEXTRA(ifp)->netsig_len) {
11470 error = EINVAL;
11471 if_inetdata_lock_done(ifp);
11472 break;
11473 }
11474 if ((*len = (uint8_t)IN_IFEXTRA(ifp)->netsig_len) > 0) {
11475 bcopy(IN_IFEXTRA(ifp)->netsig, data, *len);
11476 } else {
11477 error = ENOENT;
11478 }
11479 } else {
11480 error = ENOMEM;
11481 }
11482 if_inetdata_lock_done(ifp);
11483 break;
11484
11485 case AF_INET6:
11486 if_inet6data_lock_shared(ifp);
11487 if (IN6_IFEXTRA(ifp) != NULL) {
11488 if (*len == 0 || *len < IN6_IFEXTRA(ifp)->netsig_len) {
11489 error = EINVAL;
11490 if_inet6data_lock_done(ifp);
11491 break;
11492 }
11493 if ((*len = (uint8_t)IN6_IFEXTRA(ifp)->netsig_len) > 0) {
11494 bcopy(IN6_IFEXTRA(ifp)->netsig, data, *len);
11495 } else {
11496 error = ENOENT;
11497 }
11498 } else {
11499 error = ENOMEM;
11500 }
11501 if_inet6data_lock_done(ifp);
11502 break;
11503
11504 default:
11505 error = EINVAL;
11506 break;
11507 }
11508
11509 if (error == 0 && flags != NULL) {
11510 *flags = 0;
11511 }
11512
11513 return error;
11514 }
11515
11516 int
ifnet_set_nat64prefix(struct ifnet * ifp,struct ipv6_prefix * prefixes)11517 ifnet_set_nat64prefix(struct ifnet *ifp, struct ipv6_prefix *prefixes)
11518 {
11519 int i, error = 0, one_set = 0;
11520
11521 if_inet6data_lock_exclusive(ifp);
11522
11523 if (IN6_IFEXTRA(ifp) == NULL) {
11524 error = ENOMEM;
11525 goto out;
11526 }
11527
11528 for (i = 0; i < NAT64_MAX_NUM_PREFIXES; i++) {
11529 uint32_t prefix_len =
11530 prefixes[i].prefix_len;
11531 struct in6_addr *prefix =
11532 &prefixes[i].ipv6_prefix;
11533
11534 if (prefix_len == 0) {
11535 clat_log0((LOG_DEBUG,
11536 "NAT64 prefixes purged from Interface %s\n",
11537 if_name(ifp)));
11538 /* Allow clearing the signature */
11539 IN6_IFEXTRA(ifp)->nat64_prefixes[i].prefix_len = 0;
11540 bzero(&IN6_IFEXTRA(ifp)->nat64_prefixes[i].ipv6_prefix,
11541 sizeof(struct in6_addr));
11542
11543 continue;
11544 } else if (prefix_len != NAT64_PREFIX_LEN_32 &&
11545 prefix_len != NAT64_PREFIX_LEN_40 &&
11546 prefix_len != NAT64_PREFIX_LEN_48 &&
11547 prefix_len != NAT64_PREFIX_LEN_56 &&
11548 prefix_len != NAT64_PREFIX_LEN_64 &&
11549 prefix_len != NAT64_PREFIX_LEN_96) {
11550 clat_log0((LOG_DEBUG,
11551 "NAT64 prefixlen is incorrect %d\n", prefix_len));
11552 error = EINVAL;
11553 goto out;
11554 }
11555
11556 if (IN6_IS_SCOPE_EMBED(prefix)) {
11557 clat_log0((LOG_DEBUG,
11558 "NAT64 prefix has interface/link local scope.\n"));
11559 error = EINVAL;
11560 goto out;
11561 }
11562
11563 IN6_IFEXTRA(ifp)->nat64_prefixes[i].prefix_len = prefix_len;
11564 bcopy(prefix, &IN6_IFEXTRA(ifp)->nat64_prefixes[i].ipv6_prefix,
11565 sizeof(struct in6_addr));
11566 clat_log0((LOG_DEBUG,
11567 "NAT64 prefix set to %s with prefixlen: %d\n",
11568 ip6_sprintf(prefix), prefix_len));
11569 one_set = 1;
11570 }
11571
11572 out:
11573 if_inet6data_lock_done(ifp);
11574
11575 if (error == 0 && one_set != 0) {
11576 necp_update_all_clients();
11577 }
11578
11579 return error;
11580 }
11581
11582 int
ifnet_get_nat64prefix(struct ifnet * ifp,struct ipv6_prefix * prefixes)11583 ifnet_get_nat64prefix(struct ifnet *ifp, struct ipv6_prefix *prefixes)
11584 {
11585 int i, found_one = 0, error = 0;
11586
11587 if (ifp == NULL) {
11588 return EINVAL;
11589 }
11590
11591 if_inet6data_lock_shared(ifp);
11592
11593 if (IN6_IFEXTRA(ifp) == NULL) {
11594 error = ENOMEM;
11595 goto out;
11596 }
11597
11598 for (i = 0; i < NAT64_MAX_NUM_PREFIXES; i++) {
11599 if (IN6_IFEXTRA(ifp)->nat64_prefixes[i].prefix_len != 0) {
11600 found_one = 1;
11601 }
11602 }
11603
11604 if (found_one == 0) {
11605 error = ENOENT;
11606 goto out;
11607 }
11608
11609 if (prefixes) {
11610 bcopy(IN6_IFEXTRA(ifp)->nat64_prefixes, prefixes,
11611 sizeof(IN6_IFEXTRA(ifp)->nat64_prefixes));
11612 }
11613
11614 out:
11615 if_inet6data_lock_done(ifp);
11616
11617 return error;
11618 }
11619
11620 __attribute__((noinline))
11621 static void
dlil_output_cksum_dbg(struct ifnet * ifp,struct mbuf * m,uint32_t hoff,protocol_family_t pf)11622 dlil_output_cksum_dbg(struct ifnet *ifp, struct mbuf *m, uint32_t hoff,
11623 protocol_family_t pf)
11624 {
11625 #pragma unused(ifp)
11626 uint32_t did_sw;
11627
11628 if (!(hwcksum_dbg_mode & HWCKSUM_DBG_FINALIZE_FORCED) ||
11629 (m->m_pkthdr.csum_flags & (CSUM_TSO_IPV4 | CSUM_TSO_IPV6))) {
11630 return;
11631 }
11632
11633 switch (pf) {
11634 case PF_INET:
11635 did_sw = in_finalize_cksum(m, hoff, m->m_pkthdr.csum_flags);
11636 if (did_sw & CSUM_DELAY_IP) {
11637 hwcksum_dbg_finalized_hdr++;
11638 }
11639 if (did_sw & CSUM_DELAY_DATA) {
11640 hwcksum_dbg_finalized_data++;
11641 }
11642 break;
11643 case PF_INET6:
11644 /*
11645 * Checksum offload should not have been enabled when
11646 * extension headers exist; that also means that we
11647 * cannot force-finalize packets with extension headers.
11648 * Indicate to the callee should it skip such case by
11649 * setting optlen to -1.
11650 */
11651 did_sw = in6_finalize_cksum(m, hoff, -1, -1,
11652 m->m_pkthdr.csum_flags);
11653 if (did_sw & CSUM_DELAY_IPV6_DATA) {
11654 hwcksum_dbg_finalized_data++;
11655 }
11656 break;
11657 default:
11658 return;
11659 }
11660 }
11661
11662 static void
dlil_input_cksum_dbg(struct ifnet * ifp,struct mbuf * m,char * frame_header,protocol_family_t pf)11663 dlil_input_cksum_dbg(struct ifnet *ifp, struct mbuf *m, char *frame_header,
11664 protocol_family_t pf)
11665 {
11666 uint16_t sum = 0;
11667 uint32_t hlen;
11668
11669 if (frame_header == NULL ||
11670 frame_header < (char *)mbuf_datastart(m) ||
11671 frame_header > (char *)m->m_data) {
11672 DLIL_PRINTF("%s: frame header pointer 0x%llx out of range "
11673 "[0x%llx,0x%llx] for mbuf 0x%llx\n", if_name(ifp),
11674 (uint64_t)VM_KERNEL_ADDRPERM(frame_header),
11675 (uint64_t)VM_KERNEL_ADDRPERM(mbuf_datastart(m)),
11676 (uint64_t)VM_KERNEL_ADDRPERM(m->m_data),
11677 (uint64_t)VM_KERNEL_ADDRPERM(m));
11678 return;
11679 }
11680 hlen = (uint32_t)(m->m_data - (uintptr_t)frame_header);
11681
11682 switch (pf) {
11683 case PF_INET:
11684 case PF_INET6:
11685 break;
11686 default:
11687 return;
11688 }
11689
11690 /*
11691 * Force partial checksum offload; useful to simulate cases
11692 * where the hardware does not support partial checksum offload,
11693 * in order to validate correctness throughout the layers above.
11694 */
11695 if (hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_FORCED) {
11696 uint32_t foff = hwcksum_dbg_partial_rxoff_forced;
11697
11698 if (foff > (uint32_t)m->m_pkthdr.len) {
11699 return;
11700 }
11701
11702 m->m_pkthdr.csum_flags &= ~CSUM_RX_FLAGS;
11703
11704 /* Compute 16-bit 1's complement sum from forced offset */
11705 sum = m_sum16(m, foff, (m->m_pkthdr.len - foff));
11706
11707 m->m_pkthdr.csum_flags |= (CSUM_DATA_VALID | CSUM_PARTIAL);
11708 m->m_pkthdr.csum_rx_val = sum;
11709 m->m_pkthdr.csum_rx_start = (uint16_t)(foff + hlen);
11710
11711 hwcksum_dbg_partial_forced++;
11712 hwcksum_dbg_partial_forced_bytes += m->m_pkthdr.len;
11713 }
11714
11715 /*
11716 * Partial checksum offload verification (and adjustment);
11717 * useful to validate and test cases where the hardware
11718 * supports partial checksum offload.
11719 */
11720 if ((m->m_pkthdr.csum_flags &
11721 (CSUM_DATA_VALID | CSUM_PARTIAL | CSUM_PSEUDO_HDR)) ==
11722 (CSUM_DATA_VALID | CSUM_PARTIAL)) {
11723 uint32_t rxoff;
11724
11725 /* Start offset must begin after frame header */
11726 rxoff = m->m_pkthdr.csum_rx_start;
11727 if (hlen > rxoff) {
11728 hwcksum_dbg_bad_rxoff++;
11729 if (dlil_verbose) {
11730 DLIL_PRINTF("%s: partial cksum start offset %d "
11731 "is less than frame header length %d for "
11732 "mbuf 0x%llx\n", if_name(ifp), rxoff, hlen,
11733 (uint64_t)VM_KERNEL_ADDRPERM(m));
11734 }
11735 return;
11736 }
11737 rxoff -= hlen;
11738
11739 if (!(hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_FORCED)) {
11740 /*
11741 * Compute the expected 16-bit 1's complement sum;
11742 * skip this if we've already computed it above
11743 * when partial checksum offload is forced.
11744 */
11745 sum = m_sum16(m, rxoff, (m->m_pkthdr.len - rxoff));
11746
11747 /* Hardware or driver is buggy */
11748 if (sum != m->m_pkthdr.csum_rx_val) {
11749 hwcksum_dbg_bad_cksum++;
11750 if (dlil_verbose) {
11751 DLIL_PRINTF("%s: bad partial cksum value "
11752 "0x%x (expected 0x%x) for mbuf "
11753 "0x%llx [rx_start %d]\n",
11754 if_name(ifp),
11755 m->m_pkthdr.csum_rx_val, sum,
11756 (uint64_t)VM_KERNEL_ADDRPERM(m),
11757 m->m_pkthdr.csum_rx_start);
11758 }
11759 return;
11760 }
11761 }
11762 hwcksum_dbg_verified++;
11763
11764 /*
11765 * This code allows us to emulate various hardwares that
11766 * perform 16-bit 1's complement sum beginning at various
11767 * start offset values.
11768 */
11769 if (hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_RXOFF_ADJ) {
11770 uint32_t aoff = hwcksum_dbg_partial_rxoff_adj;
11771
11772 if (aoff == rxoff || aoff > (uint32_t)m->m_pkthdr.len) {
11773 return;
11774 }
11775
11776 sum = m_adj_sum16(m, rxoff, aoff,
11777 m_pktlen(m) - aoff, sum);
11778
11779 m->m_pkthdr.csum_rx_val = sum;
11780 m->m_pkthdr.csum_rx_start = (uint16_t)(aoff + hlen);
11781
11782 hwcksum_dbg_adjusted++;
11783 }
11784 }
11785 }
11786
11787 #if DEBUG || DEVELOPMENT
11788 /* Blob for sum16 verification */
11789 static uint8_t sumdata[] = {
11790 0x1f, 0x8b, 0x08, 0x08, 0x4c, 0xe5, 0x9a, 0x4f, 0x00, 0x03,
11791 0x5f, 0x00, 0x5d, 0x91, 0x41, 0x4e, 0xc4, 0x30, 0x0c, 0x45,
11792 0xf7, 0x9c, 0xc2, 0x07, 0x18, 0xf5, 0x0e, 0xb0, 0xe2, 0x00,
11793 0x48, 0x88, 0xa5, 0xdb, 0xba, 0x49, 0x34, 0x69, 0xdc, 0x71,
11794 0x92, 0xa9, 0xc2, 0x8a, 0x6b, 0x70, 0x3d, 0x4e, 0x82, 0x93,
11795 0xb4, 0x08, 0xd8, 0xc5, 0xb1, 0xfd, 0xff, 0xb3, 0xfd, 0x4c,
11796 0x42, 0x5f, 0x1f, 0x9f, 0x11, 0x12, 0x43, 0xb2, 0x04, 0x93,
11797 0xe0, 0x7b, 0x01, 0x0e, 0x14, 0x07, 0x78, 0xd1, 0x78, 0x75,
11798 0x71, 0x71, 0xe9, 0x08, 0x84, 0x46, 0xf2, 0xc7, 0x3b, 0x09,
11799 0xe7, 0xd1, 0xd3, 0x8a, 0x57, 0x92, 0x33, 0xcd, 0x39, 0xcc,
11800 0xb0, 0x91, 0x89, 0xe0, 0x42, 0x53, 0x8b, 0xb7, 0x8c, 0x42,
11801 0x60, 0xd9, 0x9f, 0x7a, 0x55, 0x19, 0x76, 0xcb, 0x10, 0x49,
11802 0x35, 0xac, 0x0b, 0x5a, 0x3c, 0xbb, 0x65, 0x51, 0x8c, 0x90,
11803 0x7c, 0x69, 0x45, 0x45, 0x81, 0xb4, 0x2b, 0x70, 0x82, 0x85,
11804 0x55, 0x91, 0x17, 0x90, 0xdc, 0x14, 0x1e, 0x35, 0x52, 0xdd,
11805 0x02, 0x16, 0xef, 0xb5, 0x40, 0x89, 0xe2, 0x46, 0x53, 0xad,
11806 0x93, 0x6e, 0x98, 0x30, 0xe5, 0x08, 0xb7, 0xcc, 0x03, 0xbc,
11807 0x71, 0x86, 0x09, 0x43, 0x0d, 0x52, 0xf5, 0xa2, 0xf5, 0xa2,
11808 0x56, 0x11, 0x8d, 0xa8, 0xf5, 0xee, 0x92, 0x3d, 0xfe, 0x8c,
11809 0x67, 0x71, 0x8b, 0x0e, 0x2d, 0x70, 0x77, 0xbe, 0xbe, 0xea,
11810 0xbf, 0x9a, 0x8d, 0x9c, 0x53, 0x53, 0xe5, 0xe0, 0x4b, 0x87,
11811 0x85, 0xd2, 0x45, 0x95, 0x30, 0xc1, 0xcc, 0xe0, 0x74, 0x54,
11812 0x13, 0x58, 0xe8, 0xe8, 0x79, 0xa2, 0x09, 0x73, 0xa4, 0x0e,
11813 0x39, 0x59, 0x0c, 0xe6, 0x9c, 0xb2, 0x4f, 0x06, 0x5b, 0x8e,
11814 0xcd, 0x17, 0x6c, 0x5e, 0x95, 0x4d, 0x70, 0xa2, 0x0a, 0xbf,
11815 0xa3, 0xcc, 0x03, 0xbc, 0x5a, 0xe7, 0x75, 0x06, 0x5e, 0x75,
11816 0xef, 0x58, 0x8e, 0x15, 0xd1, 0x0a, 0x18, 0xff, 0xdd, 0xe6,
11817 0x02, 0x3b, 0xb5, 0xb4, 0xa1, 0xe0, 0x72, 0xfc, 0xe3, 0xab,
11818 0x07, 0xe0, 0x4d, 0x65, 0xea, 0x92, 0xeb, 0xf2, 0x7b, 0x17,
11819 0x05, 0xce, 0xc6, 0xf6, 0x2b, 0xbb, 0x70, 0x3d, 0x00, 0x95,
11820 0xe0, 0x07, 0x52, 0x3b, 0x58, 0xfc, 0x7c, 0x69, 0x4d, 0xe9,
11821 0xf7, 0xa9, 0x66, 0x1e, 0x1e, 0xbe, 0x01, 0x69, 0x98, 0xfe,
11822 0xc8, 0x28, 0x02, 0x00, 0x00
11823 };
11824
11825 /* Precomputed 16-bit 1's complement sums for various spans of the above data */
11826 static struct {
11827 boolean_t init;
11828 uint16_t len;
11829 uint16_t sumr; /* reference */
11830 uint16_t sumrp; /* reference, precomputed */
11831 } sumtbl[] = {
11832 { FALSE, 0, 0, 0x0000 },
11833 { FALSE, 1, 0, 0x001f },
11834 { FALSE, 2, 0, 0x8b1f },
11835 { FALSE, 3, 0, 0x8b27 },
11836 { FALSE, 7, 0, 0x790e },
11837 { FALSE, 11, 0, 0xcb6d },
11838 { FALSE, 20, 0, 0x20dd },
11839 { FALSE, 27, 0, 0xbabd },
11840 { FALSE, 32, 0, 0xf3e8 },
11841 { FALSE, 37, 0, 0x197d },
11842 { FALSE, 43, 0, 0x9eae },
11843 { FALSE, 64, 0, 0x4678 },
11844 { FALSE, 127, 0, 0x9399 },
11845 { FALSE, 256, 0, 0xd147 },
11846 { FALSE, 325, 0, 0x0358 },
11847 };
11848 #define SUMTBL_MAX ((int)sizeof (sumtbl) / (int)sizeof (sumtbl[0]))
11849
11850 static void
dlil_verify_sum16(void)11851 dlil_verify_sum16(void)
11852 {
11853 struct mbuf *m;
11854 uint8_t *buf;
11855 int n;
11856
11857 /* Make sure test data plus extra room for alignment fits in cluster */
11858 _CASSERT((sizeof(sumdata) + (sizeof(uint64_t) * 2)) <= MCLBYTES);
11859
11860 kprintf("DLIL: running SUM16 self-tests ... ");
11861
11862 m = m_getcl(M_WAITOK, MT_DATA, M_PKTHDR);
11863 m_align(m, sizeof(sumdata) + (sizeof(uint64_t) * 2));
11864
11865 buf = mtod(m, uint8_t *); /* base address */
11866
11867 for (n = 0; n < SUMTBL_MAX; n++) {
11868 uint16_t len = sumtbl[n].len;
11869 int i;
11870
11871 /* Verify for all possible alignments */
11872 for (i = 0; i < (int)sizeof(uint64_t); i++) {
11873 uint16_t sum, sumr;
11874 uint8_t *c;
11875
11876 /* Copy over test data to mbuf */
11877 VERIFY(len <= sizeof(sumdata));
11878 c = buf + i;
11879 bcopy(sumdata, c, len);
11880
11881 /* Zero-offset test (align by data pointer) */
11882 m->m_data = (uintptr_t)c;
11883 m->m_len = len;
11884 sum = m_sum16(m, 0, len);
11885
11886 if (!sumtbl[n].init) {
11887 sumr = (uint16_t)in_cksum_mbuf_ref(m, len, 0, 0);
11888 sumtbl[n].sumr = sumr;
11889 sumtbl[n].init = TRUE;
11890 } else {
11891 sumr = sumtbl[n].sumr;
11892 }
11893
11894 /* Something is horribly broken; stop now */
11895 if (sumr != sumtbl[n].sumrp) {
11896 panic_plain("\n%s: broken in_cksum_mbuf_ref() "
11897 "for len=%d align=%d sum=0x%04x "
11898 "[expected=0x%04x]\n", __func__,
11899 len, i, sum, sumr);
11900 /* NOTREACHED */
11901 } else if (sum != sumr) {
11902 panic_plain("\n%s: broken m_sum16() for len=%d "
11903 "align=%d sum=0x%04x [expected=0x%04x]\n",
11904 __func__, len, i, sum, sumr);
11905 /* NOTREACHED */
11906 }
11907
11908 /* Alignment test by offset (fixed data pointer) */
11909 m->m_data = (uintptr_t)buf;
11910 m->m_len = i + len;
11911 sum = m_sum16(m, i, len);
11912
11913 /* Something is horribly broken; stop now */
11914 if (sum != sumr) {
11915 panic_plain("\n%s: broken m_sum16() for len=%d "
11916 "offset=%d sum=0x%04x [expected=0x%04x]\n",
11917 __func__, len, i, sum, sumr);
11918 /* NOTREACHED */
11919 }
11920 #if INET
11921 /* Simple sum16 contiguous buffer test by aligment */
11922 sum = b_sum16(c, len);
11923
11924 /* Something is horribly broken; stop now */
11925 if (sum != sumr) {
11926 panic_plain("\n%s: broken b_sum16() for len=%d "
11927 "align=%d sum=0x%04x [expected=0x%04x]\n",
11928 __func__, len, i, sum, sumr);
11929 /* NOTREACHED */
11930 }
11931 #endif /* INET */
11932 }
11933 }
11934 m_freem(m);
11935
11936 kprintf("PASSED\n");
11937 }
11938 #endif /* DEBUG || DEVELOPMENT */
11939
11940 #define CASE_STRINGIFY(x) case x: return #x
11941
11942 __private_extern__ const char *
dlil_kev_dl_code_str(u_int32_t event_code)11943 dlil_kev_dl_code_str(u_int32_t event_code)
11944 {
11945 switch (event_code) {
11946 CASE_STRINGIFY(KEV_DL_SIFFLAGS);
11947 CASE_STRINGIFY(KEV_DL_SIFMETRICS);
11948 CASE_STRINGIFY(KEV_DL_SIFMTU);
11949 CASE_STRINGIFY(KEV_DL_SIFPHYS);
11950 CASE_STRINGIFY(KEV_DL_SIFMEDIA);
11951 CASE_STRINGIFY(KEV_DL_SIFGENERIC);
11952 CASE_STRINGIFY(KEV_DL_ADDMULTI);
11953 CASE_STRINGIFY(KEV_DL_DELMULTI);
11954 CASE_STRINGIFY(KEV_DL_IF_ATTACHED);
11955 CASE_STRINGIFY(KEV_DL_IF_DETACHING);
11956 CASE_STRINGIFY(KEV_DL_IF_DETACHED);
11957 CASE_STRINGIFY(KEV_DL_LINK_OFF);
11958 CASE_STRINGIFY(KEV_DL_LINK_ON);
11959 CASE_STRINGIFY(KEV_DL_PROTO_ATTACHED);
11960 CASE_STRINGIFY(KEV_DL_PROTO_DETACHED);
11961 CASE_STRINGIFY(KEV_DL_LINK_ADDRESS_CHANGED);
11962 CASE_STRINGIFY(KEV_DL_WAKEFLAGS_CHANGED);
11963 CASE_STRINGIFY(KEV_DL_IF_IDLE_ROUTE_REFCNT);
11964 CASE_STRINGIFY(KEV_DL_IFCAP_CHANGED);
11965 CASE_STRINGIFY(KEV_DL_LINK_QUALITY_METRIC_CHANGED);
11966 CASE_STRINGIFY(KEV_DL_NODE_PRESENCE);
11967 CASE_STRINGIFY(KEV_DL_NODE_ABSENCE);
11968 CASE_STRINGIFY(KEV_DL_PRIMARY_ELECTED);
11969 CASE_STRINGIFY(KEV_DL_ISSUES);
11970 CASE_STRINGIFY(KEV_DL_IFDELEGATE_CHANGED);
11971 default:
11972 break;
11973 }
11974 return "";
11975 }
11976
11977 static void
dlil_dt_tcall_fn(thread_call_param_t arg0,thread_call_param_t arg1)11978 dlil_dt_tcall_fn(thread_call_param_t arg0, thread_call_param_t arg1)
11979 {
11980 #pragma unused(arg1)
11981 struct ifnet *ifp = arg0;
11982
11983 if (ifnet_is_attached(ifp, 1)) {
11984 nstat_ifnet_threshold_reached(ifp->if_index);
11985 ifnet_decr_iorefcnt(ifp);
11986 }
11987 }
11988
11989 void
ifnet_notify_data_threshold(struct ifnet * ifp)11990 ifnet_notify_data_threshold(struct ifnet *ifp)
11991 {
11992 uint64_t bytes = (ifp->if_ibytes + ifp->if_obytes);
11993 uint64_t oldbytes = ifp->if_dt_bytes;
11994
11995 ASSERT(ifp->if_dt_tcall != NULL);
11996
11997 /*
11998 * If we went over the threshold, notify NetworkStatistics.
11999 * We rate-limit it based on the threshold interval value.
12000 */
12001 if (threshold_notify && (bytes - oldbytes) > ifp->if_data_threshold &&
12002 OSCompareAndSwap64(oldbytes, bytes, &ifp->if_dt_bytes) &&
12003 !thread_call_isactive(ifp->if_dt_tcall)) {
12004 uint64_t tival = (threshold_interval * NSEC_PER_SEC);
12005 uint64_t now = mach_absolute_time(), deadline = now;
12006 uint64_t ival;
12007
12008 if (tival != 0) {
12009 nanoseconds_to_absolutetime(tival, &ival);
12010 clock_deadline_for_periodic_event(ival, now, &deadline);
12011 (void) thread_call_enter_delayed(ifp->if_dt_tcall,
12012 deadline);
12013 } else {
12014 (void) thread_call_enter(ifp->if_dt_tcall);
12015 }
12016 }
12017 }
12018
12019
12020 void
ifnet_update_stats_per_flow(struct ifnet_stats_per_flow * ifs,struct ifnet * ifp)12021 ifnet_update_stats_per_flow(struct ifnet_stats_per_flow *ifs,
12022 struct ifnet *ifp)
12023 {
12024 tcp_update_stats_per_flow(ifs, ifp);
12025 }
12026
12027 static inline u_int32_t
_set_flags(u_int32_t * flags_p,u_int32_t set_flags)12028 _set_flags(u_int32_t *flags_p, u_int32_t set_flags)
12029 {
12030 return (u_int32_t)OSBitOrAtomic(set_flags, flags_p);
12031 }
12032
12033 static inline void
_clear_flags(u_int32_t * flags_p,u_int32_t clear_flags)12034 _clear_flags(u_int32_t *flags_p, u_int32_t clear_flags)
12035 {
12036 OSBitAndAtomic(~clear_flags, flags_p);
12037 }
12038
12039 __private_extern__ u_int32_t
if_set_eflags(ifnet_t interface,u_int32_t set_flags)12040 if_set_eflags(ifnet_t interface, u_int32_t set_flags)
12041 {
12042 return _set_flags(&interface->if_eflags, set_flags);
12043 }
12044
12045 __private_extern__ void
if_clear_eflags(ifnet_t interface,u_int32_t clear_flags)12046 if_clear_eflags(ifnet_t interface, u_int32_t clear_flags)
12047 {
12048 _clear_flags(&interface->if_eflags, clear_flags);
12049 }
12050
12051 __private_extern__ u_int32_t
if_set_xflags(ifnet_t interface,u_int32_t set_flags)12052 if_set_xflags(ifnet_t interface, u_int32_t set_flags)
12053 {
12054 return _set_flags(&interface->if_xflags, set_flags);
12055 }
12056
12057 __private_extern__ void
if_clear_xflags(ifnet_t interface,u_int32_t clear_flags)12058 if_clear_xflags(ifnet_t interface, u_int32_t clear_flags)
12059 {
12060 _clear_flags(&interface->if_xflags, clear_flags);
12061 }
12062
12063 __private_extern__ void
ifnet_update_traffic_rule_genid(ifnet_t ifp)12064 ifnet_update_traffic_rule_genid(ifnet_t ifp)
12065 {
12066 os_atomic_inc(&ifp->if_traffic_rule_genid, relaxed);
12067 }
12068
12069 __private_extern__ boolean_t
ifnet_sync_traffic_rule_genid(ifnet_t ifp,uint32_t * genid)12070 ifnet_sync_traffic_rule_genid(ifnet_t ifp, uint32_t *genid)
12071 {
12072 if (*genid != ifp->if_traffic_rule_genid) {
12073 *genid = ifp->if_traffic_rule_genid;
12074 return TRUE;
12075 }
12076 return FALSE;
12077 }
12078 __private_extern__ void
ifnet_update_traffic_rule_count(ifnet_t ifp,uint32_t count)12079 ifnet_update_traffic_rule_count(ifnet_t ifp, uint32_t count)
12080 {
12081 os_atomic_store(&ifp->if_traffic_rule_count, count, release);
12082 ifnet_update_traffic_rule_genid(ifp);
12083 }
12084
12085 static void
log_hexdump(void * data,size_t len)12086 log_hexdump(void *data, size_t len)
12087 {
12088 size_t i, j, k;
12089 unsigned char *ptr = (unsigned char *)data;
12090 #define MAX_DUMP_BUF 32
12091 unsigned char buf[3 * MAX_DUMP_BUF + 1];
12092
12093 for (i = 0; i < len; i += MAX_DUMP_BUF) {
12094 for (j = i, k = 0; j < i + MAX_DUMP_BUF && j < len; j++) {
12095 unsigned char msnbl = ptr[j] >> 4;
12096 unsigned char lsnbl = ptr[j] & 0x0f;
12097
12098 buf[k++] = msnbl < 10 ? msnbl + '0' : msnbl + 'a' - 10;
12099 buf[k++] = lsnbl < 10 ? lsnbl + '0' : lsnbl + 'a' - 10;
12100
12101 if ((j % 2) == 1) {
12102 buf[k++] = ' ';
12103 }
12104 if ((j % MAX_DUMP_BUF) == MAX_DUMP_BUF - 1) {
12105 buf[k++] = ' ';
12106 }
12107 }
12108 buf[k] = 0;
12109 os_log(OS_LOG_DEFAULT, "%3lu: %s", i, buf);
12110 }
12111 }
12112
12113 #if SKYWALK
12114 static bool
net_check_compatible_if_filter(struct ifnet * ifp)12115 net_check_compatible_if_filter(struct ifnet *ifp)
12116 {
12117 if (ifp == NULL) {
12118 if (net_api_stats.nas_iflt_attach_count > net_api_stats.nas_iflt_attach_os_count) {
12119 return false;
12120 }
12121 } else {
12122 if (ifp->if_flt_non_os_count > 0) {
12123 return false;
12124 }
12125 }
12126 return true;
12127 }
12128 #endif /* SKYWALK */
12129
12130 #define DUMP_BUF_CHK() { \
12131 clen -= k; \
12132 if (clen < 1) \
12133 goto done; \
12134 c += k; \
12135 }
12136
12137 int dlil_dump_top_if_qlen(char *, int);
12138 int
dlil_dump_top_if_qlen(char * str,int str_len)12139 dlil_dump_top_if_qlen(char *str, int str_len)
12140 {
12141 char *c = str;
12142 int k, clen = str_len;
12143 struct ifnet *top_ifcq_ifp = NULL;
12144 uint32_t top_ifcq_len = 0;
12145 struct ifnet *top_inq_ifp = NULL;
12146 uint32_t top_inq_len = 0;
12147
12148 for (int ifidx = 1; ifidx < if_index; ifidx++) {
12149 struct ifnet *ifp = ifindex2ifnet[ifidx];
12150 struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
12151
12152 if (ifp == NULL) {
12153 continue;
12154 }
12155 if (ifp->if_snd != NULL && ifp->if_snd->ifcq_len > top_ifcq_len) {
12156 top_ifcq_len = ifp->if_snd->ifcq_len;
12157 top_ifcq_ifp = ifp;
12158 }
12159 if (dl_if->dl_if_inpstorage.dlth_pkts.qlen > top_inq_len) {
12160 top_inq_len = dl_if->dl_if_inpstorage.dlth_pkts.qlen;
12161 top_inq_ifp = ifp;
12162 }
12163 }
12164
12165 if (top_ifcq_ifp != NULL) {
12166 k = scnprintf(c, clen, "\ntop ifcq_len %u packets by %s\n",
12167 top_ifcq_len, top_ifcq_ifp->if_xname);
12168 DUMP_BUF_CHK();
12169 }
12170 if (top_inq_ifp != NULL) {
12171 k = scnprintf(c, clen, "\ntop inq_len %u packets by %s\n",
12172 top_inq_len, top_inq_ifp->if_xname);
12173 DUMP_BUF_CHK();
12174 }
12175 done:
12176 return str_len - clen;
12177 }
12178