1 /*
2 * Copyright (c) 2015-2020 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 /*
30 * Copyright (C) 2012-2014 Matteo Landi, Luigi Rizzo, Giuseppe Lettieri.
31 * All rights reserved.
32 * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved.
33 *
34 * Redistribution and use in source and binary forms, with or without
35 * modification, are permitted provided that the following conditions
36 * are met:
37 * 1. Redistributions of source code must retain the above copyright
38 * notice, this list of conditions and the following disclaimer.
39 * 2. Redistributions in binary form must reproduce the above copyright
40 * notice, this list of conditions and the following disclaimer in the
41 * documentation and/or other materials provided with the distribution.
42 *
43 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
44 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
45 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
46 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
47 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
48 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
49 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
50 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
51 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
52 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
53 * SUCH DAMAGE.
54 */
55
56 #ifndef _SKYWALK_CHANNEL_CHANNELVAR_H_
57 #define _SKYWALK_CHANNEL_CHANNELVAR_H_
58
59 #ifdef BSD_KERNEL_PRIVATE
60 #include <skywalk/core/skywalk_var.h>
61 #include <skywalk/os_channel_private.h>
62 #include <skywalk/nexus/nexus_mbq.h>
63 #include <skywalk/nexus/nexus_pktq.h>
64 #include <skywalk/mem/skmem_region_var.h>
65 #include <skywalk/mem/skmem_arena_var.h>
66
67 struct ch_selinfo {
68 decl_lck_mtx_data(, csi_lock);
69 struct selinfo csi_si;
70 uint32_t csi_flags;
71 uint32_t csi_pending;
72 uint64_t csi_eff_interval;
73 uint64_t csi_interval;
74 thread_call_t csi_tcall;
75 };
76
77 /* values for csi_flags */
78 #define CSI_KNOTE 0x1 /* kernel note attached */
79 #define CSI_MITIGATION 0x10 /* has mitigation */
80 #define CSI_DESTROYED (1U << 31) /* has been destroyed */
81
82 #define CSI_LOCK(_csi) \
83 lck_mtx_lock(&(_csi)->csi_lock)
84 #define CSI_LOCK_ASSERT_HELD(_csi) \
85 LCK_MTX_ASSERT(&(_csi)->csi_lock, LCK_MTX_ASSERT_OWNED)
86 #define CSI_LOCK_ASSERT_NOTHELD(_csi) \
87 LCK_MTX_ASSERT(&(_csi)->csi_lock, LCK_MTX_ASSERT_NOTOWNED)
88 #define CSI_UNLOCK(_csi) \
89 lck_mtx_unlock(&(_csi)->csi_lock)
90
91 /* mitigation intervals in ns */
92 #define CH_MIT_IVAL_DEFAULT (0)
93 #define CH_MIT_IVAL_WIFI CH_MIT_IVAL_DEFAULT
94 #define CH_MIT_IVAL_CELLULAR CH_MIT_IVAL_DEFAULT
95 #define CH_MIT_IVAL_ETHERNET CH_MIT_IVAL_DEFAULT
96
97 /*
98 * Kernel version of __user_slot_desc.
99 *
100 * Keep slot descriptor as minimal as possible.
101 * TODO: [email protected] -- Should we make use of RX/TX
102 * preparation/writeback descriptors (in a union)?
103 */
104 struct __kern_slot_desc {
105 union {
106 struct __kern_quantum *sd_qum;
107 struct __kern_packet *sd_pkt;
108 struct __kern_buflet *sd_buf;
109 void *sd_md; /* metadata address */
110 };
111
112 #ifndef __LP64__
113 uint32_t _sd_pad[1];
114 #endif /* !__LP64__ */
115 };
116
117 /* _sd_{user,kern} are at same offset in the preamble */
118 #define SLOT_DESC_KSD(_sdp) \
119 __unsafe_forge_single(struct __kern_slot_desc *, \
120 ((struct __kern_slot_desc *)((uintptr_t)&(_sdp)->_sd_private)))
121
122 /*
123 * Optional, per-slot context information. An array of these structures
124 * is allocated per nexus_adapter, and each real kring will have its slots
125 * correspond to one. This the 'arg' value is retrieved via the slot_init
126 * nexus provider callback, and is retrievable via subsequently via calls
127 * to kern_channel_slot_get_context().
128 */
129 struct slot_ctx {
130 /* -fbounds-safety: No one really uses this, so don't annotate it yet */
131 void *slot_ctx_arg; /* per-slot context */
132 };
133
134 extern lck_attr_t channel_lock_attr;
135 extern uint64_t __ch_umd_redzone_cookie;
136 extern uint32_t kr_stat_enable;
137
138 struct kern_nexus;
139 enum na_sync_mode;
140
141 struct kern_channel {
142 decl_lck_mtx_data(, ch_lock);
143 struct nexus_adapter *ch_na;
144 struct kern_nexus *ch_nexus;
145 struct ch_info *ch_info;
146 struct kern_pbufpool *ch_pp;
147
148 uint32_t ch_refcnt;
149 volatile uint32_t ch_flags; /* CHANF_* flags */
150
151 /* range of tx/rx/allocator/event rings to scan */
152 ring_id_t ch_first[NR_ALL];
153 ring_id_t ch_last[NR_ALL];
154
155 struct __user_channel_schema *ch_schema;
156
157 /*
158 * Pointers to the selinfo to be used for selrecord.
159 * Either the local or the global one depending on the
160 * number of rings.
161 */
162 struct ch_selinfo *ch_si[NR_ALL];
163
164 STAILQ_ENTRY(kern_channel) ch_link;
165 STAILQ_ENTRY(kern_channel) ch_link_if_adv;
166 void *ch_ctx;
167 mach_vm_offset_t ch_schema_offset;
168 struct skmem_arena_mmap_info ch_mmap;
169 int ch_fd; /* might be -1 if no fd */
170 pid_t ch_pid; /* process ID */
171 char ch_name[32]; /* process name */
172 };
173
174 /* valid values for ch_flags */
175 #define CHANF_ATTACHED 0x1 /* attached and connected to nexus */
176 #define CHANF_PLATFORM 0x2 /* platform binary process */
177 #define CHANF_KERNEL 0x4 /* kernel only; has no task map */
178 #define CHANF_RXONLY 0x8 /* receive only, no transmit */
179 #define CHANF_USER_PACKET_POOL 0x10 /* userspace using packet pool */
180 #define CHANF_EXCLUSIVE 0x20 /* exclusive bind to ring(s) */
181 #define CHANF_NONXREF 0x40 /* has no nexus reference */
182 #define CHANF_HOST 0x80 /* opened to host (kernel) stack */
183 #define CHANF_EXT_SKIP 0x100 /* don't notify external provider */
184 #define CHANF_EXT_PRECONNECT 0x200 /* successful nxpi_pre_connect() */
185 #define CHANF_EXT_CONNECTED 0x400 /* successful nxpi_connected() */
186 #define CHANF_EVENT_RING 0x1000 /* channel has event rings */
187 #define CHANF_IF_ADV 0x2000 /* interface advisory is active */
188 #define CHANF_DEFUNCT_SKIP 0x4000 /* defunct skipped due to active use */
189 #define CHANF_CLOSING (1U << 30) /* channel is being closed */
190 #define CHANF_DEFUNCT (1U << 31) /* channel is now defunct */
191
192 #define CHANF_BITS \
193 "\020\01ATTACHED\02PLATFORM\03KERNEL\04RXONLY\05USER_PKT_POOL" \
194 "\06EXCLUSIVE\07NONXREF\010HOST\011EXT_SKIP\012EXT_PRECONNECT" \
195 "\013EXT_CONNECTED\015EVENT\016ADVISORY" \
196 "\017DEFUNCT_SKIP\037CLOSING\040DEFUNCT"
197
198 /* valid values for ch_kevhints */
199 #define CHAN_FILT_HINT_FLOW_ADV_UPD 0x1 /* flow advisory update */
200 #define CHAN_FILT_HINT_CHANNEL_EVENT 0x2 /* channel event */
201 #define CHAN_FILT_HINT_IF_ADV_UPD 0x4 /* Interface advisory update */
202
203 #define CHAN_FILT_HINT_BITS "\020\01FLOW_ADV\02CHANNEL_EVENT\03IF_ADV"
204
205 typedef enum {
206 RING_SET_ALL = 0, /* all rings */
207 RING_SET_DEFAULT = RING_SET_ALL,
208 } ring_set_t;
209
210 typedef enum {
211 CH_ENDPOINT_NULL = 0,
212 CH_ENDPOINT_USER_PIPE_MASTER,
213 CH_ENDPOINT_USER_PIPE_SLAVE,
214 CH_ENDPOINT_KERNEL_PIPE,
215 CH_ENDPOINT_NET_IF,
216 CH_ENDPOINT_FLOW_SWITCH,
217 } ch_endpoint_t;
218
219 #define CHREQ_NAMELEN 64
220
221 struct chreq {
222 char cr_name[CHREQ_NAMELEN]; /* in */
223 uuid_t cr_spec_uuid; /* in */
224 struct ch_ev_thresh cr_tx_lowat; /* in */
225 struct ch_ev_thresh cr_rx_lowat; /* in */
226 nexus_port_t cr_port; /* in/out */
227 uint32_t cr_mode; /* in */
228 uint32_t cr_pipe_id; /* in */
229 ring_id_t cr_ring_id; /* in */
230 ring_set_t cr_ring_set; /* out */
231 ch_endpoint_t cr_real_endpoint; /* out */
232 ch_endpoint_t cr_endpoint; /* out */
233 mach_vm_size_t cr_memsize; /* out */
234 mach_vm_offset_t cr_memoffset; /* out */
235 };
236
237 /*
238 * Private, kernel view of a ring. Keeps track of the status of
239 * a ring across system calls.
240 *
241 * ckr_khead Index of the next buffer to refill. It corresponds
242 * to ring_head at the time the system call returns.
243 *
244 * ckr_ktail Index of the first buffer owned by the kernel.
245 *
246 * On RX, ckr_khead to ckr_ktail are receive buffers that
247 * are not yet released. ckr_khead is advanced following
248 * ring_head, ckr_ktail is advanced on incoming packets.
249 *
250 * On TX, ckr_rhead has been filled by the sender but not
251 * sent yet to the destination; ckr_rhead to ckr_ktail are
252 * available for new transmissions, and ckr_ktail to
253 * ckr_khead-1 are pending transmissions.
254 *
255 * Here is the layout for the RX and TX rings.
256 *
257 * RX RING TX RING
258 *
259 * +-----------------+ +-----------------+
260 * | | | |
261 * |XXX free slot XXX| |XXX free slot XXX|
262 * +-----------------+ +-----------------+
263 * head->| owned by user |<-khead | not sent to nic |<-khead
264 * | | | yet |
265 * | | | |
266 * +-----------------+ + ------ +
267 * tail->| |<-ktail | |<-klease
268 * | (being | ... | | ...
269 * | prepared) | ... | | ...
270 * +-----------------+ ... | | ...
271 * | |<-klease +-----------------+
272 * | | tail->| |<-ktail
273 * | | | |
274 * | | | |
275 * | | | |
276 * +-----------------+ +-----------------+
277 *
278 * The head/tail (user view) and khead/ktail (kernel view)
279 * are used in the normal operation of the adapter.
280 *
281 * For flow switch nexus:
282 *
283 * Concurrent rxsync or txsync on the same ring are prevented through
284 * by na_kr_(try)get() which in turn uses ckr_busy. This is all we need
285 * for NIC rings, and for TX rings attached to the host stack.
286 *
287 * RX rings attached to the host stack use an nx_mbq (ckr_rx_queue) on both
288 * nx_netif_rxsync_from_host() and nx_netif_compat_transmit(). The nx_mbq is
289 * protected by its internal lock.
290 *
291 * RX rings attached to the flow switch are accessed by both senders
292 * and receiver. They are protected through the q_lock on the RX ring.
293 *
294 * When a ring is the output of a switch port (RX ring for a flow switch
295 * port, TX ring for the host stack or NIC), slots are reserved in blocks
296 * through ckr_klease which points to the next unused slot.
297 *
298 * On an RX ring, ckr_klease is always after ckr_ktail, and completions cause
299 * ckr_ktail to advance. On a TX ring, ckr_klease is always between ckr_khead
300 * and ckr_ktail, and completions cause ckr_khead to advance.
301 *
302 * nx_fsw_vp_na_kr_space()
303 * returns the maximum number of slots that can be assigned.
304 *
305 * nx_fsw_vp_na_kr_lease() reserves the required number of buffers,
306 * advances ckr_klease and also returns an entry in a circular
307 * array where completions should be reported.
308 *
309 * For netif nexus:
310 *
311 * The indexes in the NIC and rings are offset by ckr_hwofs slots. This is
312 * so that, on a reset, buffers owned by userspace are not modified by the
313 * kernel. In particular:
314 *
315 * RX rings: the next empty buffer (ckr_ktail + ckr_hwofs) coincides with
316 * the next empty buffer as known by the hardware "next to check".
317 * TX rings: ckr_khead + ckr_hwofs coincides with "next to send".
318 *
319 */
320 typedef int (*channel_ring_notify_t)(struct __kern_channel_ring *,
321 struct proc *, uint32_t);
322
323 struct __kern_channel_ring {
324 struct __user_channel_ring *ckr_ring;
325
326 uint32_t ckr_flags; /* CKRF_* flags */
327 slot_idx_t ckr_num_slots; /* # of slots */
328 uint32_t ckr_max_pkt_len;/* max pp pkt size */
329 uint32_t ckr_largest; /* largest packet seen */
330 const slot_idx_t ckr_lim; /* ckr_num_slots - 1 */
331 enum txrx ckr_tx; /* kind of ring (tx/rx/alloc/free) */
332
333 volatile slot_idx_t ckr_khead;
334 volatile slot_idx_t ckr_ktail;
335 /*
336 * value of ckr_khead recorded at TX prologue (pre-sync)
337 */
338 volatile slot_idx_t ckr_khead_pre;
339 /*
340 * Copies of values in user rings, so we do not need to look
341 * at the ring (which could be modified). These are set in the
342 * *sync_prologue()/finalize() routines.
343 */
344 volatile slot_idx_t ckr_rhead;
345 volatile slot_idx_t ckr_rtail;
346
347 /* EWMA decay rate */
348 uint32_t ckr_transfer_decay;
349
350 uint64_t ckr_ready_bytes;
351 uint64_t ckr_ready_slots;
352
353 /*
354 * While ckr_state is set, no new [tr]xsync operations can be
355 * started on this kring. This is used by na_disable_all_rings()
356 * to find a synchronization point where critical data structures
357 * pointed to by the kring can be added or removed.
358 */
359 decl_lck_spin_data(, ckr_slock);
360 struct thread *ckr_owner; /* busy owner */
361 uint32_t ckr_busy; /* prevent kring modifications */
362 uint32_t ckr_want; /* # of threads that lost the race */
363 uint32_t ckr_state; /* KR_* states */
364
365 /* current working set for the allocator ring */
366 volatile uint32_t ckr_alloc_ws;
367
368 struct nexus_adapter *ckr_na; /* adapter this kring belongs to */
369 struct kern_pbufpool *ckr_pp; /* adapter's packet buffer pool */
370
371 /*
372 * Array of __slot_desc each representing slot-specific data, e.g.
373 * index to metadata, etc. There is exactly one descriptor for each
374 * slot in the ring. Note that the size of the array may be greater
375 * than the number of slots for this ring, and so we constrain
376 * range with [ckr_ksds, ckr_ksds_last] during validations.
377 */
378 struct __slot_desc *__counted_by(ckr_usds_cnt) ckr_usds; /* slot desc array (user) */
379 slot_idx_t ckr_usds_cnt;
380 struct __slot_desc *__counted_by(ckr_ksds_cnt) ckr_ksds; /* slot desc array (kernel) */
381 slot_idx_t ckr_ksds_cnt;
382 struct __slot_desc *ckr_ksds_last; /* cache last ksd */
383 struct skmem_cache *ckr_ksds_cache; /* owning skmem_cache for ksd */
384
385 uint32_t ckr_ring_id; /* ring ID */
386
387 boolean_t ckr_rate_limited; /* ring is rate limited */
388
389 /*
390 * Array of packet handles for as many slots as there are in the
391 * ring; this is useful for storing an array of kern_packet_t to
392 * be used when invoking the packet APIs. Only safe to be used
393 * in the context of a sync as we're single-threaded then.
394 * The memory is owned by the nexus adapter.
395 */
396 uint64_t *__counted_by(ckr_scratch_cnt)ckr_scratch;
397 slot_idx_t ckr_scratch_cnt;
398 /*
399 * [tx]sync callback for this kring. The default na_kring_create
400 * callback (na_kr_create) sets the ckr_na_sync callback of each
401 * tx(rx) kring to the corresponding na_txsync(na_rxsync) taken
402 * from the nexus_adapter.
403 *
404 * Overrides: the above configuration is not changed by
405 * any of the nm_krings_create callbacks.
406 */
407 int (*ckr_na_sync)(struct __kern_channel_ring *,
408 struct proc *, uint32_t);
409 int(*volatile ckr_na_notify)(struct __kern_channel_ring *,
410 struct proc *, uint32_t);
411
412 int (*ckr_prologue)(struct kern_channel *,
413 struct __kern_channel_ring *, const slot_idx_t,
414 uint32_t *, uint64_t *, struct proc *);
415 void (*ckr_finalize)(struct kern_channel *,
416 struct __kern_channel_ring *, const slot_idx_t, struct proc *);
417
418 /* time of last channel sync (updated at sync prologue time) */
419 uint64_t ckr_sync_time;
420
421 #if CONFIG_NEXUS_FLOWSWITCH
422 /* The following fields are for flow switch support */
423 int (*ckr_save_notify)(struct __kern_channel_ring *kring,
424 struct proc *, uint32_t flags);
425 #endif /* CONFIG_NEXUS_FLOWSWITCH */
426
427 kern_packet_svc_class_t ckr_svc;
428
429 /*
430 * (Optional) array of slot contexts for as many slots as there
431 * are in the ring; the memory is owned by the nexus adapter.
432 */
433 uint32_t ckr_slot_ctxs_set; /* number of valid/set contexts */
434 struct slot_ctx *__counted_by(ckr_slot_ctxs_cnt)ckr_slot_ctxs; /* (optional) array of slot contexts */
435 uint32_t ckr_slot_ctxs_cnt;
436 void *ckr_ctx; /* ring context */
437
438 struct ch_selinfo ckr_si; /* per-ring wait queue */
439
440 #if CONFIG_NEXUS_NETIF
441 /*
442 * netif adapters intercepts ckr_na_notify in order to
443 * mitigate IRQ events; the actual notification is done
444 * by invoking the original notify callback routine
445 * saved at na_activate() time.
446 */
447 int (*ckr_netif_notify)(struct __kern_channel_ring *kring,
448 struct proc *, uint32_t flags);
449 void (*ckr_netif_mit_stats)(struct __kern_channel_ring *kring,
450 uint64_t, uint64_t);
451 struct nx_netif_mit *ckr_mit;
452
453 volatile uint32_t ckr_pending_intr;
454 volatile uint32_t ckr_pending_doorbell;
455
456 /*
457 * Support for adapters without native Skywalk support.
458 * On tx rings we preallocate an array of tx buffers
459 * (same size as the channel ring), on rx rings we
460 * store incoming mbufs in a queue that is drained by
461 * a rxsync.
462 */
463 struct mbuf **__counted_by(ckr_tx_pool_count) ckr_tx_pool;
464 uint32_t ckr_tx_pool_count;
465 struct nx_mbq ckr_rx_queue; /* intercepted rx mbufs. */
466 #endif /* CONFIG_NEXUS_NETIF */
467
468 #if CONFIG_NEXUS_USER_PIPE
469 /* if this is a pipe ring, pointer to the other end */
470 struct __kern_channel_ring *ckr_pipe;
471 /* pointer to hidden rings see nx_user_pipe.c for details) */
472 struct __user_channel_ring *ckr_save_ring;
473 #endif /* CONFIG_NEXUS_USER_PIPE */
474
475 /*
476 * Protects kring in the event of multiple writers;
477 * only used by flow switch and monitor.
478 */
479 decl_lck_mtx_data(, ckr_qlock);
480
481 #if CONFIG_NEXUS_MONITOR
482 /* array of krings that are monitoring this kring */
483 struct __kern_channel_ring **ckr_monitors;
484 uint32_t ckr_max_monitors; /* current size of the monitors array */
485 uint32_t ckr_n_monitors; /* next unused entry in the monitor array */
486 /*
487 * Monitors work by intercepting the sync and notify callbacks of the
488 * monitored krings. This is implemented by replacing the pointers
489 * above and saving the previous ones in mon_* pointers below
490 */
491 int (*ckr_mon_sync)(struct __kern_channel_ring *kring, struct proc *,
492 uint32_t flags);
493 int (*ckr_mon_notify)(struct __kern_channel_ring *kring, struct proc *,
494 uint32_t flags);
495
496 uint32_t ckr_mon_tail; /* last seen slot on rx */
497 /* index of this ring in the monitored ring array */
498 uint32_t ckr_mon_pos;
499 #endif /* CONFIG_NEXUS_MONITOR */
500
501 uint32_t ckr_users; /* existing bindings for this ring */
502
503 /* ring flush rate limit */
504 int64_t ckr_tbr_token;
505 int64_t ckr_tbr_depth;
506 uint64_t ckr_tbr_last;
507 #define CKR_TBR_TOKEN_INVALID INT64_MAX
508
509 /* stats capturing errors */
510 channel_ring_error_stats ckr_err_stats
511 __attribute__((aligned(sizeof(uint64_t))));
512
513 /* stats capturing actual data movement (nexus provider's view) */
514 channel_ring_stats ckr_stats
515 __attribute__((aligned(sizeof(uint64_t))));
516 uint64_t ckr_accumulated_bytes;
517 uint64_t ckr_accumulated_slots;
518 uint64_t ckr_accumulate_start; /* in seconds */
519
520 /* stats capturing user activities per sync (user's view) */
521 channel_ring_user_stats ckr_usr_stats
522 __attribute__((aligned(sizeof(uint64_t))));
523 uint64_t ckr_user_accumulated_bytes;
524 uint64_t ckr_user_accumulated_slots;
525 uint64_t ckr_user_accumulated_syncs;
526 uint64_t ckr_user_accumulate_start; /* in seconds */
527
528 lck_grp_t *ckr_qlock_group;
529 lck_grp_t *ckr_slock_group;
530
531 char ckr_name[64]; /* diagnostic */
532
533 uint64_t ckr_rx_dequeue_ts; /* last timestamp when userspace dequeued */
534 uint64_t ckr_rx_enqueue_ts; /* last timestamp when kernel enqueued */
535 } __attribute__((__aligned__(CHANNEL_CACHE_ALIGN_MAX)));
536
537 #define KR_LOCK(_kr) \
538 lck_mtx_lock(&(_kr)->ckr_qlock)
539 #define KR_LOCK_SPIN(_kr) \
540 lck_mtx_lock_spin(&(_kr)->ckr_qlock)
541 #define KR_LOCK_TRY(_kr) \
542 lck_mtx_try_lock(&(_kr)->ckr_qlock)
543 #define KR_LOCK_ASSERT_HELD(_kr) \
544 LCK_MTX_ASSERT(&(_kr)->ckr_qlock, LCK_MTX_ASSERT_OWNED)
545 #define KR_LOCK_ASSERT_NOTHELD(_kr) \
546 LCK_MTX_ASSERT(&(_kr)->ckr_qlock, LCK_MTX_ASSERT_NOTOWNED)
547 #define KR_UNLOCK(_kr) \
548 lck_mtx_unlock(&(_kr)->ckr_qlock)
549
550 /* valid values for ckr_flags */
551 #define CKRF_EXCLUSIVE 0x1 /* exclusive binding */
552 #define CKRF_DROP 0x2 /* drop all mode */
553 #define CKRF_HOST 0x4 /* host ring */
554 #define CKRF_MEM_RING_INITED 0x8 /* na_kr_setup() succeeded */
555 #define CKRF_MEM_SD_INITED 0x10 /* na_kr_setup() succeeded */
556 #define CKRF_EXT_RING_INITED 0x20 /* nxpi_ring_init() succeeded */
557 #define CKRF_EXT_SLOTS_INITED 0x40 /* nxpi_slot_init() succeeded */
558 #define CKRF_SLOT_CONTEXT 0x80 /* ckr_slot_ctxs is valid */
559 #define CKRF_MITIGATION 0x100 /* supports event mitigation */
560 #define CKRF_DEFUNCT 0x200 /* no longer in service */
561 #define CKRF_KERNEL_ONLY (1U << 31) /* not usable by userland */
562
563 #define CKRF_BITS \
564 "\020\01EXCLUSIVE\02DROP\03HOST\04MEM_RING_INITED" \
565 "\05MEM_SD_INITED\06EXT_RING_INITED\07EXT_SLOTS_INITED" \
566 "\010SLOT_CONTEXT\011MITIGATION\012DEFUNCT\040KERNEL_ONLY"
567
568 #define KRNA(_kr) \
569 ((__DECONST(struct __kern_channel_ring *, _kr))->ckr_na)
570
571 #define KR_KERNEL_ONLY(_kr) \
572 (((_kr)->ckr_flags & CKRF_KERNEL_ONLY) != 0)
573 #define KR_DROP(_kr) \
574 (((_kr)->ckr_flags & (CKRF_DROP|CKRF_DEFUNCT)) != 0)
575
576 /* valid values for ckr_state */
577 enum {
578 KR_READY = 0,
579 KR_STOPPED, /* unbounded stop */
580 KR_LOCKED, /* bounded, brief stop for mutual exclusion */
581 };
582
583 #define KR_KSD(_kring, _slot_idx) \
584 (SLOT_DESC_KSD(&(_kring)->ckr_ksds[_slot_idx]))
585
586 #define KR_USD(_kring, _slot_idx) \
587 (SLOT_DESC_USD(&(_kring)->ckr_usds[_slot_idx]))
588
589 __attribute__((always_inline))
590 static inline slot_idx_t
KR_SLOT_INDEX(const struct __kern_channel_ring * kr,const struct __slot_desc * slot)591 KR_SLOT_INDEX(const struct __kern_channel_ring *kr,
592 const struct __slot_desc *slot)
593 {
594 ASSERT(slot >= kr->ckr_ksds && slot <= kr->ckr_ksds_last);
595 return (slot_idx_t)(slot - kr->ckr_ksds);
596 }
597
598 /* Helper macros for slot descriptor, decoupled for KSD/USD. */
599
600 #define KSD_VALID_METADATA(_ksd) \
601 ((_ksd)->sd_md != NULL)
602
603 #define KSD_INIT(_ksd) do { \
604 (_ksd)->sd_md = NULL; \
605 } while (0)
606
607 #define KSD_ATTACH_METADATA(_ksd, _md_addr) do { \
608 ASSERT((_ksd) != NULL); \
609 ASSERT((_ksd)->sd_md == NULL); \
610 (_ksd)->sd_md = (_md_addr); \
611 } while (0)
612
613 #define KSD_DETACH_METADATA(_ksd) do { \
614 ASSERT((_ksd) != NULL); \
615 ASSERT((_ksd)->sd_md != NULL); \
616 (_ksd)->sd_md = NULL; \
617 } while (0)
618
619 #define KSD_RESET(_ksd) KSD_INIT(_ksd)
620
621 #define USD_INIT(_usd) do { \
622 (_usd)->sd_md_idx = OBJ_IDX_NONE; \
623 (_usd)->sd_flags = 0; \
624 (_usd)->sd_len = 0; \
625 } while (0)
626
627 #define USD_ATTACH_METADATA(_usd, _md_idx) do { \
628 ASSERT((_usd) != NULL); \
629 ASSERT((_usd)->sd_md_idx == OBJ_IDX_NONE); \
630 ASSERT(((_usd)->sd_flags & SD_IDX_VALID) == 0); \
631 (_usd)->sd_md_idx = (_md_idx); \
632 (_usd)->sd_flags |= SD_IDX_VALID; \
633 /* mask off non-user flags */ \
634 (_usd)->sd_flags &= SD_FLAGS_USER; \
635 } while (0);
636
637 #define USD_DETACH_METADATA(_usd) do { \
638 ASSERT((_usd) != NULL); \
639 (_usd)->sd_md_idx = OBJ_IDX_NONE; \
640 /* mask off non-user flags */ \
641 (_usd)->sd_flags &= SD_FLAGS_USER; \
642 (_usd)->sd_flags &= ~SD_IDX_VALID; \
643 } while (0)
644
645 #define USD_RESET(_usd) USD_INIT(_usd)
646
647 #define USD_SET_LENGTH(_usd, _md_len) do { \
648 ASSERT((_usd) != NULL); \
649 (_usd)->sd_len = _md_len; \
650 } while (0)
651
652 #define _USD_COPY(_src, _dst) do { \
653 _CASSERT(sizeof (struct __user_slot_desc) == 8); \
654 sk_copy64_8((uint64_t *)(void *)_src, (uint64_t *)(void *)_dst); \
655 } while (0)
656
657 #define _USD_SWAP(_usd1, _usd2) do { \
658 struct __user_slot_desc _tusd \
659 __attribute((aligned(sizeof (uint64_t)))); \
660 _USD_COPY(_usd1, &_tusd); \
661 _USD_COPY(_usd2, _usd1); \
662 _USD_COPY(&_tusd, _usd2); \
663 } while (0)
664
665 #define _KSD_COPY(_src, _dst) do { \
666 _CASSERT(sizeof (struct __kern_slot_desc) == 8); \
667 sk_copy64_8((uint64_t *)(void *)_src, (uint64_t *)(void *)_dst); \
668 } while (0)
669
670 #define _KSD_SWAP(_ksd1, _ksd2) do { \
671 struct __kern_slot_desc _tksd \
672 __attribute((aligned(sizeof (uint64_t)))); \
673 _KSD_COPY(_ksd1, &_tksd); \
674 _KSD_COPY(_ksd2, _ksd1); \
675 _KSD_COPY(&_tksd, _ksd2); \
676 } while (0)
677
678 #define SD_SWAP(_ksd1, _usd1, _ksd2, _usd2) do { \
679 _USD_SWAP(_usd1, _usd2); \
680 _KSD_SWAP(_ksd1, _ksd2); \
681 /* swap packet attachment */ \
682 *(struct __kern_slot_desc **)(uintptr_t)&(_ksd1)->sd_qum->qum_ksd = \
683 (_ksd1); \
684 *(struct __kern_slot_desc **)(uintptr_t)&(_ksd2)->sd_qum->qum_ksd = \
685 (_ksd2); \
686 } while (0)
687
688 #define _MD_BUFLET_ADDROFF(_md, _addr, _objaddr, _doff, _dlen, _dlim) do { \
689 struct __kern_quantum *_q = SK_PTR_ADDR_KQUM(_md); \
690 switch (METADATA_TYPE(_q)) { \
691 case NEXUS_META_TYPE_PACKET: { \
692 struct __kern_packet *_p = \
693 (struct __kern_packet *)(void *)(_md); \
694 struct __kern_buflet *_kbft; \
695 PKT_GET_FIRST_BUFLET(_p, _p->pkt_bufs_cnt, _kbft); \
696 (_addr) = __unsafe_forge_bidi_indexable(void *, \
697 __DECONST(void *, _kbft->buf_addr), _kbft->buf_dlim); \
698 (_objaddr) = __unsafe_forge_bidi_indexable(void *, \
699 _kbft->buf_objaddr, _kbft->buf_dlim); \
700 (_doff) = _kbft->buf_doff; \
701 (_dlen) = _kbft->buf_dlen; \
702 (_dlim) = _kbft->buf_dlim; \
703 break; \
704 } \
705 default: \
706 (_addr) = __unsafe_forge_bidi_indexable(void *, \
707 __DECONST(void *, _q->qum_buf[0].buf_addr), \
708 _q->qum_buf[0].buf_dlim); \
709 (_objaddr) = __unsafe_forge_bidi_indexable(void *, \
710 _q->qum_buf[0].buf_objaddr, \
711 _q->qum_buf[0].buf_dlim); \
712 (_doff) = _q->qum_buf[0].buf_doff; \
713 (_dlen) = _q->qum_buf[0].buf_dlen; \
714 (_dlim) = _q->qum_buf[0].buf_dlim; \
715 break; \
716 } \
717 ASSERT((_addr) != NULL); \
718 ASSERT((_objaddr) != NULL); \
719 } while (0)
720
721 #define _MD_BUFLET_ADDR_PKT(_md, _addr) do { \
722 ASSERT(METADATA_TYPE(SK_PTR_ADDR_KQUM(_md)) == \
723 NEXUS_META_TYPE_PACKET); \
724 struct __kern_packet *_p = (struct __kern_packet *)(void *)(_md); \
725 struct __kern_buflet *_kbft; \
726 PKT_GET_FIRST_BUFLET(_p, _p->pkt_bufs_cnt, _kbft); \
727 (_addr) = __unsafe_forge_bidi_indexable(void *, \
728 __DECONST(void *, _kbft->buf_addr), _kbft->buf_dlim); \
729 ASSERT((_addr) != NULL); \
730 } while (0)
731
732
733 /*
734 * Return the data offset adjusted virtual address of a buffer associated
735 * with the metadata; for metadata with multiple buflets, this is the
736 * first buffer's address.
737 */
738 #define MD_BUFLET_ADDR(_md, _val) do { \
739 void *_addr, *_objaddr; \
740 uint32_t _doff, _dlen, _dlim; \
741 _MD_BUFLET_ADDROFF(_md, _addr, _objaddr, _doff, _dlen, _dlim); \
742 /* skip past buflet data offset */ \
743 (_val) = (void *)((uint8_t *)_addr + _doff); \
744 } while (0)
745
746 /*
747 * Return the absolute virtual address of a buffer associated with the
748 * metadata; for metadata with multiple buflets, this is the first
749 * buffer's address.
750 */
751 #define MD_BUFLET_ADDR_ABS(_md, _val) do { \
752 void *_addr, *_objaddr; \
753 uint32_t _doff, _dlen, _dlim; \
754 _MD_BUFLET_ADDROFF(_md, _addr, _objaddr, _doff, _dlen, _dlim); \
755 (_val) = (void *)_addr; \
756 } while (0)
757
758 /* similar to MD_BUFLET_ADDR_ABS() but optimized only for packets */
759 #define MD_BUFLET_ADDR_ABS_PKT(_md, _val) do { \
760 void *_addr; \
761 _MD_BUFLET_ADDR_PKT(_md, _addr); \
762 (_val) = (void *)_addr; \
763 } while (0)
764
765
766 #define MD_BUFLET_ADDR_ABS_DLEN(_md, _val, _dlen, _dlim, _doff) do { \
767 void *_addr, *_objaddr; \
768 _MD_BUFLET_ADDROFF(_md, _addr, _objaddr, _doff, _dlen, _dlim); \
769 (_val) = (void *)_addr; \
770 } while (0)
771
772
773 /*
774 * Return the buffer's object address associated with the metadata; for
775 * metadata with multiple buflets, this is the first buffer's object address.
776 */
777 #define MD_BUFLET_OBJADDR(_md, _val) do { \
778 void *_addr, *_objaddr; \
779 uint32_t _doff, _dlen, _dlim; \
780 _MD_BUFLET_ADDROFF(_md, _addr, _objaddr, _doff, _dlen, _dlim); \
781 (_val) = (void *)_objaddr; \
782 } while (0)
783
784 /*
785 * Return the data offset adjusted virtual address of a buffer associated
786 * with the metadata; for metadata with multiple buflets, this is the
787 * first buffer's address and data length.
788 */
789 #define MD_BUFLET_ADDR_DLEN(_md, _val, _dlen) do { \
790 void *_addr, *_objaddr; \
791 uint32_t _doff, _dlim; \
792 _MD_BUFLET_ADDROFF(_md, _addr, _objaddr, _doff, _dlen, _dlim); \
793 /* skip past buflet data offset */ \
794 (_val) = (void *)(__unsafe_forge_bidi_indexable(uint8_t *, _addr, _dlim) + _doff); \
795 } while (0)
796
797 /* kr_space: return available space for enqueue into kring */
798 __attribute__((always_inline))
799 static inline uint32_t
kr_available_slots(struct __kern_channel_ring * kr)800 kr_available_slots(struct __kern_channel_ring *kr)
801 {
802 uint32_t space;
803
804 space = kr->ckr_lim - (kr->ckr_num_slots - kr->ckr_khead);
805 return space;
806 }
807
808 /* kr_space: return available space for enqueue into Rx kring */
809 __attribute__((always_inline))
810 static inline uint32_t
kr_available_slots_rxring(struct __kern_channel_ring * rxkring)811 kr_available_slots_rxring(struct __kern_channel_ring *rxkring)
812 {
813 int busy;
814 uint32_t space;
815
816 /* # of rx busy (unclaimed) slots */
817 busy = (int)(rxkring->ckr_ktail - rxkring->ckr_khead);
818 if (busy < 0) {
819 busy += rxkring->ckr_num_slots;
820 }
821
822 /* # of rx avail free slots (subtract busy from max) */
823 space = rxkring->ckr_lim - (uint32_t)busy;
824 return space;
825 }
826
827 extern kern_allocation_name_t skmem_tag_ch_key;
828
829 #if (DEVELOPMENT || DEBUG)
830 SYSCTL_DECL(_kern_skywalk_channel);
831 #endif /* !DEVELOPMENT && !DEBUG */
832
833 __BEGIN_DECLS
834 extern int channel_init(void);
835 extern void channel_fini(void);
836
837 extern struct kern_channel *ch_open(struct ch_init *, struct proc *,
838 int, int *);
839 extern struct kern_channel *ch_open_special(struct kern_nexus *,
840 struct chreq *, boolean_t, int *);
841 extern void ch_close(struct kern_channel *, boolean_t);
842 extern void ch_close_special(struct kern_channel *);
843 extern int ch_kqfilter(struct kern_channel *, struct knote *,
844 struct kevent_qos_s *kev);
845 extern boolean_t ch_is_multiplex(struct kern_channel *, enum txrx);
846 extern int ch_select(struct kern_channel *, int, void *, struct proc *);
847 extern int ch_get_opt(struct kern_channel *, struct sockopt *);
848 extern int ch_set_opt(struct kern_channel *, struct sockopt *);
849 extern void ch_deactivate(struct kern_channel *);
850 extern void ch_retain(struct kern_channel *);
851 extern void ch_retain_locked(struct kern_channel *);
852 extern int ch_release(struct kern_channel *);
853 extern int ch_release_locked(struct kern_channel *);
854 extern void ch_dtor(struct kern_channel *);
855
856 extern void csi_init(struct ch_selinfo *, boolean_t, uint64_t);
857 extern void csi_destroy(struct ch_selinfo *);
858 extern void csi_selrecord_one(struct __kern_channel_ring *, struct proc *,
859 void *);
860 extern void csi_selrecord_all(struct nexus_adapter *, enum txrx, struct proc *,
861 void *);
862 extern void csi_selwakeup_one(struct __kern_channel_ring *, boolean_t,
863 boolean_t, boolean_t, uint32_t);
864 extern void csi_selwakeup_all(struct nexus_adapter *, enum txrx, boolean_t,
865 boolean_t, boolean_t, uint32_t);
866
867 extern void kr_init_to_mhints(struct __kern_channel_ring *, uint32_t);
868 extern int kr_enter(struct __kern_channel_ring *, boolean_t);
869 extern void kr_exit(struct __kern_channel_ring *);
870 extern void kr_start(struct __kern_channel_ring *);
871 extern void kr_stop(struct __kern_channel_ring *kr, uint32_t state);
872 extern void kr_update_stats(struct __kern_channel_ring *kring,
873 uint32_t slot_count, uint32_t byte_count);
874 extern boolean_t kr_txempty(struct __kern_channel_ring *kring);
875 extern uint32_t kr_reclaim(struct __kern_channel_ring *kr);
876
877 extern slot_idx_t kr_txsync_prologue(struct kern_channel *,
878 struct __kern_channel_ring *, struct proc *);
879 extern int kr_txprologue(struct kern_channel *,
880 struct __kern_channel_ring *, const slot_idx_t, uint32_t *, uint64_t *,
881 struct proc *);
882 extern int kr_txprologue_upp(struct kern_channel *,
883 struct __kern_channel_ring *, const slot_idx_t, uint32_t *, uint64_t *,
884 struct proc *);
885
886 extern void kr_txsync_finalize(struct kern_channel *,
887 struct __kern_channel_ring *, struct proc *);
888 extern void kr_txfinalize(struct kern_channel *,
889 struct __kern_channel_ring *, const slot_idx_t, struct proc *p);
890 extern void kr_txfinalize_upp(struct kern_channel *,
891 struct __kern_channel_ring *, const slot_idx_t, struct proc *p);
892
893 extern slot_idx_t kr_rxsync_prologue(struct kern_channel *ch,
894 struct __kern_channel_ring *kring, struct proc *p);
895 extern int kr_rxprologue(struct kern_channel *,
896 struct __kern_channel_ring *, const slot_idx_t, uint32_t *, uint64_t *,
897 struct proc *);
898 extern int kr_rxprologue_nodetach(struct kern_channel *,
899 struct __kern_channel_ring *, const slot_idx_t, uint32_t *, uint64_t *,
900 struct proc *);
901 extern int kr_rxprologue_upp(struct kern_channel *,
902 struct __kern_channel_ring *, const slot_idx_t, uint32_t *, uint64_t *,
903 struct proc *);
904
905 extern void kr_rxsync_finalize(struct kern_channel *ch,
906 struct __kern_channel_ring *kring, struct proc *p);
907 extern void kr_rxfinalize(struct kern_channel *,
908 struct __kern_channel_ring *, const slot_idx_t, struct proc *p);
909 extern void kr_rxfinalize_upp(struct kern_channel *,
910 struct __kern_channel_ring *, const slot_idx_t, struct proc *p);
911
912 extern void kr_txkring_reclaim_and_refill(struct __kern_channel_ring *kring,
913 slot_idx_t index);
914 extern slot_idx_t kr_alloc_sync_prologue(struct __kern_channel_ring *kring,
915 struct proc *p);
916 extern slot_idx_t kr_free_sync_prologue(struct __kern_channel_ring *kring,
917 struct proc *p);
918 extern void kr_alloc_sync_finalize(struct __kern_channel_ring *kring,
919 struct proc *p);
920 extern void kr_free_sync_finalize(struct __kern_channel_ring *kring,
921 struct proc *p);
922 extern int kr_internalize_metadata(struct kern_channel *,
923 struct __kern_channel_ring *, const uint32_t, struct __kern_quantum *,
924 struct proc *);
925 extern void kr_externalize_metadata(struct __kern_channel_ring *,
926 const uint32_t, struct __kern_quantum *, struct proc *);
927 extern slot_idx_t kr_event_sync_prologue(struct __kern_channel_ring *kring,
928 struct proc *p);
929 extern void kr_event_sync_finalize(struct kern_channel *ch,
930 struct __kern_channel_ring *kring, struct proc *p);
931
932 #if SK_LOG
933 extern void kr_log_bad_ring(struct __kern_channel_ring *);
934 #else
935 #define kr_log_bad_ring(_kr) do { ((void)0); } while (0)
936 #endif /* SK_LOG */
937 __END_DECLS
938 #endif /* BSD_KERNEL_PRIVATE */
939 #endif /* !_SKYWALK_CHANNEL_CHANNELVAR_H_ */
940