1 /*
2 * Copyright (c) 2015-2020 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 /*
30 * Copyright (C) 2012-2014 Matteo Landi, Luigi Rizzo, Giuseppe Lettieri.
31 * All rights reserved.
32 * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved.
33 *
34 * Redistribution and use in source and binary forms, with or without
35 * modification, are permitted provided that the following conditions
36 * are met:
37 * 1. Redistributions of source code must retain the above copyright
38 * notice, this list of conditions and the following disclaimer.
39 * 2. Redistributions in binary form must reproduce the above copyright
40 * notice, this list of conditions and the following disclaimer in the
41 * documentation and/or other materials provided with the distribution.
42 *
43 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
44 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
45 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
46 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
47 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
48 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
49 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
50 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
51 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
52 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
53 * SUCH DAMAGE.
54 */
55
56 #ifndef _SKYWALK_CHANNEL_CHANNELVAR_H_
57 #define _SKYWALK_CHANNEL_CHANNELVAR_H_
58
59 #ifdef BSD_KERNEL_PRIVATE
60 #include <skywalk/skywalk_common.h>
61 #include <skywalk/core/skywalk_var.h>
62 #include <skywalk/os_channel_private.h>
63 #include <skywalk/nexus/nexus_mbq.h>
64 #include <skywalk/nexus/nexus_pktq.h>
65 #include <skywalk/mem/skmem_region_var.h>
66 #include <skywalk/mem/skmem_arena_var.h>
67
68 struct ch_selinfo {
69 decl_lck_mtx_data(, csi_lock);
70 struct selinfo csi_si;
71 uint32_t csi_flags;
72 uint32_t csi_pending;
73 uint64_t csi_eff_interval;
74 uint64_t csi_interval;
75 thread_call_t csi_tcall;
76 };
77
78 /* values for csi_flags */
79 #define CSI_KNOTE 0x1 /* kernel note attached */
80 #define CSI_MITIGATION 0x10 /* has mitigation */
81 #define CSI_DESTROYED (1U << 31) /* has been destroyed */
82
83 #define CSI_LOCK(_csi) \
84 lck_mtx_lock(&(_csi)->csi_lock)
85 #define CSI_LOCK_ASSERT_HELD(_csi) \
86 LCK_MTX_ASSERT(&(_csi)->csi_lock, LCK_MTX_ASSERT_OWNED)
87 #define CSI_LOCK_ASSERT_NOTHELD(_csi) \
88 LCK_MTX_ASSERT(&(_csi)->csi_lock, LCK_MTX_ASSERT_NOTOWNED)
89 #define CSI_UNLOCK(_csi) \
90 lck_mtx_unlock(&(_csi)->csi_lock)
91
92 /* mitigation intervals in ns */
93 #define CH_MIT_IVAL_DEFAULT (0)
94 #define CH_MIT_IVAL_WIFI CH_MIT_IVAL_DEFAULT
95 #define CH_MIT_IVAL_CELLULAR CH_MIT_IVAL_DEFAULT
96 #define CH_MIT_IVAL_ETHERNET CH_MIT_IVAL_DEFAULT
97
98 /*
99 * Kernel version of __user_slot_desc.
100 *
101 * Keep slot descriptor as minimal as possible.
102 * TODO: [email protected] -- Should we make use of RX/TX
103 * preparation/writeback descriptors (in a union)?
104 */
105 struct __kern_slot_desc {
106 union {
107 struct __kern_quantum *sd_qum;
108 struct __kern_packet *sd_pkt;
109 struct __kern_buflet *sd_buf;
110 void *sd_md; /* metadata address */
111 };
112
113 #ifndef __LP64__
114 uint32_t _sd_pad[1];
115 #endif /* !__LP64__ */
116 };
117
118 /* _sd_{user,kern} are at same offset in the preamble */
119 #define SLOT_DESC_KSD(_sdp) \
120 __unsafe_forge_single(struct __kern_slot_desc *, \
121 ((struct __kern_slot_desc *)((uintptr_t)&(_sdp)->_sd_private)))
122
123 /*
124 * Optional, per-slot context information. An array of these structures
125 * is allocated per nexus_adapter, and each real kring will have its slots
126 * correspond to one. This the 'arg' value is retrieved via the slot_init
127 * nexus provider callback, and is retrievable via subsequently via calls
128 * to kern_channel_slot_get_context().
129 */
130 struct slot_ctx {
131 /* -fbounds-safety: No one really uses this, so don't annotate it yet */
132 void *slot_ctx_arg; /* per-slot context */
133 };
134
135 extern lck_attr_t channel_lock_attr;
136 extern uint64_t __ch_umd_redzone_cookie;
137 extern uint32_t kr_stat_enable;
138
139 struct kern_nexus;
140 enum na_sync_mode;
141
142 struct kern_channel {
143 decl_lck_mtx_data(, ch_lock);
144 struct nexus_adapter *ch_na;
145 struct kern_nexus *ch_nexus;
146 struct ch_info *ch_info;
147 struct kern_pbufpool *ch_pp;
148
149 uint32_t ch_refcnt;
150 volatile uint32_t ch_flags; /* CHANF_* flags */
151
152 /* range of tx/rx/allocator/event rings to scan */
153 ring_id_t ch_first[NR_ALL];
154 ring_id_t ch_last[NR_ALL];
155
156 struct __user_channel_schema *ch_schema;
157
158 /*
159 * Pointers to the selinfo to be used for selrecord.
160 * Either the local or the global one depending on the
161 * number of rings.
162 */
163 struct ch_selinfo *ch_si[NR_ALL];
164
165 STAILQ_ENTRY(kern_channel) ch_link;
166 STAILQ_ENTRY(kern_channel) ch_link_if_adv;
167 void *ch_ctx;
168 mach_vm_offset_t ch_schema_offset;
169 struct skmem_arena_mmap_info ch_mmap;
170 int ch_fd; /* might be -1 if no fd */
171 pid_t ch_pid; /* process ID */
172 char ch_name[32]; /* process name */
173 };
174
175 /* valid values for ch_flags */
176 #define CHANF_ATTACHED 0x1 /* attached and connected to nexus */
177 #define CHANF_PLATFORM 0x2 /* platform binary process */
178 #define CHANF_KERNEL 0x4 /* kernel only; has no task map */
179 #define CHANF_RXONLY 0x8 /* receive only, no transmit */
180 #define CHANF_USER_PACKET_POOL 0x10 /* userspace using packet pool */
181 #define CHANF_EXCLUSIVE 0x20 /* exclusive bind to ring(s) */
182 #define CHANF_NONXREF 0x40 /* has no nexus reference */
183 #define CHANF_HOST 0x80 /* opened to host (kernel) stack */
184 #define CHANF_EXT_SKIP 0x100 /* don't notify external provider */
185 #define CHANF_EXT_PRECONNECT 0x200 /* successful nxpi_pre_connect() */
186 #define CHANF_EXT_CONNECTED 0x400 /* successful nxpi_connected() */
187 #define CHANF_EVENT_RING 0x1000 /* channel has event rings */
188 #define CHANF_IF_ADV 0x2000 /* interface advisory is active */
189 #define CHANF_DEFUNCT_SKIP 0x4000 /* defunct skipped due to active use */
190 #define CHANF_CLOSING (1U << 30) /* channel is being closed */
191 #define CHANF_DEFUNCT (1U << 31) /* channel is now defunct */
192
193 #define CHANF_BITS \
194 "\020\01ATTACHED\02PLATFORM\03KERNEL\04RXONLY\05USER_PKT_POOL" \
195 "\06EXCLUSIVE\07NONXREF\010HOST\011EXT_SKIP\012EXT_PRECONNECT" \
196 "\013EXT_CONNECTED\015EVENT\016ADVISORY" \
197 "\017DEFUNCT_SKIP\037CLOSING\040DEFUNCT"
198
199 /* valid values for ch_kevhints */
200 #define CHAN_FILT_HINT_FLOW_ADV_UPD 0x1 /* flow advisory update */
201 #define CHAN_FILT_HINT_CHANNEL_EVENT 0x2 /* channel event */
202 #define CHAN_FILT_HINT_IF_ADV_UPD 0x4 /* Interface advisory update */
203
204 #define CHAN_FILT_HINT_BITS "\020\01FLOW_ADV\02CHANNEL_EVENT\03IF_ADV"
205
206 typedef enum {
207 RING_SET_ALL = 0, /* all rings */
208 RING_SET_DEFAULT = RING_SET_ALL,
209 } ring_set_t;
210
211 typedef enum {
212 CH_ENDPOINT_NULL = 0,
213 CH_ENDPOINT_USER_PIPE_MASTER,
214 CH_ENDPOINT_USER_PIPE_SLAVE,
215 CH_ENDPOINT_KERNEL_PIPE,
216 CH_ENDPOINT_NET_IF,
217 CH_ENDPOINT_FLOW_SWITCH,
218 } ch_endpoint_t;
219
220 #define CHREQ_NAMELEN 64
221
222 struct chreq {
223 char cr_name[CHREQ_NAMELEN]; /* in */
224 uuid_t cr_spec_uuid; /* in */
225 struct ch_ev_thresh cr_tx_lowat; /* in */
226 struct ch_ev_thresh cr_rx_lowat; /* in */
227 nexus_port_t cr_port; /* in/out */
228 uint32_t cr_mode; /* in */
229 uint32_t cr_pipe_id; /* in */
230 ring_id_t cr_ring_id; /* in */
231 ring_set_t cr_ring_set; /* out */
232 ch_endpoint_t cr_endpoint; /* out */
233 mach_vm_size_t cr_memsize; /* out */
234 mach_vm_offset_t cr_memoffset; /* out */
235 };
236
237 /*
238 * Private, kernel view of a ring. Keeps track of the status of
239 * a ring across system calls.
240 *
241 * ckr_khead Index of the next buffer to refill. It corresponds
242 * to ring_head at the time the system call returns.
243 *
244 * ckr_ktail Index of the first buffer owned by the kernel.
245 *
246 * On RX, ckr_khead to ckr_ktail are receive buffers that
247 * are not yet released. ckr_khead is advanced following
248 * ring_head, ckr_ktail is advanced on incoming packets.
249 *
250 * On TX, ckr_rhead has been filled by the sender but not
251 * sent yet to the destination; ckr_rhead to ckr_ktail are
252 * available for new transmissions, and ckr_ktail to
253 * ckr_khead-1 are pending transmissions.
254 *
255 * Here is the layout for the RX and TX rings.
256 *
257 * RX RING TX RING
258 *
259 * +-----------------+ +-----------------+
260 * | | | |
261 * |XXX free slot XXX| |XXX free slot XXX|
262 * +-----------------+ +-----------------+
263 * head->| owned by user |<-khead | not sent to nic |<-khead
264 * | | | yet |
265 * | | | |
266 * +-----------------+ + ------ +
267 * tail->| |<-ktail | |<-klease
268 * | (being | ... | | ...
269 * | prepared) | ... | | ...
270 * +-----------------+ ... | | ...
271 * | |<-klease +-----------------+
272 * | | tail->| |<-ktail
273 * | | | |
274 * | | | |
275 * | | | |
276 * +-----------------+ +-----------------+
277 *
278 * The head/tail (user view) and khead/ktail (kernel view)
279 * are used in the normal operation of the adapter.
280 *
281 * For flow switch nexus:
282 *
283 * Concurrent rxsync or txsync on the same ring are prevented through
284 * by na_kr_(try)get() which in turn uses ckr_busy. This is all we need
285 * for NIC rings, and for TX rings attached to the host stack.
286 *
287 * RX rings attached to the host stack use an nx_mbq (ckr_rx_queue) on both
288 * nx_netif_rxsync_from_host() and nx_netif_compat_transmit(). The nx_mbq is
289 * protected by its internal lock.
290 *
291 * RX rings attached to the flow switch are accessed by both senders
292 * and receiver. They are protected through the q_lock on the RX ring.
293 *
294 * When a ring is the output of a switch port (RX ring for a flow switch
295 * port, TX ring for the host stack or NIC), slots are reserved in blocks
296 * through ckr_klease which points to the next unused slot.
297 *
298 * On an RX ring, ckr_klease is always after ckr_ktail, and completions cause
299 * ckr_ktail to advance. On a TX ring, ckr_klease is always between ckr_khead
300 * and ckr_ktail, and completions cause ckr_khead to advance.
301 *
302 * nx_fsw_vp_na_kr_space()
303 * returns the maximum number of slots that can be assigned.
304 *
305 * nx_fsw_vp_na_kr_lease() reserves the required number of buffers,
306 * advances ckr_klease and also returns an entry in a circular
307 * array where completions should be reported.
308 *
309 * For netif nexus:
310 *
311 * The indexes in the NIC and rings are offset by ckr_hwofs slots. This is
312 * so that, on a reset, buffers owned by userspace are not modified by the
313 * kernel. In particular:
314 *
315 * RX rings: the next empty buffer (ckr_ktail + ckr_hwofs) coincides with
316 * the next empty buffer as known by the hardware "next to check".
317 * TX rings: ckr_khead + ckr_hwofs coincides with "next to send".
318 *
319 */
320 typedef int (*channel_ring_notify_t)(struct __kern_channel_ring *,
321 struct proc *, uint32_t);
322
323 struct __kern_channel_ring {
324 struct __user_channel_ring *ckr_ring;
325
326 uint32_t ckr_flags; /* CKRF_* flags */
327 slot_idx_t ckr_num_slots; /* # of slots */
328 uint32_t ckr_max_pkt_len;/* max pp pkt size */
329 uint32_t ckr_largest; /* largest packet seen */
330 const slot_idx_t ckr_lim; /* ckr_num_slots - 1 */
331 enum txrx ckr_tx; /* kind of ring (tx/rx/alloc/free) */
332
333 volatile slot_idx_t ckr_khead;
334 volatile slot_idx_t ckr_ktail;
335 /*
336 * value of ckr_khead recorded at TX prologue (pre-sync)
337 */
338 volatile slot_idx_t ckr_khead_pre;
339 /*
340 * Copies of values in user rings, so we do not need to look
341 * at the ring (which could be modified). These are set in the
342 * *sync_prologue()/finalize() routines.
343 */
344 volatile slot_idx_t ckr_rhead;
345 volatile slot_idx_t ckr_rtail;
346
347 /* EWMA decay rate */
348 uint32_t ckr_transfer_decay;
349
350 uint64_t ckr_ready_bytes;
351 uint64_t ckr_ready_slots;
352
353 /*
354 * While ckr_state is set, no new [tr]xsync operations can be
355 * started on this kring. This is used by na_disable_all_rings()
356 * to find a synchronization point where critical data structures
357 * pointed to by the kring can be added or removed.
358 */
359 decl_lck_spin_data(, ckr_slock);
360 struct thread *ckr_owner; /* busy owner */
361 uint32_t ckr_busy; /* prevent kring modifications */
362 uint32_t ckr_want; /* # of threads that lost the race */
363 uint32_t ckr_state; /* KR_* states */
364
365 /* current working set for the allocator ring */
366 volatile uint32_t ckr_alloc_ws;
367
368 struct nexus_adapter *ckr_na; /* adapter this kring belongs to */
369 struct kern_pbufpool *ckr_pp; /* adapter's packet buffer pool */
370
371 /*
372 * Array of __slot_desc each representing slot-specific data, e.g.
373 * index to metadata, etc. There is exactly one descriptor for each
374 * slot in the ring. Note that the size of the array may be greater
375 * than the number of slots for this ring, and so we constrain
376 * range with [ckr_ksds, ckr_ksds_last] during validations.
377 */
378 struct __slot_desc *__counted_by(ckr_usds_cnt) ckr_usds; /* slot desc array (user) */
379 slot_idx_t ckr_usds_cnt;
380 struct __slot_desc *__counted_by(ckr_ksds_cnt) ckr_ksds; /* slot desc array (kernel) */
381 slot_idx_t ckr_ksds_cnt;
382 struct __slot_desc *ckr_ksds_last; /* cache last ksd */
383 struct skmem_cache *ckr_ksds_cache; /* owning skmem_cache for ksd */
384
385 uint32_t ckr_ring_id; /* ring ID */
386
387 boolean_t ckr_rate_limited; /* ring is rate limited */
388
389 /*
390 * Array of packet handles for as many slots as there are in the
391 * ring; this is useful for storing an array of kern_packet_t to
392 * be used when invoking the packet APIs. Only safe to be used
393 * in the context of a sync as we're single-threaded then.
394 * The memory is owned by the nexus adapter.
395 */
396 uint64_t *__counted_by(ckr_scratch_cnt)ckr_scratch;
397 slot_idx_t ckr_scratch_cnt;
398 /*
399 * [tx]sync callback for this kring. The default na_kring_create
400 * callback (na_kr_create) sets the ckr_na_sync callback of each
401 * tx(rx) kring to the corresponding na_txsync(na_rxsync) taken
402 * from the nexus_adapter.
403 *
404 * Overrides: the above configuration is not changed by
405 * any of the nm_krings_create callbacks.
406 */
407 int (*ckr_na_sync)(struct __kern_channel_ring *,
408 struct proc *, uint32_t);
409 int(*volatile ckr_na_notify)(struct __kern_channel_ring *,
410 struct proc *, uint32_t);
411
412 int (*ckr_prologue)(struct kern_channel *,
413 struct __kern_channel_ring *, const slot_idx_t,
414 uint32_t *, uint64_t *, struct proc *);
415 void (*ckr_finalize)(struct kern_channel *,
416 struct __kern_channel_ring *, const slot_idx_t, struct proc *);
417
418 /* time of last channel sync (updated at sync prologue time) */
419 uint64_t ckr_sync_time;
420
421 #if CONFIG_NEXUS_FLOWSWITCH
422 /* The following fields are for flow switch support */
423 int (*ckr_save_notify)(struct __kern_channel_ring *kring,
424 struct proc *, uint32_t flags);
425 #endif /* CONFIG_NEXUS_FLOWSWITCH */
426
427 kern_packet_svc_class_t ckr_svc;
428
429 /*
430 * (Optional) array of slot contexts for as many slots as there
431 * are in the ring; the memory is owned by the nexus adapter.
432 */
433 uint32_t ckr_slot_ctxs_set; /* number of valid/set contexts */
434 struct slot_ctx *__counted_by(ckr_slot_ctxs_cnt)ckr_slot_ctxs; /* (optional) array of slot contexts */
435 uint32_t ckr_slot_ctxs_cnt;
436 void *ckr_ctx; /* ring context */
437
438 struct ch_selinfo ckr_si; /* per-ring wait queue */
439
440 #if CONFIG_NEXUS_NETIF
441 /*
442 * netif adapters intercepts ckr_na_notify in order to
443 * mitigate IRQ events; the actual notification is done
444 * by invoking the original notify callback routine
445 * saved at na_activate() time.
446 */
447 int (*ckr_netif_notify)(struct __kern_channel_ring *kring,
448 struct proc *, uint32_t flags);
449 void (*ckr_netif_mit_stats)(struct __kern_channel_ring *kring,
450 uint64_t, uint64_t);
451 struct nx_netif_mit *ckr_mit;
452
453 volatile uint32_t ckr_pending_intr;
454 volatile uint32_t ckr_pending_doorbell;
455
456 /*
457 * Support for adapters without native Skywalk support.
458 * On tx rings we preallocate an array of tx buffers
459 * (same size as the channel ring), on rx rings we
460 * store incoming mbufs in a queue that is drained by
461 * a rxsync.
462 */
463 struct mbuf **__counted_by(ckr_tx_pool_count) ckr_tx_pool;
464 uint32_t ckr_tx_pool_count;
465 struct nx_mbq ckr_rx_queue; /* intercepted rx mbufs. */
466 #endif /* CONFIG_NEXUS_NETIF */
467
468 #if CONFIG_NEXUS_USER_PIPE
469 /* if this is a pipe ring, pointer to the other end */
470 struct __kern_channel_ring *ckr_pipe;
471 /* pointer to hidden rings see nx_user_pipe.c for details) */
472 struct __user_channel_ring *ckr_save_ring;
473 #endif /* CONFIG_NEXUS_USER_PIPE */
474
475 /*
476 * Protects kring in the event of multiple writers;
477 * only used by flow switch.
478 */
479 decl_lck_mtx_data(, ckr_qlock);
480
481 uint32_t ckr_users; /* existing bindings for this ring */
482
483 /* ring flush rate limit */
484 int64_t ckr_tbr_token;
485 int64_t ckr_tbr_depth;
486 uint64_t ckr_tbr_last;
487 #define CKR_TBR_TOKEN_INVALID INT64_MAX
488
489 /* stats capturing errors */
490 channel_ring_error_stats ckr_err_stats __sk_aligned(64);
491
492 /* stats capturing actual data movement (nexus provider's view) */
493 channel_ring_stats ckr_stats __sk_aligned(64);
494 uint64_t ckr_accumulated_bytes;
495 uint64_t ckr_accumulated_slots;
496 uint64_t ckr_accumulate_start; /* in seconds */
497
498 /* stats capturing user activities per sync (user's view) */
499 channel_ring_user_stats ckr_usr_stats __sk_aligned(64);
500 uint64_t ckr_user_accumulated_bytes;
501 uint64_t ckr_user_accumulated_slots;
502 uint64_t ckr_user_accumulated_syncs;
503 uint64_t ckr_user_accumulate_start; /* in seconds */
504
505 lck_grp_t *ckr_qlock_group;
506 lck_grp_t *ckr_slock_group;
507
508 char ckr_name[64]; /* diagnostic */
509
510 uint64_t ckr_rx_dequeue_ts; /* last timestamp when userspace dequeued */
511 uint64_t ckr_rx_enqueue_ts; /* last timestamp when kernel enqueued */
512 } __sk_aligned(CHANNEL_CACHE_ALIGN_MAX);
513
514 #define KR_LOCK(_kr) \
515 lck_mtx_lock(&(_kr)->ckr_qlock)
516 #define KR_LOCK_SPIN(_kr) \
517 lck_mtx_lock_spin(&(_kr)->ckr_qlock)
518 #define KR_LOCK_TRY(_kr) \
519 lck_mtx_try_lock(&(_kr)->ckr_qlock)
520 #define KR_LOCK_ASSERT_HELD(_kr) \
521 LCK_MTX_ASSERT(&(_kr)->ckr_qlock, LCK_MTX_ASSERT_OWNED)
522 #define KR_LOCK_ASSERT_NOTHELD(_kr) \
523 LCK_MTX_ASSERT(&(_kr)->ckr_qlock, LCK_MTX_ASSERT_NOTOWNED)
524 #define KR_UNLOCK(_kr) \
525 lck_mtx_unlock(&(_kr)->ckr_qlock)
526
527 /* valid values for ckr_flags */
528 #define CKRF_EXCLUSIVE 0x1 /* exclusive binding */
529 #define CKRF_DROP 0x2 /* drop all mode */
530 #define CKRF_HOST 0x4 /* host ring */
531 #define CKRF_MEM_RING_INITED 0x8 /* na_kr_setup() succeeded */
532 #define CKRF_MEM_SD_INITED 0x10 /* na_kr_setup() succeeded */
533 #define CKRF_EXT_RING_INITED 0x20 /* nxpi_ring_init() succeeded */
534 #define CKRF_EXT_SLOTS_INITED 0x40 /* nxpi_slot_init() succeeded */
535 #define CKRF_SLOT_CONTEXT 0x80 /* ckr_slot_ctxs is valid */
536 #define CKRF_MITIGATION 0x100 /* supports event mitigation */
537 #define CKRF_DEFUNCT 0x200 /* no longer in service */
538 #define CKRF_KERNEL_ONLY (1U << 31) /* not usable by userland */
539
540 #define CKRF_BITS \
541 "\020\01EXCLUSIVE\02DROP\03HOST\04MEM_RING_INITED" \
542 "\05MEM_SD_INITED\06EXT_RING_INITED\07EXT_SLOTS_INITED" \
543 "\010SLOT_CONTEXT\011MITIGATION\012DEFUNCT\040KERNEL_ONLY"
544
545 #define KRNA(_kr) \
546 ((__DECONST(struct __kern_channel_ring *, _kr))->ckr_na)
547
548 #define KR_KERNEL_ONLY(_kr) \
549 (((_kr)->ckr_flags & CKRF_KERNEL_ONLY) != 0)
550 #define KR_DROP(_kr) \
551 (((_kr)->ckr_flags & (CKRF_DROP|CKRF_DEFUNCT)) != 0)
552
553 /* valid values for ckr_state */
554 enum {
555 KR_READY = 0,
556 KR_STOPPED, /* unbounded stop */
557 KR_LOCKED, /* bounded, brief stop for mutual exclusion */
558 };
559
560 #define KR_KSD(_kring, _slot_idx) \
561 (SLOT_DESC_KSD(&(_kring)->ckr_ksds[_slot_idx]))
562
563 #define KR_USD(_kring, _slot_idx) \
564 (SLOT_DESC_USD(&(_kring)->ckr_usds[_slot_idx]))
565
566 __attribute__((always_inline))
567 static inline slot_idx_t
KR_SLOT_INDEX(const struct __kern_channel_ring * kr,const struct __slot_desc * slot)568 KR_SLOT_INDEX(const struct __kern_channel_ring *kr,
569 const struct __slot_desc *slot)
570 {
571 ASSERT(slot >= kr->ckr_ksds && slot <= kr->ckr_ksds_last);
572 return (slot_idx_t)(slot - kr->ckr_ksds);
573 }
574
575 /* Helper macros for slot descriptor, decoupled for KSD/USD. */
576
577 #define KSD_VALID_METADATA(_ksd) \
578 ((_ksd)->sd_md != NULL)
579
580 #define KSD_INIT(_ksd) do { \
581 (_ksd)->sd_md = NULL; \
582 } while (0)
583
584 #define KSD_ATTACH_METADATA(_ksd, _md_addr) do { \
585 ASSERT((_ksd) != NULL); \
586 ASSERT((_ksd)->sd_md == NULL); \
587 (_ksd)->sd_md = (_md_addr); \
588 } while (0)
589
590 #define KSD_DETACH_METADATA(_ksd) do { \
591 ASSERT((_ksd) != NULL); \
592 ASSERT((_ksd)->sd_md != NULL); \
593 (_ksd)->sd_md = NULL; \
594 } while (0)
595
596 #define KSD_RESET(_ksd) KSD_INIT(_ksd)
597
598 #define USD_INIT(_usd) do { \
599 (_usd)->sd_md_idx = OBJ_IDX_NONE; \
600 (_usd)->sd_flags = 0; \
601 (_usd)->sd_len = 0; \
602 } while (0)
603
604 #define USD_ATTACH_METADATA(_usd, _md_idx) do { \
605 ASSERT((_usd) != NULL); \
606 ASSERT((_usd)->sd_md_idx == OBJ_IDX_NONE); \
607 ASSERT(((_usd)->sd_flags & SD_IDX_VALID) == 0); \
608 (_usd)->sd_md_idx = (_md_idx); \
609 (_usd)->sd_flags |= SD_IDX_VALID; \
610 /* mask off non-user flags */ \
611 (_usd)->sd_flags &= SD_FLAGS_USER; \
612 } while (0);
613
614 #define USD_DETACH_METADATA(_usd) do { \
615 ASSERT((_usd) != NULL); \
616 (_usd)->sd_md_idx = OBJ_IDX_NONE; \
617 /* mask off non-user flags */ \
618 (_usd)->sd_flags &= SD_FLAGS_USER; \
619 (_usd)->sd_flags &= ~SD_IDX_VALID; \
620 } while (0)
621
622 #define USD_RESET(_usd) USD_INIT(_usd)
623
624 #define USD_SET_LENGTH(_usd, _md_len) do { \
625 ASSERT((_usd) != NULL); \
626 (_usd)->sd_len = _md_len; \
627 } while (0)
628
629 #define _USD_COPY(_src, _dst) do { \
630 static_assert(sizeof(struct __user_slot_desc) == 8); \
631 sk_copy64_8((uint64_t *)(void *)_src, (uint64_t *)(void *)_dst); \
632 } while (0)
633
634 #define _USD_SWAP(_usd1, _usd2) do { \
635 struct __user_slot_desc _tusd __sk_aligned(64); \
636 _USD_COPY(_usd1, &_tusd); \
637 _USD_COPY(_usd2, _usd1); \
638 _USD_COPY(&_tusd, _usd2); \
639 } while (0)
640
641 #define _KSD_COPY(_src, _dst) do { \
642 static_assert(sizeof(struct __kern_slot_desc) == 8); \
643 sk_copy64_8((uint64_t *)(void *)_src, (uint64_t *)(void *)_dst); \
644 } while (0)
645
646 #define _KSD_SWAP(_ksd1, _ksd2) do { \
647 struct __kern_slot_desc _tksd __sk_aligned(64); \
648 _KSD_COPY(_ksd1, &_tksd); \
649 _KSD_COPY(_ksd2, _ksd1); \
650 _KSD_COPY(&_tksd, _ksd2); \
651 } while (0)
652
653 #define SD_SWAP(_ksd1, _usd1, _ksd2, _usd2) do { \
654 _USD_SWAP(_usd1, _usd2); \
655 _KSD_SWAP(_ksd1, _ksd2); \
656 /* swap packet attachment */ \
657 *(struct __kern_slot_desc **)(uintptr_t)&(_ksd1)->sd_qum->qum_ksd = \
658 (_ksd1); \
659 *(struct __kern_slot_desc **)(uintptr_t)&(_ksd2)->sd_qum->qum_ksd = \
660 (_ksd2); \
661 } while (0)
662
663 #define _MD_BUFLET_ADDROFF(_md, _addr, _objaddr, _doff, _dlen, _dlim) do { \
664 struct __kern_packet *_p = \
665 (struct __kern_packet *)(void *)(_md); \
666 struct __kern_buflet *_kbft; \
667 PKT_GET_FIRST_BUFLET(_p, _p->pkt_bufs_cnt, _kbft); \
668 (_addr) = __unsafe_forge_bidi_indexable(void *, \
669 __DECONST(void *, _kbft->buf_addr), _kbft->buf_dlim); \
670 (_objaddr) = __unsafe_forge_bidi_indexable(void *, \
671 _kbft->buf_objaddr, _kbft->buf_dlim); \
672 (_doff) = _kbft->buf_doff; \
673 (_dlen) = _kbft->buf_dlen; \
674 (_dlim) = _kbft->buf_dlim; \
675 ASSERT((_addr) != NULL); \
676 ASSERT((_objaddr) != NULL); \
677 } while (0)
678
679 #define _MD_BUFLET_ADDR_PKT(_md, _addr) do { \
680 ASSERT(METADATA_TYPE(SK_PTR_ADDR_KQUM(_md)) == \
681 NEXUS_META_TYPE_PACKET); \
682 struct __kern_packet *_p = (struct __kern_packet *)(void *)(_md); \
683 struct __kern_buflet *_kbft; \
684 PKT_GET_FIRST_BUFLET(_p, _p->pkt_bufs_cnt, _kbft); \
685 (_addr) = __unsafe_forge_bidi_indexable(void *, \
686 __DECONST(void *, _kbft->buf_addr), _kbft->buf_dlim); \
687 ASSERT((_addr) != NULL); \
688 } while (0)
689
690
691 /*
692 * Return the data offset adjusted virtual address of a buffer associated
693 * with the metadata; for metadata with multiple buflets, this is the
694 * first buffer's address.
695 */
696 #define MD_BUFLET_ADDR(_md, _val) do { \
697 void *_addr, *_objaddr; \
698 uint32_t _doff, _dlen, _dlim; \
699 _MD_BUFLET_ADDROFF(_md, _addr, _objaddr, _doff, _dlen, _dlim); \
700 /* skip past buflet data offset */ \
701 (_val) = (void *)((uint8_t *)_addr + _doff); \
702 } while (0)
703
704 /*
705 * Return the absolute virtual address of a buffer associated with the
706 * metadata; for metadata with multiple buflets, this is the first
707 * buffer's address.
708 */
709 #define MD_BUFLET_ADDR_ABS(_md, _val) do { \
710 void *_addr, *_objaddr; \
711 uint32_t _doff, _dlen, _dlim; \
712 _MD_BUFLET_ADDROFF(_md, _addr, _objaddr, _doff, _dlen, _dlim); \
713 (_val) = (void *)_addr; \
714 } while (0)
715
716 /* similar to MD_BUFLET_ADDR_ABS() but optimized only for packets */
717 #define MD_BUFLET_ADDR_ABS_PKT(_md, _val) do { \
718 void *_addr; \
719 _MD_BUFLET_ADDR_PKT(_md, _addr); \
720 (_val) = (void *)_addr; \
721 } while (0)
722
723
724 #define MD_BUFLET_ADDR_ABS_DLEN(_md, _val, _dlen, _dlim, _doff) do { \
725 void *_addr, *_objaddr; \
726 _MD_BUFLET_ADDROFF(_md, _addr, _objaddr, _doff, _dlen, _dlim); \
727 (_val) = (void *)_addr; \
728 } while (0)
729
730
731 /*
732 * Return the buffer's object address associated with the metadata; for
733 * metadata with multiple buflets, this is the first buffer's object address.
734 */
735 #define MD_BUFLET_OBJADDR(_md, _val) do { \
736 void *_addr, *_objaddr; \
737 uint32_t _doff, _dlen, _dlim; \
738 _MD_BUFLET_ADDROFF(_md, _addr, _objaddr, _doff, _dlen, _dlim); \
739 (_val) = (void *)_objaddr; \
740 } while (0)
741
742 /*
743 * Return the data offset adjusted virtual address of a buffer associated
744 * with the metadata; for metadata with multiple buflets, this is the
745 * first buffer's address and data length.
746 */
747 #define MD_BUFLET_ADDR_DLEN(_md, _val, _dlen) do { \
748 void *_addr, *_objaddr; \
749 uint32_t _doff, _dlim; \
750 _MD_BUFLET_ADDROFF(_md, _addr, _objaddr, _doff, _dlen, _dlim); \
751 /* skip past buflet data offset */ \
752 (_val) = (void *)(__unsafe_forge_bidi_indexable(uint8_t *, _addr, _dlim) + _doff); \
753 } while (0)
754
755 /* kr_space: return available space for enqueue into kring */
756 __attribute__((always_inline))
757 static inline uint32_t
kr_available_slots(struct __kern_channel_ring * kr)758 kr_available_slots(struct __kern_channel_ring *kr)
759 {
760 uint32_t space;
761
762 space = kr->ckr_lim - (kr->ckr_num_slots - kr->ckr_khead);
763 return space;
764 }
765
766 /* kr_space: return available space for enqueue into Rx kring */
767 __attribute__((always_inline))
768 static inline uint32_t
kr_available_slots_rxring(struct __kern_channel_ring * rxkring)769 kr_available_slots_rxring(struct __kern_channel_ring *rxkring)
770 {
771 int busy;
772 uint32_t space;
773
774 /* # of rx busy (unclaimed) slots */
775 busy = (int)(rxkring->ckr_ktail - rxkring->ckr_khead);
776 if (busy < 0) {
777 busy += rxkring->ckr_num_slots;
778 }
779
780 /* # of rx avail free slots (subtract busy from max) */
781 space = rxkring->ckr_lim - (uint32_t)busy;
782 return space;
783 }
784
785 extern kern_allocation_name_t skmem_tag_ch_key;
786
787 #if (DEVELOPMENT || DEBUG)
788 SYSCTL_DECL(_kern_skywalk_channel);
789 #endif /* !DEVELOPMENT && !DEBUG */
790
791 __BEGIN_DECLS
792 extern int channel_init(void);
793 extern void channel_fini(void);
794
795 extern struct kern_channel *ch_open(struct ch_init *, struct proc *,
796 int, int *);
797 extern struct kern_channel *ch_open_special(struct kern_nexus *,
798 struct chreq *, boolean_t, int *);
799 extern void ch_close(struct kern_channel *, boolean_t);
800 extern void ch_close_special(struct kern_channel *);
801 extern int ch_kqfilter(struct kern_channel *, struct knote *,
802 struct kevent_qos_s *kev);
803 extern boolean_t ch_is_multiplex(struct kern_channel *, enum txrx);
804 extern int ch_select(struct kern_channel *, int, void *, struct proc *);
805 extern int ch_get_opt(struct kern_channel *, struct sockopt *);
806 extern int ch_set_opt(struct kern_channel *, struct sockopt *);
807 extern void ch_deactivate(struct kern_channel *);
808 extern void ch_retain(struct kern_channel *);
809 extern void ch_retain_locked(struct kern_channel *);
810 extern int ch_release(struct kern_channel *);
811 extern int ch_release_locked(struct kern_channel *);
812 extern void ch_dtor(struct kern_channel *);
813 extern void ch_update_upp_buf_stats(struct kern_channel *ch,
814 struct kern_pbufpool *pp);
815
816 #if SK_LOG
817 #define CH_DBGBUF_SIZE 256
818 extern char * ch2str(const struct kern_channel *na, char *__counted_by(dsz)dst,
819 size_t dsz);
820 #endif /* SK_LOG */
821
822 extern void csi_init(struct ch_selinfo *, boolean_t, uint64_t);
823 extern void csi_destroy(struct ch_selinfo *);
824 extern void csi_selrecord_one(struct __kern_channel_ring *, struct proc *,
825 void *);
826 extern void csi_selrecord_all(struct nexus_adapter *, enum txrx, struct proc *,
827 void *);
828 extern void csi_selwakeup_one(struct __kern_channel_ring *, boolean_t,
829 boolean_t, boolean_t, uint32_t);
830 extern void csi_selwakeup_all(struct nexus_adapter *, enum txrx, boolean_t,
831 boolean_t, boolean_t, uint32_t);
832
833 extern void kr_init_to_mhints(struct __kern_channel_ring *, uint32_t);
834 extern int kr_enter(struct __kern_channel_ring *, boolean_t);
835 extern void kr_exit(struct __kern_channel_ring *);
836 extern void kr_start(struct __kern_channel_ring *);
837 extern void kr_stop(struct __kern_channel_ring *kr, uint32_t state);
838 extern void kr_update_stats(struct __kern_channel_ring *kring,
839 uint32_t slot_count, uint32_t byte_count);
840 extern boolean_t kr_txempty(struct __kern_channel_ring *kring);
841 extern uint32_t kr_reclaim(struct __kern_channel_ring *kr);
842
843 extern slot_idx_t kr_txsync_prologue(struct kern_channel *,
844 struct __kern_channel_ring *, struct proc *);
845 extern int kr_txprologue(struct kern_channel *,
846 struct __kern_channel_ring *, const slot_idx_t, uint32_t *, uint64_t *,
847 struct proc *);
848 extern int kr_txprologue_upp(struct kern_channel *,
849 struct __kern_channel_ring *, const slot_idx_t, uint32_t *, uint64_t *,
850 struct proc *);
851
852 extern void kr_txsync_finalize(struct kern_channel *,
853 struct __kern_channel_ring *, struct proc *);
854 extern void kr_txfinalize(struct kern_channel *,
855 struct __kern_channel_ring *, const slot_idx_t, struct proc *p);
856 extern void kr_txfinalize_upp(struct kern_channel *,
857 struct __kern_channel_ring *, const slot_idx_t, struct proc *p);
858
859 extern slot_idx_t kr_rxsync_prologue(struct kern_channel *ch,
860 struct __kern_channel_ring *kring, struct proc *p);
861 extern int kr_rxprologue(struct kern_channel *,
862 struct __kern_channel_ring *, const slot_idx_t, uint32_t *, uint64_t *,
863 struct proc *);
864 extern int kr_rxprologue_nodetach(struct kern_channel *,
865 struct __kern_channel_ring *, const slot_idx_t, uint32_t *, uint64_t *,
866 struct proc *);
867 extern int kr_rxprologue_upp(struct kern_channel *,
868 struct __kern_channel_ring *, const slot_idx_t, uint32_t *, uint64_t *,
869 struct proc *);
870
871 extern void kr_rxsync_finalize(struct kern_channel *ch,
872 struct __kern_channel_ring *kring, struct proc *p);
873 extern void kr_rxfinalize(struct kern_channel *,
874 struct __kern_channel_ring *, const slot_idx_t, struct proc *p);
875 extern void kr_rxfinalize_upp(struct kern_channel *,
876 struct __kern_channel_ring *, const slot_idx_t, struct proc *p);
877
878 extern void kr_txkring_reclaim_and_refill(struct __kern_channel_ring *kring,
879 slot_idx_t index);
880 extern slot_idx_t kr_alloc_sync_prologue(struct __kern_channel_ring *kring,
881 struct proc *p);
882 extern slot_idx_t kr_free_sync_prologue(struct __kern_channel_ring *kring,
883 struct proc *p);
884 extern void kr_alloc_sync_finalize(struct __kern_channel_ring *kring,
885 struct proc *p);
886 extern void kr_free_sync_finalize(struct __kern_channel_ring *kring,
887 struct proc *p);
888 extern int kr_internalize_metadata(struct kern_channel *,
889 struct __kern_channel_ring *, const uint32_t, struct __kern_quantum *,
890 struct proc *);
891 extern void kr_externalize_metadata(struct __kern_channel_ring *,
892 const uint32_t, struct __kern_quantum *, struct proc *);
893 extern slot_idx_t kr_event_sync_prologue(struct __kern_channel_ring *kring,
894 struct proc *p);
895 extern void kr_event_sync_finalize(struct kern_channel *ch,
896 struct __kern_channel_ring *kring, struct proc *p);
897
898 #if SK_LOG
899 extern void kr_log_bad_ring(struct __kern_channel_ring *);
900 extern char * kr2str(const struct __kern_channel_ring *kr,
901 char *__counted_by(dsz)dst, size_t dsz);
902 #else
903 #define kr_log_bad_ring(_kr) do { ((void)0); } while (0)
904 #endif /* SK_LOG */
905 __END_DECLS
906 #endif /* BSD_KERNEL_PRIVATE */
907 #endif /* !_SKYWALK_CHANNEL_CHANNELVAR_H_ */
908