xref: /xnu-10063.101.15/bsd/skywalk/channel/channel_var.h (revision 94d3b452840153a99b38a3a9659680b2a006908e)
1 /*
2  * Copyright (c) 2015-2020 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 /*
30  * Copyright (C) 2012-2014 Matteo Landi, Luigi Rizzo, Giuseppe Lettieri.
31  * All rights reserved.
32  * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved.
33  *
34  * Redistribution and use in source and binary forms, with or without
35  * modification, are permitted provided that the following conditions
36  * are met:
37  *   1. Redistributions of source code must retain the above copyright
38  *      notice, this list of conditions and the following disclaimer.
39  *   2. Redistributions in binary form must reproduce the above copyright
40  *      notice, this list of conditions and the following disclaimer in the
41  *    documentation and/or other materials provided with the distribution.
42  *
43  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
44  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
45  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
46  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
47  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
48  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
49  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
50  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
51  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
52  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
53  * SUCH DAMAGE.
54  */
55 
56 #ifndef _SKYWALK_CHANNEL_CHANNELVAR_H_
57 #define _SKYWALK_CHANNEL_CHANNELVAR_H_
58 
59 #ifdef BSD_KERNEL_PRIVATE
60 #include <skywalk/core/skywalk_var.h>
61 #include <skywalk/os_channel_private.h>
62 #include <skywalk/nexus/nexus_mbq.h>
63 #include <skywalk/nexus/nexus_pktq.h>
64 #include <skywalk/mem/skmem_region_var.h>
65 #include <skywalk/mem/skmem_arena_var.h>
66 
67 struct ch_selinfo {
68 	decl_lck_mtx_data(, csi_lock);
69 	struct selinfo  csi_si;
70 	uint32_t        csi_flags;
71 	uint32_t        csi_pending;
72 	uint64_t        csi_eff_interval;
73 	uint64_t        csi_interval;
74 	thread_call_t   csi_tcall;
75 };
76 
77 /* values for csi_flags */
78 #define CSI_KNOTE               0x1             /* kernel note attached */
79 #define CSI_MITIGATION          0x10            /* has mitigation */
80 #define CSI_DESTROYED           (1U << 31)      /* has been destroyed */
81 
82 #define CSI_LOCK(_csi)                  \
83 	lck_mtx_lock(&(_csi)->csi_lock)
84 #define CSI_LOCK_ASSERT_HELD(_csi)      \
85 	LCK_MTX_ASSERT(&(_csi)->csi_lock, LCK_MTX_ASSERT_OWNED)
86 #define CSI_LOCK_ASSERT_NOTHELD(_csi)   \
87 	LCK_MTX_ASSERT(&(_csi)->csi_lock, LCK_MTX_ASSERT_NOTOWNED)
88 #define CSI_UNLOCK(_csi)                        \
89 	lck_mtx_unlock(&(_csi)->csi_lock)
90 
91 /* mitigation intervals in ns */
92 #define CH_MIT_IVAL_DEFAULT     (0)
93 #define CH_MIT_IVAL_WIFI        CH_MIT_IVAL_DEFAULT
94 #define CH_MIT_IVAL_CELLULAR    CH_MIT_IVAL_DEFAULT
95 #define CH_MIT_IVAL_ETHERNET    CH_MIT_IVAL_DEFAULT
96 
97 /*
98  * Kernel version of __user_slot_desc.
99  *
100  * Keep slot descriptor as minimal as possible.
101  * TODO: [email protected] -- Should we make use of RX/TX
102  * preparation/writeback descriptors (in a union)?
103  */
104 struct __kern_slot_desc {
105 	union {
106 		struct __kern_quantum *sd_qum;
107 		struct __kern_packet *sd_pkt;
108 		struct __kern_buflet *sd_buf;
109 		void *sd_md;                    /* metadata address */
110 	};
111 
112 #ifndef __LP64__
113 	uint32_t        _sd_pad[1];
114 #endif /* !__LP64__ */
115 };
116 
117 /* _sd_{user,kern} are at same offset in the preamble */
118 #define SLOT_DESC_KSD(_sdp)     \
119 	__unsafe_forge_single(struct __kern_slot_desc *, \
120 	((struct __kern_slot_desc *)((uintptr_t)&(_sdp)->_sd_private)))
121 
122 /*
123  * Optional, per-slot context information.  An array of these structures
124  * is allocated per nexus_adapter, and each real kring will have its slots
125  * correspond to one.  This the 'arg' value is retrieved via the slot_init
126  * nexus provider callback, and is retrievable via subsequently via calls
127  * to kern_channel_slot_get_context().
128  */
129 struct slot_ctx {
130 	mach_vm_address_t       slot_ctx_arg;   /* per-slot context */
131 };
132 
133 extern lck_attr_t channel_lock_attr;
134 extern uint64_t __ch_umd_redzone_cookie;
135 extern uint32_t kr_stat_enable;
136 
137 struct kern_nexus;
138 enum na_sync_mode;
139 
140 struct kern_channel {
141 	decl_lck_mtx_data(, ch_lock);
142 	struct nexus_adapter    *ch_na;
143 	struct kern_nexus       *ch_nexus;
144 	struct ch_info          *ch_info;
145 	struct kern_pbufpool    *ch_pp;
146 
147 	uint32_t                ch_refcnt;
148 	volatile uint32_t       ch_flags;       /* CHANF_* flags */
149 
150 	/* range of tx/rx/allocator/event rings to scan */
151 	ring_id_t               ch_first[NR_ALL];
152 	ring_id_t               ch_last[NR_ALL];
153 
154 	struct __user_channel_schema *ch_schema;
155 
156 	/*
157 	 * Pointers to the selinfo to be used for selrecord.
158 	 * Either the local or the global one depending on the
159 	 * number of rings.
160 	 */
161 	struct ch_selinfo       *ch_si[NR_ALL];
162 
163 	STAILQ_ENTRY(kern_channel) ch_link;
164 	STAILQ_ENTRY(kern_channel) ch_link_if_adv;
165 	void                    *ch_ctx;
166 	mach_vm_offset_t        ch_schema_offset;
167 	struct skmem_arena_mmap_info ch_mmap;
168 	int                     ch_fd;          /* might be -1 if no fd */
169 	pid_t                   ch_pid;         /* process ID */
170 	char                    ch_name[32];    /* process name */
171 };
172 
173 /* valid values for ch_flags */
174 #define CHANF_ATTACHED          0x1     /* attached and connected to nexus */
175 #define CHANF_PLATFORM          0x2     /* platform binary process */
176 #define CHANF_KERNEL            0x4     /* kernel only; has no task map */
177 #define CHANF_RXONLY            0x8     /* receive only, no transmit */
178 #define CHANF_USER_PACKET_POOL  0x10    /* userspace using packet pool */
179 #define CHANF_EXCLUSIVE         0x20    /* exclusive bind to ring(s) */
180 #define CHANF_NONXREF           0x40    /* has no nexus reference */
181 #define CHANF_HOST              0x80    /* opened to host (kernel) stack */
182 #define CHANF_EXT_SKIP          0x100   /* don't notify external provider */
183 #define CHANF_EXT_PRECONNECT    0x200   /* successful nxpi_pre_connect() */
184 #define CHANF_EXT_CONNECTED     0x400   /* successful nxpi_connected() */
185 #define CHANF_EVENT_RING        0x1000  /* channel has event rings */
186 #define CHANF_IF_ADV            0x2000  /* interface advisory is active */
187 #define CHANF_DEFUNCT_SKIP      0x4000  /* defunct skipped due to active use */
188 #define CHANF_CLOSING           (1U << 30) /* channel is being closed */
189 #define CHANF_DEFUNCT           (1U << 31) /* channel is now defunct */
190 
191 #define CHANF_BITS                                                      \
192 	"\020\01ATTACHED\02PLATFORM\03KERNEL\04RXONLY\05USER_PKT_POOL"  \
193 	"\06EXCLUSIVE\07NONXREF\010HOST\011EXT_SKIP\012EXT_PRECONNECT"  \
194 	"\013EXT_CONNECTED\015EVENT\016ADVISORY"            \
195 	"\017DEFUNCT_SKIP\037CLOSING\040DEFUNCT"
196 
197 /* valid values for ch_kevhints */
198 #define CHAN_FILT_HINT_FLOW_ADV_UPD     0x1     /* flow advisory update */
199 #define CHAN_FILT_HINT_CHANNEL_EVENT    0x2     /* channel event */
200 #define CHAN_FILT_HINT_IF_ADV_UPD       0x4     /* Interface advisory update */
201 
202 #define CHAN_FILT_HINT_BITS    "\020\01FLOW_ADV\02CHANNEL_EVENT\03IF_ADV"
203 
204 typedef enum {
205 	RING_SET_ALL = 0,               /* all rings */
206 	RING_SET_DEFAULT = RING_SET_ALL,
207 } ring_set_t;
208 
209 typedef enum {
210 	CH_ENDPOINT_NULL = 0,
211 	CH_ENDPOINT_USER_PIPE_MASTER,
212 	CH_ENDPOINT_USER_PIPE_SLAVE,
213 	CH_ENDPOINT_KERNEL_PIPE,
214 	CH_ENDPOINT_NET_IF,
215 	CH_ENDPOINT_FLOW_SWITCH,
216 } ch_endpoint_t;
217 
218 #define CHREQ_NAMELEN   64
219 
220 struct chreq {
221 	char            cr_name[CHREQ_NAMELEN];         /* in */
222 	uuid_t          cr_spec_uuid;                   /* in */
223 	struct ch_ev_thresh cr_tx_lowat;                /* in */
224 	struct ch_ev_thresh cr_rx_lowat;                /* in */
225 	nexus_port_t    cr_port;                        /* in/out */
226 	uint32_t        cr_mode;                        /* in */
227 	uint32_t        cr_pipe_id;                     /* in */
228 	ring_id_t       cr_ring_id;                     /* in */
229 	ring_set_t      cr_ring_set;                    /* out */
230 	ch_endpoint_t   cr_real_endpoint;               /* out */
231 	ch_endpoint_t   cr_endpoint;                    /* out */
232 	mach_vm_size_t  cr_memsize;                     /* out */
233 	mach_vm_offset_t cr_memoffset;                  /* out */
234 };
235 
236 /*
237  * Private, kernel view of a ring.  Keeps track of the status of
238  * a ring across system calls.
239  *
240  *	ckr_khead	Index of the next buffer to refill.  It corresponds
241  *			to ring_head at the time the system call returns.
242  *
243  *	ckr_ktail	Index of the first buffer owned by the kernel.
244  *
245  *			On RX, ckr_khead to ckr_ktail are receive buffers that
246  *			are not yet released.  ckr_khead is advanced following
247  *			ring_head, ckr_ktail is advanced on incoming packets.
248  *
249  *			On TX, ckr_rhead has been filled by the sender but not
250  *			sent yet to the destination; ckr_rhead to ckr_ktail are
251  *			available for new transmissions, and ckr_ktail to
252  *			ckr_khead-1 are pending transmissions.
253  *
254  * Here is the layout for the RX and TX rings.
255  *
256  *            RX RING                         TX RING
257  *
258  *       +-----------------+            +-----------------+
259  *       |                 |            |                 |
260  *       |XXX free slot XXX|            |XXX free slot XXX|
261  *       +-----------------+            +-----------------+
262  * head->| owned by user   |<-khead     | not sent to nic |<-khead
263  *       |                 |            | yet             |
264  *       |                 |            |                 |
265  *       +-----------------+            +     ------      +
266  * tail->|                 |<-ktail     |                 |<-klease
267  *       | (being          | ...        |                 | ...
268  *       |  prepared)      | ...        |                 | ...
269  *       +-----------------+ ...        |                 | ...
270  *       |                 |<-klease    +-----------------+
271  *       |                 |      tail->|                 |<-ktail
272  *       |                 |            |                 |
273  *       |                 |            |                 |
274  *       |                 |            |                 |
275  *       +-----------------+            +-----------------+
276  *
277  * The head/tail (user view) and khead/ktail (kernel view)
278  * are used in the normal operation of the adapter.
279  *
280  * For flow switch nexus:
281  *
282  * The following fields are used to implement lock-free copy of packets
283  * from input to output ports in flow switch:
284  *
285  *	ckr_klease	Buffer after the last one being copied.
286  *			A writer in nx_fsw_vp_flush() reserves N buffers
287  *			from ckr_klease, advances it, then does the
288  *			copy outside the lock.
289  *
290  *			In RX rings (used for flow switch ports):
291  *				ckr_ktail <= ckr_klease < nkr_khead+N-1
292  *
293  *			In TX rings (used for NIC or host stack ports):
294  *				nkr_khead <= ckr_klease < nkr_ktail
295  *
296  *	ckr_leases	Array of ckr_num_slots where writers can report
297  *			completion of their block. CKR_NOSLOT (~0) indicates
298  *			that the writer has not finished yet
299  *
300  *	ckr_lease_idx	Index of next free slot in ckr_leases, to be assigned.
301  *
302  * The kring is manipulated by txsync/rxsync and generic kring function.
303  *
304  * Concurrent rxsync or txsync on the same ring are prevented through
305  * by na_kr_(try)get() which in turn uses ckr_busy.  This is all we need
306  * for NIC rings, and for TX rings attached to the host stack.
307  *
308  * RX rings attached to the host stack use an nx_mbq (ckr_rx_queue) on both
309  * nx_netif_rxsync_from_host() and nx_netif_compat_transmit(). The nx_mbq is
310  * protected by its internal lock.
311  *
312  * RX rings attached to the flow switch are accessed by both senders
313  * and receiver.  They are protected through the q_lock on the RX ring.
314  *
315  * When a ring is the output of a switch port (RX ring for a flow switch
316  * port, TX ring for the host stack or NIC), slots are reserved in blocks
317  * through ckr_klease which points to the next unused slot.
318  *
319  * On an RX ring, ckr_klease is always after ckr_ktail, and completions cause
320  * ckr_ktail to advance.  On a TX ring, ckr_klease is always between ckr_khead
321  * and ckr_ktail, and completions cause ckr_khead to advance.
322  *
323  * nx_fsw_vp_na_kr_space()
324  *   returns the maximum number of slots that can be assigned.
325  *
326  * nx_fsw_vp_na_kr_lease() reserves the required number of buffers,
327  *    advances ckr_klease and also returns an entry in a circular
328  *    array where completions should be reported.
329  *
330  * For netif nexus:
331  *
332  * The indexes in the NIC and rings are offset by ckr_hwofs slots.  This is
333  * so that, on a reset, buffers owned by userspace are not modified by the
334  * kernel.  In particular:
335  *
336  * RX rings: the next empty buffer (ckr_ktail + ckr_hwofs) coincides with
337  *      the next empty buffer as known by the hardware "next to check".
338  * TX rings: ckr_khead + ckr_hwofs coincides with "next to send".
339  *
340  */
341 typedef int (*channel_ring_notify_t)(struct __kern_channel_ring *,
342     struct proc *, uint32_t);
343 
344 struct __kern_channel_ring {
345 	struct __user_channel_ring      *ckr_ring;
346 
347 	uint32_t                ckr_flags;      /* CKRF_* flags */
348 	slot_idx_t              ckr_num_slots;  /* # of slots */
349 	uint32_t                ckr_max_pkt_len;/* max pp pkt size */
350 	uint32_t                ckr_largest;    /* largest packet seen */
351 	const slot_idx_t        ckr_lim; /* ckr_num_slots - 1 */
352 	enum txrx               ckr_tx;  /* kind of ring (tx/rx/alloc/free) */
353 
354 	volatile slot_idx_t     ckr_khead;
355 	volatile slot_idx_t     ckr_ktail;
356 	/*
357 	 * value of ckr_khead recorded at TX prologue (pre-sync)
358 	 */
359 	volatile slot_idx_t     ckr_khead_pre;
360 	/*
361 	 * Copies of values in user rings, so we do not need to look
362 	 * at the ring (which could be modified). These are set in the
363 	 * *sync_prologue()/finalize() routines.
364 	 */
365 	volatile slot_idx_t     ckr_rhead;
366 	volatile slot_idx_t     ckr_rtail;
367 
368 	/* EWMA decay rate */
369 	uint32_t                ckr_transfer_decay;
370 
371 	uint64_t                ckr_ready_bytes;
372 	uint64_t                ckr_ready_slots;
373 
374 	/*
375 	 * While ckr_state is set, no new [tr]xsync operations can be
376 	 * started on this kring.  This is used by na_disable_all_rings()
377 	 * to find a synchronization point where critical data structures
378 	 * pointed to by the kring can be added or removed.
379 	 */
380 	decl_lck_spin_data(, ckr_slock);
381 	struct thread *ckr_owner; /* busy owner */
382 	uint32_t ckr_busy;      /* prevent kring modifications */
383 	uint32_t ckr_want;      /* # of threads that lost the race */
384 	uint32_t ckr_state;     /* KR_* states */
385 
386 	/* current working set for the allocator ring */
387 	volatile uint32_t       ckr_alloc_ws;
388 
389 	struct nexus_adapter *ckr_na;   /* adapter this kring belongs to */
390 	struct kern_pbufpool *ckr_pp;   /* adapter's packet buffer pool */
391 
392 	/*
393 	 * Array of __slot_desc each representing slot-specific data, e.g.
394 	 * index to metadata, etc.  There is exactly one descriptor for each
395 	 * slot in the ring.  Note that the size of the array may be greater
396 	 * than the number of slots for this ring, and so we constrain
397 	 * range with [ckr_ksds, ckr_ksds_last] during validations.
398 	 */
399 	struct __slot_desc *__unsafe_indexable ckr_usds;   /* slot desc array (user) */
400 	struct __slot_desc *__unsafe_indexable ckr_ksds;   /* slot desc array (kernel) */
401 	struct __slot_desc *__single ckr_ksds_last; /* cache last ksd */
402 	struct skmem_cache *ckr_ksds_cache; /* owning skmem_cache for ksd */
403 
404 	uint32_t        ckr_ring_id;      /* ring ID */
405 
406 	boolean_t       ckr_rate_limited; /* ring is rate limited */
407 
408 	/*
409 	 * Array of packet handles for as many slots as there are in the
410 	 * ring; this is useful for storing an array of kern_packet_t to
411 	 * be used when invoking the packet APIs.  Only safe to be used
412 	 * in the context of a sync as we're single-threaded then.
413 	 * The memory is owned by the nexus adapter.
414 	 */
415 	uint64_t        *__unsafe_indexable ckr_scratch;
416 
417 	/*
418 	 * [tx]sync callback for this kring.  The default na_kring_create
419 	 * callback (na_kr_create) sets the ckr_na_sync callback of each
420 	 * tx(rx) kring to the corresponding na_txsync(na_rxsync) taken
421 	 * from the nexus_adapter.
422 	 *
423 	 * Overrides: the above configuration is not changed by
424 	 * any of the nm_krings_create callbacks.
425 	 */
426 	int (*ckr_na_sync)(struct __kern_channel_ring *,
427 	    struct proc *, uint32_t);
428 	int(*volatile ckr_na_notify)(struct __kern_channel_ring *,
429 	    struct proc *, uint32_t);
430 
431 	int (*ckr_prologue)(struct kern_channel *,
432 	    struct __kern_channel_ring *, const slot_idx_t,
433 	    uint32_t *, uint64_t *, struct proc *);
434 	void (*ckr_finalize)(struct kern_channel *,
435 	    struct __kern_channel_ring *, const slot_idx_t, struct proc *);
436 
437 	/* time of last channel sync (updated at sync prologue time) */
438 	uint64_t        ckr_sync_time;
439 
440 #if CONFIG_NEXUS_FLOWSWITCH
441 	/* The following fields are for flow switch support */
442 	int (*ckr_save_notify)(struct __kern_channel_ring *kring,
443 	    struct proc *, uint32_t flags);
444 	uint32_t        *ckr_leases;
445 #define CKR_NOSLOT      ((uint32_t)~0)  /* used in nkr_*lease* */
446 	slot_idx_t      ckr_klease;
447 	slot_idx_t      ckr_lease_idx;
448 #endif /* CONFIG_NEXUS_FLOWSWITCH */
449 
450 	kern_packet_svc_class_t ckr_svc;
451 
452 	/*
453 	 * (Optional) array of slot contexts for as many slots as there
454 	 * are in the ring; the memory is owned by the nexus adapter.
455 	 */
456 	uint32_t        ckr_slot_ctxs_set; /* number of valid/set contexts */
457 	struct slot_ctx *__unsafe_indexable ckr_slot_ctxs; /* (optional) array of slot contexts */
458 
459 	void            *ckr_ctx;       /* ring context */
460 
461 	struct ch_selinfo ckr_si;       /* per-ring wait queue */
462 
463 #if CONFIG_NEXUS_NETIF
464 	/*
465 	 * netif adapters intercepts ckr_na_notify in order to
466 	 * mitigate IRQ events; the actual notification is done
467 	 * by invoking the original notify callback routine
468 	 * saved at na_activate() time.
469 	 */
470 	int (*ckr_netif_notify)(struct __kern_channel_ring *kring,
471 	    struct proc *, uint32_t flags);
472 	void (*ckr_netif_mit_stats)(struct __kern_channel_ring *kring,
473 	    uint64_t, uint64_t);
474 	struct nx_netif_mit *ckr_mit;
475 
476 	volatile uint32_t ckr_pending_intr;
477 	volatile uint32_t ckr_pending_doorbell;
478 
479 	/*
480 	 * Support for adapters without native Skywalk support.
481 	 * On tx rings we preallocate an array of tx buffers
482 	 * (same size as the channel ring), on rx rings we
483 	 * store incoming mbufs in a queue that is drained by
484 	 * a rxsync.
485 	 */
486 	struct mbuf     **ckr_tx_pool;
487 	struct nx_mbq   ckr_rx_queue;   /* intercepted rx mbufs. */
488 #endif /* CONFIG_NEXUS_NETIF */
489 
490 #if CONFIG_NEXUS_USER_PIPE
491 	/* if this is a pipe ring, pointer to the other end */
492 	struct __kern_channel_ring *ckr_pipe;
493 	/* pointer to hidden rings see nx_user_pipe.c for details) */
494 	struct __user_channel_ring *ckr_save_ring;
495 #endif /* CONFIG_NEXUS_USER_PIPE */
496 
497 	/*
498 	 * Protects kring in the event of multiple writers;
499 	 * only used by flow switch and monitor.
500 	 */
501 	decl_lck_mtx_data(, ckr_qlock);
502 
503 #if CONFIG_NEXUS_MONITOR
504 	/* array of krings that are monitoring this kring */
505 	struct __kern_channel_ring **ckr_monitors;
506 	uint32_t ckr_max_monitors; /* current size of the monitors array */
507 	uint32_t ckr_n_monitors; /* next unused entry in the monitor array */
508 	/*
509 	 * Monitors work by intercepting the sync and notify callbacks of the
510 	 * monitored krings. This is implemented by replacing the pointers
511 	 * above and saving the previous ones in mon_* pointers below
512 	 */
513 	int (*ckr_mon_sync)(struct __kern_channel_ring *kring, struct proc *,
514 	    uint32_t flags);
515 	int (*ckr_mon_notify)(struct __kern_channel_ring *kring, struct proc *,
516 	    uint32_t flags);
517 
518 	uint32_t ckr_mon_tail;  /* last seen slot on rx */
519 	/* index of this ring in the monitored ring array */
520 	uint32_t ckr_mon_pos;
521 #endif /* CONFIG_NEXUS_MONITOR */
522 
523 	uint32_t        ckr_users;      /* existing bindings for this ring */
524 
525 	/* ring flush rate limit */
526 	int64_t         ckr_tbr_token;
527 	int64_t         ckr_tbr_depth;
528 	uint64_t        ckr_tbr_last;
529 #define CKR_TBR_TOKEN_INVALID   INT64_MAX
530 
531 	/* stats capturing errors */
532 	channel_ring_error_stats ckr_err_stats
533 	__attribute__((aligned(sizeof(uint64_t))));
534 
535 	/* stats capturing actual data movement (nexus provider's view) */
536 	channel_ring_stats ckr_stats
537 	__attribute__((aligned(sizeof(uint64_t))));
538 	uint64_t        ckr_accumulated_bytes;
539 	uint64_t        ckr_accumulated_slots;
540 	uint64_t        ckr_accumulate_start; /* in seconds */
541 
542 	/* stats capturing user activities per sync (user's view) */
543 	channel_ring_user_stats ckr_usr_stats
544 	__attribute__((aligned(sizeof(uint64_t))));
545 	uint64_t        ckr_user_accumulated_bytes;
546 	uint64_t        ckr_user_accumulated_slots;
547 	uint64_t        ckr_user_accumulated_syncs;
548 	uint64_t        ckr_user_accumulate_start; /* in seconds */
549 
550 	lck_grp_t       *ckr_qlock_group;
551 	lck_grp_t       *ckr_slock_group;
552 
553 	char            ckr_name[64];   /* diagnostic */
554 } __attribute__((__aligned__(CHANNEL_CACHE_ALIGN_MAX)));
555 
556 #define KR_LOCK(_kr)                    \
557 	lck_mtx_lock(&(_kr)->ckr_qlock)
558 #define KR_LOCK_SPIN(_kr)               \
559 	lck_mtx_lock_spin(&(_kr)->ckr_qlock)
560 #define KR_LOCK_TRY(_kr)                \
561 	lck_mtx_try_lock(&(_kr)->ckr_qlock)
562 #define KR_LOCK_ASSERT_HELD(_kr)        \
563 	LCK_MTX_ASSERT(&(_kr)->ckr_qlock, LCK_MTX_ASSERT_OWNED)
564 #define KR_LOCK_ASSERT_NOTHELD(_kr)     \
565 	LCK_MTX_ASSERT(&(_kr)->ckr_qlock, LCK_MTX_ASSERT_NOTOWNED)
566 #define KR_UNLOCK(_kr)                  \
567 	lck_mtx_unlock(&(_kr)->ckr_qlock)
568 
569 /* valid values for ckr_flags */
570 #define CKRF_EXCLUSIVE          0x1     /* exclusive binding */
571 #define CKRF_DROP               0x2     /* drop all mode */
572 #define CKRF_HOST               0x4     /* host ring */
573 #define CKRF_MEM_RING_INITED    0x8     /* na_kr_setup() succeeded */
574 #define CKRF_MEM_SD_INITED      0x10    /* na_kr_setup() succeeded  */
575 #define CKRF_EXT_RING_INITED    0x20    /* nxpi_ring_init() succeeded */
576 #define CKRF_EXT_SLOTS_INITED   0x40    /* nxpi_slot_init() succeeded */
577 #define CKRF_SLOT_CONTEXT       0x80    /* ckr_slot_ctxs is valid */
578 #define CKRF_MITIGATION         0x100   /* supports event mitigation */
579 #define CKRF_DEFUNCT            0x200   /* no longer in service */
580 #define CKRF_KERNEL_ONLY        (1U << 31) /* not usable by userland */
581 
582 #define CKRF_BITS                                                       \
583 	"\020\01EXCLUSIVE\02DROP\03HOST\04MEM_RING_INITED"              \
584 	"\05MEM_SD_INITED\06EXT_RING_INITED\07EXT_SLOTS_INITED"         \
585 	"\010SLOT_CONTEXT\011MITIGATION\012DEFUNCT\040KERNEL_ONLY"
586 
587 #define KRNA(_kr)       \
588 	((__DECONST(struct __kern_channel_ring *, _kr))->ckr_na)
589 
590 #define KR_KERNEL_ONLY(_kr)     \
591 	(((_kr)->ckr_flags & CKRF_KERNEL_ONLY) != 0)
592 #define KR_DROP(_kr)            \
593 	(((_kr)->ckr_flags & (CKRF_DROP|CKRF_DEFUNCT)) != 0)
594 
595 /* valid values for ckr_state */
596 enum {
597 	KR_READY = 0,
598 	KR_STOPPED,             /* unbounded stop */
599 	KR_LOCKED,              /* bounded, brief stop for mutual exclusion */
600 };
601 
602 #define KR_KSD(_kring, _slot_idx)       \
603 	(SLOT_DESC_KSD(&(_kring)->ckr_ksds[_slot_idx]))
604 
605 #define KR_USD(_kring, _slot_idx)       \
606 	(SLOT_DESC_USD(&(_kring)->ckr_usds[_slot_idx]))
607 
608 __attribute__((always_inline))
609 static inline slot_idx_t
KR_SLOT_INDEX(const struct __kern_channel_ring * kr,const struct __slot_desc * slot)610 KR_SLOT_INDEX(const struct __kern_channel_ring *kr,
611     const struct __slot_desc *slot)
612 {
613 	ASSERT(slot >= kr->ckr_ksds && slot <= kr->ckr_ksds_last);
614 	return (slot_idx_t)(slot - kr->ckr_ksds);
615 }
616 
617 /* Helper macros for slot descriptor, decoupled for KSD/USD. */
618 
619 #define KSD_VALID_METADATA(_ksd)                                        \
620 	((_ksd)->sd_md != NULL)
621 
622 #define KSD_INIT(_ksd) do {                                             \
623 	(_ksd)->sd_md = NULL;                                           \
624 } while (0)
625 
626 #define KSD_ATTACH_METADATA(_ksd, _md_addr) do {                        \
627 	ASSERT((_ksd) != NULL);                                         \
628 	ASSERT((_ksd)->sd_md == NULL);                                  \
629 	(_ksd)->sd_md = (_md_addr);                                     \
630 } while (0)
631 
632 #define KSD_DETACH_METADATA(_ksd) do {                                  \
633 	ASSERT((_ksd) != NULL);                                         \
634 	ASSERT((_ksd)->sd_md != NULL);                                  \
635 	(_ksd)->sd_md = NULL;                                           \
636 } while (0)
637 
638 #define KSD_RESET(_ksd) KSD_INIT(_ksd)
639 
640 #define USD_INIT(_usd) do {                                             \
641 	(_usd)->sd_md_idx = OBJ_IDX_NONE;                               \
642 	(_usd)->sd_flags = 0;                                           \
643 	(_usd)->sd_len = 0;                                             \
644 } while (0)
645 
646 #define USD_ATTACH_METADATA(_usd, _md_idx) do {                         \
647 	ASSERT((_usd) != NULL);                                         \
648 	ASSERT((_usd)->sd_md_idx == OBJ_IDX_NONE);                      \
649 	ASSERT(((_usd)->sd_flags & SD_IDX_VALID) == 0);                 \
650 	(_usd)->sd_md_idx = (_md_idx);                                  \
651 	(_usd)->sd_flags |= SD_IDX_VALID;                               \
652 	/* mask off non-user flags */                                   \
653 	(_usd)->sd_flags &= SD_FLAGS_USER;                              \
654 } while (0);
655 
656 #define USD_DETACH_METADATA(_usd) do {                                  \
657 	ASSERT((_usd) != NULL);                                         \
658 	(_usd)->sd_md_idx = OBJ_IDX_NONE;                               \
659 	/* mask off non-user flags */                                   \
660 	(_usd)->sd_flags &= SD_FLAGS_USER;                              \
661 	(_usd)->sd_flags &= ~SD_IDX_VALID;                              \
662 } while (0)
663 
664 #define USD_RESET(_usd) USD_INIT(_usd)
665 
666 #define USD_SET_LENGTH(_usd, _md_len) do {                              \
667 	ASSERT((_usd) != NULL);                                         \
668 	(_usd)->sd_len = _md_len;                                       \
669 } while (0)
670 
671 #define _USD_COPY(_src, _dst) do {                                      \
672 	_CASSERT(sizeof (struct __user_slot_desc) == 8);                \
673 	sk_copy64_8((uint64_t *)(void *)_src, (uint64_t *)(void *)_dst); \
674 } while (0)
675 
676 #define _USD_SWAP(_usd1, _usd2) do {                                    \
677 	struct __user_slot_desc _tusd                                   \
678 	    __attribute((aligned(sizeof (uint64_t))));                  \
679 	_USD_COPY(_usd1, &_tusd);                                       \
680 	_USD_COPY(_usd2, _usd1);                                        \
681 	_USD_COPY(&_tusd, _usd2);                                       \
682 } while (0)
683 
684 #define _KSD_COPY(_src, _dst) do {                                      \
685 	_CASSERT(sizeof (struct __kern_slot_desc) == 8);                \
686 	sk_copy64_8((uint64_t *)(void *)_src, (uint64_t *)(void *)_dst); \
687 } while (0)
688 
689 #define _KSD_SWAP(_ksd1, _ksd2) do {                                    \
690 	struct __kern_slot_desc _tksd                                   \
691 	    __attribute((aligned(sizeof (uint64_t))));                  \
692 	_KSD_COPY(_ksd1, &_tksd);                                       \
693 	_KSD_COPY(_ksd2, _ksd1);                                        \
694 	_KSD_COPY(&_tksd, _ksd2);                                       \
695 } while (0)
696 
697 #define SD_SWAP(_ksd1, _usd1, _ksd2, _usd2) do {                        \
698 	_USD_SWAP(_usd1, _usd2);                                        \
699 	_KSD_SWAP(_ksd1, _ksd2);                                        \
700 	/* swap packet attachment */                                    \
701 	*(struct __kern_slot_desc **)(uintptr_t)&(_ksd1)->sd_qum->qum_ksd = \
702 	    (_ksd1); \
703 	*(struct __kern_slot_desc **)(uintptr_t)&(_ksd2)->sd_qum->qum_ksd = \
704 	    (_ksd2); \
705 } while (0)
706 
707 #define _MD_BUFLET_ADDROFF(_md, _addr, _objaddr, _doff, _dlen, _dlim) do { \
708 	struct __kern_quantum *_q = SK_PTR_ADDR_KQUM(_md);              \
709 	switch (METADATA_TYPE(_q)) {                                    \
710 	case NEXUS_META_TYPE_PACKET: {                                  \
711 	        struct __kern_packet *_p =                              \
712 	            (struct __kern_packet *)(void *)(_md);              \
713 	        struct __kern_buflet *_kbft;                            \
714 	        PKT_GET_FIRST_BUFLET(_p, _p->pkt_bufs_cnt, _kbft);      \
715 	        (_addr) = __DECONST(void *, _kbft->buf_addr);           \
716 	        (_objaddr) = _kbft->buf_objaddr;                        \
717 	        (_doff) = _kbft->buf_doff;                              \
718 	        (_dlen) = _kbft->buf_dlen;                              \
719 	        (_dlim) = _kbft->buf_dlim;                              \
720 	        break;                                                  \
721 	}                                                               \
722 	default:                                                        \
723 	        (_addr) = __DECONST(void *, _q->qum_buf[0].buf_addr);   \
724 	        (_objaddr) = _q->qum_buf[0].buf_objaddr;                \
725 	        (_doff) = _q->qum_buf[0].buf_doff;                      \
726 	        (_dlen) = _q->qum_buf[0].buf_dlen;                      \
727 	        (_dlim) = _q->qum_buf[0].buf_dlim;                      \
728 	        break;                                                  \
729 	}                                                               \
730 	ASSERT((_addr) != NULL);                                        \
731 	ASSERT((_objaddr) != NULL);                                     \
732 } while (0)
733 
734 #define _MD_BUFLET_ADDR_PKT(_md, _addr) do { \
735 	ASSERT(METADATA_TYPE(SK_PTR_ADDR_KQUM(_md)) ==                  \
736 	    NEXUS_META_TYPE_PACKET);                                    \
737 	struct __kern_packet *_p = (struct __kern_packet *)(void *)(_md); \
738 	struct __kern_buflet *_kbft;                                    \
739 	PKT_GET_FIRST_BUFLET(_p, _p->pkt_bufs_cnt, _kbft);              \
740 	(_addr) = __DECONST(void *, _kbft->buf_addr);                   \
741 	ASSERT((_addr) != NULL);                                        \
742 } while (0)
743 
744 
745 /*
746  * Return the data offset adjusted virtual address of a buffer associated
747  * with the metadata; for metadata with multiple buflets, this is the
748  * first buffer's address.
749  */
750 #define MD_BUFLET_ADDR(_md, _val) do {                                  \
751 	void *__unsafe_indexable _addr, *__unsafe_indexable _objaddr;   \
752 	uint32_t _doff, _dlen, _dlim;                                   \
753 	_MD_BUFLET_ADDROFF(_md, _addr, _objaddr, _doff, _dlen, _dlim);  \
754 	/* skip past buflet data offset */                              \
755 	(_val) = (void *)((uint8_t *)_addr + _doff);                    \
756 } while (0)
757 
758 /*
759  * Return the absolute virtual address of a buffer associated with the
760  * metadata; for metadata with multiple buflets, this is the first
761  * buffer's address.
762  */
763 #define MD_BUFLET_ADDR_ABS(_md, _val) do {                              \
764 	void *__unsafe_indexable _addr, *__unsafe_indexable _objaddr;   \
765 	uint32_t _doff, _dlen, _dlim;                                   \
766 	_MD_BUFLET_ADDROFF(_md, _addr, _objaddr, _doff, _dlen, _dlim);  \
767 	(_val) = (void *)_addr;                                         \
768 } while (0)
769 
770 /* similar to MD_BUFLET_ADDR_ABS() but optimized only for packets */
771 #define MD_BUFLET_ADDR_ABS_PKT(_md, _val) do {                          \
772 	void *__unsafe_indexable _addr;                                 \
773 	_MD_BUFLET_ADDR_PKT(_md, _addr);                                \
774 	(_val) = (void *)_addr;                                         \
775 } while (0)
776 
777 
778 #define MD_BUFLET_ADDR_ABS_DLEN(_md, _val, _dlen, _dlim, _doff) do {    \
779 	void *__unsafe_indexable _addr, *__unsafe_indexable _objaddr;   \
780 	_MD_BUFLET_ADDROFF(_md, _addr, _objaddr, _doff, _dlen, _dlim);  \
781 	(_val) = (void *)_addr;                                         \
782 } while (0)
783 
784 
785 /*
786  * Return the buffer's object address associated with the metadata; for
787  * metadata with multiple buflets, this is the first buffer's object address.
788  */
789 #define MD_BUFLET_OBJADDR(_md, _val) do {                               \
790 	void *__unsafe_indexable _addr, *__unsafe_indexable _objaddr;   \
791 	uint32_t _doff, _dlen, _dlim;                                   \
792 	_MD_BUFLET_ADDROFF(_md, _addr, _objaddr, _doff, _dlen, _dlim);  \
793 	(_val) = (void *)_objaddr;                                      \
794 } while (0)
795 
796 /*
797  * Return the data offset adjusted virtual address of a buffer associated
798  * with the metadata; for metadata with multiple buflets, this is the
799  * first buffer's address and data length.
800  */
801 #define MD_BUFLET_ADDR_DLEN(_md, _val, _dlen) do {                      \
802 	void *__unsafe_indexable _addr, *__unsafe_indexable _objaddr;   \
803 	uint32_t _doff, _dlim;                                          \
804 	_MD_BUFLET_ADDROFF(_md, _addr, _objaddr, _doff, _dlen, _dlim);  \
805 	/* skip past buflet data offset */                              \
806 	(_val) = (void *)((uint8_t *)_addr + _doff);                    \
807 } while (0)
808 
809 /* kr_space: return available space for enqueue into kring */
810 __attribute__((always_inline))
811 static inline uint32_t
kr_available_slots(struct __kern_channel_ring * kr)812 kr_available_slots(struct __kern_channel_ring *kr)
813 {
814 	int busy;
815 	uint32_t space;
816 
817 	busy = (int)(kr->ckr_klease - kr->ckr_khead);
818 	if (busy < 0) {
819 		busy += kr->ckr_num_slots;
820 	}
821 	space = kr->ckr_lim - (uint32_t)busy;
822 
823 	return space;
824 }
825 
826 /* kr_space: return available space for enqueue into Rx kring */
827 __attribute__((always_inline))
828 static inline uint32_t
kr_available_slots_rxring(struct __kern_channel_ring * rxkring)829 kr_available_slots_rxring(struct __kern_channel_ring *rxkring)
830 {
831 	int busy;
832 	uint32_t space;
833 
834 	/* # of rx busy (unclaimed) slots */
835 	busy = (int)(rxkring->ckr_ktail - rxkring->ckr_khead);
836 	if (busy < 0) {
837 		busy += rxkring->ckr_num_slots;
838 	}
839 
840 	/* # of rx avail free slots (subtract busy from max) */
841 	space = rxkring->ckr_lim - (uint32_t)busy;
842 	return space;
843 }
844 
845 extern kern_allocation_name_t skmem_tag_ch_key;
846 
847 #if (DEVELOPMENT || DEBUG)
848 SYSCTL_DECL(_kern_skywalk_channel);
849 #endif /* !DEVELOPMENT && !DEBUG */
850 
851 __BEGIN_DECLS
852 extern int channel_init(void);
853 extern void channel_fini(void);
854 
855 extern struct kern_channel *ch_open(struct ch_init *, struct proc *,
856     int, int *);
857 extern struct kern_channel *ch_open_special(struct kern_nexus *,
858     struct chreq *, boolean_t, int *);
859 extern void ch_close(struct kern_channel *, boolean_t);
860 extern void ch_close_special(struct kern_channel *);
861 extern int ch_kqfilter(struct kern_channel *, struct knote *,
862     struct kevent_qos_s *kev);
863 extern boolean_t ch_is_multiplex(struct kern_channel *, enum txrx);
864 extern int ch_select(struct kern_channel *, int, void *, struct proc *);
865 extern int ch_get_opt(struct kern_channel *, struct sockopt *);
866 extern int ch_set_opt(struct kern_channel *, struct sockopt *);
867 extern void ch_deactivate(struct kern_channel *);
868 extern void ch_retain(struct kern_channel *);
869 extern void ch_retain_locked(struct kern_channel *);
870 extern int ch_release(struct kern_channel *);
871 extern int ch_release_locked(struct kern_channel *);
872 extern void ch_dtor(void *);
873 
874 extern void csi_init(struct ch_selinfo *, boolean_t, uint64_t);
875 extern void csi_destroy(struct ch_selinfo *);
876 extern void csi_selrecord_one(struct __kern_channel_ring *, struct proc *,
877     void *);
878 extern void csi_selrecord_all(struct nexus_adapter *, enum txrx, struct proc *,
879     void *);
880 extern void csi_selwakeup_one(struct __kern_channel_ring *, boolean_t,
881     boolean_t, boolean_t, uint32_t);
882 extern void csi_selwakeup_all(struct nexus_adapter *, enum txrx, boolean_t,
883     boolean_t, boolean_t, uint32_t);
884 
885 extern void kr_init_to_mhints(struct __kern_channel_ring *, uint32_t);
886 extern int kr_enter(struct __kern_channel_ring *, boolean_t);
887 extern void kr_exit(struct __kern_channel_ring *);
888 extern void kr_start(struct __kern_channel_ring *);
889 extern void kr_stop(struct __kern_channel_ring *kr, uint32_t state);
890 extern void kr_update_stats(struct __kern_channel_ring *kring,
891     uint32_t slot_count, uint32_t byte_count);
892 extern boolean_t kr_txempty(struct __kern_channel_ring *kring);
893 extern uint32_t kr_reclaim(struct __kern_channel_ring *kr);
894 
895 extern slot_idx_t kr_txsync_prologue(struct kern_channel *,
896     struct __kern_channel_ring *, struct proc *);
897 extern int kr_txprologue(struct kern_channel *,
898     struct __kern_channel_ring *, const slot_idx_t, uint32_t *, uint64_t *,
899     struct proc *);
900 extern int kr_txprologue_upp(struct kern_channel *,
901     struct __kern_channel_ring *, const slot_idx_t, uint32_t *, uint64_t *,
902     struct proc *);
903 
904 extern void kr_txsync_finalize(struct kern_channel *,
905     struct __kern_channel_ring *, struct proc *);
906 extern void kr_txfinalize(struct kern_channel *,
907     struct __kern_channel_ring *, const slot_idx_t, struct proc *p);
908 extern void kr_txfinalize_upp(struct kern_channel *,
909     struct __kern_channel_ring *, const slot_idx_t, struct proc *p);
910 
911 extern slot_idx_t kr_rxsync_prologue(struct kern_channel *ch,
912     struct __kern_channel_ring *kring, struct proc *p);
913 extern int kr_rxprologue(struct kern_channel *,
914     struct __kern_channel_ring *, const slot_idx_t, uint32_t *, uint64_t *,
915     struct proc *);
916 extern int kr_rxprologue_nodetach(struct kern_channel *,
917     struct __kern_channel_ring *, const slot_idx_t, uint32_t *, uint64_t *,
918     struct proc *);
919 extern int kr_rxprologue_upp(struct kern_channel *,
920     struct __kern_channel_ring *, const slot_idx_t, uint32_t *, uint64_t *,
921     struct proc *);
922 
923 extern void kr_rxsync_finalize(struct kern_channel *ch,
924     struct __kern_channel_ring *kring, struct proc *p);
925 extern void kr_rxfinalize(struct kern_channel *,
926     struct __kern_channel_ring *, const slot_idx_t, struct proc *p);
927 extern void kr_rxfinalize_upp(struct kern_channel *,
928     struct __kern_channel_ring *, const slot_idx_t, struct proc *p);
929 
930 extern void kr_txkring_reclaim_and_refill(struct __kern_channel_ring *kring,
931     slot_idx_t index);
932 extern slot_idx_t kr_alloc_sync_prologue(struct __kern_channel_ring *kring,
933     struct proc *p);
934 extern slot_idx_t kr_free_sync_prologue(struct __kern_channel_ring *kring,
935     struct proc *p);
936 extern void kr_alloc_sync_finalize(struct __kern_channel_ring *kring,
937     struct proc *p);
938 extern void kr_free_sync_finalize(struct __kern_channel_ring *kring,
939     struct proc *p);
940 extern int kr_internalize_metadata(struct kern_channel *,
941     struct __kern_channel_ring *, const uint32_t, struct __kern_quantum *,
942     struct proc *);
943 extern void kr_externalize_metadata(struct __kern_channel_ring *,
944     const uint32_t, struct __kern_quantum *, struct proc *);
945 extern slot_idx_t kr_event_sync_prologue(struct __kern_channel_ring *kring,
946     struct proc *p);
947 extern void kr_event_sync_finalize(struct kern_channel *ch,
948     struct __kern_channel_ring *kring, struct proc *p);
949 
950 #if SK_LOG
951 extern void kr_log_bad_ring(struct __kern_channel_ring *);
952 #else
953 #define kr_log_bad_ring(_kr)    do { ((void)0); } while (0)
954 #endif /* SK_LOG */
955 __END_DECLS
956 #endif /* BSD_KERNEL_PRIVATE */
957 #endif /* !_SKYWALK_CHANNEL_CHANNELVAR_H_ */
958