1 /*
2 * Copyright (c) 2015-2020 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 /*
30 * Copyright (C) 2012-2014 Matteo Landi, Luigi Rizzo, Giuseppe Lettieri.
31 * All rights reserved.
32 * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved.
33 *
34 * Redistribution and use in source and binary forms, with or without
35 * modification, are permitted provided that the following conditions
36 * are met:
37 * 1. Redistributions of source code must retain the above copyright
38 * notice, this list of conditions and the following disclaimer.
39 * 2. Redistributions in binary form must reproduce the above copyright
40 * notice, this list of conditions and the following disclaimer in the
41 * documentation and/or other materials provided with the distribution.
42 *
43 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
44 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
45 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
46 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
47 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
48 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
49 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
50 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
51 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
52 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
53 * SUCH DAMAGE.
54 */
55
56 #ifndef _SKYWALK_CHANNEL_CHANNELVAR_H_
57 #define _SKYWALK_CHANNEL_CHANNELVAR_H_
58
59 #ifdef BSD_KERNEL_PRIVATE
60 #include <skywalk/core/skywalk_var.h>
61 #include <skywalk/os_channel_private.h>
62 #include <skywalk/nexus/nexus_mbq.h>
63 #include <skywalk/nexus/nexus_pktq.h>
64 #include <skywalk/mem/skmem_region_var.h>
65 #include <skywalk/mem/skmem_arena_var.h>
66
67 struct ch_selinfo {
68 decl_lck_mtx_data(, csi_lock);
69 struct selinfo csi_si;
70 uint32_t csi_flags;
71 uint32_t csi_pending;
72 uint64_t csi_eff_interval;
73 uint64_t csi_interval;
74 thread_call_t csi_tcall;
75 };
76
77 /* values for csi_flags */
78 #define CSI_KNOTE 0x1 /* kernel note attached */
79 #define CSI_MITIGATION 0x10 /* has mitigation */
80 #define CSI_DESTROYED (1U << 31) /* has been destroyed */
81
82 #define CSI_LOCK(_csi) \
83 lck_mtx_lock(&(_csi)->csi_lock)
84 #define CSI_LOCK_ASSERT_HELD(_csi) \
85 LCK_MTX_ASSERT(&(_csi)->csi_lock, LCK_MTX_ASSERT_OWNED)
86 #define CSI_LOCK_ASSERT_NOTHELD(_csi) \
87 LCK_MTX_ASSERT(&(_csi)->csi_lock, LCK_MTX_ASSERT_NOTOWNED)
88 #define CSI_UNLOCK(_csi) \
89 lck_mtx_unlock(&(_csi)->csi_lock)
90
91 /* mitigation intervals in ns */
92 #define CH_MIT_IVAL_DEFAULT (0)
93 #define CH_MIT_IVAL_WIFI CH_MIT_IVAL_DEFAULT
94 #define CH_MIT_IVAL_CELLULAR CH_MIT_IVAL_DEFAULT
95 #define CH_MIT_IVAL_ETHERNET CH_MIT_IVAL_DEFAULT
96
97 /*
98 * Kernel version of __user_slot_desc.
99 *
100 * Keep slot descriptor as minimal as possible.
101 * TODO: [email protected] -- Should we make use of RX/TX
102 * preparation/writeback descriptors (in a union)?
103 */
104 struct __kern_slot_desc {
105 union {
106 struct __kern_quantum *sd_qum;
107 struct __kern_packet *sd_pkt;
108 struct __kern_buflet *sd_buf;
109 void *sd_md; /* metadata address */
110 };
111
112 #ifndef __LP64__
113 uint32_t _sd_pad[1];
114 #endif /* !__LP64__ */
115 };
116
117 /* _sd_{user,kern} are at same offset in the preamble */
118 #define SLOT_DESC_KSD(_sdp) \
119 ((struct __kern_slot_desc *)((uintptr_t)&(_sdp)->_sd_private))
120
121 /*
122 * Optional, per-slot context information. An array of these structures
123 * is allocated per nexus_adapter, and each real kring will have its slots
124 * correspond to one. This the 'arg' value is retrieved via the slot_init
125 * nexus provider callback, and is retrievable via subsequently via calls
126 * to kern_channel_slot_get_context().
127 */
128 struct slot_ctx {
129 mach_vm_address_t slot_ctx_arg; /* per-slot context */
130 };
131
132 extern lck_attr_t channel_lock_attr;
133 extern uint64_t __ch_umd_redzone_cookie;
134 extern uint32_t kr_stat_enable;
135
136 struct kern_nexus;
137 enum na_sync_mode;
138
139 struct kern_channel {
140 decl_lck_mtx_data(, ch_lock);
141 struct nexus_adapter *ch_na;
142 struct kern_nexus *ch_nexus;
143 struct ch_info *ch_info;
144 struct kern_pbufpool *ch_pp;
145
146 uint32_t ch_refcnt;
147 volatile uint32_t ch_flags; /* CHANF_* flags */
148
149 /* range of tx/rx/allocator/event rings to scan */
150 ring_id_t ch_first[NR_ALL];
151 ring_id_t ch_last[NR_ALL];
152
153 struct __user_channel_schema *ch_schema;
154
155 /*
156 * Pointers to the selinfo to be used for selrecord.
157 * Either the local or the global one depending on the
158 * number of rings.
159 */
160 struct ch_selinfo *ch_si[NR_ALL];
161
162 STAILQ_ENTRY(kern_channel) ch_link;
163 STAILQ_ENTRY(kern_channel) ch_link_if_adv;
164 void *ch_ctx;
165 mach_vm_offset_t ch_schema_offset;
166 struct skmem_arena_mmap_info ch_mmap;
167 int ch_fd; /* might be -1 if no fd */
168 pid_t ch_pid; /* process ID */
169 char ch_name[32]; /* process name */
170 };
171
172 /* valid values for ch_flags */
173 #define CHANF_ATTACHED 0x1 /* attached and connected to nexus */
174 #define CHANF_PLATFORM 0x2 /* platform binary process */
175 #define CHANF_KERNEL 0x4 /* kernel only; has no task map */
176 #define CHANF_RXONLY 0x8 /* receive only, no transmit */
177 #define CHANF_USER_PACKET_POOL 0x10 /* userspace using packet pool */
178 #define CHANF_EXCLUSIVE 0x20 /* exclusive bind to ring(s) */
179 #define CHANF_NONXREF 0x40 /* has no nexus reference */
180 #define CHANF_HOST 0x80 /* opened to host (kernel) stack */
181 #define CHANF_EXT_SKIP 0x100 /* don't notify external provider */
182 #define CHANF_EXT_PRECONNECT 0x200 /* successful nxpi_pre_connect() */
183 #define CHANF_EXT_CONNECTED 0x400 /* successful nxpi_connected() */
184 #define CHANF_EVENT_RING 0x1000 /* channel has event rings */
185 #define CHANF_IF_ADV 0x2000 /* interface advisory is active */
186 #define CHANF_DEFUNCT_SKIP 0x4000 /* defunct skipped due to active use */
187 #define CHANF_CLOSING (1U << 30) /* channel is being closed */
188 #define CHANF_DEFUNCT (1U << 31) /* channel is now defunct */
189
190 #define CHANF_BITS \
191 "\020\01ATTACHED\02PLATFORM\03KERNEL\04RXONLY\05USER_PKT_POOL" \
192 "\06EXCLUSIVE\07NONXREF\010HOST\011EXT_SKIP\012EXT_PRECONNECT" \
193 "\013EXT_CONNECTED\015EVENT\016ADVISORY" \
194 "\017DEFUNCT_SKIP\037CLOSING\040DEFUNCT"
195
196 /* valid values for ch_kevhints */
197 #define CHAN_FILT_HINT_FLOW_ADV_UPD 0x1 /* flow advisory update */
198 #define CHAN_FILT_HINT_CHANNEL_EVENT 0x2 /* channel event */
199 #define CHAN_FILT_HINT_IF_ADV_UPD 0x4 /* Interface advisory update */
200
201 #define CHAN_FILT_HINT_BITS "\020\01FLOW_ADV\02CHANNEL_EVENT\03IF_ADV"
202
203 typedef enum {
204 RING_SET_ALL = 0, /* all rings */
205 RING_SET_DEFAULT = RING_SET_ALL,
206 } ring_set_t;
207
208 typedef enum {
209 CH_ENDPOINT_NULL = 0,
210 CH_ENDPOINT_USER_PIPE_MASTER,
211 CH_ENDPOINT_USER_PIPE_SLAVE,
212 CH_ENDPOINT_KERNEL_PIPE,
213 CH_ENDPOINT_NET_IF,
214 CH_ENDPOINT_FLOW_SWITCH,
215 } ch_endpoint_t;
216
217 #define CHREQ_NAMELEN 64
218
219 struct chreq {
220 char cr_name[CHREQ_NAMELEN]; /* in */
221 uuid_t cr_spec_uuid; /* in */
222 struct ch_ev_thresh cr_tx_lowat; /* in */
223 struct ch_ev_thresh cr_rx_lowat; /* in */
224 nexus_port_t cr_port; /* in/out */
225 uint32_t cr_mode; /* in */
226 uint32_t cr_pipe_id; /* in */
227 ring_id_t cr_ring_id; /* in */
228 ring_set_t cr_ring_set; /* out */
229 ch_endpoint_t cr_real_endpoint; /* out */
230 ch_endpoint_t cr_endpoint; /* out */
231 mach_vm_size_t cr_memsize; /* out */
232 mach_vm_offset_t cr_memoffset; /* out */
233 };
234
235 /*
236 * Private, kernel view of a ring. Keeps track of the status of
237 * a ring across system calls.
238 *
239 * ckr_khead Index of the next buffer to refill. It corresponds
240 * to ring_head at the time the system call returns.
241 *
242 * ckr_ktail Index of the first buffer owned by the kernel.
243 *
244 * On RX, ckr_khead to ckr_ktail are receive buffers that
245 * are not yet released. ckr_khead is advanced following
246 * ring_head, ckr_ktail is advanced on incoming packets.
247 *
248 * On TX, ckr_rhead has been filled by the sender but not
249 * sent yet to the destination; ckr_rhead to ckr_ktail are
250 * available for new transmissions, and ckr_ktail to
251 * ckr_khead-1 are pending transmissions.
252 *
253 * Here is the layout for the RX and TX rings.
254 *
255 * RX RING TX RING
256 *
257 * +-----------------+ +-----------------+
258 * | | | |
259 * |XXX free slot XXX| |XXX free slot XXX|
260 * +-----------------+ +-----------------+
261 * head->| owned by user |<-khead | not sent to nic |<-khead
262 * | | | yet |
263 * | | | |
264 * +-----------------+ + ------ +
265 * tail->| |<-ktail | |<-klease
266 * | (being | ... | | ...
267 * | prepared) | ... | | ...
268 * +-----------------+ ... | | ...
269 * | |<-klease +-----------------+
270 * | | tail->| |<-ktail
271 * | | | |
272 * | | | |
273 * | | | |
274 * +-----------------+ +-----------------+
275 *
276 * The head/tail (user view) and khead/ktail (kernel view)
277 * are used in the normal operation of the adapter.
278 *
279 * For flow switch nexus:
280 *
281 * The following fields are used to implement lock-free copy of packets
282 * from input to output ports in flow switch:
283 *
284 * ckr_klease Buffer after the last one being copied.
285 * A writer in nx_fsw_vp_flush() reserves N buffers
286 * from ckr_klease, advances it, then does the
287 * copy outside the lock.
288 *
289 * In RX rings (used for flow switch ports):
290 * ckr_ktail <= ckr_klease < nkr_khead+N-1
291 *
292 * In TX rings (used for NIC or host stack ports):
293 * nkr_khead <= ckr_klease < nkr_ktail
294 *
295 * ckr_leases Array of ckr_num_slots where writers can report
296 * completion of their block. CKR_NOSLOT (~0) indicates
297 * that the writer has not finished yet
298 *
299 * ckr_lease_idx Index of next free slot in ckr_leases, to be assigned.
300 *
301 * The kring is manipulated by txsync/rxsync and generic kring function.
302 *
303 * Concurrent rxsync or txsync on the same ring are prevented through
304 * by na_kr_(try)get() which in turn uses ckr_busy. This is all we need
305 * for NIC rings, and for TX rings attached to the host stack.
306 *
307 * RX rings attached to the host stack use an nx_mbq (ckr_rx_queue) on both
308 * nx_netif_rxsync_from_host() and nx_netif_compat_transmit(). The nx_mbq is
309 * protected by its internal lock.
310 *
311 * RX rings attached to the flow switch are accessed by both senders
312 * and receiver. They are protected through the q_lock on the RX ring.
313 *
314 * When a ring is the output of a switch port (RX ring for a flow switch
315 * port, TX ring for the host stack or NIC), slots are reserved in blocks
316 * through ckr_klease which points to the next unused slot.
317 *
318 * On an RX ring, ckr_klease is always after ckr_ktail, and completions cause
319 * ckr_ktail to advance. On a TX ring, ckr_klease is always between ckr_khead
320 * and ckr_ktail, and completions cause ckr_khead to advance.
321 *
322 * nx_fsw_vp_na_kr_space()
323 * returns the maximum number of slots that can be assigned.
324 *
325 * nx_fsw_vp_na_kr_lease() reserves the required number of buffers,
326 * advances ckr_klease and also returns an entry in a circular
327 * array where completions should be reported.
328 *
329 * For netif nexus:
330 *
331 * The indexes in the NIC and rings are offset by ckr_hwofs slots. This is
332 * so that, on a reset, buffers owned by userspace are not modified by the
333 * kernel. In particular:
334 *
335 * RX rings: the next empty buffer (ckr_ktail + ckr_hwofs) coincides with
336 * the next empty buffer as known by the hardware "next to check".
337 * TX rings: ckr_khead + ckr_hwofs coincides with "next to send".
338 *
339 */
340 typedef int (*channel_ring_notify_t)(struct __kern_channel_ring *,
341 struct proc *, uint32_t);
342
343 struct __kern_channel_ring {
344 struct __user_channel_ring *ckr_ring;
345
346 uint32_t ckr_flags; /* CKRF_* flags */
347 slot_idx_t ckr_num_slots; /* # of slots */
348 uint32_t ckr_max_pkt_len;/* max pp pkt size */
349 uint32_t ckr_largest; /* largest packet seen */
350 const slot_idx_t ckr_lim; /* ckr_num_slots - 1 */
351 enum txrx ckr_tx; /* kind of ring (tx/rx/alloc/free) */
352
353 volatile slot_idx_t ckr_khead;
354 volatile slot_idx_t ckr_ktail;
355 /*
356 * value of ckr_khead recorded at TX prologue (pre-sync)
357 */
358 volatile slot_idx_t ckr_khead_pre;
359 /*
360 * Copies of values in user rings, so we do not need to look
361 * at the ring (which could be modified). These are set in the
362 * *sync_prologue()/finalize() routines.
363 */
364 volatile slot_idx_t ckr_rhead;
365 volatile slot_idx_t ckr_rtail;
366
367 /* EWMA decay rate */
368 uint32_t ckr_transfer_decay;
369
370 uint64_t ckr_ready_bytes;
371 uint64_t ckr_ready_slots;
372
373 /*
374 * While ckr_state is set, no new [tr]xsync operations can be
375 * started on this kring. This is used by na_disable_all_rings()
376 * to find a synchronization point where critical data structures
377 * pointed to by the kring can be added or removed.
378 */
379 decl_lck_spin_data(, ckr_slock);
380 struct thread *ckr_owner; /* busy owner */
381 uint32_t ckr_busy; /* prevent kring modifications */
382 uint32_t ckr_want; /* # of threads that lost the race */
383 uint32_t ckr_state; /* KR_* states */
384
385 /* current working set for the allocator ring */
386 volatile uint32_t ckr_alloc_ws;
387
388 struct nexus_adapter *ckr_na; /* adapter this kring belongs to */
389 struct kern_pbufpool *ckr_pp; /* adapter's packet buffer pool */
390
391 /*
392 * Array of __slot_desc each representing slot-specific data, e.g.
393 * index to metadata, etc. There is exactly one descriptor for each
394 * slot in the ring. Note that the size of the array may be greater
395 * than the number of slots for this ring, and so we constrain
396 * range with [ckr_ksds, ckr_ksds_last] during validations.
397 */
398 struct __slot_desc *ckr_usds; /* slot desc array (user) */
399 struct __slot_desc *ckr_ksds; /* slot desc array (kernel) */
400 struct __slot_desc *ckr_ksds_last; /* cache last ksd */
401 struct skmem_cache *ckr_ksds_cache; /* owning skmem_cache for ksd */
402
403 uint32_t ckr_ring_id; /* ring ID */
404
405 boolean_t ckr_rate_limited; /* ring is rate limited */
406
407 /*
408 * Array of packet handles for as many slots as there are in the
409 * ring; this is useful for storing an array of kern_packet_t to
410 * be used when invoking the packet APIs. Only safe to be used
411 * in the context of a sync as we're single-threaded then.
412 * The memory is owned by the nexus adapter.
413 */
414 uint64_t *ckr_scratch;
415
416 /*
417 * [tx]sync callback for this kring. The default na_kring_create
418 * callback (na_kr_create) sets the ckr_na_sync callback of each
419 * tx(rx) kring to the corresponding na_txsync(na_rxsync) taken
420 * from the nexus_adapter.
421 *
422 * Overrides: the above configuration is not changed by
423 * any of the nm_krings_create callbacks.
424 */
425 int (*ckr_na_sync)(struct __kern_channel_ring *,
426 struct proc *, uint32_t);
427 int(*volatile ckr_na_notify)(struct __kern_channel_ring *,
428 struct proc *, uint32_t);
429
430 int (*ckr_prologue)(struct kern_channel *,
431 struct __kern_channel_ring *, const slot_idx_t,
432 uint32_t *, uint64_t *, struct proc *);
433 void (*ckr_finalize)(struct kern_channel *,
434 struct __kern_channel_ring *, const slot_idx_t, struct proc *);
435
436 /* time of last channel sync (updated at sync prologue time) */
437 uint64_t ckr_sync_time;
438
439 #if CONFIG_NEXUS_FLOWSWITCH
440 /* The following fields are for flow switch support */
441 int (*ckr_save_notify)(struct __kern_channel_ring *kring,
442 struct proc *, uint32_t flags);
443 uint32_t *ckr_leases;
444 #define CKR_NOSLOT ((uint32_t)~0) /* used in nkr_*lease* */
445 slot_idx_t ckr_klease;
446 slot_idx_t ckr_lease_idx;
447 #endif /* CONFIG_NEXUS_FLOWSWITCH */
448
449 kern_packet_svc_class_t ckr_svc;
450
451 /*
452 * (Optional) array of slot contexts for as many slots as there
453 * are in the ring; the memory is owned by the nexus adapter.
454 */
455 uint32_t ckr_slot_ctxs_set; /* number of valid/set contexts */
456 struct slot_ctx *ckr_slot_ctxs; /* (optional) array of slot contexts */
457
458 void *ckr_ctx; /* ring context */
459
460 struct ch_selinfo ckr_si; /* per-ring wait queue */
461
462 #if CONFIG_NEXUS_NETIF
463 /*
464 * netif adapters intercepts ckr_na_notify in order to
465 * mitigate IRQ events; the actual notification is done
466 * by invoking the original notify callback routine
467 * saved at na_activate() time.
468 */
469 int (*ckr_netif_notify)(struct __kern_channel_ring *kring,
470 struct proc *, uint32_t flags);
471 void (*ckr_netif_mit_stats)(struct __kern_channel_ring *kring,
472 uint64_t, uint64_t);
473 struct nx_netif_mit *ckr_mit;
474
475 volatile uint32_t ckr_pending_intr;
476 volatile uint32_t ckr_pending_doorbell;
477
478 /*
479 * Support for adapters without native Skywalk support.
480 * On tx rings we preallocate an array of tx buffers
481 * (same size as the channel ring), on rx rings we
482 * store incoming mbufs in a queue that is drained by
483 * a rxsync.
484 */
485 struct mbuf **ckr_tx_pool;
486 struct nx_mbq ckr_rx_queue; /* intercepted rx mbufs. */
487 #endif /* CONFIG_NEXUS_NETIF */
488
489 #if CONFIG_NEXUS_USER_PIPE
490 /* if this is a pipe ring, pointer to the other end */
491 struct __kern_channel_ring *ckr_pipe;
492 /* pointer to hidden rings see nx_user_pipe.c for details) */
493 struct __user_channel_ring *ckr_save_ring;
494 #endif /* CONFIG_NEXUS_USER_PIPE */
495
496 /*
497 * Protects kring in the event of multiple writers;
498 * only used by flow switch and monitor.
499 */
500 decl_lck_mtx_data(, ckr_qlock);
501
502 #if CONFIG_NEXUS_MONITOR
503 /* array of krings that are monitoring this kring */
504 struct __kern_channel_ring **ckr_monitors;
505 uint32_t ckr_max_monitors; /* current size of the monitors array */
506 uint32_t ckr_n_monitors; /* next unused entry in the monitor array */
507 /*
508 * Monitors work by intercepting the sync and notify callbacks of the
509 * monitored krings. This is implemented by replacing the pointers
510 * above and saving the previous ones in mon_* pointers below
511 */
512 int (*ckr_mon_sync)(struct __kern_channel_ring *kring, struct proc *,
513 uint32_t flags);
514 int (*ckr_mon_notify)(struct __kern_channel_ring *kring, struct proc *,
515 uint32_t flags);
516
517 uint32_t ckr_mon_tail; /* last seen slot on rx */
518 /* index of this ring in the monitored ring array */
519 uint32_t ckr_mon_pos;
520 #endif /* CONFIG_NEXUS_MONITOR */
521
522 uint32_t ckr_users; /* existing bindings for this ring */
523
524 /* ring flush rate limit */
525 int64_t ckr_tbr_token;
526 int64_t ckr_tbr_depth;
527 uint64_t ckr_tbr_last;
528 #define CKR_TBR_TOKEN_INVALID INT64_MAX
529
530 /* stats capturing errors */
531 channel_ring_error_stats ckr_err_stats
532 __attribute__((aligned(sizeof(uint64_t))));
533
534 /* stats capturing actual data movement (nexus provider's view) */
535 channel_ring_stats ckr_stats
536 __attribute__((aligned(sizeof(uint64_t))));
537 uint64_t ckr_accumulated_bytes;
538 uint64_t ckr_accumulated_slots;
539 uint64_t ckr_accumulate_start; /* in seconds */
540
541 /* stats capturing user activities per sync (user's view) */
542 channel_ring_user_stats ckr_usr_stats
543 __attribute__((aligned(sizeof(uint64_t))));
544 uint64_t ckr_user_accumulated_bytes;
545 uint64_t ckr_user_accumulated_slots;
546 uint64_t ckr_user_accumulated_syncs;
547 uint64_t ckr_user_accumulate_start; /* in seconds */
548
549 lck_grp_t *ckr_qlock_group;
550 lck_grp_t *ckr_slock_group;
551
552 char ckr_name[64]; /* diagnostic */
553 } __attribute__((__aligned__(CHANNEL_CACHE_ALIGN_MAX)));
554
555 #define KR_LOCK(_kr) \
556 lck_mtx_lock(&(_kr)->ckr_qlock)
557 #define KR_LOCK_SPIN(_kr) \
558 lck_mtx_lock_spin(&(_kr)->ckr_qlock)
559 #define KR_LOCK_TRY(_kr) \
560 lck_mtx_try_lock(&(_kr)->ckr_qlock)
561 #define KR_LOCK_ASSERT_HELD(_kr) \
562 LCK_MTX_ASSERT(&(_kr)->ckr_qlock, LCK_MTX_ASSERT_OWNED)
563 #define KR_LOCK_ASSERT_NOTHELD(_kr) \
564 LCK_MTX_ASSERT(&(_kr)->ckr_qlock, LCK_MTX_ASSERT_NOTOWNED)
565 #define KR_UNLOCK(_kr) \
566 lck_mtx_unlock(&(_kr)->ckr_qlock)
567
568 /* valid values for ckr_flags */
569 #define CKRF_EXCLUSIVE 0x1 /* exclusive binding */
570 #define CKRF_DROP 0x2 /* drop all mode */
571 #define CKRF_HOST 0x4 /* host ring */
572 #define CKRF_MEM_RING_INITED 0x8 /* na_kr_setup() succeeded */
573 #define CKRF_MEM_SD_INITED 0x10 /* na_kr_setup() succeeded */
574 #define CKRF_EXT_RING_INITED 0x20 /* nxpi_ring_init() succeeded */
575 #define CKRF_EXT_SLOTS_INITED 0x40 /* nxpi_slot_init() succeeded */
576 #define CKRF_SLOT_CONTEXT 0x80 /* ckr_slot_ctxs is valid */
577 #define CKRF_MITIGATION 0x100 /* supports event mitigation */
578 #define CKRF_DEFUNCT 0x200 /* no longer in service */
579 #define CKRF_KERNEL_ONLY (1U << 31) /* not usable by userland */
580
581 #define CKRF_BITS \
582 "\020\01EXCLUSIVE\02DROP\03HOST\04MEM_RING_INITED" \
583 "\05MEM_SD_INITED\06EXT_RING_INITED\07EXT_SLOTS_INITED" \
584 "\010SLOT_CONTEXT\011MITIGATION\012DEFUNCT\040KERNEL_ONLY"
585
586 #define KRNA(_kr) \
587 ((__DECONST(struct __kern_channel_ring *, _kr))->ckr_na)
588
589 #define KR_KERNEL_ONLY(_kr) \
590 (((_kr)->ckr_flags & CKRF_KERNEL_ONLY) != 0)
591 #define KR_DROP(_kr) \
592 (((_kr)->ckr_flags & (CKRF_DROP|CKRF_DEFUNCT)) != 0)
593
594 /* valid values for ckr_state */
595 enum {
596 KR_READY = 0,
597 KR_STOPPED, /* unbounded stop */
598 KR_LOCKED, /* bounded, brief stop for mutual exclusion */
599 };
600
601 #define KR_KSD(_kring, _slot_idx) \
602 (SLOT_DESC_KSD(&(_kring)->ckr_ksds[_slot_idx]))
603
604 #define KR_USD(_kring, _slot_idx) \
605 (SLOT_DESC_USD(&(_kring)->ckr_usds[_slot_idx]))
606
607 __attribute__((always_inline))
608 static inline slot_idx_t
KR_SLOT_INDEX(const struct __kern_channel_ring * kr,const struct __slot_desc * slot)609 KR_SLOT_INDEX(const struct __kern_channel_ring *kr,
610 const struct __slot_desc *slot)
611 {
612 ASSERT(slot >= kr->ckr_ksds && slot <= kr->ckr_ksds_last);
613 return (slot_idx_t)(slot - kr->ckr_ksds);
614 }
615
616 /* Helper macros for slot descriptor, decoupled for KSD/USD. */
617
618 #define KSD_VALID_METADATA(_ksd) \
619 ((_ksd)->sd_md != NULL)
620
621 #define KSD_INIT(_ksd) do { \
622 (_ksd)->sd_md = NULL; \
623 } while (0)
624
625 #define KSD_ATTACH_METADATA(_ksd, _md_addr) do { \
626 ASSERT((_ksd) != NULL); \
627 ASSERT((_ksd)->sd_md == NULL); \
628 (_ksd)->sd_md = (_md_addr); \
629 } while (0)
630
631 #define KSD_DETACH_METADATA(_ksd) do { \
632 ASSERT((_ksd) != NULL); \
633 ASSERT((_ksd)->sd_md != NULL); \
634 (_ksd)->sd_md = NULL; \
635 } while (0)
636
637 #define KSD_RESET(_ksd) KSD_INIT(_ksd)
638
639 #define USD_INIT(_usd) do { \
640 (_usd)->sd_md_idx = OBJ_IDX_NONE; \
641 (_usd)->sd_flags = 0; \
642 (_usd)->sd_len = 0; \
643 } while (0)
644
645 #define USD_ATTACH_METADATA(_usd, _md_idx) do { \
646 ASSERT((_usd) != NULL); \
647 ASSERT((_usd)->sd_md_idx == OBJ_IDX_NONE); \
648 ASSERT(((_usd)->sd_flags & SD_IDX_VALID) == 0); \
649 (_usd)->sd_md_idx = (_md_idx); \
650 (_usd)->sd_flags |= SD_IDX_VALID; \
651 /* mask off non-user flags */ \
652 (_usd)->sd_flags &= SD_FLAGS_USER; \
653 } while (0);
654
655 #define USD_DETACH_METADATA(_usd) do { \
656 ASSERT((_usd) != NULL); \
657 (_usd)->sd_md_idx = OBJ_IDX_NONE; \
658 /* mask off non-user flags */ \
659 (_usd)->sd_flags &= SD_FLAGS_USER; \
660 (_usd)->sd_flags &= ~SD_IDX_VALID; \
661 } while (0)
662
663 #define USD_RESET(_usd) USD_INIT(_usd)
664
665 #define USD_SET_LENGTH(_usd, _md_len) do { \
666 ASSERT((_usd) != NULL); \
667 (_usd)->sd_len = _md_len; \
668 } while (0)
669
670 #define _USD_COPY(_src, _dst) do { \
671 _CASSERT(sizeof (struct __user_slot_desc) == 8); \
672 sk_copy64_8((uint64_t *)(void *)_src, (uint64_t *)(void *)_dst); \
673 } while (0)
674
675 #define _USD_SWAP(_usd1, _usd2) do { \
676 struct __user_slot_desc _tusd \
677 __attribute((aligned(sizeof (uint64_t)))); \
678 _USD_COPY(_usd1, &_tusd); \
679 _USD_COPY(_usd2, _usd1); \
680 _USD_COPY(&_tusd, _usd2); \
681 } while (0)
682
683 #define _KSD_COPY(_src, _dst) do { \
684 _CASSERT(sizeof (struct __kern_slot_desc) == 8); \
685 sk_copy64_8((uint64_t *)(void *)_src, (uint64_t *)(void *)_dst); \
686 } while (0)
687
688 #define _KSD_SWAP(_ksd1, _ksd2) do { \
689 struct __kern_slot_desc _tksd \
690 __attribute((aligned(sizeof (uint64_t)))); \
691 _KSD_COPY(_ksd1, &_tksd); \
692 _KSD_COPY(_ksd2, _ksd1); \
693 _KSD_COPY(&_tksd, _ksd2); \
694 } while (0)
695
696 #define SD_SWAP(_ksd1, _usd1, _ksd2, _usd2) do { \
697 _USD_SWAP(_usd1, _usd2); \
698 _KSD_SWAP(_ksd1, _ksd2); \
699 /* swap packet attachment */ \
700 *(struct __kern_slot_desc **)(uintptr_t)&(_ksd1)->sd_qum->qum_ksd = \
701 (_ksd1); \
702 *(struct __kern_slot_desc **)(uintptr_t)&(_ksd2)->sd_qum->qum_ksd = \
703 (_ksd2); \
704 } while (0)
705
706 #define _MD_BUFLET_ADDROFF(_md, _addr, _objaddr, _doff, _dlen, _dlim) do { \
707 struct __kern_quantum *_q = SK_PTR_ADDR_KQUM(_md); \
708 switch (METADATA_TYPE(_q)) { \
709 case NEXUS_META_TYPE_PACKET: { \
710 struct __kern_packet *_p = \
711 (struct __kern_packet *)(void *)(_md); \
712 struct __kern_buflet *_kbft; \
713 PKT_GET_FIRST_BUFLET(_p, _p->pkt_bufs_cnt, _kbft); \
714 (_addr) = __DECONST(void *, _kbft->buf_addr); \
715 (_objaddr) = _kbft->buf_objaddr; \
716 (_doff) = _kbft->buf_doff; \
717 (_dlen) = _kbft->buf_dlen; \
718 (_dlim) = _kbft->buf_dlim; \
719 break; \
720 } \
721 default: \
722 (_addr) = __DECONST(void *, _q->qum_buf[0].buf_addr); \
723 (_objaddr) = _q->qum_buf[0].buf_objaddr; \
724 (_doff) = _q->qum_buf[0].buf_doff; \
725 (_dlen) = _q->qum_buf[0].buf_dlen; \
726 (_dlim) = _q->qum_buf[0].buf_dlim; \
727 break; \
728 } \
729 ASSERT((_addr) != NULL); \
730 ASSERT((_objaddr) != NULL); \
731 } while (0)
732
733 #define _MD_BUFLET_ADDR_PKT(_md, _addr) do { \
734 ASSERT(METADATA_TYPE(SK_PTR_ADDR_KQUM(_md)) == \
735 NEXUS_META_TYPE_PACKET); \
736 struct __kern_packet *_p = (struct __kern_packet *)(void *)(_md); \
737 struct __kern_buflet *_kbft; \
738 PKT_GET_FIRST_BUFLET(_p, _p->pkt_bufs_cnt, _kbft); \
739 (_addr) = __DECONST(void *, _kbft->buf_addr); \
740 ASSERT((_addr) != NULL); \
741 } while (0)
742
743
744 /*
745 * Return the data offset adjusted virtual address of a buffer associated
746 * with the metadata; for metadata with multiple buflets, this is the
747 * first buffer's address.
748 */
749 #define MD_BUFLET_ADDR(_md, _val) do { \
750 void *_addr, *_objaddr; \
751 uint16_t _doff, _dlen, _dlim; \
752 _MD_BUFLET_ADDROFF(_md, _addr, _objaddr, _doff, _dlen, _dlim); \
753 /* skip past buflet data offset */ \
754 (_val) = (void *)((uint8_t *)_addr + _doff); \
755 } while (0)
756
757 /*
758 * Return the absolute virtual address of a buffer associated with the
759 * metadata; for metadata with multiple buflets, this is the first
760 * buffer's address.
761 */
762 #define MD_BUFLET_ADDR_ABS(_md, _val) do { \
763 void *_addr, *_objaddr; \
764 uint16_t _doff, _dlen, _dlim; \
765 _MD_BUFLET_ADDROFF(_md, _addr, _objaddr, _doff, _dlen, _dlim); \
766 (_val) = (void *)_addr; \
767 } while (0)
768
769 /* similar to MD_BUFLET_ADDR_ABS() but optimized only for packets */
770 #define MD_BUFLET_ADDR_ABS_PKT(_md, _val) do { \
771 void *_addr; \
772 _MD_BUFLET_ADDR_PKT(_md, _addr); \
773 (_val) = (void *)_addr; \
774 } while (0)
775
776
777 #define MD_BUFLET_ADDR_ABS_DLEN(_md, _val, _dlen, _dlim, _doff) do { \
778 void *_addr, *_objaddr; \
779 _MD_BUFLET_ADDROFF(_md, _addr, _objaddr, _doff, _dlen, _dlim); \
780 (_val) = (void *)_addr; \
781 } while (0)
782
783
784 /*
785 * Return the buffer's object address associated with the metadata; for
786 * metadata with multiple buflets, this is the first buffer's object address.
787 */
788 #define MD_BUFLET_OBJADDR(_md, _val) do { \
789 void *_addr, *_objaddr; \
790 uint16_t _doff, _dlen, _dlim; \
791 _MD_BUFLET_ADDROFF(_md, _addr, _objaddr, _doff, _dlen, _dlim); \
792 (_val) = (void *)_objaddr; \
793 } while (0)
794
795 /*
796 * Return the data offset adjusted virtual address of a buffer associated
797 * with the metadata; for metadata with multiple buflets, this is the
798 * first buffer's address and data length.
799 */
800 #define MD_BUFLET_ADDR_DLEN(_md, _val, _dlen) do { \
801 void *_addr, *_objaddr; \
802 uint16_t _doff, _dlim; \
803 _MD_BUFLET_ADDROFF(_md, _addr, _objaddr, _doff, _dlen, _dlim); \
804 /* skip past buflet data offset */ \
805 (_val) = (void *)((uint8_t *)_addr + _doff); \
806 } while (0)
807
808 /* kr_space: return available space for enqueue into kring */
809 __attribute__((always_inline))
810 static inline uint32_t
kr_available_slots(struct __kern_channel_ring * kr)811 kr_available_slots(struct __kern_channel_ring *kr)
812 {
813 int busy;
814 uint32_t space;
815
816 busy = (int)(kr->ckr_klease - kr->ckr_khead);
817 if (busy < 0) {
818 busy += kr->ckr_num_slots;
819 }
820 space = kr->ckr_lim - (uint32_t)busy;
821
822 return space;
823 }
824
825 /* kr_space: return available space for enqueue into Rx kring */
826 __attribute__((always_inline))
827 static inline uint32_t
kr_available_slots_rxring(struct __kern_channel_ring * rxkring)828 kr_available_slots_rxring(struct __kern_channel_ring *rxkring)
829 {
830 int busy;
831 uint32_t space;
832
833 /* # of rx busy (unclaimed) slots */
834 busy = (int)(rxkring->ckr_ktail - rxkring->ckr_khead);
835 if (busy < 0) {
836 busy += rxkring->ckr_num_slots;
837 }
838
839 /* # of rx avail free slots (subtract busy from max) */
840 space = rxkring->ckr_lim - (uint32_t)busy;
841 return space;
842 }
843
844 /*
845 * kr_reserve_slots: reserve n slots from kr in range [start, end).
846 * return ticket for later publish those reserved correspondingly.
847 */
848 __attribute__((always_inline))
849 static inline uint32_t
kr_reserve_slots_locked(struct __kern_channel_ring * kr,uint32_t n,uint32_t * start,uint32_t * end,uint32_t * ticket)850 kr_reserve_slots_locked(struct __kern_channel_ring *kr, uint32_t n,
851 uint32_t *start, uint32_t *end, uint32_t *ticket)
852 {
853 uint32_t n_available;
854 const uint32_t lim = kr->ckr_lim;
855
856 n_available = kr_available_slots(kr);
857 if (n > n_available) {
858 n = n_available;
859 }
860
861 *ticket = kr->ckr_lease_idx;
862 kr->ckr_leases[*ticket] = CKR_NOSLOT;
863 kr->ckr_lease_idx = SLOT_NEXT(*ticket, lim);
864
865 *start = kr->ckr_klease;
866 *end = *start + n;
867 if (*end > lim) {
868 *end -= lim + 1;
869 }
870 kr->ckr_klease = *end;
871
872 ASSERT(kr->ckr_khead < kr->ckr_num_slots);
873 ASSERT(kr->ckr_ktail < kr->ckr_num_slots);
874 ASSERT(kr->ckr_klease < kr->ckr_num_slots);
875 ASSERT(kr->ckr_lease_idx < kr->ckr_num_slots);
876
877 return n;
878 }
879
880 /*
881 * kr_publish_slots: publish slots previously reserved in [start, end).
882 * Should only be called after slots and associated packets have been deep
883 * enqueued and made available.
884 */
885 __attribute__((always_inline))
886 static inline void
kr_publish_slots(struct __kern_channel_ring * kr,struct proc * p,uint32_t start,uint32_t end,uint32_t ticket)887 kr_publish_slots(struct __kern_channel_ring *kr, struct proc *p,
888 uint32_t start, uint32_t end, uint32_t ticket)
889 {
890 uint32_t *l = kr->ckr_leases;
891 uint32_t update_pos;
892 boolean_t new_bufs = FALSE;
893
894 ASSERT(start < kr->ckr_num_slots);
895 ASSERT(end < kr->ckr_num_slots);
896 ASSERT(ticket < kr->ckr_num_slots);
897
898 l[ticket] = end;
899
900 KR_LOCK(kr);
901 update_pos = kr->ckr_ktail;
902 if (__probable(start == update_pos)) {
903 /*
904 * All slots before start have been reported,
905 * so scan subsequent leases to see if other ranges
906 * have been completed, and to a selwakeup or txsync.
907 */
908 while (ticket != kr->ckr_lease_idx &&
909 l[ticket] != CKR_NOSLOT) {
910 end = l[ticket];
911 l[ticket] = CKR_NOSLOT;
912 ticket = SLOT_NEXT(ticket, kr->ckr_lim);
913 }
914 /*
915 * end is the new 'write' position. end != start
916 * means there are new buffers to report
917 */
918 if (__probable(end != start)) {
919 kr->ckr_ktail = end;
920 new_bufs = TRUE;
921 }
922 }
923 KR_UNLOCK(kr);
924
925 if (new_bufs) {
926 kr->ckr_na_notify(kr, p, 0);
927 }
928 }
929
930 extern kern_allocation_name_t skmem_tag_ch_key;
931
932 #if (DEVELOPMENT || DEBUG)
933 SYSCTL_DECL(_kern_skywalk_channel);
934 #endif /* !DEVELOPMENT && !DEBUG */
935
936 __BEGIN_DECLS
937 extern int channel_init(void);
938 extern void channel_fini(void);
939
940 extern struct kern_channel *ch_open(struct ch_init *, struct proc *,
941 int, int *);
942 extern struct kern_channel *ch_open_special(struct kern_nexus *,
943 struct chreq *, boolean_t, int *);
944 extern void ch_close(struct kern_channel *, boolean_t);
945 extern void ch_close_special(struct kern_channel *);
946 extern int ch_kqfilter(struct kern_channel *, struct knote *,
947 struct kevent_qos_s *kev);
948 extern boolean_t ch_is_multiplex(struct kern_channel *, enum txrx);
949 extern int ch_select(struct kern_channel *, int, void *, struct proc *);
950 extern int ch_get_opt(struct kern_channel *, struct sockopt *);
951 extern int ch_set_opt(struct kern_channel *, struct sockopt *);
952 extern void ch_deactivate(struct kern_channel *);
953 extern void ch_retain(struct kern_channel *);
954 extern void ch_retain_locked(struct kern_channel *);
955 extern int ch_release(struct kern_channel *);
956 extern int ch_release_locked(struct kern_channel *);
957 extern void ch_dtor(void *);
958
959 extern void csi_init(struct ch_selinfo *, boolean_t, uint64_t);
960 extern void csi_destroy(struct ch_selinfo *);
961 extern void csi_selrecord_one(struct __kern_channel_ring *, struct proc *,
962 void *);
963 extern void csi_selrecord_all(struct nexus_adapter *, enum txrx, struct proc *,
964 void *);
965 extern void csi_selwakeup_one(struct __kern_channel_ring *, boolean_t,
966 boolean_t, boolean_t, uint32_t);
967 extern void csi_selwakeup_all(struct nexus_adapter *, enum txrx, boolean_t,
968 boolean_t, boolean_t, uint32_t);
969
970 extern void kr_init_to_mhints(struct __kern_channel_ring *, uint32_t);
971 extern int kr_enter(struct __kern_channel_ring *, boolean_t);
972 extern void kr_exit(struct __kern_channel_ring *);
973 extern void kr_start(struct __kern_channel_ring *);
974 extern void kr_stop(struct __kern_channel_ring *kr, uint32_t state);
975 extern void kr_update_stats(struct __kern_channel_ring *kring,
976 uint32_t slot_count, uint32_t byte_count);
977 extern boolean_t kr_txempty(struct __kern_channel_ring *kring);
978 extern uint32_t kr_reclaim(struct __kern_channel_ring *kr);
979
980 extern slot_idx_t kr_txsync_prologue(struct kern_channel *,
981 struct __kern_channel_ring *, struct proc *);
982 extern int kr_txprologue(struct kern_channel *,
983 struct __kern_channel_ring *, const slot_idx_t, uint32_t *, uint64_t *,
984 struct proc *);
985 extern int kr_txprologue_upp(struct kern_channel *,
986 struct __kern_channel_ring *, const slot_idx_t, uint32_t *, uint64_t *,
987 struct proc *);
988
989 extern void kr_txsync_finalize(struct kern_channel *,
990 struct __kern_channel_ring *, struct proc *);
991 extern void kr_txfinalize(struct kern_channel *,
992 struct __kern_channel_ring *, const slot_idx_t, struct proc *p);
993 extern void kr_txfinalize_upp(struct kern_channel *,
994 struct __kern_channel_ring *, const slot_idx_t, struct proc *p);
995
996 extern slot_idx_t kr_rxsync_prologue(struct kern_channel *ch,
997 struct __kern_channel_ring *kring, struct proc *p);
998 extern int kr_rxprologue(struct kern_channel *,
999 struct __kern_channel_ring *, const slot_idx_t, uint32_t *, uint64_t *,
1000 struct proc *);
1001 extern int kr_rxprologue_nodetach(struct kern_channel *,
1002 struct __kern_channel_ring *, const slot_idx_t, uint32_t *, uint64_t *,
1003 struct proc *);
1004 extern int kr_rxprologue_upp(struct kern_channel *,
1005 struct __kern_channel_ring *, const slot_idx_t, uint32_t *, uint64_t *,
1006 struct proc *);
1007
1008 extern void kr_rxsync_finalize(struct kern_channel *ch,
1009 struct __kern_channel_ring *kring, struct proc *p);
1010 extern void kr_rxfinalize(struct kern_channel *,
1011 struct __kern_channel_ring *, const slot_idx_t, struct proc *p);
1012 extern void kr_rxfinalize_upp(struct kern_channel *,
1013 struct __kern_channel_ring *, const slot_idx_t, struct proc *p);
1014
1015 extern void kr_txkring_reclaim_and_refill(struct __kern_channel_ring *kring,
1016 slot_idx_t index);
1017 extern slot_idx_t kr_alloc_sync_prologue(struct __kern_channel_ring *kring,
1018 struct proc *p);
1019 extern slot_idx_t kr_free_sync_prologue(struct __kern_channel_ring *kring,
1020 struct proc *p);
1021 extern void kr_alloc_sync_finalize(struct __kern_channel_ring *kring,
1022 struct proc *p);
1023 extern void kr_free_sync_finalize(struct __kern_channel_ring *kring,
1024 struct proc *p);
1025 extern int kr_internalize_metadata(struct kern_channel *,
1026 struct __kern_channel_ring *, const uint32_t, struct __kern_quantum *,
1027 struct proc *);
1028 extern void kr_externalize_metadata(struct __kern_channel_ring *,
1029 const uint32_t, struct __kern_quantum *, struct proc *);
1030 extern slot_idx_t kr_event_sync_prologue(struct __kern_channel_ring *kring,
1031 struct proc *p);
1032 extern void kr_event_sync_finalize(struct kern_channel *ch,
1033 struct __kern_channel_ring *kring, struct proc *p);
1034
1035 #if SK_LOG
1036 extern void kr_log_bad_ring(struct __kern_channel_ring *);
1037 #else
1038 #define kr_log_bad_ring(_kr) do { ((void)0); } while (0)
1039 #endif /* SK_LOG */
1040 __END_DECLS
1041 #endif /* BSD_KERNEL_PRIVATE */
1042 #endif /* !_SKYWALK_CHANNEL_CHANNELVAR_H_ */
1043