xref: /xnu-11215.1.10/bsd/skywalk/channel/channel_ring.c (revision 8d741a5de7ff4191bf97d57b9f54c2f6d4a15585)
1 /*
2  * Copyright (c) 2015-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include <skywalk/os_skywalk_private.h>
30 #include <kern/sched_prim.h>
31 #include <sys/sdt.h>
32 
33 static void kr_update_user_stats(struct __kern_channel_ring *,
34     uint32_t, uint32_t);
35 static void kr_externalize_metadata_internal(struct __kern_channel_ring *,
36     const uint32_t, struct __kern_quantum *, struct proc *);
37 
38 #define KR_TRANSFER_DECAY       2       /* ilog2 of EWMA decay rate (4) */
39 static uint32_t kr_transfer_decay = 0;
40 
41 #define KR_ACCUMULATE_INTERVAL  2 /* 2 seconds */
42 static uint32_t kr_accumulate_interval = KR_ACCUMULATE_INTERVAL;
43 
44 #if (DEVELOPMENT || DEBUG)
45 #define KR_STAT_ENABLE          1
46 #else /* !(DEVELOPMENT || DEBUG) */
47 #define KR_STAT_ENABLE          0
48 #endif /* !(DEVELOPMENT || DEBUG) */
49 /* Enable/Disable ring stats collection */
50 uint32_t kr_stat_enable = KR_STAT_ENABLE;
51 
52 #if (DEVELOPMENT || DEBUG)
53 SYSCTL_UINT(_kern_skywalk, OID_AUTO, ring_transfer_decay,
54     CTLFLAG_RW | CTLFLAG_LOCKED, &kr_transfer_decay,
55     0, "ilog2 of EWMA decay rate of ring transfers");
56 
57 SYSCTL_UINT(_kern_skywalk, OID_AUTO, ring_stat_accumulate_interval,
58     CTLFLAG_RW | CTLFLAG_LOCKED, &kr_accumulate_interval,
59     KR_ACCUMULATE_INTERVAL, "accumulation interval for ring stats");
60 
61 uint32_t kr_disable_panic_on_sync_err = 0;
62 SYSCTL_UINT(_kern_skywalk, OID_AUTO, disable_panic_on_sync_err,
63     CTLFLAG_RW | CTLFLAG_LOCKED, &kr_disable_panic_on_sync_err,
64     0, "disable panic on sync error");
65 #endif /* (DEVELOPMENT || DEBUG) */
66 
67 SYSCTL_UINT(_kern_skywalk, OID_AUTO, ring_stat_enable,
68     CTLFLAG_RW | CTLFLAG_LOCKED, &kr_stat_enable,
69     0, "enable/disable stats collection for ring");
70 
71 #define KR_EWMA(old, new, decay) do {                                   \
72 	u_int64_t _avg;                                                 \
73 	if (__probable((_avg = (old)) > 0))                             \
74 	        _avg = (((_avg << (decay)) - _avg) + (new)) >> (decay); \
75 	else                                                            \
76 	        _avg = (new);                                           \
77 	(old) = _avg;                                                   \
78 } while (0)
79 
80 #define _BUF_DLIM(_buf, _pp)    (BUFLET_HAS_LARGE_BUF(_buf) ?           \
81 	PP_BUF_SIZE_LARGE(_pp) : PP_BUF_SIZE_DEF(_pp))
82 
83 void
kr_init_to_mhints(struct __kern_channel_ring * kring,uint32_t nslots)84 kr_init_to_mhints(struct __kern_channel_ring *kring, uint32_t nslots)
85 {
86 	uint32_t tail;
87 
88 	tail = nslots - 1;
89 
90 	kring->ckr_transfer_decay = KR_TRANSFER_DECAY;
91 	kring->ckr_num_slots = nslots;
92 	*(slot_idx_t *)(uintptr_t)&kring->ckr_lim = (nslots - 1);
93 	kring->ckr_rhead = kring->ckr_khead = 0;
94 	/* IMPORTANT: Always keep one slot empty */
95 	kring->ckr_rtail = kring->ckr_ktail =
96 	    ((kring->ckr_tx == NR_TX) || (kring->ckr_tx == NR_F) ? tail : 0);
97 }
98 
99 /*
100  * Try to obtain exclusive right to issue the *sync() or state change
101  * operations on the ring.  The right is obtained and must be later
102  * relinquished via kr_exit() if and only if kr_enter() returns 0.
103  *
104  * In all cases the caller will typically skip the ring, possibly collecting
105  * errors along the way.
106  *
107  * If the calling context does not allow sleeping, the caller must pass
108  * FALSE in can_sleep; EBUSY may be returned if the right is held by
109  * another thread.  Otherwise, the caller may block until the right is
110  * released by the previous holder.
111  */
112 int
kr_enter(struct __kern_channel_ring * kr,boolean_t can_sleep)113 kr_enter(struct __kern_channel_ring *kr, boolean_t can_sleep)
114 {
115 	lck_spin_lock(&kr->ckr_slock);
116 	if (kr->ckr_owner == current_thread()) {
117 		ASSERT(kr->ckr_busy != 0);
118 		kr->ckr_busy++;
119 		goto done;
120 	}
121 	if (!can_sleep) {
122 		if (kr->ckr_busy != 0) {
123 			lck_spin_unlock(&kr->ckr_slock);
124 			return EBUSY;
125 		}
126 	} else {
127 		while (kr->ckr_busy != 0) {
128 			kr->ckr_want++;
129 			(void) assert_wait(&kr->ckr_busy, THREAD_UNINT);
130 			lck_spin_unlock(&kr->ckr_slock);
131 			(void) thread_block(THREAD_CONTINUE_NULL);
132 			SK_DF(SK_VERB_LOCKS, "waited for kr \"%s\" "
133 			    "(0x%llx) busy=%u", kr->ckr_name,
134 			    SK_KVA(kr), kr->ckr_busy);
135 			lck_spin_lock(&kr->ckr_slock);
136 		}
137 	}
138 	LCK_SPIN_ASSERT(&kr->ckr_slock, LCK_ASSERT_OWNED);
139 	ASSERT(kr->ckr_busy == 0);
140 	kr->ckr_busy++;
141 	kr->ckr_owner = current_thread();
142 done:
143 	lck_spin_unlock(&kr->ckr_slock);
144 
145 	SK_DF(SK_VERB_LOCKS, "kr \"%s\" (0x%llx) right acquired",
146 	    kr->ckr_name, SK_KVA(kr));
147 
148 	return 0;
149 }
150 
151 void
kr_exit(struct __kern_channel_ring * kr)152 kr_exit(struct __kern_channel_ring *kr)
153 {
154 	uint32_t want = 0;
155 
156 	lck_spin_lock(&kr->ckr_slock);
157 	ASSERT(kr->ckr_busy != 0);
158 	ASSERT(kr->ckr_owner == current_thread());
159 	if (--kr->ckr_busy == 0) {
160 		kr->ckr_owner = NULL;
161 
162 		/*
163 		 * we're done with the kring;
164 		 * notify anyone that has lost the race
165 		 */
166 		if ((want = kr->ckr_want) != 0) {
167 			kr->ckr_want = 0;
168 			wakeup((void *)&kr->ckr_busy);
169 			lck_spin_unlock(&kr->ckr_slock);
170 		} else {
171 			lck_spin_unlock(&kr->ckr_slock);
172 		}
173 	} else {
174 		lck_spin_unlock(&kr->ckr_slock);
175 	}
176 
177 	SK_DF(SK_VERB_LOCKS, "kr \"%s\" (0x%llx) right released (%u waiters)",
178 	    kr->ckr_name, SK_KVA(kr), want);
179 }
180 
181 
182 void
kr_start(struct __kern_channel_ring * kr)183 kr_start(struct __kern_channel_ring *kr)
184 {
185 	lck_spin_lock(&kr->ckr_slock);
186 	ASSERT(kr->ckr_busy != 0);
187 	ASSERT(kr->ckr_state == KR_STOPPED || kr->ckr_state == KR_LOCKED);
188 	/* now clear the state */
189 	kr->ckr_state = KR_READY;
190 	lck_spin_unlock(&kr->ckr_slock);
191 
192 	kr_exit(kr);
193 
194 	SK_DF(SK_VERB_LOCKS, "kr \"%s\" (0x%llx) is started",
195 	    kr->ckr_name, SK_KVA(kr));
196 }
197 
198 /*
199  * Put the kring in the 'stopped' state: either KR_STOPPED or KR_LOCKED.
200  * Also marks the ring as busy, which would require either kr_start() at a
201  * later point.
202  */
203 void
kr_stop(struct __kern_channel_ring * kr,uint32_t state)204 kr_stop(struct __kern_channel_ring *kr, uint32_t state)
205 {
206 	uint32_t s;
207 
208 	ASSERT(state == KR_STOPPED || state == KR_LOCKED);
209 
210 	s = kr_enter(kr, TRUE);
211 	ASSERT(s == 0);
212 
213 	lck_spin_lock(&kr->ckr_slock);
214 	ASSERT(kr->ckr_busy != 0);
215 	/* now set the state */
216 	kr->ckr_state = state;
217 	lck_spin_unlock(&kr->ckr_slock);
218 
219 	SK_DF(SK_VERB_LOCKS,
220 	    "kr \"%s\" (0x%llx) krflags 0x%b is now stopped s=%u",
221 	    kr->ckr_name, SK_KVA(kr), kr->ckr_flags, CKRF_BITS, state);
222 }
223 
224 static void
kr_update_user_stats(struct __kern_channel_ring * kring,uint32_t slot_count,uint32_t byte_count)225 kr_update_user_stats(struct __kern_channel_ring *kring, uint32_t slot_count,
226     uint32_t byte_count)
227 {
228 	uint64_t now;
229 	uint32_t transfer_decay = (kr_transfer_decay != 0) ?
230 	    kr_transfer_decay : kring->ckr_transfer_decay;
231 	channel_ring_user_stats_t stats = &kring->ckr_usr_stats;
232 
233 	now = net_uptime();
234 	kring->ckr_sync_time = now;
235 
236 	if (kr_stat_enable == 0) {
237 		return;
238 	}
239 
240 	stats->crsu_number_of_syncs++;
241 	stats->crsu_total_bytes_transferred += byte_count;
242 	stats->crsu_total_slots_transferred += slot_count;
243 
244 	if (slot_count > stats->crsu_max_slots_transferred) {
245 		stats->crsu_max_slots_transferred = slot_count;
246 	}
247 
248 	if (stats->crsu_min_slots_transferred == 0 ||
249 	    slot_count < stats->crsu_min_slots_transferred) {
250 		stats->crsu_min_slots_transferred = slot_count;
251 	}
252 
253 	if (__probable(kring->ckr_user_accumulate_start != 0)) {
254 		if ((now - kring->ckr_user_accumulate_start) >=
255 		    kr_accumulate_interval) {
256 			uint64_t        bps;
257 			uint64_t        sps;
258 			uint64_t        sps_ma;
259 
260 			/* bytes per sync */
261 			bps = kring->ckr_user_accumulated_bytes /
262 			    kring->ckr_user_accumulated_syncs;
263 			KR_EWMA(stats->crsu_bytes_per_sync_ma,
264 			    bps, transfer_decay);
265 			stats->crsu_bytes_per_sync = bps;
266 
267 			/* slots per sync */
268 			sps = kring->ckr_user_accumulated_slots /
269 			    kring->ckr_user_accumulated_syncs;
270 			sps_ma = stats->crsu_slots_per_sync_ma;
271 			KR_EWMA(sps_ma, sps, transfer_decay);
272 			stats->crsu_slots_per_sync_ma = (uint32_t)sps_ma;
273 			stats->crsu_slots_per_sync = (uint32_t)sps;
274 
275 			/* start over */
276 			kring->ckr_user_accumulate_start = now;
277 			kring->ckr_user_accumulated_bytes = 0;
278 			kring->ckr_user_accumulated_slots = 0;
279 			kring->ckr_user_accumulated_syncs = 0;
280 
281 			stats->crsu_min_slots_transferred = 0;
282 			stats->crsu_max_slots_transferred = 0;
283 		}
284 	} else {
285 		kring->ckr_user_accumulate_start = now;
286 	}
287 
288 	kring->ckr_user_accumulated_bytes += byte_count;
289 	kring->ckr_user_accumulated_slots += slot_count;
290 	kring->ckr_user_accumulated_syncs++;
291 }
292 
293 /* caller to make sure thread safety */
294 void
kr_update_stats(struct __kern_channel_ring * kring,uint32_t slot_count,uint32_t byte_count)295 kr_update_stats(struct __kern_channel_ring *kring, uint32_t slot_count,
296     uint32_t byte_count)
297 {
298 	uint64_t now;
299 	uint64_t diff_secs;
300 	channel_ring_stats_t stats = &kring->ckr_stats;
301 	uint32_t transfer_decay = (kr_transfer_decay != 0) ?
302 	    kr_transfer_decay : kring->ckr_transfer_decay;
303 
304 	if (kr_stat_enable == 0) {
305 		return;
306 	}
307 
308 	if (__improbable(slot_count == 0)) {
309 		return;
310 	}
311 
312 	stats->crs_number_of_transfers++;
313 	stats->crs_total_bytes_transferred += byte_count;
314 	stats->crs_total_slots_transferred += slot_count;
315 	if (slot_count > stats->crs_max_slots_transferred) {
316 		stats->crs_max_slots_transferred = slot_count;
317 	}
318 	if (stats->crs_min_slots_transferred == 0 ||
319 	    slot_count < stats->crs_min_slots_transferred) {
320 		stats->crs_min_slots_transferred = slot_count;
321 	}
322 
323 	now = net_uptime();
324 	if (__probable(kring->ckr_accumulate_start != 0)) {
325 		diff_secs = now - kring->ckr_accumulate_start;
326 		if (diff_secs >= kr_accumulate_interval) {
327 			uint64_t        bps;
328 			uint64_t        sps;
329 			uint64_t        sps_ma;
330 
331 			/* bytes per second */
332 			bps = kring->ckr_accumulated_bytes / diff_secs;
333 			KR_EWMA(stats->crs_bytes_per_second_ma,
334 			    bps, transfer_decay);
335 			stats->crs_bytes_per_second = bps;
336 
337 			/* slots per second */
338 			sps = kring->ckr_accumulated_slots / diff_secs;
339 			sps_ma = stats->crs_slots_per_second_ma;
340 			KR_EWMA(sps_ma, sps, transfer_decay);
341 			stats->crs_slots_per_second_ma = (uint32_t)sps_ma;
342 			stats->crs_slots_per_second = (uint32_t)sps;
343 
344 			/* start over */
345 			kring->ckr_accumulate_start = now;
346 			kring->ckr_accumulated_bytes = 0;
347 			kring->ckr_accumulated_slots = 0;
348 
349 			stats->crs_min_slots_transferred = 0;
350 			stats->crs_max_slots_transferred = 0;
351 		}
352 	} else {
353 		kring->ckr_accumulate_start = now;
354 	}
355 	kring->ckr_accumulated_bytes += byte_count;
356 	kring->ckr_accumulated_slots += slot_count;
357 }
358 
359 /* True if no space in the tx ring. only valid after kr_txsync_prologue */
360 boolean_t
kr_txempty(struct __kern_channel_ring * kring)361 kr_txempty(struct __kern_channel_ring *kring)
362 {
363 	return kring->ckr_rhead == kring->ckr_ktail;
364 }
365 
366 #if SK_LOG
367 /*
368  * Error logging routine called when txsync/rxsync detects an error.
369  * Expected to be called before killing the process with skywalk_kill_process()
370  *
371  * This routine is only called by the upper half of the kernel.
372  * It only reads khead (which is changed only by the upper half, too)
373  * and ktail (which may be changed by the lower half, but only on
374  * a tx ring and only to increase it, so any error will be recovered
375  * on the next call). For the above, we don't strictly need to call
376  * it under lock.
377  */
378 void
kr_log_bad_ring(struct __kern_channel_ring * kring)379 kr_log_bad_ring(struct __kern_channel_ring *kring)
380 {
381 	struct __user_channel_ring *ring = kring->ckr_ring;
382 	const slot_idx_t lim = kring->ckr_lim;
383 	slot_idx_t i;
384 	int errors = 0;
385 
386 	// XXX KASSERT nm_kr_tryget
387 	SK_ERR("kr \"%s\" (0x%llx) krflags 0x%b", kring->ckr_name,
388 	    SK_KVA(kring), kring->ckr_flags, CKRF_BITS);
389 	// XXX probably wrong to trust userspace
390 
391 	if (ring->ring_head > lim) {
392 		errors++;
393 	}
394 	if (ring->ring_tail > lim) {
395 		errors++;
396 	}
397 	for (i = 0; i <= lim; i++) {
398 		struct __kern_slot_desc *ksd = KR_KSD(kring, i);
399 		struct __kern_quantum *kqum = ksd->sd_qum;
400 		obj_idx_t idx;
401 		uint32_t len;
402 
403 		if (!KSD_VALID_METADATA(ksd)) {
404 			continue;
405 		}
406 
407 		idx = METADATA_IDX(kqum);
408 		len = kqum->qum_len;
409 		if (len > kring->ckr_max_pkt_len) {
410 			SK_RDERR(5, "bad len at slot %u idx %u len %u",
411 			    i, idx, len);
412 		}
413 	}
414 
415 	if (errors != 0) {
416 		SK_ERR("total %d errors", errors);
417 		SK_ERR("kr \"%s\" (0x%llx) krflags 0x%b crash, "
418 		    "head %u -> %u tail %u -> %u", kring->ckr_name,
419 		    SK_KVA(kring), kring->ckr_flags, CKRF_BITS, ring->ring_head,
420 		    kring->ckr_rhead, kring->ckr_khead,
421 		    ring->ring_tail, kring->ckr_ktail);
422 	}
423 }
424 #endif /* SK_LOG */
425 
426 uint32_t
kr_reclaim(struct __kern_channel_ring * kr)427 kr_reclaim(struct __kern_channel_ring *kr)
428 {
429 	int r = 0;
430 
431 	VERIFY(sk_is_sync_protected());
432 
433 	/*
434 	 * This is a no-op for TX ring, since the TX reclaim logic is only
435 	 * known to the nexus itself.  There, the nexus's TX sync code would
436 	 * figure out the number of slots that has been "transmitted", and
437 	 * advance the slot pointer accordingly.  This routine would then be
438 	 * called as a way to advise the system of such condition.
439 	 *
440 	 * For RX ring, this will reclaim user-released slots, and it is
441 	 * to be called by the provider's RX sync routine prior to its
442 	 * processing new slots (into the RX ring).
443 	 *
444 	 * It is therefore advised that this routine be called at the start
445 	 * of the RX sync callback, as well as at the end of the TX sync
446 	 * callback; the latter is useful in case we decide to implement
447 	 * more logic in future.
448 	 */
449 	if ((kr->ckr_tx == NR_RX) || (kr->ckr_tx == NR_EV)) {
450 		/* # of reclaimed slots */
451 		r = kr->ckr_rhead - kr->ckr_khead;
452 		if (r < 0) {
453 			r += kr->ckr_num_slots;
454 		}
455 
456 		kr->ckr_khead = kr->ckr_rhead;
457 		/* ensure global visibility */
458 		os_atomic_thread_fence(seq_cst);
459 	}
460 
461 	return (slot_idx_t)r;
462 }
463 
464 /*
465  * Nexus-specific kr_txsync_prologue() callback.
466  */
467 int
kr_txprologue(struct kern_channel * ch,struct __kern_channel_ring * kring,const slot_idx_t head,uint32_t * byte_count,uint64_t * err_reason,struct proc * p)468 kr_txprologue(struct kern_channel *ch, struct __kern_channel_ring *kring,
469     const slot_idx_t head, uint32_t *byte_count, uint64_t *err_reason,
470     struct proc *p)
471 {
472 	struct kern_pbufpool *pp = kring->ckr_pp;
473 	const uint32_t maxfrags = pp->pp_max_frags;
474 	slot_idx_t slot_idx = kring->ckr_rhead;
475 
476 	ASSERT(!(KRNA(kring)->na_flags & NAF_USER_PKT_POOL));
477 
478 	while (slot_idx != head) {
479 		struct __kern_slot_desc *ksd = KR_KSD(kring, slot_idx);
480 		struct __kern_quantum *kqum = ksd->sd_qum;
481 		int err;
482 
483 		if (__improbable(!(kqum->qum_qflags & QUM_F_KERNEL_ONLY) &&
484 		    METADATA_IDX(kqum) != METADATA_IDX(kqum->qum_user))) {
485 			SK_ERR("qum index mismatch");
486 			*err_reason = SKYWALK_KILL_REASON_QUM_IDX_MISMATCH;
487 			return -1;
488 		}
489 
490 		/* Internalize */
491 		err = kr_internalize_metadata(ch, kring, maxfrags, kqum, p);
492 		if (__improbable(err != 0)) {
493 			SK_ERR("%s(%d) kr \"%s\" (0x%llx) slot %u dropped "
494 			    "(err %d) kh %u kt %u | rh %u rt %u | h %u t %u",
495 			    sk_proc_name_address(p), sk_proc_pid(p),
496 			    kring->ckr_name, SK_KVA(kring), slot_idx, err,
497 			    kring->ckr_khead, kring->ckr_ktail,
498 			    kring->ckr_rhead, kring->ckr_rtail,
499 			    kring->ckr_ring->ring_head,
500 			    kring->ckr_ring->ring_tail);
501 			*err_reason = SKYWALK_KILL_REASON_INTERNALIZE_FAILED;
502 			return -1;
503 		}
504 
505 		*byte_count += kqum->qum_len;
506 		slot_idx = SLOT_NEXT(slot_idx, kring->ckr_lim);
507 	}
508 
509 	return 0;
510 }
511 
512 /*
513  * Nexus-specific kr_txsync_prologue() callback - user packet pool variant.
514  */
515 int
kr_txprologue_upp(struct kern_channel * ch,struct __kern_channel_ring * kring,const slot_idx_t head,uint32_t * byte_count,uint64_t * err_reason,struct proc * p)516 kr_txprologue_upp(struct kern_channel *ch, struct __kern_channel_ring *kring,
517     const slot_idx_t head, uint32_t *byte_count, uint64_t *err_reason,
518     struct proc *p)
519 {
520 	struct kern_pbufpool *pp = kring->ckr_pp;
521 	const uint32_t maxfrags = pp->pp_max_frags;
522 	slot_idx_t slot_idx = kring->ckr_rhead;
523 	struct __kern_quantum *kqum = NULL;
524 	bool free_pkt = false;
525 	int err = 0;
526 
527 	ASSERT(KRNA(kring)->na_flags & NAF_USER_PKT_POOL);
528 
529 	PP_LOCK(pp);
530 	while (slot_idx != head) {
531 		struct __kern_slot_desc *ksd = KR_KSD(kring, slot_idx);
532 		struct __user_slot_desc *usd = KR_USD(kring, slot_idx);
533 
534 		/*
535 		 * The channel is operating in user packet pool mode;
536 		 * check if the packet is in the allocated list.
537 		 */
538 		kqum = pp_remove_upp_locked(pp, usd->sd_md_idx, &err);
539 		if (__improbable(err != 0)) {
540 			if (kqum != NULL) {
541 				SK_ERR("%s(%d) kr \"%s\" (0x%llx) slot %u "
542 				    "kqum %p, bad buflet chain",
543 				    sk_proc_name_address(p), sk_proc_pid(p),
544 				    kring->ckr_name, SK_KVA(kring), slot_idx,
545 				    SK_KVA(kqum));
546 				*err_reason =
547 				    SKYWALK_KILL_REASON_BAD_BUFLET_CHAIN;
548 				goto done;
549 			}
550 
551 			SK_ERR("%s(%d) kr \"%s\" (0x%llx) slot %u "
552 			    " unallocated packet %u kh %u kt %u | "
553 			    "rh %u rt %u | h %u t %u",
554 			    sk_proc_name_address(p), sk_proc_pid(p),
555 			    kring->ckr_name, SK_KVA(kring), slot_idx,
556 			    usd->sd_md_idx, kring->ckr_khead, kring->ckr_ktail,
557 			    kring->ckr_rhead, kring->ckr_rtail,
558 			    kring->ckr_ring->ring_head,
559 			    kring->ckr_ring->ring_tail);
560 			*err_reason = SKYWALK_KILL_REASON_UNALLOCATED_PKT;
561 			goto done;
562 		}
563 
564 		if (__improbable(!(kqum->qum_qflags & QUM_F_KERNEL_ONLY) &&
565 		    METADATA_IDX(kqum) != METADATA_IDX(kqum->qum_user))) {
566 			SK_ERR("qum index mismatch");
567 			*err_reason = SKYWALK_KILL_REASON_QUM_IDX_MISMATCH;
568 			err = ERANGE;
569 			free_pkt = true;
570 			goto done;
571 		}
572 
573 		/* Internalize */
574 		err = kr_internalize_metadata(ch, kring, maxfrags, kqum, p);
575 		if (__improbable(err != 0)) {
576 			SK_ERR("%s(%d) kr \"%s\" (0x%llx) slot %u dropped "
577 			    "(err %d) kh %u kt %u | rh %u rt %u | h %u t %u",
578 			    sk_proc_name_address(p), sk_proc_pid(p),
579 			    kring->ckr_name, SK_KVA(kring), slot_idx, err,
580 			    kring->ckr_khead, kring->ckr_ktail,
581 			    kring->ckr_rhead, kring->ckr_rtail,
582 			    kring->ckr_ring->ring_head,
583 			    kring->ckr_ring->ring_tail);
584 			*err_reason = SKYWALK_KILL_REASON_INTERNALIZE_FAILED;
585 			free_pkt = true;
586 			goto done;
587 		}
588 
589 		/*
590 		 * Attach packet to slot, detach mapping from alloc ring slot.
591 		 */
592 		kqum->qum_ksd = NULL;
593 		USD_RESET(usd);
594 		KR_SLOT_ATTACH_METADATA(kring, ksd, kqum);
595 
596 		*byte_count += kqum->qum_len;
597 		slot_idx = SLOT_NEXT(slot_idx, kring->ckr_lim);
598 	}
599 
600 done:
601 	PP_UNLOCK(pp);
602 	if (__improbable(err != 0) && free_pkt) {
603 		ASSERT(kqum != NULL);
604 		kqum->qum_ksd = NULL;
605 		pp_free_packet(pp, (uint64_t)kqum);
606 	}
607 	return err;
608 }
609 
610 #define NM_FAIL_ON(t, reason) if (__improbable(t)) { SK_ERR("fail " #t); \
611 	err_reason = reason; goto error; }
612 /*
613  * Validate parameters in the TX/FREE ring/kring.
614  *
615  * ckr_rhead, ckr_rtail=ktail are stored from previous round.
616  * khead is the next packet to send to the ring.
617  *
618  * We want
619  *    khead <= *ckr_rhead <= head <= tail = *ckr_rtail <= ktail
620  *
621  * ckr_khead, ckr_rhead, ckr_rtail and ckr_ktail are reliable
622  */
623 #define _KR_TXRING_VALIDATE(_kring, _ring, _kh, _kt, _rh, _krt) do {\
624 	slot_idx_t _n = (_kring)->ckr_num_slots;                        \
625 	/* kernel sanity checks */                                      \
626 	NM_FAIL_ON((_kh) >= _n || kring->ckr_rhead >= _n || (_krt) >= _n || \
627 	    (_kt) >= _n, SKYWALK_KILL_REASON_BASIC_SANITY);             \
628 	/* user basic sanity checks */                                  \
629 	NM_FAIL_ON((_rh) >= _n, SKYWALK_KILL_REASON_BASIC_SANITY);      \
630 	/* \
631 	 * user sanity checks. We only use 'cur', \
632 	 * A, B, ... are possible positions for cur: \
633 	 * \
634 	 *  0    A  cur   B  tail  C  n-1 \
635 	 *  0    D  tail  E  cur   F  n-1 \
636 	 * \
637 	 * B, F, D are valid. A, C, E are wrong \
638 	 */                                                             \
639 	if ((_krt) >= kring->ckr_rhead) {                               \
640 	/* want ckr_rhead <= head <= ckr_rtail */               \
641 	        NM_FAIL_ON((_rh) < kring->ckr_rhead || (_rh) > (_krt),  \
642 	            SKYWALK_KILL_REASON_HEAD_OOB);                      \
643 	} else { /* here ckr_rtail < ckr_rhead */                       \
644 	/* we need head outside ckr_rtail .. ckr_rhead */       \
645 	        NM_FAIL_ON((_rh) > (_krt) && (_rh) < kring->ckr_rhead,  \
646 	            SKYWALK_KILL_REASON_HEAD_OOB_WRAPPED);              \
647 	}                                                               \
648 	NM_FAIL_ON(ring->ring_tail != (_krt),                           \
649 	    SKYWALK_KILL_REASON_TAIL_MISMATCH);                         \
650 } while (0)
651 
652 /*
653  * Validate parameters in the ring/kring on entry for *_txsync().
654  * Returns ring->ring_head if ok, or something >= kring->ckr_num_slots
655  * in case of error, in order to force a reinit.
656  */
657 slot_idx_t
kr_txsync_prologue(struct kern_channel * ch,struct __kern_channel_ring * kring,struct proc * p)658 kr_txsync_prologue(struct kern_channel *ch, struct __kern_channel_ring *kring,
659     struct proc *p)
660 {
661 	struct __user_channel_ring *ring = kring->ckr_ring;
662 	slot_idx_t ckr_khead, ckr_ktail, ckr_rtail;
663 	slot_idx_t head;
664 	uint32_t byte_count = 0;
665 	uint64_t err_reason = 0;
666 	int slot_count;
667 
668 	VERIFY(sk_is_sync_protected());
669 	/* assert that this routine is only called for user facing rings */
670 	ASSERT(!KR_KERNEL_ONLY(kring));
671 	ASSERT(kring->ckr_usds != NULL);
672 
673 	/* read these once and use local copies */
674 	head = ring->ring_head;
675 	ckr_khead = kring->ckr_khead;
676 	ckr_ktail = kring->ckr_ktail;
677 	os_atomic_thread_fence(seq_cst);
678 	ckr_rtail = kring->ckr_rtail;
679 
680 	SK_DF(SK_VERB_SYNC | SK_VERB_TX, "%s(%d) kr \"%s\", kh %u kt %u | "
681 	    "rh %u rt %u | h %u t %u", sk_proc_name_address(p),
682 	    sk_proc_pid(p), kring->ckr_name, ckr_khead, ckr_ktail,
683 	    kring->ckr_rhead, ckr_rtail,
684 	    ring->ring_head, ring->ring_tail);
685 
686 	_KR_TXRING_VALIDATE(kring, ring, ckr_khead, ckr_ktail, head, ckr_rtail);
687 
688 	/* # of new tx slots */
689 	slot_count = head - kring->ckr_rhead;
690 	if (slot_count < 0) {
691 		slot_count += kring->ckr_num_slots;
692 	}
693 
694 	/*
695 	 * Invoke nexus-specific TX prologue callback, set in na_kr_create().
696 	 */
697 	if (kring->ckr_prologue != NULL && (kring->ckr_prologue(ch,
698 	    kring, head, &byte_count, &err_reason, p) != 0)) {
699 		goto error;
700 	}
701 
702 	/* update the user's view of slots & bytes transferred */
703 	kr_update_user_stats(kring, slot_count, byte_count);
704 
705 	/* update the kernel view of ring */
706 	kring->ckr_rhead = head;
707 
708 	/* save for kr_txsync_finalize(); only khead is needed */
709 	kring->ckr_khead_pre = ckr_khead;
710 
711 	return head;
712 
713 error:
714 	SK_ERR("%s(%d) kr \"%s\" (0x%llx) krflags 0x%b error: kh %u kt %u | "
715 	    "rh %u rt %u | h %u t %u |", sk_proc_name_address(p),
716 	    sk_proc_pid(p), kring->ckr_name, SK_KVA(kring), kring->ckr_flags,
717 	    CKRF_BITS, ckr_khead, ckr_ktail, kring->ckr_rhead,
718 	    ckr_rtail, head, ring->ring_tail);
719 
720 	skywalk_kill_process(p, err_reason | SKYWALK_KILL_REASON_TX_SYNC);
721 
722 	return kring->ckr_num_slots;
723 }
724 
725 /*
726  * Validate parameters in the ring/kring on entry for *_free_sync().
727  * Returns ring->ring_head if ok, or something >= kring->ckr_num_slots
728  * in case of error, in order to force a reinit.
729  */
730 slot_idx_t
kr_free_sync_prologue(struct __kern_channel_ring * kring,struct proc * p)731 kr_free_sync_prologue(struct __kern_channel_ring *kring, struct proc *p)
732 {
733 	struct __user_channel_ring *ring = kring->ckr_ring;
734 	slot_idx_t ckr_khead, ckr_ktail, ckr_rtail;
735 	slot_idx_t head;
736 	uint64_t err_reason = 0;
737 
738 	VERIFY(sk_is_sync_protected());
739 	/* read these once and use local copies */
740 	head = ring->ring_head;
741 	ckr_khead = kring->ckr_khead;
742 	ckr_ktail = kring->ckr_ktail;
743 	os_atomic_thread_fence(seq_cst);
744 	ckr_rtail = kring->ckr_rtail;
745 
746 	SK_DF(SK_VERB_SYNC, "%s(%d) kr \"%s\", kh %u kt %u | "
747 	    "rh %u rt %u | h %u t %u", sk_proc_name_address(p),
748 	    sk_proc_pid(p), kring->ckr_name, ckr_khead, ckr_ktail,
749 	    kring->ckr_rhead, ckr_rtail, ring->ring_head, ring->ring_tail);
750 
751 	_KR_TXRING_VALIDATE(kring, ring, ckr_khead, ckr_ktail, head, ckr_rtail);
752 
753 	/* update the kernel view of ring */
754 	kring->ckr_rhead = head;
755 	return head;
756 
757 error:
758 	SK_ERR("%s(%d) kr \"%s\" (0x%llx) krflags 0x%b error: kh %u kt %u | "
759 	    "rh %u rt %u | h %u t %u |", sk_proc_name_address(p),
760 	    sk_proc_pid(p), kring->ckr_name, SK_KVA(kring), kring->ckr_flags,
761 	    CKRF_BITS, ckr_khead, ckr_ktail, kring->ckr_rhead,
762 	    ckr_rtail, head, ring->ring_tail);
763 
764 	skywalk_kill_process(p, err_reason | SKYWALK_KILL_REASON_FREE_SYNC);
765 	return kring->ckr_num_slots;
766 }
767 
768 /*
769  * Nexus-specific kr_rxsync_prologue() callback.
770  */
771 int
kr_rxprologue(struct kern_channel * ch,struct __kern_channel_ring * kring,const slot_idx_t head,uint32_t * byte_count,uint64_t * err_reason,struct proc * p)772 kr_rxprologue(struct kern_channel *ch, struct __kern_channel_ring *kring,
773     const slot_idx_t head, uint32_t *byte_count, uint64_t *err_reason,
774     struct proc *p)
775 {
776 #pragma unused(ch, p)
777 	slot_idx_t slot_idx = kring->ckr_rhead;
778 	uint32_t nfree = 0;
779 
780 	ASSERT(!(KRNA(kring)->na_flags & NAF_USER_PKT_POOL));
781 
782 	/*
783 	 * Iterating through the slots just read by user-space;
784 	 * ckr_rhead -> ring_head
785 	 */
786 	while (slot_idx != head) {
787 		struct __kern_slot_desc *ksd = KR_KSD(kring, slot_idx);
788 		struct __kern_quantum *kqum = ksd->sd_qum;
789 
790 		ASSERT(KSD_VALID_METADATA(ksd));
791 		/* # of new bytes transferred */
792 		*byte_count += kqum->qum_len;
793 
794 		/* detach and free the packet */
795 		(void) KR_SLOT_DETACH_METADATA(kring, ksd);
796 		ASSERT(nfree < kring->ckr_num_slots);
797 		kring->ckr_scratch[nfree++] = (uint64_t)kqum;
798 
799 		slot_idx = SLOT_NEXT(slot_idx, kring->ckr_lim);
800 	}
801 
802 	if (nfree > 0) {
803 		pp_free_packet_batch(kring->ckr_pp,
804 		    &kring->ckr_scratch[0], nfree);
805 	}
806 
807 	/*
808 	 * Update userspace channel statistics of # readable bytes
809 	 * subtract byte counts from slots just given back to the kernel.
810 	 */
811 	if (kring->ckr_ready_bytes < *byte_count) {
812 		SK_ERR("%s(%d) kr \"%s\" (0x%llx) inconsistent ready bytes "
813 		    "(%u < %u)  kh %u kt %u | rh %u rt %u | h %u t %u",
814 		    sk_proc_name_address(p), sk_proc_pid(p), kring->ckr_name,
815 		    SK_KVA(kring), kring->ckr_ready_bytes, *byte_count,
816 		    kring->ckr_khead, kring->ckr_ktail, kring->ckr_rhead,
817 		    kring->ckr_rtail, kring->ckr_ring->ring_head,
818 		    kring->ckr_ring->ring_tail);
819 		*err_reason = SKYWALK_KILL_REASON_INCONSISTENT_READY_BYTES;
820 		return -1;
821 	}
822 	kring->ckr_ready_bytes -= *byte_count;
823 
824 	return 0;
825 }
826 
827 /*
828  * Nexus-specific kr_rxsync_prologue() callback - no detach variant.
829  */
830 int
kr_rxprologue_nodetach(struct kern_channel * ch,struct __kern_channel_ring * kring,const slot_idx_t head,uint32_t * byte_count,uint64_t * err_reason,struct proc * p)831 kr_rxprologue_nodetach(struct kern_channel *ch,
832     struct __kern_channel_ring *kring, const slot_idx_t head,
833     uint32_t *byte_count, uint64_t *err_reason, struct proc *p)
834 {
835 #pragma unused(ch, p)
836 	slot_idx_t slot_idx = kring->ckr_rhead;
837 
838 	ASSERT(!(KRNA(kring)->na_flags & NAF_USER_PKT_POOL));
839 
840 	/*
841 	 * Iterating through the slots just read by user-space;
842 	 * ckr_rhead -> ring_head
843 	 */
844 	while (slot_idx != head) {
845 		struct __kern_slot_desc *ksd = KR_KSD(kring, slot_idx);
846 		struct __kern_quantum *kqum = ksd->sd_qum;
847 
848 		ASSERT(KSD_VALID_METADATA(ksd));
849 		/* # of new bytes transferred */
850 		*byte_count += kqum->qum_len;
851 		slot_idx = SLOT_NEXT(slot_idx, kring->ckr_lim);
852 	}
853 
854 	/*
855 	 * Update userspace channel statistics of # readable bytes
856 	 * subtract byte counts from slots just given back to the kernel.
857 	 */
858 	if (kring->ckr_ready_bytes < *byte_count) {
859 		SK_ERR("%s(%d) kr \"%s\" (0x%llx) inconsistent ready bytes "
860 		    "(%u < %u)  kh %u kt %u | rh %u rt %u | h %u t %u",
861 		    sk_proc_name_address(p), sk_proc_pid(p), kring->ckr_name,
862 		    SK_KVA(kring), kring->ckr_ready_bytes, *byte_count,
863 		    kring->ckr_khead, kring->ckr_ktail, kring->ckr_rhead,
864 		    kring->ckr_rtail, kring->ckr_ring->ring_head,
865 		    kring->ckr_ring->ring_tail);
866 		*err_reason = SKYWALK_KILL_REASON_INCONSISTENT_READY_BYTES;
867 #if (DEVELOPMENT || DEBUG)
868 		if (kr_disable_panic_on_sync_err == 0) {
869 			panic("kr(0x%llx), inconsistent, head %u, ready %llu, "
870 			    "cnt %u", SK_KVA(kring), head,
871 			    kring->ckr_ready_bytes, *byte_count);
872 			/* NOTREACHED */
873 			__builtin_unreachable();
874 		}
875 #else /* (DEVELOPMENT || DEBUG) */
876 		return -1;
877 #endif /* !(DEVELOPMENT || DEBUG) */
878 	}
879 	kring->ckr_ready_bytes -= *byte_count;
880 
881 	return 0;
882 }
883 
884 /*
885  * Nexus-specific kr_rxsync_prologue() callback - user packet pool variant.
886  */
887 int
kr_rxprologue_upp(struct kern_channel * ch,struct __kern_channel_ring * kring,const slot_idx_t head,uint32_t * byte_count,uint64_t * err_reason,struct proc * p)888 kr_rxprologue_upp(struct kern_channel *ch, struct __kern_channel_ring *kring,
889     const slot_idx_t head, uint32_t *byte_count, uint64_t *err_reason,
890     struct proc *p)
891 {
892 #pragma unused(ch, p)
893 	slot_idx_t slot_idx = kring->ckr_rhead;
894 
895 	ASSERT(KRNA(kring)->na_flags & NAF_USER_PKT_POOL);
896 
897 	/*
898 	 * Iterating through the slots just read by user-space;
899 	 * ckr_rhead -> ring_head
900 	 */
901 	while (slot_idx != head) {
902 		struct __user_slot_desc *usd = KR_USD(kring, slot_idx);
903 
904 		/*
905 		 * This is a user facing ring opting in for the user packet
906 		 * pool mode, so ensure that the user has detached packet
907 		 * from slot.
908 		 */
909 		ASSERT(!KSD_VALID_METADATA(KR_KSD(kring, slot_idx)));
910 		if (SD_VALID_METADATA(usd)) {
911 			SK_ERR("%s(%d) kr \"%s\" (0x%llx) slot %u not "
912 			    "detached md %u kh %u kt %u | rh %u rt %u |"
913 			    " h %u t %u", sk_proc_name_address(p),
914 			    sk_proc_pid(p), kring->ckr_name,
915 			    SK_KVA(kring), slot_idx, usd->sd_md_idx,
916 			    kring->ckr_khead, kring->ckr_ktail,
917 			    kring->ckr_rhead, kring->ckr_rtail,
918 			    kring->ckr_ring->ring_head,
919 			    kring->ckr_ring->ring_tail);
920 			*err_reason = SKYWALK_KILL_REASON_SLOT_NOT_DETACHED;
921 			return -1;
922 		}
923 		*byte_count += usd->sd_len;
924 
925 		slot_idx = SLOT_NEXT(slot_idx, kring->ckr_lim);
926 	}
927 
928 	/*
929 	 * update userspace channel statistics of # readable bytes
930 	 * subtract byte counts from slots just given back to the kernel
931 	 */
932 	if (kring->ckr_ready_bytes < *byte_count) {
933 		SK_ERR("%s(%d) kr \"%s\" (0x%llx) inconsistent ready bytes "
934 		    "(%u < %u)  kh %u kt %u | rh %u rt %u | h %u t %u",
935 		    sk_proc_name_address(p), sk_proc_pid(p), kring->ckr_name,
936 		    SK_KVA(kring), kring->ckr_ready_bytes, *byte_count,
937 		    kring->ckr_khead, kring->ckr_ktail, kring->ckr_rhead,
938 		    kring->ckr_rtail, kring->ckr_ring->ring_head,
939 		    kring->ckr_ring->ring_tail);
940 		*err_reason = SKYWALK_KILL_REASON_INCONSISTENT_READY_BYTES;
941 		return -1;
942 	}
943 	kring->ckr_ready_bytes -= *byte_count;
944 
945 	return 0;
946 }
947 
948 /*
949  * Validate parameters in the RX/ALLOC/EVENT ring/kring.
950  * For a valid configuration,
951  * khead <= head <= tail <= ktail
952  *
953  * We only consider head.
954  * khead and ktail are reliable.
955  */
956 #define _KR_RXRING_VALIDATE(_kring, _ring, _kh, _kt, _rh)       do {    \
957 	slot_idx_t _n = (_kring)->ckr_num_slots;                        \
958 	/* kernel sanity checks */                                      \
959 	NM_FAIL_ON((_kh) >= _n || (_kt) >= _n,                          \
960 	    SKYWALK_KILL_REASON_BASIC_SANITY);                          \
961 	/* user sanity checks */                                        \
962 	if ((_kt) >= (_kh)) {                                           \
963 	/* want khead <= head <= ktail */                       \
964 	        NM_FAIL_ON((_rh) < (_kh) || (_rh) > (_kt),              \
965 	            SKYWALK_KILL_REASON_HEAD_OOB);                      \
966 	} else {                                                        \
967 	/* we need head outside ktail..khead */                 \
968 	        NM_FAIL_ON((_rh) < (_kh) && (_rh) > (_kt),              \
969 	            SKYWALK_KILL_REASON_HEAD_OOB_WRAPPED);              \
970 	}                                                               \
971 	NM_FAIL_ON((_ring)->ring_tail != (_kring)->ckr_rtail,           \
972 	    SKYWALK_KILL_REASON_TAIL_MISMATCH);                         \
973 } while (0)
974 
975 /*
976  * Validate parameters in the ring/kring on entry for *_rxsync().
977  * Returns ring->ring_head if ok, kring->ckr_num_slots on error,
978  * in order to force a reinit.
979  */
980 slot_idx_t
kr_rxsync_prologue(struct kern_channel * ch,struct __kern_channel_ring * kring,struct proc * p)981 kr_rxsync_prologue(struct kern_channel *ch, struct __kern_channel_ring *kring,
982     struct proc *p)
983 {
984 #pragma unused(ch)
985 	struct __user_channel_ring *ring = kring->ckr_ring;
986 	slot_idx_t ckr_khead, ckr_ktail;
987 	slot_idx_t head;
988 	uint32_t byte_count = 0;
989 	uint64_t err_reason = 0;
990 	int slot_count;
991 
992 	VERIFY(sk_is_sync_protected());
993 	/* assert that this routine is only called for user facing rings */
994 	ASSERT(!KR_KERNEL_ONLY(kring));
995 	ASSERT(kring->ckr_usds != NULL);
996 
997 	/* read these once and use local copies */
998 	ckr_khead = kring->ckr_khead;
999 	ckr_ktail = kring->ckr_ktail;
1000 
1001 	SK_DF(SK_VERB_SYNC | SK_VERB_RX, "%s(%d) kr \"%s\", kh %u kt %u | "
1002 	    "rh %u rt %u | h %u t %u", sk_proc_name_address(p),
1003 	    sk_proc_pid(p), kring->ckr_name, ckr_khead, ckr_ktail,
1004 	    kring->ckr_rhead, kring->ckr_rtail,
1005 	    ring->ring_head, ring->ring_tail);
1006 	/*
1007 	 * Before storing the new values, we should check they do not
1008 	 * move backwards. However:
1009 	 * - head is not an issue because the previous value is khead;
1010 	 * - cur could in principle go back, however it does not matter
1011 	 *   because we are processing a brand new rxsync()
1012 	 */
1013 	head = ring->ring_head; /* read only once */
1014 
1015 	_KR_RXRING_VALIDATE(kring, ring, ckr_khead, ckr_ktail, head);
1016 
1017 	/* # of reclaimed slots */
1018 	slot_count = head - kring->ckr_rhead;
1019 	if (slot_count < 0) {
1020 		slot_count += kring->ckr_num_slots;
1021 	}
1022 
1023 	/*
1024 	 * Invoke nexus-specific RX prologue callback, which may detach
1025 	 * and free any consumed packets.  Configured in na_kr_create().
1026 	 */
1027 	if (kring->ckr_prologue != NULL && (kring->ckr_prologue(ch,
1028 	    kring, head, &byte_count, &err_reason, p) != 0)) {
1029 		goto error;
1030 	}
1031 	/* update the user's view of slots & bytes transferred */
1032 	kr_update_user_stats(kring, slot_count, byte_count);
1033 
1034 	/* Update Rx dequeue timestamp */
1035 	if (slot_count > 0) {
1036 		kring->ckr_rx_dequeue_ts = _net_uptime;
1037 	}
1038 
1039 	/* update the kernel view of ring */
1040 	kring->ckr_rhead = head;
1041 	return head;
1042 
1043 error:
1044 	SK_ERR("%s(%d) kr \"%s\" (0x%llx) krflags 0x%b error: kh %u kt %u | "
1045 	    "rh %u rt %u | h %u t %u", sk_proc_name_address(p),
1046 	    sk_proc_pid(p), kring->ckr_name, SK_KVA(kring), kring->ckr_flags,
1047 	    CKRF_BITS, ckr_khead, ckr_ktail,
1048 	    kring->ckr_rhead, kring->ckr_rtail,
1049 	    ring->ring_head, ring->ring_tail);
1050 
1051 	skywalk_kill_process(p, err_reason | SKYWALK_KILL_REASON_RX_SYNC);
1052 	return kring->ckr_num_slots;
1053 }
1054 
1055 /*
1056  * Validate parameters on the ring/kring on entry for *_alloc_sync().
1057  * Returns ring->ring_head if ok, kring->ckr_num_slots on error,
1058  * in order to force a reinit.
1059  */
1060 slot_idx_t
kr_alloc_sync_prologue(struct __kern_channel_ring * kring,struct proc * p)1061 kr_alloc_sync_prologue(struct __kern_channel_ring *kring, struct proc *p)
1062 {
1063 	struct __user_channel_ring *ring = kring->ckr_ring;
1064 	slot_idx_t ckr_khead, ckr_ktail;
1065 	slot_idx_t head;
1066 	uint64_t err_reason = 0;
1067 
1068 	VERIFY(sk_is_sync_protected());
1069 
1070 	/* read these once and use local copies */
1071 	ckr_khead = kring->ckr_khead;
1072 	ckr_ktail = kring->ckr_ktail;
1073 	head = ring->ring_head;
1074 
1075 	SK_DF(SK_VERB_SYNC, "%s(%d) kr \"%s\", kh %u kt %u | "
1076 	    "rh %u rt %u | h %u t %u", sk_proc_name_address(p),
1077 	    sk_proc_pid(p), kring->ckr_name, ckr_khead, ckr_ktail,
1078 	    kring->ckr_rhead, kring->ckr_rtail,
1079 	    head, ring->ring_tail);
1080 	/*
1081 	 * Before storing the new values, we should check they do not
1082 	 * move backwards. However, head is not an issue because the
1083 	 * previous value is khead;
1084 	 */
1085 	_KR_RXRING_VALIDATE(kring, ring, ckr_khead, ckr_ktail, head);
1086 
1087 	/* update the kernel view of ring */
1088 	kring->ckr_rhead = head;
1089 	return head;
1090 
1091 error:
1092 	SK_ERR("%s(%d) kr \"%s\" (0x%llx) krflags 0x%b error: kh %u kt %u | "
1093 	    "rh %u rt %u | h %u t %u", sk_proc_name_address(p),
1094 	    sk_proc_pid(p), kring->ckr_name, SK_KVA(kring), kring->ckr_flags,
1095 	    CKRF_BITS, ckr_khead, ckr_ktail,
1096 	    kring->ckr_rhead, kring->ckr_rtail,
1097 	    ring->ring_head, ring->ring_tail);
1098 
1099 	skywalk_kill_process(p, err_reason | SKYWALK_KILL_REASON_ALLOC_SYNC);
1100 	return kring->ckr_num_slots;
1101 }
1102 
1103 /*
1104  * Nexus-specific kr_txsync_finalize() callback.
1105  */
1106 void
kr_txfinalize(struct kern_channel * ch,struct __kern_channel_ring * kring,const slot_idx_t head,struct proc * p)1107 kr_txfinalize(struct kern_channel *ch, struct __kern_channel_ring *kring,
1108     const slot_idx_t head, struct proc *p)
1109 {
1110 #pragma unused(ch)
1111 	struct kern_pbufpool *pp = kring->ckr_pp;
1112 	slot_idx_t slot_idx;
1113 	uint32_t ph_cnt, i = 0;
1114 	int32_t ph_needed;
1115 	int err;
1116 
1117 	ASSERT(!(KRNA(kring)->na_flags & NAF_USER_PKT_POOL));
1118 
1119 	/* use khead value from pre-sync time */
1120 	slot_idx = kring->ckr_khead_pre;
1121 
1122 	ph_needed = head - slot_idx;
1123 	if (ph_needed < 0) {
1124 		ph_needed += kring->ckr_num_slots;
1125 	}
1126 	if (ph_needed == 0) {
1127 		return;
1128 	}
1129 
1130 	ph_cnt = (uint32_t)ph_needed;
1131 	err = kern_pbufpool_alloc_batch(pp, 1, kring->ckr_scratch, &ph_cnt);
1132 	VERIFY(err == 0 && ph_cnt == (uint32_t)ph_needed);
1133 
1134 	/* recycle the transferred packets */
1135 	while (slot_idx != head) {
1136 		struct __kern_slot_desc *ksd = KR_KSD(kring, slot_idx);
1137 		kern_packet_t ph;
1138 
1139 		if (KSD_VALID_METADATA(ksd)) {
1140 			goto next_slot;
1141 		}
1142 
1143 		ph = kring->ckr_scratch[i];
1144 		ASSERT(ph != 0);
1145 		kring->ckr_scratch[i] = 0;
1146 		++i;
1147 
1148 		/*
1149 		 * Since this packet is freshly allocated and we need
1150 		 * to have the flag set for the attach to succeed,
1151 		 * just set it here rather than calling
1152 		 * __packet_finalize().
1153 		 */
1154 		SK_PTR_ADDR_KQUM(ph)->qum_qflags |= QUM_F_FINALIZED;
1155 
1156 		KR_SLOT_ATTACH_METADATA(kring, ksd, SK_PTR_ADDR_KQUM(ph));
1157 
1158 		kr_externalize_metadata_internal(kring, pp->pp_max_frags,
1159 		    SK_PTR_ADDR_KQUM(ph), p);
1160 next_slot:
1161 		slot_idx = SLOT_NEXT(slot_idx, kring->ckr_lim);
1162 	}
1163 
1164 	if (i != ph_cnt) {
1165 		kern_pbufpool_free_batch(pp, &kring->ckr_scratch[i],
1166 		    ph_cnt - i);
1167 	}
1168 }
1169 
1170 /*
1171  * Nexus-specific kr_txsync_finalize() callback - user packet pool variant.
1172  */
1173 void
kr_txfinalize_upp(struct kern_channel * ch,struct __kern_channel_ring * kring,const slot_idx_t head,struct proc * p)1174 kr_txfinalize_upp(struct kern_channel *ch, struct __kern_channel_ring *kring,
1175     const slot_idx_t head, struct proc *p)
1176 {
1177 #pragma unused(ch, p)
1178 	slot_idx_t slot_idx;
1179 	uint32_t nfree = 0;
1180 
1181 	ASSERT(KRNA(kring)->na_flags & NAF_USER_PKT_POOL);
1182 
1183 	/* use khead value from pre-sync time */
1184 	slot_idx = kring->ckr_khead_pre;
1185 
1186 	/* recycle the transferred packets */
1187 	while (slot_idx != head) {
1188 		struct __kern_slot_desc *ksd = KR_KSD(kring, slot_idx);
1189 
1190 		if (KSD_VALID_METADATA(ksd)) {
1191 			/* detach and free the packet */
1192 			struct __kern_quantum *kqum = ksd->sd_qum;
1193 			(void) KR_SLOT_DETACH_METADATA(kring, ksd);
1194 			ASSERT(nfree < kring->ckr_num_slots);
1195 			kring->ckr_scratch[nfree++] = (uint64_t)kqum;
1196 		}
1197 
1198 		slot_idx = SLOT_NEXT(slot_idx, kring->ckr_lim);
1199 	}
1200 
1201 	if (__probable(nfree > 0)) {
1202 		pp_free_packet_batch(kring->ckr_pp,
1203 		    &kring->ckr_scratch[0], nfree);
1204 	}
1205 }
1206 
1207 /*
1208  * Update kring and ring at the end of txsync.
1209  */
1210 void
kr_txsync_finalize(struct kern_channel * ch,struct __kern_channel_ring * kring,struct proc * p)1211 kr_txsync_finalize(struct kern_channel *ch, struct __kern_channel_ring *kring,
1212     struct proc *p)
1213 {
1214 	slot_idx_t ckr_khead, ckr_ktail;
1215 	uint32_t slot_size;
1216 	int32_t slot_diff;
1217 
1218 	VERIFY(sk_is_sync_protected());
1219 	/* assert that this routine is only called for user facing rings */
1220 	ASSERT(!KR_KERNEL_ONLY(kring));
1221 
1222 	/* read these once and use local copies */
1223 	ckr_khead = kring->ckr_khead;
1224 	ckr_ktail = kring->ckr_ktail;
1225 
1226 	/*
1227 	 * update userspace-facing channel statistics (# writable bytes/slots)
1228 	 *
1229 	 * Since the ring might be dynamically allocated, we can't rely on the
1230 	 * tail pointer to calculate free TX space (the tail might be sitting
1231 	 * at the edge of allocated ring space but be able to be pushed over
1232 	 * into unallocated ring space).
1233 	 *
1234 	 * Instead, calculate free TX space by looking at what slots are
1235 	 * available to the kernel for TX, and subtracting that from the total
1236 	 * number of possible slots. This is effectively what userspace can
1237 	 * write to.
1238 	 */
1239 	slot_size = PP_BUF_SIZE_DEF(kring->ckr_pp);
1240 	slot_diff = kring->ckr_rhead - ckr_khead;
1241 	if (slot_diff < 0) {
1242 		slot_diff += kring->ckr_num_slots;
1243 	}
1244 	slot_diff = kring->ckr_lim - slot_diff;
1245 	kring->ckr_ready_slots = slot_diff;
1246 	kring->ckr_ready_bytes = slot_diff * slot_size;
1247 
1248 	/*
1249 	 * Invoke nexus-specific TX finalize callback, which may recycle any
1250 	 * transferred packets and/or externalize new ones.  Some nexus don't
1251 	 * have any callback set.  Configured in na_kr_create().
1252 	 */
1253 	if (kring->ckr_finalize != NULL) {
1254 		kring->ckr_finalize(ch, kring, ckr_khead, p);
1255 	}
1256 
1257 	/* update ring tail/khead to what the kernel knows */
1258 	*(slot_idx_t *)(uintptr_t)&kring->ckr_ring->ring_tail =
1259 	    kring->ckr_rtail = ckr_ktail;
1260 	*(slot_idx_t *)(uintptr_t)&kring->ckr_ring->ring_khead = ckr_khead;
1261 
1262 	SK_DF(SK_VERB_SYNC | SK_VERB_TX, "%s(%d) kr \"%s\", kh %u kt %u | "
1263 	    "rh %u rt %u | h %u t %u", sk_proc_name_address(p),
1264 	    sk_proc_pid(p), kring->ckr_name, ckr_khead, ckr_ktail,
1265 	    kring->ckr_rhead, kring->ckr_rtail,
1266 	    kring->ckr_ring->ring_head,
1267 	    kring->ckr_ring->ring_tail);
1268 }
1269 
1270 /*
1271  * Nexus-specific kr_rxsync_finalize() callback.
1272  */
1273 void
kr_rxfinalize(struct kern_channel * ch,struct __kern_channel_ring * kring,const slot_idx_t tail,struct proc * p)1274 kr_rxfinalize(struct kern_channel *ch, struct __kern_channel_ring *kring,
1275     const slot_idx_t tail, struct proc *p)
1276 {
1277 #pragma unused(ch)
1278 	const uint32_t maxfrags = kring->ckr_pp->pp_max_frags;
1279 	slot_idx_t slot_idx = kring->ckr_rtail;
1280 	uint32_t byte_count = 0;
1281 
1282 	while (slot_idx != tail) {
1283 		struct __kern_slot_desc *ksd = KR_KSD(kring, slot_idx);
1284 		struct __kern_quantum *kqum = ksd->sd_qum;
1285 
1286 		/*
1287 		 * nexus provider should never leave an empty slot on rx ring.
1288 		 */
1289 		VERIFY(kqum != NULL);
1290 		kr_externalize_metadata_internal(kring, maxfrags, kqum, p);
1291 		ASSERT(!(KR_USD(kring, slot_idx)->sd_flags & ~SD_FLAGS_USER));
1292 
1293 		byte_count += kqum->qum_len;
1294 		slot_idx = SLOT_NEXT(slot_idx, kring->ckr_lim);
1295 	}
1296 
1297 	kring->ckr_ready_bytes += byte_count;
1298 
1299 	/* just recalculate slot count using pointer arithmetic */
1300 	int32_t slot_diff = tail - kring->ckr_rhead;
1301 	if (slot_diff < 0) {
1302 		slot_diff += kring->ckr_num_slots;
1303 	}
1304 	kring->ckr_ready_slots = slot_diff;
1305 
1306 #if CONFIG_NEXUS_NETIF
1307 	/*
1308 	 * If this is a channel opened directly to the netif nexus, provide
1309 	 * it feedbacks on the number of packets and bytes consumed.  This
1310 	 * will drive the receive mitigation strategy.
1311 	 */
1312 	if (__improbable(kring->ckr_netif_mit_stats != NULL) &&
1313 	    slot_diff != 0 && byte_count != 0) {
1314 		kring->ckr_netif_mit_stats(kring, slot_diff, byte_count);
1315 	}
1316 #endif /* CONFIG_NEXUS_NETIF */
1317 }
1318 
1319 /*
1320  * Nexus-specific kr_rxsync_finalize() callback - user packet pool variant.
1321  */
1322 void
kr_rxfinalize_upp(struct kern_channel * ch,struct __kern_channel_ring * kring,const slot_idx_t tail,struct proc * p)1323 kr_rxfinalize_upp(struct kern_channel *ch, struct __kern_channel_ring *kring,
1324     const slot_idx_t tail, struct proc *p)
1325 {
1326 	const uint32_t maxfrags = kring->ckr_pp->pp_max_frags;
1327 	slot_idx_t slot_idx = kring->ckr_rtail;
1328 	struct kern_pbufpool *pp = kring->ckr_pp;
1329 	uint32_t byte_count = 0;
1330 
1331 	PP_LOCK(pp);
1332 	while (slot_idx != tail) {
1333 		struct __kern_slot_desc *ksd = KR_KSD(kring, slot_idx);
1334 		struct __user_slot_desc *usd = KR_USD(kring, slot_idx);
1335 		struct __kern_quantum *kqum = ksd->sd_qum;
1336 
1337 		/*
1338 		 * nexus provider should never leave an empty slot on rx ring.
1339 		 */
1340 		VERIFY(kqum != NULL);
1341 		/*
1342 		 * The channel is operating in packet allocator
1343 		 * mode, so add packet to the allocated list.
1344 		 */
1345 		pp_insert_upp_locked(pp, kqum, ch->ch_pid);
1346 
1347 		KSD_DETACH_METADATA(ksd);
1348 		/* To calculate ckr_ready_bytes by kr_rxsync_prologue */
1349 		USD_SET_LENGTH(usd, (uint16_t)kqum->qum_len);
1350 
1351 		kr_externalize_metadata_internal(kring, maxfrags, kqum, p);
1352 		ASSERT((usd->sd_flags & ~SD_FLAGS_USER) == 0);
1353 
1354 		byte_count += kqum->qum_len;
1355 		slot_idx = SLOT_NEXT(slot_idx, kring->ckr_lim);
1356 	}
1357 	PP_UNLOCK(pp);
1358 
1359 	kring->ckr_ready_bytes += byte_count;
1360 
1361 	/* just recalculate slot count using pointer arithmetic */
1362 	int32_t slot_diff = tail - kring->ckr_rhead;
1363 	if (slot_diff < 0) {
1364 		slot_diff += kring->ckr_num_slots;
1365 	}
1366 	kring->ckr_ready_slots = slot_diff;
1367 
1368 #if CONFIG_NEXUS_NETIF
1369 	/*
1370 	 * If this is a channel opened directly to the netif nexus, provide
1371 	 * it feedbacks on the number of packets and bytes consumed.  This
1372 	 * will drive the receive mitigation strategy.
1373 	 */
1374 	if (__improbable(kring->ckr_netif_mit_stats != NULL) &&
1375 	    slot_diff != 0 && byte_count != 0) {
1376 		kring->ckr_netif_mit_stats(kring, slot_diff, byte_count);
1377 	}
1378 #endif /* CONFIG_NEXUS_NETIF */
1379 }
1380 
1381 /*
1382  * Update kring and ring at the end of rxsync
1383  */
1384 void
kr_rxsync_finalize(struct kern_channel * ch,struct __kern_channel_ring * kring,struct proc * p)1385 kr_rxsync_finalize(struct kern_channel *ch, struct __kern_channel_ring *kring,
1386     struct proc *p)
1387 {
1388 #pragma unused(ch, p)
1389 	slot_idx_t ckr_khead, ckr_ktail;
1390 
1391 	VERIFY(sk_is_sync_protected());
1392 	/* assert that this routine is only called for user facing rings */
1393 	ASSERT(!KR_KERNEL_ONLY(kring));
1394 	ASSERT(kring->ckr_usds != NULL);
1395 
1396 	/* read these once and use local copies */
1397 	ckr_khead = kring->ckr_khead;
1398 	ckr_ktail = kring->ckr_ktail;
1399 
1400 	/*
1401 	 * Invoke nexus-specific RX finalize callback; set in na_kr_create().
1402 	 */
1403 	if (kring->ckr_finalize != NULL) {
1404 		kring->ckr_finalize(ch, kring, ckr_ktail, p);
1405 	}
1406 
1407 	/* update ring tail/khead to what the kernel knows */
1408 	*(slot_idx_t *)(uintptr_t)&kring->ckr_ring->ring_tail =
1409 	    kring->ckr_rtail = ckr_ktail;
1410 	*(slot_idx_t *)(uintptr_t)&kring->ckr_ring->ring_khead = ckr_khead;
1411 
1412 	SK_DF(SK_VERB_SYNC | SK_VERB_RX, "%s(%d) kr \"%s\", kh %u kt %u | "
1413 	    "rh %u rt %u | h %u t %u", sk_proc_name_address(p),
1414 	    sk_proc_pid(p), kring->ckr_name, ckr_khead, ckr_ktail,
1415 	    kring->ckr_rhead, kring->ckr_rtail,
1416 	    kring->ckr_ring->ring_head,
1417 	    kring->ckr_ring->ring_tail);
1418 }
1419 
1420 void
kr_alloc_sync_finalize(struct __kern_channel_ring * kring,struct proc * p)1421 kr_alloc_sync_finalize(struct __kern_channel_ring *kring, struct proc *p)
1422 {
1423 #pragma unused(p)
1424 	slot_idx_t ckr_khead, ckr_ktail;
1425 
1426 	VERIFY(sk_is_sync_protected());
1427 	/* read these once and use local copies */
1428 	ckr_khead = kring->ckr_khead;
1429 	ckr_ktail = kring->ckr_ktail;
1430 
1431 	/* update ring tail/khead to what the kernel knows */
1432 	*(slot_idx_t *)(uintptr_t)&kring->ckr_ring->ring_tail =
1433 	    kring->ckr_rtail = ckr_ktail;
1434 	*(slot_idx_t *)(uintptr_t)&kring->ckr_ring->ring_khead = ckr_khead;
1435 	*(uint32_t *)(uintptr_t)&kring->ckr_ring->ring_alloc_ws =
1436 	    kring->ckr_alloc_ws;
1437 
1438 	SK_DF(SK_VERB_SYNC, "%s(%d) kr \"%s\", kh %u kt %u | "
1439 	    "rh %u rt %u | h %u t %u | ws %u",
1440 	    sk_proc_name_address(p),
1441 	    sk_proc_pid(p), kring->ckr_name, ckr_khead, ckr_ktail,
1442 	    kring->ckr_rhead, kring->ckr_rtail,
1443 	    kring->ckr_ring->ring_head,
1444 	    kring->ckr_ring->ring_tail, kring->ckr_alloc_ws);
1445 }
1446 
1447 void
kr_free_sync_finalize(struct __kern_channel_ring * kring,struct proc * p)1448 kr_free_sync_finalize(struct __kern_channel_ring *kring, struct proc *p)
1449 {
1450 #pragma unused(p)
1451 	slot_idx_t ckr_khead, ckr_ktail;
1452 
1453 	VERIFY(sk_is_sync_protected());
1454 	/* read these once and use local copies */
1455 	ckr_khead = kring->ckr_khead;
1456 	ckr_ktail = kring->ckr_ktail;
1457 
1458 	/* update ring tail/khead to what the kernel knows */
1459 	*(slot_idx_t *)(uintptr_t)&kring->ckr_ring->ring_tail =
1460 	    kring->ckr_rtail = ckr_ktail;
1461 	*(slot_idx_t *)(uintptr_t)&kring->ckr_ring->ring_khead = ckr_khead;
1462 
1463 	SK_DF(SK_VERB_SYNC, "%s(%d) kr \"%s\", kh %u kt %u | "
1464 	    "rh %u rt %u | h %u t %u", sk_proc_name_address(p),
1465 	    sk_proc_pid(p), kring->ckr_name, ckr_khead, ckr_ktail,
1466 	    kring->ckr_rhead, kring->ckr_rtail,
1467 	    kring->ckr_ring->ring_head,
1468 	    kring->ckr_ring->ring_tail);
1469 }
1470 
1471 slot_idx_t
kr_event_sync_prologue(struct __kern_channel_ring * kring,struct proc * p)1472 kr_event_sync_prologue(struct __kern_channel_ring *kring, struct proc *p)
1473 {
1474 	struct __user_channel_ring *ring = kring->ckr_ring;
1475 	slot_idx_t ckr_khead, ckr_ktail;
1476 	slot_idx_t head, slot_idx;
1477 	uint64_t err_reason = 0;
1478 
1479 	ASSERT(kring->ckr_tx == NR_EV);
1480 	VERIFY(sk_is_sync_protected());
1481 
1482 	/* read these once and use local copies */
1483 	ckr_khead = kring->ckr_khead;
1484 	ckr_ktail = kring->ckr_ktail;
1485 	head = ring->ring_head;
1486 
1487 	SK_DF(SK_VERB_SYNC, "%s(%d) kr \"%s\", kh %u kt %u | "
1488 	    "rh %u rt %u | h %u t %u", sk_proc_name_address(p),
1489 	    sk_proc_pid(p), kring->ckr_name, ckr_khead, ckr_ktail,
1490 	    kring->ckr_rhead, kring->ckr_rtail,
1491 	    head, ring->ring_tail);
1492 	/*
1493 	 * Before storing the new values, we should check they do not
1494 	 * move backwards. However, head is not an issue because the
1495 	 * previous value is khead;
1496 	 */
1497 	_KR_RXRING_VALIDATE(kring, ring, ckr_khead, ckr_ktail, head);
1498 
1499 	/*
1500 	 * Iterating through the slots just read by user-space;
1501 	 * ckr_rhead -> ring_head
1502 	 */
1503 	slot_idx = kring->ckr_rhead;
1504 	while (slot_idx != head) {
1505 		struct __kern_slot_desc *ksd = KR_KSD(kring, slot_idx);
1506 		struct __user_slot_desc *usd = KR_USD(kring, slot_idx);
1507 		/*
1508 		 * ensure that the user has detached packet from slot.
1509 		 */
1510 		VERIFY(!KSD_VALID_METADATA(ksd));
1511 		if (__improbable(SD_VALID_METADATA(usd))) {
1512 			SK_ERR("%s(%d) kr \"%s\" (0x%llx) slot %u not "
1513 			    "detached md %u kh %u kt %u | rh %u rt %u |"
1514 			    " h %u t %u", sk_proc_name_address(p),
1515 			    sk_proc_pid(p), kring->ckr_name,
1516 			    SK_KVA(kring), slot_idx, usd->sd_md_idx,
1517 			    ckr_khead, ckr_ktail, kring->ckr_rhead,
1518 			    kring->ckr_rtail, ring->ring_head,
1519 			    ring->ring_tail);
1520 			err_reason = SKYWALK_KILL_REASON_SLOT_NOT_DETACHED;
1521 			goto error;
1522 		}
1523 		slot_idx = SLOT_NEXT(slot_idx, kring->ckr_lim);
1524 	}
1525 
1526 	/* update the kernel view of ring */
1527 	kring->ckr_rhead = head;
1528 	return head;
1529 
1530 error:
1531 	SK_ERR("%s(%d) kr \"%s\" (0x%llx) krflags 0x%b error: kh %u kt %u | "
1532 	    "rh %u rt %u | h %u t %u", sk_proc_name_address(p),
1533 	    sk_proc_pid(p), kring->ckr_name, SK_KVA(kring), kring->ckr_flags,
1534 	    CKRF_BITS, ckr_khead, ckr_ktail,
1535 	    kring->ckr_rhead, kring->ckr_rtail,
1536 	    ring->ring_head, ring->ring_tail);
1537 
1538 	skywalk_kill_process(p, err_reason | SKYWALK_KILL_REASON_EVENT_SYNC);
1539 	return kring->ckr_num_slots;
1540 }
1541 
1542 void
kr_event_sync_finalize(struct kern_channel * ch,struct __kern_channel_ring * kring,struct proc * p)1543 kr_event_sync_finalize(struct kern_channel *ch,
1544     struct __kern_channel_ring *kring, struct proc *p)
1545 {
1546 #pragma unused(ch)
1547 	struct kern_pbufpool *pp = kring->ckr_pp;
1548 	const uint32_t maxfrags = pp->pp_max_frags;
1549 	slot_idx_t ckr_khead, ckr_ktail, ckr_rhead;
1550 	struct __kern_slot_desc *ksd;
1551 	struct __user_slot_desc *usd;
1552 	struct __kern_quantum *kqum;
1553 
1554 	VERIFY(sk_is_sync_protected());
1555 	/* assert that this routine is only called for user facing rings */
1556 	ASSERT(!KR_KERNEL_ONLY(kring));
1557 	ASSERT(kring->ckr_usds != NULL);
1558 	ASSERT(kring->ckr_tx == NR_EV);
1559 
1560 	/* read these once and use local copies */
1561 	ckr_khead = kring->ckr_khead;
1562 	ckr_ktail = kring->ckr_ktail;
1563 	ckr_rhead = kring->ckr_rhead;
1564 
1565 	slot_idx_t slot_idx = kring->ckr_rtail;
1566 	PP_LOCK(pp);
1567 	while (slot_idx != ckr_ktail) {
1568 		ksd = KR_KSD(kring, slot_idx);
1569 		usd = KR_USD(kring, slot_idx);
1570 		kqum = ksd->sd_qum;
1571 
1572 		/*
1573 		 * Add packet to the allocated list of user packet pool.
1574 		 */
1575 		pp_insert_upp_locked(pp, kqum, ch->ch_pid);
1576 
1577 		KSD_DETACH_METADATA(ksd);
1578 		kr_externalize_metadata_internal(kring, maxfrags, kqum, p);
1579 		ASSERT((usd->sd_flags & ~SD_FLAGS_USER) == 0);
1580 		slot_idx = SLOT_NEXT(slot_idx, kring->ckr_lim);
1581 	}
1582 	PP_UNLOCK(pp);
1583 
1584 	/* just recalculate slot count using pointer arithmetic */
1585 	int32_t slot_diff = ckr_ktail - ckr_rhead;
1586 	if (slot_diff < 0) {
1587 		slot_diff += kring->ckr_num_slots;
1588 	}
1589 	kring->ckr_ready_slots = slot_diff;
1590 
1591 	/* update ring tail/khead to what the kernel knows */
1592 	*(slot_idx_t *)(uintptr_t)&kring->ckr_ring->ring_tail =
1593 	    kring->ckr_rtail = ckr_ktail;
1594 	*(slot_idx_t *)(uintptr_t)&kring->ckr_ring->ring_khead = ckr_khead;
1595 
1596 	SK_DF(SK_VERB_SYNC | SK_VERB_RX, "%s(%d) kr \"%s\", kh %u kt %u | "
1597 	    "rh %u rt %u | h %u t %u", sk_proc_name_address(p),
1598 	    sk_proc_pid(p), kring->ckr_name, ckr_khead, ckr_ktail,
1599 	    kring->ckr_rhead, kring->ckr_rtail,
1600 	    kring->ckr_ring->ring_head,
1601 	    kring->ckr_ring->ring_tail);
1602 }
1603 #undef NM_FAIL_ON
1604 
1605 void
kr_txkring_reclaim_and_refill(struct __kern_channel_ring * kring,slot_idx_t index)1606 kr_txkring_reclaim_and_refill(struct __kern_channel_ring *kring,
1607     slot_idx_t index)
1608 {
1609 	const slot_idx_t lim = kring->ckr_lim;
1610 	slot_idx_t next_index = SLOT_NEXT(index, lim);
1611 
1612 	kring->ckr_khead = next_index;
1613 	/* reclaim */
1614 	kring->ckr_ktail = index;
1615 }
1616 
1617 /*
1618  * *************************************************************************
1619  * Checks on packet header offsets in kr_internalize_metadata
1620  * *************************************************************************
1621  *
1622  *  +----------+------------------------------+----------------------------+
1623  *  |          | NEXUS_META_SUBTYPE_RAW       | NEXUS_META_SUBTYPE_PAYLOAD |
1624  *  |----------+------------------------------+----------------------------+
1625  *  | buflet   | (bdoff + len) <= dlim        | (bdoff + len) <= dlim      |
1626  *  |----------+------------------------------+----------------------------+
1627  *  | headroom | hr == bdoff && hr < bdlim    | hr == 0 && bdoff == 0      |
1628  *  |----------+------------------------------+----------------------------+
1629  *  | l2_len   | hr + l2_len < bdim           | l2_len == 0                |
1630  *  |----------+------------------------------+----------------------------+
1631  */
1632 int
kr_internalize_metadata(struct kern_channel * ch,struct __kern_channel_ring * kring,const uint32_t maxfrags,struct __kern_quantum * kqum,struct proc * p)1633 kr_internalize_metadata(struct kern_channel *ch,
1634     struct __kern_channel_ring *kring, const uint32_t maxfrags,
1635     struct __kern_quantum *kqum, struct proc *p)
1636 {
1637 #pragma unused(kring, maxfrags, p)
1638 	struct __user_buflet *ubuf, *pubuf;     /* user buflet */
1639 	struct __kern_buflet *kbuf, *pkbuf;     /* kernel buflet */
1640 	struct __user_quantum *uqum;            /* user source */
1641 	struct __user_packet *upkt;
1642 	struct __kern_packet *kpkt;
1643 	const nexus_meta_type_t md_type = METADATA_TYPE(kqum);
1644 	const nexus_meta_subtype_t md_subtype = METADATA_SUBTYPE(kqum);
1645 	uint32_t len = 0, bdoff, bdlim;
1646 	uint16_t bcnt = 0, bmax, i;
1647 	boolean_t dropped;
1648 	int err = 0;
1649 
1650 	/*
1651 	 * Verify that the quantum/packet belongs to the same pp as
1652 	 * the one used by the adapter, i.e. the packet must have
1653 	 * been allocated from the same pp and attached to the kring.
1654 	 */
1655 	ASSERT(kqum->qum_pp == kring->ckr_pp);
1656 
1657 	_CASSERT(sizeof(uqum->qum_com) == sizeof(kqum->qum_com));
1658 	_CASSERT(sizeof(upkt->pkt_com) == sizeof(kpkt->pkt_com));
1659 	uqum = __DECONST(struct __user_quantum *, kqum->qum_user);
1660 	ASSERT(!(kqum->qum_qflags & QUM_F_KERNEL_ONLY) && uqum != NULL);
1661 	upkt = SK_PTR_ADDR_UPKT(uqum);
1662 	kpkt = SK_PTR_ADDR_KPKT(kqum);
1663 
1664 	DTRACE_SKYWALK3(internalize, struct __kern_channel_ring *, kring,
1665 	    struct __kern_packet *, kpkt, struct __user_packet *, upkt);
1666 	SK_DF(SK_VERB_MEM, "%s(%d) kring 0x%llx uqum 0x%llx -> kqum 0x%llx",
1667 	    sk_proc_name_address(p), sk_proc_pid(p), SK_KVA(kring),
1668 	    SK_KVA(uqum), SK_KVA(kqum));
1669 
1670 	/* check if it's dropped before we internalize it */
1671 	dropped = ((uqum->qum_qflags & QUM_F_DROPPED) != 0);
1672 
1673 	/*
1674 	 * Internalize common quantum metadata.
1675 	 *
1676 	 * For packet metadata, we trust the kernel copy for the buflet
1677 	 * count and limit; any mismatch on the user copy will cause
1678 	 * us to drop this packet.
1679 	 */
1680 	_QUM_INTERNALIZE(uqum, kqum);
1681 
1682 	/* if marked as dropped, don't bother going further */
1683 	if (__improbable(dropped)) {
1684 		SK_ERR("%s(%d) kring 0x%llx dropped",
1685 		    sk_proc_name_address(p), sk_proc_pid(p), SK_KVA(kring));
1686 		err = ERANGE;
1687 		goto done;
1688 	}
1689 
1690 	switch (md_type) {
1691 	case NEXUS_META_TYPE_PACKET:
1692 		/*
1693 		 * Internalize common packet metadata.
1694 		 */
1695 		_PKT_INTERNALIZE(upkt, kpkt);
1696 
1697 		switch (md_subtype) {
1698 		case NEXUS_META_SUBTYPE_PAYLOAD:
1699 			/* sanitize link layer fields for payload mode */
1700 			kpkt->pkt_link_flags = 0;
1701 			break;
1702 		default:
1703 			break;
1704 		}
1705 
1706 		if (__probable(ch != NULL)) {
1707 			_UUID_COPY(kpkt->pkt_flowsrc_id,
1708 			    ch->ch_info->cinfo_ch_id);
1709 		}
1710 
1711 		bcnt = upkt->pkt_bufs_cnt;
1712 		bmax = kpkt->pkt_bufs_max;
1713 		ASSERT(bmax == maxfrags);
1714 		if (__improbable((bcnt == 0) || (bcnt > bmax) ||
1715 		    (upkt->pkt_bufs_max != bmax))) {
1716 			SK_ERR("%s(%d) kring 0x%llx bad bufcnt %d, %d, %d",
1717 			    sk_proc_name_address(p), sk_proc_pid(p),
1718 			    SK_KVA(kring), bcnt, bmax, upkt->pkt_bufs_max);
1719 			err = ERANGE;
1720 			goto done;
1721 		}
1722 		break;
1723 
1724 	case NEXUS_META_TYPE_QUANTUM:
1725 		ASSERT(maxfrags == 1);
1726 		bcnt = bmax = 1;
1727 		break;
1728 
1729 	default:
1730 		VERIFY(0);
1731 		/* NOTREACHED */
1732 		__builtin_unreachable();
1733 	}
1734 
1735 	ASSERT(bcnt != 0);
1736 	ubuf = pubuf = NULL;
1737 	kbuf = pkbuf = NULL;
1738 
1739 	/*
1740 	 * Validate and internalize buflets.
1741 	 */
1742 	for (i = 0; i < bcnt; i++) {
1743 		_CASSERT(offsetof(struct __kern_packet, pkt_qum) == 0);
1744 		_CASSERT(offsetof(struct __user_packet, pkt_qum) == 0);
1745 		_CASSERT(offsetof(struct __kern_quantum, qum_com) == 0);
1746 		PKT_GET_NEXT_BUFLET(kpkt, bcnt, pkbuf, kbuf);
1747 		ASSERT(kbuf != NULL);
1748 		if (kbuf->buf_flag & BUFLET_FLAG_EXTERNAL) {
1749 			struct __kern_buflet_ext *kbuf_ext;
1750 
1751 			kbuf_ext = __container_of(kbuf,
1752 			    struct __kern_buflet_ext, kbe_overlay);
1753 			ubuf = __DECONST(struct __user_buflet *,
1754 			    kbuf_ext->kbe_buf_user);
1755 		} else {
1756 			ASSERT(i == 0);
1757 			ubuf = __DECONST(struct __user_buflet *,
1758 			    &uqum->qum_buf[0]);
1759 		}
1760 		ASSERT(ubuf != NULL);
1761 		ASSERT((kbuf != pkbuf) && (ubuf != pubuf));
1762 		ASSERT(kbuf->buf_dlim == _BUF_DLIM(kbuf, kqum->qum_pp));
1763 		ASSERT(kbuf->buf_addr != 0);
1764 		/*
1765 		 * For now, user-facing pool does not support shared
1766 		 * buffer, since otherwise the ubuf and kbuf buffer
1767 		 * indices would not match.  Assert this is the case.
1768 		 */
1769 		ASSERT(kbuf->buf_addr == (mach_vm_address_t)kbuf->buf_objaddr);
1770 
1771 		kbuf->buf_dlen = ubuf->buf_dlen;
1772 		kbuf->buf_doff = ubuf->buf_doff;
1773 
1774 		/*
1775 		 * kernel and user metadata use the same object index
1776 		 * also checks the sanity of buflet data offset and length
1777 		 */
1778 		if (__improbable(!BUF_IN_RANGE(kbuf) ||
1779 		    ubuf->buf_idx != kbuf->buf_idx)) {
1780 			kbuf->buf_dlen = kbuf->buf_doff = 0;
1781 			SK_ERR("%s(%d) kring 0x%llx bad bufidx 0x%x, 0x%x",
1782 			    sk_proc_name_address(p), sk_proc_pid(p),
1783 			    SK_KVA(kring), kbuf->buf_idx, ubuf->buf_idx);
1784 			err = ERANGE;
1785 			goto done;
1786 		}
1787 
1788 		/* save data offset from the first buflet */
1789 		if (pkbuf == NULL) {
1790 			bdoff = kbuf->buf_doff;
1791 		}
1792 
1793 		/* all good to go */
1794 		len += kbuf->buf_dlen;
1795 		pubuf = ubuf;
1796 		pkbuf = kbuf;
1797 	}
1798 
1799 	_CASSERT(offsetof(struct __kern_packet, pkt_length) ==
1800 	    offsetof(struct __kern_packet, pkt_qum.qum_len));
1801 	if (__improbable(kpkt->pkt_length != len)) {
1802 		SK_ERR("%s(%d) kring 0x%llx bad pktlen %d, %d",
1803 		    sk_proc_name_address(p), sk_proc_pid(p),
1804 		    SK_KVA(kring), kpkt->pkt_length, len);
1805 		err = ERANGE;
1806 		goto done;
1807 	}
1808 
1809 	if ((err == 0) && (md_type == NEXUS_META_TYPE_PACKET)) {
1810 		bdlim = PP_BUF_SIZE_DEF(kqum->qum_pp);
1811 		switch (md_subtype) {
1812 		case NEXUS_META_SUBTYPE_RAW:
1813 			/*
1814 			 * For a raw packet from user space we need to
1815 			 * validate that headroom is sane and is in the
1816 			 * first buflet.
1817 			 */
1818 			if (__improbable(kpkt->pkt_headroom != bdoff)) {
1819 				SK_ERR("%s(%d) kring 0x%llx bad headroom %d, %d",
1820 				    sk_proc_name_address(p), sk_proc_pid(p),
1821 				    SK_KVA(kring), kpkt->pkt_headroom, bdoff);
1822 				err = ERANGE;
1823 				goto done;
1824 			}
1825 			if (__improbable(kpkt->pkt_headroom +
1826 			    kpkt->pkt_l2_len >= bdlim)) {
1827 				SK_ERR("%s(%d) kring 0x%llx bad headroom l2len %d, %d",
1828 				    sk_proc_name_address(p), sk_proc_pid(p),
1829 				    SK_KVA(kring), kpkt->pkt_l2_len, bdlim);
1830 				err = ERANGE;
1831 				goto done;
1832 			}
1833 			break;
1834 		case NEXUS_META_SUBTYPE_PAYLOAD:
1835 			/*
1836 			 * For a payload packet from user space we need
1837 			 * to validate that payload starts from 0 and L2
1838 			 * length is 0.
1839 			 */
1840 			if (__improbable((kpkt->pkt_headroom != 0) ||
1841 			    (kpkt->pkt_l2_len != 0))) {
1842 				SK_ERR("%s(%d) kring 0x%llx bad headroom "
1843 				    "payload subtype %d headroom %d l2len %d",
1844 				    sk_proc_name_address(p), sk_proc_pid(p),
1845 				    SK_KVA(kring), SK_PTR_SUBTYPE(kpkt),
1846 				    kpkt->pkt_headroom, kpkt->pkt_l2_len);
1847 				err = ERANGE;
1848 				goto done;
1849 			}
1850 			break;
1851 		default:
1852 			VERIFY(0);
1853 			/* NOTREACHED */
1854 			__builtin_unreachable();
1855 		}
1856 
1857 		/* validate checksum offload properties */
1858 		if (__probable(PACKET_HAS_PARTIAL_CHECKSUM(kpkt))) {
1859 			uint16_t start = kpkt->pkt_csum_tx_start_off;
1860 			uint16_t stuff = kpkt->pkt_csum_tx_stuff_off;
1861 			if (__improbable(start > stuff ||
1862 			    start > kpkt->pkt_length ||
1863 			    (stuff + sizeof(uint16_t)) > kpkt->pkt_length)) {
1864 				SK_ERR("%s(%d) flags 0x%x start %u stuff %u "
1865 				    "len %u", sk_proc_name_address(p),
1866 				    sk_proc_pid(p), kpkt->pkt_csum_flags,
1867 				    start, stuff, kpkt->pkt_length);
1868 				err = ERANGE;
1869 				goto done;
1870 			}
1871 		} else {
1872 			kpkt->pkt_csum_tx_start_off = 0;
1873 			kpkt->pkt_csum_tx_stuff_off = 0;
1874 		}
1875 		*__DECONST(uint16_t *, &kpkt->pkt_bufs_cnt) = bcnt;
1876 	}
1877 
1878 done:
1879 	if (__probable(err == 0)) {
1880 		kqum->qum_len = len;
1881 		kqum->qum_qflags |= (QUM_F_INTERNALIZED | QUM_F_FINALIZED);
1882 	} else {
1883 		kqum->qum_len = 0;
1884 		kqum->qum_qflags |= (QUM_F_INTERNALIZED | QUM_F_DROPPED);
1885 	}
1886 	return err;
1887 }
1888 
1889 __attribute__((always_inline))
1890 static inline void
kr_externalize_metadata_internal(struct __kern_channel_ring * kring,const uint32_t maxfrags,struct __kern_quantum * kqum,struct proc * p)1891 kr_externalize_metadata_internal(struct __kern_channel_ring *kring,
1892     const uint32_t maxfrags, struct __kern_quantum *kqum, struct proc *p)
1893 {
1894 #pragma unused(kring, maxfrags, p)
1895 	struct __kern_buflet *kbuf, *pkbuf;     /* kernel buflet */
1896 	struct __user_buflet *ubuf, *pubuf;     /* user buflet */
1897 	struct __user_quantum *uqum;            /* user destination */
1898 	struct __user_packet *upkt;
1899 	struct __kern_packet *kpkt;
1900 	const nexus_meta_type_t md_type = METADATA_TYPE(kqum);
1901 	const nexus_meta_subtype_t md_subtype = METADATA_SUBTYPE(kqum);
1902 	uint32_t len = 0;
1903 	uint16_t bcnt = 0, bmax, i;
1904 
1905 	/*
1906 	 * Verify that the quantum/packet belongs to the same pp as
1907 	 * the one used by the adapter, i.e. the packet must have
1908 	 * been allocated from the same pp and attached to the kring.
1909 	 */
1910 	ASSERT(kqum->qum_pp == kring->ckr_pp);
1911 	ASSERT(kqum->qum_qflags & (QUM_F_FINALIZED | QUM_F_INTERNALIZED));
1912 
1913 	_CASSERT(sizeof(kpkt->pkt_com) == sizeof(upkt->pkt_com));
1914 	_CASSERT(sizeof(kqum->qum_com) == sizeof(uqum->qum_com));
1915 	uqum = __DECONST(struct __user_quantum *, kqum->qum_user);
1916 	ASSERT(!(kqum->qum_qflags & QUM_F_KERNEL_ONLY) && uqum != NULL);
1917 	upkt = SK_PTR_ADDR_UPKT(uqum);
1918 	kpkt = SK_PTR_ADDR_KPKT(kqum);
1919 
1920 	DTRACE_SKYWALK3(externalize, struct __kern_channel_ring *, kring,
1921 	    struct __kern_packet *, kpkt, struct __user_packet *, upkt);
1922 	SK_DF(SK_VERB_MEM, "%s(%d) kring 0x%llx kqum 0x%llx -> uqum 0x%llx",
1923 	    sk_proc_name_address(p), sk_proc_pid(p), SK_KVA(kring),
1924 	    SK_KVA(kqum), SK_KVA(uqum));
1925 
1926 	/*
1927 	 * Externalize common quantum metadata.
1928 	 */
1929 	_QUM_EXTERNALIZE(kqum, uqum);
1930 
1931 	switch (md_type) {
1932 	case NEXUS_META_TYPE_PACKET: {
1933 		bcnt = kpkt->pkt_bufs_cnt;
1934 		bmax = kpkt->pkt_bufs_max;
1935 		ASSERT(bmax == maxfrags);
1936 		ASSERT(bcnt <= bmax);
1937 		/*
1938 		 * Externalize common packet metadata.
1939 		 */
1940 		_PKT_EXTERNALIZE(kpkt, upkt);
1941 
1942 		/* sanitize buflet count and limit (deconst) */
1943 		_CASSERT(sizeof(upkt->pkt_bufs_max) == sizeof(uint16_t));
1944 		_CASSERT(sizeof(upkt->pkt_bufs_cnt) == sizeof(uint16_t));
1945 		*(uint16_t *)(uintptr_t)&upkt->pkt_bufs_max = bmax;
1946 		*(uint16_t *)(uintptr_t)&upkt->pkt_bufs_cnt = bcnt;
1947 
1948 		switch (md_subtype) {
1949 		case NEXUS_META_SUBTYPE_PAYLOAD:
1950 			/* sanitize link layer fields for payload mode */
1951 			upkt->pkt_headroom = 0;
1952 			upkt->pkt_link_flags = 0;
1953 			break;
1954 		default:
1955 			break;
1956 		}
1957 		break;
1958 	}
1959 
1960 	case NEXUS_META_TYPE_QUANTUM:
1961 		ASSERT(maxfrags == 1);
1962 		bcnt = bmax = 1;
1963 		break;
1964 
1965 	default:
1966 		VERIFY(0);
1967 		/* NOTREACHED */
1968 		__builtin_unreachable();
1969 	}
1970 
1971 	ASSERT(bcnt != 0);
1972 	/*
1973 	 * special handling to externalize empty packet buflet.
1974 	 */
1975 	kbuf = &kpkt->pkt_qum.qum_buf[0];
1976 	if (kbuf->buf_addr == 0) {
1977 		ubuf = __DECONST(struct __user_buflet *,
1978 		    &kpkt->pkt_qum.qum_user->qum_buf[0]);
1979 		UBUF_INIT(kbuf, ubuf);
1980 	}
1981 
1982 	kbuf = pkbuf = NULL;
1983 	ubuf = pubuf = NULL;
1984 	/*
1985 	 * Externalize buflets.
1986 	 */
1987 	for (i = 0; i < bcnt; i++) {
1988 		_CASSERT(offsetof(struct __kern_packet, pkt_qum) == 0);
1989 		PKT_GET_NEXT_BUFLET(kpkt, bcnt, pkbuf, kbuf);
1990 		ASSERT(kbuf != NULL);
1991 
1992 		if (kbuf->buf_flag & BUFLET_FLAG_EXTERNAL) {
1993 			struct __kern_buflet_ext *kbuf_ext;
1994 
1995 			kbuf_ext = __container_of(kbuf,
1996 			    struct __kern_buflet_ext, kbe_overlay);
1997 			ubuf = __DECONST(struct __user_buflet *,
1998 			    kbuf_ext->kbe_buf_user);
1999 		} else {
2000 			ASSERT(i == 0);
2001 			ubuf = __DECONST(struct __user_buflet *,
2002 			    &kpkt->pkt_qum.qum_user->qum_buf[0]);
2003 		}
2004 
2005 		ASSERT(ubuf != NULL);
2006 		ASSERT((kbuf != pkbuf) && (ubuf != pubuf));
2007 		ASSERT(BUF_IN_RANGE(kbuf));
2008 		KBUF_EXTERNALIZE(kbuf, ubuf, kqum->qum_pp);
2009 
2010 		/* all good to go */
2011 		len += kbuf->buf_dlen;
2012 		pkbuf = kbuf;
2013 		pubuf = ubuf;
2014 	}
2015 
2016 	uqum->qum_len = len;
2017 	uqum->qum_qflags |= QUM_F_FINALIZED;
2018 
2019 	/*
2020 	 * XXX: [email protected] -- do this during reclaim instead?
2021 	 */
2022 	kqum->qum_qflags &= ~QUM_F_INTERNALIZED;
2023 }
2024 
2025 
2026 void
kr_externalize_metadata(struct __kern_channel_ring * kring,const uint32_t maxfrags,struct __kern_quantum * kqum,struct proc * p)2027 kr_externalize_metadata(struct __kern_channel_ring *kring,
2028     const uint32_t maxfrags, struct __kern_quantum *kqum, struct proc *p)
2029 {
2030 	kr_externalize_metadata_internal(kring, maxfrags, kqum, p);
2031 }
2032