xref: /xnu-8020.121.3/bsd/skywalk/channel/channel_ring.c (revision fdd8201d7b966f0c3ea610489d29bd841d358941)
1 /*
2  * Copyright (c) 2015-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include <skywalk/os_skywalk_private.h>
30 #include <kern/sched_prim.h>
31 #include <sys/sdt.h>
32 
33 static void kr_update_user_stats(struct __kern_channel_ring *,
34     uint32_t, uint32_t);
35 static void kr_externalize_metadata_internal(struct __kern_channel_ring *,
36     const uint32_t, struct __kern_quantum *, struct proc *);
37 
38 #define KR_TRANSFER_DECAY       2       /* ilog2 of EWMA decay rate (4) */
39 static uint32_t kr_transfer_decay = 0;
40 
41 #define KR_ACCUMULATE_INTERVAL  2 /* 2 seconds */
42 static uint32_t kr_accumulate_interval = KR_ACCUMULATE_INTERVAL;
43 
44 #if (DEVELOPMENT || DEBUG)
45 #define KR_STAT_ENABLE          1
46 #else /* !(DEVELOPMENT || DEBUG) */
47 #define KR_STAT_ENABLE          0
48 #endif /* !(DEVELOPMENT || DEBUG) */
49 /* Enable/Disable ring stats collection */
50 uint32_t kr_stat_enable = KR_STAT_ENABLE;
51 
52 #if (DEVELOPMENT || DEBUG)
53 SYSCTL_UINT(_kern_skywalk, OID_AUTO, ring_transfer_decay,
54     CTLFLAG_RW | CTLFLAG_LOCKED, &kr_transfer_decay,
55     0, "ilog2 of EWMA decay rate of ring transfers");
56 
57 SYSCTL_UINT(_kern_skywalk, OID_AUTO, ring_stat_accumulate_interval,
58     CTLFLAG_RW | CTLFLAG_LOCKED, &kr_accumulate_interval,
59     KR_ACCUMULATE_INTERVAL, "accumulation interval for ring stats");
60 
61 uint32_t kr_disable_panic_on_sync_err = 0;
62 SYSCTL_UINT(_kern_skywalk, OID_AUTO, disable_panic_on_sync_err,
63     CTLFLAG_RW | CTLFLAG_LOCKED, &kr_disable_panic_on_sync_err,
64     0, "disable panic on sync error");
65 #endif /* (DEVELOPMENT || DEBUG) */
66 
67 SYSCTL_UINT(_kern_skywalk, OID_AUTO, ring_stat_enable,
68     CTLFLAG_RW | CTLFLAG_LOCKED, &kr_stat_enable,
69     0, "enable/disable stats collection for ring");
70 
71 #define KR_EWMA(old, new, decay) do {                                   \
72 	u_int64_t _avg;                                                 \
73 	if (__probable((_avg = (old)) > 0))                             \
74 	        _avg = (((_avg << (decay)) - _avg) + (new)) >> (decay); \
75 	else                                                            \
76 	        _avg = (new);                                           \
77 	(old) = _avg;                                                   \
78 } while (0)
79 
80 void
kr_init_to_mhints(struct __kern_channel_ring * kring,uint32_t nslots)81 kr_init_to_mhints(struct __kern_channel_ring *kring, uint32_t nslots)
82 {
83 	uint32_t tail;
84 
85 	tail = nslots - 1;
86 
87 	kring->ckr_transfer_decay = KR_TRANSFER_DECAY;
88 	kring->ckr_num_slots = nslots;
89 	*(slot_idx_t *)(uintptr_t)&kring->ckr_lim = (nslots - 1);
90 	kring->ckr_rhead = kring->ckr_khead = 0;
91 	/* IMPORTANT: Always keep one slot empty */
92 	kring->ckr_rtail = kring->ckr_ktail =
93 	    ((kring->ckr_tx == NR_TX) || (kring->ckr_tx == NR_F) ? tail : 0);
94 }
95 
96 /*
97  * Try to obtain exclusive right to issue the *sync() or state change
98  * operations on the ring.  The right is obtained and must be later
99  * relinquished via kr_exit() if and only if kr_enter() returns 0.
100  *
101  * In all cases the caller will typically skip the ring, possibly collecting
102  * errors along the way.
103  *
104  * If the calling context does not allow sleeping, the caller must pass
105  * FALSE in can_sleep; EBUSY may be returned if the right is held by
106  * another thread.  Otherwise, the caller may block until the right is
107  * released by the previous holder.
108  */
109 int
kr_enter(struct __kern_channel_ring * kr,boolean_t can_sleep)110 kr_enter(struct __kern_channel_ring *kr, boolean_t can_sleep)
111 {
112 	lck_spin_lock(&kr->ckr_slock);
113 	if (kr->ckr_owner == current_thread()) {
114 		ASSERT(kr->ckr_busy != 0);
115 		kr->ckr_busy++;
116 		goto done;
117 	}
118 	if (!can_sleep) {
119 		if (kr->ckr_busy != 0) {
120 			lck_spin_unlock(&kr->ckr_slock);
121 			return EBUSY;
122 		}
123 	} else {
124 		while (kr->ckr_busy != 0) {
125 			kr->ckr_want++;
126 			(void) assert_wait(&kr->ckr_busy, THREAD_UNINT);
127 			lck_spin_unlock(&kr->ckr_slock);
128 			(void) thread_block(THREAD_CONTINUE_NULL);
129 			SK_DF(SK_VERB_LOCKS, "waited for kr \"%s\" "
130 			    "(0x%llx) busy=%u", kr->ckr_name,
131 			    SK_KVA(kr), kr->ckr_busy);
132 			lck_spin_lock(&kr->ckr_slock);
133 		}
134 	}
135 	LCK_SPIN_ASSERT(&kr->ckr_slock, LCK_ASSERT_OWNED);
136 	ASSERT(kr->ckr_busy == 0);
137 	kr->ckr_busy++;
138 	kr->ckr_owner = current_thread();
139 done:
140 	lck_spin_unlock(&kr->ckr_slock);
141 
142 	SK_DF(SK_VERB_LOCKS, "kr \"%s\" (0x%llx) right acquired",
143 	    kr->ckr_name, SK_KVA(kr));
144 
145 	return 0;
146 }
147 
148 void
kr_exit(struct __kern_channel_ring * kr)149 kr_exit(struct __kern_channel_ring *kr)
150 {
151 	uint32_t want = 0;
152 
153 	lck_spin_lock(&kr->ckr_slock);
154 	ASSERT(kr->ckr_busy != 0);
155 	ASSERT(kr->ckr_owner == current_thread());
156 	if (--kr->ckr_busy == 0) {
157 		kr->ckr_owner = NULL;
158 
159 		/*
160 		 * we're done with the kring;
161 		 * notify anyone that has lost the race
162 		 */
163 		if ((want = kr->ckr_want) != 0) {
164 			kr->ckr_want = 0;
165 			wakeup((void *)&kr->ckr_busy);
166 			lck_spin_unlock(&kr->ckr_slock);
167 		} else {
168 			lck_spin_unlock(&kr->ckr_slock);
169 		}
170 	} else {
171 		lck_spin_unlock(&kr->ckr_slock);
172 	}
173 
174 	SK_DF(SK_VERB_LOCKS, "kr \"%s\" (0x%llx) right released (%u waiters)",
175 	    kr->ckr_name, SK_KVA(kr), want);
176 }
177 
178 
179 void
kr_start(struct __kern_channel_ring * kr)180 kr_start(struct __kern_channel_ring *kr)
181 {
182 	lck_spin_lock(&kr->ckr_slock);
183 	ASSERT(kr->ckr_busy != 0);
184 	ASSERT(kr->ckr_state == KR_STOPPED || kr->ckr_state == KR_LOCKED);
185 	/* now clear the state */
186 	kr->ckr_state = KR_READY;
187 	lck_spin_unlock(&kr->ckr_slock);
188 
189 	kr_exit(kr);
190 
191 	SK_DF(SK_VERB_LOCKS, "kr \"%s\" (0x%llx) is started",
192 	    kr->ckr_name, SK_KVA(kr));
193 }
194 
195 /*
196  * Put the kring in the 'stopped' state: either KR_STOPPED or KR_LOCKED.
197  * Also marks the ring as busy, which would require either kr_start() at a
198  * later point.
199  */
200 void
kr_stop(struct __kern_channel_ring * kr,uint32_t state)201 kr_stop(struct __kern_channel_ring *kr, uint32_t state)
202 {
203 	uint32_t s;
204 
205 	ASSERT(state == KR_STOPPED || state == KR_LOCKED);
206 
207 	s = kr_enter(kr, TRUE);
208 	ASSERT(s == 0);
209 
210 	lck_spin_lock(&kr->ckr_slock);
211 	ASSERT(kr->ckr_busy != 0);
212 	/* now set the state */
213 	kr->ckr_state = state;
214 	lck_spin_unlock(&kr->ckr_slock);
215 
216 	SK_DF(SK_VERB_LOCKS,
217 	    "kr \"%s\" (0x%llx) krflags 0x%b is now stopped s=%u",
218 	    kr->ckr_name, SK_KVA(kr), kr->ckr_flags, CKRF_BITS, state);
219 }
220 
221 static void
kr_update_user_stats(struct __kern_channel_ring * kring,uint32_t slot_count,uint32_t byte_count)222 kr_update_user_stats(struct __kern_channel_ring *kring, uint32_t slot_count,
223     uint32_t byte_count)
224 {
225 	uint64_t now;
226 	uint32_t transfer_decay = (kr_transfer_decay != 0) ?
227 	    kr_transfer_decay : kring->ckr_transfer_decay;
228 	channel_ring_user_stats_t stats = &kring->ckr_usr_stats;
229 
230 	now = net_uptime();
231 	kring->ckr_sync_time = now;
232 
233 	if (kr_stat_enable == 0) {
234 		return;
235 	}
236 
237 	stats->crsu_number_of_syncs++;
238 	stats->crsu_total_bytes_transferred += byte_count;
239 	stats->crsu_total_slots_transferred += slot_count;
240 
241 	if (slot_count > stats->crsu_max_slots_transferred) {
242 		stats->crsu_max_slots_transferred = slot_count;
243 	}
244 
245 	if (stats->crsu_min_slots_transferred == 0 ||
246 	    slot_count < stats->crsu_min_slots_transferred) {
247 		stats->crsu_min_slots_transferred = slot_count;
248 	}
249 
250 	if (__probable(kring->ckr_user_accumulate_start != 0)) {
251 		if ((now - kring->ckr_user_accumulate_start) >=
252 		    kr_accumulate_interval) {
253 			uint64_t        bps;
254 			uint64_t        sps;
255 			uint64_t        sps_ma;
256 
257 			/* bytes per sync */
258 			bps = kring->ckr_user_accumulated_bytes /
259 			    kring->ckr_user_accumulated_syncs;
260 			KR_EWMA(stats->crsu_bytes_per_sync_ma,
261 			    bps, transfer_decay);
262 			stats->crsu_bytes_per_sync = bps;
263 
264 			/* slots per sync */
265 			sps = kring->ckr_user_accumulated_slots /
266 			    kring->ckr_user_accumulated_syncs;
267 			sps_ma = stats->crsu_slots_per_sync_ma;
268 			KR_EWMA(sps_ma, sps, transfer_decay);
269 			stats->crsu_slots_per_sync_ma = (uint32_t)sps_ma;
270 			stats->crsu_slots_per_sync = (uint32_t)sps;
271 
272 			/* start over */
273 			kring->ckr_user_accumulate_start = now;
274 			kring->ckr_user_accumulated_bytes = 0;
275 			kring->ckr_user_accumulated_slots = 0;
276 			kring->ckr_user_accumulated_syncs = 0;
277 
278 			stats->crsu_min_slots_transferred = 0;
279 			stats->crsu_max_slots_transferred = 0;
280 		}
281 	} else {
282 		kring->ckr_user_accumulate_start = now;
283 	}
284 
285 	kring->ckr_user_accumulated_bytes += byte_count;
286 	kring->ckr_user_accumulated_slots += slot_count;
287 	kring->ckr_user_accumulated_syncs++;
288 }
289 
290 /* caller to make sure thread safety */
291 void
kr_update_stats(struct __kern_channel_ring * kring,uint32_t slot_count,uint32_t byte_count)292 kr_update_stats(struct __kern_channel_ring *kring, uint32_t slot_count,
293     uint32_t byte_count)
294 {
295 	uint64_t now;
296 	uint64_t diff_secs;
297 	channel_ring_stats_t stats = &kring->ckr_stats;
298 	uint32_t transfer_decay = (kr_transfer_decay != 0) ?
299 	    kr_transfer_decay : kring->ckr_transfer_decay;
300 
301 	if (kr_stat_enable == 0) {
302 		return;
303 	}
304 
305 	if (__improbable(slot_count == 0)) {
306 		return;
307 	}
308 
309 	stats->crs_number_of_transfers++;
310 	stats->crs_total_bytes_transferred += byte_count;
311 	stats->crs_total_slots_transferred += slot_count;
312 	if (slot_count > stats->crs_max_slots_transferred) {
313 		stats->crs_max_slots_transferred = slot_count;
314 	}
315 	if (stats->crs_min_slots_transferred == 0 ||
316 	    slot_count < stats->crs_min_slots_transferred) {
317 		stats->crs_min_slots_transferred = slot_count;
318 	}
319 
320 	now = net_uptime();
321 	if (__probable(kring->ckr_accumulate_start != 0)) {
322 		diff_secs = now - kring->ckr_accumulate_start;
323 		if (diff_secs >= kr_accumulate_interval) {
324 			uint64_t        bps;
325 			uint64_t        sps;
326 			uint64_t        sps_ma;
327 
328 			/* bytes per second */
329 			bps = kring->ckr_accumulated_bytes / diff_secs;
330 			KR_EWMA(stats->crs_bytes_per_second_ma,
331 			    bps, transfer_decay);
332 			stats->crs_bytes_per_second = bps;
333 
334 			/* slots per second */
335 			sps = kring->ckr_accumulated_slots / diff_secs;
336 			sps_ma = stats->crs_slots_per_second_ma;
337 			KR_EWMA(sps_ma, sps, transfer_decay);
338 			stats->crs_slots_per_second_ma = (uint32_t)sps_ma;
339 			stats->crs_slots_per_second = (uint32_t)sps;
340 
341 			/* start over */
342 			kring->ckr_accumulate_start = now;
343 			kring->ckr_accumulated_bytes = 0;
344 			kring->ckr_accumulated_slots = 0;
345 
346 			stats->crs_min_slots_transferred = 0;
347 			stats->crs_max_slots_transferred = 0;
348 		}
349 	} else {
350 		kring->ckr_accumulate_start = now;
351 	}
352 	kring->ckr_accumulated_bytes += byte_count;
353 	kring->ckr_accumulated_slots += slot_count;
354 }
355 
356 /* True if no space in the tx ring. only valid after kr_txsync_prologue */
357 boolean_t
kr_txempty(struct __kern_channel_ring * kring)358 kr_txempty(struct __kern_channel_ring *kring)
359 {
360 	return kring->ckr_rhead == kring->ckr_ktail;
361 }
362 
363 #if SK_LOG
364 /*
365  * Error logging routine called when txsync/rxsync detects an error.
366  * Expected to be called before killing the process with skywalk_kill_process()
367  *
368  * This routine is only called by the upper half of the kernel.
369  * It only reads khead (which is changed only by the upper half, too)
370  * and ktail (which may be changed by the lower half, but only on
371  * a tx ring and only to increase it, so any error will be recovered
372  * on the next call). For the above, we don't strictly need to call
373  * it under lock.
374  */
375 void
kr_log_bad_ring(struct __kern_channel_ring * kring)376 kr_log_bad_ring(struct __kern_channel_ring *kring)
377 {
378 	struct __user_channel_ring *ring = kring->ckr_ring;
379 	const slot_idx_t lim = kring->ckr_lim;
380 	slot_idx_t i;
381 	int errors = 0;
382 
383 	// XXX KASSERT nm_kr_tryget
384 	SK_ERR("kr \"%s\" (0x%llx) krflags 0x%b", kring->ckr_name,
385 	    SK_KVA(kring), kring->ckr_flags, CKRF_BITS);
386 	// XXX probably wrong to trust userspace
387 
388 	if (ring->ring_head > lim) {
389 		errors++;
390 	}
391 	if (ring->ring_tail > lim) {
392 		errors++;
393 	}
394 	for (i = 0; i <= lim; i++) {
395 		struct __kern_slot_desc *ksd = KR_KSD(kring, i);
396 		struct __kern_quantum *kqum = ksd->sd_qum;
397 		obj_idx_t idx;
398 		uint32_t len;
399 
400 		if (!KSD_VALID_METADATA(ksd)) {
401 			continue;
402 		}
403 
404 		idx = METADATA_IDX(kqum);
405 		len = kqum->qum_len;
406 		if (len > kring->ckr_max_pkt_len) {
407 			SK_RDERR(5, "bad len at slot %u idx %u len %u",
408 			    i, idx, len);
409 		}
410 	}
411 
412 	if (errors != 0) {
413 		SK_ERR("total %d errors", errors);
414 		SK_ERR("kr \"%s\" (0x%llx) krflags 0x%b crash, "
415 		    "head %u -> %u tail %u -> %u", kring->ckr_name,
416 		    SK_KVA(kring), kring->ckr_flags, CKRF_BITS, ring->ring_head,
417 		    kring->ckr_rhead, kring->ckr_khead,
418 		    ring->ring_tail, kring->ckr_ktail);
419 	}
420 }
421 #endif /* SK_LOG */
422 
423 uint32_t
kr_reclaim(struct __kern_channel_ring * kr)424 kr_reclaim(struct __kern_channel_ring *kr)
425 {
426 	int r = 0;
427 
428 	VERIFY(sk_is_sync_protected());
429 
430 	/*
431 	 * This is a no-op for TX ring, since the TX reclaim logic is only
432 	 * known to the nexus itself.  There, the nexus's TX sync code would
433 	 * figure out the number of slots that has been "transmitted", and
434 	 * advance the slot pointer accordingly.  This routine would then be
435 	 * called as a way to advise the system of such condition.
436 	 *
437 	 * For RX ring, this will reclaim user-released slots, and it is
438 	 * to be called by the provider's RX sync routine prior to its
439 	 * processing new slots (into the RX ring).
440 	 *
441 	 * It is therefore advised that this routine be called at the start
442 	 * of the RX sync callback, as well as at the end of the TX sync
443 	 * callback; the latter is useful in case we decide to implement
444 	 * more logic in future.
445 	 */
446 	if ((kr->ckr_tx == NR_RX) || (kr->ckr_tx == NR_EV)) {
447 		/* # of reclaimed slots */
448 		r = kr->ckr_rhead - kr->ckr_khead;
449 		if (r < 0) {
450 			r += kr->ckr_num_slots;
451 		}
452 
453 		kr->ckr_khead = kr->ckr_rhead;
454 		/* ensure global visibility */
455 		membar_sync();
456 	}
457 
458 	return (slot_idx_t)r;
459 }
460 
461 /*
462  * Nexus-specific kr_txsync_prologue() callback.
463  */
464 int
kr_txprologue(struct kern_channel * ch,struct __kern_channel_ring * kring,const slot_idx_t head,uint32_t * byte_count,uint64_t * err_reason,struct proc * p)465 kr_txprologue(struct kern_channel *ch, struct __kern_channel_ring *kring,
466     const slot_idx_t head, uint32_t *byte_count, uint64_t *err_reason,
467     struct proc *p)
468 {
469 	struct kern_pbufpool *pp = kring->ckr_pp;
470 	const uint32_t maxfrags = pp->pp_max_frags;
471 	slot_idx_t slot_idx = kring->ckr_rhead;
472 
473 	ASSERT(!(KRNA(kring)->na_flags & NAF_USER_PKT_POOL));
474 
475 	while (slot_idx != head) {
476 		struct __kern_slot_desc *ksd = KR_KSD(kring, slot_idx);
477 		struct __kern_quantum *kqum = ksd->sd_qum;
478 		int err;
479 
480 		if (__improbable(!(kqum->qum_qflags & QUM_F_KERNEL_ONLY) &&
481 		    METADATA_IDX(kqum) != METADATA_IDX(kqum->qum_user))) {
482 			SK_ERR("qum index mismatch");
483 			*err_reason = SKYWALK_KILL_REASON_QUM_IDX_MISMATCH;
484 			return -1;
485 		}
486 
487 		/* Internalize */
488 		err = kr_internalize_metadata(ch, kring, maxfrags, kqum, p);
489 		if (__improbable(err != 0)) {
490 			SK_ERR("%s(%d) kr \"%s\" (0x%llx) slot %u dropped "
491 			    "(err %d) kh %u kt %u | rh %u rt %u | h %u t %u",
492 			    sk_proc_name_address(p), sk_proc_pid(p),
493 			    kring->ckr_name, SK_KVA(kring), slot_idx, err,
494 			    kring->ckr_khead, kring->ckr_ktail,
495 			    kring->ckr_rhead, kring->ckr_rtail,
496 			    kring->ckr_ring->ring_head,
497 			    kring->ckr_ring->ring_tail);
498 			*err_reason = SKYWALK_KILL_REASON_INTERNALIZE_FAILED;
499 			return -1;
500 		}
501 
502 		*byte_count += kqum->qum_len;
503 		slot_idx = SLOT_NEXT(slot_idx, kring->ckr_lim);
504 	}
505 
506 	return 0;
507 }
508 
509 /*
510  * Nexus-specific kr_txsync_prologue() callback - user packet pool variant.
511  */
512 int
kr_txprologue_upp(struct kern_channel * ch,struct __kern_channel_ring * kring,const slot_idx_t head,uint32_t * byte_count,uint64_t * err_reason,struct proc * p)513 kr_txprologue_upp(struct kern_channel *ch, struct __kern_channel_ring *kring,
514     const slot_idx_t head, uint32_t *byte_count, uint64_t *err_reason,
515     struct proc *p)
516 {
517 	struct kern_pbufpool *pp = kring->ckr_pp;
518 	const uint32_t maxfrags = pp->pp_max_frags;
519 	slot_idx_t slot_idx = kring->ckr_rhead;
520 	struct __kern_quantum *kqum = NULL;
521 	bool free_pkt = false;
522 	int err = 0;
523 
524 	ASSERT(KRNA(kring)->na_flags & NAF_USER_PKT_POOL);
525 
526 	PP_LOCK(pp);
527 	while (slot_idx != head) {
528 		struct __kern_slot_desc *ksd = KR_KSD(kring, slot_idx);
529 		struct __user_slot_desc *usd = KR_USD(kring, slot_idx);
530 
531 		/*
532 		 * The channel is operating in user packet pool mode;
533 		 * check if the packet is in the allocated list.
534 		 */
535 		kqum = pp_remove_upp_locked(pp, usd->sd_md_idx, &err);
536 		if (__improbable(err != 0)) {
537 			if (kqum != NULL) {
538 				SK_ERR("%s(%d) kr \"%s\" (0x%llx) slot %u "
539 				    "kqum %p, bad buflet chain",
540 				    sk_proc_name_address(p), sk_proc_pid(p),
541 				    kring->ckr_name, SK_KVA(kring), slot_idx,
542 				    SK_KVA(kqum));
543 				*err_reason =
544 				    SKYWALK_KILL_REASON_BAD_BUFLET_CHAIN;
545 				goto done;
546 			}
547 
548 			SK_ERR("%s(%d) kr \"%s\" (0x%llx) slot %u "
549 			    " unallocated packet %u kh %u kt %u | "
550 			    "rh %u rt %u | h %u t %u",
551 			    sk_proc_name_address(p), sk_proc_pid(p),
552 			    kring->ckr_name, SK_KVA(kring), slot_idx,
553 			    usd->sd_md_idx, kring->ckr_khead, kring->ckr_ktail,
554 			    kring->ckr_rhead, kring->ckr_rtail,
555 			    kring->ckr_ring->ring_head,
556 			    kring->ckr_ring->ring_tail);
557 			*err_reason = SKYWALK_KILL_REASON_UNALLOCATED_PKT;
558 			goto done;
559 		}
560 
561 		if (__improbable(!(kqum->qum_qflags & QUM_F_KERNEL_ONLY) &&
562 		    METADATA_IDX(kqum) != METADATA_IDX(kqum->qum_user))) {
563 			SK_ERR("qum index mismatch");
564 			*err_reason = SKYWALK_KILL_REASON_QUM_IDX_MISMATCH;
565 			err = ERANGE;
566 			free_pkt = true;
567 			goto done;
568 		}
569 
570 		/* Internalize */
571 		err = kr_internalize_metadata(ch, kring, maxfrags, kqum, p);
572 		if (__improbable(err != 0)) {
573 			SK_ERR("%s(%d) kr \"%s\" (0x%llx) slot %u dropped "
574 			    "(err %d) kh %u kt %u | rh %u rt %u | h %u t %u",
575 			    sk_proc_name_address(p), sk_proc_pid(p),
576 			    kring->ckr_name, SK_KVA(kring), slot_idx, err,
577 			    kring->ckr_khead, kring->ckr_ktail,
578 			    kring->ckr_rhead, kring->ckr_rtail,
579 			    kring->ckr_ring->ring_head,
580 			    kring->ckr_ring->ring_tail);
581 			*err_reason = SKYWALK_KILL_REASON_INTERNALIZE_FAILED;
582 			free_pkt = true;
583 			goto done;
584 		}
585 
586 		/*
587 		 * Attach packet to slot, detach mapping from alloc ring slot.
588 		 */
589 		kqum->qum_ksd = NULL;
590 		USD_RESET(usd);
591 		KR_SLOT_ATTACH_METADATA(kring, ksd, kqum);
592 
593 		*byte_count += kqum->qum_len;
594 		slot_idx = SLOT_NEXT(slot_idx, kring->ckr_lim);
595 	}
596 
597 done:
598 	PP_UNLOCK(pp);
599 	if (__improbable(err != 0) && free_pkt) {
600 		ASSERT(kqum != NULL);
601 		kqum->qum_ksd = NULL;
602 		pp_free_packet(pp, (uint64_t)kqum);
603 	}
604 	return err;
605 }
606 
607 #define NM_FAIL_ON(t, reason) if (__improbable(t)) { SK_ERR("fail " #t); \
608 	err_reason = reason; goto error; }
609 /*
610  * Validate parameters in the TX/FREE ring/kring.
611  *
612  * ckr_rhead, ckr_rtail=ktail are stored from previous round.
613  * khead is the next packet to send to the ring.
614  *
615  * We want
616  *    khead <= *ckr_rhead <= head <= tail = *ckr_rtail <= ktail
617  *
618  * ckr_khead, ckr_rhead, ckr_rtail and ckr_ktail are reliable
619  */
620 #define _KR_TXRING_VALIDATE(_kring, _ring, _kh, _kt, _rh, _krt) do {\
621 	slot_idx_t _n = (_kring)->ckr_num_slots;                        \
622 	/* kernel sanity checks */                                      \
623 	NM_FAIL_ON((_kh) >= _n || kring->ckr_rhead >= _n || (_krt) >= _n || \
624 	    (_kt) >= _n, SKYWALK_KILL_REASON_BASIC_SANITY);             \
625 	/* user basic sanity checks */                                  \
626 	NM_FAIL_ON((_rh) >= _n, SKYWALK_KILL_REASON_BASIC_SANITY);      \
627 	/* \
628 	 * user sanity checks. We only use 'cur', \
629 	 * A, B, ... are possible positions for cur: \
630 	 * \
631 	 *  0    A  cur   B  tail  C  n-1 \
632 	 *  0    D  tail  E  cur   F  n-1 \
633 	 * \
634 	 * B, F, D are valid. A, C, E are wrong \
635 	 */                                                             \
636 	if ((_krt) >= kring->ckr_rhead) {                               \
637 	/* want ckr_rhead <= head <= ckr_rtail */               \
638 	        NM_FAIL_ON((_rh) < kring->ckr_rhead || (_rh) > (_krt),  \
639 	            SKYWALK_KILL_REASON_HEAD_OOB);                      \
640 	} else { /* here ckr_rtail < ckr_rhead */                       \
641 	/* we need head outside ckr_rtail .. ckr_rhead */       \
642 	        NM_FAIL_ON((_rh) > (_krt) && (_rh) < kring->ckr_rhead,  \
643 	            SKYWALK_KILL_REASON_HEAD_OOB_WRAPPED);              \
644 	}                                                               \
645 	NM_FAIL_ON(ring->ring_tail != (_krt),                           \
646 	    SKYWALK_KILL_REASON_TAIL_MISMATCH);                         \
647 } while (0)
648 
649 /*
650  * Validate parameters in the ring/kring on entry for *_txsync().
651  * Returns ring->ring_head if ok, or something >= kring->ckr_num_slots
652  * in case of error, in order to force a reinit.
653  */
654 slot_idx_t
kr_txsync_prologue(struct kern_channel * ch,struct __kern_channel_ring * kring,struct proc * p)655 kr_txsync_prologue(struct kern_channel *ch, struct __kern_channel_ring *kring,
656     struct proc *p)
657 {
658 	struct __user_channel_ring *ring = kring->ckr_ring;
659 	slot_idx_t ckr_khead, ckr_ktail, ckr_rtail;
660 	slot_idx_t head;
661 	uint32_t byte_count = 0;
662 	uint64_t err_reason = 0;
663 	int slot_count;
664 
665 	VERIFY(sk_is_sync_protected());
666 	/* assert that this routine is only called for user facing rings */
667 	ASSERT(!KR_KERNEL_ONLY(kring));
668 	ASSERT(kring->ckr_usds != NULL);
669 
670 	/* read these once and use local copies */
671 	head = ring->ring_head;
672 	ckr_khead = kring->ckr_khead;
673 	ckr_ktail = kring->ckr_ktail;
674 	membar_sync();
675 	ckr_rtail = kring->ckr_rtail;
676 
677 	SK_DF(SK_VERB_SYNC | SK_VERB_TX, "%s(%d) kr \"%s\", kh %u kt %u | "
678 	    "rh %u rt %u | h %u t %u", sk_proc_name_address(p),
679 	    sk_proc_pid(p), kring->ckr_name, ckr_khead, ckr_ktail,
680 	    kring->ckr_rhead, ckr_rtail,
681 	    ring->ring_head, ring->ring_tail);
682 
683 	_KR_TXRING_VALIDATE(kring, ring, ckr_khead, ckr_ktail, head, ckr_rtail);
684 
685 	/* # of new tx slots */
686 	slot_count = head - kring->ckr_rhead;
687 	if (slot_count < 0) {
688 		slot_count += kring->ckr_num_slots;
689 	}
690 
691 	/*
692 	 * Invoke nexus-specific TX prologue callback, set in na_kr_create().
693 	 */
694 	if (kring->ckr_prologue != NULL && (kring->ckr_prologue(ch,
695 	    kring, head, &byte_count, &err_reason, p) != 0)) {
696 		goto error;
697 	}
698 
699 	/* update the user's view of slots & bytes transferred */
700 	kr_update_user_stats(kring, slot_count, byte_count);
701 
702 	/* update the kernel view of ring */
703 	kring->ckr_rhead = head;
704 
705 	/* save for kr_txsync_finalize(); only khead is needed */
706 	kring->ckr_khead_pre = ckr_khead;
707 
708 	return head;
709 
710 error:
711 	SK_ERR("%s(%d) kr \"%s\" (0x%llx) krflags 0x%b error: kh %u kt %u | "
712 	    "rh %u rt %u | h %u t %u |", sk_proc_name_address(p),
713 	    sk_proc_pid(p), kring->ckr_name, SK_KVA(kring), kring->ckr_flags,
714 	    CKRF_BITS, ckr_khead, ckr_ktail, kring->ckr_rhead,
715 	    ckr_rtail, head, ring->ring_tail);
716 
717 	skywalk_kill_process(p, err_reason | SKYWALK_KILL_REASON_TX_SYNC);
718 
719 	return kring->ckr_num_slots;
720 }
721 
722 /*
723  * Validate parameters in the ring/kring on entry for *_free_sync().
724  * Returns ring->ring_head if ok, or something >= kring->ckr_num_slots
725  * in case of error, in order to force a reinit.
726  */
727 slot_idx_t
kr_free_sync_prologue(struct __kern_channel_ring * kring,struct proc * p)728 kr_free_sync_prologue(struct __kern_channel_ring *kring, struct proc *p)
729 {
730 	struct __user_channel_ring *ring = kring->ckr_ring;
731 	slot_idx_t ckr_khead, ckr_ktail, ckr_rtail;
732 	slot_idx_t head;
733 	uint64_t err_reason = 0;
734 
735 	VERIFY(sk_is_sync_protected());
736 	/* read these once and use local copies */
737 	head = ring->ring_head;
738 	ckr_khead = kring->ckr_khead;
739 	ckr_ktail = kring->ckr_ktail;
740 	membar_sync();
741 	ckr_rtail = kring->ckr_rtail;
742 
743 	SK_DF(SK_VERB_SYNC, "%s(%d) kr \"%s\", kh %u kt %u | "
744 	    "rh %u rt %u | h %u t %u", sk_proc_name_address(p),
745 	    sk_proc_pid(p), kring->ckr_name, ckr_khead, ckr_ktail,
746 	    kring->ckr_rhead, ckr_rtail, ring->ring_head, ring->ring_tail);
747 
748 	_KR_TXRING_VALIDATE(kring, ring, ckr_khead, ckr_ktail, head, ckr_rtail);
749 
750 	/* update the kernel view of ring */
751 	kring->ckr_rhead = head;
752 	return head;
753 
754 error:
755 	SK_ERR("%s(%d) kr \"%s\" (0x%llx) krflags 0x%b error: kh %u kt %u | "
756 	    "rh %u rt %u | h %u t %u |", sk_proc_name_address(p),
757 	    sk_proc_pid(p), kring->ckr_name, SK_KVA(kring), kring->ckr_flags,
758 	    CKRF_BITS, ckr_khead, ckr_ktail, kring->ckr_rhead,
759 	    ckr_rtail, head, ring->ring_tail);
760 
761 	skywalk_kill_process(p, err_reason | SKYWALK_KILL_REASON_FREE_SYNC);
762 	return kring->ckr_num_slots;
763 }
764 
765 /*
766  * Nexus-specific kr_rxsync_prologue() callback.
767  */
768 int
kr_rxprologue(struct kern_channel * ch,struct __kern_channel_ring * kring,const slot_idx_t head,uint32_t * byte_count,uint64_t * err_reason,struct proc * p)769 kr_rxprologue(struct kern_channel *ch, struct __kern_channel_ring *kring,
770     const slot_idx_t head, uint32_t *byte_count, uint64_t *err_reason,
771     struct proc *p)
772 {
773 #pragma unused(ch, p)
774 	slot_idx_t slot_idx = kring->ckr_rhead;
775 	uint32_t nfree = 0;
776 
777 	ASSERT(!(KRNA(kring)->na_flags & NAF_USER_PKT_POOL));
778 
779 	/*
780 	 * Iterating through the slots just read by user-space;
781 	 * ckr_rhead -> ring_head
782 	 */
783 	while (slot_idx != head) {
784 		struct __kern_slot_desc *ksd = KR_KSD(kring, slot_idx);
785 		struct __kern_quantum *kqum = ksd->sd_qum;
786 
787 		ASSERT(KSD_VALID_METADATA(ksd));
788 		/* # of new bytes transferred */
789 		*byte_count += kqum->qum_len;
790 
791 		/* detach and free the packet */
792 		(void) KR_SLOT_DETACH_METADATA(kring, ksd);
793 		ASSERT(nfree < kring->ckr_num_slots);
794 		kring->ckr_scratch[nfree++] = (uint64_t)kqum;
795 
796 		slot_idx = SLOT_NEXT(slot_idx, kring->ckr_lim);
797 	}
798 
799 	if (nfree > 0) {
800 		pp_free_packet_batch(kring->ckr_pp,
801 		    &kring->ckr_scratch[0], nfree);
802 	}
803 
804 	/*
805 	 * Update userspace channel statistics of # readable bytes
806 	 * subtract byte counts from slots just given back to the kernel.
807 	 */
808 	if (kring->ckr_ready_bytes < *byte_count) {
809 		SK_ERR("%s(%d) kr \"%s\" (0x%llx) inconsistent ready bytes "
810 		    "(%u < %u)  kh %u kt %u | rh %u rt %u | h %u t %u",
811 		    sk_proc_name_address(p), sk_proc_pid(p), kring->ckr_name,
812 		    SK_KVA(kring), kring->ckr_ready_bytes, *byte_count,
813 		    kring->ckr_khead, kring->ckr_ktail, kring->ckr_rhead,
814 		    kring->ckr_rtail, kring->ckr_ring->ring_head,
815 		    kring->ckr_ring->ring_tail);
816 		*err_reason = SKYWALK_KILL_REASON_INCONSISTENT_READY_BYTES;
817 		return -1;
818 	}
819 	kring->ckr_ready_bytes -= *byte_count;
820 
821 	return 0;
822 }
823 
824 /*
825  * Nexus-specific kr_rxsync_prologue() callback - no detach variant.
826  */
827 int
kr_rxprologue_nodetach(struct kern_channel * ch,struct __kern_channel_ring * kring,const slot_idx_t head,uint32_t * byte_count,uint64_t * err_reason,struct proc * p)828 kr_rxprologue_nodetach(struct kern_channel *ch,
829     struct __kern_channel_ring *kring, const slot_idx_t head,
830     uint32_t *byte_count, uint64_t *err_reason, struct proc *p)
831 {
832 #pragma unused(ch, p)
833 	slot_idx_t slot_idx = kring->ckr_rhead;
834 
835 	ASSERT(!(KRNA(kring)->na_flags & NAF_USER_PKT_POOL));
836 
837 	/*
838 	 * Iterating through the slots just read by user-space;
839 	 * ckr_rhead -> ring_head
840 	 */
841 	while (slot_idx != head) {
842 		struct __kern_slot_desc *ksd = KR_KSD(kring, slot_idx);
843 		struct __kern_quantum *kqum = ksd->sd_qum;
844 
845 		ASSERT(KSD_VALID_METADATA(ksd));
846 		/* # of new bytes transferred */
847 		*byte_count += kqum->qum_len;
848 		slot_idx = SLOT_NEXT(slot_idx, kring->ckr_lim);
849 	}
850 
851 	/*
852 	 * Update userspace channel statistics of # readable bytes
853 	 * subtract byte counts from slots just given back to the kernel.
854 	 */
855 	if (kring->ckr_ready_bytes < *byte_count) {
856 		SK_ERR("%s(%d) kr \"%s\" (0x%llx) inconsistent ready bytes "
857 		    "(%u < %u)  kh %u kt %u | rh %u rt %u | h %u t %u",
858 		    sk_proc_name_address(p), sk_proc_pid(p), kring->ckr_name,
859 		    SK_KVA(kring), kring->ckr_ready_bytes, *byte_count,
860 		    kring->ckr_khead, kring->ckr_ktail, kring->ckr_rhead,
861 		    kring->ckr_rtail, kring->ckr_ring->ring_head,
862 		    kring->ckr_ring->ring_tail);
863 		*err_reason = SKYWALK_KILL_REASON_INCONSISTENT_READY_BYTES;
864 		return -1;
865 	}
866 	kring->ckr_ready_bytes -= *byte_count;
867 
868 	return 0;
869 }
870 
871 /*
872  * Nexus-specific kr_rxsync_prologue() callback - user packet pool variant.
873  */
874 int
kr_rxprologue_upp(struct kern_channel * ch,struct __kern_channel_ring * kring,const slot_idx_t head,uint32_t * byte_count,uint64_t * err_reason,struct proc * p)875 kr_rxprologue_upp(struct kern_channel *ch, struct __kern_channel_ring *kring,
876     const slot_idx_t head, uint32_t *byte_count, uint64_t *err_reason,
877     struct proc *p)
878 {
879 #pragma unused(ch, p)
880 	slot_idx_t slot_idx = kring->ckr_rhead;
881 
882 	ASSERT(KRNA(kring)->na_flags & NAF_USER_PKT_POOL);
883 
884 	/*
885 	 * Iterating through the slots just read by user-space;
886 	 * ckr_rhead -> ring_head
887 	 */
888 	while (slot_idx != head) {
889 		struct __user_slot_desc *usd = KR_USD(kring, slot_idx);
890 
891 		/*
892 		 * This is a user facing ring opting in for the user packet
893 		 * pool mode, so ensure that the user has detached packet
894 		 * from slot.
895 		 */
896 		ASSERT(!KSD_VALID_METADATA(KR_KSD(kring, slot_idx)));
897 		if (SD_VALID_METADATA(usd)) {
898 			SK_ERR("%s(%d) kr \"%s\" (0x%llx) slot %u not "
899 			    "detached md %u kh %u kt %u | rh %u rt %u |"
900 			    " h %u t %u", sk_proc_name_address(p),
901 			    sk_proc_pid(p), kring->ckr_name,
902 			    SK_KVA(kring), slot_idx, usd->sd_md_idx,
903 			    kring->ckr_khead, kring->ckr_ktail,
904 			    kring->ckr_rhead, kring->ckr_rtail,
905 			    kring->ckr_ring->ring_head,
906 			    kring->ckr_ring->ring_tail);
907 			*err_reason = SKYWALK_KILL_REASON_SLOT_NOT_DETACHED;
908 			return -1;
909 		}
910 		*byte_count += usd->sd_len;
911 
912 		slot_idx = SLOT_NEXT(slot_idx, kring->ckr_lim);
913 	}
914 
915 	/*
916 	 * update userspace channel statistics of # readable bytes
917 	 * subtract byte counts from slots just given back to the kernel
918 	 */
919 	if (kring->ckr_ready_bytes < *byte_count) {
920 		SK_ERR("%s(%d) kr \"%s\" (0x%llx) inconsistent ready bytes "
921 		    "(%u < %u)  kh %u kt %u | rh %u rt %u | h %u t %u",
922 		    sk_proc_name_address(p), sk_proc_pid(p), kring->ckr_name,
923 		    SK_KVA(kring), kring->ckr_ready_bytes, *byte_count,
924 		    kring->ckr_khead, kring->ckr_ktail, kring->ckr_rhead,
925 		    kring->ckr_rtail, kring->ckr_ring->ring_head,
926 		    kring->ckr_ring->ring_tail);
927 		*err_reason = SKYWALK_KILL_REASON_INCONSISTENT_READY_BYTES;
928 		return -1;
929 	}
930 	kring->ckr_ready_bytes -= *byte_count;
931 
932 	return 0;
933 }
934 
935 /*
936  * Validate parameters in the RX/ALLOC/EVENT ring/kring.
937  * For a valid configuration,
938  * khead <= head <= tail <= ktail
939  *
940  * We only consider head.
941  * khead and ktail are reliable.
942  */
943 #define _KR_RXRING_VALIDATE(_kring, _ring, _kh, _kt, _rh)       do {    \
944 	slot_idx_t _n = (_kring)->ckr_num_slots;                        \
945 	/* kernel sanity checks */                                      \
946 	NM_FAIL_ON((_kh) >= _n || (_kt) >= _n,                          \
947 	    SKYWALK_KILL_REASON_BASIC_SANITY);                          \
948 	/* user sanity checks */                                        \
949 	if ((_kt) >= (_kh)) {                                           \
950 	/* want khead <= head <= ktail */                       \
951 	        NM_FAIL_ON((_rh) < (_kh) || (_rh) > (_kt),              \
952 	            SKYWALK_KILL_REASON_HEAD_OOB);                      \
953 	} else {                                                        \
954 	/* we need head outside ktail..khead */                 \
955 	        NM_FAIL_ON((_rh) < (_kh) && (_rh) > (_kt),              \
956 	            SKYWALK_KILL_REASON_HEAD_OOB_WRAPPED);              \
957 	}                                                               \
958 	NM_FAIL_ON((_ring)->ring_tail != (_kring)->ckr_rtail,           \
959 	    SKYWALK_KILL_REASON_TAIL_MISMATCH);                         \
960 } while (0)
961 
962 /*
963  * Validate parameters in the ring/kring on entry for *_rxsync().
964  * Returns ring->ring_head if ok, kring->ckr_num_slots on error,
965  * in order to force a reinit.
966  */
967 slot_idx_t
kr_rxsync_prologue(struct kern_channel * ch,struct __kern_channel_ring * kring,struct proc * p)968 kr_rxsync_prologue(struct kern_channel *ch, struct __kern_channel_ring *kring,
969     struct proc *p)
970 {
971 #pragma unused(ch)
972 	struct __user_channel_ring *ring = kring->ckr_ring;
973 	slot_idx_t ckr_khead, ckr_ktail;
974 	slot_idx_t head;
975 	uint32_t byte_count = 0;
976 	uint64_t err_reason = 0;
977 	int slot_count;
978 
979 	VERIFY(sk_is_sync_protected());
980 	/* assert that this routine is only called for user facing rings */
981 	ASSERT(!KR_KERNEL_ONLY(kring));
982 	ASSERT(kring->ckr_usds != NULL);
983 
984 	/* read these once and use local copies */
985 	ckr_khead = kring->ckr_khead;
986 	ckr_ktail = kring->ckr_ktail;
987 
988 	SK_DF(SK_VERB_SYNC | SK_VERB_RX, "%s(%d) kr \"%s\", kh %u kt %u | "
989 	    "rh %u rt %u | h %u t %u", sk_proc_name_address(p),
990 	    sk_proc_pid(p), kring->ckr_name, ckr_khead, ckr_ktail,
991 	    kring->ckr_rhead, kring->ckr_rtail,
992 	    ring->ring_head, ring->ring_tail);
993 	/*
994 	 * Before storing the new values, we should check they do not
995 	 * move backwards. However:
996 	 * - head is not an issue because the previous value is khead;
997 	 * - cur could in principle go back, however it does not matter
998 	 *   because we are processing a brand new rxsync()
999 	 */
1000 	head = ring->ring_head; /* read only once */
1001 
1002 	_KR_RXRING_VALIDATE(kring, ring, ckr_khead, ckr_ktail, head);
1003 
1004 	/* # of reclaimed slots */
1005 	slot_count = head - kring->ckr_rhead;
1006 	if (slot_count < 0) {
1007 		slot_count += kring->ckr_num_slots;
1008 	}
1009 
1010 	/*
1011 	 * Invoke nexus-specific RX prologue callback, which may detach
1012 	 * and free any consumed packets.  Configured in na_kr_create().
1013 	 */
1014 	if (kring->ckr_prologue != NULL && (kring->ckr_prologue(ch,
1015 	    kring, head, &byte_count, &err_reason, p) != 0)) {
1016 		goto error;
1017 	}
1018 	/* update the user's view of slots & bytes transferred */
1019 	kr_update_user_stats(kring, slot_count, byte_count);
1020 
1021 	/* update the kernel view of ring */
1022 	kring->ckr_rhead = head;
1023 	return head;
1024 
1025 error:
1026 	SK_ERR("%s(%d) kr \"%s\" (0x%llx) krflags 0x%b error: kh %u kt %u | "
1027 	    "rh %u rt %u | h %u t %u", sk_proc_name_address(p),
1028 	    sk_proc_pid(p), kring->ckr_name, SK_KVA(kring), kring->ckr_flags,
1029 	    CKRF_BITS, ckr_khead, ckr_ktail,
1030 	    kring->ckr_rhead, kring->ckr_rtail,
1031 	    ring->ring_head, ring->ring_tail);
1032 
1033 	skywalk_kill_process(p, err_reason | SKYWALK_KILL_REASON_RX_SYNC);
1034 	return kring->ckr_num_slots;
1035 }
1036 
1037 /*
1038  * Validate parameters on the ring/kring on entry for *_alloc_sync().
1039  * Returns ring->ring_head if ok, kring->ckr_num_slots on error,
1040  * in order to force a reinit.
1041  */
1042 slot_idx_t
kr_alloc_sync_prologue(struct __kern_channel_ring * kring,struct proc * p)1043 kr_alloc_sync_prologue(struct __kern_channel_ring *kring, struct proc *p)
1044 {
1045 	struct __user_channel_ring *ring = kring->ckr_ring;
1046 	slot_idx_t ckr_khead, ckr_ktail;
1047 	slot_idx_t head;
1048 	uint64_t err_reason = 0;
1049 
1050 	VERIFY(sk_is_sync_protected());
1051 
1052 	/* read these once and use local copies */
1053 	ckr_khead = kring->ckr_khead;
1054 	ckr_ktail = kring->ckr_ktail;
1055 	head = ring->ring_head;
1056 
1057 	SK_DF(SK_VERB_SYNC, "%s(%d) kr \"%s\", kh %u kt %u | "
1058 	    "rh %u rt %u | h %u t %u", sk_proc_name_address(p),
1059 	    sk_proc_pid(p), kring->ckr_name, ckr_khead, ckr_ktail,
1060 	    kring->ckr_rhead, kring->ckr_rtail,
1061 	    head, ring->ring_tail);
1062 	/*
1063 	 * Before storing the new values, we should check they do not
1064 	 * move backwards. However, head is not an issue because the
1065 	 * previous value is khead;
1066 	 */
1067 	_KR_RXRING_VALIDATE(kring, ring, ckr_khead, ckr_ktail, head);
1068 
1069 	/* update the kernel view of ring */
1070 	kring->ckr_rhead = head;
1071 	return head;
1072 
1073 error:
1074 	SK_ERR("%s(%d) kr \"%s\" (0x%llx) krflags 0x%b error: kh %u kt %u | "
1075 	    "rh %u rt %u | h %u t %u", sk_proc_name_address(p),
1076 	    sk_proc_pid(p), kring->ckr_name, SK_KVA(kring), kring->ckr_flags,
1077 	    CKRF_BITS, ckr_khead, ckr_ktail,
1078 	    kring->ckr_rhead, kring->ckr_rtail,
1079 	    ring->ring_head, ring->ring_tail);
1080 
1081 	skywalk_kill_process(p, err_reason | SKYWALK_KILL_REASON_ALLOC_SYNC);
1082 	return kring->ckr_num_slots;
1083 }
1084 
1085 /*
1086  * Nexus-specific kr_txsync_finalize() callback.
1087  */
1088 void
kr_txfinalize(struct kern_channel * ch,struct __kern_channel_ring * kring,const slot_idx_t head,struct proc * p)1089 kr_txfinalize(struct kern_channel *ch, struct __kern_channel_ring *kring,
1090     const slot_idx_t head, struct proc *p)
1091 {
1092 #pragma unused(ch)
1093 	struct kern_pbufpool *pp = kring->ckr_pp;
1094 	slot_idx_t slot_idx;
1095 	uint32_t ph_cnt, i = 0;
1096 	int32_t ph_needed;
1097 	int err;
1098 
1099 	ASSERT(!(KRNA(kring)->na_flags & NAF_USER_PKT_POOL));
1100 
1101 	/* use khead value from pre-sync time */
1102 	slot_idx = kring->ckr_khead_pre;
1103 
1104 	ph_needed = head - slot_idx;
1105 	if (ph_needed < 0) {
1106 		ph_needed += kring->ckr_num_slots;
1107 	}
1108 	if (ph_needed == 0) {
1109 		return;
1110 	}
1111 
1112 	ph_cnt = (uint32_t)ph_needed;
1113 	err = kern_pbufpool_alloc_batch(pp, 1, kring->ckr_scratch, &ph_cnt);
1114 	VERIFY(err == 0 && ph_cnt == (uint32_t)ph_needed);
1115 
1116 	/* recycle the transferred packets */
1117 	while (slot_idx != head) {
1118 		struct __kern_slot_desc *ksd = KR_KSD(kring, slot_idx);
1119 		kern_packet_t ph;
1120 
1121 		if (KSD_VALID_METADATA(ksd)) {
1122 			goto next_slot;
1123 		}
1124 
1125 		ph = kring->ckr_scratch[i];
1126 		ASSERT(ph != 0);
1127 		kring->ckr_scratch[i] = 0;
1128 		++i;
1129 
1130 		/*
1131 		 * Since this packet is freshly allocated and we need
1132 		 * to have the flag set for the attach to succeed,
1133 		 * just set it here rather than calling
1134 		 * __packet_finalize().
1135 		 */
1136 		SK_PTR_ADDR_KQUM(ph)->qum_qflags |= QUM_F_FINALIZED;
1137 
1138 		KR_SLOT_ATTACH_METADATA(kring, ksd, SK_PTR_ADDR_KQUM(ph));
1139 
1140 		kr_externalize_metadata_internal(kring, pp->pp_max_frags,
1141 		    SK_PTR_ADDR_KQUM(ph), p);
1142 next_slot:
1143 		slot_idx = SLOT_NEXT(slot_idx, kring->ckr_lim);
1144 	}
1145 
1146 	if (i != ph_cnt) {
1147 		kern_pbufpool_free_batch(pp, &kring->ckr_scratch[i],
1148 		    ph_cnt - i);
1149 	}
1150 }
1151 
1152 /*
1153  * Nexus-specific kr_txsync_finalize() callback - user packet pool variant.
1154  */
1155 void
kr_txfinalize_upp(struct kern_channel * ch,struct __kern_channel_ring * kring,const slot_idx_t head,struct proc * p)1156 kr_txfinalize_upp(struct kern_channel *ch, struct __kern_channel_ring *kring,
1157     const slot_idx_t head, struct proc *p)
1158 {
1159 #pragma unused(ch, p)
1160 	slot_idx_t slot_idx;
1161 	uint32_t nfree = 0;
1162 
1163 	ASSERT(KRNA(kring)->na_flags & NAF_USER_PKT_POOL);
1164 
1165 	/* use khead value from pre-sync time */
1166 	slot_idx = kring->ckr_khead_pre;
1167 
1168 	/* recycle the transferred packets */
1169 	while (slot_idx != head) {
1170 		struct __kern_slot_desc *ksd = KR_KSD(kring, slot_idx);
1171 
1172 		if (KSD_VALID_METADATA(ksd)) {
1173 			/* detach and free the packet */
1174 			struct __kern_quantum *kqum = ksd->sd_qum;
1175 			(void) KR_SLOT_DETACH_METADATA(kring, ksd);
1176 			ASSERT(nfree < kring->ckr_num_slots);
1177 			kring->ckr_scratch[nfree++] = (uint64_t)kqum;
1178 		}
1179 
1180 		slot_idx = SLOT_NEXT(slot_idx, kring->ckr_lim);
1181 	}
1182 
1183 	if (__probable(nfree > 0)) {
1184 		pp_free_packet_batch(kring->ckr_pp,
1185 		    &kring->ckr_scratch[0], nfree);
1186 	}
1187 }
1188 
1189 /*
1190  * Update kring and ring at the end of txsync.
1191  */
1192 void
kr_txsync_finalize(struct kern_channel * ch,struct __kern_channel_ring * kring,struct proc * p)1193 kr_txsync_finalize(struct kern_channel *ch, struct __kern_channel_ring *kring,
1194     struct proc *p)
1195 {
1196 	slot_idx_t ckr_khead, ckr_ktail;
1197 	uint32_t slot_size;
1198 	int32_t slot_diff;
1199 
1200 	VERIFY(sk_is_sync_protected());
1201 	/* assert that this routine is only called for user facing rings */
1202 	ASSERT(!KR_KERNEL_ONLY(kring));
1203 
1204 	/* read these once and use local copies */
1205 	ckr_khead = kring->ckr_khead;
1206 	ckr_ktail = kring->ckr_ktail;
1207 
1208 	/*
1209 	 * update userspace-facing channel statistics (# writable bytes/slots)
1210 	 *
1211 	 * Since the ring might be dynamically allocated, we can't rely on the
1212 	 * tail pointer to calculate free TX space (the tail might be sitting
1213 	 * at the edge of allocated ring space but be able to be pushed over
1214 	 * into unallocated ring space).
1215 	 *
1216 	 * Instead, calculate free TX space by looking at what slots are
1217 	 * available to the kernel for TX, and subtracting that from the total
1218 	 * number of possible slots. This is effectively what userspace can
1219 	 * write to.
1220 	 */
1221 	slot_size = kring->ckr_pp->pp_buflet_size;
1222 	slot_diff = kring->ckr_rhead - ckr_khead;
1223 	if (slot_diff < 0) {
1224 		slot_diff += kring->ckr_num_slots;
1225 	}
1226 	slot_diff = kring->ckr_lim - slot_diff;
1227 	kring->ckr_ready_slots = slot_diff;
1228 	kring->ckr_ready_bytes = slot_diff * slot_size;
1229 
1230 	/*
1231 	 * Invoke nexus-specific TX finalize callback, which may recycle any
1232 	 * transferred packets and/or externalize new ones.  Some nexus don't
1233 	 * have any callback set.  Configured in na_kr_create().
1234 	 */
1235 	if (kring->ckr_finalize != NULL) {
1236 		kring->ckr_finalize(ch, kring, ckr_khead, p);
1237 	}
1238 
1239 	/* update ring tail/khead to what the kernel knows */
1240 	*(slot_idx_t *)(uintptr_t)&kring->ckr_ring->ring_tail =
1241 	    kring->ckr_rtail = ckr_ktail;
1242 	*(slot_idx_t *)(uintptr_t)&kring->ckr_ring->ring_khead = ckr_khead;
1243 
1244 	SK_DF(SK_VERB_SYNC | SK_VERB_TX, "%s(%d) kr \"%s\", kh %u kt %u | "
1245 	    "rh %u rt %u | h %u t %u", sk_proc_name_address(p),
1246 	    sk_proc_pid(p), kring->ckr_name, ckr_khead, ckr_ktail,
1247 	    kring->ckr_rhead, kring->ckr_rtail,
1248 	    kring->ckr_ring->ring_head,
1249 	    kring->ckr_ring->ring_tail);
1250 }
1251 
1252 /*
1253  * Nexus-specific kr_rxsync_finalize() callback.
1254  */
1255 void
kr_rxfinalize(struct kern_channel * ch,struct __kern_channel_ring * kring,const slot_idx_t tail,struct proc * p)1256 kr_rxfinalize(struct kern_channel *ch, struct __kern_channel_ring *kring,
1257     const slot_idx_t tail, struct proc *p)
1258 {
1259 #pragma unused(ch)
1260 	const uint32_t maxfrags = kring->ckr_pp->pp_max_frags;
1261 	slot_idx_t slot_idx = kring->ckr_rtail;
1262 	uint32_t byte_count = 0;
1263 
1264 	while (slot_idx != tail) {
1265 		struct __kern_slot_desc *ksd = KR_KSD(kring, slot_idx);
1266 		struct __kern_quantum *kqum = ksd->sd_qum;
1267 
1268 		/*
1269 		 * nexus provider should never leave an empty slot on rx ring.
1270 		 */
1271 		VERIFY(kqum != NULL);
1272 		kr_externalize_metadata_internal(kring, maxfrags, kqum, p);
1273 		ASSERT(!(KR_USD(kring, slot_idx)->sd_flags & ~SD_FLAGS_USER));
1274 
1275 		byte_count += kqum->qum_len;
1276 		slot_idx = SLOT_NEXT(slot_idx, kring->ckr_lim);
1277 	}
1278 
1279 	kring->ckr_ready_bytes += byte_count;
1280 
1281 	/* just recalculate slot count using pointer arithmetic */
1282 	int32_t slot_diff = tail - kring->ckr_rhead;
1283 	if (slot_diff < 0) {
1284 		slot_diff += kring->ckr_num_slots;
1285 	}
1286 	kring->ckr_ready_slots = slot_diff;
1287 
1288 #if CONFIG_NEXUS_NETIF
1289 	/*
1290 	 * If this is a channel opened directly to the netif nexus, provide
1291 	 * it feedbacks on the number of packets and bytes consumed.  This
1292 	 * will drive the receive mitigation strategy.
1293 	 */
1294 	if (__improbable(kring->ckr_netif_mit_stats != NULL) &&
1295 	    slot_diff != 0 && byte_count != 0) {
1296 		kring->ckr_netif_mit_stats(kring, slot_diff, byte_count);
1297 	}
1298 #endif /* CONFIG_NEXUS_NETIF */
1299 }
1300 
1301 /*
1302  * Nexus-specific kr_rxsync_finalize() callback - user packet pool variant.
1303  */
1304 void
kr_rxfinalize_upp(struct kern_channel * ch,struct __kern_channel_ring * kring,const slot_idx_t tail,struct proc * p)1305 kr_rxfinalize_upp(struct kern_channel *ch, struct __kern_channel_ring *kring,
1306     const slot_idx_t tail, struct proc *p)
1307 {
1308 	const uint32_t maxfrags = kring->ckr_pp->pp_max_frags;
1309 	slot_idx_t slot_idx = kring->ckr_rtail;
1310 	struct kern_pbufpool *pp = kring->ckr_pp;
1311 	uint32_t byte_count = 0;
1312 
1313 	PP_LOCK(pp);
1314 	while (slot_idx != tail) {
1315 		struct __kern_slot_desc *ksd = KR_KSD(kring, slot_idx);
1316 		struct __user_slot_desc *usd = KR_USD(kring, slot_idx);
1317 		struct __kern_quantum *kqum = ksd->sd_qum;
1318 
1319 		/*
1320 		 * nexus provider should never leave an empty slot on rx ring.
1321 		 */
1322 		VERIFY(kqum != NULL);
1323 		/*
1324 		 * The channel is operating in packet allocator
1325 		 * mode, so add packet to the allocated list.
1326 		 */
1327 		pp_insert_upp_locked(pp, kqum, ch->ch_pid);
1328 
1329 		KSD_DETACH_METADATA(ksd);
1330 		/* To calculate ckr_ready_bytes by kr_rxsync_prologue */
1331 		USD_SET_LENGTH(usd, (uint16_t)kqum->qum_len);
1332 
1333 		kr_externalize_metadata_internal(kring, maxfrags, kqum, p);
1334 		ASSERT((usd->sd_flags & ~SD_FLAGS_USER) == 0);
1335 
1336 		byte_count += kqum->qum_len;
1337 		slot_idx = SLOT_NEXT(slot_idx, kring->ckr_lim);
1338 	}
1339 	PP_UNLOCK(pp);
1340 
1341 	kring->ckr_ready_bytes += byte_count;
1342 
1343 	/* just recalculate slot count using pointer arithmetic */
1344 	int32_t slot_diff = tail - kring->ckr_rhead;
1345 	if (slot_diff < 0) {
1346 		slot_diff += kring->ckr_num_slots;
1347 	}
1348 	kring->ckr_ready_slots = slot_diff;
1349 
1350 #if CONFIG_NEXUS_NETIF
1351 	/*
1352 	 * If this is a channel opened directly to the netif nexus, provide
1353 	 * it feedbacks on the number of packets and bytes consumed.  This
1354 	 * will drive the receive mitigation strategy.
1355 	 */
1356 	if (__improbable(kring->ckr_netif_mit_stats != NULL) &&
1357 	    slot_diff != 0 && byte_count != 0) {
1358 		kring->ckr_netif_mit_stats(kring, slot_diff, byte_count);
1359 	}
1360 #endif /* CONFIG_NEXUS_NETIF */
1361 }
1362 
1363 /*
1364  * Update kring and ring at the end of rxsync
1365  */
1366 void
kr_rxsync_finalize(struct kern_channel * ch,struct __kern_channel_ring * kring,struct proc * p)1367 kr_rxsync_finalize(struct kern_channel *ch, struct __kern_channel_ring *kring,
1368     struct proc *p)
1369 {
1370 #pragma unused(ch, p)
1371 	slot_idx_t ckr_khead, ckr_ktail;
1372 
1373 	VERIFY(sk_is_sync_protected());
1374 	/* assert that this routine is only called for user facing rings */
1375 	ASSERT(!KR_KERNEL_ONLY(kring));
1376 	ASSERT(kring->ckr_usds != NULL);
1377 
1378 	/* read these once and use local copies */
1379 	ckr_khead = kring->ckr_khead;
1380 	ckr_ktail = kring->ckr_ktail;
1381 
1382 	/*
1383 	 * Invoke nexus-specific RX finalize callback; set in na_kr_create().
1384 	 */
1385 	if (kring->ckr_finalize != NULL) {
1386 		kring->ckr_finalize(ch, kring, ckr_ktail, p);
1387 	}
1388 
1389 	/* update ring tail/khead to what the kernel knows */
1390 	*(slot_idx_t *)(uintptr_t)&kring->ckr_ring->ring_tail =
1391 	    kring->ckr_rtail = ckr_ktail;
1392 	*(slot_idx_t *)(uintptr_t)&kring->ckr_ring->ring_khead = ckr_khead;
1393 
1394 	SK_DF(SK_VERB_SYNC | SK_VERB_RX, "%s(%d) kr \"%s\", kh %u kt %u | "
1395 	    "rh %u rt %u | h %u t %u", sk_proc_name_address(p),
1396 	    sk_proc_pid(p), kring->ckr_name, ckr_khead, ckr_ktail,
1397 	    kring->ckr_rhead, kring->ckr_rtail,
1398 	    kring->ckr_ring->ring_head,
1399 	    kring->ckr_ring->ring_tail);
1400 }
1401 
1402 void
kr_alloc_sync_finalize(struct __kern_channel_ring * kring,struct proc * p)1403 kr_alloc_sync_finalize(struct __kern_channel_ring *kring, struct proc *p)
1404 {
1405 #pragma unused(p)
1406 	slot_idx_t ckr_khead, ckr_ktail;
1407 
1408 	VERIFY(sk_is_sync_protected());
1409 	/* read these once and use local copies */
1410 	ckr_khead = kring->ckr_khead;
1411 	ckr_ktail = kring->ckr_ktail;
1412 
1413 	/* update ring tail/khead to what the kernel knows */
1414 	*(slot_idx_t *)(uintptr_t)&kring->ckr_ring->ring_tail =
1415 	    kring->ckr_rtail = ckr_ktail;
1416 	*(slot_idx_t *)(uintptr_t)&kring->ckr_ring->ring_khead = ckr_khead;
1417 	*(uint32_t *)(uintptr_t)&kring->ckr_ring->ring_alloc_ws =
1418 	    kring->ckr_alloc_ws;
1419 
1420 	SK_DF(SK_VERB_SYNC, "%s(%d) kr \"%s\", kh %u kt %u | "
1421 	    "rh %u rt %u | h %u t %u | ws %u",
1422 	    sk_proc_name_address(p),
1423 	    sk_proc_pid(p), kring->ckr_name, ckr_khead, ckr_ktail,
1424 	    kring->ckr_rhead, kring->ckr_rtail,
1425 	    kring->ckr_ring->ring_head,
1426 	    kring->ckr_ring->ring_tail, kring->ckr_alloc_ws);
1427 }
1428 
1429 void
kr_free_sync_finalize(struct __kern_channel_ring * kring,struct proc * p)1430 kr_free_sync_finalize(struct __kern_channel_ring *kring, struct proc *p)
1431 {
1432 #pragma unused(p)
1433 	slot_idx_t ckr_khead, ckr_ktail;
1434 
1435 	VERIFY(sk_is_sync_protected());
1436 	/* read these once and use local copies */
1437 	ckr_khead = kring->ckr_khead;
1438 	ckr_ktail = kring->ckr_ktail;
1439 
1440 	/* update ring tail/khead to what the kernel knows */
1441 	*(slot_idx_t *)(uintptr_t)&kring->ckr_ring->ring_tail =
1442 	    kring->ckr_rtail = ckr_ktail;
1443 	*(slot_idx_t *)(uintptr_t)&kring->ckr_ring->ring_khead = ckr_khead;
1444 
1445 	SK_DF(SK_VERB_SYNC, "%s(%d) kr \"%s\", kh %u kt %u | "
1446 	    "rh %u rt %u | h %u t %u", sk_proc_name_address(p),
1447 	    sk_proc_pid(p), kring->ckr_name, ckr_khead, ckr_ktail,
1448 	    kring->ckr_rhead, kring->ckr_rtail,
1449 	    kring->ckr_ring->ring_head,
1450 	    kring->ckr_ring->ring_tail);
1451 }
1452 
1453 slot_idx_t
kr_event_sync_prologue(struct __kern_channel_ring * kring,struct proc * p)1454 kr_event_sync_prologue(struct __kern_channel_ring *kring, struct proc *p)
1455 {
1456 	struct __user_channel_ring *ring = kring->ckr_ring;
1457 	slot_idx_t ckr_khead, ckr_ktail;
1458 	slot_idx_t head, slot_idx;
1459 	uint64_t err_reason = 0;
1460 
1461 	ASSERT(kring->ckr_tx == NR_EV);
1462 	VERIFY(sk_is_sync_protected());
1463 
1464 	/* read these once and use local copies */
1465 	ckr_khead = kring->ckr_khead;
1466 	ckr_ktail = kring->ckr_ktail;
1467 	head = ring->ring_head;
1468 
1469 	SK_DF(SK_VERB_SYNC, "%s(%d) kr \"%s\", kh %u kt %u | "
1470 	    "rh %u rt %u | h %u t %u", sk_proc_name_address(p),
1471 	    sk_proc_pid(p), kring->ckr_name, ckr_khead, ckr_ktail,
1472 	    kring->ckr_rhead, kring->ckr_rtail,
1473 	    head, ring->ring_tail);
1474 	/*
1475 	 * Before storing the new values, we should check they do not
1476 	 * move backwards. However, head is not an issue because the
1477 	 * previous value is khead;
1478 	 */
1479 	_KR_RXRING_VALIDATE(kring, ring, ckr_khead, ckr_ktail, head);
1480 
1481 	/*
1482 	 * Iterating through the slots just read by user-space;
1483 	 * ckr_rhead -> ring_head
1484 	 */
1485 	slot_idx = kring->ckr_rhead;
1486 	while (slot_idx != head) {
1487 		struct __kern_slot_desc *ksd = KR_KSD(kring, slot_idx);
1488 		struct __user_slot_desc *usd = KR_USD(kring, slot_idx);
1489 		/*
1490 		 * ensure that the user has detached packet from slot.
1491 		 */
1492 		VERIFY(!KSD_VALID_METADATA(ksd));
1493 		if (__improbable(SD_VALID_METADATA(usd))) {
1494 			SK_ERR("%s(%d) kr \"%s\" (0x%llx) slot %u not "
1495 			    "detached md %u kh %u kt %u | rh %u rt %u |"
1496 			    " h %u t %u", sk_proc_name_address(p),
1497 			    sk_proc_pid(p), kring->ckr_name,
1498 			    SK_KVA(kring), slot_idx, usd->sd_md_idx,
1499 			    ckr_khead, ckr_ktail, kring->ckr_rhead,
1500 			    kring->ckr_rtail, ring->ring_head,
1501 			    ring->ring_tail);
1502 			err_reason = SKYWALK_KILL_REASON_SLOT_NOT_DETACHED;
1503 			goto error;
1504 		}
1505 		slot_idx = SLOT_NEXT(slot_idx, kring->ckr_lim);
1506 	}
1507 
1508 	/* update the kernel view of ring */
1509 	kring->ckr_rhead = head;
1510 	return head;
1511 
1512 error:
1513 	SK_ERR("%s(%d) kr \"%s\" (0x%llx) krflags 0x%b error: kh %u kt %u | "
1514 	    "rh %u rt %u | h %u t %u", sk_proc_name_address(p),
1515 	    sk_proc_pid(p), kring->ckr_name, SK_KVA(kring), kring->ckr_flags,
1516 	    CKRF_BITS, ckr_khead, ckr_ktail,
1517 	    kring->ckr_rhead, kring->ckr_rtail,
1518 	    ring->ring_head, ring->ring_tail);
1519 
1520 	skywalk_kill_process(p, err_reason | SKYWALK_KILL_REASON_EVENT_SYNC);
1521 	return kring->ckr_num_slots;
1522 }
1523 
1524 void
kr_event_sync_finalize(struct kern_channel * ch,struct __kern_channel_ring * kring,struct proc * p)1525 kr_event_sync_finalize(struct kern_channel *ch,
1526     struct __kern_channel_ring *kring, struct proc *p)
1527 {
1528 #pragma unused(ch)
1529 	struct kern_pbufpool *pp = kring->ckr_pp;
1530 	const uint32_t maxfrags = pp->pp_max_frags;
1531 	slot_idx_t ckr_khead, ckr_ktail, ckr_rhead;
1532 	struct __kern_slot_desc *ksd;
1533 	struct __user_slot_desc *usd;
1534 	struct __kern_quantum *kqum;
1535 
1536 	VERIFY(sk_is_sync_protected());
1537 	/* assert that this routine is only called for user facing rings */
1538 	ASSERT(!KR_KERNEL_ONLY(kring));
1539 	ASSERT(kring->ckr_usds != NULL);
1540 	ASSERT(kring->ckr_tx == NR_EV);
1541 
1542 	/* read these once and use local copies */
1543 	ckr_khead = kring->ckr_khead;
1544 	ckr_ktail = kring->ckr_ktail;
1545 	ckr_rhead = kring->ckr_rhead;
1546 
1547 	slot_idx_t slot_idx = kring->ckr_rtail;
1548 	PP_LOCK(pp);
1549 	while (slot_idx != ckr_ktail) {
1550 		ksd = KR_KSD(kring, slot_idx);
1551 		usd = KR_USD(kring, slot_idx);
1552 		kqum = ksd->sd_qum;
1553 
1554 		/*
1555 		 * Add packet to the allocated list of user packet pool.
1556 		 */
1557 		pp_insert_upp_locked(pp, kqum, ch->ch_pid);
1558 
1559 		KSD_DETACH_METADATA(ksd);
1560 		kr_externalize_metadata_internal(kring, maxfrags, kqum, p);
1561 		ASSERT((usd->sd_flags & ~SD_FLAGS_USER) == 0);
1562 		slot_idx = SLOT_NEXT(slot_idx, kring->ckr_lim);
1563 	}
1564 	PP_UNLOCK(pp);
1565 
1566 	/* just recalculate slot count using pointer arithmetic */
1567 	int32_t slot_diff = ckr_ktail - ckr_rhead;
1568 	if (slot_diff < 0) {
1569 		slot_diff += kring->ckr_num_slots;
1570 	}
1571 	kring->ckr_ready_slots = slot_diff;
1572 
1573 	/* update ring tail/khead to what the kernel knows */
1574 	*(slot_idx_t *)(uintptr_t)&kring->ckr_ring->ring_tail =
1575 	    kring->ckr_rtail = ckr_ktail;
1576 	*(slot_idx_t *)(uintptr_t)&kring->ckr_ring->ring_khead = ckr_khead;
1577 
1578 	SK_DF(SK_VERB_SYNC | SK_VERB_RX, "%s(%d) kr \"%s\", kh %u kt %u | "
1579 	    "rh %u rt %u | h %u t %u", sk_proc_name_address(p),
1580 	    sk_proc_pid(p), kring->ckr_name, ckr_khead, ckr_ktail,
1581 	    kring->ckr_rhead, kring->ckr_rtail,
1582 	    kring->ckr_ring->ring_head,
1583 	    kring->ckr_ring->ring_tail);
1584 }
1585 #undef NM_FAIL_ON
1586 
1587 void
kr_txkring_reclaim_and_refill(struct __kern_channel_ring * kring,slot_idx_t index)1588 kr_txkring_reclaim_and_refill(struct __kern_channel_ring *kring,
1589     slot_idx_t index)
1590 {
1591 	const slot_idx_t lim = kring->ckr_lim;
1592 	slot_idx_t next_index = SLOT_NEXT(index, lim);
1593 
1594 	kring->ckr_khead = next_index;
1595 	/* reclaim */
1596 	kring->ckr_ktail = index;
1597 }
1598 
1599 /*
1600  * *************************************************************************
1601  * Checks on packet header offsets in kr_internalize_metadata
1602  * *************************************************************************
1603  *
1604  *  +----------+------------------------------+----------------------------+
1605  *  |          | NEXUS_META_SUBTYPE_RAW       | NEXUS_META_SUBTYPE_PAYLOAD |
1606  *  |----------+------------------------------+----------------------------+
1607  *  | buflet   | (bdoff + len) <= dlim        | (bdoff + len) <= dlim      |
1608  *  |----------+------------------------------+----------------------------+
1609  *  | headroom | hr == bdoff && hr < bdlim    | hr == 0 && bdoff == 0      |
1610  *  |----------+------------------------------+----------------------------+
1611  *  | l2_len   | hr + l2_len < bdim           | l2_len == 0                |
1612  *  |----------+------------------------------+----------------------------+
1613  */
1614 int
kr_internalize_metadata(struct kern_channel * ch,struct __kern_channel_ring * kring,const uint32_t maxfrags,struct __kern_quantum * kqum,struct proc * p)1615 kr_internalize_metadata(struct kern_channel *ch,
1616     struct __kern_channel_ring *kring, const uint32_t maxfrags,
1617     struct __kern_quantum *kqum, struct proc *p)
1618 {
1619 #pragma unused(kring, maxfrags, p)
1620 	struct __user_buflet *ubuf, *pubuf;     /* user buflet */
1621 	struct __kern_buflet *kbuf, *pkbuf;     /* kernel buflet */
1622 	struct __user_quantum *uqum;            /* user source */
1623 	struct __user_packet *upkt;
1624 	struct __kern_packet *kpkt;
1625 	const nexus_meta_type_t md_type = METADATA_TYPE(kqum);
1626 	const nexus_meta_subtype_t md_subtype = METADATA_SUBTYPE(kqum);
1627 	uint32_t len = 0;
1628 	uint16_t bcnt = 0, bmax, i, bdoff, bdlim;
1629 	boolean_t dropped;
1630 	int err = 0;
1631 
1632 	/*
1633 	 * Verify that the quantum/packet belongs to the same pp as
1634 	 * the one used by the adapter, i.e. the packet must have
1635 	 * been allocated from the same pp and attached to the kring.
1636 	 */
1637 	ASSERT(kqum->qum_pp == kring->ckr_pp);
1638 
1639 	_CASSERT(sizeof(uqum->qum_com) == sizeof(kqum->qum_com));
1640 	_CASSERT(sizeof(upkt->pkt_com) == sizeof(kpkt->pkt_com));
1641 	uqum = __DECONST(struct __user_quantum *, kqum->qum_user);
1642 	ASSERT(!(kqum->qum_qflags & QUM_F_KERNEL_ONLY) && uqum != NULL);
1643 	upkt = SK_PTR_ADDR_UPKT(uqum);
1644 	kpkt = SK_PTR_ADDR_KPKT(kqum);
1645 
1646 	DTRACE_SKYWALK3(internalize, struct __kern_channel_ring *, kring,
1647 	    struct __kern_packet *, kpkt, struct __user_packet *, upkt);
1648 	SK_DF(SK_VERB_MEM, "%s(%d) kring 0x%llx uqum 0x%llx -> kqum 0x%llx",
1649 	    sk_proc_name_address(p), sk_proc_pid(p), SK_KVA(kring),
1650 	    SK_KVA(uqum), SK_KVA(kqum));
1651 
1652 	/* check if it's dropped before we internalize it */
1653 	dropped = ((uqum->qum_qflags & QUM_F_DROPPED) != 0);
1654 
1655 	/*
1656 	 * Internalize common quantum metadata.
1657 	 *
1658 	 * For packet metadata, we trust the kernel copy for the buflet
1659 	 * count and limit; any mismatch on the user copy will cause
1660 	 * us to drop this packet.
1661 	 */
1662 	_QUM_INTERNALIZE(uqum, kqum);
1663 
1664 	/* if marked as dropped, don't bother going further */
1665 	if (__improbable(dropped)) {
1666 		SK_ERR("%s(%d) kring 0x%llx dropped",
1667 		    sk_proc_name_address(p), sk_proc_pid(p), SK_KVA(kring));
1668 		err = ERANGE;
1669 		goto done;
1670 	}
1671 
1672 	switch (md_type) {
1673 	case NEXUS_META_TYPE_PACKET:
1674 		/*
1675 		 * Internalize common packet metadata.
1676 		 */
1677 		_PKT_INTERNALIZE(upkt, kpkt);
1678 
1679 		switch (md_subtype) {
1680 		case NEXUS_META_SUBTYPE_PAYLOAD:
1681 			/* sanitize link layer fields for payload mode */
1682 			kpkt->pkt_link_flags = 0;
1683 			break;
1684 		default:
1685 			break;
1686 		}
1687 
1688 		if (__probable(ch != NULL)) {
1689 			_UUID_COPY(kpkt->pkt_flowsrc_id,
1690 			    ch->ch_info->cinfo_ch_id);
1691 		}
1692 
1693 		bcnt = upkt->pkt_bufs_cnt;
1694 		bmax = kpkt->pkt_bufs_max;
1695 		ASSERT(bmax == maxfrags);
1696 		if (__improbable((bcnt == 0) || (bcnt > bmax) ||
1697 		    (upkt->pkt_bufs_max != bmax))) {
1698 			SK_ERR("%s(%d) kring 0x%llx bad bufcnt %d, %d, %d",
1699 			    sk_proc_name_address(p), sk_proc_pid(p),
1700 			    SK_KVA(kring), bcnt, bmax, upkt->pkt_bufs_max);
1701 			err = ERANGE;
1702 			goto done;
1703 		}
1704 		break;
1705 
1706 	case NEXUS_META_TYPE_QUANTUM:
1707 		ASSERT(maxfrags == 1);
1708 		bcnt = bmax = 1;
1709 		break;
1710 
1711 	default:
1712 		VERIFY(0);
1713 		/* NOTREACHED */
1714 		__builtin_unreachable();
1715 	}
1716 
1717 	ASSERT(bcnt != 0);
1718 	ubuf = pubuf = NULL;
1719 	kbuf = pkbuf = NULL;
1720 
1721 	/*
1722 	 * Validate and internalize buflets.
1723 	 */
1724 	for (i = 0; i < bcnt; i++) {
1725 		_CASSERT(offsetof(struct __kern_packet, pkt_qum) == 0);
1726 		_CASSERT(offsetof(struct __user_packet, pkt_qum) == 0);
1727 		_CASSERT(offsetof(struct __kern_quantum, qum_com) == 0);
1728 		PKT_GET_NEXT_BUFLET(kpkt, bcnt, pkbuf, kbuf);
1729 		ASSERT(kbuf != NULL);
1730 		if (kbuf->buf_flag & BUFLET_FLAG_EXTERNAL) {
1731 			ubuf = __DECONST(struct __user_buflet *,
1732 			    ((struct __kern_buflet_ext *)kbuf)->kbe_buf_user);
1733 		} else {
1734 			ASSERT(i == 0);
1735 			ubuf = __DECONST(struct __user_buflet *,
1736 			    &uqum->qum_buf[0]);
1737 		}
1738 		ASSERT(ubuf != NULL);
1739 		ASSERT((kbuf != pkbuf) && (ubuf != pubuf));
1740 		ASSERT(kbuf->buf_dlim == kqum->qum_pp->pp_buflet_size);
1741 		ASSERT(kbuf->buf_addr != 0);
1742 		/*
1743 		 * For now, user-facing pool does not support shared
1744 		 * buffer, since otherwise the ubuf and kbuf buffer
1745 		 * indices would not match.  Assert this is the case.
1746 		 */
1747 		ASSERT(kbuf->buf_addr == (mach_vm_address_t)kbuf->buf_objaddr);
1748 
1749 		kbuf->buf_dlen = ubuf->buf_dlen;
1750 		kbuf->buf_doff = ubuf->buf_doff;
1751 
1752 		/*
1753 		 * kernel and user metadata use the same object index
1754 		 * also checks the sanity of buflet data offset and length
1755 		 */
1756 		if (__improbable(!BUF_IN_RANGE(kbuf) ||
1757 		    ubuf->buf_idx != kbuf->buf_idx)) {
1758 			kbuf->buf_dlen = kbuf->buf_doff = 0;
1759 			SK_ERR("%s(%d) kring 0x%llx bad bufidx 0x%x, 0x%x",
1760 			    sk_proc_name_address(p), sk_proc_pid(p),
1761 			    SK_KVA(kring), kbuf->buf_idx, ubuf->buf_idx);
1762 			err = ERANGE;
1763 			goto done;
1764 		}
1765 
1766 		/* save data offset from the first buflet */
1767 		if (pkbuf == NULL) {
1768 			bdoff = kbuf->buf_doff;
1769 		}
1770 
1771 		/* all good to go */
1772 		len += kbuf->buf_dlen;
1773 		pubuf = ubuf;
1774 		pkbuf = kbuf;
1775 	}
1776 
1777 	_CASSERT(offsetof(struct __kern_packet, pkt_length) ==
1778 	    offsetof(struct __kern_packet, pkt_qum.qum_len));
1779 	if (__improbable(kpkt->pkt_length != len)) {
1780 		SK_ERR("%s(%d) kring 0x%llx bad pktlen %d, %d",
1781 		    sk_proc_name_address(p), sk_proc_pid(p),
1782 		    SK_KVA(kring), kpkt->pkt_length, len);
1783 		err = ERANGE;
1784 		goto done;
1785 	}
1786 
1787 	if ((err == 0) && (md_type == NEXUS_META_TYPE_PACKET)) {
1788 		bdlim = kqum->qum_pp->pp_buflet_size;
1789 		switch (md_subtype) {
1790 		case NEXUS_META_SUBTYPE_RAW:
1791 			/*
1792 			 * For a raw packet from user space we need to
1793 			 * validate that headroom is sane and is in the
1794 			 * first buflet.
1795 			 */
1796 			if (__improbable(kpkt->pkt_headroom != bdoff)) {
1797 				SK_ERR("%s(%d) kring 0x%llx bad headroom %d, %d",
1798 				    sk_proc_name_address(p), sk_proc_pid(p),
1799 				    SK_KVA(kring), kpkt->pkt_headroom, bdoff);
1800 				err = ERANGE;
1801 				goto done;
1802 			}
1803 			if (__improbable(kpkt->pkt_headroom +
1804 			    kpkt->pkt_l2_len >= bdlim)) {
1805 				SK_ERR("%s(%d) kring 0x%llx bad headroom l2len %d, %d",
1806 				    sk_proc_name_address(p), sk_proc_pid(p),
1807 				    SK_KVA(kring), kpkt->pkt_l2_len, bdlim);
1808 				err = ERANGE;
1809 				goto done;
1810 			}
1811 			break;
1812 		case NEXUS_META_SUBTYPE_PAYLOAD:
1813 			/*
1814 			 * For a payload packet from user space we need
1815 			 * to validate that payload starts from 0 and L2
1816 			 * length is 0.
1817 			 */
1818 			if (__improbable((kpkt->pkt_headroom != 0) ||
1819 			    (kpkt->pkt_l2_len != 0))) {
1820 				SK_ERR("%s(%d) kring 0x%llx bad headroom "
1821 				    "payload subtype %d headroom %d l2len %d",
1822 				    sk_proc_name_address(p), sk_proc_pid(p),
1823 				    SK_KVA(kring), SK_PTR_SUBTYPE(kpkt),
1824 				    kpkt->pkt_headroom, kpkt->pkt_l2_len);
1825 				err = ERANGE;
1826 				goto done;
1827 			}
1828 			break;
1829 		default:
1830 			VERIFY(0);
1831 			/* NOTREACHED */
1832 			__builtin_unreachable();
1833 		}
1834 
1835 		/* validate checksum offload properties */
1836 		if (__probable(PACKET_HAS_PARTIAL_CHECKSUM(kpkt))) {
1837 			uint16_t start = kpkt->pkt_csum_tx_start_off;
1838 			uint16_t stuff = kpkt->pkt_csum_tx_stuff_off;
1839 			if (__improbable(start > stuff ||
1840 			    start > kpkt->pkt_length ||
1841 			    (stuff + sizeof(uint16_t)) > kpkt->pkt_length)) {
1842 				SK_ERR("%s(%d) flags 0x%x start %u stuff %u "
1843 				    "len %u", sk_proc_name_address(p),
1844 				    sk_proc_pid(p), kpkt->pkt_csum_flags,
1845 				    start, stuff, kpkt->pkt_length);
1846 				err = ERANGE;
1847 				goto done;
1848 			}
1849 		} else {
1850 			kpkt->pkt_csum_tx_start_off = 0;
1851 			kpkt->pkt_csum_tx_stuff_off = 0;
1852 		}
1853 		*__DECONST(uint16_t *, &kpkt->pkt_bufs_cnt) = bcnt;
1854 	}
1855 
1856 done:
1857 	if (__probable(err == 0)) {
1858 		kqum->qum_len = len;
1859 		kqum->qum_qflags |= (QUM_F_INTERNALIZED | QUM_F_FINALIZED);
1860 	} else {
1861 		kqum->qum_len = 0;
1862 		kqum->qum_qflags |= (QUM_F_INTERNALIZED | QUM_F_DROPPED);
1863 	}
1864 	return err;
1865 }
1866 
1867 __attribute__((always_inline))
1868 static inline void
kr_externalize_metadata_internal(struct __kern_channel_ring * kring,const uint32_t maxfrags,struct __kern_quantum * kqum,struct proc * p)1869 kr_externalize_metadata_internal(struct __kern_channel_ring *kring,
1870     const uint32_t maxfrags, struct __kern_quantum *kqum, struct proc *p)
1871 {
1872 #pragma unused(kring, maxfrags, p)
1873 	struct __kern_buflet *kbuf, *pkbuf;     /* kernel buflet */
1874 	struct __user_buflet *ubuf, *pubuf;     /* user buflet */
1875 	struct __user_quantum *uqum;            /* user destination */
1876 	struct __user_packet *upkt;
1877 	struct __kern_packet *kpkt;
1878 	const nexus_meta_type_t md_type = METADATA_TYPE(kqum);
1879 	const nexus_meta_subtype_t md_subtype = METADATA_SUBTYPE(kqum);
1880 	uint32_t len = 0;
1881 	uint16_t bcnt = 0, bmax, i;
1882 
1883 	/*
1884 	 * Verify that the quantum/packet belongs to the same pp as
1885 	 * the one used by the adapter, i.e. the packet must have
1886 	 * been allocated from the same pp and attached to the kring.
1887 	 */
1888 	ASSERT(kqum->qum_pp == kring->ckr_pp);
1889 	ASSERT(kqum->qum_qflags & (QUM_F_FINALIZED | QUM_F_INTERNALIZED));
1890 
1891 	_CASSERT(sizeof(kpkt->pkt_com) == sizeof(upkt->pkt_com));
1892 	_CASSERT(sizeof(kqum->qum_com) == sizeof(uqum->qum_com));
1893 	uqum = __DECONST(struct __user_quantum *, kqum->qum_user);
1894 	ASSERT(!(kqum->qum_qflags & QUM_F_KERNEL_ONLY) && uqum != NULL);
1895 	upkt = SK_PTR_ADDR_UPKT(uqum);
1896 	kpkt = SK_PTR_ADDR_KPKT(kqum);
1897 
1898 	DTRACE_SKYWALK3(externalize, struct __kern_channel_ring *, kring,
1899 	    struct __kern_packet *, kpkt, struct __user_packet *, upkt);
1900 	SK_DF(SK_VERB_MEM, "%s(%d) kring 0x%llx kqum 0x%llx -> uqum 0x%llx",
1901 	    sk_proc_name_address(p), sk_proc_pid(p), SK_KVA(kring),
1902 	    SK_KVA(kqum), SK_KVA(uqum));
1903 
1904 	/*
1905 	 * Externalize common quantum metadata.
1906 	 */
1907 	_QUM_EXTERNALIZE(kqum, uqum);
1908 
1909 	switch (md_type) {
1910 	case NEXUS_META_TYPE_PACKET: {
1911 		bcnt = kpkt->pkt_bufs_cnt;
1912 		bmax = kpkt->pkt_bufs_max;
1913 		ASSERT(bmax == maxfrags);
1914 		ASSERT(bcnt <= bmax);
1915 		/*
1916 		 * Externalize common packet metadata.
1917 		 */
1918 		_PKT_EXTERNALIZE(kpkt, upkt);
1919 
1920 		/* sanitize buflet count and limit (deconst) */
1921 		_CASSERT(sizeof(upkt->pkt_bufs_max) == sizeof(uint16_t));
1922 		_CASSERT(sizeof(upkt->pkt_bufs_cnt) == sizeof(uint16_t));
1923 		*(uint16_t *)(uintptr_t)&upkt->pkt_bufs_max = bmax;
1924 		*(uint16_t *)(uintptr_t)&upkt->pkt_bufs_cnt = bcnt;
1925 
1926 		switch (md_subtype) {
1927 		case NEXUS_META_SUBTYPE_PAYLOAD:
1928 			/* sanitize link layer fields for payload mode */
1929 			upkt->pkt_headroom = 0;
1930 			upkt->pkt_link_flags = 0;
1931 			break;
1932 		default:
1933 			break;
1934 		}
1935 		break;
1936 	}
1937 
1938 	case NEXUS_META_TYPE_QUANTUM:
1939 		ASSERT(maxfrags == 1);
1940 		bcnt = bmax = 1;
1941 		break;
1942 
1943 	default:
1944 		VERIFY(0);
1945 		/* NOTREACHED */
1946 		__builtin_unreachable();
1947 	}
1948 
1949 	ASSERT(bcnt != 0);
1950 	/*
1951 	 * special handling to externalize empty packet buflet.
1952 	 */
1953 	kbuf = &kpkt->pkt_qum.qum_buf[0];
1954 	if (kbuf->buf_addr == 0) {
1955 		ubuf = __DECONST(struct __user_buflet *,
1956 		    &kpkt->pkt_qum.qum_user->qum_buf[0]);
1957 		UBUF_INIT(kbuf, ubuf);
1958 	}
1959 
1960 	kbuf = pkbuf = NULL;
1961 	ubuf = pubuf = NULL;
1962 	/*
1963 	 * Externalize buflets.
1964 	 */
1965 	for (i = 0; i < bcnt; i++) {
1966 		_CASSERT(offsetof(struct __kern_packet, pkt_qum) == 0);
1967 		PKT_GET_NEXT_BUFLET(kpkt, bcnt, pkbuf, kbuf);
1968 		ASSERT(kbuf != NULL);
1969 
1970 		if (kbuf->buf_flag & BUFLET_FLAG_EXTERNAL) {
1971 			ubuf = __DECONST(struct __user_buflet *,
1972 			    ((struct __kern_buflet_ext *)kbuf)->kbe_buf_user);
1973 		} else {
1974 			ASSERT(i == 0);
1975 			ubuf = __DECONST(struct __user_buflet *,
1976 			    &kpkt->pkt_qum.qum_user->qum_buf[0]);
1977 		}
1978 
1979 		ASSERT(ubuf != NULL);
1980 		ASSERT((kbuf != pkbuf) && (ubuf != pubuf));
1981 		ASSERT(BUF_IN_RANGE(kbuf));
1982 		KBUF_EXTERNALIZE(kbuf, ubuf, kqum->qum_pp);
1983 
1984 		/* all good to go */
1985 		len += kbuf->buf_dlen;
1986 		pkbuf = kbuf;
1987 		pubuf = ubuf;
1988 	}
1989 
1990 	uqum->qum_len = len;
1991 	uqum->qum_qflags |= QUM_F_FINALIZED;
1992 
1993 	/*
1994 	 * XXX: [email protected] -- do this during reclaim instead?
1995 	 */
1996 	kqum->qum_qflags &= ~QUM_F_INTERNALIZED;
1997 }
1998 
1999 
2000 void
kr_externalize_metadata(struct __kern_channel_ring * kring,const uint32_t maxfrags,struct __kern_quantum * kqum,struct proc * p)2001 kr_externalize_metadata(struct __kern_channel_ring *kring,
2002     const uint32_t maxfrags, struct __kern_quantum *kqum, struct proc *p)
2003 {
2004 	kr_externalize_metadata_internal(kring, maxfrags, kqum, p);
2005 }
2006