1 /*
2 * Copyright (c) 2015-2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 #include <skywalk/os_skywalk_private.h>
30 #include <kern/sched_prim.h>
31 #include <sys/sdt.h>
32
33 static void kr_update_user_stats(struct __kern_channel_ring *,
34 uint32_t, uint32_t);
35 static void kr_externalize_metadata_internal(struct __kern_channel_ring *,
36 const uint32_t, struct __kern_quantum *, struct proc *);
37
38 #define KR_TRANSFER_DECAY 2 /* ilog2 of EWMA decay rate (4) */
39 static uint32_t kr_transfer_decay = 0;
40
41 #define KR_ACCUMULATE_INTERVAL 2 /* 2 seconds */
42 static uint32_t kr_accumulate_interval = KR_ACCUMULATE_INTERVAL;
43
44 #if (DEVELOPMENT || DEBUG)
45 #define KR_STAT_ENABLE 1
46 #else /* !(DEVELOPMENT || DEBUG) */
47 #define KR_STAT_ENABLE 0
48 #endif /* !(DEVELOPMENT || DEBUG) */
49 /* Enable/Disable ring stats collection */
50 uint32_t kr_stat_enable = KR_STAT_ENABLE;
51
52 #if (DEVELOPMENT || DEBUG)
53 SYSCTL_UINT(_kern_skywalk, OID_AUTO, ring_transfer_decay,
54 CTLFLAG_RW | CTLFLAG_LOCKED, &kr_transfer_decay,
55 0, "ilog2 of EWMA decay rate of ring transfers");
56
57 SYSCTL_UINT(_kern_skywalk, OID_AUTO, ring_stat_accumulate_interval,
58 CTLFLAG_RW | CTLFLAG_LOCKED, &kr_accumulate_interval,
59 KR_ACCUMULATE_INTERVAL, "accumulation interval for ring stats");
60
61 uint32_t kr_disable_panic_on_sync_err = 0;
62 SYSCTL_UINT(_kern_skywalk, OID_AUTO, disable_panic_on_sync_err,
63 CTLFLAG_RW | CTLFLAG_LOCKED, &kr_disable_panic_on_sync_err,
64 0, "disable panic on sync error");
65 #endif /* (DEVELOPMENT || DEBUG) */
66
67 SYSCTL_UINT(_kern_skywalk, OID_AUTO, ring_stat_enable,
68 CTLFLAG_RW | CTLFLAG_LOCKED, &kr_stat_enable,
69 0, "enable/disable stats collection for ring");
70
71 #define KR_EWMA(old, new, decay) do { \
72 u_int64_t _avg; \
73 if (__probable((_avg = (old)) > 0)) \
74 _avg = (((_avg << (decay)) - _avg) + (new)) >> (decay); \
75 else \
76 _avg = (new); \
77 (old) = _avg; \
78 } while (0)
79
80 void
kr_init_to_mhints(struct __kern_channel_ring * kring,uint32_t nslots)81 kr_init_to_mhints(struct __kern_channel_ring *kring, uint32_t nslots)
82 {
83 uint32_t tail;
84
85 tail = nslots - 1;
86
87 kring->ckr_transfer_decay = KR_TRANSFER_DECAY;
88 kring->ckr_num_slots = nslots;
89 *(slot_idx_t *)(uintptr_t)&kring->ckr_lim = (nslots - 1);
90 kring->ckr_rhead = kring->ckr_khead = 0;
91 /* IMPORTANT: Always keep one slot empty */
92 kring->ckr_rtail = kring->ckr_ktail =
93 ((kring->ckr_tx == NR_TX) || (kring->ckr_tx == NR_F) ? tail : 0);
94 }
95
96 /*
97 * Try to obtain exclusive right to issue the *sync() or state change
98 * operations on the ring. The right is obtained and must be later
99 * relinquished via kr_exit() if and only if kr_enter() returns 0.
100 *
101 * In all cases the caller will typically skip the ring, possibly collecting
102 * errors along the way.
103 *
104 * If the calling context does not allow sleeping, the caller must pass
105 * FALSE in can_sleep; EBUSY may be returned if the right is held by
106 * another thread. Otherwise, the caller may block until the right is
107 * released by the previous holder.
108 */
109 int
kr_enter(struct __kern_channel_ring * kr,boolean_t can_sleep)110 kr_enter(struct __kern_channel_ring *kr, boolean_t can_sleep)
111 {
112 lck_spin_lock(&kr->ckr_slock);
113 if (kr->ckr_owner == current_thread()) {
114 ASSERT(kr->ckr_busy != 0);
115 kr->ckr_busy++;
116 goto done;
117 }
118 if (!can_sleep) {
119 if (kr->ckr_busy != 0) {
120 lck_spin_unlock(&kr->ckr_slock);
121 return EBUSY;
122 }
123 } else {
124 while (kr->ckr_busy != 0) {
125 kr->ckr_want++;
126 (void) assert_wait(&kr->ckr_busy, THREAD_UNINT);
127 lck_spin_unlock(&kr->ckr_slock);
128 (void) thread_block(THREAD_CONTINUE_NULL);
129 SK_DF(SK_VERB_LOCKS, "waited for kr \"%s\" "
130 "(0x%llx) busy=%u", kr->ckr_name,
131 SK_KVA(kr), kr->ckr_busy);
132 lck_spin_lock(&kr->ckr_slock);
133 }
134 }
135 LCK_SPIN_ASSERT(&kr->ckr_slock, LCK_ASSERT_OWNED);
136 ASSERT(kr->ckr_busy == 0);
137 kr->ckr_busy++;
138 kr->ckr_owner = current_thread();
139 done:
140 lck_spin_unlock(&kr->ckr_slock);
141
142 SK_DF(SK_VERB_LOCKS, "kr \"%s\" (0x%llx) right acquired",
143 kr->ckr_name, SK_KVA(kr));
144
145 return 0;
146 }
147
148 void
kr_exit(struct __kern_channel_ring * kr)149 kr_exit(struct __kern_channel_ring *kr)
150 {
151 uint32_t want = 0;
152
153 lck_spin_lock(&kr->ckr_slock);
154 ASSERT(kr->ckr_busy != 0);
155 ASSERT(kr->ckr_owner == current_thread());
156 if (--kr->ckr_busy == 0) {
157 kr->ckr_owner = NULL;
158
159 /*
160 * we're done with the kring;
161 * notify anyone that has lost the race
162 */
163 if ((want = kr->ckr_want) != 0) {
164 kr->ckr_want = 0;
165 wakeup((void *)&kr->ckr_busy);
166 lck_spin_unlock(&kr->ckr_slock);
167 } else {
168 lck_spin_unlock(&kr->ckr_slock);
169 }
170 } else {
171 lck_spin_unlock(&kr->ckr_slock);
172 }
173
174 SK_DF(SK_VERB_LOCKS, "kr \"%s\" (0x%llx) right released (%u waiters)",
175 kr->ckr_name, SK_KVA(kr), want);
176 }
177
178
179 void
kr_start(struct __kern_channel_ring * kr)180 kr_start(struct __kern_channel_ring *kr)
181 {
182 lck_spin_lock(&kr->ckr_slock);
183 ASSERT(kr->ckr_busy != 0);
184 ASSERT(kr->ckr_state == KR_STOPPED || kr->ckr_state == KR_LOCKED);
185 /* now clear the state */
186 kr->ckr_state = KR_READY;
187 lck_spin_unlock(&kr->ckr_slock);
188
189 kr_exit(kr);
190
191 SK_DF(SK_VERB_LOCKS, "kr \"%s\" (0x%llx) is started",
192 kr->ckr_name, SK_KVA(kr));
193 }
194
195 /*
196 * Put the kring in the 'stopped' state: either KR_STOPPED or KR_LOCKED.
197 * Also marks the ring as busy, which would require either kr_start() at a
198 * later point.
199 */
200 void
kr_stop(struct __kern_channel_ring * kr,uint32_t state)201 kr_stop(struct __kern_channel_ring *kr, uint32_t state)
202 {
203 uint32_t s;
204
205 ASSERT(state == KR_STOPPED || state == KR_LOCKED);
206
207 s = kr_enter(kr, TRUE);
208 ASSERT(s == 0);
209
210 lck_spin_lock(&kr->ckr_slock);
211 ASSERT(kr->ckr_busy != 0);
212 /* now set the state */
213 kr->ckr_state = state;
214 lck_spin_unlock(&kr->ckr_slock);
215
216 SK_DF(SK_VERB_LOCKS,
217 "kr \"%s\" (0x%llx) krflags 0x%b is now stopped s=%u",
218 kr->ckr_name, SK_KVA(kr), kr->ckr_flags, CKRF_BITS, state);
219 }
220
221 static void
kr_update_user_stats(struct __kern_channel_ring * kring,uint32_t slot_count,uint32_t byte_count)222 kr_update_user_stats(struct __kern_channel_ring *kring, uint32_t slot_count,
223 uint32_t byte_count)
224 {
225 uint64_t now;
226 uint32_t transfer_decay = (kr_transfer_decay != 0) ?
227 kr_transfer_decay : kring->ckr_transfer_decay;
228 channel_ring_user_stats_t stats = &kring->ckr_usr_stats;
229
230 now = net_uptime();
231 kring->ckr_sync_time = now;
232
233 if (kr_stat_enable == 0) {
234 return;
235 }
236
237 stats->crsu_number_of_syncs++;
238 stats->crsu_total_bytes_transferred += byte_count;
239 stats->crsu_total_slots_transferred += slot_count;
240
241 if (slot_count > stats->crsu_max_slots_transferred) {
242 stats->crsu_max_slots_transferred = slot_count;
243 }
244
245 if (stats->crsu_min_slots_transferred == 0 ||
246 slot_count < stats->crsu_min_slots_transferred) {
247 stats->crsu_min_slots_transferred = slot_count;
248 }
249
250 if (__probable(kring->ckr_user_accumulate_start != 0)) {
251 if ((now - kring->ckr_user_accumulate_start) >=
252 kr_accumulate_interval) {
253 uint64_t bps;
254 uint64_t sps;
255 uint64_t sps_ma;
256
257 /* bytes per sync */
258 bps = kring->ckr_user_accumulated_bytes /
259 kring->ckr_user_accumulated_syncs;
260 KR_EWMA(stats->crsu_bytes_per_sync_ma,
261 bps, transfer_decay);
262 stats->crsu_bytes_per_sync = bps;
263
264 /* slots per sync */
265 sps = kring->ckr_user_accumulated_slots /
266 kring->ckr_user_accumulated_syncs;
267 sps_ma = stats->crsu_slots_per_sync_ma;
268 KR_EWMA(sps_ma, sps, transfer_decay);
269 stats->crsu_slots_per_sync_ma = (uint32_t)sps_ma;
270 stats->crsu_slots_per_sync = (uint32_t)sps;
271
272 /* start over */
273 kring->ckr_user_accumulate_start = now;
274 kring->ckr_user_accumulated_bytes = 0;
275 kring->ckr_user_accumulated_slots = 0;
276 kring->ckr_user_accumulated_syncs = 0;
277
278 stats->crsu_min_slots_transferred = 0;
279 stats->crsu_max_slots_transferred = 0;
280 }
281 } else {
282 kring->ckr_user_accumulate_start = now;
283 }
284
285 kring->ckr_user_accumulated_bytes += byte_count;
286 kring->ckr_user_accumulated_slots += slot_count;
287 kring->ckr_user_accumulated_syncs++;
288 }
289
290 /* caller to make sure thread safety */
291 void
kr_update_stats(struct __kern_channel_ring * kring,uint32_t slot_count,uint32_t byte_count)292 kr_update_stats(struct __kern_channel_ring *kring, uint32_t slot_count,
293 uint32_t byte_count)
294 {
295 uint64_t now;
296 uint64_t diff_secs;
297 channel_ring_stats_t stats = &kring->ckr_stats;
298 uint32_t transfer_decay = (kr_transfer_decay != 0) ?
299 kr_transfer_decay : kring->ckr_transfer_decay;
300
301 if (kr_stat_enable == 0) {
302 return;
303 }
304
305 if (__improbable(slot_count == 0)) {
306 return;
307 }
308
309 stats->crs_number_of_transfers++;
310 stats->crs_total_bytes_transferred += byte_count;
311 stats->crs_total_slots_transferred += slot_count;
312 if (slot_count > stats->crs_max_slots_transferred) {
313 stats->crs_max_slots_transferred = slot_count;
314 }
315 if (stats->crs_min_slots_transferred == 0 ||
316 slot_count < stats->crs_min_slots_transferred) {
317 stats->crs_min_slots_transferred = slot_count;
318 }
319
320 now = net_uptime();
321 if (__probable(kring->ckr_accumulate_start != 0)) {
322 diff_secs = now - kring->ckr_accumulate_start;
323 if (diff_secs >= kr_accumulate_interval) {
324 uint64_t bps;
325 uint64_t sps;
326 uint64_t sps_ma;
327
328 /* bytes per second */
329 bps = kring->ckr_accumulated_bytes / diff_secs;
330 KR_EWMA(stats->crs_bytes_per_second_ma,
331 bps, transfer_decay);
332 stats->crs_bytes_per_second = bps;
333
334 /* slots per second */
335 sps = kring->ckr_accumulated_slots / diff_secs;
336 sps_ma = stats->crs_slots_per_second_ma;
337 KR_EWMA(sps_ma, sps, transfer_decay);
338 stats->crs_slots_per_second_ma = (uint32_t)sps_ma;
339 stats->crs_slots_per_second = (uint32_t)sps;
340
341 /* start over */
342 kring->ckr_accumulate_start = now;
343 kring->ckr_accumulated_bytes = 0;
344 kring->ckr_accumulated_slots = 0;
345
346 stats->crs_min_slots_transferred = 0;
347 stats->crs_max_slots_transferred = 0;
348 }
349 } else {
350 kring->ckr_accumulate_start = now;
351 }
352 kring->ckr_accumulated_bytes += byte_count;
353 kring->ckr_accumulated_slots += slot_count;
354 }
355
356 /* True if no space in the tx ring. only valid after kr_txsync_prologue */
357 boolean_t
kr_txempty(struct __kern_channel_ring * kring)358 kr_txempty(struct __kern_channel_ring *kring)
359 {
360 return kring->ckr_rhead == kring->ckr_ktail;
361 }
362
363 #if SK_LOG
364 /*
365 * Error logging routine called when txsync/rxsync detects an error.
366 * Expected to be called before killing the process with skywalk_kill_process()
367 *
368 * This routine is only called by the upper half of the kernel.
369 * It only reads khead (which is changed only by the upper half, too)
370 * and ktail (which may be changed by the lower half, but only on
371 * a tx ring and only to increase it, so any error will be recovered
372 * on the next call). For the above, we don't strictly need to call
373 * it under lock.
374 */
375 void
kr_log_bad_ring(struct __kern_channel_ring * kring)376 kr_log_bad_ring(struct __kern_channel_ring *kring)
377 {
378 struct __user_channel_ring *ring = kring->ckr_ring;
379 const slot_idx_t lim = kring->ckr_lim;
380 slot_idx_t i;
381 int errors = 0;
382
383 // XXX KASSERT nm_kr_tryget
384 SK_ERR("kr \"%s\" (0x%llx) krflags 0x%b", kring->ckr_name,
385 SK_KVA(kring), kring->ckr_flags, CKRF_BITS);
386 // XXX probably wrong to trust userspace
387
388 if (ring->ring_head > lim) {
389 errors++;
390 }
391 if (ring->ring_tail > lim) {
392 errors++;
393 }
394 for (i = 0; i <= lim; i++) {
395 struct __kern_slot_desc *ksd = KR_KSD(kring, i);
396 struct __kern_quantum *kqum = ksd->sd_qum;
397 obj_idx_t idx;
398 uint32_t len;
399
400 if (!KSD_VALID_METADATA(ksd)) {
401 continue;
402 }
403
404 idx = METADATA_IDX(kqum);
405 len = kqum->qum_len;
406 if (len > kring->ckr_max_pkt_len) {
407 SK_RDERR(5, "bad len at slot %u idx %u len %u",
408 i, idx, len);
409 }
410 }
411
412 if (errors != 0) {
413 SK_ERR("total %d errors", errors);
414 SK_ERR("kr \"%s\" (0x%llx) krflags 0x%b crash, "
415 "head %u -> %u tail %u -> %u", kring->ckr_name,
416 SK_KVA(kring), kring->ckr_flags, CKRF_BITS, ring->ring_head,
417 kring->ckr_rhead, kring->ckr_khead,
418 ring->ring_tail, kring->ckr_ktail);
419 }
420 }
421 #endif /* SK_LOG */
422
423 uint32_t
kr_reclaim(struct __kern_channel_ring * kr)424 kr_reclaim(struct __kern_channel_ring *kr)
425 {
426 int r = 0;
427
428 VERIFY(sk_is_sync_protected());
429
430 /*
431 * This is a no-op for TX ring, since the TX reclaim logic is only
432 * known to the nexus itself. There, the nexus's TX sync code would
433 * figure out the number of slots that has been "transmitted", and
434 * advance the slot pointer accordingly. This routine would then be
435 * called as a way to advise the system of such condition.
436 *
437 * For RX ring, this will reclaim user-released slots, and it is
438 * to be called by the provider's RX sync routine prior to its
439 * processing new slots (into the RX ring).
440 *
441 * It is therefore advised that this routine be called at the start
442 * of the RX sync callback, as well as at the end of the TX sync
443 * callback; the latter is useful in case we decide to implement
444 * more logic in future.
445 */
446 if ((kr->ckr_tx == NR_RX) || (kr->ckr_tx == NR_EV)) {
447 /* # of reclaimed slots */
448 r = kr->ckr_rhead - kr->ckr_khead;
449 if (r < 0) {
450 r += kr->ckr_num_slots;
451 }
452
453 kr->ckr_khead = kr->ckr_rhead;
454 /* ensure global visibility */
455 membar_sync();
456 }
457
458 return (slot_idx_t)r;
459 }
460
461 /*
462 * Nexus-specific kr_txsync_prologue() callback.
463 */
464 int
kr_txprologue(struct kern_channel * ch,struct __kern_channel_ring * kring,const slot_idx_t head,uint32_t * byte_count,uint64_t * err_reason,struct proc * p)465 kr_txprologue(struct kern_channel *ch, struct __kern_channel_ring *kring,
466 const slot_idx_t head, uint32_t *byte_count, uint64_t *err_reason,
467 struct proc *p)
468 {
469 struct kern_pbufpool *pp = kring->ckr_pp;
470 const uint32_t maxfrags = pp->pp_max_frags;
471 slot_idx_t slot_idx = kring->ckr_rhead;
472
473 ASSERT(!(KRNA(kring)->na_flags & NAF_USER_PKT_POOL));
474
475 while (slot_idx != head) {
476 struct __kern_slot_desc *ksd = KR_KSD(kring, slot_idx);
477 struct __kern_quantum *kqum = ksd->sd_qum;
478 int err;
479
480 if (__improbable(!(kqum->qum_qflags & QUM_F_KERNEL_ONLY) &&
481 METADATA_IDX(kqum) != METADATA_IDX(kqum->qum_user))) {
482 SK_ERR("qum index mismatch");
483 *err_reason = SKYWALK_KILL_REASON_QUM_IDX_MISMATCH;
484 return -1;
485 }
486
487 /* Internalize */
488 err = kr_internalize_metadata(ch, kring, maxfrags, kqum, p);
489 if (__improbable(err != 0)) {
490 SK_ERR("%s(%d) kr \"%s\" (0x%llx) slot %u dropped "
491 "(err %d) kh %u kt %u | rh %u rt %u | h %u t %u",
492 sk_proc_name_address(p), sk_proc_pid(p),
493 kring->ckr_name, SK_KVA(kring), slot_idx, err,
494 kring->ckr_khead, kring->ckr_ktail,
495 kring->ckr_rhead, kring->ckr_rtail,
496 kring->ckr_ring->ring_head,
497 kring->ckr_ring->ring_tail);
498 *err_reason = SKYWALK_KILL_REASON_INTERNALIZE_FAILED;
499 return -1;
500 }
501
502 *byte_count += kqum->qum_len;
503 slot_idx = SLOT_NEXT(slot_idx, kring->ckr_lim);
504 }
505
506 return 0;
507 }
508
509 /*
510 * Nexus-specific kr_txsync_prologue() callback - user packet pool variant.
511 */
512 int
kr_txprologue_upp(struct kern_channel * ch,struct __kern_channel_ring * kring,const slot_idx_t head,uint32_t * byte_count,uint64_t * err_reason,struct proc * p)513 kr_txprologue_upp(struct kern_channel *ch, struct __kern_channel_ring *kring,
514 const slot_idx_t head, uint32_t *byte_count, uint64_t *err_reason,
515 struct proc *p)
516 {
517 struct kern_pbufpool *pp = kring->ckr_pp;
518 const uint32_t maxfrags = pp->pp_max_frags;
519 slot_idx_t slot_idx = kring->ckr_rhead;
520 struct __kern_quantum *kqum = NULL;
521 bool free_pkt = false;
522 int err = 0;
523
524 ASSERT(KRNA(kring)->na_flags & NAF_USER_PKT_POOL);
525
526 PP_LOCK(pp);
527 while (slot_idx != head) {
528 struct __kern_slot_desc *ksd = KR_KSD(kring, slot_idx);
529 struct __user_slot_desc *usd = KR_USD(kring, slot_idx);
530
531 /*
532 * The channel is operating in user packet pool mode;
533 * check if the packet is in the allocated list.
534 */
535 kqum = pp_remove_upp_locked(pp, usd->sd_md_idx, &err);
536 if (__improbable(err != 0)) {
537 if (kqum != NULL) {
538 SK_ERR("%s(%d) kr \"%s\" (0x%llx) slot %u "
539 "kqum %p, bad buflet chain",
540 sk_proc_name_address(p), sk_proc_pid(p),
541 kring->ckr_name, SK_KVA(kring), slot_idx,
542 SK_KVA(kqum));
543 *err_reason =
544 SKYWALK_KILL_REASON_BAD_BUFLET_CHAIN;
545 goto done;
546 }
547
548 SK_ERR("%s(%d) kr \"%s\" (0x%llx) slot %u "
549 " unallocated packet %u kh %u kt %u | "
550 "rh %u rt %u | h %u t %u",
551 sk_proc_name_address(p), sk_proc_pid(p),
552 kring->ckr_name, SK_KVA(kring), slot_idx,
553 usd->sd_md_idx, kring->ckr_khead, kring->ckr_ktail,
554 kring->ckr_rhead, kring->ckr_rtail,
555 kring->ckr_ring->ring_head,
556 kring->ckr_ring->ring_tail);
557 *err_reason = SKYWALK_KILL_REASON_UNALLOCATED_PKT;
558 goto done;
559 }
560
561 if (__improbable(!(kqum->qum_qflags & QUM_F_KERNEL_ONLY) &&
562 METADATA_IDX(kqum) != METADATA_IDX(kqum->qum_user))) {
563 SK_ERR("qum index mismatch");
564 *err_reason = SKYWALK_KILL_REASON_QUM_IDX_MISMATCH;
565 err = ERANGE;
566 free_pkt = true;
567 goto done;
568 }
569
570 /* Internalize */
571 err = kr_internalize_metadata(ch, kring, maxfrags, kqum, p);
572 if (__improbable(err != 0)) {
573 SK_ERR("%s(%d) kr \"%s\" (0x%llx) slot %u dropped "
574 "(err %d) kh %u kt %u | rh %u rt %u | h %u t %u",
575 sk_proc_name_address(p), sk_proc_pid(p),
576 kring->ckr_name, SK_KVA(kring), slot_idx, err,
577 kring->ckr_khead, kring->ckr_ktail,
578 kring->ckr_rhead, kring->ckr_rtail,
579 kring->ckr_ring->ring_head,
580 kring->ckr_ring->ring_tail);
581 *err_reason = SKYWALK_KILL_REASON_INTERNALIZE_FAILED;
582 free_pkt = true;
583 goto done;
584 }
585
586 /*
587 * Attach packet to slot, detach mapping from alloc ring slot.
588 */
589 kqum->qum_ksd = NULL;
590 USD_RESET(usd);
591 KR_SLOT_ATTACH_METADATA(kring, ksd, kqum);
592
593 *byte_count += kqum->qum_len;
594 slot_idx = SLOT_NEXT(slot_idx, kring->ckr_lim);
595 }
596
597 done:
598 PP_UNLOCK(pp);
599 if (__improbable(err != 0) && free_pkt) {
600 ASSERT(kqum != NULL);
601 kqum->qum_ksd = NULL;
602 pp_free_packet(pp, (uint64_t)kqum);
603 }
604 return err;
605 }
606
607 #define NM_FAIL_ON(t, reason) if (__improbable(t)) { SK_ERR("fail " #t); \
608 err_reason = reason; goto error; }
609 /*
610 * Validate parameters in the TX/FREE ring/kring.
611 *
612 * ckr_rhead, ckr_rtail=ktail are stored from previous round.
613 * khead is the next packet to send to the ring.
614 *
615 * We want
616 * khead <= *ckr_rhead <= head <= tail = *ckr_rtail <= ktail
617 *
618 * ckr_khead, ckr_rhead, ckr_rtail and ckr_ktail are reliable
619 */
620 #define _KR_TXRING_VALIDATE(_kring, _ring, _kh, _kt, _rh, _krt) do {\
621 slot_idx_t _n = (_kring)->ckr_num_slots; \
622 /* kernel sanity checks */ \
623 NM_FAIL_ON((_kh) >= _n || kring->ckr_rhead >= _n || (_krt) >= _n || \
624 (_kt) >= _n, SKYWALK_KILL_REASON_BASIC_SANITY); \
625 /* user basic sanity checks */ \
626 NM_FAIL_ON((_rh) >= _n, SKYWALK_KILL_REASON_BASIC_SANITY); \
627 /* \
628 * user sanity checks. We only use 'cur', \
629 * A, B, ... are possible positions for cur: \
630 * \
631 * 0 A cur B tail C n-1 \
632 * 0 D tail E cur F n-1 \
633 * \
634 * B, F, D are valid. A, C, E are wrong \
635 */ \
636 if ((_krt) >= kring->ckr_rhead) { \
637 /* want ckr_rhead <= head <= ckr_rtail */ \
638 NM_FAIL_ON((_rh) < kring->ckr_rhead || (_rh) > (_krt), \
639 SKYWALK_KILL_REASON_HEAD_OOB); \
640 } else { /* here ckr_rtail < ckr_rhead */ \
641 /* we need head outside ckr_rtail .. ckr_rhead */ \
642 NM_FAIL_ON((_rh) > (_krt) && (_rh) < kring->ckr_rhead, \
643 SKYWALK_KILL_REASON_HEAD_OOB_WRAPPED); \
644 } \
645 NM_FAIL_ON(ring->ring_tail != (_krt), \
646 SKYWALK_KILL_REASON_TAIL_MISMATCH); \
647 } while (0)
648
649 /*
650 * Validate parameters in the ring/kring on entry for *_txsync().
651 * Returns ring->ring_head if ok, or something >= kring->ckr_num_slots
652 * in case of error, in order to force a reinit.
653 */
654 slot_idx_t
kr_txsync_prologue(struct kern_channel * ch,struct __kern_channel_ring * kring,struct proc * p)655 kr_txsync_prologue(struct kern_channel *ch, struct __kern_channel_ring *kring,
656 struct proc *p)
657 {
658 struct __user_channel_ring *ring = kring->ckr_ring;
659 slot_idx_t ckr_khead, ckr_ktail, ckr_rtail;
660 slot_idx_t head;
661 uint32_t byte_count = 0;
662 uint64_t err_reason = 0;
663 int slot_count;
664
665 VERIFY(sk_is_sync_protected());
666 /* assert that this routine is only called for user facing rings */
667 ASSERT(!KR_KERNEL_ONLY(kring));
668 ASSERT(kring->ckr_usds != NULL);
669
670 /* read these once and use local copies */
671 head = ring->ring_head;
672 ckr_khead = kring->ckr_khead;
673 ckr_ktail = kring->ckr_ktail;
674 membar_sync();
675 ckr_rtail = kring->ckr_rtail;
676
677 SK_DF(SK_VERB_SYNC | SK_VERB_TX, "%s(%d) kr \"%s\", kh %u kt %u | "
678 "rh %u rt %u | h %u t %u", sk_proc_name_address(p),
679 sk_proc_pid(p), kring->ckr_name, ckr_khead, ckr_ktail,
680 kring->ckr_rhead, ckr_rtail,
681 ring->ring_head, ring->ring_tail);
682
683 _KR_TXRING_VALIDATE(kring, ring, ckr_khead, ckr_ktail, head, ckr_rtail);
684
685 /* # of new tx slots */
686 slot_count = head - kring->ckr_rhead;
687 if (slot_count < 0) {
688 slot_count += kring->ckr_num_slots;
689 }
690
691 /*
692 * Invoke nexus-specific TX prologue callback, set in na_kr_create().
693 */
694 if (kring->ckr_prologue != NULL && (kring->ckr_prologue(ch,
695 kring, head, &byte_count, &err_reason, p) != 0)) {
696 goto error;
697 }
698
699 /* update the user's view of slots & bytes transferred */
700 kr_update_user_stats(kring, slot_count, byte_count);
701
702 /* update the kernel view of ring */
703 kring->ckr_rhead = head;
704
705 /* save for kr_txsync_finalize(); only khead is needed */
706 kring->ckr_khead_pre = ckr_khead;
707
708 return head;
709
710 error:
711 SK_ERR("%s(%d) kr \"%s\" (0x%llx) krflags 0x%b error: kh %u kt %u | "
712 "rh %u rt %u | h %u t %u |", sk_proc_name_address(p),
713 sk_proc_pid(p), kring->ckr_name, SK_KVA(kring), kring->ckr_flags,
714 CKRF_BITS, ckr_khead, ckr_ktail, kring->ckr_rhead,
715 ckr_rtail, head, ring->ring_tail);
716
717 skywalk_kill_process(p, err_reason | SKYWALK_KILL_REASON_TX_SYNC);
718
719 return kring->ckr_num_slots;
720 }
721
722 /*
723 * Validate parameters in the ring/kring on entry for *_free_sync().
724 * Returns ring->ring_head if ok, or something >= kring->ckr_num_slots
725 * in case of error, in order to force a reinit.
726 */
727 slot_idx_t
kr_free_sync_prologue(struct __kern_channel_ring * kring,struct proc * p)728 kr_free_sync_prologue(struct __kern_channel_ring *kring, struct proc *p)
729 {
730 struct __user_channel_ring *ring = kring->ckr_ring;
731 slot_idx_t ckr_khead, ckr_ktail, ckr_rtail;
732 slot_idx_t head;
733 uint64_t err_reason = 0;
734
735 VERIFY(sk_is_sync_protected());
736 /* read these once and use local copies */
737 head = ring->ring_head;
738 ckr_khead = kring->ckr_khead;
739 ckr_ktail = kring->ckr_ktail;
740 membar_sync();
741 ckr_rtail = kring->ckr_rtail;
742
743 SK_DF(SK_VERB_SYNC, "%s(%d) kr \"%s\", kh %u kt %u | "
744 "rh %u rt %u | h %u t %u", sk_proc_name_address(p),
745 sk_proc_pid(p), kring->ckr_name, ckr_khead, ckr_ktail,
746 kring->ckr_rhead, ckr_rtail, ring->ring_head, ring->ring_tail);
747
748 _KR_TXRING_VALIDATE(kring, ring, ckr_khead, ckr_ktail, head, ckr_rtail);
749
750 /* update the kernel view of ring */
751 kring->ckr_rhead = head;
752 return head;
753
754 error:
755 SK_ERR("%s(%d) kr \"%s\" (0x%llx) krflags 0x%b error: kh %u kt %u | "
756 "rh %u rt %u | h %u t %u |", sk_proc_name_address(p),
757 sk_proc_pid(p), kring->ckr_name, SK_KVA(kring), kring->ckr_flags,
758 CKRF_BITS, ckr_khead, ckr_ktail, kring->ckr_rhead,
759 ckr_rtail, head, ring->ring_tail);
760
761 skywalk_kill_process(p, err_reason | SKYWALK_KILL_REASON_FREE_SYNC);
762 return kring->ckr_num_slots;
763 }
764
765 /*
766 * Nexus-specific kr_rxsync_prologue() callback.
767 */
768 int
kr_rxprologue(struct kern_channel * ch,struct __kern_channel_ring * kring,const slot_idx_t head,uint32_t * byte_count,uint64_t * err_reason,struct proc * p)769 kr_rxprologue(struct kern_channel *ch, struct __kern_channel_ring *kring,
770 const slot_idx_t head, uint32_t *byte_count, uint64_t *err_reason,
771 struct proc *p)
772 {
773 #pragma unused(ch, p)
774 slot_idx_t slot_idx = kring->ckr_rhead;
775 uint32_t nfree = 0;
776
777 ASSERT(!(KRNA(kring)->na_flags & NAF_USER_PKT_POOL));
778
779 /*
780 * Iterating through the slots just read by user-space;
781 * ckr_rhead -> ring_head
782 */
783 while (slot_idx != head) {
784 struct __kern_slot_desc *ksd = KR_KSD(kring, slot_idx);
785 struct __kern_quantum *kqum = ksd->sd_qum;
786
787 ASSERT(KSD_VALID_METADATA(ksd));
788 /* # of new bytes transferred */
789 *byte_count += kqum->qum_len;
790
791 /* detach and free the packet */
792 (void) KR_SLOT_DETACH_METADATA(kring, ksd);
793 ASSERT(nfree < kring->ckr_num_slots);
794 kring->ckr_scratch[nfree++] = (uint64_t)kqum;
795
796 slot_idx = SLOT_NEXT(slot_idx, kring->ckr_lim);
797 }
798
799 if (nfree > 0) {
800 pp_free_packet_batch(kring->ckr_pp,
801 &kring->ckr_scratch[0], nfree);
802 }
803
804 /*
805 * Update userspace channel statistics of # readable bytes
806 * subtract byte counts from slots just given back to the kernel.
807 */
808 if (kring->ckr_ready_bytes < *byte_count) {
809 SK_ERR("%s(%d) kr \"%s\" (0x%llx) inconsistent ready bytes "
810 "(%u < %u) kh %u kt %u | rh %u rt %u | h %u t %u",
811 sk_proc_name_address(p), sk_proc_pid(p), kring->ckr_name,
812 SK_KVA(kring), kring->ckr_ready_bytes, *byte_count,
813 kring->ckr_khead, kring->ckr_ktail, kring->ckr_rhead,
814 kring->ckr_rtail, kring->ckr_ring->ring_head,
815 kring->ckr_ring->ring_tail);
816 *err_reason = SKYWALK_KILL_REASON_INCONSISTENT_READY_BYTES;
817 return -1;
818 }
819 kring->ckr_ready_bytes -= *byte_count;
820
821 return 0;
822 }
823
824 /*
825 * Nexus-specific kr_rxsync_prologue() callback - no detach variant.
826 */
827 int
kr_rxprologue_nodetach(struct kern_channel * ch,struct __kern_channel_ring * kring,const slot_idx_t head,uint32_t * byte_count,uint64_t * err_reason,struct proc * p)828 kr_rxprologue_nodetach(struct kern_channel *ch,
829 struct __kern_channel_ring *kring, const slot_idx_t head,
830 uint32_t *byte_count, uint64_t *err_reason, struct proc *p)
831 {
832 #pragma unused(ch, p)
833 slot_idx_t slot_idx = kring->ckr_rhead;
834
835 ASSERT(!(KRNA(kring)->na_flags & NAF_USER_PKT_POOL));
836
837 /*
838 * Iterating through the slots just read by user-space;
839 * ckr_rhead -> ring_head
840 */
841 while (slot_idx != head) {
842 struct __kern_slot_desc *ksd = KR_KSD(kring, slot_idx);
843 struct __kern_quantum *kqum = ksd->sd_qum;
844
845 ASSERT(KSD_VALID_METADATA(ksd));
846 /* # of new bytes transferred */
847 *byte_count += kqum->qum_len;
848 slot_idx = SLOT_NEXT(slot_idx, kring->ckr_lim);
849 }
850
851 /*
852 * Update userspace channel statistics of # readable bytes
853 * subtract byte counts from slots just given back to the kernel.
854 */
855 if (kring->ckr_ready_bytes < *byte_count) {
856 SK_ERR("%s(%d) kr \"%s\" (0x%llx) inconsistent ready bytes "
857 "(%u < %u) kh %u kt %u | rh %u rt %u | h %u t %u",
858 sk_proc_name_address(p), sk_proc_pid(p), kring->ckr_name,
859 SK_KVA(kring), kring->ckr_ready_bytes, *byte_count,
860 kring->ckr_khead, kring->ckr_ktail, kring->ckr_rhead,
861 kring->ckr_rtail, kring->ckr_ring->ring_head,
862 kring->ckr_ring->ring_tail);
863 *err_reason = SKYWALK_KILL_REASON_INCONSISTENT_READY_BYTES;
864 return -1;
865 }
866 kring->ckr_ready_bytes -= *byte_count;
867
868 return 0;
869 }
870
871 /*
872 * Nexus-specific kr_rxsync_prologue() callback - user packet pool variant.
873 */
874 int
kr_rxprologue_upp(struct kern_channel * ch,struct __kern_channel_ring * kring,const slot_idx_t head,uint32_t * byte_count,uint64_t * err_reason,struct proc * p)875 kr_rxprologue_upp(struct kern_channel *ch, struct __kern_channel_ring *kring,
876 const slot_idx_t head, uint32_t *byte_count, uint64_t *err_reason,
877 struct proc *p)
878 {
879 #pragma unused(ch, p)
880 slot_idx_t slot_idx = kring->ckr_rhead;
881
882 ASSERT(KRNA(kring)->na_flags & NAF_USER_PKT_POOL);
883
884 /*
885 * Iterating through the slots just read by user-space;
886 * ckr_rhead -> ring_head
887 */
888 while (slot_idx != head) {
889 struct __user_slot_desc *usd = KR_USD(kring, slot_idx);
890
891 /*
892 * This is a user facing ring opting in for the user packet
893 * pool mode, so ensure that the user has detached packet
894 * from slot.
895 */
896 ASSERT(!KSD_VALID_METADATA(KR_KSD(kring, slot_idx)));
897 if (SD_VALID_METADATA(usd)) {
898 SK_ERR("%s(%d) kr \"%s\" (0x%llx) slot %u not "
899 "detached md %u kh %u kt %u | rh %u rt %u |"
900 " h %u t %u", sk_proc_name_address(p),
901 sk_proc_pid(p), kring->ckr_name,
902 SK_KVA(kring), slot_idx, usd->sd_md_idx,
903 kring->ckr_khead, kring->ckr_ktail,
904 kring->ckr_rhead, kring->ckr_rtail,
905 kring->ckr_ring->ring_head,
906 kring->ckr_ring->ring_tail);
907 *err_reason = SKYWALK_KILL_REASON_SLOT_NOT_DETACHED;
908 return -1;
909 }
910 *byte_count += usd->sd_len;
911
912 slot_idx = SLOT_NEXT(slot_idx, kring->ckr_lim);
913 }
914
915 /*
916 * update userspace channel statistics of # readable bytes
917 * subtract byte counts from slots just given back to the kernel
918 */
919 if (kring->ckr_ready_bytes < *byte_count) {
920 SK_ERR("%s(%d) kr \"%s\" (0x%llx) inconsistent ready bytes "
921 "(%u < %u) kh %u kt %u | rh %u rt %u | h %u t %u",
922 sk_proc_name_address(p), sk_proc_pid(p), kring->ckr_name,
923 SK_KVA(kring), kring->ckr_ready_bytes, *byte_count,
924 kring->ckr_khead, kring->ckr_ktail, kring->ckr_rhead,
925 kring->ckr_rtail, kring->ckr_ring->ring_head,
926 kring->ckr_ring->ring_tail);
927 *err_reason = SKYWALK_KILL_REASON_INCONSISTENT_READY_BYTES;
928 return -1;
929 }
930 kring->ckr_ready_bytes -= *byte_count;
931
932 return 0;
933 }
934
935 /*
936 * Validate parameters in the RX/ALLOC/EVENT ring/kring.
937 * For a valid configuration,
938 * khead <= head <= tail <= ktail
939 *
940 * We only consider head.
941 * khead and ktail are reliable.
942 */
943 #define _KR_RXRING_VALIDATE(_kring, _ring, _kh, _kt, _rh) do { \
944 slot_idx_t _n = (_kring)->ckr_num_slots; \
945 /* kernel sanity checks */ \
946 NM_FAIL_ON((_kh) >= _n || (_kt) >= _n, \
947 SKYWALK_KILL_REASON_BASIC_SANITY); \
948 /* user sanity checks */ \
949 if ((_kt) >= (_kh)) { \
950 /* want khead <= head <= ktail */ \
951 NM_FAIL_ON((_rh) < (_kh) || (_rh) > (_kt), \
952 SKYWALK_KILL_REASON_HEAD_OOB); \
953 } else { \
954 /* we need head outside ktail..khead */ \
955 NM_FAIL_ON((_rh) < (_kh) && (_rh) > (_kt), \
956 SKYWALK_KILL_REASON_HEAD_OOB_WRAPPED); \
957 } \
958 NM_FAIL_ON((_ring)->ring_tail != (_kring)->ckr_rtail, \
959 SKYWALK_KILL_REASON_TAIL_MISMATCH); \
960 } while (0)
961
962 /*
963 * Validate parameters in the ring/kring on entry for *_rxsync().
964 * Returns ring->ring_head if ok, kring->ckr_num_slots on error,
965 * in order to force a reinit.
966 */
967 slot_idx_t
kr_rxsync_prologue(struct kern_channel * ch,struct __kern_channel_ring * kring,struct proc * p)968 kr_rxsync_prologue(struct kern_channel *ch, struct __kern_channel_ring *kring,
969 struct proc *p)
970 {
971 #pragma unused(ch)
972 struct __user_channel_ring *ring = kring->ckr_ring;
973 slot_idx_t ckr_khead, ckr_ktail;
974 slot_idx_t head;
975 uint32_t byte_count = 0;
976 uint64_t err_reason = 0;
977 int slot_count;
978
979 VERIFY(sk_is_sync_protected());
980 /* assert that this routine is only called for user facing rings */
981 ASSERT(!KR_KERNEL_ONLY(kring));
982 ASSERT(kring->ckr_usds != NULL);
983
984 /* read these once and use local copies */
985 ckr_khead = kring->ckr_khead;
986 ckr_ktail = kring->ckr_ktail;
987
988 SK_DF(SK_VERB_SYNC | SK_VERB_RX, "%s(%d) kr \"%s\", kh %u kt %u | "
989 "rh %u rt %u | h %u t %u", sk_proc_name_address(p),
990 sk_proc_pid(p), kring->ckr_name, ckr_khead, ckr_ktail,
991 kring->ckr_rhead, kring->ckr_rtail,
992 ring->ring_head, ring->ring_tail);
993 /*
994 * Before storing the new values, we should check they do not
995 * move backwards. However:
996 * - head is not an issue because the previous value is khead;
997 * - cur could in principle go back, however it does not matter
998 * because we are processing a brand new rxsync()
999 */
1000 head = ring->ring_head; /* read only once */
1001
1002 _KR_RXRING_VALIDATE(kring, ring, ckr_khead, ckr_ktail, head);
1003
1004 /* # of reclaimed slots */
1005 slot_count = head - kring->ckr_rhead;
1006 if (slot_count < 0) {
1007 slot_count += kring->ckr_num_slots;
1008 }
1009
1010 /*
1011 * Invoke nexus-specific RX prologue callback, which may detach
1012 * and free any consumed packets. Configured in na_kr_create().
1013 */
1014 if (kring->ckr_prologue != NULL && (kring->ckr_prologue(ch,
1015 kring, head, &byte_count, &err_reason, p) != 0)) {
1016 goto error;
1017 }
1018 /* update the user's view of slots & bytes transferred */
1019 kr_update_user_stats(kring, slot_count, byte_count);
1020
1021 /* update the kernel view of ring */
1022 kring->ckr_rhead = head;
1023 return head;
1024
1025 error:
1026 SK_ERR("%s(%d) kr \"%s\" (0x%llx) krflags 0x%b error: kh %u kt %u | "
1027 "rh %u rt %u | h %u t %u", sk_proc_name_address(p),
1028 sk_proc_pid(p), kring->ckr_name, SK_KVA(kring), kring->ckr_flags,
1029 CKRF_BITS, ckr_khead, ckr_ktail,
1030 kring->ckr_rhead, kring->ckr_rtail,
1031 ring->ring_head, ring->ring_tail);
1032
1033 skywalk_kill_process(p, err_reason | SKYWALK_KILL_REASON_RX_SYNC);
1034 return kring->ckr_num_slots;
1035 }
1036
1037 /*
1038 * Validate parameters on the ring/kring on entry for *_alloc_sync().
1039 * Returns ring->ring_head if ok, kring->ckr_num_slots on error,
1040 * in order to force a reinit.
1041 */
1042 slot_idx_t
kr_alloc_sync_prologue(struct __kern_channel_ring * kring,struct proc * p)1043 kr_alloc_sync_prologue(struct __kern_channel_ring *kring, struct proc *p)
1044 {
1045 struct __user_channel_ring *ring = kring->ckr_ring;
1046 slot_idx_t ckr_khead, ckr_ktail;
1047 slot_idx_t head;
1048 uint64_t err_reason = 0;
1049
1050 VERIFY(sk_is_sync_protected());
1051
1052 /* read these once and use local copies */
1053 ckr_khead = kring->ckr_khead;
1054 ckr_ktail = kring->ckr_ktail;
1055 head = ring->ring_head;
1056
1057 SK_DF(SK_VERB_SYNC, "%s(%d) kr \"%s\", kh %u kt %u | "
1058 "rh %u rt %u | h %u t %u", sk_proc_name_address(p),
1059 sk_proc_pid(p), kring->ckr_name, ckr_khead, ckr_ktail,
1060 kring->ckr_rhead, kring->ckr_rtail,
1061 head, ring->ring_tail);
1062 /*
1063 * Before storing the new values, we should check they do not
1064 * move backwards. However, head is not an issue because the
1065 * previous value is khead;
1066 */
1067 _KR_RXRING_VALIDATE(kring, ring, ckr_khead, ckr_ktail, head);
1068
1069 /* update the kernel view of ring */
1070 kring->ckr_rhead = head;
1071 return head;
1072
1073 error:
1074 SK_ERR("%s(%d) kr \"%s\" (0x%llx) krflags 0x%b error: kh %u kt %u | "
1075 "rh %u rt %u | h %u t %u", sk_proc_name_address(p),
1076 sk_proc_pid(p), kring->ckr_name, SK_KVA(kring), kring->ckr_flags,
1077 CKRF_BITS, ckr_khead, ckr_ktail,
1078 kring->ckr_rhead, kring->ckr_rtail,
1079 ring->ring_head, ring->ring_tail);
1080
1081 skywalk_kill_process(p, err_reason | SKYWALK_KILL_REASON_ALLOC_SYNC);
1082 return kring->ckr_num_slots;
1083 }
1084
1085 /*
1086 * Nexus-specific kr_txsync_finalize() callback.
1087 */
1088 void
kr_txfinalize(struct kern_channel * ch,struct __kern_channel_ring * kring,const slot_idx_t head,struct proc * p)1089 kr_txfinalize(struct kern_channel *ch, struct __kern_channel_ring *kring,
1090 const slot_idx_t head, struct proc *p)
1091 {
1092 #pragma unused(ch)
1093 struct kern_pbufpool *pp = kring->ckr_pp;
1094 slot_idx_t slot_idx;
1095 uint32_t ph_cnt, i = 0;
1096 int32_t ph_needed;
1097 int err;
1098
1099 ASSERT(!(KRNA(kring)->na_flags & NAF_USER_PKT_POOL));
1100
1101 /* use khead value from pre-sync time */
1102 slot_idx = kring->ckr_khead_pre;
1103
1104 ph_needed = head - slot_idx;
1105 if (ph_needed < 0) {
1106 ph_needed += kring->ckr_num_slots;
1107 }
1108 if (ph_needed == 0) {
1109 return;
1110 }
1111
1112 ph_cnt = (uint32_t)ph_needed;
1113 err = kern_pbufpool_alloc_batch(pp, 1, kring->ckr_scratch, &ph_cnt);
1114 VERIFY(err == 0 && ph_cnt == (uint32_t)ph_needed);
1115
1116 /* recycle the transferred packets */
1117 while (slot_idx != head) {
1118 struct __kern_slot_desc *ksd = KR_KSD(kring, slot_idx);
1119 kern_packet_t ph;
1120
1121 if (KSD_VALID_METADATA(ksd)) {
1122 goto next_slot;
1123 }
1124
1125 ph = kring->ckr_scratch[i];
1126 ASSERT(ph != 0);
1127 kring->ckr_scratch[i] = 0;
1128 ++i;
1129
1130 /*
1131 * Since this packet is freshly allocated and we need
1132 * to have the flag set for the attach to succeed,
1133 * just set it here rather than calling
1134 * __packet_finalize().
1135 */
1136 SK_PTR_ADDR_KQUM(ph)->qum_qflags |= QUM_F_FINALIZED;
1137
1138 KR_SLOT_ATTACH_METADATA(kring, ksd, SK_PTR_ADDR_KQUM(ph));
1139
1140 kr_externalize_metadata_internal(kring, pp->pp_max_frags,
1141 SK_PTR_ADDR_KQUM(ph), p);
1142 next_slot:
1143 slot_idx = SLOT_NEXT(slot_idx, kring->ckr_lim);
1144 }
1145
1146 if (i != ph_cnt) {
1147 kern_pbufpool_free_batch(pp, &kring->ckr_scratch[i],
1148 ph_cnt - i);
1149 }
1150 }
1151
1152 /*
1153 * Nexus-specific kr_txsync_finalize() callback - user packet pool variant.
1154 */
1155 void
kr_txfinalize_upp(struct kern_channel * ch,struct __kern_channel_ring * kring,const slot_idx_t head,struct proc * p)1156 kr_txfinalize_upp(struct kern_channel *ch, struct __kern_channel_ring *kring,
1157 const slot_idx_t head, struct proc *p)
1158 {
1159 #pragma unused(ch, p)
1160 slot_idx_t slot_idx;
1161 uint32_t nfree = 0;
1162
1163 ASSERT(KRNA(kring)->na_flags & NAF_USER_PKT_POOL);
1164
1165 /* use khead value from pre-sync time */
1166 slot_idx = kring->ckr_khead_pre;
1167
1168 /* recycle the transferred packets */
1169 while (slot_idx != head) {
1170 struct __kern_slot_desc *ksd = KR_KSD(kring, slot_idx);
1171
1172 if (KSD_VALID_METADATA(ksd)) {
1173 /* detach and free the packet */
1174 struct __kern_quantum *kqum = ksd->sd_qum;
1175 (void) KR_SLOT_DETACH_METADATA(kring, ksd);
1176 ASSERT(nfree < kring->ckr_num_slots);
1177 kring->ckr_scratch[nfree++] = (uint64_t)kqum;
1178 }
1179
1180 slot_idx = SLOT_NEXT(slot_idx, kring->ckr_lim);
1181 }
1182
1183 if (__probable(nfree > 0)) {
1184 pp_free_packet_batch(kring->ckr_pp,
1185 &kring->ckr_scratch[0], nfree);
1186 }
1187 }
1188
1189 /*
1190 * Update kring and ring at the end of txsync.
1191 */
1192 void
kr_txsync_finalize(struct kern_channel * ch,struct __kern_channel_ring * kring,struct proc * p)1193 kr_txsync_finalize(struct kern_channel *ch, struct __kern_channel_ring *kring,
1194 struct proc *p)
1195 {
1196 slot_idx_t ckr_khead, ckr_ktail;
1197 uint32_t slot_size;
1198 int32_t slot_diff;
1199
1200 VERIFY(sk_is_sync_protected());
1201 /* assert that this routine is only called for user facing rings */
1202 ASSERT(!KR_KERNEL_ONLY(kring));
1203
1204 /* read these once and use local copies */
1205 ckr_khead = kring->ckr_khead;
1206 ckr_ktail = kring->ckr_ktail;
1207
1208 /*
1209 * update userspace-facing channel statistics (# writable bytes/slots)
1210 *
1211 * Since the ring might be dynamically allocated, we can't rely on the
1212 * tail pointer to calculate free TX space (the tail might be sitting
1213 * at the edge of allocated ring space but be able to be pushed over
1214 * into unallocated ring space).
1215 *
1216 * Instead, calculate free TX space by looking at what slots are
1217 * available to the kernel for TX, and subtracting that from the total
1218 * number of possible slots. This is effectively what userspace can
1219 * write to.
1220 */
1221 slot_size = kring->ckr_pp->pp_buflet_size;
1222 slot_diff = kring->ckr_rhead - ckr_khead;
1223 if (slot_diff < 0) {
1224 slot_diff += kring->ckr_num_slots;
1225 }
1226 slot_diff = kring->ckr_lim - slot_diff;
1227 kring->ckr_ready_slots = slot_diff;
1228 kring->ckr_ready_bytes = slot_diff * slot_size;
1229
1230 /*
1231 * Invoke nexus-specific TX finalize callback, which may recycle any
1232 * transferred packets and/or externalize new ones. Some nexus don't
1233 * have any callback set. Configured in na_kr_create().
1234 */
1235 if (kring->ckr_finalize != NULL) {
1236 kring->ckr_finalize(ch, kring, ckr_khead, p);
1237 }
1238
1239 /* update ring tail/khead to what the kernel knows */
1240 *(slot_idx_t *)(uintptr_t)&kring->ckr_ring->ring_tail =
1241 kring->ckr_rtail = ckr_ktail;
1242 *(slot_idx_t *)(uintptr_t)&kring->ckr_ring->ring_khead = ckr_khead;
1243
1244 SK_DF(SK_VERB_SYNC | SK_VERB_TX, "%s(%d) kr \"%s\", kh %u kt %u | "
1245 "rh %u rt %u | h %u t %u", sk_proc_name_address(p),
1246 sk_proc_pid(p), kring->ckr_name, ckr_khead, ckr_ktail,
1247 kring->ckr_rhead, kring->ckr_rtail,
1248 kring->ckr_ring->ring_head,
1249 kring->ckr_ring->ring_tail);
1250 }
1251
1252 /*
1253 * Nexus-specific kr_rxsync_finalize() callback.
1254 */
1255 void
kr_rxfinalize(struct kern_channel * ch,struct __kern_channel_ring * kring,const slot_idx_t tail,struct proc * p)1256 kr_rxfinalize(struct kern_channel *ch, struct __kern_channel_ring *kring,
1257 const slot_idx_t tail, struct proc *p)
1258 {
1259 #pragma unused(ch)
1260 const uint32_t maxfrags = kring->ckr_pp->pp_max_frags;
1261 slot_idx_t slot_idx = kring->ckr_rtail;
1262 uint32_t byte_count = 0;
1263
1264 while (slot_idx != tail) {
1265 struct __kern_slot_desc *ksd = KR_KSD(kring, slot_idx);
1266 struct __kern_quantum *kqum = ksd->sd_qum;
1267
1268 /*
1269 * nexus provider should never leave an empty slot on rx ring.
1270 */
1271 VERIFY(kqum != NULL);
1272 kr_externalize_metadata_internal(kring, maxfrags, kqum, p);
1273 ASSERT(!(KR_USD(kring, slot_idx)->sd_flags & ~SD_FLAGS_USER));
1274
1275 byte_count += kqum->qum_len;
1276 slot_idx = SLOT_NEXT(slot_idx, kring->ckr_lim);
1277 }
1278
1279 kring->ckr_ready_bytes += byte_count;
1280
1281 /* just recalculate slot count using pointer arithmetic */
1282 int32_t slot_diff = tail - kring->ckr_rhead;
1283 if (slot_diff < 0) {
1284 slot_diff += kring->ckr_num_slots;
1285 }
1286 kring->ckr_ready_slots = slot_diff;
1287
1288 #if CONFIG_NEXUS_NETIF
1289 /*
1290 * If this is a channel opened directly to the netif nexus, provide
1291 * it feedbacks on the number of packets and bytes consumed. This
1292 * will drive the receive mitigation strategy.
1293 */
1294 if (__improbable(kring->ckr_netif_mit_stats != NULL) &&
1295 slot_diff != 0 && byte_count != 0) {
1296 kring->ckr_netif_mit_stats(kring, slot_diff, byte_count);
1297 }
1298 #endif /* CONFIG_NEXUS_NETIF */
1299 }
1300
1301 /*
1302 * Nexus-specific kr_rxsync_finalize() callback - user packet pool variant.
1303 */
1304 void
kr_rxfinalize_upp(struct kern_channel * ch,struct __kern_channel_ring * kring,const slot_idx_t tail,struct proc * p)1305 kr_rxfinalize_upp(struct kern_channel *ch, struct __kern_channel_ring *kring,
1306 const slot_idx_t tail, struct proc *p)
1307 {
1308 const uint32_t maxfrags = kring->ckr_pp->pp_max_frags;
1309 slot_idx_t slot_idx = kring->ckr_rtail;
1310 struct kern_pbufpool *pp = kring->ckr_pp;
1311 uint32_t byte_count = 0;
1312
1313 PP_LOCK(pp);
1314 while (slot_idx != tail) {
1315 struct __kern_slot_desc *ksd = KR_KSD(kring, slot_idx);
1316 struct __user_slot_desc *usd = KR_USD(kring, slot_idx);
1317 struct __kern_quantum *kqum = ksd->sd_qum;
1318
1319 /*
1320 * nexus provider should never leave an empty slot on rx ring.
1321 */
1322 VERIFY(kqum != NULL);
1323 /*
1324 * The channel is operating in packet allocator
1325 * mode, so add packet to the allocated list.
1326 */
1327 pp_insert_upp_locked(pp, kqum, ch->ch_pid);
1328
1329 KSD_DETACH_METADATA(ksd);
1330 /* To calculate ckr_ready_bytes by kr_rxsync_prologue */
1331 USD_SET_LENGTH(usd, (uint16_t)kqum->qum_len);
1332
1333 kr_externalize_metadata_internal(kring, maxfrags, kqum, p);
1334 ASSERT((usd->sd_flags & ~SD_FLAGS_USER) == 0);
1335
1336 byte_count += kqum->qum_len;
1337 slot_idx = SLOT_NEXT(slot_idx, kring->ckr_lim);
1338 }
1339 PP_UNLOCK(pp);
1340
1341 kring->ckr_ready_bytes += byte_count;
1342
1343 /* just recalculate slot count using pointer arithmetic */
1344 int32_t slot_diff = tail - kring->ckr_rhead;
1345 if (slot_diff < 0) {
1346 slot_diff += kring->ckr_num_slots;
1347 }
1348 kring->ckr_ready_slots = slot_diff;
1349
1350 #if CONFIG_NEXUS_NETIF
1351 /*
1352 * If this is a channel opened directly to the netif nexus, provide
1353 * it feedbacks on the number of packets and bytes consumed. This
1354 * will drive the receive mitigation strategy.
1355 */
1356 if (__improbable(kring->ckr_netif_mit_stats != NULL) &&
1357 slot_diff != 0 && byte_count != 0) {
1358 kring->ckr_netif_mit_stats(kring, slot_diff, byte_count);
1359 }
1360 #endif /* CONFIG_NEXUS_NETIF */
1361 }
1362
1363 /*
1364 * Update kring and ring at the end of rxsync
1365 */
1366 void
kr_rxsync_finalize(struct kern_channel * ch,struct __kern_channel_ring * kring,struct proc * p)1367 kr_rxsync_finalize(struct kern_channel *ch, struct __kern_channel_ring *kring,
1368 struct proc *p)
1369 {
1370 #pragma unused(ch, p)
1371 slot_idx_t ckr_khead, ckr_ktail;
1372
1373 VERIFY(sk_is_sync_protected());
1374 /* assert that this routine is only called for user facing rings */
1375 ASSERT(!KR_KERNEL_ONLY(kring));
1376 ASSERT(kring->ckr_usds != NULL);
1377
1378 /* read these once and use local copies */
1379 ckr_khead = kring->ckr_khead;
1380 ckr_ktail = kring->ckr_ktail;
1381
1382 /*
1383 * Invoke nexus-specific RX finalize callback; set in na_kr_create().
1384 */
1385 if (kring->ckr_finalize != NULL) {
1386 kring->ckr_finalize(ch, kring, ckr_ktail, p);
1387 }
1388
1389 /* update ring tail/khead to what the kernel knows */
1390 *(slot_idx_t *)(uintptr_t)&kring->ckr_ring->ring_tail =
1391 kring->ckr_rtail = ckr_ktail;
1392 *(slot_idx_t *)(uintptr_t)&kring->ckr_ring->ring_khead = ckr_khead;
1393
1394 SK_DF(SK_VERB_SYNC | SK_VERB_RX, "%s(%d) kr \"%s\", kh %u kt %u | "
1395 "rh %u rt %u | h %u t %u", sk_proc_name_address(p),
1396 sk_proc_pid(p), kring->ckr_name, ckr_khead, ckr_ktail,
1397 kring->ckr_rhead, kring->ckr_rtail,
1398 kring->ckr_ring->ring_head,
1399 kring->ckr_ring->ring_tail);
1400 }
1401
1402 void
kr_alloc_sync_finalize(struct __kern_channel_ring * kring,struct proc * p)1403 kr_alloc_sync_finalize(struct __kern_channel_ring *kring, struct proc *p)
1404 {
1405 #pragma unused(p)
1406 slot_idx_t ckr_khead, ckr_ktail;
1407
1408 VERIFY(sk_is_sync_protected());
1409 /* read these once and use local copies */
1410 ckr_khead = kring->ckr_khead;
1411 ckr_ktail = kring->ckr_ktail;
1412
1413 /* update ring tail/khead to what the kernel knows */
1414 *(slot_idx_t *)(uintptr_t)&kring->ckr_ring->ring_tail =
1415 kring->ckr_rtail = ckr_ktail;
1416 *(slot_idx_t *)(uintptr_t)&kring->ckr_ring->ring_khead = ckr_khead;
1417 *(uint32_t *)(uintptr_t)&kring->ckr_ring->ring_alloc_ws =
1418 kring->ckr_alloc_ws;
1419
1420 SK_DF(SK_VERB_SYNC, "%s(%d) kr \"%s\", kh %u kt %u | "
1421 "rh %u rt %u | h %u t %u | ws %u",
1422 sk_proc_name_address(p),
1423 sk_proc_pid(p), kring->ckr_name, ckr_khead, ckr_ktail,
1424 kring->ckr_rhead, kring->ckr_rtail,
1425 kring->ckr_ring->ring_head,
1426 kring->ckr_ring->ring_tail, kring->ckr_alloc_ws);
1427 }
1428
1429 void
kr_free_sync_finalize(struct __kern_channel_ring * kring,struct proc * p)1430 kr_free_sync_finalize(struct __kern_channel_ring *kring, struct proc *p)
1431 {
1432 #pragma unused(p)
1433 slot_idx_t ckr_khead, ckr_ktail;
1434
1435 VERIFY(sk_is_sync_protected());
1436 /* read these once and use local copies */
1437 ckr_khead = kring->ckr_khead;
1438 ckr_ktail = kring->ckr_ktail;
1439
1440 /* update ring tail/khead to what the kernel knows */
1441 *(slot_idx_t *)(uintptr_t)&kring->ckr_ring->ring_tail =
1442 kring->ckr_rtail = ckr_ktail;
1443 *(slot_idx_t *)(uintptr_t)&kring->ckr_ring->ring_khead = ckr_khead;
1444
1445 SK_DF(SK_VERB_SYNC, "%s(%d) kr \"%s\", kh %u kt %u | "
1446 "rh %u rt %u | h %u t %u", sk_proc_name_address(p),
1447 sk_proc_pid(p), kring->ckr_name, ckr_khead, ckr_ktail,
1448 kring->ckr_rhead, kring->ckr_rtail,
1449 kring->ckr_ring->ring_head,
1450 kring->ckr_ring->ring_tail);
1451 }
1452
1453 slot_idx_t
kr_event_sync_prologue(struct __kern_channel_ring * kring,struct proc * p)1454 kr_event_sync_prologue(struct __kern_channel_ring *kring, struct proc *p)
1455 {
1456 struct __user_channel_ring *ring = kring->ckr_ring;
1457 slot_idx_t ckr_khead, ckr_ktail;
1458 slot_idx_t head, slot_idx;
1459 uint64_t err_reason = 0;
1460
1461 ASSERT(kring->ckr_tx == NR_EV);
1462 VERIFY(sk_is_sync_protected());
1463
1464 /* read these once and use local copies */
1465 ckr_khead = kring->ckr_khead;
1466 ckr_ktail = kring->ckr_ktail;
1467 head = ring->ring_head;
1468
1469 SK_DF(SK_VERB_SYNC, "%s(%d) kr \"%s\", kh %u kt %u | "
1470 "rh %u rt %u | h %u t %u", sk_proc_name_address(p),
1471 sk_proc_pid(p), kring->ckr_name, ckr_khead, ckr_ktail,
1472 kring->ckr_rhead, kring->ckr_rtail,
1473 head, ring->ring_tail);
1474 /*
1475 * Before storing the new values, we should check they do not
1476 * move backwards. However, head is not an issue because the
1477 * previous value is khead;
1478 */
1479 _KR_RXRING_VALIDATE(kring, ring, ckr_khead, ckr_ktail, head);
1480
1481 /*
1482 * Iterating through the slots just read by user-space;
1483 * ckr_rhead -> ring_head
1484 */
1485 slot_idx = kring->ckr_rhead;
1486 while (slot_idx != head) {
1487 struct __kern_slot_desc *ksd = KR_KSD(kring, slot_idx);
1488 struct __user_slot_desc *usd = KR_USD(kring, slot_idx);
1489 /*
1490 * ensure that the user has detached packet from slot.
1491 */
1492 VERIFY(!KSD_VALID_METADATA(ksd));
1493 if (__improbable(SD_VALID_METADATA(usd))) {
1494 SK_ERR("%s(%d) kr \"%s\" (0x%llx) slot %u not "
1495 "detached md %u kh %u kt %u | rh %u rt %u |"
1496 " h %u t %u", sk_proc_name_address(p),
1497 sk_proc_pid(p), kring->ckr_name,
1498 SK_KVA(kring), slot_idx, usd->sd_md_idx,
1499 ckr_khead, ckr_ktail, kring->ckr_rhead,
1500 kring->ckr_rtail, ring->ring_head,
1501 ring->ring_tail);
1502 err_reason = SKYWALK_KILL_REASON_SLOT_NOT_DETACHED;
1503 goto error;
1504 }
1505 slot_idx = SLOT_NEXT(slot_idx, kring->ckr_lim);
1506 }
1507
1508 /* update the kernel view of ring */
1509 kring->ckr_rhead = head;
1510 return head;
1511
1512 error:
1513 SK_ERR("%s(%d) kr \"%s\" (0x%llx) krflags 0x%b error: kh %u kt %u | "
1514 "rh %u rt %u | h %u t %u", sk_proc_name_address(p),
1515 sk_proc_pid(p), kring->ckr_name, SK_KVA(kring), kring->ckr_flags,
1516 CKRF_BITS, ckr_khead, ckr_ktail,
1517 kring->ckr_rhead, kring->ckr_rtail,
1518 ring->ring_head, ring->ring_tail);
1519
1520 skywalk_kill_process(p, err_reason | SKYWALK_KILL_REASON_EVENT_SYNC);
1521 return kring->ckr_num_slots;
1522 }
1523
1524 void
kr_event_sync_finalize(struct kern_channel * ch,struct __kern_channel_ring * kring,struct proc * p)1525 kr_event_sync_finalize(struct kern_channel *ch,
1526 struct __kern_channel_ring *kring, struct proc *p)
1527 {
1528 #pragma unused(ch)
1529 struct kern_pbufpool *pp = kring->ckr_pp;
1530 const uint32_t maxfrags = pp->pp_max_frags;
1531 slot_idx_t ckr_khead, ckr_ktail, ckr_rhead;
1532 struct __kern_slot_desc *ksd;
1533 struct __user_slot_desc *usd;
1534 struct __kern_quantum *kqum;
1535
1536 VERIFY(sk_is_sync_protected());
1537 /* assert that this routine is only called for user facing rings */
1538 ASSERT(!KR_KERNEL_ONLY(kring));
1539 ASSERT(kring->ckr_usds != NULL);
1540 ASSERT(kring->ckr_tx == NR_EV);
1541
1542 /* read these once and use local copies */
1543 ckr_khead = kring->ckr_khead;
1544 ckr_ktail = kring->ckr_ktail;
1545 ckr_rhead = kring->ckr_rhead;
1546
1547 slot_idx_t slot_idx = kring->ckr_rtail;
1548 PP_LOCK(pp);
1549 while (slot_idx != ckr_ktail) {
1550 ksd = KR_KSD(kring, slot_idx);
1551 usd = KR_USD(kring, slot_idx);
1552 kqum = ksd->sd_qum;
1553
1554 /*
1555 * Add packet to the allocated list of user packet pool.
1556 */
1557 pp_insert_upp_locked(pp, kqum, ch->ch_pid);
1558
1559 KSD_DETACH_METADATA(ksd);
1560 kr_externalize_metadata_internal(kring, maxfrags, kqum, p);
1561 ASSERT((usd->sd_flags & ~SD_FLAGS_USER) == 0);
1562 slot_idx = SLOT_NEXT(slot_idx, kring->ckr_lim);
1563 }
1564 PP_UNLOCK(pp);
1565
1566 /* just recalculate slot count using pointer arithmetic */
1567 int32_t slot_diff = ckr_ktail - ckr_rhead;
1568 if (slot_diff < 0) {
1569 slot_diff += kring->ckr_num_slots;
1570 }
1571 kring->ckr_ready_slots = slot_diff;
1572
1573 /* update ring tail/khead to what the kernel knows */
1574 *(slot_idx_t *)(uintptr_t)&kring->ckr_ring->ring_tail =
1575 kring->ckr_rtail = ckr_ktail;
1576 *(slot_idx_t *)(uintptr_t)&kring->ckr_ring->ring_khead = ckr_khead;
1577
1578 SK_DF(SK_VERB_SYNC | SK_VERB_RX, "%s(%d) kr \"%s\", kh %u kt %u | "
1579 "rh %u rt %u | h %u t %u", sk_proc_name_address(p),
1580 sk_proc_pid(p), kring->ckr_name, ckr_khead, ckr_ktail,
1581 kring->ckr_rhead, kring->ckr_rtail,
1582 kring->ckr_ring->ring_head,
1583 kring->ckr_ring->ring_tail);
1584 }
1585 #undef NM_FAIL_ON
1586
1587 void
kr_txkring_reclaim_and_refill(struct __kern_channel_ring * kring,slot_idx_t index)1588 kr_txkring_reclaim_and_refill(struct __kern_channel_ring *kring,
1589 slot_idx_t index)
1590 {
1591 const slot_idx_t lim = kring->ckr_lim;
1592 slot_idx_t next_index = SLOT_NEXT(index, lim);
1593
1594 kring->ckr_khead = next_index;
1595 /* reclaim */
1596 kring->ckr_ktail = index;
1597 }
1598
1599 /*
1600 * *************************************************************************
1601 * Checks on packet header offsets in kr_internalize_metadata
1602 * *************************************************************************
1603 *
1604 * +----------+------------------------------+----------------------------+
1605 * | | NEXUS_META_SUBTYPE_RAW | NEXUS_META_SUBTYPE_PAYLOAD |
1606 * |----------+------------------------------+----------------------------+
1607 * | buflet | (bdoff + len) <= dlim | (bdoff + len) <= dlim |
1608 * |----------+------------------------------+----------------------------+
1609 * | headroom | hr == bdoff && hr < bdlim | hr == 0 && bdoff == 0 |
1610 * |----------+------------------------------+----------------------------+
1611 * | l2_len | hr + l2_len < bdim | l2_len == 0 |
1612 * |----------+------------------------------+----------------------------+
1613 */
1614 int
kr_internalize_metadata(struct kern_channel * ch,struct __kern_channel_ring * kring,const uint32_t maxfrags,struct __kern_quantum * kqum,struct proc * p)1615 kr_internalize_metadata(struct kern_channel *ch,
1616 struct __kern_channel_ring *kring, const uint32_t maxfrags,
1617 struct __kern_quantum *kqum, struct proc *p)
1618 {
1619 #pragma unused(kring, maxfrags, p)
1620 struct __user_buflet *ubuf, *pubuf; /* user buflet */
1621 struct __kern_buflet *kbuf, *pkbuf; /* kernel buflet */
1622 struct __user_quantum *uqum; /* user source */
1623 struct __user_packet *upkt;
1624 struct __kern_packet *kpkt;
1625 const nexus_meta_type_t md_type = METADATA_TYPE(kqum);
1626 const nexus_meta_subtype_t md_subtype = METADATA_SUBTYPE(kqum);
1627 uint32_t len = 0;
1628 uint16_t bcnt = 0, bmax, i, bdoff, bdlim;
1629 boolean_t dropped;
1630 int err = 0;
1631
1632 /*
1633 * Verify that the quantum/packet belongs to the same pp as
1634 * the one used by the adapter, i.e. the packet must have
1635 * been allocated from the same pp and attached to the kring.
1636 */
1637 ASSERT(kqum->qum_pp == kring->ckr_pp);
1638
1639 _CASSERT(sizeof(uqum->qum_com) == sizeof(kqum->qum_com));
1640 _CASSERT(sizeof(upkt->pkt_com) == sizeof(kpkt->pkt_com));
1641 uqum = __DECONST(struct __user_quantum *, kqum->qum_user);
1642 ASSERT(!(kqum->qum_qflags & QUM_F_KERNEL_ONLY) && uqum != NULL);
1643 upkt = SK_PTR_ADDR_UPKT(uqum);
1644 kpkt = SK_PTR_ADDR_KPKT(kqum);
1645
1646 DTRACE_SKYWALK3(internalize, struct __kern_channel_ring *, kring,
1647 struct __kern_packet *, kpkt, struct __user_packet *, upkt);
1648 SK_DF(SK_VERB_MEM, "%s(%d) kring 0x%llx uqum 0x%llx -> kqum 0x%llx",
1649 sk_proc_name_address(p), sk_proc_pid(p), SK_KVA(kring),
1650 SK_KVA(uqum), SK_KVA(kqum));
1651
1652 /* check if it's dropped before we internalize it */
1653 dropped = ((uqum->qum_qflags & QUM_F_DROPPED) != 0);
1654
1655 /*
1656 * Internalize common quantum metadata.
1657 *
1658 * For packet metadata, we trust the kernel copy for the buflet
1659 * count and limit; any mismatch on the user copy will cause
1660 * us to drop this packet.
1661 */
1662 _QUM_INTERNALIZE(uqum, kqum);
1663
1664 /* if marked as dropped, don't bother going further */
1665 if (__improbable(dropped)) {
1666 SK_ERR("%s(%d) kring 0x%llx dropped",
1667 sk_proc_name_address(p), sk_proc_pid(p), SK_KVA(kring));
1668 err = ERANGE;
1669 goto done;
1670 }
1671
1672 switch (md_type) {
1673 case NEXUS_META_TYPE_PACKET:
1674 /*
1675 * Internalize common packet metadata.
1676 */
1677 _PKT_INTERNALIZE(upkt, kpkt);
1678
1679 switch (md_subtype) {
1680 case NEXUS_META_SUBTYPE_PAYLOAD:
1681 /* sanitize link layer fields for payload mode */
1682 kpkt->pkt_link_flags = 0;
1683 break;
1684 default:
1685 break;
1686 }
1687
1688 if (__probable(ch != NULL)) {
1689 _UUID_COPY(kpkt->pkt_flowsrc_id,
1690 ch->ch_info->cinfo_ch_id);
1691 }
1692
1693 bcnt = upkt->pkt_bufs_cnt;
1694 bmax = kpkt->pkt_bufs_max;
1695 ASSERT(bmax == maxfrags);
1696 if (__improbable((bcnt == 0) || (bcnt > bmax) ||
1697 (upkt->pkt_bufs_max != bmax))) {
1698 SK_ERR("%s(%d) kring 0x%llx bad bufcnt %d, %d, %d",
1699 sk_proc_name_address(p), sk_proc_pid(p),
1700 SK_KVA(kring), bcnt, bmax, upkt->pkt_bufs_max);
1701 err = ERANGE;
1702 goto done;
1703 }
1704 break;
1705
1706 case NEXUS_META_TYPE_QUANTUM:
1707 ASSERT(maxfrags == 1);
1708 bcnt = bmax = 1;
1709 break;
1710
1711 default:
1712 VERIFY(0);
1713 /* NOTREACHED */
1714 __builtin_unreachable();
1715 }
1716
1717 ASSERT(bcnt != 0);
1718 ubuf = pubuf = NULL;
1719 kbuf = pkbuf = NULL;
1720
1721 /*
1722 * Validate and internalize buflets.
1723 */
1724 for (i = 0; i < bcnt; i++) {
1725 _CASSERT(offsetof(struct __kern_packet, pkt_qum) == 0);
1726 _CASSERT(offsetof(struct __user_packet, pkt_qum) == 0);
1727 _CASSERT(offsetof(struct __kern_quantum, qum_com) == 0);
1728 PKT_GET_NEXT_BUFLET(kpkt, bcnt, pkbuf, kbuf);
1729 ASSERT(kbuf != NULL);
1730 if (kbuf->buf_flag & BUFLET_FLAG_EXTERNAL) {
1731 ubuf = __DECONST(struct __user_buflet *,
1732 ((struct __kern_buflet_ext *)kbuf)->kbe_buf_user);
1733 } else {
1734 ASSERT(i == 0);
1735 ubuf = __DECONST(struct __user_buflet *,
1736 &uqum->qum_buf[0]);
1737 }
1738 ASSERT(ubuf != NULL);
1739 ASSERT((kbuf != pkbuf) && (ubuf != pubuf));
1740 ASSERT(kbuf->buf_dlim == kqum->qum_pp->pp_buflet_size);
1741 ASSERT(kbuf->buf_addr != 0);
1742 /*
1743 * For now, user-facing pool does not support shared
1744 * buffer, since otherwise the ubuf and kbuf buffer
1745 * indices would not match. Assert this is the case.
1746 */
1747 ASSERT(kbuf->buf_addr == (mach_vm_address_t)kbuf->buf_objaddr);
1748
1749 kbuf->buf_dlen = ubuf->buf_dlen;
1750 kbuf->buf_doff = ubuf->buf_doff;
1751
1752 /*
1753 * kernel and user metadata use the same object index
1754 * also checks the sanity of buflet data offset and length
1755 */
1756 if (__improbable(!BUF_IN_RANGE(kbuf) ||
1757 ubuf->buf_idx != kbuf->buf_idx)) {
1758 kbuf->buf_dlen = kbuf->buf_doff = 0;
1759 SK_ERR("%s(%d) kring 0x%llx bad bufidx 0x%x, 0x%x",
1760 sk_proc_name_address(p), sk_proc_pid(p),
1761 SK_KVA(kring), kbuf->buf_idx, ubuf->buf_idx);
1762 err = ERANGE;
1763 goto done;
1764 }
1765
1766 /* save data offset from the first buflet */
1767 if (pkbuf == NULL) {
1768 bdoff = kbuf->buf_doff;
1769 }
1770
1771 /* all good to go */
1772 len += kbuf->buf_dlen;
1773 pubuf = ubuf;
1774 pkbuf = kbuf;
1775 }
1776
1777 _CASSERT(offsetof(struct __kern_packet, pkt_length) ==
1778 offsetof(struct __kern_packet, pkt_qum.qum_len));
1779 if (__improbable(kpkt->pkt_length != len)) {
1780 SK_ERR("%s(%d) kring 0x%llx bad pktlen %d, %d",
1781 sk_proc_name_address(p), sk_proc_pid(p),
1782 SK_KVA(kring), kpkt->pkt_length, len);
1783 err = ERANGE;
1784 goto done;
1785 }
1786
1787 if ((err == 0) && (md_type == NEXUS_META_TYPE_PACKET)) {
1788 bdlim = kqum->qum_pp->pp_buflet_size;
1789 switch (md_subtype) {
1790 case NEXUS_META_SUBTYPE_RAW:
1791 /*
1792 * For a raw packet from user space we need to
1793 * validate that headroom is sane and is in the
1794 * first buflet.
1795 */
1796 if (__improbable(kpkt->pkt_headroom != bdoff)) {
1797 SK_ERR("%s(%d) kring 0x%llx bad headroom %d, %d",
1798 sk_proc_name_address(p), sk_proc_pid(p),
1799 SK_KVA(kring), kpkt->pkt_headroom, bdoff);
1800 err = ERANGE;
1801 goto done;
1802 }
1803 if (__improbable(kpkt->pkt_headroom +
1804 kpkt->pkt_l2_len >= bdlim)) {
1805 SK_ERR("%s(%d) kring 0x%llx bad headroom l2len %d, %d",
1806 sk_proc_name_address(p), sk_proc_pid(p),
1807 SK_KVA(kring), kpkt->pkt_l2_len, bdlim);
1808 err = ERANGE;
1809 goto done;
1810 }
1811 break;
1812 case NEXUS_META_SUBTYPE_PAYLOAD:
1813 /*
1814 * For a payload packet from user space we need
1815 * to validate that payload starts from 0 and L2
1816 * length is 0.
1817 */
1818 if (__improbable((kpkt->pkt_headroom != 0) ||
1819 (kpkt->pkt_l2_len != 0))) {
1820 SK_ERR("%s(%d) kring 0x%llx bad headroom "
1821 "payload subtype %d headroom %d l2len %d",
1822 sk_proc_name_address(p), sk_proc_pid(p),
1823 SK_KVA(kring), SK_PTR_SUBTYPE(kpkt),
1824 kpkt->pkt_headroom, kpkt->pkt_l2_len);
1825 err = ERANGE;
1826 goto done;
1827 }
1828 break;
1829 default:
1830 VERIFY(0);
1831 /* NOTREACHED */
1832 __builtin_unreachable();
1833 }
1834
1835 /* validate checksum offload properties */
1836 if (__probable(PACKET_HAS_PARTIAL_CHECKSUM(kpkt))) {
1837 uint16_t start = kpkt->pkt_csum_tx_start_off;
1838 uint16_t stuff = kpkt->pkt_csum_tx_stuff_off;
1839 if (__improbable(start > stuff ||
1840 start > kpkt->pkt_length ||
1841 (stuff + sizeof(uint16_t)) > kpkt->pkt_length)) {
1842 SK_ERR("%s(%d) flags 0x%x start %u stuff %u "
1843 "len %u", sk_proc_name_address(p),
1844 sk_proc_pid(p), kpkt->pkt_csum_flags,
1845 start, stuff, kpkt->pkt_length);
1846 err = ERANGE;
1847 goto done;
1848 }
1849 } else {
1850 kpkt->pkt_csum_tx_start_off = 0;
1851 kpkt->pkt_csum_tx_stuff_off = 0;
1852 }
1853 *__DECONST(uint16_t *, &kpkt->pkt_bufs_cnt) = bcnt;
1854 }
1855
1856 done:
1857 if (__probable(err == 0)) {
1858 kqum->qum_len = len;
1859 kqum->qum_qflags |= (QUM_F_INTERNALIZED | QUM_F_FINALIZED);
1860 } else {
1861 kqum->qum_len = 0;
1862 kqum->qum_qflags |= (QUM_F_INTERNALIZED | QUM_F_DROPPED);
1863 }
1864 return err;
1865 }
1866
1867 __attribute__((always_inline))
1868 static inline void
kr_externalize_metadata_internal(struct __kern_channel_ring * kring,const uint32_t maxfrags,struct __kern_quantum * kqum,struct proc * p)1869 kr_externalize_metadata_internal(struct __kern_channel_ring *kring,
1870 const uint32_t maxfrags, struct __kern_quantum *kqum, struct proc *p)
1871 {
1872 #pragma unused(kring, maxfrags, p)
1873 struct __kern_buflet *kbuf, *pkbuf; /* kernel buflet */
1874 struct __user_buflet *ubuf, *pubuf; /* user buflet */
1875 struct __user_quantum *uqum; /* user destination */
1876 struct __user_packet *upkt;
1877 struct __kern_packet *kpkt;
1878 const nexus_meta_type_t md_type = METADATA_TYPE(kqum);
1879 const nexus_meta_subtype_t md_subtype = METADATA_SUBTYPE(kqum);
1880 uint32_t len = 0;
1881 uint16_t bcnt = 0, bmax, i;
1882
1883 /*
1884 * Verify that the quantum/packet belongs to the same pp as
1885 * the one used by the adapter, i.e. the packet must have
1886 * been allocated from the same pp and attached to the kring.
1887 */
1888 ASSERT(kqum->qum_pp == kring->ckr_pp);
1889 ASSERT(kqum->qum_qflags & (QUM_F_FINALIZED | QUM_F_INTERNALIZED));
1890
1891 _CASSERT(sizeof(kpkt->pkt_com) == sizeof(upkt->pkt_com));
1892 _CASSERT(sizeof(kqum->qum_com) == sizeof(uqum->qum_com));
1893 uqum = __DECONST(struct __user_quantum *, kqum->qum_user);
1894 ASSERT(!(kqum->qum_qflags & QUM_F_KERNEL_ONLY) && uqum != NULL);
1895 upkt = SK_PTR_ADDR_UPKT(uqum);
1896 kpkt = SK_PTR_ADDR_KPKT(kqum);
1897
1898 DTRACE_SKYWALK3(externalize, struct __kern_channel_ring *, kring,
1899 struct __kern_packet *, kpkt, struct __user_packet *, upkt);
1900 SK_DF(SK_VERB_MEM, "%s(%d) kring 0x%llx kqum 0x%llx -> uqum 0x%llx",
1901 sk_proc_name_address(p), sk_proc_pid(p), SK_KVA(kring),
1902 SK_KVA(kqum), SK_KVA(uqum));
1903
1904 /*
1905 * Externalize common quantum metadata.
1906 */
1907 _QUM_EXTERNALIZE(kqum, uqum);
1908
1909 switch (md_type) {
1910 case NEXUS_META_TYPE_PACKET: {
1911 bcnt = kpkt->pkt_bufs_cnt;
1912 bmax = kpkt->pkt_bufs_max;
1913 ASSERT(bmax == maxfrags);
1914 ASSERT(bcnt <= bmax);
1915 /*
1916 * Externalize common packet metadata.
1917 */
1918 _PKT_EXTERNALIZE(kpkt, upkt);
1919
1920 /* sanitize buflet count and limit (deconst) */
1921 _CASSERT(sizeof(upkt->pkt_bufs_max) == sizeof(uint16_t));
1922 _CASSERT(sizeof(upkt->pkt_bufs_cnt) == sizeof(uint16_t));
1923 *(uint16_t *)(uintptr_t)&upkt->pkt_bufs_max = bmax;
1924 *(uint16_t *)(uintptr_t)&upkt->pkt_bufs_cnt = bcnt;
1925
1926 switch (md_subtype) {
1927 case NEXUS_META_SUBTYPE_PAYLOAD:
1928 /* sanitize link layer fields for payload mode */
1929 upkt->pkt_headroom = 0;
1930 upkt->pkt_link_flags = 0;
1931 break;
1932 default:
1933 break;
1934 }
1935 break;
1936 }
1937
1938 case NEXUS_META_TYPE_QUANTUM:
1939 ASSERT(maxfrags == 1);
1940 bcnt = bmax = 1;
1941 break;
1942
1943 default:
1944 VERIFY(0);
1945 /* NOTREACHED */
1946 __builtin_unreachable();
1947 }
1948
1949 ASSERT(bcnt != 0);
1950 /*
1951 * special handling to externalize empty packet buflet.
1952 */
1953 kbuf = &kpkt->pkt_qum.qum_buf[0];
1954 if (kbuf->buf_addr == 0) {
1955 ubuf = __DECONST(struct __user_buflet *,
1956 &kpkt->pkt_qum.qum_user->qum_buf[0]);
1957 UBUF_INIT(kbuf, ubuf);
1958 }
1959
1960 kbuf = pkbuf = NULL;
1961 ubuf = pubuf = NULL;
1962 /*
1963 * Externalize buflets.
1964 */
1965 for (i = 0; i < bcnt; i++) {
1966 _CASSERT(offsetof(struct __kern_packet, pkt_qum) == 0);
1967 PKT_GET_NEXT_BUFLET(kpkt, bcnt, pkbuf, kbuf);
1968 ASSERT(kbuf != NULL);
1969
1970 if (kbuf->buf_flag & BUFLET_FLAG_EXTERNAL) {
1971 ubuf = __DECONST(struct __user_buflet *,
1972 ((struct __kern_buflet_ext *)kbuf)->kbe_buf_user);
1973 } else {
1974 ASSERT(i == 0);
1975 ubuf = __DECONST(struct __user_buflet *,
1976 &kpkt->pkt_qum.qum_user->qum_buf[0]);
1977 }
1978
1979 ASSERT(ubuf != NULL);
1980 ASSERT((kbuf != pkbuf) && (ubuf != pubuf));
1981 ASSERT(BUF_IN_RANGE(kbuf));
1982 KBUF_EXTERNALIZE(kbuf, ubuf, kqum->qum_pp);
1983
1984 /* all good to go */
1985 len += kbuf->buf_dlen;
1986 pkbuf = kbuf;
1987 pubuf = ubuf;
1988 }
1989
1990 uqum->qum_len = len;
1991 uqum->qum_qflags |= QUM_F_FINALIZED;
1992
1993 /*
1994 * XXX: [email protected] -- do this during reclaim instead?
1995 */
1996 kqum->qum_qflags &= ~QUM_F_INTERNALIZED;
1997 }
1998
1999
2000 void
kr_externalize_metadata(struct __kern_channel_ring * kring,const uint32_t maxfrags,struct __kern_quantum * kqum,struct proc * p)2001 kr_externalize_metadata(struct __kern_channel_ring *kring,
2002 const uint32_t maxfrags, struct __kern_quantum *kqum, struct proc *p)
2003 {
2004 kr_externalize_metadata_internal(kring, maxfrags, kqum, p);
2005 }
2006