1 /*
2 * Copyright (c) 2015-2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 #include <skywalk/os_skywalk_private.h>
30 #include <kern/sched_prim.h>
31 #include <kern/uipc_domain.h>
32 #include <sys/sdt.h>
33
34 static void kr_update_user_stats(struct __kern_channel_ring *,
35 uint32_t, uint32_t);
36 static void kr_externalize_metadata_internal(struct __kern_channel_ring *,
37 const uint32_t, struct __kern_quantum *, struct proc *);
38
39 #define KR_TRANSFER_DECAY 2 /* ilog2 of EWMA decay rate (4) */
40 static uint32_t kr_transfer_decay = 0;
41
42 #define KR_ACCUMULATE_INTERVAL 2 /* 2 seconds */
43 static uint32_t kr_accumulate_interval = KR_ACCUMULATE_INTERVAL;
44
45 #if (DEVELOPMENT || DEBUG)
46 #define KR_STAT_ENABLE 1
47 #else /* !(DEVELOPMENT || DEBUG) */
48 #define KR_STAT_ENABLE 0
49 #endif /* !(DEVELOPMENT || DEBUG) */
50 /* Enable/Disable ring stats collection */
51 uint32_t kr_stat_enable = KR_STAT_ENABLE;
52
53 #if (DEVELOPMENT || DEBUG)
54 SYSCTL_UINT(_kern_skywalk, OID_AUTO, ring_transfer_decay,
55 CTLFLAG_RW | CTLFLAG_LOCKED, &kr_transfer_decay,
56 0, "ilog2 of EWMA decay rate of ring transfers");
57
58 SYSCTL_UINT(_kern_skywalk, OID_AUTO, ring_stat_accumulate_interval,
59 CTLFLAG_RW | CTLFLAG_LOCKED, &kr_accumulate_interval,
60 KR_ACCUMULATE_INTERVAL, "accumulation interval for ring stats");
61
62 uint32_t kr_disable_panic_on_sync_err = 0;
63 SYSCTL_UINT(_kern_skywalk, OID_AUTO, disable_panic_on_sync_err,
64 CTLFLAG_RW | CTLFLAG_LOCKED, &kr_disable_panic_on_sync_err,
65 0, "disable panic on sync error");
66 #endif /* (DEVELOPMENT || DEBUG) */
67
68 SYSCTL_UINT(_kern_skywalk, OID_AUTO, ring_stat_enable,
69 CTLFLAG_RW | CTLFLAG_LOCKED, &kr_stat_enable,
70 0, "enable/disable stats collection for ring");
71
72 #define KR_EWMA(old, new, decay) do { \
73 u_int64_t _avg; \
74 if (__probable((_avg = (old)) > 0)) \
75 _avg = (((_avg << (decay)) - _avg) + (new)) >> (decay); \
76 else \
77 _avg = (new); \
78 (old) = _avg; \
79 } while (0)
80
81 #define _BUF_DLIM(_buf, _pp) (BUFLET_HAS_LARGE_BUF(_buf) ? \
82 PP_BUF_SIZE_LARGE(_pp) : PP_BUF_SIZE_DEF(_pp))
83
84 void
kr_init_to_mhints(struct __kern_channel_ring * kring,uint32_t nslots)85 kr_init_to_mhints(struct __kern_channel_ring *kring, uint32_t nslots)
86 {
87 uint32_t tail;
88
89 tail = nslots - 1;
90
91 kring->ckr_transfer_decay = KR_TRANSFER_DECAY;
92 kring->ckr_num_slots = nslots;
93 *(slot_idx_t *)(uintptr_t)&kring->ckr_lim = (nslots - 1);
94 kring->ckr_rhead = kring->ckr_khead = 0;
95 /* IMPORTANT: Always keep one slot empty */
96 kring->ckr_rtail = kring->ckr_ktail =
97 ((kring->ckr_tx == NR_TX) || (kring->ckr_tx == NR_F) ? tail : 0);
98 }
99
100 /*
101 * Try to obtain exclusive right to issue the *sync() or state change
102 * operations on the ring. The right is obtained and must be later
103 * relinquished via kr_exit() if and only if kr_enter() returns 0.
104 *
105 * In all cases the caller will typically skip the ring, possibly collecting
106 * errors along the way.
107 *
108 * If the calling context does not allow sleeping, the caller must pass
109 * FALSE in can_sleep; EBUSY may be returned if the right is held by
110 * another thread. Otherwise, the caller may block until the right is
111 * released by the previous holder.
112 */
113 int
kr_enter(struct __kern_channel_ring * kr,boolean_t can_sleep)114 kr_enter(struct __kern_channel_ring *kr, boolean_t can_sleep)
115 {
116 lck_spin_lock(&kr->ckr_slock);
117 if (kr->ckr_owner == current_thread()) {
118 ASSERT(kr->ckr_busy != 0);
119 kr->ckr_busy++;
120 goto done;
121 }
122 if (!can_sleep) {
123 if (kr->ckr_busy != 0) {
124 lck_spin_unlock(&kr->ckr_slock);
125 return EBUSY;
126 }
127 } else {
128 while (kr->ckr_busy != 0) {
129 kr->ckr_want++;
130 (void) assert_wait(&kr->ckr_busy, THREAD_UNINT);
131 lck_spin_unlock(&kr->ckr_slock);
132 (void) thread_block(THREAD_CONTINUE_NULL);
133 SK_DF(SK_VERB_LOCKS, "waited for kr \"%s\" "
134 "(%p) busy=%u", kr->ckr_name,
135 SK_KVA(kr), kr->ckr_busy);
136 lck_spin_lock(&kr->ckr_slock);
137 }
138 }
139 LCK_SPIN_ASSERT(&kr->ckr_slock, LCK_ASSERT_OWNED);
140 ASSERT(kr->ckr_busy == 0);
141 kr->ckr_busy++;
142 kr->ckr_owner = current_thread();
143 done:
144 lck_spin_unlock(&kr->ckr_slock);
145
146 SK_DF(SK_VERB_LOCKS, "kr \"%s\" (%p) right acquired",
147 kr->ckr_name, SK_KVA(kr));
148
149 return 0;
150 }
151
152 void
kr_exit(struct __kern_channel_ring * kr)153 kr_exit(struct __kern_channel_ring *kr)
154 {
155 uint32_t want = 0;
156
157 lck_spin_lock(&kr->ckr_slock);
158 ASSERT(kr->ckr_busy != 0);
159 ASSERT(kr->ckr_owner == current_thread());
160 if (--kr->ckr_busy == 0) {
161 kr->ckr_owner = NULL;
162
163 /*
164 * we're done with the kring;
165 * notify anyone that has lost the race
166 */
167 if ((want = kr->ckr_want) != 0) {
168 kr->ckr_want = 0;
169 wakeup((void *)&kr->ckr_busy);
170 lck_spin_unlock(&kr->ckr_slock);
171 } else {
172 lck_spin_unlock(&kr->ckr_slock);
173 }
174 } else {
175 lck_spin_unlock(&kr->ckr_slock);
176 }
177
178 SK_DF(SK_VERB_LOCKS, "kr \"%s\" (%p) right released (%u waiters)",
179 kr->ckr_name, SK_KVA(kr), want);
180 }
181
182
183 void
kr_start(struct __kern_channel_ring * kr)184 kr_start(struct __kern_channel_ring *kr)
185 {
186 lck_spin_lock(&kr->ckr_slock);
187 ASSERT(kr->ckr_busy != 0);
188 ASSERT(kr->ckr_state == KR_STOPPED || kr->ckr_state == KR_LOCKED);
189 /* now clear the state */
190 kr->ckr_state = KR_READY;
191 lck_spin_unlock(&kr->ckr_slock);
192
193 kr_exit(kr);
194
195 SK_DF(SK_VERB_LOCKS, "kr \"%s\" (%p) is started",
196 kr->ckr_name, SK_KVA(kr));
197 }
198
199 /*
200 * Put the kring in the 'stopped' state: either KR_STOPPED or KR_LOCKED.
201 * Also marks the ring as busy, which would require either kr_start() at a
202 * later point.
203 */
204 void
kr_stop(struct __kern_channel_ring * kr,uint32_t state)205 kr_stop(struct __kern_channel_ring *kr, uint32_t state)
206 {
207 uint32_t s;
208
209 ASSERT(state == KR_STOPPED || state == KR_LOCKED);
210
211 s = kr_enter(kr, TRUE);
212 ASSERT(s == 0);
213
214 lck_spin_lock(&kr->ckr_slock);
215 ASSERT(kr->ckr_busy != 0);
216 /* now set the state */
217 kr->ckr_state = state;
218 lck_spin_unlock(&kr->ckr_slock);
219
220 SK_DF(SK_VERB_LOCKS,
221 "kr \"%s\" (0x%p) krflags 0x%x is now stopped s=%u",
222 kr->ckr_name, SK_KVA(kr), kr->ckr_flags, state);
223 }
224
225 static void
kr_update_user_stats(struct __kern_channel_ring * kring,uint32_t slot_count,uint32_t byte_count)226 kr_update_user_stats(struct __kern_channel_ring *kring, uint32_t slot_count,
227 uint32_t byte_count)
228 {
229 uint64_t now;
230 uint32_t transfer_decay = (kr_transfer_decay != 0) ?
231 kr_transfer_decay : kring->ckr_transfer_decay;
232 channel_ring_user_stats_t stats = &kring->ckr_usr_stats;
233
234 now = net_uptime();
235 kring->ckr_sync_time = now;
236
237 if (kr_stat_enable == 0) {
238 return;
239 }
240
241 stats->crsu_number_of_syncs++;
242 stats->crsu_total_bytes_transferred += byte_count;
243 stats->crsu_total_slots_transferred += slot_count;
244
245 if (slot_count > stats->crsu_max_slots_transferred) {
246 stats->crsu_max_slots_transferred = slot_count;
247 }
248
249 if (stats->crsu_min_slots_transferred == 0 ||
250 slot_count < stats->crsu_min_slots_transferred) {
251 stats->crsu_min_slots_transferred = slot_count;
252 }
253
254 if (__probable(kring->ckr_user_accumulate_start != 0)) {
255 if ((now - kring->ckr_user_accumulate_start) >=
256 kr_accumulate_interval) {
257 uint64_t bps;
258 uint64_t sps;
259 uint64_t sps_ma;
260
261 /* bytes per sync */
262 bps = kring->ckr_user_accumulated_bytes /
263 kring->ckr_user_accumulated_syncs;
264 KR_EWMA(stats->crsu_bytes_per_sync_ma,
265 bps, transfer_decay);
266 stats->crsu_bytes_per_sync = bps;
267
268 /* slots per sync */
269 sps = kring->ckr_user_accumulated_slots /
270 kring->ckr_user_accumulated_syncs;
271 sps_ma = stats->crsu_slots_per_sync_ma;
272 KR_EWMA(sps_ma, sps, transfer_decay);
273 stats->crsu_slots_per_sync_ma = (uint32_t)sps_ma;
274 stats->crsu_slots_per_sync = (uint32_t)sps;
275
276 /* start over */
277 kring->ckr_user_accumulate_start = now;
278 kring->ckr_user_accumulated_bytes = 0;
279 kring->ckr_user_accumulated_slots = 0;
280 kring->ckr_user_accumulated_syncs = 0;
281
282 stats->crsu_min_slots_transferred = 0;
283 stats->crsu_max_slots_transferred = 0;
284 }
285 } else {
286 kring->ckr_user_accumulate_start = now;
287 }
288
289 kring->ckr_user_accumulated_bytes += byte_count;
290 kring->ckr_user_accumulated_slots += slot_count;
291 kring->ckr_user_accumulated_syncs++;
292 }
293
294 /* caller to make sure thread safety */
295 void
kr_update_stats(struct __kern_channel_ring * kring,uint32_t slot_count,uint32_t byte_count)296 kr_update_stats(struct __kern_channel_ring *kring, uint32_t slot_count,
297 uint32_t byte_count)
298 {
299 uint64_t now;
300 uint64_t diff_secs;
301 channel_ring_stats_t stats = &kring->ckr_stats;
302 uint32_t transfer_decay = (kr_transfer_decay != 0) ?
303 kr_transfer_decay : kring->ckr_transfer_decay;
304
305 if (kr_stat_enable == 0) {
306 return;
307 }
308
309 if (__improbable(slot_count == 0)) {
310 return;
311 }
312
313 stats->crs_number_of_transfers++;
314 stats->crs_total_bytes_transferred += byte_count;
315 stats->crs_total_slots_transferred += slot_count;
316 if (slot_count > stats->crs_max_slots_transferred) {
317 stats->crs_max_slots_transferred = slot_count;
318 }
319 if (stats->crs_min_slots_transferred == 0 ||
320 slot_count < stats->crs_min_slots_transferred) {
321 stats->crs_min_slots_transferred = slot_count;
322 }
323
324 now = net_uptime();
325 stats->crs_last_update_net_uptime = now;
326 if (__probable(kring->ckr_accumulate_start != 0)) {
327 diff_secs = now - kring->ckr_accumulate_start;
328 if (diff_secs >= kr_accumulate_interval) {
329 uint64_t bps;
330 uint64_t sps;
331 uint64_t sps_ma;
332
333 /* bytes per second */
334 bps = kring->ckr_accumulated_bytes / diff_secs;
335 KR_EWMA(stats->crs_bytes_per_second_ma,
336 bps, transfer_decay);
337 stats->crs_bytes_per_second = bps;
338
339 /* slots per second */
340 sps = kring->ckr_accumulated_slots / diff_secs;
341 sps_ma = stats->crs_slots_per_second_ma;
342 KR_EWMA(sps_ma, sps, transfer_decay);
343 stats->crs_slots_per_second_ma = (uint32_t)sps_ma;
344 stats->crs_slots_per_second = (uint32_t)sps;
345
346 /* start over */
347 kring->ckr_accumulate_start = now;
348 kring->ckr_accumulated_bytes = 0;
349 kring->ckr_accumulated_slots = 0;
350
351 stats->crs_min_slots_transferred = 0;
352 stats->crs_max_slots_transferred = 0;
353 }
354 } else {
355 kring->ckr_accumulate_start = now;
356 }
357 kring->ckr_accumulated_bytes += byte_count;
358 kring->ckr_accumulated_slots += slot_count;
359 }
360
361 /* True if no space in the tx ring. only valid after kr_txsync_prologue */
362 boolean_t
kr_txempty(struct __kern_channel_ring * kring)363 kr_txempty(struct __kern_channel_ring *kring)
364 {
365 return kring->ckr_rhead == kring->ckr_ktail;
366 }
367
368 #if SK_LOG
369 /*
370 * Error logging routine called when txsync/rxsync detects an error.
371 * Expected to be called before killing the process with skywalk_kill_process()
372 *
373 * This routine is only called by the upper half of the kernel.
374 * It only reads khead (which is changed only by the upper half, too)
375 * and ktail (which may be changed by the lower half, but only on
376 * a tx ring and only to increase it, so any error will be recovered
377 * on the next call). For the above, we don't strictly need to call
378 * it under lock.
379 */
380 void
kr_log_bad_ring(struct __kern_channel_ring * kring)381 kr_log_bad_ring(struct __kern_channel_ring *kring)
382 {
383 struct __user_channel_ring *ring = kring->ckr_ring;
384 const slot_idx_t lim = kring->ckr_lim;
385 slot_idx_t i;
386 int errors = 0;
387
388 SK_ERR("kr \"%s\" (0x%p) krflags 0x%x", kring->ckr_name, SK_KVA(kring),
389 kring->ckr_flags);
390
391 if (ring->ring_head > lim) {
392 errors++;
393 }
394 if (ring->ring_tail > lim) {
395 errors++;
396 }
397 for (i = 0; i <= lim; i++) {
398 struct __kern_slot_desc *ksd = KR_KSD(kring, i);
399 struct __kern_quantum *kqum = ksd->sd_qum;
400 obj_idx_t idx;
401 uint32_t len;
402
403 if (!KSD_VALID_METADATA(ksd)) {
404 continue;
405 }
406
407 idx = METADATA_IDX(kqum);
408 len = kqum->qum_len;
409 if (len > kring->ckr_max_pkt_len) {
410 SK_RDERR(5, "bad len at slot %u idx %u len %u",
411 i, idx, len);
412 }
413 }
414
415 if (errors != 0) {
416 SK_ERR("total %d errors", errors);
417 SK_ERR("kr \"%s\" (0x%p) krflags 0x%x crash, "
418 "head %u/%u -> %u tail %u/%u -> %u", kring->ckr_name,
419 SK_KVA(kring), kring->ckr_flags, ring->ring_head,
420 kring->ckr_rhead, kring->ckr_khead, ring->ring_tail,
421 kring->ckr_rtail, kring->ckr_ktail);
422 }
423 }
424 #endif /* SK_LOG */
425
426 uint32_t
kr_reclaim(struct __kern_channel_ring * kr)427 kr_reclaim(struct __kern_channel_ring *kr)
428 {
429 int r = 0;
430
431 VERIFY(sk_is_sync_protected());
432
433 /*
434 * This is a no-op for TX ring, since the TX reclaim logic is only
435 * known to the nexus itself. There, the nexus's TX sync code would
436 * figure out the number of slots that has been "transmitted", and
437 * advance the slot pointer accordingly. This routine would then be
438 * called as a way to advise the system of such condition.
439 *
440 * For RX ring, this will reclaim user-released slots, and it is
441 * to be called by the provider's RX sync routine prior to its
442 * processing new slots (into the RX ring).
443 *
444 * It is therefore advised that this routine be called at the start
445 * of the RX sync callback, as well as at the end of the TX sync
446 * callback; the latter is useful in case we decide to implement
447 * more logic in future.
448 */
449 if ((kr->ckr_tx == NR_RX) || (kr->ckr_tx == NR_EV)) {
450 /* # of reclaimed slots */
451 r = kr->ckr_rhead - kr->ckr_khead;
452 if (r < 0) {
453 r += kr->ckr_num_slots;
454 }
455
456 kr->ckr_khead = kr->ckr_rhead;
457 /* ensure global visibility */
458 os_atomic_thread_fence(seq_cst);
459 }
460
461 return (slot_idx_t)r;
462 }
463
464 /*
465 * Nexus-specific kr_txsync_prologue() callback.
466 */
467 int
kr_txprologue(struct kern_channel * ch,struct __kern_channel_ring * kring,const slot_idx_t head,uint32_t * byte_count,uint64_t * err_reason,struct proc * p)468 kr_txprologue(struct kern_channel *ch, struct __kern_channel_ring *kring,
469 const slot_idx_t head, uint32_t *byte_count, uint64_t *err_reason,
470 struct proc *p)
471 {
472 struct kern_pbufpool *pp = kring->ckr_pp;
473 const uint32_t maxfrags = pp->pp_max_frags;
474 slot_idx_t slot_idx = kring->ckr_rhead;
475
476 ASSERT(!(KRNA(kring)->na_flags & NAF_USER_PKT_POOL));
477
478 while (slot_idx != head) {
479 struct __kern_slot_desc *ksd = KR_KSD(kring, slot_idx);
480 struct __kern_quantum *kqum = ksd->sd_qum;
481 int err;
482
483 if (__improbable(!(kqum->qum_qflags & QUM_F_KERNEL_ONLY) &&
484 METADATA_IDX(kqum) != METADATA_IDX(kqum->qum_user))) {
485 SK_ERR("qum index mismatch");
486 *err_reason = SKYWALK_KILL_REASON_QUM_IDX_MISMATCH;
487 return -1;
488 }
489
490 /* Internalize */
491 err = kr_internalize_metadata(ch, kring, maxfrags, kqum, p);
492 if (__improbable(err != 0)) {
493 SK_ERR("%s(%d) kr \"%s\" (%p) slot %u dropped "
494 "(err %d) kh %u kt %u | rh %u rt %u | h %u t %u",
495 sk_proc_name(p), sk_proc_pid(p),
496 kring->ckr_name, SK_KVA(kring), slot_idx, err,
497 kring->ckr_khead, kring->ckr_ktail,
498 kring->ckr_rhead, kring->ckr_rtail,
499 kring->ckr_ring->ring_head,
500 kring->ckr_ring->ring_tail);
501 *err_reason = SKYWALK_KILL_REASON_INTERNALIZE_FAILED;
502 return -1;
503 }
504
505 *byte_count += kqum->qum_len;
506 slot_idx = SLOT_NEXT(slot_idx, kring->ckr_lim);
507 }
508
509 return 0;
510 }
511
512 /*
513 * Nexus-specific kr_txsync_prologue() callback - user packet pool variant.
514 */
515 int
kr_txprologue_upp(struct kern_channel * ch,struct __kern_channel_ring * kring,const slot_idx_t head,uint32_t * byte_count,uint64_t * err_reason,struct proc * p)516 kr_txprologue_upp(struct kern_channel *ch, struct __kern_channel_ring *kring,
517 const slot_idx_t head, uint32_t *byte_count, uint64_t *err_reason,
518 struct proc *p)
519 {
520 struct kern_pbufpool *pp = kring->ckr_pp;
521 const uint32_t maxfrags = pp->pp_max_frags;
522 slot_idx_t slot_idx = kring->ckr_rhead;
523 struct __kern_quantum *kqum = NULL;
524 bool free_pkt = false;
525 int err = 0;
526
527 ASSERT(KRNA(kring)->na_flags & NAF_USER_PKT_POOL);
528
529 PP_LOCK(pp);
530 while (slot_idx != head) {
531 struct __kern_slot_desc *ksd = KR_KSD(kring, slot_idx);
532 struct __user_slot_desc *usd = KR_USD(kring, slot_idx);
533
534 /*
535 * The channel is operating in user packet pool mode;
536 * check if the packet is in the allocated list.
537 */
538 kqum = pp_remove_upp_locked(pp, usd->sd_md_idx, &err);
539 if (__improbable(err != 0)) {
540 if (kqum != NULL) {
541 SK_ERR("%s(%d) kr \"%s\" (%p) slot %u "
542 "kqum %p, bad buflet chain",
543 sk_proc_name(p), sk_proc_pid(p),
544 kring->ckr_name, SK_KVA(kring), slot_idx,
545 SK_KVA(kqum));
546 *err_reason =
547 SKYWALK_KILL_REASON_BAD_BUFLET_CHAIN;
548 goto done;
549 }
550
551 SK_ERR("%s(%d) kr \"%s\" (%p) slot %u "
552 " unallocated packet %u kh %u kt %u | "
553 "rh %u rt %u | h %u t %u",
554 sk_proc_name(p), sk_proc_pid(p),
555 kring->ckr_name, SK_KVA(kring), slot_idx,
556 usd->sd_md_idx, kring->ckr_khead, kring->ckr_ktail,
557 kring->ckr_rhead, kring->ckr_rtail,
558 kring->ckr_ring->ring_head,
559 kring->ckr_ring->ring_tail);
560 *err_reason = SKYWALK_KILL_REASON_UNALLOCATED_PKT;
561 goto done;
562 }
563
564 if (__improbable(!(kqum->qum_qflags & QUM_F_KERNEL_ONLY) &&
565 METADATA_IDX(kqum) != METADATA_IDX(kqum->qum_user))) {
566 SK_ERR("qum index mismatch");
567 *err_reason = SKYWALK_KILL_REASON_QUM_IDX_MISMATCH;
568 err = ERANGE;
569 free_pkt = true;
570 goto done;
571 }
572
573 /* Internalize */
574 err = kr_internalize_metadata(ch, kring, maxfrags, kqum, p);
575 if (__improbable(err != 0)) {
576 SK_ERR("%s(%d) kr \"%s\" (%p) slot %u dropped "
577 "(err %d) kh %u kt %u | rh %u rt %u | h %u t %u",
578 sk_proc_name(p), sk_proc_pid(p),
579 kring->ckr_name, SK_KVA(kring), slot_idx, err,
580 kring->ckr_khead, kring->ckr_ktail,
581 kring->ckr_rhead, kring->ckr_rtail,
582 kring->ckr_ring->ring_head,
583 kring->ckr_ring->ring_tail);
584 *err_reason = SKYWALK_KILL_REASON_INTERNALIZE_FAILED;
585 free_pkt = true;
586 goto done;
587 }
588
589 /*
590 * Attach packet to slot, detach mapping from alloc ring slot.
591 */
592 kqum->qum_ksd = NULL;
593 USD_RESET(usd);
594 KR_SLOT_ATTACH_METADATA(kring, ksd, kqum);
595
596 *byte_count += kqum->qum_len;
597 slot_idx = SLOT_NEXT(slot_idx, kring->ckr_lim);
598 }
599
600 done:
601 PP_UNLOCK(pp);
602 if (__improbable(err != 0) && free_pkt) {
603 ASSERT(kqum != NULL);
604 kqum->qum_ksd = NULL;
605 pp_free_packet(pp, (uint64_t)kqum);
606 }
607 return err;
608 }
609
610 #define NM_FAIL_ON(t, reason) if (__improbable(t)) { SK_ERR("fail " #t); \
611 err_reason = reason; goto error; }
612 /*
613 * Validate parameters in the TX/FREE ring/kring.
614 *
615 * ckr_rhead, ckr_rtail=ktail are stored from previous round.
616 * khead is the next packet to send to the ring.
617 *
618 * We want
619 * khead <= *ckr_rhead <= head <= tail = *ckr_rtail <= ktail
620 *
621 * ckr_khead, ckr_rhead, ckr_rtail and ckr_ktail are reliable
622 */
623 #define _KR_TXRING_VALIDATE(_kring, _ring, _kh, _kt, _rh, _krt) do {\
624 slot_idx_t _n = (_kring)->ckr_num_slots; \
625 /* kernel sanity checks */ \
626 NM_FAIL_ON((_kh) >= _n || kring->ckr_rhead >= _n || (_krt) >= _n || \
627 (_kt) >= _n, SKYWALK_KILL_REASON_BASIC_SANITY); \
628 /* user basic sanity checks */ \
629 NM_FAIL_ON((_rh) >= _n, SKYWALK_KILL_REASON_BASIC_SANITY); \
630 /* \
631 * user sanity checks. We only use 'cur', \
632 * A, B, ... are possible positions for cur: \
633 * \
634 * 0 A cur B tail C n-1 \
635 * 0 D tail E cur F n-1 \
636 * \
637 * B, F, D are valid. A, C, E are wrong \
638 */ \
639 if ((_krt) >= kring->ckr_rhead) { \
640 /* want ckr_rhead <= head <= ckr_rtail */ \
641 NM_FAIL_ON((_rh) < kring->ckr_rhead || (_rh) > (_krt), \
642 SKYWALK_KILL_REASON_HEAD_OOB); \
643 } else { /* here ckr_rtail < ckr_rhead */ \
644 /* we need head outside ckr_rtail .. ckr_rhead */ \
645 NM_FAIL_ON((_rh) > (_krt) && (_rh) < kring->ckr_rhead, \
646 SKYWALK_KILL_REASON_HEAD_OOB_WRAPPED); \
647 } \
648 NM_FAIL_ON(ring->ring_tail != (_krt), \
649 SKYWALK_KILL_REASON_TAIL_MISMATCH); \
650 } while (0)
651
652 /*
653 * Validate parameters in the ring/kring on entry for *_txsync().
654 * Returns ring->ring_head if ok, or something >= kring->ckr_num_slots
655 * in case of error, in order to force a reinit.
656 */
657 slot_idx_t
kr_txsync_prologue(struct kern_channel * ch,struct __kern_channel_ring * kring,struct proc * p)658 kr_txsync_prologue(struct kern_channel *ch, struct __kern_channel_ring *kring,
659 struct proc *p)
660 {
661 struct __user_channel_ring *ring = kring->ckr_ring;
662 slot_idx_t ckr_khead, ckr_ktail, ckr_rtail;
663 slot_idx_t head;
664 uint32_t byte_count = 0;
665 uint64_t err_reason = 0;
666 int slot_count;
667
668 VERIFY(sk_is_sync_protected());
669 /* assert that this routine is only called for user facing rings */
670 ASSERT(!KR_KERNEL_ONLY(kring));
671 ASSERT(kring->ckr_usds != NULL);
672
673 /* read these once and use local copies */
674 head = ring->ring_head;
675 ckr_khead = kring->ckr_khead;
676 ckr_ktail = kring->ckr_ktail;
677 os_atomic_thread_fence(seq_cst);
678 ckr_rtail = kring->ckr_rtail;
679
680 SK_DF(SK_VERB_SYNC | SK_VERB_TX, "%s(%d) kr \"%s\", kh %u kt %u | "
681 "rh %u rt %u | h %u t %u", sk_proc_name(p),
682 sk_proc_pid(p), kring->ckr_name, ckr_khead, ckr_ktail,
683 kring->ckr_rhead, ckr_rtail,
684 ring->ring_head, ring->ring_tail);
685
686 _KR_TXRING_VALIDATE(kring, ring, ckr_khead, ckr_ktail, head, ckr_rtail);
687
688 /* # of new tx slots */
689 slot_count = head - kring->ckr_rhead;
690 if (slot_count < 0) {
691 slot_count += kring->ckr_num_slots;
692 }
693
694 /*
695 * Invoke nexus-specific TX prologue callback, set in na_kr_create().
696 */
697 if (kring->ckr_prologue != NULL && (kring->ckr_prologue(ch,
698 kring, head, &byte_count, &err_reason, p) != 0)) {
699 goto error;
700 }
701
702 /* update the user's view of slots & bytes transferred */
703 kr_update_user_stats(kring, slot_count, byte_count);
704
705 /* update the kernel view of ring */
706 kring->ckr_rhead = head;
707
708 /* save for kr_txsync_finalize(); only khead is needed */
709 kring->ckr_khead_pre = ckr_khead;
710
711 return head;
712
713 error:
714 SK_ERR("%s(%d) kr \"%s\" (%p) krflags 0x%x error: kh %u kt %u | "
715 "rh %u rt %u | h %u t %u |", sk_proc_name(p),
716 sk_proc_pid(p), kring->ckr_name, SK_KVA(kring), kring->ckr_flags,
717 ckr_khead, ckr_ktail, kring->ckr_rhead, ckr_rtail, head,
718 ring->ring_tail);
719
720 skywalk_kill_process(p, err_reason | SKYWALK_KILL_REASON_TX_SYNC);
721
722 return kring->ckr_num_slots;
723 }
724
725 /*
726 * Validate parameters in the ring/kring on entry for *_free_sync().
727 * Returns ring->ring_head if ok, or something >= kring->ckr_num_slots
728 * in case of error, in order to force a reinit.
729 */
730 slot_idx_t
kr_free_sync_prologue(struct __kern_channel_ring * kring,struct proc * p)731 kr_free_sync_prologue(struct __kern_channel_ring *kring, struct proc *p)
732 {
733 struct __user_channel_ring *ring = kring->ckr_ring;
734 slot_idx_t ckr_khead, ckr_ktail, ckr_rtail;
735 slot_idx_t head;
736 uint64_t err_reason = 0;
737
738 VERIFY(sk_is_sync_protected());
739 /* read these once and use local copies */
740 head = ring->ring_head;
741 ckr_khead = kring->ckr_khead;
742 ckr_ktail = kring->ckr_ktail;
743 os_atomic_thread_fence(seq_cst);
744 ckr_rtail = kring->ckr_rtail;
745
746 SK_DF(SK_VERB_SYNC, "%s(%d) kr \"%s\", kh %u kt %u | "
747 "rh %u rt %u | h %u t %u", sk_proc_name(p),
748 sk_proc_pid(p), kring->ckr_name, ckr_khead, ckr_ktail,
749 kring->ckr_rhead, ckr_rtail, ring->ring_head, ring->ring_tail);
750
751 _KR_TXRING_VALIDATE(kring, ring, ckr_khead, ckr_ktail, head, ckr_rtail);
752
753 /* update the kernel view of ring */
754 kring->ckr_rhead = head;
755 return head;
756
757 error:
758 SK_ERR("%s(%d) kr \"%s\" (%p) krflags 0x%x error: kh %u kt %u | "
759 "rh %u rt %u | h %u t %u |", sk_proc_name(p),
760 sk_proc_pid(p), kring->ckr_name, SK_KVA(kring), kring->ckr_flags,
761 ckr_khead, ckr_ktail, kring->ckr_rhead, ckr_rtail, head,
762 ring->ring_tail);
763
764 skywalk_kill_process(p, err_reason | SKYWALK_KILL_REASON_FREE_SYNC);
765 return kring->ckr_num_slots;
766 }
767
768 /*
769 * Nexus-specific kr_rxsync_prologue() callback.
770 */
771 int
kr_rxprologue(struct kern_channel * ch,struct __kern_channel_ring * kring,const slot_idx_t head,uint32_t * byte_count,uint64_t * err_reason,struct proc * p)772 kr_rxprologue(struct kern_channel *ch, struct __kern_channel_ring *kring,
773 const slot_idx_t head, uint32_t *byte_count, uint64_t *err_reason,
774 struct proc *p)
775 {
776 #pragma unused(ch, p)
777 slot_idx_t slot_idx = kring->ckr_rhead;
778 uint32_t nfree = 0;
779
780 ASSERT(!(KRNA(kring)->na_flags & NAF_USER_PKT_POOL));
781
782 /*
783 * Iterating through the slots just read by user-space;
784 * ckr_rhead -> ring_head
785 */
786 while (slot_idx != head) {
787 struct __kern_slot_desc *ksd = KR_KSD(kring, slot_idx);
788 struct __kern_quantum *kqum = ksd->sd_qum;
789
790 ASSERT(KSD_VALID_METADATA(ksd));
791 /* # of new bytes transferred */
792 *byte_count += kqum->qum_len;
793
794 /* detach and free the packet */
795 (void) KR_SLOT_DETACH_METADATA(kring, ksd);
796 ASSERT(nfree < kring->ckr_num_slots);
797 kring->ckr_scratch[nfree++] = (uint64_t)kqum;
798
799 slot_idx = SLOT_NEXT(slot_idx, kring->ckr_lim);
800 }
801
802 if (nfree > 0) {
803 pp_free_packet_batch(kring->ckr_pp,
804 &kring->ckr_scratch[0], nfree);
805 }
806
807 /*
808 * Update userspace channel statistics of # readable bytes
809 * subtract byte counts from slots just given back to the kernel.
810 */
811 if (kring->ckr_ready_bytes < *byte_count) {
812 SK_ERR("%s(%d) kr \"%s\" (%p) inconsistent ready bytes "
813 "(%llu < %u) kh %u kt %u | rh %u rt %u | h %u t %u",
814 sk_proc_name(p), sk_proc_pid(p), kring->ckr_name,
815 SK_KVA(kring), kring->ckr_ready_bytes, *byte_count,
816 kring->ckr_khead, kring->ckr_ktail, kring->ckr_rhead,
817 kring->ckr_rtail, kring->ckr_ring->ring_head,
818 kring->ckr_ring->ring_tail);
819 *err_reason = SKYWALK_KILL_REASON_INCONSISTENT_READY_BYTES;
820 return -1;
821 }
822 kring->ckr_ready_bytes -= *byte_count;
823
824 return 0;
825 }
826
827 /*
828 * Nexus-specific kr_rxsync_prologue() callback - no detach variant.
829 */
830 int
kr_rxprologue_nodetach(struct kern_channel * ch,struct __kern_channel_ring * kring,const slot_idx_t head,uint32_t * byte_count,uint64_t * err_reason,struct proc * p)831 kr_rxprologue_nodetach(struct kern_channel *ch,
832 struct __kern_channel_ring *kring, const slot_idx_t head,
833 uint32_t *byte_count, uint64_t *err_reason, struct proc *p)
834 {
835 #pragma unused(ch, p)
836 slot_idx_t slot_idx = kring->ckr_rhead;
837
838 ASSERT(!(KRNA(kring)->na_flags & NAF_USER_PKT_POOL));
839
840 /*
841 * Iterating through the slots just read by user-space;
842 * ckr_rhead -> ring_head
843 */
844 while (slot_idx != head) {
845 struct __kern_slot_desc *ksd = KR_KSD(kring, slot_idx);
846 struct __kern_quantum *kqum = ksd->sd_qum;
847
848 ASSERT(KSD_VALID_METADATA(ksd));
849 /* # of new bytes transferred */
850 *byte_count += kqum->qum_len;
851 slot_idx = SLOT_NEXT(slot_idx, kring->ckr_lim);
852 }
853
854 /*
855 * Update userspace channel statistics of # readable bytes
856 * subtract byte counts from slots just given back to the kernel.
857 */
858 if (kring->ckr_ready_bytes < *byte_count) {
859 SK_ERR("%s(%d) kr \"%s\" (%p) inconsistent ready bytes "
860 "(%llu < %u) kh %u kt %u | rh %u rt %u | h %u t %u",
861 sk_proc_name(p), sk_proc_pid(p), kring->ckr_name,
862 SK_KVA(kring), kring->ckr_ready_bytes, *byte_count,
863 kring->ckr_khead, kring->ckr_ktail, kring->ckr_rhead,
864 kring->ckr_rtail, kring->ckr_ring->ring_head,
865 kring->ckr_ring->ring_tail);
866 *err_reason = SKYWALK_KILL_REASON_INCONSISTENT_READY_BYTES;
867 #if (DEVELOPMENT || DEBUG)
868 if (kr_disable_panic_on_sync_err == 0) {
869 panic("kr(%p), inconsistent, head %u, ready %llu, "
870 "cnt %u", SK_KVA(kring), head,
871 kring->ckr_ready_bytes, *byte_count);
872 /* NOTREACHED */
873 __builtin_unreachable();
874 }
875 #else /* (DEVELOPMENT || DEBUG) */
876 return -1;
877 #endif /* !(DEVELOPMENT || DEBUG) */
878 }
879 kring->ckr_ready_bytes -= *byte_count;
880
881 return 0;
882 }
883
884 /*
885 * Nexus-specific kr_rxsync_prologue() callback - user packet pool variant.
886 */
887 int
kr_rxprologue_upp(struct kern_channel * ch,struct __kern_channel_ring * kring,const slot_idx_t head,uint32_t * byte_count,uint64_t * err_reason,struct proc * p)888 kr_rxprologue_upp(struct kern_channel *ch, struct __kern_channel_ring *kring,
889 const slot_idx_t head, uint32_t *byte_count, uint64_t *err_reason,
890 struct proc *p)
891 {
892 #pragma unused(ch, p)
893 slot_idx_t slot_idx = kring->ckr_rhead;
894
895 ASSERT(KRNA(kring)->na_flags & NAF_USER_PKT_POOL);
896
897 /*
898 * Iterating through the slots just read by user-space;
899 * ckr_rhead -> ring_head
900 */
901 while (slot_idx != head) {
902 struct __user_slot_desc *usd = KR_USD(kring, slot_idx);
903
904 /*
905 * This is a user facing ring opting in for the user packet
906 * pool mode, so ensure that the user has detached packet
907 * from slot.
908 */
909 ASSERT(!KSD_VALID_METADATA(KR_KSD(kring, slot_idx)));
910 if (SD_VALID_METADATA(usd)) {
911 SK_ERR("%s(%d) kr \"%s\" (%p) slot %u not "
912 "detached md %u kh %u kt %u | rh %u rt %u |"
913 " h %u t %u", sk_proc_name(p),
914 sk_proc_pid(p), kring->ckr_name,
915 SK_KVA(kring), slot_idx, usd->sd_md_idx,
916 kring->ckr_khead, kring->ckr_ktail,
917 kring->ckr_rhead, kring->ckr_rtail,
918 kring->ckr_ring->ring_head,
919 kring->ckr_ring->ring_tail);
920 *err_reason = SKYWALK_KILL_REASON_SLOT_NOT_DETACHED;
921 return -1;
922 }
923 *byte_count += usd->sd_len;
924
925 slot_idx = SLOT_NEXT(slot_idx, kring->ckr_lim);
926 }
927
928 /*
929 * update userspace channel statistics of # readable bytes
930 * subtract byte counts from slots just given back to the kernel
931 */
932 if (kring->ckr_ready_bytes < *byte_count) {
933 SK_ERR("%s(%d) kr \"%s\" (%p) inconsistent ready bytes "
934 "(%llu < %u) kh %u kt %u | rh %u rt %u | h %u t %u",
935 sk_proc_name(p), sk_proc_pid(p), kring->ckr_name,
936 SK_KVA(kring), kring->ckr_ready_bytes, *byte_count,
937 kring->ckr_khead, kring->ckr_ktail, kring->ckr_rhead,
938 kring->ckr_rtail, kring->ckr_ring->ring_head,
939 kring->ckr_ring->ring_tail);
940 *err_reason = SKYWALK_KILL_REASON_INCONSISTENT_READY_BYTES;
941 return -1;
942 }
943 kring->ckr_ready_bytes -= *byte_count;
944
945 return 0;
946 }
947
948 /*
949 * Validate parameters in the RX/ALLOC/EVENT ring/kring.
950 * For a valid configuration,
951 * khead <= head <= tail <= ktail
952 *
953 * We only consider head.
954 * khead and ktail are reliable.
955 */
956 #define _KR_RXRING_VALIDATE(_kring, _ring, _kh, _kt, _rh) do { \
957 slot_idx_t _n = (_kring)->ckr_num_slots; \
958 /* kernel sanity checks */ \
959 NM_FAIL_ON((_kh) >= _n || (_kt) >= _n, \
960 SKYWALK_KILL_REASON_BASIC_SANITY); \
961 /* user sanity checks */ \
962 if ((_kt) >= (_kh)) { \
963 /* want khead <= head <= ktail */ \
964 NM_FAIL_ON((_rh) < (_kh) || (_rh) > (_kt), \
965 SKYWALK_KILL_REASON_HEAD_OOB); \
966 } else { \
967 /* we need head outside ktail..khead */ \
968 NM_FAIL_ON((_rh) < (_kh) && (_rh) > (_kt), \
969 SKYWALK_KILL_REASON_HEAD_OOB_WRAPPED); \
970 } \
971 NM_FAIL_ON((_ring)->ring_tail != (_kring)->ckr_rtail, \
972 SKYWALK_KILL_REASON_TAIL_MISMATCH); \
973 } while (0)
974
975 /*
976 * Validate parameters in the ring/kring on entry for *_rxsync().
977 * Returns ring->ring_head if ok, kring->ckr_num_slots on error,
978 * in order to force a reinit.
979 */
980 slot_idx_t
kr_rxsync_prologue(struct kern_channel * ch,struct __kern_channel_ring * kring,struct proc * p)981 kr_rxsync_prologue(struct kern_channel *ch, struct __kern_channel_ring *kring,
982 struct proc *p)
983 {
984 #pragma unused(ch)
985 struct __user_channel_ring *ring = kring->ckr_ring;
986 slot_idx_t ckr_khead, ckr_ktail;
987 slot_idx_t head;
988 uint32_t byte_count = 0;
989 uint64_t err_reason = 0;
990 int slot_count;
991
992 VERIFY(sk_is_sync_protected());
993 /* assert that this routine is only called for user facing rings */
994 ASSERT(!KR_KERNEL_ONLY(kring));
995 ASSERT(kring->ckr_usds != NULL);
996
997 /* read these once and use local copies */
998 ckr_khead = kring->ckr_khead;
999 ckr_ktail = kring->ckr_ktail;
1000
1001 SK_DF(SK_VERB_SYNC | SK_VERB_RX, "%s(%d) kr \"%s\", kh %u kt %u | "
1002 "rh %u rt %u | h %u t %u", sk_proc_name(p),
1003 sk_proc_pid(p), kring->ckr_name, ckr_khead, ckr_ktail,
1004 kring->ckr_rhead, kring->ckr_rtail,
1005 ring->ring_head, ring->ring_tail);
1006 /*
1007 * Before storing the new values, we should check they do not
1008 * move backwards. However:
1009 * - head is not an issue because the previous value is khead;
1010 * - cur could in principle go back, however it does not matter
1011 * because we are processing a brand new rxsync()
1012 */
1013 head = ring->ring_head; /* read only once */
1014
1015 _KR_RXRING_VALIDATE(kring, ring, ckr_khead, ckr_ktail, head);
1016
1017 /* # of reclaimed slots */
1018 slot_count = head - kring->ckr_rhead;
1019 if (slot_count < 0) {
1020 slot_count += kring->ckr_num_slots;
1021 }
1022
1023 /*
1024 * Invoke nexus-specific RX prologue callback, which may detach
1025 * and free any consumed packets. Configured in na_kr_create().
1026 */
1027 if (kring->ckr_prologue != NULL && (kring->ckr_prologue(ch,
1028 kring, head, &byte_count, &err_reason, p) != 0)) {
1029 goto error;
1030 }
1031 /* update the user's view of slots & bytes transferred */
1032 kr_update_user_stats(kring, slot_count, byte_count);
1033
1034 /* Update Rx dequeue timestamp */
1035 if (slot_count > 0) {
1036 kring->ckr_rx_dequeue_ts = net_uptime();
1037 }
1038
1039 /* update the kernel view of ring */
1040 kring->ckr_rhead = head;
1041 return head;
1042
1043 error:
1044 SK_ERR("%s(%d) kr \"%s\" (%p) krflags 0x%x error: kh %u kt %u | "
1045 "rh %u rt %u | h %u t %u", sk_proc_name(p),
1046 sk_proc_pid(p), kring->ckr_name, SK_KVA(kring), kring->ckr_flags,
1047 ckr_khead, ckr_ktail, kring->ckr_rhead, kring->ckr_rtail,
1048 ring->ring_head, ring->ring_tail);
1049
1050 skywalk_kill_process(p, err_reason | SKYWALK_KILL_REASON_RX_SYNC);
1051 return kring->ckr_num_slots;
1052 }
1053
1054 /*
1055 * Validate parameters on the ring/kring on entry for *_alloc_sync().
1056 * Returns ring->ring_head if ok, kring->ckr_num_slots on error,
1057 * in order to force a reinit.
1058 */
1059 slot_idx_t
kr_alloc_sync_prologue(struct __kern_channel_ring * kring,struct proc * p)1060 kr_alloc_sync_prologue(struct __kern_channel_ring *kring, struct proc *p)
1061 {
1062 struct __user_channel_ring *ring = kring->ckr_ring;
1063 slot_idx_t ckr_khead, ckr_ktail;
1064 slot_idx_t head;
1065 uint64_t err_reason = 0;
1066
1067 VERIFY(sk_is_sync_protected());
1068
1069 /* read these once and use local copies */
1070 ckr_khead = kring->ckr_khead;
1071 ckr_ktail = kring->ckr_ktail;
1072 head = ring->ring_head;
1073
1074 SK_DF(SK_VERB_SYNC, "%s(%d) kr \"%s\", kh %u kt %u | "
1075 "rh %u rt %u | h %u t %u", sk_proc_name(p),
1076 sk_proc_pid(p), kring->ckr_name, ckr_khead, ckr_ktail,
1077 kring->ckr_rhead, kring->ckr_rtail,
1078 head, ring->ring_tail);
1079 /*
1080 * Before storing the new values, we should check they do not
1081 * move backwards. However, head is not an issue because the
1082 * previous value is khead;
1083 */
1084 _KR_RXRING_VALIDATE(kring, ring, ckr_khead, ckr_ktail, head);
1085
1086 /* update the kernel view of ring */
1087 kring->ckr_rhead = head;
1088 return head;
1089
1090 error:
1091 SK_ERR("%s(%d) kr \"%s\" (%p) krflags 0x%x error: kh %u kt %u | "
1092 "rh %u rt %u | h %u t %u", sk_proc_name(p),
1093 sk_proc_pid(p), kring->ckr_name, SK_KVA(kring), kring->ckr_flags,
1094 ckr_khead, ckr_ktail, kring->ckr_rhead, kring->ckr_rtail,
1095 ring->ring_head, ring->ring_tail);
1096
1097 skywalk_kill_process(p, err_reason | SKYWALK_KILL_REASON_ALLOC_SYNC);
1098 return kring->ckr_num_slots;
1099 }
1100
1101 /*
1102 * Nexus-specific kr_txsync_finalize() callback.
1103 */
1104 void
kr_txfinalize(struct kern_channel * ch,struct __kern_channel_ring * kring,const slot_idx_t head,struct proc * p)1105 kr_txfinalize(struct kern_channel *ch, struct __kern_channel_ring *kring,
1106 const slot_idx_t head, struct proc *p)
1107 {
1108 #pragma unused(ch)
1109 struct kern_pbufpool *pp = kring->ckr_pp;
1110 slot_idx_t slot_idx;
1111 uint32_t ph_cnt, i = 0;
1112 int32_t ph_needed;
1113 int err;
1114
1115 ASSERT(!(KRNA(kring)->na_flags & NAF_USER_PKT_POOL));
1116
1117 /* use khead value from pre-sync time */
1118 slot_idx = kring->ckr_khead_pre;
1119
1120 ph_needed = head - slot_idx;
1121 if (ph_needed < 0) {
1122 ph_needed += kring->ckr_num_slots;
1123 }
1124 if (ph_needed == 0) {
1125 return;
1126 }
1127
1128 ph_cnt = (uint32_t)ph_needed;
1129 err = kern_pbufpool_alloc_batch(pp, 1, kring->ckr_scratch, &ph_cnt);
1130 VERIFY(err == 0 && ph_cnt == (uint32_t)ph_needed);
1131
1132 /* recycle the transferred packets */
1133 while (slot_idx != head) {
1134 struct __kern_slot_desc *ksd = KR_KSD(kring, slot_idx);
1135 kern_packet_t ph;
1136
1137 if (KSD_VALID_METADATA(ksd)) {
1138 goto next_slot;
1139 }
1140
1141 ph = kring->ckr_scratch[i];
1142 ASSERT(ph != 0);
1143 kring->ckr_scratch[i] = 0;
1144 ++i;
1145
1146 /*
1147 * Since this packet is freshly allocated and we need
1148 * to have the flag set for the attach to succeed,
1149 * just set it here rather than calling
1150 * __packet_finalize().
1151 */
1152 SK_PTR_ADDR_KQUM(ph)->qum_qflags |= QUM_F_FINALIZED;
1153
1154 KR_SLOT_ATTACH_METADATA(kring, ksd, SK_PTR_ADDR_KQUM(ph));
1155
1156 kr_externalize_metadata_internal(kring, pp->pp_max_frags,
1157 SK_PTR_ADDR_KQUM(ph), p);
1158 next_slot:
1159 slot_idx = SLOT_NEXT(slot_idx, kring->ckr_lim);
1160 }
1161
1162 if (i != ph_cnt) {
1163 kern_pbufpool_free_batch(pp, &kring->ckr_scratch[i],
1164 ph_cnt - i);
1165 }
1166 }
1167
1168 /*
1169 * Nexus-specific kr_txsync_finalize() callback - user packet pool variant.
1170 */
1171 void
kr_txfinalize_upp(struct kern_channel * ch,struct __kern_channel_ring * kring,const slot_idx_t head,struct proc * p)1172 kr_txfinalize_upp(struct kern_channel *ch, struct __kern_channel_ring *kring,
1173 const slot_idx_t head, struct proc *p)
1174 {
1175 #pragma unused(ch, p)
1176 slot_idx_t slot_idx;
1177 uint32_t nfree = 0;
1178
1179 ASSERT(KRNA(kring)->na_flags & NAF_USER_PKT_POOL);
1180
1181 /* use khead value from pre-sync time */
1182 slot_idx = kring->ckr_khead_pre;
1183
1184 /* recycle the transferred packets */
1185 while (slot_idx != head) {
1186 struct __kern_slot_desc *ksd = KR_KSD(kring, slot_idx);
1187
1188 if (KSD_VALID_METADATA(ksd)) {
1189 /* detach and free the packet */
1190 struct __kern_quantum *kqum = ksd->sd_qum;
1191 (void) KR_SLOT_DETACH_METADATA(kring, ksd);
1192 ASSERT(nfree < kring->ckr_num_slots);
1193 kring->ckr_scratch[nfree++] = (uint64_t)kqum;
1194 }
1195
1196 slot_idx = SLOT_NEXT(slot_idx, kring->ckr_lim);
1197 }
1198
1199 if (__probable(nfree > 0)) {
1200 pp_free_packet_batch(kring->ckr_pp,
1201 &kring->ckr_scratch[0], nfree);
1202 }
1203 }
1204
1205 /*
1206 * Update kring and ring at the end of txsync.
1207 */
1208 void
kr_txsync_finalize(struct kern_channel * ch,struct __kern_channel_ring * kring,struct proc * p)1209 kr_txsync_finalize(struct kern_channel *ch, struct __kern_channel_ring *kring,
1210 struct proc *p)
1211 {
1212 slot_idx_t ckr_khead, ckr_ktail;
1213 uint32_t slot_size;
1214 int32_t slot_diff;
1215
1216 VERIFY(sk_is_sync_protected());
1217 /* assert that this routine is only called for user facing rings */
1218 ASSERT(!KR_KERNEL_ONLY(kring));
1219
1220 /* read these once and use local copies */
1221 ckr_khead = kring->ckr_khead;
1222 ckr_ktail = kring->ckr_ktail;
1223
1224 /*
1225 * update userspace-facing channel statistics (# writable bytes/slots)
1226 *
1227 * Since the ring might be dynamically allocated, we can't rely on the
1228 * tail pointer to calculate free TX space (the tail might be sitting
1229 * at the edge of allocated ring space but be able to be pushed over
1230 * into unallocated ring space).
1231 *
1232 * Instead, calculate free TX space by looking at what slots are
1233 * available to the kernel for TX, and subtracting that from the total
1234 * number of possible slots. This is effectively what userspace can
1235 * write to.
1236 */
1237 slot_size = PP_BUF_SIZE_DEF(kring->ckr_pp);
1238 slot_diff = kring->ckr_rhead - ckr_khead;
1239 if (slot_diff < 0) {
1240 slot_diff += kring->ckr_num_slots;
1241 }
1242 slot_diff = kring->ckr_lim - slot_diff;
1243 kring->ckr_ready_slots = slot_diff;
1244 kring->ckr_ready_bytes = slot_diff * slot_size;
1245
1246 /*
1247 * Invoke nexus-specific TX finalize callback, which may recycle any
1248 * transferred packets and/or externalize new ones. Some nexus don't
1249 * have any callback set. Configured in na_kr_create().
1250 */
1251 if (kring->ckr_finalize != NULL) {
1252 kring->ckr_finalize(ch, kring, ckr_khead, p);
1253 }
1254
1255 /* update ring tail/khead to what the kernel knows */
1256 *(slot_idx_t *)(uintptr_t)&kring->ckr_ring->ring_tail =
1257 kring->ckr_rtail = ckr_ktail;
1258 *(slot_idx_t *)(uintptr_t)&kring->ckr_ring->ring_khead = ckr_khead;
1259
1260 SK_DF(SK_VERB_SYNC | SK_VERB_TX, "%s(%d) kr \"%s\", kh %u kt %u | "
1261 "rh %u rt %u | h %u t %u", sk_proc_name(p),
1262 sk_proc_pid(p), kring->ckr_name, ckr_khead, ckr_ktail,
1263 kring->ckr_rhead, kring->ckr_rtail,
1264 kring->ckr_ring->ring_head,
1265 kring->ckr_ring->ring_tail);
1266 }
1267
1268 /*
1269 * Nexus-specific kr_rxsync_finalize() callback.
1270 */
1271 void
kr_rxfinalize(struct kern_channel * ch,struct __kern_channel_ring * kring,const slot_idx_t tail,struct proc * p)1272 kr_rxfinalize(struct kern_channel *ch, struct __kern_channel_ring *kring,
1273 const slot_idx_t tail, struct proc *p)
1274 {
1275 #pragma unused(ch)
1276 const uint32_t maxfrags = kring->ckr_pp->pp_max_frags;
1277 slot_idx_t slot_idx = kring->ckr_rtail;
1278 uint32_t byte_count = 0;
1279
1280 while (slot_idx != tail) {
1281 struct __kern_slot_desc *ksd = KR_KSD(kring, slot_idx);
1282 struct __kern_quantum *kqum = ksd->sd_qum;
1283
1284 /*
1285 * nexus provider should never leave an empty slot on rx ring.
1286 */
1287 VERIFY(kqum != NULL);
1288 kr_externalize_metadata_internal(kring, maxfrags, kqum, p);
1289 ASSERT(!(KR_USD(kring, slot_idx)->sd_flags & ~SD_FLAGS_USER));
1290
1291 byte_count += kqum->qum_len;
1292 slot_idx = SLOT_NEXT(slot_idx, kring->ckr_lim);
1293 }
1294
1295 kring->ckr_ready_bytes += byte_count;
1296
1297 /* just recalculate slot count using pointer arithmetic */
1298 int32_t slot_diff = tail - kring->ckr_rhead;
1299 if (slot_diff < 0) {
1300 slot_diff += kring->ckr_num_slots;
1301 }
1302 kring->ckr_ready_slots = slot_diff;
1303
1304 #if CONFIG_NEXUS_NETIF
1305 /*
1306 * If this is a channel opened directly to the netif nexus, provide
1307 * it feedbacks on the number of packets and bytes consumed. This
1308 * will drive the receive mitigation strategy.
1309 */
1310 if (__improbable(kring->ckr_netif_mit_stats != NULL) &&
1311 slot_diff != 0 && byte_count != 0) {
1312 kring->ckr_netif_mit_stats(kring, slot_diff, byte_count);
1313 }
1314 #endif /* CONFIG_NEXUS_NETIF */
1315 }
1316
1317 /*
1318 * Nexus-specific kr_rxsync_finalize() callback - user packet pool variant.
1319 */
1320 void
kr_rxfinalize_upp(struct kern_channel * ch,struct __kern_channel_ring * kring,const slot_idx_t tail,struct proc * p)1321 kr_rxfinalize_upp(struct kern_channel *ch, struct __kern_channel_ring *kring,
1322 const slot_idx_t tail, struct proc *p)
1323 {
1324 const uint32_t maxfrags = kring->ckr_pp->pp_max_frags;
1325 slot_idx_t slot_idx = kring->ckr_rtail;
1326 struct kern_pbufpool *pp = kring->ckr_pp;
1327 uint32_t byte_count = 0;
1328
1329 PP_LOCK(pp);
1330 while (slot_idx != tail) {
1331 struct __kern_slot_desc *ksd = KR_KSD(kring, slot_idx);
1332 struct __user_slot_desc *usd = KR_USD(kring, slot_idx);
1333 struct __kern_quantum *kqum = ksd->sd_qum;
1334
1335 /*
1336 * nexus provider should never leave an empty slot on rx ring.
1337 */
1338 VERIFY(kqum != NULL);
1339 /*
1340 * The channel is operating in packet allocator
1341 * mode, so add packet to the allocated list.
1342 */
1343 pp_insert_upp_locked(pp, kqum, ch->ch_pid);
1344
1345 KSD_DETACH_METADATA(ksd);
1346 /* To calculate ckr_ready_bytes by kr_rxsync_prologue */
1347 USD_SET_LENGTH(usd, (uint16_t)kqum->qum_len);
1348
1349 kr_externalize_metadata_internal(kring, maxfrags, kqum, p);
1350 ASSERT((usd->sd_flags & ~SD_FLAGS_USER) == 0);
1351
1352 byte_count += kqum->qum_len;
1353 slot_idx = SLOT_NEXT(slot_idx, kring->ckr_lim);
1354 }
1355 ch_update_upp_buf_stats(ch, pp);
1356 PP_UNLOCK(pp);
1357
1358 kring->ckr_ready_bytes += byte_count;
1359
1360 /* just recalculate slot count using pointer arithmetic */
1361 int32_t slot_diff = tail - kring->ckr_rhead;
1362 if (slot_diff < 0) {
1363 slot_diff += kring->ckr_num_slots;
1364 }
1365 kring->ckr_ready_slots = slot_diff;
1366
1367 #if CONFIG_NEXUS_NETIF
1368 /*
1369 * If this is a channel opened directly to the netif nexus, provide
1370 * it feedbacks on the number of packets and bytes consumed. This
1371 * will drive the receive mitigation strategy.
1372 */
1373 if (__improbable(kring->ckr_netif_mit_stats != NULL) &&
1374 slot_diff != 0 && byte_count != 0) {
1375 kring->ckr_netif_mit_stats(kring, slot_diff, byte_count);
1376 }
1377 #endif /* CONFIG_NEXUS_NETIF */
1378 }
1379
1380 /*
1381 * Update kring and ring at the end of rxsync
1382 */
1383 void
kr_rxsync_finalize(struct kern_channel * ch,struct __kern_channel_ring * kring,struct proc * p)1384 kr_rxsync_finalize(struct kern_channel *ch, struct __kern_channel_ring *kring,
1385 struct proc *p)
1386 {
1387 #pragma unused(ch, p)
1388 slot_idx_t ckr_khead, ckr_ktail;
1389
1390 VERIFY(sk_is_sync_protected());
1391 /* assert that this routine is only called for user facing rings */
1392 ASSERT(!KR_KERNEL_ONLY(kring));
1393 ASSERT(kring->ckr_usds != NULL);
1394
1395 /* read these once and use local copies */
1396 ckr_khead = kring->ckr_khead;
1397 ckr_ktail = kring->ckr_ktail;
1398
1399 /*
1400 * Invoke nexus-specific RX finalize callback; set in na_kr_create().
1401 */
1402 if (kring->ckr_finalize != NULL) {
1403 kring->ckr_finalize(ch, kring, ckr_ktail, p);
1404 }
1405
1406 /* update ring tail/khead to what the kernel knows */
1407 *(slot_idx_t *)(uintptr_t)&kring->ckr_ring->ring_tail =
1408 kring->ckr_rtail = ckr_ktail;
1409 *(slot_idx_t *)(uintptr_t)&kring->ckr_ring->ring_khead = ckr_khead;
1410
1411 SK_DF(SK_VERB_SYNC | SK_VERB_RX, "%s(%d) kr \"%s\", kh %u kt %u | "
1412 "rh %u rt %u | h %u t %u", sk_proc_name(p),
1413 sk_proc_pid(p), kring->ckr_name, ckr_khead, ckr_ktail,
1414 kring->ckr_rhead, kring->ckr_rtail,
1415 kring->ckr_ring->ring_head,
1416 kring->ckr_ring->ring_tail);
1417 }
1418
1419 void
kr_alloc_sync_finalize(struct __kern_channel_ring * kring,struct proc * p)1420 kr_alloc_sync_finalize(struct __kern_channel_ring *kring, struct proc *p)
1421 {
1422 #pragma unused(p)
1423 slot_idx_t ckr_khead, ckr_ktail;
1424
1425 VERIFY(sk_is_sync_protected());
1426 /* read these once and use local copies */
1427 ckr_khead = kring->ckr_khead;
1428 ckr_ktail = kring->ckr_ktail;
1429
1430 /* update ring tail/khead to what the kernel knows */
1431 *(slot_idx_t *)(uintptr_t)&kring->ckr_ring->ring_tail =
1432 kring->ckr_rtail = ckr_ktail;
1433 *(slot_idx_t *)(uintptr_t)&kring->ckr_ring->ring_khead = ckr_khead;
1434 *(uint32_t *)(uintptr_t)&kring->ckr_ring->ring_alloc_ws =
1435 kring->ckr_alloc_ws;
1436
1437 SK_DF(SK_VERB_SYNC, "%s(%d) kr \"%s\", kh %u kt %u | "
1438 "rh %u rt %u | h %u t %u | ws %u",
1439 sk_proc_name(p),
1440 sk_proc_pid(p), kring->ckr_name, ckr_khead, ckr_ktail,
1441 kring->ckr_rhead, kring->ckr_rtail,
1442 kring->ckr_ring->ring_head,
1443 kring->ckr_ring->ring_tail, kring->ckr_alloc_ws);
1444 }
1445
1446 void
kr_free_sync_finalize(struct __kern_channel_ring * kring,struct proc * p)1447 kr_free_sync_finalize(struct __kern_channel_ring *kring, struct proc *p)
1448 {
1449 #pragma unused(p)
1450 slot_idx_t ckr_khead, ckr_ktail;
1451
1452 VERIFY(sk_is_sync_protected());
1453 /* read these once and use local copies */
1454 ckr_khead = kring->ckr_khead;
1455 ckr_ktail = kring->ckr_ktail;
1456
1457 /* update ring tail/khead to what the kernel knows */
1458 *(slot_idx_t *)(uintptr_t)&kring->ckr_ring->ring_tail =
1459 kring->ckr_rtail = ckr_ktail;
1460 *(slot_idx_t *)(uintptr_t)&kring->ckr_ring->ring_khead = ckr_khead;
1461
1462 SK_DF(SK_VERB_SYNC, "%s(%d) kr \"%s\", kh %u kt %u | "
1463 "rh %u rt %u | h %u t %u", sk_proc_name(p),
1464 sk_proc_pid(p), kring->ckr_name, ckr_khead, ckr_ktail,
1465 kring->ckr_rhead, kring->ckr_rtail,
1466 kring->ckr_ring->ring_head,
1467 kring->ckr_ring->ring_tail);
1468 }
1469
1470 slot_idx_t
kr_event_sync_prologue(struct __kern_channel_ring * kring,struct proc * p)1471 kr_event_sync_prologue(struct __kern_channel_ring *kring, struct proc *p)
1472 {
1473 struct __user_channel_ring *ring = kring->ckr_ring;
1474 slot_idx_t ckr_khead, ckr_ktail;
1475 slot_idx_t head, slot_idx;
1476 uint64_t err_reason = 0;
1477
1478 ASSERT(kring->ckr_tx == NR_EV);
1479 VERIFY(sk_is_sync_protected());
1480
1481 /* read these once and use local copies */
1482 ckr_khead = kring->ckr_khead;
1483 ckr_ktail = kring->ckr_ktail;
1484 head = ring->ring_head;
1485
1486 SK_DF(SK_VERB_SYNC, "%s(%d) kr \"%s\", kh %u kt %u | "
1487 "rh %u rt %u | h %u t %u", sk_proc_name(p),
1488 sk_proc_pid(p), kring->ckr_name, ckr_khead, ckr_ktail,
1489 kring->ckr_rhead, kring->ckr_rtail,
1490 head, ring->ring_tail);
1491 /*
1492 * Before storing the new values, we should check they do not
1493 * move backwards. However, head is not an issue because the
1494 * previous value is khead;
1495 */
1496 _KR_RXRING_VALIDATE(kring, ring, ckr_khead, ckr_ktail, head);
1497
1498 /*
1499 * Iterating through the slots just read by user-space;
1500 * ckr_rhead -> ring_head
1501 */
1502 slot_idx = kring->ckr_rhead;
1503 while (slot_idx != head) {
1504 struct __kern_slot_desc *ksd = KR_KSD(kring, slot_idx);
1505 struct __user_slot_desc *usd = KR_USD(kring, slot_idx);
1506 /*
1507 * ensure that the user has detached packet from slot.
1508 */
1509 VERIFY(!KSD_VALID_METADATA(ksd));
1510 if (__improbable(SD_VALID_METADATA(usd))) {
1511 SK_ERR("%s(%d) kr \"%s\" (%p) slot %u not "
1512 "detached md %u kh %u kt %u | rh %u rt %u |"
1513 " h %u t %u", sk_proc_name(p),
1514 sk_proc_pid(p), kring->ckr_name,
1515 SK_KVA(kring), slot_idx, usd->sd_md_idx,
1516 ckr_khead, ckr_ktail, kring->ckr_rhead,
1517 kring->ckr_rtail, ring->ring_head,
1518 ring->ring_tail);
1519 err_reason = SKYWALK_KILL_REASON_SLOT_NOT_DETACHED;
1520 goto error;
1521 }
1522 slot_idx = SLOT_NEXT(slot_idx, kring->ckr_lim);
1523 }
1524
1525 /* update the kernel view of ring */
1526 kring->ckr_rhead = head;
1527 return head;
1528
1529 error:
1530 SK_ERR("%s(%d) kr \"%s\" (%p) krflags 0x%x error: kh %u kt %u | "
1531 "rh %u rt %u | h %u t %u", sk_proc_name(p),
1532 sk_proc_pid(p), kring->ckr_name, SK_KVA(kring), kring->ckr_flags,
1533 ckr_khead, ckr_ktail, kring->ckr_rhead, kring->ckr_rtail,
1534 ring->ring_head, ring->ring_tail);
1535
1536 skywalk_kill_process(p, err_reason | SKYWALK_KILL_REASON_EVENT_SYNC);
1537 return kring->ckr_num_slots;
1538 }
1539
1540 void
kr_event_sync_finalize(struct kern_channel * ch,struct __kern_channel_ring * kring,struct proc * p)1541 kr_event_sync_finalize(struct kern_channel *ch,
1542 struct __kern_channel_ring *kring, struct proc *p)
1543 {
1544 #pragma unused(ch)
1545 struct kern_pbufpool *pp = kring->ckr_pp;
1546 const uint32_t maxfrags = pp->pp_max_frags;
1547 slot_idx_t ckr_khead, ckr_ktail, ckr_rhead;
1548 struct __kern_slot_desc *ksd;
1549 struct __user_slot_desc *usd;
1550 struct __kern_quantum *kqum;
1551
1552 VERIFY(sk_is_sync_protected());
1553 /* assert that this routine is only called for user facing rings */
1554 ASSERT(!KR_KERNEL_ONLY(kring));
1555 ASSERT(kring->ckr_usds != NULL);
1556 ASSERT(kring->ckr_tx == NR_EV);
1557
1558 /* read these once and use local copies */
1559 ckr_khead = kring->ckr_khead;
1560 ckr_ktail = kring->ckr_ktail;
1561 ckr_rhead = kring->ckr_rhead;
1562
1563 slot_idx_t slot_idx = kring->ckr_rtail;
1564 PP_LOCK(pp);
1565 while (slot_idx != ckr_ktail) {
1566 ksd = KR_KSD(kring, slot_idx);
1567 usd = KR_USD(kring, slot_idx);
1568 kqum = ksd->sd_qum;
1569
1570 /*
1571 * Add packet to the allocated list of user packet pool.
1572 */
1573 pp_insert_upp_locked(pp, kqum, ch->ch_pid);
1574
1575 KSD_DETACH_METADATA(ksd);
1576 kr_externalize_metadata_internal(kring, maxfrags, kqum, p);
1577 ASSERT((usd->sd_flags & ~SD_FLAGS_USER) == 0);
1578 slot_idx = SLOT_NEXT(slot_idx, kring->ckr_lim);
1579 }
1580 ch_update_upp_buf_stats(ch, pp);
1581 PP_UNLOCK(pp);
1582
1583 /* just recalculate slot count using pointer arithmetic */
1584 int32_t slot_diff = ckr_ktail - ckr_rhead;
1585 if (slot_diff < 0) {
1586 slot_diff += kring->ckr_num_slots;
1587 }
1588 kring->ckr_ready_slots = slot_diff;
1589
1590 /* update ring tail/khead to what the kernel knows */
1591 *(slot_idx_t *)(uintptr_t)&kring->ckr_ring->ring_tail =
1592 kring->ckr_rtail = ckr_ktail;
1593 *(slot_idx_t *)(uintptr_t)&kring->ckr_ring->ring_khead = ckr_khead;
1594
1595 SK_DF(SK_VERB_SYNC | SK_VERB_RX, "%s(%d) kr \"%s\", kh %u kt %u | "
1596 "rh %u rt %u | h %u t %u", sk_proc_name(p),
1597 sk_proc_pid(p), kring->ckr_name, ckr_khead, ckr_ktail,
1598 kring->ckr_rhead, kring->ckr_rtail,
1599 kring->ckr_ring->ring_head,
1600 kring->ckr_ring->ring_tail);
1601 }
1602 #undef NM_FAIL_ON
1603
1604 void
kr_txkring_reclaim_and_refill(struct __kern_channel_ring * kring,slot_idx_t index)1605 kr_txkring_reclaim_and_refill(struct __kern_channel_ring *kring,
1606 slot_idx_t index)
1607 {
1608 const slot_idx_t lim = kring->ckr_lim;
1609 slot_idx_t next_index = SLOT_NEXT(index, lim);
1610
1611 kring->ckr_khead = next_index;
1612 /* reclaim */
1613 kring->ckr_ktail = index;
1614 }
1615
1616 /*
1617 * *************************************************************************
1618 * Checks on packet header offsets in kr_internalize_metadata
1619 * *************************************************************************
1620 *
1621 * +----------+------------------------------+----------------------------+
1622 * | | NEXUS_META_SUBTYPE_RAW | NEXUS_META_SUBTYPE_PAYLOAD |
1623 * |----------+------------------------------+----------------------------+
1624 * | buflet | (bdoff + len) <= dlim | (bdoff + len) <= dlim |
1625 * |----------+------------------------------+----------------------------+
1626 * | headroom | hr == bdoff && hr < bdlim | hr == 0 && bdoff == 0 |
1627 * |----------+------------------------------+----------------------------+
1628 * | l2_len | hr + l2_len < bdim | l2_len == 0 |
1629 * |----------+------------------------------+----------------------------+
1630 */
1631 int
kr_internalize_metadata(struct kern_channel * ch,struct __kern_channel_ring * kring,const uint32_t maxfrags,struct __kern_quantum * kqum,struct proc * p)1632 kr_internalize_metadata(struct kern_channel *ch,
1633 struct __kern_channel_ring *kring, const uint32_t maxfrags,
1634 struct __kern_quantum *kqum, struct proc *p)
1635 {
1636 #pragma unused(kring, maxfrags, p)
1637 struct __user_buflet *ubuf, *pubuf; /* user buflet */
1638 struct __kern_buflet *kbuf, *pkbuf; /* kernel buflet */
1639 struct __user_quantum *uqum; /* user source */
1640 struct __user_packet *upkt;
1641 struct __kern_packet *kpkt;
1642 uint32_t len = 0, bdoff, bdlim;
1643 uint16_t bcnt = 0, bmax, i;
1644 boolean_t dropped;
1645 int err = 0;
1646
1647 /*
1648 * Verify that the quantum/packet belongs to the same pp as
1649 * the one used by the adapter, i.e. the packet must have
1650 * been allocated from the same pp and attached to the kring.
1651 */
1652 ASSERT(kqum->qum_pp == kring->ckr_pp);
1653
1654 static_assert(sizeof(uqum->qum_com) == sizeof(kqum->qum_com));
1655 static_assert(sizeof(upkt->pkt_com) == sizeof(kpkt->pkt_com));
1656 uqum = __DECONST(struct __user_quantum *, kqum->qum_user);
1657 ASSERT(!(kqum->qum_qflags & QUM_F_KERNEL_ONLY) && uqum != NULL);
1658 upkt = SK_PTR_ADDR_UPKT(uqum);
1659 kpkt = SK_PTR_ADDR_KPKT(kqum);
1660
1661 DTRACE_SKYWALK3(internalize, struct __kern_channel_ring *, kring,
1662 struct __kern_packet *, kpkt, struct __user_packet *, upkt);
1663 SK_DF(SK_VERB_MEM, "%s(%d) kring %p uqum %p -> kqum %p",
1664 sk_proc_name(p), sk_proc_pid(p), SK_KVA(kring),
1665 SK_KVA(uqum), SK_KVA(kqum));
1666
1667 /* check if it's dropped before we internalize it */
1668 dropped = ((uqum->qum_qflags & QUM_F_DROPPED) != 0);
1669
1670 /*
1671 * Internalize common quantum metadata.
1672 *
1673 * For packet metadata, we trust the kernel copy for the buflet
1674 * count and limit; any mismatch on the user copy will cause
1675 * us to drop this packet.
1676 */
1677 _QUM_INTERNALIZE(uqum, kqum);
1678
1679 /* if marked as dropped, don't bother going further */
1680 if (__improbable(dropped)) {
1681 SK_ERR("%s(%d) kring %p dropped",
1682 sk_proc_name(p), sk_proc_pid(p), SK_KVA(kring));
1683 err = ERANGE;
1684 goto done;
1685 }
1686
1687 /*
1688 * Internalize common packet metadata.
1689 */
1690 _PKT_INTERNALIZE(upkt, kpkt);
1691
1692 if (__probable(ch != NULL)) {
1693 _UUID_COPY(kpkt->pkt_flowsrc_id,
1694 ch->ch_info->cinfo_ch_id);
1695 }
1696
1697 bcnt = upkt->pkt_bufs_cnt;
1698 bmax = kpkt->pkt_bufs_max;
1699 ASSERT(bmax == maxfrags);
1700 if (__improbable((bcnt == 0) || (bcnt > bmax) ||
1701 (upkt->pkt_bufs_max != bmax))) {
1702 SK_ERR("%s(%d) kring %p bad bufcnt %d, %d, %d",
1703 sk_proc_name(p), sk_proc_pid(p),
1704 SK_KVA(kring), bcnt, bmax, upkt->pkt_bufs_max);
1705 err = ERANGE;
1706 goto done;
1707 }
1708
1709 ASSERT(bcnt != 0);
1710 ubuf = pubuf = NULL;
1711 kbuf = pkbuf = NULL;
1712
1713 /*
1714 * Validate and internalize buflets.
1715 */
1716 for (i = 0; i < bcnt; i++) {
1717 static_assert(offsetof(struct __kern_packet, pkt_qum) == 0);
1718 static_assert(offsetof(struct __user_packet, pkt_qum) == 0);
1719 static_assert(offsetof(struct __kern_quantum, qum_com) == 0);
1720 PKT_GET_NEXT_BUFLET(kpkt, bcnt, pkbuf, kbuf);
1721 ASSERT(kbuf != NULL);
1722 if (kbuf->buf_flag & BUFLET_FLAG_EXTERNAL) {
1723 struct __kern_buflet_ext *kbuf_ext;
1724
1725 kbuf_ext = __container_of(kbuf,
1726 struct __kern_buflet_ext, kbe_overlay);
1727 ubuf = __DECONST(struct __user_buflet *,
1728 kbuf_ext->kbe_buf_user);
1729 } else {
1730 ASSERT(i == 0);
1731 ubuf = __DECONST(struct __user_buflet *,
1732 &uqum->qum_buf[0]);
1733 }
1734 ASSERT(ubuf != NULL);
1735 ASSERT((kbuf != pkbuf) && (ubuf != pubuf));
1736 ASSERT(kbuf->buf_dlim == _BUF_DLIM(kbuf, kqum->qum_pp));
1737 ASSERT(kbuf->buf_addr != 0);
1738 /*
1739 * For now, user-facing pool does not support shared
1740 * buffer, since otherwise the ubuf and kbuf buffer
1741 * indices would not match. Assert this is the case.
1742 */
1743 ASSERT(kbuf->buf_addr == (mach_vm_address_t)kbuf->buf_objaddr);
1744
1745 kbuf->buf_dlen = ubuf->buf_dlen;
1746 kbuf->buf_doff = ubuf->buf_doff;
1747
1748 /*
1749 * Check:
1750 * - sanity of buflet data offset and length
1751 * - kernel and user metadata use the same object index
1752 * - User hasn't modified nbft_idx: we don't allow user
1753 * to construct multi buflet packets for tx.
1754 */
1755 if (__improbable(!BUF_IN_RANGE(kbuf) ||
1756 ubuf->buf_idx != kbuf->buf_idx ||
1757 ubuf->buf_nbft_idx != kbuf->buf_nbft_idx)) {
1758 kbuf->buf_dlen = kbuf->buf_doff = 0;
1759 SK_ERR("%s(%d) kring %p bad bufidx 0x%x, 0x%x"
1760 " 0x%x, 0x%x",
1761 sk_proc_name(p), sk_proc_pid(p),
1762 SK_KVA(kring), kbuf->buf_idx, ubuf->buf_idx,
1763 kbuf->buf_nbft_idx, ubuf->buf_nbft_idx);
1764 err = ERANGE;
1765 goto done;
1766 }
1767
1768 /* save data offset from the first buflet */
1769 if (pkbuf == NULL) {
1770 bdoff = kbuf->buf_doff;
1771 }
1772
1773 /* all good to go */
1774 len += kbuf->buf_dlen;
1775 pubuf = ubuf;
1776 pkbuf = kbuf;
1777 }
1778
1779 static_assert(offsetof(struct __kern_packet, pkt_length) == offsetof(struct __kern_packet, pkt_qum.qum_len));
1780 if (__improbable(kpkt->pkt_length != len)) {
1781 SK_ERR("%s(%d) kring %p bad pktlen %d, %d",
1782 sk_proc_name(p), sk_proc_pid(p),
1783 SK_KVA(kring), kpkt->pkt_length, len);
1784 err = ERANGE;
1785 goto done;
1786 }
1787
1788 if (err == 0) {
1789 bdlim = PP_BUF_SIZE_DEF(kqum->qum_pp);
1790 /*
1791 * For a raw packet from user space we need to
1792 * validate that headroom is sane and is in the
1793 * first buflet.
1794 */
1795 if (__improbable(kpkt->pkt_headroom != bdoff)) {
1796 SK_ERR("%s(%d) kring %p bad headroom %d, %d",
1797 sk_proc_name(p), sk_proc_pid(p),
1798 SK_KVA(kring), kpkt->pkt_headroom, bdoff);
1799 err = ERANGE;
1800 goto done;
1801 }
1802 if (__improbable(kpkt->pkt_headroom +
1803 kpkt->pkt_l2_len >= bdlim)) {
1804 SK_ERR("%s(%d) kring %p bad headroom l2len %d, %d",
1805 sk_proc_name(p), sk_proc_pid(p),
1806 SK_KVA(kring), kpkt->pkt_l2_len, bdlim);
1807 err = ERANGE;
1808 goto done;
1809 }
1810
1811 /* validate checksum offload properties */
1812 if (__probable(PACKET_HAS_PARTIAL_CHECKSUM(kpkt))) {
1813 uint16_t start = kpkt->pkt_csum_tx_start_off;
1814 uint16_t stuff = kpkt->pkt_csum_tx_stuff_off;
1815 if (__improbable(start > stuff ||
1816 start > kpkt->pkt_length ||
1817 (stuff + sizeof(uint16_t)) > kpkt->pkt_length)) {
1818 SK_ERR("%s(%d) flags 0x%x start %u stuff %u "
1819 "len %u", sk_proc_name(p),
1820 sk_proc_pid(p), kpkt->pkt_csum_flags,
1821 start, stuff, kpkt->pkt_length);
1822 err = ERANGE;
1823 goto done;
1824 }
1825 } else {
1826 kpkt->pkt_csum_tx_start_off = 0;
1827 kpkt->pkt_csum_tx_stuff_off = 0;
1828 }
1829 *__DECONST(uint16_t *, &kpkt->pkt_bufs_cnt) = bcnt;
1830 }
1831
1832 done:
1833 if (__probable(err == 0)) {
1834 kqum->qum_len = len;
1835 kqum->qum_qflags |= (QUM_F_INTERNALIZED | QUM_F_FINALIZED);
1836 } else {
1837 kqum->qum_len = 0;
1838 kqum->qum_qflags |= (QUM_F_INTERNALIZED | QUM_F_DROPPED);
1839 }
1840 return err;
1841 }
1842
1843 __attribute__((always_inline))
1844 static inline void
kr_externalize_metadata_internal(struct __kern_channel_ring * kring,const uint32_t maxfrags,struct __kern_quantum * kqum,struct proc * p)1845 kr_externalize_metadata_internal(struct __kern_channel_ring *kring,
1846 const uint32_t maxfrags, struct __kern_quantum *kqum, struct proc *p)
1847 {
1848 #pragma unused(kring, maxfrags, p)
1849 struct __kern_buflet *kbuf, *pkbuf; /* kernel buflet */
1850 struct __user_buflet *ubuf, *pubuf; /* user buflet */
1851 struct __user_quantum *uqum; /* user destination */
1852 struct __user_packet *upkt;
1853 struct __kern_packet *kpkt;
1854 uint32_t len = 0;
1855 uint16_t bcnt = 0, bmax, i;
1856
1857 /*
1858 * Verify that the quantum/packet belongs to the same pp as
1859 * the one used by the adapter, i.e. the packet must have
1860 * been allocated from the same pp and attached to the kring.
1861 */
1862 ASSERT(kqum->qum_pp == kring->ckr_pp);
1863 ASSERT(kqum->qum_qflags & (QUM_F_FINALIZED | QUM_F_INTERNALIZED));
1864
1865 static_assert(sizeof(kpkt->pkt_com) == sizeof(upkt->pkt_com));
1866 static_assert(sizeof(kqum->qum_com) == sizeof(uqum->qum_com));
1867 uqum = __DECONST(struct __user_quantum *, kqum->qum_user);
1868 ASSERT(!(kqum->qum_qflags & QUM_F_KERNEL_ONLY) && uqum != NULL);
1869 upkt = SK_PTR_ADDR_UPKT(uqum);
1870 kpkt = SK_PTR_ADDR_KPKT(kqum);
1871
1872 DTRACE_SKYWALK3(externalize, struct __kern_channel_ring *, kring,
1873 struct __kern_packet *, kpkt, struct __user_packet *, upkt);
1874 SK_DF(SK_VERB_MEM, "%s(%d) kring %p kqum %p -> uqum %p",
1875 sk_proc_name(p), sk_proc_pid(p), SK_KVA(kring),
1876 SK_KVA(kqum), SK_KVA(uqum));
1877
1878 /*
1879 * Externalize common quantum metadata.
1880 */
1881 _QUM_EXTERNALIZE(kqum, uqum);
1882
1883 bcnt = kpkt->pkt_bufs_cnt;
1884 bmax = kpkt->pkt_bufs_max;
1885 ASSERT(bmax == maxfrags);
1886 ASSERT(bcnt <= bmax);
1887 /*
1888 * Externalize common packet metadata.
1889 */
1890 _PKT_EXTERNALIZE(kpkt, upkt);
1891
1892 /* sanitize buflet count and limit (deconst) */
1893 static_assert(sizeof(upkt->pkt_bufs_max) == sizeof(uint16_t));
1894 static_assert(sizeof(upkt->pkt_bufs_cnt) == sizeof(uint16_t));
1895 *(uint16_t *)(uintptr_t)&upkt->pkt_bufs_max = bmax;
1896 *(uint16_t *)(uintptr_t)&upkt->pkt_bufs_cnt = bcnt;
1897
1898 ASSERT(bcnt != 0);
1899 /*
1900 * special handling to externalize empty packet buflet.
1901 */
1902 kbuf = &kpkt->pkt_qum.qum_buf[0];
1903 if (kbuf->buf_addr == 0) {
1904 ubuf = __DECONST(struct __user_buflet *,
1905 &kpkt->pkt_qum.qum_user->qum_buf[0]);
1906 UBUF_INIT(kbuf, ubuf);
1907 }
1908
1909 kbuf = pkbuf = NULL;
1910 ubuf = pubuf = NULL;
1911 /*
1912 * Externalize buflets.
1913 */
1914 for (i = 0; i < bcnt; i++) {
1915 static_assert(offsetof(struct __kern_packet, pkt_qum) == 0);
1916 PKT_GET_NEXT_BUFLET(kpkt, bcnt, pkbuf, kbuf);
1917 ASSERT(kbuf != NULL);
1918
1919 if (kbuf->buf_flag & BUFLET_FLAG_EXTERNAL) {
1920 struct __kern_buflet_ext *kbuf_ext;
1921
1922 kbuf_ext = __container_of(kbuf,
1923 struct __kern_buflet_ext, kbe_overlay);
1924 ubuf = __DECONST(struct __user_buflet *,
1925 kbuf_ext->kbe_buf_user);
1926 } else {
1927 ASSERT(i == 0);
1928 ubuf = __DECONST(struct __user_buflet *,
1929 &kpkt->pkt_qum.qum_user->qum_buf[0]);
1930 }
1931
1932 ASSERT(ubuf != NULL);
1933 ASSERT((kbuf != pkbuf) && (ubuf != pubuf));
1934 ASSERT(BUF_IN_RANGE(kbuf));
1935 KBUF_EXTERNALIZE(kbuf, ubuf, kqum->qum_pp);
1936
1937 /* all good to go */
1938 len += kbuf->buf_dlen;
1939 pkbuf = kbuf;
1940 pubuf = ubuf;
1941 }
1942
1943 uqum->qum_len = len;
1944 uqum->qum_qflags |= QUM_F_FINALIZED;
1945
1946 /*
1947 * XXX: [email protected] -- do this during reclaim instead?
1948 */
1949 kqum->qum_qflags &= ~QUM_F_INTERNALIZED;
1950 }
1951
1952 void
kr_externalize_metadata(struct __kern_channel_ring * kring,const uint32_t maxfrags,struct __kern_quantum * kqum,struct proc * p)1953 kr_externalize_metadata(struct __kern_channel_ring *kring,
1954 const uint32_t maxfrags, struct __kern_quantum *kqum, struct proc *p)
1955 {
1956 kr_externalize_metadata_internal(kring, maxfrags, kqum, p);
1957 }
1958
1959 #if SK_LOG
1960 SK_NO_INLINE_ATTRIBUTE
1961 char *
kr2str(const struct __kern_channel_ring * kr,char * __counted_by (dsz)dst,size_t dsz)1962 kr2str(const struct __kern_channel_ring *kr, char *__counted_by(dsz)dst,
1963 size_t dsz)
1964 {
1965 (void) sk_snprintf(dst, dsz, "%p %s %s flags 0x%b",
1966 SK_KVA(kr), kr->ckr_name, sk_ring2str(kr->ckr_tx), kr->ckr_flags,
1967 CKRF_BITS);
1968
1969 return dst;
1970 }
1971 #endif
1972