1 /*
2 * Copyright (c) 2015-2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 #include <sys/kdebug.h>
30 #include <skywalk/os_skywalk_private.h>
31 #include <net/ntstat.h>
32 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
33 #include <skywalk/nexus/netif/nx_netif.h>
34 #include <skywalk/nexus/upipe/nx_user_pipe.h>
35
36 #define KRING_EMPTY_TX(_kring, _index) \
37 ((_kring)->ckr_rhead == (_index))
38
39 #define KRING_FULL_RX(_kring, _index) \
40 ((_kring)->ckr_khead == SLOT_NEXT((_index), (_kring)->ckr_lim))
41
42 uint32_t
kern_channel_notify(const kern_channel_ring_t kring,uint32_t flags)43 kern_channel_notify(const kern_channel_ring_t kring, uint32_t flags)
44 {
45 #pragma unused(flags)
46 if (__improbable(KR_DROP(kring))) {
47 return ENXIO;
48 }
49
50 return kring->ckr_na_notify(kring, kernproc, 0);
51 }
52
53 uint32_t
kern_channel_reclaim(const kern_channel_ring_t kring)54 kern_channel_reclaim(const kern_channel_ring_t kring)
55 {
56 return kr_reclaim(kring);
57 }
58
59 static inline uint32_t
_kern_channel_available_slot_count_tx(const kern_channel_ring_t kring,slot_idx_t index)60 _kern_channel_available_slot_count_tx(const kern_channel_ring_t kring,
61 slot_idx_t index)
62 {
63 ASSERT(kring->ckr_tx == NR_TX);
64
65 if (kring->ckr_rhead < index) {
66 return kring->ckr_num_slots + kring->ckr_rhead - index;
67 }
68
69 return kring->ckr_rhead - index;
70 }
71
72 static inline uint32_t
_kern_channel_available_slot_count_rx(const kern_channel_ring_t kring,slot_idx_t index)73 _kern_channel_available_slot_count_rx(const kern_channel_ring_t kring,
74 slot_idx_t index)
75 {
76 uint32_t busy;
77 slot_idx_t lim = kring->ckr_lim;
78
79 ASSERT(kring->ckr_tx == NR_RX);
80
81 if (index < kring->ckr_khead) {
82 busy = kring->ckr_num_slots + index - kring->ckr_khead;
83 } else {
84 busy = index - kring->ckr_khead;
85 }
86
87 ASSERT(lim >= busy);
88 return lim - busy;
89 }
90
91 uint32_t
kern_channel_available_slot_count(const kern_channel_ring_t kring)92 kern_channel_available_slot_count(const kern_channel_ring_t kring)
93 {
94 if (kring->ckr_tx == NR_TX) {
95 return _kern_channel_available_slot_count_tx(kring,
96 kring->ckr_khead);
97 } else {
98 return _kern_channel_available_slot_count_rx(kring,
99 kring->ckr_ktail);
100 }
101 }
102
103 kern_channel_slot_t
kern_channel_get_next_slot(const kern_channel_ring_t kring,const kern_channel_slot_t slot0,struct kern_slot_prop * prop)104 kern_channel_get_next_slot(const kern_channel_ring_t kring,
105 const kern_channel_slot_t slot0, struct kern_slot_prop *prop)
106 {
107 kern_channel_slot_t slot;
108 slot_idx_t slot_idx;
109
110 /* Ensure this is only done by the thread doing a sync syscall */
111 VERIFY(sk_is_sync_protected());
112
113 if (__improbable(slot0 == NULL)) {
114 if (kring->ckr_tx == NR_TX) {
115 slot_idx = kring->ckr_khead;
116 } else {
117 slot_idx = kring->ckr_ktail;
118 }
119 } else {
120 slot_idx = SLOT_NEXT(KR_SLOT_INDEX(kring, slot0),
121 kring->ckr_lim);
122 }
123
124 ASSERT(slot_idx < kring->ckr_num_slots);
125
126 if (kring->ckr_tx == NR_TX) {
127 if (__improbable(KRING_EMPTY_TX(kring, slot_idx))) {
128 SK_DF(SK_VERB_SYNC | SK_VERB_TX,
129 "EMPTY_TX: na \"%s\" kr \"%s\" "
130 "i %u (kc %u kt %u | rh %u rt %u)",
131 KRNA(kring)->na_name,
132 kring->ckr_name, slot_idx, kring->ckr_khead,
133 kring->ckr_ktail, kring->ckr_rhead,
134 kring->ckr_rtail);
135 slot = NULL;
136 } else {
137 slot = &kring->ckr_ksds[slot_idx];
138 }
139 } else {
140 if (__improbable(KRING_FULL_RX(kring, slot_idx))) {
141 SK_DF(SK_VERB_SYNC | SK_VERB_RX,
142 "FULL_RX: na \"%s\" kr \"%s\" "
143 "i %u (kc %u kt %u | rh %u rt %u)",
144 KRNA(kring)->na_name,
145 kring->ckr_name, slot_idx, kring->ckr_khead,
146 kring->ckr_ktail, kring->ckr_rhead,
147 kring->ckr_rtail);
148 slot = NULL;
149 } else {
150 slot = &kring->ckr_ksds[slot_idx];
151 }
152 }
153
154 if (prop != NULL) {
155 bzero(prop, sizeof(*prop));
156 }
157
158 return slot;
159 }
160
161 static inline void
_kern_channel_advance_slot_tx(const kern_channel_ring_t kring,slot_idx_t index)162 _kern_channel_advance_slot_tx(const kern_channel_ring_t kring, slot_idx_t index)
163 {
164 /* Ensure this is only done by the thread doing a sync syscall */
165 VERIFY(sk_is_sync_protected());
166 kr_txkring_reclaim_and_refill(kring, index);
167 }
168
169 static inline void
_kern_channel_advance_slot_rx(const kern_channel_ring_t kring,slot_idx_t index)170 _kern_channel_advance_slot_rx(const kern_channel_ring_t kring, slot_idx_t index)
171 {
172 ASSERT(kring->ckr_tx == NR_RX || kring->ckr_tx == NR_EV);
173 /* Ensure this is only done by the thread doing a sync syscall */
174 VERIFY(sk_is_sync_protected());
175
176 kring->ckr_ktail = SLOT_NEXT(index, kring->ckr_lim);
177 }
178
179 void
kern_channel_advance_slot(const kern_channel_ring_t kring,kern_channel_slot_t slot)180 kern_channel_advance_slot(const kern_channel_ring_t kring,
181 kern_channel_slot_t slot)
182 {
183 slot_idx_t index = KR_SLOT_INDEX(kring, slot);
184 ASSERT(index < kring->ckr_num_slots);
185
186 if (kring->ckr_tx == NR_TX) {
187 _kern_channel_advance_slot_tx(kring, index);
188 } else {
189 _kern_channel_advance_slot_rx(kring, index);
190 }
191 }
192
193 void *
kern_channel_get_context(const kern_channel_t ch)194 kern_channel_get_context(const kern_channel_t ch)
195 {
196 return ch->ch_ctx;
197 }
198
199 void *
kern_channel_ring_get_context(const kern_channel_ring_t kring)200 kern_channel_ring_get_context(const kern_channel_ring_t kring)
201 {
202 return kring->ckr_ctx;
203 }
204
205 errno_t
kern_channel_ring_get_container(const kern_channel_ring_t kring,kern_packet_t ** array,uint32_t * count)206 kern_channel_ring_get_container(const kern_channel_ring_t kring,
207 kern_packet_t **array, uint32_t *count)
208 {
209 /* Ensure this is only done by the thread doing a sync syscall */
210 VERIFY(sk_is_sync_protected());
211
212 if (array == NULL) {
213 return EINVAL;
214 }
215
216 *array = kring->ckr_scratch;
217 if (count != NULL) {
218 *count = na_get_nslots(kring->ckr_na, kring->ckr_tx);
219 }
220
221 return 0;
222 }
223
224 /*
225 * -fbounds-safety: This function is only used by kpipe (kplo_slot_fini), which
226 * we won't adopt -fbounds-safety until later. And kplo_slot_fini casts this to
227 * uintptr_t in the KPLO_VERIFY_CTX macro anyway. So having it as a plain void *
228 * without bounds information could be okay.
229 */
230 void *
kern_channel_slot_get_context(const kern_channel_ring_t kring,const kern_channel_slot_t slot)231 kern_channel_slot_get_context(const kern_channel_ring_t kring,
232 const kern_channel_slot_t slot)
233 {
234 slot_idx_t i = KR_SLOT_INDEX(kring, slot);
235 void *__single slot_ctx = 0;
236
237 if (kring->ckr_slot_ctxs != NULL) {
238 slot_ctx = kring->ckr_slot_ctxs[i].slot_ctx_arg;
239 }
240
241 return slot_ctx;
242 }
243
244 void
kern_channel_increment_ring_stats(kern_channel_ring_t kring,struct kern_channel_ring_stat_increment * stats)245 kern_channel_increment_ring_stats(kern_channel_ring_t kring,
246 struct kern_channel_ring_stat_increment *stats)
247 {
248 kr_update_stats(kring, stats->kcrsi_slots_transferred,
249 stats->kcrsi_bytes_transferred);
250 }
251
252 void
kern_channel_increment_ring_net_stats(kern_channel_ring_t kring,struct ifnet * ifp,struct kern_channel_ring_stat_increment * stats)253 kern_channel_increment_ring_net_stats(kern_channel_ring_t kring,
254 struct ifnet *ifp, struct kern_channel_ring_stat_increment *stats)
255 {
256 if (kring->ckr_tx == NR_TX) {
257 os_atomic_add(&ifp->if_data.ifi_opackets, stats->kcrsi_slots_transferred, relaxed);
258 os_atomic_add(&ifp->if_data.ifi_obytes, stats->kcrsi_bytes_transferred, relaxed);
259 } else {
260 os_atomic_add(&ifp->if_data.ifi_ipackets, stats->kcrsi_slots_transferred, relaxed);
261 os_atomic_add(&ifp->if_data.ifi_ibytes, stats->kcrsi_bytes_transferred, relaxed);
262 }
263
264 if (ifp->if_data_threshold != 0) {
265 ifnet_notify_data_threshold(ifp);
266 }
267
268 kr_update_stats(kring, stats->kcrsi_slots_transferred,
269 stats->kcrsi_bytes_transferred);
270 }
271
272 kern_packet_t
kern_channel_slot_get_packet(const kern_channel_ring_t kring,const kern_channel_slot_t slot)273 kern_channel_slot_get_packet(const kern_channel_ring_t kring,
274 const kern_channel_slot_t slot)
275 {
276 #if (DEVELOPMENT || DEBUG)
277 /* catch invalid slot */
278 slot_idx_t idx = KR_SLOT_INDEX(kring, slot);
279 struct __kern_slot_desc *ksd = KR_KSD(kring, idx);
280 #else
281 #pragma unused(kring)
282 struct __kern_slot_desc *ksd = SLOT_DESC_KSD(slot);
283 #endif /* (DEVELOPMENT || DEBUG) */
284 struct __kern_quantum *kqum = ksd->sd_qum;
285
286 if (__improbable(kqum == NULL ||
287 (kqum->qum_qflags & QUM_F_DROPPED) != 0)) {
288 return 0;
289 }
290
291 return SD_GET_TAGGED_METADATA(ksd);
292 }
293
294 errno_t
kern_channel_slot_attach_packet(const kern_channel_ring_t kring,const kern_channel_slot_t slot,kern_packet_t ph)295 kern_channel_slot_attach_packet(const kern_channel_ring_t kring,
296 const kern_channel_slot_t slot, kern_packet_t ph)
297 {
298 #if (DEVELOPMENT || DEBUG)
299 /* catch invalid slot */
300 slot_idx_t idx = KR_SLOT_INDEX(kring, slot);
301 struct __kern_slot_desc *ksd = KR_KSD(kring, idx);
302 #else
303 #pragma unused(kring)
304 struct __kern_slot_desc *ksd = SLOT_DESC_KSD(slot);
305 #endif /* (DEVELOPMENT || DEBUG) */
306
307 return KR_SLOT_ATTACH_METADATA(kring, ksd, SK_PTR_ADDR_KQUM(ph));
308 }
309
310 errno_t
kern_channel_slot_detach_packet(const kern_channel_ring_t kring,const kern_channel_slot_t slot,kern_packet_t ph)311 kern_channel_slot_detach_packet(const kern_channel_ring_t kring,
312 const kern_channel_slot_t slot, kern_packet_t ph)
313 {
314 #pragma unused(ph)
315 #if (DEVELOPMENT || DEBUG)
316 /* catch invalid slot */
317 slot_idx_t idx = KR_SLOT_INDEX(kring, slot);
318 struct __kern_slot_desc *ksd = KR_KSD(kring, idx);
319 #else
320 struct __kern_slot_desc *ksd = SLOT_DESC_KSD(slot);
321 #endif /* (DEVELOPMENT || DEBUG) */
322
323 ASSERT(SK_PTR_ADDR_KQUM(ph) ==
324 SK_PTR_ADDR_KQUM(SD_GET_TAGGED_METADATA(ksd)));
325 (void) KR_SLOT_DETACH_METADATA(kring, ksd);
326
327 return 0;
328 }
329
330 static errno_t
kern_channel_tx_refill_common(const kern_channel_ring_t hw_kring,uint32_t pkt_limit,uint32_t byte_limit,boolean_t tx_doorbell_ctxt,boolean_t * pkts_pending,boolean_t canblock)331 kern_channel_tx_refill_common(const kern_channel_ring_t hw_kring,
332 uint32_t pkt_limit, uint32_t byte_limit, boolean_t tx_doorbell_ctxt,
333 boolean_t *pkts_pending, boolean_t canblock)
334 {
335 #pragma unused(tx_doorbell_ctxt)
336 struct nexus_adapter *hwna;
337 struct ifnet *ifp;
338 sk_protect_t protect;
339 errno_t rc = 0;
340 errno_t sync_err = 0;
341
342 KDBG((SK_KTRACE_CHANNEL_TX_REFILL | DBG_FUNC_START), SK_KVA(hw_kring));
343
344 VERIFY(hw_kring != NULL);
345 hwna = KRNA(hw_kring);
346 ifp = hwna->na_ifp;
347
348 ASSERT(hwna->na_type == NA_NETIF_DEV);
349 ASSERT(hw_kring->ckr_tx == NR_TX);
350 *pkts_pending = FALSE;
351
352 if (__improbable(pkt_limit == 0 || byte_limit == 0)) {
353 SK_ERR("invalid limits plim %d, blim %d",
354 pkt_limit, byte_limit);
355 rc = EINVAL;
356 goto out;
357 }
358
359 if (__improbable(!IF_FULLY_ATTACHED(ifp))) {
360 SK_ERR("hwna 0x%llx ifp %s (0x%llx), interface not attached",
361 SK_KVA(hwna), if_name(ifp), SK_KVA(ifp));
362 rc = ENXIO;
363 goto out;
364 }
365
366 if (__improbable((ifp->if_start_flags & IFSF_FLOW_CONTROLLED) != 0)) {
367 SK_DF(SK_VERB_SYNC | SK_VERB_TX, "hwna 0x%llx ifp %s (0x%llx), "
368 "flow control ON", SK_KVA(hwna), if_name(ifp), SK_KVA(ifp));
369 rc = ENXIO;
370 goto out;
371 }
372
373 /*
374 * if the ring is busy, it means another dequeue is in
375 * progress, so ignore this request and return success.
376 */
377 if (kr_enter(hw_kring, canblock) != 0) {
378 rc = 0;
379 goto out;
380 }
381
382 if (__improbable(KR_DROP(hw_kring) ||
383 !NA_IS_ACTIVE(hw_kring->ckr_na))) {
384 kr_exit(hw_kring);
385 SK_ERR("hw-kr 0x%llx stopped", SK_KVA(hw_kring));
386 rc = ENXIO;
387 goto out;
388 }
389
390 /*
391 * Unlikely to get here, unless a channel is opened by
392 * a user process directly to the netif. Issue a TX sync
393 * on the netif device TX ring.
394 */
395 protect = sk_sync_protect();
396 sync_err = hw_kring->ckr_na_sync(hw_kring, kernproc,
397 NA_SYNCF_NETIF);
398 sk_sync_unprotect(protect);
399 kr_exit(hw_kring);
400
401 if (rc == 0) {
402 rc = sync_err;
403 }
404
405 out:
406 KDBG((SK_KTRACE_CHANNEL_TX_REFILL | DBG_FUNC_END), SK_KVA(hw_kring),
407 rc, 0, 0);
408
409 return rc;
410 }
411
412 errno_t
kern_channel_tx_refill(const kern_channel_ring_t hw_kring,uint32_t pkt_limit,uint32_t byte_limit,boolean_t tx_doorbell_ctxt,boolean_t * pkts_pending)413 kern_channel_tx_refill(const kern_channel_ring_t hw_kring,
414 uint32_t pkt_limit, uint32_t byte_limit, boolean_t tx_doorbell_ctxt,
415 boolean_t *pkts_pending)
416 {
417 if (NA_OWNED_BY_FSW(hw_kring->ckr_na)) {
418 return netif_ring_tx_refill(hw_kring, pkt_limit,
419 byte_limit, tx_doorbell_ctxt, pkts_pending, FALSE);
420 } else {
421 return kern_channel_tx_refill_common(hw_kring, pkt_limit,
422 byte_limit, tx_doorbell_ctxt, pkts_pending, FALSE);
423 }
424 }
425
426 errno_t
kern_channel_tx_refill_canblock(const kern_channel_ring_t hw_kring,uint32_t pkt_limit,uint32_t byte_limit,boolean_t tx_doorbell_ctxt,boolean_t * pkts_pending)427 kern_channel_tx_refill_canblock(const kern_channel_ring_t hw_kring,
428 uint32_t pkt_limit, uint32_t byte_limit, boolean_t tx_doorbell_ctxt,
429 boolean_t *pkts_pending)
430 {
431 if (NA_OWNED_BY_FSW(hw_kring->ckr_na)) {
432 return netif_ring_tx_refill(hw_kring, pkt_limit,
433 byte_limit, tx_doorbell_ctxt, pkts_pending, TRUE);
434 } else {
435 return kern_channel_tx_refill_common(hw_kring, pkt_limit,
436 byte_limit, tx_doorbell_ctxt, pkts_pending, TRUE);
437 }
438 }
439
440 errno_t
kern_channel_get_service_class(const kern_channel_ring_t kring,kern_packet_svc_class_t * svc)441 kern_channel_get_service_class(const kern_channel_ring_t kring,
442 kern_packet_svc_class_t *svc)
443 {
444 if ((KRNA(kring)->na_type != NA_NETIF_DEV) ||
445 (kring->ckr_tx == NR_RX) || (kring->ckr_svc == KPKT_SC_UNSPEC)) {
446 return ENOTSUP;
447 }
448 *svc = kring->ckr_svc;
449 return 0;
450 }
451
452 void
kern_channel_flowadv_clear(struct flowadv_fcentry * fce)453 kern_channel_flowadv_clear(struct flowadv_fcentry *fce)
454 {
455 const flowadv_token_t ch_token = fce->fce_flowsrc_token;
456 const flowadv_token_t flow_token = fce->fce_flowid;
457 const flowadv_idx_t flow_fidx = fce->fce_flowsrc_fidx;
458 struct ifnet *ifp = fce->fce_ifp;
459 struct nexus_adapter *hwna;
460 struct kern_nexus *fsw_nx;
461 struct kern_channel *ch = NULL;
462 struct nx_flowswitch *fsw;
463
464 _CASSERT(sizeof(ch->ch_info->cinfo_ch_token) == sizeof(ch_token));
465
466 SK_LOCK();
467 if (ifnet_is_attached(ifp, 0) == 0 || ifp->if_na == NULL) {
468 goto done;
469 }
470
471 hwna = &ifp->if_na->nifna_up;
472 VERIFY((hwna->na_type == NA_NETIF_DEV) ||
473 (hwna->na_type == NA_NETIF_COMPAT_DEV));
474
475 if (!NA_IS_ACTIVE(hwna) || (fsw = fsw_ifp_to_fsw(ifp)) == NULL) {
476 goto done;
477 }
478
479 fsw_nx = fsw->fsw_nx;
480 VERIFY(fsw_nx != NULL);
481
482 /* find the channel */
483 STAILQ_FOREACH(ch, &fsw_nx->nx_ch_head, ch_link) {
484 if (ch_token == ch->ch_info->cinfo_ch_token) {
485 break;
486 }
487 }
488
489 if (ch != NULL) {
490 if (ch->ch_na != NULL &&
491 na_flowadv_clear(ch, flow_fidx, flow_token)) {
492 /* trigger flow advisory kevent */
493 na_flowadv_event(
494 &ch->ch_na->na_tx_rings[ch->ch_first[NR_TX]]);
495 SK_DF(SK_VERB_FLOW_ADVISORY,
496 "%s(%d) notified of flow update",
497 ch->ch_name, ch->ch_pid);
498 } else if (ch->ch_na == NULL) {
499 SK_DF(SK_VERB_FLOW_ADVISORY,
500 "%s(%d) is closing (flow update ignored)",
501 ch->ch_name, ch->ch_pid);
502 }
503 } else {
504 SK_ERR("channel token 0x%x fidx %u on %s not found",
505 ch_token, flow_fidx, ifp->if_xname);
506 }
507 done:
508 SK_UNLOCK();
509 }
510
511 void
kern_channel_flowadv_report_ce_event(struct flowadv_fcentry * fce,uint32_t ce_cnt,uint32_t total_pkt_cnt)512 kern_channel_flowadv_report_ce_event(struct flowadv_fcentry *fce,
513 uint32_t ce_cnt, uint32_t total_pkt_cnt)
514 {
515 const flowadv_token_t ch_token = fce->fce_flowsrc_token;
516 const flowadv_token_t flow_token = fce->fce_flowid;
517 const flowadv_idx_t flow_fidx = fce->fce_flowsrc_fidx;
518 struct ifnet *ifp = fce->fce_ifp;
519 struct nexus_adapter *hwna;
520 struct kern_nexus *fsw_nx;
521 struct kern_channel *ch = NULL;
522 struct nx_flowswitch *fsw;
523
524 _CASSERT(sizeof(ch->ch_info->cinfo_ch_token) == sizeof(ch_token));
525
526 SK_LOCK();
527 if (ifnet_is_attached(ifp, 0) == 0 || ifp->if_na == NULL) {
528 goto done;
529 }
530
531 hwna = &ifp->if_na->nifna_up;
532 VERIFY((hwna->na_type == NA_NETIF_DEV) ||
533 (hwna->na_type == NA_NETIF_COMPAT_DEV));
534
535 if (!NA_IS_ACTIVE(hwna) || (fsw = fsw_ifp_to_fsw(ifp)) == NULL) {
536 goto done;
537 }
538
539 fsw_nx = fsw->fsw_nx;
540 VERIFY(fsw_nx != NULL);
541
542 /* find the channel */
543 STAILQ_FOREACH(ch, &fsw_nx->nx_ch_head, ch_link) {
544 if (ch_token == ch->ch_info->cinfo_ch_token) {
545 break;
546 }
547 }
548
549 if (ch != NULL) {
550 if (ch->ch_na != NULL &&
551 na_flowadv_report_ce_event(ch, flow_fidx, flow_token,
552 ce_cnt, total_pkt_cnt)) {
553 SK_DF(SK_VERB_FLOW_ADVISORY,
554 "%s(%d) notified of flow update",
555 ch->ch_name, ch->ch_pid);
556 } else if (ch->ch_na == NULL) {
557 SK_DF(SK_VERB_FLOW_ADVISORY,
558 "%s(%d) is closing (flow update ignored)",
559 ch->ch_name, ch->ch_pid);
560 }
561 } else {
562 SK_ERR("channel token 0x%x fidx %u on %s not found",
563 ch_token, flow_fidx, ifp->if_xname);
564 }
565 done:
566 SK_UNLOCK();
567 }
568
569
570 void
kern_channel_memstatus(struct proc * p,uint32_t status,struct kern_channel * ch)571 kern_channel_memstatus(struct proc *p, uint32_t status,
572 struct kern_channel *ch)
573 {
574 #pragma unused(p, status)
575 SK_LOCK_ASSERT_NOTHELD();
576
577 ASSERT(!(ch->ch_flags & CHANF_KERNEL));
578 ASSERT(proc_pid(p) == ch->ch_pid);
579 /*
580 * If we're already draining, then bail. Otherwise, check it
581 * again via na_drain() with the channel lock held.
582 */
583 if (ch->ch_na->na_flags & NAF_DRAINING) {
584 return;
585 }
586
587 SK_DF(SK_VERB_CHANNEL, "%s(%d) ch 0x%llx flags 0x%b status %s",
588 sk_proc_name_address(p), sk_proc_pid(p), SK_KVA(ch),
589 ch->ch_flags, CHANF_BITS, sk_memstatus2str(status));
590
591 /* serialize accesses against channel syscalls */
592 lck_mtx_lock(&ch->ch_lock);
593 na_drain(ch->ch_na, TRUE); /* purge caches */
594 lck_mtx_unlock(&ch->ch_lock);
595 }
596
597 static bool
_kern_channel_defunct_eligible(struct kern_channel * ch)598 _kern_channel_defunct_eligible(struct kern_channel *ch)
599 {
600 struct nexus_upipe_adapter *pna;
601
602 if ((ch->ch_info->cinfo_ch_mode & CHMODE_DEFUNCT_OK) == 0) {
603 return false;
604 }
605 if (ch->ch_na->na_type != NA_USER_PIPE) {
606 return true;
607 }
608 pna = (struct nexus_upipe_adapter *)ch->ch_na;
609 if ((pna->pna_parent->na_flags & NAF_DEFUNCT_OK) == 0) {
610 return false;
611 }
612 return true;
613 }
614
615 void
kern_channel_defunct(struct proc * p,struct kern_channel * ch)616 kern_channel_defunct(struct proc *p, struct kern_channel *ch)
617 {
618 uint32_t ch_mode = ch->ch_info->cinfo_ch_mode;
619
620 SK_LOCK_ASSERT_NOTHELD();
621
622 ASSERT(!(ch->ch_flags & CHANF_KERNEL));
623 ASSERT(proc_pid(p) == ch->ch_pid);
624 /*
625 * If the channel is eligible for defunct, mark it as such.
626 * Otherwise, set the draining flag which tells the reaper
627 * thread to purge any cached objects associated with it.
628 * That draining flag will be cleared then, which allows the
629 * channel to cache objects again once the process is resumed.
630 */
631 if (_kern_channel_defunct_eligible(ch)) {
632 struct kern_nexus *nx = ch->ch_nexus;
633 struct kern_nexus_domain_provider *nxdom_prov = NX_DOM_PROV(nx);
634 boolean_t need_defunct;
635 int err;
636
637 /*
638 * This may be called often, so check first (without lock) if
639 * the trapdoor flag CHANF_DEFUNCT has been set and bail if so,
640 * for performance reasons. This check is repeated below with
641 * the channel lock held.
642 */
643 if (ch->ch_flags & CHANF_DEFUNCT) {
644 return;
645 }
646
647 SK_DF(SK_VERB_CHANNEL, "%s(%d) ch 0x%llx flags 0x%b",
648 sk_proc_name_address(p), sk_proc_pid(p), SK_KVA(ch),
649 ch->ch_flags, CHANF_BITS);
650
651 /* serialize accesses against channel syscalls */
652 lck_mtx_lock(&ch->ch_lock);
653
654 /*
655 * If opportunistic defunct is in effect, skip the rest of
656 * the defunct work based on two cases:
657 *
658 * a) if the channel isn't using user packet pool; or
659 * b) if the channel is using user packet pool and we
660 * detect that there are outstanding allocations.
661 *
662 * Note that for case (a) above we essentially treat the
663 * channel as ineligible for defunct, and although it may
664 * be idle we'd leave the memory mapping intact. This
665 * should not be a concern as the majority of channels are
666 * on flowswitches where user packet pool is mandatory.
667 *
668 * If skipping, mark the channel with CHANF_DEFUNCT_SKIP
669 * and increment the stats (for flowswitch only).
670 */
671 if (sk_opp_defunct && (!(ch_mode & CHMODE_USER_PACKET_POOL) ||
672 !pp_isempty_upp(ch->ch_pp))) {
673 if (ch->ch_na->na_type == NA_FLOWSWITCH_VP) {
674 struct nx_flowswitch *fsw =
675 VPNA(ch->ch_na)->vpna_fsw;
676 STATS_INC(&fsw->fsw_stats,
677 FSW_STATS_CHAN_DEFUNCT_SKIP);
678 }
679 os_atomic_or(&ch->ch_flags, CHANF_DEFUNCT_SKIP,
680 relaxed);
681 /* skip defunct */
682 lck_mtx_unlock(&ch->ch_lock);
683 return;
684 }
685 os_atomic_andnot(&ch->ch_flags, CHANF_DEFUNCT_SKIP, relaxed);
686
687 /*
688 * Proceed with the rest of the defunct work.
689 */
690 if (os_atomic_or_orig(&ch->ch_flags, CHANF_DEFUNCT, relaxed) &
691 CHANF_DEFUNCT) {
692 /* already defunct; nothing to do */
693 lck_mtx_unlock(&ch->ch_lock);
694 return;
695 }
696
697 /* mark this channel as inactive */
698 ch_deactivate(ch);
699
700 /*
701 * Redirect memory regions for the map; upon success, instruct
702 * the nexus to finalize the defunct and teardown the respective
703 * memory regions. It's crucial that the redirection happens
704 * first before freeing the objects, since the page protection
705 * flags get inherited only from unfreed segments. Freed ones
706 * will cause VM_PROT_NONE to be used for the segment span, to
707 * catch use-after-free cases. For unfreed objects, doing so
708 * may cause an exception when the process is later resumed
709 * and touches an address within the span; hence the ordering.
710 */
711 if ((err = skmem_arena_mredirect(ch->ch_na->na_arena,
712 &ch->ch_mmap, p, &need_defunct)) == 0 && need_defunct) {
713 /*
714 * Let the domain provider handle the initial tasks of
715 * the defunct that are specific to this channel. It
716 * may safely free objects as the redirection is done.
717 */
718 nxdom_prov->nxdom_prov_dom->nxdom_defunct(nxdom_prov,
719 nx, ch, p);
720 /*
721 * Let the domain provider complete the defunct;
722 * do this after dropping the channel lock, as
723 * the nexus may end up acquiring other locks
724 * that would otherwise violate lock ordering.
725 * The channel refcnt is still held by virtue
726 * of the caller holding the process's file
727 * table lock.
728 */
729 lck_mtx_unlock(&ch->ch_lock);
730 nxdom_prov->nxdom_prov_dom->nxdom_defunct_finalize(
731 nxdom_prov, nx, ch, FALSE);
732 } else if (err == 0) {
733 /*
734 * Let the domain provider handle the initial tasks of
735 * the defunct that are specific to this channel. It
736 * may sadely free objects as the redirection is done.
737 */
738 nxdom_prov->nxdom_prov_dom->nxdom_defunct(nxdom_prov,
739 nx, ch, p);
740 lck_mtx_unlock(&ch->ch_lock);
741 } else {
742 /* already redirected; nothing to do */
743 lck_mtx_unlock(&ch->ch_lock);
744 }
745 } else {
746 lck_mtx_lock(&ch->ch_lock);
747 na_drain(ch->ch_na, FALSE); /* prune caches */
748 lck_mtx_unlock(&ch->ch_lock);
749 }
750 }
751