xref: /xnu-10002.1.13/bsd/skywalk/channel/channel_kern.c (revision 1031c584a5e37aff177559b9f69dbd3c8c3fd30a)
1 /*
2  * Copyright (c) 2015-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include <sys/kdebug.h>
30 #include <skywalk/os_skywalk_private.h>
31 #include <net/ntstat.h>
32 #include <skywalk/nexus/flowswitch/nx_flowswitch.h>
33 #include <skywalk/nexus/netif/nx_netif.h>
34 #include <skywalk/nexus/upipe/nx_user_pipe.h>
35 
36 #define KRING_EMPTY_TX(_kring, _index)  \
37 	((_kring)->ckr_rhead == (_index))
38 
39 #define KRING_FULL_RX(_kring, _index)                                   \
40 	((_kring)->ckr_khead == SLOT_NEXT((_index), (_kring)->ckr_lim))
41 
42 uint32_t
kern_channel_notify(const kern_channel_ring_t kring,uint32_t flags)43 kern_channel_notify(const kern_channel_ring_t kring, uint32_t flags)
44 {
45 #pragma unused(flags)
46 	if (__improbable(KR_DROP(kring))) {
47 		return ENXIO;
48 	}
49 
50 	return kring->ckr_na_notify(kring, kernproc, 0);
51 }
52 
53 uint32_t
kern_channel_reclaim(const kern_channel_ring_t kring)54 kern_channel_reclaim(const kern_channel_ring_t kring)
55 {
56 	return kr_reclaim(kring);
57 }
58 
59 static inline uint32_t
_kern_channel_available_slot_count_tx(const kern_channel_ring_t kring,slot_idx_t index)60 _kern_channel_available_slot_count_tx(const kern_channel_ring_t kring,
61     slot_idx_t index)
62 {
63 	ASSERT(kring->ckr_tx == NR_TX);
64 
65 	if (kring->ckr_rhead < index) {
66 		return kring->ckr_num_slots + kring->ckr_rhead - index;
67 	}
68 
69 	return kring->ckr_rhead - index;
70 }
71 
72 static inline uint32_t
_kern_channel_available_slot_count_rx(const kern_channel_ring_t kring,slot_idx_t index)73 _kern_channel_available_slot_count_rx(const kern_channel_ring_t kring,
74     slot_idx_t index)
75 {
76 	uint32_t busy;
77 	slot_idx_t lim = kring->ckr_lim;
78 
79 	ASSERT(kring->ckr_tx == NR_RX);
80 
81 	if (index < kring->ckr_khead) {
82 		busy = kring->ckr_num_slots + index - kring->ckr_khead;
83 	} else {
84 		busy = index - kring->ckr_khead;
85 	}
86 
87 	ASSERT(lim >= busy);
88 	return lim - busy;
89 }
90 
91 uint32_t
kern_channel_available_slot_count(const kern_channel_ring_t kring)92 kern_channel_available_slot_count(const kern_channel_ring_t kring)
93 {
94 	if (kring->ckr_tx == NR_TX) {
95 		return _kern_channel_available_slot_count_tx(kring,
96 		           kring->ckr_khead);
97 	} else {
98 		return _kern_channel_available_slot_count_rx(kring,
99 		           kring->ckr_ktail);
100 	}
101 }
102 
103 kern_channel_slot_t
kern_channel_get_next_slot(const kern_channel_ring_t kring,const kern_channel_slot_t slot0,struct kern_slot_prop * prop)104 kern_channel_get_next_slot(const kern_channel_ring_t kring,
105     const kern_channel_slot_t slot0, struct kern_slot_prop *prop)
106 {
107 	kern_channel_slot_t slot;
108 	slot_idx_t slot_idx;
109 
110 	/* Ensure this is only done by the thread doing a sync syscall */
111 	VERIFY(sk_is_sync_protected());
112 
113 	if (__improbable(slot0 == NULL)) {
114 		if (kring->ckr_tx == NR_TX) {
115 			slot_idx = kring->ckr_khead;
116 		} else {
117 			slot_idx = kring->ckr_ktail;
118 		}
119 	} else {
120 		slot_idx = SLOT_NEXT(KR_SLOT_INDEX(kring, slot0),
121 		    kring->ckr_lim);
122 	}
123 
124 	ASSERT(slot_idx < kring->ckr_num_slots);
125 
126 	if (kring->ckr_tx == NR_TX) {
127 		if (__improbable(KRING_EMPTY_TX(kring, slot_idx))) {
128 			SK_DF(SK_VERB_SYNC | SK_VERB_TX,
129 			    "EMPTY_TX: na \"%s\" kr \"%s\" "
130 			    "i %u (kc %u kt %u kl %u | rh %u rt %u)",
131 			    KRNA(kring)->na_name,
132 			    kring->ckr_name, slot_idx, kring->ckr_khead,
133 			    kring->ckr_ktail, kring->ckr_klease,
134 			    kring->ckr_rhead, kring->ckr_rtail);
135 			slot = NULL;
136 		} else {
137 			slot = &kring->ckr_ksds[slot_idx];
138 		}
139 	} else {
140 		if (__improbable(KRING_FULL_RX(kring, slot_idx))) {
141 			SK_DF(SK_VERB_SYNC | SK_VERB_RX,
142 			    "FULL_RX: na \"%s\" kr \"%s\" "
143 			    "i %u (kc %u kt %u kl %u | rh %u rt %u)",
144 			    KRNA(kring)->na_name,
145 			    kring->ckr_name, slot_idx, kring->ckr_khead,
146 			    kring->ckr_ktail, kring->ckr_klease,
147 			    kring->ckr_rhead, kring->ckr_rtail);
148 			slot = NULL;
149 		} else {
150 			slot = &kring->ckr_ksds[slot_idx];
151 		}
152 	}
153 
154 	if (prop != NULL) {
155 		bzero(prop, sizeof(*prop));
156 	}
157 
158 	return slot;
159 }
160 
161 static inline void
_kern_channel_advance_slot_tx(const kern_channel_ring_t kring,slot_idx_t index)162 _kern_channel_advance_slot_tx(const kern_channel_ring_t kring, slot_idx_t index)
163 {
164 	/* Ensure this is only done by the thread doing a sync syscall */
165 	VERIFY(sk_is_sync_protected());
166 	kr_txkring_reclaim_and_refill(kring, index);
167 }
168 
169 static inline void
_kern_channel_advance_slot_rx(const kern_channel_ring_t kring,slot_idx_t index)170 _kern_channel_advance_slot_rx(const kern_channel_ring_t kring, slot_idx_t index)
171 {
172 	ASSERT(kring->ckr_tx == NR_RX || kring->ckr_tx == NR_EV);
173 	/* Ensure this is only done by the thread doing a sync syscall */
174 	VERIFY(sk_is_sync_protected());
175 
176 	kring->ckr_ktail = SLOT_NEXT(index, kring->ckr_lim);
177 }
178 
179 void
kern_channel_advance_slot(const kern_channel_ring_t kring,kern_channel_slot_t slot)180 kern_channel_advance_slot(const kern_channel_ring_t kring,
181     kern_channel_slot_t slot)
182 {
183 	slot_idx_t index = KR_SLOT_INDEX(kring, slot);
184 	ASSERT(index < kring->ckr_num_slots);
185 
186 	if (kring->ckr_tx == NR_TX) {
187 		_kern_channel_advance_slot_tx(kring, index);
188 	} else {
189 		_kern_channel_advance_slot_rx(kring, index);
190 	}
191 }
192 
193 void *
kern_channel_get_context(const kern_channel_t ch)194 kern_channel_get_context(const kern_channel_t ch)
195 {
196 	return ch->ch_ctx;
197 }
198 
199 void *
kern_channel_ring_get_context(const kern_channel_ring_t kring)200 kern_channel_ring_get_context(const kern_channel_ring_t kring)
201 {
202 	return kring->ckr_ctx;
203 }
204 
205 errno_t
kern_channel_ring_get_container(const kern_channel_ring_t kring,kern_packet_t ** array,uint32_t * count)206 kern_channel_ring_get_container(const kern_channel_ring_t kring,
207     kern_packet_t **array, uint32_t *count)
208 {
209 	/* Ensure this is only done by the thread doing a sync syscall */
210 	VERIFY(sk_is_sync_protected());
211 
212 	if (array == NULL) {
213 		return EINVAL;
214 	}
215 
216 	*array = kring->ckr_scratch;
217 	if (count != NULL) {
218 		*count = na_get_nslots(kring->ckr_na, kring->ckr_tx);
219 	}
220 
221 	return 0;
222 }
223 
224 void *
kern_channel_slot_get_context(const kern_channel_ring_t kring,const kern_channel_slot_t slot)225 kern_channel_slot_get_context(const kern_channel_ring_t kring,
226     const kern_channel_slot_t slot)
227 {
228 	slot_idx_t i = KR_SLOT_INDEX(kring, slot);
229 	void *slot_ctx = NULL;
230 
231 	if (kring->ckr_slot_ctxs != NULL) {
232 		slot_ctx = (void *)(kring->ckr_slot_ctxs[i].slot_ctx_arg);
233 	}
234 
235 	return slot_ctx;
236 }
237 
238 void
kern_channel_increment_ring_stats(kern_channel_ring_t kring,struct kern_channel_ring_stat_increment * stats)239 kern_channel_increment_ring_stats(kern_channel_ring_t kring,
240     struct kern_channel_ring_stat_increment *stats)
241 {
242 	kr_update_stats(kring, stats->kcrsi_slots_transferred,
243 	    stats->kcrsi_bytes_transferred);
244 }
245 
246 void
kern_channel_increment_ring_net_stats(kern_channel_ring_t kring,struct ifnet * ifp,struct kern_channel_ring_stat_increment * stats)247 kern_channel_increment_ring_net_stats(kern_channel_ring_t kring,
248     struct ifnet *ifp, struct kern_channel_ring_stat_increment *stats)
249 {
250 	if (kring->ckr_tx == NR_TX) {
251 		os_atomic_add(&ifp->if_data.ifi_opackets, stats->kcrsi_slots_transferred, relaxed);
252 		os_atomic_add(&ifp->if_data.ifi_obytes, stats->kcrsi_bytes_transferred, relaxed);
253 	} else {
254 		os_atomic_add(&ifp->if_data.ifi_ipackets, stats->kcrsi_slots_transferred, relaxed);
255 		os_atomic_add(&ifp->if_data.ifi_ibytes, stats->kcrsi_bytes_transferred, relaxed);
256 	}
257 
258 	if (ifp->if_data_threshold != 0) {
259 		ifnet_notify_data_threshold(ifp);
260 	}
261 
262 	kr_update_stats(kring, stats->kcrsi_slots_transferred,
263 	    stats->kcrsi_bytes_transferred);
264 }
265 
266 kern_packet_t
kern_channel_slot_get_packet(const kern_channel_ring_t kring,const kern_channel_slot_t slot)267 kern_channel_slot_get_packet(const kern_channel_ring_t kring,
268     const kern_channel_slot_t slot)
269 {
270 #if (DEVELOPMENT || DEBUG)
271 	/* catch invalid slot */
272 	slot_idx_t idx = KR_SLOT_INDEX(kring, slot);
273 	struct __kern_slot_desc *ksd = KR_KSD(kring, idx);
274 #else
275 #pragma unused(kring)
276 	struct __kern_slot_desc *ksd = SLOT_DESC_KSD(slot);
277 #endif /* (DEVELOPMENT || DEBUG) */
278 	struct __kern_quantum *kqum = ksd->sd_qum;
279 
280 	if (__improbable(kqum == NULL ||
281 	    (kqum->qum_qflags & QUM_F_DROPPED) != 0)) {
282 		return 0;
283 	}
284 
285 	return SD_GET_TAGGED_METADATA(ksd);
286 }
287 
288 errno_t
kern_channel_slot_attach_packet(const kern_channel_ring_t kring,const kern_channel_slot_t slot,kern_packet_t ph)289 kern_channel_slot_attach_packet(const kern_channel_ring_t kring,
290     const kern_channel_slot_t slot, kern_packet_t ph)
291 {
292 #if (DEVELOPMENT || DEBUG)
293 	/* catch invalid slot */
294 	slot_idx_t idx = KR_SLOT_INDEX(kring, slot);
295 	struct __kern_slot_desc *ksd = KR_KSD(kring, idx);
296 #else
297 #pragma unused(kring)
298 	struct __kern_slot_desc *ksd = SLOT_DESC_KSD(slot);
299 #endif /* (DEVELOPMENT || DEBUG) */
300 
301 	return KR_SLOT_ATTACH_METADATA(kring, ksd, SK_PTR_ADDR_KQUM(ph));
302 }
303 
304 errno_t
kern_channel_slot_detach_packet(const kern_channel_ring_t kring,const kern_channel_slot_t slot,kern_packet_t ph)305 kern_channel_slot_detach_packet(const kern_channel_ring_t kring,
306     const kern_channel_slot_t slot, kern_packet_t ph)
307 {
308 #pragma unused(ph)
309 #if (DEVELOPMENT || DEBUG)
310 	/* catch invalid slot */
311 	slot_idx_t idx = KR_SLOT_INDEX(kring, slot);
312 	struct __kern_slot_desc *ksd = KR_KSD(kring, idx);
313 #else
314 	struct __kern_slot_desc *ksd = SLOT_DESC_KSD(slot);
315 #endif /* (DEVELOPMENT || DEBUG) */
316 
317 	ASSERT(SK_PTR_ADDR_KQUM(ph) ==
318 	    SK_PTR_ADDR_KQUM(SD_GET_TAGGED_METADATA(ksd)));
319 	(void) KR_SLOT_DETACH_METADATA(kring, ksd);
320 
321 	return 0;
322 }
323 
324 static errno_t
kern_channel_tx_refill_common(const kern_channel_ring_t hw_kring,uint32_t pkt_limit,uint32_t byte_limit,boolean_t tx_doorbell_ctxt,boolean_t * pkts_pending,boolean_t canblock)325 kern_channel_tx_refill_common(const kern_channel_ring_t hw_kring,
326     uint32_t pkt_limit, uint32_t byte_limit, boolean_t tx_doorbell_ctxt,
327     boolean_t *pkts_pending, boolean_t canblock)
328 {
329 #pragma unused(tx_doorbell_ctxt)
330 	struct nexus_adapter *hwna;
331 	struct ifnet *ifp;
332 	sk_protect_t protect;
333 	errno_t rc = 0;
334 	errno_t sync_err = 0;
335 
336 	KDBG((SK_KTRACE_CHANNEL_TX_REFILL | DBG_FUNC_START), SK_KVA(hw_kring));
337 
338 	VERIFY(hw_kring != NULL);
339 	hwna = KRNA(hw_kring);
340 	ifp = hwna->na_ifp;
341 
342 	ASSERT(hwna->na_type == NA_NETIF_DEV);
343 	ASSERT(hw_kring->ckr_tx == NR_TX);
344 	*pkts_pending = FALSE;
345 
346 	if (__improbable(pkt_limit == 0 || byte_limit == 0)) {
347 		SK_ERR("invalid limits plim %d, blim %d",
348 		    pkt_limit, byte_limit);
349 		rc = EINVAL;
350 		goto out;
351 	}
352 
353 	if (__improbable(!IF_FULLY_ATTACHED(ifp))) {
354 		SK_ERR("hwna 0x%llx ifp %s (0x%llx), interface not attached",
355 		    SK_KVA(hwna), if_name(ifp), SK_KVA(ifp));
356 		rc = ENXIO;
357 		goto out;
358 	}
359 
360 	if (__improbable((ifp->if_start_flags & IFSF_FLOW_CONTROLLED) != 0)) {
361 		SK_DF(SK_VERB_SYNC | SK_VERB_TX, "hwna 0x%llx ifp %s (0x%llx), "
362 		    "flow control ON", SK_KVA(hwna), if_name(ifp), SK_KVA(ifp));
363 		rc = ENXIO;
364 		goto out;
365 	}
366 
367 	/*
368 	 * if the ring is busy, it means another dequeue is in
369 	 * progress, so ignore this request and return success.
370 	 */
371 	if (kr_enter(hw_kring, canblock) != 0) {
372 		rc = 0;
373 		goto out;
374 	}
375 
376 	if (__improbable(KR_DROP(hw_kring) ||
377 	    !NA_IS_ACTIVE(hw_kring->ckr_na))) {
378 		kr_exit(hw_kring);
379 		SK_ERR("hw-kr 0x%llx stopped", SK_KVA(hw_kring));
380 		rc = ENXIO;
381 		goto out;
382 	}
383 
384 	/*
385 	 * Unlikely to get here, unless a channel is opened by
386 	 * a user process directly to the netif.  Issue a TX sync
387 	 * on the netif device TX ring.
388 	 */
389 	protect = sk_sync_protect();
390 	sync_err = hw_kring->ckr_na_sync(hw_kring, kernproc,
391 	    NA_SYNCF_NETIF);
392 	sk_sync_unprotect(protect);
393 	kr_exit(hw_kring);
394 
395 	if (rc == 0) {
396 		rc = sync_err;
397 	}
398 
399 out:
400 	KDBG((SK_KTRACE_CHANNEL_TX_REFILL | DBG_FUNC_END), SK_KVA(hw_kring),
401 	    rc, 0, 0);
402 
403 	return rc;
404 }
405 
406 errno_t
kern_channel_tx_refill(const kern_channel_ring_t hw_kring,uint32_t pkt_limit,uint32_t byte_limit,boolean_t tx_doorbell_ctxt,boolean_t * pkts_pending)407 kern_channel_tx_refill(const kern_channel_ring_t hw_kring,
408     uint32_t pkt_limit, uint32_t byte_limit, boolean_t tx_doorbell_ctxt,
409     boolean_t *pkts_pending)
410 {
411 	if (NA_OWNED_BY_FSW(hw_kring->ckr_na)) {
412 		return netif_ring_tx_refill(hw_kring, pkt_limit,
413 		           byte_limit, tx_doorbell_ctxt, pkts_pending, FALSE);
414 	} else {
415 		return kern_channel_tx_refill_common(hw_kring, pkt_limit,
416 		           byte_limit, tx_doorbell_ctxt, pkts_pending, FALSE);
417 	}
418 }
419 
420 errno_t
kern_channel_tx_refill_canblock(const kern_channel_ring_t hw_kring,uint32_t pkt_limit,uint32_t byte_limit,boolean_t tx_doorbell_ctxt,boolean_t * pkts_pending)421 kern_channel_tx_refill_canblock(const kern_channel_ring_t hw_kring,
422     uint32_t pkt_limit, uint32_t byte_limit, boolean_t tx_doorbell_ctxt,
423     boolean_t *pkts_pending)
424 {
425 	if (NA_OWNED_BY_FSW(hw_kring->ckr_na)) {
426 		return netif_ring_tx_refill(hw_kring, pkt_limit,
427 		           byte_limit, tx_doorbell_ctxt, pkts_pending, TRUE);
428 	} else {
429 		return kern_channel_tx_refill_common(hw_kring, pkt_limit,
430 		           byte_limit, tx_doorbell_ctxt, pkts_pending, TRUE);
431 	}
432 }
433 
434 errno_t
kern_channel_get_service_class(const kern_channel_ring_t kring,kern_packet_svc_class_t * svc)435 kern_channel_get_service_class(const kern_channel_ring_t kring,
436     kern_packet_svc_class_t *svc)
437 {
438 	if ((KRNA(kring)->na_type != NA_NETIF_DEV) ||
439 	    (kring->ckr_tx == NR_RX) || (kring->ckr_svc == KPKT_SC_UNSPEC)) {
440 		return ENOTSUP;
441 	}
442 	*svc = kring->ckr_svc;
443 	return 0;
444 }
445 
446 void
kern_channel_flowadv_clear(struct flowadv_fcentry * fce)447 kern_channel_flowadv_clear(struct flowadv_fcentry *fce)
448 {
449 	const flowadv_token_t ch_token = fce->fce_flowsrc_token;
450 	const flowadv_token_t flow_token = fce->fce_flowid;
451 	const flowadv_idx_t flow_fidx = fce->fce_flowsrc_fidx;
452 	struct ifnet *ifp = fce->fce_ifp;
453 	struct nexus_adapter *hwna;
454 	struct kern_nexus *fsw_nx;
455 	struct kern_channel *ch = NULL;
456 	struct nx_flowswitch *fsw;
457 
458 	_CASSERT(sizeof(ch->ch_info->cinfo_ch_token) == sizeof(ch_token));
459 
460 	SK_LOCK();
461 	if (ifnet_is_attached(ifp, 0) == 0 || ifp->if_na == NULL) {
462 		goto done;
463 	}
464 
465 	hwna = &ifp->if_na->nifna_up;
466 	VERIFY((hwna->na_type == NA_NETIF_DEV) ||
467 	    (hwna->na_type == NA_NETIF_COMPAT_DEV));
468 
469 	if (!NA_IS_ACTIVE(hwna) || (fsw = fsw_ifp_to_fsw(ifp)) == NULL) {
470 		goto done;
471 	}
472 
473 	fsw_nx = fsw->fsw_nx;
474 	VERIFY(fsw_nx != NULL);
475 
476 	/* find the channel */
477 	STAILQ_FOREACH(ch, &fsw_nx->nx_ch_head, ch_link) {
478 		if (ch_token == ch->ch_info->cinfo_ch_token) {
479 			break;
480 		}
481 	}
482 
483 	if (ch != NULL) {
484 		if (ch->ch_na != NULL &&
485 		    na_flowadv_clear(ch, flow_fidx, flow_token)) {
486 			/* trigger flow advisory kevent */
487 			na_flowadv_event(
488 				&ch->ch_na->na_tx_rings[ch->ch_first[NR_TX]]);
489 			SK_DF(SK_VERB_FLOW_ADVISORY,
490 			    "%s(%d) notified of flow update",
491 			    ch->ch_name, ch->ch_pid);
492 		} else if (ch->ch_na == NULL) {
493 			SK_DF(SK_VERB_FLOW_ADVISORY,
494 			    "%s(%d) is closing (flow update ignored)",
495 			    ch->ch_name, ch->ch_pid);
496 		}
497 	} else {
498 		SK_ERR("channel token 0x%x fidx %u on %s not found",
499 		    ch_token, flow_fidx, ifp->if_xname);
500 	}
501 done:
502 	SK_UNLOCK();
503 }
504 
505 void
kern_channel_flowadv_report_ce_event(struct flowadv_fcentry * fce,uint32_t ce_cnt,uint32_t total_pkt_cnt)506 kern_channel_flowadv_report_ce_event(struct flowadv_fcentry *fce,
507     uint32_t ce_cnt, uint32_t total_pkt_cnt)
508 {
509 	const flowadv_token_t ch_token = fce->fce_flowsrc_token;
510 	const flowadv_token_t flow_token = fce->fce_flowid;
511 	const flowadv_idx_t flow_fidx = fce->fce_flowsrc_fidx;
512 	struct ifnet *ifp = fce->fce_ifp;
513 	struct nexus_adapter *hwna;
514 	struct kern_nexus *fsw_nx;
515 	struct kern_channel *ch = NULL;
516 	struct nx_flowswitch *fsw;
517 
518 	_CASSERT(sizeof(ch->ch_info->cinfo_ch_token) == sizeof(ch_token));
519 
520 	SK_LOCK();
521 	if (ifnet_is_attached(ifp, 0) == 0 || ifp->if_na == NULL) {
522 		goto done;
523 	}
524 
525 	hwna = &ifp->if_na->nifna_up;
526 	VERIFY((hwna->na_type == NA_NETIF_DEV) ||
527 	    (hwna->na_type == NA_NETIF_COMPAT_DEV));
528 
529 	if (!NA_IS_ACTIVE(hwna) || (fsw = fsw_ifp_to_fsw(ifp)) == NULL) {
530 		goto done;
531 	}
532 
533 	fsw_nx = fsw->fsw_nx;
534 	VERIFY(fsw_nx != NULL);
535 
536 	/* find the channel */
537 	STAILQ_FOREACH(ch, &fsw_nx->nx_ch_head, ch_link) {
538 		if (ch_token == ch->ch_info->cinfo_ch_token) {
539 			break;
540 		}
541 	}
542 
543 	if (ch != NULL) {
544 		if (ch->ch_na != NULL &&
545 		    na_flowadv_report_ce_event(ch, flow_fidx, flow_token,
546 		    ce_cnt, total_pkt_cnt)) {
547 			SK_DF(SK_VERB_FLOW_ADVISORY,
548 			    "%s(%d) notified of flow update",
549 			    ch->ch_name, ch->ch_pid);
550 		} else if (ch->ch_na == NULL) {
551 			SK_DF(SK_VERB_FLOW_ADVISORY,
552 			    "%s(%d) is closing (flow update ignored)",
553 			    ch->ch_name, ch->ch_pid);
554 		}
555 	} else {
556 		SK_ERR("channel token 0x%x fidx %u on %s not found",
557 		    ch_token, flow_fidx, ifp->if_xname);
558 	}
559 done:
560 	SK_UNLOCK();
561 }
562 
563 
564 void
kern_channel_memstatus(struct proc * p,uint32_t status,struct kern_channel * ch)565 kern_channel_memstatus(struct proc *p, uint32_t status,
566     struct kern_channel *ch)
567 {
568 #pragma unused(p, status)
569 	SK_LOCK_ASSERT_NOTHELD();
570 
571 	ASSERT(!(ch->ch_flags & CHANF_KERNEL));
572 	ASSERT(proc_pid(p) == ch->ch_pid);
573 	/*
574 	 * If we're already draining, then bail.  Otherwise, check it
575 	 * again via na_drain() with the channel lock held.
576 	 */
577 	if (ch->ch_na->na_flags & NAF_DRAINING) {
578 		return;
579 	}
580 
581 	SK_DF(SK_VERB_CHANNEL, "%s(%d) ch 0x%llx flags 0x%b status %s",
582 	    sk_proc_name_address(p), sk_proc_pid(p), SK_KVA(ch),
583 	    ch->ch_flags, CHANF_BITS, sk_memstatus2str(status));
584 
585 	/* serialize accesses against channel syscalls */
586 	lck_mtx_lock(&ch->ch_lock);
587 	na_drain(ch->ch_na, TRUE);   /* purge caches */
588 	lck_mtx_unlock(&ch->ch_lock);
589 }
590 
591 static bool
_kern_channel_defunct_eligible(struct kern_channel * ch)592 _kern_channel_defunct_eligible(struct kern_channel *ch)
593 {
594 	struct nexus_upipe_adapter *pna;
595 
596 	if ((ch->ch_info->cinfo_ch_mode & CHMODE_DEFUNCT_OK) == 0) {
597 		return false;
598 	}
599 	if (ch->ch_na->na_type != NA_USER_PIPE) {
600 		return true;
601 	}
602 	pna = (struct nexus_upipe_adapter *)ch->ch_na;
603 	if ((pna->pna_parent->na_flags & NAF_DEFUNCT_OK) == 0) {
604 		return false;
605 	}
606 	return true;
607 }
608 
609 void
kern_channel_defunct(struct proc * p,struct kern_channel * ch)610 kern_channel_defunct(struct proc *p, struct kern_channel *ch)
611 {
612 #pragma unused(p)
613 	uint32_t ch_mode = ch->ch_info->cinfo_ch_mode;
614 
615 	SK_LOCK_ASSERT_NOTHELD();
616 
617 	ASSERT(!(ch->ch_flags & CHANF_KERNEL));
618 	ASSERT(proc_pid(p) == ch->ch_pid);
619 	/*
620 	 * If the channel is eligible for defunct, mark it as such.
621 	 * Otherwise, set the draining flag which tells the reaper
622 	 * thread to purge any cached objects associated with it.
623 	 * That draining flag will be cleared then, which allows the
624 	 * channel to cache objects again once the process is resumed.
625 	 */
626 	if (_kern_channel_defunct_eligible(ch)) {
627 		struct kern_nexus *nx = ch->ch_nexus;
628 		struct kern_nexus_domain_provider *nxdom_prov = NX_DOM_PROV(nx);
629 		boolean_t need_defunct;
630 		int err;
631 
632 		/*
633 		 * This may be called often, so check first (without lock) if
634 		 * the trapdoor flag CHANF_DEFUNCT has been set and bail if so,
635 		 * for performance reasons.  This check is repeated below with
636 		 * the channel lock held.
637 		 */
638 		if (ch->ch_flags & CHANF_DEFUNCT) {
639 			return;
640 		}
641 
642 		SK_DF(SK_VERB_CHANNEL, "%s(%d) ch 0x%llx flags 0x%b",
643 		    sk_proc_name_address(p), sk_proc_pid(p), SK_KVA(ch),
644 		    ch->ch_flags, CHANF_BITS);
645 
646 		/* serialize accesses against channel syscalls */
647 		lck_mtx_lock(&ch->ch_lock);
648 
649 		/*
650 		 * If opportunistic defunct is in effect, skip the rest of
651 		 * the defunct work based on two cases:
652 		 *
653 		 *   a) if the channel isn't using user packet pool; or
654 		 *   b) if the channel is using user packet pool and we
655 		 *      detect that there are outstanding allocations.
656 		 *
657 		 * Note that for case (a) above we essentially treat the
658 		 * channel as ineligible for defunct, and although it may
659 		 * be idle we'd leave the memory mapping intact.  This
660 		 * should not be a concern as the majority of channels are
661 		 * on flowswitches where user packet pool is mandatory.
662 		 *
663 		 * If skipping, mark the channel with CHANF_DEFUNCT_SKIP
664 		 * and increment the stats (for flowswitch only).
665 		 */
666 		if (sk_opp_defunct && (!(ch_mode & CHMODE_USER_PACKET_POOL) ||
667 		    !pp_isempty_upp(ch->ch_pp))) {
668 			if (ch->ch_na->na_type == NA_FLOWSWITCH_VP) {
669 				struct nx_flowswitch *fsw =
670 				    VPNA(ch->ch_na)->vpna_fsw;
671 				STATS_INC(&fsw->fsw_stats,
672 				    FSW_STATS_CHAN_DEFUNCT_SKIP);
673 			}
674 			os_atomic_or(&ch->ch_flags, CHANF_DEFUNCT_SKIP,
675 			    relaxed);
676 			/* skip defunct */
677 			lck_mtx_unlock(&ch->ch_lock);
678 			return;
679 		}
680 		os_atomic_andnot(&ch->ch_flags, CHANF_DEFUNCT_SKIP, relaxed);
681 
682 		/*
683 		 * Proceed with the rest of the defunct work.
684 		 */
685 		if (os_atomic_or_orig(&ch->ch_flags, CHANF_DEFUNCT, relaxed) &
686 		    CHANF_DEFUNCT) {
687 			/* already defunct; nothing to do */
688 			lck_mtx_unlock(&ch->ch_lock);
689 			return;
690 		}
691 
692 		/* mark this channel as inactive */
693 		ch_deactivate(ch);
694 
695 		/*
696 		 * Redirect memory regions for the map; upon success, instruct
697 		 * the nexus to finalize the defunct and teardown the respective
698 		 * memory regions.  It's crucial that the redirection happens
699 		 * first before freeing the objects, since the page protection
700 		 * flags get inherited only from unfreed segments.  Freed ones
701 		 * will cause VM_PROT_NONE to be used for the segment span, to
702 		 * catch use-after-free cases.  For unfreed objects, doing so
703 		 * may cause an exception when the process is later resumed
704 		 * and touches an address within the span; hence the ordering.
705 		 */
706 		if ((err = skmem_arena_mredirect(ch->ch_na->na_arena,
707 		    &ch->ch_mmap, p, &need_defunct)) == 0 && need_defunct) {
708 			/*
709 			 * Let the domain provider handle the initial tasks of
710 			 * the defunct that are specific to this channel.  It
711 			 * may safely free objects as the redirection is done.
712 			 */
713 			nxdom_prov->nxdom_prov_dom->nxdom_defunct(nxdom_prov,
714 			    nx, ch, p);
715 			/*
716 			 * Let the domain provider complete the defunct;
717 			 * do this after dropping the channel lock, as
718 			 * the nexus may end up acquiring other locks
719 			 * that would otherwise violate lock ordering.
720 			 * The channel refcnt is still held by virtue
721 			 * of the caller holding the process's file
722 			 * table lock.
723 			 */
724 			lck_mtx_unlock(&ch->ch_lock);
725 			nxdom_prov->nxdom_prov_dom->nxdom_defunct_finalize(
726 				nxdom_prov, nx, ch, FALSE);
727 		} else if (err == 0) {
728 			/*
729 			 * Let the domain provider handle the initial tasks of
730 			 * the defunct that are specific to this channel.  It
731 			 * may sadely free objects as the redirection is done.
732 			 */
733 			nxdom_prov->nxdom_prov_dom->nxdom_defunct(nxdom_prov,
734 			    nx, ch, p);
735 			lck_mtx_unlock(&ch->ch_lock);
736 		} else {
737 			/* already redirected; nothing to do */
738 			lck_mtx_unlock(&ch->ch_lock);
739 		}
740 	} else {
741 		lck_mtx_lock(&ch->ch_lock);
742 		na_drain(ch->ch_na, FALSE);  /* prune caches */
743 		lck_mtx_unlock(&ch->ch_lock);
744 	}
745 }
746