xref: /xnu-8019.80.24/osfmk/ipc/ipc_pset.c (revision a325d9c4a84054e40bbe985afedcb50ab80993ea)
1 /*
2  * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * @OSF_COPYRIGHT@
30  */
31 /*
32  * Mach Operating System
33  * Copyright (c) 1991,1990,1989 Carnegie Mellon University
34  * All Rights Reserved.
35  *
36  * Permission to use, copy, modify and distribute this software and its
37  * documentation is hereby granted, provided that both the copyright
38  * notice and this permission notice appear in all copies of the
39  * software, derivative works or modified versions, and any portions
40  * thereof, and that both notices appear in supporting documentation.
41  *
42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45  *
46  * Carnegie Mellon requests users of this software to return to
47  *
48  *  Software Distribution Coordinator  or  [email protected]
49  *  School of Computer Science
50  *  Carnegie Mellon University
51  *  Pittsburgh PA 15213-3890
52  *
53  * any improvements or extensions that they make and grant Carnegie Mellon
54  * the rights to redistribute these changes.
55  */
56 /*
57  */
58 /*
59  *	File:	ipc/ipc_pset.c
60  *	Author:	Rich Draves
61  *	Date:	1989
62  *
63  *	Functions to manipulate IPC port sets.
64  */
65 
66 #include <mach/port.h>
67 #include <mach/kern_return.h>
68 #include <mach/message.h>
69 #include <ipc/ipc_mqueue.h>
70 #include <ipc/ipc_object.h>
71 #include <ipc/ipc_pset.h>
72 #include <ipc/ipc_right.h>
73 #include <ipc/ipc_space.h>
74 #include <ipc/ipc_port.h>
75 #include <ipc/ipc_kmsg.h>
76 
77 #include <kern/kern_types.h>
78 
79 #include <vm/vm_map.h>
80 #include <libkern/section_keywords.h>
81 #include <pthread/priority_private.h>
82 
83 /* processor_set stole ipc_pset_init */
84 static void
ipc_port_set_init(ipc_pset_t pset,int policy)85 ipc_port_set_init(ipc_pset_t pset, int policy)
86 {
87 	policy |= SYNC_POLICY_FIFO | SYNC_POLICY_PORT_SET;
88 	waitq_set_init(&pset->ips_wqset, policy);
89 	klist_init(&pset->ips_klist);
90 }
91 
92 /*
93  *	Routine:	ipc_pset_alloc
94  *	Purpose:
95  *		Allocate a port set.
96  *	Conditions:
97  *		Nothing locked.  If successful, the port set is returned
98  *		locked.  (The caller doesn't have a reference.)
99  *	Returns:
100  *		KERN_SUCCESS		The port set is allocated.
101  *		KERN_INVALID_TASK	The space is dead.
102  *		KERN_NO_SPACE		No room for an entry in the space.
103  */
104 
105 kern_return_t
ipc_pset_alloc(ipc_space_t space,mach_port_name_t * namep,ipc_pset_t * psetp)106 ipc_pset_alloc(
107 	ipc_space_t             space,
108 	mach_port_name_t        *namep,
109 	ipc_pset_t              *psetp)
110 {
111 	ipc_pset_t pset;
112 	mach_port_name_t name;
113 	kern_return_t kr;
114 
115 	kr = ipc_object_alloc(space, IOT_PORT_SET,
116 	    MACH_PORT_TYPE_PORT_SET, 0,
117 	    &name, (ipc_object_t *) &pset);
118 	if (kr != KERN_SUCCESS) {
119 		return kr;
120 	}
121 	/* space is locked */
122 
123 	ipc_port_set_init(pset, SYNC_POLICY_INIT_LOCKED);
124 	/* port set is locked */
125 
126 	is_write_unlock(space);
127 
128 	*namep = name;
129 	*psetp = pset;
130 	return KERN_SUCCESS;
131 }
132 
133 /*
134  *	Routine:	ipc_pset_alloc_name
135  *	Purpose:
136  *		Allocate a port set, with a specific name.
137  *	Conditions:
138  *		Nothing locked.  If successful, the port set is returned
139  *		locked.  (The caller doesn't have a reference.)
140  *	Returns:
141  *		KERN_SUCCESS		The port set is allocated.
142  *		KERN_INVALID_TASK	The space is dead.
143  *		KERN_NAME_EXISTS	The name already denotes a right.
144  */
145 
146 kern_return_t
ipc_pset_alloc_name(ipc_space_t space,mach_port_name_t name,ipc_pset_t * psetp)147 ipc_pset_alloc_name(
148 	ipc_space_t             space,
149 	mach_port_name_t        name,
150 	ipc_pset_t              *psetp)
151 {
152 	return ipc_object_alloc_name(space, IOT_PORT_SET,
153 	           MACH_PORT_TYPE_PORT_SET, 0,
154 	           name, (ipc_object_t *)psetp, ^(ipc_object_t object){
155 		ipc_port_set_init(ips_object_to_pset(object),
156 		SYNC_POLICY_INIT_LOCKED);
157 	});
158 }
159 
160 
161 /*
162  *	Routine:	ipc_pset_alloc_special
163  *	Purpose:
164  *		Allocate a port set in a special space.
165  *		The new port set is returned with one ref.
166  *		If unsuccessful, IPS_NULL is returned.
167  *	Conditions:
168  *		Nothing locked.
169  */
170 ipc_pset_t
ipc_pset_alloc_special(__assert_only ipc_space_t space)171 ipc_pset_alloc_special(
172 	__assert_only ipc_space_t space)
173 {
174 	ipc_pset_t pset;
175 
176 	assert(space != IS_NULL);
177 	assert(!is_active(space));
178 
179 	pset = ips_object_to_pset(io_alloc(IOT_PORT_SET, Z_WAITOK | Z_ZERO));
180 	if (pset == IPS_NULL) {
181 		return IPS_NULL;
182 	}
183 
184 	os_atomic_init(&pset->ips_object.io_bits, io_makebits(TRUE, IOT_PORT_SET, 0));
185 	os_atomic_init(&pset->ips_object.io_references, 1);
186 
187 	ipc_port_set_init(pset, 0);
188 
189 	return pset;
190 }
191 
192 
193 /*
194  *	Routine:	ipc_pset_add_unlock
195  *	Purpose:
196  *		Puts a port into a port set.
197  *	Conditions:
198  *		Port locked and active.
199  *		Port is unlocked on return.
200  *		The owner of the port set is also receiver for the port.
201  */
202 
203 kern_return_t
ipc_pset_add_unlock(ipc_pset_t pset,ipc_port_t port,waitq_ref_t * reserved_link,uint64_t * reserved_prepost)204 ipc_pset_add_unlock(
205 	ipc_pset_t        pset,
206 	ipc_port_t        port,
207 	waitq_ref_t      *reserved_link,
208 	uint64_t         *reserved_prepost)
209 {
210 	require_ip_active(port);
211 
212 	return ipc_mqueue_add_unlock(&port->ip_messages, pset,
213 	           reserved_link, reserved_prepost, FALSE);
214 }
215 
216 
217 
218 /*
219  *	Routine:	ipc_pset_remove_locked
220  *	Purpose:
221  *		Removes a port from a port set.
222  *		The port set loses a reference.
223  *	Conditions:
224  *		Port is locked and active.
225  */
226 
227 kern_return_t
ipc_pset_remove_locked(ipc_pset_t pset,ipc_port_t port)228 ipc_pset_remove_locked(
229 	ipc_pset_t        pset,
230 	ipc_port_t        port)
231 {
232 	require_ip_active(port);
233 
234 	if (ip_in_pset(port)) {
235 		return waitq_unlink_locked(&port->ip_waitq, &pset->ips_wqset);
236 	}
237 	return KERN_NOT_IN_SET;
238 }
239 
240 /*
241  *	Routine:	ipc_pset_lazy_allocate
242  *	Purpose:
243  *		lazily initialize the wqset of a port set.
244  *	Conditions:
245  *		Nothing locked.
246  */
247 
248 kern_return_t
ipc_pset_lazy_allocate(ipc_space_t space,mach_port_name_t psname)249 ipc_pset_lazy_allocate(
250 	ipc_space_t space,
251 	mach_port_name_t psname)
252 {
253 	ipc_entry_bits_t bits;
254 	kern_return_t kr;
255 	ipc_object_t psobj;
256 
257 	kr = ipc_right_lookup_read(space, psname, &bits, &psobj);
258 	if (kr != KERN_SUCCESS) {
259 		return kr;
260 	}
261 	/* object is locked and active */
262 
263 	if ((bits & MACH_PORT_TYPE_PORT_SET) == 0) {
264 		io_unlock(psobj);
265 		return KERN_INVALID_RIGHT;
266 	}
267 
268 	io_reference(psobj);
269 	io_unlock(psobj);
270 
271 	/*
272 	 * lazily initialize the wqset to avoid
273 	 * possible allocation while linking
274 	 * under spinlocks.
275 	 */
276 	waitq_set_lazy_init_link(&ips_object_to_pset(psobj)->ips_wqset);
277 
278 	io_release(psobj);
279 
280 	return KERN_SUCCESS;
281 }
282 
283 /*
284  *	Routine:	ipc_pset_remove_from_all_unlock
285  *	Purpose:
286  *		Removes a port from all it's port sets.
287  *	Conditions:
288  *		port is locked and active, port unlocked on return.
289  */
290 
291 void
ipc_pset_remove_from_all_unlock(ipc_port_t port)292 ipc_pset_remove_from_all_unlock(
293 	ipc_port_t      port)
294 {
295 	assert(waitq_is_valid(&port->ip_waitq));
296 	waitq_unlink_all_unlock(&port->ip_waitq);
297 }
298 
299 /*
300  *	Routine:	ipc_pset_move_unlock
301  *	Purpose:
302  *		Removes a port from all its port sets and adds it to given port set.
303  *	Conditions:
304  *		port is locked and active.
305  *		port is unlocked on return.
306  */
307 kern_return_t
ipc_pset_move_unlock(ipc_pset_t pset,ipc_port_t port,uint64_t * reserved_prepost)308 ipc_pset_move_unlock(
309 	ipc_pset_t        pset,
310 	ipc_port_t        port,
311 	uint64_t         *reserved_prepost)
312 {
313 	return ipc_mqueue_add_unlock(&port->ip_messages, pset,
314 	           NULL, reserved_prepost, TRUE);
315 }
316 
317 /*
318  *	Routine:	ipc_pset_destroy
319  *	Purpose:
320  *		Destroys a port_set.
321  *	Conditions:
322  *		The port_set is locked and alive.
323  *		The caller has a reference, which is consumed.
324  *		Afterwards, the port_set is unlocked and dead.
325  */
326 
327 void
ipc_pset_destroy(ipc_space_t space,ipc_pset_t pset)328 ipc_pset_destroy(
329 	ipc_space_t     space,
330 	ipc_pset_t      pset)
331 {
332 	assert(ips_active(pset));
333 
334 	io_bits_andnot(ips_to_object(pset), IO_BITS_ACTIVE);
335 
336 	/*
337 	 * Set all waiters on the portset running to
338 	 * discover the change.
339 	 *
340 	 * Then under the same lock hold, deinit the waitq-set,
341 	 * which will remove all the member message queues,
342 	 * linkages and clean up preposts.
343 	 */
344 	ipc_mqueue_changed(space, &pset->ips_wqset.wqset_q);
345 	waitq_set_deinit_and_unlock(&pset->ips_wqset);
346 
347 	ips_release(pset);       /* consume the ref our caller gave us */
348 }
349 
350 /*
351  * Kqueue EVFILT_MACHPORT support
352  *
353  * - kn_ipc_obj points to the monitored ipc port or pset
354  *
355  * - (in/out) ext[0] holds a mach_vm_address_t to a userspace buffer
356  *   that can be used to direct-deliver messages when
357  *   MACH_RCV_MSG is set in kn_sfflags
358  *
359  * - (in/out) ext[1] holds a mach_msg_size_t representing the size
360  *   of the userspace buffer held in ext[0].
361  *
362  * - (out)    ext[2] is used to deliver qos information
363  *   about the send queue to userspace.
364  *
365  * - (abused) ext[3] is used in kernel to hold a reference to the first port
366  *   with a turnstile that participate to sync IPC override.
367  *
368  * - kn_hook is optionally a "knote" turnstile. It is used as the inheritor
369  *   of turnstiles for rights copied out as part of direct message delivery
370  *   when they can participate to sync IPC override.
371  *
372  *   It is used to atomically neuter the sync IPC override when the knote is
373  *   re-enabled.
374  *
375  */
376 
377 #include <sys/event.h>
378 #include <sys/errno.h>
379 
380 static int
filt_machport_filter_result(struct knote * kn,ipc_object_t object)381 filt_machport_filter_result(struct knote *kn, ipc_object_t object)
382 {
383 	struct waitq *wq = io_waitq(object);
384 	ipc_kmsg_t first;
385 	int result = 0;
386 
387 	io_lock_held(object);
388 
389 	if (kn->kn_sfflags & MACH_RCV_MSG) {
390 		result = FILTER_RESET_EVENT_QOS;
391 	}
392 
393 	if (!waitq_is_valid(wq)) {
394 		return result;
395 	}
396 
397 	if (waitq_is_set(wq)) {
398 		ipc_pset_t pset = ips_object_to_pset(object);
399 		int rc;
400 
401 		rc = waitq_set_iterate_preposts(&pset->ips_wqset,
402 		    ^(struct waitq *waitq) {
403 			ipc_port_t port = ip_from_waitq(waitq);
404 			ipc_kmsg_queue_t kmsgs = &port->ip_messages.imq_messages;
405 
406 			if (ipc_kmsg_queue_first(kmsgs) != IKM_NULL) {
407 			        /* break out of the prepost iteration */
408 			        return WQ_ITERATE_BREAK;
409 			}
410 			return WQ_ITERATE_CONTINUE;
411 		});
412 		if (rc == WQ_ITERATE_BREAK) {
413 			result = FILTER_ACTIVE;
414 		}
415 		return result;
416 	}
417 
418 	ipc_port_t port = ip_object_to_port(object);
419 
420 	first = ipc_kmsg_queue_first(&port->ip_messages.imq_messages);
421 	if (!first) {
422 		return result;
423 	}
424 
425 	result = FILTER_ACTIVE;
426 	if (kn->kn_sfflags & MACH_RCV_MSG) {
427 		result |= FILTER_ADJUST_EVENT_QOS(first->ikm_qos_override);
428 	}
429 
430 #if CONFIG_PREADOPT_TG
431 	struct thread_group *tg = ipc_kmsg_get_thread_group(first);
432 	if (tg) {
433 		struct kqueue *kq = knote_get_kq(kn);
434 		kqueue_set_preadopted_thread_group(kq, tg,
435 		    first->ikm_qos_override);
436 	}
437 #endif
438 
439 	return result;
440 }
441 
442 struct turnstile *
filt_ipc_kqueue_turnstile(struct knote * kn)443 filt_ipc_kqueue_turnstile(struct knote *kn)
444 {
445 	assert(kn->kn_filter == EVFILT_MACHPORT || kn->kn_filter == EVFILT_WORKLOOP);
446 	return kqueue_turnstile(knote_get_kq(kn));
447 }
448 
449 bool
filt_machport_kqueue_has_turnstile(struct knote * kn)450 filt_machport_kqueue_has_turnstile(struct knote *kn)
451 {
452 	assert(kn->kn_filter == EVFILT_MACHPORT);
453 	return ((kn->kn_sfflags & MACH_RCV_MSG) || (kn->kn_sfflags & MACH_RCV_SYNC_PEEK))
454 	       && (kn->kn_flags & EV_DISPATCH);
455 }
456 
457 /*
458  * Stashes a port that participate to sync IPC override until the knote
459  * is being re-enabled.
460  *
461  * It returns:
462  * - the turnstile to use as an inheritor for the stashed port
463  * - the kind of stash that happened as PORT_SYNC_* value among:
464  *   o not stashed (no sync IPC support)
465  *   o stashed in the knote (in kn_ext[3])
466  *   o to be hooked to the kn_hook knote
467  */
468 struct turnstile *
filt_machport_stash_port(struct knote * kn,ipc_port_t port,int * link)469 filt_machport_stash_port(struct knote *kn, ipc_port_t port, int *link)
470 {
471 	struct turnstile *ts = TURNSTILE_NULL;
472 
473 	if (kn->kn_filter == EVFILT_WORKLOOP) {
474 		assert(kn->kn_ipc_obj == NULL);
475 		kn->kn_ipc_obj = ip_to_object(port);
476 		ip_reference(port);
477 		if (link) {
478 			*link = PORT_SYNC_LINK_WORKLOOP_KNOTE;
479 		}
480 		ts = filt_ipc_kqueue_turnstile(kn);
481 	} else if (!filt_machport_kqueue_has_turnstile(kn)) {
482 		if (link) {
483 			*link = PORT_SYNC_LINK_NO_LINKAGE;
484 		}
485 	} else if (kn->kn_ext[3] == 0) {
486 		ip_reference(port);
487 		kn->kn_ext[3] = (uintptr_t)port;
488 		ts = filt_ipc_kqueue_turnstile(kn);
489 		if (link) {
490 			*link = PORT_SYNC_LINK_WORKLOOP_KNOTE;
491 		}
492 	} else {
493 		ts = (struct turnstile *)kn->kn_hook;
494 		if (link) {
495 			*link = PORT_SYNC_LINK_WORKLOOP_STASH;
496 		}
497 	}
498 
499 	return ts;
500 }
501 
502 /*
503  * Lazily prepare a turnstile so that filt_machport_stash_port()
504  * can be called with the mqueue lock held.
505  *
506  * It will allocate a turnstile in kn_hook if:
507  * - the knote supports sync IPC override,
508  * - we already stashed a port in kn_ext[3],
509  * - the object that will be copied out has a chance to ask to be stashed.
510  *
511  * It is setup so that its inheritor is the workloop turnstile that has been
512  * allocated when this knote was attached.
513  */
514 void
filt_machport_turnstile_prepare_lazily(struct knote * kn,mach_msg_type_name_t msgt_name,ipc_port_t port)515 filt_machport_turnstile_prepare_lazily(
516 	struct knote *kn,
517 	mach_msg_type_name_t msgt_name,
518 	ipc_port_t port)
519 {
520 	/* This is called from within filt_machportprocess */
521 	assert((kn->kn_status & KN_SUPPRESSED) && (kn->kn_status & KN_LOCKED));
522 
523 	if (!filt_machport_kqueue_has_turnstile(kn)) {
524 		return;
525 	}
526 
527 	if (kn->kn_ext[3] == 0 || kn->kn_hook) {
528 		return;
529 	}
530 
531 	struct turnstile *ts = filt_ipc_kqueue_turnstile(kn);
532 	if ((msgt_name == MACH_MSG_TYPE_PORT_SEND_ONCE && port->ip_specialreply) ||
533 	    (msgt_name == MACH_MSG_TYPE_PORT_RECEIVE)) {
534 		struct turnstile *kn_ts = turnstile_alloc();
535 		kn_ts = turnstile_prepare((uintptr_t)kn,
536 		    (struct turnstile **)&kn->kn_hook, kn_ts, TURNSTILE_KNOTE);
537 		turnstile_update_inheritor(kn_ts, ts,
538 		    TURNSTILE_IMMEDIATE_UPDATE | TURNSTILE_INHERITOR_TURNSTILE);
539 		turnstile_cleanup();
540 	}
541 }
542 
543 static void
filt_machport_turnstile_complete_port(struct knote * kn,ipc_port_t port)544 filt_machport_turnstile_complete_port(struct knote *kn, ipc_port_t port)
545 {
546 	struct turnstile *ts = TURNSTILE_NULL;
547 
548 	ip_mq_lock(port);
549 	if (port->ip_specialreply) {
550 		/*
551 		 * If the reply has been sent to the special reply port already,
552 		 * then the special reply port may already be reused to do something
553 		 * entirely different.
554 		 *
555 		 * However, the only reason for it to still point to this knote is
556 		 * that it's still waiting for a reply, so when this is the case,
557 		 * neuter the linkage.
558 		 */
559 		if (port->ip_sync_link_state == PORT_SYNC_LINK_WORKLOOP_KNOTE &&
560 		    port->ip_sync_inheritor_knote == kn) {
561 			ipc_port_adjust_special_reply_port_locked(port, NULL,
562 			    (IPC_PORT_ADJUST_SR_NONE | IPC_PORT_ADJUST_SR_ENABLE_EVENT), FALSE);
563 			/* port unlocked */
564 		} else {
565 			ip_mq_unlock(port);
566 		}
567 	} else {
568 		/*
569 		 * For receive rights, if their IMQ_KNOTE() is still this
570 		 * knote, then sever the link.
571 		 */
572 		if (port->ip_sync_link_state == PORT_SYNC_LINK_WORKLOOP_KNOTE &&
573 		    port->ip_messages.imq_inheritor_knote == kn) {
574 			ipc_port_adjust_sync_link_state_locked(port, PORT_SYNC_LINK_ANY, NULL);
575 			ts = port_send_turnstile(port);
576 		}
577 		if (ts) {
578 			turnstile_reference(ts);
579 			turnstile_update_inheritor(ts, TURNSTILE_INHERITOR_NULL,
580 			    TURNSTILE_IMMEDIATE_UPDATE);
581 		}
582 		ip_mq_unlock(port);
583 
584 		if (ts) {
585 			turnstile_update_inheritor_complete(ts,
586 			    TURNSTILE_INTERLOCK_NOT_HELD);
587 			turnstile_deallocate(ts);
588 		}
589 	}
590 
591 	ip_release(port);
592 }
593 
594 void
filt_wldetach_sync_ipc(struct knote * kn)595 filt_wldetach_sync_ipc(struct knote *kn)
596 {
597 	ipc_object_t io = kn->kn_ipc_obj;
598 	filt_machport_turnstile_complete_port(kn, ip_object_to_port(io));
599 	kn->kn_ipc_obj = IO_NULL;
600 }
601 
602 /*
603  * Other half of filt_machport_turnstile_prepare_lazily()
604  *
605  * This is serialized by the knote state machine.
606  */
607 static void
filt_machport_turnstile_complete(struct knote * kn)608 filt_machport_turnstile_complete(struct knote *kn)
609 {
610 	if (kn->kn_ext[3]) {
611 		ipc_port_t port = (ipc_port_t)kn->kn_ext[3];
612 		filt_machport_turnstile_complete_port(kn, port);
613 		kn->kn_ext[3] = 0;
614 	}
615 
616 	if (kn->kn_hook) {
617 		struct turnstile *ts = kn->kn_hook;
618 
619 		turnstile_update_inheritor(ts, TURNSTILE_INHERITOR_NULL,
620 		    TURNSTILE_IMMEDIATE_UPDATE);
621 		turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD);
622 
623 		turnstile_complete((uintptr_t)kn, (struct turnstile **)&kn->kn_hook, &ts, TURNSTILE_KNOTE);
624 		turnstile_cleanup();
625 
626 		assert(ts);
627 		turnstile_deallocate(ts);
628 	}
629 }
630 
631 static void
filt_machport_link(struct klist * klist,struct knote * kn)632 filt_machport_link(struct klist *klist, struct knote *kn)
633 {
634 	struct knote *hd = SLIST_FIRST(klist);
635 
636 	if (hd && filt_machport_kqueue_has_turnstile(kn)) {
637 		SLIST_INSERT_AFTER(hd, kn, kn_selnext);
638 	} else {
639 		SLIST_INSERT_HEAD(klist, kn, kn_selnext);
640 	}
641 }
642 
643 static void
filt_machport_unlink(struct klist * klist,struct knote * kn)644 filt_machport_unlink(struct klist *klist, struct knote *kn)
645 {
646 	struct knote **knprev;
647 
648 	KNOTE_DETACH(klist, kn);
649 
650 	/* make sure the first knote is a knote we can push on */
651 	SLIST_FOREACH_PREVPTR(kn, knprev, klist, kn_selnext) {
652 		if (filt_machport_kqueue_has_turnstile(kn)) {
653 			*knprev = SLIST_NEXT(kn, kn_selnext);
654 			SLIST_INSERT_HEAD(klist, kn, kn_selnext);
655 			break;
656 		}
657 	}
658 }
659 
660 int
filt_wlattach_sync_ipc(struct knote * kn)661 filt_wlattach_sync_ipc(struct knote *kn)
662 {
663 	mach_port_name_t name = (mach_port_name_t)kn->kn_id;
664 	ipc_space_t space = current_space();
665 	ipc_entry_bits_t bits;
666 	ipc_object_t object;
667 	ipc_port_t port = IP_NULL;
668 	int error = 0;
669 
670 	if (ipc_right_lookup_read(space, name, &bits, &object) != KERN_SUCCESS) {
671 		return ENOENT;
672 	}
673 	/* object is locked and active */
674 
675 	if (bits & MACH_PORT_TYPE_RECEIVE) {
676 		port = ip_object_to_port(object);
677 		if (port->ip_specialreply) {
678 			error = ENOENT;
679 		}
680 	} else if (bits & MACH_PORT_TYPE_SEND_ONCE) {
681 		port = ip_object_to_port(object);
682 		if (!port->ip_specialreply) {
683 			error = ENOENT;
684 		}
685 	} else {
686 		error = ENOENT;
687 	}
688 	if (error) {
689 		io_unlock(object);
690 		return error;
691 	}
692 
693 	if (port->ip_sync_link_state == PORT_SYNC_LINK_ANY) {
694 		io_unlock(object);
695 		/*
696 		 * We cannot start a sync IPC inheritance chain, only further one
697 		 * Note: this can also happen if the inheritance chain broke
698 		 * because the original requestor died.
699 		 */
700 		return ENOENT;
701 	}
702 
703 	if (port->ip_specialreply) {
704 		ipc_port_adjust_special_reply_port_locked(port, kn,
705 		    IPC_PORT_ADJUST_SR_LINK_WORKLOOP, FALSE);
706 	} else {
707 		ipc_port_adjust_port_locked(port, kn, FALSE);
708 	}
709 
710 	/* make sure the port was stashed */
711 	assert(kn->kn_ipc_obj == ip_to_object(port));
712 
713 	/* port has been unlocked by ipc_port_adjust_* */
714 
715 	return 0;
716 }
717 
718 static int
filt_machportattach(struct knote * kn,__unused struct kevent_qos_s * kev)719 filt_machportattach(
720 	struct knote *kn,
721 	__unused struct kevent_qos_s *kev)
722 {
723 	mach_port_name_t name = (mach_port_name_t)kn->kn_id;
724 	ipc_space_t space = current_space();
725 	ipc_entry_bits_t bits;
726 	ipc_object_t object;
727 	struct turnstile *send_turnstile = TURNSTILE_NULL;
728 
729 	int error = 0;
730 	int result = 0;
731 	kern_return_t kr;
732 
733 	kn->kn_flags &= ~EV_EOF;
734 	kn->kn_ext[3] = 0;
735 
736 	if (filt_machport_kqueue_has_turnstile(kn)) {
737 		/*
738 		 * If the filter is likely to support sync IPC override,
739 		 * and it happens to be attaching to a workloop,
740 		 * make sure the workloop has an allocated turnstile.
741 		 */
742 		kqueue_alloc_turnstile(knote_get_kq(kn));
743 	}
744 
745 	kr = ipc_right_lookup_read(space, name, &bits, &object);
746 
747 	if (kr != KERN_SUCCESS) {
748 		error = ENOENT;
749 		goto out;
750 	}
751 	/* object is locked and active */
752 
753 	if (bits & MACH_PORT_TYPE_PORT_SET) {
754 		ipc_pset_t pset = ips_object_to_pset(object);
755 
756 		io_reference(object);
757 		kn->kn_ipc_obj = object;
758 		filt_machport_link(&pset->ips_klist, kn);
759 		result = filt_machport_filter_result(kn, object);
760 		io_unlock(object);
761 	} else if (bits & MACH_PORT_TYPE_RECEIVE) {
762 		ipc_port_t port = ip_object_to_port(object);
763 
764 		if (port->ip_specialreply) {
765 			/*
766 			 * Registering for kevents on special reply ports
767 			 * isn't supported for two reasons:
768 			 *
769 			 * 1. it really makes very little sense for a port that
770 			 *    is supposed to be used synchronously
771 			 *
772 			 * 2. their ports's ip_klist field will be used to
773 			 *    store the receive turnstile, so we can't possibly
774 			 *    attach them anyway.
775 			 */
776 			io_unlock(object);
777 			error = ENOTSUP;
778 			goto out;
779 		}
780 
781 		io_reference(object);
782 		kn->kn_ipc_obj = object;
783 		if (port->ip_sync_link_state != PORT_SYNC_LINK_ANY) {
784 			/*
785 			 * We're attaching a port that used to have an IMQ_KNOTE,
786 			 * clobber this state, we'll fixup its turnstile inheritor below.
787 			 */
788 			ipc_port_adjust_sync_link_state_locked(port, PORT_SYNC_LINK_ANY, NULL);
789 		}
790 
791 		filt_machport_link(&port->ip_klist, kn);
792 		result = filt_machport_filter_result(kn, object);
793 
794 		/*
795 		 * Update the port's turnstile inheritor
796 		 *
797 		 * Unlike filt_machportdetach(), we don't have to care about races for
798 		 * turnstile_workloop_pusher_info(): filt_machport_link() doesn't affect
799 		 * already pushing knotes, and if the current one becomes the new
800 		 * pusher, it'll only be visible when turnstile_workloop_pusher_info()
801 		 * returns.
802 		 */
803 		send_turnstile = port_send_turnstile(port);
804 		if (send_turnstile) {
805 			turnstile_reference(send_turnstile);
806 			ipc_port_send_update_inheritor(port, send_turnstile,
807 			    TURNSTILE_IMMEDIATE_UPDATE);
808 
809 			/*
810 			 * rdar://problem/48861190
811 			 *
812 			 * When a listener connection resumes a peer,
813 			 * updating the inheritor above has moved the push
814 			 * from the current thread to the workloop.
815 			 *
816 			 * However, we haven't told the workloop yet
817 			 * that it needs a thread request, and we risk
818 			 * to be preeempted as soon as we drop the space
819 			 * lock below.
820 			 *
821 			 * To avoid this disable preemption and let kevent
822 			 * reenable it after it takes the kqlock.
823 			 */
824 			disable_preemption();
825 			result |= FILTER_THREADREQ_NODEFEER;
826 		}
827 
828 		io_unlock(object);
829 
830 		if (send_turnstile) {
831 			turnstile_update_inheritor_complete(send_turnstile,
832 			    TURNSTILE_INTERLOCK_NOT_HELD);
833 			turnstile_deallocate_safe(send_turnstile);
834 		}
835 	} else {
836 		io_unlock(object);
837 		error = ENOTSUP;
838 	}
839 
840 out:
841 	/* bail out on errors */
842 	if (error) {
843 		knote_set_error(kn, error);
844 		return 0;
845 	}
846 
847 	return result;
848 }
849 
850 static void
filt_machportdetach(struct knote * kn)851 filt_machportdetach(
852 	struct knote *kn)
853 {
854 	ipc_object_t object = kn->kn_ipc_obj;
855 	struct turnstile *send_turnstile = TURNSTILE_NULL;
856 
857 	filt_machport_turnstile_complete(kn);
858 
859 	io_lock(object);
860 	if ((kn->kn_status & KN_VANISHED) || (kn->kn_flags & EV_EOF)) {
861 		/*
862 		 * ipc_mqueue_changed() already unhooked this knote from the waitq,
863 		 */
864 	} else {
865 		ipc_port_t port = IP_NULL;
866 
867 		/*
868 		 * When the knote being detached is the first one in the list,
869 		 * then unlinking the knote *and* updating the turnstile inheritor
870 		 * need to happen atomically with respect to the callers of
871 		 * turnstile_workloop_pusher_info().
872 		 *
873 		 * The caller of turnstile_workloop_pusher_info() will use the kq req
874 		 * lock (and hence the kqlock), so we just need to hold the kqlock too.
875 		 */
876 		if (io_otype(object) == IOT_PORT) {
877 			port = ip_object_to_port(object);
878 			assert(port->ip_sync_link_state == PORT_SYNC_LINK_ANY);
879 			if (kn == SLIST_FIRST(&port->ip_klist)) {
880 				send_turnstile = port_send_turnstile(port);
881 			}
882 			filt_machport_unlink(&port->ip_klist, kn);
883 		} else {
884 			ipc_pset_t pset = ips_object_to_pset(object);
885 
886 			filt_machport_unlink(&pset->ips_klist, kn);
887 		}
888 
889 
890 		if (send_turnstile) {
891 			turnstile_reference(send_turnstile);
892 			ipc_port_send_update_inheritor(port, send_turnstile,
893 			    TURNSTILE_IMMEDIATE_UPDATE);
894 		}
895 	}
896 
897 	/* Clear the knote pointer once the knote has been removed from turnstile */
898 	kn->kn_ipc_obj = IO_NULL;
899 	io_unlock(object);
900 
901 	if (send_turnstile) {
902 		turnstile_update_inheritor_complete(send_turnstile,
903 		    TURNSTILE_INTERLOCK_NOT_HELD);
904 		turnstile_deallocate(send_turnstile);
905 	}
906 
907 	io_release(object);
908 }
909 
910 /*
911  * filt_machportevent - deliver events into the mach port filter
912  *
913  * Mach port message arrival events are currently only posted via the
914  * kqueue filter routine for ports.
915  *
916  * If there is a message at the head of the queue,
917  * we indicate that the knote should go active.  If
918  * the message is to be direct-received, we adjust the
919  * QoS of the knote according the requested and override
920  * QoS of that first message.
921  *
922  * When the knote is for a port-set, the hint is non 0
923  * and is the waitq which is posting.
924  */
925 static int
filt_machportevent(struct knote * kn,long hint __assert_only)926 filt_machportevent(struct knote *kn, long hint __assert_only)
927 {
928 	if (io_otype(kn->kn_ipc_obj) == IOT_PORT_SET) {
929 		/*
930 		 * When called for a port-set,
931 		 * the posting port waitq is locked.
932 		 *
933 		 * waitq_set_iterate_preposts()
934 		 * in filt_machport_filter_result()
935 		 * would try to lock it and be very sad.
936 		 *
937 		 * Just trust what we know to be true.
938 		 */
939 		assert(hint != 0);
940 		return FILTER_ACTIVE;
941 	}
942 	assert(hint == 0);
943 	return filt_machport_filter_result(kn, kn->kn_ipc_obj);
944 }
945 
946 /*
947  * Upcall from the waitq code to prepost to the kevent subsystem.
948  *
949  * Called with the pset and waitq locks held.
950  */
951 void
ipc_pset_prepost(struct waitq_set * wqs,struct waitq * waitq)952 ipc_pset_prepost(struct waitq_set *wqs, struct waitq *waitq)
953 {
954 	KNOTE(&ips_from_waitq(&wqs->wqset_q)->ips_klist, (long)waitq);
955 }
956 
957 static int
filt_machporttouch(struct knote * kn,struct kevent_qos_s * kev)958 filt_machporttouch(
959 	struct knote *kn,
960 	struct kevent_qos_s *kev)
961 {
962 	ipc_object_t object = kn->kn_ipc_obj;
963 	int result = 0;
964 
965 	/* copy in new settings and save off new input fflags */
966 	kn->kn_sfflags = kev->fflags;
967 	kn->kn_ext[0] = kev->ext[0];
968 	kn->kn_ext[1] = kev->ext[1];
969 
970 	if (kev->flags & EV_ENABLE) {
971 		/*
972 		 * If the knote is being enabled, make sure there's no lingering
973 		 * IPC overrides from the previous message delivery.
974 		 */
975 		filt_machport_turnstile_complete(kn);
976 	}
977 
978 	io_lock(object);
979 	result = filt_machport_filter_result(kn, object);
980 	io_unlock(object);
981 
982 	return result;
983 }
984 
985 static int
filt_machportprocess(struct knote * kn,struct kevent_qos_s * kev)986 filt_machportprocess(struct knote *kn, struct kevent_qos_s *kev)
987 {
988 	ipc_object_t object = kn->kn_ipc_obj;
989 	thread_t self = current_thread();
990 	kevent_ctx_t kectx = NULL;
991 
992 	wait_result_t wresult;
993 	mach_msg_option_t option;
994 	mach_vm_address_t addr;
995 	mach_msg_size_t size;
996 
997 	/* Capture current state */
998 	knote_fill_kevent(kn, kev, MACH_PORT_NULL);
999 	kev->ext[3] = 0; /* hide our port reference from userspace */
1000 
1001 	/* If already deallocated/moved return one last EOF event */
1002 	if (kev->flags & EV_EOF) {
1003 		return FILTER_ACTIVE | FILTER_RESET_EVENT_QOS;
1004 	}
1005 
1006 	/*
1007 	 * Only honor supported receive options. If no options are
1008 	 * provided, just force a MACH_RCV_TOO_LARGE to detect the
1009 	 * name of the port and sizeof the waiting message.
1010 	 */
1011 	option = kn->kn_sfflags & (MACH_RCV_MSG | MACH_RCV_LARGE | MACH_RCV_LARGE_IDENTITY |
1012 	    MACH_RCV_TRAILER_MASK | MACH_RCV_VOUCHER | MACH_MSG_STRICT_REPLY);
1013 
1014 	if (option & MACH_RCV_MSG) {
1015 		addr = (mach_vm_address_t) kn->kn_ext[0];
1016 		size = (mach_msg_size_t) kn->kn_ext[1];
1017 
1018 		/*
1019 		 * If the kevent didn't specify a buffer and length, carve a buffer
1020 		 * from the filter processing data according to the flags.
1021 		 */
1022 		if (size == 0) {
1023 			kectx = kevent_get_context(self);
1024 			addr  = (mach_vm_address_t)kectx->kec_data_out;
1025 			size  = (mach_msg_size_t)kectx->kec_data_resid;
1026 			option |= (MACH_RCV_LARGE | MACH_RCV_LARGE_IDENTITY);
1027 			if (kectx->kec_process_flags & KEVENT_FLAG_STACK_DATA) {
1028 				option |= MACH_RCV_STACK;
1029 			}
1030 		}
1031 	} else {
1032 		/* just detect the port name (if a set) and size of the first message */
1033 		option = MACH_RCV_LARGE;
1034 		addr = 0;
1035 		size = 0;
1036 	}
1037 
1038 	/*
1039 	 * Set up to receive a message or the notification of a
1040 	 * too large message.  But never allow this call to wait.
1041 	 * If the user provided aditional options, like trailer
1042 	 * options, pass those through here.  But we don't support
1043 	 * scatter lists through this interface.
1044 	 *
1045 	 * Note: while in filt_machportprocess(),
1046 	 *       the knote has a reference on `object` that we can borrow.
1047 	 */
1048 	self->ith_object = object;
1049 	self->ith_msg_addr = addr;
1050 	self->ith_rsize = size;
1051 	self->ith_msize = 0;
1052 	self->ith_option = option;
1053 	self->ith_receiver_name = MACH_PORT_NULL;
1054 	self->ith_continuation = NULL;
1055 	option |= MACH_RCV_TIMEOUT; // never wait
1056 	self->ith_state = MACH_RCV_IN_PROGRESS;
1057 	self->ith_knote = kn;
1058 
1059 	wresult = ipc_mqueue_receive_on_thread(
1060 		io_waitq(object),
1061 		option,
1062 		size,         /* max_size */
1063 		0,         /* immediate timeout */
1064 		THREAD_INTERRUPTIBLE,
1065 		self);
1066 	/* port unlocked */
1067 
1068 	/* If we timed out, or the process is exiting, just zero.  */
1069 	if (wresult == THREAD_RESTART || self->ith_state == MACH_RCV_TIMED_OUT) {
1070 		assert(self->turnstile != TURNSTILE_NULL);
1071 		return 0;
1072 	}
1073 
1074 	assert(wresult == THREAD_NOT_WAITING);
1075 	assert(self->ith_state != MACH_RCV_IN_PROGRESS);
1076 
1077 	/*
1078 	 * If we weren't attempting to receive a message
1079 	 * directly, we need to return the port name in
1080 	 * the kevent structure.
1081 	 */
1082 	if ((option & MACH_RCV_MSG) != MACH_RCV_MSG) {
1083 		assert(self->ith_state == MACH_RCV_TOO_LARGE);
1084 		assert(self->ith_kmsg == IKM_NULL);
1085 		kev->data = self->ith_receiver_name;
1086 		return FILTER_ACTIVE;
1087 	}
1088 
1089 #if CONFIG_PREADOPT_TG
1090 	/* If we're the first EVFILT_MACHPORT knote that is being processed for this
1091 	 * kqwl, then make sure to preadopt the thread group from the kmsg we're
1092 	 * about to receive. This is to make sure that we fix up the preadoption
1093 	 * thread group correctly on the receive side for the first message.
1094 	 */
1095 	struct kqueue *kq = knote_get_kq(kn);
1096 
1097 	if (self->ith_kmsg) {
1098 		struct thread_group *tg = ipc_kmsg_get_thread_group(self->ith_kmsg);
1099 
1100 		kqueue_process_preadopt_thread_group(self, kq, tg);
1101 	}
1102 #endif
1103 
1104 	/*
1105 	 * Attempt to receive the message directly, returning
1106 	 * the results in the fflags field.
1107 	 */
1108 	io_reference(object);
1109 	kev->fflags = mach_msg_receive_results(&size);
1110 
1111 	/* kmsg and object reference consumed */
1112 
1113 	/*
1114 	 * if the user asked for the identity of ports containing a
1115 	 * a too-large message, return it in the data field (as we
1116 	 * do for messages we didn't try to receive).
1117 	 */
1118 	if (kev->fflags == MACH_RCV_TOO_LARGE) {
1119 		kev->ext[1] = self->ith_msize;
1120 		if (option & MACH_RCV_LARGE_IDENTITY) {
1121 			kev->data = self->ith_receiver_name;
1122 		} else {
1123 			kev->data = MACH_PORT_NULL;
1124 		}
1125 	} else {
1126 		kev->ext[1] = size;
1127 		kev->data = MACH_PORT_NULL;
1128 	}
1129 
1130 	/*
1131 	 * If we used a data buffer carved out from the filt_process data,
1132 	 * store the address used in the knote and adjust the residual and
1133 	 * other parameters for future use.
1134 	 */
1135 	if (kectx) {
1136 		assert(kectx->kec_data_resid >= size);
1137 		kectx->kec_data_resid -= size;
1138 		if ((kectx->kec_process_flags & KEVENT_FLAG_STACK_DATA) == 0) {
1139 			kev->ext[0] = kectx->kec_data_out;
1140 			kectx->kec_data_out += size;
1141 		} else {
1142 			assert(option & MACH_RCV_STACK);
1143 			kev->ext[0] = kectx->kec_data_out + kectx->kec_data_resid;
1144 		}
1145 	}
1146 
1147 	/*
1148 	 * Apply message-based QoS values to output kevent as prescribed.
1149 	 * The kev->ext[2] field gets (msg-qos << 32) | (override-qos).
1150 	 *
1151 	 * The mach_msg_receive_results() call saved off the message
1152 	 * QoS values in the continuation save area on successful receive.
1153 	 */
1154 	if (kev->fflags == MACH_MSG_SUCCESS) {
1155 		kev->ext[2] = ((uint64_t)self->ith_ppriority << 32) |
1156 		    _pthread_priority_make_from_thread_qos(self->ith_qos_override, 0, 0);
1157 	}
1158 
1159 	return FILTER_ACTIVE;
1160 }
1161 
1162 SECURITY_READ_ONLY_EARLY(struct filterops) machport_filtops = {
1163 	.f_adjusts_qos = true,
1164 	.f_extended_codes = true,
1165 	.f_attach = filt_machportattach,
1166 	.f_detach = filt_machportdetach,
1167 	.f_event = filt_machportevent,
1168 	.f_touch = filt_machporttouch,
1169 	.f_process = filt_machportprocess,
1170 };
1171