xref: /xnu-10002.41.9/osfmk/ipc/ipc_pset.c (revision 699cd48037512bf4380799317ca44ca453c82f57)
1 /*
2  * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * @OSF_COPYRIGHT@
30  */
31 /*
32  * Mach Operating System
33  * Copyright (c) 1991,1990,1989 Carnegie Mellon University
34  * All Rights Reserved.
35  *
36  * Permission to use, copy, modify and distribute this software and its
37  * documentation is hereby granted, provided that both the copyright
38  * notice and this permission notice appear in all copies of the
39  * software, derivative works or modified versions, and any portions
40  * thereof, and that both notices appear in supporting documentation.
41  *
42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45  *
46  * Carnegie Mellon requests users of this software to return to
47  *
48  *  Software Distribution Coordinator  or  [email protected]
49  *  School of Computer Science
50  *  Carnegie Mellon University
51  *  Pittsburgh PA 15213-3890
52  *
53  * any improvements or extensions that they make and grant Carnegie Mellon
54  * the rights to redistribute these changes.
55  */
56 /*
57  */
58 /*
59  *	File:	ipc/ipc_pset.c
60  *	Author:	Rich Draves
61  *	Date:	1989
62  *
63  *	Functions to manipulate IPC port sets.
64  */
65 
66 #include <mach/port.h>
67 #include <mach/kern_return.h>
68 #include <mach/message.h>
69 #include <ipc/ipc_mqueue.h>
70 #include <ipc/ipc_object.h>
71 #include <ipc/ipc_pset.h>
72 #include <ipc/ipc_right.h>
73 #include <ipc/ipc_space.h>
74 #include <ipc/ipc_port.h>
75 #include <ipc/ipc_kmsg.h>
76 #include <kern/policy_internal.h>
77 
78 #include <kern/kern_types.h>
79 
80 #include <vm/vm_map.h>
81 #include <libkern/section_keywords.h>
82 #include <pthread/priority_private.h>
83 
84 /* processor_set stole ipc_pset_init */
85 static void
ipc_port_set_init(ipc_pset_t pset,mach_port_name_t name,int policy)86 ipc_port_set_init(ipc_pset_t pset, mach_port_name_t name, int policy)
87 {
88 	waitq_init(&pset->ips_wqset, WQT_PORT_SET, policy | SYNC_POLICY_FIFO);
89 	klist_init(&pset->ips_klist);
90 	pset->ips_wqset.wqset_index = MACH_PORT_INDEX(name);
91 }
92 
93 /*
94  *	Routine:	ipc_pset_alloc
95  *	Purpose:
96  *		Allocate a port set.
97  *	Conditions:
98  *		Nothing locked.  If successful, the port set is returned
99  *		locked.  (The caller doesn't have a reference.)
100  *	Returns:
101  *		KERN_SUCCESS		The port set is allocated.
102  *		KERN_INVALID_TASK	The space is dead.
103  *		KERN_NO_SPACE		No room for an entry in the space.
104  */
105 
106 kern_return_t
ipc_pset_alloc(ipc_space_t space,mach_port_name_t * namep,ipc_pset_t * psetp)107 ipc_pset_alloc(
108 	ipc_space_t             space,
109 	mach_port_name_t        *namep,
110 	ipc_pset_t              *psetp)
111 {
112 	ipc_pset_t pset;
113 	mach_port_name_t name;
114 	kern_return_t kr;
115 
116 	kr = ipc_object_alloc(space, IOT_PORT_SET,
117 	    MACH_PORT_TYPE_PORT_SET, 0,
118 	    &name, (ipc_object_t *) &pset);
119 	if (kr != KERN_SUCCESS) {
120 		return kr;
121 	}
122 	/* space is locked */
123 
124 	ipc_port_set_init(pset, name, SYNC_POLICY_INIT_LOCKED);
125 	/* port set is locked */
126 
127 	is_write_unlock(space);
128 
129 	*namep = name;
130 	*psetp = pset;
131 	return KERN_SUCCESS;
132 }
133 
134 /*
135  *	Routine:	ipc_pset_alloc_name
136  *	Purpose:
137  *		Allocate a port set, with a specific name.
138  *	Conditions:
139  *		Nothing locked.  If successful, the port set is returned
140  *		locked.  (The caller doesn't have a reference.)
141  *	Returns:
142  *		KERN_SUCCESS		The port set is allocated.
143  *		KERN_INVALID_TASK	The space is dead.
144  *		KERN_NAME_EXISTS	The name already denotes a right.
145  */
146 
147 kern_return_t
ipc_pset_alloc_name(ipc_space_t space,mach_port_name_t name,ipc_pset_t * psetp)148 ipc_pset_alloc_name(
149 	ipc_space_t             space,
150 	mach_port_name_t        name,
151 	ipc_pset_t              *psetp)
152 {
153 	return ipc_object_alloc_name(space, IOT_PORT_SET,
154 	           MACH_PORT_TYPE_PORT_SET, 0,
155 	           name, (ipc_object_t *)psetp, ^(ipc_object_t object){
156 		ipc_port_set_init(ips_object_to_pset(object), name,
157 		SYNC_POLICY_INIT_LOCKED);
158 	});
159 }
160 
161 
162 /*
163  *	Routine:	ipc_pset_alloc_special
164  *	Purpose:
165  *		Allocate a port set in a special space.
166  *		The new port set is returned with one ref.
167  *		If unsuccessful, IPS_NULL is returned.
168  *	Conditions:
169  *		Nothing locked.
170  */
171 ipc_pset_t
ipc_pset_alloc_special(__assert_only ipc_space_t space)172 ipc_pset_alloc_special(
173 	__assert_only ipc_space_t space)
174 {
175 	ipc_pset_t pset;
176 
177 	assert(space != IS_NULL);
178 	assert(!is_active(space));
179 
180 	pset = ips_object_to_pset(io_alloc(IOT_PORT_SET, Z_WAITOK | Z_ZERO));
181 	if (pset == IPS_NULL) {
182 		return IPS_NULL;
183 	}
184 
185 	os_atomic_init(&pset->ips_object.io_bits, io_makebits(IOT_PORT_SET));
186 	os_atomic_init(&pset->ips_object.io_references, 1);
187 
188 	ipc_port_set_init(pset, MACH_PORT_SPECIAL_DEFAULT, 0);
189 
190 	return pset;
191 }
192 
193 
194 /*
195  *	Routine:	ipc_pset_destroy
196  *	Purpose:
197  *		Destroys a port_set.
198  *	Conditions:
199  *		The port_set is locked and alive.
200  *		The caller has a reference, which is consumed.
201  *		Afterwards, the port_set is unlocked and dead.
202  */
203 
204 void
ipc_pset_destroy(ipc_space_t space,ipc_pset_t pset)205 ipc_pset_destroy(
206 	ipc_space_t     space,
207 	ipc_pset_t      pset)
208 {
209 	waitq_link_list_t free_l = { };
210 
211 	assert(ips_active(pset));
212 
213 	io_bits_andnot(ips_to_object(pset), IO_BITS_ACTIVE);
214 
215 	/*
216 	 * Set all waiters on the portset running to
217 	 * discover the change.
218 	 *
219 	 * Then under the same lock hold, deinit the waitq-set,
220 	 * which will remove all the member message queues,
221 	 * linkages and clean up preposts.
222 	 */
223 	ipc_mqueue_changed(space, &pset->ips_wqset);
224 	waitq_invalidate(&pset->ips_wqset);
225 	waitq_set_unlink_all_locked(&pset->ips_wqset, &free_l);
226 
227 	ips_mq_unlock(pset);
228 
229 	ips_release(pset);       /* consume the ref our caller gave us */
230 
231 	waitq_link_free_list(WQT_PORT_SET, &free_l);
232 }
233 
234 /*
235  *	Routine:	ipc_pset_finalize
236  *	Purpose:
237  *		Called on last reference deallocate to
238  *		free any remaining data associated with the pset.
239  *	Conditions:
240  *		Nothing locked.
241  */
242 void
ipc_pset_finalize(ipc_pset_t pset)243 ipc_pset_finalize(
244 	ipc_pset_t              pset)
245 {
246 	waitq_deinit(&pset->ips_wqset);
247 }
248 
249 
250 /*
251  * Kqueue EVFILT_MACHPORT support
252  *
253  * - kn_ipc_obj points to the monitored ipc port or pset. If the knote is
254  *   using a kqwl, it is eligible to participate in sync IPC overrides.
255  *
256  *   For the first such sync IPC message in the port, we set up the port's
257  *   turnstile to directly push on the kqwl's turnstile (which is in turn set up
258  *   during filt_machportattach). If userspace responds to the message, the
259  *   turnstile push is severed the point of reply. If userspace returns without
260  *   responding to the message, we sever the turnstile push at the
261  *   point of reenabling the knote to deliver the next message. This is why the
262  *   knote needs to remember the port. For more details, see also
263  *   filt_machport_turnstile_complete.
264  *
265  *   If there are multiple other sync IPC messages in the port, messages 2 to n
266  *   redirect their turnstile push to the kqwl through an intermediatry "knote"
267  *   turnstile which in turn, pushes on the kqwl turnstile. This knote turnstile
268  *   is stored in the kn_hook. See also filt_machport_turnstile_prepare_lazily.
269  *
270  * - (in/out) ext[0] holds a mach_vm_address_t to a userspace buffer
271  *   that can be used to direct-deliver messages when
272  *   MACH_RCV_MSG is set in kn_sfflags
273  *
274  * - (in/out) ext[1] holds a mach_msg_size_t representing the size
275  *   of the userspace buffer held in ext[0].
276  *
277  * - (out)    ext[2] is used to deliver qos information
278  *   about the send queue to userspace.
279  *
280  * - (abused) ext[3] is used in kernel to hold a reference to the first port
281  *   with a turnstile that participate to sync IPC override. For more details,
282  *   see filt_machport_stash_port
283  *
284  * - kn_hook is optionally a "knote" turnstile. It is used as the inheritor
285  *   of turnstiles for rights copied out as part of direct message delivery
286  *   when they can participate to sync IPC override.
287  *
288  *   It is used to atomically neuter the sync IPC override when the knote is
289  *   re-enabled.
290  *
291  */
292 
293 #include <sys/event.h>
294 #include <sys/errno.h>
295 
296 static int
filt_machport_filter_result(struct knote * kn,ipc_object_t object)297 filt_machport_filter_result(struct knote *kn, ipc_object_t object)
298 {
299 	struct waitq *wq = io_waitq(object);
300 	ipc_kmsg_t first;
301 	int result = 0;
302 
303 	io_lock_held(object);
304 
305 	if (kn->kn_sfflags & MACH_RCV_MSG) {
306 		result = FILTER_RESET_EVENT_QOS;
307 	}
308 
309 	if (!waitq_is_valid(wq)) {
310 		return result;
311 	}
312 
313 	if (waitq_type(wq) == WQT_PORT_SET) {
314 		ipc_pset_t pset = ips_object_to_pset(object);
315 		return waitq_set_first_prepost(&pset->ips_wqset, WQS_PREPOST_PEEK) ?
316 		       FILTER_ACTIVE : 0;
317 	}
318 
319 	ipc_port_t port = ip_object_to_port(object);
320 	struct kqueue *kqwl = knote_get_kq(kn);
321 
322 	if (port->ip_kernel_iotier_override != kqueue_get_iotier_override(kqwl)) {
323 		kqueue_set_iotier_override(kqwl, port->ip_kernel_iotier_override);
324 		result |= FILTER_ADJUST_EVENT_IOTIER_BIT;
325 	}
326 
327 	first = ipc_kmsg_queue_first(&port->ip_messages.imq_messages);
328 	if (!first) {
329 		return result;
330 	}
331 
332 	result = FILTER_ACTIVE;
333 	if (kn->kn_sfflags & MACH_RCV_MSG) {
334 		result |= FILTER_ADJUST_EVENT_QOS(first->ikm_qos_override);
335 	}
336 
337 #if CONFIG_PREADOPT_TG
338 	struct thread_group *tg = ipc_kmsg_get_thread_group(first);
339 	if (tg) {
340 		struct kqueue *kq = knote_get_kq(kn);
341 		kqueue_set_preadopted_thread_group(kq, tg,
342 		    first->ikm_qos_override);
343 	}
344 #endif
345 
346 	return result;
347 }
348 
349 struct turnstile *
filt_ipc_kqueue_turnstile(struct knote * kn)350 filt_ipc_kqueue_turnstile(struct knote *kn)
351 {
352 	assert(kn->kn_filter == EVFILT_MACHPORT || kn->kn_filter == EVFILT_WORKLOOP);
353 	return kqueue_turnstile(knote_get_kq(kn));
354 }
355 
356 bool
filt_machport_kqueue_has_turnstile(struct knote * kn)357 filt_machport_kqueue_has_turnstile(struct knote *kn)
358 {
359 	assert(kn->kn_filter == EVFILT_MACHPORT);
360 	return ((kn->kn_sfflags & MACH_RCV_MSG) || (kn->kn_sfflags & MACH_RCV_SYNC_PEEK))
361 	       && (kn->kn_flags & EV_DISPATCH);
362 }
363 
364 /*
365  * Stashes a port that participate to sync IPC override on the knote until the
366  * knote is re-enabled.
367  *
368  * It returns:
369  * - the turnstile to use as an inheritor for the stashed port
370  * - the kind of stash that happened as PORT_SYNC_* value among:
371  *   o not stashed (no sync IPC support)
372  *   o stashed in the knote (in kn_ext[3])
373  *   o to be hooked to the kn_hook knote
374  */
375 struct turnstile *
filt_machport_stash_port(struct knote * kn,ipc_port_t port,int * link)376 filt_machport_stash_port(struct knote *kn, ipc_port_t port, int *link)
377 {
378 	struct turnstile *ts = TURNSTILE_NULL;
379 
380 	if (kn->kn_filter == EVFILT_WORKLOOP) {
381 		assert(kn->kn_ipc_obj == NULL);
382 		kn->kn_ipc_obj = ip_to_object(port);
383 		ip_reference(port);
384 		if (link) {
385 			*link = PORT_SYNC_LINK_WORKLOOP_KNOTE;
386 		}
387 		ts = filt_ipc_kqueue_turnstile(kn);
388 	} else if (!filt_machport_kqueue_has_turnstile(kn)) {
389 		if (link) {
390 			*link = PORT_SYNC_LINK_NO_LINKAGE;
391 		}
392 	} else if (kn->kn_ext[3] == 0) {
393 		ip_reference(port);
394 		kn->kn_ext[3] = (uintptr_t)port;
395 		ts = filt_ipc_kqueue_turnstile(kn);
396 		if (link) {
397 			*link = PORT_SYNC_LINK_WORKLOOP_KNOTE;
398 		}
399 	} else {
400 		ts = (struct turnstile *)knote_kn_hook_get_raw(kn);
401 		if (link) {
402 			*link = PORT_SYNC_LINK_WORKLOOP_STASH;
403 		}
404 	}
405 
406 	return ts;
407 }
408 
409 /*
410  * Lazily prepare a turnstile so that filt_machport_stash_port()
411  * can be called with the mqueue lock held.
412  *
413  * It will allocate a turnstile in kn_hook if:
414  * - the knote supports sync IPC override,
415  * - we already stashed a port in kn_ext[3],
416  * - the object that will be copied out has a chance to ask to be stashed.
417  *
418  * It is setup so that its inheritor is the workloop turnstile that has been
419  * allocated when this knote was attached.
420  */
421 void
filt_machport_turnstile_prepare_lazily(struct knote * kn,mach_msg_type_name_t msgt_name,ipc_port_t port)422 filt_machport_turnstile_prepare_lazily(
423 	struct knote *kn,
424 	mach_msg_type_name_t msgt_name,
425 	ipc_port_t port)
426 {
427 	/* This is called from within filt_machportprocess */
428 	assert((kn->kn_status & KN_SUPPRESSED) && (kn->kn_status & KN_LOCKED));
429 
430 	if (!filt_machport_kqueue_has_turnstile(kn)) {
431 		return;
432 	}
433 
434 	if (kn->kn_ext[3] == 0 || knote_kn_hook_get_raw(kn)) {
435 		return;
436 	}
437 
438 	struct turnstile *ts = filt_ipc_kqueue_turnstile(kn);
439 	if ((msgt_name == MACH_MSG_TYPE_PORT_SEND_ONCE && port->ip_specialreply) ||
440 	    (msgt_name == MACH_MSG_TYPE_PORT_RECEIVE)) {
441 		struct turnstile *kn_ts = turnstile_alloc();
442 		struct turnstile *ts_store;
443 		kn_ts = turnstile_prepare((uintptr_t)kn, &ts_store, kn_ts, TURNSTILE_KNOTE);
444 		knote_kn_hook_set_raw(kn, ts_store);
445 
446 		turnstile_update_inheritor(kn_ts, ts,
447 		    TURNSTILE_IMMEDIATE_UPDATE | TURNSTILE_INHERITOR_TURNSTILE);
448 		turnstile_cleanup();
449 	}
450 }
451 
452 static void
filt_machport_turnstile_complete_port(struct knote * kn,ipc_port_t port)453 filt_machport_turnstile_complete_port(struct knote *kn, ipc_port_t port)
454 {
455 	struct turnstile *ts = TURNSTILE_NULL;
456 
457 	ip_mq_lock(port);
458 	if (port->ip_specialreply) {
459 		/*
460 		 * If the reply has been sent to the special reply port already,
461 		 * then the special reply port may already be reused to do something
462 		 * entirely different.
463 		 *
464 		 * However, the only reason for it to still point to this knote is
465 		 * that it's still waiting for a reply, so when this is the case,
466 		 * neuter the linkage.
467 		 */
468 		if (port->ip_sync_link_state == PORT_SYNC_LINK_WORKLOOP_KNOTE &&
469 		    port->ip_sync_inheritor_knote == kn) {
470 			ipc_port_adjust_special_reply_port_locked(port, NULL,
471 			    (IPC_PORT_ADJUST_SR_NONE | IPC_PORT_ADJUST_SR_ENABLE_EVENT), FALSE);
472 			/* port unlocked */
473 		} else {
474 			ip_mq_unlock(port);
475 		}
476 	} else {
477 		/*
478 		 * For receive rights, if their IMQ_KNOTE() is still this
479 		 * knote, then sever the link.
480 		 */
481 		if (port->ip_sync_link_state == PORT_SYNC_LINK_WORKLOOP_KNOTE &&
482 		    port->ip_messages.imq_inheritor_knote == kn) {
483 			ipc_port_adjust_sync_link_state_locked(port, PORT_SYNC_LINK_ANY, NULL);
484 			ts = port_send_turnstile(port);
485 		}
486 		if (ts) {
487 			turnstile_reference(ts);
488 			turnstile_update_inheritor(ts, TURNSTILE_INHERITOR_NULL,
489 			    TURNSTILE_IMMEDIATE_UPDATE);
490 		}
491 		ip_mq_unlock(port);
492 
493 		if (ts) {
494 			turnstile_update_inheritor_complete(ts,
495 			    TURNSTILE_INTERLOCK_NOT_HELD);
496 			turnstile_deallocate(ts);
497 		}
498 	}
499 
500 	ip_release(port);
501 }
502 
503 void
filt_wldetach_sync_ipc(struct knote * kn)504 filt_wldetach_sync_ipc(struct knote *kn)
505 {
506 	ipc_object_t io = kn->kn_ipc_obj;
507 	filt_machport_turnstile_complete_port(kn, ip_object_to_port(io));
508 	kn->kn_ipc_obj = IO_NULL;
509 }
510 
511 /*
512  * Other half of filt_machport_turnstile_prepare_lazily()
513  *
514  * This is serialized by the knote state machine.
515  */
516 static void
filt_machport_turnstile_complete(struct knote * kn)517 filt_machport_turnstile_complete(struct knote *kn)
518 {
519 	if (kn->kn_ext[3]) {
520 		ipc_port_t port = (ipc_port_t)kn->kn_ext[3];
521 		filt_machport_turnstile_complete_port(kn, port);
522 		kn->kn_ext[3] = 0;
523 	}
524 
525 	struct turnstile *ts = knote_kn_hook_get_raw(kn);
526 	if (ts) {
527 		turnstile_update_inheritor(ts, TURNSTILE_INHERITOR_NULL,
528 		    TURNSTILE_IMMEDIATE_UPDATE);
529 		turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD);
530 
531 		struct turnstile *ts_store = ts;
532 		turnstile_complete((uintptr_t)kn, (struct turnstile **)&ts_store, &ts, TURNSTILE_KNOTE);
533 		knote_kn_hook_set_raw(kn, ts_store);
534 
535 		turnstile_cleanup();
536 
537 		assert(ts);
538 		turnstile_deallocate(ts);
539 	}
540 }
541 
542 static void
filt_machport_link(struct klist * klist,struct knote * kn)543 filt_machport_link(struct klist *klist, struct knote *kn)
544 {
545 	struct knote *hd = SLIST_FIRST(klist);
546 
547 	if (hd && filt_machport_kqueue_has_turnstile(kn)) {
548 		SLIST_INSERT_AFTER(hd, kn, kn_selnext);
549 	} else {
550 		SLIST_INSERT_HEAD(klist, kn, kn_selnext);
551 	}
552 }
553 
554 static void
filt_machport_unlink(struct klist * klist,struct knote * kn)555 filt_machport_unlink(struct klist *klist, struct knote *kn)
556 {
557 	struct knote **knprev;
558 
559 	KNOTE_DETACH(klist, kn);
560 
561 	/* make sure the first knote is a knote we can push on */
562 	SLIST_FOREACH_PREVPTR(kn, knprev, klist, kn_selnext) {
563 		if (filt_machport_kqueue_has_turnstile(kn)) {
564 			*knprev = SLIST_NEXT(kn, kn_selnext);
565 			SLIST_INSERT_HEAD(klist, kn, kn_selnext);
566 			break;
567 		}
568 	}
569 }
570 
571 int
filt_wlattach_sync_ipc(struct knote * kn)572 filt_wlattach_sync_ipc(struct knote *kn)
573 {
574 	mach_port_name_t name = (mach_port_name_t)kn->kn_id;
575 	ipc_space_t space = current_space();
576 	ipc_entry_bits_t bits;
577 	ipc_object_t object;
578 	ipc_port_t port = IP_NULL;
579 	int error = 0;
580 
581 	if (ipc_right_lookup_read(space, name, &bits, &object) != KERN_SUCCESS) {
582 		return ENOENT;
583 	}
584 	/* object is locked and active */
585 
586 	if (bits & MACH_PORT_TYPE_RECEIVE) {
587 		port = ip_object_to_port(object);
588 		if (port->ip_specialreply) {
589 			error = ENOENT;
590 		}
591 	} else if (bits & MACH_PORT_TYPE_SEND_ONCE) {
592 		port = ip_object_to_port(object);
593 		if (!port->ip_specialreply) {
594 			error = ENOENT;
595 		}
596 	} else {
597 		error = ENOENT;
598 	}
599 	if (error) {
600 		io_unlock(object);
601 		return error;
602 	}
603 
604 	if (port->ip_sync_link_state == PORT_SYNC_LINK_ANY) {
605 		io_unlock(object);
606 		/*
607 		 * We cannot start a sync IPC inheritance chain, only further one
608 		 * Note: this can also happen if the inheritance chain broke
609 		 * because the original requestor died.
610 		 */
611 		return ENOENT;
612 	}
613 
614 	if (port->ip_specialreply) {
615 		ipc_port_adjust_special_reply_port_locked(port, kn,
616 		    IPC_PORT_ADJUST_SR_LINK_WORKLOOP, FALSE);
617 	} else {
618 		ipc_port_adjust_port_locked(port, kn, FALSE);
619 	}
620 
621 	/* make sure the port was stashed */
622 	assert(kn->kn_ipc_obj == ip_to_object(port));
623 
624 	/* port has been unlocked by ipc_port_adjust_* */
625 
626 	return 0;
627 }
628 
629 static int
filt_machportattach(struct knote * kn,__unused struct kevent_qos_s * kev)630 filt_machportattach(
631 	struct knote *kn,
632 	__unused struct kevent_qos_s *kev)
633 {
634 	mach_port_name_t name = (mach_port_name_t)kn->kn_id;
635 	ipc_space_t space = current_space();
636 	ipc_entry_bits_t bits;
637 	ipc_object_t object;
638 	struct turnstile *send_turnstile = TURNSTILE_NULL;
639 
640 	int error = 0;
641 	int result = 0;
642 	kern_return_t kr;
643 
644 	kn->kn_flags &= ~EV_EOF;
645 	kn->kn_ext[3] = 0;
646 
647 	if (filt_machport_kqueue_has_turnstile(kn)) {
648 		/*
649 		 * If the filter is likely to support sync IPC override,
650 		 * and it happens to be attaching to a workloop,
651 		 * make sure the workloop has an allocated turnstile.
652 		 */
653 		kqueue_alloc_turnstile(knote_get_kq(kn));
654 	}
655 
656 	kr = ipc_right_lookup_read(space, name, &bits, &object);
657 
658 	if (kr != KERN_SUCCESS) {
659 		error = ENOENT;
660 		goto out;
661 	}
662 	/* object is locked and active */
663 
664 	if (bits & MACH_PORT_TYPE_PORT_SET) {
665 		ipc_pset_t pset = ips_object_to_pset(object);
666 
667 		io_reference(object);
668 		kn->kn_ipc_obj = object;
669 		filt_machport_link(&pset->ips_klist, kn);
670 		result = filt_machport_filter_result(kn, object);
671 		io_unlock(object);
672 	} else if (bits & MACH_PORT_TYPE_RECEIVE) {
673 		ipc_port_t port = ip_object_to_port(object);
674 
675 		if (port->ip_specialreply) {
676 			/*
677 			 * Registering for kevents on special reply ports
678 			 * isn't supported for two reasons:
679 			 *
680 			 * 1. it really makes very little sense for a port that
681 			 *    is supposed to be used synchronously
682 			 *
683 			 * 2. their ports's ip_klist field will be used to
684 			 *    store the receive turnstile, so we can't possibly
685 			 *    attach them anyway.
686 			 */
687 			io_unlock(object);
688 			error = ENOTSUP;
689 			goto out;
690 		}
691 
692 		io_reference(object);
693 		kn->kn_ipc_obj = object;
694 		if (port->ip_sync_link_state != PORT_SYNC_LINK_ANY) {
695 			/*
696 			 * We're attaching a port that used to have an IMQ_KNOTE,
697 			 * clobber this state, we'll fixup its turnstile inheritor below.
698 			 */
699 			ipc_port_adjust_sync_link_state_locked(port, PORT_SYNC_LINK_ANY, NULL);
700 		}
701 
702 		filt_machport_link(&port->ip_klist, kn);
703 		result = filt_machport_filter_result(kn, object);
704 
705 		/*
706 		 * Update the port's turnstile inheritor
707 		 *
708 		 * Unlike filt_machportdetach(), we don't have to care about races for
709 		 * turnstile_workloop_pusher_info(): filt_machport_link() doesn't affect
710 		 * already pushing knotes, and if the current one becomes the new
711 		 * pusher, it'll only be visible when turnstile_workloop_pusher_info()
712 		 * returns.
713 		 */
714 		send_turnstile = port_send_turnstile(port);
715 		if (send_turnstile) {
716 			turnstile_reference(send_turnstile);
717 			ipc_port_send_update_inheritor(port, send_turnstile,
718 			    TURNSTILE_IMMEDIATE_UPDATE);
719 
720 			/*
721 			 * rdar://problem/48861190
722 			 *
723 			 * When a listener connection resumes a peer,
724 			 * updating the inheritor above has moved the push
725 			 * from the current thread to the workloop.
726 			 *
727 			 * However, we haven't told the workloop yet
728 			 * that it needs a thread request, and we risk
729 			 * to be preeempted as soon as we drop the space
730 			 * lock below.
731 			 *
732 			 * To avoid this disable preemption and let kevent
733 			 * reenable it after it takes the kqlock.
734 			 */
735 			disable_preemption();
736 			result |= FILTER_THREADREQ_NODEFEER;
737 		}
738 
739 		io_unlock(object);
740 
741 		if (send_turnstile) {
742 			turnstile_update_inheritor_complete(send_turnstile,
743 			    TURNSTILE_INTERLOCK_NOT_HELD);
744 			turnstile_deallocate_safe(send_turnstile);
745 		}
746 	} else {
747 		io_unlock(object);
748 		error = ENOTSUP;
749 	}
750 
751 out:
752 	/* bail out on errors */
753 	if (error) {
754 		knote_set_error(kn, error);
755 		return 0;
756 	}
757 
758 	return result;
759 }
760 
761 static void
filt_machportdetach(struct knote * kn)762 filt_machportdetach(
763 	struct knote *kn)
764 {
765 	ipc_object_t object = kn->kn_ipc_obj;
766 	struct turnstile *send_turnstile = TURNSTILE_NULL;
767 
768 	filt_machport_turnstile_complete(kn);
769 
770 	io_lock(object);
771 	if ((kn->kn_status & KN_VANISHED) || (kn->kn_flags & EV_EOF)) {
772 		/*
773 		 * ipc_mqueue_changed() already unhooked this knote from the waitq,
774 		 */
775 	} else {
776 		ipc_port_t port = IP_NULL;
777 
778 		/*
779 		 * When the knote being detached is the first one in the list,
780 		 * then unlinking the knote *and* updating the turnstile inheritor
781 		 * need to happen atomically with respect to the callers of
782 		 * turnstile_workloop_pusher_info().
783 		 *
784 		 * The caller of turnstile_workloop_pusher_info() will use the kq req
785 		 * lock (and hence the kqlock), so we just need to hold the kqlock too.
786 		 */
787 		if (io_otype(object) == IOT_PORT) {
788 			port = ip_object_to_port(object);
789 			assert(port->ip_sync_link_state == PORT_SYNC_LINK_ANY);
790 			if (kn == SLIST_FIRST(&port->ip_klist)) {
791 				send_turnstile = port_send_turnstile(port);
792 			}
793 			filt_machport_unlink(&port->ip_klist, kn);
794 			struct kqueue *kq = knote_get_kq(kn);
795 			kqueue_set_iotier_override(kq, THROTTLE_LEVEL_END);
796 		} else {
797 			ipc_pset_t pset = ips_object_to_pset(object);
798 
799 			filt_machport_unlink(&pset->ips_klist, kn);
800 		}
801 
802 
803 		if (send_turnstile) {
804 			turnstile_reference(send_turnstile);
805 			ipc_port_send_update_inheritor(port, send_turnstile,
806 			    TURNSTILE_IMMEDIATE_UPDATE);
807 		}
808 	}
809 
810 	/* Clear the knote pointer once the knote has been removed from turnstile */
811 	kn->kn_ipc_obj = IO_NULL;
812 	io_unlock(object);
813 
814 	if (send_turnstile) {
815 		turnstile_update_inheritor_complete(send_turnstile,
816 		    TURNSTILE_INTERLOCK_NOT_HELD);
817 		turnstile_deallocate(send_turnstile);
818 	}
819 
820 	io_release(object);
821 }
822 
823 /*
824  * filt_machportevent - deliver events into the mach port filter
825  *
826  * Mach port message arrival events are currently only posted via the
827  * kqueue filter routine for ports.
828  *
829  * If there is a message at the head of the queue,
830  * we indicate that the knote should go active.  If
831  * the message is to be direct-received, we adjust the
832  * QoS of the knote according the requested and override
833  * QoS of that first message.
834  *
835  * When the knote is for a port-set, the hint is non 0
836  * and is the waitq which is posting.
837  */
838 static int
filt_machportevent(struct knote * kn,long hint __assert_only)839 filt_machportevent(struct knote *kn, long hint __assert_only)
840 {
841 	if (io_otype(kn->kn_ipc_obj) == IOT_PORT_SET) {
842 		/*
843 		 * When called for a port-set,
844 		 * the posting port waitq is locked.
845 		 *
846 		 * waitq_set_first_prepost()
847 		 * in filt_machport_filter_result()
848 		 * would try to lock it and be very sad.
849 		 *
850 		 * Just trust what we know to be true.
851 		 */
852 		assert(hint != 0);
853 		return FILTER_ACTIVE;
854 	}
855 	assert(hint == 0);
856 	return filt_machport_filter_result(kn, kn->kn_ipc_obj);
857 }
858 
859 void
ipc_pset_prepost(struct waitq_set * wqs,struct waitq * waitq)860 ipc_pset_prepost(struct waitq_set *wqs, struct waitq *waitq)
861 {
862 	KNOTE(&ips_from_waitq(wqs)->ips_klist, (long)waitq);
863 }
864 
865 static int
filt_machporttouch(struct knote * kn,struct kevent_qos_s * kev)866 filt_machporttouch(
867 	struct knote *kn,
868 	struct kevent_qos_s *kev)
869 {
870 	ipc_object_t object = kn->kn_ipc_obj;
871 	int result = 0;
872 
873 	/*
874 	 * Specificying MACH_RCV_MSG or MACH_RCV_SYNC_PEEK during attach results in
875 	 * allocation of a turnstile. Modifying the filter flags to include these
876 	 * flags later, without a turnstile being allocated, leads to
877 	 * inconsistencies.
878 	 */
879 	if ((kn->kn_sfflags ^ kev->fflags) & (MACH_RCV_MSG | MACH_RCV_SYNC_PEEK)) {
880 		kev->flags |= EV_ERROR;
881 		kev->data = EINVAL;
882 		return 0;
883 	}
884 
885 	/* copy in new settings and save off new input fflags */
886 	kn->kn_sfflags = kev->fflags;
887 	kn->kn_ext[0] = kev->ext[0];
888 	kn->kn_ext[1] = kev->ext[1];
889 
890 	if (kev->flags & EV_ENABLE) {
891 		/*
892 		 * If the knote is being enabled, make sure there's no lingering
893 		 * IPC overrides from the previous message delivery.
894 		 */
895 		filt_machport_turnstile_complete(kn);
896 	}
897 
898 	io_lock(object);
899 	result = filt_machport_filter_result(kn, object);
900 	io_unlock(object);
901 
902 	return result;
903 }
904 
905 static int
filt_machportprocess(struct knote * kn,struct kevent_qos_s * kev)906 filt_machportprocess(struct knote *kn, struct kevent_qos_s *kev)
907 {
908 	ipc_object_t object = kn->kn_ipc_obj;
909 	thread_t self = current_thread();
910 	kevent_ctx_t kectx = NULL;
911 
912 	wait_result_t wresult;
913 	mach_msg_option64_t option64;
914 	mach_vm_address_t msg_addr;
915 	mach_msg_size_t max_msg_size, cpout_aux_size, cpout_msg_size;
916 	uint32_t ppri;
917 	mach_msg_qos_t oqos;
918 
919 	int result = FILTER_ACTIVE;
920 
921 	/* Capture current state */
922 	knote_fill_kevent(kn, kev, MACH_PORT_NULL);
923 
924 	/* Clear port reference, use ext3 as size of msg aux data */
925 	kev->ext[3] = 0;
926 
927 	/* If already deallocated/moved return one last EOF event */
928 	if (kev->flags & EV_EOF) {
929 		return FILTER_ACTIVE | FILTER_RESET_EVENT_QOS;
930 	}
931 
932 	/*
933 	 * Only honor supported receive options. If no options are
934 	 * provided, just force a MACH_RCV_LARGE to detect the
935 	 * name of the port and sizeof the waiting message.
936 	 *
937 	 * Extend kn_sfflags to 64 bits.
938 	 */
939 	option64 = (mach_msg_option64_t)kn->kn_sfflags & (MACH_RCV_MSG |
940 	    MACH_RCV_LARGE | MACH_RCV_LARGE_IDENTITY |
941 	    MACH_RCV_TRAILER_MASK | MACH_RCV_VOUCHER | MACH_MSG_STRICT_REPLY);
942 
943 	if (option64 & MACH_RCV_MSG) {
944 		msg_addr = (mach_vm_address_t) kn->kn_ext[0];
945 		max_msg_size = (mach_msg_size_t) kn->kn_ext[1];
946 
947 		/*
948 		 * Copy out the incoming message as vector, and append aux data
949 		 * immediately after the message proper (if any) and report its
950 		 * size on ext3.
951 		 */
952 		option64 |= (MACH64_MSG_VECTOR | MACH64_RCV_LINEAR_VECTOR);
953 
954 		/*
955 		 * If the kevent didn't specify a buffer and length, carve a buffer
956 		 * from the filter processing data according to the flags.
957 		 */
958 		if (max_msg_size == 0) {
959 			kectx = kevent_get_context(self);
960 			msg_addr  = (mach_vm_address_t)kectx->kec_data_out;
961 			max_msg_size  = (mach_msg_size_t)kectx->kec_data_resid;
962 			option64 |= (MACH_RCV_LARGE | MACH_RCV_LARGE_IDENTITY);
963 			/* Receive vector linearly onto stack */
964 			if (kectx->kec_process_flags & KEVENT_FLAG_STACK_DATA) {
965 				option64 |= MACH64_RCV_STACK;
966 			}
967 		}
968 	} else {
969 		/* just detect the port name (if a set) and size of the first message */
970 		option64 = MACH_RCV_LARGE;
971 		msg_addr = 0;
972 		max_msg_size = 0;
973 	}
974 
975 	/*
976 	 * Set up to receive a message or the notification of a
977 	 * too large message.  But never allow this call to wait.
978 	 * If the user provided aditional options, like trailer
979 	 * options, pass those through here.  But we don't support
980 	 * scatter lists through this interface.
981 	 *
982 	 * Note: while in filt_machportprocess(),
983 	 *       the knote has a reference on `object` that we can borrow.
984 	 */
985 	self->ith_object = object;
986 
987 	/* Using msg_addr as combined buffer for message proper and aux */
988 	self->ith_msg_addr = msg_addr;
989 	self->ith_max_msize = max_msg_size;
990 	self->ith_msize = 0;
991 
992 	self->ith_aux_addr = 0;
993 	self->ith_max_asize = 0;
994 	self->ith_asize = 0;
995 
996 	self->ith_option = option64;
997 	self->ith_receiver_name = MACH_PORT_NULL;
998 	option64 |= MACH_RCV_TIMEOUT; // never wait
999 	self->ith_state = MACH_RCV_IN_PROGRESS;
1000 	self->ith_knote = kn;
1001 
1002 	io_lock(object);
1003 
1004 	wresult = ipc_mqueue_receive_on_thread_and_unlock(
1005 		io_waitq(object),
1006 		option64,
1007 		self->ith_max_msize,       /* max msg suze */
1008 		0,                         /* max aux size 0, using combined buffer */
1009 		0,                         /* immediate timeout */
1010 		THREAD_INTERRUPTIBLE,
1011 		self);
1012 	/* port unlocked */
1013 
1014 	/* If we timed out, or the process is exiting, just zero.  */
1015 	if (wresult == THREAD_RESTART || self->ith_state == MACH_RCV_TIMED_OUT) {
1016 		assert(self->turnstile != TURNSTILE_NULL);
1017 		self->ith_knote = ITH_KNOTE_NULL;
1018 		return 0;
1019 	}
1020 
1021 	assert(wresult == THREAD_NOT_WAITING);
1022 	assert(self->ith_state != MACH_RCV_IN_PROGRESS);
1023 
1024 	/*
1025 	 * If we weren't attempting to receive a message
1026 	 * directly, we need to return the port name in
1027 	 * the kevent structure.
1028 	 */
1029 	if ((option64 & MACH_RCV_MSG) != MACH_RCV_MSG) {
1030 		assert(self->ith_state == MACH_RCV_TOO_LARGE);
1031 		assert(self->ith_kmsg == IKM_NULL);
1032 		kev->data = self->ith_receiver_name;
1033 		self->ith_knote = ITH_KNOTE_NULL;
1034 		return result;
1035 	}
1036 
1037 #if CONFIG_PREADOPT_TG
1038 	/* If we're the first EVFILT_MACHPORT knote that is being processed for this
1039 	 * kqwl, then make sure to preadopt the thread group from the kmsg we're
1040 	 * about to receive. This is to make sure that we fix up the preadoption
1041 	 * thread group correctly on the receive side for the first message.
1042 	 */
1043 	struct kqueue *kq = knote_get_kq(kn);
1044 
1045 	if (self->ith_kmsg) {
1046 		struct thread_group *tg = ipc_kmsg_get_thread_group(self->ith_kmsg);
1047 
1048 		kqueue_process_preadopt_thread_group(self, kq, tg);
1049 	}
1050 #endif
1051 	ipc_port_t port = ip_object_to_port(object);
1052 	struct kqueue *kqwl = knote_get_kq(kn);
1053 	if (port->ip_kernel_iotier_override != kqueue_get_iotier_override(kqwl)) {
1054 		/*
1055 		 * Lock the port to make sure port->ip_kernel_iotier_override does
1056 		 * not change while updating the kqueue override, else kqueue could
1057 		 * have old iotier value.
1058 		 */
1059 		ip_mq_lock(port);
1060 		kqueue_set_iotier_override(kqwl, port->ip_kernel_iotier_override);
1061 		result |= FILTER_ADJUST_EVENT_IOTIER_BIT;
1062 		ip_mq_unlock(port);
1063 	}
1064 
1065 	/*
1066 	 * Attempt to receive the message directly, returning
1067 	 * the results in the fflags field.
1068 	 */
1069 	io_reference(object);
1070 	kev->fflags = mach_msg_receive_results_kevent(&cpout_msg_size,
1071 	    &cpout_aux_size, &ppri, &oqos);
1072 
1073 	/* kmsg and object reference consumed */
1074 
1075 	/*
1076 	 * if the user asked for the identity of ports containing a
1077 	 * a too-large message, return it in the data field (as we
1078 	 * do for messages we didn't try to receive).
1079 	 */
1080 	if (kev->fflags == MACH_RCV_TOO_LARGE) {
1081 		kev->ext[1] = self->ith_msize;
1082 		kev->ext[3] = self->ith_asize;  /* Only lower 32 bits of ext3 are used */
1083 		if (option64 & MACH_RCV_LARGE_IDENTITY) {
1084 			kev->data = self->ith_receiver_name;
1085 		} else {
1086 			kev->data = MACH_PORT_NULL;
1087 		}
1088 	} else {
1089 		kev->ext[1] = cpout_msg_size;
1090 		kev->ext[3] = cpout_aux_size; /* Only lower 32 bits of ext3 are used */
1091 		kev->data = MACH_PORT_NULL;
1092 	}
1093 
1094 	/*
1095 	 * If we used a data buffer carved out from the filt_process data,
1096 	 * store the address used in the knote and adjust the residual and
1097 	 * other parameters for future use.
1098 	 */
1099 	if (kectx) {
1100 		assert(kectx->kec_data_resid >= cpout_msg_size + cpout_aux_size);
1101 		kectx->kec_data_resid -= cpout_msg_size + cpout_aux_size;
1102 		if ((kectx->kec_process_flags & KEVENT_FLAG_STACK_DATA) == 0) {
1103 			kev->ext[0] = kectx->kec_data_out;
1104 			kectx->kec_data_out += cpout_msg_size + cpout_aux_size;
1105 		} else {
1106 			assert(option64 & MACH64_RCV_STACK);
1107 			kev->ext[0] = kectx->kec_data_out + kectx->kec_data_resid;
1108 		}
1109 	}
1110 
1111 	/*
1112 	 * Apply message-based QoS values to output kevent as prescribed.
1113 	 * The kev->ext[2] field gets (msg-qos << 32) | (override-qos).
1114 	 */
1115 	if (kev->fflags == MACH_MSG_SUCCESS) {
1116 		kev->ext[2] = ((uint64_t)ppri << 32) |
1117 		    _pthread_priority_make_from_thread_qos(oqos, 0, 0);
1118 	}
1119 
1120 	self->ith_knote = ITH_KNOTE_NULL;
1121 	return result;
1122 }
1123 
1124 static void
filt_machportsanitizedcopyout(struct knote * kn,struct kevent_qos_s * kev)1125 filt_machportsanitizedcopyout(struct knote *kn, struct kevent_qos_s *kev)
1126 {
1127 	*kev = *(struct kevent_qos_s *)&kn->kn_kevent;
1128 
1129 	// We may have stashed the address to the port that is pushing on the sync
1130 	// IPC so clear it out.
1131 	kev->ext[3] = 0;
1132 }
1133 
1134 SECURITY_READ_ONLY_EARLY(struct filterops) machport_filtops = {
1135 	.f_adjusts_qos = true,
1136 	.f_extended_codes = true,
1137 	.f_attach = filt_machportattach,
1138 	.f_detach = filt_machportdetach,
1139 	.f_event = filt_machportevent,
1140 	.f_touch = filt_machporttouch,
1141 	.f_process = filt_machportprocess,
1142 	.f_sanitized_copyout = filt_machportsanitizedcopyout,
1143 };
1144