1 /*
2 * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * @OSF_COPYRIGHT@
30 */
31 /*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or [email protected]
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56 /*
57 */
58 /*
59 * File: ipc/ipc_pset.c
60 * Author: Rich Draves
61 * Date: 1989
62 *
63 * Functions to manipulate IPC port sets.
64 */
65
66 #include <mach/port.h>
67 #include <mach/kern_return.h>
68 #include <mach/message.h>
69 #include <ipc/ipc_mqueue.h>
70 #include <ipc/ipc_object.h>
71 #include <ipc/ipc_policy.h>
72 #include <ipc/ipc_pset.h>
73 #include <ipc/ipc_right.h>
74 #include <ipc/ipc_space.h>
75 #include <ipc/ipc_port.h>
76 #include <ipc/ipc_kmsg.h>
77 #include <kern/policy_internal.h>
78
79 #include <kern/kern_types.h>
80
81 #include <vm/vm_map.h>
82 #include <libkern/section_keywords.h>
83 #include <pthread/priority_private.h>
84
85 /* processor_set stole ipc_pset_init */
86 static void
ipc_port_set_init(ipc_pset_t pset,mach_port_name_t name,int policy)87 ipc_port_set_init(ipc_pset_t pset, mach_port_name_t name, int policy)
88 {
89 waitq_init(&pset->ips_wqset, WQT_PORT_SET, policy | SYNC_POLICY_FIFO);
90 klist_init(&pset->ips_klist);
91 pset->ips_wqset.wqset_index = MACH_PORT_INDEX(name);
92 }
93
94 void
ipc_pset_lock(ipc_pset_t pset)95 ipc_pset_lock(ipc_pset_t pset)
96 {
97 ips_validate(pset);
98 waitq_lock(&pset->ips_wqset);
99 }
100
101 /*
102 * Routine: ipc_pset_alloc
103 * Purpose:
104 * Allocate a port set.
105 * Conditions:
106 * Nothing locked. If successful, the port set is returned
107 * locked. (The caller doesn't have a reference.)
108 * Returns:
109 * KERN_SUCCESS The port set is allocated.
110 * KERN_INVALID_TASK The space is dead.
111 * KERN_NO_SPACE No room for an entry in the space.
112 */
113
114 kern_return_t
ipc_pset_alloc(ipc_space_t space,mach_port_name_t * namep,ipc_pset_t * psetp)115 ipc_pset_alloc(
116 ipc_space_t space,
117 mach_port_name_t *namep,
118 ipc_pset_t *psetp)
119 {
120 ipc_pset_t pset;
121 mach_port_name_t name;
122 kern_return_t kr;
123
124 kr = ipc_object_alloc(space, IOT_PORT_SET,
125 MACH_PORT_TYPE_PORT_SET, 0,
126 &name, (ipc_object_t *) &pset);
127 if (kr != KERN_SUCCESS) {
128 return kr;
129 }
130 /* space is locked */
131
132 ipc_port_set_init(pset, name, SYNC_POLICY_INIT_LOCKED);
133 /* port set is locked */
134
135 is_write_unlock(space);
136
137 *namep = name;
138 *psetp = pset;
139 return KERN_SUCCESS;
140 }
141
142 /*
143 * Routine: ipc_pset_alloc_name
144 * Purpose:
145 * Allocate a port set, with a specific name.
146 * Conditions:
147 * Nothing locked. If successful, the port set is returned
148 * locked. (The caller doesn't have a reference.)
149 * Returns:
150 * KERN_SUCCESS The port set is allocated.
151 * KERN_INVALID_TASK The space is dead.
152 * KERN_NAME_EXISTS The name already denotes a right.
153 */
154
155 kern_return_t
ipc_pset_alloc_name(ipc_space_t space,mach_port_name_t name,ipc_pset_t * psetp)156 ipc_pset_alloc_name(
157 ipc_space_t space,
158 mach_port_name_t name,
159 ipc_pset_t *psetp)
160 {
161 return ipc_object_alloc_name(space, IOT_PORT_SET,
162 MACH_PORT_TYPE_PORT_SET, 0,
163 name, (ipc_object_t *)psetp, ^(ipc_object_t object){
164 ipc_port_set_init(ips_object_to_pset(object), name,
165 SYNC_POLICY_INIT_LOCKED);
166 });
167 }
168
169
170 /*
171 * Routine: ipc_pset_alloc_special
172 * Purpose:
173 * Allocate a port set in a special space.
174 * The new port set is returned with one ref.
175 * If unsuccessful, IPS_NULL is returned.
176 * Conditions:
177 * Nothing locked.
178 */
179 ipc_pset_t
ipc_pset_alloc_special(__assert_only ipc_space_t space)180 ipc_pset_alloc_special(
181 __assert_only ipc_space_t space)
182 {
183 ipc_pset_t pset;
184
185 assert(space != IS_NULL);
186 assert(!is_active(space));
187
188 pset = ips_object_to_pset(io_alloc(IOT_PORT_SET, Z_WAITOK | Z_ZERO));
189 if (pset == IPS_NULL) {
190 return IPS_NULL;
191 }
192
193 os_atomic_init(&pset->ips_object.io_bits, io_makebits(IOT_PORT_SET));
194 os_atomic_init(&pset->ips_object.io_references, 1);
195
196 ipc_port_set_init(pset, MACH_PORT_SPECIAL_DEFAULT, 0);
197
198 return pset;
199 }
200
201
202 /*
203 * Routine: ipc_pset_destroy
204 * Purpose:
205 * Destroys a port_set.
206 * Conditions:
207 * The port_set is locked and alive.
208 * The caller has a reference, which is consumed.
209 * Afterwards, the port_set is unlocked and dead.
210 */
211
212 void
ipc_pset_destroy(ipc_space_t space,ipc_pset_t pset)213 ipc_pset_destroy(
214 ipc_space_t space,
215 ipc_pset_t pset)
216 {
217 waitq_link_list_t free_l = { };
218
219 assert(ips_active(pset));
220
221 io_bits_andnot(ips_to_object(pset), IO_BITS_ACTIVE);
222
223 /*
224 * Set all waiters on the portset running to
225 * discover the change.
226 *
227 * Then under the same lock hold, deinit the waitq-set,
228 * which will remove all the member message queues,
229 * linkages and clean up preposts.
230 */
231 ipc_mqueue_changed(space, &pset->ips_wqset);
232 waitq_invalidate(&pset->ips_wqset);
233 waitq_set_unlink_all_locked(&pset->ips_wqset, &free_l);
234
235 ips_mq_unlock(pset);
236
237 ips_release(pset); /* consume the ref our caller gave us */
238
239 waitq_link_free_list(WQT_PORT_SET, &free_l);
240 }
241
242 /*
243 * Routine: ipc_pset_finalize
244 * Purpose:
245 * Called on last reference deallocate to
246 * free any remaining data associated with the pset.
247 * Conditions:
248 * Nothing locked.
249 */
250 void
ipc_pset_finalize(ipc_pset_t pset)251 ipc_pset_finalize(
252 ipc_pset_t pset)
253 {
254 waitq_deinit(&pset->ips_wqset);
255 }
256
257
258 #pragma mark - kevent support
259
260 /*
261 * Kqueue EVFILT_MACHPORT support
262 *
263 * - kn_ipc_{port,pset} points to the monitored ipc port or pset. If the knote
264 * is using a kqwl, it is eligible to participate in sync IPC overrides.
265 *
266 * For the first such sync IPC message in the port, we set up the port's
267 * turnstile to directly push on the kqwl's turnstile (which is in turn set up
268 * during filt_machportattach). If userspace responds to the message, the
269 * turnstile push is severed the point of reply. If userspace returns without
270 * responding to the message, we sever the turnstile push at the
271 * point of reenabling the knote to deliver the next message. This is why the
272 * knote needs to remember the port. For more details, see also
273 * filt_machport_turnstile_complete.
274 *
275 * If there are multiple other sync IPC messages in the port, messages 2 to n
276 * redirect their turnstile push to the kqwl through an intermediatry "knote"
277 * turnstile which in turn, pushes on the kqwl turnstile. This knote turnstile
278 * is stored in the kn_hook. See also filt_machport_turnstile_prepare_lazily.
279 *
280 * - (in/out) ext[0] holds a mach_vm_address_t to a userspace buffer
281 * that can be used to direct-deliver messages when
282 * MACH_RCV_MSG is set in kn_sfflags
283 *
284 * - (in/out) ext[1] holds a mach_msg_size_t representing the size
285 * of the userspace buffer held in ext[0].
286 *
287 * - (out) ext[2] is used to deliver qos information
288 * about the send queue to userspace.
289 *
290 * - (abused) ext[3] is used in kernel to hold a reference to the first port
291 * with a turnstile that participate to sync IPC override. For more details,
292 * see filt_machport_stash_port
293 *
294 * - kn_hook is optionally a "knote" turnstile. It is used as the inheritor
295 * of turnstiles for rights copied out as part of direct message delivery
296 * when they can participate to sync IPC override.
297 *
298 * It is used to atomically neuter the sync IPC override when the knote is
299 * re-enabled.
300 *
301 */
302
303 #include <sys/event.h>
304 #include <sys/errno.h>
305
306 static int
filt_pset_filter_result(ipc_pset_t pset)307 filt_pset_filter_result(ipc_pset_t pset)
308 {
309 ips_mq_lock_held(pset);
310
311 if (!waitq_is_valid(&pset->ips_wqset)) {
312 return 0;
313 }
314
315 return waitq_set_first_prepost(&pset->ips_wqset, WQS_PREPOST_PEEK) ?
316 FILTER_ACTIVE : 0;
317 }
318
319 static int
filt_port_filter_result(struct knote * kn,ipc_port_t port)320 filt_port_filter_result(struct knote *kn, ipc_port_t port)
321 {
322 struct kqueue *kqwl = knote_get_kq(kn);
323 ipc_kmsg_t first;
324 int result = 0;
325
326 ip_mq_lock_held(port);
327
328 if (kn->kn_sfflags & MACH_RCV_MSG) {
329 result = FILTER_RESET_EVENT_QOS;
330 }
331
332 if (!waitq_is_valid(&port->ip_waitq)) {
333 return result;
334 }
335
336 if (port->ip_kernel_iotier_override != kqueue_get_iotier_override(kqwl)) {
337 kqueue_set_iotier_override(kqwl, port->ip_kernel_iotier_override);
338 result |= FILTER_ADJUST_EVENT_IOTIER_BIT;
339 }
340
341 first = ipc_kmsg_queue_first(&port->ip_messages.imq_messages);
342 if (!first) {
343 return result;
344 }
345
346 result = FILTER_ACTIVE;
347 if (kn->kn_sfflags & MACH_RCV_MSG) {
348 result |= FILTER_ADJUST_EVENT_QOS(first->ikm_qos_override);
349 }
350
351 #if CONFIG_PREADOPT_TG
352 struct thread_group *tg = ipc_kmsg_get_thread_group(first);
353 if (tg) {
354 struct kqueue *kq = knote_get_kq(kn);
355 kqueue_set_preadopted_thread_group(kq, tg,
356 first->ikm_qos_override);
357 }
358 #endif
359
360 return result;
361 }
362
363 struct turnstile *
filt_ipc_kqueue_turnstile(struct knote * kn)364 filt_ipc_kqueue_turnstile(struct knote *kn)
365 {
366 assert(kn->kn_filter == EVFILT_MACHPORT || kn->kn_filter == EVFILT_WORKLOOP);
367 return kqueue_turnstile(knote_get_kq(kn));
368 }
369
370 bool
filt_machport_kqueue_has_turnstile(struct knote * kn)371 filt_machport_kqueue_has_turnstile(struct knote *kn)
372 {
373 assert(kn->kn_filter == EVFILT_MACHPORT);
374 return ((kn->kn_sfflags & MACH_RCV_MSG) || (kn->kn_sfflags & MACH_RCV_SYNC_PEEK))
375 && (kn->kn_flags & EV_DISPATCH);
376 }
377
378 /*
379 * Stashes a port that participate to sync IPC override on the knote until the
380 * knote is re-enabled.
381 *
382 * It returns:
383 * - the turnstile to use as an inheritor for the stashed port
384 * - the kind of stash that happened as PORT_SYNC_* value among:
385 * o not stashed (no sync IPC support)
386 * o stashed in the knote (in kn_ext[3])
387 * o to be hooked to the kn_hook knote
388 */
389 struct turnstile *
filt_machport_stash_port(struct knote * kn,ipc_port_t port,int * link)390 filt_machport_stash_port(struct knote *kn, ipc_port_t port, int *link)
391 {
392 struct turnstile *ts = TURNSTILE_NULL;
393
394 if (kn->kn_filter == EVFILT_WORKLOOP) {
395 assert(kn->kn_ipc_port == NULL);
396 kn->kn_ipc_port = port;
397 ip_reference(port);
398 if (link) {
399 *link = PORT_SYNC_LINK_WORKLOOP_KNOTE;
400 }
401 ts = filt_ipc_kqueue_turnstile(kn);
402 } else if (!filt_machport_kqueue_has_turnstile(kn)) {
403 if (link) {
404 *link = PORT_SYNC_LINK_NO_LINKAGE;
405 }
406 } else if (kn->kn_ext[3] == 0) {
407 ip_reference(port);
408 kn->kn_ext[3] = (uintptr_t)port;
409 ts = filt_ipc_kqueue_turnstile(kn);
410 if (link) {
411 *link = PORT_SYNC_LINK_WORKLOOP_KNOTE;
412 }
413 } else {
414 ts = (struct turnstile *)knote_kn_hook_get_raw(kn);
415 if (link) {
416 *link = PORT_SYNC_LINK_WORKLOOP_STASH;
417 }
418 }
419
420 return ts;
421 }
422
423 /*
424 * Lazily prepare a turnstile so that filt_machport_stash_port()
425 * can be called with the mqueue lock held.
426 *
427 * It will allocate a turnstile in kn_hook if:
428 * - the knote supports sync IPC override,
429 * - we already stashed a port in kn_ext[3],
430 * - the object that will be copied out has a chance to ask to be stashed.
431 *
432 * It is setup so that its inheritor is the workloop turnstile that has been
433 * allocated when this knote was attached.
434 */
435 void
filt_machport_turnstile_prepare_lazily(struct knote * kn,mach_msg_type_name_t msgt_name,ipc_port_t port)436 filt_machport_turnstile_prepare_lazily(
437 struct knote *kn,
438 mach_msg_type_name_t msgt_name,
439 ipc_port_t port)
440 {
441 /* This is called from within filt_machportprocess */
442 assert((kn->kn_status & KN_SUPPRESSED) && (kn->kn_status & KN_LOCKED));
443
444 if (!filt_machport_kqueue_has_turnstile(kn)) {
445 return;
446 }
447
448 if (kn->kn_ext[3] == 0 || knote_kn_hook_get_raw(kn)) {
449 return;
450 }
451
452 struct turnstile *ts = filt_ipc_kqueue_turnstile(kn);
453 if ((msgt_name == MACH_MSG_TYPE_PORT_SEND_ONCE && port->ip_specialreply) ||
454 (msgt_name == MACH_MSG_TYPE_PORT_RECEIVE)) {
455 struct turnstile *kn_ts = turnstile_alloc();
456 struct turnstile *ts_store = TURNSTILE_NULL;
457 kn_ts = turnstile_prepare((uintptr_t)kn, &ts_store, kn_ts, TURNSTILE_KNOTE);
458 knote_kn_hook_set_raw(kn, ts_store);
459
460 turnstile_update_inheritor(kn_ts, ts,
461 TURNSTILE_IMMEDIATE_UPDATE | TURNSTILE_INHERITOR_TURNSTILE);
462 turnstile_cleanup();
463 }
464 }
465
466 static void
filt_machport_turnstile_complete_port(struct knote * kn,ipc_port_t port)467 filt_machport_turnstile_complete_port(struct knote *kn, ipc_port_t port)
468 {
469 struct turnstile *ts = TURNSTILE_NULL;
470
471 ip_mq_lock(port);
472 if (port->ip_specialreply) {
473 /*
474 * If the reply has been sent to the special reply port already,
475 * then the special reply port may already be reused to do something
476 * entirely different.
477 *
478 * However, the only reason for it to still point to this knote is
479 * that it's still waiting for a reply, so when this is the case,
480 * neuter the linkage.
481 */
482 if (port->ip_sync_link_state == PORT_SYNC_LINK_WORKLOOP_KNOTE &&
483 port->ip_sync_inheritor_knote == kn) {
484 ipc_port_adjust_special_reply_port_locked(port, NULL,
485 (IPC_PORT_ADJUST_SR_NONE | IPC_PORT_ADJUST_SR_ENABLE_EVENT), FALSE);
486 /* port unlocked */
487 } else {
488 ip_mq_unlock(port);
489 }
490 } else {
491 /*
492 * For receive rights, if their IMQ_KNOTE() is still this
493 * knote, then sever the link.
494 */
495 if (port->ip_sync_link_state == PORT_SYNC_LINK_WORKLOOP_KNOTE &&
496 port->ip_messages.imq_inheritor_knote == kn) {
497 ipc_port_adjust_sync_link_state_locked(port, PORT_SYNC_LINK_ANY, NULL);
498 ts = port_send_turnstile(port);
499 }
500 if (ts) {
501 turnstile_reference(ts);
502 turnstile_update_inheritor(ts, TURNSTILE_INHERITOR_NULL,
503 TURNSTILE_IMMEDIATE_UPDATE);
504 }
505 ip_mq_unlock(port);
506
507 if (ts) {
508 turnstile_update_inheritor_complete(ts,
509 TURNSTILE_INTERLOCK_NOT_HELD);
510 turnstile_deallocate(ts);
511 }
512 }
513
514 ip_release(port);
515 }
516
517 void
filt_wldetach_sync_ipc(struct knote * kn)518 filt_wldetach_sync_ipc(struct knote *kn)
519 {
520 ipc_port_t port = kn->kn_ipc_port;
521 filt_machport_turnstile_complete_port(kn, port);
522 kn->kn_ipc_port = IP_NULL;
523 }
524
525 /*
526 * Other half of filt_machport_turnstile_prepare_lazily()
527 *
528 * This is serialized by the knote state machine.
529 */
530 static void
filt_machport_turnstile_complete(struct knote * kn)531 filt_machport_turnstile_complete(struct knote *kn)
532 {
533 if (kn->kn_ext[3]) {
534 ipc_port_t port = (ipc_port_t)kn->kn_ext[3];
535 filt_machport_turnstile_complete_port(kn, port);
536 kn->kn_ext[3] = 0;
537 }
538
539 struct turnstile *ts = knote_kn_hook_get_raw(kn);
540 if (ts) {
541 turnstile_update_inheritor(ts, TURNSTILE_INHERITOR_NULL,
542 TURNSTILE_IMMEDIATE_UPDATE);
543 turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD);
544
545 struct turnstile *ts_store = ts;
546 turnstile_complete((uintptr_t)kn, (struct turnstile **)&ts_store, &ts, TURNSTILE_KNOTE);
547 knote_kn_hook_set_raw(kn, ts_store);
548
549 turnstile_cleanup();
550
551 assert(ts);
552 turnstile_deallocate(ts);
553 }
554 }
555
556 static void
filt_machport_link(struct klist * klist,struct knote * kn)557 filt_machport_link(struct klist *klist, struct knote *kn)
558 {
559 struct knote *hd = SLIST_FIRST(klist);
560
561 if (hd && filt_machport_kqueue_has_turnstile(kn)) {
562 SLIST_INSERT_AFTER(hd, kn, kn_selnext);
563 } else {
564 SLIST_INSERT_HEAD(klist, kn, kn_selnext);
565 }
566 }
567
568 static void
filt_machport_unlink(struct klist * klist,struct knote * kn)569 filt_machport_unlink(struct klist *klist, struct knote *kn)
570 {
571 struct knote **knprev;
572
573 KNOTE_DETACH(klist, kn);
574
575 /* make sure the first knote is a knote we can push on */
576 SLIST_FOREACH_PREVPTR(kn, knprev, klist, kn_selnext) {
577 if (filt_machport_kqueue_has_turnstile(kn)) {
578 *knprev = SLIST_NEXT(kn, kn_selnext);
579 SLIST_INSERT_HEAD(klist, kn, kn_selnext);
580 break;
581 }
582 }
583 }
584
585 int
filt_wlattach_sync_ipc(struct knote * kn)586 filt_wlattach_sync_ipc(struct knote *kn)
587 {
588 mach_port_name_t name = (mach_port_name_t)kn->kn_id;
589 ipc_space_t space = current_space();
590 ipc_entry_bits_t bits;
591 ipc_object_t object;
592 ipc_port_t port = IP_NULL;
593 int error = 0;
594
595 if (ipc_right_lookup_read(space, name, &bits, &object) != KERN_SUCCESS) {
596 return ENOENT;
597 }
598 /* object is locked and active */
599
600 if (bits & MACH_PORT_TYPE_RECEIVE) {
601 port = ip_object_to_port(object);
602 if (port->ip_specialreply || ip_is_kobject(port)) {
603 error = ENOENT;
604 }
605 } else if (bits & MACH_PORT_TYPE_SEND_ONCE) {
606 port = ip_object_to_port(object);
607 if (!port->ip_specialreply) {
608 error = ENOENT;
609 }
610 } else {
611 error = ENOENT;
612 }
613 if (error) {
614 io_unlock(object);
615 return error;
616 }
617
618 if (port->ip_sync_link_state == PORT_SYNC_LINK_ANY) {
619 io_unlock(object);
620 /*
621 * We cannot start a sync IPC inheritance chain, only further one
622 * Note: this can also happen if the inheritance chain broke
623 * because the original requestor died.
624 */
625 return ENOENT;
626 }
627
628 if (port->ip_specialreply) {
629 ipc_port_adjust_special_reply_port_locked(port, kn,
630 IPC_PORT_ADJUST_SR_LINK_WORKLOOP, FALSE);
631 } else {
632 ipc_port_adjust_port_locked(port, kn, FALSE);
633 }
634
635 /* make sure the port was stashed */
636 assert(kn->kn_ipc_port == port);
637
638 /* port has been unlocked by ipc_port_adjust_* */
639
640 return 0;
641 }
642
643 static int
filt_psetattach(struct knote * kn,ipc_pset_t pset)644 filt_psetattach(struct knote *kn, ipc_pset_t pset)
645 {
646 int result = 0;
647
648 ips_reference(pset);
649 kn->kn_ipc_pset = pset;
650
651 filt_machport_link(&pset->ips_klist, kn);
652 result = filt_pset_filter_result(pset);
653 ips_mq_unlock(pset);
654
655 return result;
656 }
657
658 static int
filt_portattach(struct knote * kn,ipc_port_t port)659 filt_portattach(struct knote *kn, ipc_port_t port)
660 {
661 struct turnstile *send_turnstile = TURNSTILE_NULL;
662 int result = 0;
663
664 if (port->ip_specialreply) {
665 /*
666 * Registering for kevents on special reply ports
667 * isn't supported for two reasons:
668 *
669 * 1. it really makes very little sense for a port that
670 * is supposed to be used synchronously
671 *
672 * 2. their ports's ip_klist field will be used to
673 * store the receive turnstile, so we can't possibly
674 * attach them anyway.
675 */
676 ip_mq_unlock(port);
677 knote_set_error(kn, ENOTSUP);
678 return 0;
679 }
680
681 ip_reference(port);
682 kn->kn_ipc_port = port;
683 if (port->ip_sync_link_state != PORT_SYNC_LINK_ANY) {
684 /*
685 * We're attaching a port that used to have an IMQ_KNOTE,
686 * clobber this state, we'll fixup its turnstile inheritor below.
687 */
688 ipc_port_adjust_sync_link_state_locked(port, PORT_SYNC_LINK_ANY, NULL);
689 }
690
691 filt_machport_link(&port->ip_klist, kn);
692 result = filt_port_filter_result(kn, port);
693
694 /*
695 * Update the port's turnstile inheritor
696 *
697 * Unlike filt_machportdetach(), we don't have to care about races for
698 * turnstile_workloop_pusher_info(): filt_machport_link() doesn't affect
699 * already pushing knotes, and if the current one becomes the new
700 * pusher, it'll only be visible when turnstile_workloop_pusher_info()
701 * returns.
702 */
703 send_turnstile = port_send_turnstile(port);
704 if (send_turnstile) {
705 turnstile_reference(send_turnstile);
706 ipc_port_send_update_inheritor(port, send_turnstile,
707 TURNSTILE_IMMEDIATE_UPDATE);
708
709 /*
710 * rdar://problem/48861190
711 *
712 * When a listener connection resumes a peer,
713 * updating the inheritor above has moved the push
714 * from the current thread to the workloop.
715 *
716 * However, we haven't told the workloop yet
717 * that it needs a thread request, and we risk
718 * to be preeempted as soon as we drop the space
719 * lock below.
720 *
721 * To avoid this disable preemption and let kevent
722 * reenable it after it takes the kqlock.
723 */
724 disable_preemption();
725 result |= FILTER_THREADREQ_NODEFEER;
726 }
727
728 ip_mq_unlock(port);
729
730 if (send_turnstile) {
731 turnstile_update_inheritor_complete(send_turnstile,
732 TURNSTILE_INTERLOCK_NOT_HELD);
733 turnstile_deallocate(send_turnstile);
734 }
735
736 return result;
737 }
738
739 static int
filt_machportattach(struct knote * kn,__unused struct kevent_qos_s * kev)740 filt_machportattach(struct knote *kn, __unused struct kevent_qos_s *kev)
741 {
742 mach_port_name_t name = (mach_port_name_t)kn->kn_id;
743 ipc_space_t space = current_space();
744 ipc_entry_bits_t bits;
745 ipc_object_t object;
746 kern_return_t kr;
747
748 kn->kn_flags &= ~EV_EOF;
749 kn->kn_ext[3] = 0;
750
751 if (filt_machport_kqueue_has_turnstile(kn)) {
752 /*
753 * If the filter is likely to support sync IPC override,
754 * and it happens to be attaching to a workloop,
755 * make sure the workloop has an allocated turnstile.
756 */
757 kqueue_alloc_turnstile(knote_get_kq(kn));
758 }
759
760 kr = ipc_right_lookup_read(space, name, &bits, &object);
761
762 if (kr != KERN_SUCCESS) {
763 knote_set_error(kn, ENOENT);
764 return 0;
765 }
766 /* object is locked and active */
767
768 if (bits & MACH_PORT_TYPE_PORT_SET) {
769 kn->kn_filtid = EVFILTID_MACH_PORT_SET;
770 return filt_psetattach(kn, ips_object_to_pset(object));
771 }
772
773 if (bits & MACH_PORT_TYPE_RECEIVE) {
774 kn->kn_filtid = EVFILTID_MACH_PORT;
775 return filt_portattach(kn, ip_object_to_port(object));
776 }
777
778 io_unlock(object);
779 knote_set_error(kn, ENOTSUP);
780 return 0;
781 }
782
783 static void
filt_psetdetach(struct knote * kn)784 filt_psetdetach(struct knote *kn)
785 {
786 ipc_pset_t pset = kn->kn_ipc_pset;
787
788 filt_machport_turnstile_complete(kn);
789
790 ips_mq_lock(pset);
791
792 if ((kn->kn_status & KN_VANISHED) || (kn->kn_flags & EV_EOF)) {
793 /*
794 * ipc_mqueue_changed() already unhooked this knote from the waitq,
795 */
796 } else {
797 filt_machport_unlink(&pset->ips_klist, kn);
798 }
799
800 kn->kn_ipc_pset = IPS_NULL;
801 ips_mq_unlock(pset);
802 ips_release(pset);
803 }
804
805 static void
filt_portdetach(struct knote * kn)806 filt_portdetach(struct knote *kn)
807 {
808 ipc_port_t port = kn->kn_ipc_port;
809 struct turnstile *send_turnstile = TURNSTILE_NULL;
810
811 filt_machport_turnstile_complete(kn);
812
813 ip_mq_lock(port);
814 if ((kn->kn_status & KN_VANISHED) || (kn->kn_flags & EV_EOF)) {
815 /*
816 * ipc_mqueue_changed() already unhooked this knote from the waitq,
817 */
818 } else {
819 /*
820 * When the knote being detached is the first one in the list,
821 * then unlinking the knote *and* updating the turnstile inheritor
822 * need to happen atomically with respect to the callers of
823 * turnstile_workloop_pusher_info().
824 *
825 * The caller of turnstile_workloop_pusher_info() will use the kq req
826 * lock (and hence the kqlock), so we just need to hold the kqlock too.
827 */
828 assert(port->ip_sync_link_state == PORT_SYNC_LINK_ANY);
829 if (kn == SLIST_FIRST(&port->ip_klist)) {
830 send_turnstile = port_send_turnstile(port);
831 }
832 filt_machport_unlink(&port->ip_klist, kn);
833 struct kqueue *kq = knote_get_kq(kn);
834 kqueue_set_iotier_override(kq, THROTTLE_LEVEL_END);
835 }
836
837 if (send_turnstile) {
838 turnstile_reference(send_turnstile);
839 ipc_port_send_update_inheritor(port, send_turnstile,
840 TURNSTILE_IMMEDIATE_UPDATE);
841 }
842
843 /* Clear the knote pointer once the knote has been removed from turnstile */
844 kn->kn_ipc_port = IP_NULL;
845 ip_mq_unlock(port);
846
847 if (send_turnstile) {
848 turnstile_update_inheritor_complete(send_turnstile,
849 TURNSTILE_INTERLOCK_NOT_HELD);
850 turnstile_deallocate(send_turnstile);
851 }
852
853 ip_release(port);
854 }
855
856 /*
857 * filt_{pset,port}event - deliver events into the mach port filter
858 *
859 * Mach port message arrival events are currently only posted via the
860 * kqueue filter routine for ports.
861 *
862 * If there is a message at the head of the queue,
863 * we indicate that the knote should go active. If
864 * the message is to be direct-received, we adjust the
865 * QoS of the knote according the requested and override
866 * QoS of that first message.
867 *
868 * When the knote is for a port-set, the hint is non 0
869 * and is the waitq which is posting.
870 */
871 static int
filt_psetevent(struct knote * kn __unused,long hint __assert_only)872 filt_psetevent(struct knote *kn __unused, long hint __assert_only)
873 {
874 /*
875 * When called for a port-set,
876 * the posting port waitq is locked.
877 *
878 * waitq_set_first_prepost()
879 * in filt_machport_filter_result()
880 * would try to lock it and be very sad.
881 *
882 * Just trust what we know to be true.
883 */
884 assert(hint != 0);
885 return FILTER_ACTIVE;
886 }
887
888 static int
filt_portevent(struct knote * kn,long hint __assert_only)889 filt_portevent(struct knote *kn, long hint __assert_only)
890 {
891 assert(hint == 0);
892 return filt_port_filter_result(kn, kn->kn_ipc_port);
893 }
894
895 void
ipc_pset_prepost(struct waitq_set * wqs,struct waitq * waitq)896 ipc_pset_prepost(struct waitq_set *wqs, struct waitq *waitq)
897 {
898 KNOTE(&ips_from_waitq(wqs)->ips_klist, (long)waitq);
899 }
900
901 static void
filt_machporttouch(struct knote * kn,struct kevent_qos_s * kev)902 filt_machporttouch(struct knote *kn, struct kevent_qos_s *kev)
903 {
904 /*
905 * Specificying MACH_RCV_MSG or MACH_RCV_SYNC_PEEK during attach results in
906 * allocation of a turnstile. Modifying the filter flags to include these
907 * flags later, without a turnstile being allocated, leads to
908 * inconsistencies.
909 */
910 if ((kn->kn_sfflags ^ kev->fflags) & (MACH_RCV_MSG | MACH_RCV_SYNC_PEEK)) {
911 kev->flags |= EV_ERROR;
912 kev->data = EINVAL;
913 return;
914 }
915
916 /* copy in new settings and save off new input fflags */
917 kn->kn_sfflags = kev->fflags;
918 kn->kn_ext[0] = kev->ext[0];
919 kn->kn_ext[1] = kev->ext[1];
920
921 if (kev->flags & EV_ENABLE) {
922 /*
923 * If the knote is being enabled, make sure there's no lingering
924 * IPC overrides from the previous message delivery.
925 */
926 filt_machport_turnstile_complete(kn);
927 }
928 }
929
930 static int
filt_psettouch(struct knote * kn,struct kevent_qos_s * kev)931 filt_psettouch(struct knote *kn, struct kevent_qos_s *kev)
932 {
933 ipc_pset_t pset = kn->kn_ipc_pset;
934 int result = 0;
935
936 filt_machporttouch(kn, kev);
937 if (kev->flags & EV_ERROR) {
938 return 0;
939 }
940
941 ips_mq_lock(pset);
942 result = filt_pset_filter_result(pset);
943 ips_mq_unlock(pset);
944
945 return result;
946 }
947
948 static int
filt_porttouch(struct knote * kn,struct kevent_qos_s * kev)949 filt_porttouch(struct knote *kn, struct kevent_qos_s *kev)
950 {
951 ipc_port_t port = kn->kn_ipc_port;
952 int result = 0;
953
954 filt_machporttouch(kn, kev);
955 if (kev->flags & EV_ERROR) {
956 return 0;
957 }
958
959 ip_mq_lock(port);
960 result = filt_port_filter_result(kn, port);
961 ip_mq_unlock(port);
962
963 return result;
964 }
965
966 static int
filt_machportprocess(struct knote * kn,struct kevent_qos_s * kev,ipc_object_t object,ipc_object_type_t otype)967 filt_machportprocess(
968 struct knote *kn,
969 struct kevent_qos_s *kev,
970 ipc_object_t object,
971 ipc_object_type_t otype)
972 {
973 thread_t self = current_thread();
974 kevent_ctx_t kectx = NULL;
975
976 wait_result_t wresult;
977 mach_msg_option64_t option64;
978 mach_vm_address_t msg_addr;
979 mach_msg_size_t max_msg_size;
980 mach_msg_recv_result_t msgr;
981
982 int result = FILTER_ACTIVE;
983
984 /* Capture current state */
985 knote_fill_kevent(kn, kev, MACH_PORT_NULL);
986
987 /* Clear port reference, use ext3 as size of msg aux data */
988 kev->ext[3] = 0;
989
990 /* If already deallocated/moved return one last EOF event */
991 if (kev->flags & EV_EOF) {
992 return FILTER_ACTIVE | FILTER_RESET_EVENT_QOS;
993 }
994
995 /*
996 * Only honor supported receive options. If no options are
997 * provided, just force a MACH_RCV_LARGE to detect the
998 * name of the port and sizeof the waiting message.
999 *
1000 * Extend kn_sfflags to 64 bits.
1001 *
1002 * Add MACH_RCV_TIMEOUT to never wait (in case someone concurrently
1003 * dequeued the message that made this knote active already).
1004 */
1005 option64 = kn->kn_sfflags & (MACH_RCV_MSG | MACH_RCV_LARGE |
1006 MACH_RCV_LARGE_IDENTITY | MACH_RCV_TRAILER_MASK |
1007 MACH_RCV_VOUCHER | MACH_MSG_STRICT_REPLY);
1008 option64 = ipc_current_user_policy(current_task(), option64);
1009
1010 if (option64 & MACH_RCV_MSG) {
1011 msg_addr = (mach_vm_address_t) kn->kn_ext[0];
1012 max_msg_size = (mach_msg_size_t) kn->kn_ext[1];
1013
1014 /*
1015 * Copy out the incoming message as vector, and append aux data
1016 * immediately after the message proper (if any) and report its
1017 * size on ext3.
1018 *
1019 * Note: MACH64_RCV_LINEAR_VECTOR is how the receive machinery
1020 * knows this comes from kevent (see comment in
1021 * mach_msg_receive_too_large()).
1022 */
1023 option64 |= (MACH64_MSG_VECTOR | MACH64_RCV_LINEAR_VECTOR);
1024
1025 /*
1026 * If the kevent didn't specify a buffer and length, carve a buffer
1027 * from the filter processing data according to the flags.
1028 */
1029 if (max_msg_size == 0) {
1030 kectx = kevent_get_context(self);
1031 msg_addr = (mach_vm_address_t)kectx->kec_data_out;
1032 max_msg_size = (mach_msg_size_t)kectx->kec_data_resid;
1033 option64 |= (MACH_RCV_LARGE | MACH_RCV_LARGE_IDENTITY);
1034 /* Receive vector linearly onto stack */
1035 if (kectx->kec_process_flags & KEVENT_FLAG_STACK_DATA) {
1036 option64 |= MACH64_RCV_STACK;
1037 }
1038 }
1039 } else {
1040 /* just detect the port name (if a set) and size of the first message */
1041 option64 = MACH_RCV_LARGE;
1042 msg_addr = 0;
1043 max_msg_size = 0;
1044 }
1045 option64 |= MACH_RCV_TIMEOUT; /* never wait */
1046
1047 /*
1048 * Set up to receive a message or the notification of a
1049 * too large message. But never allow this call to wait.
1050 * If the user provided aditional options, like trailer
1051 * options, pass those through here. But we don't support
1052 * scatter lists through this interface.
1053 *
1054 * Note: while in filt_machportprocess(),
1055 * the knote has a reference on `object` that we can borrow.
1056 */
1057
1058 /* Set up message proper receive params on thread */
1059 bzero(&self->ith_receive, sizeof(self->ith_receive));
1060 self->ith_recv_bufs = (mach_msg_recv_bufs_t){
1061 .recv_msg_addr = msg_addr,
1062 .recv_msg_size = max_msg_size,
1063 };
1064 self->ith_object = object;
1065 self->ith_option = option64;
1066 self->ith_knote = kn;
1067
1068 ipc_object_validate(object, otype);
1069
1070 waitq_lock(io_waitq(object));
1071 wresult = ipc_mqueue_receive_on_thread_and_unlock(io_waitq(object),
1072 MACH_MSG_TIMEOUT_NONE, THREAD_INTERRUPTIBLE, self);
1073 /* port unlocked */
1074
1075 /* If we timed out, or the process is exiting, just zero. */
1076 if (wresult == THREAD_RESTART || self->ith_state == MACH_RCV_TIMED_OUT) {
1077 assert(self->turnstile != TURNSTILE_NULL);
1078 self->ith_knote = ITH_KNOTE_NULL;
1079 return 0;
1080 }
1081
1082 assert(wresult == THREAD_NOT_WAITING);
1083 assert(self->ith_state != MACH_RCV_IN_PROGRESS);
1084
1085 /*
1086 * If we weren't attempting to receive a message
1087 * directly, we need to return the port name in
1088 * the kevent structure.
1089 */
1090 if ((option64 & MACH_RCV_MSG) != MACH_RCV_MSG) {
1091 assert(self->ith_state == MACH_RCV_TOO_LARGE);
1092 assert(self->ith_kmsg == IKM_NULL);
1093 kev->data = self->ith_receiver_name;
1094 self->ith_knote = ITH_KNOTE_NULL;
1095 return result;
1096 }
1097
1098 #if CONFIG_PREADOPT_TG
1099 /* If we're the first EVFILT_MACHPORT knote that is being processed for this
1100 * kqwl, then make sure to preadopt the thread group from the kmsg we're
1101 * about to receive. This is to make sure that we fix up the preadoption
1102 * thread group correctly on the receive side for the first message.
1103 */
1104 struct kqueue *kq = knote_get_kq(kn);
1105
1106 if (self->ith_kmsg) {
1107 struct thread_group *tg = ipc_kmsg_get_thread_group(self->ith_kmsg);
1108
1109 kqueue_process_preadopt_thread_group(self, kq, tg);
1110 }
1111 #endif
1112 if (otype == IOT_PORT) {
1113 ipc_port_t port = ip_object_to_port(object);
1114 struct kqueue *kqwl = knote_get_kq(kn);
1115 if (port->ip_kernel_iotier_override != kqueue_get_iotier_override(kqwl)) {
1116 /*
1117 * Lock the port to make sure port->ip_kernel_iotier_override does
1118 * not change while updating the kqueue override, else kqueue could
1119 * have old iotier value.
1120 */
1121 ip_mq_lock(port);
1122 kqueue_set_iotier_override(kqwl, port->ip_kernel_iotier_override);
1123 ip_mq_unlock(port);
1124 result |= FILTER_ADJUST_EVENT_IOTIER_BIT;
1125 }
1126 }
1127
1128 /*
1129 * Attempt to receive the message directly, returning
1130 * the results in the fflags field.
1131 */
1132 io_reference(object);
1133 kev->fflags = mach_msg_receive_results(&msgr);
1134
1135 /* kmsg and object reference consumed */
1136
1137 /*
1138 * if the user asked for the identity of ports containing a
1139 * a too-large message, return it in the data field (as we
1140 * do for messages we didn't try to receive).
1141 */
1142 kev->ext[1] = msgr.msgr_msg_size + msgr.msgr_trailer_size;
1143 kev->ext[3] = msgr.msgr_aux_size; /* Only lower 32 bits of ext3 are used */
1144 if (kev->fflags == MACH_RCV_TOO_LARGE &&
1145 (option64 & MACH_RCV_LARGE_IDENTITY)) {
1146 kev->data = msgr.msgr_recv_name;
1147 } else {
1148 kev->data = MACH_PORT_NULL;
1149 }
1150
1151 /*
1152 * If we used a data buffer carved out from the filt_process data,
1153 * store the address used in the knote and adjust the residual and
1154 * other parameters for future use.
1155 */
1156 if (kectx && kev->fflags != MACH_RCV_TOO_LARGE) {
1157 mach_vm_size_t size = msgr.msgr_msg_size +
1158 msgr.msgr_trailer_size + msgr.msgr_aux_size;
1159
1160 assert(kectx->kec_data_resid >= size);
1161 kectx->kec_data_resid -= size;
1162 if ((kectx->kec_process_flags & KEVENT_FLAG_STACK_DATA) == 0) {
1163 kev->ext[0] = kectx->kec_data_out;
1164 kectx->kec_data_out += size;
1165 } else {
1166 assert(option64 & MACH64_RCV_STACK);
1167 kev->ext[0] = kectx->kec_data_out + kectx->kec_data_resid;
1168 }
1169 }
1170
1171 /*
1172 * Apply message-based QoS values to output kevent as prescribed.
1173 * The kev->ext[2] field gets (msg-qos << 32) | (override-qos).
1174 */
1175 if (kev->fflags == MACH_MSG_SUCCESS) {
1176 kev->ext[2] = ((uint64_t)msgr.msgr_priority << 32) |
1177 _pthread_priority_make_from_thread_qos(msgr.msgr_qos_ovrd, 0, 0);
1178 }
1179
1180 self->ith_knote = ITH_KNOTE_NULL;
1181 return result;
1182 }
1183
1184 static int
filt_psetprocess(struct knote * kn,struct kevent_qos_s * kev)1185 filt_psetprocess(struct knote *kn, struct kevent_qos_s *kev)
1186 {
1187 ipc_object_t io = ips_to_object(kn->kn_ipc_pset);
1188
1189 return filt_machportprocess(kn, kev, io, IOT_PORT_SET);
1190 }
1191
1192 static int
filt_portprocess(struct knote * kn,struct kevent_qos_s * kev)1193 filt_portprocess(struct knote *kn, struct kevent_qos_s *kev)
1194 {
1195 ipc_object_t io = ip_to_object(kn->kn_ipc_port);
1196
1197 return filt_machportprocess(kn, kev, io, IOT_PORT);
1198 }
1199
1200 static void
filt_machportsanitizedcopyout(struct knote * kn,struct kevent_qos_s * kev)1201 filt_machportsanitizedcopyout(struct knote *kn, struct kevent_qos_s *kev)
1202 {
1203 *kev = *(struct kevent_qos_s *)&kn->kn_kevent;
1204
1205 // We may have stashed the address to the port that is pushing on the sync
1206 // IPC so clear it out.
1207 kev->ext[3] = 0;
1208 }
1209
1210 const struct filterops machport_attach_filtops = {
1211 .f_adjusts_qos = true,
1212 .f_extended_codes = true,
1213 .f_attach = filt_machportattach,
1214 .f_sanitized_copyout = filt_machportsanitizedcopyout,
1215 };
1216
1217 const struct filterops mach_port_filtops = {
1218 .f_adjusts_qos = true,
1219 .f_extended_codes = true,
1220 .f_detach = filt_portdetach,
1221 .f_event = filt_portevent,
1222 .f_touch = filt_porttouch,
1223 .f_process = filt_portprocess,
1224 .f_sanitized_copyout = filt_machportsanitizedcopyout,
1225 };
1226
1227 const struct filterops mach_port_set_filtops = {
1228 .f_adjusts_qos = true,
1229 .f_extended_codes = true,
1230 .f_detach = filt_psetdetach,
1231 .f_event = filt_psetevent,
1232 .f_touch = filt_psettouch,
1233 .f_process = filt_psetprocess,
1234 .f_sanitized_copyout = filt_machportsanitizedcopyout,
1235 };
1236