1 /*
2 * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * @OSF_COPYRIGHT@
30 */
31 /*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or [email protected]
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56 /*
57 */
58 /*
59 * File: ipc/ipc_pset.c
60 * Author: Rich Draves
61 * Date: 1989
62 *
63 * Functions to manipulate IPC port sets.
64 */
65
66 #include <mach/port.h>
67 #include <mach/kern_return.h>
68 #include <mach/message.h>
69 #include <ipc/ipc_mqueue.h>
70 #include <ipc/ipc_object.h>
71 #include <ipc/ipc_pset.h>
72 #include <ipc/ipc_right.h>
73 #include <ipc/ipc_space.h>
74 #include <ipc/ipc_port.h>
75 #include <ipc/ipc_kmsg.h>
76
77 #include <kern/kern_types.h>
78
79 #include <vm/vm_map.h>
80 #include <libkern/section_keywords.h>
81 #include <pthread/priority_private.h>
82
83 /* processor_set stole ipc_pset_init */
84 static void
ipc_port_set_init(ipc_pset_t pset,int policy)85 ipc_port_set_init(ipc_pset_t pset, int policy)
86 {
87 policy |= SYNC_POLICY_FIFO | SYNC_POLICY_PORT_SET;
88 waitq_set_init(&pset->ips_wqset, policy);
89 klist_init(&pset->ips_klist);
90 }
91
92 /*
93 * Routine: ipc_pset_alloc
94 * Purpose:
95 * Allocate a port set.
96 * Conditions:
97 * Nothing locked. If successful, the port set is returned
98 * locked. (The caller doesn't have a reference.)
99 * Returns:
100 * KERN_SUCCESS The port set is allocated.
101 * KERN_INVALID_TASK The space is dead.
102 * KERN_NO_SPACE No room for an entry in the space.
103 */
104
105 kern_return_t
ipc_pset_alloc(ipc_space_t space,mach_port_name_t * namep,ipc_pset_t * psetp)106 ipc_pset_alloc(
107 ipc_space_t space,
108 mach_port_name_t *namep,
109 ipc_pset_t *psetp)
110 {
111 ipc_pset_t pset;
112 mach_port_name_t name;
113 kern_return_t kr;
114
115 kr = ipc_object_alloc(space, IOT_PORT_SET,
116 MACH_PORT_TYPE_PORT_SET, 0,
117 &name, (ipc_object_t *) &pset);
118 if (kr != KERN_SUCCESS) {
119 return kr;
120 }
121 /* space is locked */
122
123 ipc_port_set_init(pset, SYNC_POLICY_INIT_LOCKED);
124 /* port set is locked */
125
126 is_write_unlock(space);
127
128 *namep = name;
129 *psetp = pset;
130 return KERN_SUCCESS;
131 }
132
133 /*
134 * Routine: ipc_pset_alloc_name
135 * Purpose:
136 * Allocate a port set, with a specific name.
137 * Conditions:
138 * Nothing locked. If successful, the port set is returned
139 * locked. (The caller doesn't have a reference.)
140 * Returns:
141 * KERN_SUCCESS The port set is allocated.
142 * KERN_INVALID_TASK The space is dead.
143 * KERN_NAME_EXISTS The name already denotes a right.
144 */
145
146 kern_return_t
ipc_pset_alloc_name(ipc_space_t space,mach_port_name_t name,ipc_pset_t * psetp)147 ipc_pset_alloc_name(
148 ipc_space_t space,
149 mach_port_name_t name,
150 ipc_pset_t *psetp)
151 {
152 return ipc_object_alloc_name(space, IOT_PORT_SET,
153 MACH_PORT_TYPE_PORT_SET, 0,
154 name, (ipc_object_t *)psetp, ^(ipc_object_t object){
155 ipc_port_set_init(ips_object_to_pset(object),
156 SYNC_POLICY_INIT_LOCKED);
157 });
158 }
159
160
161 /*
162 * Routine: ipc_pset_alloc_special
163 * Purpose:
164 * Allocate a port set in a special space.
165 * The new port set is returned with one ref.
166 * If unsuccessful, IPS_NULL is returned.
167 * Conditions:
168 * Nothing locked.
169 */
170 ipc_pset_t
ipc_pset_alloc_special(__assert_only ipc_space_t space)171 ipc_pset_alloc_special(
172 __assert_only ipc_space_t space)
173 {
174 ipc_pset_t pset;
175
176 assert(space != IS_NULL);
177 assert(!is_active(space));
178
179 pset = ips_object_to_pset(io_alloc(IOT_PORT_SET, Z_WAITOK | Z_ZERO));
180 if (pset == IPS_NULL) {
181 return IPS_NULL;
182 }
183
184 os_atomic_init(&pset->ips_object.io_bits, io_makebits(TRUE, IOT_PORT_SET, 0));
185 os_atomic_init(&pset->ips_object.io_references, 1);
186
187 ipc_port_set_init(pset, 0);
188
189 return pset;
190 }
191
192
193 /*
194 * Routine: ipc_pset_add_unlock
195 * Purpose:
196 * Puts a port into a port set.
197 * Conditions:
198 * Port locked and active.
199 * Port is unlocked on return.
200 * The owner of the port set is also receiver for the port.
201 */
202
203 kern_return_t
ipc_pset_add_unlock(ipc_pset_t pset,ipc_port_t port,waitq_ref_t * reserved_link,uint64_t * reserved_prepost)204 ipc_pset_add_unlock(
205 ipc_pset_t pset,
206 ipc_port_t port,
207 waitq_ref_t *reserved_link,
208 uint64_t *reserved_prepost)
209 {
210 require_ip_active(port);
211
212 return ipc_mqueue_add_unlock(&port->ip_messages, pset,
213 reserved_link, reserved_prepost, FALSE);
214 }
215
216
217
218 /*
219 * Routine: ipc_pset_remove_locked
220 * Purpose:
221 * Removes a port from a port set.
222 * The port set loses a reference.
223 * Conditions:
224 * Port is locked and active.
225 */
226
227 kern_return_t
ipc_pset_remove_locked(ipc_pset_t pset,ipc_port_t port)228 ipc_pset_remove_locked(
229 ipc_pset_t pset,
230 ipc_port_t port)
231 {
232 require_ip_active(port);
233
234 if (ip_in_pset(port)) {
235 return waitq_unlink_locked(&port->ip_waitq, &pset->ips_wqset);
236 }
237 return KERN_NOT_IN_SET;
238 }
239
240 /*
241 * Routine: ipc_pset_lazy_allocate
242 * Purpose:
243 * lazily initialize the wqset of a port set.
244 * Conditions:
245 * Nothing locked.
246 */
247
248 kern_return_t
ipc_pset_lazy_allocate(ipc_space_t space,mach_port_name_t psname)249 ipc_pset_lazy_allocate(
250 ipc_space_t space,
251 mach_port_name_t psname)
252 {
253 ipc_entry_bits_t bits;
254 kern_return_t kr;
255 ipc_object_t psobj;
256
257 kr = ipc_right_lookup_read(space, psname, &bits, &psobj);
258 if (kr != KERN_SUCCESS) {
259 return kr;
260 }
261 /* object is locked and active */
262
263 if ((bits & MACH_PORT_TYPE_PORT_SET) == 0) {
264 io_unlock(psobj);
265 return KERN_INVALID_RIGHT;
266 }
267
268 io_reference(psobj);
269 io_unlock(psobj);
270
271 /*
272 * lazily initialize the wqset to avoid
273 * possible allocation while linking
274 * under spinlocks.
275 */
276 waitq_set_lazy_init_link(&ips_object_to_pset(psobj)->ips_wqset);
277
278 io_release(psobj);
279
280 return KERN_SUCCESS;
281 }
282
283 /*
284 * Routine: ipc_pset_remove_from_all_unlock
285 * Purpose:
286 * Removes a port from all it's port sets.
287 * Conditions:
288 * port is locked and active, port unlocked on return.
289 */
290
291 void
ipc_pset_remove_from_all_unlock(ipc_port_t port)292 ipc_pset_remove_from_all_unlock(
293 ipc_port_t port)
294 {
295 assert(waitq_is_valid(&port->ip_waitq));
296 waitq_unlink_all_unlock(&port->ip_waitq);
297 }
298
299 /*
300 * Routine: ipc_pset_move_unlock
301 * Purpose:
302 * Removes a port from all its port sets and adds it to given port set.
303 * Conditions:
304 * port is locked and active.
305 * port is unlocked on return.
306 */
307 kern_return_t
ipc_pset_move_unlock(ipc_pset_t pset,ipc_port_t port,uint64_t * reserved_prepost)308 ipc_pset_move_unlock(
309 ipc_pset_t pset,
310 ipc_port_t port,
311 uint64_t *reserved_prepost)
312 {
313 return ipc_mqueue_add_unlock(&port->ip_messages, pset,
314 NULL, reserved_prepost, TRUE);
315 }
316
317 /*
318 * Routine: ipc_pset_destroy
319 * Purpose:
320 * Destroys a port_set.
321 * Conditions:
322 * The port_set is locked and alive.
323 * The caller has a reference, which is consumed.
324 * Afterwards, the port_set is unlocked and dead.
325 */
326
327 void
ipc_pset_destroy(ipc_space_t space,ipc_pset_t pset)328 ipc_pset_destroy(
329 ipc_space_t space,
330 ipc_pset_t pset)
331 {
332 assert(ips_active(pset));
333
334 io_bits_andnot(ips_to_object(pset), IO_BITS_ACTIVE);
335
336 /*
337 * Set all waiters on the portset running to
338 * discover the change.
339 *
340 * Then under the same lock hold, deinit the waitq-set,
341 * which will remove all the member message queues,
342 * linkages and clean up preposts.
343 */
344 ipc_mqueue_changed(space, &pset->ips_wqset.wqset_q);
345 waitq_set_deinit_and_unlock(&pset->ips_wqset);
346
347 ips_release(pset); /* consume the ref our caller gave us */
348 }
349
350 /*
351 * Kqueue EVFILT_MACHPORT support
352 *
353 * - kn_ipc_obj points to the monitored ipc port or pset
354 *
355 * - (in/out) ext[0] holds a mach_vm_address_t to a userspace buffer
356 * that can be used to direct-deliver messages when
357 * MACH_RCV_MSG is set in kn_sfflags
358 *
359 * - (in/out) ext[1] holds a mach_msg_size_t representing the size
360 * of the userspace buffer held in ext[0].
361 *
362 * - (out) ext[2] is used to deliver qos information
363 * about the send queue to userspace.
364 *
365 * - (abused) ext[3] is used in kernel to hold a reference to the first port
366 * with a turnstile that participate to sync IPC override.
367 *
368 * - kn_hook is optionally a "knote" turnstile. It is used as the inheritor
369 * of turnstiles for rights copied out as part of direct message delivery
370 * when they can participate to sync IPC override.
371 *
372 * It is used to atomically neuter the sync IPC override when the knote is
373 * re-enabled.
374 *
375 */
376
377 #include <sys/event.h>
378 #include <sys/errno.h>
379
380 static int
filt_machport_filter_result(struct knote * kn,ipc_object_t object)381 filt_machport_filter_result(struct knote *kn, ipc_object_t object)
382 {
383 struct waitq *wq = io_waitq(object);
384 ipc_kmsg_t first;
385 int result = 0;
386
387 io_lock_held(object);
388
389 if (kn->kn_sfflags & MACH_RCV_MSG) {
390 result = FILTER_RESET_EVENT_QOS;
391 }
392
393 if (!waitq_is_valid(wq)) {
394 return result;
395 }
396
397 if (waitq_is_set(wq)) {
398 ipc_pset_t pset = ips_object_to_pset(object);
399 int rc;
400
401 rc = waitq_set_iterate_preposts(&pset->ips_wqset,
402 ^(struct waitq *waitq) {
403 ipc_port_t port = ip_from_waitq(waitq);
404 ipc_kmsg_queue_t kmsgs = &port->ip_messages.imq_messages;
405
406 if (ipc_kmsg_queue_first(kmsgs) != IKM_NULL) {
407 /* break out of the prepost iteration */
408 return WQ_ITERATE_BREAK;
409 }
410 return WQ_ITERATE_CONTINUE;
411 });
412 if (rc == WQ_ITERATE_BREAK) {
413 result = FILTER_ACTIVE;
414 }
415 return result;
416 }
417
418 ipc_port_t port = ip_object_to_port(object);
419
420 first = ipc_kmsg_queue_first(&port->ip_messages.imq_messages);
421 if (!first) {
422 return result;
423 }
424
425 result = FILTER_ACTIVE;
426 if (kn->kn_sfflags & MACH_RCV_MSG) {
427 result |= FILTER_ADJUST_EVENT_QOS(first->ikm_qos_override);
428 }
429
430 #if CONFIG_PREADOPT_TG
431 struct thread_group *tg = ipc_kmsg_get_thread_group(first);
432 if (tg) {
433 struct kqueue *kq = knote_get_kq(kn);
434 kqueue_set_preadopted_thread_group(kq, tg,
435 first->ikm_qos_override);
436 }
437 #endif
438
439 return result;
440 }
441
442 struct turnstile *
filt_ipc_kqueue_turnstile(struct knote * kn)443 filt_ipc_kqueue_turnstile(struct knote *kn)
444 {
445 assert(kn->kn_filter == EVFILT_MACHPORT || kn->kn_filter == EVFILT_WORKLOOP);
446 return kqueue_turnstile(knote_get_kq(kn));
447 }
448
449 bool
filt_machport_kqueue_has_turnstile(struct knote * kn)450 filt_machport_kqueue_has_turnstile(struct knote *kn)
451 {
452 assert(kn->kn_filter == EVFILT_MACHPORT);
453 return ((kn->kn_sfflags & MACH_RCV_MSG) || (kn->kn_sfflags & MACH_RCV_SYNC_PEEK))
454 && (kn->kn_flags & EV_DISPATCH);
455 }
456
457 /*
458 * Stashes a port that participate to sync IPC override until the knote
459 * is being re-enabled.
460 *
461 * It returns:
462 * - the turnstile to use as an inheritor for the stashed port
463 * - the kind of stash that happened as PORT_SYNC_* value among:
464 * o not stashed (no sync IPC support)
465 * o stashed in the knote (in kn_ext[3])
466 * o to be hooked to the kn_hook knote
467 */
468 struct turnstile *
filt_machport_stash_port(struct knote * kn,ipc_port_t port,int * link)469 filt_machport_stash_port(struct knote *kn, ipc_port_t port, int *link)
470 {
471 struct turnstile *ts = TURNSTILE_NULL;
472
473 if (kn->kn_filter == EVFILT_WORKLOOP) {
474 assert(kn->kn_ipc_obj == NULL);
475 kn->kn_ipc_obj = ip_to_object(port);
476 ip_reference(port);
477 if (link) {
478 *link = PORT_SYNC_LINK_WORKLOOP_KNOTE;
479 }
480 ts = filt_ipc_kqueue_turnstile(kn);
481 } else if (!filt_machport_kqueue_has_turnstile(kn)) {
482 if (link) {
483 *link = PORT_SYNC_LINK_NO_LINKAGE;
484 }
485 } else if (kn->kn_ext[3] == 0) {
486 ip_reference(port);
487 kn->kn_ext[3] = (uintptr_t)port;
488 ts = filt_ipc_kqueue_turnstile(kn);
489 if (link) {
490 *link = PORT_SYNC_LINK_WORKLOOP_KNOTE;
491 }
492 } else {
493 ts = (struct turnstile *)kn->kn_hook;
494 if (link) {
495 *link = PORT_SYNC_LINK_WORKLOOP_STASH;
496 }
497 }
498
499 return ts;
500 }
501
502 /*
503 * Lazily prepare a turnstile so that filt_machport_stash_port()
504 * can be called with the mqueue lock held.
505 *
506 * It will allocate a turnstile in kn_hook if:
507 * - the knote supports sync IPC override,
508 * - we already stashed a port in kn_ext[3],
509 * - the object that will be copied out has a chance to ask to be stashed.
510 *
511 * It is setup so that its inheritor is the workloop turnstile that has been
512 * allocated when this knote was attached.
513 */
514 void
filt_machport_turnstile_prepare_lazily(struct knote * kn,mach_msg_type_name_t msgt_name,ipc_port_t port)515 filt_machport_turnstile_prepare_lazily(
516 struct knote *kn,
517 mach_msg_type_name_t msgt_name,
518 ipc_port_t port)
519 {
520 /* This is called from within filt_machportprocess */
521 assert((kn->kn_status & KN_SUPPRESSED) && (kn->kn_status & KN_LOCKED));
522
523 if (!filt_machport_kqueue_has_turnstile(kn)) {
524 return;
525 }
526
527 if (kn->kn_ext[3] == 0 || kn->kn_hook) {
528 return;
529 }
530
531 struct turnstile *ts = filt_ipc_kqueue_turnstile(kn);
532 if ((msgt_name == MACH_MSG_TYPE_PORT_SEND_ONCE && port->ip_specialreply) ||
533 (msgt_name == MACH_MSG_TYPE_PORT_RECEIVE)) {
534 struct turnstile *kn_ts = turnstile_alloc();
535 kn_ts = turnstile_prepare((uintptr_t)kn,
536 (struct turnstile **)&kn->kn_hook, kn_ts, TURNSTILE_KNOTE);
537 turnstile_update_inheritor(kn_ts, ts,
538 TURNSTILE_IMMEDIATE_UPDATE | TURNSTILE_INHERITOR_TURNSTILE);
539 turnstile_cleanup();
540 }
541 }
542
543 static void
filt_machport_turnstile_complete_port(struct knote * kn,ipc_port_t port)544 filt_machport_turnstile_complete_port(struct knote *kn, ipc_port_t port)
545 {
546 struct turnstile *ts = TURNSTILE_NULL;
547
548 ip_mq_lock(port);
549 if (port->ip_specialreply) {
550 /*
551 * If the reply has been sent to the special reply port already,
552 * then the special reply port may already be reused to do something
553 * entirely different.
554 *
555 * However, the only reason for it to still point to this knote is
556 * that it's still waiting for a reply, so when this is the case,
557 * neuter the linkage.
558 */
559 if (port->ip_sync_link_state == PORT_SYNC_LINK_WORKLOOP_KNOTE &&
560 port->ip_sync_inheritor_knote == kn) {
561 ipc_port_adjust_special_reply_port_locked(port, NULL,
562 (IPC_PORT_ADJUST_SR_NONE | IPC_PORT_ADJUST_SR_ENABLE_EVENT), FALSE);
563 /* port unlocked */
564 } else {
565 ip_mq_unlock(port);
566 }
567 } else {
568 /*
569 * For receive rights, if their IMQ_KNOTE() is still this
570 * knote, then sever the link.
571 */
572 if (port->ip_sync_link_state == PORT_SYNC_LINK_WORKLOOP_KNOTE &&
573 port->ip_messages.imq_inheritor_knote == kn) {
574 ipc_port_adjust_sync_link_state_locked(port, PORT_SYNC_LINK_ANY, NULL);
575 ts = port_send_turnstile(port);
576 }
577 if (ts) {
578 turnstile_reference(ts);
579 turnstile_update_inheritor(ts, TURNSTILE_INHERITOR_NULL,
580 TURNSTILE_IMMEDIATE_UPDATE);
581 }
582 ip_mq_unlock(port);
583
584 if (ts) {
585 turnstile_update_inheritor_complete(ts,
586 TURNSTILE_INTERLOCK_NOT_HELD);
587 turnstile_deallocate(ts);
588 }
589 }
590
591 ip_release(port);
592 }
593
594 void
filt_wldetach_sync_ipc(struct knote * kn)595 filt_wldetach_sync_ipc(struct knote *kn)
596 {
597 ipc_object_t io = kn->kn_ipc_obj;
598 filt_machport_turnstile_complete_port(kn, ip_object_to_port(io));
599 kn->kn_ipc_obj = IO_NULL;
600 }
601
602 /*
603 * Other half of filt_machport_turnstile_prepare_lazily()
604 *
605 * This is serialized by the knote state machine.
606 */
607 static void
filt_machport_turnstile_complete(struct knote * kn)608 filt_machport_turnstile_complete(struct knote *kn)
609 {
610 if (kn->kn_ext[3]) {
611 ipc_port_t port = (ipc_port_t)kn->kn_ext[3];
612 filt_machport_turnstile_complete_port(kn, port);
613 kn->kn_ext[3] = 0;
614 }
615
616 if (kn->kn_hook) {
617 struct turnstile *ts = kn->kn_hook;
618
619 turnstile_update_inheritor(ts, TURNSTILE_INHERITOR_NULL,
620 TURNSTILE_IMMEDIATE_UPDATE);
621 turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD);
622
623 turnstile_complete((uintptr_t)kn, (struct turnstile **)&kn->kn_hook, &ts, TURNSTILE_KNOTE);
624 turnstile_cleanup();
625
626 assert(ts);
627 turnstile_deallocate(ts);
628 }
629 }
630
631 static void
filt_machport_link(struct klist * klist,struct knote * kn)632 filt_machport_link(struct klist *klist, struct knote *kn)
633 {
634 struct knote *hd = SLIST_FIRST(klist);
635
636 if (hd && filt_machport_kqueue_has_turnstile(kn)) {
637 SLIST_INSERT_AFTER(hd, kn, kn_selnext);
638 } else {
639 SLIST_INSERT_HEAD(klist, kn, kn_selnext);
640 }
641 }
642
643 static void
filt_machport_unlink(struct klist * klist,struct knote * kn)644 filt_machport_unlink(struct klist *klist, struct knote *kn)
645 {
646 struct knote **knprev;
647
648 KNOTE_DETACH(klist, kn);
649
650 /* make sure the first knote is a knote we can push on */
651 SLIST_FOREACH_PREVPTR(kn, knprev, klist, kn_selnext) {
652 if (filt_machport_kqueue_has_turnstile(kn)) {
653 *knprev = SLIST_NEXT(kn, kn_selnext);
654 SLIST_INSERT_HEAD(klist, kn, kn_selnext);
655 break;
656 }
657 }
658 }
659
660 int
filt_wlattach_sync_ipc(struct knote * kn)661 filt_wlattach_sync_ipc(struct knote *kn)
662 {
663 mach_port_name_t name = (mach_port_name_t)kn->kn_id;
664 ipc_space_t space = current_space();
665 ipc_entry_bits_t bits;
666 ipc_object_t object;
667 ipc_port_t port = IP_NULL;
668 int error = 0;
669
670 if (ipc_right_lookup_read(space, name, &bits, &object) != KERN_SUCCESS) {
671 return ENOENT;
672 }
673 /* object is locked and active */
674
675 if (bits & MACH_PORT_TYPE_RECEIVE) {
676 port = ip_object_to_port(object);
677 if (port->ip_specialreply) {
678 error = ENOENT;
679 }
680 } else if (bits & MACH_PORT_TYPE_SEND_ONCE) {
681 port = ip_object_to_port(object);
682 if (!port->ip_specialreply) {
683 error = ENOENT;
684 }
685 } else {
686 error = ENOENT;
687 }
688 if (error) {
689 io_unlock(object);
690 return error;
691 }
692
693 if (port->ip_sync_link_state == PORT_SYNC_LINK_ANY) {
694 io_unlock(object);
695 /*
696 * We cannot start a sync IPC inheritance chain, only further one
697 * Note: this can also happen if the inheritance chain broke
698 * because the original requestor died.
699 */
700 return ENOENT;
701 }
702
703 if (port->ip_specialreply) {
704 ipc_port_adjust_special_reply_port_locked(port, kn,
705 IPC_PORT_ADJUST_SR_LINK_WORKLOOP, FALSE);
706 } else {
707 ipc_port_adjust_port_locked(port, kn, FALSE);
708 }
709
710 /* make sure the port was stashed */
711 assert(kn->kn_ipc_obj == ip_to_object(port));
712
713 /* port has been unlocked by ipc_port_adjust_* */
714
715 return 0;
716 }
717
718 static int
filt_machportattach(struct knote * kn,__unused struct kevent_qos_s * kev)719 filt_machportattach(
720 struct knote *kn,
721 __unused struct kevent_qos_s *kev)
722 {
723 mach_port_name_t name = (mach_port_name_t)kn->kn_id;
724 ipc_space_t space = current_space();
725 ipc_entry_bits_t bits;
726 ipc_object_t object;
727 struct turnstile *send_turnstile = TURNSTILE_NULL;
728
729 int error = 0;
730 int result = 0;
731 kern_return_t kr;
732
733 kn->kn_flags &= ~EV_EOF;
734 kn->kn_ext[3] = 0;
735
736 if (filt_machport_kqueue_has_turnstile(kn)) {
737 /*
738 * If the filter is likely to support sync IPC override,
739 * and it happens to be attaching to a workloop,
740 * make sure the workloop has an allocated turnstile.
741 */
742 kqueue_alloc_turnstile(knote_get_kq(kn));
743 }
744
745 kr = ipc_right_lookup_read(space, name, &bits, &object);
746
747 if (kr != KERN_SUCCESS) {
748 error = ENOENT;
749 goto out;
750 }
751 /* object is locked and active */
752
753 if (bits & MACH_PORT_TYPE_PORT_SET) {
754 ipc_pset_t pset = ips_object_to_pset(object);
755
756 io_reference(object);
757 kn->kn_ipc_obj = object;
758 filt_machport_link(&pset->ips_klist, kn);
759 result = filt_machport_filter_result(kn, object);
760 io_unlock(object);
761 } else if (bits & MACH_PORT_TYPE_RECEIVE) {
762 ipc_port_t port = ip_object_to_port(object);
763
764 if (port->ip_specialreply) {
765 /*
766 * Registering for kevents on special reply ports
767 * isn't supported for two reasons:
768 *
769 * 1. it really makes very little sense for a port that
770 * is supposed to be used synchronously
771 *
772 * 2. their ports's ip_klist field will be used to
773 * store the receive turnstile, so we can't possibly
774 * attach them anyway.
775 */
776 io_unlock(object);
777 error = ENOTSUP;
778 goto out;
779 }
780
781 io_reference(object);
782 kn->kn_ipc_obj = object;
783 if (port->ip_sync_link_state != PORT_SYNC_LINK_ANY) {
784 /*
785 * We're attaching a port that used to have an IMQ_KNOTE,
786 * clobber this state, we'll fixup its turnstile inheritor below.
787 */
788 ipc_port_adjust_sync_link_state_locked(port, PORT_SYNC_LINK_ANY, NULL);
789 }
790
791 filt_machport_link(&port->ip_klist, kn);
792 result = filt_machport_filter_result(kn, object);
793
794 /*
795 * Update the port's turnstile inheritor
796 *
797 * Unlike filt_machportdetach(), we don't have to care about races for
798 * turnstile_workloop_pusher_info(): filt_machport_link() doesn't affect
799 * already pushing knotes, and if the current one becomes the new
800 * pusher, it'll only be visible when turnstile_workloop_pusher_info()
801 * returns.
802 */
803 send_turnstile = port_send_turnstile(port);
804 if (send_turnstile) {
805 turnstile_reference(send_turnstile);
806 ipc_port_send_update_inheritor(port, send_turnstile,
807 TURNSTILE_IMMEDIATE_UPDATE);
808
809 /*
810 * rdar://problem/48861190
811 *
812 * When a listener connection resumes a peer,
813 * updating the inheritor above has moved the push
814 * from the current thread to the workloop.
815 *
816 * However, we haven't told the workloop yet
817 * that it needs a thread request, and we risk
818 * to be preeempted as soon as we drop the space
819 * lock below.
820 *
821 * To avoid this disable preemption and let kevent
822 * reenable it after it takes the kqlock.
823 */
824 disable_preemption();
825 result |= FILTER_THREADREQ_NODEFEER;
826 }
827
828 io_unlock(object);
829
830 if (send_turnstile) {
831 turnstile_update_inheritor_complete(send_turnstile,
832 TURNSTILE_INTERLOCK_NOT_HELD);
833 turnstile_deallocate_safe(send_turnstile);
834 }
835 } else {
836 io_unlock(object);
837 error = ENOTSUP;
838 }
839
840 out:
841 /* bail out on errors */
842 if (error) {
843 knote_set_error(kn, error);
844 return 0;
845 }
846
847 return result;
848 }
849
850 static void
filt_machportdetach(struct knote * kn)851 filt_machportdetach(
852 struct knote *kn)
853 {
854 ipc_object_t object = kn->kn_ipc_obj;
855 struct turnstile *send_turnstile = TURNSTILE_NULL;
856
857 filt_machport_turnstile_complete(kn);
858
859 io_lock(object);
860 if ((kn->kn_status & KN_VANISHED) || (kn->kn_flags & EV_EOF)) {
861 /*
862 * ipc_mqueue_changed() already unhooked this knote from the waitq,
863 */
864 } else {
865 ipc_port_t port = IP_NULL;
866
867 /*
868 * When the knote being detached is the first one in the list,
869 * then unlinking the knote *and* updating the turnstile inheritor
870 * need to happen atomically with respect to the callers of
871 * turnstile_workloop_pusher_info().
872 *
873 * The caller of turnstile_workloop_pusher_info() will use the kq req
874 * lock (and hence the kqlock), so we just need to hold the kqlock too.
875 */
876 if (io_otype(object) == IOT_PORT) {
877 port = ip_object_to_port(object);
878 assert(port->ip_sync_link_state == PORT_SYNC_LINK_ANY);
879 if (kn == SLIST_FIRST(&port->ip_klist)) {
880 send_turnstile = port_send_turnstile(port);
881 }
882 filt_machport_unlink(&port->ip_klist, kn);
883 } else {
884 ipc_pset_t pset = ips_object_to_pset(object);
885
886 filt_machport_unlink(&pset->ips_klist, kn);
887 }
888
889
890 if (send_turnstile) {
891 turnstile_reference(send_turnstile);
892 ipc_port_send_update_inheritor(port, send_turnstile,
893 TURNSTILE_IMMEDIATE_UPDATE);
894 }
895 }
896
897 /* Clear the knote pointer once the knote has been removed from turnstile */
898 kn->kn_ipc_obj = IO_NULL;
899 io_unlock(object);
900
901 if (send_turnstile) {
902 turnstile_update_inheritor_complete(send_turnstile,
903 TURNSTILE_INTERLOCK_NOT_HELD);
904 turnstile_deallocate(send_turnstile);
905 }
906
907 io_release(object);
908 }
909
910 /*
911 * filt_machportevent - deliver events into the mach port filter
912 *
913 * Mach port message arrival events are currently only posted via the
914 * kqueue filter routine for ports.
915 *
916 * If there is a message at the head of the queue,
917 * we indicate that the knote should go active. If
918 * the message is to be direct-received, we adjust the
919 * QoS of the knote according the requested and override
920 * QoS of that first message.
921 *
922 * When the knote is for a port-set, the hint is non 0
923 * and is the waitq which is posting.
924 */
925 static int
filt_machportevent(struct knote * kn,long hint __assert_only)926 filt_machportevent(struct knote *kn, long hint __assert_only)
927 {
928 if (io_otype(kn->kn_ipc_obj) == IOT_PORT_SET) {
929 /*
930 * When called for a port-set,
931 * the posting port waitq is locked.
932 *
933 * waitq_set_iterate_preposts()
934 * in filt_machport_filter_result()
935 * would try to lock it and be very sad.
936 *
937 * Just trust what we know to be true.
938 */
939 assert(hint != 0);
940 return FILTER_ACTIVE;
941 }
942 assert(hint == 0);
943 return filt_machport_filter_result(kn, kn->kn_ipc_obj);
944 }
945
946 /*
947 * Upcall from the waitq code to prepost to the kevent subsystem.
948 *
949 * Called with the pset and waitq locks held.
950 */
951 void
ipc_pset_prepost(struct waitq_set * wqs,struct waitq * waitq)952 ipc_pset_prepost(struct waitq_set *wqs, struct waitq *waitq)
953 {
954 KNOTE(&ips_from_waitq(&wqs->wqset_q)->ips_klist, (long)waitq);
955 }
956
957 static int
filt_machporttouch(struct knote * kn,struct kevent_qos_s * kev)958 filt_machporttouch(
959 struct knote *kn,
960 struct kevent_qos_s *kev)
961 {
962 ipc_object_t object = kn->kn_ipc_obj;
963 int result = 0;
964
965 /* copy in new settings and save off new input fflags */
966 kn->kn_sfflags = kev->fflags;
967 kn->kn_ext[0] = kev->ext[0];
968 kn->kn_ext[1] = kev->ext[1];
969
970 if (kev->flags & EV_ENABLE) {
971 /*
972 * If the knote is being enabled, make sure there's no lingering
973 * IPC overrides from the previous message delivery.
974 */
975 filt_machport_turnstile_complete(kn);
976 }
977
978 io_lock(object);
979 result = filt_machport_filter_result(kn, object);
980 io_unlock(object);
981
982 return result;
983 }
984
985 static int
filt_machportprocess(struct knote * kn,struct kevent_qos_s * kev)986 filt_machportprocess(struct knote *kn, struct kevent_qos_s *kev)
987 {
988 ipc_object_t object = kn->kn_ipc_obj;
989 thread_t self = current_thread();
990 kevent_ctx_t kectx = NULL;
991
992 wait_result_t wresult;
993 mach_msg_option_t option;
994 mach_vm_address_t addr;
995 mach_msg_size_t size;
996
997 /* Capture current state */
998 knote_fill_kevent(kn, kev, MACH_PORT_NULL);
999 kev->ext[3] = 0; /* hide our port reference from userspace */
1000
1001 /* If already deallocated/moved return one last EOF event */
1002 if (kev->flags & EV_EOF) {
1003 return FILTER_ACTIVE | FILTER_RESET_EVENT_QOS;
1004 }
1005
1006 /*
1007 * Only honor supported receive options. If no options are
1008 * provided, just force a MACH_RCV_TOO_LARGE to detect the
1009 * name of the port and sizeof the waiting message.
1010 */
1011 option = kn->kn_sfflags & (MACH_RCV_MSG | MACH_RCV_LARGE | MACH_RCV_LARGE_IDENTITY |
1012 MACH_RCV_TRAILER_MASK | MACH_RCV_VOUCHER | MACH_MSG_STRICT_REPLY);
1013
1014 if (option & MACH_RCV_MSG) {
1015 addr = (mach_vm_address_t) kn->kn_ext[0];
1016 size = (mach_msg_size_t) kn->kn_ext[1];
1017
1018 /*
1019 * If the kevent didn't specify a buffer and length, carve a buffer
1020 * from the filter processing data according to the flags.
1021 */
1022 if (size == 0) {
1023 kectx = kevent_get_context(self);
1024 addr = (mach_vm_address_t)kectx->kec_data_out;
1025 size = (mach_msg_size_t)kectx->kec_data_resid;
1026 option |= (MACH_RCV_LARGE | MACH_RCV_LARGE_IDENTITY);
1027 if (kectx->kec_process_flags & KEVENT_FLAG_STACK_DATA) {
1028 option |= MACH_RCV_STACK;
1029 }
1030 }
1031 } else {
1032 /* just detect the port name (if a set) and size of the first message */
1033 option = MACH_RCV_LARGE;
1034 addr = 0;
1035 size = 0;
1036 }
1037
1038 /*
1039 * Set up to receive a message or the notification of a
1040 * too large message. But never allow this call to wait.
1041 * If the user provided aditional options, like trailer
1042 * options, pass those through here. But we don't support
1043 * scatter lists through this interface.
1044 *
1045 * Note: while in filt_machportprocess(),
1046 * the knote has a reference on `object` that we can borrow.
1047 */
1048 self->ith_object = object;
1049 self->ith_msg_addr = addr;
1050 self->ith_rsize = size;
1051 self->ith_msize = 0;
1052 self->ith_option = option;
1053 self->ith_receiver_name = MACH_PORT_NULL;
1054 self->ith_continuation = NULL;
1055 option |= MACH_RCV_TIMEOUT; // never wait
1056 self->ith_state = MACH_RCV_IN_PROGRESS;
1057 self->ith_knote = kn;
1058
1059 wresult = ipc_mqueue_receive_on_thread(
1060 io_waitq(object),
1061 option,
1062 size, /* max_size */
1063 0, /* immediate timeout */
1064 THREAD_INTERRUPTIBLE,
1065 self);
1066 /* port unlocked */
1067
1068 /* If we timed out, or the process is exiting, just zero. */
1069 if (wresult == THREAD_RESTART || self->ith_state == MACH_RCV_TIMED_OUT) {
1070 assert(self->turnstile != TURNSTILE_NULL);
1071 return 0;
1072 }
1073
1074 assert(wresult == THREAD_NOT_WAITING);
1075 assert(self->ith_state != MACH_RCV_IN_PROGRESS);
1076
1077 /*
1078 * If we weren't attempting to receive a message
1079 * directly, we need to return the port name in
1080 * the kevent structure.
1081 */
1082 if ((option & MACH_RCV_MSG) != MACH_RCV_MSG) {
1083 assert(self->ith_state == MACH_RCV_TOO_LARGE);
1084 assert(self->ith_kmsg == IKM_NULL);
1085 kev->data = self->ith_receiver_name;
1086 return FILTER_ACTIVE;
1087 }
1088
1089 #if CONFIG_PREADOPT_TG
1090 /* If we're the first EVFILT_MACHPORT knote that is being processed for this
1091 * kqwl, then make sure to preadopt the thread group from the kmsg we're
1092 * about to receive. This is to make sure that we fix up the preadoption
1093 * thread group correctly on the receive side for the first message.
1094 */
1095 struct kqueue *kq = knote_get_kq(kn);
1096
1097 if (self->ith_kmsg) {
1098 struct thread_group *tg = ipc_kmsg_get_thread_group(self->ith_kmsg);
1099
1100 kqueue_process_preadopt_thread_group(self, kq, tg);
1101 }
1102 #endif
1103
1104 /*
1105 * Attempt to receive the message directly, returning
1106 * the results in the fflags field.
1107 */
1108 io_reference(object);
1109 kev->fflags = mach_msg_receive_results(&size);
1110
1111 /* kmsg and object reference consumed */
1112
1113 /*
1114 * if the user asked for the identity of ports containing a
1115 * a too-large message, return it in the data field (as we
1116 * do for messages we didn't try to receive).
1117 */
1118 if (kev->fflags == MACH_RCV_TOO_LARGE) {
1119 kev->ext[1] = self->ith_msize;
1120 if (option & MACH_RCV_LARGE_IDENTITY) {
1121 kev->data = self->ith_receiver_name;
1122 } else {
1123 kev->data = MACH_PORT_NULL;
1124 }
1125 } else {
1126 kev->ext[1] = size;
1127 kev->data = MACH_PORT_NULL;
1128 }
1129
1130 /*
1131 * If we used a data buffer carved out from the filt_process data,
1132 * store the address used in the knote and adjust the residual and
1133 * other parameters for future use.
1134 */
1135 if (kectx) {
1136 assert(kectx->kec_data_resid >= size);
1137 kectx->kec_data_resid -= size;
1138 if ((kectx->kec_process_flags & KEVENT_FLAG_STACK_DATA) == 0) {
1139 kev->ext[0] = kectx->kec_data_out;
1140 kectx->kec_data_out += size;
1141 } else {
1142 assert(option & MACH_RCV_STACK);
1143 kev->ext[0] = kectx->kec_data_out + kectx->kec_data_resid;
1144 }
1145 }
1146
1147 /*
1148 * Apply message-based QoS values to output kevent as prescribed.
1149 * The kev->ext[2] field gets (msg-qos << 32) | (override-qos).
1150 *
1151 * The mach_msg_receive_results() call saved off the message
1152 * QoS values in the continuation save area on successful receive.
1153 */
1154 if (kev->fflags == MACH_MSG_SUCCESS) {
1155 kev->ext[2] = ((uint64_t)self->ith_ppriority << 32) |
1156 _pthread_priority_make_from_thread_qos(self->ith_qos_override, 0, 0);
1157 }
1158
1159 return FILTER_ACTIVE;
1160 }
1161
1162 SECURITY_READ_ONLY_EARLY(struct filterops) machport_filtops = {
1163 .f_adjusts_qos = true,
1164 .f_extended_codes = true,
1165 .f_attach = filt_machportattach,
1166 .f_detach = filt_machportdetach,
1167 .f_event = filt_machportevent,
1168 .f_touch = filt_machporttouch,
1169 .f_process = filt_machportprocess,
1170 };
1171