1 /*
2 * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * @OSF_COPYRIGHT@
30 */
31 /*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or [email protected]
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56 /*
57 */
58 /*
59 * File: ipc/ipc_pset.c
60 * Author: Rich Draves
61 * Date: 1989
62 *
63 * Functions to manipulate IPC port sets.
64 */
65
66 #include <mach/port.h>
67 #include <mach/kern_return.h>
68 #include <mach/message.h>
69 #include <ipc/ipc_mqueue.h>
70 #include <ipc/ipc_object.h>
71 #include <ipc/ipc_policy.h>
72 #include <ipc/ipc_pset.h>
73 #include <ipc/ipc_right.h>
74 #include <ipc/ipc_space.h>
75 #include <ipc/ipc_port.h>
76 #include <ipc/ipc_kmsg.h>
77 #include <kern/policy_internal.h>
78
79 #include <kern/kern_types.h>
80
81 #include <vm/vm_map.h>
82 #include <libkern/section_keywords.h>
83 #include <pthread/priority_private.h>
84
85 /* processor_set stole ipc_pset_init */
86 static void
ipc_port_set_init(ipc_pset_t pset,mach_port_name_t name)87 ipc_port_set_init(ipc_pset_t pset, mach_port_name_t name)
88 {
89 waitq_init(&pset->ips_wqset, WQT_PORT_SET,
90 SYNC_POLICY_INIT_LOCKED | SYNC_POLICY_FIFO);
91 klist_init(&pset->ips_klist);
92 pset->ips_wqset.wqset_index = MACH_PORT_INDEX(name);
93
94 /* init io_bits */
95 os_ref_init_raw(&pset->ips_object.io_references, NULL);
96 io_label_init(&pset->ips_object, (ipc_object_label_t){
97 .io_type = IOT_PORT_SET,
98 .io_state = IO_STATE_IN_SPACE_IMMOVABLE,
99 });
100 }
101
102 void
ipc_pset_lock(ipc_pset_t pset)103 ipc_pset_lock(ipc_pset_t pset)
104 {
105 ips_validate(pset);
106 waitq_lock(&pset->ips_wqset);
107 }
108
109 /*
110 * Routine: ipc_pset_alloc
111 * Purpose:
112 * Allocate a port set.
113 * Conditions:
114 * Nothing locked. If successful, the port set is returned
115 * locked. (The caller doesn't have a reference.)
116 * Returns:
117 * KERN_SUCCESS The port set is allocated.
118 * KERN_INVALID_TASK The space is dead.
119 * KERN_NO_SPACE No room for an entry in the space.
120 */
121
122 kern_return_t
ipc_pset_alloc(ipc_space_t space,mach_port_name_t * namep,ipc_pset_t * psetp)123 ipc_pset_alloc(
124 ipc_space_t space,
125 mach_port_name_t *namep,
126 ipc_pset_t *psetp)
127 {
128 mach_port_name_t name;
129 kern_return_t kr;
130 ipc_entry_t entry;
131 mach_port_type_t type = MACH_PORT_TYPE_PORT_SET;
132 mach_port_urefs_t urefs = 0;
133 ipc_pset_t pset;
134 ipc_object_t object;
135
136 pset = ips_alloc();
137 object = ips_to_object(pset);
138 kr = ipc_object_alloc_entry(space, object, &name, &entry);
139 if (kr != KERN_SUCCESS) {
140 ips_free(pset);
141 return kr;
142 }
143 /* space is locked */
144
145 ipc_port_set_init(pset, name);
146 /* port set is locked */
147 ipc_entry_init(space, object, type, entry, urefs, name);
148
149 is_write_unlock(space);
150
151 *namep = name;
152 *psetp = pset;
153 return KERN_SUCCESS;
154 }
155
156 /*
157 * Routine: ipc_pset_alloc_name
158 * Purpose:
159 * Allocate a port set, with a specific name.
160 * Conditions:
161 * Nothing locked. If successful, the port set is returned
162 * locked. (The caller doesn't have a reference.)
163 * Returns:
164 * KERN_SUCCESS The port set is allocated.
165 * KERN_INVALID_TASK The space is dead.
166 * KERN_NAME_EXISTS The name already denotes a right.
167 */
168
169 kern_return_t
ipc_pset_alloc_name(ipc_space_t space,mach_port_name_t name,ipc_pset_t * psetp)170 ipc_pset_alloc_name(
171 ipc_space_t space,
172 mach_port_name_t name,
173 ipc_pset_t *psetp)
174 {
175 kern_return_t kr;
176 ipc_entry_t entry;
177 mach_port_type_t type = MACH_PORT_TYPE_PORT_SET;
178 mach_port_urefs_t urefs = 0;
179 ipc_pset_t pset;
180 ipc_object_t object;
181
182 pset = ips_alloc();
183 object = ips_to_object(pset);
184 kr = ipc_object_alloc_entry_with_name(space, name, &entry);
185 if (kr != KERN_SUCCESS) {
186 ips_free(pset);
187 return kr;
188 }
189 /* space is locked */
190
191 ipc_port_set_init(pset, name);
192 /* port set is locked */
193 ipc_entry_init(space, object, type, entry, urefs, name);
194
195 is_write_unlock(space);
196 *psetp = pset;
197 return KERN_SUCCESS;
198 }
199
200
201 /*
202 * Routine: ipc_pset_alloc_special
203 * Purpose:
204 * Allocate a port set in a special space.
205 * The new port set is returned with one ref and locked.
206 * If unsuccessful, IPS_NULL is returned.
207 * Conditions:
208 * Nothing locked.
209 */
210 ipc_pset_t
ipc_pset_alloc_special(__assert_only ipc_space_t space)211 ipc_pset_alloc_special(
212 __assert_only ipc_space_t space)
213 {
214 ipc_pset_t pset = ips_alloc();
215
216 assert(space != IS_NULL);
217 assert(!is_active(space));
218
219 ipc_port_set_init(pset, MACH_PORT_SPECIAL_DEFAULT);
220 /* port set is locked */
221 return pset;
222 }
223
224
225 /*
226 * Routine: ipc_pset_destroy
227 * Purpose:
228 * Destroys a port_set.
229 * Conditions:
230 * The port_set is locked and alive.
231 * The caller has a reference, which is consumed.
232 * Afterwards, the port_set is unlocked and dead.
233 */
234
235 void
ipc_pset_destroy(ipc_space_t space,ipc_pset_t pset)236 ipc_pset_destroy(
237 ipc_space_t space,
238 ipc_pset_t pset)
239 {
240 waitq_link_list_t free_l = { };
241 ipc_object_label_t label = io_label_get(&pset->ips_object, IOT_PORT_SET);
242
243 ipc_release_assert(io_state_in_space(label.io_state));
244 label.io_state = IO_STATE_INACTIVE;
245 io_label_set_and_put(&pset->ips_object, &label);
246
247 /*
248 * Set all waiters on the portset running to
249 * discover the change.
250 *
251 * Then under the same lock hold, deinit the waitq-set,
252 * which will remove all the member message queues,
253 * linkages and clean up preposts.
254 */
255 ipc_mqueue_changed(space, &pset->ips_wqset);
256 waitq_invalidate(&pset->ips_wqset);
257 waitq_set_unlink_all_locked(&pset->ips_wqset, &free_l);
258
259 ips_mq_unlock(pset);
260
261 ips_release(pset); /* consume the ref our caller gave us */
262
263 waitq_link_free_list(WQT_PORT_SET, &free_l);
264 }
265
266 /*
267 * Routine: ipc_pset_free
268 * Purpose:
269 * Called on last reference deallocate to
270 * free any remaining data associated with the pset.
271 * Conditions:
272 * Nothing locked.
273 */
274 void
ipc_pset_free(ipc_pset_t pset)275 ipc_pset_free(
276 ipc_pset_t pset)
277 {
278 waitq_deinit(&pset->ips_wqset);
279 ips_free(pset);
280 }
281
282
283 #pragma mark - kevent support
284
285 /*
286 * Kqueue EVFILT_MACHPORT support
287 *
288 * - kn_ipc_{port,pset} points to the monitored ipc port or pset. If the knote
289 * is using a kqwl, it is eligible to participate in sync IPC overrides.
290 *
291 * For the first such sync IPC message in the port, we set up the port's
292 * turnstile to directly push on the kqwl's turnstile (which is in turn set up
293 * during filt_machportattach). If userspace responds to the message, the
294 * turnstile push is severed the point of reply. If userspace returns without
295 * responding to the message, we sever the turnstile push at the
296 * point of reenabling the knote to deliver the next message. This is why the
297 * knote needs to remember the port. For more details, see also
298 * filt_machport_turnstile_complete.
299 *
300 * If there are multiple other sync IPC messages in the port, messages 2 to n
301 * redirect their turnstile push to the kqwl through an intermediatry "knote"
302 * turnstile which in turn, pushes on the kqwl turnstile. This knote turnstile
303 * is stored in the kn_hook. See also filt_machport_turnstile_prepare_lazily.
304 *
305 * - (in/out) ext[0] holds a mach_vm_address_t to a userspace buffer
306 * that can be used to direct-deliver messages when
307 * MACH_RCV_MSG is set in kn_sfflags
308 *
309 * - (in/out) ext[1] holds a mach_msg_size_t representing the size
310 * of the userspace buffer held in ext[0].
311 *
312 * - (out) ext[2] is used to deliver qos information
313 * about the send queue to userspace.
314 *
315 * - (abused) ext[3] is used in kernel to hold a reference to the first port
316 * with a turnstile that participate to sync IPC override. For more details,
317 * see filt_machport_stash_port
318 *
319 * - kn_hook is optionally a "knote" turnstile. It is used as the inheritor
320 * of turnstiles for rights copied out as part of direct message delivery
321 * when they can participate to sync IPC override.
322 *
323 * It is used to atomically neuter the sync IPC override when the knote is
324 * re-enabled.
325 *
326 */
327
328 #include <sys/event.h>
329 #include <sys/errno.h>
330
331 static int
filt_pset_filter_result(ipc_pset_t pset)332 filt_pset_filter_result(ipc_pset_t pset)
333 {
334 ips_mq_lock_held(pset);
335
336 if (!waitq_is_valid(&pset->ips_wqset)) {
337 return 0;
338 }
339
340 return waitq_set_first_prepost(&pset->ips_wqset, WQS_PREPOST_PEEK) ?
341 FILTER_ACTIVE : 0;
342 }
343
344 static int
filt_port_filter_result(struct knote * kn,ipc_port_t port)345 filt_port_filter_result(struct knote *kn, ipc_port_t port)
346 {
347 struct kqueue *kqwl = knote_get_kq(kn);
348 ipc_kmsg_t first;
349 int result = 0;
350
351 ip_mq_lock_held(port);
352
353 if (kn->kn_sfflags & MACH_RCV_MSG) {
354 result = FILTER_RESET_EVENT_QOS;
355 }
356
357 if (!waitq_is_valid(&port->ip_waitq)) {
358 return result;
359 }
360
361 if (port->ip_kernel_iotier_override != kqueue_get_iotier_override(kqwl)) {
362 kqueue_set_iotier_override(kqwl, port->ip_kernel_iotier_override);
363 result |= FILTER_ADJUST_EVENT_IOTIER_BIT;
364 }
365
366 first = ipc_kmsg_queue_first(&port->ip_messages.imq_messages);
367 if (!first) {
368 return result;
369 }
370
371 result = FILTER_ACTIVE;
372 if (kn->kn_sfflags & MACH_RCV_MSG) {
373 result |= FILTER_ADJUST_EVENT_QOS(first->ikm_qos_override);
374 }
375
376 #if CONFIG_PREADOPT_TG
377 struct thread_group *tg = ipc_kmsg_get_thread_group(first);
378 if (tg) {
379 struct kqueue *kq = knote_get_kq(kn);
380 kqueue_set_preadopted_thread_group(kq, tg,
381 first->ikm_qos_override);
382 }
383 #endif
384
385 return result;
386 }
387
388 struct turnstile *
filt_ipc_kqueue_turnstile(struct knote * kn)389 filt_ipc_kqueue_turnstile(struct knote *kn)
390 {
391 assert(kn->kn_filter == EVFILT_MACHPORT || kn->kn_filter == EVFILT_WORKLOOP);
392 return kqueue_turnstile(knote_get_kq(kn));
393 }
394
395 bool
filt_machport_kqueue_has_turnstile(struct knote * kn)396 filt_machport_kqueue_has_turnstile(struct knote *kn)
397 {
398 assert(kn->kn_filter == EVFILT_MACHPORT);
399 return ((kn->kn_sfflags & MACH_RCV_MSG) || (kn->kn_sfflags & MACH_RCV_SYNC_PEEK))
400 && (kn->kn_flags & EV_DISPATCH);
401 }
402
403 /*
404 * Stashes a port that participate to sync IPC override on the knote until the
405 * knote is re-enabled.
406 *
407 * It returns:
408 * - the turnstile to use as an inheritor for the stashed port
409 * - the kind of stash that happened as PORT_SYNC_* value among:
410 * o not stashed (no sync IPC support)
411 * o stashed in the knote (in kn_ext[3])
412 * o to be hooked to the kn_hook knote
413 */
414 struct turnstile *
filt_machport_stash_port(struct knote * kn,ipc_port_t port,int * link)415 filt_machport_stash_port(struct knote *kn, ipc_port_t port, int *link)
416 {
417 struct turnstile *ts = TURNSTILE_NULL;
418
419 if (kn->kn_filter == EVFILT_WORKLOOP) {
420 assert(kn->kn_ipc_port == NULL);
421 kn->kn_ipc_port = port;
422 ip_reference(port);
423 if (link) {
424 *link = PORT_SYNC_LINK_WORKLOOP_KNOTE;
425 }
426 ts = filt_ipc_kqueue_turnstile(kn);
427 } else if (!filt_machport_kqueue_has_turnstile(kn)) {
428 if (link) {
429 *link = PORT_SYNC_LINK_NO_LINKAGE;
430 }
431 } else if (kn->kn_ext[3] == 0) {
432 ip_reference(port);
433 kn->kn_ext[3] = (uintptr_t)port;
434 ts = filt_ipc_kqueue_turnstile(kn);
435 if (link) {
436 *link = PORT_SYNC_LINK_WORKLOOP_KNOTE;
437 }
438 } else {
439 ts = (struct turnstile *)knote_kn_hook_get_raw(kn);
440 if (link) {
441 *link = PORT_SYNC_LINK_WORKLOOP_STASH;
442 }
443 }
444
445 return ts;
446 }
447
448 /*
449 * Lazily prepare a turnstile so that filt_machport_stash_port()
450 * can be called with the mqueue lock held.
451 *
452 * It will allocate a turnstile in kn_hook if:
453 * - the knote supports sync IPC override,
454 * - we already stashed a port in kn_ext[3],
455 * - the object that will be copied out has a chance to ask to be stashed.
456 *
457 * It is setup so that its inheritor is the workloop turnstile that has been
458 * allocated when this knote was attached.
459 */
460 void
filt_machport_turnstile_prepare_lazily(struct knote * kn,mach_msg_type_name_t msgt_name,ipc_port_t port)461 filt_machport_turnstile_prepare_lazily(
462 struct knote *kn,
463 mach_msg_type_name_t msgt_name,
464 ipc_port_t port)
465 {
466 /* This is called from within filt_machportprocess */
467 assert((kn->kn_status & KN_SUPPRESSED) && (kn->kn_status & KN_LOCKED));
468
469 if (!filt_machport_kqueue_has_turnstile(kn)) {
470 return;
471 }
472
473 if (kn->kn_ext[3] == 0 || knote_kn_hook_get_raw(kn)) {
474 return;
475 }
476
477 struct turnstile *ts = filt_ipc_kqueue_turnstile(kn);
478 if ((msgt_name == MACH_MSG_TYPE_PORT_SEND_ONCE && ip_is_special_reply_port(port)) ||
479 (msgt_name == MACH_MSG_TYPE_PORT_RECEIVE)) {
480 struct turnstile *kn_ts = turnstile_alloc();
481 struct turnstile *ts_store = TURNSTILE_NULL;
482 kn_ts = turnstile_prepare((uintptr_t)kn, &ts_store, kn_ts, TURNSTILE_KNOTE);
483 knote_kn_hook_set_raw(kn, ts_store);
484
485 turnstile_update_inheritor(kn_ts, ts,
486 TURNSTILE_IMMEDIATE_UPDATE | TURNSTILE_INHERITOR_TURNSTILE);
487 turnstile_cleanup();
488 }
489 }
490
491 static void
filt_machport_turnstile_complete_port(struct knote * kn,ipc_port_t port)492 filt_machport_turnstile_complete_port(struct knote *kn, ipc_port_t port)
493 {
494 struct turnstile *ts = TURNSTILE_NULL;
495
496 ip_mq_lock(port);
497 if (ip_is_special_reply_port(port)) {
498 /*
499 * If the reply has been sent to the special reply port already,
500 * then the special reply port may already be reused to do something
501 * entirely different.
502 *
503 * However, the only reason for it to still point to this knote is
504 * that it's still waiting for a reply, so when this is the case,
505 * neuter the linkage.
506 */
507 if (port->ip_sync_link_state == PORT_SYNC_LINK_WORKLOOP_KNOTE &&
508 port->ip_sync_inheritor_knote == kn) {
509 ipc_port_adjust_special_reply_port_locked(port, NULL,
510 (IPC_PORT_ADJUST_SR_NONE | IPC_PORT_ADJUST_SR_ENABLE_EVENT), FALSE);
511 /* port unlocked */
512 } else {
513 ip_mq_unlock(port);
514 }
515 } else {
516 /*
517 * For receive rights, if their IMQ_KNOTE() is still this
518 * knote, then sever the link.
519 */
520 if (port->ip_sync_link_state == PORT_SYNC_LINK_WORKLOOP_KNOTE &&
521 port->ip_messages.imq_inheritor_knote == kn) {
522 ipc_port_adjust_sync_link_state_locked(port, PORT_SYNC_LINK_ANY, NULL);
523 ts = port_send_turnstile(port);
524 }
525 if (ts) {
526 turnstile_reference(ts);
527 turnstile_update_inheritor(ts, TURNSTILE_INHERITOR_NULL,
528 TURNSTILE_IMMEDIATE_UPDATE);
529 }
530 ip_mq_unlock(port);
531
532 if (ts) {
533 turnstile_update_inheritor_complete(ts,
534 TURNSTILE_INTERLOCK_NOT_HELD);
535 turnstile_deallocate(ts);
536 }
537 }
538
539 ip_release(port);
540 }
541
542 void
filt_wldetach_sync_ipc(struct knote * kn)543 filt_wldetach_sync_ipc(struct knote *kn)
544 {
545 ipc_port_t port = kn->kn_ipc_port;
546 filt_machport_turnstile_complete_port(kn, port);
547 kn->kn_ipc_port = IP_NULL;
548 }
549
550 /*
551 * Other half of filt_machport_turnstile_prepare_lazily()
552 *
553 * This is serialized by the knote state machine.
554 */
555 static void
filt_machport_turnstile_complete(struct knote * kn)556 filt_machport_turnstile_complete(struct knote *kn)
557 {
558 if (kn->kn_ext[3]) {
559 ipc_port_t port = (ipc_port_t)kn->kn_ext[3];
560 filt_machport_turnstile_complete_port(kn, port);
561 kn->kn_ext[3] = 0;
562 }
563
564 struct turnstile *ts = knote_kn_hook_get_raw(kn);
565 if (ts) {
566 turnstile_update_inheritor(ts, TURNSTILE_INHERITOR_NULL,
567 TURNSTILE_IMMEDIATE_UPDATE);
568 turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD);
569
570 struct turnstile *ts_store = ts;
571 turnstile_complete((uintptr_t)kn, (struct turnstile **)&ts_store, &ts, TURNSTILE_KNOTE);
572 knote_kn_hook_set_raw(kn, ts_store);
573
574 turnstile_cleanup();
575
576 assert(ts);
577 turnstile_deallocate(ts);
578 }
579 }
580
581 static void
filt_machport_link(struct klist * klist,struct knote * kn)582 filt_machport_link(struct klist *klist, struct knote *kn)
583 {
584 struct knote *hd = SLIST_FIRST(klist);
585
586 if (hd && filt_machport_kqueue_has_turnstile(kn)) {
587 SLIST_INSERT_AFTER(hd, kn, kn_selnext);
588 } else {
589 SLIST_INSERT_HEAD(klist, kn, kn_selnext);
590 }
591 }
592
593 static void
filt_machport_unlink(struct klist * klist,struct knote * kn)594 filt_machport_unlink(struct klist *klist, struct knote *kn)
595 {
596 struct knote **knprev;
597
598 KNOTE_DETACH(klist, kn);
599
600 /* make sure the first knote is a knote we can push on */
601 SLIST_FOREACH_PREVPTR(kn, knprev, klist, kn_selnext) {
602 if (filt_machport_kqueue_has_turnstile(kn)) {
603 *knprev = SLIST_NEXT(kn, kn_selnext);
604 SLIST_INSERT_HEAD(klist, kn, kn_selnext);
605 break;
606 }
607 }
608 }
609
610 int
filt_wlattach_sync_ipc(struct knote * kn)611 filt_wlattach_sync_ipc(struct knote *kn)
612 {
613 mach_port_name_t name = (mach_port_name_t)kn->kn_id;
614 ipc_space_t space = current_space();
615 ipc_entry_bits_t bits;
616 ipc_object_t object;
617 ipc_port_t port = IP_NULL;
618 int error = 0;
619
620 if (ipc_right_lookup_read(space, name, &bits, &object) != KERN_SUCCESS) {
621 return ENOENT;
622 }
623 /* object is locked and active */
624
625 if (bits & MACH_PORT_TYPE_RECEIVE) {
626 port = ip_object_to_port(object);
627 if (ip_is_special_reply_port(port) || ip_is_kobject(port)) {
628 error = ENOENT;
629 }
630 } else if (bits & MACH_PORT_TYPE_SEND_ONCE) {
631 port = ip_object_to_port(object);
632 if (!ip_is_special_reply_port(port)) {
633 error = ENOENT;
634 }
635 } else {
636 error = ENOENT;
637 }
638 if (error) {
639 io_unlock(object);
640 return error;
641 }
642
643 if (port->ip_sync_link_state == PORT_SYNC_LINK_ANY) {
644 io_unlock(object);
645 /*
646 * We cannot start a sync IPC inheritance chain, only further one
647 * Note: this can also happen if the inheritance chain broke
648 * because the original requestor died.
649 */
650 return ENOENT;
651 }
652
653 if (ip_is_special_reply_port(port)) {
654 ipc_port_adjust_special_reply_port_locked(port, kn,
655 IPC_PORT_ADJUST_SR_LINK_WORKLOOP, FALSE);
656 } else {
657 ipc_port_adjust_port_locked(port, kn, FALSE);
658 }
659
660 /* make sure the port was stashed */
661 assert(kn->kn_ipc_port == port);
662
663 /* port has been unlocked by ipc_port_adjust_* */
664
665 return 0;
666 }
667
668 static int
filt_psetattach(struct knote * kn,ipc_pset_t pset)669 filt_psetattach(struct knote *kn, ipc_pset_t pset)
670 {
671 int result = 0;
672
673 ips_reference(pset);
674 kn->kn_ipc_pset = pset;
675
676 filt_machport_link(&pset->ips_klist, kn);
677 result = filt_pset_filter_result(pset);
678 ips_mq_unlock(pset);
679
680 return result;
681 }
682
683 static int
filt_portattach(struct knote * kn,ipc_port_t port)684 filt_portattach(struct knote *kn, ipc_port_t port)
685 {
686 struct turnstile *send_turnstile = TURNSTILE_NULL;
687 int result = 0;
688
689 if (ip_is_special_reply_port(port)) {
690 /*
691 * Registering for kevents on special reply ports
692 * isn't supported for two reasons:
693 *
694 * 1. it really makes very little sense for a port that
695 * is supposed to be used synchronously
696 *
697 * 2. their ports's ip_klist field will be used to
698 * store the receive turnstile, so we can't possibly
699 * attach them anyway.
700 */
701 ip_mq_unlock(port);
702 knote_set_error(kn, ENOTSUP);
703 return 0;
704 }
705
706 ip_reference(port);
707 kn->kn_ipc_port = port;
708 if (port->ip_sync_link_state != PORT_SYNC_LINK_ANY) {
709 /*
710 * We're attaching a port that used to have an IMQ_KNOTE,
711 * clobber this state, we'll fixup its turnstile inheritor below.
712 */
713 ipc_port_adjust_sync_link_state_locked(port, PORT_SYNC_LINK_ANY, NULL);
714 }
715
716 filt_machport_link(&port->ip_klist, kn);
717 result = filt_port_filter_result(kn, port);
718
719 /*
720 * Update the port's turnstile inheritor
721 *
722 * Unlike filt_machportdetach(), we don't have to care about races for
723 * turnstile_workloop_pusher_info(): filt_machport_link() doesn't affect
724 * already pushing knotes, and if the current one becomes the new
725 * pusher, it'll only be visible when turnstile_workloop_pusher_info()
726 * returns.
727 */
728 send_turnstile = port_send_turnstile(port);
729 if (send_turnstile) {
730 turnstile_reference(send_turnstile);
731 ipc_port_send_update_inheritor(port, send_turnstile,
732 TURNSTILE_IMMEDIATE_UPDATE);
733
734 /*
735 * rdar://problem/48861190
736 *
737 * When a listener connection resumes a peer,
738 * updating the inheritor above has moved the push
739 * from the current thread to the workloop.
740 *
741 * However, we haven't told the workloop yet
742 * that it needs a thread request, and we risk
743 * to be preeempted as soon as we drop the space
744 * lock below.
745 *
746 * To avoid this disable preemption and let kevent
747 * reenable it after it takes the kqlock.
748 */
749 disable_preemption();
750 result |= FILTER_THREADREQ_NODEFEER;
751 }
752
753 ip_mq_unlock(port);
754
755 if (send_turnstile) {
756 turnstile_update_inheritor_complete(send_turnstile,
757 TURNSTILE_INTERLOCK_NOT_HELD);
758 turnstile_deallocate(send_turnstile);
759 }
760
761 return result;
762 }
763
764 static int
filt_machportattach(struct knote * kn,__unused struct kevent_qos_s * kev)765 filt_machportattach(struct knote *kn, __unused struct kevent_qos_s *kev)
766 {
767 mach_port_name_t name = (mach_port_name_t)kn->kn_id;
768 ipc_space_t space = current_space();
769 ipc_entry_bits_t bits;
770 ipc_object_t object;
771 kern_return_t kr;
772
773 kn->kn_flags &= ~EV_EOF;
774 kn->kn_ext[3] = 0;
775
776 if (filt_machport_kqueue_has_turnstile(kn)) {
777 /*
778 * If the filter is likely to support sync IPC override,
779 * and it happens to be attaching to a workloop,
780 * make sure the workloop has an allocated turnstile.
781 */
782 kqueue_alloc_turnstile(knote_get_kq(kn));
783 }
784
785 kr = ipc_right_lookup_read(space, name, &bits, &object);
786
787 if (kr != KERN_SUCCESS) {
788 knote_set_error(kn, ENOENT);
789 return 0;
790 }
791 /* object is locked and active */
792
793 if (bits & MACH_PORT_TYPE_PORT_SET) {
794 kn->kn_filtid = EVFILTID_MACH_PORT_SET;
795 return filt_psetattach(kn, ips_object_to_pset(object));
796 }
797
798 if (bits & MACH_PORT_TYPE_RECEIVE) {
799 kn->kn_filtid = EVFILTID_MACH_PORT;
800 return filt_portattach(kn, ip_object_to_port(object));
801 }
802
803 io_unlock(object);
804 knote_set_error(kn, ENOTSUP);
805 return 0;
806 }
807
808 static void
filt_psetdetach(struct knote * kn)809 filt_psetdetach(struct knote *kn)
810 {
811 ipc_pset_t pset = kn->kn_ipc_pset;
812
813 filt_machport_turnstile_complete(kn);
814
815 ips_mq_lock(pset);
816
817 if ((kn->kn_status & KN_VANISHED) || (kn->kn_flags & EV_EOF)) {
818 /*
819 * ipc_mqueue_changed() already unhooked this knote from the waitq,
820 */
821 } else {
822 filt_machport_unlink(&pset->ips_klist, kn);
823 }
824
825 kn->kn_ipc_pset = IPS_NULL;
826 ips_mq_unlock(pset);
827 ips_release(pset);
828 }
829
830 static void
filt_portdetach(struct knote * kn)831 filt_portdetach(struct knote *kn)
832 {
833 ipc_port_t port = kn->kn_ipc_port;
834 struct turnstile *send_turnstile = TURNSTILE_NULL;
835
836 filt_machport_turnstile_complete(kn);
837
838 ip_mq_lock(port);
839 if ((kn->kn_status & KN_VANISHED) || (kn->kn_flags & EV_EOF)) {
840 /*
841 * ipc_mqueue_changed() already unhooked this knote from the waitq,
842 */
843 } else {
844 /*
845 * When the knote being detached is the first one in the list,
846 * then unlinking the knote *and* updating the turnstile inheritor
847 * need to happen atomically with respect to the callers of
848 * turnstile_workloop_pusher_info().
849 *
850 * The caller of turnstile_workloop_pusher_info() will use the kq req
851 * lock (and hence the kqlock), so we just need to hold the kqlock too.
852 */
853 assert(port->ip_sync_link_state == PORT_SYNC_LINK_ANY);
854 if (kn == SLIST_FIRST(&port->ip_klist)) {
855 send_turnstile = port_send_turnstile(port);
856 }
857 filt_machport_unlink(&port->ip_klist, kn);
858 struct kqueue *kq = knote_get_kq(kn);
859 kqueue_set_iotier_override(kq, THROTTLE_LEVEL_END);
860 }
861
862 if (send_turnstile) {
863 turnstile_reference(send_turnstile);
864 ipc_port_send_update_inheritor(port, send_turnstile,
865 TURNSTILE_IMMEDIATE_UPDATE);
866 }
867
868 /* Clear the knote pointer once the knote has been removed from turnstile */
869 kn->kn_ipc_port = IP_NULL;
870 ip_mq_unlock(port);
871
872 if (send_turnstile) {
873 turnstile_update_inheritor_complete(send_turnstile,
874 TURNSTILE_INTERLOCK_NOT_HELD);
875 turnstile_deallocate(send_turnstile);
876 }
877
878 ip_release(port);
879 }
880
881 /*
882 * filt_{pset,port}event - deliver events into the mach port filter
883 *
884 * Mach port message arrival events are currently only posted via the
885 * kqueue filter routine for ports.
886 *
887 * If there is a message at the head of the queue,
888 * we indicate that the knote should go active. If
889 * the message is to be direct-received, we adjust the
890 * QoS of the knote according the requested and override
891 * QoS of that first message.
892 *
893 * When the knote is for a port-set, the hint is non 0
894 * and is the waitq which is posting.
895 */
896 static int
filt_psetevent(struct knote * kn __unused,long hint __assert_only)897 filt_psetevent(struct knote *kn __unused, long hint __assert_only)
898 {
899 /*
900 * When called for a port-set,
901 * the posting port waitq is locked.
902 *
903 * waitq_set_first_prepost()
904 * in filt_machport_filter_result()
905 * would try to lock it and be very sad.
906 *
907 * Just trust what we know to be true.
908 */
909 assert(hint != 0);
910 return FILTER_ACTIVE;
911 }
912
913 static int
filt_portevent(struct knote * kn,long hint __assert_only)914 filt_portevent(struct knote *kn, long hint __assert_only)
915 {
916 assert(hint == 0);
917 return filt_port_filter_result(kn, kn->kn_ipc_port);
918 }
919
920 void
ipc_pset_prepost(struct waitq_set * wqs,struct waitq * waitq)921 ipc_pset_prepost(struct waitq_set *wqs, struct waitq *waitq)
922 {
923 KNOTE(&ips_from_waitq(wqs)->ips_klist, (long)waitq);
924 }
925
926 static void
filt_machporttouch(struct knote * kn,struct kevent_qos_s * kev)927 filt_machporttouch(struct knote *kn, struct kevent_qos_s *kev)
928 {
929 /*
930 * Specificying MACH_RCV_MSG or MACH_RCV_SYNC_PEEK during attach results in
931 * allocation of a turnstile. Modifying the filter flags to include these
932 * flags later, without a turnstile being allocated, leads to
933 * inconsistencies.
934 */
935 if ((kn->kn_sfflags ^ kev->fflags) & (MACH_RCV_MSG | MACH_RCV_SYNC_PEEK)) {
936 kev->flags |= EV_ERROR;
937 kev->data = EINVAL;
938 return;
939 }
940
941 /* copy in new settings and save off new input fflags */
942 kn->kn_sfflags = kev->fflags;
943 kn->kn_ext[0] = kev->ext[0];
944 kn->kn_ext[1] = kev->ext[1];
945
946 if (kev->flags & EV_ENABLE) {
947 /*
948 * If the knote is being enabled, make sure there's no lingering
949 * IPC overrides from the previous message delivery.
950 */
951 filt_machport_turnstile_complete(kn);
952 }
953 }
954
955 static int
filt_psettouch(struct knote * kn,struct kevent_qos_s * kev)956 filt_psettouch(struct knote *kn, struct kevent_qos_s *kev)
957 {
958 ipc_pset_t pset = kn->kn_ipc_pset;
959 int result = 0;
960
961 filt_machporttouch(kn, kev);
962 if (kev->flags & EV_ERROR) {
963 return 0;
964 }
965
966 ips_mq_lock(pset);
967 result = filt_pset_filter_result(pset);
968 ips_mq_unlock(pset);
969
970 return result;
971 }
972
973 static int
filt_porttouch(struct knote * kn,struct kevent_qos_s * kev)974 filt_porttouch(struct knote *kn, struct kevent_qos_s *kev)
975 {
976 ipc_port_t port = kn->kn_ipc_port;
977 int result = 0;
978
979 filt_machporttouch(kn, kev);
980 if (kev->flags & EV_ERROR) {
981 return 0;
982 }
983
984 ip_mq_lock(port);
985 result = filt_port_filter_result(kn, port);
986 ip_mq_unlock(port);
987
988 return result;
989 }
990
991 static int
filt_machportprocess(struct knote * kn,struct kevent_qos_s * kev,ipc_object_t object,ipc_object_type_t otype)992 filt_machportprocess(
993 struct knote *kn,
994 struct kevent_qos_s *kev,
995 ipc_object_t object,
996 ipc_object_type_t otype)
997 {
998 thread_t self = current_thread();
999 kevent_ctx_t kectx = NULL;
1000
1001 wait_result_t wresult;
1002 mach_msg_option64_t option64;
1003 mach_vm_address_t msg_addr;
1004 mach_msg_size_t max_msg_size;
1005 mach_msg_recv_result_t msgr;
1006
1007 int result = FILTER_ACTIVE;
1008
1009 /* Capture current state */
1010 knote_fill_kevent(kn, kev, MACH_PORT_NULL);
1011
1012 /* Clear port reference, use ext3 as size of msg aux data */
1013 kev->ext[3] = 0;
1014
1015 /* If already deallocated/moved return one last EOF event */
1016 if (kev->flags & EV_EOF) {
1017 return FILTER_ACTIVE | FILTER_RESET_EVENT_QOS;
1018 }
1019
1020 /*
1021 * Only honor supported receive options. If no options are
1022 * provided, just force a MACH_RCV_LARGE to detect the
1023 * name of the port and sizeof the waiting message.
1024 *
1025 * Extend kn_sfflags to 64 bits.
1026 *
1027 * Add MACH_RCV_TIMEOUT to never wait (in case someone concurrently
1028 * dequeued the message that made this knote active already).
1029 */
1030 option64 = kn->kn_sfflags & (MACH_RCV_MSG | MACH_RCV_LARGE |
1031 MACH_RCV_LARGE_IDENTITY | MACH_RCV_TRAILER_MASK |
1032 MACH_RCV_VOUCHER | MACH_MSG_STRICT_REPLY);
1033 option64 = ipc_current_msg_options(current_task(), option64);
1034
1035 if (option64 & MACH_RCV_MSG) {
1036 msg_addr = (mach_vm_address_t) kn->kn_ext[0];
1037 max_msg_size = (mach_msg_size_t) kn->kn_ext[1];
1038
1039 /*
1040 * Copy out the incoming message as vector, and append aux data
1041 * immediately after the message proper (if any) and report its
1042 * size on ext3.
1043 *
1044 * Note: MACH64_RCV_LINEAR_VECTOR is how the receive machinery
1045 * knows this comes from kevent (see comment in
1046 * mach_msg_receive_too_large()).
1047 */
1048 option64 |= (MACH64_MSG_VECTOR | MACH64_RCV_LINEAR_VECTOR);
1049
1050 /*
1051 * If the kevent didn't specify a buffer and length, carve a buffer
1052 * from the filter processing data according to the flags.
1053 */
1054 if (max_msg_size == 0) {
1055 kectx = kevent_get_context(self);
1056 msg_addr = (mach_vm_address_t)kectx->kec_data_out;
1057 max_msg_size = (mach_msg_size_t)kectx->kec_data_resid;
1058 option64 |= (MACH_RCV_LARGE | MACH_RCV_LARGE_IDENTITY);
1059 /* Receive vector linearly onto stack */
1060 if (kectx->kec_process_flags & KEVENT_FLAG_STACK_DATA) {
1061 option64 |= MACH64_RCV_STACK;
1062 }
1063 }
1064 } else {
1065 /* just detect the port name (if a set) and size of the first message */
1066 option64 = MACH_RCV_LARGE;
1067 msg_addr = 0;
1068 max_msg_size = 0;
1069 }
1070 option64 |= MACH_RCV_TIMEOUT; /* never wait */
1071
1072 /*
1073 * Set up to receive a message or the notification of a
1074 * too large message. But never allow this call to wait.
1075 * If the user provided aditional options, like trailer
1076 * options, pass those through here. But we don't support
1077 * scatter lists through this interface.
1078 *
1079 * Note: while in filt_machportprocess(),
1080 * the knote has a reference on `object` that we can borrow.
1081 */
1082
1083 /* Set up message proper receive params on thread */
1084 bzero(&self->ith_receive, sizeof(self->ith_receive));
1085 self->ith_recv_bufs = (mach_msg_recv_bufs_t){
1086 .recv_msg_addr = msg_addr,
1087 .recv_msg_size = max_msg_size,
1088 };
1089 self->ith_object = object;
1090 self->ith_option = option64;
1091 self->ith_knote = kn;
1092
1093 ipc_object_validate(object, otype);
1094
1095 waitq_lock(io_waitq(object));
1096 wresult = ipc_mqueue_receive_on_thread_and_unlock(io_waitq(object),
1097 MACH_MSG_TIMEOUT_NONE, THREAD_INTERRUPTIBLE, self);
1098 /* port unlocked */
1099
1100 /* If we timed out, or the process is exiting, just zero. */
1101 if (wresult == THREAD_RESTART || self->ith_state == MACH_RCV_TIMED_OUT) {
1102 assert(self->turnstile != TURNSTILE_NULL);
1103 self->ith_knote = ITH_KNOTE_NULL;
1104 return 0;
1105 }
1106
1107 assert(wresult == THREAD_NOT_WAITING);
1108 assert(self->ith_state != MACH_RCV_IN_PROGRESS);
1109
1110 /*
1111 * If we weren't attempting to receive a message
1112 * directly, we need to return the port name in
1113 * the kevent structure.
1114 */
1115 if ((option64 & MACH_RCV_MSG) != MACH_RCV_MSG) {
1116 assert(self->ith_state == MACH_RCV_TOO_LARGE);
1117 assert(self->ith_kmsg == IKM_NULL);
1118 kev->data = self->ith_receiver_name;
1119 self->ith_knote = ITH_KNOTE_NULL;
1120 return result;
1121 }
1122
1123 #if CONFIG_PREADOPT_TG
1124 /* If we're the first EVFILT_MACHPORT knote that is being processed for this
1125 * kqwl, then make sure to preadopt the thread group from the kmsg we're
1126 * about to receive. This is to make sure that we fix up the preadoption
1127 * thread group correctly on the receive side for the first message.
1128 */
1129 struct kqueue *kq = knote_get_kq(kn);
1130
1131 if (self->ith_kmsg) {
1132 struct thread_group *tg = ipc_kmsg_get_thread_group(self->ith_kmsg);
1133
1134 kqueue_process_preadopt_thread_group(self, kq, tg);
1135 }
1136 #endif
1137 if (io_is_any_port_type(otype)) {
1138 ipc_port_t port = ip_object_to_port(object);
1139 struct kqueue *kqwl = knote_get_kq(kn);
1140 if (port->ip_kernel_iotier_override != kqueue_get_iotier_override(kqwl)) {
1141 /*
1142 * Lock the port to make sure port->ip_kernel_iotier_override does
1143 * not change while updating the kqueue override, else kqueue could
1144 * have old iotier value.
1145 */
1146 ip_mq_lock(port);
1147 kqueue_set_iotier_override(kqwl, port->ip_kernel_iotier_override);
1148 ip_mq_unlock(port);
1149 result |= FILTER_ADJUST_EVENT_IOTIER_BIT;
1150 }
1151 }
1152
1153 /*
1154 * Attempt to receive the message directly, returning
1155 * the results in the fflags field.
1156 */
1157 io_reference(object);
1158 kev->fflags = mach_msg_receive_results(&msgr);
1159
1160 /* kmsg and object reference consumed */
1161
1162 /*
1163 * if the user asked for the identity of ports containing a
1164 * a too-large message, return it in the data field (as we
1165 * do for messages we didn't try to receive).
1166 */
1167 kev->ext[1] = msgr.msgr_msg_size + msgr.msgr_trailer_size;
1168 kev->ext[3] = msgr.msgr_aux_size; /* Only lower 32 bits of ext3 are used */
1169 if (kev->fflags == MACH_RCV_TOO_LARGE &&
1170 (option64 & MACH_RCV_LARGE_IDENTITY)) {
1171 kev->data = msgr.msgr_recv_name;
1172 } else {
1173 kev->data = MACH_PORT_NULL;
1174 }
1175
1176 /*
1177 * If we used a data buffer carved out from the filt_process data,
1178 * store the address used in the knote and adjust the residual and
1179 * other parameters for future use.
1180 */
1181 if (kectx && kev->fflags != MACH_RCV_TOO_LARGE) {
1182 mach_vm_size_t size = msgr.msgr_msg_size +
1183 msgr.msgr_trailer_size + msgr.msgr_aux_size;
1184
1185 assert(kectx->kec_data_resid >= size);
1186 kectx->kec_data_resid -= size;
1187 if ((kectx->kec_process_flags & KEVENT_FLAG_STACK_DATA) == 0) {
1188 kev->ext[0] = kectx->kec_data_out;
1189 kectx->kec_data_out += size;
1190 } else {
1191 assert(option64 & MACH64_RCV_STACK);
1192 kev->ext[0] = kectx->kec_data_out + kectx->kec_data_resid;
1193 }
1194 }
1195
1196 /*
1197 * Apply message-based QoS values to output kevent as prescribed.
1198 * The kev->ext[2] field gets (msg-qos << 32) | (override-qos).
1199 */
1200 if (kev->fflags == MACH_MSG_SUCCESS) {
1201 kev->ext[2] = ((uint64_t)msgr.msgr_priority << 32) |
1202 _pthread_priority_make_from_thread_qos(msgr.msgr_qos_ovrd, 0, 0);
1203 }
1204
1205 self->ith_knote = ITH_KNOTE_NULL;
1206 return result;
1207 }
1208
1209 static int
filt_psetprocess(struct knote * kn,struct kevent_qos_s * kev)1210 filt_psetprocess(struct knote *kn, struct kevent_qos_s *kev)
1211 {
1212 ipc_object_t io = ips_to_object(kn->kn_ipc_pset);
1213
1214 return filt_machportprocess(kn, kev, io, IOT_PORT_SET);
1215 }
1216
1217 static int
filt_portprocess(struct knote * kn,struct kevent_qos_s * kev)1218 filt_portprocess(struct knote *kn, struct kevent_qos_s *kev)
1219 {
1220 ipc_object_t io = ip_to_object(kn->kn_ipc_port);
1221
1222 return filt_machportprocess(kn, kev, io, IOT_PORT);
1223 }
1224
1225 static void
filt_machportsanitizedcopyout(struct knote * kn,struct kevent_qos_s * kev)1226 filt_machportsanitizedcopyout(struct knote *kn, struct kevent_qos_s *kev)
1227 {
1228 *kev = *(struct kevent_qos_s *)&kn->kn_kevent;
1229
1230 // We may have stashed the address to the port that is pushing on the sync
1231 // IPC so clear it out.
1232 kev->ext[3] = 0;
1233 }
1234
1235 const struct filterops machport_attach_filtops = {
1236 .f_adjusts_qos = true,
1237 .f_extended_codes = true,
1238 .f_attach = filt_machportattach,
1239 .f_sanitized_copyout = filt_machportsanitizedcopyout,
1240 };
1241
1242 const struct filterops mach_port_filtops = {
1243 .f_adjusts_qos = true,
1244 .f_extended_codes = true,
1245 .f_detach = filt_portdetach,
1246 .f_event = filt_portevent,
1247 .f_touch = filt_porttouch,
1248 .f_process = filt_portprocess,
1249 .f_sanitized_copyout = filt_machportsanitizedcopyout,
1250 };
1251
1252 const struct filterops mach_port_set_filtops = {
1253 .f_adjusts_qos = true,
1254 .f_extended_codes = true,
1255 .f_detach = filt_psetdetach,
1256 .f_event = filt_psetevent,
1257 .f_touch = filt_psettouch,
1258 .f_process = filt_psetprocess,
1259 .f_sanitized_copyout = filt_machportsanitizedcopyout,
1260 };
1261