1 /*
2 * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * @OSF_COPYRIGHT@
30 */
31 /*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or [email protected]
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56 /*
57 */
58 /*
59 * File: ipc/ipc_pset.c
60 * Author: Rich Draves
61 * Date: 1989
62 *
63 * Functions to manipulate IPC port sets.
64 */
65
66 #include <mach/port.h>
67 #include <mach/kern_return.h>
68 #include <mach/message.h>
69 #include <ipc/ipc_mqueue.h>
70 #include <ipc/ipc_object.h>
71 #include <ipc/ipc_pset.h>
72 #include <ipc/ipc_right.h>
73 #include <ipc/ipc_space.h>
74 #include <ipc/ipc_port.h>
75 #include <ipc/ipc_kmsg.h>
76 #include <kern/policy_internal.h>
77
78 #include <kern/kern_types.h>
79
80 #include <vm/vm_map.h>
81 #include <libkern/section_keywords.h>
82 #include <pthread/priority_private.h>
83
84 /* processor_set stole ipc_pset_init */
85 static void
ipc_port_set_init(ipc_pset_t pset,mach_port_name_t name,int policy)86 ipc_port_set_init(ipc_pset_t pset, mach_port_name_t name, int policy)
87 {
88 waitq_init(&pset->ips_wqset, WQT_PORT_SET, policy | SYNC_POLICY_FIFO);
89 klist_init(&pset->ips_klist);
90 pset->ips_wqset.wqset_index = MACH_PORT_INDEX(name);
91 }
92
93 /*
94 * Routine: ipc_pset_alloc
95 * Purpose:
96 * Allocate a port set.
97 * Conditions:
98 * Nothing locked. If successful, the port set is returned
99 * locked. (The caller doesn't have a reference.)
100 * Returns:
101 * KERN_SUCCESS The port set is allocated.
102 * KERN_INVALID_TASK The space is dead.
103 * KERN_NO_SPACE No room for an entry in the space.
104 */
105
106 kern_return_t
ipc_pset_alloc(ipc_space_t space,mach_port_name_t * namep,ipc_pset_t * psetp)107 ipc_pset_alloc(
108 ipc_space_t space,
109 mach_port_name_t *namep,
110 ipc_pset_t *psetp)
111 {
112 ipc_pset_t pset;
113 mach_port_name_t name;
114 kern_return_t kr;
115
116 kr = ipc_object_alloc(space, IOT_PORT_SET,
117 MACH_PORT_TYPE_PORT_SET, 0,
118 &name, (ipc_object_t *) &pset);
119 if (kr != KERN_SUCCESS) {
120 return kr;
121 }
122 /* space is locked */
123
124 ipc_port_set_init(pset, name, SYNC_POLICY_INIT_LOCKED);
125 /* port set is locked */
126
127 is_write_unlock(space);
128
129 *namep = name;
130 *psetp = pset;
131 return KERN_SUCCESS;
132 }
133
134 /*
135 * Routine: ipc_pset_alloc_name
136 * Purpose:
137 * Allocate a port set, with a specific name.
138 * Conditions:
139 * Nothing locked. If successful, the port set is returned
140 * locked. (The caller doesn't have a reference.)
141 * Returns:
142 * KERN_SUCCESS The port set is allocated.
143 * KERN_INVALID_TASK The space is dead.
144 * KERN_NAME_EXISTS The name already denotes a right.
145 */
146
147 kern_return_t
ipc_pset_alloc_name(ipc_space_t space,mach_port_name_t name,ipc_pset_t * psetp)148 ipc_pset_alloc_name(
149 ipc_space_t space,
150 mach_port_name_t name,
151 ipc_pset_t *psetp)
152 {
153 return ipc_object_alloc_name(space, IOT_PORT_SET,
154 MACH_PORT_TYPE_PORT_SET, 0,
155 name, (ipc_object_t *)psetp, ^(ipc_object_t object){
156 ipc_port_set_init(ips_object_to_pset(object), name,
157 SYNC_POLICY_INIT_LOCKED);
158 });
159 }
160
161
162 /*
163 * Routine: ipc_pset_alloc_special
164 * Purpose:
165 * Allocate a port set in a special space.
166 * The new port set is returned with one ref.
167 * If unsuccessful, IPS_NULL is returned.
168 * Conditions:
169 * Nothing locked.
170 */
171 ipc_pset_t
ipc_pset_alloc_special(__assert_only ipc_space_t space)172 ipc_pset_alloc_special(
173 __assert_only ipc_space_t space)
174 {
175 ipc_pset_t pset;
176
177 assert(space != IS_NULL);
178 assert(!is_active(space));
179
180 pset = ips_object_to_pset(io_alloc(IOT_PORT_SET, Z_WAITOK | Z_ZERO));
181 if (pset == IPS_NULL) {
182 return IPS_NULL;
183 }
184
185 os_atomic_init(&pset->ips_object.io_bits, io_makebits(IOT_PORT_SET));
186 os_atomic_init(&pset->ips_object.io_references, 1);
187
188 ipc_port_set_init(pset, MACH_PORT_SPECIAL_DEFAULT, 0);
189
190 return pset;
191 }
192
193
194 /*
195 * Routine: ipc_pset_destroy
196 * Purpose:
197 * Destroys a port_set.
198 * Conditions:
199 * The port_set is locked and alive.
200 * The caller has a reference, which is consumed.
201 * Afterwards, the port_set is unlocked and dead.
202 */
203
204 void
ipc_pset_destroy(ipc_space_t space,ipc_pset_t pset)205 ipc_pset_destroy(
206 ipc_space_t space,
207 ipc_pset_t pset)
208 {
209 waitq_link_list_t free_l = { };
210
211 assert(ips_active(pset));
212
213 io_bits_andnot(ips_to_object(pset), IO_BITS_ACTIVE);
214
215 /*
216 * Set all waiters on the portset running to
217 * discover the change.
218 *
219 * Then under the same lock hold, deinit the waitq-set,
220 * which will remove all the member message queues,
221 * linkages and clean up preposts.
222 */
223 ipc_mqueue_changed(space, &pset->ips_wqset);
224 waitq_invalidate(&pset->ips_wqset);
225 waitq_set_unlink_all_locked(&pset->ips_wqset, &free_l);
226
227 ips_mq_unlock(pset);
228
229 ips_release(pset); /* consume the ref our caller gave us */
230
231 waitq_link_free_list(WQT_PORT_SET, &free_l);
232 }
233
234 /*
235 * Routine: ipc_pset_finalize
236 * Purpose:
237 * Called on last reference deallocate to
238 * free any remaining data associated with the pset.
239 * Conditions:
240 * Nothing locked.
241 */
242 void
ipc_pset_finalize(ipc_pset_t pset)243 ipc_pset_finalize(
244 ipc_pset_t pset)
245 {
246 waitq_deinit(&pset->ips_wqset);
247 }
248
249
250 /*
251 * Kqueue EVFILT_MACHPORT support
252 *
253 * - kn_ipc_obj points to the monitored ipc port or pset
254 *
255 * - (in/out) ext[0] holds a mach_vm_address_t to a userspace buffer
256 * that can be used to direct-deliver messages when
257 * MACH_RCV_MSG is set in kn_sfflags
258 *
259 * - (in/out) ext[1] holds a mach_msg_size_t representing the size
260 * of the userspace buffer held in ext[0].
261 *
262 * - (out) ext[2] is used to deliver qos information
263 * about the send queue to userspace.
264 *
265 * - (abused) ext[3] is used in kernel to hold a reference to the first port
266 * with a turnstile that participate to sync IPC override.
267 *
268 * - kn_hook is optionally a "knote" turnstile. It is used as the inheritor
269 * of turnstiles for rights copied out as part of direct message delivery
270 * when they can participate to sync IPC override.
271 *
272 * It is used to atomically neuter the sync IPC override when the knote is
273 * re-enabled.
274 *
275 */
276
277 #include <sys/event.h>
278 #include <sys/errno.h>
279
280 static int
filt_machport_filter_result(struct knote * kn,ipc_object_t object)281 filt_machport_filter_result(struct knote *kn, ipc_object_t object)
282 {
283 struct waitq *wq = io_waitq(object);
284 ipc_kmsg_t first;
285 int result = 0;
286
287 io_lock_held(object);
288
289 if (kn->kn_sfflags & MACH_RCV_MSG) {
290 result = FILTER_RESET_EVENT_QOS;
291 }
292
293 if (!waitq_is_valid(wq)) {
294 return result;
295 }
296
297 if (waitq_type(wq) == WQT_PORT_SET) {
298 ipc_pset_t pset = ips_object_to_pset(object);
299 return waitq_set_first_prepost(&pset->ips_wqset, WQS_PREPOST_PEEK) ?
300 FILTER_ACTIVE : 0;
301 }
302
303 ipc_port_t port = ip_object_to_port(object);
304 struct kqueue *kqwl = knote_get_kq(kn);
305
306 if (port->ip_kernel_iotier_override != kqueue_get_iotier_override(kqwl)) {
307 kqueue_set_iotier_override(kqwl, port->ip_kernel_iotier_override);
308 result |= FILTER_ADJUST_EVENT_IOTIER_BIT;
309 }
310
311 first = ipc_kmsg_queue_first(&port->ip_messages.imq_messages);
312 if (!first) {
313 return result;
314 }
315
316 result = FILTER_ACTIVE;
317 if (kn->kn_sfflags & MACH_RCV_MSG) {
318 result |= FILTER_ADJUST_EVENT_QOS(first->ikm_qos_override);
319 }
320
321 #if CONFIG_PREADOPT_TG
322 struct thread_group *tg = ipc_kmsg_get_thread_group(first);
323 if (tg) {
324 struct kqueue *kq = knote_get_kq(kn);
325 kqueue_set_preadopted_thread_group(kq, tg,
326 first->ikm_qos_override);
327 }
328 #endif
329
330 return result;
331 }
332
333 struct turnstile *
filt_ipc_kqueue_turnstile(struct knote * kn)334 filt_ipc_kqueue_turnstile(struct knote *kn)
335 {
336 assert(kn->kn_filter == EVFILT_MACHPORT || kn->kn_filter == EVFILT_WORKLOOP);
337 return kqueue_turnstile(knote_get_kq(kn));
338 }
339
340 bool
filt_machport_kqueue_has_turnstile(struct knote * kn)341 filt_machport_kqueue_has_turnstile(struct knote *kn)
342 {
343 assert(kn->kn_filter == EVFILT_MACHPORT);
344 return ((kn->kn_sfflags & MACH_RCV_MSG) || (kn->kn_sfflags & MACH_RCV_SYNC_PEEK))
345 && (kn->kn_flags & EV_DISPATCH);
346 }
347
348 /*
349 * Stashes a port that participate to sync IPC override until the knote
350 * is being re-enabled.
351 *
352 * It returns:
353 * - the turnstile to use as an inheritor for the stashed port
354 * - the kind of stash that happened as PORT_SYNC_* value among:
355 * o not stashed (no sync IPC support)
356 * o stashed in the knote (in kn_ext[3])
357 * o to be hooked to the kn_hook knote
358 */
359 struct turnstile *
filt_machport_stash_port(struct knote * kn,ipc_port_t port,int * link)360 filt_machport_stash_port(struct knote *kn, ipc_port_t port, int *link)
361 {
362 struct turnstile *ts = TURNSTILE_NULL;
363
364 if (kn->kn_filter == EVFILT_WORKLOOP) {
365 assert(kn->kn_ipc_obj == NULL);
366 kn->kn_ipc_obj = ip_to_object(port);
367 ip_reference(port);
368 if (link) {
369 *link = PORT_SYNC_LINK_WORKLOOP_KNOTE;
370 }
371 ts = filt_ipc_kqueue_turnstile(kn);
372 } else if (!filt_machport_kqueue_has_turnstile(kn)) {
373 if (link) {
374 *link = PORT_SYNC_LINK_NO_LINKAGE;
375 }
376 } else if (kn->kn_ext[3] == 0) {
377 ip_reference(port);
378 kn->kn_ext[3] = (uintptr_t)port;
379 ts = filt_ipc_kqueue_turnstile(kn);
380 if (link) {
381 *link = PORT_SYNC_LINK_WORKLOOP_KNOTE;
382 }
383 } else {
384 ts = (struct turnstile *)kn->kn_hook;
385 if (link) {
386 *link = PORT_SYNC_LINK_WORKLOOP_STASH;
387 }
388 }
389
390 return ts;
391 }
392
393 /*
394 * Lazily prepare a turnstile so that filt_machport_stash_port()
395 * can be called with the mqueue lock held.
396 *
397 * It will allocate a turnstile in kn_hook if:
398 * - the knote supports sync IPC override,
399 * - we already stashed a port in kn_ext[3],
400 * - the object that will be copied out has a chance to ask to be stashed.
401 *
402 * It is setup so that its inheritor is the workloop turnstile that has been
403 * allocated when this knote was attached.
404 */
405 void
filt_machport_turnstile_prepare_lazily(struct knote * kn,mach_msg_type_name_t msgt_name,ipc_port_t port)406 filt_machport_turnstile_prepare_lazily(
407 struct knote *kn,
408 mach_msg_type_name_t msgt_name,
409 ipc_port_t port)
410 {
411 /* This is called from within filt_machportprocess */
412 assert((kn->kn_status & KN_SUPPRESSED) && (kn->kn_status & KN_LOCKED));
413
414 if (!filt_machport_kqueue_has_turnstile(kn)) {
415 return;
416 }
417
418 if (kn->kn_ext[3] == 0 || kn->kn_hook) {
419 return;
420 }
421
422 struct turnstile *ts = filt_ipc_kqueue_turnstile(kn);
423 if ((msgt_name == MACH_MSG_TYPE_PORT_SEND_ONCE && port->ip_specialreply) ||
424 (msgt_name == MACH_MSG_TYPE_PORT_RECEIVE)) {
425 struct turnstile *kn_ts = turnstile_alloc();
426 kn_ts = turnstile_prepare((uintptr_t)kn,
427 (struct turnstile **)&kn->kn_hook, kn_ts, TURNSTILE_KNOTE);
428 turnstile_update_inheritor(kn_ts, ts,
429 TURNSTILE_IMMEDIATE_UPDATE | TURNSTILE_INHERITOR_TURNSTILE);
430 turnstile_cleanup();
431 }
432 }
433
434 static void
filt_machport_turnstile_complete_port(struct knote * kn,ipc_port_t port)435 filt_machport_turnstile_complete_port(struct knote *kn, ipc_port_t port)
436 {
437 struct turnstile *ts = TURNSTILE_NULL;
438
439 ip_mq_lock(port);
440 if (port->ip_specialreply) {
441 /*
442 * If the reply has been sent to the special reply port already,
443 * then the special reply port may already be reused to do something
444 * entirely different.
445 *
446 * However, the only reason for it to still point to this knote is
447 * that it's still waiting for a reply, so when this is the case,
448 * neuter the linkage.
449 */
450 if (port->ip_sync_link_state == PORT_SYNC_LINK_WORKLOOP_KNOTE &&
451 port->ip_sync_inheritor_knote == kn) {
452 ipc_port_adjust_special_reply_port_locked(port, NULL,
453 (IPC_PORT_ADJUST_SR_NONE | IPC_PORT_ADJUST_SR_ENABLE_EVENT), FALSE);
454 /* port unlocked */
455 } else {
456 ip_mq_unlock(port);
457 }
458 } else {
459 /*
460 * For receive rights, if their IMQ_KNOTE() is still this
461 * knote, then sever the link.
462 */
463 if (port->ip_sync_link_state == PORT_SYNC_LINK_WORKLOOP_KNOTE &&
464 port->ip_messages.imq_inheritor_knote == kn) {
465 ipc_port_adjust_sync_link_state_locked(port, PORT_SYNC_LINK_ANY, NULL);
466 ts = port_send_turnstile(port);
467 }
468 if (ts) {
469 turnstile_reference(ts);
470 turnstile_update_inheritor(ts, TURNSTILE_INHERITOR_NULL,
471 TURNSTILE_IMMEDIATE_UPDATE);
472 }
473 ip_mq_unlock(port);
474
475 if (ts) {
476 turnstile_update_inheritor_complete(ts,
477 TURNSTILE_INTERLOCK_NOT_HELD);
478 turnstile_deallocate(ts);
479 }
480 }
481
482 ip_release(port);
483 }
484
485 void
filt_wldetach_sync_ipc(struct knote * kn)486 filt_wldetach_sync_ipc(struct knote *kn)
487 {
488 ipc_object_t io = kn->kn_ipc_obj;
489 filt_machport_turnstile_complete_port(kn, ip_object_to_port(io));
490 kn->kn_ipc_obj = IO_NULL;
491 }
492
493 /*
494 * Other half of filt_machport_turnstile_prepare_lazily()
495 *
496 * This is serialized by the knote state machine.
497 */
498 static void
filt_machport_turnstile_complete(struct knote * kn)499 filt_machport_turnstile_complete(struct knote *kn)
500 {
501 if (kn->kn_ext[3]) {
502 ipc_port_t port = (ipc_port_t)kn->kn_ext[3];
503 filt_machport_turnstile_complete_port(kn, port);
504 kn->kn_ext[3] = 0;
505 }
506
507 if (kn->kn_hook) {
508 struct turnstile *ts = kn->kn_hook;
509
510 turnstile_update_inheritor(ts, TURNSTILE_INHERITOR_NULL,
511 TURNSTILE_IMMEDIATE_UPDATE);
512 turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD);
513
514 turnstile_complete((uintptr_t)kn, (struct turnstile **)&kn->kn_hook, &ts, TURNSTILE_KNOTE);
515 turnstile_cleanup();
516
517 assert(ts);
518 turnstile_deallocate(ts);
519 }
520 }
521
522 static void
filt_machport_link(struct klist * klist,struct knote * kn)523 filt_machport_link(struct klist *klist, struct knote *kn)
524 {
525 struct knote *hd = SLIST_FIRST(klist);
526
527 if (hd && filt_machport_kqueue_has_turnstile(kn)) {
528 SLIST_INSERT_AFTER(hd, kn, kn_selnext);
529 } else {
530 SLIST_INSERT_HEAD(klist, kn, kn_selnext);
531 }
532 }
533
534 static void
filt_machport_unlink(struct klist * klist,struct knote * kn)535 filt_machport_unlink(struct klist *klist, struct knote *kn)
536 {
537 struct knote **knprev;
538
539 KNOTE_DETACH(klist, kn);
540
541 /* make sure the first knote is a knote we can push on */
542 SLIST_FOREACH_PREVPTR(kn, knprev, klist, kn_selnext) {
543 if (filt_machport_kqueue_has_turnstile(kn)) {
544 *knprev = SLIST_NEXT(kn, kn_selnext);
545 SLIST_INSERT_HEAD(klist, kn, kn_selnext);
546 break;
547 }
548 }
549 }
550
551 int
filt_wlattach_sync_ipc(struct knote * kn)552 filt_wlattach_sync_ipc(struct knote *kn)
553 {
554 mach_port_name_t name = (mach_port_name_t)kn->kn_id;
555 ipc_space_t space = current_space();
556 ipc_entry_bits_t bits;
557 ipc_object_t object;
558 ipc_port_t port = IP_NULL;
559 int error = 0;
560
561 if (ipc_right_lookup_read(space, name, &bits, &object) != KERN_SUCCESS) {
562 return ENOENT;
563 }
564 /* object is locked and active */
565
566 if (bits & MACH_PORT_TYPE_RECEIVE) {
567 port = ip_object_to_port(object);
568 if (port->ip_specialreply) {
569 error = ENOENT;
570 }
571 } else if (bits & MACH_PORT_TYPE_SEND_ONCE) {
572 port = ip_object_to_port(object);
573 if (!port->ip_specialreply) {
574 error = ENOENT;
575 }
576 } else {
577 error = ENOENT;
578 }
579 if (error) {
580 io_unlock(object);
581 return error;
582 }
583
584 if (port->ip_sync_link_state == PORT_SYNC_LINK_ANY) {
585 io_unlock(object);
586 /*
587 * We cannot start a sync IPC inheritance chain, only further one
588 * Note: this can also happen if the inheritance chain broke
589 * because the original requestor died.
590 */
591 return ENOENT;
592 }
593
594 if (port->ip_specialreply) {
595 ipc_port_adjust_special_reply_port_locked(port, kn,
596 IPC_PORT_ADJUST_SR_LINK_WORKLOOP, FALSE);
597 } else {
598 ipc_port_adjust_port_locked(port, kn, FALSE);
599 }
600
601 /* make sure the port was stashed */
602 assert(kn->kn_ipc_obj == ip_to_object(port));
603
604 /* port has been unlocked by ipc_port_adjust_* */
605
606 return 0;
607 }
608
609 static int
filt_machportattach(struct knote * kn,__unused struct kevent_qos_s * kev)610 filt_machportattach(
611 struct knote *kn,
612 __unused struct kevent_qos_s *kev)
613 {
614 mach_port_name_t name = (mach_port_name_t)kn->kn_id;
615 ipc_space_t space = current_space();
616 ipc_entry_bits_t bits;
617 ipc_object_t object;
618 struct turnstile *send_turnstile = TURNSTILE_NULL;
619
620 int error = 0;
621 int result = 0;
622 kern_return_t kr;
623
624 kn->kn_flags &= ~EV_EOF;
625 kn->kn_ext[3] = 0;
626
627 if (filt_machport_kqueue_has_turnstile(kn)) {
628 /*
629 * If the filter is likely to support sync IPC override,
630 * and it happens to be attaching to a workloop,
631 * make sure the workloop has an allocated turnstile.
632 */
633 kqueue_alloc_turnstile(knote_get_kq(kn));
634 }
635
636 kr = ipc_right_lookup_read(space, name, &bits, &object);
637
638 if (kr != KERN_SUCCESS) {
639 error = ENOENT;
640 goto out;
641 }
642 /* object is locked and active */
643
644 if (bits & MACH_PORT_TYPE_PORT_SET) {
645 ipc_pset_t pset = ips_object_to_pset(object);
646
647 io_reference(object);
648 kn->kn_ipc_obj = object;
649 filt_machport_link(&pset->ips_klist, kn);
650 result = filt_machport_filter_result(kn, object);
651 io_unlock(object);
652 } else if (bits & MACH_PORT_TYPE_RECEIVE) {
653 ipc_port_t port = ip_object_to_port(object);
654
655 if (port->ip_specialreply) {
656 /*
657 * Registering for kevents on special reply ports
658 * isn't supported for two reasons:
659 *
660 * 1. it really makes very little sense for a port that
661 * is supposed to be used synchronously
662 *
663 * 2. their ports's ip_klist field will be used to
664 * store the receive turnstile, so we can't possibly
665 * attach them anyway.
666 */
667 io_unlock(object);
668 error = ENOTSUP;
669 goto out;
670 }
671
672 io_reference(object);
673 kn->kn_ipc_obj = object;
674 if (port->ip_sync_link_state != PORT_SYNC_LINK_ANY) {
675 /*
676 * We're attaching a port that used to have an IMQ_KNOTE,
677 * clobber this state, we'll fixup its turnstile inheritor below.
678 */
679 ipc_port_adjust_sync_link_state_locked(port, PORT_SYNC_LINK_ANY, NULL);
680 }
681
682 filt_machport_link(&port->ip_klist, kn);
683 result = filt_machport_filter_result(kn, object);
684
685 /*
686 * Update the port's turnstile inheritor
687 *
688 * Unlike filt_machportdetach(), we don't have to care about races for
689 * turnstile_workloop_pusher_info(): filt_machport_link() doesn't affect
690 * already pushing knotes, and if the current one becomes the new
691 * pusher, it'll only be visible when turnstile_workloop_pusher_info()
692 * returns.
693 */
694 send_turnstile = port_send_turnstile(port);
695 if (send_turnstile) {
696 turnstile_reference(send_turnstile);
697 ipc_port_send_update_inheritor(port, send_turnstile,
698 TURNSTILE_IMMEDIATE_UPDATE);
699
700 /*
701 * rdar://problem/48861190
702 *
703 * When a listener connection resumes a peer,
704 * updating the inheritor above has moved the push
705 * from the current thread to the workloop.
706 *
707 * However, we haven't told the workloop yet
708 * that it needs a thread request, and we risk
709 * to be preeempted as soon as we drop the space
710 * lock below.
711 *
712 * To avoid this disable preemption and let kevent
713 * reenable it after it takes the kqlock.
714 */
715 disable_preemption();
716 result |= FILTER_THREADREQ_NODEFEER;
717 }
718
719 io_unlock(object);
720
721 if (send_turnstile) {
722 turnstile_update_inheritor_complete(send_turnstile,
723 TURNSTILE_INTERLOCK_NOT_HELD);
724 turnstile_deallocate_safe(send_turnstile);
725 }
726 } else {
727 io_unlock(object);
728 error = ENOTSUP;
729 }
730
731 out:
732 /* bail out on errors */
733 if (error) {
734 knote_set_error(kn, error);
735 return 0;
736 }
737
738 return result;
739 }
740
741 static void
filt_machportdetach(struct knote * kn)742 filt_machportdetach(
743 struct knote *kn)
744 {
745 ipc_object_t object = kn->kn_ipc_obj;
746 struct turnstile *send_turnstile = TURNSTILE_NULL;
747
748 filt_machport_turnstile_complete(kn);
749
750 io_lock(object);
751 if ((kn->kn_status & KN_VANISHED) || (kn->kn_flags & EV_EOF)) {
752 /*
753 * ipc_mqueue_changed() already unhooked this knote from the waitq,
754 */
755 } else {
756 ipc_port_t port = IP_NULL;
757
758 /*
759 * When the knote being detached is the first one in the list,
760 * then unlinking the knote *and* updating the turnstile inheritor
761 * need to happen atomically with respect to the callers of
762 * turnstile_workloop_pusher_info().
763 *
764 * The caller of turnstile_workloop_pusher_info() will use the kq req
765 * lock (and hence the kqlock), so we just need to hold the kqlock too.
766 */
767 if (io_otype(object) == IOT_PORT) {
768 port = ip_object_to_port(object);
769 assert(port->ip_sync_link_state == PORT_SYNC_LINK_ANY);
770 if (kn == SLIST_FIRST(&port->ip_klist)) {
771 send_turnstile = port_send_turnstile(port);
772 }
773 filt_machport_unlink(&port->ip_klist, kn);
774 struct kqueue *kq = knote_get_kq(kn);
775 kqueue_set_iotier_override(kq, THROTTLE_LEVEL_END);
776 } else {
777 ipc_pset_t pset = ips_object_to_pset(object);
778
779 filt_machport_unlink(&pset->ips_klist, kn);
780 }
781
782
783 if (send_turnstile) {
784 turnstile_reference(send_turnstile);
785 ipc_port_send_update_inheritor(port, send_turnstile,
786 TURNSTILE_IMMEDIATE_UPDATE);
787 }
788 }
789
790 /* Clear the knote pointer once the knote has been removed from turnstile */
791 kn->kn_ipc_obj = IO_NULL;
792 io_unlock(object);
793
794 if (send_turnstile) {
795 turnstile_update_inheritor_complete(send_turnstile,
796 TURNSTILE_INTERLOCK_NOT_HELD);
797 turnstile_deallocate(send_turnstile);
798 }
799
800 io_release(object);
801 }
802
803 /*
804 * filt_machportevent - deliver events into the mach port filter
805 *
806 * Mach port message arrival events are currently only posted via the
807 * kqueue filter routine for ports.
808 *
809 * If there is a message at the head of the queue,
810 * we indicate that the knote should go active. If
811 * the message is to be direct-received, we adjust the
812 * QoS of the knote according the requested and override
813 * QoS of that first message.
814 *
815 * When the knote is for a port-set, the hint is non 0
816 * and is the waitq which is posting.
817 */
818 static int
filt_machportevent(struct knote * kn,long hint __assert_only)819 filt_machportevent(struct knote *kn, long hint __assert_only)
820 {
821 if (io_otype(kn->kn_ipc_obj) == IOT_PORT_SET) {
822 /*
823 * When called for a port-set,
824 * the posting port waitq is locked.
825 *
826 * waitq_set_first_prepost()
827 * in filt_machport_filter_result()
828 * would try to lock it and be very sad.
829 *
830 * Just trust what we know to be true.
831 */
832 assert(hint != 0);
833 return FILTER_ACTIVE;
834 }
835 assert(hint == 0);
836 return filt_machport_filter_result(kn, kn->kn_ipc_obj);
837 }
838
839 void
ipc_pset_prepost(struct waitq_set * wqs,struct waitq * waitq)840 ipc_pset_prepost(struct waitq_set *wqs, struct waitq *waitq)
841 {
842 KNOTE(&ips_from_waitq(wqs)->ips_klist, (long)waitq);
843 }
844
845 static int
filt_machporttouch(struct knote * kn,struct kevent_qos_s * kev)846 filt_machporttouch(
847 struct knote *kn,
848 struct kevent_qos_s *kev)
849 {
850 ipc_object_t object = kn->kn_ipc_obj;
851 int result = 0;
852
853 /*
854 * Specificying MACH_RCV_MSG or MACH_RCV_SYNC_PEEK during attach results in
855 * allocation of a turnstile. Modifying the filter flags to include these
856 * flags later, without a turnstile being allocated, leads to
857 * inconsistencies.
858 */
859 if ((kn->kn_sfflags ^ kev->fflags) & (MACH_RCV_MSG | MACH_RCV_SYNC_PEEK)) {
860 kev->flags |= EV_ERROR;
861 kev->data = EINVAL;
862 return 0;
863 }
864
865 /* copy in new settings and save off new input fflags */
866 kn->kn_sfflags = kev->fflags;
867 kn->kn_ext[0] = kev->ext[0];
868 kn->kn_ext[1] = kev->ext[1];
869
870 if (kev->flags & EV_ENABLE) {
871 /*
872 * If the knote is being enabled, make sure there's no lingering
873 * IPC overrides from the previous message delivery.
874 */
875 filt_machport_turnstile_complete(kn);
876 }
877
878 io_lock(object);
879 result = filt_machport_filter_result(kn, object);
880 io_unlock(object);
881
882 return result;
883 }
884
885 static int
filt_machportprocess(struct knote * kn,struct kevent_qos_s * kev)886 filt_machportprocess(struct knote *kn, struct kevent_qos_s *kev)
887 {
888 ipc_object_t object = kn->kn_ipc_obj;
889 thread_t self = current_thread();
890 kevent_ctx_t kectx = NULL;
891
892 wait_result_t wresult;
893 mach_msg_option64_t option64;
894 mach_vm_address_t msg_addr;
895 mach_msg_size_t max_msg_size, cpout_aux_size, cpout_msg_size;
896 uint32_t ppri;
897 mach_msg_qos_t oqos;
898
899 int result = FILTER_ACTIVE;
900
901 /* Capture current state */
902 knote_fill_kevent(kn, kev, MACH_PORT_NULL);
903
904 /* Clear port reference, use ext3 as size of msg aux data */
905 kev->ext[3] = 0;
906
907 /* If already deallocated/moved return one last EOF event */
908 if (kev->flags & EV_EOF) {
909 return FILTER_ACTIVE | FILTER_RESET_EVENT_QOS;
910 }
911
912 /*
913 * Only honor supported receive options. If no options are
914 * provided, just force a MACH_RCV_LARGE to detect the
915 * name of the port and sizeof the waiting message.
916 *
917 * Extend kn_sfflags to 64 bits.
918 */
919 option64 = (mach_msg_option64_t)kn->kn_sfflags & (MACH_RCV_MSG |
920 MACH_RCV_LARGE | MACH_RCV_LARGE_IDENTITY |
921 MACH_RCV_TRAILER_MASK | MACH_RCV_VOUCHER | MACH_MSG_STRICT_REPLY);
922
923 if (option64 & MACH_RCV_MSG) {
924 msg_addr = (mach_vm_address_t) kn->kn_ext[0];
925 max_msg_size = (mach_msg_size_t) kn->kn_ext[1];
926
927 /*
928 * Copy out the incoming message as vector, and append aux data
929 * immediately after the message proper (if any) and report its
930 * size on ext3.
931 */
932 option64 |= (MACH64_MSG_VECTOR | MACH64_RCV_LINEAR_VECTOR);
933
934 /*
935 * If the kevent didn't specify a buffer and length, carve a buffer
936 * from the filter processing data according to the flags.
937 */
938 if (max_msg_size == 0) {
939 kectx = kevent_get_context(self);
940 msg_addr = (mach_vm_address_t)kectx->kec_data_out;
941 max_msg_size = (mach_msg_size_t)kectx->kec_data_resid;
942 option64 |= (MACH_RCV_LARGE | MACH_RCV_LARGE_IDENTITY);
943 /* Receive vector linearly onto stack */
944 if (kectx->kec_process_flags & KEVENT_FLAG_STACK_DATA) {
945 option64 |= MACH64_RCV_STACK;
946 }
947 }
948 } else {
949 /* just detect the port name (if a set) and size of the first message */
950 option64 = MACH_RCV_LARGE;
951 msg_addr = 0;
952 max_msg_size = 0;
953 }
954
955 /*
956 * Set up to receive a message or the notification of a
957 * too large message. But never allow this call to wait.
958 * If the user provided aditional options, like trailer
959 * options, pass those through here. But we don't support
960 * scatter lists through this interface.
961 *
962 * Note: while in filt_machportprocess(),
963 * the knote has a reference on `object` that we can borrow.
964 */
965 self->ith_object = object;
966
967 /* Using msg_addr as combined buffer for message proper and aux */
968 self->ith_msg_addr = msg_addr;
969 self->ith_max_msize = max_msg_size;
970 self->ith_msize = 0;
971
972 self->ith_aux_addr = 0;
973 self->ith_max_asize = 0;
974 self->ith_asize = 0;
975
976 self->ith_option = option64;
977 self->ith_receiver_name = MACH_PORT_NULL;
978 option64 |= MACH_RCV_TIMEOUT; // never wait
979 self->ith_state = MACH_RCV_IN_PROGRESS;
980 self->ith_knote = kn;
981
982 io_lock(object);
983
984 wresult = ipc_mqueue_receive_on_thread_and_unlock(
985 io_waitq(object),
986 option64,
987 self->ith_max_msize, /* max msg suze */
988 0, /* max aux size 0, using combined buffer */
989 0, /* immediate timeout */
990 THREAD_INTERRUPTIBLE,
991 self);
992 /* port unlocked */
993
994 /* If we timed out, or the process is exiting, just zero. */
995 if (wresult == THREAD_RESTART || self->ith_state == MACH_RCV_TIMED_OUT) {
996 assert(self->turnstile != TURNSTILE_NULL);
997 self->ith_knote = ITH_KNOTE_NULL;
998 return 0;
999 }
1000
1001 assert(wresult == THREAD_NOT_WAITING);
1002 assert(self->ith_state != MACH_RCV_IN_PROGRESS);
1003
1004 /*
1005 * If we weren't attempting to receive a message
1006 * directly, we need to return the port name in
1007 * the kevent structure.
1008 */
1009 if ((option64 & MACH_RCV_MSG) != MACH_RCV_MSG) {
1010 assert(self->ith_state == MACH_RCV_TOO_LARGE);
1011 assert(self->ith_kmsg == IKM_NULL);
1012 kev->data = self->ith_receiver_name;
1013 self->ith_knote = ITH_KNOTE_NULL;
1014 return result;
1015 }
1016
1017 #if CONFIG_PREADOPT_TG
1018 /* If we're the first EVFILT_MACHPORT knote that is being processed for this
1019 * kqwl, then make sure to preadopt the thread group from the kmsg we're
1020 * about to receive. This is to make sure that we fix up the preadoption
1021 * thread group correctly on the receive side for the first message.
1022 */
1023 struct kqueue *kq = knote_get_kq(kn);
1024
1025 if (self->ith_kmsg) {
1026 struct thread_group *tg = ipc_kmsg_get_thread_group(self->ith_kmsg);
1027
1028 kqueue_process_preadopt_thread_group(self, kq, tg);
1029 }
1030 #endif
1031 ipc_port_t port = ip_object_to_port(object);
1032 struct kqueue *kqwl = knote_get_kq(kn);
1033 if (port->ip_kernel_iotier_override != kqueue_get_iotier_override(kqwl)) {
1034 /*
1035 * Lock the port to make sure port->ip_kernel_iotier_override does
1036 * not change while updating the kqueue override, else kqueue could
1037 * have old iotier value.
1038 */
1039 ip_mq_lock(port);
1040 kqueue_set_iotier_override(kqwl, port->ip_kernel_iotier_override);
1041 result |= FILTER_ADJUST_EVENT_IOTIER_BIT;
1042 ip_mq_unlock(port);
1043 }
1044
1045 /*
1046 * Attempt to receive the message directly, returning
1047 * the results in the fflags field.
1048 */
1049 io_reference(object);
1050 kev->fflags = mach_msg_receive_results_kevent(&cpout_msg_size,
1051 &cpout_aux_size, &ppri, &oqos);
1052
1053 /* kmsg and object reference consumed */
1054
1055 /*
1056 * if the user asked for the identity of ports containing a
1057 * a too-large message, return it in the data field (as we
1058 * do for messages we didn't try to receive).
1059 */
1060 if (kev->fflags == MACH_RCV_TOO_LARGE) {
1061 kev->ext[1] = self->ith_msize;
1062 kev->ext[3] = self->ith_asize; /* Only lower 32 bits of ext3 are used */
1063 if (option64 & MACH_RCV_LARGE_IDENTITY) {
1064 kev->data = self->ith_receiver_name;
1065 } else {
1066 kev->data = MACH_PORT_NULL;
1067 }
1068 } else {
1069 kev->ext[1] = cpout_msg_size;
1070 kev->ext[3] = cpout_aux_size; /* Only lower 32 bits of ext3 are used */
1071 kev->data = MACH_PORT_NULL;
1072 }
1073
1074 /*
1075 * If we used a data buffer carved out from the filt_process data,
1076 * store the address used in the knote and adjust the residual and
1077 * other parameters for future use.
1078 */
1079 if (kectx) {
1080 assert(kectx->kec_data_resid >= cpout_msg_size + cpout_aux_size);
1081 kectx->kec_data_resid -= cpout_msg_size + cpout_aux_size;
1082 if ((kectx->kec_process_flags & KEVENT_FLAG_STACK_DATA) == 0) {
1083 kev->ext[0] = kectx->kec_data_out;
1084 kectx->kec_data_out += cpout_msg_size + cpout_aux_size;
1085 } else {
1086 assert(option64 & MACH64_RCV_STACK);
1087 kev->ext[0] = kectx->kec_data_out + kectx->kec_data_resid;
1088 }
1089 }
1090
1091 /*
1092 * Apply message-based QoS values to output kevent as prescribed.
1093 * The kev->ext[2] field gets (msg-qos << 32) | (override-qos).
1094 */
1095 if (kev->fflags == MACH_MSG_SUCCESS) {
1096 kev->ext[2] = ((uint64_t)ppri << 32) |
1097 _pthread_priority_make_from_thread_qos(oqos, 0, 0);
1098 }
1099
1100 self->ith_knote = ITH_KNOTE_NULL;
1101 return result;
1102 }
1103
1104 SECURITY_READ_ONLY_EARLY(struct filterops) machport_filtops = {
1105 .f_adjusts_qos = true,
1106 .f_extended_codes = true,
1107 .f_attach = filt_machportattach,
1108 .f_detach = filt_machportdetach,
1109 .f_event = filt_machportevent,
1110 .f_touch = filt_machporttouch,
1111 .f_process = filt_machportprocess,
1112 };
1113