xref: /xnu-12377.41.6/bsd/kern/sys_ulock.c (revision bbb1b6f9e71b8cdde6e5cd6f4841f207dee3d828) !
1 /*
2  * Copyright (c) 2015-2020 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include <machine/atomic.h>
30 
31 #include <sys/param.h>
32 #include <sys/systm.h>
33 #include <sys/ioctl.h>
34 #include <sys/file_internal.h>
35 #include <sys/proc_internal.h>
36 #include <sys/kernel.h>
37 #include <sys/guarded.h>
38 #include <sys/stat.h>
39 #include <sys/malloc.h>
40 #include <sys/sysproto.h>
41 #include <sys/pthread_shims.h>
42 
43 #include <mach/mach_types.h>
44 
45 #include <kern/cpu_data.h>
46 #include <kern/mach_param.h>
47 #include <kern/kern_types.h>
48 #include <kern/assert.h>
49 #include <kern/zalloc.h>
50 #include <kern/thread.h>
51 #include <kern/clock.h>
52 #include <kern/ledger.h>
53 #include <kern/policy_internal.h>
54 #include <kern/task.h>
55 #include <kern/telemetry.h>
56 #include <kern/waitq.h>
57 #include <kern/sched_prim.h>
58 #include <kern/turnstile.h>
59 #include <kern/zalloc.h>
60 #include <kern/debug.h>
61 
62 #include <vm/vm_map_xnu.h>
63 
64 #include <pexpert/pexpert.h>
65 
66 #define XNU_TEST_BITMAP
67 #include <kern/bits.h>
68 
69 #include <os/hash.h>
70 #include <sys/ulock.h>
71 
72 /*
73  * How ulock promotion works:
74  *
75  * There’s a requested policy field on every thread called ‘promotions’, which
76  * expresses which ulock promotions are happening to this thread.
77  * The promotion priority saturates until the promotion count goes to 0.
78  *
79  * We also track effective promotion qos, which is the qos before clamping.
80  * This value is used for promoting a thread that another thread is waiting on,
81  * so that the lock owner reinflates to the right priority after unclamping.
82  *
83  * This also works for non-QoS threads, which can donate base priority to QoS
84  * and non-QoS threads alike.
85  *
86  * ulock wait applies a promotion to the owner communicated through
87  * UL_UNFAIR_LOCK as waiters block, and that promotion is saturated as long as
88  * there is still an owner.  In ulock wake, if the waker is still the owner,
89  * then it clears its ownership and drops the boost.  It does NOT transfer
90  * ownership/priority boost to the new thread.  Instead, it selects the
91  * waiting thread with the highest base priority to be woken next, and
92  * relies on that thread to carry the torch for the other waiting threads.
93  */
94 
95 static LCK_GRP_DECLARE(ull_lck_grp, "ulocks");
96 
97 #if XNU_TARGET_OS_XR
98 #define ULL_TICKET_LOCK 1
99 #endif /* XNU_TARGET_OS_XR */
100 
101 #if ULL_TICKET_LOCK
102 typedef lck_ticket_t ull_lock_t;
103 #define ull_lock_init(ull)      lck_ticket_init(&ull->ull_lock, &ull_lck_grp)
104 #define ull_lock_destroy(ull)   lck_ticket_destroy(&ull->ull_lock, &ull_lck_grp)
105 #define ull_lock(ull)           lck_ticket_lock(&ull->ull_lock, &ull_lck_grp)
106 #define ull_unlock(ull)         lck_ticket_unlock(&ull->ull_lock)
107 #define ull_assert_owned(ull)   lck_ticket_assert_owned(&ull->ull_lock)
108 #define ull_assert_notwned(ull) lck_ticket_assert_not_owned(&ull->ull_lock)
109 #else
110 typedef lck_spin_t ull_lock_t;
111 #define ull_lock_init(ull)      lck_spin_init(&ull->ull_lock, &ull_lck_grp, NULL)
112 #define ull_lock_destroy(ull)   lck_spin_destroy(&ull->ull_lock, &ull_lck_grp)
113 #define ull_lock(ull)           lck_spin_lock_grp(&ull->ull_lock, &ull_lck_grp)
114 #define ull_unlock(ull)         lck_spin_unlock(&ull->ull_lock)
115 #define ull_assert_owned(ull)   LCK_SPIN_ASSERT(&ull->ull_lock, LCK_ASSERT_OWNED)
116 #define ull_assert_notwned(ull) LCK_SPIN_ASSERT(&ull->ull_lock, LCK_ASSERT_NOTOWNED)
117 #endif /* ULL_TICKET_LOCK */
118 
119 #define ULOCK_TO_EVENT(ull)   ((event_t)ull)
120 #define EVENT_TO_ULOCK(event) ((ull_t *)event)
121 
122 typedef enum {
123 	ULK_INVALID = 0,
124 	ULK_UADDR,
125 	ULK_XPROC,
126 } ulk_type;
127 
128 typedef struct {
129 	union {
130 		struct __attribute__((packed)) {
131 			user_addr_t     ulk_addr;
132 			/*
133 			 * We use the task address as a hashing key,
134 			 * so that ulock wakes across exec can't
135 			 * be confused.
136 			 */
137 			task_t          ulk_task __kernel_data_semantics;
138 		};
139 		struct __attribute__((packed)) {
140 			uint64_t        ulk_object;
141 			uint64_t        ulk_offset;
142 		};
143 	};
144 	ulk_type        ulk_key_type;
145 } ulk_t;
146 
147 #define ULK_UADDR_LEN   (sizeof(user_addr_t) + sizeof(task_t))
148 #define ULK_XPROC_LEN   (sizeof(uint64_t) + sizeof(uint64_t))
149 
150 inline static bool
ull_key_match(ulk_t * a,ulk_t * b)151 ull_key_match(ulk_t *a, ulk_t *b)
152 {
153 	if (a->ulk_key_type != b->ulk_key_type) {
154 		return false;
155 	}
156 
157 	if (a->ulk_key_type == ULK_UADDR) {
158 		return (a->ulk_task == b->ulk_task) &&
159 		       (a->ulk_addr == b->ulk_addr);
160 	}
161 
162 	assert(a->ulk_key_type == ULK_XPROC);
163 	return (a->ulk_object == b->ulk_object) &&
164 	       (a->ulk_offset == b->ulk_offset);
165 }
166 
167 typedef struct ull {
168 	/*
169 	 * ull_owner is the most recent known value for the owner of this ulock
170 	 * i.e. it may be out of date WRT the real value in userspace.
171 	 */
172 	thread_t        ull_owner; /* holds +1 thread reference */
173 	ulk_t           ull_key;
174 	ull_lock_t      ull_lock;
175 	uint            ull_bucket_index;
176 	int32_t         ull_nwaiters;
177 	int32_t         ull_refcount;
178 	uint8_t         ull_opcode;
179 	struct turnstile *ull_turnstile;
180 	queue_chain_t   ull_hash_link;
181 } ull_t;
182 
183 #define ULL_MUST_EXIST  0x0001
184 static void ull_put(ull_t *);
185 
186 static uint32_t ulock_adaptive_spin_usecs = 20;
187 
188 SYSCTL_INT(_kern, OID_AUTO, ulock_adaptive_spin_usecs, CTLFLAG_RW | CTLFLAG_LOCKED,
189     &ulock_adaptive_spin_usecs, 0, "ulock adaptive spin duration");
190 
191 #if DEVELOPMENT || DEBUG
192 static int ull_simulate_copyin_fault = 0;
193 
194 static void
ull_dump(ull_t * ull)195 ull_dump(ull_t *ull)
196 {
197 	kprintf("ull\t%p\n", ull);
198 	switch (ull->ull_key.ulk_key_type) {
199 	case ULK_UADDR:
200 		kprintf("ull_key.ulk_key_type\tULK_UADDR\n");
201 		kprintf("ull_key.ulk_task\t%p\n", ull->ull_key.ulk_task);
202 		kprintf("ull_key.ulk_addr\t%p\n", (void *)(ull->ull_key.ulk_addr));
203 		break;
204 	case ULK_XPROC:
205 		kprintf("ull_key.ulk_key_type\tULK_XPROC\n");
206 		kprintf("ull_key.ulk_object\t%p\n", (void *)(ull->ull_key.ulk_object));
207 		kprintf("ull_key.ulk_offset\t%p\n", (void *)(ull->ull_key.ulk_offset));
208 		break;
209 	default:
210 		kprintf("ull_key.ulk_key_type\tUNKNOWN %d\n", ull->ull_key.ulk_key_type);
211 		break;
212 	}
213 	kprintf("ull_nwaiters\t%d\n", ull->ull_nwaiters);
214 	kprintf("ull_refcount\t%d\n", ull->ull_refcount);
215 	kprintf("ull_opcode\t%d\n\n", ull->ull_opcode);
216 	kprintf("ull_owner\t0x%llx\n\n", thread_tid(ull->ull_owner));
217 	kprintf("ull_turnstile\t%p\n\n", ull->ull_turnstile);
218 }
219 #endif
220 
221 typedef struct ull_bucket {
222 	queue_head_t ulb_head;
223 #if ULL_TICKET_LOCK
224 	lck_ticket_t ulb_lock;
225 #else
226 	lck_spin_t   ulb_lock;
227 #endif /* ULL_TICKET_LOCK */
228 } ull_bucket_t;
229 
230 static SECURITY_READ_ONLY_LATE(int) ull_hash_buckets;
231 static SECURITY_READ_ONLY_LATE(ull_bucket_t *) ull_bucket;
232 static uint32_t ull_nzalloc = 0;
233 static KALLOC_TYPE_DEFINE(ull_zone, ull_t, KT_DEFAULT);
234 
235 #if ULL_TICKET_LOCK
236 #define ull_bucket_lock(i)       lck_ticket_lock(&ull_bucket[i].ulb_lock, &ull_lck_grp)
237 #define ull_bucket_unlock(i)     lck_ticket_unlock(&ull_bucket[i].ulb_lock)
238 #else
239 #define ull_bucket_lock(i)       lck_spin_lock_grp(&ull_bucket[i].ulb_lock, &ull_lck_grp)
240 #define ull_bucket_unlock(i)     lck_spin_unlock(&ull_bucket[i].ulb_lock)
241 #endif /* ULL_TICKET_LOCK */
242 static __inline__ uint32_t
ull_hash_index(const void * key,size_t length)243 ull_hash_index(const void *key, size_t length)
244 {
245 	uint32_t hash = os_hash_jenkins(key, length);
246 
247 	hash &= (ull_hash_buckets - 1);
248 
249 	return hash;
250 }
251 
252 #define ULL_INDEX(keyp) ull_hash_index(keyp, keyp->ulk_key_type == ULK_UADDR ? ULK_UADDR_LEN : ULK_XPROC_LEN)
253 
254 static void
ulock_initialize(void)255 ulock_initialize(void)
256 {
257 	assert(thread_max > 16);
258 	/* Size ull_hash_buckets based on thread_max.
259 	 * Round up to nearest power of 2, then divide by 4
260 	 */
261 	ull_hash_buckets = (1 << (bit_ceiling(thread_max) - 2));
262 
263 	kprintf("%s>thread_max=%d, ull_hash_buckets=%d\n", __FUNCTION__, thread_max, ull_hash_buckets);
264 	assert(ull_hash_buckets >= thread_max / 4);
265 
266 	ull_bucket = zalloc_permanent(sizeof(ull_bucket_t) * ull_hash_buckets,
267 	    ZALIGN_PTR);
268 	assert(ull_bucket != NULL);
269 
270 	for (int i = 0; i < ull_hash_buckets; i++) {
271 		queue_init(&ull_bucket[i].ulb_head);
272 #if ULL_TICKET_LOCK
273 		lck_ticket_init(&ull_bucket[i].ulb_lock, &ull_lck_grp);
274 #else
275 		lck_spin_init(&ull_bucket[i].ulb_lock, &ull_lck_grp, NULL);
276 #endif /* ULL_TICKET_LOCK */
277 	}
278 }
279 STARTUP(EARLY_BOOT, STARTUP_RANK_FIRST, ulock_initialize);
280 
281 #if DEVELOPMENT || DEBUG
282 /* Count the number of hash entries for a given task address.
283  * if task==0, dump the whole table.
284  */
285 static int
ull_hash_dump(task_t task)286 ull_hash_dump(task_t task)
287 {
288 	int count = 0;
289 	if (task == TASK_NULL) {
290 		kprintf("%s>total number of ull_t allocated %d\n", __FUNCTION__, ull_nzalloc);
291 		kprintf("%s>BEGIN\n", __FUNCTION__);
292 	}
293 	for (int i = 0; i < ull_hash_buckets; i++) {
294 		ull_bucket_lock(i);
295 		if (!queue_empty(&ull_bucket[i].ulb_head)) {
296 			ull_t *elem;
297 			if (task == TASK_NULL) {
298 				kprintf("%s>index %d:\n", __FUNCTION__, i);
299 			}
300 			qe_foreach_element(elem, &ull_bucket[i].ulb_head, ull_hash_link) {
301 				if ((task == TASK_NULL) || ((elem->ull_key.ulk_key_type == ULK_UADDR)
302 				    && (task == elem->ull_key.ulk_task))) {
303 					ull_dump(elem);
304 					count++;
305 				}
306 			}
307 		}
308 		ull_bucket_unlock(i);
309 	}
310 	if (task == TASK_NULL) {
311 		kprintf("%s>END\n", __FUNCTION__);
312 		ull_nzalloc = 0;
313 	}
314 	return count;
315 }
316 #endif
317 
318 static ull_t *
ull_alloc(ulk_t * key)319 ull_alloc(ulk_t *key)
320 {
321 	ull_t *ull = (ull_t *)zalloc_flags(ull_zone, Z_SET_NOTEARLY);
322 	assert(ull != NULL);
323 
324 	ull->ull_refcount = 1;
325 	ull->ull_key = *key;
326 	ull->ull_bucket_index = ULL_INDEX(key);
327 	ull->ull_nwaiters = 0;
328 	ull->ull_opcode = 0;
329 
330 	ull->ull_owner = THREAD_NULL;
331 	ull->ull_turnstile = TURNSTILE_NULL;
332 
333 	ull_lock_init(ull);
334 
335 	ull_nzalloc++;
336 	return ull;
337 }
338 
339 static void
ull_free(ull_t * ull)340 ull_free(ull_t *ull)
341 {
342 	assert(ull->ull_owner == THREAD_NULL);
343 	assert(ull->ull_turnstile == TURNSTILE_NULL);
344 
345 	ull_assert_notwned(ull);
346 
347 	ull_lock_destroy(ull);
348 
349 	zfree(ull_zone, ull);
350 }
351 
352 /* Finds an existing ulock structure (ull_t), or creates a new one.
353  * If MUST_EXIST flag is set, returns NULL instead of creating a new one.
354  * The ulock structure is returned with ull_lock locked
355  */
356 static ull_t *
ull_get(ulk_t * key,uint32_t flags,ull_t ** unused_ull)357 ull_get(ulk_t *key, uint32_t flags, ull_t **unused_ull)
358 {
359 	ull_t *ull = NULL;
360 	uint i = ULL_INDEX(key);
361 	ull_t *new_ull = (flags & ULL_MUST_EXIST) ? NULL : ull_alloc(key);
362 	ull_t *elem;
363 
364 	ull_bucket_lock(i);
365 	qe_foreach_element(elem, &ull_bucket[i].ulb_head, ull_hash_link) {
366 		ull_lock(elem);
367 		if (ull_key_match(&elem->ull_key, key)) {
368 			ull = elem;
369 			break;
370 		} else {
371 			ull_unlock(elem);
372 		}
373 	}
374 	if (ull == NULL) {
375 		if (flags & ULL_MUST_EXIST) {
376 			/* Must already exist (called from wake) */
377 			ull_bucket_unlock(i);
378 			assert(new_ull == NULL);
379 			assert(unused_ull == NULL);
380 			return NULL;
381 		}
382 
383 		if (new_ull == NULL) {
384 			/* Alloc above failed */
385 			ull_bucket_unlock(i);
386 			return NULL;
387 		}
388 
389 		ull = new_ull;
390 		ull_lock(ull);
391 		enqueue(&ull_bucket[i].ulb_head, &ull->ull_hash_link);
392 	} else if (!(flags & ULL_MUST_EXIST)) {
393 		assert(new_ull);
394 		assert(unused_ull);
395 		assert(*unused_ull == NULL);
396 		*unused_ull = new_ull;
397 	}
398 
399 	ull->ull_refcount++;
400 
401 	ull_bucket_unlock(i);
402 
403 	return ull; /* still locked */
404 }
405 
406 /*
407  * Must be called with ull_lock held
408  */
409 static void
ull_put(ull_t * ull)410 ull_put(ull_t *ull)
411 {
412 	ull_assert_owned(ull);
413 	int refcount = --ull->ull_refcount;
414 	assert(refcount == 0 ? (ull->ull_key.ulk_key_type == ULK_INVALID) : 1);
415 	ull_unlock(ull);
416 
417 	if (refcount > 0) {
418 		return;
419 	}
420 
421 	ull_bucket_lock(ull->ull_bucket_index);
422 	remqueue(&ull->ull_hash_link);
423 	ull_bucket_unlock(ull->ull_bucket_index);
424 
425 	ull_free(ull);
426 }
427 
428 
429 extern boolean_t machine_thread_on_core(thread_t thread);
430 
431 static int
uaddr_findobj(user_addr_t uaddr,uint64_t * objectp,uint64_t * offsetp)432 uaddr_findobj(user_addr_t uaddr, uint64_t *objectp, uint64_t *offsetp)
433 {
434 	kern_return_t ret;
435 	vm_page_info_basic_data_t info;
436 	mach_msg_type_number_t count = VM_PAGE_INFO_BASIC_COUNT;
437 
438 #if HAS_MTE || HAS_MTE_EMULATION_SHIMS
439 	/*
440 	 * uaddr_findobj() is the common entrypoint for sys_ulock*
441 	 * syscalls. We allow tagged addresses through and strip
442 	 * away metadata bits here.
443 	 */
444 	uaddr = vm_map_strip_addr(current_map(), uaddr);
445 #endif /* HAS_MTE || HAS_MTE_EMULATION_SHIMS */
446 
447 	ret = vm_map_page_info(current_map(), uaddr, VM_PAGE_INFO_BASIC, (vm_page_info_t)&info, &count);
448 	if (ret != KERN_SUCCESS) {
449 		return EINVAL;
450 	}
451 
452 	if (objectp != NULL) {
453 		*objectp = (uint64_t)info.object_id;
454 	}
455 	if (offsetp != NULL) {
456 		*offsetp = (uint64_t)info.offset;
457 	}
458 
459 	return 0;
460 }
461 
462 static void ulock_wait_continue(void *, wait_result_t);
463 static void ulock_wait_cleanup(ull_t *, thread_t, thread_t, int32_t *);
464 
465 inline static int
wait_result_to_return_code(wait_result_t wr)466 wait_result_to_return_code(wait_result_t wr)
467 {
468 	int ret = 0;
469 
470 	switch (wr) {
471 	case THREAD_AWAKENED:
472 		break;
473 	case THREAD_TIMED_OUT:
474 		ret = ETIMEDOUT;
475 		break;
476 	case THREAD_INTERRUPTED:
477 	case THREAD_RESTART:
478 	default:
479 		ret = EINTR;
480 		break;
481 	}
482 
483 	return ret;
484 }
485 
486 static int
ulock_resolve_owner(uint32_t value,thread_t * owner)487 ulock_resolve_owner(uint32_t value, thread_t *owner)
488 {
489 	mach_port_name_t owner_name = ulock_owner_value_to_port_name(value);
490 
491 	*owner = port_name_to_thread(owner_name,
492 	    PORT_INTRANS_THREAD_IN_CURRENT_TASK |
493 	    PORT_INTRANS_THREAD_NOT_CURRENT_THREAD);
494 	if (*owner == THREAD_NULL) {
495 		/*
496 		 * Translation failed - even though the lock value is up to date,
497 		 * whatever was stored in the lock wasn't actually a thread port.
498 		 */
499 		return owner_name == MACH_PORT_DEAD ? ESRCH : EOWNERDEAD;
500 	}
501 	return 0;
502 }
503 
504 int
sys_ulock_wait(struct proc * p,struct ulock_wait_args * args,int32_t * retval)505 sys_ulock_wait(struct proc *p, struct ulock_wait_args *args, int32_t *retval)
506 {
507 	struct ulock_wait2_args args2;
508 
509 	args2.operation = args->operation;
510 	args2.addr      = args->addr;
511 	args2.value     = args->value;
512 	args2.timeout   = (uint64_t)(args->timeout) * NSEC_PER_USEC;
513 	args2.value2    = 0;
514 
515 	return sys_ulock_wait2(p, &args2, retval);
516 }
517 
518 int
sys_ulock_wait2(struct proc * p,struct ulock_wait2_args * args,int32_t * retval)519 sys_ulock_wait2(struct proc *p, struct ulock_wait2_args *args, int32_t *retval)
520 {
521 	uint8_t opcode = (uint8_t)(args->operation & UL_OPCODE_MASK);
522 	uint flags = args->operation & UL_FLAGS_MASK;
523 
524 	if (flags & ULF_WAIT_CANCEL_POINT) {
525 		__pthread_testcancel(1);
526 	}
527 
528 	int ret = 0;
529 	thread_t self = current_thread();
530 	ulk_t key;
531 
532 	/* involved threads - each variable holds +1 ref if not null */
533 	thread_t owner_thread   = THREAD_NULL;
534 	thread_t old_owner      = THREAD_NULL;
535 
536 	ull_t *unused_ull = NULL;
537 
538 	if ((flags & ULF_WAIT_MASK) != flags) {
539 		ret = EINVAL;
540 		goto munge_retval;
541 	}
542 
543 	bool set_owner = false;
544 	bool xproc = false;
545 	size_t lock_size = sizeof(uint32_t);
546 	int copy_ret;
547 
548 	switch (opcode) {
549 	case UL_UNFAIR_LOCK:
550 		set_owner = true;
551 		break;
552 	case UL_COMPARE_AND_WAIT:
553 		break;
554 	case UL_COMPARE_AND_WAIT64:
555 		lock_size = sizeof(uint64_t);
556 		break;
557 	case UL_COMPARE_AND_WAIT_SHARED:
558 		xproc = true;
559 		break;
560 	case UL_COMPARE_AND_WAIT64_SHARED:
561 		xproc = true;
562 		lock_size = sizeof(uint64_t);
563 		break;
564 	default:
565 		ret = EINVAL;
566 		goto munge_retval;
567 	}
568 
569 	uint64_t value = 0;
570 
571 	if ((args->addr == 0) || (args->addr & (lock_size - 1))) {
572 		ret = EINVAL;
573 		goto munge_retval;
574 	}
575 
576 	if (xproc) {
577 		uint64_t object = 0;
578 		uint64_t offset = 0;
579 
580 		ret = uaddr_findobj(args->addr, &object, &offset);
581 		if (ret) {
582 			ret = EINVAL;
583 			goto munge_retval;
584 		}
585 		key.ulk_key_type = ULK_XPROC;
586 		key.ulk_object = object;
587 		key.ulk_offset = offset;
588 	} else {
589 		key.ulk_key_type = ULK_UADDR;
590 		key.ulk_task = proc_task(p);
591 		key.ulk_addr = args->addr;
592 	}
593 
594 	if ((flags & ULF_WAIT_ADAPTIVE_SPIN) && set_owner) {
595 		/*
596 		 * Attempt the copyin outside of the lock once,
597 		 *
598 		 * If it doesn't match (which is common), return right away.
599 		 *
600 		 * If it matches, resolve the current owner, and if it is on core,
601 		 * spin a bit waiting for the value to change. If the owner isn't on
602 		 * core, or if the value stays stable, then go on with the regular
603 		 * blocking code.
604 		 */
605 		uint64_t end = 0;
606 		uint32_t u32;
607 
608 		ret = copyin_atomic32(args->addr, &u32);
609 		if (ret || u32 != args->value) {
610 			goto munge_retval;
611 		}
612 		for (;;) {
613 			if (owner_thread == NULL && ulock_resolve_owner(u32, &owner_thread) != 0) {
614 				break;
615 			}
616 
617 			/* owner_thread may have a +1 starting here */
618 
619 			if (!machine_thread_on_core(owner_thread)) {
620 				break;
621 			}
622 			if (end == 0) {
623 				clock_interval_to_deadline(ulock_adaptive_spin_usecs,
624 				    NSEC_PER_USEC, &end);
625 			} else if (mach_absolute_time() > end) {
626 				break;
627 			}
628 			if (copyin_atomic32_wait_if_equals(args->addr, u32) != 0) {
629 				goto munge_retval;
630 			}
631 		}
632 	}
633 
634 	ull_t *ull = ull_get(&key, 0, &unused_ull);
635 	if (ull == NULL) {
636 		ret = ENOMEM;
637 		goto munge_retval;
638 	}
639 	/* ull is locked */
640 
641 	ull->ull_nwaiters++;
642 
643 	if (ull->ull_opcode == 0) {
644 		ull->ull_opcode = opcode;
645 	} else if (ull->ull_opcode != opcode) {
646 		ret = EDOM;
647 		goto out_locked;
648 	}
649 
650 	/*
651 	 * We don't want this copyin to get wedged behind VM operations,
652 	 * but we have to read the userspace value under the ull lock for correctness.
653 	 *
654 	 * Until <rdar://problem/24999882> exists,
655 	 * holding the ull spinlock across copyin forces any
656 	 * vm_fault we encounter to fail.
657 	 */
658 
659 	/* copyin_atomicXX always checks alignment */
660 
661 	if (lock_size == 4) {
662 		uint32_t u32;
663 		copy_ret = copyin_atomic32(args->addr, &u32);
664 		value = u32;
665 	} else {
666 		copy_ret = copyin_atomic64(args->addr, &value);
667 	}
668 
669 #if DEVELOPMENT || DEBUG
670 	/* Occasionally simulate copyin finding the user address paged out */
671 	if (((ull_simulate_copyin_fault == proc_getpid(p)) || (ull_simulate_copyin_fault == 1)) && (copy_ret == 0)) {
672 		static _Atomic int fault_inject = 0;
673 		if (os_atomic_inc_orig(&fault_inject, relaxed) % 73 == 0) {
674 			copy_ret = EFAULT;
675 		}
676 	}
677 #endif
678 	if (copy_ret != 0) {
679 		/* copyin() will return an error if the access to the user addr would have faulted,
680 		 * so just return and let the user level code fault it in.
681 		 */
682 		ret = copy_ret;
683 		goto out_locked;
684 	}
685 
686 	if (value != args->value) {
687 		/* Lock value has changed from expected so bail out */
688 		goto out_locked;
689 	}
690 
691 	if (set_owner) {
692 		if (owner_thread == THREAD_NULL) {
693 			ret = ulock_resolve_owner((uint32_t)args->value, &owner_thread);
694 			if (ret == EOWNERDEAD) {
695 				/*
696 				 * Translation failed - even though the lock value is up to date,
697 				 * whatever was stored in the lock wasn't actually a thread port.
698 				 */
699 				goto out_locked;
700 			}
701 			/* HACK: don't bail on MACH_PORT_DEAD, to avoid blowing up the no-tsd pthread lock */
702 			ret = 0;
703 		}
704 		/* owner_thread has a +1 reference */
705 
706 		/*
707 		 * At this point, I know:
708 		 * a) owner_thread is definitely the current owner, because I just read the value
709 		 * b) owner_thread is either:
710 		 *      i) holding the user lock or
711 		 *      ii) has just unlocked the user lock after I looked
712 		 *              and is heading toward the kernel to call ull_wake.
713 		 *              If so, it's going to have to wait for the ull mutex.
714 		 *
715 		 * Therefore, I can ask the turnstile to promote its priority, and I can rely
716 		 * on it to come by later to issue the wakeup and lose its promotion.
717 		 */
718 
719 		/* Return the +1 ref from the ull_owner field */
720 		old_owner = ull->ull_owner;
721 		ull->ull_owner = THREAD_NULL;
722 
723 		if (owner_thread != THREAD_NULL) {
724 			/* The ull_owner field now owns a +1 ref on owner_thread */
725 			thread_reference(owner_thread);
726 			ull->ull_owner = owner_thread;
727 		}
728 	}
729 
730 	wait_result_t wr;
731 	uint64_t timeout = args->timeout; /* nanoseconds */
732 	uint64_t deadline = TIMEOUT_WAIT_FOREVER;
733 	wait_interrupt_t interruptible = THREAD_ABORTSAFE;
734 	struct turnstile *ts;
735 
736 	ts = turnstile_prepare((uintptr_t)ull, &ull->ull_turnstile,
737 	    TURNSTILE_NULL, TURNSTILE_ULOCK);
738 	thread_set_pending_block_hint(self, kThreadWaitUserLock);
739 
740 	if (flags & ULF_WAIT_WORKQ_DATA_CONTENTION) {
741 		interruptible |= THREAD_WAIT_NOREPORT;
742 	}
743 
744 	turnstile_update_inheritor(ts, owner_thread,
745 	    (TURNSTILE_DELAYED_UPDATE | TURNSTILE_INHERITOR_THREAD));
746 
747 	if (timeout) {
748 		if (flags & ULF_DEADLINE) {
749 			deadline = timeout;
750 		} else {
751 			nanoseconds_to_deadline(timeout, &deadline);
752 		}
753 	}
754 
755 	wr = waitq_assert_wait64(&ts->ts_waitq, CAST_EVENT64_T(ULOCK_TO_EVENT(ull)),
756 	    interruptible, deadline);
757 
758 	if (wr == THREAD_WAITING) {
759 		uthread_t uthread = (uthread_t)get_bsdthread_info(self);
760 		uthread->uu_save.uus_ulock_wait_data.ull = ull;
761 		uthread->uu_save.uus_ulock_wait_data.retval = retval;
762 		uthread->uu_save.uus_ulock_wait_data.flags = flags;
763 		uthread->uu_save.uus_ulock_wait_data.owner_thread = owner_thread;
764 		uthread->uu_save.uus_ulock_wait_data.old_owner = old_owner;
765 	}
766 
767 	ull_unlock(ull);
768 
769 	if (unused_ull) {
770 		ull_free(unused_ull);
771 		unused_ull = NULL;
772 	}
773 
774 	turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_NOT_HELD);
775 
776 	if (wr == THREAD_WAITING) {
777 		if (set_owner && owner_thread != THREAD_NULL) {
778 			thread_handoff_parameter(owner_thread, ulock_wait_continue, ull, THREAD_HANDOFF_NONE);
779 		} else {
780 			assert(owner_thread == THREAD_NULL);
781 			thread_block_parameter(ulock_wait_continue, ull);
782 		}
783 		/* NOT REACHED */
784 	}
785 
786 	ret = wait_result_to_return_code(wr);
787 
788 	ull_lock(ull);
789 	turnstile_complete((uintptr_t)ull, &ull->ull_turnstile, NULL, TURNSTILE_ULOCK);
790 
791 out_locked:
792 	ulock_wait_cleanup(ull, owner_thread, old_owner, retval);
793 	owner_thread = NULL;
794 
795 	if (unused_ull) {
796 		ull_free(unused_ull);
797 		unused_ull = NULL;
798 	}
799 
800 	assert(*retval >= 0);
801 
802 munge_retval:
803 	if (owner_thread) {
804 		thread_deallocate(owner_thread);
805 	}
806 	if (ret == ESTALE) {
807 		ret = 0;
808 	}
809 	if ((flags & ULF_NO_ERRNO) && (ret != 0)) {
810 		*retval = -ret;
811 		ret = 0;
812 	}
813 	return ret;
814 }
815 
816 /*
817  * Must be called with ull_lock held
818  */
819 static void
ulock_wait_cleanup(ull_t * ull,thread_t owner_thread,thread_t old_owner,int32_t * retval)820 ulock_wait_cleanup(ull_t *ull, thread_t owner_thread, thread_t old_owner, int32_t *retval)
821 {
822 	ull_assert_owned(ull);
823 
824 	thread_t old_lingering_owner = THREAD_NULL;
825 
826 	*retval = --ull->ull_nwaiters;
827 	if (ull->ull_nwaiters == 0) {
828 		/*
829 		 * If the wait was canceled early, we might need to
830 		 * clear out the lingering owner reference before
831 		 * freeing the ull.
832 		 */
833 		old_lingering_owner = ull->ull_owner;
834 		ull->ull_owner = THREAD_NULL;
835 
836 		memset(&ull->ull_key, 0, sizeof ull->ull_key);
837 		ull->ull_refcount--;
838 		assert(ull->ull_refcount > 0);
839 	}
840 	ull_put(ull);
841 
842 	/* Need to be called after dropping the interlock */
843 	turnstile_cleanup();
844 
845 	if (owner_thread != THREAD_NULL) {
846 		thread_deallocate(owner_thread);
847 	}
848 
849 	if (old_owner != THREAD_NULL) {
850 		thread_deallocate(old_owner);
851 	}
852 
853 	if (old_lingering_owner != THREAD_NULL) {
854 		thread_deallocate(old_lingering_owner);
855 	}
856 
857 	assert(*retval >= 0);
858 }
859 
860 __attribute__((noreturn))
861 static void
ulock_wait_continue(__unused void * parameter,wait_result_t wr)862 ulock_wait_continue(__unused void * parameter, wait_result_t wr)
863 {
864 	uthread_t uthread = current_uthread();
865 	int ret = 0;
866 
867 	ull_t *ull = uthread->uu_save.uus_ulock_wait_data.ull;
868 	int32_t *retval = uthread->uu_save.uus_ulock_wait_data.retval;
869 	uint flags = uthread->uu_save.uus_ulock_wait_data.flags;
870 	thread_t owner_thread = uthread->uu_save.uus_ulock_wait_data.owner_thread;
871 	thread_t old_owner = uthread->uu_save.uus_ulock_wait_data.old_owner;
872 
873 	ret = wait_result_to_return_code(wr);
874 
875 	ull_lock(ull);
876 	turnstile_complete((uintptr_t)ull, &ull->ull_turnstile, NULL, TURNSTILE_ULOCK);
877 
878 	ulock_wait_cleanup(ull, owner_thread, old_owner, retval);
879 
880 	if ((flags & ULF_NO_ERRNO) && (ret != 0)) {
881 		*retval = -ret;
882 		ret = 0;
883 	}
884 
885 	unix_syscall_return(ret);
886 }
887 
888 int
sys_ulock_wake(struct proc * p,struct ulock_wake_args * args,int32_t * retval)889 sys_ulock_wake(struct proc *p, struct ulock_wake_args *args, int32_t *retval)
890 {
891 	int ret = 0;
892 #if DEVELOPMENT || DEBUG
893 	uint8_t opcode = (uint8_t)(args->operation & UL_OPCODE_MASK);
894 
895 	if (opcode == UL_DEBUG_HASH_DUMP_PID) {
896 		*retval = ull_hash_dump(proc_task(p));
897 		return ret;
898 	} else if (opcode == UL_DEBUG_HASH_DUMP_ALL) {
899 		*retval = ull_hash_dump(TASK_NULL);
900 		return ret;
901 	} else if (opcode == UL_DEBUG_SIMULATE_COPYIN_FAULT) {
902 		ull_simulate_copyin_fault = (int)(args->wake_value);
903 		return ret;
904 	}
905 #endif
906 	ret = ulock_wake(proc_task(p), args->operation, args->addr, args->wake_value);
907 
908 	if ((args->operation & ULF_NO_ERRNO) && (ret != 0)) {
909 		*retval = -ret;
910 		ret = 0;
911 	}
912 
913 	return ret;
914 }
915 
916 int
ulock_wake(task_t task,uint32_t operation,user_addr_t addr,uint64_t wake_value)917 ulock_wake(task_t task, uint32_t operation, user_addr_t addr, uint64_t wake_value)
918 {
919 	uint8_t opcode = (uint8_t)(operation & UL_OPCODE_MASK);
920 	uint flags = operation & UL_FLAGS_MASK;
921 	int ret = 0;
922 	ulk_t key;
923 
924 	/* involved threads - each variable holds +1 ref if not null */
925 	thread_t wake_thread    = THREAD_NULL;
926 
927 	bool set_owner = false;
928 	bool allow_non_owner = false;
929 	bool xproc = false;
930 
931 	switch (opcode) {
932 	case UL_UNFAIR_LOCK:
933 		set_owner = true;
934 		break;
935 	case UL_COMPARE_AND_WAIT:
936 	case UL_COMPARE_AND_WAIT64:
937 		break;
938 	case UL_COMPARE_AND_WAIT_SHARED:
939 	case UL_COMPARE_AND_WAIT64_SHARED:
940 		xproc = true;
941 		break;
942 	default:
943 		ret = EINVAL;
944 		goto munge_retval;
945 	}
946 
947 	if ((flags & ULF_WAKE_MASK) != flags) {
948 		ret = EINVAL;
949 		goto munge_retval;
950 	}
951 
952 	if ((flags & ULF_WAKE_THREAD) && ((flags & ULF_WAKE_ALL) || set_owner)) {
953 		ret = EINVAL;
954 		goto munge_retval;
955 	}
956 
957 	if (flags & ULF_WAKE_ALLOW_NON_OWNER) {
958 		if (!set_owner) {
959 			ret = EINVAL;
960 			goto munge_retval;
961 		}
962 
963 		allow_non_owner = true;
964 	}
965 
966 	if (addr == 0) {
967 		ret = EINVAL;
968 		goto munge_retval;
969 	}
970 
971 	if (xproc) {
972 		uint64_t object = 0;
973 		uint64_t offset = 0;
974 
975 		ret = uaddr_findobj(addr, &object, &offset);
976 		if (ret) {
977 			ret = EINVAL;
978 			goto munge_retval;
979 		}
980 		key.ulk_key_type = ULK_XPROC;
981 		key.ulk_object = object;
982 		key.ulk_offset = offset;
983 	} else {
984 		key.ulk_key_type = ULK_UADDR;
985 		key.ulk_task = task;
986 		key.ulk_addr = addr;
987 	}
988 
989 	if (flags & ULF_WAKE_THREAD) {
990 		mach_port_name_t wake_thread_name = (mach_port_name_t)(wake_value);
991 		wake_thread = port_name_to_thread(wake_thread_name,
992 		    PORT_INTRANS_THREAD_IN_CURRENT_TASK |
993 		    PORT_INTRANS_THREAD_NOT_CURRENT_THREAD);
994 		if (wake_thread == THREAD_NULL) {
995 			ret = ESRCH;
996 			goto munge_retval;
997 		}
998 	}
999 
1000 	ull_t *ull = ull_get(&key, ULL_MUST_EXIST, NULL);
1001 	thread_t new_owner = THREAD_NULL;
1002 	struct turnstile *ts = TURNSTILE_NULL;
1003 	thread_t cleanup_thread = THREAD_NULL;
1004 
1005 	if (ull == NULL) {
1006 		ret = ENOENT;
1007 		goto munge_retval;
1008 	}
1009 	/* ull is locked */
1010 
1011 	if (opcode != ull->ull_opcode) {
1012 		ret = EDOM;
1013 		goto out_ull_put;
1014 	}
1015 
1016 	if (set_owner) {
1017 		if ((ull->ull_owner != current_thread()) && !allow_non_owner) {
1018 			/*
1019 			 * If the current thread isn't the known owner,
1020 			 * then this wake call was late to the party,
1021 			 * and the kernel already knows who owns the lock.
1022 			 *
1023 			 * This current owner already knows the lock is contended
1024 			 * and will redrive wakes, just bail out.
1025 			 */
1026 			goto out_ull_put;
1027 		}
1028 	} else {
1029 		assert(ull->ull_owner == THREAD_NULL);
1030 	}
1031 
1032 	ts = turnstile_prepare((uintptr_t)ull, &ull->ull_turnstile,
1033 	    TURNSTILE_NULL, TURNSTILE_ULOCK);
1034 	assert(ts != TURNSTILE_NULL);
1035 
1036 	if (flags & ULF_WAKE_THREAD) {
1037 		kern_return_t kr = waitq_wakeup64_thread(&ts->ts_waitq,
1038 		    CAST_EVENT64_T(ULOCK_TO_EVENT(ull)),
1039 		    wake_thread, THREAD_AWAKENED);
1040 		if (kr != KERN_SUCCESS) {
1041 			assert(kr == KERN_NOT_WAITING);
1042 			ret = EALREADY;
1043 		}
1044 	} else if (flags & ULF_WAKE_ALL) {
1045 		waitq_wakeup64_all(&ts->ts_waitq, CAST_EVENT64_T(ULOCK_TO_EVENT(ull)),
1046 		    THREAD_AWAKENED,
1047 		    set_owner ? WAITQ_UPDATE_INHERITOR : WAITQ_WAKEUP_DEFAULT);
1048 	} else if (set_owner) {
1049 		/*
1050 		 * The turnstile waitq is priority ordered,
1051 		 * and will wake up the highest priority waiter
1052 		 * and set it as the inheritor for us.
1053 		 */
1054 		new_owner = waitq_wakeup64_identify(&ts->ts_waitq,
1055 		    CAST_EVENT64_T(ULOCK_TO_EVENT(ull)),
1056 		    THREAD_AWAKENED, WAITQ_UPDATE_INHERITOR);
1057 	} else {
1058 		waitq_wakeup64_one(&ts->ts_waitq, CAST_EVENT64_T(ULOCK_TO_EVENT(ull)),
1059 		    THREAD_AWAKENED, WAITQ_WAKEUP_DEFAULT);
1060 	}
1061 
1062 	if (set_owner) {
1063 		turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD);
1064 		cleanup_thread = ull->ull_owner;
1065 		ull->ull_owner = new_owner;
1066 	}
1067 
1068 	turnstile_complete((uintptr_t)ull, &ull->ull_turnstile, NULL, TURNSTILE_ULOCK);
1069 
1070 out_ull_put:
1071 	ull_put(ull);
1072 
1073 	if (ts != TURNSTILE_NULL) {
1074 		/* Need to be called after dropping the interlock */
1075 		turnstile_cleanup();
1076 	}
1077 
1078 	if (cleanup_thread != THREAD_NULL) {
1079 		thread_deallocate(cleanup_thread);
1080 	}
1081 
1082 munge_retval:
1083 	if (wake_thread != THREAD_NULL) {
1084 		thread_deallocate(wake_thread);
1085 	}
1086 
1087 	return ret;
1088 }
1089 
1090 void
kdp_ulock_find_owner(__unused struct waitq * waitq,event64_t event,thread_waitinfo_t * waitinfo)1091 kdp_ulock_find_owner(__unused struct waitq * waitq, event64_t event, thread_waitinfo_t * waitinfo)
1092 {
1093 	ull_t *ull = EVENT_TO_ULOCK(event);
1094 
1095 	zone_require(ull_zone->kt_zv.zv_zone, ull);
1096 
1097 	switch (ull->ull_opcode) {
1098 	case UL_UNFAIR_LOCK:
1099 	case UL_UNFAIR_LOCK64_SHARED:
1100 		waitinfo->owner   = thread_tid(ull->ull_owner);
1101 		waitinfo->context = ull->ull_key.ulk_addr;
1102 		break;
1103 	case UL_COMPARE_AND_WAIT:
1104 	case UL_COMPARE_AND_WAIT64:
1105 	case UL_COMPARE_AND_WAIT_SHARED:
1106 	case UL_COMPARE_AND_WAIT64_SHARED:
1107 		waitinfo->owner   = 0;
1108 		waitinfo->context = ull->ull_key.ulk_addr;
1109 		break;
1110 	default:
1111 		panic("%s: Invalid ulock opcode %d addr %p", __FUNCTION__, ull->ull_opcode, (void*)ull);
1112 		break;
1113 	}
1114 	return;
1115 }
1116