xref: /xnu-12377.1.9/osfmk/kern/lock_mtx.c (revision f6217f891ac0bb64f3d375211650a4c1ff8ca1ea)
1 /*
2  * Copyright (c) 2022 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #define LOCK_PRIVATE 1
30 
31 #include <mach_ldebug.h>
32 #include <kern/locks_internal.h>
33 #include <kern/lock_stat.h>
34 #include <kern/locks.h>
35 #include <kern/kalloc.h>
36 #include <kern/thread.h>
37 
38 #include <mach/machine/sdt.h>
39 
40 #include <machine/cpu_data.h>
41 #include <machine/machine_cpu.h>
42 
43 #if !LCK_MTX_USE_ARCH
44 
45 /*
46  * lck_mtx_t
47  * ~~~~~~~~~
48  *
49  * Kernel mutexes in this implementation are made of four 32 bits words:
50  *
51  *   - word 0: turnstile compact ID (24 bits) and the 0x22 lock tag
52  *   - word 1: padding (to be used for group compact IDs)
53  *   - word 2: mutex state (lock owner + interlock, spin and waiters bits),
54  *             refered to as "data" in the code.
55  *   - word 3: adaptive spin and interlock MCS queue tails.
56  *
57  * The 64 bits word made of the last two words is refered to
58  * as the "mutex state" in code.
59  *
60  *
61  * Core serialization rules
62  * ~~~~~~~~~~~~~~~~~~~~~~~~
63  *
64  * The mutex has a bit (lck_mtx_t::lck_mtx.ilocked or bit LCK_MTX_ILOCK
65  * of the data word) that serves as a spinlock for the mutex state.
66  *
67  *
68  * Updating the lock fields must follow the following rules:
69  *
70  *   - It is ok to "steal" the mutex (updating its data field) if no one
71  *     holds the interlock.
72  *
73  *   - Holding the interlock allows its holder to update the first 3 words
74  *     of the kernel mutex without using RMW atomics (plain stores are OK).
75  *
76  *   - Holding the interlock is required for a thread to remove itself
77  *     from the adaptive spin queue.
78  *
79  *   - Threads can enqueue themselves onto the adaptive spin wait queue
80  *     or the interlock wait queue at any time.
81  *
82  *
83  * Waiters bit and turnstiles
84  * ~~~~~~~~~~~~~~~~~~~~~~~~~~
85  *
86  * The turnstile on a kernel mutex is set by waiters, and cleared
87  * once they have all been resumed and successfully acquired the lock.
88  *
89  * LCK_MTX_NEEDS_WAKEUP being set (always with an owner set too)
90  * forces threads to the lck_mtx_unlock slowpath,
91  * in order to evaluate whether lck_mtx_unlock_wakeup() must be called.
92  *
93  * As a result it means it really only needs to be set at select times:
94  *
95  *   - when a thread blocks and "snitches" on the current thread owner,
96  *     so that when that thread unlocks it calls wake up,
97  *
98  *   - when a thread that was woken up resumes its work and became
99  *     the inheritor.
100  */
101 
102 #define ADAPTIVE_SPIN_ENABLE 0x1
103 
104 #define NOINLINE                __attribute__((noinline))
105 #define LCK_MTX_EVENT(lck)      CAST_EVENT64_T(&(lck)->lck_mtx.data)
106 #define LCK_EVENT_TO_MUTEX(e)   __container_of((uint32_t *)(e), lck_mtx_t, lck_mtx.data)
107 #define LCK_MTX_HAS_WAITERS(l)  ((l)->lck_mtx.data & LCK_MTX_NEEDS_WAKEUP)
108 
109 #if DEVELOPMENT || DEBUG
110 TUNABLE(bool, LckDisablePreemptCheck, "-disable_mtx_chk", false);
111 #endif /* DEVELOPMENT || DEBUG */
112 
113 extern unsigned int not_in_kdp;
114 
115 #if CONFIG_SPTM
116 extern const bool * sptm_xnu_triggered_panic_ptr;
117 #endif /* CONFIG_SPTM */
118 
119 KALLOC_TYPE_DEFINE(KT_LCK_MTX, lck_mtx_t, KT_PRIV_ACCT);
120 
121 #define LCK_MTX_NULL_CTID       0x00000000u
122 
123 __enum_decl(lck_mtx_mode_t, uint32_t, {
124 	LCK_MTX_MODE_SLEEPABLE,
125 	LCK_MTX_MODE_SPIN,
126 	LCK_MTX_MODE_SPIN_ALWAYS,
127 });
128 
129 __enum_decl(lck_ilk_mode_t, uint32_t, {
130 	LCK_ILK_MODE_UNLOCK,
131 	LCK_ILK_MODE_DIRECT,
132 	LCK_ILK_MODE_FROM_AS,
133 });
134 
135 static inline void
lck_mtx_mcs_clear(lck_mtx_mcs_t mcs)136 lck_mtx_mcs_clear(lck_mtx_mcs_t mcs)
137 {
138 	*mcs = (struct lck_mtx_mcs){ };
139 }
140 
141 static inline lck_mcs_id_t
lck_mtx_get_mcs_id(void)142 lck_mtx_get_mcs_id(void)
143 {
144 	return lck_mcs_id_current(LCK_MCS_SLOT_0);
145 }
146 
147 __pure2
148 static inline lck_mtx_mcs_t
lck_mtx_get_mcs(lck_mcs_id_t idx)149 lck_mtx_get_mcs(lck_mcs_id_t idx)
150 {
151 	return &lck_mcs_get_other(idx)->mcs_mtx;
152 }
153 
154 
155 #pragma mark lck_mtx_t: validation
156 
157 __abortlike
158 static void
__lck_mtx_invalid_panic(lck_mtx_t * lck)159 __lck_mtx_invalid_panic(lck_mtx_t *lck)
160 {
161 	panic("Invalid/destroyed mutex %p: "
162 	    "<0x%06x 0x%02x 0x%08x 0x%08x/%p 0x%04x 0x%04x>",
163 	    lck, lck->lck_mtx_tsid, lck->lck_mtx_type, lck->lck_mtx_grp,
164 	    lck->lck_mtx.data, ctid_get_thread_unsafe(lck->lck_mtx.owner),
165 	    lck->lck_mtx.as_tail, lck->lck_mtx.ilk_tail);
166 }
167 
168 __abortlike
169 static void
__lck_mtx_not_owned_panic(lck_mtx_t * lock,thread_t thread)170 __lck_mtx_not_owned_panic(lck_mtx_t *lock, thread_t thread)
171 {
172 	panic("Mutex %p is unexpectedly not owned by thread %p", lock, thread);
173 }
174 
175 __abortlike
176 static void
__lck_mtx_owned_panic(lck_mtx_t * lock,thread_t thread)177 __lck_mtx_owned_panic(lck_mtx_t *lock, thread_t thread)
178 {
179 	panic("Mutex %p is unexpectedly owned by thread %p", lock, thread);
180 }
181 
182 __abortlike
183 static void
__lck_mtx_lock_is_sleepable_panic(lck_mtx_t * lck)184 __lck_mtx_lock_is_sleepable_panic(lck_mtx_t *lck)
185 {
186 	// "Always" variants can never block. If the lock is held as a normal mutex
187 	// then someone is mixing always and non-always calls on the same lock, which is
188 	// forbidden.
189 	panic("Mutex %p is held as a full-mutex (spin-always lock attempted)", lck);
190 }
191 
192 #if DEVELOPMENT || DEBUG
193 __abortlike
194 static void
__lck_mtx_preemption_disabled_panic(lck_mtx_t * lck,int expected)195 __lck_mtx_preemption_disabled_panic(lck_mtx_t *lck, int expected)
196 {
197 	panic("Attempt to take mutex %p with preemption disabled (%d)",
198 	    lck, get_preemption_level() - expected);
199 }
200 
201 __abortlike
202 static void
__lck_mtx_at_irq_panic(lck_mtx_t * lck)203 __lck_mtx_at_irq_panic(lck_mtx_t *lck)
204 {
205 	panic("Attempt to take mutex %p in IRQ context", lck);
206 }
207 
208 /*
209  *	Routine:	lck_mtx_check_preemption
210  *
211  *	Verify preemption is enabled when attempting to acquire a mutex.
212  */
213 static inline void
lck_mtx_check_preemption(lck_mtx_t * lock,thread_t thread,int expected)214 lck_mtx_check_preemption(lck_mtx_t *lock, thread_t thread, int expected)
215 {
216 #pragma unused(thread)
217 	if (lock_preemption_level_for_thread(thread) == expected) {
218 		return;
219 	}
220 	if (LckDisablePreemptCheck) {
221 		return;
222 	}
223 	if (current_cpu_datap()->cpu_hibernate) {
224 		return;
225 	}
226 	if (startup_phase < STARTUP_SUB_EARLY_BOOT) {
227 		return;
228 	}
229 #if CONFIG_SPTM
230 	/*
231 	 * If a panic has been initiated on SPTM devices, preemption was disabled by sleh,
232 	 * but platform callbacks could be acquiring mutexes
233 	 */
234 	if (*sptm_xnu_triggered_panic_ptr) {
235 		return;
236 	}
237 #endif
238 	__lck_mtx_preemption_disabled_panic(lock, expected);
239 }
240 
241 static inline void
lck_mtx_check_irq(lck_mtx_t * lock)242 lck_mtx_check_irq(lck_mtx_t *lock)
243 {
244 	if (ml_at_interrupt_context()) {
245 		__lck_mtx_at_irq_panic(lock);
246 	}
247 }
248 
249 #define LCK_MTX_SNIFF_PREEMPTION(thread)   lock_preemption_level_for_thread(thread)
250 #define LCK_MTX_CHECK_INVARIANTS           1
251 #else
252 #define lck_mtx_check_irq(lck)             ((void)0)
253 #define LCK_MTX_SNIFF_PREEMPTION(thread)   0
254 #define LCK_MTX_CHECK_INVARIANTS           0
255 #endif /* !DEVELOPMENT && !DEBUG */
256 
257 #if CONFIG_DTRACE
258 #define LCK_MTX_SNIFF_DTRACE()             lck_debug_state.lds_value
259 #else
260 #define LCK_MTX_SNIFF_DTRACE()             0
261 #endif
262 
263 
264 #pragma mark lck_mtx_t: alloc/init/destroy/free
265 
266 lck_mtx_t *
lck_mtx_alloc_init(lck_grp_t * grp,lck_attr_t * attr)267 lck_mtx_alloc_init(lck_grp_t *grp, lck_attr_t *attr)
268 {
269 	lck_mtx_t      *lck;
270 
271 	lck = zalloc(KT_LCK_MTX);
272 	lck_mtx_init(lck, grp, attr);
273 	return lck;
274 }
275 
276 void
lck_mtx_free(lck_mtx_t * lck,lck_grp_t * grp)277 lck_mtx_free(lck_mtx_t *lck, lck_grp_t *grp)
278 {
279 	lck_mtx_destroy(lck, grp);
280 	zfree(KT_LCK_MTX, lck);
281 }
282 
283 __mockable void
lck_mtx_init(lck_mtx_t * lck,lck_grp_t * grp,lck_attr_t * attr)284 lck_mtx_init(lck_mtx_t *lck, lck_grp_t *grp, lck_attr_t *attr)
285 {
286 	if (attr == LCK_ATTR_NULL) {
287 		attr = &lck_attr_default;
288 	}
289 
290 	*lck = (lck_mtx_t){
291 		.lck_mtx_type = LCK_TYPE_MUTEX,
292 		.lck_mtx_grp  = grp->lck_grp_attr_id,
293 	};
294 	if (attr->lck_attr_val & LCK_ATTR_DEBUG) {
295 		lck->lck_mtx.data |= LCK_MTX_PROFILE;
296 	}
297 
298 	lck_grp_reference(grp, &grp->lck_grp_mtxcnt);
299 }
300 
301 __mockable void
lck_mtx_destroy(lck_mtx_t * lck,lck_grp_t * grp)302 lck_mtx_destroy(lck_mtx_t *lck, lck_grp_t *grp)
303 {
304 	if (lck->lck_mtx_tsid && lck->lck_mtx_type == LCK_TYPE_MUTEX) {
305 		panic("Mutex to destroy still has waiters: %p: "
306 		    "<0x%06x 0x%02x 0x%08x 0x%08x/%p 0x%04x 0x%04x>",
307 		    lck, lck->lck_mtx_tsid, lck->lck_mtx_type, lck->lck_mtx_grp,
308 		    lck->lck_mtx.data, ctid_get_thread_unsafe(lck->lck_mtx.owner),
309 		    lck->lck_mtx.as_tail, lck->lck_mtx.ilk_tail);
310 	}
311 	if (lck->lck_mtx_type != LCK_TYPE_MUTEX ||
312 	    (lck->lck_mtx.data & ~LCK_MTX_PROFILE) ||
313 	    lck->lck_mtx.as_tail || lck->lck_mtx.ilk_tail) {
314 		__lck_mtx_invalid_panic(lck);
315 	}
316 	LCK_GRP_ASSERT_ID(grp, lck->lck_mtx_grp);
317 	lck->lck_mtx_type = LCK_TYPE_NONE;
318 	lck->lck_mtx.data = LCK_MTX_TAG_DESTROYED;
319 	lck->lck_mtx_grp      = 0;
320 	lck_grp_deallocate(grp, &grp->lck_grp_mtxcnt);
321 }
322 
323 
324 #pragma mark lck_mtx_t: lck_mtx_ilk*
325 
326 static hw_spin_timeout_status_t
lck_mtx_ilk_timeout_panic(void * _lock,hw_spin_timeout_t to,hw_spin_state_t st)327 lck_mtx_ilk_timeout_panic(void *_lock, hw_spin_timeout_t to, hw_spin_state_t st)
328 {
329 	lck_mtx_t *lck = _lock;
330 
331 	panic("Mutex interlock[%p] " HW_SPIN_TIMEOUT_FMT "; "
332 	    "current owner: %p, "
333 	    "<0x%06x 0x%02x 0x%08x 0x%08x 0x%04x 0x%04x>, "
334 	    HW_SPIN_TIMEOUT_DETAILS_FMT,
335 	    lck, HW_SPIN_TIMEOUT_ARG(to, st),
336 	    ctid_get_thread_unsafe(lck->lck_mtx.owner),
337 	    lck->lck_mtx_tsid, lck->lck_mtx_type,
338 	    lck->lck_mtx_grp, lck->lck_mtx.data,
339 	    lck->lck_mtx.as_tail, lck->lck_mtx.ilk_tail,
340 	    HW_SPIN_TIMEOUT_DETAILS_ARG(to, st));
341 }
342 
343 static const struct hw_spin_policy lck_mtx_ilk_timeout_policy = {
344 	.hwsp_name              = "lck_mtx_t (ilk)",
345 	.hwsp_timeout_atomic    = &lock_panic_timeout,
346 	.hwsp_op_timeout        = lck_mtx_ilk_timeout_panic,
347 };
348 
349 static void
lck_mtx_ilk_lock_cleanup_as_mcs(lck_mtx_t * lock,lck_mcs_id_t idx,lck_mtx_mcs_t mcs,hw_spin_timeout_t to,hw_spin_state_t * ss)350 lck_mtx_ilk_lock_cleanup_as_mcs(
351 	lck_mtx_t               *lock,
352 	lck_mcs_id_t             idx,
353 	lck_mtx_mcs_t            mcs,
354 	hw_spin_timeout_t        to,
355 	hw_spin_state_t         *ss)
356 {
357 	lck_mtx_mcs_t nnode = NULL;
358 	lck_mcs_id_t  pidx  = (lck_mcs_id_t)mcs->lmm_as_prev;
359 	bool          was_last;
360 
361 	/*
362 	 *	This is called when the thread made use
363 	 *	of the adaptive spin queue and needs
364 	 *	to remove itself from it.
365 	 */
366 
367 	/*
368 	 *	If the thread is last, set the tail to the node before us.
369 	 */
370 	was_last = lock_cmpxchg(&lock->lck_mtx.as_tail, idx, pidx, release);
371 
372 	if (was_last) {
373 		/*
374 		 *	If @c mcs was last, we need to erase the previous
375 		 *	node link to it.
376 		 *
377 		 *	However, new nodes could have now taken our place
378 		 *	and set the previous node's @c lmm_as_next field
379 		 *	already, so we must CAS rather than blindly set.
380 		 *
381 		 *	We know the previous node is stable because
382 		 *	we hold the interlock (preventing concurrent
383 		 *	removals).
384 		 */
385 		if (pidx) {
386 			os_atomic_cmpxchg(&lck_mtx_get_mcs(pidx)->lmm_as_next,
387 			    mcs, nnode, relaxed);
388 		}
389 	} else {
390 		/*
391 		 *	If @c mcs wasn't last, then wait to make sure
392 		 *	we observe @c lmm_as_next. Once we do, we know
393 		 *	the field is stable since we hold the interlock
394 		 *	(preventing concurrent dequeues).
395 		 *
396 		 *	We can then update it to @c mcs next node index
397 		 *	(which is also stable for similar reasons).
398 		 *
399 		 *	Lastly update the previous node @c lmm_as_next
400 		 *	field as well to terminate the dequeue.
401 		 */
402 		while (!hw_spin_wait_until(&mcs->lmm_as_next, nnode, nnode)) {
403 			hw_spin_policy_t pol = &lck_mtx_ilk_timeout_policy;
404 			hw_spin_should_keep_spinning(lock, pol, to, ss);
405 		}
406 
407 		os_atomic_store(&nnode->lmm_as_prev, pidx, relaxed);
408 		if (pidx) {
409 			os_atomic_store(&lck_mtx_get_mcs(pidx)->lmm_as_next,
410 			    nnode, relaxed);
411 		}
412 	}
413 
414 	/*
415 	 *	@c mcs's fields are left dangling,
416 	 *	it is the responsibilty of the caller
417 	 *	to terminate the cleanup.
418 	 */
419 }
420 
421 static NOINLINE void
lck_mtx_ilk_lock_contended(lck_mtx_t * lock,lck_mtx_state_t state,lck_ilk_mode_t mode)422 lck_mtx_ilk_lock_contended(
423 	lck_mtx_t              *lock,
424 	lck_mtx_state_t         state,
425 	lck_ilk_mode_t          mode)
426 {
427 	hw_spin_policy_t  pol = &lck_mtx_ilk_timeout_policy;
428 	hw_spin_timeout_t to  = hw_spin_compute_timeout(pol);
429 	hw_spin_state_t   ss  = { };
430 
431 	lck_mtx_mcs_t     mcs, nnode, pnode;
432 	lck_mcs_id_t      idx, pidx;
433 	lck_mtx_state_t   nstate;
434 	unsigned long     ready;
435 	uint64_t          spin_start;
436 
437 	/*
438 	 *	Take a spot in the interlock MCS queue,
439 	 *	and then spin until we're at the head of it.
440 	 */
441 
442 	idx  = lck_mtx_get_mcs_id();
443 	mcs  = &lck_mcs_get_current()->mcs_mtx;
444 	if (mode != LCK_MTX_MODE_SPIN) {
445 		spin_start = LCK_MTX_ADAPTIVE_SPIN_BEGIN();
446 	}
447 
448 	mcs->lmm_ilk_current = lock;
449 	pidx = os_atomic_xchg(&lock->lck_mtx.ilk_tail, idx, release);
450 	if (pidx) {
451 		pnode = lck_mtx_get_mcs(pidx);
452 		os_atomic_store(&pnode->lmm_ilk_next, mcs, relaxed);
453 
454 		while (!hw_spin_wait_until(&mcs->lmm_ilk_ready, ready, ready)) {
455 			hw_spin_should_keep_spinning(lock, pol, to, &ss);
456 		}
457 	}
458 
459 
460 	/*
461 	 *	We're now the first in line, wait for the interlock
462 	 *	to look ready and take it.
463 	 *
464 	 *	We can't just assume the lock is ours for the taking,
465 	 *	because the fastpath of lck_mtx_lock_spin{,_always}
466 	 *	only look at the mutex "data" and might steal it.
467 	 *
468 	 *	Also clear the interlock MCS tail if @c mcs is last.
469 	 */
470 	do {
471 		while (!hw_spin_wait_until(&lock->lck_mtx.val,
472 		    state.val, state.ilocked == 0)) {
473 			hw_spin_should_keep_spinning(lock, pol, to, &ss);
474 		}
475 
476 		nstate = state;
477 		nstate.ilocked = 1;
478 		if (nstate.ilk_tail == idx) {
479 			nstate.ilk_tail = 0;
480 		}
481 	} while (!os_atomic_cmpxchg(&lock->lck_mtx, state, nstate, acquire));
482 
483 
484 	/*
485 	 *	We now have the interlock, let's cleanup the MCS state.
486 	 *
487 	 *	First, if there is a node after us, notify that it
488 	 *	is at the head of the interlock queue.
489 	 *
490 	 *	Second, perform the adaptive spin MCS cleanup if needed.
491 	 *
492 	 *	Lastly, clear the MCS node.
493 	 */
494 	if (state.ilk_tail != idx) {
495 		while (!hw_spin_wait_until(&mcs->lmm_ilk_next, nnode, nnode)) {
496 			hw_spin_should_keep_spinning(lock, pol, to, &ss);
497 		}
498 
499 		os_atomic_store(&nnode->lmm_ilk_ready, 1, relaxed);
500 	}
501 
502 	if (mode == LCK_ILK_MODE_FROM_AS) {
503 		lck_mtx_ilk_lock_cleanup_as_mcs(lock, idx, mcs, to, &ss);
504 	}
505 	lck_mtx_mcs_clear(mcs);
506 
507 	if (mode != LCK_MTX_MODE_SPIN) {
508 		LCK_MTX_ADAPTIVE_SPIN_END(lock, lock->lck_mtx_grp, spin_start);
509 	}
510 }
511 
512 static void
lck_mtx_ilk_lock_nopreempt(lck_mtx_t * lock,lck_ilk_mode_t mode)513 lck_mtx_ilk_lock_nopreempt(lck_mtx_t *lock, lck_ilk_mode_t mode)
514 {
515 	lck_mtx_state_t state, nstate;
516 
517 	os_atomic_rmw_loop(&lock->lck_mtx.val, state.val, nstate.val, acquire, {
518 		if (__improbable(state.ilocked || state.ilk_tail)) {
519 		        os_atomic_rmw_loop_give_up({
520 				return lck_mtx_ilk_lock_contended(lock, state, mode);
521 			});
522 		}
523 
524 		nstate = state;
525 		nstate.ilocked = true;
526 	});
527 }
528 
529 static void
lck_mtx_ilk_unlock_v(lck_mtx_t * lock,uint32_t data)530 lck_mtx_ilk_unlock_v(lck_mtx_t *lock, uint32_t data)
531 {
532 	os_atomic_store(&lock->lck_mtx.data, data, release);
533 	lock_enable_preemption();
534 }
535 
536 static void
lck_mtx_ilk_unlock(lck_mtx_t * lock)537 lck_mtx_ilk_unlock(lck_mtx_t *lock)
538 {
539 	lck_mtx_ilk_unlock_v(lock, lock->lck_mtx.data & ~LCK_MTX_ILOCK);
540 }
541 
542 
543 #pragma mark lck_mtx_t: turnstile integration
544 
545 /*
546  * Routine: lck_mtx_lock_wait
547  *
548  * Invoked in order to wait on contention.
549  *
550  * Called with the interlock locked and
551  * returns it unlocked.
552  *
553  * Always aggressively sets the owning thread to promoted,
554  * even if it's the same or higher priority
555  * This prevents it from lowering its own priority while holding a lock
556  *
557  * TODO: Come up with a more efficient way to handle same-priority promotions
558  *      <rdar://problem/30737670> ARM mutex contention logic could avoid taking the thread lock
559  */
560 static struct turnstile *
lck_mtx_lock_wait(lck_mtx_t * lck,thread_t self,thread_t holder,struct turnstile * ts)561 lck_mtx_lock_wait(
562 	lck_mtx_t              *lck,
563 	thread_t                self,
564 	thread_t                holder,
565 	struct turnstile       *ts)
566 {
567 	uint64_t sleep_start = LCK_MTX_BLOCK_BEGIN();
568 
569 	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_START,
570 	    unslide_for_kdebug(lck), (uintptr_t)thread_tid(self), 0, 0, 0);
571 
572 	if (ts == TURNSTILE_NULL) {
573 		ts = turnstile_prepare_compact_id((uintptr_t)lck,
574 		    lck->lck_mtx_tsid, TURNSTILE_KERNEL_MUTEX);
575 		if (lck->lck_mtx_tsid == 0) {
576 			lck->lck_mtx_tsid = ts->ts_compact_id;
577 		}
578 	}
579 	assert3u(ts->ts_compact_id, ==, lck->lck_mtx_tsid);
580 
581 	thread_set_pending_block_hint(self, kThreadWaitKernelMutex);
582 	turnstile_update_inheritor(ts, holder, (TURNSTILE_DELAYED_UPDATE | TURNSTILE_INHERITOR_THREAD));
583 
584 	waitq_assert_wait64(&ts->ts_waitq, LCK_MTX_EVENT(lck),
585 	    THREAD_UNINT | THREAD_WAIT_NOREPORT_USER, TIMEOUT_WAIT_FOREVER);
586 
587 	lck_mtx_ilk_unlock(lck);
588 
589 	turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_NOT_HELD);
590 
591 	thread_block(THREAD_CONTINUE_NULL);
592 
593 	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_END, 0, 0, 0, 0, 0);
594 
595 	LCK_MTX_BLOCK_END(lck, lck->lck_mtx_grp, sleep_start);
596 
597 	return ts;
598 }
599 
600 static void
lck_mtx_lock_wait_done(lck_mtx_t * lck,struct turnstile * ts)601 lck_mtx_lock_wait_done(lck_mtx_t *lck, struct turnstile  *ts)
602 {
603 	if (turnstile_complete_compact_id((uintptr_t)lck, ts,
604 	    TURNSTILE_KERNEL_MUTEX)) {
605 		lck->lck_mtx_tsid = 0;
606 	}
607 }
608 
609 /*
610  * Routine:     lck_mtx_lock_will_need_wakeup
611  *
612  * Returns whether the thread is the current turnstile inheritor,
613  * which means it will have to call lck_mtx_unlock_wakeup()
614  * on unlock.
615  */
616 __attribute__((always_inline))
617 static bool
lck_mtx_lock_will_need_wakeup(lck_mtx_t * lck,thread_t self)618 lck_mtx_lock_will_need_wakeup(lck_mtx_t *lck, thread_t  self)
619 {
620 	uint32_t tsid = lck->lck_mtx_tsid;
621 
622 	return tsid && turnstile_get_by_id(tsid)->ts_inheritor == self;
623 }
624 
625 /*
626  * Routine:     lck_mtx_unlock_wakeup
627  *
628  * Invoked on unlock when there is contention.
629  *
630  * Called with the interlock locked.
631  *
632  * NOTE: callers should call turnstile_clenup after
633  * dropping the interlock.
634  */
635 static void
lck_mtx_unlock_wakeup(lck_mtx_t * lck,__kdebug_only thread_t thread)636 lck_mtx_unlock_wakeup(
637 	lck_mtx_t                       *lck,
638 	__kdebug_only thread_t          thread)
639 {
640 	struct turnstile *ts;
641 	kern_return_t did_wake;
642 
643 	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_UNLCK_WAKEUP_CODE) | DBG_FUNC_START,
644 	    unslide_for_kdebug(lck), (uintptr_t)thread_tid(thread), 0, 0, 0);
645 
646 	ts = turnstile_get_by_id(lck->lck_mtx_tsid);
647 
648 	/*
649 	 * We can skip turnstile_{prepare,cleanup} because
650 	 * we hold the interlock of the primitive,
651 	 * and enqueues/wakeups all happen under the interlock,
652 	 * which means the turnstile is stable.
653 	 */
654 	did_wake = waitq_wakeup64_one(&ts->ts_waitq, LCK_MTX_EVENT(lck),
655 	    THREAD_AWAKENED, WAITQ_UPDATE_INHERITOR);
656 	assert(did_wake == KERN_SUCCESS);
657 
658 	turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD);
659 
660 	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_UNLCK_WAKEUP_CODE) | DBG_FUNC_END, 0, 0, 0, 0, 0);
661 }
662 
663 
664 #pragma mark lck_mtx_t: lck_mtx_lock
665 
666 static inline bool
lck_mtx_ctid_on_core(uint32_t ctid)667 lck_mtx_ctid_on_core(uint32_t ctid)
668 {
669 	thread_t th = ctid_get_thread_unsafe(ctid);
670 
671 	return th && machine_thread_on_core_allow_invalid(th);
672 }
673 
674 #define LCK_MTX_OWNER_FOR_TRACE(lock) \
675 	VM_KERNEL_UNSLIDE_OR_PERM(ctid_get_thread_unsafe((lock)->lck_mtx.data))
676 
677 static void
lck_mtx_lock_adaptive_spin(lck_mtx_t * lock,lck_mtx_state_t state)678 lck_mtx_lock_adaptive_spin(lck_mtx_t *lock, lck_mtx_state_t state)
679 {
680 	__kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lock);
681 	hw_spin_policy_t  pol = &lck_mtx_ilk_timeout_policy;
682 	hw_spin_timeout_t to  = hw_spin_compute_timeout(pol);
683 	hw_spin_state_t   ss  = { };
684 	uint64_t          deadline;
685 
686 	lck_mtx_mcs_t     mcs, node;
687 	lck_mcs_id_t      idx, pidx, clear_idx;
688 	unsigned long     prev;
689 	lck_mtx_state_t   nstate;
690 	ast_t      *const astp = ast_pending();
691 
692 	idx  = lck_mtx_get_mcs_id();
693 	mcs  = &lck_mcs_get_current()->mcs_mtx;
694 
695 	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_START,
696 	    trace_lck, LCK_MTX_OWNER_FOR_TRACE(lock), lock->lck_mtx_tsid, 0, 0);
697 
698 	deadline = ml_get_timebase() + os_atomic_load(&MutexSpin, relaxed) * processor_avail_count;
699 
700 	/*
701 	 *	Take a spot in the adaptive spin queue,
702 	 *	and then spin until we're at the head of it.
703 	 *
704 	 *	Until we're at the head, we do not need to monitor
705 	 *	for whether the current owner is on core or not:
706 	 *
707 	 *	1. the head of the queue is doing it already,
708 	 *
709 	 *	2. when the entire adaptive spin queue will "give up"
710 	 *	   as a result of the owner going off core, we want
711 	 *	   to avoid a thundering herd and let the AS queue
712 	 *	   pour into the interlock one slowly.
713 	 *
714 	 *	Do give up if the scheduler made noises something
715 	 *	more important has shown up.
716 	 *
717 	 *	Note: this function is optimized so that we do not touch
718 	 *	      our local mcs node when we're the head of the queue.
719 	 *
720 	 *	      This allows us in the case when the contention is
721 	 *	      between 2 cores only to not have to touch this
722 	 *	      cacheline at all.
723 	 */
724 	pidx = os_atomic_xchg(&lock->lck_mtx.as_tail, idx, release);
725 	if (pidx) {
726 		node = lck_mtx_get_mcs(pidx);
727 		mcs->lmm_as_prev = pidx;
728 		os_atomic_store(&node->lmm_as_next, mcs, release);
729 
730 		while (!hw_spin_wait_until(&mcs->lmm_as_prev, prev,
731 		    prev == 0 || (os_atomic_load(astp, relaxed) & AST_URGENT) || (ml_get_timebase() > deadline))) {
732 			hw_spin_should_keep_spinning(lock, pol, to, &ss);
733 		}
734 
735 		if (__improbable(prev)) {
736 			goto adaptive_spin_fail;
737 		}
738 
739 		clear_idx = 0;
740 	} else {
741 		clear_idx = idx;
742 	}
743 
744 	/*
745 	 *	We're now first in line.
746 	 *
747 	 *	It's our responsbility to monitor the lock's state
748 	 *	for whether (1) the lock has become available,
749 	 *	(2) its owner has gone off core, (3) the scheduler
750 	 *	wants its CPU back, or (4) we've spun for too long.
751 	 */
752 	deadline = ml_get_timebase() + os_atomic_load(&MutexSpin, relaxed);
753 
754 	for (;;) {
755 		state.val = lock_load_exclusive(&lock->lck_mtx.val, acquire);
756 
757 		if (__probable(!state.ilocked && !state.ilk_tail && !state.owner)) {
758 			/*
759 			 * 2-core contention: if we can, try to dequeue
760 			 * ourselves from the adaptive spin queue
761 			 * as part of this CAS in order to avoid
762 			 * the cost of lck_mtx_ilk_lock_cleanup_as_mcs()
763 			 * and zeroing the mcs node at all.
764 			 *
765 			 * Because the queue is designed to limit contention,
766 			 * using store-exclusive over an armv8.1 LSE atomic
767 			 * is actually marginally better (presumably due to
768 			 * the better codegen).
769 			 */
770 			nstate = state;
771 			nstate.ilocked = true;
772 			if (state.as_tail == clear_idx) {
773 				nstate.as_tail = 0;
774 			}
775 			if (__probable(lock_store_exclusive(&lock->lck_mtx.val,
776 			    state.val, nstate.val, acquire))) {
777 				break;
778 			}
779 		} else {
780 			lock_wait_for_event();
781 		}
782 
783 		if (__improbable(ml_get_timebase() > deadline ||
784 		    (os_atomic_load(astp, relaxed) & AST_URGENT) ||
785 		    (!state.ilocked && !state.ilk_tail && state.owner &&
786 		    !lck_mtx_ctid_on_core(state.owner)))) {
787 			goto adaptive_spin_fail;
788 		}
789 	}
790 
791 	/*
792 	 *	If we're here, we got the lock, we just have to cleanup
793 	 *	the MCS nodes and return.
794 	 */
795 	if (state.as_tail != clear_idx) {
796 		lck_mtx_ilk_lock_cleanup_as_mcs(lock, idx, mcs, to, &ss);
797 		lck_mtx_mcs_clear(mcs);
798 	}
799 
800 	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_END,
801 	    trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(thread),
802 	    lock->lck_mtx_tsid, 0, 0);
803 	return;
804 
805 adaptive_spin_fail:
806 	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_END,
807 	    trace_lck, LCK_MTX_OWNER_FOR_TRACE(lock), lock->lck_mtx_tsid, 0, 0);
808 	return lck_mtx_ilk_lock_contended(lock, state, LCK_ILK_MODE_FROM_AS);
809 }
810 
811 static NOINLINE void
lck_mtx_lock_contended(lck_mtx_t * lock,thread_t thread,lck_mtx_mode_t mode)812 lck_mtx_lock_contended(lck_mtx_t *lock, thread_t thread, lck_mtx_mode_t mode)
813 {
814 	struct turnstile *ts = TURNSTILE_NULL;
815 	lck_mtx_state_t   state;
816 	uint32_t          ctid = thread->ctid;
817 	uint32_t          data;
818 #if CONFIG_DTRACE
819 	int               first_miss = 0;
820 #endif /* CONFIG_DTRACE */
821 	bool              direct_wait = false;
822 	uint64_t          spin_start;
823 	uint32_t          profile;
824 
825 	lck_mtx_check_irq(lock);
826 	if (mode == LCK_MTX_MODE_SLEEPABLE) {
827 		lock_disable_preemption_for_thread(thread);
828 	}
829 
830 	for (;;) {
831 		/*
832 		 *	Load the current state and perform sanity checks
833 		 *
834 		 *	Note that the various "corrupt" values are designed
835 		 *	so that the slowpath is taken when a mutex was used
836 		 *	after destruction, so that we do not have to do
837 		 *	sanity checks in the fast path.
838 		 */
839 		state = os_atomic_load(&lock->lck_mtx, relaxed);
840 		if (state.owner == ctid) {
841 			__lck_mtx_owned_panic(lock, thread);
842 		}
843 		if (lock->lck_mtx_type != LCK_TYPE_MUTEX ||
844 		    state.data == LCK_MTX_TAG_DESTROYED) {
845 			__lck_mtx_invalid_panic(lock);
846 		}
847 		profile = (state.data & LCK_MTX_PROFILE);
848 
849 		/*
850 		 *	Attempt steal
851 		 *
852 		 *	When the lock state is 0, then no thread can be queued
853 		 *	for adaptive spinning or for the interlock yet.
854 		 *
855 		 *	As such we can attempt to try to take the interlock.
856 		 *	(we can't take the mutex directly because we need
857 		 *	the interlock to do turnstile operations on the way out).
858 		 */
859 		if ((state.val & ~(uint64_t)LCK_MTX_PROFILE) == 0) {
860 			if (!os_atomic_cmpxchgv(&lock->lck_mtx.val,
861 			    state.val, state.val | LCK_MTX_ILOCK,
862 			    &state.val, acquire)) {
863 				continue;
864 			}
865 			break;
866 		}
867 
868 #if CONFIG_DTRACE
869 		if (profile) {
870 			LCK_MTX_PROF_MISS(lock, lock->lck_mtx_grp, &first_miss);
871 		}
872 #endif /* CONFIG_DTRACE */
873 
874 		if (mode == LCK_MTX_MODE_SLEEPABLE) {
875 			spin_start = LCK_MTX_ADAPTIVE_SPIN_BEGIN();
876 		} else {
877 			spin_start = LCK_MTX_SPIN_SPIN_BEGIN();
878 		}
879 
880 		/*
881 		 *	Adaptive spin or interlock
882 		 *
883 		 *	Evaluate if adaptive spinning should be attempted,
884 		 *	and if yes go to adaptive spin.
885 		 *
886 		 *	Otherwise (and this includes always-spin mutexes),
887 		 *	go for the interlock.
888 		 */
889 		if (mode != LCK_MTX_MODE_SPIN_ALWAYS &&
890 		    (state.ilocked || state.as_tail || !state.owner ||
891 		    lck_mtx_ctid_on_core(state.owner))) {
892 			lck_mtx_lock_adaptive_spin(lock, state);
893 		} else {
894 			direct_wait = true;
895 			lck_mtx_ilk_lock_nopreempt(lock, LCK_ILK_MODE_DIRECT);
896 		}
897 
898 		if (mode == LCK_MTX_MODE_SLEEPABLE) {
899 			LCK_MTX_ADAPTIVE_SPIN_END(lock, lock->lck_mtx_grp, spin_start);
900 		} else {
901 			LCK_MTX_SPIN_SPIN_END(lock, lock->lck_mtx_grp, spin_start);
902 		}
903 
904 		/*
905 		 *	Take or sleep
906 		 *
907 		 *	We now have the interlock. Either the owner
908 		 *	isn't set, and the mutex is ours to claim,
909 		 *	or we must go to sleep.
910 		 *
911 		 *	If we go to sleep, we need to set LCK_MTX_NEEDS_WAKEUP
912 		 *	to force the current lock owner to call
913 		 *	lck_mtx_unlock_wakeup().
914 		 */
915 		state = os_atomic_load(&lock->lck_mtx, relaxed);
916 		if (state.owner == LCK_MTX_NULL_CTID) {
917 			break;
918 		}
919 
920 		if (mode == LCK_MTX_MODE_SPIN_ALWAYS) {
921 			__lck_mtx_lock_is_sleepable_panic(lock);
922 		}
923 
924 #if CONFIG_DTRACE
925 		if (profile) {
926 			LCK_MTX_PROF_WAIT(lock, lock->lck_mtx_grp,
927 			    direct_wait, &first_miss);
928 		}
929 #endif /* CONFIG_DTRACE */
930 		os_atomic_store(&lock->lck_mtx.data,
931 		    state.data | LCK_MTX_ILOCK | LCK_MTX_NEEDS_WAKEUP,
932 		    compiler_acq_rel);
933 		ts = lck_mtx_lock_wait(lock, thread,
934 		    ctid_get_thread(state.owner), ts);
935 
936 		/* returns interlock unlocked and preemption re-enabled */
937 		lock_disable_preemption_for_thread(thread);
938 	}
939 
940 	/*
941 	 *	We can take the lock!
942 	 *
943 	 *	We only have the interlock and the owner field is 0.
944 	 *
945 	 *	Perform various turnstile cleanups if needed,
946 	 *	claim the lock, and reenable preemption (if needed).
947 	 */
948 	if (ts) {
949 		lck_mtx_lock_wait_done(lock, ts);
950 	}
951 	data = ctid | profile;
952 	if (lck_mtx_lock_will_need_wakeup(lock, thread)) {
953 		data |= LCK_MTX_NEEDS_WAKEUP;
954 	}
955 	if (mode != LCK_MTX_MODE_SLEEPABLE) {
956 		data |= LCK_MTX_ILOCK | LCK_MTX_SPIN_MODE;
957 	}
958 	os_atomic_store(&lock->lck_mtx.data, data, release);
959 
960 	if (mode == LCK_MTX_MODE_SLEEPABLE) {
961 		lock_enable_preemption();
962 	}
963 
964 	assert(thread->turnstile != NULL);
965 
966 	if (ts) {
967 		turnstile_cleanup();
968 	}
969 	LCK_MTX_ACQUIRED(lock, lock->lck_mtx_grp,
970 	    mode != LCK_MTX_MODE_SLEEPABLE, profile);
971 }
972 
973 #if LCK_MTX_CHECK_INVARIANTS || CONFIG_DTRACE
974 __attribute__((noinline))
975 #else
976 __attribute__((always_inline))
977 #endif
978 static void
lck_mtx_lock_slow(lck_mtx_t * lock,thread_t thread,lck_mtx_state_t state,lck_mtx_mode_t mode)979 lck_mtx_lock_slow(
980 	lck_mtx_t              *lock,
981 	thread_t                thread,
982 	lck_mtx_state_t         state,
983 	lck_mtx_mode_t          mode)
984 {
985 #pragma unused(state)
986 #if CONFIG_DTRACE
987 	lck_mtx_state_t ostate = {
988 		.data = LCK_MTX_PROFILE,
989 	};
990 #endif /* CONFIG_DTRACE */
991 
992 #if LCK_MTX_CHECK_INVARIANTS
993 	if (mode != LCK_MTX_MODE_SPIN_ALWAYS) {
994 		lck_mtx_check_preemption(lock, thread,
995 		    (mode == LCK_MTX_MODE_SPIN));
996 	}
997 #endif /* LCK_MTX_CHECK_INVARIANTS */
998 #if CONFIG_DTRACE
999 	if (state.val == ostate.val) {
1000 		state.data = thread->ctid | LCK_MTX_PROFILE;
1001 		if (mode != LCK_MTX_MODE_SLEEPABLE) {
1002 			state.ilocked = true;
1003 			state.spin_mode = true;
1004 		}
1005 		os_atomic_cmpxchgv(&lock->lck_mtx.val,
1006 		    ostate.val, state.val, &state.val, acquire);
1007 	}
1008 	if ((state.val & ~ostate.val) == 0) {
1009 		LCK_MTX_ACQUIRED(lock, lock->lck_mtx_grp,
1010 		    mode != LCK_MTX_MODE_SLEEPABLE,
1011 		    state.data & LCK_MTX_PROFILE);
1012 		return;
1013 	}
1014 #endif /* CONFIG_DTRACE */
1015 	lck_mtx_lock_contended(lock, thread, mode);
1016 }
1017 
1018 static __attribute__((always_inline)) void
lck_mtx_lock_fastpath(lck_mtx_t * lock,lck_mtx_mode_t mode)1019 lck_mtx_lock_fastpath(lck_mtx_t *lock, lck_mtx_mode_t mode)
1020 {
1021 	thread_t thread = current_thread();
1022 	lck_mtx_state_t state = {
1023 		.data = thread->ctid,
1024 	};
1025 	uint64_t take_slowpath = 0;
1026 
1027 	if (mode != LCK_MTX_MODE_SPIN_ALWAYS) {
1028 		take_slowpath |= LCK_MTX_SNIFF_PREEMPTION(thread);
1029 	}
1030 	take_slowpath |= LCK_MTX_SNIFF_DTRACE();
1031 
1032 	if (mode != LCK_MTX_MODE_SLEEPABLE) {
1033 		lock_disable_preemption_for_thread(thread);
1034 		state.ilocked = true;
1035 		state.spin_mode = true;
1036 	}
1037 
1038 	/*
1039 	 * Do the CAS on the entire mutex state,
1040 	 * which hence requires for the ILK/AS queues
1041 	 * to be empty (which is fairer).
1042 	 */
1043 	lock_cmpxchgv(&lock->lck_mtx.val,
1044 	    0, state.val, &state.val, acquire);
1045 
1046 	take_slowpath |= state.val;
1047 	if (__improbable(take_slowpath)) {
1048 		return lck_mtx_lock_slow(lock, thread, state, mode);
1049 	}
1050 }
1051 
1052 __mockable void
lck_mtx_lock(lck_mtx_t * lock)1053 lck_mtx_lock(lck_mtx_t *lock)
1054 {
1055 	lck_mtx_lock_fastpath(lock, LCK_MTX_MODE_SLEEPABLE);
1056 }
1057 
1058 void
lck_mtx_lock_spin(lck_mtx_t * lock)1059 lck_mtx_lock_spin(lck_mtx_t *lock)
1060 {
1061 	lck_mtx_lock_fastpath(lock, LCK_MTX_MODE_SPIN);
1062 }
1063 
1064 void
lck_mtx_lock_spin_always(lck_mtx_t * lock)1065 lck_mtx_lock_spin_always(lck_mtx_t *lock)
1066 {
1067 	lck_mtx_lock_fastpath(lock, LCK_MTX_MODE_SPIN_ALWAYS);
1068 }
1069 
1070 
1071 #pragma mark lck_mtx_t: lck_mtx_try_lock
1072 
1073 static __attribute__((always_inline)) bool
lck_mtx_try_lock_slow_inline(lck_mtx_t * lock,thread_t thread,uint32_t odata,uint32_t ndata,bool spin)1074 lck_mtx_try_lock_slow_inline(
1075 	lck_mtx_t              *lock,
1076 	thread_t                thread,
1077 	uint32_t                odata,
1078 	uint32_t                ndata,
1079 	bool                    spin)
1080 {
1081 #pragma unused(lock, thread, odata, ndata)
1082 #if CONFIG_DTRACE
1083 	if (odata == LCK_MTX_PROFILE) {
1084 		os_atomic_cmpxchgv(&lock->lck_mtx.data,
1085 		    odata, ndata | LCK_MTX_PROFILE, &odata, acquire);
1086 	}
1087 	if ((odata & ~LCK_MTX_PROFILE) == 0) {
1088 		LCK_MTX_TRY_ACQUIRED(lock, lock->lck_mtx_grp,
1089 		    spin, odata & LCK_MTX_PROFILE);
1090 		return true;
1091 	}
1092 	if (odata & LCK_MTX_PROFILE) {
1093 		LCK_MTX_PROF_MISS(lock, lock->lck_mtx_grp, &(int){ 0 });
1094 	}
1095 #endif /* CONFIG_DTRACE */
1096 
1097 	if (spin) {
1098 		lock_enable_preemption();
1099 	}
1100 	return false;
1101 }
1102 
1103 #if CONFIG_DTRACE || LCK_MTX_CHECK_INVARIANTS
1104 __attribute__((noinline))
1105 #else
1106 __attribute__((always_inline))
1107 #endif
1108 static bool
lck_mtx_try_lock_slow(lck_mtx_t * lock,thread_t thread,uint32_t odata,uint32_t ndata)1109 lck_mtx_try_lock_slow(
1110 	lck_mtx_t              *lock,
1111 	thread_t                thread,
1112 	uint32_t                odata,
1113 	uint32_t                ndata)
1114 {
1115 	return lck_mtx_try_lock_slow_inline(lock, thread, odata, ndata, false);
1116 }
1117 
1118 #if CONFIG_DTRACE || LCK_MTX_CHECK_INVARIANTS
1119 __attribute__((noinline))
1120 #else
1121 __attribute__((always_inline))
1122 #endif
1123 static bool
lck_mtx_try_lock_slow_spin(lck_mtx_t * lock,thread_t thread,uint32_t odata,uint32_t ndata)1124 lck_mtx_try_lock_slow_spin(
1125 	lck_mtx_t              *lock,
1126 	thread_t                thread,
1127 	uint32_t                odata,
1128 	uint32_t                ndata)
1129 {
1130 	return lck_mtx_try_lock_slow_inline(lock, thread, odata, ndata, true);
1131 }
1132 
1133 static __attribute__((always_inline)) bool
lck_mtx_try_lock_fastpath(lck_mtx_t * lock,lck_mtx_mode_t mode)1134 lck_mtx_try_lock_fastpath(lck_mtx_t *lock, lck_mtx_mode_t mode)
1135 {
1136 	thread_t thread = current_thread();
1137 	uint32_t odata, ndata = thread->ctid;
1138 	uint32_t take_slowpath = 0;
1139 
1140 #if CONFIG_DTRACE
1141 	take_slowpath |= lck_debug_state.lds_value;
1142 #endif
1143 	if (mode != LCK_MTX_MODE_SLEEPABLE) {
1144 		lock_disable_preemption_for_thread(thread);
1145 		ndata |= LCK_MTX_SPIN_MODE | LCK_MTX_ILOCK;
1146 	}
1147 
1148 	/*
1149 	 * try_lock because it's likely to be used for cases
1150 	 * like lock inversion resolutions tries a bit harder
1151 	 * than lck_mtx_lock() to take the lock and ignores
1152 	 * adaptive spin / interlock queues by doing the CAS
1153 	 * on the 32bit mutex data only.
1154 	 */
1155 	lock_cmpxchgv(&lock->lck_mtx.data, 0, ndata, &odata, acquire);
1156 
1157 	take_slowpath |= odata;
1158 	if (__probable(!take_slowpath)) {
1159 		return true;
1160 	}
1161 
1162 	if (mode == LCK_MTX_MODE_SPIN_ALWAYS &&
1163 	    (odata & LCK_MTX_CTID_MASK) &&
1164 	    !(odata & LCK_MTX_SPIN_MODE)) {
1165 		__lck_mtx_lock_is_sleepable_panic(lock);
1166 	}
1167 
1168 	if (mode == LCK_MTX_MODE_SLEEPABLE) {
1169 		return lck_mtx_try_lock_slow(lock, thread, odata, ndata);
1170 	} else {
1171 		return lck_mtx_try_lock_slow_spin(lock, thread, odata, ndata);
1172 	}
1173 }
1174 
1175 boolean_t
lck_mtx_try_lock(lck_mtx_t * lock)1176 lck_mtx_try_lock(lck_mtx_t *lock)
1177 {
1178 	return lck_mtx_try_lock_fastpath(lock, LCK_MTX_MODE_SLEEPABLE);
1179 }
1180 
1181 boolean_t
lck_mtx_try_lock_spin(lck_mtx_t * lock)1182 lck_mtx_try_lock_spin(lck_mtx_t *lock)
1183 {
1184 	return lck_mtx_try_lock_fastpath(lock, LCK_MTX_MODE_SPIN);
1185 }
1186 
1187 boolean_t
lck_mtx_try_lock_spin_always(lck_mtx_t * lock)1188 lck_mtx_try_lock_spin_always(lck_mtx_t *lock)
1189 {
1190 	return lck_mtx_try_lock_fastpath(lock, LCK_MTX_MODE_SPIN_ALWAYS);
1191 }
1192 
1193 
1194 #pragma mark lck_mtx_t: lck_mtx_unlock
1195 
1196 static NOINLINE void
lck_mtx_unlock_contended(lck_mtx_t * lock,thread_t thread,uint32_t data)1197 lck_mtx_unlock_contended(lck_mtx_t *lock, thread_t thread, uint32_t data)
1198 {
1199 	bool cleanup = false;
1200 
1201 #if !CONFIG_DTRACE
1202 	/*
1203 	 * This check is done by lck_mtx_unlock_slow() when it is enabled.
1204 	 */
1205 	if (thread->ctid != (data & LCK_MTX_CTID_MASK)) {
1206 		__lck_mtx_not_owned_panic(lock, thread);
1207 	}
1208 #endif /* !CONFIG_DTRACE */
1209 
1210 	if ((data & LCK_MTX_SPIN_MODE) == 0) {
1211 		lock_disable_preemption_for_thread(thread);
1212 		lck_mtx_ilk_lock_nopreempt(lock, LCK_ILK_MODE_UNLOCK);
1213 	}
1214 
1215 	/*
1216 	 * We must re-load the data: we might have taken
1217 	 * the slowpath because another thread had taken
1218 	 * the interlock and set the NEEDS_WAKEUP bit
1219 	 * while we were spinning to get it.
1220 	 */
1221 	data = os_atomic_load(&lock->lck_mtx.data, compiler_acq_rel);
1222 	if (data & LCK_MTX_NEEDS_WAKEUP) {
1223 		lck_mtx_unlock_wakeup(lock, thread);
1224 		cleanup = true;
1225 	}
1226 	lck_mtx_ilk_unlock_v(lock, data & LCK_MTX_PROFILE);
1227 
1228 	LCK_MTX_RELEASED(lock, lock->lck_mtx_grp, data & LCK_MTX_PROFILE);
1229 
1230 	/*
1231 	 * Do not do any turnstile operations outside of this block.
1232 	 *
1233 	 * lock/unlock is called at early stage of boot while single
1234 	 * threaded, without turnstiles being available yet.
1235 	 * Even without contention we can come throught the slow path
1236 	 * if the mutex is acquired as a spin lock.
1237 	 */
1238 	if (cleanup) {
1239 		turnstile_cleanup();
1240 	}
1241 }
1242 
1243 #if CONFIG_DTRACE
1244 __attribute__((noinline))
1245 #else
1246 __attribute__((always_inline))
1247 #endif
1248 static void
lck_mtx_unlock_slow(lck_mtx_t * lock,thread_t thread,uint32_t data)1249 lck_mtx_unlock_slow(lck_mtx_t *lock, thread_t thread, uint32_t data)
1250 {
1251 #if CONFIG_DTRACE
1252 	/*
1253 	 *	If Dtrace is enabled, locks can be profiled,
1254 	 *	which causes the fastpath of unlock to fail.
1255 	 */
1256 	if ((data & LCK_MTX_BITS_MASK) == LCK_MTX_PROFILE) {
1257 		os_atomic_cmpxchgv(&lock->lck_mtx.data, data, LCK_MTX_PROFILE,
1258 		    &data, release);
1259 	}
1260 	if (thread->ctid != (data & LCK_MTX_CTID_MASK)) {
1261 		__lck_mtx_not_owned_panic(lock, thread);
1262 	}
1263 	if ((data & (LCK_MTX_BITS_MASK & ~LCK_MTX_PROFILE)) == 0) {
1264 		LCK_MTX_RELEASED(lock, lock->lck_mtx_grp, false);
1265 		return;
1266 	}
1267 #endif /* CONFIG_DTRACE */
1268 
1269 	lck_mtx_unlock_contended(lock, thread, data);
1270 }
1271 
1272 __mockable void
lck_mtx_unlock(lck_mtx_t * lock)1273 lck_mtx_unlock(lck_mtx_t *lock)
1274 {
1275 	thread_t thread = current_thread();
1276 	uint32_t take_slowpath = 0;
1277 	uint32_t data;
1278 
1279 	take_slowpath |= LCK_MTX_SNIFF_DTRACE();
1280 
1281 	/*
1282 	 * The fast path ignores the ILK/AS queues on purpose,
1283 	 * those really are a "lock" concept, not unlock.
1284 	 */
1285 	if (__probable(lock_cmpxchgv(&lock->lck_mtx.data,
1286 	    thread->ctid, 0, &data, release))) {
1287 		if (__probable(!take_slowpath)) {
1288 			return;
1289 		}
1290 	}
1291 
1292 	lck_mtx_unlock_slow(lock, thread, data);
1293 }
1294 
1295 
1296 #pragma mark lck_mtx_t: misc
1297 
1298 void
lck_mtx_assert(lck_mtx_t * lock,unsigned int type)1299 lck_mtx_assert(lck_mtx_t *lock, unsigned int type)
1300 {
1301 	lck_mtx_state_t state  = os_atomic_load(&lock->lck_mtx, relaxed);
1302 	thread_t        thread = current_thread();
1303 
1304 	if (type == LCK_MTX_ASSERT_OWNED) {
1305 		if (state.owner != thread->ctid) {
1306 			__lck_mtx_not_owned_panic(lock, thread);
1307 		}
1308 	} else if (type == LCK_MTX_ASSERT_NOTOWNED) {
1309 		if (state.owner == thread->ctid) {
1310 			__lck_mtx_owned_panic(lock, thread);
1311 		}
1312 	} else {
1313 		panic("lck_mtx_assert(): invalid arg (%u)", type);
1314 	}
1315 }
1316 
1317 /*
1318  *	Routine:	lck_mtx_convert_spin
1319  *
1320  *	Convert a mutex held for spin into a held full mutex
1321  */
1322 void
lck_mtx_convert_spin(lck_mtx_t * lock)1323 lck_mtx_convert_spin(lck_mtx_t *lock)
1324 {
1325 	lck_mtx_state_t state  = os_atomic_load(&lock->lck_mtx, relaxed);
1326 	thread_t        thread = current_thread();
1327 	uint32_t        data   = thread->ctid;
1328 
1329 	if (state.owner != data) {
1330 		__lck_mtx_not_owned_panic(lock, thread);
1331 	}
1332 
1333 	if (state.spin_mode) {
1334 		/*
1335 		 * Note: we can acquire the lock in spin mode
1336 		 *       _and_ be the inheritor if we waited.
1337 		 *
1338 		 *       We must only clear ilocked and spin_mode,
1339 		 *       but preserve owner and needs_wakeup.
1340 		 */
1341 		state.ilocked = false;
1342 		state.spin_mode = false;
1343 		lck_mtx_ilk_unlock_v(lock, state.data);
1344 		turnstile_cleanup();
1345 	}
1346 }
1347 
1348 /*
1349  * Routine: kdp_lck_mtx_lock_spin_is_acquired
1350  * NOT SAFE: To be used only by kernel debugger to avoid deadlock.
1351  */
1352 boolean_t
kdp_lck_mtx_lock_spin_is_acquired(lck_mtx_t * lck)1353 kdp_lck_mtx_lock_spin_is_acquired(lck_mtx_t *lck)
1354 {
1355 	lck_mtx_state_t state = os_atomic_load(&lck->lck_mtx, relaxed);
1356 
1357 	if (not_in_kdp) {
1358 		panic("panic: spinlock acquired check done outside of kernel debugger");
1359 	}
1360 	if (state.data == LCK_MTX_TAG_DESTROYED) {
1361 		return false;
1362 	}
1363 	return state.owner || state.ilocked;
1364 }
1365 
1366 void
kdp_lck_mtx_find_owner(struct waitq * waitq __unused,event64_t event,thread_waitinfo_t * waitinfo)1367 kdp_lck_mtx_find_owner(
1368 	struct waitq           *waitq __unused,
1369 	event64_t               event,
1370 	thread_waitinfo_t      *waitinfo)
1371 {
1372 	lck_mtx_t      *mutex  = LCK_EVENT_TO_MUTEX(event);
1373 	lck_mtx_state_t state  = os_atomic_load(&mutex->lck_mtx, relaxed);
1374 
1375 	assert3u(state.data, !=, LCK_MTX_TAG_DESTROYED);
1376 	waitinfo->context = VM_KERNEL_UNSLIDE_OR_PERM(mutex);
1377 	waitinfo->owner   = thread_tid(ctid_get_thread(state.owner));
1378 }
1379 
1380 #endif /* !LCK_MTX_USE_ARCH */
1381 
1382 /*
1383  * Routine:     mutex_pause
1384  *
1385  * Called by former callers of simple_lock_pause().
1386  */
1387 #define MAX_COLLISION_COUNTS    32
1388 #define MAX_COLLISION   8
1389 
1390 unsigned int max_collision_count[MAX_COLLISION_COUNTS];
1391 
1392 uint32_t collision_backoffs[MAX_COLLISION] = {
1393 	10, 50, 100, 200, 400, 600, 800, 1000
1394 };
1395 
1396 
1397 void
mutex_pause(uint32_t collisions)1398 mutex_pause(uint32_t collisions)
1399 {
1400 	wait_result_t wait_result;
1401 	uint32_t        back_off;
1402 
1403 	if (collisions >= MAX_COLLISION_COUNTS) {
1404 		collisions = MAX_COLLISION_COUNTS - 1;
1405 	}
1406 	max_collision_count[collisions]++;
1407 
1408 	if (collisions >= MAX_COLLISION) {
1409 		collisions = MAX_COLLISION - 1;
1410 	}
1411 	back_off = collision_backoffs[collisions];
1412 
1413 	wait_result = assert_wait_timeout((event_t)mutex_pause, THREAD_UNINT, back_off, NSEC_PER_USEC);
1414 	assert(wait_result == THREAD_WAITING);
1415 
1416 	wait_result = thread_block(THREAD_CONTINUE_NULL);
1417 	assert(wait_result == THREAD_TIMED_OUT);
1418 }
1419 
1420 
1421 unsigned int mutex_yield_wait = 0;
1422 unsigned int mutex_yield_no_wait = 0;
1423 
1424 boolean_t
lck_mtx_yield(lck_mtx_t * lck)1425 lck_mtx_yield(
1426 	lck_mtx_t   *lck)
1427 {
1428 	bool has_waiters = LCK_MTX_HAS_WAITERS(lck);
1429 
1430 #if DEBUG
1431 	lck_mtx_assert(lck, LCK_MTX_ASSERT_OWNED);
1432 #endif /* DEBUG */
1433 
1434 	if (!has_waiters) {
1435 		mutex_yield_no_wait++;
1436 	} else {
1437 		mutex_yield_wait++;
1438 		lck_mtx_unlock(lck);
1439 		mutex_pause(0);
1440 		lck_mtx_lock(lck);
1441 	}
1442 	return has_waiters;
1443 }
1444