xref: /xnu-12377.81.4/osfmk/kern/lock_mtx.c (revision 043036a2b3718f7f0be807e2870f8f47d3fa0796)
1 /*
2  * Copyright (c) 2022 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #define LOCK_PRIVATE 1
30 
31 #include <mach_ldebug.h>
32 #include <kern/locks_internal.h>
33 #include <kern/lock_stat.h>
34 #include <kern/locks.h>
35 #include <kern/kalloc.h>
36 #include <kern/thread.h>
37 
38 #include <mach/machine/sdt.h>
39 
40 #include <machine/cpu_data.h>
41 #include <machine/machine_cpu.h>
42 
43 #if !LCK_MTX_USE_ARCH
44 
45 /*
46  * lck_mtx_t
47  * ~~~~~~~~~
48  *
49  * Kernel mutexes in this implementation are made of four 32 bits words:
50  *
51  *   - word 0: turnstile compact ID (24 bits) and the 0x22 lock tag
52  *   - word 1: padding (to be used for group compact IDs)
53  *   - word 2: mutex state (lock owner + interlock, spin and waiters bits),
54  *             refered to as "data" in the code.
55  *   - word 3: adaptive spin and interlock MCS queue tails.
56  *
57  * The 64 bits word made of the last two words is refered to
58  * as the "mutex state" in code.
59  *
60  *
61  * Core serialization rules
62  * ~~~~~~~~~~~~~~~~~~~~~~~~
63  *
64  * The mutex has a bit (lck_mtx_t::lck_mtx.ilocked or bit LCK_MTX_ILOCK
65  * of the data word) that serves as a spinlock for the mutex state.
66  *
67  *
68  * Updating the lock fields must follow the following rules:
69  *
70  *   - It is ok to "steal" the mutex (updating its data field) if no one
71  *     holds the interlock.
72  *
73  *   - Holding the interlock allows its holder to update the first 3 words
74  *     of the kernel mutex without using RMW atomics (plain stores are OK).
75  *
76  *   - Holding the interlock is required for a thread to remove itself
77  *     from the adaptive spin queue.
78  *
79  *   - Threads can enqueue themselves onto the adaptive spin wait queue
80  *     or the interlock wait queue at any time.
81  *
82  *
83  * Waiters bit and turnstiles
84  * ~~~~~~~~~~~~~~~~~~~~~~~~~~
85  *
86  * The turnstile on a kernel mutex is set by waiters, and cleared
87  * once they have all been resumed and successfully acquired the lock.
88  *
89  * LCK_MTX_NEEDS_WAKEUP being set (always with an owner set too)
90  * forces threads to the lck_mtx_unlock slowpath,
91  * in order to evaluate whether lck_mtx_unlock_wakeup() must be called.
92  *
93  * As a result it means it really only needs to be set at select times:
94  *
95  *   - when a thread blocks and "snitches" on the current thread owner,
96  *     so that when that thread unlocks it calls wake up,
97  *
98  *   - when a thread that was woken up resumes its work and became
99  *     the inheritor.
100  */
101 
102 #define ADAPTIVE_SPIN_ENABLE 0x1
103 
104 #define NOINLINE                __attribute__((noinline))
105 #define LCK_MTX_EVENT(lck)      CAST_EVENT64_T(&(lck)->lck_mtx.data)
106 #define LCK_EVENT_TO_MUTEX(e)   __container_of((uint32_t *)(e), lck_mtx_t, lck_mtx.data)
107 #define LCK_MTX_HAS_WAITERS(l)  ((l)->lck_mtx.data & LCK_MTX_NEEDS_WAKEUP)
108 
109 #if DEVELOPMENT || DEBUG
110 TUNABLE(bool, LckDisablePreemptCheck, "-disable_mtx_chk", false);
111 #endif /* DEVELOPMENT || DEBUG */
112 
113 extern unsigned int not_in_kdp;
114 
115 #if CONFIG_SPTM
116 extern const bool * sptm_xnu_triggered_panic_ptr;
117 #endif /* CONFIG_SPTM */
118 
119 KALLOC_TYPE_DEFINE(KT_LCK_MTX, lck_mtx_t, KT_PRIV_ACCT);
120 
121 #define LCK_MTX_NULL_CTID       0x00000000u
122 
123 __enum_decl(lck_mtx_mode_t, uint32_t, {
124 	LCK_MTX_MODE_SLEEPABLE,
125 	LCK_MTX_MODE_SPIN,
126 	LCK_MTX_MODE_SPIN_ALWAYS,
127 });
128 
129 __enum_decl(lck_ilk_mode_t, uint32_t, {
130 	LCK_ILK_MODE_UNLOCK,
131 	LCK_ILK_MODE_DIRECT,
132 	LCK_ILK_MODE_FROM_AS,
133 });
134 
135 static inline void
lck_mtx_mcs_clear(lck_mtx_mcs_t mcs)136 lck_mtx_mcs_clear(lck_mtx_mcs_t mcs)
137 {
138 	*mcs = (struct lck_mtx_mcs){ };
139 }
140 
141 static inline lck_mcs_id_t
lck_mtx_get_mcs_id(void)142 lck_mtx_get_mcs_id(void)
143 {
144 	return lck_mcs_id_current(LCK_MCS_SLOT_0);
145 }
146 
147 __pure2
148 static inline lck_mtx_mcs_t
lck_mtx_get_mcs(lck_mcs_id_t idx)149 lck_mtx_get_mcs(lck_mcs_id_t idx)
150 {
151 	return &lck_mcs_get_other(idx)->mcs_mtx;
152 }
153 
154 
155 #pragma mark lck_mtx_t: validation
156 
157 __abortlike
158 static void
__lck_mtx_invalid_panic(lck_mtx_t * lck)159 __lck_mtx_invalid_panic(lck_mtx_t *lck)
160 {
161 	panic("Invalid/destroyed mutex %p: "
162 	    "<0x%06x 0x%02x 0x%08x 0x%08x/%p 0x%04x 0x%04x>",
163 	    lck, lck->lck_mtx_tsid, lck->lck_mtx_type, lck->lck_mtx_grp,
164 	    lck->lck_mtx.data, ctid_get_thread_unsafe(lck->lck_mtx.owner),
165 	    lck->lck_mtx.as_tail, lck->lck_mtx.ilk_tail);
166 }
167 
168 __abortlike
169 static void
__lck_mtx_not_owned_panic(lck_mtx_t * lock,thread_t thread)170 __lck_mtx_not_owned_panic(lck_mtx_t *lock, thread_t thread)
171 {
172 	panic("Mutex %p is unexpectedly not owned by thread %p", lock, thread);
173 }
174 
175 #if !LCK_MTX_USE_ARCH
176 __abortlike
177 static void
__lck_mtx_not_locked_spin(lck_mtx_t * lock,thread_t thread)178 __lck_mtx_not_locked_spin(lck_mtx_t *lock, thread_t thread)
179 {
180 	panic("Mutex %p is unexpectedly not locked in spin mode by thread %p",
181 	    lock, thread);
182 }
183 #endif /* !LCK_MTX_USE_ARCH */
184 
185 __abortlike
186 static void
__lck_mtx_owned_panic(lck_mtx_t * lock,thread_t thread)187 __lck_mtx_owned_panic(lck_mtx_t *lock, thread_t thread)
188 {
189 	panic("Mutex %p is unexpectedly owned by thread %p", lock, thread);
190 }
191 
192 __abortlike
193 static void
__lck_mtx_lock_is_sleepable_panic(lck_mtx_t * lck)194 __lck_mtx_lock_is_sleepable_panic(lck_mtx_t *lck)
195 {
196 	// "Always" variants can never block. If the lock is held as a normal mutex
197 	// then someone is mixing always and non-always calls on the same lock, which is
198 	// forbidden.
199 	panic("Mutex %p is held as a full-mutex (spin-always lock attempted)", lck);
200 }
201 
202 #if DEVELOPMENT || DEBUG
203 __abortlike
204 static void
__lck_mtx_preemption_disabled_panic(lck_mtx_t * lck,int expected)205 __lck_mtx_preemption_disabled_panic(lck_mtx_t *lck, int expected)
206 {
207 	panic("Attempt to take mutex %p with preemption disabled (%d)",
208 	    lck, get_preemption_level() - expected);
209 }
210 
211 __abortlike
212 static void
__lck_mtx_at_irq_panic(lck_mtx_t * lck)213 __lck_mtx_at_irq_panic(lck_mtx_t *lck)
214 {
215 	panic("Attempt to take mutex %p in IRQ context", lck);
216 }
217 
218 /*
219  *	Routine:	lck_mtx_check_preemption
220  *
221  *	Verify preemption is enabled when attempting to acquire a mutex.
222  */
223 static inline void
lck_mtx_check_preemption(lck_mtx_t * lock,thread_t thread,int expected)224 lck_mtx_check_preemption(lck_mtx_t *lock, thread_t thread, int expected)
225 {
226 #pragma unused(thread)
227 	if (lock_preemption_level_for_thread(thread) == expected) {
228 		return;
229 	}
230 	if (LckDisablePreemptCheck) {
231 		return;
232 	}
233 	if (current_cpu_datap()->cpu_hibernate) {
234 		return;
235 	}
236 	if (startup_phase < STARTUP_SUB_EARLY_BOOT) {
237 		return;
238 	}
239 #if CONFIG_SPTM
240 	/*
241 	 * If a panic has been initiated on SPTM devices, preemption was disabled by sleh,
242 	 * but platform callbacks could be acquiring mutexes
243 	 */
244 	if (*sptm_xnu_triggered_panic_ptr) {
245 		return;
246 	}
247 #endif
248 	__lck_mtx_preemption_disabled_panic(lock, expected);
249 }
250 
251 static inline void
lck_mtx_check_irq(lck_mtx_t * lock)252 lck_mtx_check_irq(lck_mtx_t *lock)
253 {
254 	if (ml_at_interrupt_context()) {
255 		__lck_mtx_at_irq_panic(lock);
256 	}
257 }
258 
259 #define LCK_MTX_SNIFF_PREEMPTION(thread)   lock_preemption_level_for_thread(thread)
260 #define LCK_MTX_CHECK_INVARIANTS           1
261 #else
262 #define lck_mtx_check_irq(lck)             ((void)0)
263 #define LCK_MTX_SNIFF_PREEMPTION(thread)   0
264 #define LCK_MTX_CHECK_INVARIANTS           0
265 #endif /* !DEVELOPMENT && !DEBUG */
266 
267 #if CONFIG_DTRACE
268 #define LCK_MTX_SNIFF_DTRACE()             lck_debug_state.lds_value
269 #else
270 #define LCK_MTX_SNIFF_DTRACE()             0
271 #endif
272 
273 
274 #pragma mark lck_mtx_t: alloc/init/destroy/free
275 
276 lck_mtx_t *
lck_mtx_alloc_init(lck_grp_t * grp,lck_attr_t * attr)277 lck_mtx_alloc_init(lck_grp_t *grp, lck_attr_t *attr)
278 {
279 	lck_mtx_t      *lck;
280 
281 	lck = zalloc(KT_LCK_MTX);
282 	lck_mtx_init(lck, grp, attr);
283 	return lck;
284 }
285 
286 void
lck_mtx_free(lck_mtx_t * lck,lck_grp_t * grp)287 lck_mtx_free(lck_mtx_t *lck, lck_grp_t *grp)
288 {
289 	lck_mtx_destroy(lck, grp);
290 	zfree(KT_LCK_MTX, lck);
291 }
292 
293 __mockable void
lck_mtx_init(lck_mtx_t * lck,lck_grp_t * grp,lck_attr_t * attr)294 lck_mtx_init(lck_mtx_t *lck, lck_grp_t *grp, lck_attr_t *attr)
295 {
296 	if (attr == LCK_ATTR_NULL) {
297 		attr = &lck_attr_default;
298 	}
299 
300 	*lck = (lck_mtx_t){
301 		.lck_mtx_type = LCK_TYPE_MUTEX,
302 		.lck_mtx_grp  = grp->lck_grp_attr_id,
303 	};
304 	if (attr->lck_attr_val & LCK_ATTR_DEBUG) {
305 		lck->lck_mtx.data |= LCK_MTX_PROFILE;
306 	}
307 
308 	lck_grp_reference(grp, &grp->lck_grp_mtxcnt);
309 }
310 
311 __mockable void
lck_mtx_destroy(lck_mtx_t * lck,lck_grp_t * grp)312 lck_mtx_destroy(lck_mtx_t *lck, lck_grp_t *grp)
313 {
314 	if (lck->lck_mtx_tsid && lck->lck_mtx_type == LCK_TYPE_MUTEX) {
315 		panic("Mutex to destroy still has waiters: %p: "
316 		    "<0x%06x 0x%02x 0x%08x 0x%08x/%p 0x%04x 0x%04x>",
317 		    lck, lck->lck_mtx_tsid, lck->lck_mtx_type, lck->lck_mtx_grp,
318 		    lck->lck_mtx.data, ctid_get_thread_unsafe(lck->lck_mtx.owner),
319 		    lck->lck_mtx.as_tail, lck->lck_mtx.ilk_tail);
320 	}
321 	if (lck->lck_mtx_type != LCK_TYPE_MUTEX ||
322 	    (lck->lck_mtx.data & ~LCK_MTX_PROFILE) ||
323 	    lck->lck_mtx.as_tail || lck->lck_mtx.ilk_tail) {
324 		__lck_mtx_invalid_panic(lck);
325 	}
326 	LCK_GRP_ASSERT_ID(grp, lck->lck_mtx_grp);
327 	lck->lck_mtx_type = LCK_TYPE_NONE;
328 	lck->lck_mtx.data = LCK_MTX_TAG_DESTROYED;
329 	lck->lck_mtx_grp      = 0;
330 	lck_grp_deallocate(grp, &grp->lck_grp_mtxcnt);
331 }
332 
333 
334 #pragma mark lck_mtx_t: lck_mtx_ilk*
335 
336 static hw_spin_timeout_status_t
lck_mtx_ilk_timeout_panic(void * _lock,hw_spin_timeout_t to,hw_spin_state_t st)337 lck_mtx_ilk_timeout_panic(void *_lock, hw_spin_timeout_t to, hw_spin_state_t st)
338 {
339 	lck_mtx_t *lck = _lock;
340 
341 	panic("Mutex interlock[%p] " HW_SPIN_TIMEOUT_FMT "; "
342 	    "current owner: %p, "
343 	    "<0x%06x 0x%02x 0x%08x 0x%08x 0x%04x 0x%04x>, "
344 	    HW_SPIN_TIMEOUT_DETAILS_FMT,
345 	    lck, HW_SPIN_TIMEOUT_ARG(to, st),
346 	    ctid_get_thread_unsafe(lck->lck_mtx.owner),
347 	    lck->lck_mtx_tsid, lck->lck_mtx_type,
348 	    lck->lck_mtx_grp, lck->lck_mtx.data,
349 	    lck->lck_mtx.as_tail, lck->lck_mtx.ilk_tail,
350 	    HW_SPIN_TIMEOUT_DETAILS_ARG(to, st));
351 }
352 
353 static const struct hw_spin_policy lck_mtx_ilk_timeout_policy = {
354 	.hwsp_name              = "lck_mtx_t (ilk)",
355 	.hwsp_timeout_atomic    = &lock_panic_timeout,
356 	.hwsp_op_timeout        = lck_mtx_ilk_timeout_panic,
357 };
358 
359 static void
lck_mtx_ilk_lock_cleanup_as_mcs(lck_mtx_t * lock,lck_mcs_id_t idx,lck_mtx_mcs_t mcs,hw_spin_timeout_t to,hw_spin_state_t * ss)360 lck_mtx_ilk_lock_cleanup_as_mcs(
361 	lck_mtx_t               *lock,
362 	lck_mcs_id_t             idx,
363 	lck_mtx_mcs_t            mcs,
364 	hw_spin_timeout_t        to,
365 	hw_spin_state_t         *ss)
366 {
367 	lck_mtx_mcs_t nnode = NULL;
368 	lck_mcs_id_t  pidx  = (lck_mcs_id_t)mcs->lmm_as_prev;
369 	bool          was_last;
370 
371 	/*
372 	 *	This is called when the thread made use
373 	 *	of the adaptive spin queue and needs
374 	 *	to remove itself from it.
375 	 */
376 
377 	/*
378 	 *	If the thread is last, set the tail to the node before us.
379 	 */
380 	was_last = lock_cmpxchg(&lock->lck_mtx.as_tail, idx, pidx, release);
381 
382 	if (was_last) {
383 		/*
384 		 *	If @c mcs was last, we need to erase the previous
385 		 *	node link to it.
386 		 *
387 		 *	However, new nodes could have now taken our place
388 		 *	and set the previous node's @c lmm_as_next field
389 		 *	already, so we must CAS rather than blindly set.
390 		 *
391 		 *	We know the previous node is stable because
392 		 *	we hold the interlock (preventing concurrent
393 		 *	removals).
394 		 */
395 		if (pidx) {
396 			os_atomic_cmpxchg(&lck_mtx_get_mcs(pidx)->lmm_as_next,
397 			    mcs, nnode, relaxed);
398 		}
399 	} else {
400 		/*
401 		 *	If @c mcs wasn't last, then wait to make sure
402 		 *	we observe @c lmm_as_next. Once we do, we know
403 		 *	the field is stable since we hold the interlock
404 		 *	(preventing concurrent dequeues).
405 		 *
406 		 *	We can then update it to @c mcs next node index
407 		 *	(which is also stable for similar reasons).
408 		 *
409 		 *	Lastly update the previous node @c lmm_as_next
410 		 *	field as well to terminate the dequeue.
411 		 */
412 		while (!hw_spin_wait_until(&mcs->lmm_as_next, nnode, nnode)) {
413 			hw_spin_policy_t pol = &lck_mtx_ilk_timeout_policy;
414 			hw_spin_should_keep_spinning(lock, pol, to, ss);
415 		}
416 
417 		os_atomic_store(&nnode->lmm_as_prev, pidx, relaxed);
418 		if (pidx) {
419 			os_atomic_store(&lck_mtx_get_mcs(pidx)->lmm_as_next,
420 			    nnode, relaxed);
421 		}
422 	}
423 
424 	/*
425 	 *	@c mcs's fields are left dangling,
426 	 *	it is the responsibilty of the caller
427 	 *	to terminate the cleanup.
428 	 */
429 }
430 
431 static NOINLINE void
lck_mtx_ilk_lock_contended(lck_mtx_t * lock,lck_mtx_state_t state,lck_ilk_mode_t mode)432 lck_mtx_ilk_lock_contended(
433 	lck_mtx_t              *lock,
434 	lck_mtx_state_t         state,
435 	lck_ilk_mode_t          mode)
436 {
437 	hw_spin_policy_t  pol = &lck_mtx_ilk_timeout_policy;
438 	hw_spin_timeout_t to  = hw_spin_compute_timeout(pol);
439 	hw_spin_state_t   ss  = { };
440 
441 	lck_mtx_mcs_t     mcs, nnode, pnode;
442 	lck_mcs_id_t      idx, pidx;
443 	lck_mtx_state_t   nstate;
444 	unsigned long     ready;
445 	uint64_t          spin_start;
446 
447 	/*
448 	 *	Take a spot in the interlock MCS queue,
449 	 *	and then spin until we're at the head of it.
450 	 */
451 
452 	idx  = lck_mtx_get_mcs_id();
453 	mcs  = &lck_mcs_get_current()->mcs_mtx;
454 	if (mode != LCK_MTX_MODE_SPIN) {
455 		spin_start = LCK_MTX_ADAPTIVE_SPIN_BEGIN();
456 	}
457 
458 	mcs->lmm_ilk_current = lock;
459 	pidx = os_atomic_xchg(&lock->lck_mtx.ilk_tail, idx, release);
460 	if (pidx) {
461 		pnode = lck_mtx_get_mcs(pidx);
462 		os_atomic_store(&pnode->lmm_ilk_next, mcs, relaxed);
463 
464 		while (!hw_spin_wait_until(&mcs->lmm_ilk_ready, ready, ready)) {
465 			hw_spin_should_keep_spinning(lock, pol, to, &ss);
466 		}
467 	}
468 
469 
470 	/*
471 	 *	We're now the first in line, wait for the interlock
472 	 *	to look ready and take it.
473 	 *
474 	 *	We can't just assume the lock is ours for the taking,
475 	 *	because the fastpath of lck_mtx_lock_spin{,_always}
476 	 *	only look at the mutex "data" and might steal it.
477 	 *
478 	 *	Also clear the interlock MCS tail if @c mcs is last.
479 	 */
480 	do {
481 		while (!hw_spin_wait_until(&lock->lck_mtx.val,
482 		    state.val, state.ilocked == 0)) {
483 			hw_spin_should_keep_spinning(lock, pol, to, &ss);
484 		}
485 
486 		nstate = state;
487 		nstate.ilocked = 1;
488 		if (nstate.ilk_tail == idx) {
489 			nstate.ilk_tail = 0;
490 		}
491 	} while (!os_atomic_cmpxchg(&lock->lck_mtx, state, nstate, acquire));
492 
493 
494 	/*
495 	 *	We now have the interlock, let's cleanup the MCS state.
496 	 *
497 	 *	First, if there is a node after us, notify that it
498 	 *	is at the head of the interlock queue.
499 	 *
500 	 *	Second, perform the adaptive spin MCS cleanup if needed.
501 	 *
502 	 *	Lastly, clear the MCS node.
503 	 */
504 	if (state.ilk_tail != idx) {
505 		while (!hw_spin_wait_until(&mcs->lmm_ilk_next, nnode, nnode)) {
506 			hw_spin_should_keep_spinning(lock, pol, to, &ss);
507 		}
508 
509 		os_atomic_store(&nnode->lmm_ilk_ready, 1, relaxed);
510 	}
511 
512 	if (mode == LCK_ILK_MODE_FROM_AS) {
513 		lck_mtx_ilk_lock_cleanup_as_mcs(lock, idx, mcs, to, &ss);
514 	}
515 	lck_mtx_mcs_clear(mcs);
516 
517 	if (mode != LCK_MTX_MODE_SPIN) {
518 		LCK_MTX_ADAPTIVE_SPIN_END(lock, lock->lck_mtx_grp, spin_start);
519 	}
520 }
521 
522 static void
lck_mtx_ilk_lock_nopreempt(lck_mtx_t * lock,lck_ilk_mode_t mode)523 lck_mtx_ilk_lock_nopreempt(lck_mtx_t *lock, lck_ilk_mode_t mode)
524 {
525 	lck_mtx_state_t state, nstate;
526 
527 	os_atomic_rmw_loop(&lock->lck_mtx.val, state.val, nstate.val, acquire, {
528 		if (__improbable(state.ilocked || state.ilk_tail)) {
529 		        os_atomic_rmw_loop_give_up({
530 				return lck_mtx_ilk_lock_contended(lock, state, mode);
531 			});
532 		}
533 
534 		nstate = state;
535 		nstate.ilocked = true;
536 	});
537 }
538 
539 static void
lck_mtx_ilk_unlock_v(lck_mtx_t * lock,uint32_t data)540 lck_mtx_ilk_unlock_v(lck_mtx_t *lock, uint32_t data)
541 {
542 	os_atomic_store(&lock->lck_mtx.data, data, release);
543 	lock_enable_preemption();
544 }
545 
546 static void
lck_mtx_ilk_unlock(lck_mtx_t * lock)547 lck_mtx_ilk_unlock(lck_mtx_t *lock)
548 {
549 	lck_mtx_ilk_unlock_v(lock, lock->lck_mtx.data & ~LCK_MTX_ILOCK);
550 }
551 
552 
553 #pragma mark lck_mtx_t: turnstile integration
554 
555 /*
556  * Routine: lck_mtx_lock_wait
557  *
558  * Invoked in order to wait on contention.
559  *
560  * Called with the interlock locked and
561  * returns it unlocked.
562  *
563  * Always aggressively sets the owning thread to promoted,
564  * even if it's the same or higher priority
565  * This prevents it from lowering its own priority while holding a lock
566  *
567  * TODO: Come up with a more efficient way to handle same-priority promotions
568  *      <rdar://problem/30737670> ARM mutex contention logic could avoid taking the thread lock
569  */
570 static struct turnstile *
lck_mtx_lock_wait(lck_mtx_t * lck,thread_t self,thread_t holder,struct turnstile * ts)571 lck_mtx_lock_wait(
572 	lck_mtx_t              *lck,
573 	thread_t                self,
574 	thread_t                holder,
575 	struct turnstile       *ts)
576 {
577 	uint64_t sleep_start = LCK_MTX_BLOCK_BEGIN();
578 
579 	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_START,
580 	    unslide_for_kdebug(lck), (uintptr_t)thread_tid(self), 0, 0, 0);
581 
582 	if (ts == TURNSTILE_NULL) {
583 		ts = turnstile_prepare_compact_id((uintptr_t)lck,
584 		    lck->lck_mtx_tsid, TURNSTILE_KERNEL_MUTEX);
585 		if (lck->lck_mtx_tsid == 0) {
586 			lck->lck_mtx_tsid = ts->ts_compact_id;
587 		}
588 	}
589 	assert3u(ts->ts_compact_id, ==, lck->lck_mtx_tsid);
590 
591 	thread_set_pending_block_hint(self, kThreadWaitKernelMutex);
592 	turnstile_update_inheritor(ts, holder, (TURNSTILE_DELAYED_UPDATE | TURNSTILE_INHERITOR_THREAD));
593 
594 	waitq_assert_wait64(&ts->ts_waitq, LCK_MTX_EVENT(lck),
595 	    THREAD_UNINT | THREAD_WAIT_NOREPORT_USER, TIMEOUT_WAIT_FOREVER);
596 
597 	lck_mtx_ilk_unlock(lck);
598 
599 	turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_NOT_HELD);
600 
601 	thread_block(THREAD_CONTINUE_NULL);
602 
603 	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_END, 0, 0, 0, 0, 0);
604 
605 	LCK_MTX_BLOCK_END(lck, lck->lck_mtx_grp, sleep_start);
606 
607 	return ts;
608 }
609 
610 static void
lck_mtx_lock_wait_done(lck_mtx_t * lck,struct turnstile * ts)611 lck_mtx_lock_wait_done(lck_mtx_t *lck, struct turnstile  *ts)
612 {
613 	if (turnstile_complete_compact_id((uintptr_t)lck, ts,
614 	    TURNSTILE_KERNEL_MUTEX)) {
615 		lck->lck_mtx_tsid = 0;
616 	}
617 }
618 
619 /*
620  * Routine:     lck_mtx_lock_will_need_wakeup
621  *
622  * Returns whether the thread is the current turnstile inheritor,
623  * which means it will have to call lck_mtx_unlock_wakeup()
624  * on unlock.
625  */
626 __attribute__((always_inline))
627 static bool
lck_mtx_lock_will_need_wakeup(lck_mtx_t * lck,thread_t self)628 lck_mtx_lock_will_need_wakeup(lck_mtx_t *lck, thread_t  self)
629 {
630 	uint32_t tsid = lck->lck_mtx_tsid;
631 
632 	return tsid && turnstile_get_by_id(tsid)->ts_inheritor == self;
633 }
634 
635 /*
636  * Routine:     lck_mtx_unlock_wakeup
637  *
638  * Invoked on unlock when there is contention.
639  *
640  * Called with the interlock locked.
641  *
642  * NOTE: callers should call turnstile_clenup after
643  * dropping the interlock.
644  */
645 static void
lck_mtx_unlock_wakeup(lck_mtx_t * lck,__kdebug_only thread_t thread)646 lck_mtx_unlock_wakeup(
647 	lck_mtx_t                       *lck,
648 	__kdebug_only thread_t          thread)
649 {
650 	struct turnstile *ts;
651 	kern_return_t did_wake;
652 
653 	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_UNLCK_WAKEUP_CODE) | DBG_FUNC_START,
654 	    unslide_for_kdebug(lck), (uintptr_t)thread_tid(thread), 0, 0, 0);
655 
656 	ts = turnstile_get_by_id(lck->lck_mtx_tsid);
657 
658 	/*
659 	 * We can skip turnstile_{prepare,cleanup} because
660 	 * we hold the interlock of the primitive,
661 	 * and enqueues/wakeups all happen under the interlock,
662 	 * which means the turnstile is stable.
663 	 */
664 	did_wake = waitq_wakeup64_one(&ts->ts_waitq, LCK_MTX_EVENT(lck),
665 	    THREAD_AWAKENED, WAITQ_UPDATE_INHERITOR);
666 	assert(did_wake == KERN_SUCCESS);
667 
668 	turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD);
669 
670 	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_UNLCK_WAKEUP_CODE) | DBG_FUNC_END, 0, 0, 0, 0, 0);
671 }
672 
673 
674 #pragma mark lck_mtx_t: lck_mtx_lock
675 
676 static inline bool
lck_mtx_ctid_on_core(uint32_t ctid)677 lck_mtx_ctid_on_core(uint32_t ctid)
678 {
679 	thread_t th = ctid_get_thread_unsafe(ctid);
680 
681 	return th && machine_thread_on_core_allow_invalid(th);
682 }
683 
684 #define LCK_MTX_OWNER_FOR_TRACE(lock) \
685 	VM_KERNEL_UNSLIDE_OR_PERM(ctid_get_thread_unsafe((lock)->lck_mtx.data))
686 
687 static void
lck_mtx_lock_adaptive_spin(lck_mtx_t * lock,lck_mtx_state_t state)688 lck_mtx_lock_adaptive_spin(lck_mtx_t *lock, lck_mtx_state_t state)
689 {
690 	__kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lock);
691 	hw_spin_policy_t  pol = &lck_mtx_ilk_timeout_policy;
692 	hw_spin_timeout_t to  = hw_spin_compute_timeout(pol);
693 	hw_spin_state_t   ss  = { };
694 	uint64_t          deadline;
695 
696 	lck_mtx_mcs_t     mcs, node;
697 	lck_mcs_id_t      idx, pidx, clear_idx;
698 	unsigned long     prev;
699 	lck_mtx_state_t   nstate;
700 	ast_t      *const astp = ast_pending();
701 
702 	idx  = lck_mtx_get_mcs_id();
703 	mcs  = &lck_mcs_get_current()->mcs_mtx;
704 
705 	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_START,
706 	    trace_lck, LCK_MTX_OWNER_FOR_TRACE(lock), lock->lck_mtx_tsid, 0, 0);
707 
708 	deadline = ml_get_timebase() + os_atomic_load(&MutexSpin, relaxed) * processor_avail_count;
709 
710 	/*
711 	 *	Take a spot in the adaptive spin queue,
712 	 *	and then spin until we're at the head of it.
713 	 *
714 	 *	Until we're at the head, we do not need to monitor
715 	 *	for whether the current owner is on core or not:
716 	 *
717 	 *	1. the head of the queue is doing it already,
718 	 *
719 	 *	2. when the entire adaptive spin queue will "give up"
720 	 *	   as a result of the owner going off core, we want
721 	 *	   to avoid a thundering herd and let the AS queue
722 	 *	   pour into the interlock one slowly.
723 	 *
724 	 *	Do give up if the scheduler made noises something
725 	 *	more important has shown up.
726 	 *
727 	 *	Note: this function is optimized so that we do not touch
728 	 *	      our local mcs node when we're the head of the queue.
729 	 *
730 	 *	      This allows us in the case when the contention is
731 	 *	      between 2 cores only to not have to touch this
732 	 *	      cacheline at all.
733 	 */
734 	pidx = os_atomic_xchg(&lock->lck_mtx.as_tail, idx, release);
735 	if (pidx) {
736 		node = lck_mtx_get_mcs(pidx);
737 		mcs->lmm_as_prev = pidx;
738 		os_atomic_store(&node->lmm_as_next, mcs, release);
739 
740 		while (!hw_spin_wait_until(&mcs->lmm_as_prev, prev,
741 		    prev == 0 || (os_atomic_load(astp, relaxed) & AST_URGENT) || (ml_get_timebase() > deadline))) {
742 			hw_spin_should_keep_spinning(lock, pol, to, &ss);
743 		}
744 
745 		if (__improbable(prev)) {
746 			goto adaptive_spin_fail;
747 		}
748 
749 		clear_idx = 0;
750 	} else {
751 		clear_idx = idx;
752 	}
753 
754 	/*
755 	 *	We're now first in line.
756 	 *
757 	 *	It's our responsbility to monitor the lock's state
758 	 *	for whether (1) the lock has become available,
759 	 *	(2) its owner has gone off core, (3) the scheduler
760 	 *	wants its CPU back, or (4) we've spun for too long.
761 	 */
762 	deadline = ml_get_timebase() + os_atomic_load(&MutexSpin, relaxed);
763 
764 	for (;;) {
765 		state.val = lock_load_exclusive(&lock->lck_mtx.val, acquire);
766 
767 		if (__probable(!state.ilocked && !state.ilk_tail && !state.owner)) {
768 			/*
769 			 * 2-core contention: if we can, try to dequeue
770 			 * ourselves from the adaptive spin queue
771 			 * as part of this CAS in order to avoid
772 			 * the cost of lck_mtx_ilk_lock_cleanup_as_mcs()
773 			 * and zeroing the mcs node at all.
774 			 *
775 			 * Because the queue is designed to limit contention,
776 			 * using store-exclusive over an armv8.1 LSE atomic
777 			 * is actually marginally better (presumably due to
778 			 * the better codegen).
779 			 */
780 			nstate = state;
781 			nstate.ilocked = true;
782 			if (state.as_tail == clear_idx) {
783 				nstate.as_tail = 0;
784 			}
785 			if (__probable(lock_store_exclusive(&lock->lck_mtx.val,
786 			    state.val, nstate.val, acquire))) {
787 				break;
788 			}
789 		} else {
790 			lock_wait_for_event();
791 		}
792 
793 		if (__improbable(ml_get_timebase() > deadline ||
794 		    (os_atomic_load(astp, relaxed) & AST_URGENT) ||
795 		    (!state.ilocked && !state.ilk_tail && state.owner &&
796 		    !lck_mtx_ctid_on_core(state.owner)))) {
797 			goto adaptive_spin_fail;
798 		}
799 	}
800 
801 	/*
802 	 *	If we're here, we got the lock, we just have to cleanup
803 	 *	the MCS nodes and return.
804 	 */
805 	if (state.as_tail != clear_idx) {
806 		lck_mtx_ilk_lock_cleanup_as_mcs(lock, idx, mcs, to, &ss);
807 		lck_mtx_mcs_clear(mcs);
808 	}
809 
810 	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_END,
811 	    trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(thread),
812 	    lock->lck_mtx_tsid, 0, 0);
813 	return;
814 
815 adaptive_spin_fail:
816 	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_END,
817 	    trace_lck, LCK_MTX_OWNER_FOR_TRACE(lock), lock->lck_mtx_tsid, 0, 0);
818 	return lck_mtx_ilk_lock_contended(lock, state, LCK_ILK_MODE_FROM_AS);
819 }
820 
821 static NOINLINE void
lck_mtx_lock_contended(lck_mtx_t * lock,thread_t thread,lck_mtx_mode_t mode)822 lck_mtx_lock_contended(lck_mtx_t *lock, thread_t thread, lck_mtx_mode_t mode)
823 {
824 	struct turnstile *ts = TURNSTILE_NULL;
825 	lck_mtx_state_t   state;
826 	uint32_t          ctid = thread->ctid;
827 	uint32_t          data;
828 #if CONFIG_DTRACE
829 	int               first_miss = 0;
830 #endif /* CONFIG_DTRACE */
831 	bool              direct_wait = false;
832 	uint64_t          spin_start;
833 	uint32_t          profile;
834 
835 	lck_mtx_check_irq(lock);
836 	if (mode == LCK_MTX_MODE_SLEEPABLE) {
837 		lock_disable_preemption_for_thread(thread);
838 	}
839 
840 	for (;;) {
841 		/*
842 		 *	Load the current state and perform sanity checks
843 		 *
844 		 *	Note that the various "corrupt" values are designed
845 		 *	so that the slowpath is taken when a mutex was used
846 		 *	after destruction, so that we do not have to do
847 		 *	sanity checks in the fast path.
848 		 */
849 		state = os_atomic_load(&lock->lck_mtx, relaxed);
850 		if (state.owner == ctid) {
851 			__lck_mtx_owned_panic(lock, thread);
852 		}
853 		if (lock->lck_mtx_type != LCK_TYPE_MUTEX ||
854 		    state.data == LCK_MTX_TAG_DESTROYED) {
855 			__lck_mtx_invalid_panic(lock);
856 		}
857 		profile = (state.data & LCK_MTX_PROFILE);
858 
859 		/*
860 		 *	Attempt steal
861 		 *
862 		 *	When the lock state is 0, then no thread can be queued
863 		 *	for adaptive spinning or for the interlock yet.
864 		 *
865 		 *	As such we can attempt to try to take the interlock.
866 		 *	(we can't take the mutex directly because we need
867 		 *	the interlock to do turnstile operations on the way out).
868 		 */
869 		if ((state.val & ~(uint64_t)LCK_MTX_PROFILE) == 0) {
870 			if (!os_atomic_cmpxchgv(&lock->lck_mtx.val,
871 			    state.val, state.val | LCK_MTX_ILOCK,
872 			    &state.val, acquire)) {
873 				continue;
874 			}
875 			break;
876 		}
877 
878 #if CONFIG_DTRACE
879 		if (profile) {
880 			LCK_MTX_PROF_MISS(lock, lock->lck_mtx_grp, &first_miss);
881 		}
882 #endif /* CONFIG_DTRACE */
883 
884 		if (mode == LCK_MTX_MODE_SLEEPABLE) {
885 			spin_start = LCK_MTX_ADAPTIVE_SPIN_BEGIN();
886 		} else {
887 			spin_start = LCK_MTX_SPIN_SPIN_BEGIN();
888 		}
889 
890 		/*
891 		 *	Adaptive spin or interlock
892 		 *
893 		 *	Evaluate if adaptive spinning should be attempted,
894 		 *	and if yes go to adaptive spin.
895 		 *
896 		 *	Otherwise (and this includes always-spin mutexes),
897 		 *	go for the interlock.
898 		 */
899 		if (mode != LCK_MTX_MODE_SPIN_ALWAYS &&
900 		    (state.ilocked || state.as_tail || !state.owner ||
901 		    lck_mtx_ctid_on_core(state.owner))) {
902 			lck_mtx_lock_adaptive_spin(lock, state);
903 		} else {
904 			direct_wait = true;
905 			lck_mtx_ilk_lock_nopreempt(lock, LCK_ILK_MODE_DIRECT);
906 		}
907 
908 		if (mode == LCK_MTX_MODE_SLEEPABLE) {
909 			LCK_MTX_ADAPTIVE_SPIN_END(lock, lock->lck_mtx_grp, spin_start);
910 		} else {
911 			LCK_MTX_SPIN_SPIN_END(lock, lock->lck_mtx_grp, spin_start);
912 		}
913 
914 		/*
915 		 *	Take or sleep
916 		 *
917 		 *	We now have the interlock. Either the owner
918 		 *	isn't set, and the mutex is ours to claim,
919 		 *	or we must go to sleep.
920 		 *
921 		 *	If we go to sleep, we need to set LCK_MTX_NEEDS_WAKEUP
922 		 *	to force the current lock owner to call
923 		 *	lck_mtx_unlock_wakeup().
924 		 */
925 		state = os_atomic_load(&lock->lck_mtx, relaxed);
926 		if (state.owner == LCK_MTX_NULL_CTID) {
927 			break;
928 		}
929 
930 		if (mode == LCK_MTX_MODE_SPIN_ALWAYS) {
931 			__lck_mtx_lock_is_sleepable_panic(lock);
932 		}
933 
934 #if CONFIG_DTRACE
935 		if (profile) {
936 			LCK_MTX_PROF_WAIT(lock, lock->lck_mtx_grp,
937 			    direct_wait, &first_miss);
938 		}
939 #endif /* CONFIG_DTRACE */
940 		os_atomic_store(&lock->lck_mtx.data,
941 		    state.data | LCK_MTX_ILOCK | LCK_MTX_NEEDS_WAKEUP,
942 		    compiler_acq_rel);
943 		ts = lck_mtx_lock_wait(lock, thread,
944 		    ctid_get_thread(state.owner), ts);
945 
946 		/* returns interlock unlocked and preemption re-enabled */
947 		lock_disable_preemption_for_thread(thread);
948 	}
949 
950 	/*
951 	 *	We can take the lock!
952 	 *
953 	 *	We only have the interlock and the owner field is 0.
954 	 *
955 	 *	Perform various turnstile cleanups if needed,
956 	 *	claim the lock, and reenable preemption (if needed).
957 	 */
958 	if (ts) {
959 		lck_mtx_lock_wait_done(lock, ts);
960 	}
961 	data = ctid | profile;
962 	if (lck_mtx_lock_will_need_wakeup(lock, thread)) {
963 		data |= LCK_MTX_NEEDS_WAKEUP;
964 	}
965 	if (mode != LCK_MTX_MODE_SLEEPABLE) {
966 		data |= LCK_MTX_ILOCK | LCK_MTX_SPIN_MODE;
967 	}
968 	os_atomic_store(&lock->lck_mtx.data, data, release);
969 
970 	if (mode == LCK_MTX_MODE_SLEEPABLE) {
971 		lock_enable_preemption();
972 	}
973 
974 	assert(thread->turnstile != NULL);
975 
976 	if (ts) {
977 		turnstile_cleanup();
978 	}
979 	LCK_MTX_ACQUIRED(lock, lock->lck_mtx_grp,
980 	    mode != LCK_MTX_MODE_SLEEPABLE, profile);
981 }
982 
983 #if LCK_MTX_CHECK_INVARIANTS || CONFIG_DTRACE
984 __attribute__((noinline))
985 #else
986 __attribute__((always_inline))
987 #endif
988 static void
lck_mtx_lock_slow(lck_mtx_t * lock,thread_t thread,lck_mtx_state_t state,lck_mtx_mode_t mode)989 lck_mtx_lock_slow(
990 	lck_mtx_t              *lock,
991 	thread_t                thread,
992 	lck_mtx_state_t         state,
993 	lck_mtx_mode_t          mode)
994 {
995 #pragma unused(state)
996 #if CONFIG_DTRACE
997 	lck_mtx_state_t ostate = {
998 		.data = LCK_MTX_PROFILE,
999 	};
1000 #endif /* CONFIG_DTRACE */
1001 
1002 #if LCK_MTX_CHECK_INVARIANTS
1003 	if (mode != LCK_MTX_MODE_SPIN_ALWAYS) {
1004 		lck_mtx_check_preemption(lock, thread,
1005 		    (mode == LCK_MTX_MODE_SPIN));
1006 	}
1007 #endif /* LCK_MTX_CHECK_INVARIANTS */
1008 #if CONFIG_DTRACE
1009 	if (state.val == ostate.val) {
1010 		state.data = thread->ctid | LCK_MTX_PROFILE;
1011 		if (mode != LCK_MTX_MODE_SLEEPABLE) {
1012 			state.ilocked = true;
1013 			state.spin_mode = true;
1014 		}
1015 		os_atomic_cmpxchgv(&lock->lck_mtx.val,
1016 		    ostate.val, state.val, &state.val, acquire);
1017 	}
1018 	if ((state.val & ~ostate.val) == 0) {
1019 		LCK_MTX_ACQUIRED(lock, lock->lck_mtx_grp,
1020 		    mode != LCK_MTX_MODE_SLEEPABLE,
1021 		    state.data & LCK_MTX_PROFILE);
1022 		return;
1023 	}
1024 #endif /* CONFIG_DTRACE */
1025 	lck_mtx_lock_contended(lock, thread, mode);
1026 }
1027 
1028 static __attribute__((always_inline)) void
lck_mtx_lock_fastpath(lck_mtx_t * lock,lck_mtx_mode_t mode)1029 lck_mtx_lock_fastpath(lck_mtx_t *lock, lck_mtx_mode_t mode)
1030 {
1031 	thread_t thread = current_thread();
1032 	lck_mtx_state_t state = {
1033 		.data = thread->ctid,
1034 	};
1035 	uint64_t take_slowpath = 0;
1036 
1037 	if (mode != LCK_MTX_MODE_SPIN_ALWAYS) {
1038 		take_slowpath |= LCK_MTX_SNIFF_PREEMPTION(thread);
1039 	}
1040 	take_slowpath |= LCK_MTX_SNIFF_DTRACE();
1041 
1042 	if (mode != LCK_MTX_MODE_SLEEPABLE) {
1043 		lock_disable_preemption_for_thread(thread);
1044 		state.ilocked = true;
1045 		state.spin_mode = true;
1046 	}
1047 
1048 	/*
1049 	 * Do the CAS on the entire mutex state,
1050 	 * which hence requires for the ILK/AS queues
1051 	 * to be empty (which is fairer).
1052 	 */
1053 	lock_cmpxchgv(&lock->lck_mtx.val,
1054 	    0, state.val, &state.val, acquire);
1055 
1056 	take_slowpath |= state.val;
1057 	if (__improbable(take_slowpath)) {
1058 		return lck_mtx_lock_slow(lock, thread, state, mode);
1059 	}
1060 }
1061 
1062 __mockable void
lck_mtx_lock(lck_mtx_t * lock)1063 lck_mtx_lock(lck_mtx_t *lock)
1064 {
1065 	lck_mtx_lock_fastpath(lock, LCK_MTX_MODE_SLEEPABLE);
1066 }
1067 
1068 void
lck_mtx_lock_spin(lck_mtx_t * lock)1069 lck_mtx_lock_spin(lck_mtx_t *lock)
1070 {
1071 	lck_mtx_lock_fastpath(lock, LCK_MTX_MODE_SPIN);
1072 }
1073 
1074 void
lck_mtx_lock_spin_always(lck_mtx_t * lock)1075 lck_mtx_lock_spin_always(lck_mtx_t *lock)
1076 {
1077 	lck_mtx_lock_fastpath(lock, LCK_MTX_MODE_SPIN_ALWAYS);
1078 }
1079 
1080 
1081 #pragma mark lck_mtx_t: lck_mtx_try_lock
1082 
1083 static __attribute__((always_inline)) bool
lck_mtx_try_lock_slow_inline(lck_mtx_t * lock,thread_t thread,uint32_t odata,uint32_t ndata,bool spin)1084 lck_mtx_try_lock_slow_inline(
1085 	lck_mtx_t              *lock,
1086 	thread_t                thread,
1087 	uint32_t                odata,
1088 	uint32_t                ndata,
1089 	bool                    spin)
1090 {
1091 #pragma unused(lock, thread, odata, ndata)
1092 #if CONFIG_DTRACE
1093 	if (odata == LCK_MTX_PROFILE) {
1094 		os_atomic_cmpxchgv(&lock->lck_mtx.data,
1095 		    odata, ndata | LCK_MTX_PROFILE, &odata, acquire);
1096 	}
1097 	if ((odata & ~LCK_MTX_PROFILE) == 0) {
1098 		LCK_MTX_TRY_ACQUIRED(lock, lock->lck_mtx_grp,
1099 		    spin, odata & LCK_MTX_PROFILE);
1100 		return true;
1101 	}
1102 	if (odata & LCK_MTX_PROFILE) {
1103 		LCK_MTX_PROF_MISS(lock, lock->lck_mtx_grp, &(int){ 0 });
1104 	}
1105 #endif /* CONFIG_DTRACE */
1106 
1107 	if (spin) {
1108 		lock_enable_preemption();
1109 	}
1110 	return false;
1111 }
1112 
1113 #if CONFIG_DTRACE || LCK_MTX_CHECK_INVARIANTS
1114 __attribute__((noinline))
1115 #else
1116 __attribute__((always_inline))
1117 #endif
1118 static bool
lck_mtx_try_lock_slow(lck_mtx_t * lock,thread_t thread,uint32_t odata,uint32_t ndata)1119 lck_mtx_try_lock_slow(
1120 	lck_mtx_t              *lock,
1121 	thread_t                thread,
1122 	uint32_t                odata,
1123 	uint32_t                ndata)
1124 {
1125 	return lck_mtx_try_lock_slow_inline(lock, thread, odata, ndata, false);
1126 }
1127 
1128 #if CONFIG_DTRACE || LCK_MTX_CHECK_INVARIANTS
1129 __attribute__((noinline))
1130 #else
1131 __attribute__((always_inline))
1132 #endif
1133 static bool
lck_mtx_try_lock_slow_spin(lck_mtx_t * lock,thread_t thread,uint32_t odata,uint32_t ndata)1134 lck_mtx_try_lock_slow_spin(
1135 	lck_mtx_t              *lock,
1136 	thread_t                thread,
1137 	uint32_t                odata,
1138 	uint32_t                ndata)
1139 {
1140 	return lck_mtx_try_lock_slow_inline(lock, thread, odata, ndata, true);
1141 }
1142 
1143 static __attribute__((always_inline)) bool
lck_mtx_try_lock_fastpath(lck_mtx_t * lock,lck_mtx_mode_t mode)1144 lck_mtx_try_lock_fastpath(lck_mtx_t *lock, lck_mtx_mode_t mode)
1145 {
1146 	thread_t thread = current_thread();
1147 	uint32_t odata, ndata = thread->ctid;
1148 	uint32_t take_slowpath = 0;
1149 
1150 #if CONFIG_DTRACE
1151 	take_slowpath |= lck_debug_state.lds_value;
1152 #endif
1153 	if (mode != LCK_MTX_MODE_SLEEPABLE) {
1154 		lock_disable_preemption_for_thread(thread);
1155 		ndata |= LCK_MTX_SPIN_MODE | LCK_MTX_ILOCK;
1156 	}
1157 
1158 	/*
1159 	 * try_lock because it's likely to be used for cases
1160 	 * like lock inversion resolutions tries a bit harder
1161 	 * than lck_mtx_lock() to take the lock and ignores
1162 	 * adaptive spin / interlock queues by doing the CAS
1163 	 * on the 32bit mutex data only.
1164 	 */
1165 	lock_cmpxchgv(&lock->lck_mtx.data, 0, ndata, &odata, acquire);
1166 
1167 	take_slowpath |= odata;
1168 	if (__probable(!take_slowpath)) {
1169 		return true;
1170 	}
1171 
1172 	if (mode == LCK_MTX_MODE_SPIN_ALWAYS &&
1173 	    (odata & LCK_MTX_CTID_MASK) &&
1174 	    !(odata & LCK_MTX_SPIN_MODE)) {
1175 		__lck_mtx_lock_is_sleepable_panic(lock);
1176 	}
1177 
1178 	if (mode == LCK_MTX_MODE_SLEEPABLE) {
1179 		return lck_mtx_try_lock_slow(lock, thread, odata, ndata);
1180 	} else {
1181 		return lck_mtx_try_lock_slow_spin(lock, thread, odata, ndata);
1182 	}
1183 }
1184 
1185 boolean_t
lck_mtx_try_lock(lck_mtx_t * lock)1186 lck_mtx_try_lock(lck_mtx_t *lock)
1187 {
1188 	return lck_mtx_try_lock_fastpath(lock, LCK_MTX_MODE_SLEEPABLE);
1189 }
1190 
1191 boolean_t
lck_mtx_try_lock_spin(lck_mtx_t * lock)1192 lck_mtx_try_lock_spin(lck_mtx_t *lock)
1193 {
1194 	return lck_mtx_try_lock_fastpath(lock, LCK_MTX_MODE_SPIN);
1195 }
1196 
1197 boolean_t
lck_mtx_try_lock_spin_always(lck_mtx_t * lock)1198 lck_mtx_try_lock_spin_always(lck_mtx_t *lock)
1199 {
1200 	return lck_mtx_try_lock_fastpath(lock, LCK_MTX_MODE_SPIN_ALWAYS);
1201 }
1202 
1203 
1204 #pragma mark lck_mtx_t: lck_mtx_unlock
1205 
1206 static NOINLINE void
lck_mtx_unlock_contended(lck_mtx_t * lock,thread_t thread,uint32_t data)1207 lck_mtx_unlock_contended(lck_mtx_t *lock, thread_t thread, uint32_t data)
1208 {
1209 	bool cleanup = false;
1210 
1211 #if !CONFIG_DTRACE
1212 	/*
1213 	 * This check is done by lck_mtx_unlock_slow() when it is enabled.
1214 	 */
1215 	if (thread->ctid != (data & LCK_MTX_CTID_MASK)) {
1216 		__lck_mtx_not_owned_panic(lock, thread);
1217 	}
1218 #endif /* !CONFIG_DTRACE */
1219 
1220 	if ((data & LCK_MTX_SPIN_MODE) == 0) {
1221 		lock_disable_preemption_for_thread(thread);
1222 		lck_mtx_ilk_lock_nopreempt(lock, LCK_ILK_MODE_UNLOCK);
1223 	}
1224 
1225 	/*
1226 	 * We must re-load the data: we might have taken
1227 	 * the slowpath because another thread had taken
1228 	 * the interlock and set the NEEDS_WAKEUP bit
1229 	 * while we were spinning to get it.
1230 	 */
1231 	data = os_atomic_load(&lock->lck_mtx.data, compiler_acq_rel);
1232 	if (data & LCK_MTX_NEEDS_WAKEUP) {
1233 		lck_mtx_unlock_wakeup(lock, thread);
1234 		cleanup = true;
1235 	}
1236 	lck_mtx_ilk_unlock_v(lock, data & LCK_MTX_PROFILE);
1237 
1238 	LCK_MTX_RELEASED(lock, lock->lck_mtx_grp, data & LCK_MTX_PROFILE);
1239 
1240 	/*
1241 	 * Do not do any turnstile operations outside of this block.
1242 	 *
1243 	 * lock/unlock is called at early stage of boot while single
1244 	 * threaded, without turnstiles being available yet.
1245 	 * Even without contention we can come throught the slow path
1246 	 * if the mutex is acquired as a spin lock.
1247 	 */
1248 	if (cleanup) {
1249 		turnstile_cleanup();
1250 	}
1251 }
1252 
1253 #if CONFIG_DTRACE
1254 __attribute__((noinline))
1255 #else
1256 __attribute__((always_inline))
1257 #endif
1258 static void
lck_mtx_unlock_slow(lck_mtx_t * lock,thread_t thread,uint32_t data)1259 lck_mtx_unlock_slow(lck_mtx_t *lock, thread_t thread, uint32_t data)
1260 {
1261 #if CONFIG_DTRACE
1262 	/*
1263 	 *	If Dtrace is enabled, locks can be profiled,
1264 	 *	which causes the fastpath of unlock to fail.
1265 	 */
1266 	if ((data & LCK_MTX_BITS_MASK) == LCK_MTX_PROFILE) {
1267 		os_atomic_cmpxchgv(&lock->lck_mtx.data, data, LCK_MTX_PROFILE,
1268 		    &data, release);
1269 	}
1270 	if (thread->ctid != (data & LCK_MTX_CTID_MASK)) {
1271 		__lck_mtx_not_owned_panic(lock, thread);
1272 	}
1273 	if ((data & (LCK_MTX_BITS_MASK & ~LCK_MTX_PROFILE)) == 0) {
1274 		LCK_MTX_RELEASED(lock, lock->lck_mtx_grp, false);
1275 		return;
1276 	}
1277 #endif /* CONFIG_DTRACE */
1278 
1279 	lck_mtx_unlock_contended(lock, thread, data);
1280 }
1281 
1282 __mockable void
lck_mtx_unlock(lck_mtx_t * lock)1283 lck_mtx_unlock(lck_mtx_t *lock)
1284 {
1285 	thread_t thread = current_thread();
1286 	uint32_t take_slowpath = 0;
1287 	uint32_t data;
1288 
1289 	take_slowpath |= LCK_MTX_SNIFF_DTRACE();
1290 
1291 	/*
1292 	 * The fast path ignores the ILK/AS queues on purpose,
1293 	 * those really are a "lock" concept, not unlock.
1294 	 */
1295 	if (__probable(lock_cmpxchgv(&lock->lck_mtx.data,
1296 	    thread->ctid, 0, &data, release))) {
1297 		if (__probable(!take_slowpath)) {
1298 			return;
1299 		}
1300 	}
1301 
1302 	lck_mtx_unlock_slow(lock, thread, data);
1303 }
1304 
1305 
1306 #pragma mark lck_mtx_t: misc
1307 
1308 void
lck_mtx_assert(lck_mtx_t * lock,unsigned int type)1309 lck_mtx_assert(lck_mtx_t *lock, unsigned int type)
1310 {
1311 	lck_mtx_state_t state  = os_atomic_load(&lock->lck_mtx, relaxed);
1312 	thread_t        thread = current_thread();
1313 
1314 	if (type == LCK_MTX_ASSERT_OWNED) {
1315 		if (state.owner != thread->ctid) {
1316 			__lck_mtx_not_owned_panic(lock, thread);
1317 		}
1318 	} else if (type == LCK_MTX_ASSERT_NOTOWNED) {
1319 		if (state.owner == thread->ctid) {
1320 			__lck_mtx_owned_panic(lock, thread);
1321 		}
1322 	} else {
1323 		panic("lck_mtx_assert(): invalid arg (%u)", type);
1324 	}
1325 }
1326 
1327 #if !LCK_MTX_USE_ARCH
1328 void
lck_mtx_assert_owned_spin(lck_mtx_t * lock)1329 lck_mtx_assert_owned_spin(lck_mtx_t *lock)
1330 {
1331 	lck_mtx_state_t state  = os_atomic_load(&lock->lck_mtx, relaxed);
1332 	thread_t        thread = current_thread();
1333 
1334 	if (state.owner != thread->ctid) {
1335 		__lck_mtx_not_owned_panic(lock, thread);
1336 	}
1337 
1338 	if (!state.spin_mode) {
1339 		__lck_mtx_not_locked_spin(lock, thread);
1340 	}
1341 }
1342 #endif /* !LCK_MTX_USE_ARCH */
1343 
1344 /*
1345  *	Routine:	lck_mtx_convert_spin
1346  *
1347  *	Convert a mutex held for spin into a held full mutex
1348  */
1349 void
lck_mtx_convert_spin(lck_mtx_t * lock)1350 lck_mtx_convert_spin(lck_mtx_t *lock)
1351 {
1352 	lck_mtx_state_t state  = os_atomic_load(&lock->lck_mtx, relaxed);
1353 	thread_t        thread = current_thread();
1354 	uint32_t        data   = thread->ctid;
1355 
1356 	if (state.owner != data) {
1357 		__lck_mtx_not_owned_panic(lock, thread);
1358 	}
1359 
1360 	if (state.spin_mode) {
1361 		/*
1362 		 * Note: we can acquire the lock in spin mode
1363 		 *       _and_ be the inheritor if we waited.
1364 		 *
1365 		 *       We must only clear ilocked and spin_mode,
1366 		 *       but preserve owner and needs_wakeup.
1367 		 */
1368 		state.ilocked = false;
1369 		state.spin_mode = false;
1370 		lck_mtx_ilk_unlock_v(lock, state.data);
1371 		turnstile_cleanup();
1372 	}
1373 }
1374 
1375 /*
1376  * Routine: kdp_lck_mtx_lock_spin_is_acquired
1377  * NOT SAFE: To be used only by kernel debugger to avoid deadlock.
1378  */
1379 boolean_t
kdp_lck_mtx_lock_spin_is_acquired(lck_mtx_t * lck)1380 kdp_lck_mtx_lock_spin_is_acquired(lck_mtx_t *lck)
1381 {
1382 	lck_mtx_state_t state = os_atomic_load(&lck->lck_mtx, relaxed);
1383 
1384 	if (not_in_kdp) {
1385 		panic("panic: spinlock acquired check done outside of kernel debugger");
1386 	}
1387 	if (state.data == LCK_MTX_TAG_DESTROYED) {
1388 		return false;
1389 	}
1390 	return state.owner || state.ilocked;
1391 }
1392 
1393 void
kdp_lck_mtx_find_owner(struct waitq * waitq __unused,event64_t event,thread_waitinfo_t * waitinfo)1394 kdp_lck_mtx_find_owner(
1395 	struct waitq           *waitq __unused,
1396 	event64_t               event,
1397 	thread_waitinfo_t      *waitinfo)
1398 {
1399 	lck_mtx_t      *mutex  = LCK_EVENT_TO_MUTEX(event);
1400 	lck_mtx_state_t state  = os_atomic_load(&mutex->lck_mtx, relaxed);
1401 
1402 	assert3u(state.data, !=, LCK_MTX_TAG_DESTROYED);
1403 	waitinfo->context = VM_KERNEL_UNSLIDE_OR_PERM(mutex);
1404 	waitinfo->owner   = thread_tid(ctid_get_thread(state.owner));
1405 }
1406 
1407 #endif /* !LCK_MTX_USE_ARCH */
1408 
1409 /*
1410  * Routine:     mutex_pause
1411  *
1412  * Called by former callers of simple_lock_pause().
1413  */
1414 #define MAX_COLLISION_COUNTS    32
1415 #define MAX_COLLISION   8
1416 
1417 unsigned int max_collision_count[MAX_COLLISION_COUNTS];
1418 
1419 uint32_t collision_backoffs[MAX_COLLISION] = {
1420 	10, 50, 100, 200, 400, 600, 800, 1000
1421 };
1422 
1423 
1424 void
mutex_pause(uint32_t collisions)1425 mutex_pause(uint32_t collisions)
1426 {
1427 	wait_result_t wait_result;
1428 	uint32_t        back_off;
1429 
1430 	if (collisions >= MAX_COLLISION_COUNTS) {
1431 		collisions = MAX_COLLISION_COUNTS - 1;
1432 	}
1433 	max_collision_count[collisions]++;
1434 
1435 	if (collisions >= MAX_COLLISION) {
1436 		collisions = MAX_COLLISION - 1;
1437 	}
1438 	back_off = collision_backoffs[collisions];
1439 
1440 	wait_result = assert_wait_timeout((event_t)mutex_pause, THREAD_UNINT, back_off, NSEC_PER_USEC);
1441 	assert(wait_result == THREAD_WAITING);
1442 
1443 	wait_result = thread_block(THREAD_CONTINUE_NULL);
1444 	assert(wait_result == THREAD_TIMED_OUT);
1445 }
1446 
1447 
1448 unsigned int mutex_yield_wait = 0;
1449 unsigned int mutex_yield_no_wait = 0;
1450 
1451 boolean_t
lck_mtx_yield(lck_mtx_t * lck)1452 lck_mtx_yield(
1453 	lck_mtx_t   *lck)
1454 {
1455 	bool has_waiters = LCK_MTX_HAS_WAITERS(lck);
1456 
1457 #if DEBUG
1458 	lck_mtx_assert(lck, LCK_MTX_ASSERT_OWNED);
1459 #endif /* DEBUG */
1460 
1461 	if (!has_waiters) {
1462 		mutex_yield_no_wait++;
1463 	} else {
1464 		mutex_yield_wait++;
1465 		lck_mtx_unlock(lck);
1466 		mutex_pause(0);
1467 		lck_mtx_lock(lck);
1468 	}
1469 	return has_waiters;
1470 }
1471