xref: /xnu-10063.121.3/osfmk/kern/lock_mtx.c (revision 2c2f96dc2b9a4408a43d3150ae9c105355ca3daa)
1 /*
2  * Copyright (c) 2022 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #define LOCK_PRIVATE 1
30 
31 #include <mach_ldebug.h>
32 #include <kern/locks_internal.h>
33 #include <kern/lock_stat.h>
34 #include <kern/locks.h>
35 #include <kern/kalloc.h>
36 #include <kern/thread.h>
37 
38 #include <mach/machine/sdt.h>
39 
40 #include <machine/cpu_data.h>
41 #include <machine/machine_cpu.h>
42 
43 #if !LCK_MTX_USE_ARCH
44 
45 /*
46  * lck_mtx_t
47  * ~~~~~~~~~
48  *
49  * Kernel mutexes in this implementation are made of four 32 bits words:
50  *
51  *   - word 0: turnstile compact ID (24 bits) and the 0x22 lock tag
52  *   - word 1: padding (to be used for group compact IDs)
53  *   - word 2: mutex state (lock owner + interlock, spin and waiters bits),
54  *             refered to as "data" in the code.
55  *   - word 3: adaptive spin and interlock MCS queue tails.
56  *
57  * The 64 bits word made of the last two words is refered to
58  * as the "mutex state" in code.
59  *
60  *
61  * Core serialization rules
62  * ~~~~~~~~~~~~~~~~~~~~~~~~
63  *
64  * The mutex has a bit (lck_mtx_t::lck_mtx.ilocked or bit LCK_MTX_ILOCK
65  * of the data word) that serves as a spinlock for the mutex state.
66  *
67  *
68  * Updating the lock fields must follow the following rules:
69  *
70  *   - It is ok to "steal" the mutex (updating its data field) if no one
71  *     holds the interlock.
72  *
73  *   - Holding the interlock allows its holder to update the first 3 words
74  *     of the kernel mutex without using RMW atomics (plain stores are OK).
75  *
76  *   - Holding the interlock is required for a thread to remove itself
77  *     from the adaptive spin queue.
78  *
79  *   - Threads can enqueue themselves onto the adaptive spin wait queue
80  *     or the interlock wait queue at any time.
81  *
82  *
83  * Waiters bit and turnstiles
84  * ~~~~~~~~~~~~~~~~~~~~~~~~~~
85  *
86  * The turnstile on a kernel mutex is set by waiters, and cleared
87  * once they have all been resumed and successfully acquired the lock.
88  *
89  * LCK_MTX_NEEDS_WAKEUP being set (always with an owner set too)
90  * forces threads to the lck_mtx_unlock slowpath,
91  * in order to evaluate whether lck_mtx_unlock_wakeup() must be called.
92  *
93  * As a result it means it really only needs to be set at select times:
94  *
95  *   - when a thread blocks and "snitches" on the current thread owner,
96  *     so that when that thread unlocks it calls wake up,
97  *
98  *   - when a thread that was woken up resumes its work and became
99  *     the inheritor.
100  */
101 
102 #define ADAPTIVE_SPIN_ENABLE 0x1
103 
104 #define NOINLINE                __attribute__((noinline))
105 #define LCK_MTX_EVENT(lck)      CAST_EVENT64_T(&(lck)->lck_mtx.data)
106 #define LCK_EVENT_TO_MUTEX(e)   __container_of((uint32_t *)(e), lck_mtx_t, lck_mtx.data)
107 #define LCK_MTX_HAS_WAITERS(l)  ((l)->lck_mtx.data & LCK_MTX_NEEDS_WAKEUP)
108 
109 #if DEVELOPMENT || DEBUG
110 TUNABLE(bool, LckDisablePreemptCheck, "-disable_mtx_chk", false);
111 #endif /* DEVELOPMENT || DEBUG */
112 
113 extern unsigned int not_in_kdp;
114 
115 KALLOC_TYPE_DEFINE(KT_LCK_MTX, lck_mtx_t, KT_PRIV_ACCT);
116 
117 #define LCK_MTX_NULL_CTID       0x00000000u
118 
119 __enum_decl(lck_mtx_mode_t, uint32_t, {
120 	LCK_MTX_MODE_SLEEPABLE,
121 	LCK_MTX_MODE_SPIN,
122 	LCK_MTX_MODE_SPIN_ALWAYS,
123 });
124 
125 __enum_decl(lck_ilk_mode_t, uint32_t, {
126 	LCK_ILK_MODE_UNLOCK,
127 	LCK_ILK_MODE_DIRECT,
128 	LCK_ILK_MODE_FROM_AS,
129 });
130 
131 static inline void
lck_mtx_mcs_clear(lck_mtx_mcs_t mcs)132 lck_mtx_mcs_clear(lck_mtx_mcs_t mcs)
133 {
134 	*mcs = (struct lck_mtx_mcs){ };
135 }
136 
137 static inline lck_mcs_id_t
lck_mtx_get_mcs_id(void)138 lck_mtx_get_mcs_id(void)
139 {
140 	return lck_mcs_id_current(LCK_MCS_SLOT_0);
141 }
142 
143 __pure2
144 static inline lck_mtx_mcs_t
lck_mtx_get_mcs(lck_mcs_id_t idx)145 lck_mtx_get_mcs(lck_mcs_id_t idx)
146 {
147 	return &lck_mcs_get_other(idx)->mcs_mtx;
148 }
149 
150 
151 #pragma mark lck_mtx_t: validation
152 
153 __abortlike
154 static void
__lck_mtx_invalid_panic(lck_mtx_t * lck)155 __lck_mtx_invalid_panic(lck_mtx_t *lck)
156 {
157 	panic("Invalid/destroyed mutex %p: "
158 	    "<0x%06x 0x%02x 0x%08x 0x%08x/%p 0x%04x 0x%04x>",
159 	    lck, lck->lck_mtx_tsid, lck->lck_mtx_type, lck->lck_mtx_grp,
160 	    lck->lck_mtx.data, ctid_get_thread_unsafe(lck->lck_mtx.owner),
161 	    lck->lck_mtx.as_tail, lck->lck_mtx.ilk_tail);
162 }
163 
164 __abortlike
165 static void
__lck_mtx_not_owned_panic(lck_mtx_t * lock,thread_t thread)166 __lck_mtx_not_owned_panic(lck_mtx_t *lock, thread_t thread)
167 {
168 	panic("Mutex %p is unexpectedly not owned by thread %p", lock, thread);
169 }
170 
171 __abortlike
172 static void
__lck_mtx_owned_panic(lck_mtx_t * lock,thread_t thread)173 __lck_mtx_owned_panic(lck_mtx_t *lock, thread_t thread)
174 {
175 	panic("Mutex %p is unexpectedly owned by thread %p", lock, thread);
176 }
177 
178 __abortlike
179 static void
__lck_mtx_lock_is_sleepable_panic(lck_mtx_t * lck)180 __lck_mtx_lock_is_sleepable_panic(lck_mtx_t *lck)
181 {
182 	// "Always" variants can never block. If the lock is held as a normal mutex
183 	// then someone is mixing always and non-always calls on the same lock, which is
184 	// forbidden.
185 	panic("Mutex %p is held as a full-mutex (spin-always lock attempted)", lck);
186 }
187 
188 #if DEVELOPMENT || DEBUG
189 __abortlike
190 static void
__lck_mtx_preemption_disabled_panic(lck_mtx_t * lck,int expected)191 __lck_mtx_preemption_disabled_panic(lck_mtx_t *lck, int expected)
192 {
193 	panic("Attempt to take mutex %p with preemption disabled (%d)",
194 	    lck, get_preemption_level() - expected);
195 }
196 
197 __abortlike
198 static void
__lck_mtx_at_irq_panic(lck_mtx_t * lck)199 __lck_mtx_at_irq_panic(lck_mtx_t *lck)
200 {
201 	panic("Attempt to take mutex %p in IRQ context", lck);
202 }
203 
204 /*
205  *	Routine:	lck_mtx_check_preemption
206  *
207  *	Verify preemption is enabled when attempting to acquire a mutex.
208  */
209 static inline void
lck_mtx_check_preemption(lck_mtx_t * lock,thread_t thread,int expected)210 lck_mtx_check_preemption(lck_mtx_t *lock, thread_t thread, int expected)
211 {
212 #pragma unused(thread)
213 	if (lock_preemption_level_for_thread(thread) == expected) {
214 		return;
215 	}
216 	if (LckDisablePreemptCheck) {
217 		return;
218 	}
219 	if (current_cpu_datap()->cpu_hibernate) {
220 		return;
221 	}
222 	if (startup_phase < STARTUP_SUB_EARLY_BOOT) {
223 		return;
224 	}
225 	__lck_mtx_preemption_disabled_panic(lock, expected);
226 }
227 
228 static inline void
lck_mtx_check_irq(lck_mtx_t * lock)229 lck_mtx_check_irq(lck_mtx_t *lock)
230 {
231 	if (ml_at_interrupt_context()) {
232 		__lck_mtx_at_irq_panic(lock);
233 	}
234 }
235 
236 #define LCK_MTX_SNIFF_PREEMPTION(thread)   lock_preemption_level_for_thread(thread)
237 #define LCK_MTX_CHECK_INVARIANTS           1
238 #else
239 #define lck_mtx_check_irq(lck)             ((void)0)
240 #define LCK_MTX_SNIFF_PREEMPTION(thread)   0
241 #define LCK_MTX_CHECK_INVARIANTS           0
242 #endif /* !DEVELOPMENT && !DEBUG */
243 
244 #if CONFIG_DTRACE
245 #define LCK_MTX_SNIFF_DTRACE()             lck_debug_state.lds_value
246 #else
247 #define LCK_MTX_SNIFF_DTRACE()             0
248 #endif
249 
250 
251 #pragma mark lck_mtx_t: alloc/init/destroy/free
252 
253 lck_mtx_t *
lck_mtx_alloc_init(lck_grp_t * grp,lck_attr_t * attr)254 lck_mtx_alloc_init(lck_grp_t *grp, lck_attr_t *attr)
255 {
256 	lck_mtx_t      *lck;
257 
258 	lck = zalloc(KT_LCK_MTX);
259 	lck_mtx_init(lck, grp, attr);
260 	return lck;
261 }
262 
263 void
lck_mtx_free(lck_mtx_t * lck,lck_grp_t * grp)264 lck_mtx_free(lck_mtx_t *lck, lck_grp_t *grp)
265 {
266 	lck_mtx_destroy(lck, grp);
267 	zfree(KT_LCK_MTX, lck);
268 }
269 
270 void
lck_mtx_init(lck_mtx_t * lck,lck_grp_t * grp,lck_attr_t * attr)271 lck_mtx_init(lck_mtx_t *lck, lck_grp_t *grp, lck_attr_t *attr)
272 {
273 	if (attr == LCK_ATTR_NULL) {
274 		attr = &lck_attr_default;
275 	}
276 
277 	*lck = (lck_mtx_t){
278 		.lck_mtx_type = LCK_TYPE_MUTEX,
279 		.lck_mtx_grp  = grp->lck_grp_attr_id,
280 	};
281 	if (attr->lck_attr_val & LCK_ATTR_DEBUG) {
282 		lck->lck_mtx.data |= LCK_MTX_PROFILE;
283 	}
284 
285 	lck_grp_reference(grp, &grp->lck_grp_mtxcnt);
286 }
287 
288 void
lck_mtx_destroy(lck_mtx_t * lck,lck_grp_t * grp)289 lck_mtx_destroy(lck_mtx_t *lck, lck_grp_t *grp)
290 {
291 	if (lck->lck_mtx_tsid && lck->lck_mtx_type == LCK_TYPE_MUTEX) {
292 		panic("Mutex to destroy still has waiters: %p: "
293 		    "<0x%06x 0x%02x 0x%08x 0x%08x/%p 0x%04x 0x%04x>",
294 		    lck, lck->lck_mtx_tsid, lck->lck_mtx_type, lck->lck_mtx_grp,
295 		    lck->lck_mtx.data, ctid_get_thread_unsafe(lck->lck_mtx.owner),
296 		    lck->lck_mtx.as_tail, lck->lck_mtx.ilk_tail);
297 	}
298 	if (lck->lck_mtx_type != LCK_TYPE_MUTEX ||
299 	    (lck->lck_mtx.data & ~LCK_MTX_PROFILE) ||
300 	    lck->lck_mtx.as_tail || lck->lck_mtx.ilk_tail) {
301 		__lck_mtx_invalid_panic(lck);
302 	}
303 	LCK_GRP_ASSERT_ID(grp, lck->lck_mtx_grp);
304 	lck->lck_mtx_type = LCK_TYPE_NONE;
305 	lck->lck_mtx.data = LCK_MTX_TAG_DESTROYED;
306 	lck->lck_mtx_grp      = 0;
307 	lck_grp_deallocate(grp, &grp->lck_grp_mtxcnt);
308 }
309 
310 
311 #pragma mark lck_mtx_t: lck_mtx_ilk*
312 
313 static hw_spin_timeout_status_t
lck_mtx_ilk_timeout_panic(void * _lock,hw_spin_timeout_t to,hw_spin_state_t st)314 lck_mtx_ilk_timeout_panic(void *_lock, hw_spin_timeout_t to, hw_spin_state_t st)
315 {
316 	lck_mtx_t *lck = _lock;
317 
318 	panic("Mutex interlock[%p] " HW_SPIN_TIMEOUT_FMT "; "
319 	    "current owner: %p, "
320 	    "<0x%06x 0x%02x 0x%08x 0x%08x 0x%04x 0x%04x>, "
321 	    HW_SPIN_TIMEOUT_DETAILS_FMT,
322 	    lck, HW_SPIN_TIMEOUT_ARG(to, st),
323 	    ctid_get_thread_unsafe(lck->lck_mtx.owner),
324 	    lck->lck_mtx_tsid, lck->lck_mtx_type,
325 	    lck->lck_mtx_grp, lck->lck_mtx.data,
326 	    lck->lck_mtx.as_tail, lck->lck_mtx.ilk_tail,
327 	    HW_SPIN_TIMEOUT_DETAILS_ARG(to, st));
328 }
329 
330 static const struct hw_spin_policy lck_mtx_ilk_timeout_policy = {
331 	.hwsp_name              = "lck_mtx_t (ilk)",
332 	.hwsp_timeout_atomic    = &lock_panic_timeout,
333 	.hwsp_op_timeout        = lck_mtx_ilk_timeout_panic,
334 };
335 
336 static void
lck_mtx_ilk_lock_cleanup_as_mcs(lck_mtx_t * lock,lck_mcs_id_t idx,lck_mtx_mcs_t mcs,hw_spin_timeout_t to,hw_spin_state_t * ss)337 lck_mtx_ilk_lock_cleanup_as_mcs(
338 	lck_mtx_t               *lock,
339 	lck_mcs_id_t             idx,
340 	lck_mtx_mcs_t            mcs,
341 	hw_spin_timeout_t        to,
342 	hw_spin_state_t         *ss)
343 {
344 	lck_mtx_mcs_t nnode = NULL;
345 	lck_mcs_id_t  pidx  = (lck_mcs_id_t)mcs->lmm_as_prev;
346 	bool          was_last;
347 
348 	/*
349 	 *	This is called when the thread made use
350 	 *	of the adaptive spin queue and needs
351 	 *	to remove itself from it.
352 	 */
353 
354 	/*
355 	 *	If the thread is last, set the tail to the node before us.
356 	 */
357 	was_last = lock_cmpxchg(&lock->lck_mtx.as_tail, idx, pidx, release);
358 
359 	if (was_last) {
360 		/*
361 		 *	If @c mcs was last, we need to erase the previous
362 		 *	node link to it.
363 		 *
364 		 *	However, new nodes could have now taken our place
365 		 *	and set the previous node's @c lmm_as_next field
366 		 *	already, so we must CAS rather than blindly set.
367 		 *
368 		 *	We know the previous node is stable because
369 		 *	we hold the interlock (preventing concurrent
370 		 *	removals).
371 		 */
372 		if (pidx) {
373 			os_atomic_cmpxchg(&lck_mtx_get_mcs(pidx)->lmm_as_next,
374 			    mcs, nnode, relaxed);
375 		}
376 	} else {
377 		/*
378 		 *	If @c mcs wasn't last, then wait to make sure
379 		 *	we observe @c lmm_as_next. Once we do, we know
380 		 *	the field is stable since we hold the interlock
381 		 *	(preventing concurrent dequeues).
382 		 *
383 		 *	We can then update it to @c mcs next node index
384 		 *	(which is also stable for similar reasons).
385 		 *
386 		 *	Lastly update the previous node @c lmm_as_next
387 		 *	field as well to terminate the dequeue.
388 		 */
389 		while (!hw_spin_wait_until(&mcs->lmm_as_next, nnode, nnode)) {
390 			hw_spin_policy_t pol = &lck_mtx_ilk_timeout_policy;
391 			hw_spin_should_keep_spinning(lock, pol, to, ss);
392 		}
393 
394 		os_atomic_store(&nnode->lmm_as_prev, pidx, relaxed);
395 		if (pidx) {
396 			os_atomic_store(&lck_mtx_get_mcs(pidx)->lmm_as_next,
397 			    nnode, relaxed);
398 		}
399 	}
400 
401 	/*
402 	 *	@c mcs's fields are left dangling,
403 	 *	it is the responsibilty of the caller
404 	 *	to terminate the cleanup.
405 	 */
406 }
407 
408 static NOINLINE void
lck_mtx_ilk_lock_contended(lck_mtx_t * lock,lck_mtx_state_t state,lck_ilk_mode_t mode)409 lck_mtx_ilk_lock_contended(
410 	lck_mtx_t              *lock,
411 	lck_mtx_state_t         state,
412 	lck_ilk_mode_t          mode)
413 {
414 	hw_spin_policy_t  pol = &lck_mtx_ilk_timeout_policy;
415 	hw_spin_timeout_t to  = hw_spin_compute_timeout(pol);
416 	hw_spin_state_t   ss  = { };
417 
418 	lck_mtx_mcs_t     mcs, nnode, pnode;
419 	lck_mcs_id_t      idx, pidx;
420 	lck_mtx_state_t   nstate;
421 	unsigned long     ready;
422 	uint64_t          spin_start;
423 
424 	/*
425 	 *	Take a spot in the interlock MCS queue,
426 	 *	and then spin until we're at the head of it.
427 	 */
428 
429 	idx  = lck_mtx_get_mcs_id();
430 	mcs  = &lck_mcs_get_current()->mcs_mtx;
431 	if (mode != LCK_MTX_MODE_SPIN) {
432 		spin_start = LCK_MTX_ADAPTIVE_SPIN_BEGIN();
433 	}
434 
435 	mcs->lmm_ilk_current = lock;
436 	pidx = os_atomic_xchg(&lock->lck_mtx.ilk_tail, idx, release);
437 	if (pidx) {
438 		pnode = lck_mtx_get_mcs(pidx);
439 		os_atomic_store(&pnode->lmm_ilk_next, mcs, relaxed);
440 
441 		while (!hw_spin_wait_until(&mcs->lmm_ilk_ready, ready, ready)) {
442 			hw_spin_should_keep_spinning(lock, pol, to, &ss);
443 		}
444 	}
445 
446 
447 	/*
448 	 *	We're now the first in line, wait for the interlock
449 	 *	to look ready and take it.
450 	 *
451 	 *	We can't just assume the lock is ours for the taking,
452 	 *	because the fastpath of lck_mtx_lock_spin{,_always}
453 	 *	only look at the mutex "data" and might steal it.
454 	 *
455 	 *	Also clear the interlock MCS tail if @c mcs is last.
456 	 */
457 	do {
458 		while (!hw_spin_wait_until(&lock->lck_mtx.val,
459 		    state.val, state.ilocked == 0)) {
460 			hw_spin_should_keep_spinning(lock, pol, to, &ss);
461 		}
462 
463 		nstate = state;
464 		nstate.ilocked = 1;
465 		if (nstate.ilk_tail == idx) {
466 			nstate.ilk_tail = 0;
467 		}
468 	} while (!os_atomic_cmpxchg(&lock->lck_mtx, state, nstate, acquire));
469 
470 
471 	/*
472 	 *	We now have the interlock, let's cleanup the MCS state.
473 	 *
474 	 *	First, if there is a node after us, notify that it
475 	 *	is at the head of the interlock queue.
476 	 *
477 	 *	Second, perform the adaptive spin MCS cleanup if needed.
478 	 *
479 	 *	Lastly, clear the MCS node.
480 	 */
481 	if (state.ilk_tail != idx) {
482 		while (!hw_spin_wait_until(&mcs->lmm_ilk_next, nnode, nnode)) {
483 			hw_spin_should_keep_spinning(lock, pol, to, &ss);
484 		}
485 
486 		os_atomic_store(&nnode->lmm_ilk_ready, 1, relaxed);
487 	}
488 
489 	if (mode == LCK_ILK_MODE_FROM_AS) {
490 		lck_mtx_ilk_lock_cleanup_as_mcs(lock, idx, mcs, to, &ss);
491 	}
492 	lck_mtx_mcs_clear(mcs);
493 
494 	if (mode != LCK_MTX_MODE_SPIN) {
495 		LCK_MTX_ADAPTIVE_SPIN_END(lock, lock->lck_mtx_grp, spin_start);
496 	}
497 }
498 
499 static void
lck_mtx_ilk_lock_nopreempt(lck_mtx_t * lock,lck_ilk_mode_t mode)500 lck_mtx_ilk_lock_nopreempt(lck_mtx_t *lock, lck_ilk_mode_t mode)
501 {
502 	lck_mtx_state_t state, nstate;
503 
504 	os_atomic_rmw_loop(&lock->lck_mtx.val, state.val, nstate.val, acquire, {
505 		if (__improbable(state.ilocked || state.ilk_tail)) {
506 		        os_atomic_rmw_loop_give_up({
507 				return lck_mtx_ilk_lock_contended(lock, state, mode);
508 			});
509 		}
510 
511 		nstate = state;
512 		nstate.ilocked = true;
513 	});
514 }
515 
516 static void
lck_mtx_ilk_unlock_v(lck_mtx_t * lock,uint32_t data)517 lck_mtx_ilk_unlock_v(lck_mtx_t *lock, uint32_t data)
518 {
519 	os_atomic_store(&lock->lck_mtx.data, data, release);
520 	lock_enable_preemption();
521 }
522 
523 static void
lck_mtx_ilk_unlock(lck_mtx_t * lock)524 lck_mtx_ilk_unlock(lck_mtx_t *lock)
525 {
526 	lck_mtx_ilk_unlock_v(lock, lock->lck_mtx.data & ~LCK_MTX_ILOCK);
527 }
528 
529 
530 #pragma mark lck_mtx_t: turnstile integration
531 
532 /*
533  * Routine: lck_mtx_lock_wait
534  *
535  * Invoked in order to wait on contention.
536  *
537  * Called with the interlock locked and
538  * returns it unlocked.
539  *
540  * Always aggressively sets the owning thread to promoted,
541  * even if it's the same or higher priority
542  * This prevents it from lowering its own priority while holding a lock
543  *
544  * TODO: Come up with a more efficient way to handle same-priority promotions
545  *      <rdar://problem/30737670> ARM mutex contention logic could avoid taking the thread lock
546  */
547 static struct turnstile *
lck_mtx_lock_wait(lck_mtx_t * lck,thread_t self,thread_t holder,struct turnstile * ts)548 lck_mtx_lock_wait(
549 	lck_mtx_t              *lck,
550 	thread_t                self,
551 	thread_t                holder,
552 	struct turnstile       *ts)
553 {
554 	uint64_t sleep_start = LCK_MTX_BLOCK_BEGIN();
555 
556 	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_START,
557 	    unslide_for_kdebug(lck), (uintptr_t)thread_tid(self), 0, 0, 0);
558 
559 	if (ts == TURNSTILE_NULL) {
560 		ts = turnstile_prepare_compact_id((uintptr_t)lck,
561 		    lck->lck_mtx_tsid, TURNSTILE_KERNEL_MUTEX);
562 		if (lck->lck_mtx_tsid == 0) {
563 			lck->lck_mtx_tsid = ts->ts_compact_id;
564 		}
565 	}
566 	assert3u(ts->ts_compact_id, ==, lck->lck_mtx_tsid);
567 
568 	thread_set_pending_block_hint(self, kThreadWaitKernelMutex);
569 	turnstile_update_inheritor(ts, holder, (TURNSTILE_DELAYED_UPDATE | TURNSTILE_INHERITOR_THREAD));
570 
571 	waitq_assert_wait64(&ts->ts_waitq, LCK_MTX_EVENT(lck),
572 	    THREAD_UNINT | THREAD_WAIT_NOREPORT_USER, TIMEOUT_WAIT_FOREVER);
573 
574 	lck_mtx_ilk_unlock(lck);
575 
576 	turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_NOT_HELD);
577 
578 	thread_block(THREAD_CONTINUE_NULL);
579 
580 	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_END, 0, 0, 0, 0, 0);
581 
582 	LCK_MTX_BLOCK_END(lck, lck->lck_mtx_grp, sleep_start);
583 
584 	return ts;
585 }
586 
587 static void
lck_mtx_lock_wait_done(lck_mtx_t * lck,struct turnstile * ts)588 lck_mtx_lock_wait_done(lck_mtx_t *lck, struct turnstile  *ts)
589 {
590 	if (turnstile_complete_compact_id((uintptr_t)lck, ts,
591 	    TURNSTILE_KERNEL_MUTEX)) {
592 		lck->lck_mtx_tsid = 0;
593 	}
594 }
595 
596 /*
597  * Routine:     lck_mtx_lock_will_need_wakeup
598  *
599  * Returns whether the thread is the current turnstile inheritor,
600  * which means it will have to call lck_mtx_unlock_wakeup()
601  * on unlock.
602  */
603 __attribute__((always_inline))
604 static bool
lck_mtx_lock_will_need_wakeup(lck_mtx_t * lck,thread_t self)605 lck_mtx_lock_will_need_wakeup(lck_mtx_t *lck, thread_t  self)
606 {
607 	uint32_t tsid = lck->lck_mtx_tsid;
608 
609 	return tsid && turnstile_get_by_id(tsid)->ts_inheritor == self;
610 }
611 
612 /*
613  * Routine:     lck_mtx_unlock_wakeup
614  *
615  * Invoked on unlock when there is contention.
616  *
617  * Called with the interlock locked.
618  *
619  * NOTE: callers should call turnstile_clenup after
620  * dropping the interlock.
621  */
622 static void
lck_mtx_unlock_wakeup(lck_mtx_t * lck,__kdebug_only thread_t thread)623 lck_mtx_unlock_wakeup(
624 	lck_mtx_t                       *lck,
625 	__kdebug_only thread_t          thread)
626 {
627 	struct turnstile *ts;
628 	kern_return_t did_wake;
629 
630 	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_UNLCK_WAKEUP_CODE) | DBG_FUNC_START,
631 	    unslide_for_kdebug(lck), (uintptr_t)thread_tid(thread), 0, 0, 0);
632 
633 	ts = turnstile_get_by_id(lck->lck_mtx_tsid);
634 
635 	/*
636 	 * We can skip turnstile_{prepare,cleanup} because
637 	 * we hold the interlock of the primitive,
638 	 * and enqueues/wakeups all happen under the interlock,
639 	 * which means the turnstile is stable.
640 	 */
641 	did_wake = waitq_wakeup64_one(&ts->ts_waitq, LCK_MTX_EVENT(lck),
642 	    THREAD_AWAKENED, WAITQ_UPDATE_INHERITOR);
643 	assert(did_wake == KERN_SUCCESS);
644 
645 	turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD);
646 
647 	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_UNLCK_WAKEUP_CODE) | DBG_FUNC_END, 0, 0, 0, 0, 0);
648 }
649 
650 
651 #pragma mark lck_mtx_t: lck_mtx_lock
652 
653 static inline bool
lck_mtx_ctid_on_core(uint32_t ctid)654 lck_mtx_ctid_on_core(uint32_t ctid)
655 {
656 	thread_t th = ctid_get_thread_unsafe(ctid);
657 
658 	return th && machine_thread_on_core_allow_invalid(th);
659 }
660 
661 #define LCK_MTX_OWNER_FOR_TRACE(lock) \
662 	VM_KERNEL_UNSLIDE_OR_PERM(ctid_get_thread_unsafe((lock)->lck_mtx.data))
663 
664 static void
lck_mtx_lock_adaptive_spin(lck_mtx_t * lock,lck_mtx_state_t state)665 lck_mtx_lock_adaptive_spin(lck_mtx_t *lock, lck_mtx_state_t state)
666 {
667 	__kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lock);
668 	hw_spin_policy_t  pol = &lck_mtx_ilk_timeout_policy;
669 	hw_spin_timeout_t to  = hw_spin_compute_timeout(pol);
670 	hw_spin_state_t   ss  = { };
671 	uint64_t          deadline;
672 
673 	lck_mtx_mcs_t     mcs, node;
674 	lck_mcs_id_t      idx, pidx, clear_idx;
675 	unsigned long     prev;
676 	lck_mtx_state_t   nstate;
677 	ast_t      *const astp = ast_pending();
678 
679 	idx  = lck_mtx_get_mcs_id();
680 	mcs  = &lck_mcs_get_current()->mcs_mtx;
681 
682 	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_START,
683 	    trace_lck, LCK_MTX_OWNER_FOR_TRACE(lock), lock->lck_mtx_tsid, 0, 0);
684 
685 	/*
686 	 *	Take a spot in the adaptive spin queue,
687 	 *	and then spin until we're at the head of it.
688 	 *
689 	 *	Until we're at the head, we do not need to monitor
690 	 *	for whether the current owner is on core or not:
691 	 *
692 	 *	1. the head of the queue is doing it already,
693 	 *
694 	 *	2. when the entire adaptive spin queue will "give up"
695 	 *	   as a result of the owner going off core, we want
696 	 *	   to avoid a thundering herd and let the AS queue
697 	 *	   pour into the interlock one slowly.
698 	 *
699 	 *	Do give up if the scheduler made noises something
700 	 *	more important has shown up.
701 	 *
702 	 *	Note: this function is optimized so that we do not touch
703 	 *	      our local mcs node when we're the head of the queue.
704 	 *
705 	 *	      This allows us in the case when the contention is
706 	 *	      between 2 cores only to not have to touch this
707 	 *	      cacheline at all.
708 	 */
709 	pidx = os_atomic_xchg(&lock->lck_mtx.as_tail, idx, release);
710 	if (pidx) {
711 		node = lck_mtx_get_mcs(pidx);
712 		mcs->lmm_as_prev = pidx;
713 		os_atomic_store(&node->lmm_as_next, mcs, release);
714 
715 		while (!hw_spin_wait_until(&mcs->lmm_as_prev, prev,
716 		    prev == 0 || (os_atomic_load(astp, relaxed) & AST_URGENT))) {
717 			hw_spin_should_keep_spinning(lock, pol, to, &ss);
718 		}
719 
720 		if (__improbable(prev)) {
721 			goto adaptive_spin_fail;
722 		}
723 
724 		clear_idx = 0;
725 	} else {
726 		clear_idx = idx;
727 	}
728 
729 	/*
730 	 *	We're now first in line.
731 	 *
732 	 *	It's our responsbility to monitor the lock's state
733 	 *	for whether (1) the lock has become available,
734 	 *	(2) its owner has gone off core, (3) the scheduler
735 	 *	wants its CPU back, or (4) we've spun for too long.
736 	 */
737 	deadline = ml_get_timebase() + os_atomic_load(&MutexSpin, relaxed);
738 
739 	for (;;) {
740 		state.val = lock_load_exclusive(&lock->lck_mtx.val, acquire);
741 
742 		if (__probable(!state.ilocked && !state.ilk_tail && !state.owner)) {
743 			/*
744 			 * 2-core contention: if we can, try to dequeue
745 			 * ourselves from the adaptive spin queue
746 			 * as part of this CAS in order to avoid
747 			 * the cost of lck_mtx_ilk_lock_cleanup_as_mcs()
748 			 * and zeroing the mcs node at all.
749 			 *
750 			 * Because the queue is designed to limit contention,
751 			 * using store-exclusive over an armv8.1 LSE atomic
752 			 * is actually marginally better (presumably due to
753 			 * the better codegen).
754 			 */
755 			nstate = state;
756 			nstate.ilocked = true;
757 			if (state.as_tail == clear_idx) {
758 				nstate.as_tail = 0;
759 			}
760 			if (__probable(lock_store_exclusive(&lock->lck_mtx.val,
761 			    state.val, nstate.val, acquire))) {
762 				break;
763 			}
764 		} else {
765 			lock_wait_for_event();
766 		}
767 
768 		if (__improbable(ml_get_timebase() > deadline ||
769 		    (os_atomic_load(astp, relaxed) & AST_URGENT) ||
770 		    (!state.ilocked && !state.ilk_tail && state.owner &&
771 		    !lck_mtx_ctid_on_core(state.owner)))) {
772 			goto adaptive_spin_fail;
773 		}
774 	}
775 
776 	/*
777 	 *	If we're here, we got the lock, we just have to cleanup
778 	 *	the MCS nodes and return.
779 	 */
780 	if (state.as_tail != clear_idx) {
781 		lck_mtx_ilk_lock_cleanup_as_mcs(lock, idx, mcs, to, &ss);
782 		lck_mtx_mcs_clear(mcs);
783 	}
784 
785 	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_END,
786 	    trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(thread),
787 	    lock->lck_mtx_tsid, 0, 0);
788 	return;
789 
790 adaptive_spin_fail:
791 	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_END,
792 	    trace_lck, LCK_MTX_OWNER_FOR_TRACE(lock), lock->lck_mtx_tsid, 0, 0);
793 	return lck_mtx_ilk_lock_contended(lock, state, LCK_ILK_MODE_FROM_AS);
794 }
795 
796 static NOINLINE void
lck_mtx_lock_contended(lck_mtx_t * lock,thread_t thread,lck_mtx_mode_t mode)797 lck_mtx_lock_contended(lck_mtx_t *lock, thread_t thread, lck_mtx_mode_t mode)
798 {
799 	struct turnstile *ts = TURNSTILE_NULL;
800 	lck_mtx_state_t   state;
801 	uint32_t          ctid = thread->ctid;
802 	uint32_t          data;
803 #if CONFIG_DTRACE
804 	int               first_miss = 0;
805 #endif /* CONFIG_DTRACE */
806 	bool              direct_wait = false;
807 	uint64_t          spin_start;
808 	uint32_t          profile;
809 
810 	lck_mtx_check_irq(lock);
811 	if (mode == LCK_MTX_MODE_SLEEPABLE) {
812 		lock_disable_preemption_for_thread(thread);
813 	}
814 
815 	for (;;) {
816 		/*
817 		 *	Load the current state and perform sanity checks
818 		 *
819 		 *	Note that the various "corrupt" values are designed
820 		 *	so that the slowpath is taken when a mutex was used
821 		 *	after destruction, so that we do not have to do
822 		 *	sanity checks in the fast path.
823 		 */
824 		state = os_atomic_load(&lock->lck_mtx, relaxed);
825 		if (state.owner == ctid) {
826 			__lck_mtx_owned_panic(lock, thread);
827 		}
828 		if (lock->lck_mtx_type != LCK_TYPE_MUTEX ||
829 		    state.data == LCK_MTX_TAG_DESTROYED) {
830 			__lck_mtx_invalid_panic(lock);
831 		}
832 		profile = (state.data & LCK_MTX_PROFILE);
833 
834 		/*
835 		 *	Attempt steal
836 		 *
837 		 *	When the lock state is 0, then no thread can be queued
838 		 *	for adaptive spinning or for the interlock yet.
839 		 *
840 		 *	As such we can attempt to try to take the interlock.
841 		 *	(we can't take the mutex directly because we need
842 		 *	the interlock to do turnstile operations on the way out).
843 		 */
844 		if ((state.val & ~(uint64_t)LCK_MTX_PROFILE) == 0) {
845 			if (!os_atomic_cmpxchgv(&lock->lck_mtx.val,
846 			    state.val, state.val | LCK_MTX_ILOCK,
847 			    &state.val, acquire)) {
848 				continue;
849 			}
850 			break;
851 		}
852 
853 #if CONFIG_DTRACE
854 		if (profile) {
855 			LCK_MTX_PROF_MISS(lock, lock->lck_mtx_grp, &first_miss);
856 		}
857 #endif /* CONFIG_DTRACE */
858 
859 		if (mode == LCK_MTX_MODE_SLEEPABLE) {
860 			spin_start = LCK_MTX_ADAPTIVE_SPIN_BEGIN();
861 		} else {
862 			spin_start = LCK_MTX_SPIN_SPIN_BEGIN();
863 		}
864 
865 		/*
866 		 *	Adaptive spin or interlock
867 		 *
868 		 *	Evaluate if adaptive spinning should be attempted,
869 		 *	and if yes go to adaptive spin.
870 		 *
871 		 *	Otherwise (and this includes always-spin mutexes),
872 		 *	go for the interlock.
873 		 */
874 		if (mode != LCK_MTX_MODE_SPIN_ALWAYS &&
875 		    (state.ilocked || state.as_tail || !state.owner ||
876 		    lck_mtx_ctid_on_core(state.owner))) {
877 			lck_mtx_lock_adaptive_spin(lock, state);
878 		} else {
879 			direct_wait = true;
880 			lck_mtx_ilk_lock_nopreempt(lock, LCK_ILK_MODE_DIRECT);
881 		}
882 
883 		if (mode == LCK_MTX_MODE_SLEEPABLE) {
884 			LCK_MTX_ADAPTIVE_SPIN_END(lock, lock->lck_mtx_grp, spin_start);
885 		} else {
886 			LCK_MTX_SPIN_SPIN_END(lock, lock->lck_mtx_grp, spin_start);
887 		}
888 
889 		/*
890 		 *	Take or sleep
891 		 *
892 		 *	We now have the interlock. Either the owner
893 		 *	isn't set, and the mutex is ours to claim,
894 		 *	or we must go to sleep.
895 		 *
896 		 *	If we go to sleep, we need to set LCK_MTX_NEEDS_WAKEUP
897 		 *	to force the current lock owner to call
898 		 *	lck_mtx_unlock_wakeup().
899 		 */
900 		state = os_atomic_load(&lock->lck_mtx, relaxed);
901 		if (state.owner == LCK_MTX_NULL_CTID) {
902 			break;
903 		}
904 
905 		if (mode == LCK_MTX_MODE_SPIN_ALWAYS) {
906 			__lck_mtx_lock_is_sleepable_panic(lock);
907 		}
908 
909 #if CONFIG_DTRACE
910 		if (profile) {
911 			LCK_MTX_PROF_WAIT(lock, lock->lck_mtx_grp,
912 			    direct_wait, &first_miss);
913 		}
914 #endif /* CONFIG_DTRACE */
915 		os_atomic_store(&lock->lck_mtx.data,
916 		    state.data | LCK_MTX_ILOCK | LCK_MTX_NEEDS_WAKEUP,
917 		    compiler_acq_rel);
918 		ts = lck_mtx_lock_wait(lock, thread,
919 		    ctid_get_thread(state.owner), ts);
920 
921 		/* returns interlock unlocked and preemption re-enabled */
922 		lock_disable_preemption_for_thread(thread);
923 	}
924 
925 	/*
926 	 *	We can take the lock!
927 	 *
928 	 *	We only have the interlock and the owner field is 0.
929 	 *
930 	 *	Perform various turnstile cleanups if needed,
931 	 *	claim the lock, and reenable preemption (if needed).
932 	 */
933 	if (ts) {
934 		lck_mtx_lock_wait_done(lock, ts);
935 	}
936 	data = ctid | profile;
937 	if (lck_mtx_lock_will_need_wakeup(lock, thread)) {
938 		data |= LCK_MTX_NEEDS_WAKEUP;
939 	}
940 	if (mode != LCK_MTX_MODE_SLEEPABLE) {
941 		data |= LCK_MTX_ILOCK | LCK_MTX_SPIN_MODE;
942 	}
943 	os_atomic_store(&lock->lck_mtx.data, data, release);
944 
945 	if (mode == LCK_MTX_MODE_SLEEPABLE) {
946 		lock_enable_preemption();
947 	}
948 
949 	assert(thread->turnstile != NULL);
950 
951 	if (ts) {
952 		turnstile_cleanup();
953 	}
954 	LCK_MTX_ACQUIRED(lock, lock->lck_mtx_grp,
955 	    mode != LCK_MTX_MODE_SLEEPABLE, profile);
956 }
957 
958 #if LCK_MTX_CHECK_INVARIANTS || CONFIG_DTRACE
959 __attribute__((noinline))
960 #else
961 __attribute__((always_inline))
962 #endif
963 static void
lck_mtx_lock_slow(lck_mtx_t * lock,thread_t thread,lck_mtx_state_t state,lck_mtx_mode_t mode)964 lck_mtx_lock_slow(
965 	lck_mtx_t              *lock,
966 	thread_t                thread,
967 	lck_mtx_state_t         state,
968 	lck_mtx_mode_t          mode)
969 {
970 #pragma unused(state)
971 #if CONFIG_DTRACE
972 	lck_mtx_state_t ostate = {
973 		.data = LCK_MTX_PROFILE,
974 	};
975 #endif /* CONFIG_DTRACE */
976 
977 #if LCK_MTX_CHECK_INVARIANTS
978 	if (mode != LCK_MTX_MODE_SPIN_ALWAYS) {
979 		lck_mtx_check_preemption(lock, thread,
980 		    (mode == LCK_MTX_MODE_SPIN));
981 	}
982 #endif /* LCK_MTX_CHECK_INVARIANTS */
983 #if CONFIG_DTRACE
984 	if (state.val == ostate.val) {
985 		state.data = thread->ctid | LCK_MTX_PROFILE;
986 		if (mode != LCK_MTX_MODE_SLEEPABLE) {
987 			state.ilocked = true;
988 			state.spin_mode = true;
989 		}
990 		os_atomic_cmpxchgv(&lock->lck_mtx.val,
991 		    ostate.val, state.val, &state.val, acquire);
992 	}
993 	if ((state.val & ~ostate.val) == 0) {
994 		LCK_MTX_ACQUIRED(lock, lock->lck_mtx_grp,
995 		    mode != LCK_MTX_MODE_SLEEPABLE,
996 		    state.data & LCK_MTX_PROFILE);
997 		return;
998 	}
999 #endif /* CONFIG_DTRACE */
1000 	lck_mtx_lock_contended(lock, thread, mode);
1001 }
1002 
1003 static __attribute__((always_inline)) void
lck_mtx_lock_fastpath(lck_mtx_t * lock,lck_mtx_mode_t mode)1004 lck_mtx_lock_fastpath(lck_mtx_t *lock, lck_mtx_mode_t mode)
1005 {
1006 	thread_t thread = current_thread();
1007 	lck_mtx_state_t state = {
1008 		.data = thread->ctid,
1009 	};
1010 	uint64_t take_slowpath = 0;
1011 
1012 	if (mode != LCK_MTX_MODE_SPIN_ALWAYS) {
1013 		take_slowpath |= LCK_MTX_SNIFF_PREEMPTION(thread);
1014 	}
1015 	take_slowpath |= LCK_MTX_SNIFF_DTRACE();
1016 
1017 	if (mode != LCK_MTX_MODE_SLEEPABLE) {
1018 		lock_disable_preemption_for_thread(thread);
1019 		state.ilocked = true;
1020 		state.spin_mode = true;
1021 	}
1022 
1023 	/*
1024 	 * Do the CAS on the entire mutex state,
1025 	 * which hence requires for the ILK/AS queues
1026 	 * to be empty (which is fairer).
1027 	 */
1028 	lock_cmpxchgv(&lock->lck_mtx.val,
1029 	    0, state.val, &state.val, acquire);
1030 
1031 	take_slowpath |= state.val;
1032 	if (__improbable(take_slowpath)) {
1033 		return lck_mtx_lock_slow(lock, thread, state, mode);
1034 	}
1035 }
1036 
1037 void
lck_mtx_lock(lck_mtx_t * lock)1038 lck_mtx_lock(lck_mtx_t *lock)
1039 {
1040 	lck_mtx_lock_fastpath(lock, LCK_MTX_MODE_SLEEPABLE);
1041 }
1042 
1043 void
lck_mtx_lock_spin(lck_mtx_t * lock)1044 lck_mtx_lock_spin(lck_mtx_t *lock)
1045 {
1046 	lck_mtx_lock_fastpath(lock, LCK_MTX_MODE_SPIN);
1047 }
1048 
1049 void
lck_mtx_lock_spin_always(lck_mtx_t * lock)1050 lck_mtx_lock_spin_always(lck_mtx_t *lock)
1051 {
1052 	lck_mtx_lock_fastpath(lock, LCK_MTX_MODE_SPIN_ALWAYS);
1053 }
1054 
1055 
1056 #pragma mark lck_mtx_t: lck_mtx_try_lock
1057 
1058 static __attribute__((always_inline)) bool
lck_mtx_try_lock_slow_inline(lck_mtx_t * lock,thread_t thread,uint32_t odata,uint32_t ndata,bool spin)1059 lck_mtx_try_lock_slow_inline(
1060 	lck_mtx_t              *lock,
1061 	thread_t                thread,
1062 	uint32_t                odata,
1063 	uint32_t                ndata,
1064 	bool                    spin)
1065 {
1066 #pragma unused(lock, thread, odata, ndata)
1067 #if CONFIG_DTRACE
1068 	if (odata == LCK_MTX_PROFILE) {
1069 		os_atomic_cmpxchgv(&lock->lck_mtx.data,
1070 		    odata, ndata | LCK_MTX_PROFILE, &odata, acquire);
1071 	}
1072 	if ((odata & ~LCK_MTX_PROFILE) == 0) {
1073 		LCK_MTX_TRY_ACQUIRED(lock, lock->lck_mtx_grp,
1074 		    spin, odata & LCK_MTX_PROFILE);
1075 		return true;
1076 	}
1077 	if (odata & LCK_MTX_PROFILE) {
1078 		LCK_MTX_PROF_MISS(lock, lock->lck_mtx_grp, &(int){ 0 });
1079 	}
1080 #endif /* CONFIG_DTRACE */
1081 
1082 	if (spin) {
1083 		lock_enable_preemption();
1084 	}
1085 	return false;
1086 }
1087 
1088 #if CONFIG_DTRACE || LCK_MTX_CHECK_INVARIANTS
1089 __attribute__((noinline))
1090 #else
1091 __attribute__((always_inline))
1092 #endif
1093 static bool
lck_mtx_try_lock_slow(lck_mtx_t * lock,thread_t thread,uint32_t odata,uint32_t ndata)1094 lck_mtx_try_lock_slow(
1095 	lck_mtx_t              *lock,
1096 	thread_t                thread,
1097 	uint32_t                odata,
1098 	uint32_t                ndata)
1099 {
1100 	return lck_mtx_try_lock_slow_inline(lock, thread, odata, ndata, false);
1101 }
1102 
1103 #if CONFIG_DTRACE || LCK_MTX_CHECK_INVARIANTS
1104 __attribute__((noinline))
1105 #else
1106 __attribute__((always_inline))
1107 #endif
1108 static bool
lck_mtx_try_lock_slow_spin(lck_mtx_t * lock,thread_t thread,uint32_t odata,uint32_t ndata)1109 lck_mtx_try_lock_slow_spin(
1110 	lck_mtx_t              *lock,
1111 	thread_t                thread,
1112 	uint32_t                odata,
1113 	uint32_t                ndata)
1114 {
1115 	return lck_mtx_try_lock_slow_inline(lock, thread, odata, ndata, true);
1116 }
1117 
1118 static __attribute__((always_inline)) bool
lck_mtx_try_lock_fastpath(lck_mtx_t * lock,lck_mtx_mode_t mode)1119 lck_mtx_try_lock_fastpath(lck_mtx_t *lock, lck_mtx_mode_t mode)
1120 {
1121 	thread_t thread = current_thread();
1122 	uint32_t odata, ndata = thread->ctid;
1123 	uint32_t take_slowpath = 0;
1124 
1125 #if CONFIG_DTRACE
1126 	take_slowpath |= lck_debug_state.lds_value;
1127 #endif
1128 	if (mode != LCK_MTX_MODE_SLEEPABLE) {
1129 		lock_disable_preemption_for_thread(thread);
1130 		ndata |= LCK_MTX_SPIN_MODE | LCK_MTX_ILOCK;
1131 	}
1132 
1133 	/*
1134 	 * try_lock because it's likely to be used for cases
1135 	 * like lock inversion resolutions tries a bit harder
1136 	 * than lck_mtx_lock() to take the lock and ignores
1137 	 * adaptive spin / interlock queues by doing the CAS
1138 	 * on the 32bit mutex data only.
1139 	 */
1140 	lock_cmpxchgv(&lock->lck_mtx.data, 0, ndata, &odata, acquire);
1141 
1142 	take_slowpath |= odata;
1143 	if (__probable(!take_slowpath)) {
1144 		return true;
1145 	}
1146 
1147 	if (mode == LCK_MTX_MODE_SPIN_ALWAYS &&
1148 	    (odata & LCK_MTX_CTID_MASK) &&
1149 	    !(odata & LCK_MTX_SPIN_MODE)) {
1150 		__lck_mtx_lock_is_sleepable_panic(lock);
1151 	}
1152 
1153 	if (mode == LCK_MTX_MODE_SLEEPABLE) {
1154 		return lck_mtx_try_lock_slow(lock, thread, odata, ndata);
1155 	} else {
1156 		return lck_mtx_try_lock_slow_spin(lock, thread, odata, ndata);
1157 	}
1158 }
1159 
1160 boolean_t
lck_mtx_try_lock(lck_mtx_t * lock)1161 lck_mtx_try_lock(lck_mtx_t *lock)
1162 {
1163 	return lck_mtx_try_lock_fastpath(lock, LCK_MTX_MODE_SLEEPABLE);
1164 }
1165 
1166 boolean_t
lck_mtx_try_lock_spin(lck_mtx_t * lock)1167 lck_mtx_try_lock_spin(lck_mtx_t *lock)
1168 {
1169 	return lck_mtx_try_lock_fastpath(lock, LCK_MTX_MODE_SPIN);
1170 }
1171 
1172 boolean_t
lck_mtx_try_lock_spin_always(lck_mtx_t * lock)1173 lck_mtx_try_lock_spin_always(lck_mtx_t *lock)
1174 {
1175 	return lck_mtx_try_lock_fastpath(lock, LCK_MTX_MODE_SPIN_ALWAYS);
1176 }
1177 
1178 
1179 #pragma mark lck_mtx_t: lck_mtx_unlock
1180 
1181 static NOINLINE void
lck_mtx_unlock_contended(lck_mtx_t * lock,thread_t thread,uint32_t data)1182 lck_mtx_unlock_contended(lck_mtx_t *lock, thread_t thread, uint32_t data)
1183 {
1184 	bool cleanup = false;
1185 
1186 #if !CONFIG_DTRACE
1187 	/*
1188 	 * This check is done by lck_mtx_unlock_slow() when it is enabled.
1189 	 */
1190 	if (thread->ctid != (data & LCK_MTX_CTID_MASK)) {
1191 		__lck_mtx_not_owned_panic(lock, thread);
1192 	}
1193 #endif /* !CONFIG_DTRACE */
1194 
1195 	if ((data & LCK_MTX_SPIN_MODE) == 0) {
1196 		lock_disable_preemption_for_thread(thread);
1197 		lck_mtx_ilk_lock_nopreempt(lock, LCK_ILK_MODE_UNLOCK);
1198 	}
1199 
1200 	/*
1201 	 * We must re-load the data: we might have taken
1202 	 * the slowpath because another thread had taken
1203 	 * the interlock and set the NEEDS_WAKEUP bit
1204 	 * while we were spinning to get it.
1205 	 */
1206 	data = os_atomic_load(&lock->lck_mtx.data, compiler_acq_rel);
1207 	if (data & LCK_MTX_NEEDS_WAKEUP) {
1208 		lck_mtx_unlock_wakeup(lock, thread);
1209 		cleanup = true;
1210 	}
1211 	lck_mtx_ilk_unlock_v(lock, data & LCK_MTX_PROFILE);
1212 
1213 	LCK_MTX_RELEASED(lock, lock->lck_mtx_grp, data & LCK_MTX_PROFILE);
1214 
1215 	/*
1216 	 * Do not do any turnstile operations outside of this block.
1217 	 *
1218 	 * lock/unlock is called at early stage of boot while single
1219 	 * threaded, without turnstiles being available yet.
1220 	 * Even without contention we can come throught the slow path
1221 	 * if the mutex is acquired as a spin lock.
1222 	 */
1223 	if (cleanup) {
1224 		turnstile_cleanup();
1225 	}
1226 }
1227 
1228 #if CONFIG_DTRACE
1229 __attribute__((noinline))
1230 #else
1231 __attribute__((always_inline))
1232 #endif
1233 static void
lck_mtx_unlock_slow(lck_mtx_t * lock,thread_t thread,uint32_t data)1234 lck_mtx_unlock_slow(lck_mtx_t *lock, thread_t thread, uint32_t data)
1235 {
1236 #if CONFIG_DTRACE
1237 	/*
1238 	 *	If Dtrace is enabled, locks can be profiled,
1239 	 *	which causes the fastpath of unlock to fail.
1240 	 */
1241 	if ((data & LCK_MTX_BITS_MASK) == LCK_MTX_PROFILE) {
1242 		os_atomic_cmpxchgv(&lock->lck_mtx.data, data, LCK_MTX_PROFILE,
1243 		    &data, release);
1244 	}
1245 	if (thread->ctid != (data & LCK_MTX_CTID_MASK)) {
1246 		__lck_mtx_not_owned_panic(lock, thread);
1247 	}
1248 	if ((data & (LCK_MTX_BITS_MASK & ~LCK_MTX_PROFILE)) == 0) {
1249 		LCK_MTX_RELEASED(lock, lock->lck_mtx_grp, false);
1250 		return;
1251 	}
1252 #endif /* CONFIG_DTRACE */
1253 
1254 	lck_mtx_unlock_contended(lock, thread, data);
1255 }
1256 
1257 void
lck_mtx_unlock(lck_mtx_t * lock)1258 lck_mtx_unlock(lck_mtx_t *lock)
1259 {
1260 	thread_t thread = current_thread();
1261 	uint32_t take_slowpath = 0;
1262 	uint32_t data;
1263 
1264 	take_slowpath |= LCK_MTX_SNIFF_DTRACE();
1265 
1266 	/*
1267 	 * The fast path ignores the ILK/AS queues on purpose,
1268 	 * those really are a "lock" concept, not unlock.
1269 	 */
1270 	if (__probable(lock_cmpxchgv(&lock->lck_mtx.data,
1271 	    thread->ctid, 0, &data, release))) {
1272 		if (__probable(!take_slowpath)) {
1273 			return;
1274 		}
1275 	}
1276 
1277 	lck_mtx_unlock_slow(lock, thread, data);
1278 }
1279 
1280 
1281 #pragma mark lck_mtx_t: misc
1282 
1283 void
lck_mtx_assert(lck_mtx_t * lock,unsigned int type)1284 lck_mtx_assert(lck_mtx_t *lock, unsigned int type)
1285 {
1286 	lck_mtx_state_t state  = os_atomic_load(&lock->lck_mtx, relaxed);
1287 	thread_t        thread = current_thread();
1288 
1289 	if (type == LCK_MTX_ASSERT_OWNED) {
1290 		if (state.owner != thread->ctid) {
1291 			__lck_mtx_not_owned_panic(lock, thread);
1292 		}
1293 	} else if (type == LCK_MTX_ASSERT_NOTOWNED) {
1294 		if (state.owner == thread->ctid) {
1295 			__lck_mtx_owned_panic(lock, thread);
1296 		}
1297 	} else {
1298 		panic("lck_mtx_assert(): invalid arg (%u)", type);
1299 	}
1300 }
1301 
1302 /*
1303  *	Routine:	lck_mtx_convert_spin
1304  *
1305  *	Convert a mutex held for spin into a held full mutex
1306  */
1307 void
lck_mtx_convert_spin(lck_mtx_t * lock)1308 lck_mtx_convert_spin(lck_mtx_t *lock)
1309 {
1310 	lck_mtx_state_t state  = os_atomic_load(&lock->lck_mtx, relaxed);
1311 	thread_t        thread = current_thread();
1312 	uint32_t        data   = thread->ctid;
1313 
1314 	if (state.owner != data) {
1315 		__lck_mtx_not_owned_panic(lock, thread);
1316 	}
1317 
1318 	if (state.spin_mode) {
1319 		/*
1320 		 * Note: we can acquire the lock in spin mode
1321 		 *       _and_ be the inheritor if we waited.
1322 		 *
1323 		 *       We must only clear ilocked and spin_mode,
1324 		 *       but preserve owner and needs_wakeup.
1325 		 */
1326 		state.ilocked = false;
1327 		state.spin_mode = false;
1328 		lck_mtx_ilk_unlock_v(lock, state.data);
1329 		turnstile_cleanup();
1330 	}
1331 }
1332 
1333 /*
1334  * Routine: kdp_lck_mtx_lock_spin_is_acquired
1335  * NOT SAFE: To be used only by kernel debugger to avoid deadlock.
1336  */
1337 boolean_t
kdp_lck_mtx_lock_spin_is_acquired(lck_mtx_t * lck)1338 kdp_lck_mtx_lock_spin_is_acquired(lck_mtx_t *lck)
1339 {
1340 	lck_mtx_state_t state = os_atomic_load(&lck->lck_mtx, relaxed);
1341 
1342 	if (not_in_kdp) {
1343 		panic("panic: spinlock acquired check done outside of kernel debugger");
1344 	}
1345 	if (state.data == LCK_MTX_TAG_DESTROYED) {
1346 		return false;
1347 	}
1348 	return state.owner || state.ilocked;
1349 }
1350 
1351 void
kdp_lck_mtx_find_owner(struct waitq * waitq __unused,event64_t event,thread_waitinfo_t * waitinfo)1352 kdp_lck_mtx_find_owner(
1353 	struct waitq           *waitq __unused,
1354 	event64_t               event,
1355 	thread_waitinfo_t      *waitinfo)
1356 {
1357 	lck_mtx_t      *mutex  = LCK_EVENT_TO_MUTEX(event);
1358 	lck_mtx_state_t state  = os_atomic_load(&mutex->lck_mtx, relaxed);
1359 
1360 	assert3u(state.data, !=, LCK_MTX_TAG_DESTROYED);
1361 	waitinfo->context = VM_KERNEL_UNSLIDE_OR_PERM(mutex);
1362 	waitinfo->owner   = thread_tid(ctid_get_thread(state.owner));
1363 }
1364 
1365 #endif /* !LCK_MTX_USE_ARCH */
1366 
1367 /*
1368  * Routine:     mutex_pause
1369  *
1370  * Called by former callers of simple_lock_pause().
1371  */
1372 #define MAX_COLLISION_COUNTS    32
1373 #define MAX_COLLISION   8
1374 
1375 unsigned int max_collision_count[MAX_COLLISION_COUNTS];
1376 
1377 uint32_t collision_backoffs[MAX_COLLISION] = {
1378 	10, 50, 100, 200, 400, 600, 800, 1000
1379 };
1380 
1381 
1382 void
mutex_pause(uint32_t collisions)1383 mutex_pause(uint32_t collisions)
1384 {
1385 	wait_result_t wait_result;
1386 	uint32_t        back_off;
1387 
1388 	if (collisions >= MAX_COLLISION_COUNTS) {
1389 		collisions = MAX_COLLISION_COUNTS - 1;
1390 	}
1391 	max_collision_count[collisions]++;
1392 
1393 	if (collisions >= MAX_COLLISION) {
1394 		collisions = MAX_COLLISION - 1;
1395 	}
1396 	back_off = collision_backoffs[collisions];
1397 
1398 	wait_result = assert_wait_timeout((event_t)mutex_pause, THREAD_UNINT, back_off, NSEC_PER_USEC);
1399 	assert(wait_result == THREAD_WAITING);
1400 
1401 	wait_result = thread_block(THREAD_CONTINUE_NULL);
1402 	assert(wait_result == THREAD_TIMED_OUT);
1403 }
1404 
1405 
1406 unsigned int mutex_yield_wait = 0;
1407 unsigned int mutex_yield_no_wait = 0;
1408 
1409 boolean_t
lck_mtx_yield(lck_mtx_t * lck)1410 lck_mtx_yield(
1411 	lck_mtx_t   *lck)
1412 {
1413 	bool has_waiters = LCK_MTX_HAS_WAITERS(lck);
1414 
1415 #if DEBUG
1416 	lck_mtx_assert(lck, LCK_MTX_ASSERT_OWNED);
1417 #endif /* DEBUG */
1418 
1419 	if (!has_waiters) {
1420 		mutex_yield_no_wait++;
1421 	} else {
1422 		mutex_yield_wait++;
1423 		lck_mtx_unlock(lck);
1424 		mutex_pause(0);
1425 		lck_mtx_lock(lck);
1426 	}
1427 	return has_waiters;
1428 }
1429