1 /*
2 * Copyright (c) 2022 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 #define LOCK_PRIVATE 1
30
31 #include <mach_ldebug.h>
32 #include <kern/locks_internal.h>
33 #include <kern/lock_stat.h>
34 #include <kern/locks.h>
35 #include <kern/kalloc.h>
36 #include <kern/thread.h>
37
38 #include <mach/machine/sdt.h>
39
40 #include <machine/cpu_data.h>
41 #include <machine/machine_cpu.h>
42
43 #if !LCK_MTX_USE_ARCH
44
45 /*
46 * lck_mtx_t
47 * ~~~~~~~~~
48 *
49 * Kernel mutexes in this implementation are made of four 32 bits words:
50 *
51 * - word 0: turnstile compact ID (24 bits) and the 0x22 lock tag
52 * - word 1: padding (to be used for group compact IDs)
53 * - word 2: mutex state (lock owner + interlock, spin and waiters bits),
54 * refered to as "data" in the code.
55 * - word 3: adaptive spin and interlock MCS queue tails.
56 *
57 * The 64 bits word made of the last two words is refered to
58 * as the "mutex state" in code.
59 *
60 *
61 * Core serialization rules
62 * ~~~~~~~~~~~~~~~~~~~~~~~~
63 *
64 * The mutex has a bit (lck_mtx_t::lck_mtx.ilocked or bit LCK_MTX_ILOCK
65 * of the data word) that serves as a spinlock for the mutex state.
66 *
67 *
68 * Updating the lock fields must follow the following rules:
69 *
70 * - It is ok to "steal" the mutex (updating its data field) if no one
71 * holds the interlock.
72 *
73 * - Holding the interlock allows its holder to update the first 3 words
74 * of the kernel mutex without using RMW atomics (plain stores are OK).
75 *
76 * - Holding the interlock is required for a thread to remove itself
77 * from the adaptive spin queue.
78 *
79 * - Threads can enqueue themselves onto the adaptive spin wait queue
80 * or the interlock wait queue at any time.
81 *
82 *
83 * Waiters bit and turnstiles
84 * ~~~~~~~~~~~~~~~~~~~~~~~~~~
85 *
86 * The turnstile on a kernel mutex is set by waiters, and cleared
87 * once they have all been resumed and successfully acquired the lock.
88 *
89 * LCK_MTX_NEEDS_WAKEUP being set (always with an owner set too)
90 * forces threads to the lck_mtx_unlock slowpath,
91 * in order to evaluate whether lck_mtx_unlock_wakeup() must be called.
92 *
93 * As a result it means it really only needs to be set at select times:
94 *
95 * - when a thread blocks and "snitches" on the current thread owner,
96 * so that when that thread unlocks it calls wake up,
97 *
98 * - when a thread that was woken up resumes its work and became
99 * the inheritor.
100 */
101
102 #define ADAPTIVE_SPIN_ENABLE 0x1
103
104 #define NOINLINE __attribute__((noinline))
105 #define LCK_MTX_EVENT(lck) CAST_EVENT64_T(&(lck)->lck_mtx.data)
106 #define LCK_EVENT_TO_MUTEX(e) __container_of((uint32_t *)(e), lck_mtx_t, lck_mtx.data)
107 #define LCK_MTX_HAS_WAITERS(l) ((l)->lck_mtx.data & LCK_MTX_NEEDS_WAKEUP)
108
109 #if DEVELOPMENT || DEBUG
110 TUNABLE(bool, LckDisablePreemptCheck, "-disable_mtx_chk", false);
111 #endif /* DEVELOPMENT || DEBUG */
112
113 extern unsigned int not_in_kdp;
114
115 KALLOC_TYPE_DEFINE(KT_LCK_MTX, lck_mtx_t, KT_PRIV_ACCT);
116
117 #define LCK_MTX_NULL_CTID 0x00000000u
118
119 __enum_decl(lck_mtx_mode_t, uint32_t, {
120 LCK_MTX_MODE_SLEEPABLE,
121 LCK_MTX_MODE_SPIN,
122 LCK_MTX_MODE_SPIN_ALWAYS,
123 });
124
125 __enum_decl(lck_ilk_mode_t, uint32_t, {
126 LCK_ILK_MODE_UNLOCK,
127 LCK_ILK_MODE_DIRECT,
128 LCK_ILK_MODE_FROM_AS,
129 });
130
131 static inline void
lck_mtx_mcs_clear(lck_mtx_mcs_t mcs)132 lck_mtx_mcs_clear(lck_mtx_mcs_t mcs)
133 {
134 *mcs = (struct lck_mtx_mcs){ };
135 }
136
137 static inline lck_mcs_id_t
lck_mtx_get_mcs_id(void)138 lck_mtx_get_mcs_id(void)
139 {
140 return lck_mcs_id_current(LCK_MCS_SLOT_0);
141 }
142
143 __pure2
144 static inline lck_mtx_mcs_t
lck_mtx_get_mcs(lck_mcs_id_t idx)145 lck_mtx_get_mcs(lck_mcs_id_t idx)
146 {
147 return &lck_mcs_get_other(idx)->mcs_mtx;
148 }
149
150
151 #pragma mark lck_mtx_t: validation
152
153 __abortlike
154 static void
__lck_mtx_invalid_panic(lck_mtx_t * lck)155 __lck_mtx_invalid_panic(lck_mtx_t *lck)
156 {
157 panic("Invalid/destroyed mutex %p: "
158 "<0x%06x 0x%02x 0x%08x 0x%08x/%p 0x%04x 0x%04x>",
159 lck, lck->lck_mtx_tsid, lck->lck_mtx_type, lck->lck_mtx_grp,
160 lck->lck_mtx.data, ctid_get_thread_unsafe(lck->lck_mtx.owner),
161 lck->lck_mtx.as_tail, lck->lck_mtx.ilk_tail);
162 }
163
164 __abortlike
165 static void
__lck_mtx_not_owned_panic(lck_mtx_t * lock,thread_t thread)166 __lck_mtx_not_owned_panic(lck_mtx_t *lock, thread_t thread)
167 {
168 panic("Mutex %p is unexpectedly not owned by thread %p", lock, thread);
169 }
170
171 __abortlike
172 static void
__lck_mtx_owned_panic(lck_mtx_t * lock,thread_t thread)173 __lck_mtx_owned_panic(lck_mtx_t *lock, thread_t thread)
174 {
175 panic("Mutex %p is unexpectedly owned by thread %p", lock, thread);
176 }
177
178 __abortlike
179 static void
__lck_mtx_lock_is_sleepable_panic(lck_mtx_t * lck)180 __lck_mtx_lock_is_sleepable_panic(lck_mtx_t *lck)
181 {
182 // "Always" variants can never block. If the lock is held as a normal mutex
183 // then someone is mixing always and non-always calls on the same lock, which is
184 // forbidden.
185 panic("Mutex %p is held as a full-mutex (spin-always lock attempted)", lck);
186 }
187
188 #if DEVELOPMENT || DEBUG
189 __abortlike
190 static void
__lck_mtx_preemption_disabled_panic(lck_mtx_t * lck,int expected)191 __lck_mtx_preemption_disabled_panic(lck_mtx_t *lck, int expected)
192 {
193 panic("Attempt to take mutex %p with preemption disabled (%d)",
194 lck, get_preemption_level() - expected);
195 }
196
197 __abortlike
198 static void
__lck_mtx_at_irq_panic(lck_mtx_t * lck)199 __lck_mtx_at_irq_panic(lck_mtx_t *lck)
200 {
201 panic("Attempt to take mutex %p in IRQ context", lck);
202 }
203
204 /*
205 * Routine: lck_mtx_check_preemption
206 *
207 * Verify preemption is enabled when attempting to acquire a mutex.
208 */
209 static inline void
lck_mtx_check_preemption(lck_mtx_t * lock,thread_t thread,int expected)210 lck_mtx_check_preemption(lck_mtx_t *lock, thread_t thread, int expected)
211 {
212 #pragma unused(thread)
213 if (lock_preemption_level_for_thread(thread) == expected) {
214 return;
215 }
216 if (LckDisablePreemptCheck) {
217 return;
218 }
219 if (current_cpu_datap()->cpu_hibernate) {
220 return;
221 }
222 if (startup_phase < STARTUP_SUB_EARLY_BOOT) {
223 return;
224 }
225 __lck_mtx_preemption_disabled_panic(lock, expected);
226 }
227
228 static inline void
lck_mtx_check_irq(lck_mtx_t * lock)229 lck_mtx_check_irq(lck_mtx_t *lock)
230 {
231 if (ml_at_interrupt_context()) {
232 __lck_mtx_at_irq_panic(lock);
233 }
234 }
235
236 #define LCK_MTX_SNIFF_PREEMPTION(thread) lock_preemption_level_for_thread(thread)
237 #define LCK_MTX_CHECK_INVARIANTS 1
238 #else
239 #define lck_mtx_check_irq(lck) ((void)0)
240 #define LCK_MTX_SNIFF_PREEMPTION(thread) 0
241 #define LCK_MTX_CHECK_INVARIANTS 0
242 #endif /* !DEVELOPMENT && !DEBUG */
243
244 #if CONFIG_DTRACE
245 #define LCK_MTX_SNIFF_DTRACE() lck_debug_state.lds_value
246 #else
247 #define LCK_MTX_SNIFF_DTRACE() 0
248 #endif
249
250
251 #pragma mark lck_mtx_t: alloc/init/destroy/free
252
253 lck_mtx_t *
lck_mtx_alloc_init(lck_grp_t * grp,lck_attr_t * attr)254 lck_mtx_alloc_init(lck_grp_t *grp, lck_attr_t *attr)
255 {
256 lck_mtx_t *lck;
257
258 lck = zalloc(KT_LCK_MTX);
259 lck_mtx_init(lck, grp, attr);
260 return lck;
261 }
262
263 void
lck_mtx_free(lck_mtx_t * lck,lck_grp_t * grp)264 lck_mtx_free(lck_mtx_t *lck, lck_grp_t *grp)
265 {
266 lck_mtx_destroy(lck, grp);
267 zfree(KT_LCK_MTX, lck);
268 }
269
270 void
lck_mtx_init(lck_mtx_t * lck,lck_grp_t * grp,lck_attr_t * attr)271 lck_mtx_init(lck_mtx_t *lck, lck_grp_t *grp, lck_attr_t *attr)
272 {
273 if (attr == LCK_ATTR_NULL) {
274 attr = &lck_attr_default;
275 }
276
277 *lck = (lck_mtx_t){
278 .lck_mtx_type = LCK_TYPE_MUTEX,
279 .lck_mtx_grp = grp->lck_grp_attr_id,
280 };
281 if (attr->lck_attr_val & LCK_ATTR_DEBUG) {
282 lck->lck_mtx.data |= LCK_MTX_PROFILE;
283 }
284
285 lck_grp_reference(grp, &grp->lck_grp_mtxcnt);
286 }
287
288 void
lck_mtx_destroy(lck_mtx_t * lck,lck_grp_t * grp)289 lck_mtx_destroy(lck_mtx_t *lck, lck_grp_t *grp)
290 {
291 if (lck->lck_mtx_tsid && lck->lck_mtx_type == LCK_TYPE_MUTEX) {
292 panic("Mutex to destroy still has waiters: %p: "
293 "<0x%06x 0x%02x 0x%08x 0x%08x/%p 0x%04x 0x%04x>",
294 lck, lck->lck_mtx_tsid, lck->lck_mtx_type, lck->lck_mtx_grp,
295 lck->lck_mtx.data, ctid_get_thread_unsafe(lck->lck_mtx.owner),
296 lck->lck_mtx.as_tail, lck->lck_mtx.ilk_tail);
297 }
298 if (lck->lck_mtx_type != LCK_TYPE_MUTEX ||
299 (lck->lck_mtx.data & ~LCK_MTX_PROFILE) ||
300 lck->lck_mtx.as_tail || lck->lck_mtx.ilk_tail) {
301 __lck_mtx_invalid_panic(lck);
302 }
303 LCK_GRP_ASSERT_ID(grp, lck->lck_mtx_grp);
304 lck->lck_mtx_type = LCK_TYPE_NONE;
305 lck->lck_mtx.data = LCK_MTX_TAG_DESTROYED;
306 lck->lck_mtx_grp = 0;
307 lck_grp_deallocate(grp, &grp->lck_grp_mtxcnt);
308 }
309
310
311 #pragma mark lck_mtx_t: lck_mtx_ilk*
312
313 static hw_spin_timeout_status_t
lck_mtx_ilk_timeout_panic(void * _lock,hw_spin_timeout_t to,hw_spin_state_t st)314 lck_mtx_ilk_timeout_panic(void *_lock, hw_spin_timeout_t to, hw_spin_state_t st)
315 {
316 lck_mtx_t *lck = _lock;
317
318 panic("Mutex interlock[%p] " HW_SPIN_TIMEOUT_FMT "; "
319 "current owner: %p, "
320 "<0x%06x 0x%02x 0x%08x 0x%08x 0x%04x 0x%04x>, "
321 HW_SPIN_TIMEOUT_DETAILS_FMT,
322 lck, HW_SPIN_TIMEOUT_ARG(to, st),
323 ctid_get_thread_unsafe(lck->lck_mtx.owner),
324 lck->lck_mtx_tsid, lck->lck_mtx_type,
325 lck->lck_mtx_grp, lck->lck_mtx.data,
326 lck->lck_mtx.as_tail, lck->lck_mtx.ilk_tail,
327 HW_SPIN_TIMEOUT_DETAILS_ARG(to, st));
328 }
329
330 static const struct hw_spin_policy lck_mtx_ilk_timeout_policy = {
331 .hwsp_name = "lck_mtx_t (ilk)",
332 .hwsp_timeout_atomic = &lock_panic_timeout,
333 .hwsp_op_timeout = lck_mtx_ilk_timeout_panic,
334 };
335
336 static void
lck_mtx_ilk_lock_cleanup_as_mcs(lck_mtx_t * lock,lck_mcs_id_t idx,lck_mtx_mcs_t mcs,hw_spin_timeout_t to,hw_spin_state_t * ss)337 lck_mtx_ilk_lock_cleanup_as_mcs(
338 lck_mtx_t *lock,
339 lck_mcs_id_t idx,
340 lck_mtx_mcs_t mcs,
341 hw_spin_timeout_t to,
342 hw_spin_state_t *ss)
343 {
344 lck_mtx_mcs_t nnode = NULL;
345 lck_mcs_id_t pidx = (lck_mcs_id_t)mcs->lmm_as_prev;
346 bool was_last;
347
348 /*
349 * This is called when the thread made use
350 * of the adaptive spin queue and needs
351 * to remove itself from it.
352 */
353
354 /*
355 * If the thread is last, set the tail to the node before us.
356 */
357 was_last = lock_cmpxchg(&lock->lck_mtx.as_tail, idx, pidx, release);
358
359 if (was_last) {
360 /*
361 * If @c mcs was last, we need to erase the previous
362 * node link to it.
363 *
364 * However, new nodes could have now taken our place
365 * and set the previous node's @c lmm_as_next field
366 * already, so we must CAS rather than blindly set.
367 *
368 * We know the previous node is stable because
369 * we hold the interlock (preventing concurrent
370 * removals).
371 */
372 if (pidx) {
373 os_atomic_cmpxchg(&lck_mtx_get_mcs(pidx)->lmm_as_next,
374 mcs, nnode, relaxed);
375 }
376 } else {
377 /*
378 * If @c mcs wasn't last, then wait to make sure
379 * we observe @c lmm_as_next. Once we do, we know
380 * the field is stable since we hold the interlock
381 * (preventing concurrent dequeues).
382 *
383 * We can then update it to @c mcs next node index
384 * (which is also stable for similar reasons).
385 *
386 * Lastly update the previous node @c lmm_as_next
387 * field as well to terminate the dequeue.
388 */
389 while (!hw_spin_wait_until(&mcs->lmm_as_next, nnode, nnode)) {
390 hw_spin_policy_t pol = &lck_mtx_ilk_timeout_policy;
391 hw_spin_should_keep_spinning(lock, pol, to, ss);
392 }
393
394 os_atomic_store(&nnode->lmm_as_prev, pidx, relaxed);
395 if (pidx) {
396 os_atomic_store(&lck_mtx_get_mcs(pidx)->lmm_as_next,
397 nnode, relaxed);
398 }
399 }
400
401 /*
402 * @c mcs's fields are left dangling,
403 * it is the responsibilty of the caller
404 * to terminate the cleanup.
405 */
406 }
407
408 static NOINLINE void
lck_mtx_ilk_lock_contended(lck_mtx_t * lock,lck_mtx_state_t state,lck_ilk_mode_t mode)409 lck_mtx_ilk_lock_contended(
410 lck_mtx_t *lock,
411 lck_mtx_state_t state,
412 lck_ilk_mode_t mode)
413 {
414 hw_spin_policy_t pol = &lck_mtx_ilk_timeout_policy;
415 hw_spin_timeout_t to = hw_spin_compute_timeout(pol);
416 hw_spin_state_t ss = { };
417
418 lck_mtx_mcs_t mcs, nnode, pnode;
419 lck_mcs_id_t idx, pidx;
420 lck_mtx_state_t nstate;
421 unsigned long ready;
422 uint64_t spin_start;
423
424 /*
425 * Take a spot in the interlock MCS queue,
426 * and then spin until we're at the head of it.
427 */
428
429 idx = lck_mtx_get_mcs_id();
430 mcs = &lck_mcs_get_current()->mcs_mtx;
431 if (mode != LCK_MTX_MODE_SPIN) {
432 spin_start = LCK_MTX_ADAPTIVE_SPIN_BEGIN();
433 }
434
435 mcs->lmm_ilk_current = lock;
436 pidx = os_atomic_xchg(&lock->lck_mtx.ilk_tail, idx, release);
437 if (pidx) {
438 pnode = lck_mtx_get_mcs(pidx);
439 os_atomic_store(&pnode->lmm_ilk_next, mcs, relaxed);
440
441 while (!hw_spin_wait_until(&mcs->lmm_ilk_ready, ready, ready)) {
442 hw_spin_should_keep_spinning(lock, pol, to, &ss);
443 }
444 }
445
446
447 /*
448 * We're now the first in line, wait for the interlock
449 * to look ready and take it.
450 *
451 * We can't just assume the lock is ours for the taking,
452 * because the fastpath of lck_mtx_lock_spin{,_always}
453 * only look at the mutex "data" and might steal it.
454 *
455 * Also clear the interlock MCS tail if @c mcs is last.
456 */
457 do {
458 while (!hw_spin_wait_until(&lock->lck_mtx.val,
459 state.val, state.ilocked == 0)) {
460 hw_spin_should_keep_spinning(lock, pol, to, &ss);
461 }
462
463 nstate = state;
464 nstate.ilocked = 1;
465 if (nstate.ilk_tail == idx) {
466 nstate.ilk_tail = 0;
467 }
468 } while (!os_atomic_cmpxchg(&lock->lck_mtx, state, nstate, acquire));
469
470
471 /*
472 * We now have the interlock, let's cleanup the MCS state.
473 *
474 * First, if there is a node after us, notify that it
475 * is at the head of the interlock queue.
476 *
477 * Second, perform the adaptive spin MCS cleanup if needed.
478 *
479 * Lastly, clear the MCS node.
480 */
481 if (state.ilk_tail != idx) {
482 while (!hw_spin_wait_until(&mcs->lmm_ilk_next, nnode, nnode)) {
483 hw_spin_should_keep_spinning(lock, pol, to, &ss);
484 }
485
486 os_atomic_store(&nnode->lmm_ilk_ready, 1, relaxed);
487 }
488
489 if (mode == LCK_ILK_MODE_FROM_AS) {
490 lck_mtx_ilk_lock_cleanup_as_mcs(lock, idx, mcs, to, &ss);
491 }
492 lck_mtx_mcs_clear(mcs);
493
494 if (mode != LCK_MTX_MODE_SPIN) {
495 LCK_MTX_ADAPTIVE_SPIN_END(lock, lock->lck_mtx_grp, spin_start);
496 }
497 }
498
499 static void
lck_mtx_ilk_lock_nopreempt(lck_mtx_t * lock,lck_ilk_mode_t mode)500 lck_mtx_ilk_lock_nopreempt(lck_mtx_t *lock, lck_ilk_mode_t mode)
501 {
502 lck_mtx_state_t state, nstate;
503
504 os_atomic_rmw_loop(&lock->lck_mtx.val, state.val, nstate.val, acquire, {
505 if (__improbable(state.ilocked || state.ilk_tail)) {
506 os_atomic_rmw_loop_give_up({
507 return lck_mtx_ilk_lock_contended(lock, state, mode);
508 });
509 }
510
511 nstate = state;
512 nstate.ilocked = true;
513 });
514 }
515
516 static void
lck_mtx_ilk_unlock_v(lck_mtx_t * lock,uint32_t data)517 lck_mtx_ilk_unlock_v(lck_mtx_t *lock, uint32_t data)
518 {
519 os_atomic_store(&lock->lck_mtx.data, data, release);
520 lock_enable_preemption();
521 }
522
523 static void
lck_mtx_ilk_unlock(lck_mtx_t * lock)524 lck_mtx_ilk_unlock(lck_mtx_t *lock)
525 {
526 lck_mtx_ilk_unlock_v(lock, lock->lck_mtx.data & ~LCK_MTX_ILOCK);
527 }
528
529
530 #pragma mark lck_mtx_t: turnstile integration
531
532 /*
533 * Routine: lck_mtx_lock_wait
534 *
535 * Invoked in order to wait on contention.
536 *
537 * Called with the interlock locked and
538 * returns it unlocked.
539 *
540 * Always aggressively sets the owning thread to promoted,
541 * even if it's the same or higher priority
542 * This prevents it from lowering its own priority while holding a lock
543 *
544 * TODO: Come up with a more efficient way to handle same-priority promotions
545 * <rdar://problem/30737670> ARM mutex contention logic could avoid taking the thread lock
546 */
547 static struct turnstile *
lck_mtx_lock_wait(lck_mtx_t * lck,thread_t self,thread_t holder,struct turnstile * ts)548 lck_mtx_lock_wait(
549 lck_mtx_t *lck,
550 thread_t self,
551 thread_t holder,
552 struct turnstile *ts)
553 {
554 uint64_t sleep_start = LCK_MTX_BLOCK_BEGIN();
555
556 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_START,
557 unslide_for_kdebug(lck), (uintptr_t)thread_tid(self), 0, 0, 0);
558
559 if (ts == TURNSTILE_NULL) {
560 ts = turnstile_prepare_compact_id((uintptr_t)lck,
561 lck->lck_mtx_tsid, TURNSTILE_KERNEL_MUTEX);
562 if (lck->lck_mtx_tsid == 0) {
563 lck->lck_mtx_tsid = ts->ts_compact_id;
564 }
565 }
566 assert3u(ts->ts_compact_id, ==, lck->lck_mtx_tsid);
567
568 thread_set_pending_block_hint(self, kThreadWaitKernelMutex);
569 turnstile_update_inheritor(ts, holder, (TURNSTILE_DELAYED_UPDATE | TURNSTILE_INHERITOR_THREAD));
570
571 waitq_assert_wait64(&ts->ts_waitq, LCK_MTX_EVENT(lck),
572 THREAD_UNINT | THREAD_WAIT_NOREPORT_USER, TIMEOUT_WAIT_FOREVER);
573
574 lck_mtx_ilk_unlock(lck);
575
576 turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_NOT_HELD);
577
578 thread_block(THREAD_CONTINUE_NULL);
579
580 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_END, 0, 0, 0, 0, 0);
581
582 LCK_MTX_BLOCK_END(lck, lck->lck_mtx_grp, sleep_start);
583
584 return ts;
585 }
586
587 static void
lck_mtx_lock_wait_done(lck_mtx_t * lck,struct turnstile * ts)588 lck_mtx_lock_wait_done(lck_mtx_t *lck, struct turnstile *ts)
589 {
590 if (turnstile_complete_compact_id((uintptr_t)lck, ts,
591 TURNSTILE_KERNEL_MUTEX)) {
592 lck->lck_mtx_tsid = 0;
593 }
594 }
595
596 /*
597 * Routine: lck_mtx_lock_will_need_wakeup
598 *
599 * Returns whether the thread is the current turnstile inheritor,
600 * which means it will have to call lck_mtx_unlock_wakeup()
601 * on unlock.
602 */
603 __attribute__((always_inline))
604 static bool
lck_mtx_lock_will_need_wakeup(lck_mtx_t * lck,thread_t self)605 lck_mtx_lock_will_need_wakeup(lck_mtx_t *lck, thread_t self)
606 {
607 uint32_t tsid = lck->lck_mtx_tsid;
608
609 return tsid && turnstile_get_by_id(tsid)->ts_inheritor == self;
610 }
611
612 /*
613 * Routine: lck_mtx_unlock_wakeup
614 *
615 * Invoked on unlock when there is contention.
616 *
617 * Called with the interlock locked.
618 *
619 * NOTE: callers should call turnstile_clenup after
620 * dropping the interlock.
621 */
622 static void
lck_mtx_unlock_wakeup(lck_mtx_t * lck,__kdebug_only thread_t thread)623 lck_mtx_unlock_wakeup(
624 lck_mtx_t *lck,
625 __kdebug_only thread_t thread)
626 {
627 struct turnstile *ts;
628 kern_return_t did_wake;
629
630 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_UNLCK_WAKEUP_CODE) | DBG_FUNC_START,
631 unslide_for_kdebug(lck), (uintptr_t)thread_tid(thread), 0, 0, 0);
632
633 ts = turnstile_get_by_id(lck->lck_mtx_tsid);
634
635 /*
636 * We can skip turnstile_{prepare,cleanup} because
637 * we hold the interlock of the primitive,
638 * and enqueues/wakeups all happen under the interlock,
639 * which means the turnstile is stable.
640 */
641 did_wake = waitq_wakeup64_one(&ts->ts_waitq, LCK_MTX_EVENT(lck),
642 THREAD_AWAKENED, WAITQ_UPDATE_INHERITOR);
643 assert(did_wake == KERN_SUCCESS);
644
645 turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD);
646
647 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_UNLCK_WAKEUP_CODE) | DBG_FUNC_END, 0, 0, 0, 0, 0);
648 }
649
650
651 #pragma mark lck_mtx_t: lck_mtx_lock
652
653 static inline bool
lck_mtx_ctid_on_core(uint32_t ctid)654 lck_mtx_ctid_on_core(uint32_t ctid)
655 {
656 thread_t th = ctid_get_thread_unsafe(ctid);
657
658 return th && machine_thread_on_core_allow_invalid(th);
659 }
660
661 #define LCK_MTX_OWNER_FOR_TRACE(lock) \
662 VM_KERNEL_UNSLIDE_OR_PERM(ctid_get_thread_unsafe((lock)->lck_mtx.data))
663
664 static void
lck_mtx_lock_adaptive_spin(lck_mtx_t * lock,lck_mtx_state_t state)665 lck_mtx_lock_adaptive_spin(lck_mtx_t *lock, lck_mtx_state_t state)
666 {
667 __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lock);
668 hw_spin_policy_t pol = &lck_mtx_ilk_timeout_policy;
669 hw_spin_timeout_t to = hw_spin_compute_timeout(pol);
670 hw_spin_state_t ss = { };
671 uint64_t deadline;
672
673 lck_mtx_mcs_t mcs, node;
674 lck_mcs_id_t idx, pidx, clear_idx;
675 unsigned long prev;
676 lck_mtx_state_t nstate;
677 ast_t *const astp = ast_pending();
678
679 idx = lck_mtx_get_mcs_id();
680 mcs = &lck_mcs_get_current()->mcs_mtx;
681
682 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_START,
683 trace_lck, LCK_MTX_OWNER_FOR_TRACE(lock), lock->lck_mtx_tsid, 0, 0);
684
685 /*
686 * Take a spot in the adaptive spin queue,
687 * and then spin until we're at the head of it.
688 *
689 * Until we're at the head, we do not need to monitor
690 * for whether the current owner is on core or not:
691 *
692 * 1. the head of the queue is doing it already,
693 *
694 * 2. when the entire adaptive spin queue will "give up"
695 * as a result of the owner going off core, we want
696 * to avoid a thundering herd and let the AS queue
697 * pour into the interlock one slowly.
698 *
699 * Do give up if the scheduler made noises something
700 * more important has shown up.
701 *
702 * Note: this function is optimized so that we do not touch
703 * our local mcs node when we're the head of the queue.
704 *
705 * This allows us in the case when the contention is
706 * between 2 cores only to not have to touch this
707 * cacheline at all.
708 */
709 pidx = os_atomic_xchg(&lock->lck_mtx.as_tail, idx, release);
710 if (pidx) {
711 node = lck_mtx_get_mcs(pidx);
712 mcs->lmm_as_prev = pidx;
713 os_atomic_store(&node->lmm_as_next, mcs, release);
714
715 while (!hw_spin_wait_until(&mcs->lmm_as_prev, prev,
716 prev == 0 || (os_atomic_load(astp, relaxed) & AST_URGENT))) {
717 hw_spin_should_keep_spinning(lock, pol, to, &ss);
718 }
719
720 if (__improbable(prev)) {
721 goto adaptive_spin_fail;
722 }
723
724 clear_idx = 0;
725 } else {
726 clear_idx = idx;
727 }
728
729 /*
730 * We're now first in line.
731 *
732 * It's our responsbility to monitor the lock's state
733 * for whether (1) the lock has become available,
734 * (2) its owner has gone off core, (3) the scheduler
735 * wants its CPU back, or (4) we've spun for too long.
736 */
737 deadline = ml_get_timebase() + os_atomic_load(&MutexSpin, relaxed);
738
739 for (;;) {
740 state.val = lock_load_exclusive(&lock->lck_mtx.val, acquire);
741
742 if (__probable(!state.ilocked && !state.ilk_tail && !state.owner)) {
743 /*
744 * 2-core contention: if we can, try to dequeue
745 * ourselves from the adaptive spin queue
746 * as part of this CAS in order to avoid
747 * the cost of lck_mtx_ilk_lock_cleanup_as_mcs()
748 * and zeroing the mcs node at all.
749 *
750 * Because the queue is designed to limit contention,
751 * using store-exclusive over an armv8.1 LSE atomic
752 * is actually marginally better (presumably due to
753 * the better codegen).
754 */
755 nstate = state;
756 nstate.ilocked = true;
757 if (state.as_tail == clear_idx) {
758 nstate.as_tail = 0;
759 }
760 if (__probable(lock_store_exclusive(&lock->lck_mtx.val,
761 state.val, nstate.val, acquire))) {
762 break;
763 }
764 } else {
765 lock_wait_for_event();
766 }
767
768 if (__improbable(ml_get_timebase() > deadline ||
769 (os_atomic_load(astp, relaxed) & AST_URGENT) ||
770 (!state.ilocked && !state.ilk_tail && state.owner &&
771 !lck_mtx_ctid_on_core(state.owner)))) {
772 goto adaptive_spin_fail;
773 }
774 }
775
776 /*
777 * If we're here, we got the lock, we just have to cleanup
778 * the MCS nodes and return.
779 */
780 if (state.as_tail != clear_idx) {
781 lck_mtx_ilk_lock_cleanup_as_mcs(lock, idx, mcs, to, &ss);
782 lck_mtx_mcs_clear(mcs);
783 }
784
785 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_END,
786 trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(thread),
787 lock->lck_mtx_tsid, 0, 0);
788 return;
789
790 adaptive_spin_fail:
791 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_END,
792 trace_lck, LCK_MTX_OWNER_FOR_TRACE(lock), lock->lck_mtx_tsid, 0, 0);
793 return lck_mtx_ilk_lock_contended(lock, state, LCK_ILK_MODE_FROM_AS);
794 }
795
796 static NOINLINE void
lck_mtx_lock_contended(lck_mtx_t * lock,thread_t thread,lck_mtx_mode_t mode)797 lck_mtx_lock_contended(lck_mtx_t *lock, thread_t thread, lck_mtx_mode_t mode)
798 {
799 struct turnstile *ts = TURNSTILE_NULL;
800 lck_mtx_state_t state;
801 uint32_t ctid = thread->ctid;
802 uint32_t data;
803 #if CONFIG_DTRACE
804 int first_miss = 0;
805 #endif /* CONFIG_DTRACE */
806 bool direct_wait = false;
807 uint64_t spin_start;
808 uint32_t profile;
809
810 lck_mtx_check_irq(lock);
811 if (mode == LCK_MTX_MODE_SLEEPABLE) {
812 lock_disable_preemption_for_thread(thread);
813 }
814
815 for (;;) {
816 /*
817 * Load the current state and perform sanity checks
818 *
819 * Note that the various "corrupt" values are designed
820 * so that the slowpath is taken when a mutex was used
821 * after destruction, so that we do not have to do
822 * sanity checks in the fast path.
823 */
824 state = os_atomic_load(&lock->lck_mtx, relaxed);
825 if (state.owner == ctid) {
826 __lck_mtx_owned_panic(lock, thread);
827 }
828 if (lock->lck_mtx_type != LCK_TYPE_MUTEX ||
829 state.data == LCK_MTX_TAG_DESTROYED) {
830 __lck_mtx_invalid_panic(lock);
831 }
832 profile = (state.data & LCK_MTX_PROFILE);
833
834 /*
835 * Attempt steal
836 *
837 * When the lock state is 0, then no thread can be queued
838 * for adaptive spinning or for the interlock yet.
839 *
840 * As such we can attempt to try to take the interlock.
841 * (we can't take the mutex directly because we need
842 * the interlock to do turnstile operations on the way out).
843 */
844 if ((state.val & ~(uint64_t)LCK_MTX_PROFILE) == 0) {
845 if (!os_atomic_cmpxchgv(&lock->lck_mtx.val,
846 state.val, state.val | LCK_MTX_ILOCK,
847 &state.val, acquire)) {
848 continue;
849 }
850 break;
851 }
852
853 #if CONFIG_DTRACE
854 if (profile) {
855 LCK_MTX_PROF_MISS(lock, lock->lck_mtx_grp, &first_miss);
856 }
857 #endif /* CONFIG_DTRACE */
858
859 if (mode == LCK_MTX_MODE_SLEEPABLE) {
860 spin_start = LCK_MTX_ADAPTIVE_SPIN_BEGIN();
861 } else {
862 spin_start = LCK_MTX_SPIN_SPIN_BEGIN();
863 }
864
865 /*
866 * Adaptive spin or interlock
867 *
868 * Evaluate if adaptive spinning should be attempted,
869 * and if yes go to adaptive spin.
870 *
871 * Otherwise (and this includes always-spin mutexes),
872 * go for the interlock.
873 */
874 if (mode != LCK_MTX_MODE_SPIN_ALWAYS &&
875 (state.ilocked || state.as_tail || !state.owner ||
876 lck_mtx_ctid_on_core(state.owner))) {
877 lck_mtx_lock_adaptive_spin(lock, state);
878 } else {
879 direct_wait = true;
880 lck_mtx_ilk_lock_nopreempt(lock, LCK_ILK_MODE_DIRECT);
881 }
882
883 if (mode == LCK_MTX_MODE_SLEEPABLE) {
884 LCK_MTX_ADAPTIVE_SPIN_END(lock, lock->lck_mtx_grp, spin_start);
885 } else {
886 LCK_MTX_SPIN_SPIN_END(lock, lock->lck_mtx_grp, spin_start);
887 }
888
889 /*
890 * Take or sleep
891 *
892 * We now have the interlock. Either the owner
893 * isn't set, and the mutex is ours to claim,
894 * or we must go to sleep.
895 *
896 * If we go to sleep, we need to set LCK_MTX_NEEDS_WAKEUP
897 * to force the current lock owner to call
898 * lck_mtx_unlock_wakeup().
899 */
900 state = os_atomic_load(&lock->lck_mtx, relaxed);
901 if (state.owner == LCK_MTX_NULL_CTID) {
902 break;
903 }
904
905 if (mode == LCK_MTX_MODE_SPIN_ALWAYS) {
906 __lck_mtx_lock_is_sleepable_panic(lock);
907 }
908
909 #if CONFIG_DTRACE
910 if (profile) {
911 LCK_MTX_PROF_WAIT(lock, lock->lck_mtx_grp,
912 direct_wait, &first_miss);
913 }
914 #endif /* CONFIG_DTRACE */
915 os_atomic_store(&lock->lck_mtx.data,
916 state.data | LCK_MTX_ILOCK | LCK_MTX_NEEDS_WAKEUP,
917 compiler_acq_rel);
918 ts = lck_mtx_lock_wait(lock, thread,
919 ctid_get_thread(state.owner), ts);
920
921 /* returns interlock unlocked and preemption re-enabled */
922 lock_disable_preemption_for_thread(thread);
923 }
924
925 /*
926 * We can take the lock!
927 *
928 * We only have the interlock and the owner field is 0.
929 *
930 * Perform various turnstile cleanups if needed,
931 * claim the lock, and reenable preemption (if needed).
932 */
933 if (ts) {
934 lck_mtx_lock_wait_done(lock, ts);
935 }
936 data = ctid | profile;
937 if (lck_mtx_lock_will_need_wakeup(lock, thread)) {
938 data |= LCK_MTX_NEEDS_WAKEUP;
939 }
940 if (mode != LCK_MTX_MODE_SLEEPABLE) {
941 data |= LCK_MTX_ILOCK | LCK_MTX_SPIN_MODE;
942 }
943 os_atomic_store(&lock->lck_mtx.data, data, release);
944
945 if (mode == LCK_MTX_MODE_SLEEPABLE) {
946 lock_enable_preemption();
947 }
948
949 assert(thread->turnstile != NULL);
950
951 if (ts) {
952 turnstile_cleanup();
953 }
954 LCK_MTX_ACQUIRED(lock, lock->lck_mtx_grp,
955 mode != LCK_MTX_MODE_SLEEPABLE, profile);
956 }
957
958 #if LCK_MTX_CHECK_INVARIANTS || CONFIG_DTRACE
959 __attribute__((noinline))
960 #else
961 __attribute__((always_inline))
962 #endif
963 static void
lck_mtx_lock_slow(lck_mtx_t * lock,thread_t thread,lck_mtx_state_t state,lck_mtx_mode_t mode)964 lck_mtx_lock_slow(
965 lck_mtx_t *lock,
966 thread_t thread,
967 lck_mtx_state_t state,
968 lck_mtx_mode_t mode)
969 {
970 #pragma unused(state)
971 #if CONFIG_DTRACE
972 lck_mtx_state_t ostate = {
973 .data = LCK_MTX_PROFILE,
974 };
975 #endif /* CONFIG_DTRACE */
976
977 #if LCK_MTX_CHECK_INVARIANTS
978 if (mode != LCK_MTX_MODE_SPIN_ALWAYS) {
979 lck_mtx_check_preemption(lock, thread,
980 (mode == LCK_MTX_MODE_SPIN));
981 }
982 #endif /* LCK_MTX_CHECK_INVARIANTS */
983 #if CONFIG_DTRACE
984 if (state.val == ostate.val) {
985 state.data = thread->ctid | LCK_MTX_PROFILE;
986 if (mode != LCK_MTX_MODE_SLEEPABLE) {
987 state.ilocked = true;
988 state.spin_mode = true;
989 }
990 os_atomic_cmpxchgv(&lock->lck_mtx.val,
991 ostate.val, state.val, &state.val, acquire);
992 }
993 if ((state.val & ~ostate.val) == 0) {
994 LCK_MTX_ACQUIRED(lock, lock->lck_mtx_grp,
995 mode != LCK_MTX_MODE_SLEEPABLE,
996 state.data & LCK_MTX_PROFILE);
997 return;
998 }
999 #endif /* CONFIG_DTRACE */
1000 lck_mtx_lock_contended(lock, thread, mode);
1001 }
1002
1003 static __attribute__((always_inline)) void
lck_mtx_lock_fastpath(lck_mtx_t * lock,lck_mtx_mode_t mode)1004 lck_mtx_lock_fastpath(lck_mtx_t *lock, lck_mtx_mode_t mode)
1005 {
1006 thread_t thread = current_thread();
1007 lck_mtx_state_t state = {
1008 .data = thread->ctid,
1009 };
1010 uint64_t take_slowpath = 0;
1011
1012 if (mode != LCK_MTX_MODE_SPIN_ALWAYS) {
1013 take_slowpath |= LCK_MTX_SNIFF_PREEMPTION(thread);
1014 }
1015 take_slowpath |= LCK_MTX_SNIFF_DTRACE();
1016
1017 if (mode != LCK_MTX_MODE_SLEEPABLE) {
1018 lock_disable_preemption_for_thread(thread);
1019 state.ilocked = true;
1020 state.spin_mode = true;
1021 }
1022
1023 /*
1024 * Do the CAS on the entire mutex state,
1025 * which hence requires for the ILK/AS queues
1026 * to be empty (which is fairer).
1027 */
1028 lock_cmpxchgv(&lock->lck_mtx.val,
1029 0, state.val, &state.val, acquire);
1030
1031 take_slowpath |= state.val;
1032 if (__improbable(take_slowpath)) {
1033 return lck_mtx_lock_slow(lock, thread, state, mode);
1034 }
1035 }
1036
1037 void
lck_mtx_lock(lck_mtx_t * lock)1038 lck_mtx_lock(lck_mtx_t *lock)
1039 {
1040 lck_mtx_lock_fastpath(lock, LCK_MTX_MODE_SLEEPABLE);
1041 }
1042
1043 void
lck_mtx_lock_spin(lck_mtx_t * lock)1044 lck_mtx_lock_spin(lck_mtx_t *lock)
1045 {
1046 lck_mtx_lock_fastpath(lock, LCK_MTX_MODE_SPIN);
1047 }
1048
1049 void
lck_mtx_lock_spin_always(lck_mtx_t * lock)1050 lck_mtx_lock_spin_always(lck_mtx_t *lock)
1051 {
1052 lck_mtx_lock_fastpath(lock, LCK_MTX_MODE_SPIN_ALWAYS);
1053 }
1054
1055
1056 #pragma mark lck_mtx_t: lck_mtx_try_lock
1057
1058 static __attribute__((always_inline)) bool
lck_mtx_try_lock_slow_inline(lck_mtx_t * lock,thread_t thread,uint32_t odata,uint32_t ndata,bool spin)1059 lck_mtx_try_lock_slow_inline(
1060 lck_mtx_t *lock,
1061 thread_t thread,
1062 uint32_t odata,
1063 uint32_t ndata,
1064 bool spin)
1065 {
1066 #pragma unused(lock, thread, odata, ndata)
1067 #if CONFIG_DTRACE
1068 if (odata == LCK_MTX_PROFILE) {
1069 os_atomic_cmpxchgv(&lock->lck_mtx.data,
1070 odata, ndata | LCK_MTX_PROFILE, &odata, acquire);
1071 }
1072 if ((odata & ~LCK_MTX_PROFILE) == 0) {
1073 LCK_MTX_TRY_ACQUIRED(lock, lock->lck_mtx_grp,
1074 spin, odata & LCK_MTX_PROFILE);
1075 return true;
1076 }
1077 if (odata & LCK_MTX_PROFILE) {
1078 LCK_MTX_PROF_MISS(lock, lock->lck_mtx_grp, &(int){ 0 });
1079 }
1080 #endif /* CONFIG_DTRACE */
1081
1082 if (spin) {
1083 lock_enable_preemption();
1084 }
1085 return false;
1086 }
1087
1088 #if CONFIG_DTRACE || LCK_MTX_CHECK_INVARIANTS
1089 __attribute__((noinline))
1090 #else
1091 __attribute__((always_inline))
1092 #endif
1093 static bool
lck_mtx_try_lock_slow(lck_mtx_t * lock,thread_t thread,uint32_t odata,uint32_t ndata)1094 lck_mtx_try_lock_slow(
1095 lck_mtx_t *lock,
1096 thread_t thread,
1097 uint32_t odata,
1098 uint32_t ndata)
1099 {
1100 return lck_mtx_try_lock_slow_inline(lock, thread, odata, ndata, false);
1101 }
1102
1103 #if CONFIG_DTRACE || LCK_MTX_CHECK_INVARIANTS
1104 __attribute__((noinline))
1105 #else
1106 __attribute__((always_inline))
1107 #endif
1108 static bool
lck_mtx_try_lock_slow_spin(lck_mtx_t * lock,thread_t thread,uint32_t odata,uint32_t ndata)1109 lck_mtx_try_lock_slow_spin(
1110 lck_mtx_t *lock,
1111 thread_t thread,
1112 uint32_t odata,
1113 uint32_t ndata)
1114 {
1115 return lck_mtx_try_lock_slow_inline(lock, thread, odata, ndata, true);
1116 }
1117
1118 static __attribute__((always_inline)) bool
lck_mtx_try_lock_fastpath(lck_mtx_t * lock,lck_mtx_mode_t mode)1119 lck_mtx_try_lock_fastpath(lck_mtx_t *lock, lck_mtx_mode_t mode)
1120 {
1121 thread_t thread = current_thread();
1122 uint32_t odata, ndata = thread->ctid;
1123 uint32_t take_slowpath = 0;
1124
1125 #if CONFIG_DTRACE
1126 take_slowpath |= lck_debug_state.lds_value;
1127 #endif
1128 if (mode != LCK_MTX_MODE_SLEEPABLE) {
1129 lock_disable_preemption_for_thread(thread);
1130 ndata |= LCK_MTX_SPIN_MODE | LCK_MTX_ILOCK;
1131 }
1132
1133 /*
1134 * try_lock because it's likely to be used for cases
1135 * like lock inversion resolutions tries a bit harder
1136 * than lck_mtx_lock() to take the lock and ignores
1137 * adaptive spin / interlock queues by doing the CAS
1138 * on the 32bit mutex data only.
1139 */
1140 lock_cmpxchgv(&lock->lck_mtx.data, 0, ndata, &odata, acquire);
1141
1142 take_slowpath |= odata;
1143 if (__probable(!take_slowpath)) {
1144 return true;
1145 }
1146
1147 if (mode == LCK_MTX_MODE_SPIN_ALWAYS &&
1148 (odata & LCK_MTX_CTID_MASK) &&
1149 !(odata & LCK_MTX_SPIN_MODE)) {
1150 __lck_mtx_lock_is_sleepable_panic(lock);
1151 }
1152
1153 if (mode == LCK_MTX_MODE_SLEEPABLE) {
1154 return lck_mtx_try_lock_slow(lock, thread, odata, ndata);
1155 } else {
1156 return lck_mtx_try_lock_slow_spin(lock, thread, odata, ndata);
1157 }
1158 }
1159
1160 boolean_t
lck_mtx_try_lock(lck_mtx_t * lock)1161 lck_mtx_try_lock(lck_mtx_t *lock)
1162 {
1163 return lck_mtx_try_lock_fastpath(lock, LCK_MTX_MODE_SLEEPABLE);
1164 }
1165
1166 boolean_t
lck_mtx_try_lock_spin(lck_mtx_t * lock)1167 lck_mtx_try_lock_spin(lck_mtx_t *lock)
1168 {
1169 return lck_mtx_try_lock_fastpath(lock, LCK_MTX_MODE_SPIN);
1170 }
1171
1172 boolean_t
lck_mtx_try_lock_spin_always(lck_mtx_t * lock)1173 lck_mtx_try_lock_spin_always(lck_mtx_t *lock)
1174 {
1175 return lck_mtx_try_lock_fastpath(lock, LCK_MTX_MODE_SPIN_ALWAYS);
1176 }
1177
1178
1179 #pragma mark lck_mtx_t: lck_mtx_unlock
1180
1181 static NOINLINE void
lck_mtx_unlock_contended(lck_mtx_t * lock,thread_t thread,uint32_t data)1182 lck_mtx_unlock_contended(lck_mtx_t *lock, thread_t thread, uint32_t data)
1183 {
1184 bool cleanup = false;
1185
1186 #if !CONFIG_DTRACE
1187 /*
1188 * This check is done by lck_mtx_unlock_slow() when it is enabled.
1189 */
1190 if (thread->ctid != (data & LCK_MTX_CTID_MASK)) {
1191 __lck_mtx_not_owned_panic(lock, thread);
1192 }
1193 #endif /* !CONFIG_DTRACE */
1194
1195 if ((data & LCK_MTX_SPIN_MODE) == 0) {
1196 lock_disable_preemption_for_thread(thread);
1197 lck_mtx_ilk_lock_nopreempt(lock, LCK_ILK_MODE_UNLOCK);
1198 }
1199
1200 /*
1201 * We must re-load the data: we might have taken
1202 * the slowpath because another thread had taken
1203 * the interlock and set the NEEDS_WAKEUP bit
1204 * while we were spinning to get it.
1205 */
1206 data = os_atomic_load(&lock->lck_mtx.data, compiler_acq_rel);
1207 if (data & LCK_MTX_NEEDS_WAKEUP) {
1208 lck_mtx_unlock_wakeup(lock, thread);
1209 cleanup = true;
1210 }
1211 lck_mtx_ilk_unlock_v(lock, data & LCK_MTX_PROFILE);
1212
1213 LCK_MTX_RELEASED(lock, lock->lck_mtx_grp, data & LCK_MTX_PROFILE);
1214
1215 /*
1216 * Do not do any turnstile operations outside of this block.
1217 *
1218 * lock/unlock is called at early stage of boot while single
1219 * threaded, without turnstiles being available yet.
1220 * Even without contention we can come throught the slow path
1221 * if the mutex is acquired as a spin lock.
1222 */
1223 if (cleanup) {
1224 turnstile_cleanup();
1225 }
1226 }
1227
1228 #if CONFIG_DTRACE
1229 __attribute__((noinline))
1230 #else
1231 __attribute__((always_inline))
1232 #endif
1233 static void
lck_mtx_unlock_slow(lck_mtx_t * lock,thread_t thread,uint32_t data)1234 lck_mtx_unlock_slow(lck_mtx_t *lock, thread_t thread, uint32_t data)
1235 {
1236 #if CONFIG_DTRACE
1237 /*
1238 * If Dtrace is enabled, locks can be profiled,
1239 * which causes the fastpath of unlock to fail.
1240 */
1241 if ((data & LCK_MTX_BITS_MASK) == LCK_MTX_PROFILE) {
1242 os_atomic_cmpxchgv(&lock->lck_mtx.data, data, LCK_MTX_PROFILE,
1243 &data, release);
1244 }
1245 if (thread->ctid != (data & LCK_MTX_CTID_MASK)) {
1246 __lck_mtx_not_owned_panic(lock, thread);
1247 }
1248 if ((data & (LCK_MTX_BITS_MASK & ~LCK_MTX_PROFILE)) == 0) {
1249 LCK_MTX_RELEASED(lock, lock->lck_mtx_grp, false);
1250 return;
1251 }
1252 #endif /* CONFIG_DTRACE */
1253
1254 lck_mtx_unlock_contended(lock, thread, data);
1255 }
1256
1257 void
lck_mtx_unlock(lck_mtx_t * lock)1258 lck_mtx_unlock(lck_mtx_t *lock)
1259 {
1260 thread_t thread = current_thread();
1261 uint32_t take_slowpath = 0;
1262 uint32_t data;
1263
1264 take_slowpath |= LCK_MTX_SNIFF_DTRACE();
1265
1266 /*
1267 * The fast path ignores the ILK/AS queues on purpose,
1268 * those really are a "lock" concept, not unlock.
1269 */
1270 if (__probable(lock_cmpxchgv(&lock->lck_mtx.data,
1271 thread->ctid, 0, &data, release))) {
1272 if (__probable(!take_slowpath)) {
1273 return;
1274 }
1275 }
1276
1277 lck_mtx_unlock_slow(lock, thread, data);
1278 }
1279
1280
1281 #pragma mark lck_mtx_t: misc
1282
1283 void
lck_mtx_assert(lck_mtx_t * lock,unsigned int type)1284 lck_mtx_assert(lck_mtx_t *lock, unsigned int type)
1285 {
1286 lck_mtx_state_t state = os_atomic_load(&lock->lck_mtx, relaxed);
1287 thread_t thread = current_thread();
1288
1289 if (type == LCK_MTX_ASSERT_OWNED) {
1290 if (state.owner != thread->ctid) {
1291 __lck_mtx_not_owned_panic(lock, thread);
1292 }
1293 } else if (type == LCK_MTX_ASSERT_NOTOWNED) {
1294 if (state.owner == thread->ctid) {
1295 __lck_mtx_owned_panic(lock, thread);
1296 }
1297 } else {
1298 panic("lck_mtx_assert(): invalid arg (%u)", type);
1299 }
1300 }
1301
1302 /*
1303 * Routine: lck_mtx_convert_spin
1304 *
1305 * Convert a mutex held for spin into a held full mutex
1306 */
1307 void
lck_mtx_convert_spin(lck_mtx_t * lock)1308 lck_mtx_convert_spin(lck_mtx_t *lock)
1309 {
1310 lck_mtx_state_t state = os_atomic_load(&lock->lck_mtx, relaxed);
1311 thread_t thread = current_thread();
1312 uint32_t data = thread->ctid;
1313
1314 if (state.owner != data) {
1315 __lck_mtx_not_owned_panic(lock, thread);
1316 }
1317
1318 if (state.spin_mode) {
1319 /*
1320 * Note: we can acquire the lock in spin mode
1321 * _and_ be the inheritor if we waited.
1322 *
1323 * We must only clear ilocked and spin_mode,
1324 * but preserve owner and needs_wakeup.
1325 */
1326 state.ilocked = false;
1327 state.spin_mode = false;
1328 lck_mtx_ilk_unlock_v(lock, state.data);
1329 turnstile_cleanup();
1330 }
1331 }
1332
1333 /*
1334 * Routine: kdp_lck_mtx_lock_spin_is_acquired
1335 * NOT SAFE: To be used only by kernel debugger to avoid deadlock.
1336 */
1337 boolean_t
kdp_lck_mtx_lock_spin_is_acquired(lck_mtx_t * lck)1338 kdp_lck_mtx_lock_spin_is_acquired(lck_mtx_t *lck)
1339 {
1340 lck_mtx_state_t state = os_atomic_load(&lck->lck_mtx, relaxed);
1341
1342 if (not_in_kdp) {
1343 panic("panic: spinlock acquired check done outside of kernel debugger");
1344 }
1345 if (state.data == LCK_MTX_TAG_DESTROYED) {
1346 return false;
1347 }
1348 return state.owner || state.ilocked;
1349 }
1350
1351 void
kdp_lck_mtx_find_owner(struct waitq * waitq __unused,event64_t event,thread_waitinfo_t * waitinfo)1352 kdp_lck_mtx_find_owner(
1353 struct waitq *waitq __unused,
1354 event64_t event,
1355 thread_waitinfo_t *waitinfo)
1356 {
1357 lck_mtx_t *mutex = LCK_EVENT_TO_MUTEX(event);
1358 lck_mtx_state_t state = os_atomic_load(&mutex->lck_mtx, relaxed);
1359
1360 assert3u(state.data, !=, LCK_MTX_TAG_DESTROYED);
1361 waitinfo->context = VM_KERNEL_UNSLIDE_OR_PERM(mutex);
1362 waitinfo->owner = thread_tid(ctid_get_thread(state.owner));
1363 }
1364
1365 #endif /* !LCK_MTX_USE_ARCH */
1366
1367 /*
1368 * Routine: mutex_pause
1369 *
1370 * Called by former callers of simple_lock_pause().
1371 */
1372 #define MAX_COLLISION_COUNTS 32
1373 #define MAX_COLLISION 8
1374
1375 unsigned int max_collision_count[MAX_COLLISION_COUNTS];
1376
1377 uint32_t collision_backoffs[MAX_COLLISION] = {
1378 10, 50, 100, 200, 400, 600, 800, 1000
1379 };
1380
1381
1382 void
mutex_pause(uint32_t collisions)1383 mutex_pause(uint32_t collisions)
1384 {
1385 wait_result_t wait_result;
1386 uint32_t back_off;
1387
1388 if (collisions >= MAX_COLLISION_COUNTS) {
1389 collisions = MAX_COLLISION_COUNTS - 1;
1390 }
1391 max_collision_count[collisions]++;
1392
1393 if (collisions >= MAX_COLLISION) {
1394 collisions = MAX_COLLISION - 1;
1395 }
1396 back_off = collision_backoffs[collisions];
1397
1398 wait_result = assert_wait_timeout((event_t)mutex_pause, THREAD_UNINT, back_off, NSEC_PER_USEC);
1399 assert(wait_result == THREAD_WAITING);
1400
1401 wait_result = thread_block(THREAD_CONTINUE_NULL);
1402 assert(wait_result == THREAD_TIMED_OUT);
1403 }
1404
1405
1406 unsigned int mutex_yield_wait = 0;
1407 unsigned int mutex_yield_no_wait = 0;
1408
1409 boolean_t
lck_mtx_yield(lck_mtx_t * lck)1410 lck_mtx_yield(
1411 lck_mtx_t *lck)
1412 {
1413 bool has_waiters = LCK_MTX_HAS_WAITERS(lck);
1414
1415 #if DEBUG
1416 lck_mtx_assert(lck, LCK_MTX_ASSERT_OWNED);
1417 #endif /* DEBUG */
1418
1419 if (!has_waiters) {
1420 mutex_yield_no_wait++;
1421 } else {
1422 mutex_yield_wait++;
1423 lck_mtx_unlock(lck);
1424 mutex_pause(0);
1425 lck_mtx_lock(lck);
1426 }
1427 return has_waiters;
1428 }
1429