1 /*
2 * Copyright (c) 2022 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 #define LOCK_PRIVATE 1
30
31 #include <mach_ldebug.h>
32 #include <kern/locks_internal.h>
33 #include <kern/lock_stat.h>
34 #include <kern/locks.h>
35 #include <kern/kalloc.h>
36 #include <kern/thread.h>
37
38 #include <mach/machine/sdt.h>
39
40 #include <machine/cpu_data.h>
41 #include <machine/machine_cpu.h>
42
43 #if !LCK_MTX_USE_ARCH
44
45 /*
46 * lck_mtx_t
47 * ~~~~~~~~~
48 *
49 * Kernel mutexes in this implementation are made of four 32 bits words:
50 *
51 * - word 0: turnstile compact ID (24 bits) and the 0x22 lock tag
52 * - word 1: padding (to be used for group compact IDs)
53 * - word 2: mutex state (lock owner + interlock, spin and waiters bits),
54 * refered to as "data" in the code.
55 * - word 3: adaptive spin and interlock MCS queue tails.
56 *
57 * The 64 bits word made of the last two words is refered to
58 * as the "mutex state" in code.
59 *
60 *
61 * Core serialization rules
62 * ~~~~~~~~~~~~~~~~~~~~~~~~
63 *
64 * The mutex has a bit (lck_mtx_t::lck_mtx.ilocked or bit LCK_MTX_ILOCK
65 * of the data word) that serves as a spinlock for the mutex state.
66 *
67 *
68 * Updating the lock fields must follow the following rules:
69 *
70 * - It is ok to "steal" the mutex (updating its data field) if no one
71 * holds the interlock.
72 *
73 * - Holding the interlock allows its holder to update the first 3 words
74 * of the kernel mutex without using RMW atomics (plain stores are OK).
75 *
76 * - Holding the interlock is required for a thread to remove itself
77 * from the adaptive spin queue.
78 *
79 * - Threads can enqueue themselves onto the adaptive spin wait queue
80 * or the interlock wait queue at any time.
81 *
82 *
83 * Waiters bit and turnstiles
84 * ~~~~~~~~~~~~~~~~~~~~~~~~~~
85 *
86 * The turnstile on a kernel mutex is set by waiters, and cleared
87 * once they have all been resumed and successfully acquired the lock.
88 *
89 * LCK_MTX_NEEDS_WAKEUP being set (always with an owner set too)
90 * forces threads to the lck_mtx_unlock slowpath,
91 * in order to evaluate whether lck_mtx_unlock_wakeup() must be called.
92 *
93 * As a result it means it really only needs to be set at select times:
94 *
95 * - when a thread blocks and "snitches" on the current thread owner,
96 * so that when that thread unlocks it calls wake up,
97 *
98 * - when a thread that was woken up resumes its work and became
99 * the inheritor.
100 */
101
102 #define ADAPTIVE_SPIN_ENABLE 0x1
103
104 #define NOINLINE __attribute__((noinline))
105 #define LCK_MTX_EVENT(lck) CAST_EVENT64_T(&(lck)->lck_mtx.data)
106 #define LCK_EVENT_TO_MUTEX(e) __container_of((uint32_t *)(e), lck_mtx_t, lck_mtx.data)
107 #define LCK_MTX_HAS_WAITERS(l) ((l)->lck_mtx.data & LCK_MTX_NEEDS_WAKEUP)
108
109 #if DEVELOPMENT || DEBUG
110 TUNABLE(bool, LckDisablePreemptCheck, "-disable_mtx_chk", false);
111 #endif /* DEVELOPMENT || DEBUG */
112
113 extern unsigned int not_in_kdp;
114
115 #if CONFIG_SPTM
116 extern const bool * sptm_xnu_triggered_panic_ptr;
117 #endif /* CONFIG_SPTM */
118
119 KALLOC_TYPE_DEFINE(KT_LCK_MTX, lck_mtx_t, KT_PRIV_ACCT);
120
121 #define LCK_MTX_NULL_CTID 0x00000000u
122
123 __enum_decl(lck_mtx_mode_t, uint32_t, {
124 LCK_MTX_MODE_SLEEPABLE,
125 LCK_MTX_MODE_SPIN,
126 LCK_MTX_MODE_SPIN_ALWAYS,
127 });
128
129 __enum_decl(lck_ilk_mode_t, uint32_t, {
130 LCK_ILK_MODE_UNLOCK,
131 LCK_ILK_MODE_DIRECT,
132 LCK_ILK_MODE_FROM_AS,
133 });
134
135 static inline void
lck_mtx_mcs_clear(lck_mtx_mcs_t mcs)136 lck_mtx_mcs_clear(lck_mtx_mcs_t mcs)
137 {
138 *mcs = (struct lck_mtx_mcs){ };
139 }
140
141 static inline lck_mcs_id_t
lck_mtx_get_mcs_id(void)142 lck_mtx_get_mcs_id(void)
143 {
144 return lck_mcs_id_current(LCK_MCS_SLOT_0);
145 }
146
147 __pure2
148 static inline lck_mtx_mcs_t
lck_mtx_get_mcs(lck_mcs_id_t idx)149 lck_mtx_get_mcs(lck_mcs_id_t idx)
150 {
151 return &lck_mcs_get_other(idx)->mcs_mtx;
152 }
153
154
155 #pragma mark lck_mtx_t: validation
156
157 __abortlike
158 static void
__lck_mtx_invalid_panic(lck_mtx_t * lck)159 __lck_mtx_invalid_panic(lck_mtx_t *lck)
160 {
161 panic("Invalid/destroyed mutex %p: "
162 "<0x%06x 0x%02x 0x%08x 0x%08x/%p 0x%04x 0x%04x>",
163 lck, lck->lck_mtx_tsid, lck->lck_mtx_type, lck->lck_mtx_grp,
164 lck->lck_mtx.data, ctid_get_thread_unsafe(lck->lck_mtx.owner),
165 lck->lck_mtx.as_tail, lck->lck_mtx.ilk_tail);
166 }
167
168 __abortlike
169 static void
__lck_mtx_not_owned_panic(lck_mtx_t * lock,thread_t thread)170 __lck_mtx_not_owned_panic(lck_mtx_t *lock, thread_t thread)
171 {
172 panic("Mutex %p is unexpectedly not owned by thread %p", lock, thread);
173 }
174
175 __abortlike
176 static void
__lck_mtx_owned_panic(lck_mtx_t * lock,thread_t thread)177 __lck_mtx_owned_panic(lck_mtx_t *lock, thread_t thread)
178 {
179 panic("Mutex %p is unexpectedly owned by thread %p", lock, thread);
180 }
181
182 __abortlike
183 static void
__lck_mtx_lock_is_sleepable_panic(lck_mtx_t * lck)184 __lck_mtx_lock_is_sleepable_panic(lck_mtx_t *lck)
185 {
186 // "Always" variants can never block. If the lock is held as a normal mutex
187 // then someone is mixing always and non-always calls on the same lock, which is
188 // forbidden.
189 panic("Mutex %p is held as a full-mutex (spin-always lock attempted)", lck);
190 }
191
192 #if DEVELOPMENT || DEBUG
193 __abortlike
194 static void
__lck_mtx_preemption_disabled_panic(lck_mtx_t * lck,int expected)195 __lck_mtx_preemption_disabled_panic(lck_mtx_t *lck, int expected)
196 {
197 panic("Attempt to take mutex %p with preemption disabled (%d)",
198 lck, get_preemption_level() - expected);
199 }
200
201 __abortlike
202 static void
__lck_mtx_at_irq_panic(lck_mtx_t * lck)203 __lck_mtx_at_irq_panic(lck_mtx_t *lck)
204 {
205 panic("Attempt to take mutex %p in IRQ context", lck);
206 }
207
208 /*
209 * Routine: lck_mtx_check_preemption
210 *
211 * Verify preemption is enabled when attempting to acquire a mutex.
212 */
213 static inline void
lck_mtx_check_preemption(lck_mtx_t * lock,thread_t thread,int expected)214 lck_mtx_check_preemption(lck_mtx_t *lock, thread_t thread, int expected)
215 {
216 #pragma unused(thread)
217 if (lock_preemption_level_for_thread(thread) == expected) {
218 return;
219 }
220 if (LckDisablePreemptCheck) {
221 return;
222 }
223 if (current_cpu_datap()->cpu_hibernate) {
224 return;
225 }
226 if (startup_phase < STARTUP_SUB_EARLY_BOOT) {
227 return;
228 }
229 #if CONFIG_SPTM
230 /*
231 * If a panic has been initiated on SPTM devices, preemption was disabled by sleh,
232 * but platform callbacks could be acquiring mutexes
233 */
234 if (*sptm_xnu_triggered_panic_ptr) {
235 return;
236 }
237 #endif
238 __lck_mtx_preemption_disabled_panic(lock, expected);
239 }
240
241 static inline void
lck_mtx_check_irq(lck_mtx_t * lock)242 lck_mtx_check_irq(lck_mtx_t *lock)
243 {
244 if (ml_at_interrupt_context()) {
245 __lck_mtx_at_irq_panic(lock);
246 }
247 }
248
249 #define LCK_MTX_SNIFF_PREEMPTION(thread) lock_preemption_level_for_thread(thread)
250 #define LCK_MTX_CHECK_INVARIANTS 1
251 #else
252 #define lck_mtx_check_irq(lck) ((void)0)
253 #define LCK_MTX_SNIFF_PREEMPTION(thread) 0
254 #define LCK_MTX_CHECK_INVARIANTS 0
255 #endif /* !DEVELOPMENT && !DEBUG */
256
257 #if CONFIG_DTRACE
258 #define LCK_MTX_SNIFF_DTRACE() lck_debug_state.lds_value
259 #else
260 #define LCK_MTX_SNIFF_DTRACE() 0
261 #endif
262
263
264 #pragma mark lck_mtx_t: alloc/init/destroy/free
265
266 lck_mtx_t *
lck_mtx_alloc_init(lck_grp_t * grp,lck_attr_t * attr)267 lck_mtx_alloc_init(lck_grp_t *grp, lck_attr_t *attr)
268 {
269 lck_mtx_t *lck;
270
271 lck = zalloc(KT_LCK_MTX);
272 lck_mtx_init(lck, grp, attr);
273 return lck;
274 }
275
276 void
lck_mtx_free(lck_mtx_t * lck,lck_grp_t * grp)277 lck_mtx_free(lck_mtx_t *lck, lck_grp_t *grp)
278 {
279 lck_mtx_destroy(lck, grp);
280 zfree(KT_LCK_MTX, lck);
281 }
282
283 __mockable void
lck_mtx_init(lck_mtx_t * lck,lck_grp_t * grp,lck_attr_t * attr)284 lck_mtx_init(lck_mtx_t *lck, lck_grp_t *grp, lck_attr_t *attr)
285 {
286 if (attr == LCK_ATTR_NULL) {
287 attr = &lck_attr_default;
288 }
289
290 *lck = (lck_mtx_t){
291 .lck_mtx_type = LCK_TYPE_MUTEX,
292 .lck_mtx_grp = grp->lck_grp_attr_id,
293 };
294 if (attr->lck_attr_val & LCK_ATTR_DEBUG) {
295 lck->lck_mtx.data |= LCK_MTX_PROFILE;
296 }
297
298 lck_grp_reference(grp, &grp->lck_grp_mtxcnt);
299 }
300
301 __mockable void
lck_mtx_destroy(lck_mtx_t * lck,lck_grp_t * grp)302 lck_mtx_destroy(lck_mtx_t *lck, lck_grp_t *grp)
303 {
304 if (lck->lck_mtx_tsid && lck->lck_mtx_type == LCK_TYPE_MUTEX) {
305 panic("Mutex to destroy still has waiters: %p: "
306 "<0x%06x 0x%02x 0x%08x 0x%08x/%p 0x%04x 0x%04x>",
307 lck, lck->lck_mtx_tsid, lck->lck_mtx_type, lck->lck_mtx_grp,
308 lck->lck_mtx.data, ctid_get_thread_unsafe(lck->lck_mtx.owner),
309 lck->lck_mtx.as_tail, lck->lck_mtx.ilk_tail);
310 }
311 if (lck->lck_mtx_type != LCK_TYPE_MUTEX ||
312 (lck->lck_mtx.data & ~LCK_MTX_PROFILE) ||
313 lck->lck_mtx.as_tail || lck->lck_mtx.ilk_tail) {
314 __lck_mtx_invalid_panic(lck);
315 }
316 LCK_GRP_ASSERT_ID(grp, lck->lck_mtx_grp);
317 lck->lck_mtx_type = LCK_TYPE_NONE;
318 lck->lck_mtx.data = LCK_MTX_TAG_DESTROYED;
319 lck->lck_mtx_grp = 0;
320 lck_grp_deallocate(grp, &grp->lck_grp_mtxcnt);
321 }
322
323
324 #pragma mark lck_mtx_t: lck_mtx_ilk*
325
326 static hw_spin_timeout_status_t
lck_mtx_ilk_timeout_panic(void * _lock,hw_spin_timeout_t to,hw_spin_state_t st)327 lck_mtx_ilk_timeout_panic(void *_lock, hw_spin_timeout_t to, hw_spin_state_t st)
328 {
329 lck_mtx_t *lck = _lock;
330
331 panic("Mutex interlock[%p] " HW_SPIN_TIMEOUT_FMT "; "
332 "current owner: %p, "
333 "<0x%06x 0x%02x 0x%08x 0x%08x 0x%04x 0x%04x>, "
334 HW_SPIN_TIMEOUT_DETAILS_FMT,
335 lck, HW_SPIN_TIMEOUT_ARG(to, st),
336 ctid_get_thread_unsafe(lck->lck_mtx.owner),
337 lck->lck_mtx_tsid, lck->lck_mtx_type,
338 lck->lck_mtx_grp, lck->lck_mtx.data,
339 lck->lck_mtx.as_tail, lck->lck_mtx.ilk_tail,
340 HW_SPIN_TIMEOUT_DETAILS_ARG(to, st));
341 }
342
343 static const struct hw_spin_policy lck_mtx_ilk_timeout_policy = {
344 .hwsp_name = "lck_mtx_t (ilk)",
345 .hwsp_timeout_atomic = &lock_panic_timeout,
346 .hwsp_op_timeout = lck_mtx_ilk_timeout_panic,
347 };
348
349 static void
lck_mtx_ilk_lock_cleanup_as_mcs(lck_mtx_t * lock,lck_mcs_id_t idx,lck_mtx_mcs_t mcs,hw_spin_timeout_t to,hw_spin_state_t * ss)350 lck_mtx_ilk_lock_cleanup_as_mcs(
351 lck_mtx_t *lock,
352 lck_mcs_id_t idx,
353 lck_mtx_mcs_t mcs,
354 hw_spin_timeout_t to,
355 hw_spin_state_t *ss)
356 {
357 lck_mtx_mcs_t nnode = NULL;
358 lck_mcs_id_t pidx = (lck_mcs_id_t)mcs->lmm_as_prev;
359 bool was_last;
360
361 /*
362 * This is called when the thread made use
363 * of the adaptive spin queue and needs
364 * to remove itself from it.
365 */
366
367 /*
368 * If the thread is last, set the tail to the node before us.
369 */
370 was_last = lock_cmpxchg(&lock->lck_mtx.as_tail, idx, pidx, release);
371
372 if (was_last) {
373 /*
374 * If @c mcs was last, we need to erase the previous
375 * node link to it.
376 *
377 * However, new nodes could have now taken our place
378 * and set the previous node's @c lmm_as_next field
379 * already, so we must CAS rather than blindly set.
380 *
381 * We know the previous node is stable because
382 * we hold the interlock (preventing concurrent
383 * removals).
384 */
385 if (pidx) {
386 os_atomic_cmpxchg(&lck_mtx_get_mcs(pidx)->lmm_as_next,
387 mcs, nnode, relaxed);
388 }
389 } else {
390 /*
391 * If @c mcs wasn't last, then wait to make sure
392 * we observe @c lmm_as_next. Once we do, we know
393 * the field is stable since we hold the interlock
394 * (preventing concurrent dequeues).
395 *
396 * We can then update it to @c mcs next node index
397 * (which is also stable for similar reasons).
398 *
399 * Lastly update the previous node @c lmm_as_next
400 * field as well to terminate the dequeue.
401 */
402 while (!hw_spin_wait_until(&mcs->lmm_as_next, nnode, nnode)) {
403 hw_spin_policy_t pol = &lck_mtx_ilk_timeout_policy;
404 hw_spin_should_keep_spinning(lock, pol, to, ss);
405 }
406
407 os_atomic_store(&nnode->lmm_as_prev, pidx, relaxed);
408 if (pidx) {
409 os_atomic_store(&lck_mtx_get_mcs(pidx)->lmm_as_next,
410 nnode, relaxed);
411 }
412 }
413
414 /*
415 * @c mcs's fields are left dangling,
416 * it is the responsibilty of the caller
417 * to terminate the cleanup.
418 */
419 }
420
421 static NOINLINE void
lck_mtx_ilk_lock_contended(lck_mtx_t * lock,lck_mtx_state_t state,lck_ilk_mode_t mode)422 lck_mtx_ilk_lock_contended(
423 lck_mtx_t *lock,
424 lck_mtx_state_t state,
425 lck_ilk_mode_t mode)
426 {
427 hw_spin_policy_t pol = &lck_mtx_ilk_timeout_policy;
428 hw_spin_timeout_t to = hw_spin_compute_timeout(pol);
429 hw_spin_state_t ss = { };
430
431 lck_mtx_mcs_t mcs, nnode, pnode;
432 lck_mcs_id_t idx, pidx;
433 lck_mtx_state_t nstate;
434 unsigned long ready;
435 uint64_t spin_start;
436
437 /*
438 * Take a spot in the interlock MCS queue,
439 * and then spin until we're at the head of it.
440 */
441
442 idx = lck_mtx_get_mcs_id();
443 mcs = &lck_mcs_get_current()->mcs_mtx;
444 if (mode != LCK_MTX_MODE_SPIN) {
445 spin_start = LCK_MTX_ADAPTIVE_SPIN_BEGIN();
446 }
447
448 mcs->lmm_ilk_current = lock;
449 pidx = os_atomic_xchg(&lock->lck_mtx.ilk_tail, idx, release);
450 if (pidx) {
451 pnode = lck_mtx_get_mcs(pidx);
452 os_atomic_store(&pnode->lmm_ilk_next, mcs, relaxed);
453
454 while (!hw_spin_wait_until(&mcs->lmm_ilk_ready, ready, ready)) {
455 hw_spin_should_keep_spinning(lock, pol, to, &ss);
456 }
457 }
458
459
460 /*
461 * We're now the first in line, wait for the interlock
462 * to look ready and take it.
463 *
464 * We can't just assume the lock is ours for the taking,
465 * because the fastpath of lck_mtx_lock_spin{,_always}
466 * only look at the mutex "data" and might steal it.
467 *
468 * Also clear the interlock MCS tail if @c mcs is last.
469 */
470 do {
471 while (!hw_spin_wait_until(&lock->lck_mtx.val,
472 state.val, state.ilocked == 0)) {
473 hw_spin_should_keep_spinning(lock, pol, to, &ss);
474 }
475
476 nstate = state;
477 nstate.ilocked = 1;
478 if (nstate.ilk_tail == idx) {
479 nstate.ilk_tail = 0;
480 }
481 } while (!os_atomic_cmpxchg(&lock->lck_mtx, state, nstate, acquire));
482
483
484 /*
485 * We now have the interlock, let's cleanup the MCS state.
486 *
487 * First, if there is a node after us, notify that it
488 * is at the head of the interlock queue.
489 *
490 * Second, perform the adaptive spin MCS cleanup if needed.
491 *
492 * Lastly, clear the MCS node.
493 */
494 if (state.ilk_tail != idx) {
495 while (!hw_spin_wait_until(&mcs->lmm_ilk_next, nnode, nnode)) {
496 hw_spin_should_keep_spinning(lock, pol, to, &ss);
497 }
498
499 os_atomic_store(&nnode->lmm_ilk_ready, 1, relaxed);
500 }
501
502 if (mode == LCK_ILK_MODE_FROM_AS) {
503 lck_mtx_ilk_lock_cleanup_as_mcs(lock, idx, mcs, to, &ss);
504 }
505 lck_mtx_mcs_clear(mcs);
506
507 if (mode != LCK_MTX_MODE_SPIN) {
508 LCK_MTX_ADAPTIVE_SPIN_END(lock, lock->lck_mtx_grp, spin_start);
509 }
510 }
511
512 static void
lck_mtx_ilk_lock_nopreempt(lck_mtx_t * lock,lck_ilk_mode_t mode)513 lck_mtx_ilk_lock_nopreempt(lck_mtx_t *lock, lck_ilk_mode_t mode)
514 {
515 lck_mtx_state_t state, nstate;
516
517 os_atomic_rmw_loop(&lock->lck_mtx.val, state.val, nstate.val, acquire, {
518 if (__improbable(state.ilocked || state.ilk_tail)) {
519 os_atomic_rmw_loop_give_up({
520 return lck_mtx_ilk_lock_contended(lock, state, mode);
521 });
522 }
523
524 nstate = state;
525 nstate.ilocked = true;
526 });
527 }
528
529 static void
lck_mtx_ilk_unlock_v(lck_mtx_t * lock,uint32_t data)530 lck_mtx_ilk_unlock_v(lck_mtx_t *lock, uint32_t data)
531 {
532 os_atomic_store(&lock->lck_mtx.data, data, release);
533 lock_enable_preemption();
534 }
535
536 static void
lck_mtx_ilk_unlock(lck_mtx_t * lock)537 lck_mtx_ilk_unlock(lck_mtx_t *lock)
538 {
539 lck_mtx_ilk_unlock_v(lock, lock->lck_mtx.data & ~LCK_MTX_ILOCK);
540 }
541
542
543 #pragma mark lck_mtx_t: turnstile integration
544
545 /*
546 * Routine: lck_mtx_lock_wait
547 *
548 * Invoked in order to wait on contention.
549 *
550 * Called with the interlock locked and
551 * returns it unlocked.
552 *
553 * Always aggressively sets the owning thread to promoted,
554 * even if it's the same or higher priority
555 * This prevents it from lowering its own priority while holding a lock
556 *
557 * TODO: Come up with a more efficient way to handle same-priority promotions
558 * <rdar://problem/30737670> ARM mutex contention logic could avoid taking the thread lock
559 */
560 static struct turnstile *
lck_mtx_lock_wait(lck_mtx_t * lck,thread_t self,thread_t holder,struct turnstile * ts)561 lck_mtx_lock_wait(
562 lck_mtx_t *lck,
563 thread_t self,
564 thread_t holder,
565 struct turnstile *ts)
566 {
567 uint64_t sleep_start = LCK_MTX_BLOCK_BEGIN();
568
569 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_START,
570 unslide_for_kdebug(lck), (uintptr_t)thread_tid(self), 0, 0, 0);
571
572 if (ts == TURNSTILE_NULL) {
573 ts = turnstile_prepare_compact_id((uintptr_t)lck,
574 lck->lck_mtx_tsid, TURNSTILE_KERNEL_MUTEX);
575 if (lck->lck_mtx_tsid == 0) {
576 lck->lck_mtx_tsid = ts->ts_compact_id;
577 }
578 }
579 assert3u(ts->ts_compact_id, ==, lck->lck_mtx_tsid);
580
581 thread_set_pending_block_hint(self, kThreadWaitKernelMutex);
582 turnstile_update_inheritor(ts, holder, (TURNSTILE_DELAYED_UPDATE | TURNSTILE_INHERITOR_THREAD));
583
584 waitq_assert_wait64(&ts->ts_waitq, LCK_MTX_EVENT(lck),
585 THREAD_UNINT | THREAD_WAIT_NOREPORT_USER, TIMEOUT_WAIT_FOREVER);
586
587 lck_mtx_ilk_unlock(lck);
588
589 turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_NOT_HELD);
590
591 thread_block(THREAD_CONTINUE_NULL);
592
593 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_END, 0, 0, 0, 0, 0);
594
595 LCK_MTX_BLOCK_END(lck, lck->lck_mtx_grp, sleep_start);
596
597 return ts;
598 }
599
600 static void
lck_mtx_lock_wait_done(lck_mtx_t * lck,struct turnstile * ts)601 lck_mtx_lock_wait_done(lck_mtx_t *lck, struct turnstile *ts)
602 {
603 if (turnstile_complete_compact_id((uintptr_t)lck, ts,
604 TURNSTILE_KERNEL_MUTEX)) {
605 lck->lck_mtx_tsid = 0;
606 }
607 }
608
609 /*
610 * Routine: lck_mtx_lock_will_need_wakeup
611 *
612 * Returns whether the thread is the current turnstile inheritor,
613 * which means it will have to call lck_mtx_unlock_wakeup()
614 * on unlock.
615 */
616 __attribute__((always_inline))
617 static bool
lck_mtx_lock_will_need_wakeup(lck_mtx_t * lck,thread_t self)618 lck_mtx_lock_will_need_wakeup(lck_mtx_t *lck, thread_t self)
619 {
620 uint32_t tsid = lck->lck_mtx_tsid;
621
622 return tsid && turnstile_get_by_id(tsid)->ts_inheritor == self;
623 }
624
625 /*
626 * Routine: lck_mtx_unlock_wakeup
627 *
628 * Invoked on unlock when there is contention.
629 *
630 * Called with the interlock locked.
631 *
632 * NOTE: callers should call turnstile_clenup after
633 * dropping the interlock.
634 */
635 static void
lck_mtx_unlock_wakeup(lck_mtx_t * lck,__kdebug_only thread_t thread)636 lck_mtx_unlock_wakeup(
637 lck_mtx_t *lck,
638 __kdebug_only thread_t thread)
639 {
640 struct turnstile *ts;
641 kern_return_t did_wake;
642
643 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_UNLCK_WAKEUP_CODE) | DBG_FUNC_START,
644 unslide_for_kdebug(lck), (uintptr_t)thread_tid(thread), 0, 0, 0);
645
646 ts = turnstile_get_by_id(lck->lck_mtx_tsid);
647
648 /*
649 * We can skip turnstile_{prepare,cleanup} because
650 * we hold the interlock of the primitive,
651 * and enqueues/wakeups all happen under the interlock,
652 * which means the turnstile is stable.
653 */
654 did_wake = waitq_wakeup64_one(&ts->ts_waitq, LCK_MTX_EVENT(lck),
655 THREAD_AWAKENED, WAITQ_UPDATE_INHERITOR);
656 assert(did_wake == KERN_SUCCESS);
657
658 turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD);
659
660 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_UNLCK_WAKEUP_CODE) | DBG_FUNC_END, 0, 0, 0, 0, 0);
661 }
662
663
664 #pragma mark lck_mtx_t: lck_mtx_lock
665
666 static inline bool
lck_mtx_ctid_on_core(uint32_t ctid)667 lck_mtx_ctid_on_core(uint32_t ctid)
668 {
669 thread_t th = ctid_get_thread_unsafe(ctid);
670
671 return th && machine_thread_on_core_allow_invalid(th);
672 }
673
674 #define LCK_MTX_OWNER_FOR_TRACE(lock) \
675 VM_KERNEL_UNSLIDE_OR_PERM(ctid_get_thread_unsafe((lock)->lck_mtx.data))
676
677 static void
lck_mtx_lock_adaptive_spin(lck_mtx_t * lock,lck_mtx_state_t state)678 lck_mtx_lock_adaptive_spin(lck_mtx_t *lock, lck_mtx_state_t state)
679 {
680 __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lock);
681 hw_spin_policy_t pol = &lck_mtx_ilk_timeout_policy;
682 hw_spin_timeout_t to = hw_spin_compute_timeout(pol);
683 hw_spin_state_t ss = { };
684 uint64_t deadline;
685
686 lck_mtx_mcs_t mcs, node;
687 lck_mcs_id_t idx, pidx, clear_idx;
688 unsigned long prev;
689 lck_mtx_state_t nstate;
690 ast_t *const astp = ast_pending();
691
692 idx = lck_mtx_get_mcs_id();
693 mcs = &lck_mcs_get_current()->mcs_mtx;
694
695 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_START,
696 trace_lck, LCK_MTX_OWNER_FOR_TRACE(lock), lock->lck_mtx_tsid, 0, 0);
697
698 deadline = ml_get_timebase() + os_atomic_load(&MutexSpin, relaxed) * processor_avail_count;
699
700 /*
701 * Take a spot in the adaptive spin queue,
702 * and then spin until we're at the head of it.
703 *
704 * Until we're at the head, we do not need to monitor
705 * for whether the current owner is on core or not:
706 *
707 * 1. the head of the queue is doing it already,
708 *
709 * 2. when the entire adaptive spin queue will "give up"
710 * as a result of the owner going off core, we want
711 * to avoid a thundering herd and let the AS queue
712 * pour into the interlock one slowly.
713 *
714 * Do give up if the scheduler made noises something
715 * more important has shown up.
716 *
717 * Note: this function is optimized so that we do not touch
718 * our local mcs node when we're the head of the queue.
719 *
720 * This allows us in the case when the contention is
721 * between 2 cores only to not have to touch this
722 * cacheline at all.
723 */
724 pidx = os_atomic_xchg(&lock->lck_mtx.as_tail, idx, release);
725 if (pidx) {
726 node = lck_mtx_get_mcs(pidx);
727 mcs->lmm_as_prev = pidx;
728 os_atomic_store(&node->lmm_as_next, mcs, release);
729
730 while (!hw_spin_wait_until(&mcs->lmm_as_prev, prev,
731 prev == 0 || (os_atomic_load(astp, relaxed) & AST_URGENT) || (ml_get_timebase() > deadline))) {
732 hw_spin_should_keep_spinning(lock, pol, to, &ss);
733 }
734
735 if (__improbable(prev)) {
736 goto adaptive_spin_fail;
737 }
738
739 clear_idx = 0;
740 } else {
741 clear_idx = idx;
742 }
743
744 /*
745 * We're now first in line.
746 *
747 * It's our responsbility to monitor the lock's state
748 * for whether (1) the lock has become available,
749 * (2) its owner has gone off core, (3) the scheduler
750 * wants its CPU back, or (4) we've spun for too long.
751 */
752 deadline = ml_get_timebase() + os_atomic_load(&MutexSpin, relaxed);
753
754 for (;;) {
755 state.val = lock_load_exclusive(&lock->lck_mtx.val, acquire);
756
757 if (__probable(!state.ilocked && !state.ilk_tail && !state.owner)) {
758 /*
759 * 2-core contention: if we can, try to dequeue
760 * ourselves from the adaptive spin queue
761 * as part of this CAS in order to avoid
762 * the cost of lck_mtx_ilk_lock_cleanup_as_mcs()
763 * and zeroing the mcs node at all.
764 *
765 * Because the queue is designed to limit contention,
766 * using store-exclusive over an armv8.1 LSE atomic
767 * is actually marginally better (presumably due to
768 * the better codegen).
769 */
770 nstate = state;
771 nstate.ilocked = true;
772 if (state.as_tail == clear_idx) {
773 nstate.as_tail = 0;
774 }
775 if (__probable(lock_store_exclusive(&lock->lck_mtx.val,
776 state.val, nstate.val, acquire))) {
777 break;
778 }
779 } else {
780 lock_wait_for_event();
781 }
782
783 if (__improbable(ml_get_timebase() > deadline ||
784 (os_atomic_load(astp, relaxed) & AST_URGENT) ||
785 (!state.ilocked && !state.ilk_tail && state.owner &&
786 !lck_mtx_ctid_on_core(state.owner)))) {
787 goto adaptive_spin_fail;
788 }
789 }
790
791 /*
792 * If we're here, we got the lock, we just have to cleanup
793 * the MCS nodes and return.
794 */
795 if (state.as_tail != clear_idx) {
796 lck_mtx_ilk_lock_cleanup_as_mcs(lock, idx, mcs, to, &ss);
797 lck_mtx_mcs_clear(mcs);
798 }
799
800 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_END,
801 trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(thread),
802 lock->lck_mtx_tsid, 0, 0);
803 return;
804
805 adaptive_spin_fail:
806 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_END,
807 trace_lck, LCK_MTX_OWNER_FOR_TRACE(lock), lock->lck_mtx_tsid, 0, 0);
808 return lck_mtx_ilk_lock_contended(lock, state, LCK_ILK_MODE_FROM_AS);
809 }
810
811 static NOINLINE void
lck_mtx_lock_contended(lck_mtx_t * lock,thread_t thread,lck_mtx_mode_t mode)812 lck_mtx_lock_contended(lck_mtx_t *lock, thread_t thread, lck_mtx_mode_t mode)
813 {
814 struct turnstile *ts = TURNSTILE_NULL;
815 lck_mtx_state_t state;
816 uint32_t ctid = thread->ctid;
817 uint32_t data;
818 #if CONFIG_DTRACE
819 int first_miss = 0;
820 #endif /* CONFIG_DTRACE */
821 bool direct_wait = false;
822 uint64_t spin_start;
823 uint32_t profile;
824
825 lck_mtx_check_irq(lock);
826 if (mode == LCK_MTX_MODE_SLEEPABLE) {
827 lock_disable_preemption_for_thread(thread);
828 }
829
830 for (;;) {
831 /*
832 * Load the current state and perform sanity checks
833 *
834 * Note that the various "corrupt" values are designed
835 * so that the slowpath is taken when a mutex was used
836 * after destruction, so that we do not have to do
837 * sanity checks in the fast path.
838 */
839 state = os_atomic_load(&lock->lck_mtx, relaxed);
840 if (state.owner == ctid) {
841 __lck_mtx_owned_panic(lock, thread);
842 }
843 if (lock->lck_mtx_type != LCK_TYPE_MUTEX ||
844 state.data == LCK_MTX_TAG_DESTROYED) {
845 __lck_mtx_invalid_panic(lock);
846 }
847 profile = (state.data & LCK_MTX_PROFILE);
848
849 /*
850 * Attempt steal
851 *
852 * When the lock state is 0, then no thread can be queued
853 * for adaptive spinning or for the interlock yet.
854 *
855 * As such we can attempt to try to take the interlock.
856 * (we can't take the mutex directly because we need
857 * the interlock to do turnstile operations on the way out).
858 */
859 if ((state.val & ~(uint64_t)LCK_MTX_PROFILE) == 0) {
860 if (!os_atomic_cmpxchgv(&lock->lck_mtx.val,
861 state.val, state.val | LCK_MTX_ILOCK,
862 &state.val, acquire)) {
863 continue;
864 }
865 break;
866 }
867
868 #if CONFIG_DTRACE
869 if (profile) {
870 LCK_MTX_PROF_MISS(lock, lock->lck_mtx_grp, &first_miss);
871 }
872 #endif /* CONFIG_DTRACE */
873
874 if (mode == LCK_MTX_MODE_SLEEPABLE) {
875 spin_start = LCK_MTX_ADAPTIVE_SPIN_BEGIN();
876 } else {
877 spin_start = LCK_MTX_SPIN_SPIN_BEGIN();
878 }
879
880 /*
881 * Adaptive spin or interlock
882 *
883 * Evaluate if adaptive spinning should be attempted,
884 * and if yes go to adaptive spin.
885 *
886 * Otherwise (and this includes always-spin mutexes),
887 * go for the interlock.
888 */
889 if (mode != LCK_MTX_MODE_SPIN_ALWAYS &&
890 (state.ilocked || state.as_tail || !state.owner ||
891 lck_mtx_ctid_on_core(state.owner))) {
892 lck_mtx_lock_adaptive_spin(lock, state);
893 } else {
894 direct_wait = true;
895 lck_mtx_ilk_lock_nopreempt(lock, LCK_ILK_MODE_DIRECT);
896 }
897
898 if (mode == LCK_MTX_MODE_SLEEPABLE) {
899 LCK_MTX_ADAPTIVE_SPIN_END(lock, lock->lck_mtx_grp, spin_start);
900 } else {
901 LCK_MTX_SPIN_SPIN_END(lock, lock->lck_mtx_grp, spin_start);
902 }
903
904 /*
905 * Take or sleep
906 *
907 * We now have the interlock. Either the owner
908 * isn't set, and the mutex is ours to claim,
909 * or we must go to sleep.
910 *
911 * If we go to sleep, we need to set LCK_MTX_NEEDS_WAKEUP
912 * to force the current lock owner to call
913 * lck_mtx_unlock_wakeup().
914 */
915 state = os_atomic_load(&lock->lck_mtx, relaxed);
916 if (state.owner == LCK_MTX_NULL_CTID) {
917 break;
918 }
919
920 if (mode == LCK_MTX_MODE_SPIN_ALWAYS) {
921 __lck_mtx_lock_is_sleepable_panic(lock);
922 }
923
924 #if CONFIG_DTRACE
925 if (profile) {
926 LCK_MTX_PROF_WAIT(lock, lock->lck_mtx_grp,
927 direct_wait, &first_miss);
928 }
929 #endif /* CONFIG_DTRACE */
930 os_atomic_store(&lock->lck_mtx.data,
931 state.data | LCK_MTX_ILOCK | LCK_MTX_NEEDS_WAKEUP,
932 compiler_acq_rel);
933 ts = lck_mtx_lock_wait(lock, thread,
934 ctid_get_thread(state.owner), ts);
935
936 /* returns interlock unlocked and preemption re-enabled */
937 lock_disable_preemption_for_thread(thread);
938 }
939
940 /*
941 * We can take the lock!
942 *
943 * We only have the interlock and the owner field is 0.
944 *
945 * Perform various turnstile cleanups if needed,
946 * claim the lock, and reenable preemption (if needed).
947 */
948 if (ts) {
949 lck_mtx_lock_wait_done(lock, ts);
950 }
951 data = ctid | profile;
952 if (lck_mtx_lock_will_need_wakeup(lock, thread)) {
953 data |= LCK_MTX_NEEDS_WAKEUP;
954 }
955 if (mode != LCK_MTX_MODE_SLEEPABLE) {
956 data |= LCK_MTX_ILOCK | LCK_MTX_SPIN_MODE;
957 }
958 os_atomic_store(&lock->lck_mtx.data, data, release);
959
960 if (mode == LCK_MTX_MODE_SLEEPABLE) {
961 lock_enable_preemption();
962 }
963
964 assert(thread->turnstile != NULL);
965
966 if (ts) {
967 turnstile_cleanup();
968 }
969 LCK_MTX_ACQUIRED(lock, lock->lck_mtx_grp,
970 mode != LCK_MTX_MODE_SLEEPABLE, profile);
971 }
972
973 #if LCK_MTX_CHECK_INVARIANTS || CONFIG_DTRACE
974 __attribute__((noinline))
975 #else
976 __attribute__((always_inline))
977 #endif
978 static void
lck_mtx_lock_slow(lck_mtx_t * lock,thread_t thread,lck_mtx_state_t state,lck_mtx_mode_t mode)979 lck_mtx_lock_slow(
980 lck_mtx_t *lock,
981 thread_t thread,
982 lck_mtx_state_t state,
983 lck_mtx_mode_t mode)
984 {
985 #pragma unused(state)
986 #if CONFIG_DTRACE
987 lck_mtx_state_t ostate = {
988 .data = LCK_MTX_PROFILE,
989 };
990 #endif /* CONFIG_DTRACE */
991
992 #if LCK_MTX_CHECK_INVARIANTS
993 if (mode != LCK_MTX_MODE_SPIN_ALWAYS) {
994 lck_mtx_check_preemption(lock, thread,
995 (mode == LCK_MTX_MODE_SPIN));
996 }
997 #endif /* LCK_MTX_CHECK_INVARIANTS */
998 #if CONFIG_DTRACE
999 if (state.val == ostate.val) {
1000 state.data = thread->ctid | LCK_MTX_PROFILE;
1001 if (mode != LCK_MTX_MODE_SLEEPABLE) {
1002 state.ilocked = true;
1003 state.spin_mode = true;
1004 }
1005 os_atomic_cmpxchgv(&lock->lck_mtx.val,
1006 ostate.val, state.val, &state.val, acquire);
1007 }
1008 if ((state.val & ~ostate.val) == 0) {
1009 LCK_MTX_ACQUIRED(lock, lock->lck_mtx_grp,
1010 mode != LCK_MTX_MODE_SLEEPABLE,
1011 state.data & LCK_MTX_PROFILE);
1012 return;
1013 }
1014 #endif /* CONFIG_DTRACE */
1015 lck_mtx_lock_contended(lock, thread, mode);
1016 }
1017
1018 static __attribute__((always_inline)) void
lck_mtx_lock_fastpath(lck_mtx_t * lock,lck_mtx_mode_t mode)1019 lck_mtx_lock_fastpath(lck_mtx_t *lock, lck_mtx_mode_t mode)
1020 {
1021 thread_t thread = current_thread();
1022 lck_mtx_state_t state = {
1023 .data = thread->ctid,
1024 };
1025 uint64_t take_slowpath = 0;
1026
1027 if (mode != LCK_MTX_MODE_SPIN_ALWAYS) {
1028 take_slowpath |= LCK_MTX_SNIFF_PREEMPTION(thread);
1029 }
1030 take_slowpath |= LCK_MTX_SNIFF_DTRACE();
1031
1032 if (mode != LCK_MTX_MODE_SLEEPABLE) {
1033 lock_disable_preemption_for_thread(thread);
1034 state.ilocked = true;
1035 state.spin_mode = true;
1036 }
1037
1038 /*
1039 * Do the CAS on the entire mutex state,
1040 * which hence requires for the ILK/AS queues
1041 * to be empty (which is fairer).
1042 */
1043 lock_cmpxchgv(&lock->lck_mtx.val,
1044 0, state.val, &state.val, acquire);
1045
1046 take_slowpath |= state.val;
1047 if (__improbable(take_slowpath)) {
1048 return lck_mtx_lock_slow(lock, thread, state, mode);
1049 }
1050 }
1051
1052 __mockable void
lck_mtx_lock(lck_mtx_t * lock)1053 lck_mtx_lock(lck_mtx_t *lock)
1054 {
1055 lck_mtx_lock_fastpath(lock, LCK_MTX_MODE_SLEEPABLE);
1056 }
1057
1058 void
lck_mtx_lock_spin(lck_mtx_t * lock)1059 lck_mtx_lock_spin(lck_mtx_t *lock)
1060 {
1061 lck_mtx_lock_fastpath(lock, LCK_MTX_MODE_SPIN);
1062 }
1063
1064 void
lck_mtx_lock_spin_always(lck_mtx_t * lock)1065 lck_mtx_lock_spin_always(lck_mtx_t *lock)
1066 {
1067 lck_mtx_lock_fastpath(lock, LCK_MTX_MODE_SPIN_ALWAYS);
1068 }
1069
1070
1071 #pragma mark lck_mtx_t: lck_mtx_try_lock
1072
1073 static __attribute__((always_inline)) bool
lck_mtx_try_lock_slow_inline(lck_mtx_t * lock,thread_t thread,uint32_t odata,uint32_t ndata,bool spin)1074 lck_mtx_try_lock_slow_inline(
1075 lck_mtx_t *lock,
1076 thread_t thread,
1077 uint32_t odata,
1078 uint32_t ndata,
1079 bool spin)
1080 {
1081 #pragma unused(lock, thread, odata, ndata)
1082 #if CONFIG_DTRACE
1083 if (odata == LCK_MTX_PROFILE) {
1084 os_atomic_cmpxchgv(&lock->lck_mtx.data,
1085 odata, ndata | LCK_MTX_PROFILE, &odata, acquire);
1086 }
1087 if ((odata & ~LCK_MTX_PROFILE) == 0) {
1088 LCK_MTX_TRY_ACQUIRED(lock, lock->lck_mtx_grp,
1089 spin, odata & LCK_MTX_PROFILE);
1090 return true;
1091 }
1092 if (odata & LCK_MTX_PROFILE) {
1093 LCK_MTX_PROF_MISS(lock, lock->lck_mtx_grp, &(int){ 0 });
1094 }
1095 #endif /* CONFIG_DTRACE */
1096
1097 if (spin) {
1098 lock_enable_preemption();
1099 }
1100 return false;
1101 }
1102
1103 #if CONFIG_DTRACE || LCK_MTX_CHECK_INVARIANTS
1104 __attribute__((noinline))
1105 #else
1106 __attribute__((always_inline))
1107 #endif
1108 static bool
lck_mtx_try_lock_slow(lck_mtx_t * lock,thread_t thread,uint32_t odata,uint32_t ndata)1109 lck_mtx_try_lock_slow(
1110 lck_mtx_t *lock,
1111 thread_t thread,
1112 uint32_t odata,
1113 uint32_t ndata)
1114 {
1115 return lck_mtx_try_lock_slow_inline(lock, thread, odata, ndata, false);
1116 }
1117
1118 #if CONFIG_DTRACE || LCK_MTX_CHECK_INVARIANTS
1119 __attribute__((noinline))
1120 #else
1121 __attribute__((always_inline))
1122 #endif
1123 static bool
lck_mtx_try_lock_slow_spin(lck_mtx_t * lock,thread_t thread,uint32_t odata,uint32_t ndata)1124 lck_mtx_try_lock_slow_spin(
1125 lck_mtx_t *lock,
1126 thread_t thread,
1127 uint32_t odata,
1128 uint32_t ndata)
1129 {
1130 return lck_mtx_try_lock_slow_inline(lock, thread, odata, ndata, true);
1131 }
1132
1133 static __attribute__((always_inline)) bool
lck_mtx_try_lock_fastpath(lck_mtx_t * lock,lck_mtx_mode_t mode)1134 lck_mtx_try_lock_fastpath(lck_mtx_t *lock, lck_mtx_mode_t mode)
1135 {
1136 thread_t thread = current_thread();
1137 uint32_t odata, ndata = thread->ctid;
1138 uint32_t take_slowpath = 0;
1139
1140 #if CONFIG_DTRACE
1141 take_slowpath |= lck_debug_state.lds_value;
1142 #endif
1143 if (mode != LCK_MTX_MODE_SLEEPABLE) {
1144 lock_disable_preemption_for_thread(thread);
1145 ndata |= LCK_MTX_SPIN_MODE | LCK_MTX_ILOCK;
1146 }
1147
1148 /*
1149 * try_lock because it's likely to be used for cases
1150 * like lock inversion resolutions tries a bit harder
1151 * than lck_mtx_lock() to take the lock and ignores
1152 * adaptive spin / interlock queues by doing the CAS
1153 * on the 32bit mutex data only.
1154 */
1155 lock_cmpxchgv(&lock->lck_mtx.data, 0, ndata, &odata, acquire);
1156
1157 take_slowpath |= odata;
1158 if (__probable(!take_slowpath)) {
1159 return true;
1160 }
1161
1162 if (mode == LCK_MTX_MODE_SPIN_ALWAYS &&
1163 (odata & LCK_MTX_CTID_MASK) &&
1164 !(odata & LCK_MTX_SPIN_MODE)) {
1165 __lck_mtx_lock_is_sleepable_panic(lock);
1166 }
1167
1168 if (mode == LCK_MTX_MODE_SLEEPABLE) {
1169 return lck_mtx_try_lock_slow(lock, thread, odata, ndata);
1170 } else {
1171 return lck_mtx_try_lock_slow_spin(lock, thread, odata, ndata);
1172 }
1173 }
1174
1175 boolean_t
lck_mtx_try_lock(lck_mtx_t * lock)1176 lck_mtx_try_lock(lck_mtx_t *lock)
1177 {
1178 return lck_mtx_try_lock_fastpath(lock, LCK_MTX_MODE_SLEEPABLE);
1179 }
1180
1181 boolean_t
lck_mtx_try_lock_spin(lck_mtx_t * lock)1182 lck_mtx_try_lock_spin(lck_mtx_t *lock)
1183 {
1184 return lck_mtx_try_lock_fastpath(lock, LCK_MTX_MODE_SPIN);
1185 }
1186
1187 boolean_t
lck_mtx_try_lock_spin_always(lck_mtx_t * lock)1188 lck_mtx_try_lock_spin_always(lck_mtx_t *lock)
1189 {
1190 return lck_mtx_try_lock_fastpath(lock, LCK_MTX_MODE_SPIN_ALWAYS);
1191 }
1192
1193
1194 #pragma mark lck_mtx_t: lck_mtx_unlock
1195
1196 static NOINLINE void
lck_mtx_unlock_contended(lck_mtx_t * lock,thread_t thread,uint32_t data)1197 lck_mtx_unlock_contended(lck_mtx_t *lock, thread_t thread, uint32_t data)
1198 {
1199 bool cleanup = false;
1200
1201 #if !CONFIG_DTRACE
1202 /*
1203 * This check is done by lck_mtx_unlock_slow() when it is enabled.
1204 */
1205 if (thread->ctid != (data & LCK_MTX_CTID_MASK)) {
1206 __lck_mtx_not_owned_panic(lock, thread);
1207 }
1208 #endif /* !CONFIG_DTRACE */
1209
1210 if ((data & LCK_MTX_SPIN_MODE) == 0) {
1211 lock_disable_preemption_for_thread(thread);
1212 lck_mtx_ilk_lock_nopreempt(lock, LCK_ILK_MODE_UNLOCK);
1213 }
1214
1215 /*
1216 * We must re-load the data: we might have taken
1217 * the slowpath because another thread had taken
1218 * the interlock and set the NEEDS_WAKEUP bit
1219 * while we were spinning to get it.
1220 */
1221 data = os_atomic_load(&lock->lck_mtx.data, compiler_acq_rel);
1222 if (data & LCK_MTX_NEEDS_WAKEUP) {
1223 lck_mtx_unlock_wakeup(lock, thread);
1224 cleanup = true;
1225 }
1226 lck_mtx_ilk_unlock_v(lock, data & LCK_MTX_PROFILE);
1227
1228 LCK_MTX_RELEASED(lock, lock->lck_mtx_grp, data & LCK_MTX_PROFILE);
1229
1230 /*
1231 * Do not do any turnstile operations outside of this block.
1232 *
1233 * lock/unlock is called at early stage of boot while single
1234 * threaded, without turnstiles being available yet.
1235 * Even without contention we can come throught the slow path
1236 * if the mutex is acquired as a spin lock.
1237 */
1238 if (cleanup) {
1239 turnstile_cleanup();
1240 }
1241 }
1242
1243 #if CONFIG_DTRACE
1244 __attribute__((noinline))
1245 #else
1246 __attribute__((always_inline))
1247 #endif
1248 static void
lck_mtx_unlock_slow(lck_mtx_t * lock,thread_t thread,uint32_t data)1249 lck_mtx_unlock_slow(lck_mtx_t *lock, thread_t thread, uint32_t data)
1250 {
1251 #if CONFIG_DTRACE
1252 /*
1253 * If Dtrace is enabled, locks can be profiled,
1254 * which causes the fastpath of unlock to fail.
1255 */
1256 if ((data & LCK_MTX_BITS_MASK) == LCK_MTX_PROFILE) {
1257 os_atomic_cmpxchgv(&lock->lck_mtx.data, data, LCK_MTX_PROFILE,
1258 &data, release);
1259 }
1260 if (thread->ctid != (data & LCK_MTX_CTID_MASK)) {
1261 __lck_mtx_not_owned_panic(lock, thread);
1262 }
1263 if ((data & (LCK_MTX_BITS_MASK & ~LCK_MTX_PROFILE)) == 0) {
1264 LCK_MTX_RELEASED(lock, lock->lck_mtx_grp, false);
1265 return;
1266 }
1267 #endif /* CONFIG_DTRACE */
1268
1269 lck_mtx_unlock_contended(lock, thread, data);
1270 }
1271
1272 __mockable void
lck_mtx_unlock(lck_mtx_t * lock)1273 lck_mtx_unlock(lck_mtx_t *lock)
1274 {
1275 thread_t thread = current_thread();
1276 uint32_t take_slowpath = 0;
1277 uint32_t data;
1278
1279 take_slowpath |= LCK_MTX_SNIFF_DTRACE();
1280
1281 /*
1282 * The fast path ignores the ILK/AS queues on purpose,
1283 * those really are a "lock" concept, not unlock.
1284 */
1285 if (__probable(lock_cmpxchgv(&lock->lck_mtx.data,
1286 thread->ctid, 0, &data, release))) {
1287 if (__probable(!take_slowpath)) {
1288 return;
1289 }
1290 }
1291
1292 lck_mtx_unlock_slow(lock, thread, data);
1293 }
1294
1295
1296 #pragma mark lck_mtx_t: misc
1297
1298 void
lck_mtx_assert(lck_mtx_t * lock,unsigned int type)1299 lck_mtx_assert(lck_mtx_t *lock, unsigned int type)
1300 {
1301 lck_mtx_state_t state = os_atomic_load(&lock->lck_mtx, relaxed);
1302 thread_t thread = current_thread();
1303
1304 if (type == LCK_MTX_ASSERT_OWNED) {
1305 if (state.owner != thread->ctid) {
1306 __lck_mtx_not_owned_panic(lock, thread);
1307 }
1308 } else if (type == LCK_MTX_ASSERT_NOTOWNED) {
1309 if (state.owner == thread->ctid) {
1310 __lck_mtx_owned_panic(lock, thread);
1311 }
1312 } else {
1313 panic("lck_mtx_assert(): invalid arg (%u)", type);
1314 }
1315 }
1316
1317 /*
1318 * Routine: lck_mtx_convert_spin
1319 *
1320 * Convert a mutex held for spin into a held full mutex
1321 */
1322 void
lck_mtx_convert_spin(lck_mtx_t * lock)1323 lck_mtx_convert_spin(lck_mtx_t *lock)
1324 {
1325 lck_mtx_state_t state = os_atomic_load(&lock->lck_mtx, relaxed);
1326 thread_t thread = current_thread();
1327 uint32_t data = thread->ctid;
1328
1329 if (state.owner != data) {
1330 __lck_mtx_not_owned_panic(lock, thread);
1331 }
1332
1333 if (state.spin_mode) {
1334 /*
1335 * Note: we can acquire the lock in spin mode
1336 * _and_ be the inheritor if we waited.
1337 *
1338 * We must only clear ilocked and spin_mode,
1339 * but preserve owner and needs_wakeup.
1340 */
1341 state.ilocked = false;
1342 state.spin_mode = false;
1343 lck_mtx_ilk_unlock_v(lock, state.data);
1344 turnstile_cleanup();
1345 }
1346 }
1347
1348 /*
1349 * Routine: kdp_lck_mtx_lock_spin_is_acquired
1350 * NOT SAFE: To be used only by kernel debugger to avoid deadlock.
1351 */
1352 boolean_t
kdp_lck_mtx_lock_spin_is_acquired(lck_mtx_t * lck)1353 kdp_lck_mtx_lock_spin_is_acquired(lck_mtx_t *lck)
1354 {
1355 lck_mtx_state_t state = os_atomic_load(&lck->lck_mtx, relaxed);
1356
1357 if (not_in_kdp) {
1358 panic("panic: spinlock acquired check done outside of kernel debugger");
1359 }
1360 if (state.data == LCK_MTX_TAG_DESTROYED) {
1361 return false;
1362 }
1363 return state.owner || state.ilocked;
1364 }
1365
1366 void
kdp_lck_mtx_find_owner(struct waitq * waitq __unused,event64_t event,thread_waitinfo_t * waitinfo)1367 kdp_lck_mtx_find_owner(
1368 struct waitq *waitq __unused,
1369 event64_t event,
1370 thread_waitinfo_t *waitinfo)
1371 {
1372 lck_mtx_t *mutex = LCK_EVENT_TO_MUTEX(event);
1373 lck_mtx_state_t state = os_atomic_load(&mutex->lck_mtx, relaxed);
1374
1375 assert3u(state.data, !=, LCK_MTX_TAG_DESTROYED);
1376 waitinfo->context = VM_KERNEL_UNSLIDE_OR_PERM(mutex);
1377 waitinfo->owner = thread_tid(ctid_get_thread(state.owner));
1378 }
1379
1380 #endif /* !LCK_MTX_USE_ARCH */
1381
1382 /*
1383 * Routine: mutex_pause
1384 *
1385 * Called by former callers of simple_lock_pause().
1386 */
1387 #define MAX_COLLISION_COUNTS 32
1388 #define MAX_COLLISION 8
1389
1390 unsigned int max_collision_count[MAX_COLLISION_COUNTS];
1391
1392 uint32_t collision_backoffs[MAX_COLLISION] = {
1393 10, 50, 100, 200, 400, 600, 800, 1000
1394 };
1395
1396
1397 void
mutex_pause(uint32_t collisions)1398 mutex_pause(uint32_t collisions)
1399 {
1400 wait_result_t wait_result;
1401 uint32_t back_off;
1402
1403 if (collisions >= MAX_COLLISION_COUNTS) {
1404 collisions = MAX_COLLISION_COUNTS - 1;
1405 }
1406 max_collision_count[collisions]++;
1407
1408 if (collisions >= MAX_COLLISION) {
1409 collisions = MAX_COLLISION - 1;
1410 }
1411 back_off = collision_backoffs[collisions];
1412
1413 wait_result = assert_wait_timeout((event_t)mutex_pause, THREAD_UNINT, back_off, NSEC_PER_USEC);
1414 assert(wait_result == THREAD_WAITING);
1415
1416 wait_result = thread_block(THREAD_CONTINUE_NULL);
1417 assert(wait_result == THREAD_TIMED_OUT);
1418 }
1419
1420
1421 unsigned int mutex_yield_wait = 0;
1422 unsigned int mutex_yield_no_wait = 0;
1423
1424 boolean_t
lck_mtx_yield(lck_mtx_t * lck)1425 lck_mtx_yield(
1426 lck_mtx_t *lck)
1427 {
1428 bool has_waiters = LCK_MTX_HAS_WAITERS(lck);
1429
1430 #if DEBUG
1431 lck_mtx_assert(lck, LCK_MTX_ASSERT_OWNED);
1432 #endif /* DEBUG */
1433
1434 if (!has_waiters) {
1435 mutex_yield_no_wait++;
1436 } else {
1437 mutex_yield_wait++;
1438 lck_mtx_unlock(lck);
1439 mutex_pause(0);
1440 lck_mtx_lock(lck);
1441 }
1442 return has_waiters;
1443 }
1444