1 /*
2 * Copyright (c) 2007-2018 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * @OSF_COPYRIGHT@
30 */
31 /*
32 * Mach Operating System Copyright (c) 1991,1990,1989,1988,1987 Carnegie
33 * Mellon University All Rights Reserved.
34 *
35 * Permission to use, copy, modify and distribute this software and its
36 * documentation is hereby granted, provided that both the copyright notice
37 * and this permission notice appear in all copies of the software,
38 * derivative works or modified versions, and any portions thereof, and that
39 * both notices appear in supporting documentation.
40 *
41 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" CONDITION.
42 * CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR ANY DAMAGES
43 * WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
44 *
45 * Carnegie Mellon requests users of this software to return to
46 *
47 * Software Distribution Coordinator or [email protected]
48 * School of Computer Science Carnegie Mellon University Pittsburgh PA
49 * 15213-3890
50 *
51 * any improvements or extensions that they make and grant Carnegie Mellon the
52 * rights to redistribute these changes.
53 */
54 /*
55 * File: kern/lock.c
56 * Author: Avadis Tevanian, Jr., Michael Wayne Young
57 * Date: 1985
58 *
59 * Locking primitives implementation
60 */
61
62 #define LOCK_PRIVATE 1
63
64 #include <mach_ldebug.h>
65
66 #include <mach/machine/sdt.h>
67
68 #include <kern/zalloc.h>
69 #include <kern/lock_stat.h>
70 #include <kern/locks.h>
71 #include <kern/misc_protos.h>
72 #include <kern/thread.h>
73 #include <kern/processor.h>
74 #include <kern/sched_hygiene.h>
75 #include <kern/sched_prim.h>
76 #include <kern/debug.h>
77 #include <kern/kcdata.h>
78 #include <kern/percpu.h>
79 #include <string.h>
80 #include <arm/cpu_internal.h>
81 #include <os/hash.h>
82 #include <arm/cpu_data.h>
83
84 #include <arm/cpu_data_internal.h>
85 #include <arm/proc_reg.h>
86 #include <arm/smp.h>
87 #include <machine/atomic.h>
88 #include <machine/machine_cpu.h>
89
90 #include <pexpert/pexpert.h>
91
92 #include <sys/kdebug.h>
93
94 #define ANY_LOCK_DEBUG (USLOCK_DEBUG || LOCK_DEBUG || MUTEX_DEBUG)
95
96 // Panic in tests that check lock usage correctness
97 // These are undesirable when in a panic or a debugger is runnning.
98 #define LOCK_CORRECTNESS_PANIC() (kernel_debugger_entry_count == 0)
99
100 #define ADAPTIVE_SPIN_ENABLE 0x1
101
102 int lck_mtx_adaptive_spin_mode = ADAPTIVE_SPIN_ENABLE;
103
104 #define SPINWAIT_OWNER_CHECK_COUNT 4
105
106 typedef enum {
107 SPINWAIT_ACQUIRED, /* Got the lock. */
108 SPINWAIT_INTERLOCK, /* Got the interlock, no owner, but caller must finish acquiring the lock. */
109 SPINWAIT_DID_SPIN_HIGH_THR, /* Got the interlock, spun, but failed to get the lock. */
110 SPINWAIT_DID_SPIN_OWNER_NOT_CORE, /* Got the interlock, spun, but failed to get the lock. */
111 SPINWAIT_DID_SPIN_NO_WINDOW_CONTENTION, /* Got the interlock, spun, but failed to get the lock. */
112 SPINWAIT_DID_SPIN_SLIDING_THR,/* Got the interlock, spun, but failed to get the lock. */
113 SPINWAIT_DID_NOT_SPIN, /* Got the interlock, did not spin. */
114 } spinwait_result_t;
115
116 #if CONFIG_DTRACE
117 extern machine_timeout32_t dtrace_spin_threshold;
118 #endif
119
120 /* Forwards */
121
122 extern unsigned int not_in_kdp;
123
124 /*
125 * We often want to know the addresses of the callers
126 * of the various lock routines. However, this information
127 * is only used for debugging and statistics.
128 */
129 typedef void *pc_t;
130 #define INVALID_PC ((void *) VM_MAX_KERNEL_ADDRESS)
131 #define INVALID_THREAD ((void *) VM_MAX_KERNEL_ADDRESS)
132
133 #ifdef lint
134 /*
135 * Eliminate lint complaints about unused local pc variables.
136 */
137 #define OBTAIN_PC(pc, l) ++pc
138 #else /* lint */
139 #define OBTAIN_PC(pc, l)
140 #endif /* lint */
141
142
143 /*
144 * Portable lock package implementation of usimple_locks.
145 */
146
147 /*
148 * Owner thread pointer when lock held in spin mode
149 */
150 #define LCK_MTX_SPIN_TAG 0xfffffff0
151
152
153 #define interlock_lock(lock) hw_lock_bit ((hw_lock_bit_t*)(&(lock)->lck_mtx_data), LCK_ILOCK_BIT, LCK_GRP_NULL)
154 #define interlock_try(lock) hw_lock_bit_try((hw_lock_bit_t*)(&(lock)->lck_mtx_data), LCK_ILOCK_BIT, LCK_GRP_NULL)
155 #define interlock_unlock(lock) hw_unlock_bit ((hw_lock_bit_t*)(&(lock)->lck_mtx_data), LCK_ILOCK_BIT)
156 #define load_memory_barrier() os_atomic_thread_fence(acquire)
157
158 // Enforce program order of loads and stores.
159 #define ordered_load(target) \
160 os_atomic_load(target, compiler_acq_rel)
161 #define ordered_store(target, value) \
162 os_atomic_store(target, value, compiler_acq_rel)
163
164 #define ordered_load_mtx(lock) ordered_load(&(lock)->lck_mtx_data)
165 #define ordered_store_mtx(lock, value) ordered_store(&(lock)->lck_mtx_data, (value))
166 #define ordered_load_hw(lock) ordered_load(&(lock)->lock_data)
167 #define ordered_store_hw(lock, value) ordered_store(&(lock)->lock_data, (value))
168 #define ordered_load_bit(lock) ordered_load((lock))
169 #define ordered_store_bit(lock, value) ordered_store((lock), (value))
170
171
172 // Prevent the compiler from reordering memory operations around this
173 #define compiler_memory_fence() __asm__ volatile ("" ::: "memory")
174
175 MACHINE_TIMEOUT32(lock_panic_timeout, "lock-panic",
176 0xc00000 /* 12.5 m ticks ~= 524ms with 24MHz OSC */, MACHINE_TIMEOUT_UNIT_TIMEBASE, NULL);
177
178 #define NOINLINE __attribute__((noinline))
179
180
181 #if __arm__
182 #define interrupts_disabled(mask) (mask & PSR_INTMASK)
183 #else
184 #define interrupts_disabled(mask) (mask & DAIF_IRQF)
185 #endif
186
187
188 #if __arm__
189 #define enable_fiq() __asm__ volatile ("cpsie f" ::: "memory");
190 #define enable_interrupts() __asm__ volatile ("cpsie if" ::: "memory");
191 #endif
192
193 KALLOC_TYPE_DEFINE(KT_LCK_SPIN, lck_spin_t, KT_PRIV_ACCT);
194
195 KALLOC_TYPE_DEFINE(KT_LCK_MTX, lck_mtx_t, KT_PRIV_ACCT);
196
197 KALLOC_TYPE_DEFINE(KT_LCK_MTX_EXT, lck_mtx_ext_t, KT_PRIV_ACCT);
198
199 #pragma GCC visibility push(hidden)
200 /*
201 * atomic exchange API is a low level abstraction of the operations
202 * to atomically read, modify, and write a pointer. This abstraction works
203 * for both Intel and ARMv8.1 compare and exchange atomic instructions as
204 * well as the ARM exclusive instructions.
205 *
206 * atomic_exchange_begin() - begin exchange and retrieve current value
207 * atomic_exchange_complete() - conclude an exchange
208 * atomic_exchange_abort() - cancel an exchange started with atomic_exchange_begin()
209 */
210 uint32_t
load_exclusive32(uint32_t * target,enum memory_order ord)211 load_exclusive32(uint32_t *target, enum memory_order ord)
212 {
213 uint32_t value;
214
215 #if __arm__
216 if (_os_atomic_mo_has_release(ord)) {
217 // Pre-load release barrier
218 atomic_thread_fence(memory_order_release);
219 }
220 value = __builtin_arm_ldrex(target);
221 #else
222 if (_os_atomic_mo_has_acquire(ord)) {
223 value = __builtin_arm_ldaex(target); // ldaxr
224 } else {
225 value = __builtin_arm_ldrex(target); // ldxr
226 }
227 #endif // __arm__
228 return value;
229 }
230
231 boolean_t
store_exclusive32(uint32_t * target,uint32_t value,enum memory_order ord)232 store_exclusive32(uint32_t *target, uint32_t value, enum memory_order ord)
233 {
234 boolean_t err;
235
236 #if __arm__
237 err = __builtin_arm_strex(value, target);
238 if (_os_atomic_mo_has_acquire(ord)) {
239 // Post-store acquire barrier
240 atomic_thread_fence(memory_order_acquire);
241 }
242 #else
243 if (_os_atomic_mo_has_release(ord)) {
244 err = __builtin_arm_stlex(value, target); // stlxr
245 } else {
246 err = __builtin_arm_strex(value, target); // stxr
247 }
248 #endif // __arm__
249 return !err;
250 }
251
252 uint32_t
atomic_exchange_begin32(uint32_t * target,uint32_t * previous,enum memory_order ord)253 atomic_exchange_begin32(uint32_t *target, uint32_t *previous, enum memory_order ord)
254 {
255 uint32_t val;
256
257 #if __ARM_ATOMICS_8_1
258 ord = memory_order_relaxed;
259 #endif
260 val = load_exclusive32(target, ord);
261 *previous = val;
262 return val;
263 }
264
265 boolean_t
atomic_exchange_complete32(uint32_t * target,uint32_t previous,uint32_t newval,enum memory_order ord)266 atomic_exchange_complete32(uint32_t *target, uint32_t previous, uint32_t newval, enum memory_order ord)
267 {
268 #if __ARM_ATOMICS_8_1
269 return __c11_atomic_compare_exchange_strong((_Atomic uint32_t *)target, &previous, newval, ord, memory_order_relaxed);
270 #else
271 (void)previous; // Previous not needed, monitor is held
272 return store_exclusive32(target, newval, ord);
273 #endif
274 }
275
276 void
atomic_exchange_abort(void)277 atomic_exchange_abort(void)
278 {
279 os_atomic_clear_exclusive();
280 }
281
282 boolean_t
atomic_test_and_set32(uint32_t * target,uint32_t test_mask,uint32_t set_mask,enum memory_order ord,boolean_t wait)283 atomic_test_and_set32(uint32_t *target, uint32_t test_mask, uint32_t set_mask, enum memory_order ord, boolean_t wait)
284 {
285 uint32_t value, prev;
286
287 for (;;) {
288 value = atomic_exchange_begin32(target, &prev, ord);
289 if (value & test_mask) {
290 if (wait) {
291 wait_for_event(); // Wait with monitor held
292 } else {
293 atomic_exchange_abort(); // Clear exclusive monitor
294 }
295 return FALSE;
296 }
297 value |= set_mask;
298 if (atomic_exchange_complete32(target, prev, value, ord)) {
299 return TRUE;
300 }
301 }
302 }
303
304 #pragma GCC visibility pop
305
306 #if SCHED_PREEMPTION_DISABLE_DEBUG
307
308 uint64_t PERCPU_DATA(preemption_disable_max_mt);
309
310 MACHINE_TIMEOUT_WRITEABLE(sched_preemption_disable_threshold_mt, "sched-preemption", 0, MACHINE_TIMEOUT_UNIT_TIMEBASE, kprintf_spam_mt_pred);
311
312 TUNABLE_DT_WRITEABLE(sched_hygiene_mode_t, sched_preemption_disable_debug_mode,
313 "machine-timeouts",
314 "sched-preemption-disable-mode", /* DT property names have to be 31 chars max */
315 "sched_preemption_disable_debug_mode",
316 SCHED_HYGIENE_MODE_OFF,
317 TUNABLE_DT_CHECK_CHOSEN);
318
319 static uint32_t const sched_preemption_disable_debug_dbgid = MACHDBG_CODE(DBG_MACH_SCHED, MACH_PREEMPTION_EXPIRED) | DBG_FUNC_NONE;
320
321 NOINLINE void
_prepare_preemption_disable_measurement(thread_t thread)322 _prepare_preemption_disable_measurement(thread_t thread)
323 {
324 if (thread->machine.inthandler_timestamp == 0) {
325 /*
326 * Only prepare a measurement if not currently in an interrupt
327 * handler.
328 *
329 * We are only interested in the net duration of disabled
330 * preemption, that is: The time in which preemption was
331 * disabled, minus the intervals in which any (likely
332 * unrelated) interrupts were handled.
333 * ml_adjust_preemption_disable_time() will remove those
334 * intervals, however we also do not even start measuring
335 * preemption disablement if we are already within handling of
336 * an interrupt when preemption was disabled (the resulting
337 * net time would be 0).
338 *
339 * Interrupt handling duration is handled separately, and any
340 * long intervals of preemption disablement are counted
341 * towards that.
342 */
343 thread->machine.preemption_disable_adj_mt = ml_get_speculative_timebase();
344 }
345 }
346
347 NOINLINE void
_collect_preemption_disable_measurement(thread_t thread)348 _collect_preemption_disable_measurement(thread_t thread)
349 {
350 bool istate = ml_set_interrupts_enabled(false);
351 /*
352 * Collect start time and current time with interrupts disabled.
353 * Otherwise an interrupt coming in after grabbing the timestamp
354 * could spuriously inflate the measurement, because it will
355 * adjust preemption_disable_adj_mt only after we already grabbed
356 * it.
357 *
358 * (Even worse if we collected the current time first: Then a
359 * subsequent interrupt could adjust preemption_disable_adj_mt to
360 * make the duration go negative after subtracting the already
361 * grabbed time. With interrupts disabled we don't care much about
362 * the order.)
363 */
364
365 uint64_t const mt = thread->machine.preemption_disable_adj_mt;
366 uint64_t const now = ml_get_speculative_timebase();
367
368 os_compiler_barrier(acq_rel);
369
370 ml_set_interrupts_enabled(istate);
371
372 int64_t const duration = now - mt;
373
374
375 uint64_t * const max_duration = PERCPU_GET(preemption_disable_max_mt);
376
377 if (__improbable(duration > *max_duration)) {
378 *max_duration = duration;
379 }
380
381 uint64_t const threshold = os_atomic_load(&sched_preemption_disable_threshold_mt, relaxed);
382 if (__improbable(threshold > 0 && duration >= threshold)) {
383 if (sched_preemption_disable_debug_mode == SCHED_HYGIENE_MODE_PANIC) {
384 panic("preemption disable timeout exceeded: %llu >= %llu timebase ticks", duration, threshold);
385 }
386
387 DTRACE_SCHED1(mach_preemption_expired, uint64_t, duration);
388 if (__improbable(kdebug_debugid_enabled(sched_preemption_disable_debug_dbgid))) {
389 KDBG(sched_preemption_disable_debug_dbgid, duration);
390 }
391 }
392
393 thread->machine.preemption_disable_adj_mt = 0;
394 }
395
396 /*
397 * Skip predicate for sched_preemption_disable, which would trigger
398 * spuriously when kprintf spam is enabled.
399 */
400 bool
kprintf_spam_mt_pred(struct machine_timeout_spec const __unused * spec)401 kprintf_spam_mt_pred(struct machine_timeout_spec const __unused *spec)
402 {
403 bool const kprintf_spam_enabled = !(disable_kprintf_output || disable_serial_output);
404 return kprintf_spam_enabled;
405 }
406
407 #endif /* SCHED_PREEMPTION_DISABLE_DEBUG */
408
409 /*
410 * To help _disable_preemption() inline everywhere with LTO,
411 * we keep these nice non inlineable functions as the panic()
412 * codegen setup is quite large and for weird reasons causes a frame.
413 */
414 __abortlike
415 static void
_disable_preemption_overflow(void)416 _disable_preemption_overflow(void)
417 {
418 panic("Preemption count overflow");
419 }
420
421 void
_disable_preemption(void)422 _disable_preemption(void)
423 {
424 thread_t thread = current_thread();
425 unsigned int count = thread->machine.preemption_count;
426
427 if (__improbable(++count == 0)) {
428 _disable_preemption_overflow();
429 }
430
431 os_atomic_store(&thread->machine.preemption_count, count, compiler_acq_rel);
432
433 #if SCHED_PREEMPTION_DISABLE_DEBUG
434
435 /*
436 * Note that this is not the only place preemption gets disabled,
437 * it also gets modified on ISR and PPL entry/exit. Both of those
438 * events will be treated specially however, and
439 * increment/decrement being paired around their entry/exit means
440 * that collection here is not desynced otherwise.
441 */
442
443 if (count == 1 && sched_preemption_disable_debug_mode) {
444 _prepare_preemption_disable_measurement(thread);
445 }
446 #endif /* SCHED_PREEMPTION_DISABLE_DEBUG */
447 }
448
449 /*
450 * This variant of _disable_preemption() allows disabling preemption
451 * without taking measurements (and later potentially triggering
452 * actions on those).
453 *
454 * We do this through a separate variant because we do not want to
455 * disturb inlinability of _disable_preemption(). However, in order to
456 * also avoid code duplication, instead of repeating common code we
457 * simply call _disable_preemption() and explicitly abandon any taken
458 * measurement.
459 */
460 void
_disable_preemption_without_measurements(void)461 _disable_preemption_without_measurements(void)
462 {
463 _disable_preemption();
464
465 #if SCHED_PREEMPTION_DISABLE_DEBUG
466 /*
467 * Abandon a potential preemption disable measurement. Useful for
468 * example for the idle thread, which would just spuriously
469 * trigger the threshold while actually idling, which we don't
470 * care about.
471 */
472 thread_t t = current_thread();
473 if (t->machine.preemption_disable_adj_mt != 0) {
474 t->machine.preemption_disable_adj_mt = 0;
475 }
476 #endif /* SCHED_PREEMPTION_DISABLE_DEBUG */
477 }
478
479 /*
480 * This function checks whether an AST_URGENT has been pended.
481 *
482 * It is called once the preemption has been reenabled, which means the thread
483 * may have been preempted right before this was called, and when this function
484 * actually performs the check, we've changed CPU.
485 *
486 * This race is however benign: the point of AST_URGENT is to trigger a context
487 * switch, so if one happened, there's nothing left to check for, and AST_URGENT
488 * was cleared in the process.
489 *
490 * It follows that this check cannot have false negatives, which allows us
491 * to avoid fiddling with interrupt state for the vast majority of cases
492 * when the check will actually be negative.
493 */
494 static NOINLINE void
kernel_preempt_check(thread_t thread)495 kernel_preempt_check(thread_t thread)
496 {
497 cpu_data_t *cpu_data_ptr;
498 long state;
499
500 #if __arm__
501 #define INTERRUPT_MASK PSR_IRQF
502 #else // __arm__
503 #define INTERRUPT_MASK DAIF_IRQF
504 #endif // __arm__
505
506 /*
507 * This check is racy and could load from another CPU's pending_ast mask,
508 * but as described above, this can't have false negatives.
509 */
510 cpu_data_ptr = os_atomic_load(&thread->machine.CpuDatap, compiler_acq_rel);
511 if (__probable((cpu_data_ptr->cpu_pending_ast & AST_URGENT) == 0)) {
512 return;
513 }
514
515 /* If interrupts are masked, we can't take an AST here */
516 state = get_interrupts();
517 if ((state & INTERRUPT_MASK) == 0) {
518 disable_interrupts_noread(); // Disable interrupts
519
520 /*
521 * Reload cpu_data_ptr: a context switch would cause it to change.
522 * Now that interrupts are disabled, this will debounce false positives.
523 */
524 cpu_data_ptr = os_atomic_load(&thread->machine.CpuDatap, compiler_acq_rel);
525 if (thread->machine.CpuDatap->cpu_pending_ast & AST_URGENT) {
526 #if __arm__
527 #if __ARM_USER_PROTECT__
528 uintptr_t up = arm_user_protect_begin(thread);
529 #endif // __ARM_USER_PROTECT__
530 enable_fiq();
531 #endif // __arm__
532 ast_taken_kernel(); // Handle urgent AST
533 #if __arm__
534 #if __ARM_USER_PROTECT__
535 arm_user_protect_end(thread, up, TRUE);
536 #endif // __ARM_USER_PROTECT__
537 enable_interrupts();
538 return; // Return early on arm only due to FIQ enabling
539 #endif // __arm__
540 }
541 restore_interrupts(state); // Enable interrupts
542 }
543 }
544
545 /*
546 * To help _enable_preemption() inline everywhere with LTO,
547 * we keep these nice non inlineable functions as the panic()
548 * codegen setup is quite large and for weird reasons causes a frame.
549 */
550 __abortlike
551 static void
_enable_preemption_underflow(void)552 _enable_preemption_underflow(void)
553 {
554 panic("Preemption count underflow");
555 }
556
557 void
_enable_preemption(void)558 _enable_preemption(void)
559 {
560 thread_t thread = current_thread();
561 unsigned int count = thread->machine.preemption_count;
562
563 if (__improbable(count == 0)) {
564 _enable_preemption_underflow();
565 }
566 count -= 1;
567
568 #if SCHED_PREEMPTION_DISABLE_DEBUG
569 if (count == 0 && thread->machine.preemption_disable_adj_mt != 0) {
570 _collect_preemption_disable_measurement(thread);
571 }
572 #endif /* SCHED_PREEMPTION_DISABLE_DEBUG */
573
574 os_atomic_store(&thread->machine.preemption_count, count, compiler_acq_rel);
575 if (count == 0) {
576 kernel_preempt_check(thread);
577 }
578
579 os_compiler_barrier();
580 }
581
582 int
get_preemption_level(void)583 get_preemption_level(void)
584 {
585 return current_thread()->machine.preemption_count;
586 }
587
588 /*
589 * Routine: lck_spin_alloc_init
590 */
591 lck_spin_t *
lck_spin_alloc_init(lck_grp_t * grp,lck_attr_t * attr)592 lck_spin_alloc_init(
593 lck_grp_t * grp,
594 lck_attr_t * attr)
595 {
596 lck_spin_t *lck;
597
598 lck = zalloc(KT_LCK_SPIN);
599 lck_spin_init(lck, grp, attr);
600 return lck;
601 }
602
603 /*
604 * Routine: lck_spin_free
605 */
606 void
lck_spin_free(lck_spin_t * lck,lck_grp_t * grp)607 lck_spin_free(
608 lck_spin_t * lck,
609 lck_grp_t * grp)
610 {
611 lck_spin_destroy(lck, grp);
612 zfree(KT_LCK_SPIN, lck);
613 }
614
615 /*
616 * Routine: lck_spin_init
617 */
618 void
lck_spin_init(lck_spin_t * lck,lck_grp_t * grp,__unused lck_attr_t * attr)619 lck_spin_init(
620 lck_spin_t * lck,
621 lck_grp_t * grp,
622 __unused lck_attr_t * attr)
623 {
624 lck->type = LCK_SPIN_TYPE;
625 hw_lock_init(&lck->hwlock);
626 if (grp) {
627 lck_grp_reference(grp);
628 lck_grp_lckcnt_incr(grp, LCK_TYPE_SPIN);
629 }
630 }
631
632 /*
633 * arm_usimple_lock is a lck_spin_t without a group or attributes
634 */
635 MARK_AS_HIBERNATE_TEXT void inline
arm_usimple_lock_init(simple_lock_t lck,__unused unsigned short initial_value)636 arm_usimple_lock_init(simple_lock_t lck, __unused unsigned short initial_value)
637 {
638 lck->type = LCK_SPIN_TYPE;
639 hw_lock_init(&lck->hwlock);
640 }
641
642
643 /*
644 * Routine: lck_spin_lock
645 */
646 void
lck_spin_lock(lck_spin_t * lock)647 lck_spin_lock(lck_spin_t *lock)
648 {
649 #if DEVELOPMENT || DEBUG
650 if (lock->type != LCK_SPIN_TYPE) {
651 panic("Invalid spinlock %p", lock);
652 }
653 #endif // DEVELOPMENT || DEBUG
654 hw_lock_lock(&lock->hwlock, LCK_GRP_NULL);
655 }
656
657 void
lck_spin_lock_grp(lck_spin_t * lock,lck_grp_t * grp)658 lck_spin_lock_grp(lck_spin_t *lock, lck_grp_t *grp)
659 {
660 #pragma unused(grp)
661 #if DEVELOPMENT || DEBUG
662 if (lock->type != LCK_SPIN_TYPE) {
663 panic("Invalid spinlock %p", lock);
664 }
665 #endif // DEVELOPMENT || DEBUG
666 hw_lock_lock(&lock->hwlock, grp);
667 }
668
669 /*
670 * Routine: lck_spin_lock_nopreempt
671 */
672 void
lck_spin_lock_nopreempt(lck_spin_t * lock)673 lck_spin_lock_nopreempt(lck_spin_t *lock)
674 {
675 #if DEVELOPMENT || DEBUG
676 if (lock->type != LCK_SPIN_TYPE) {
677 panic("Invalid spinlock %p", lock);
678 }
679 #endif // DEVELOPMENT || DEBUG
680 hw_lock_lock_nopreempt(&lock->hwlock, LCK_GRP_NULL);
681 }
682
683 void
lck_spin_lock_nopreempt_grp(lck_spin_t * lock,lck_grp_t * grp)684 lck_spin_lock_nopreempt_grp(lck_spin_t *lock, lck_grp_t *grp)
685 {
686 #pragma unused(grp)
687 #if DEVELOPMENT || DEBUG
688 if (lock->type != LCK_SPIN_TYPE) {
689 panic("Invalid spinlock %p", lock);
690 }
691 #endif // DEVELOPMENT || DEBUG
692 hw_lock_lock_nopreempt(&lock->hwlock, grp);
693 }
694
695 /*
696 * Routine: lck_spin_try_lock
697 */
698 int
lck_spin_try_lock(lck_spin_t * lock)699 lck_spin_try_lock(lck_spin_t *lock)
700 {
701 return hw_lock_try(&lock->hwlock, LCK_GRP_NULL);
702 }
703
704 int
lck_spin_try_lock_grp(lck_spin_t * lock,lck_grp_t * grp)705 lck_spin_try_lock_grp(lck_spin_t *lock, lck_grp_t *grp)
706 {
707 #pragma unused(grp)
708 return hw_lock_try(&lock->hwlock, grp);
709 }
710
711 /*
712 * Routine: lck_spin_try_lock_nopreempt
713 */
714 int
lck_spin_try_lock_nopreempt(lck_spin_t * lock)715 lck_spin_try_lock_nopreempt(lck_spin_t *lock)
716 {
717 return hw_lock_try_nopreempt(&lock->hwlock, LCK_GRP_NULL);
718 }
719
720 int
lck_spin_try_lock_nopreempt_grp(lck_spin_t * lock,lck_grp_t * grp)721 lck_spin_try_lock_nopreempt_grp(lck_spin_t *lock, lck_grp_t *grp)
722 {
723 #pragma unused(grp)
724 return hw_lock_try_nopreempt(&lock->hwlock, grp);
725 }
726
727 /*
728 * Routine: lck_spin_unlock
729 */
730 void
lck_spin_unlock(lck_spin_t * lock)731 lck_spin_unlock(lck_spin_t *lock)
732 {
733 #if DEVELOPMENT || DEBUG
734 if ((LCK_MTX_STATE_TO_THREAD(lock->lck_spin_data) != current_thread()) && LOCK_CORRECTNESS_PANIC()) {
735 panic("Spinlock not owned by thread %p = %lx", lock, lock->lck_spin_data);
736 }
737 if (lock->type != LCK_SPIN_TYPE) {
738 panic("Invalid spinlock type %p", lock);
739 }
740 #endif // DEVELOPMENT || DEBUG
741 hw_lock_unlock(&lock->hwlock);
742 }
743
744 /*
745 * Routine: lck_spin_unlock_nopreempt
746 */
747 void
lck_spin_unlock_nopreempt(lck_spin_t * lock)748 lck_spin_unlock_nopreempt(lck_spin_t *lock)
749 {
750 #if DEVELOPMENT || DEBUG
751 if ((LCK_MTX_STATE_TO_THREAD(lock->lck_spin_data) != current_thread()) && LOCK_CORRECTNESS_PANIC()) {
752 panic("Spinlock not owned by thread %p = %lx", lock, lock->lck_spin_data);
753 }
754 if (lock->type != LCK_SPIN_TYPE) {
755 panic("Invalid spinlock type %p", lock);
756 }
757 #endif // DEVELOPMENT || DEBUG
758 hw_lock_unlock_nopreempt(&lock->hwlock);
759 }
760
761 /*
762 * Routine: lck_spin_destroy
763 */
764 void
lck_spin_destroy(lck_spin_t * lck,lck_grp_t * grp)765 lck_spin_destroy(
766 lck_spin_t * lck,
767 lck_grp_t * grp)
768 {
769 if (lck->lck_spin_data == LCK_SPIN_TAG_DESTROYED) {
770 return;
771 }
772 lck->lck_spin_data = LCK_SPIN_TAG_DESTROYED;
773 if (grp) {
774 lck_grp_lckcnt_decr(grp, LCK_TYPE_SPIN);
775 lck_grp_deallocate(grp);
776 }
777 }
778
779 /*
780 * Routine: kdp_lck_spin_is_acquired
781 * NOT SAFE: To be used only by kernel debugger to avoid deadlock.
782 */
783 boolean_t
kdp_lck_spin_is_acquired(lck_spin_t * lck)784 kdp_lck_spin_is_acquired(lck_spin_t *lck)
785 {
786 if (not_in_kdp) {
787 panic("panic: spinlock acquired check done outside of kernel debugger");
788 }
789 return ((lck->lck_spin_data & ~LCK_SPIN_TAG_DESTROYED) != 0) ? TRUE:FALSE;
790 }
791
792 /*
793 * Initialize a usimple_lock.
794 *
795 * No change in preemption state.
796 */
797 void
usimple_lock_init(usimple_lock_t l,unsigned short tag)798 usimple_lock_init(
799 usimple_lock_t l,
800 unsigned short tag)
801 {
802 simple_lock_init((simple_lock_t) l, tag);
803 }
804
805
806 /*
807 * Acquire a usimple_lock.
808 *
809 * Returns with preemption disabled. Note
810 * that the hw_lock routines are responsible for
811 * maintaining preemption state.
812 */
813 void
814 (usimple_lock)(
815 usimple_lock_t l
816 LCK_GRP_ARG(lck_grp_t *grp))
817 {
818 simple_lock((simple_lock_t) l, LCK_GRP_PROBEARG(grp));
819 }
820
821
822 extern void sync(void);
823
824 /*
825 * Release a usimple_lock.
826 *
827 * Returns with preemption enabled. Note
828 * that the hw_lock routines are responsible for
829 * maintaining preemption state.
830 */
831 void
832 (usimple_unlock)(
833 usimple_lock_t l)
834 {
835 simple_unlock((simple_lock_t)l);
836 }
837
838
839 /*
840 * Conditionally acquire a usimple_lock.
841 *
842 * On success, returns with preemption disabled.
843 * On failure, returns with preemption in the same state
844 * as when first invoked. Note that the hw_lock routines
845 * are responsible for maintaining preemption state.
846 *
847 * XXX No stats are gathered on a miss; I preserved this
848 * behavior from the original assembly-language code, but
849 * doesn't it make sense to log misses? XXX
850 */
851 unsigned
852 int
853 (usimple_lock_try)(
854 usimple_lock_t l
855 LCK_GRP_ARG(lck_grp_t *grp))
856 {
857 return simple_lock_try((simple_lock_t) l, grp);
858 }
859
860 /*
861 * The C portion of the mutex package. These routines are only invoked
862 * if the optimized assembler routines can't do the work.
863 */
864
865 /*
866 * Forward declaration
867 */
868
869 void
870 lck_mtx_ext_init(
871 lck_mtx_ext_t * lck,
872 lck_grp_t * grp,
873 lck_attr_t * attr);
874
875 /*
876 * Routine: lck_mtx_alloc_init
877 */
878 lck_mtx_t *
lck_mtx_alloc_init(lck_grp_t * grp,lck_attr_t * attr)879 lck_mtx_alloc_init(
880 lck_grp_t * grp,
881 lck_attr_t * attr)
882 {
883 lck_mtx_t *lck;
884
885 lck = zalloc(KT_LCK_MTX);
886 lck_mtx_init(lck, grp, attr);
887 return lck;
888 }
889
890 /*
891 * Routine: lck_mtx_free
892 */
893 void
lck_mtx_free(lck_mtx_t * lck,lck_grp_t * grp)894 lck_mtx_free(
895 lck_mtx_t * lck,
896 lck_grp_t * grp)
897 {
898 lck_mtx_destroy(lck, grp);
899 zfree(KT_LCK_MTX, lck);
900 }
901
902 /*
903 * Routine: lck_mtx_init
904 */
905 void
lck_mtx_init(lck_mtx_t * lck,lck_grp_t * grp,lck_attr_t * attr)906 lck_mtx_init(
907 lck_mtx_t * lck,
908 lck_grp_t * grp,
909 lck_attr_t * attr)
910 {
911 #ifdef BER_XXX
912 lck_mtx_ext_t *lck_ext;
913 #endif
914 lck_attr_t *lck_attr;
915
916 if (attr != LCK_ATTR_NULL) {
917 lck_attr = attr;
918 } else {
919 lck_attr = &LockDefaultLckAttr;
920 }
921
922 #ifdef BER_XXX
923 if ((lck_attr->lck_attr_val) & LCK_ATTR_DEBUG) {
924 lck_ext = zalloc(KT_LCK_MTX_EXT);
925 lck_mtx_ext_init(lck_ext, grp, lck_attr);
926 lck->lck_mtx_tag = LCK_MTX_TAG_INDIRECT;
927 lck->lck_mtx_ptr = lck_ext;
928 lck->lck_mtx_type = LCK_MTX_TYPE;
929 } else
930 #endif
931 {
932 *lck = (lck_mtx_t){
933 .lck_mtx_type = LCK_MTX_TYPE,
934 };
935 }
936 lck_grp_reference(grp);
937 lck_grp_lckcnt_incr(grp, LCK_TYPE_MTX);
938 }
939
940 /*
941 * Routine: lck_mtx_init_ext
942 */
943 void
lck_mtx_init_ext(lck_mtx_t * lck,lck_mtx_ext_t * lck_ext __unused,lck_grp_t * grp,lck_attr_t * attr)944 lck_mtx_init_ext(
945 lck_mtx_t * lck,
946 lck_mtx_ext_t * lck_ext __unused,
947 lck_grp_t * grp,
948 lck_attr_t * attr)
949 {
950 lck_attr_t *lck_attr;
951
952 if (attr != LCK_ATTR_NULL) {
953 lck_attr = attr;
954 } else {
955 lck_attr = &LockDefaultLckAttr;
956 }
957
958 #if LOCKS_INDIRECT_ALLOW
959 if ((lck_attr->lck_attr_val) & LCK_ATTR_DEBUG) {
960 lck_mtx_ext_init(lck_ext, grp, lck_attr);
961 lck->lck_mtx_tag = LCK_MTX_TAG_INDIRECT;
962 lck->lck_mtx_ptr = lck_ext;
963 lck->lck_mtx_type = LCK_MTX_TYPE;
964 } else
965 #endif /* LOCKS_INDIRECT_ALLOW */
966 {
967 lck->lck_mtx_waiters = 0;
968 lck->lck_mtx_type = LCK_MTX_TYPE;
969 ordered_store_mtx(lck, 0);
970 }
971 lck_grp_reference(grp);
972 lck_grp_lckcnt_incr(grp, LCK_TYPE_MTX);
973 }
974
975 /*
976 * Routine: lck_mtx_ext_init
977 */
978 void
lck_mtx_ext_init(lck_mtx_ext_t * lck,lck_grp_t * grp,lck_attr_t * attr)979 lck_mtx_ext_init(
980 lck_mtx_ext_t * lck,
981 lck_grp_t * grp,
982 lck_attr_t * attr)
983 {
984 bzero((void *) lck, sizeof(lck_mtx_ext_t));
985
986 lck->lck_mtx.lck_mtx_type = LCK_MTX_TYPE;
987
988 if ((attr->lck_attr_val) & LCK_ATTR_DEBUG) {
989 lck->lck_mtx_deb.type = MUTEX_TAG;
990 lck->lck_mtx_attr |= LCK_MTX_ATTR_DEBUG;
991 }
992 lck->lck_mtx_grp = grp;
993
994 if (grp->lck_grp_attr & LCK_GRP_ATTR_STAT) {
995 lck->lck_mtx_attr |= LCK_MTX_ATTR_STAT;
996 }
997 }
998
999 /* The slow versions */
1000 static void lck_mtx_lock_contended(lck_mtx_t *lock, thread_t thread, boolean_t interlocked);
1001 static boolean_t lck_mtx_try_lock_contended(lck_mtx_t *lock, thread_t thread);
1002 static void lck_mtx_unlock_contended(lck_mtx_t *lock, thread_t thread, boolean_t interlocked);
1003
1004 /* The adaptive spin function */
1005 static spinwait_result_t lck_mtx_lock_contended_spinwait_arm(lck_mtx_t *lock, thread_t thread, boolean_t interlocked);
1006
1007 /*
1008 * Routine: lck_mtx_verify
1009 *
1010 * Verify if a mutex is valid
1011 */
1012 static inline void
lck_mtx_verify(lck_mtx_t * lock)1013 lck_mtx_verify(lck_mtx_t *lock)
1014 {
1015 if (lock->lck_mtx_type != LCK_MTX_TYPE) {
1016 panic("Invalid mutex %p", lock);
1017 }
1018 #if DEVELOPMENT || DEBUG
1019 if (lock->lck_mtx_tag == LCK_MTX_TAG_DESTROYED) {
1020 panic("Mutex destroyed %p", lock);
1021 }
1022 #endif /* DEVELOPMENT || DEBUG */
1023 }
1024
1025 /*
1026 * Routine: lck_mtx_check_preemption
1027 *
1028 * Verify preemption is enabled when attempting to acquire a mutex.
1029 */
1030
1031 static inline void
lck_mtx_check_preemption(lck_mtx_t * lock)1032 lck_mtx_check_preemption(lck_mtx_t *lock)
1033 {
1034 #if DEVELOPMENT || DEBUG
1035 if (current_cpu_datap()->cpu_hibernate) {
1036 return;
1037 }
1038
1039 int pl = get_preemption_level();
1040
1041 if (pl != 0) {
1042 panic("Attempt to take mutex with preemption disabled. Lock=%p, level=%d", lock, pl);
1043 }
1044 #else
1045 (void)lock;
1046 #endif
1047 }
1048
1049 /*
1050 * Routine: lck_mtx_lock
1051 */
1052 void
lck_mtx_lock(lck_mtx_t * lock)1053 lck_mtx_lock(lck_mtx_t *lock)
1054 {
1055 thread_t thread;
1056
1057 lck_mtx_verify(lock);
1058 lck_mtx_check_preemption(lock);
1059 thread = current_thread();
1060 if (os_atomic_cmpxchg(&lock->lck_mtx_data,
1061 0, LCK_MTX_THREAD_TO_STATE(thread), acquire)) {
1062 #if CONFIG_DTRACE
1063 LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_ACQUIRE, lock, 0);
1064 #endif /* CONFIG_DTRACE */
1065 return;
1066 }
1067 lck_mtx_lock_contended(lock, thread, FALSE);
1068 }
1069
1070 /*
1071 * This is the slow version of mutex locking.
1072 */
1073 static void NOINLINE
lck_mtx_lock_contended(lck_mtx_t * lock,thread_t thread,boolean_t interlocked)1074 lck_mtx_lock_contended(lck_mtx_t *lock, thread_t thread, boolean_t interlocked)
1075 {
1076 thread_t holding_thread;
1077 uintptr_t state;
1078 int waiters = 0;
1079 spinwait_result_t sw_res;
1080 struct turnstile *ts = NULL;
1081
1082 /* Loop waiting until I see that the mutex is unowned */
1083 for (;;) {
1084 sw_res = lck_mtx_lock_contended_spinwait_arm(lock, thread, interlocked);
1085 interlocked = FALSE;
1086
1087 switch (sw_res) {
1088 case SPINWAIT_ACQUIRED:
1089 if (ts != NULL) {
1090 interlock_lock(lock);
1091 turnstile_complete((uintptr_t)lock, NULL, NULL, TURNSTILE_KERNEL_MUTEX);
1092 interlock_unlock(lock);
1093 }
1094 goto done;
1095 case SPINWAIT_INTERLOCK:
1096 goto set_owner;
1097 default:
1098 break;
1099 }
1100
1101 state = ordered_load_mtx(lock);
1102 holding_thread = LCK_MTX_STATE_TO_THREAD(state);
1103 if (holding_thread == NULL) {
1104 break;
1105 }
1106 ordered_store_mtx(lock, (state | LCK_ILOCK | ARM_LCK_WAITERS)); // Set waiters bit and wait
1107 lck_mtx_lock_wait(lock, holding_thread, &ts);
1108 /* returns interlock unlocked */
1109 }
1110
1111 set_owner:
1112 /* Hooray, I'm the new owner! */
1113 state = ordered_load_mtx(lock);
1114
1115 if (state & ARM_LCK_WAITERS) {
1116 /* Skip lck_mtx_lock_acquire if there are no waiters. */
1117 waiters = lck_mtx_lock_acquire(lock, ts);
1118 /*
1119 * lck_mtx_lock_acquire will call
1120 * turnstile_complete
1121 */
1122 } else {
1123 if (ts != NULL) {
1124 turnstile_complete((uintptr_t)lock, NULL, NULL, TURNSTILE_KERNEL_MUTEX);
1125 }
1126 }
1127
1128 state = LCK_MTX_THREAD_TO_STATE(thread);
1129 if (waiters != 0) {
1130 state |= ARM_LCK_WAITERS;
1131 }
1132 state |= LCK_ILOCK; // Preserve interlock
1133 ordered_store_mtx(lock, state); // Set ownership
1134 interlock_unlock(lock); // Release interlock, enable preemption
1135
1136 done:
1137 load_memory_barrier();
1138
1139 assert(thread->turnstile != NULL);
1140
1141 if (ts != NULL) {
1142 turnstile_cleanup();
1143 }
1144
1145 #if CONFIG_DTRACE
1146 LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_ACQUIRE, lock, 0);
1147 #endif /* CONFIG_DTRACE */
1148 }
1149
1150 /*
1151 * Routine: lck_mtx_lock_spinwait_arm
1152 *
1153 * Invoked trying to acquire a mutex when there is contention but
1154 * the holder is running on another processor. We spin for up to a maximum
1155 * time waiting for the lock to be released.
1156 */
1157 static spinwait_result_t
lck_mtx_lock_contended_spinwait_arm(lck_mtx_t * lock,thread_t thread,boolean_t interlocked)1158 lck_mtx_lock_contended_spinwait_arm(lck_mtx_t *lock, thread_t thread, boolean_t interlocked)
1159 {
1160 int has_interlock = (int)interlocked;
1161 __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lock);
1162 thread_t owner, prev_owner;
1163 uint64_t window_deadline, sliding_deadline, high_deadline;
1164 uint64_t start_time, cur_time, avg_hold_time, bias, delta;
1165 int loopcount = 0;
1166 uint i, prev_owner_cpu;
1167 int total_hold_time_samples, window_hold_time_samples, unfairness;
1168 bool owner_on_core, adjust;
1169 uintptr_t state, new_state, waiters;
1170 spinwait_result_t retval = SPINWAIT_DID_SPIN_HIGH_THR;
1171
1172 if (__improbable(!(lck_mtx_adaptive_spin_mode & ADAPTIVE_SPIN_ENABLE))) {
1173 if (!has_interlock) {
1174 interlock_lock(lock);
1175 }
1176
1177 return SPINWAIT_DID_NOT_SPIN;
1178 }
1179
1180 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_START,
1181 trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(LCK_MTX_STATE_TO_THREAD(state)), lock->lck_mtx_waiters, 0, 0);
1182
1183 start_time = mach_absolute_time();
1184 /*
1185 * window_deadline represents the "learning" phase.
1186 * The thread collects statistics about the lock during
1187 * window_deadline and then it makes a decision on whether to spin more
1188 * or block according to the concurrency behavior
1189 * observed.
1190 *
1191 * Every thread can spin at least low_MutexSpin.
1192 */
1193 window_deadline = start_time + low_MutexSpin;
1194 /*
1195 * Sliding_deadline is the adjusted spin deadline
1196 * computed after the "learning" phase.
1197 */
1198 sliding_deadline = window_deadline;
1199 /*
1200 * High_deadline is a hard deadline. No thread
1201 * can spin more than this deadline.
1202 */
1203 if (high_MutexSpin >= 0) {
1204 high_deadline = start_time + high_MutexSpin;
1205 } else {
1206 high_deadline = start_time + low_MutexSpin * real_ncpus;
1207 }
1208
1209 /*
1210 * Do not know yet which is the owner cpu.
1211 * Initialize prev_owner_cpu with next cpu.
1212 */
1213 prev_owner_cpu = (cpu_number() + 1) % real_ncpus;
1214 total_hold_time_samples = 0;
1215 window_hold_time_samples = 0;
1216 avg_hold_time = 0;
1217 adjust = TRUE;
1218 bias = (os_hash_kernel_pointer(lock) + cpu_number()) % real_ncpus;
1219
1220 /* Snoop the lock state */
1221 state = ordered_load_mtx(lock);
1222 owner = LCK_MTX_STATE_TO_THREAD(state);
1223 prev_owner = owner;
1224
1225 if (has_interlock) {
1226 if (owner == NULL) {
1227 retval = SPINWAIT_INTERLOCK;
1228 goto done_spinning;
1229 } else {
1230 /*
1231 * We are holding the interlock, so
1232 * we can safely dereference owner.
1233 */
1234 if (!machine_thread_on_core(owner) || (owner->state & TH_IDLE)) {
1235 retval = SPINWAIT_DID_NOT_SPIN;
1236 goto done_spinning;
1237 }
1238 }
1239 interlock_unlock(lock);
1240 has_interlock = 0;
1241 }
1242
1243 /*
1244 * Spin while:
1245 * - mutex is locked, and
1246 * - it's locked as a spin lock, and
1247 * - owner is running on another processor, and
1248 * - we haven't spun for long enough.
1249 */
1250 do {
1251 /*
1252 * Try to acquire the lock.
1253 */
1254 owner = LCK_MTX_STATE_TO_THREAD(state);
1255 if (owner == NULL) {
1256 waiters = state & ARM_LCK_WAITERS;
1257 if (waiters) {
1258 /*
1259 * preserve the waiter bit
1260 * and try acquire the interlock.
1261 * Note: we will successfully acquire
1262 * the interlock only if we can also
1263 * acquire the lock.
1264 */
1265 new_state = ARM_LCK_WAITERS | LCK_ILOCK;
1266 has_interlock = 1;
1267 retval = SPINWAIT_INTERLOCK;
1268 disable_preemption();
1269 } else {
1270 new_state = LCK_MTX_THREAD_TO_STATE(thread);
1271 retval = SPINWAIT_ACQUIRED;
1272 }
1273
1274 /*
1275 * The cmpxchg will succed only if the lock
1276 * is not owned (doesn't have an owner set)
1277 * and it is not interlocked.
1278 * It will not fail if there are waiters.
1279 */
1280 if (os_atomic_cmpxchgv(&lock->lck_mtx_data,
1281 waiters, new_state, &state, acquire)) {
1282 goto done_spinning;
1283 } else {
1284 if (waiters) {
1285 has_interlock = 0;
1286 enable_preemption();
1287 }
1288 }
1289 }
1290
1291 cur_time = mach_absolute_time();
1292
1293 /*
1294 * Never spin past high_deadline.
1295 */
1296 if (cur_time >= high_deadline) {
1297 retval = SPINWAIT_DID_SPIN_HIGH_THR;
1298 break;
1299 }
1300
1301 /*
1302 * Check if owner is on core. If not block.
1303 */
1304 owner = LCK_MTX_STATE_TO_THREAD(state);
1305 if (owner) {
1306 i = prev_owner_cpu;
1307 owner_on_core = FALSE;
1308
1309 disable_preemption();
1310 state = ordered_load_mtx(lock);
1311 owner = LCK_MTX_STATE_TO_THREAD(state);
1312
1313 /*
1314 * For scalability we want to check if the owner is on core
1315 * without locking the mutex interlock.
1316 * If we do not lock the mutex interlock, the owner that we see might be
1317 * invalid, so we cannot dereference it. Therefore we cannot check
1318 * any field of the thread to tell us if it is on core.
1319 * Check if the thread that is running on the other cpus matches the owner.
1320 */
1321 if (owner) {
1322 do {
1323 cpu_data_t *cpu_data_ptr = CpuDataEntries[i].cpu_data_vaddr;
1324 if ((cpu_data_ptr != NULL) && (cpu_data_ptr->cpu_active_thread == owner)) {
1325 owner_on_core = TRUE;
1326 break;
1327 }
1328 if (++i >= real_ncpus) {
1329 i = 0;
1330 }
1331 } while (i != prev_owner_cpu);
1332 enable_preemption();
1333
1334 if (owner_on_core) {
1335 prev_owner_cpu = i;
1336 } else {
1337 prev_owner = owner;
1338 state = ordered_load_mtx(lock);
1339 owner = LCK_MTX_STATE_TO_THREAD(state);
1340 if (owner == prev_owner) {
1341 /*
1342 * Owner is not on core.
1343 * Stop spinning.
1344 */
1345 if (loopcount == 0) {
1346 retval = SPINWAIT_DID_NOT_SPIN;
1347 } else {
1348 retval = SPINWAIT_DID_SPIN_OWNER_NOT_CORE;
1349 }
1350 break;
1351 }
1352 /*
1353 * Fall through if the owner changed while we were scanning.
1354 * The new owner could potentially be on core, so loop
1355 * again.
1356 */
1357 }
1358 } else {
1359 enable_preemption();
1360 }
1361 }
1362
1363 /*
1364 * Save how many times we see the owner changing.
1365 * We can roughly estimate the the mutex hold
1366 * time and the fairness with that.
1367 */
1368 if (owner != prev_owner) {
1369 prev_owner = owner;
1370 total_hold_time_samples++;
1371 window_hold_time_samples++;
1372 }
1373
1374 /*
1375 * Learning window expired.
1376 * Try to adjust the sliding_deadline.
1377 */
1378 if (cur_time >= window_deadline) {
1379 /*
1380 * If there was not contention during the window
1381 * stop spinning.
1382 */
1383 if (window_hold_time_samples < 1) {
1384 retval = SPINWAIT_DID_SPIN_NO_WINDOW_CONTENTION;
1385 break;
1386 }
1387
1388 if (adjust) {
1389 /*
1390 * For a fair lock, we'd wait for at most (NCPU-1) periods,
1391 * but the lock is unfair, so let's try to estimate by how much.
1392 */
1393 unfairness = total_hold_time_samples / real_ncpus;
1394
1395 if (unfairness == 0) {
1396 /*
1397 * We observed the owner changing `total_hold_time_samples` times which
1398 * let us estimate the average hold time of this mutex for the duration
1399 * of the spin time.
1400 * avg_hold_time = (cur_time - start_time) / total_hold_time_samples;
1401 *
1402 * In this case spin at max avg_hold_time * (real_ncpus - 1)
1403 */
1404 delta = cur_time - start_time;
1405 sliding_deadline = start_time + (delta * (real_ncpus - 1)) / total_hold_time_samples;
1406 } else {
1407 /*
1408 * In this case at least one of the other cpus was able to get the lock twice
1409 * while I was spinning.
1410 * We could spin longer but it won't necessarily help if the system is unfair.
1411 * Try to randomize the wait to reduce contention.
1412 *
1413 * We compute how much time we could potentially spin
1414 * and distribute it over the cpus.
1415 *
1416 * bias is an integer between 0 and real_ncpus.
1417 * distributed_increment = ((high_deadline - cur_time) / real_ncpus) * bias
1418 */
1419 delta = high_deadline - cur_time;
1420 sliding_deadline = cur_time + ((delta * bias) / real_ncpus);
1421 adjust = FALSE;
1422 }
1423 }
1424
1425 window_deadline += low_MutexSpin;
1426 window_hold_time_samples = 0;
1427 }
1428
1429 /*
1430 * Stop spinning if we past
1431 * the adjusted deadline.
1432 */
1433 if (cur_time >= sliding_deadline) {
1434 retval = SPINWAIT_DID_SPIN_SLIDING_THR;
1435 break;
1436 }
1437
1438 /*
1439 * We want to arm the monitor for wfe,
1440 * so load exclusively the lock.
1441 *
1442 * NOTE:
1443 * we rely on the fact that wfe will
1444 * eventually return even if the cache line
1445 * is not modified. This way we will keep
1446 * looping and checking if the deadlines expired.
1447 */
1448 state = os_atomic_load_exclusive(&lock->lck_mtx_data, relaxed);
1449 owner = LCK_MTX_STATE_TO_THREAD(state);
1450 if (owner != NULL) {
1451 wait_for_event();
1452 state = ordered_load_mtx(lock);
1453 } else {
1454 atomic_exchange_abort();
1455 }
1456
1457 loopcount++;
1458 } while (TRUE);
1459
1460 done_spinning:
1461 #if CONFIG_DTRACE
1462 /*
1463 * Note that we record a different probe id depending on whether
1464 * this is a direct or indirect mutex. This allows us to
1465 * penalize only lock groups that have debug/stats enabled
1466 * with dtrace processing if desired.
1467 */
1468 #if LOCKS_INDIRECT_ALLOW
1469 if (__probable(lock->lck_mtx_tag != LCK_MTX_TAG_INDIRECT)) {
1470 LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_SPIN, lock,
1471 mach_absolute_time() - start_time);
1472 } else
1473 #endif /* LOCKS_INDIRECT_ALLOW */
1474 {
1475 LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_SPIN, lock,
1476 mach_absolute_time() - start_time);
1477 }
1478 /* The lockstat acquire event is recorded by the caller. */
1479 #endif
1480
1481 state = ordered_load_mtx(lock);
1482
1483 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_END,
1484 trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(LCK_MTX_STATE_TO_THREAD(state)), lock->lck_mtx_waiters, retval, 0);
1485 if ((!has_interlock) && (retval != SPINWAIT_ACQUIRED)) {
1486 /* We must own either the lock or the interlock on return. */
1487 interlock_lock(lock);
1488 }
1489
1490 return retval;
1491 }
1492
1493
1494 /*
1495 * Common code for mutex locking as spinlock
1496 */
1497 static inline void
lck_mtx_lock_spin_internal(lck_mtx_t * lock,boolean_t allow_held_as_mutex)1498 lck_mtx_lock_spin_internal(lck_mtx_t *lock, boolean_t allow_held_as_mutex)
1499 {
1500 uintptr_t state;
1501
1502 interlock_lock(lock);
1503 state = ordered_load_mtx(lock);
1504 if (LCK_MTX_STATE_TO_THREAD(state)) {
1505 if (allow_held_as_mutex) {
1506 lck_mtx_lock_contended(lock, current_thread(), TRUE);
1507 } else {
1508 // "Always" variants can never block. If the lock is held and blocking is not allowed
1509 // then someone is mixing always and non-always calls on the same lock, which is
1510 // forbidden.
1511 panic("Attempting to block on a lock taken as spin-always %p", lock);
1512 }
1513 return;
1514 }
1515 state &= ARM_LCK_WAITERS; // Preserve waiters bit
1516 state |= (LCK_MTX_SPIN_TAG | LCK_ILOCK); // Add spin tag and maintain interlock
1517 ordered_store_mtx(lock, state);
1518 load_memory_barrier();
1519
1520 #if CONFIG_DTRACE
1521 LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_SPIN_ACQUIRE, lock, 0);
1522 #endif /* CONFIG_DTRACE */
1523 }
1524
1525 /*
1526 * Routine: lck_mtx_lock_spin
1527 */
1528 void
lck_mtx_lock_spin(lck_mtx_t * lock)1529 lck_mtx_lock_spin(lck_mtx_t *lock)
1530 {
1531 lck_mtx_check_preemption(lock);
1532 lck_mtx_lock_spin_internal(lock, TRUE);
1533 }
1534
1535 /*
1536 * Routine: lck_mtx_lock_spin_always
1537 */
1538 void
lck_mtx_lock_spin_always(lck_mtx_t * lock)1539 lck_mtx_lock_spin_always(lck_mtx_t *lock)
1540 {
1541 lck_mtx_lock_spin_internal(lock, FALSE);
1542 }
1543
1544 /*
1545 * Routine: lck_mtx_try_lock
1546 */
1547 boolean_t
lck_mtx_try_lock(lck_mtx_t * lock)1548 lck_mtx_try_lock(lck_mtx_t *lock)
1549 {
1550 thread_t thread = current_thread();
1551
1552 lck_mtx_verify(lock);
1553 if (os_atomic_cmpxchg(&lock->lck_mtx_data,
1554 0, LCK_MTX_THREAD_TO_STATE(thread), acquire)) {
1555 #if CONFIG_DTRACE
1556 LOCKSTAT_RECORD(LS_LCK_MTX_TRY_LOCK_ACQUIRE, lock, 0);
1557 #endif /* CONFIG_DTRACE */
1558 return TRUE;
1559 }
1560 return lck_mtx_try_lock_contended(lock, thread);
1561 }
1562
1563 static boolean_t NOINLINE
lck_mtx_try_lock_contended(lck_mtx_t * lock,thread_t thread)1564 lck_mtx_try_lock_contended(lck_mtx_t *lock, thread_t thread)
1565 {
1566 thread_t holding_thread;
1567 uintptr_t state;
1568 int waiters;
1569
1570 interlock_lock(lock);
1571 state = ordered_load_mtx(lock);
1572 holding_thread = LCK_MTX_STATE_TO_THREAD(state);
1573 if (holding_thread) {
1574 interlock_unlock(lock);
1575 return FALSE;
1576 }
1577 waiters = lck_mtx_lock_acquire(lock, NULL);
1578 state = LCK_MTX_THREAD_TO_STATE(thread);
1579 if (waiters != 0) {
1580 state |= ARM_LCK_WAITERS;
1581 }
1582 state |= LCK_ILOCK; // Preserve interlock
1583 ordered_store_mtx(lock, state); // Set ownership
1584 interlock_unlock(lock); // Release interlock, enable preemption
1585 load_memory_barrier();
1586
1587 turnstile_cleanup();
1588
1589 return TRUE;
1590 }
1591
1592 static inline boolean_t
lck_mtx_try_lock_spin_internal(lck_mtx_t * lock,boolean_t allow_held_as_mutex)1593 lck_mtx_try_lock_spin_internal(lck_mtx_t *lock, boolean_t allow_held_as_mutex)
1594 {
1595 uintptr_t state;
1596
1597 if (!interlock_try(lock)) {
1598 return FALSE;
1599 }
1600 state = ordered_load_mtx(lock);
1601 if (LCK_MTX_STATE_TO_THREAD(state)) {
1602 // Lock is held as mutex
1603 if (allow_held_as_mutex) {
1604 interlock_unlock(lock);
1605 } else {
1606 // "Always" variants can never block. If the lock is held as a normal mutex
1607 // then someone is mixing always and non-always calls on the same lock, which is
1608 // forbidden.
1609 panic("Spin-mutex held as full mutex %p", lock);
1610 }
1611 return FALSE;
1612 }
1613 state &= ARM_LCK_WAITERS; // Preserve waiters bit
1614 state |= (LCK_MTX_SPIN_TAG | LCK_ILOCK); // Add spin tag and maintain interlock
1615 ordered_store_mtx(lock, state);
1616 load_memory_barrier();
1617
1618 #if CONFIG_DTRACE
1619 LOCKSTAT_RECORD(LS_LCK_MTX_TRY_SPIN_LOCK_ACQUIRE, lock, 0);
1620 #endif /* CONFIG_DTRACE */
1621 return TRUE;
1622 }
1623
1624 /*
1625 * Routine: lck_mtx_try_lock_spin
1626 */
1627 boolean_t
lck_mtx_try_lock_spin(lck_mtx_t * lock)1628 lck_mtx_try_lock_spin(lck_mtx_t *lock)
1629 {
1630 return lck_mtx_try_lock_spin_internal(lock, TRUE);
1631 }
1632
1633 /*
1634 * Routine: lck_mtx_try_lock_spin_always
1635 */
1636 boolean_t
lck_mtx_try_lock_spin_always(lck_mtx_t * lock)1637 lck_mtx_try_lock_spin_always(lck_mtx_t *lock)
1638 {
1639 return lck_mtx_try_lock_spin_internal(lock, FALSE);
1640 }
1641
1642
1643
1644 /*
1645 * Routine: lck_mtx_unlock
1646 */
1647 void
lck_mtx_unlock(lck_mtx_t * lock)1648 lck_mtx_unlock(lck_mtx_t *lock)
1649 {
1650 thread_t thread = current_thread();
1651 uintptr_t state;
1652 boolean_t ilk_held = FALSE;
1653
1654 lck_mtx_verify(lock);
1655
1656 state = ordered_load_mtx(lock);
1657 if (state & LCK_ILOCK) {
1658 if (LCK_MTX_STATE_TO_THREAD(state) == (thread_t)LCK_MTX_SPIN_TAG) {
1659 ilk_held = TRUE; // Interlock is held by (presumably) this thread
1660 }
1661 goto slow_case;
1662 }
1663 // Locked as a mutex
1664 if (os_atomic_cmpxchg(&lock->lck_mtx_data,
1665 LCK_MTX_THREAD_TO_STATE(thread), 0, release)) {
1666 #if CONFIG_DTRACE
1667 LOCKSTAT_RECORD(LS_LCK_MTX_UNLOCK_RELEASE, lock, 0);
1668 #endif /* CONFIG_DTRACE */
1669 return;
1670 }
1671 slow_case:
1672 lck_mtx_unlock_contended(lock, thread, ilk_held);
1673 }
1674
1675 static void NOINLINE
lck_mtx_unlock_contended(lck_mtx_t * lock,thread_t thread,boolean_t ilk_held)1676 lck_mtx_unlock_contended(lck_mtx_t *lock, thread_t thread, boolean_t ilk_held)
1677 {
1678 uintptr_t state;
1679 boolean_t cleanup = FALSE;
1680
1681 if (ilk_held) {
1682 state = ordered_load_mtx(lock);
1683 } else {
1684 interlock_lock(lock);
1685 state = ordered_load_mtx(lock);
1686 if (thread != LCK_MTX_STATE_TO_THREAD(state)) {
1687 panic("lck_mtx_unlock(): Attempt to release lock not owned by thread (%p)", lock);
1688 }
1689 if (state & ARM_LCK_WAITERS) {
1690 if (lck_mtx_unlock_wakeup(lock, thread)) {
1691 state = ARM_LCK_WAITERS;
1692 } else {
1693 state = 0;
1694 }
1695 cleanup = TRUE;
1696 goto unlock;
1697 }
1698 }
1699 state &= ARM_LCK_WAITERS; /* Clear state, retain waiters bit */
1700 unlock:
1701 state |= LCK_ILOCK;
1702 ordered_store_mtx(lock, state);
1703 interlock_unlock(lock);
1704 if (cleanup) {
1705 /*
1706 * Do not do any turnstile operations outside of this block.
1707 * lock/unlock is called at early stage of boot with single thread,
1708 * when turnstile is not yet initialized.
1709 * Even without contention we can come throught the slow path
1710 * if the mutex is acquired as a spin lock.
1711 */
1712 turnstile_cleanup();
1713 }
1714
1715 #if CONFIG_DTRACE
1716 LOCKSTAT_RECORD(LS_LCK_MTX_UNLOCK_RELEASE, lock, 0);
1717 #endif /* CONFIG_DTRACE */
1718 }
1719
1720 /*
1721 * Routine: lck_mtx_assert
1722 */
1723 void
lck_mtx_assert(lck_mtx_t * lock,unsigned int type)1724 lck_mtx_assert(lck_mtx_t *lock, unsigned int type)
1725 {
1726 thread_t thread, holder;
1727 uintptr_t state;
1728
1729 state = ordered_load_mtx(lock);
1730 holder = LCK_MTX_STATE_TO_THREAD(state);
1731 if (holder == (thread_t)LCK_MTX_SPIN_TAG) {
1732 // Lock is held in spin mode, owner is unknown.
1733 return; // Punt
1734 }
1735 thread = current_thread();
1736 if (type == LCK_MTX_ASSERT_OWNED) {
1737 if (thread != holder) {
1738 panic("lck_mtx_assert(): mutex (%p) owned", lock);
1739 }
1740 } else if (type == LCK_MTX_ASSERT_NOTOWNED) {
1741 if (thread == holder) {
1742 panic("lck_mtx_assert(): mutex (%p) not owned", lock);
1743 }
1744 } else {
1745 panic("lck_mtx_assert(): invalid arg (%u)", type);
1746 }
1747 }
1748
1749 /*
1750 * Routine: lck_mtx_ilk_unlock
1751 */
1752 boolean_t
lck_mtx_ilk_unlock(lck_mtx_t * lock)1753 lck_mtx_ilk_unlock(lck_mtx_t *lock)
1754 {
1755 interlock_unlock(lock);
1756 return TRUE;
1757 }
1758
1759 /*
1760 * Routine: lck_mtx_convert_spin
1761 *
1762 * Convert a mutex held for spin into a held full mutex
1763 */
1764 void
lck_mtx_convert_spin(lck_mtx_t * lock)1765 lck_mtx_convert_spin(lck_mtx_t *lock)
1766 {
1767 thread_t thread = current_thread();
1768 uintptr_t state;
1769 int waiters;
1770
1771 state = ordered_load_mtx(lock);
1772 if (LCK_MTX_STATE_TO_THREAD(state) == thread) {
1773 return; // Already owned as mutex, return
1774 }
1775 if ((state & LCK_ILOCK) == 0 || (LCK_MTX_STATE_TO_THREAD(state) != (thread_t)LCK_MTX_SPIN_TAG)) {
1776 panic("lck_mtx_convert_spin: Not held as spinlock (%p)", lock);
1777 }
1778 state &= ~(LCK_MTX_THREAD_MASK); // Clear the spin tag
1779 ordered_store_mtx(lock, state);
1780 waiters = lck_mtx_lock_acquire(lock, NULL); // Acquire to manage priority boosts
1781 state = LCK_MTX_THREAD_TO_STATE(thread);
1782 if (waiters != 0) {
1783 state |= ARM_LCK_WAITERS;
1784 }
1785 state |= LCK_ILOCK;
1786 ordered_store_mtx(lock, state); // Set ownership
1787 interlock_unlock(lock); // Release interlock, enable preemption
1788 turnstile_cleanup();
1789 }
1790
1791
1792 /*
1793 * Routine: lck_mtx_destroy
1794 */
1795 void
lck_mtx_destroy(lck_mtx_t * lck,lck_grp_t * grp)1796 lck_mtx_destroy(
1797 lck_mtx_t * lck,
1798 lck_grp_t * grp)
1799 {
1800 if (lck->lck_mtx_type != LCK_MTX_TYPE) {
1801 panic("Destroying invalid mutex %p", lck);
1802 }
1803 if (lck->lck_mtx_tag == LCK_MTX_TAG_DESTROYED) {
1804 panic("Destroying previously destroyed lock %p", lck);
1805 }
1806 lck_mtx_assert(lck, LCK_MTX_ASSERT_NOTOWNED);
1807 lck->lck_mtx_tag = LCK_MTX_TAG_DESTROYED;
1808 lck_grp_lckcnt_decr(grp, LCK_TYPE_MTX);
1809 lck_grp_deallocate(grp);
1810 return;
1811 }
1812
1813 /*
1814 * Routine: lck_spin_assert
1815 */
1816 void
lck_spin_assert(lck_spin_t * lock,unsigned int type)1817 lck_spin_assert(lck_spin_t *lock, unsigned int type)
1818 {
1819 thread_t thread, holder;
1820 uintptr_t state;
1821
1822 if (lock->type != LCK_SPIN_TYPE) {
1823 panic("Invalid spinlock %p", lock);
1824 }
1825
1826 state = lock->lck_spin_data;
1827 holder = (thread_t)(state & ~LCK_ILOCK);
1828 thread = current_thread();
1829 if (type == LCK_ASSERT_OWNED) {
1830 if (holder == 0) {
1831 panic("Lock not owned %p = %lx", lock, state);
1832 }
1833 if (holder != thread) {
1834 panic("Lock not owned by current thread %p = %lx", lock, state);
1835 }
1836 if ((state & LCK_ILOCK) == 0) {
1837 panic("Lock bit not set %p = %lx", lock, state);
1838 }
1839 } else if (type == LCK_ASSERT_NOTOWNED) {
1840 if (holder != 0) {
1841 if (holder == thread) {
1842 panic("Lock owned by current thread %p = %lx", lock, state);
1843 }
1844 }
1845 } else {
1846 panic("lck_spin_assert(): invalid arg (%u)", type);
1847 }
1848 }
1849
1850 /*
1851 * Routine: kdp_lck_mtx_lock_spin_is_acquired
1852 * NOT SAFE: To be used only by kernel debugger to avoid deadlock.
1853 */
1854 boolean_t
kdp_lck_mtx_lock_spin_is_acquired(lck_mtx_t * lck)1855 kdp_lck_mtx_lock_spin_is_acquired(lck_mtx_t *lck)
1856 {
1857 uintptr_t state;
1858
1859 if (not_in_kdp) {
1860 panic("panic: spinlock acquired check done outside of kernel debugger");
1861 }
1862 state = ordered_load_mtx(lck);
1863 if (state == LCK_MTX_TAG_DESTROYED) {
1864 return FALSE;
1865 }
1866 if (LCK_MTX_STATE_TO_THREAD(state) || (state & LCK_ILOCK)) {
1867 return TRUE;
1868 }
1869 return FALSE;
1870 }
1871
1872 void
kdp_lck_mtx_find_owner(__unused struct waitq * waitq,event64_t event,thread_waitinfo_t * waitinfo)1873 kdp_lck_mtx_find_owner(__unused struct waitq * waitq, event64_t event, thread_waitinfo_t * waitinfo)
1874 {
1875 lck_mtx_t * mutex = LCK_EVENT_TO_MUTEX(event);
1876 waitinfo->context = VM_KERNEL_UNSLIDE_OR_PERM(mutex);
1877 uintptr_t state = ordered_load_mtx(mutex);
1878 thread_t holder = LCK_MTX_STATE_TO_THREAD(state);
1879 if ((uintptr_t)holder == (uintptr_t)LCK_MTX_SPIN_TAG) {
1880 waitinfo->owner = STACKSHOT_WAITOWNER_MTXSPIN;
1881 } else {
1882 assertf(state != (uintptr_t)LCK_MTX_TAG_DESTROYED, "state=0x%llx", (uint64_t)state);
1883 #if LOCKS_INDIRECT_ALLOW
1884 assertf(state != (uintptr_t)LCK_MTX_TAG_INDIRECT, "state=0x%llx", (uint64_t)state);
1885 #endif /* LOCKS_INDIRECT_ALLOW */
1886 waitinfo->owner = thread_tid(holder);
1887 }
1888 }
1889