1 /*
2 * Copyright (c) 2007-2018 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * @OSF_COPYRIGHT@
30 */
31 /*
32 * Mach Operating System Copyright (c) 1991,1990,1989,1988,1987 Carnegie
33 * Mellon University All Rights Reserved.
34 *
35 * Permission to use, copy, modify and distribute this software and its
36 * documentation is hereby granted, provided that both the copyright notice
37 * and this permission notice appear in all copies of the software,
38 * derivative works or modified versions, and any portions thereof, and that
39 * both notices appear in supporting documentation.
40 *
41 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" CONDITION.
42 * CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR ANY DAMAGES
43 * WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
44 *
45 * Carnegie Mellon requests users of this software to return to
46 *
47 * Software Distribution Coordinator or [email protected]
48 * School of Computer Science Carnegie Mellon University Pittsburgh PA
49 * 15213-3890
50 *
51 * any improvements or extensions that they make and grant Carnegie Mellon the
52 * rights to redistribute these changes.
53 */
54 /*
55 * File: kern/lock.c
56 * Author: Avadis Tevanian, Jr., Michael Wayne Young
57 * Date: 1985
58 *
59 * Locking primitives implementation
60 */
61
62 #define LOCK_PRIVATE 1
63
64 #include <mach_ldebug.h>
65
66 #include <mach/machine/sdt.h>
67
68 #include <kern/zalloc.h>
69 #include <kern/lock_stat.h>
70 #include <kern/locks.h>
71 #include <kern/misc_protos.h>
72 #include <kern/thread.h>
73 #include <kern/processor.h>
74 #include <kern/sched_hygiene.h>
75 #include <kern/sched_prim.h>
76 #include <kern/debug.h>
77 #include <kern/kcdata.h>
78 #include <kern/percpu.h>
79 #include <string.h>
80 #include <arm/cpu_internal.h>
81 #include <os/hash.h>
82 #include <arm/cpu_data.h>
83
84 #include <arm/cpu_data_internal.h>
85 #include <arm/proc_reg.h>
86 #include <arm/smp.h>
87 #include <machine/atomic.h>
88 #include <machine/machine_cpu.h>
89
90 #include <pexpert/pexpert.h>
91
92 #include <sys/kdebug.h>
93
94 #define ANY_LOCK_DEBUG (USLOCK_DEBUG || LOCK_DEBUG || MUTEX_DEBUG)
95
96 // Panic in tests that check lock usage correctness
97 // These are undesirable when in a panic or a debugger is runnning.
98 #define LOCK_CORRECTNESS_PANIC() (kernel_debugger_entry_count == 0)
99
100 #define ADAPTIVE_SPIN_ENABLE 0x1
101
102 int lck_mtx_adaptive_spin_mode = ADAPTIVE_SPIN_ENABLE;
103
104 #define SPINWAIT_OWNER_CHECK_COUNT 4
105
106 typedef enum {
107 SPINWAIT_ACQUIRED, /* Got the lock. */
108 SPINWAIT_INTERLOCK, /* Got the interlock, no owner, but caller must finish acquiring the lock. */
109 SPINWAIT_DID_SPIN_HIGH_THR, /* Got the interlock, spun, but failed to get the lock. */
110 SPINWAIT_DID_SPIN_OWNER_NOT_CORE, /* Got the interlock, spun, but failed to get the lock. */
111 SPINWAIT_DID_SPIN_NO_WINDOW_CONTENTION, /* Got the interlock, spun, but failed to get the lock. */
112 SPINWAIT_DID_SPIN_SLIDING_THR,/* Got the interlock, spun, but failed to get the lock. */
113 SPINWAIT_DID_NOT_SPIN, /* Got the interlock, did not spin. */
114 } spinwait_result_t;
115
116 /* Forwards */
117
118 extern unsigned int not_in_kdp;
119
120 /*
121 * We often want to know the addresses of the callers
122 * of the various lock routines. However, this information
123 * is only used for debugging and statistics.
124 */
125 typedef void *pc_t;
126 #define INVALID_PC ((void *) VM_MAX_KERNEL_ADDRESS)
127 #define INVALID_THREAD ((void *) VM_MAX_KERNEL_ADDRESS)
128
129 #ifdef lint
130 /*
131 * Eliminate lint complaints about unused local pc variables.
132 */
133 #define OBTAIN_PC(pc, l) ++pc
134 #else /* lint */
135 #define OBTAIN_PC(pc, l)
136 #endif /* lint */
137
138
139 /*
140 * Portable lock package implementation of usimple_locks.
141 */
142
143 /*
144 * Owner thread pointer when lock held in spin mode
145 */
146 #define LCK_MTX_SPIN_TAG 0xfffffff0
147
148
149 #define interlock_lock(lock) hw_lock_bit ((hw_lock_bit_t*)(&(lock)->lck_mtx_data), LCK_ILOCK_BIT, LCK_GRP_NULL)
150 #define interlock_try(lock) hw_lock_bit_try((hw_lock_bit_t*)(&(lock)->lck_mtx_data), LCK_ILOCK_BIT, LCK_GRP_NULL)
151 #define interlock_unlock(lock) hw_unlock_bit ((hw_lock_bit_t*)(&(lock)->lck_mtx_data), LCK_ILOCK_BIT)
152 #define load_memory_barrier() os_atomic_thread_fence(acquire)
153
154 // Enforce program order of loads and stores.
155 #define ordered_load(target) \
156 os_atomic_load(target, compiler_acq_rel)
157 #define ordered_store(target, value) \
158 os_atomic_store(target, value, compiler_acq_rel)
159
160 #define ordered_load_mtx(lock) ordered_load(&(lock)->lck_mtx_data)
161 #define ordered_store_mtx(lock, value) ordered_store(&(lock)->lck_mtx_data, (value))
162 #define ordered_load_hw(lock) ordered_load(&(lock)->lock_data)
163 #define ordered_store_hw(lock, value) ordered_store(&(lock)->lock_data, (value))
164 #define ordered_load_bit(lock) ordered_load((lock))
165 #define ordered_store_bit(lock, value) ordered_store((lock), (value))
166
167
168 // Prevent the compiler from reordering memory operations around this
169 #define compiler_memory_fence() __asm__ volatile ("" ::: "memory")
170
171 MACHINE_TIMEOUT32(lock_panic_timeout, "lock-panic",
172 0xc00000 /* 12.5 m ticks ~= 524ms with 24MHz OSC */, MACHINE_TIMEOUT_UNIT_TIMEBASE, NULL);
173
174 #define NOINLINE __attribute__((noinline))
175
176
177 #if __arm__
178 #define interrupts_disabled(mask) (mask & PSR_INTMASK)
179 #else
180 #define interrupts_disabled(mask) (mask & DAIF_IRQF)
181 #endif
182
183
184 #if __arm__
185 #define enable_fiq() __asm__ volatile ("cpsie f" ::: "memory");
186 #define enable_interrupts() __asm__ volatile ("cpsie if" ::: "memory");
187 #endif
188
189 KALLOC_TYPE_DEFINE(KT_LCK_SPIN, lck_spin_t, KT_PRIV_ACCT);
190
191 KALLOC_TYPE_DEFINE(KT_LCK_MTX, lck_mtx_t, KT_PRIV_ACCT);
192
193 KALLOC_TYPE_DEFINE(KT_LCK_MTX_EXT, lck_mtx_ext_t, KT_PRIV_ACCT);
194
195 #pragma GCC visibility push(hidden)
196 /*
197 * atomic exchange API is a low level abstraction of the operations
198 * to atomically read, modify, and write a pointer. This abstraction works
199 * for both Intel and ARMv8.1 compare and exchange atomic instructions as
200 * well as the ARM exclusive instructions.
201 *
202 * atomic_exchange_begin() - begin exchange and retrieve current value
203 * atomic_exchange_complete() - conclude an exchange
204 * atomic_exchange_abort() - cancel an exchange started with atomic_exchange_begin()
205 */
206 uint32_t
load_exclusive32(uint32_t * target,enum memory_order ord)207 load_exclusive32(uint32_t *target, enum memory_order ord)
208 {
209 uint32_t value;
210
211 #if __arm__
212 if (_os_atomic_mo_has_release(ord)) {
213 // Pre-load release barrier
214 atomic_thread_fence(memory_order_release);
215 }
216 value = __builtin_arm_ldrex(target);
217 #else
218 if (_os_atomic_mo_has_acquire(ord)) {
219 value = __builtin_arm_ldaex(target); // ldaxr
220 } else {
221 value = __builtin_arm_ldrex(target); // ldxr
222 }
223 #endif // __arm__
224 return value;
225 }
226
227 boolean_t
store_exclusive32(uint32_t * target,uint32_t value,enum memory_order ord)228 store_exclusive32(uint32_t *target, uint32_t value, enum memory_order ord)
229 {
230 boolean_t err;
231
232 #if __arm__
233 err = __builtin_arm_strex(value, target);
234 if (_os_atomic_mo_has_acquire(ord)) {
235 // Post-store acquire barrier
236 atomic_thread_fence(memory_order_acquire);
237 }
238 #else
239 if (_os_atomic_mo_has_release(ord)) {
240 err = __builtin_arm_stlex(value, target); // stlxr
241 } else {
242 err = __builtin_arm_strex(value, target); // stxr
243 }
244 #endif // __arm__
245 return !err;
246 }
247
248 uint32_t
atomic_exchange_begin32(uint32_t * target,uint32_t * previous,enum memory_order ord)249 atomic_exchange_begin32(uint32_t *target, uint32_t *previous, enum memory_order ord)
250 {
251 uint32_t val;
252
253 #if __ARM_ATOMICS_8_1
254 ord = memory_order_relaxed;
255 #endif
256 val = load_exclusive32(target, ord);
257 *previous = val;
258 return val;
259 }
260
261 boolean_t
atomic_exchange_complete32(uint32_t * target,uint32_t previous,uint32_t newval,enum memory_order ord)262 atomic_exchange_complete32(uint32_t *target, uint32_t previous, uint32_t newval, enum memory_order ord)
263 {
264 #if __ARM_ATOMICS_8_1
265 return __c11_atomic_compare_exchange_strong((_Atomic uint32_t *)target, &previous, newval, ord, memory_order_relaxed);
266 #else
267 (void)previous; // Previous not needed, monitor is held
268 return store_exclusive32(target, newval, ord);
269 #endif
270 }
271
272 void
atomic_exchange_abort(void)273 atomic_exchange_abort(void)
274 {
275 os_atomic_clear_exclusive();
276 }
277
278 boolean_t
atomic_test_and_set32(uint32_t * target,uint32_t test_mask,uint32_t set_mask,enum memory_order ord,boolean_t wait)279 atomic_test_and_set32(uint32_t *target, uint32_t test_mask, uint32_t set_mask, enum memory_order ord, boolean_t wait)
280 {
281 uint32_t value, prev;
282
283 for (;;) {
284 value = atomic_exchange_begin32(target, &prev, ord);
285 if (value & test_mask) {
286 if (wait) {
287 wait_for_event(); // Wait with monitor held
288 } else {
289 atomic_exchange_abort(); // Clear exclusive monitor
290 }
291 return FALSE;
292 }
293 value |= set_mask;
294 if (atomic_exchange_complete32(target, prev, value, ord)) {
295 return TRUE;
296 }
297 }
298 }
299
300 #pragma GCC visibility pop
301
302 #if SCHED_PREEMPTION_DISABLE_DEBUG
303
304 uint64_t PERCPU_DATA(preemption_disable_max_mt);
305
306 MACHINE_TIMEOUT_WRITEABLE(sched_preemption_disable_threshold_mt, "sched-preemption", 0, MACHINE_TIMEOUT_UNIT_TIMEBASE, kprintf_spam_mt_pred);
307
308 TUNABLE_DT_WRITEABLE(sched_hygiene_mode_t, sched_preemption_disable_debug_mode,
309 "machine-timeouts",
310 "sched-preemption-disable-mode", /* DT property names have to be 31 chars max */
311 "sched_preemption_disable_debug_mode",
312 SCHED_HYGIENE_MODE_OFF,
313 TUNABLE_DT_CHECK_CHOSEN);
314
315 static uint32_t const sched_preemption_disable_debug_dbgid = MACHDBG_CODE(DBG_MACH_SCHED, MACH_PREEMPTION_EXPIRED) | DBG_FUNC_NONE;
316
317 NOINLINE void
_prepare_preemption_disable_measurement(thread_t thread)318 _prepare_preemption_disable_measurement(thread_t thread)
319 {
320 if (thread->machine.inthandler_timestamp == 0) {
321 /*
322 * Only prepare a measurement if not currently in an interrupt
323 * handler.
324 *
325 * We are only interested in the net duration of disabled
326 * preemption, that is: The time in which preemption was
327 * disabled, minus the intervals in which any (likely
328 * unrelated) interrupts were handled.
329 * ml_adjust_preemption_disable_time() will remove those
330 * intervals, however we also do not even start measuring
331 * preemption disablement if we are already within handling of
332 * an interrupt when preemption was disabled (the resulting
333 * net time would be 0).
334 *
335 * Interrupt handling duration is handled separately, and any
336 * long intervals of preemption disablement are counted
337 * towards that.
338 */
339 thread->machine.preemption_disable_adj_mt = ml_get_speculative_timebase();
340 }
341 }
342
343 NOINLINE void
_collect_preemption_disable_measurement(thread_t thread)344 _collect_preemption_disable_measurement(thread_t thread)
345 {
346 bool istate = ml_set_interrupts_enabled(false);
347 /*
348 * Collect start time and current time with interrupts disabled.
349 * Otherwise an interrupt coming in after grabbing the timestamp
350 * could spuriously inflate the measurement, because it will
351 * adjust preemption_disable_adj_mt only after we already grabbed
352 * it.
353 *
354 * (Even worse if we collected the current time first: Then a
355 * subsequent interrupt could adjust preemption_disable_adj_mt to
356 * make the duration go negative after subtracting the already
357 * grabbed time. With interrupts disabled we don't care much about
358 * the order.)
359 */
360
361 uint64_t const mt = thread->machine.preemption_disable_adj_mt;
362 uint64_t const now = ml_get_speculative_timebase();
363
364 os_compiler_barrier(acq_rel);
365
366 ml_set_interrupts_enabled(istate);
367
368 int64_t const duration = now - mt;
369
370
371 uint64_t * const max_duration = PERCPU_GET(preemption_disable_max_mt);
372
373 if (__improbable(duration > *max_duration)) {
374 *max_duration = duration;
375 }
376
377 uint64_t const threshold = os_atomic_load(&sched_preemption_disable_threshold_mt, relaxed);
378 if (__improbable(threshold > 0 && duration >= threshold)) {
379 if (sched_preemption_disable_debug_mode == SCHED_HYGIENE_MODE_PANIC) {
380 panic("preemption disable timeout exceeded: %llu >= %llu timebase ticks", duration, threshold);
381 }
382
383 DTRACE_SCHED1(mach_preemption_expired, uint64_t, duration);
384 if (__improbable(kdebug_debugid_enabled(sched_preemption_disable_debug_dbgid))) {
385 KDBG(sched_preemption_disable_debug_dbgid, duration);
386 }
387 }
388
389 thread->machine.preemption_disable_adj_mt = 0;
390 }
391
392 /*
393 * Skip predicate for sched_preemption_disable, which would trigger
394 * spuriously when kprintf spam is enabled.
395 */
396 bool
kprintf_spam_mt_pred(struct machine_timeout_spec const __unused * spec)397 kprintf_spam_mt_pred(struct machine_timeout_spec const __unused *spec)
398 {
399 bool const kprintf_spam_enabled = !(disable_kprintf_output || disable_serial_output);
400 return kprintf_spam_enabled;
401 }
402
403 #endif /* SCHED_PREEMPTION_DISABLE_DEBUG */
404
405 /*
406 * To help _disable_preemption() inline everywhere with LTO,
407 * we keep these nice non inlineable functions as the panic()
408 * codegen setup is quite large and for weird reasons causes a frame.
409 */
410 __abortlike
411 static void
_disable_preemption_overflow(void)412 _disable_preemption_overflow(void)
413 {
414 panic("Preemption count overflow");
415 }
416
417 void
_disable_preemption(void)418 _disable_preemption(void)
419 {
420 thread_t thread = current_thread();
421 unsigned int count = thread->machine.preemption_count;
422
423 if (__improbable(++count == 0)) {
424 _disable_preemption_overflow();
425 }
426
427 os_atomic_store(&thread->machine.preemption_count, count, compiler_acq_rel);
428
429 #if SCHED_PREEMPTION_DISABLE_DEBUG
430
431 /*
432 * Note that this is not the only place preemption gets disabled,
433 * it also gets modified on ISR and PPL entry/exit. Both of those
434 * events will be treated specially however, and
435 * increment/decrement being paired around their entry/exit means
436 * that collection here is not desynced otherwise.
437 */
438
439 if (count == 1 && sched_preemption_disable_debug_mode) {
440 _prepare_preemption_disable_measurement(thread);
441 }
442 #endif /* SCHED_PREEMPTION_DISABLE_DEBUG */
443 }
444
445 /*
446 * This variant of _disable_preemption() allows disabling preemption
447 * without taking measurements (and later potentially triggering
448 * actions on those).
449 *
450 * We do this through a separate variant because we do not want to
451 * disturb inlinability of _disable_preemption(). However, in order to
452 * also avoid code duplication, instead of repeating common code we
453 * simply call _disable_preemption() and explicitly abandon any taken
454 * measurement.
455 */
456 void
_disable_preemption_without_measurements(void)457 _disable_preemption_without_measurements(void)
458 {
459 _disable_preemption();
460
461 #if SCHED_PREEMPTION_DISABLE_DEBUG
462 /*
463 * Abandon a potential preemption disable measurement. Useful for
464 * example for the idle thread, which would just spuriously
465 * trigger the threshold while actually idling, which we don't
466 * care about.
467 */
468 thread_t t = current_thread();
469 if (t->machine.preemption_disable_adj_mt != 0) {
470 t->machine.preemption_disable_adj_mt = 0;
471 }
472 #endif /* SCHED_PREEMPTION_DISABLE_DEBUG */
473 }
474
475 /*
476 * This function checks whether an AST_URGENT has been pended.
477 *
478 * It is called once the preemption has been reenabled, which means the thread
479 * may have been preempted right before this was called, and when this function
480 * actually performs the check, we've changed CPU.
481 *
482 * This race is however benign: the point of AST_URGENT is to trigger a context
483 * switch, so if one happened, there's nothing left to check for, and AST_URGENT
484 * was cleared in the process.
485 *
486 * It follows that this check cannot have false negatives, which allows us
487 * to avoid fiddling with interrupt state for the vast majority of cases
488 * when the check will actually be negative.
489 */
490 static NOINLINE void
kernel_preempt_check(thread_t thread)491 kernel_preempt_check(thread_t thread)
492 {
493 long state;
494
495 #if __arm__
496 #define INTERRUPT_MASK PSR_IRQF
497 #else // __arm__
498 #define INTERRUPT_MASK DAIF_IRQF
499 #endif // __arm__
500
501 /* If interrupts are masked, we can't take an AST here */
502 state = get_interrupts();
503 if ((state & INTERRUPT_MASK) == 0) {
504 disable_interrupts_noread(); // Disable interrupts
505
506 /*
507 * Reload cpu_pending_ast: a context switch would cause it to change.
508 * Now that interrupts are disabled, this will debounce false positives.
509 */
510 if (thread->machine.CpuDatap->cpu_pending_ast & AST_URGENT) {
511 #if __arm__
512 #if __ARM_USER_PROTECT__
513 uintptr_t up = arm_user_protect_begin(thread);
514 #endif // __ARM_USER_PROTECT__
515 enable_fiq();
516 #endif // __arm__
517 ast_taken_kernel(); // Handle urgent AST
518 #if __arm__
519 #if __ARM_USER_PROTECT__
520 arm_user_protect_end(thread, up, TRUE);
521 #endif // __ARM_USER_PROTECT__
522 enable_interrupts();
523 return; // Return early on arm only due to FIQ enabling
524 #endif // __arm__
525 }
526 restore_interrupts(state); // Enable interrupts
527 }
528 }
529
530 /*
531 * To help _enable_preemption() inline everywhere with LTO,
532 * we keep these nice non inlineable functions as the panic()
533 * codegen setup is quite large and for weird reasons causes a frame.
534 */
535 __abortlike
536 static void
_enable_preemption_underflow(void)537 _enable_preemption_underflow(void)
538 {
539 panic("Preemption count underflow");
540 }
541
542 void
_enable_preemption(void)543 _enable_preemption(void)
544 {
545 thread_t thread = current_thread();
546 unsigned int count = thread->machine.preemption_count;
547
548 if (__improbable(count == 0)) {
549 _enable_preemption_underflow();
550 }
551 count -= 1;
552
553 #if SCHED_PREEMPTION_DISABLE_DEBUG
554 if (count == 0 && thread->machine.preemption_disable_adj_mt != 0) {
555 _collect_preemption_disable_measurement(thread);
556 }
557 #endif /* SCHED_PREEMPTION_DISABLE_DEBUG */
558
559 os_atomic_store(&thread->machine.preemption_count, count, compiler_acq_rel);
560 if (count == 0) {
561 /*
562 * This check is racy and could load from another CPU's pending_ast mask,
563 * but as described above, this can't have false negatives.
564 */
565 if (__improbable(thread->machine.CpuDatap->cpu_pending_ast & AST_URGENT)) {
566 kernel_preempt_check(thread);
567 }
568 }
569
570 os_compiler_barrier();
571 }
572
573 int
get_preemption_level(void)574 get_preemption_level(void)
575 {
576 return current_thread()->machine.preemption_count;
577 }
578
579 /*
580 * Routine: lck_spin_alloc_init
581 */
582 lck_spin_t *
lck_spin_alloc_init(lck_grp_t * grp,lck_attr_t * attr)583 lck_spin_alloc_init(
584 lck_grp_t * grp,
585 lck_attr_t * attr)
586 {
587 lck_spin_t *lck;
588
589 lck = zalloc(KT_LCK_SPIN);
590 lck_spin_init(lck, grp, attr);
591 return lck;
592 }
593
594 /*
595 * Routine: lck_spin_free
596 */
597 void
lck_spin_free(lck_spin_t * lck,lck_grp_t * grp)598 lck_spin_free(
599 lck_spin_t * lck,
600 lck_grp_t * grp)
601 {
602 lck_spin_destroy(lck, grp);
603 zfree(KT_LCK_SPIN, lck);
604 }
605
606 /*
607 * Routine: lck_spin_init
608 */
609 void
lck_spin_init(lck_spin_t * lck,lck_grp_t * grp,__unused lck_attr_t * attr)610 lck_spin_init(
611 lck_spin_t * lck,
612 lck_grp_t * grp,
613 __unused lck_attr_t * attr)
614 {
615 lck->type = LCK_SPIN_TYPE;
616 hw_lock_init(&lck->hwlock);
617 if (grp) {
618 lck_grp_reference(grp, &grp->lck_grp_spincnt);
619 }
620 }
621
622 /*
623 * arm_usimple_lock is a lck_spin_t without a group or attributes
624 */
625 MARK_AS_HIBERNATE_TEXT void inline
arm_usimple_lock_init(simple_lock_t lck,__unused unsigned short initial_value)626 arm_usimple_lock_init(simple_lock_t lck, __unused unsigned short initial_value)
627 {
628 lck->type = LCK_SPIN_TYPE;
629 hw_lock_init(&lck->hwlock);
630 }
631
632
633 /*
634 * Routine: lck_spin_lock
635 */
636 void
lck_spin_lock(lck_spin_t * lock)637 lck_spin_lock(lck_spin_t *lock)
638 {
639 #if DEVELOPMENT || DEBUG
640 if (lock->type != LCK_SPIN_TYPE) {
641 panic("Invalid spinlock %p", lock);
642 }
643 #endif // DEVELOPMENT || DEBUG
644 hw_lock_lock(&lock->hwlock, LCK_GRP_NULL);
645 }
646
647 void
lck_spin_lock_grp(lck_spin_t * lock,lck_grp_t * grp)648 lck_spin_lock_grp(lck_spin_t *lock, lck_grp_t *grp)
649 {
650 #pragma unused(grp)
651 #if DEVELOPMENT || DEBUG
652 if (lock->type != LCK_SPIN_TYPE) {
653 panic("Invalid spinlock %p", lock);
654 }
655 #endif // DEVELOPMENT || DEBUG
656 hw_lock_lock(&lock->hwlock, grp);
657 }
658
659 /*
660 * Routine: lck_spin_lock_nopreempt
661 */
662 void
lck_spin_lock_nopreempt(lck_spin_t * lock)663 lck_spin_lock_nopreempt(lck_spin_t *lock)
664 {
665 #if DEVELOPMENT || DEBUG
666 if (lock->type != LCK_SPIN_TYPE) {
667 panic("Invalid spinlock %p", lock);
668 }
669 #endif // DEVELOPMENT || DEBUG
670 hw_lock_lock_nopreempt(&lock->hwlock, LCK_GRP_NULL);
671 }
672
673 void
lck_spin_lock_nopreempt_grp(lck_spin_t * lock,lck_grp_t * grp)674 lck_spin_lock_nopreempt_grp(lck_spin_t *lock, lck_grp_t *grp)
675 {
676 #pragma unused(grp)
677 #if DEVELOPMENT || DEBUG
678 if (lock->type != LCK_SPIN_TYPE) {
679 panic("Invalid spinlock %p", lock);
680 }
681 #endif // DEVELOPMENT || DEBUG
682 hw_lock_lock_nopreempt(&lock->hwlock, grp);
683 }
684
685 /*
686 * Routine: lck_spin_try_lock
687 */
688 int
lck_spin_try_lock(lck_spin_t * lock)689 lck_spin_try_lock(lck_spin_t *lock)
690 {
691 return hw_lock_try(&lock->hwlock, LCK_GRP_NULL);
692 }
693
694 int
lck_spin_try_lock_grp(lck_spin_t * lock,lck_grp_t * grp)695 lck_spin_try_lock_grp(lck_spin_t *lock, lck_grp_t *grp)
696 {
697 #pragma unused(grp)
698 return hw_lock_try(&lock->hwlock, grp);
699 }
700
701 /*
702 * Routine: lck_spin_try_lock_nopreempt
703 */
704 int
lck_spin_try_lock_nopreempt(lck_spin_t * lock)705 lck_spin_try_lock_nopreempt(lck_spin_t *lock)
706 {
707 return hw_lock_try_nopreempt(&lock->hwlock, LCK_GRP_NULL);
708 }
709
710 int
lck_spin_try_lock_nopreempt_grp(lck_spin_t * lock,lck_grp_t * grp)711 lck_spin_try_lock_nopreempt_grp(lck_spin_t *lock, lck_grp_t *grp)
712 {
713 #pragma unused(grp)
714 return hw_lock_try_nopreempt(&lock->hwlock, grp);
715 }
716
717 /*
718 * Routine: lck_spin_unlock
719 */
720 void
lck_spin_unlock(lck_spin_t * lock)721 lck_spin_unlock(lck_spin_t *lock)
722 {
723 #if DEVELOPMENT || DEBUG
724 if ((LCK_MTX_STATE_TO_THREAD(lock->lck_spin_data) != current_thread()) && LOCK_CORRECTNESS_PANIC()) {
725 panic("Spinlock not owned by thread %p = %lx", lock, lock->lck_spin_data);
726 }
727 if (lock->type != LCK_SPIN_TYPE) {
728 panic("Invalid spinlock type %p", lock);
729 }
730 #endif // DEVELOPMENT || DEBUG
731 hw_lock_unlock(&lock->hwlock);
732 }
733
734 /*
735 * Routine: lck_spin_unlock_nopreempt
736 */
737 void
lck_spin_unlock_nopreempt(lck_spin_t * lock)738 lck_spin_unlock_nopreempt(lck_spin_t *lock)
739 {
740 #if DEVELOPMENT || DEBUG
741 if ((LCK_MTX_STATE_TO_THREAD(lock->lck_spin_data) != current_thread()) && LOCK_CORRECTNESS_PANIC()) {
742 panic("Spinlock not owned by thread %p = %lx", lock, lock->lck_spin_data);
743 }
744 if (lock->type != LCK_SPIN_TYPE) {
745 panic("Invalid spinlock type %p", lock);
746 }
747 #endif // DEVELOPMENT || DEBUG
748 hw_lock_unlock_nopreempt(&lock->hwlock);
749 }
750
751 /*
752 * Routine: lck_spin_destroy
753 */
754 void
lck_spin_destroy(lck_spin_t * lck,lck_grp_t * grp)755 lck_spin_destroy(
756 lck_spin_t * lck,
757 lck_grp_t * grp)
758 {
759 if (lck->lck_spin_data == LCK_SPIN_TAG_DESTROYED) {
760 return;
761 }
762 lck->lck_spin_data = LCK_SPIN_TAG_DESTROYED;
763 if (grp) {
764 lck_grp_deallocate(grp, &grp->lck_grp_spincnt);
765 }
766 }
767
768 /*
769 * Routine: kdp_lck_spin_is_acquired
770 * NOT SAFE: To be used only by kernel debugger to avoid deadlock.
771 */
772 boolean_t
kdp_lck_spin_is_acquired(lck_spin_t * lck)773 kdp_lck_spin_is_acquired(lck_spin_t *lck)
774 {
775 if (not_in_kdp) {
776 panic("panic: spinlock acquired check done outside of kernel debugger");
777 }
778 return ((lck->lck_spin_data & ~LCK_SPIN_TAG_DESTROYED) != 0) ? TRUE:FALSE;
779 }
780
781 /*
782 * Initialize a usimple_lock.
783 *
784 * No change in preemption state.
785 */
786 void
usimple_lock_init(usimple_lock_t l,unsigned short tag)787 usimple_lock_init(
788 usimple_lock_t l,
789 unsigned short tag)
790 {
791 simple_lock_init((simple_lock_t) l, tag);
792 }
793
794
795 /*
796 * Acquire a usimple_lock.
797 *
798 * Returns with preemption disabled. Note
799 * that the hw_lock routines are responsible for
800 * maintaining preemption state.
801 */
802 void
803 (usimple_lock)(
804 usimple_lock_t l
805 LCK_GRP_ARG(lck_grp_t *grp))
806 {
807 simple_lock((simple_lock_t) l, LCK_GRP_PROBEARG(grp));
808 }
809
810
811 extern void sync(void);
812
813 /*
814 * Release a usimple_lock.
815 *
816 * Returns with preemption enabled. Note
817 * that the hw_lock routines are responsible for
818 * maintaining preemption state.
819 */
820 void
821 (usimple_unlock)(
822 usimple_lock_t l)
823 {
824 simple_unlock((simple_lock_t)l);
825 }
826
827
828 /*
829 * Conditionally acquire a usimple_lock.
830 *
831 * On success, returns with preemption disabled.
832 * On failure, returns with preemption in the same state
833 * as when first invoked. Note that the hw_lock routines
834 * are responsible for maintaining preemption state.
835 *
836 * XXX No stats are gathered on a miss; I preserved this
837 * behavior from the original assembly-language code, but
838 * doesn't it make sense to log misses? XXX
839 */
840 unsigned
841 int
842 (usimple_lock_try)(
843 usimple_lock_t l
844 LCK_GRP_ARG(lck_grp_t *grp))
845 {
846 return simple_lock_try((simple_lock_t) l, grp);
847 }
848
849 /*
850 * The C portion of the mutex package. These routines are only invoked
851 * if the optimized assembler routines can't do the work.
852 */
853
854 /*
855 * Routine: lck_mtx_alloc_init
856 */
857 lck_mtx_t *
lck_mtx_alloc_init(lck_grp_t * grp,lck_attr_t * attr)858 lck_mtx_alloc_init(
859 lck_grp_t * grp,
860 lck_attr_t * attr)
861 {
862 lck_mtx_t *lck;
863
864 lck = zalloc(KT_LCK_MTX);
865 lck_mtx_init(lck, grp, attr);
866 return lck;
867 }
868
869 /*
870 * Routine: lck_mtx_free
871 */
872 void
lck_mtx_free(lck_mtx_t * lck,lck_grp_t * grp)873 lck_mtx_free(
874 lck_mtx_t * lck,
875 lck_grp_t * grp)
876 {
877 lck_mtx_destroy(lck, grp);
878 zfree(KT_LCK_MTX, lck);
879 }
880
881 /*
882 * Routine: lck_mtx_init
883 */
884 void
lck_mtx_init(lck_mtx_t * lck,lck_grp_t * grp,lck_attr_t * attr)885 lck_mtx_init(
886 lck_mtx_t * lck,
887 lck_grp_t * grp,
888 lck_attr_t * attr)
889 {
890 lck_mtx_ext_t *lck_ext = NULL;
891
892 if (attr == LCK_ATTR_NULL) {
893 attr = &LockDefaultLckAttr;
894 }
895 #ifdef BER_XXX
896 if (attr->lck_attr_val & LCK_ATTR_DEBUG) {
897 lck_ext = zalloc(KT_LCK_MTX_EXT);
898 }
899 #endif
900 lck_mtx_init_ext(lck, lck_ext, grp, attr);
901 }
902
903 /*
904 * Routine: lck_mtx_init_ext
905 */
906 void
lck_mtx_init_ext(lck_mtx_t * lck,lck_mtx_ext_t * lck_ext __unused,lck_grp_t * grp,lck_attr_t * attr)907 lck_mtx_init_ext(
908 lck_mtx_t * lck,
909 lck_mtx_ext_t * lck_ext __unused,
910 lck_grp_t * grp,
911 lck_attr_t * attr)
912 {
913 if (attr == LCK_ATTR_NULL) {
914 attr = &LockDefaultLckAttr;
915 }
916
917 *lck = (lck_mtx_t){
918 .lck_mtx_type = LCK_MTX_TYPE,
919 };
920
921 #if LOCKS_INDIRECT_ALLOW
922 if (__improbable(lck_ext && (attr->lck_attr_val & LCK_ATTR_DEBUG))) {
923 *lck_ext = (lck_mtx_ext_t){
924 .lck_mtx_deb.type = MUTEX_TAG,
925 .lck_mtx_grp = grp,
926 .lck_mtx = *lck,
927 };
928 lck->lck_mtx_tag = LCK_MTX_TAG_INDIRECT;
929 lck->lck_mtx_ptr = lck_ext;
930 }
931 #endif /* LOCKS_INDIRECT_ALLOW */
932
933 lck_grp_reference(grp, &grp->lck_grp_mtxcnt);
934 }
935
936 /* The slow versions */
937 static void lck_mtx_lock_contended(lck_mtx_t *lock, thread_t thread, boolean_t interlocked);
938 static boolean_t lck_mtx_try_lock_contended(lck_mtx_t *lock, thread_t thread);
939 static void lck_mtx_unlock_contended(lck_mtx_t *lock, thread_t thread, boolean_t interlocked);
940
941 /* The adaptive spin function */
942 static spinwait_result_t lck_mtx_lock_contended_spinwait_arm(lck_mtx_t *lock, thread_t thread, boolean_t interlocked);
943
944 /*
945 * Routine: lck_mtx_verify
946 *
947 * Verify if a mutex is valid
948 */
949 static inline void
lck_mtx_verify(lck_mtx_t * lock)950 lck_mtx_verify(lck_mtx_t *lock)
951 {
952 if (lock->lck_mtx_type != LCK_MTX_TYPE) {
953 panic("Invalid mutex %p", lock);
954 }
955 #if DEVELOPMENT || DEBUG
956 if (lock->lck_mtx_tag == LCK_MTX_TAG_DESTROYED) {
957 panic("Mutex destroyed %p", lock);
958 }
959 #endif /* DEVELOPMENT || DEBUG */
960 }
961
962 /*
963 * Routine: lck_mtx_check_preemption
964 *
965 * Verify preemption is enabled when attempting to acquire a mutex.
966 */
967
968 static inline void
lck_mtx_check_preemption(lck_mtx_t * lock)969 lck_mtx_check_preemption(lck_mtx_t *lock)
970 {
971 #if DEVELOPMENT || DEBUG
972 if (current_cpu_datap()->cpu_hibernate) {
973 return;
974 }
975
976 int pl = get_preemption_level();
977
978 if (pl != 0) {
979 panic("Attempt to take mutex with preemption disabled. Lock=%p, level=%d", lock, pl);
980 }
981 #else
982 (void)lock;
983 #endif
984 }
985
986 /*
987 * Routine: lck_mtx_lock
988 */
989 void
lck_mtx_lock(lck_mtx_t * lock)990 lck_mtx_lock(lck_mtx_t *lock)
991 {
992 thread_t thread;
993
994 lck_mtx_verify(lock);
995 lck_mtx_check_preemption(lock);
996 thread = current_thread();
997 if (os_atomic_cmpxchg(&lock->lck_mtx_data,
998 0, LCK_MTX_THREAD_TO_STATE(thread), acquire)) {
999 #if CONFIG_DTRACE
1000 LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_ACQUIRE, lock, 0);
1001 #endif /* CONFIG_DTRACE */
1002 return;
1003 }
1004 lck_mtx_lock_contended(lock, thread, FALSE);
1005 }
1006
1007 /*
1008 * This is the slow version of mutex locking.
1009 */
1010 static void NOINLINE
lck_mtx_lock_contended(lck_mtx_t * lock,thread_t thread,boolean_t interlocked)1011 lck_mtx_lock_contended(lck_mtx_t *lock, thread_t thread, boolean_t interlocked)
1012 {
1013 thread_t holding_thread;
1014 uintptr_t state;
1015 int waiters = 0;
1016 spinwait_result_t sw_res;
1017 struct turnstile *ts = NULL;
1018
1019 /* Loop waiting until I see that the mutex is unowned */
1020 for (;;) {
1021 sw_res = lck_mtx_lock_contended_spinwait_arm(lock, thread, interlocked);
1022 interlocked = FALSE;
1023
1024 switch (sw_res) {
1025 case SPINWAIT_ACQUIRED:
1026 if (ts != NULL) {
1027 interlock_lock(lock);
1028 turnstile_complete((uintptr_t)lock, NULL, NULL, TURNSTILE_KERNEL_MUTEX);
1029 interlock_unlock(lock);
1030 }
1031 goto done;
1032 case SPINWAIT_INTERLOCK:
1033 goto set_owner;
1034 default:
1035 break;
1036 }
1037
1038 state = ordered_load_mtx(lock);
1039 holding_thread = LCK_MTX_STATE_TO_THREAD(state);
1040 if (holding_thread == NULL) {
1041 break;
1042 }
1043 ordered_store_mtx(lock, (state | LCK_ILOCK | ARM_LCK_WAITERS)); // Set waiters bit and wait
1044 lck_mtx_lock_wait(lock, holding_thread, &ts);
1045 /* returns interlock unlocked */
1046 }
1047
1048 set_owner:
1049 /* Hooray, I'm the new owner! */
1050 state = ordered_load_mtx(lock);
1051
1052 if (state & ARM_LCK_WAITERS) {
1053 /* Skip lck_mtx_lock_acquire if there are no waiters. */
1054 waiters = lck_mtx_lock_acquire(lock, ts);
1055 /*
1056 * lck_mtx_lock_acquire will call
1057 * turnstile_complete
1058 */
1059 } else {
1060 if (ts != NULL) {
1061 turnstile_complete((uintptr_t)lock, NULL, NULL, TURNSTILE_KERNEL_MUTEX);
1062 }
1063 }
1064
1065 state = LCK_MTX_THREAD_TO_STATE(thread);
1066 if (waiters != 0) {
1067 state |= ARM_LCK_WAITERS;
1068 }
1069 state |= LCK_ILOCK; // Preserve interlock
1070 ordered_store_mtx(lock, state); // Set ownership
1071 interlock_unlock(lock); // Release interlock, enable preemption
1072
1073 done:
1074 load_memory_barrier();
1075
1076 assert(thread->turnstile != NULL);
1077
1078 if (ts != NULL) {
1079 turnstile_cleanup();
1080 }
1081
1082 #if CONFIG_DTRACE
1083 LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_ACQUIRE, lock, 0);
1084 #endif /* CONFIG_DTRACE */
1085 }
1086
1087 /*
1088 * Routine: lck_mtx_lock_spinwait_arm
1089 *
1090 * Invoked trying to acquire a mutex when there is contention but
1091 * the holder is running on another processor. We spin for up to a maximum
1092 * time waiting for the lock to be released.
1093 */
1094 static spinwait_result_t
lck_mtx_lock_contended_spinwait_arm(lck_mtx_t * lock,thread_t thread,boolean_t interlocked)1095 lck_mtx_lock_contended_spinwait_arm(lck_mtx_t *lock, thread_t thread, boolean_t interlocked)
1096 {
1097 int has_interlock = (int)interlocked;
1098 __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lock);
1099 thread_t owner, prev_owner;
1100 uint64_t window_deadline, sliding_deadline, high_deadline;
1101 uint64_t start_time, cur_time, avg_hold_time, bias, delta;
1102 int loopcount = 0;
1103 uint i, prev_owner_cpu;
1104 int total_hold_time_samples, window_hold_time_samples, unfairness;
1105 bool owner_on_core, adjust;
1106 uintptr_t state, new_state, waiters;
1107 spinwait_result_t retval = SPINWAIT_DID_SPIN_HIGH_THR;
1108
1109 if (__improbable(!(lck_mtx_adaptive_spin_mode & ADAPTIVE_SPIN_ENABLE))) {
1110 if (!has_interlock) {
1111 interlock_lock(lock);
1112 }
1113
1114 return SPINWAIT_DID_NOT_SPIN;
1115 }
1116
1117 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_START,
1118 trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(LCK_MTX_STATE_TO_THREAD(state)), lock->lck_mtx_waiters, 0, 0);
1119
1120 start_time = mach_absolute_time();
1121 /*
1122 * window_deadline represents the "learning" phase.
1123 * The thread collects statistics about the lock during
1124 * window_deadline and then it makes a decision on whether to spin more
1125 * or block according to the concurrency behavior
1126 * observed.
1127 *
1128 * Every thread can spin at least low_MutexSpin.
1129 */
1130 window_deadline = start_time + low_MutexSpin;
1131 /*
1132 * Sliding_deadline is the adjusted spin deadline
1133 * computed after the "learning" phase.
1134 */
1135 sliding_deadline = window_deadline;
1136 /*
1137 * High_deadline is a hard deadline. No thread
1138 * can spin more than this deadline.
1139 */
1140 if (high_MutexSpin >= 0) {
1141 high_deadline = start_time + high_MutexSpin;
1142 } else {
1143 high_deadline = start_time + low_MutexSpin * real_ncpus;
1144 }
1145
1146 /*
1147 * Do not know yet which is the owner cpu.
1148 * Initialize prev_owner_cpu with next cpu.
1149 */
1150 prev_owner_cpu = (cpu_number() + 1) % real_ncpus;
1151 total_hold_time_samples = 0;
1152 window_hold_time_samples = 0;
1153 avg_hold_time = 0;
1154 adjust = TRUE;
1155 bias = (os_hash_kernel_pointer(lock) + cpu_number()) % real_ncpus;
1156
1157 /* Snoop the lock state */
1158 state = ordered_load_mtx(lock);
1159 owner = LCK_MTX_STATE_TO_THREAD(state);
1160 prev_owner = owner;
1161
1162 if (has_interlock) {
1163 if (owner == NULL) {
1164 retval = SPINWAIT_INTERLOCK;
1165 goto done_spinning;
1166 } else {
1167 /*
1168 * We are holding the interlock, so
1169 * we can safely dereference owner.
1170 */
1171 if (!machine_thread_on_core(owner) || (owner->state & TH_IDLE)) {
1172 retval = SPINWAIT_DID_NOT_SPIN;
1173 goto done_spinning;
1174 }
1175 }
1176 interlock_unlock(lock);
1177 has_interlock = 0;
1178 }
1179
1180 /*
1181 * Spin while:
1182 * - mutex is locked, and
1183 * - it's locked as a spin lock, and
1184 * - owner is running on another processor, and
1185 * - we haven't spun for long enough.
1186 */
1187 do {
1188 /*
1189 * Try to acquire the lock.
1190 */
1191 owner = LCK_MTX_STATE_TO_THREAD(state);
1192 if (owner == NULL) {
1193 waiters = state & ARM_LCK_WAITERS;
1194 if (waiters) {
1195 /*
1196 * preserve the waiter bit
1197 * and try acquire the interlock.
1198 * Note: we will successfully acquire
1199 * the interlock only if we can also
1200 * acquire the lock.
1201 */
1202 new_state = ARM_LCK_WAITERS | LCK_ILOCK;
1203 has_interlock = 1;
1204 retval = SPINWAIT_INTERLOCK;
1205 disable_preemption();
1206 } else {
1207 new_state = LCK_MTX_THREAD_TO_STATE(thread);
1208 retval = SPINWAIT_ACQUIRED;
1209 }
1210
1211 /*
1212 * The cmpxchg will succed only if the lock
1213 * is not owned (doesn't have an owner set)
1214 * and it is not interlocked.
1215 * It will not fail if there are waiters.
1216 */
1217 if (os_atomic_cmpxchgv(&lock->lck_mtx_data,
1218 waiters, new_state, &state, acquire)) {
1219 goto done_spinning;
1220 } else {
1221 if (waiters) {
1222 has_interlock = 0;
1223 enable_preemption();
1224 }
1225 }
1226 }
1227
1228 cur_time = mach_absolute_time();
1229
1230 /*
1231 * Never spin past high_deadline.
1232 */
1233 if (cur_time >= high_deadline) {
1234 retval = SPINWAIT_DID_SPIN_HIGH_THR;
1235 break;
1236 }
1237
1238 /*
1239 * Check if owner is on core. If not block.
1240 */
1241 owner = LCK_MTX_STATE_TO_THREAD(state);
1242 if (owner) {
1243 i = prev_owner_cpu;
1244 owner_on_core = FALSE;
1245
1246 disable_preemption();
1247 state = ordered_load_mtx(lock);
1248 owner = LCK_MTX_STATE_TO_THREAD(state);
1249
1250 /*
1251 * For scalability we want to check if the owner is on core
1252 * without locking the mutex interlock.
1253 * If we do not lock the mutex interlock, the owner that we see might be
1254 * invalid, so we cannot dereference it. Therefore we cannot check
1255 * any field of the thread to tell us if it is on core.
1256 * Check if the thread that is running on the other cpus matches the owner.
1257 */
1258 if (owner) {
1259 do {
1260 cpu_data_t *cpu_data_ptr = CpuDataEntries[i].cpu_data_vaddr;
1261 if ((cpu_data_ptr != NULL) && (cpu_data_ptr->cpu_active_thread == owner)) {
1262 owner_on_core = TRUE;
1263 break;
1264 }
1265 if (++i >= real_ncpus) {
1266 i = 0;
1267 }
1268 } while (i != prev_owner_cpu);
1269 enable_preemption();
1270
1271 if (owner_on_core) {
1272 prev_owner_cpu = i;
1273 } else {
1274 prev_owner = owner;
1275 state = ordered_load_mtx(lock);
1276 owner = LCK_MTX_STATE_TO_THREAD(state);
1277 if (owner == prev_owner) {
1278 /*
1279 * Owner is not on core.
1280 * Stop spinning.
1281 */
1282 if (loopcount == 0) {
1283 retval = SPINWAIT_DID_NOT_SPIN;
1284 } else {
1285 retval = SPINWAIT_DID_SPIN_OWNER_NOT_CORE;
1286 }
1287 break;
1288 }
1289 /*
1290 * Fall through if the owner changed while we were scanning.
1291 * The new owner could potentially be on core, so loop
1292 * again.
1293 */
1294 }
1295 } else {
1296 enable_preemption();
1297 }
1298 }
1299
1300 /*
1301 * Save how many times we see the owner changing.
1302 * We can roughly estimate the the mutex hold
1303 * time and the fairness with that.
1304 */
1305 if (owner != prev_owner) {
1306 prev_owner = owner;
1307 total_hold_time_samples++;
1308 window_hold_time_samples++;
1309 }
1310
1311 /*
1312 * Learning window expired.
1313 * Try to adjust the sliding_deadline.
1314 */
1315 if (cur_time >= window_deadline) {
1316 /*
1317 * If there was not contention during the window
1318 * stop spinning.
1319 */
1320 if (window_hold_time_samples < 1) {
1321 retval = SPINWAIT_DID_SPIN_NO_WINDOW_CONTENTION;
1322 break;
1323 }
1324
1325 if (adjust) {
1326 /*
1327 * For a fair lock, we'd wait for at most (NCPU-1) periods,
1328 * but the lock is unfair, so let's try to estimate by how much.
1329 */
1330 unfairness = total_hold_time_samples / real_ncpus;
1331
1332 if (unfairness == 0) {
1333 /*
1334 * We observed the owner changing `total_hold_time_samples` times which
1335 * let us estimate the average hold time of this mutex for the duration
1336 * of the spin time.
1337 * avg_hold_time = (cur_time - start_time) / total_hold_time_samples;
1338 *
1339 * In this case spin at max avg_hold_time * (real_ncpus - 1)
1340 */
1341 delta = cur_time - start_time;
1342 sliding_deadline = start_time + (delta * (real_ncpus - 1)) / total_hold_time_samples;
1343 } else {
1344 /*
1345 * In this case at least one of the other cpus was able to get the lock twice
1346 * while I was spinning.
1347 * We could spin longer but it won't necessarily help if the system is unfair.
1348 * Try to randomize the wait to reduce contention.
1349 *
1350 * We compute how much time we could potentially spin
1351 * and distribute it over the cpus.
1352 *
1353 * bias is an integer between 0 and real_ncpus.
1354 * distributed_increment = ((high_deadline - cur_time) / real_ncpus) * bias
1355 */
1356 delta = high_deadline - cur_time;
1357 sliding_deadline = cur_time + ((delta * bias) / real_ncpus);
1358 adjust = FALSE;
1359 }
1360 }
1361
1362 window_deadline += low_MutexSpin;
1363 window_hold_time_samples = 0;
1364 }
1365
1366 /*
1367 * Stop spinning if we past
1368 * the adjusted deadline.
1369 */
1370 if (cur_time >= sliding_deadline) {
1371 retval = SPINWAIT_DID_SPIN_SLIDING_THR;
1372 break;
1373 }
1374
1375 /*
1376 * We want to arm the monitor for wfe,
1377 * so load exclusively the lock.
1378 *
1379 * NOTE:
1380 * we rely on the fact that wfe will
1381 * eventually return even if the cache line
1382 * is not modified. This way we will keep
1383 * looping and checking if the deadlines expired.
1384 */
1385 state = os_atomic_load_exclusive(&lock->lck_mtx_data, relaxed);
1386 owner = LCK_MTX_STATE_TO_THREAD(state);
1387 if (owner != NULL) {
1388 wait_for_event();
1389 state = ordered_load_mtx(lock);
1390 } else {
1391 atomic_exchange_abort();
1392 }
1393
1394 loopcount++;
1395 } while (TRUE);
1396
1397 done_spinning:
1398 #if CONFIG_DTRACE
1399 /*
1400 * Note that we record a different probe id depending on whether
1401 * this is a direct or indirect mutex. This allows us to
1402 * penalize only lock groups that have debug/stats enabled
1403 * with dtrace processing if desired.
1404 */
1405 #if LOCKS_INDIRECT_ALLOW
1406 if (__probable(lock->lck_mtx_tag != LCK_MTX_TAG_INDIRECT)) {
1407 LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_ADAPTIVE_SPIN, lock,
1408 mach_absolute_time() - start_time);
1409 } else {
1410 LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_ADAPTIVE_SPIN, lock,
1411 mach_absolute_time() - start_time);
1412 }
1413 #else
1414 LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_ADAPTIVE_SPIN, lock,
1415 mach_absolute_time() - start_time);
1416 #endif /* LOCKS_INDIRECT_ALLOW */
1417 /* The lockstat acquire event is recorded by the caller. */
1418 #endif
1419
1420 state = ordered_load_mtx(lock);
1421
1422 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_END,
1423 trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(LCK_MTX_STATE_TO_THREAD(state)), lock->lck_mtx_waiters, retval, 0);
1424 if ((!has_interlock) && (retval != SPINWAIT_ACQUIRED)) {
1425 /* We must own either the lock or the interlock on return. */
1426 interlock_lock(lock);
1427 }
1428
1429 return retval;
1430 }
1431
1432
1433 /*
1434 * Common code for mutex locking as spinlock
1435 */
1436 static inline void
lck_mtx_lock_spin_internal(lck_mtx_t * lock,boolean_t allow_held_as_mutex)1437 lck_mtx_lock_spin_internal(lck_mtx_t *lock, boolean_t allow_held_as_mutex)
1438 {
1439 uintptr_t state;
1440 #if CONFIG_DTRACE
1441 bool stat_enabled = false;
1442 uint64_t start_time = 0;
1443
1444 if (interlock_try(lock)) {
1445 goto interlock_locked;
1446 }
1447
1448 #if LOCKS_INDIRECT_ALLOW
1449 bool indirect = (lock->lck_mtx_tag == LCK_MTX_TAG_INDIRECT);
1450
1451 if ((lockstat_probemap[LS_LCK_MTX_LOCK_SPIN_SPIN] && !indirect) ||
1452 (lockstat_probemap[LS_LCK_MTX_EXT_LOCK_SPIN_SPIN] && indirect))
1453 #else
1454 if (lockstat_probemap[LS_LCK_MTX_LOCK_SPIN_SPIN])
1455 #endif /* LOCKS_INDIRECT_ALLOW */
1456 {
1457 stat_enabled = true;
1458 start_time = mach_absolute_time();
1459 }
1460 #endif /* CONFIG_DTRACE */
1461
1462 interlock_lock(lock);
1463
1464 #if CONFIG_DTRACE
1465 if (stat_enabled) {
1466 #if LOCKS_INDIRECT_ALLOW
1467 if (indirect) {
1468 LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_SPIN_SPIN, lock,
1469 mach_absolute_time() - start_time);
1470 } else
1471 #endif /* LOCKS_INDIRECT_ALLOW */
1472 {
1473 LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_SPIN_SPIN, lock,
1474 mach_absolute_time() - start_time);
1475 }
1476 }
1477
1478 interlock_locked:
1479 #endif /* CONFIG_DTRACE */
1480
1481 state = ordered_load_mtx(lock);
1482 if (LCK_MTX_STATE_TO_THREAD(state)) {
1483 if (allow_held_as_mutex) {
1484 lck_mtx_lock_contended(lock, current_thread(), TRUE);
1485 } else {
1486 // "Always" variants can never block. If the lock is held and blocking is not allowed
1487 // then someone is mixing always and non-always calls on the same lock, which is
1488 // forbidden.
1489 panic("Attempting to block on a lock taken as spin-always %p", lock);
1490 }
1491 return;
1492 }
1493 state &= ARM_LCK_WAITERS; // Preserve waiters bit
1494 state |= (LCK_MTX_SPIN_TAG | LCK_ILOCK); // Add spin tag and maintain interlock
1495 ordered_store_mtx(lock, state);
1496 load_memory_barrier();
1497
1498 #if CONFIG_DTRACE
1499 LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_SPIN_ACQUIRE, lock, 0);
1500 #endif /* CONFIG_DTRACE */
1501 }
1502
1503 /*
1504 * Routine: lck_mtx_lock_spin
1505 */
1506 void
lck_mtx_lock_spin(lck_mtx_t * lock)1507 lck_mtx_lock_spin(lck_mtx_t *lock)
1508 {
1509 lck_mtx_check_preemption(lock);
1510 lck_mtx_lock_spin_internal(lock, TRUE);
1511 }
1512
1513 /*
1514 * Routine: lck_mtx_lock_spin_always
1515 */
1516 void
lck_mtx_lock_spin_always(lck_mtx_t * lock)1517 lck_mtx_lock_spin_always(lck_mtx_t *lock)
1518 {
1519 lck_mtx_lock_spin_internal(lock, FALSE);
1520 }
1521
1522 /*
1523 * Routine: lck_mtx_try_lock
1524 */
1525 boolean_t
lck_mtx_try_lock(lck_mtx_t * lock)1526 lck_mtx_try_lock(lck_mtx_t *lock)
1527 {
1528 thread_t thread = current_thread();
1529
1530 lck_mtx_verify(lock);
1531 if (os_atomic_cmpxchg(&lock->lck_mtx_data,
1532 0, LCK_MTX_THREAD_TO_STATE(thread), acquire)) {
1533 #if CONFIG_DTRACE
1534 LOCKSTAT_RECORD(LS_LCK_MTX_TRY_LOCK_ACQUIRE, lock, 0);
1535 #endif /* CONFIG_DTRACE */
1536 return TRUE;
1537 }
1538 return lck_mtx_try_lock_contended(lock, thread);
1539 }
1540
1541 static boolean_t NOINLINE
lck_mtx_try_lock_contended(lck_mtx_t * lock,thread_t thread)1542 lck_mtx_try_lock_contended(lck_mtx_t *lock, thread_t thread)
1543 {
1544 thread_t holding_thread;
1545 uintptr_t state;
1546 int waiters;
1547
1548 if (!interlock_try(lock)) {
1549 return FALSE;
1550 }
1551 state = ordered_load_mtx(lock);
1552 holding_thread = LCK_MTX_STATE_TO_THREAD(state);
1553 if (holding_thread) {
1554 interlock_unlock(lock);
1555 return FALSE;
1556 }
1557 waiters = lck_mtx_lock_acquire(lock, NULL);
1558 state = LCK_MTX_THREAD_TO_STATE(thread);
1559 if (waiters != 0) {
1560 state |= ARM_LCK_WAITERS;
1561 }
1562 state |= LCK_ILOCK; // Preserve interlock
1563 ordered_store_mtx(lock, state); // Set ownership
1564 interlock_unlock(lock); // Release interlock, enable preemption
1565 load_memory_barrier();
1566
1567 turnstile_cleanup();
1568
1569 #if CONFIG_DTRACE
1570 LOCKSTAT_RECORD(LS_LCK_MTX_TRY_LOCK_ACQUIRE, lock, 0);
1571 #endif /* CONFIG_DTRACE */
1572
1573 return TRUE;
1574 }
1575
1576 static inline boolean_t
lck_mtx_try_lock_spin_internal(lck_mtx_t * lock,boolean_t allow_held_as_mutex)1577 lck_mtx_try_lock_spin_internal(lck_mtx_t *lock, boolean_t allow_held_as_mutex)
1578 {
1579 uintptr_t state;
1580
1581 if (!interlock_try(lock)) {
1582 return FALSE;
1583 }
1584 state = ordered_load_mtx(lock);
1585 if (LCK_MTX_STATE_TO_THREAD(state)) {
1586 // Lock is held as mutex
1587 if (allow_held_as_mutex) {
1588 interlock_unlock(lock);
1589 } else {
1590 // "Always" variants can never block. If the lock is held as a normal mutex
1591 // then someone is mixing always and non-always calls on the same lock, which is
1592 // forbidden.
1593 panic("Spin-mutex held as full mutex %p", lock);
1594 }
1595 return FALSE;
1596 }
1597 state &= ARM_LCK_WAITERS; // Preserve waiters bit
1598 state |= (LCK_MTX_SPIN_TAG | LCK_ILOCK); // Add spin tag and maintain interlock
1599 ordered_store_mtx(lock, state);
1600 load_memory_barrier();
1601
1602 #if CONFIG_DTRACE
1603 LOCKSTAT_RECORD(LS_LCK_MTX_TRY_LOCK_SPIN_ACQUIRE, lock, 0);
1604 #endif /* CONFIG_DTRACE */
1605 return TRUE;
1606 }
1607
1608 /*
1609 * Routine: lck_mtx_try_lock_spin
1610 */
1611 boolean_t
lck_mtx_try_lock_spin(lck_mtx_t * lock)1612 lck_mtx_try_lock_spin(lck_mtx_t *lock)
1613 {
1614 return lck_mtx_try_lock_spin_internal(lock, TRUE);
1615 }
1616
1617 /*
1618 * Routine: lck_mtx_try_lock_spin_always
1619 */
1620 boolean_t
lck_mtx_try_lock_spin_always(lck_mtx_t * lock)1621 lck_mtx_try_lock_spin_always(lck_mtx_t *lock)
1622 {
1623 return lck_mtx_try_lock_spin_internal(lock, FALSE);
1624 }
1625
1626
1627
1628 /*
1629 * Routine: lck_mtx_unlock
1630 */
1631 void
lck_mtx_unlock(lck_mtx_t * lock)1632 lck_mtx_unlock(lck_mtx_t *lock)
1633 {
1634 thread_t thread = current_thread();
1635 uintptr_t state;
1636 boolean_t ilk_held = FALSE;
1637
1638 lck_mtx_verify(lock);
1639
1640 state = ordered_load_mtx(lock);
1641 if (state & LCK_ILOCK) {
1642 if (LCK_MTX_STATE_TO_THREAD(state) == (thread_t)LCK_MTX_SPIN_TAG) {
1643 ilk_held = TRUE; // Interlock is held by (presumably) this thread
1644 }
1645 goto slow_case;
1646 }
1647 // Locked as a mutex
1648 if (os_atomic_cmpxchg(&lock->lck_mtx_data,
1649 LCK_MTX_THREAD_TO_STATE(thread), 0, release)) {
1650 #if CONFIG_DTRACE
1651 LOCKSTAT_RECORD(LS_LCK_MTX_UNLOCK_RELEASE, lock, 0);
1652 #endif /* CONFIG_DTRACE */
1653 return;
1654 }
1655 slow_case:
1656 lck_mtx_unlock_contended(lock, thread, ilk_held);
1657 }
1658
1659 static void NOINLINE
lck_mtx_unlock_contended(lck_mtx_t * lock,thread_t thread,boolean_t ilk_held)1660 lck_mtx_unlock_contended(lck_mtx_t *lock, thread_t thread, boolean_t ilk_held)
1661 {
1662 uintptr_t state;
1663 boolean_t cleanup = FALSE;
1664
1665 if (ilk_held) {
1666 state = ordered_load_mtx(lock);
1667 } else {
1668 interlock_lock(lock);
1669 state = ordered_load_mtx(lock);
1670 if (thread != LCK_MTX_STATE_TO_THREAD(state)) {
1671 panic("lck_mtx_unlock(): Attempt to release lock not owned by thread (%p)", lock);
1672 }
1673 if (state & ARM_LCK_WAITERS) {
1674 if (lck_mtx_unlock_wakeup(lock, thread)) {
1675 state = ARM_LCK_WAITERS;
1676 } else {
1677 state = 0;
1678 }
1679 cleanup = TRUE;
1680 goto unlock;
1681 }
1682 }
1683 state &= ARM_LCK_WAITERS; /* Clear state, retain waiters bit */
1684 unlock:
1685 state |= LCK_ILOCK;
1686 ordered_store_mtx(lock, state);
1687 interlock_unlock(lock);
1688 if (cleanup) {
1689 /*
1690 * Do not do any turnstile operations outside of this block.
1691 * lock/unlock is called at early stage of boot with single thread,
1692 * when turnstile is not yet initialized.
1693 * Even without contention we can come throught the slow path
1694 * if the mutex is acquired as a spin lock.
1695 */
1696 turnstile_cleanup();
1697 }
1698
1699 #if CONFIG_DTRACE
1700 LOCKSTAT_RECORD(LS_LCK_MTX_UNLOCK_RELEASE, lock, 0);
1701 #endif /* CONFIG_DTRACE */
1702 }
1703
1704 /*
1705 * Routine: lck_mtx_assert
1706 */
1707 void
lck_mtx_assert(lck_mtx_t * lock,unsigned int type)1708 lck_mtx_assert(lck_mtx_t *lock, unsigned int type)
1709 {
1710 thread_t thread, holder;
1711 uintptr_t state;
1712
1713 state = ordered_load_mtx(lock);
1714 holder = LCK_MTX_STATE_TO_THREAD(state);
1715 if (holder == (thread_t)LCK_MTX_SPIN_TAG) {
1716 // Lock is held in spin mode, owner is unknown.
1717 return; // Punt
1718 }
1719 thread = current_thread();
1720 if (type == LCK_MTX_ASSERT_OWNED) {
1721 if (thread != holder) {
1722 panic("lck_mtx_assert(): mutex (%p) owned", lock);
1723 }
1724 } else if (type == LCK_MTX_ASSERT_NOTOWNED) {
1725 if (thread == holder) {
1726 panic("lck_mtx_assert(): mutex (%p) not owned", lock);
1727 }
1728 } else {
1729 panic("lck_mtx_assert(): invalid arg (%u)", type);
1730 }
1731 }
1732
1733 /*
1734 * Routine: lck_mtx_ilk_unlock
1735 */
1736 boolean_t
lck_mtx_ilk_unlock(lck_mtx_t * lock)1737 lck_mtx_ilk_unlock(lck_mtx_t *lock)
1738 {
1739 interlock_unlock(lock);
1740 return TRUE;
1741 }
1742
1743 /*
1744 * Routine: lck_mtx_convert_spin
1745 *
1746 * Convert a mutex held for spin into a held full mutex
1747 */
1748 void
lck_mtx_convert_spin(lck_mtx_t * lock)1749 lck_mtx_convert_spin(lck_mtx_t *lock)
1750 {
1751 thread_t thread = current_thread();
1752 uintptr_t state;
1753 int waiters;
1754
1755 state = ordered_load_mtx(lock);
1756 if (LCK_MTX_STATE_TO_THREAD(state) == thread) {
1757 return; // Already owned as mutex, return
1758 }
1759 if ((state & LCK_ILOCK) == 0 || (LCK_MTX_STATE_TO_THREAD(state) != (thread_t)LCK_MTX_SPIN_TAG)) {
1760 panic("lck_mtx_convert_spin: Not held as spinlock (%p)", lock);
1761 }
1762 state &= ~(LCK_MTX_THREAD_MASK); // Clear the spin tag
1763 ordered_store_mtx(lock, state);
1764 waiters = lck_mtx_lock_acquire(lock, NULL); // Acquire to manage priority boosts
1765 state = LCK_MTX_THREAD_TO_STATE(thread);
1766 if (waiters != 0) {
1767 state |= ARM_LCK_WAITERS;
1768 }
1769 state |= LCK_ILOCK;
1770 ordered_store_mtx(lock, state); // Set ownership
1771 interlock_unlock(lock); // Release interlock, enable preemption
1772 turnstile_cleanup();
1773 }
1774
1775
1776 /*
1777 * Routine: lck_mtx_destroy
1778 */
1779 void
lck_mtx_destroy(lck_mtx_t * lck,lck_grp_t * grp)1780 lck_mtx_destroy(
1781 lck_mtx_t * lck,
1782 lck_grp_t * grp)
1783 {
1784 if (lck->lck_mtx_type != LCK_MTX_TYPE) {
1785 panic("Destroying invalid mutex %p", lck);
1786 }
1787 if (lck->lck_mtx_tag == LCK_MTX_TAG_DESTROYED) {
1788 panic("Destroying previously destroyed lock %p", lck);
1789 }
1790 lck_mtx_assert(lck, LCK_MTX_ASSERT_NOTOWNED);
1791 lck->lck_mtx_tag = LCK_MTX_TAG_DESTROYED;
1792 lck_grp_deallocate(grp, &grp->lck_grp_mtxcnt);
1793 }
1794
1795 /*
1796 * Routine: lck_spin_assert
1797 */
1798 void
lck_spin_assert(lck_spin_t * lock,unsigned int type)1799 lck_spin_assert(lck_spin_t *lock, unsigned int type)
1800 {
1801 thread_t thread, holder;
1802 uintptr_t state;
1803
1804 if (lock->type != LCK_SPIN_TYPE) {
1805 panic("Invalid spinlock %p", lock);
1806 }
1807
1808 state = lock->lck_spin_data;
1809 holder = (thread_t)(state & ~LCK_ILOCK);
1810 thread = current_thread();
1811 if (type == LCK_ASSERT_OWNED) {
1812 if (holder == 0) {
1813 panic("Lock not owned %p = %lx", lock, state);
1814 }
1815 if (holder != thread) {
1816 panic("Lock not owned by current thread %p = %lx", lock, state);
1817 }
1818 if ((state & LCK_ILOCK) == 0) {
1819 panic("Lock bit not set %p = %lx", lock, state);
1820 }
1821 } else if (type == LCK_ASSERT_NOTOWNED) {
1822 if (holder != 0) {
1823 if (holder == thread) {
1824 panic("Lock owned by current thread %p = %lx", lock, state);
1825 }
1826 }
1827 } else {
1828 panic("lck_spin_assert(): invalid arg (%u)", type);
1829 }
1830 }
1831
1832 /*
1833 * Routine: kdp_lck_mtx_lock_spin_is_acquired
1834 * NOT SAFE: To be used only by kernel debugger to avoid deadlock.
1835 */
1836 boolean_t
kdp_lck_mtx_lock_spin_is_acquired(lck_mtx_t * lck)1837 kdp_lck_mtx_lock_spin_is_acquired(lck_mtx_t *lck)
1838 {
1839 uintptr_t state;
1840
1841 if (not_in_kdp) {
1842 panic("panic: spinlock acquired check done outside of kernel debugger");
1843 }
1844 state = ordered_load_mtx(lck);
1845 if (state == LCK_MTX_TAG_DESTROYED) {
1846 return FALSE;
1847 }
1848 if (LCK_MTX_STATE_TO_THREAD(state) || (state & LCK_ILOCK)) {
1849 return TRUE;
1850 }
1851 return FALSE;
1852 }
1853
1854 void
kdp_lck_mtx_find_owner(__unused struct waitq * waitq,event64_t event,thread_waitinfo_t * waitinfo)1855 kdp_lck_mtx_find_owner(__unused struct waitq * waitq, event64_t event, thread_waitinfo_t * waitinfo)
1856 {
1857 lck_mtx_t * mutex = LCK_EVENT_TO_MUTEX(event);
1858 waitinfo->context = VM_KERNEL_UNSLIDE_OR_PERM(mutex);
1859 uintptr_t state = ordered_load_mtx(mutex);
1860 thread_t holder = LCK_MTX_STATE_TO_THREAD(state);
1861 if ((uintptr_t)holder == (uintptr_t)LCK_MTX_SPIN_TAG) {
1862 waitinfo->owner = STACKSHOT_WAITOWNER_MTXSPIN;
1863 } else {
1864 assertf(state != (uintptr_t)LCK_MTX_TAG_DESTROYED, "state=0x%llx", (uint64_t)state);
1865 #if LOCKS_INDIRECT_ALLOW
1866 assertf(state != (uintptr_t)LCK_MTX_TAG_INDIRECT, "state=0x%llx", (uint64_t)state);
1867 #endif /* LOCKS_INDIRECT_ALLOW */
1868 waitinfo->owner = thread_tid(holder);
1869 }
1870 }
1871