1 /*
2 * Copyright (c) 2000-2020 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * @OSF_COPYRIGHT@
30 */
31 /*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or [email protected]
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56 /*
57 * File: kern/lock.c
58 * Author: Avadis Tevanian, Jr., Michael Wayne Young
59 * Date: 1985
60 *
61 * Locking primitives implementation
62 */
63
64 #define LOCK_PRIVATE 1
65
66 #include <mach_ldebug.h>
67
68 #include <kern/lock_stat.h>
69 #include <kern/locks.h>
70 #include <kern/zalloc.h>
71 #include <kern/misc_protos.h>
72 #include <kern/thread.h>
73 #include <kern/processor.h>
74 #include <kern/cpu_data.h>
75 #include <kern/cpu_number.h>
76 #include <kern/sched_prim.h>
77 #include <kern/debug.h>
78 #include <string.h>
79
80 #include <i386/machine_routines.h> /* machine_timeout_suspended() */
81 #include <i386/tsc.h>
82 #include <machine/atomic.h>
83 #include <machine/machine_cpu.h>
84 #include <i386/mp.h>
85 #include <machine/atomic.h>
86 #include <sys/kdebug.h>
87 #if LCK_MTX_USE_ARCH
88 #include <i386/locks_i386_inlines.h>
89 #endif /* LCK_MTX_USE_ARCH */
90 #include <kern/cpu_number.h>
91 #include <os/hash.h>
92 #include <i386/cpuid.h>
93
94 #define ANY_LOCK_DEBUG (USLOCK_DEBUG || LOCK_DEBUG || MUTEX_DEBUG)
95
96 uint64_t _Atomic lock_panic_timeout = 0xf000000; /* 251e6 TSC ticks */
97
98 /* Forwards */
99
100 #if USLOCK_DEBUG
101 /*
102 * Perform simple lock checks.
103 */
104 int uslock_check = 1;
105 int max_lock_loops = 100000000;
106 decl_simple_lock_data(extern, printf_lock);
107 decl_simple_lock_data(extern, panic_lock);
108 #endif /* USLOCK_DEBUG */
109
110 extern unsigned int not_in_kdp;
111
112 #if !LCK_GRP_USE_ARG
113 #define usimple_lock_nopreempt(lck, grp) \
114 usimple_lock_nopreempt(lck)
115 #define usimple_lock_try_nopreempt(lck, grp) \
116 usimple_lock_try_nopreempt(lck)
117 #endif
118 static void usimple_lock_nopreempt(usimple_lock_t, lck_grp_t *);
119 static unsigned int usimple_lock_try_nopreempt(usimple_lock_t, lck_grp_t *);
120
121 /*
122 * We often want to know the addresses of the callers
123 * of the various lock routines. However, this information
124 * is only used for debugging and statistics.
125 */
126 typedef void *pc_t;
127 #define INVALID_PC ((void *) VM_MAX_KERNEL_ADDRESS)
128 #define INVALID_THREAD ((void *) VM_MAX_KERNEL_ADDRESS)
129 #if ANY_LOCK_DEBUG
130 #define OBTAIN_PC(pc) ((pc) = GET_RETURN_PC())
131 #define DECL_PC(pc) pc_t pc;
132 #else /* ANY_LOCK_DEBUG */
133 #define DECL_PC(pc)
134 #ifdef lint
135 /*
136 * Eliminate lint complaints about unused local pc variables.
137 */
138 #define OBTAIN_PC(pc) ++pc
139 #else /* lint */
140 #define OBTAIN_PC(pc)
141 #endif /* lint */
142 #endif /* USLOCK_DEBUG */
143
144 KALLOC_TYPE_DEFINE(KT_LCK_SPIN, lck_spin_t, KT_PRIV_ACCT);
145
146 #if LCK_MTX_USE_ARCH
147 KALLOC_TYPE_DEFINE(KT_LCK_MTX, lck_mtx_t, KT_PRIV_ACCT);
148 #endif /* LCK_MTX_USE_ARCH */
149
150 /*
151 * atomic exchange API is a low level abstraction of the operations
152 * to atomically read, modify, and write a pointer. This abstraction works
153 * for both Intel and ARMv8.1 compare and exchange atomic instructions as
154 * well as the ARM exclusive instructions.
155 *
156 * atomic_exchange_begin() - begin exchange and retrieve current value
157 * atomic_exchange_complete() - conclude an exchange
158 * atomic_exchange_abort() - cancel an exchange started with atomic_exchange_begin()
159 */
160 uint32_t
atomic_exchange_begin32(uint32_t * target,uint32_t * previous,enum memory_order ord)161 atomic_exchange_begin32(uint32_t *target, uint32_t *previous, enum memory_order ord)
162 {
163 uint32_t val;
164
165 (void)ord; // Memory order not used
166 val = os_atomic_load(target, relaxed);
167 *previous = val;
168 return val;
169 }
170
171 boolean_t
atomic_exchange_complete32(uint32_t * target,uint32_t previous,uint32_t newval,enum memory_order ord)172 atomic_exchange_complete32(uint32_t *target, uint32_t previous, uint32_t newval, enum memory_order ord)
173 {
174 return __c11_atomic_compare_exchange_strong((_Atomic uint32_t *)target, &previous, newval, ord, memory_order_relaxed);
175 }
176
177 void
atomic_exchange_abort(void)178 atomic_exchange_abort(void)
179 {
180 }
181
182 boolean_t
atomic_test_and_set32(uint32_t * target,uint32_t test_mask,uint32_t set_mask,enum memory_order ord,boolean_t wait)183 atomic_test_and_set32(uint32_t *target, uint32_t test_mask, uint32_t set_mask, enum memory_order ord, boolean_t wait)
184 {
185 uint32_t value, prev;
186
187 for (;;) {
188 value = atomic_exchange_begin32(target, &prev, ord);
189 if (value & test_mask) {
190 if (wait) {
191 cpu_pause();
192 } else {
193 atomic_exchange_abort();
194 }
195 return FALSE;
196 }
197 value |= set_mask;
198 if (atomic_exchange_complete32(target, prev, value, ord)) {
199 return TRUE;
200 }
201 }
202 }
203
204 /*
205 * Portable lock package implementation of usimple_locks.
206 */
207
208 #if USLOCK_DEBUG
209 #define USLDBG(stmt) stmt
210 void usld_lock_init(usimple_lock_t, unsigned short);
211 void usld_lock_pre(usimple_lock_t, pc_t);
212 void usld_lock_post(usimple_lock_t, pc_t);
213 void usld_unlock(usimple_lock_t, pc_t);
214 void usld_lock_try_pre(usimple_lock_t, pc_t);
215 void usld_lock_try_post(usimple_lock_t, pc_t);
216 int usld_lock_common_checks(usimple_lock_t, char *);
217 #else /* USLOCK_DEBUG */
218 #define USLDBG(stmt)
219 #endif /* USLOCK_DEBUG */
220
221 #if LCK_MTX_USE_ARCH
222 /*
223 * Forward definitions
224 */
225
226 static void lck_mtx_unlock_wakeup_tail(lck_mtx_t *mutex, uint32_t state);
227 static void lck_mtx_interlock_lock(lck_mtx_t *mutex, uint32_t *new_state);
228 static void lck_mtx_interlock_lock_clear_flags(lck_mtx_t *mutex, uint32_t and_flags, uint32_t *new_state);
229 static int lck_mtx_interlock_try_lock_set_flags(lck_mtx_t *mutex, uint32_t or_flags, uint32_t *new_state);
230 static boolean_t lck_mtx_lock_wait_interlock_to_clear(lck_mtx_t *lock, uint32_t *new_state);
231 static boolean_t lck_mtx_try_lock_wait_interlock_to_clear(lck_mtx_t *lock, uint32_t *new_state);
232 #endif /* LCK_MTX_USE_ARCH */
233
234
235 /*
236 * Routine: lck_spin_alloc_init
237 */
238 lck_spin_t *
lck_spin_alloc_init(lck_grp_t * grp,lck_attr_t * attr)239 lck_spin_alloc_init(
240 lck_grp_t *grp,
241 lck_attr_t *attr)
242 {
243 lck_spin_t *lck;
244
245 lck = zalloc(KT_LCK_SPIN);
246 lck_spin_init(lck, grp, attr);
247 return lck;
248 }
249
250 /*
251 * Routine: lck_spin_free
252 */
253 void
lck_spin_free(lck_spin_t * lck,lck_grp_t * grp)254 lck_spin_free(
255 lck_spin_t *lck,
256 lck_grp_t *grp)
257 {
258 lck_spin_destroy(lck, grp);
259 zfree(KT_LCK_SPIN, lck);
260 }
261
262 /*
263 * Routine: lck_spin_init
264 */
265 void
lck_spin_init(lck_spin_t * lck,lck_grp_t * grp,__unused lck_attr_t * attr)266 lck_spin_init(
267 lck_spin_t *lck,
268 lck_grp_t *grp,
269 __unused lck_attr_t *attr)
270 {
271 usimple_lock_init((usimple_lock_t) lck, 0);
272 if (grp) {
273 lck_grp_reference(grp, &grp->lck_grp_spincnt);
274 }
275 }
276
277 /*
278 * Routine: lck_spin_destroy
279 */
280 void
lck_spin_destroy(lck_spin_t * lck,lck_grp_t * grp)281 lck_spin_destroy(
282 lck_spin_t *lck,
283 lck_grp_t *grp)
284 {
285 if (lck->interlock == LCK_SPIN_TAG_DESTROYED) {
286 return;
287 }
288 lck->interlock = LCK_SPIN_TAG_DESTROYED;
289 if (grp) {
290 lck_grp_deallocate(grp, &grp->lck_grp_spincnt);
291 }
292 return;
293 }
294
295 /*
296 * Routine: lck_spin_lock
297 */
298 void
lck_spin_lock_grp(lck_spin_t * lck,lck_grp_t * grp)299 lck_spin_lock_grp(
300 lck_spin_t *lck,
301 lck_grp_t *grp)
302 {
303 #pragma unused(grp)
304 usimple_lock((usimple_lock_t) lck, grp);
305 }
306
307 void
lck_spin_lock(lck_spin_t * lck)308 lck_spin_lock(
309 lck_spin_t *lck)
310 {
311 usimple_lock((usimple_lock_t) lck, NULL);
312 }
313
314 void
lck_spin_lock_nopreempt(lck_spin_t * lck)315 lck_spin_lock_nopreempt(
316 lck_spin_t *lck)
317 {
318 usimple_lock_nopreempt((usimple_lock_t) lck, NULL);
319 }
320
321 void
lck_spin_lock_nopreempt_grp(lck_spin_t * lck,lck_grp_t * grp)322 lck_spin_lock_nopreempt_grp(
323 lck_spin_t *lck,
324 lck_grp_t *grp)
325 {
326 #pragma unused(grp)
327 usimple_lock_nopreempt((usimple_lock_t) lck, grp);
328 }
329
330 /*
331 * Routine: lck_spin_unlock
332 */
333 void
lck_spin_unlock(lck_spin_t * lck)334 lck_spin_unlock(
335 lck_spin_t *lck)
336 {
337 usimple_unlock((usimple_lock_t) lck);
338 }
339
340 void
lck_spin_unlock_nopreempt(lck_spin_t * lck)341 lck_spin_unlock_nopreempt(
342 lck_spin_t *lck)
343 {
344 usimple_unlock_nopreempt((usimple_lock_t) lck);
345 }
346
347 boolean_t
lck_spin_try_lock_grp(lck_spin_t * lck,lck_grp_t * grp)348 lck_spin_try_lock_grp(
349 lck_spin_t *lck,
350 lck_grp_t *grp)
351 {
352 #pragma unused(grp)
353 boolean_t lrval = (boolean_t)usimple_lock_try((usimple_lock_t) lck, grp);
354 #if DEVELOPMENT || DEBUG
355 if (lrval) {
356 pltrace(FALSE);
357 }
358 #endif
359 return lrval;
360 }
361
362
363 /*
364 * Routine: lck_spin_try_lock
365 */
366 boolean_t
lck_spin_try_lock(lck_spin_t * lck)367 lck_spin_try_lock(
368 lck_spin_t *lck)
369 {
370 boolean_t lrval = (boolean_t)usimple_lock_try((usimple_lock_t) lck, LCK_GRP_NULL);
371 #if DEVELOPMENT || DEBUG
372 if (lrval) {
373 pltrace(FALSE);
374 }
375 #endif
376 return lrval;
377 }
378
379 int
lck_spin_try_lock_nopreempt(lck_spin_t * lck)380 lck_spin_try_lock_nopreempt(
381 lck_spin_t *lck)
382 {
383 boolean_t lrval = (boolean_t)usimple_lock_try_nopreempt((usimple_lock_t) lck, LCK_GRP_NULL);
384 #if DEVELOPMENT || DEBUG
385 if (lrval) {
386 pltrace(FALSE);
387 }
388 #endif
389 return lrval;
390 }
391
392 int
lck_spin_try_lock_nopreempt_grp(lck_spin_t * lck,lck_grp_t * grp)393 lck_spin_try_lock_nopreempt_grp(
394 lck_spin_t *lck,
395 lck_grp_t *grp)
396 {
397 #pragma unused(grp)
398 boolean_t lrval = (boolean_t)usimple_lock_try_nopreempt((usimple_lock_t) lck, grp);
399 #if DEVELOPMENT || DEBUG
400 if (lrval) {
401 pltrace(FALSE);
402 }
403 #endif
404 return lrval;
405 }
406
407 /*
408 * Routine: lck_spin_assert
409 */
410 void
lck_spin_assert(const lck_spin_t * lock,unsigned int type)411 lck_spin_assert(const lck_spin_t *lock, unsigned int type)
412 {
413 thread_t thread, holder;
414 uintptr_t state;
415
416 if (__improbable(type != LCK_ASSERT_OWNED && type != LCK_ASSERT_NOTOWNED)) {
417 panic("lck_spin_assert(): invalid arg (%u)", type);
418 }
419
420 state = lock->interlock;
421 holder = (thread_t)state;
422 thread = current_thread();
423 if (type == LCK_ASSERT_OWNED) {
424 if (__improbable(holder == THREAD_NULL)) {
425 panic("Lock not owned %p = %lx", lock, state);
426 }
427 if (__improbable(holder != thread)) {
428 panic("Lock not owned by current thread %p = %lx", lock, state);
429 }
430 } else if (type == LCK_ASSERT_NOTOWNED) {
431 if (__improbable(holder != THREAD_NULL)) {
432 if (holder == thread) {
433 panic("Lock owned by current thread %p = %lx", lock, state);
434 }
435 }
436 }
437 }
438
439 /*
440 * Routine: kdp_lck_spin_is_acquired
441 * NOT SAFE: To be used only by kernel debugger to avoid deadlock.
442 * Returns: TRUE if lock is acquired.
443 */
444 boolean_t
kdp_lck_spin_is_acquired(lck_spin_t * lck)445 kdp_lck_spin_is_acquired(lck_spin_t *lck)
446 {
447 if (not_in_kdp) {
448 panic("panic: spinlock acquired check done outside of kernel debugger");
449 }
450 return (lck->interlock != 0)? TRUE : FALSE;
451 }
452
453 /*
454 * Initialize a usimple_lock.
455 *
456 * No change in preemption state.
457 */
458 void
usimple_lock_init(usimple_lock_t l,__unused unsigned short tag)459 usimple_lock_init(
460 usimple_lock_t l,
461 __unused unsigned short tag)
462 {
463 USLDBG(usld_lock_init(l, tag));
464 hw_lock_init(&l->interlock);
465 }
466
467 static hw_spin_timeout_status_t
usimple_lock_acquire_timeout_panic(void * _lock,hw_spin_timeout_t to,hw_spin_state_t st)468 usimple_lock_acquire_timeout_panic(void *_lock, hw_spin_timeout_t to, hw_spin_state_t st)
469 {
470 usimple_lock_t l = _lock;
471 uintptr_t lowner;
472 lck_spinlock_to_info_t lsti;
473
474 if (machine_timeout_suspended()) {
475 return HW_LOCK_TIMEOUT_CONTINUE;
476 }
477
478 lowner = (uintptr_t)l->interlock.lock_data;
479 lsti = lck_spinlock_timeout_hit(l, lowner);
480
481 panic("Spinlock[%p] " HW_SPIN_TIMEOUT_FMT " ;"
482 "lock owner thread=0x%lx, current_thread: %p, "
483 "lock owner active on CPU %d, "
484 HW_SPIN_TIMEOUT_DETAILS_FMT,
485 l, HW_SPIN_TIMEOUT_ARG(to, st),
486 lowner, current_thread(), lsti->owner_cpu,
487 HW_SPIN_TIMEOUT_DETAILS_ARG(to, st));
488 }
489
490 static const struct hw_spin_policy usimple_lock_spin_policy = {
491 .hwsp_name = "usimple_lock_t",
492 .hwsp_timeout = &LockTimeOutTSC,
493 .hwsp_op_timeout = usimple_lock_acquire_timeout_panic,
494 };
495
496 /*
497 * Acquire a usimple_lock.
498 *
499 * Returns with preemption disabled. Note
500 * that the hw_lock routines are responsible for
501 * maintaining preemption state.
502 */
503 void
504 (usimple_lock)(
505 usimple_lock_t l
506 LCK_GRP_ARG(lck_grp_t *grp))
507 {
508 DECL_PC(pc);
509
510 OBTAIN_PC(pc);
511 USLDBG(usld_lock_pre(l, pc));
512
513 (void)hw_lock_to(&l->interlock, &usimple_lock_spin_policy, grp);
514 #if DEVELOPMENT || DEBUG
515 pltrace(FALSE);
516 #endif
517
518 USLDBG(usld_lock_post(l, pc));
519 #if CONFIG_DTRACE
520 LOCKSTAT_RECORD(LS_LCK_SPIN_LOCK_ACQUIRE, l, 0, (uintptr_t)LCK_GRP_PROBEARG(grp));
521 #endif
522 }
523
524 /*
525 * Acquire a usimple_lock_nopreempt
526 *
527 * Called and returns with preemption disabled. Note
528 * that the hw_lock routines are responsible for
529 * maintaining preemption state.
530 */
531 static void
usimple_lock_nopreempt(usimple_lock_t l,lck_grp_t * grp)532 usimple_lock_nopreempt(
533 usimple_lock_t l,
534 lck_grp_t *grp)
535 {
536 DECL_PC(pc);
537
538 OBTAIN_PC(pc);
539 USLDBG(usld_lock_pre(l, pc));
540
541 (void)hw_lock_to_nopreempt(&l->interlock, &usimple_lock_spin_policy, grp);
542
543 #if DEVELOPMENT || DEBUG
544 pltrace(FALSE);
545 #endif
546
547 USLDBG(usld_lock_post(l, pc));
548 #if CONFIG_DTRACE
549 LOCKSTAT_RECORD(LS_LCK_SPIN_LOCK_ACQUIRE, l, 0, (uintptr_t)LCK_GRP_PROBEARG(grp));
550 #endif
551 }
552
553
554 /*
555 * Release a usimple_lock.
556 *
557 * Returns with preemption enabled. Note
558 * that the hw_lock routines are responsible for
559 * maintaining preemption state.
560 */
561 void
usimple_unlock(usimple_lock_t l)562 usimple_unlock(
563 usimple_lock_t l)
564 {
565 DECL_PC(pc);
566
567 OBTAIN_PC(pc);
568 USLDBG(usld_unlock(l, pc));
569 #if DEVELOPMENT || DEBUG
570 pltrace(TRUE);
571 #endif
572 hw_lock_unlock(&l->interlock);
573 }
574
575 /*
576 * Release a usimple_unlock_nopreempt.
577 *
578 * Called and returns with preemption enabled. Note
579 * that the hw_lock routines are responsible for
580 * maintaining preemption state.
581 */
582 void
usimple_unlock_nopreempt(usimple_lock_t l)583 usimple_unlock_nopreempt(
584 usimple_lock_t l)
585 {
586 DECL_PC(pc);
587
588 OBTAIN_PC(pc);
589 USLDBG(usld_unlock(l, pc));
590 #if DEVELOPMENT || DEBUG
591 pltrace(TRUE);
592 #endif
593 hw_lock_unlock_nopreempt(&l->interlock);
594 }
595
596 /*
597 * Conditionally acquire a usimple_lock.
598 *
599 * On success, returns with preemption disabled.
600 * On failure, returns with preemption in the same state
601 * as when first invoked. Note that the hw_lock routines
602 * are responsible for maintaining preemption state.
603 *
604 * XXX No stats are gathered on a miss; I preserved this
605 * behavior from the original assembly-language code, but
606 * doesn't it make sense to log misses? XXX
607 */
608 unsigned int
usimple_lock_try(usimple_lock_t l,lck_grp_t * grp)609 usimple_lock_try(
610 usimple_lock_t l,
611 lck_grp_t *grp)
612 {
613 unsigned int success;
614 DECL_PC(pc);
615
616 OBTAIN_PC(pc);
617 USLDBG(usld_lock_try_pre(l, pc));
618 if ((success = hw_lock_try(&l->interlock, grp))) {
619 #if DEVELOPMENT || DEBUG
620 pltrace(FALSE);
621 #endif
622 USLDBG(usld_lock_try_post(l, pc));
623 }
624 return success;
625 }
626
627 void
usimple_lock_assert(usimple_lock_t l,unsigned int type)628 usimple_lock_assert(usimple_lock_t l, unsigned int type)
629 {
630 hw_lock_assert(&l->interlock, type);
631 }
632
633 /*
634 * Conditionally acquire a usimple_lock.
635 *
636 * Called and returns with preemption disabled. Note
637 * that the hw_lock routines are responsible for
638 * maintaining preemption state.
639 *
640 * XXX No stats are gathered on a miss; I preserved this
641 * behavior from the original assembly-language code, but
642 * doesn't it make sense to log misses? XXX
643 */
644 static unsigned int
usimple_lock_try_nopreempt(usimple_lock_t l,lck_grp_t * grp)645 usimple_lock_try_nopreempt(
646 usimple_lock_t l,
647 lck_grp_t *grp)
648 {
649 unsigned int success;
650 DECL_PC(pc);
651
652 OBTAIN_PC(pc);
653 USLDBG(usld_lock_try_pre(l, pc));
654 if ((success = hw_lock_try_nopreempt(&l->interlock, grp))) {
655 #if DEVELOPMENT || DEBUG
656 pltrace(FALSE);
657 #endif
658 USLDBG(usld_lock_try_post(l, pc));
659 }
660 return success;
661 }
662
663 /*
664 * Acquire a usimple_lock while polling for pending cpu signals
665 * and spinning on a lock.
666 *
667 */
668 unsigned
669 int
670 (usimple_lock_try_lock_mp_signal_safe_loop_deadline)(usimple_lock_t l,
671 uint64_t deadline
672 LCK_GRP_ARG(lck_grp_t *grp))
673 {
674 boolean_t istate = ml_get_interrupts_enabled();
675
676 if (deadline < mach_absolute_time()) {
677 return 0;
678 }
679
680 while (!simple_lock_try(l, grp)) {
681 if (!istate) {
682 cpu_signal_handler(NULL);
683 }
684
685 if (deadline < mach_absolute_time()) {
686 return 0;
687 }
688
689 cpu_pause();
690 }
691
692 return 1;
693 }
694
695 void
696 (usimple_lock_try_lock_loop)(usimple_lock_t l
697 LCK_GRP_ARG(lck_grp_t *grp))
698 {
699 /* When the lock is not contended, grab the lock and go. */
700 if (!simple_lock_try(l, grp)) {
701 usimple_lock_try_lock_mp_signal_safe_loop_deadline(l, ULLONG_MAX, grp);
702 }
703 }
704
705 unsigned
706 int
707 (usimple_lock_try_lock_mp_signal_safe_loop_duration)(usimple_lock_t l,
708 uint64_t duration
709 LCK_GRP_ARG(lck_grp_t *grp))
710 {
711 uint64_t deadline;
712 uint64_t base_at;
713 uint64_t duration_at;
714
715 /* Fast track for uncontended locks */
716 if (simple_lock_try(l, grp)) {
717 return 1;
718 }
719
720 base_at = mach_absolute_time();
721
722 nanoseconds_to_absolutetime(duration, &duration_at);
723 deadline = base_at + duration_at;
724 if (deadline < base_at) {
725 /* deadline has overflowed, make it saturate */
726 deadline = ULLONG_MAX;
727 }
728
729 return usimple_lock_try_lock_mp_signal_safe_loop_deadline(l, deadline, grp);
730 }
731
732 #if USLOCK_DEBUG
733 /*
734 * States of a usimple_lock. The default when initializing
735 * a usimple_lock is setting it up for debug checking.
736 */
737 #define USLOCK_CHECKED 0x0001 /* lock is being checked */
738 #define USLOCK_TAKEN 0x0002 /* lock has been taken */
739 #define USLOCK_INIT 0xBAA0 /* lock has been initialized */
740 #define USLOCK_INITIALIZED (USLOCK_INIT|USLOCK_CHECKED)
741 #define USLOCK_CHECKING(l) (uslock_check && \
742 ((l)->debug.state & USLOCK_CHECKED))
743
744 /*
745 * Initialize the debugging information contained
746 * in a usimple_lock.
747 */
748 void
usld_lock_init(usimple_lock_t l,__unused unsigned short tag)749 usld_lock_init(
750 usimple_lock_t l,
751 __unused unsigned short tag)
752 {
753 if (l == USIMPLE_LOCK_NULL) {
754 panic("lock initialization: null lock pointer");
755 }
756 l->lock_type = USLOCK_TAG;
757 l->debug.state = uslock_check ? USLOCK_INITIALIZED : 0;
758 l->debug.lock_cpu = l->debug.unlock_cpu = 0;
759 l->debug.lock_pc = l->debug.unlock_pc = INVALID_PC;
760 l->debug.lock_thread = l->debug.unlock_thread = INVALID_THREAD;
761 l->debug.duration[0] = l->debug.duration[1] = 0;
762 l->debug.unlock_cpu = l->debug.unlock_cpu = 0;
763 l->debug.unlock_pc = l->debug.unlock_pc = INVALID_PC;
764 l->debug.unlock_thread = l->debug.unlock_thread = INVALID_THREAD;
765 }
766
767
768 /*
769 * These checks apply to all usimple_locks, not just
770 * those with USLOCK_CHECKED turned on.
771 */
772 int
usld_lock_common_checks(usimple_lock_t l,char * caller)773 usld_lock_common_checks(
774 usimple_lock_t l,
775 char *caller)
776 {
777 if (l == USIMPLE_LOCK_NULL) {
778 panic("%s: null lock pointer", caller);
779 }
780 if (l->lock_type != USLOCK_TAG) {
781 panic("%s: %p is not a usimple lock, 0x%x", caller, l, l->lock_type);
782 }
783 if (!(l->debug.state & USLOCK_INIT)) {
784 panic("%s: %p is not an initialized lock, 0x%x", caller, l, l->debug.state);
785 }
786 return USLOCK_CHECKING(l);
787 }
788
789
790 /*
791 * Debug checks on a usimple_lock just before attempting
792 * to acquire it.
793 */
794 /* ARGSUSED */
795 void
usld_lock_pre(usimple_lock_t l,pc_t pc)796 usld_lock_pre(
797 usimple_lock_t l,
798 pc_t pc)
799 {
800 char caller[] = "usimple_lock";
801
802
803 if (!usld_lock_common_checks(l, caller)) {
804 return;
805 }
806
807 /*
808 * Note that we have a weird case where we are getting a lock when we are]
809 * in the process of putting the system to sleep. We are running with no
810 * current threads, therefore we can't tell if we are trying to retake a lock
811 * we have or someone on the other processor has it. Therefore we just
812 * ignore this test if the locking thread is 0.
813 */
814
815 if ((l->debug.state & USLOCK_TAKEN) && l->debug.lock_thread &&
816 l->debug.lock_thread == (void *) current_thread()) {
817 printf("%s: lock %p already locked (at %p) by",
818 caller, l, l->debug.lock_pc);
819 printf(" current thread %p (new attempt at pc %p)\n",
820 l->debug.lock_thread, pc);
821 panic("%s", caller);
822 }
823 mp_disable_preemption();
824 mp_enable_preemption();
825 }
826
827
828 /*
829 * Debug checks on a usimple_lock just after acquiring it.
830 *
831 * Pre-emption has been disabled at this point,
832 * so we are safe in using cpu_number.
833 */
834 void
usld_lock_post(usimple_lock_t l,pc_t pc)835 usld_lock_post(
836 usimple_lock_t l,
837 pc_t pc)
838 {
839 unsigned int mycpu;
840 char caller[] = "successful usimple_lock";
841
842
843 if (!usld_lock_common_checks(l, caller)) {
844 return;
845 }
846
847 if (!((l->debug.state & ~USLOCK_TAKEN) == USLOCK_INITIALIZED)) {
848 panic("%s: lock %p became uninitialized",
849 caller, l);
850 }
851 if ((l->debug.state & USLOCK_TAKEN)) {
852 panic("%s: lock 0x%p became TAKEN by someone else",
853 caller, l);
854 }
855
856 mycpu = (unsigned int)cpu_number();
857 assert(mycpu <= UCHAR_MAX);
858
859 l->debug.lock_thread = (void *)current_thread();
860 l->debug.state |= USLOCK_TAKEN;
861 l->debug.lock_pc = pc;
862 l->debug.lock_cpu = (unsigned char)mycpu;
863 }
864
865
866 /*
867 * Debug checks on a usimple_lock just before
868 * releasing it. Note that the caller has not
869 * yet released the hardware lock.
870 *
871 * Preemption is still disabled, so there's
872 * no problem using cpu_number.
873 */
874 void
usld_unlock(usimple_lock_t l,pc_t pc)875 usld_unlock(
876 usimple_lock_t l,
877 pc_t pc)
878 {
879 unsigned int mycpu;
880 char caller[] = "usimple_unlock";
881
882
883 if (!usld_lock_common_checks(l, caller)) {
884 return;
885 }
886
887 mycpu = cpu_number();
888 assert(mycpu <= UCHAR_MAX);
889
890 if (!(l->debug.state & USLOCK_TAKEN)) {
891 panic("%s: lock 0x%p hasn't been taken",
892 caller, l);
893 }
894 if (l->debug.lock_thread != (void *) current_thread()) {
895 panic("%s: unlocking lock 0x%p, owned by thread %p",
896 caller, l, l->debug.lock_thread);
897 }
898 if (l->debug.lock_cpu != mycpu) {
899 printf("%s: unlocking lock 0x%p on cpu 0x%x",
900 caller, l, mycpu);
901 printf(" (acquired on cpu 0x%x)\n", l->debug.lock_cpu);
902 panic("%s", caller);
903 }
904
905 l->debug.unlock_thread = l->debug.lock_thread;
906 l->debug.lock_thread = INVALID_PC;
907 l->debug.state &= ~USLOCK_TAKEN;
908 l->debug.unlock_pc = pc;
909 l->debug.unlock_cpu = (unsigned char)mycpu;
910 }
911
912
913 /*
914 * Debug checks on a usimple_lock just before
915 * attempting to acquire it.
916 *
917 * Preemption isn't guaranteed to be disabled.
918 */
919 void
usld_lock_try_pre(usimple_lock_t l,__unused pc_t pc)920 usld_lock_try_pre(
921 usimple_lock_t l,
922 __unused pc_t pc)
923 {
924 char caller[] = "usimple_lock_try";
925
926 if (!usld_lock_common_checks(l, caller)) {
927 return;
928 }
929 }
930
931
932 /*
933 * Debug checks on a usimple_lock just after
934 * successfully attempting to acquire it.
935 *
936 * Preemption has been disabled by the
937 * lock acquisition attempt, so it's safe
938 * to use cpu_number.
939 */
940 void
usld_lock_try_post(usimple_lock_t l,pc_t pc)941 usld_lock_try_post(
942 usimple_lock_t l,
943 pc_t pc)
944 {
945 unsigned int mycpu;
946 char caller[] = "successful usimple_lock_try";
947
948 if (!usld_lock_common_checks(l, caller)) {
949 return;
950 }
951
952 if (!((l->debug.state & ~USLOCK_TAKEN) == USLOCK_INITIALIZED)) {
953 panic("%s: lock 0x%p became uninitialized",
954 caller, l);
955 }
956 if ((l->debug.state & USLOCK_TAKEN)) {
957 panic("%s: lock 0x%p became TAKEN by someone else",
958 caller, l);
959 }
960
961 mycpu = cpu_number();
962 assert(mycpu <= UCHAR_MAX);
963
964 l->debug.lock_thread = (void *) current_thread();
965 l->debug.state |= USLOCK_TAKEN;
966 l->debug.lock_pc = pc;
967 l->debug.lock_cpu = (unsigned char)mycpu;
968 }
969 #endif /* USLOCK_DEBUG */
970 #if LCK_MTX_USE_ARCH
971
972 /*
973 * Slow path routines for lck_mtx locking and unlocking functions.
974 *
975 * These functions were previously implemented in x86 assembly,
976 * and some optimizations are in place in this c code to obtain a compiled code
977 * as performant and compact as the assembly version.
978 *
979 * To avoid to inline these functions on the fast path, all functions directly called by
980 * the fast paths have the __attribute__((noinline)) specified. Also they are all implemented
981 * in such a way the fast path can tail call into them. In this way the return address
982 * does not need to be pushed on the caller stack and stack optimization can happen on the caller.
983 *
984 * Slow path code is structured in such a way there are no calls to functions that will return
985 * on the context of the caller function, i.e. all functions called are or tail call functions
986 * or inline functions. The number of arguments of the tail call functions are less then six,
987 * so that they can be passed over registers and do not need to be pushed on stack.
988 * This allows the compiler to not create a stack frame for the functions.
989 *
990 * __improbable and __probable are used to compile the slow path code in such a way
991 * the fast path case will be on a sequence of instructions with as less jumps as possible,
992 * to make this case the most optimized even if falling through the slow path.
993 */
994
995 /*
996 * Intel lock invariants:
997 *
998 * lck_mtx_waiters: contains the count of threads currently in the mutex waitqueue
999 *
1000 * The lock owner is promoted to the max priority of all its waiters only if it
1001 * was a lower priority when it acquired or was an owner when a waiter waited.
1002 * Max priority is capped at MAXPRI_PROMOTE.
1003 *
1004 * The last waiter will not be promoted as it is woken up, but the last
1005 * lock owner may not have been the last thread to have been woken up depending on the
1006 * luck of the draw. Therefore a last-owner may still have the promoted-on-wakeup
1007 * flag set.
1008 *
1009 * TODO: Figure out an algorithm for stopping a lock holder which is already at the right
1010 * priority from dropping priority in the future without having to take thread lock
1011 * on acquire.
1012 */
1013
1014 /*
1015 * Routine: lck_mtx_alloc_init
1016 */
1017 lck_mtx_t *
lck_mtx_alloc_init(lck_grp_t * grp,lck_attr_t * attr)1018 lck_mtx_alloc_init(
1019 lck_grp_t *grp,
1020 lck_attr_t *attr)
1021 {
1022 lck_mtx_t *lck;
1023
1024 lck = zalloc(KT_LCK_MTX);
1025 lck_mtx_init(lck, grp, attr);
1026 return lck;
1027 }
1028
1029 /*
1030 * Routine: lck_mtx_free
1031 */
1032 void
lck_mtx_free(lck_mtx_t * lck,lck_grp_t * grp)1033 lck_mtx_free(
1034 lck_mtx_t *lck,
1035 lck_grp_t *grp)
1036 {
1037 lck_mtx_destroy(lck, grp);
1038 zfree(KT_LCK_MTX, lck);
1039 }
1040
1041 /*
1042 * Routine: lck_mtx_init
1043 */
1044 void
lck_mtx_init(lck_mtx_t * lck,lck_grp_t * grp,lck_attr_t * attr)1045 lck_mtx_init(
1046 lck_mtx_t *lck,
1047 lck_grp_t *grp,
1048 lck_attr_t *attr)
1049 {
1050 if (attr == LCK_ATTR_NULL) {
1051 attr = &lck_attr_default;
1052 }
1053
1054 *lck = (lck_mtx_t){
1055 .lck_mtx_grp = grp->lck_grp_attr_id,
1056 };
1057
1058 if (__improbable((attr->lck_attr_val & LCK_ATTR_DEBUG) ||
1059 (grp->lck_grp_attr_id & LCK_GRP_ATTR_DEBUG))) {
1060 lck->lck_mtx_profile = 1;
1061 }
1062
1063 lck_grp_reference(grp, &grp->lck_grp_mtxcnt);
1064 }
1065
1066 /*
1067 * Routine: lck_mtx_destroy
1068 */
1069 void
lck_mtx_destroy(lck_mtx_t * lck,lck_grp_t * grp)1070 lck_mtx_destroy(
1071 lck_mtx_t *lck,
1072 lck_grp_t *grp)
1073 {
1074 if (lck->lck_mtx_state == LCK_MTX_TAG_DESTROYED) {
1075 return;
1076 }
1077 #if MACH_LDEBUG
1078 lck_mtx_assert(lck, LCK_MTX_ASSERT_NOTOWNED);
1079 #endif
1080 ordered_store_mtx_state_release(lck, LCK_MTX_TAG_DESTROYED);
1081 lck_grp_deallocate(grp, &grp->lck_grp_mtxcnt);
1082 }
1083
1084
1085 #if DEVELOPMENT | DEBUG
1086 __attribute__((noinline))
1087 void
lck_mtx_owner_check_panic(lck_mtx_t * lock)1088 lck_mtx_owner_check_panic(
1089 lck_mtx_t *lock)
1090 {
1091 thread_t owner = ctid_get_thread_unsafe(lock->lck_mtx_owner);
1092 panic("Mutex unlock attempted from non-owner thread. Owner=%p lock=%p", owner, lock);
1093 }
1094 #endif
1095
1096 /*
1097 * Routine: lck_mtx_unlock_slow
1098 *
1099 * Unlocks a mutex held by current thread.
1100 *
1101 * It will wake up waiters if necessary.
1102 *
1103 * Interlock can be held.
1104 */
1105 __attribute__((noinline))
1106 void
lck_mtx_unlock_slow(lck_mtx_t * lock)1107 lck_mtx_unlock_slow(
1108 lck_mtx_t *lock)
1109 {
1110 thread_t thread;
1111 uint32_t state;
1112
1113 state = ordered_load_mtx_state(lock);
1114
1115 thread = current_thread();
1116
1117 #if DEVELOPMENT | DEBUG
1118 if (__improbable(lock->lck_mtx_owner != thread->ctid)) {
1119 lck_mtx_owner_check_panic(lock);
1120 }
1121 #endif
1122
1123 /* check if it is held as a spinlock */
1124 if (__improbable((state & LCK_MTX_MLOCKED_MSK) == 0)) {
1125 goto unlock;
1126 }
1127
1128 lck_mtx_interlock_lock_clear_flags(lock, LCK_MTX_MLOCKED_MSK, &state);
1129
1130 unlock:
1131 /* preemption disabled, interlock held and mutex not held */
1132
1133 /* clear owner */
1134 ordered_store_mtx_owner(lock, 0);
1135
1136 if (__improbable(state & LCK_MTX_WAITERS_MSK)) {
1137 return lck_mtx_unlock_wakeup_tail(lock, state);
1138 }
1139
1140 /* release interlock, promotion and clear spin flag */
1141 state &= (~(LCK_MTX_ILOCKED_MSK | LCK_MTX_SPIN_MSK));
1142 ordered_store_mtx_state_release(lock, state); /* since I own the interlock, I don't need an atomic update */
1143
1144 /* re-enable preemption */
1145 lck_mtx_unlock_finish_inline(lock, state);
1146 }
1147
1148 #define LCK_MTX_LCK_WAIT_CODE 0x20
1149 #define LCK_MTX_LCK_WAKEUP_CODE 0x21
1150 #define LCK_MTX_LCK_SPIN_CODE 0x22
1151 #define LCK_MTX_LCK_ACQUIRE_CODE 0x23
1152 #define LCK_MTX_LCK_DEMOTE_CODE 0x24
1153
1154 /*
1155 * Routine: lck_mtx_unlock_wakeup_tail
1156 *
1157 * Invoked on unlock when there is
1158 * contention, i.e. the assembly routine sees
1159 * that mutex->lck_mtx_waiters != 0
1160 *
1161 * neither the mutex or interlock is held
1162 *
1163 * Note that this routine might not be called if there are pending
1164 * waiters which have previously been woken up, and they didn't
1165 * end up boosting the old owner.
1166 *
1167 * assembly routine previously did the following to mutex:
1168 * (after saving the state in prior_lock_state)
1169 * decremented lck_mtx_waiters if nonzero
1170 *
1171 * This function needs to be called as a tail call
1172 * to optimize the compiled code.
1173 */
1174 __attribute__((noinline))
1175 static void
lck_mtx_unlock_wakeup_tail(lck_mtx_t * mutex,uint32_t state)1176 lck_mtx_unlock_wakeup_tail(
1177 lck_mtx_t *mutex,
1178 uint32_t state)
1179 {
1180 struct turnstile *ts;
1181
1182 __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(mutex);
1183 kern_return_t did_wake;
1184
1185 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAKEUP_CODE) | DBG_FUNC_START,
1186 trace_lck, 0, mutex->lck_mtx_waiters, 0, 0);
1187
1188 ts = turnstile_prepare_hash((uintptr_t)mutex, TURNSTILE_KERNEL_MUTEX);
1189
1190 did_wake = waitq_wakeup64_one(&ts->ts_waitq, LCK_MTX_EVENT(mutex),
1191 THREAD_AWAKENED, WAITQ_UPDATE_INHERITOR);
1192 assert(did_wake == KERN_SUCCESS);
1193
1194 turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD);
1195 turnstile_complete_hash((uintptr_t)mutex, TURNSTILE_KERNEL_MUTEX);
1196
1197 state -= LCK_MTX_WAITER;
1198 state &= (~(LCK_MTX_SPIN_MSK | LCK_MTX_ILOCKED_MSK));
1199 ordered_store_mtx_state_release(mutex, state);
1200
1201 assert(current_thread()->turnstile != NULL);
1202
1203 turnstile_cleanup();
1204
1205 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAKEUP_CODE) | DBG_FUNC_END,
1206 trace_lck, 0, mutex->lck_mtx_waiters, 0, 0);
1207
1208 lck_mtx_unlock_finish_inline(mutex, state);
1209 }
1210
1211 /*
1212 * Routine: lck_mtx_lock_acquire_x86
1213 *
1214 * Invoked on acquiring the mutex when there is
1215 * contention (i.e. the assembly routine sees that
1216 * that mutex->lck_mtx_waiters != 0
1217 *
1218 * mutex is owned... interlock is held... preemption is disabled
1219 */
1220 __attribute__((always_inline))
1221 static void
lck_mtx_lock_acquire_inline(lck_mtx_t * mutex,struct turnstile * ts)1222 lck_mtx_lock_acquire_inline(
1223 lck_mtx_t *mutex,
1224 struct turnstile *ts)
1225 {
1226 __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(mutex);
1227
1228 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_ACQUIRE_CODE) | DBG_FUNC_START,
1229 trace_lck, 0, mutex->lck_mtx_waiters, 0, 0);
1230
1231 if (mutex->lck_mtx_waiters > 0) {
1232 if (ts == NULL) {
1233 ts = turnstile_prepare_hash((uintptr_t)mutex, TURNSTILE_KERNEL_MUTEX);
1234 }
1235
1236 turnstile_update_inheritor(ts, current_thread(),
1237 TURNSTILE_IMMEDIATE_UPDATE | TURNSTILE_INHERITOR_THREAD);
1238 turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD);
1239 }
1240
1241 if (ts != NULL) {
1242 turnstile_complete_hash((uintptr_t)mutex, TURNSTILE_KERNEL_MUTEX);
1243 }
1244
1245 assert(current_thread()->turnstile != NULL);
1246
1247 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_ACQUIRE_CODE) | DBG_FUNC_END,
1248 trace_lck, 0, mutex->lck_mtx_waiters, 0, 0);
1249 }
1250
1251 void
lck_mtx_lock_acquire_x86(lck_mtx_t * mutex)1252 lck_mtx_lock_acquire_x86(
1253 lck_mtx_t *mutex)
1254 {
1255 return lck_mtx_lock_acquire_inline(mutex, NULL);
1256 }
1257
1258 /*
1259 * Tail call helpers for lock functions that perform
1260 * lck_mtx_lock_acquire followed by the caller's finish routine, to optimize
1261 * the caller's compiled code.
1262 */
1263
1264 __attribute__((noinline))
1265 static void
lck_mtx_lock_acquire_tail(lck_mtx_t * mutex,struct turnstile * ts)1266 lck_mtx_lock_acquire_tail(
1267 lck_mtx_t *mutex,
1268 struct turnstile *ts)
1269 {
1270 lck_mtx_lock_acquire_inline(mutex, ts);
1271 lck_mtx_lock_finish_inline_with_cleanup(mutex, ordered_load_mtx_state(mutex));
1272 }
1273
1274 __attribute__((noinline))
1275 static boolean_t
lck_mtx_try_lock_acquire_tail(lck_mtx_t * mutex)1276 lck_mtx_try_lock_acquire_tail(
1277 lck_mtx_t *mutex)
1278 {
1279 lck_mtx_lock_acquire_inline(mutex, NULL);
1280 lck_mtx_try_lock_finish_inline(mutex, ordered_load_mtx_state(mutex));
1281
1282 return TRUE;
1283 }
1284
1285 __attribute__((noinline))
1286 static void
lck_mtx_convert_spin_acquire_tail(lck_mtx_t * mutex)1287 lck_mtx_convert_spin_acquire_tail(
1288 lck_mtx_t *mutex)
1289 {
1290 lck_mtx_lock_acquire_inline(mutex, NULL);
1291 lck_mtx_convert_spin_finish_inline(mutex, ordered_load_mtx_state(mutex));
1292 }
1293
1294 static void
lck_mtx_ilk_unlock(lck_mtx_t * mutex)1295 lck_mtx_ilk_unlock(
1296 lck_mtx_t *mutex)
1297 {
1298 lck_mtx_ilk_unlock_inline(mutex, ordered_load_mtx_state(mutex));
1299 }
1300
1301 static inline void
lck_mtx_interlock_lock_set_and_clear_flags(lck_mtx_t * mutex,uint32_t xor_flags,uint32_t and_flags,uint32_t * new_state)1302 lck_mtx_interlock_lock_set_and_clear_flags(
1303 lck_mtx_t *mutex,
1304 uint32_t xor_flags,
1305 uint32_t and_flags,
1306 uint32_t *new_state)
1307 {
1308 uint32_t state, prev;
1309 state = *new_state;
1310
1311 for (;;) {
1312 /* have to wait for interlock to clear */
1313 while (__improbable(state & (LCK_MTX_ILOCKED_MSK | xor_flags))) {
1314 cpu_pause();
1315 state = ordered_load_mtx_state(mutex);
1316 }
1317 prev = state; /* prev contains snapshot for exchange */
1318 state |= LCK_MTX_ILOCKED_MSK | xor_flags; /* pick up interlock */
1319 state &= ~and_flags; /* clear flags */
1320
1321 disable_preemption();
1322 if (os_atomic_cmpxchg(&mutex->lck_mtx_state, prev, state, acquire)) {
1323 break;
1324 }
1325 enable_preemption();
1326 cpu_pause();
1327 state = ordered_load_mtx_state(mutex);
1328 }
1329 *new_state = state;
1330 return;
1331 }
1332
1333 static inline void
lck_mtx_interlock_lock_clear_flags(lck_mtx_t * mutex,uint32_t and_flags,uint32_t * new_state)1334 lck_mtx_interlock_lock_clear_flags(
1335 lck_mtx_t *mutex,
1336 uint32_t and_flags,
1337 uint32_t *new_state)
1338 {
1339 return lck_mtx_interlock_lock_set_and_clear_flags(mutex, 0, and_flags, new_state);
1340 }
1341
1342 static inline void
lck_mtx_interlock_lock(lck_mtx_t * mutex,uint32_t * new_state)1343 lck_mtx_interlock_lock(
1344 lck_mtx_t *mutex,
1345 uint32_t *new_state)
1346 {
1347 return lck_mtx_interlock_lock_set_and_clear_flags(mutex, 0, 0, new_state);
1348 }
1349
1350 static inline int
lck_mtx_interlock_try_lock_set_flags(lck_mtx_t * mutex,uint32_t or_flags,uint32_t * new_state)1351 lck_mtx_interlock_try_lock_set_flags(
1352 lck_mtx_t *mutex,
1353 uint32_t or_flags,
1354 uint32_t *new_state)
1355 {
1356 uint32_t state, prev;
1357 state = *new_state;
1358
1359 /* have to wait for interlock to clear */
1360 if (state & (LCK_MTX_ILOCKED_MSK | or_flags)) {
1361 return 0;
1362 }
1363 prev = state; /* prev contains snapshot for exchange */
1364 state |= LCK_MTX_ILOCKED_MSK | or_flags; /* pick up interlock */
1365 disable_preemption();
1366 if (os_atomic_cmpxchg(&mutex->lck_mtx_state, prev, state, acquire)) {
1367 *new_state = state;
1368 return 1;
1369 }
1370
1371 enable_preemption();
1372 return 0;
1373 }
1374
1375 __attribute__((noinline))
1376 static void
lck_mtx_lock_contended(lck_mtx_t * lock,bool profile,boolean_t * first_miss)1377 lck_mtx_lock_contended(
1378 lck_mtx_t *lock,
1379 bool profile,
1380 boolean_t *first_miss)
1381 {
1382 lck_mtx_spinwait_ret_type_t ret;
1383 uint32_t state;
1384 thread_t thread;
1385 struct turnstile *ts = NULL;
1386
1387 try_again:
1388
1389 if (profile) {
1390 LCK_MTX_PROF_MISS(lock, lock->lck_mtx_grp, first_miss);
1391 }
1392
1393 ret = lck_mtx_lock_spinwait_x86(lock);
1394 state = ordered_load_mtx_state(lock);
1395 switch (ret) {
1396 case LCK_MTX_SPINWAIT_NO_SPIN:
1397 /*
1398 * owner not on core, lck_mtx_lock_spinwait_x86 didn't even
1399 * try to spin.
1400 */
1401 /* just fall through case LCK_MTX_SPINWAIT_SPUN */
1402 OS_FALLTHROUGH;
1403 case LCK_MTX_SPINWAIT_SPUN_HIGH_THR:
1404 case LCK_MTX_SPINWAIT_SPUN_OWNER_NOT_CORE:
1405 case LCK_MTX_SPINWAIT_SPUN_NO_WINDOW_CONTENTION:
1406 case LCK_MTX_SPINWAIT_SPUN_SLIDING_THR:
1407 /*
1408 * mutex not acquired but lck_mtx_lock_spinwait_x86 tried to spin
1409 * interlock not held
1410 */
1411 lck_mtx_interlock_lock(lock, &state);
1412 assert(state & LCK_MTX_ILOCKED_MSK);
1413
1414 if (state & LCK_MTX_MLOCKED_MSK) {
1415 if (profile) {
1416 LCK_MTX_PROF_WAIT(lock, lock->lck_mtx_grp,
1417 ret == LCK_MTX_SPINWAIT_NO_SPIN, first_miss);
1418 }
1419 lck_mtx_lock_wait_x86(lock, &ts);
1420 /*
1421 * interlock is not held here.
1422 */
1423 goto try_again;
1424 } else {
1425 /* grab the mutex */
1426 state |= LCK_MTX_MLOCKED_MSK;
1427 ordered_store_mtx_state_release(lock, state);
1428 thread = current_thread();
1429 ordered_store_mtx_owner(lock, thread->ctid);
1430 }
1431
1432 break;
1433 case LCK_MTX_SPINWAIT_ACQUIRED:
1434 /*
1435 * mutex has been acquired by lck_mtx_lock_spinwait_x86
1436 * interlock is held and preemption disabled
1437 * owner is set and mutex marked as locked
1438 * statistics updated too
1439 */
1440 break;
1441 default:
1442 panic("lck_mtx_lock_spinwait_x86 returned %d for mutex %p", ret, lock);
1443 }
1444
1445 /*
1446 * interlock is already acquired here
1447 */
1448
1449 /* mutex has been acquired */
1450 if (state & LCK_MTX_WAITERS_MSK) {
1451 /*
1452 * lck_mtx_lock_acquire_tail will call
1453 * turnstile_complete.
1454 */
1455 return lck_mtx_lock_acquire_tail(lock, ts);
1456 }
1457
1458 if (ts != NULL) {
1459 turnstile_complete_hash((uintptr_t)lock, TURNSTILE_KERNEL_MUTEX);
1460 }
1461
1462 assert(current_thread()->turnstile != NULL);
1463
1464 /* release the interlock */
1465 lck_mtx_lock_finish_inline_with_cleanup(lock, ordered_load_mtx_state(lock));
1466 }
1467
1468 /*
1469 * Helper noinline functions for calling
1470 * panic to optimize compiled code.
1471 */
1472
1473 __attribute__((noinline)) __abortlike
1474 static void
lck_mtx_destroyed(lck_mtx_t * lock)1475 lck_mtx_destroyed(
1476 lck_mtx_t *lock)
1477 {
1478 panic("trying to interlock destroyed mutex (%p)", lock);
1479 }
1480
1481 __attribute__((noinline))
1482 static boolean_t
lck_mtx_try_destroyed(lck_mtx_t * lock)1483 lck_mtx_try_destroyed(
1484 lck_mtx_t *lock)
1485 {
1486 panic("trying to interlock destroyed mutex (%p)", lock);
1487 return FALSE;
1488 }
1489
1490 __attribute__((always_inline))
1491 static boolean_t
lck_mtx_lock_wait_interlock_to_clear(lck_mtx_t * lock,uint32_t * new_state)1492 lck_mtx_lock_wait_interlock_to_clear(
1493 lck_mtx_t *lock,
1494 uint32_t* new_state)
1495 {
1496 uint32_t state;
1497
1498 for (;;) {
1499 cpu_pause();
1500 state = ordered_load_mtx_state(lock);
1501 if (!(state & (LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK))) {
1502 *new_state = state;
1503 return TRUE;
1504 }
1505 if (state & LCK_MTX_MLOCKED_MSK) {
1506 /* if it is held as mutex, just fail */
1507 return FALSE;
1508 }
1509 }
1510 }
1511
1512 __attribute__((always_inline))
1513 static boolean_t
lck_mtx_try_lock_wait_interlock_to_clear(lck_mtx_t * lock,uint32_t * new_state)1514 lck_mtx_try_lock_wait_interlock_to_clear(
1515 lck_mtx_t *lock,
1516 uint32_t* new_state)
1517 {
1518 uint32_t state;
1519
1520 for (;;) {
1521 cpu_pause();
1522 state = ordered_load_mtx_state(lock);
1523 if (state & (LCK_MTX_MLOCKED_MSK | LCK_MTX_SPIN_MSK)) {
1524 /* if it is held as mutex or spin, just fail */
1525 return FALSE;
1526 }
1527 if (!(state & LCK_MTX_ILOCKED_MSK)) {
1528 *new_state = state;
1529 return TRUE;
1530 }
1531 }
1532 }
1533
1534 /*
1535 * Routine: lck_mtx_lock_slow
1536 *
1537 * Locks a mutex for current thread.
1538 * If the lock is contended this function might
1539 * sleep.
1540 *
1541 * Called with interlock not held.
1542 */
1543 __attribute__((noinline))
1544 void
lck_mtx_lock_slow(lck_mtx_t * lock)1545 lck_mtx_lock_slow(
1546 lck_mtx_t *lock)
1547 {
1548 bool profile;
1549 uint32_t state;
1550 int first_miss = 0;
1551
1552 state = ordered_load_mtx_state(lock);
1553 profile = (state & LCK_MTX_PROFILE_MSK);
1554
1555 if (__improbable(profile)) {
1556 if (state & LCK_MTX_SPIN_MSK) {
1557 /* M_SPIN_MSK was set, so M_ILOCKED_MSK must also be present */
1558 assert(state & LCK_MTX_ILOCKED_MSK);
1559 LCK_MTX_PROF_MISS(lock, lock->lck_mtx_grp, &first_miss);
1560 }
1561 }
1562
1563 /* is the interlock or mutex held */
1564 if (__improbable(state & (LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK))) {
1565 /*
1566 * Note: LCK_MTX_TAG_DESTROYED has
1567 * LCK_MTX_ILOCKED_MSK and LCK_MTX_MLOCKED_MSK set.
1568 */
1569
1570 /* is the mutex already held */
1571 if (__improbable(!(state & LCK_MTX_ILOCKED_MSK))) {
1572 /* no, must have been the mutex */
1573 return lck_mtx_lock_contended(lock, profile, &first_miss);
1574 }
1575
1576 /* check to see if it is marked destroyed */
1577 if (__improbable(state == LCK_MTX_TAG_DESTROYED)) {
1578 lck_mtx_destroyed(lock);
1579 }
1580 }
1581
1582 /* no - can't be DESTROYED or locked */
1583 while (__improbable(!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_MLOCKED_MSK, &state))) {
1584 if (!lck_mtx_lock_wait_interlock_to_clear(lock, &state)) {
1585 return lck_mtx_lock_contended(lock, profile, &first_miss);
1586 }
1587 }
1588
1589 /* lock and interlock acquired */
1590
1591 /* record owner of mutex */
1592 ordered_store_mtx_owner(lock, current_thread()->ctid);
1593
1594 /*
1595 * Check if there are waiters to
1596 * inherit their priority.
1597 */
1598 if (__improbable(state & LCK_MTX_WAITERS_MSK)) {
1599 return lck_mtx_lock_acquire_tail(lock, NULL);
1600 }
1601
1602 /* release the interlock */
1603 lck_mtx_lock_finish_inline(lock, ordered_load_mtx_state(lock));
1604 }
1605
1606 __attribute__((noinline))
1607 boolean_t
lck_mtx_try_lock_slow(lck_mtx_t * lock)1608 lck_mtx_try_lock_slow(
1609 lck_mtx_t *lock)
1610 {
1611 bool profile;
1612 uint32_t state;
1613 int first_miss = 0;
1614
1615 state = ordered_load_mtx_state(lock);
1616 profile = (state & LCK_MTX_PROFILE_MSK);
1617
1618 /* is the interlock or mutex held */
1619 if (__improbable(state & (LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK))) {
1620 /*
1621 * Note: LCK_MTX_TAG_DESTROYED has
1622 * LCK_MTX_ILOCKED_MSK and LCK_MTX_MLOCKED_MSK set.
1623 */
1624
1625 /* is the mutex already held */
1626 if (__improbable(!(state & LCK_MTX_ILOCKED_MSK))) {
1627 return FALSE;
1628 }
1629
1630 /* check to see if it is marked destroyed */
1631 if (__improbable(state == LCK_MTX_TAG_DESTROYED)) {
1632 lck_mtx_try_destroyed(lock);
1633 }
1634
1635 if (!lck_mtx_try_lock_wait_interlock_to_clear(lock, &state)) {
1636 if (__improbable(profile)) {
1637 LCK_MTX_PROF_MISS(lock, lock->lck_mtx_grp, &first_miss);
1638 }
1639 return FALSE;
1640 }
1641 }
1642
1643 if (__improbable(!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_MLOCKED_MSK, &state))) {
1644 if (__improbable(profile)) {
1645 LCK_MTX_PROF_MISS(lock, lock->lck_mtx_grp, &first_miss);
1646 }
1647 return FALSE;
1648 }
1649
1650 /* lock and interlock acquired */
1651
1652 /* record owner of mutex */
1653 ordered_store_mtx_owner(lock, current_thread()->ctid);
1654
1655 /*
1656 * Check if there are waiters to
1657 * inherit their priority.
1658 */
1659 if (__improbable(state & LCK_MTX_WAITERS_MSK)) {
1660 return lck_mtx_try_lock_acquire_tail(lock);
1661 }
1662
1663 /* release the interlock */
1664 lck_mtx_try_lock_finish_inline(lock, ordered_load_mtx_state(lock));
1665
1666 return TRUE;
1667 }
1668
1669 __attribute__((noinline))
1670 void
lck_mtx_lock_spin_slow(lck_mtx_t * lock)1671 lck_mtx_lock_spin_slow(
1672 lck_mtx_t *lock)
1673 {
1674 bool profile;
1675 uint32_t state;
1676 int first_miss = 0;
1677
1678 state = ordered_load_mtx_state(lock);
1679 profile = (state & LCK_MTX_PROFILE_MSK);
1680
1681 if (__improbable(profile)) {
1682 if (state & LCK_MTX_SPIN_MSK) {
1683 /* M_SPIN_MSK was set, so M_ILOCKED_MSK must also be present */
1684 assert(state & LCK_MTX_ILOCKED_MSK);
1685 LCK_MTX_PROF_MISS(lock, lock->lck_mtx_grp, &first_miss);
1686 }
1687 }
1688
1689 /* is interlock or mutex held */
1690 if (__improbable(state & (LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK))) {
1691 /*
1692 * Note: LCK_MTX_TAG_DESTROYED has
1693 * LCK_MTX_ILOCKED_MSK and LCK_MTX_MLOCKED_MSK set.
1694 */
1695
1696 /* is the mutex already held */
1697 if (__improbable(!(state & LCK_MTX_ILOCKED_MSK))) {
1698 /* no, must have been the mutex */
1699 return lck_mtx_lock_contended(lock, profile, &first_miss);
1700 }
1701
1702 /* check to see if it is marked destroyed */
1703 if (__improbable(state == LCK_MTX_TAG_DESTROYED)) {
1704 lck_mtx_destroyed(lock);
1705 }
1706 }
1707
1708 bool acquired = true;
1709
1710 #if CONFIG_DTRACE
1711 uint64_t spin_start;
1712
1713 if (__probable(lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_SPIN_MSK, &state))) {
1714 goto past_spin;
1715 }
1716
1717 spin_start = LCK_MTX_SPIN_SPIN_BEGIN();
1718 #endif /* CONFIG_DTRACE */
1719
1720 /* note - can't be DESTROYED or locked */
1721 /* note - preemption is not disabled while spinning */
1722 while (__improbable(!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_SPIN_MSK, &state))) {
1723 if (!lck_mtx_lock_wait_interlock_to_clear(lock, &state)) {
1724 acquired = false;
1725 break;
1726 }
1727 }
1728
1729 LCK_MTX_SPIN_SPIN_END(lock, lock->lck_mtx_grp, spin_start);
1730 #if CONFIG_DTRACE
1731 past_spin:
1732 #endif /* CONFIG_DTRACE */
1733
1734 if (__improbable(!acquired)) {
1735 return lck_mtx_lock_contended(lock, profile, &first_miss);
1736 }
1737
1738 /* lock as spinlock and interlock acquired */
1739
1740 /* record owner of mutex */
1741 ordered_store_mtx_owner(lock, current_thread()->ctid);
1742
1743 LCK_MTX_ACQUIRED(lock, lock->lck_mtx_grp, true, profile);
1744
1745 /* return with the interlock held and preemption disabled */
1746 return;
1747 }
1748
1749 __attribute__((noinline))
1750 boolean_t
lck_mtx_try_lock_spin_slow(lck_mtx_t * lock)1751 lck_mtx_try_lock_spin_slow(
1752 lck_mtx_t *lock)
1753 {
1754 bool profile;
1755 uint32_t state;
1756 int first_miss = 0;
1757
1758 state = ordered_load_mtx_state(lock);
1759 profile = (state & LCK_MTX_PROFILE_MSK);
1760
1761 /* is the interlock or mutex held */
1762 if (__improbable(state & (LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK))) {
1763 /*
1764 * Note: LCK_MTX_TAG_DESTROYED has
1765 * LCK_MTX_ILOCKED_MSK and LCK_MTX_MLOCKED_MSK set.
1766 */
1767
1768 /* is the mutex already held */
1769 if (__improbable(!(state & LCK_MTX_ILOCKED_MSK))) {
1770 return FALSE;
1771 }
1772
1773 /* check to see if it is marked destroyed */
1774 if (__improbable(state == LCK_MTX_TAG_DESTROYED)) {
1775 lck_mtx_try_destroyed(lock);
1776 }
1777 }
1778
1779 /* note - can't be DESTROYED or locked */
1780 if (__improbable(!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_SPIN_MSK, &state))) {
1781 if (__improbable(profile)) {
1782 LCK_MTX_PROF_MISS(lock, lock->lck_mtx_grp, &first_miss);
1783 }
1784 return FALSE;
1785 }
1786
1787 /* lock and interlock acquired */
1788
1789 /* record owner of mutex */
1790 ordered_store_mtx_owner(lock, current_thread()->ctid);
1791
1792 #if CONFIG_DTRACE
1793 LOCKSTAT_RECORD(LS_LCK_MTX_TRY_LOCK_SPIN_ACQUIRE, lock, 0);
1794 #endif
1795 return TRUE;
1796 }
1797
1798 __attribute__((noinline))
1799 void
lck_mtx_convert_spin(lck_mtx_t * lock)1800 lck_mtx_convert_spin(
1801 lck_mtx_t *lock)
1802 {
1803 uint32_t state;
1804
1805 state = ordered_load_mtx_state(lock);
1806
1807 assertf(lock->lck_mtx_owner == current_thread()->ctid,
1808 "lock %p not owned by thread ctid %d (current owner %d)",
1809 lock, current_thread()->ctid, lock->lck_mtx_owner);
1810
1811 if (__improbable(state & LCK_MTX_MLOCKED_MSK)) {
1812 /* already owned as a mutex, just return */
1813 return;
1814 }
1815
1816 assert(get_preemption_level() > 0);
1817 assert(state & LCK_MTX_ILOCKED_MSK);
1818 assert(state & LCK_MTX_SPIN_MSK);
1819
1820 /*
1821 * Check if there are waiters to
1822 * inherit their priority.
1823 */
1824 if (__improbable(state & LCK_MTX_WAITERS_MSK)) {
1825 return lck_mtx_convert_spin_acquire_tail(lock);
1826 }
1827
1828 lck_mtx_convert_spin_finish_inline(lock, ordered_load_mtx_state(lock));
1829
1830 return;
1831 }
1832
1833 static inline boolean_t
lck_mtx_lock_grab_mutex(lck_mtx_t * lock)1834 lck_mtx_lock_grab_mutex(
1835 lck_mtx_t *lock)
1836 {
1837 uint32_t state;
1838
1839 state = ordered_load_mtx_state(lock);
1840
1841 if (!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_MLOCKED_MSK, &state)) {
1842 return FALSE;
1843 }
1844
1845 /* lock and interlock acquired */
1846
1847 /* record owner of mutex */
1848 ordered_store_mtx_owner(lock, current_thread()->ctid);
1849 return TRUE;
1850 }
1851
1852 __attribute__((noinline))
1853 void
lck_mtx_assert(lck_mtx_t * lock,unsigned int type)1854 lck_mtx_assert(
1855 lck_mtx_t *lock,
1856 unsigned int type)
1857 {
1858 thread_t thread;
1859 uint32_t state;
1860
1861 thread = current_thread();
1862 state = ordered_load_mtx_state(lock);
1863
1864 if (type == LCK_MTX_ASSERT_OWNED) {
1865 if ((lock->lck_mtx_owner != thread->ctid) ||
1866 !(state & (LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK))) {
1867 panic("mutex (%p) not owned", lock);
1868 }
1869 } else {
1870 assert(type == LCK_MTX_ASSERT_NOTOWNED);
1871 if (lock->lck_mtx_owner == thread->ctid) {
1872 panic("mutex (%p) owned", lock);
1873 }
1874 }
1875 }
1876
1877 /*
1878 * Routine: lck_mtx_lock_spinwait_x86
1879 *
1880 * Invoked trying to acquire a mutex when there is contention but
1881 * the holder is running on another processor. We spin for up to a maximum
1882 * time waiting for the lock to be released.
1883 *
1884 * Called with the interlock unlocked.
1885 * returns LCK_MTX_SPINWAIT_ACQUIRED if mutex acquired
1886 * returns LCK_MTX_SPINWAIT_SPUN if we spun
1887 * returns LCK_MTX_SPINWAIT_NO_SPIN if we didn't spin due to the holder not running
1888 */
1889 __attribute__((noinline))
1890 lck_mtx_spinwait_ret_type_t
lck_mtx_lock_spinwait_x86(lck_mtx_t * mutex)1891 lck_mtx_lock_spinwait_x86(
1892 lck_mtx_t *mutex)
1893 {
1894 __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(mutex);
1895 ctid_t owner, prev_owner;
1896 uint64_t window_deadline, sliding_deadline, high_deadline;
1897 uint64_t spin_start, start_time, cur_time, avg_hold_time, bias, delta;
1898 lck_mtx_spinwait_ret_type_t retval = LCK_MTX_SPINWAIT_SPUN_HIGH_THR;
1899 int loopcount = 0;
1900 int total_hold_time_samples, window_hold_time_samples, unfairness;
1901 uint prev_owner_cpu;
1902 bool adjust;
1903
1904 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_START,
1905 trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(ctid_get_thread_unsafe(mutex->lck_mtx_owner)),
1906 mutex->lck_mtx_waiters, 0, 0);
1907
1908 spin_start = LCK_MTX_ADAPTIVE_SPIN_BEGIN();
1909 start_time = mach_absolute_time();
1910
1911 /*
1912 * window_deadline represents the "learning" phase.
1913 * The thread collects statistics about the lock during
1914 * window_deadline and then it makes a decision on whether to spin more
1915 * or block according to the concurrency behavior
1916 * observed.
1917 *
1918 * Every thread can spin at least low_MutexSpin.
1919 */
1920 window_deadline = start_time + low_MutexSpin;
1921 /*
1922 * Sliding_deadline is the adjusted spin deadline
1923 * computed after the "learning" phase.
1924 */
1925 sliding_deadline = window_deadline;
1926 /*
1927 * High_deadline is a hard deadline. No thread
1928 * can spin more than this deadline.
1929 */
1930 if (high_MutexSpin >= 0) {
1931 high_deadline = start_time + high_MutexSpin;
1932 } else {
1933 high_deadline = start_time + low_MutexSpin * real_ncpus;
1934 }
1935
1936 /*
1937 * Do not know yet which is the owner cpu.
1938 * Initialize prev_owner_cpu with next cpu.
1939 */
1940 prev_owner_cpu = (cpu_number() + 1) % real_ncpus;
1941 total_hold_time_samples = 0;
1942 window_hold_time_samples = 0;
1943 avg_hold_time = 0;
1944 adjust = TRUE;
1945 bias = (os_hash_kernel_pointer(mutex) + cpu_number()) % real_ncpus;
1946
1947 prev_owner = mutex->lck_mtx_owner;
1948
1949 /*
1950 * Spin while:
1951 * - mutex is locked, and
1952 * - it's locked as a spin lock, and
1953 * - owner is running on another processor, and
1954 * - we haven't spun for long enough.
1955 */
1956 do {
1957 /*
1958 * Try to acquire the lock.
1959 */
1960 if (__probable(lck_mtx_lock_grab_mutex(mutex))) {
1961 retval = LCK_MTX_SPINWAIT_ACQUIRED;
1962 break;
1963 }
1964
1965 cur_time = mach_absolute_time();
1966
1967 /*
1968 * Never spin past high_deadline.
1969 */
1970 if (cur_time >= high_deadline) {
1971 retval = LCK_MTX_SPINWAIT_SPUN_HIGH_THR;
1972 break;
1973 }
1974
1975 /*
1976 * Check if owner is on core. If not block.
1977 */
1978 owner = mutex->lck_mtx_owner;
1979 if (owner) {
1980 uint32_t i = prev_owner_cpu;
1981 bool owner_on_core = false;
1982
1983 disable_preemption();
1984 owner = mutex->lck_mtx_owner;
1985
1986 /*
1987 * For scalability we want to check if the owner is on core
1988 * without locking the mutex interlock.
1989 * If we do not lock the mutex interlock, the owner that we see might be
1990 * invalid, so we cannot dereference it. Therefore we cannot check
1991 * any field of the thread to tell us if it is on core.
1992 * Check if the thread that is running on the other cpus matches the owner.
1993 */
1994 if (owner) {
1995 thread_t owner_thread = ctid_get_thread_unsafe(owner);
1996
1997 do {
1998 if ((cpu_data_ptr[i] != NULL) && (cpu_data_ptr[i]->cpu_active_thread == owner_thread)) {
1999 owner_on_core = true;
2000 break;
2001 }
2002 if (++i >= real_ncpus) {
2003 i = 0;
2004 }
2005 } while (i != prev_owner_cpu);
2006 }
2007
2008 enable_preemption();
2009
2010 if (owner == 0) {
2011 /* nothing to do, we didn't find an owner */
2012 } else if (owner_on_core) {
2013 prev_owner_cpu = i;
2014 } else {
2015 prev_owner = owner;
2016 owner = mutex->lck_mtx_owner;
2017 if (owner == prev_owner) {
2018 /*
2019 * Owner is not on core.
2020 * Stop spinning.
2021 */
2022 if (loopcount == 0) {
2023 retval = LCK_MTX_SPINWAIT_NO_SPIN;
2024 } else {
2025 retval = LCK_MTX_SPINWAIT_SPUN_OWNER_NOT_CORE;
2026 }
2027 break;
2028 }
2029 /*
2030 * Fall through if the owner changed while we were scanning.
2031 * The new owner could potentially be on core, so loop
2032 * again.
2033 */
2034 }
2035 }
2036
2037 /*
2038 * Save how many times we see the owner changing.
2039 * We can roughly estimate the mutex hold
2040 * time and the fairness with that.
2041 */
2042 if (owner != prev_owner) {
2043 prev_owner = owner;
2044 total_hold_time_samples++;
2045 window_hold_time_samples++;
2046 }
2047
2048 /*
2049 * Learning window expired.
2050 * Try to adjust the sliding_deadline.
2051 */
2052 if (cur_time >= window_deadline) {
2053 /*
2054 * If there was not contention during the window
2055 * stop spinning.
2056 */
2057 if (window_hold_time_samples < 1) {
2058 retval = LCK_MTX_SPINWAIT_SPUN_NO_WINDOW_CONTENTION;
2059 break;
2060 }
2061
2062 if (adjust) {
2063 /*
2064 * For a fair lock, we'd wait for at most (NCPU-1) periods,
2065 * but the lock is unfair, so let's try to estimate by how much.
2066 */
2067 unfairness = total_hold_time_samples / real_ncpus;
2068
2069 if (unfairness == 0) {
2070 /*
2071 * We observed the owner changing `total_hold_time_samples` times which
2072 * let us estimate the average hold time of this mutex for the duration
2073 * of the spin time.
2074 * avg_hold_time = (cur_time - start_time) / total_hold_time_samples;
2075 *
2076 * In this case spin at max avg_hold_time * (real_ncpus - 1)
2077 */
2078 delta = cur_time - start_time;
2079 sliding_deadline = start_time + (delta * (real_ncpus - 1)) / total_hold_time_samples;
2080 } else {
2081 /*
2082 * In this case at least one of the other cpus was able to get the lock twice
2083 * while I was spinning.
2084 * We could spin longer but it won't necessarily help if the system is unfair.
2085 * Try to randomize the wait to reduce contention.
2086 *
2087 * We compute how much time we could potentially spin
2088 * and distribute it over the cpus.
2089 *
2090 * bias is an integer between 0 and real_ncpus.
2091 * distributed_increment = ((high_deadline - cur_time) / real_ncpus) * bias
2092 */
2093 delta = high_deadline - cur_time;
2094 sliding_deadline = cur_time + ((delta * bias) / real_ncpus);
2095 adjust = FALSE;
2096 }
2097 }
2098
2099 window_deadline += low_MutexSpin;
2100 window_hold_time_samples = 0;
2101 }
2102
2103 /*
2104 * Stop spinning if we past
2105 * the adjusted deadline.
2106 */
2107 if (cur_time >= sliding_deadline) {
2108 retval = LCK_MTX_SPINWAIT_SPUN_SLIDING_THR;
2109 break;
2110 }
2111
2112 if (mutex->lck_mtx_owner != 0) {
2113 cpu_pause();
2114 }
2115
2116 loopcount++;
2117 } while (TRUE);
2118
2119 LCK_MTX_ADAPTIVE_SPIN_END(mutex, mutex->lck_mtx_grp, spin_start);
2120
2121 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_END,
2122 trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(ctid_get_thread_unsafe(mutex->lck_mtx_owner)),
2123 mutex->lck_mtx_waiters, retval, 0);
2124
2125 return retval;
2126 }
2127
2128
2129
2130 /*
2131 * Routine: lck_mtx_lock_wait_x86
2132 *
2133 * Invoked in order to wait on contention.
2134 *
2135 * Called with the interlock locked and
2136 * preemption disabled...
2137 * returns it unlocked and with preemption enabled
2138 *
2139 * lck_mtx_waiters is 1:1 with a wakeup needing to occur.
2140 * A runnable waiter can exist between wait and acquire
2141 * without a waiters count being set.
2142 * This allows us to never make a spurious wakeup call.
2143 *
2144 * Priority:
2145 * This avoids taking the thread lock if the owning thread is the same priority.
2146 * This optimizes the case of same-priority threads contending on a lock.
2147 * However, that allows the owning thread to drop in priority while holding the lock,
2148 * because there is no state that the priority change can notice that
2149 * says that the targeted thread holds a contended mutex.
2150 *
2151 * One possible solution: priority changes could look for some atomic tag
2152 * on the thread saying 'holding contended lock', and then set up a promotion.
2153 * Needs a story for dropping that promotion - the last contended unlock
2154 * has to notice that this has happened.
2155 */
2156 __attribute__((noinline))
2157 void
lck_mtx_lock_wait_x86(lck_mtx_t * mutex,struct turnstile ** ts)2158 lck_mtx_lock_wait_x86(
2159 lck_mtx_t *mutex,
2160 struct turnstile **ts)
2161 {
2162 thread_t self = current_thread();
2163 uint64_t sleep_start;
2164
2165 __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(mutex);
2166
2167 thread_t holder = ctid_get_thread(mutex->lck_mtx_owner);
2168
2169 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_START,
2170 trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(holder),
2171 mutex->lck_mtx_waiters, 0, 0);
2172 sleep_start = LCK_MTX_BLOCK_BEGIN();
2173
2174 mutex->lck_mtx_waiters++;
2175
2176 /*
2177 * lck_mtx_lock_wait_x86 might be called on a loop. Call prepare just once and reuse
2178 * the same turnstile while looping, the matching turnstile compleate will be called
2179 * by lck_mtx_lock_contended when finally acquiring the lock.
2180 */
2181 if (*ts == NULL) {
2182 *ts = turnstile_prepare_hash((uintptr_t)mutex, TURNSTILE_KERNEL_MUTEX);
2183 }
2184
2185 struct turnstile *turnstile = *ts;
2186 thread_set_pending_block_hint(self, kThreadWaitKernelMutex);
2187 turnstile_update_inheritor(turnstile, holder, (TURNSTILE_DELAYED_UPDATE | TURNSTILE_INHERITOR_THREAD));
2188
2189 waitq_assert_wait64(&turnstile->ts_waitq, LCK_MTX_EVENT(mutex), THREAD_UNINT | THREAD_WAIT_NOREPORT_USER, TIMEOUT_WAIT_FOREVER);
2190
2191 lck_mtx_ilk_unlock(mutex);
2192
2193 turnstile_update_inheritor_complete(turnstile, TURNSTILE_INTERLOCK_NOT_HELD);
2194
2195 thread_block(THREAD_CONTINUE_NULL);
2196
2197 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_END,
2198 trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(ctid_get_thread_unsafe(mutex->lck_mtx_owner)),
2199 mutex->lck_mtx_waiters, 0, 0);
2200
2201 LCK_MTX_BLOCK_END(mutex, mutex->lck_mtx_grp, sleep_start);
2202 }
2203
2204 /*
2205 * Routine: kdp_lck_mtx_lock_spin_is_acquired
2206 * NOT SAFE: To be used only by kernel debugger to avoid deadlock.
2207 * Returns: TRUE if lock is acquired.
2208 */
2209 boolean_t
kdp_lck_mtx_lock_spin_is_acquired(lck_mtx_t * lck)2210 kdp_lck_mtx_lock_spin_is_acquired(lck_mtx_t *lck)
2211 {
2212 if (not_in_kdp) {
2213 panic("panic: kdp_lck_mtx_lock_spin_is_acquired called outside of kernel debugger");
2214 }
2215
2216 if (lck->lck_mtx_ilocked || lck->lck_mtx_mlocked) {
2217 return TRUE;
2218 }
2219
2220 return FALSE;
2221 }
2222
2223 void
kdp_lck_mtx_find_owner(__unused struct waitq * waitq,event64_t event,thread_waitinfo_t * waitinfo)2224 kdp_lck_mtx_find_owner(__unused struct waitq * waitq, event64_t event, thread_waitinfo_t * waitinfo)
2225 {
2226 lck_mtx_t * mutex = LCK_EVENT_TO_MUTEX(event);
2227 waitinfo->context = VM_KERNEL_UNSLIDE_OR_PERM(mutex);
2228 waitinfo->owner = thread_tid(ctid_get_thread(mutex->lck_mtx_owner));
2229 }
2230
2231 #endif /* LCK_MTX_USE_ARCH */
2232 #if CONFIG_PV_TICKET
2233 __startup_func
2234 void
lck_init_pv(void)2235 lck_init_pv(void)
2236 {
2237 uint32_t pvtck = 1;
2238 PE_parse_boot_argn("pvticket", &pvtck, sizeof(pvtck));
2239 if (pvtck == 0) {
2240 return;
2241 }
2242 has_lock_pv = cpuid_vmm_present() &&
2243 (cpuid_vmm_get_kvm_features() & CPUID_KVM_FEATURE_PV_UNHALT) != 0;
2244 }
2245 STARTUP(LOCKS, STARTUP_RANK_FIRST, lck_init_pv);
2246 #endif
2247