1 /*
2 * Copyright (c) 2000-2020 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * @OSF_COPYRIGHT@
30 */
31 /*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or [email protected]
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56 /*
57 * File: kern/lock.c
58 * Author: Avadis Tevanian, Jr., Michael Wayne Young
59 * Date: 1985
60 *
61 * Locking primitives implementation
62 */
63
64 #define LOCK_PRIVATE 1
65
66 #include <mach_ldebug.h>
67
68 #include <kern/lock_stat.h>
69 #include <kern/locks.h>
70 #include <kern/zalloc.h>
71 #include <kern/misc_protos.h>
72 #include <kern/thread.h>
73 #include <kern/processor.h>
74 #include <kern/cpu_data.h>
75 #include <kern/cpu_number.h>
76 #include <kern/sched_prim.h>
77 #include <kern/debug.h>
78 #include <string.h>
79
80 #include <i386/machine_routines.h> /* machine_timeout_suspended() */
81 #include <i386/tsc.h>
82 #include <machine/atomic.h>
83 #include <machine/machine_cpu.h>
84 #include <i386/mp.h>
85 #include <machine/atomic.h>
86 #include <sys/kdebug.h>
87 #if LCK_MTX_USE_ARCH
88 #include <i386/locks_i386_inlines.h>
89 #endif /* LCK_MTX_USE_ARCH */
90 #include <kern/cpu_number.h>
91 #include <os/hash.h>
92 #include <i386/cpuid.h>
93
94 #define ANY_LOCK_DEBUG (USLOCK_DEBUG || LOCK_DEBUG || MUTEX_DEBUG)
95
96 uint64_t _Atomic lock_panic_timeout = 0xf000000; /* 251e6 TSC ticks */
97
98 /* Forwards */
99
100 #if USLOCK_DEBUG
101 /*
102 * Perform simple lock checks.
103 */
104 int uslock_check = 1;
105 int max_lock_loops = 100000000;
106 decl_simple_lock_data(extern, printf_lock);
107 decl_simple_lock_data(extern, panic_lock);
108 #endif /* USLOCK_DEBUG */
109
110 extern unsigned int not_in_kdp;
111
112 #if !LCK_GRP_USE_ARG
113 #define usimple_lock_nopreempt(lck, grp) \
114 usimple_lock_nopreempt(lck)
115 #define usimple_lock_try_nopreempt(lck, grp) \
116 usimple_lock_try_nopreempt(lck)
117 #endif
118 static void usimple_lock_nopreempt(usimple_lock_t, lck_grp_t *);
119 static unsigned int usimple_lock_try_nopreempt(usimple_lock_t, lck_grp_t *);
120
121 /*
122 * We often want to know the addresses of the callers
123 * of the various lock routines. However, this information
124 * is only used for debugging and statistics.
125 */
126 typedef void *pc_t;
127 #define INVALID_PC ((void *) VM_MAX_KERNEL_ADDRESS)
128 #define INVALID_THREAD ((void *) VM_MAX_KERNEL_ADDRESS)
129 #if ANY_LOCK_DEBUG
130 #define OBTAIN_PC(pc) ((pc) = GET_RETURN_PC())
131 #define DECL_PC(pc) pc_t pc;
132 #else /* ANY_LOCK_DEBUG */
133 #define DECL_PC(pc)
134 #ifdef lint
135 /*
136 * Eliminate lint complaints about unused local pc variables.
137 */
138 #define OBTAIN_PC(pc) ++pc
139 #else /* lint */
140 #define OBTAIN_PC(pc)
141 #endif /* lint */
142 #endif /* USLOCK_DEBUG */
143
144 KALLOC_TYPE_DEFINE(KT_LCK_SPIN, lck_spin_t, KT_PRIV_ACCT);
145
146 #if LCK_MTX_USE_ARCH
147 KALLOC_TYPE_DEFINE(KT_LCK_MTX, lck_mtx_t, KT_PRIV_ACCT);
148 #endif /* LCK_MTX_USE_ARCH */
149
150 /*
151 * atomic exchange API is a low level abstraction of the operations
152 * to atomically read, modify, and write a pointer. This abstraction works
153 * for both Intel and ARMv8.1 compare and exchange atomic instructions as
154 * well as the ARM exclusive instructions.
155 *
156 * atomic_exchange_begin() - begin exchange and retrieve current value
157 * atomic_exchange_complete() - conclude an exchange
158 * atomic_exchange_abort() - cancel an exchange started with atomic_exchange_begin()
159 */
160 uint32_t
atomic_exchange_begin32(uint32_t * target,uint32_t * previous,enum memory_order ord)161 atomic_exchange_begin32(uint32_t *target, uint32_t *previous, enum memory_order ord)
162 {
163 uint32_t val;
164
165 (void)ord; // Memory order not used
166 val = os_atomic_load(target, relaxed);
167 *previous = val;
168 return val;
169 }
170
171 boolean_t
atomic_exchange_complete32(uint32_t * target,uint32_t previous,uint32_t newval,enum memory_order ord)172 atomic_exchange_complete32(uint32_t *target, uint32_t previous, uint32_t newval, enum memory_order ord)
173 {
174 return __c11_atomic_compare_exchange_strong((_Atomic uint32_t *)target, &previous, newval, ord, memory_order_relaxed);
175 }
176
177 void
atomic_exchange_abort(void)178 atomic_exchange_abort(void)
179 {
180 }
181
182 boolean_t
atomic_test_and_set32(uint32_t * target,uint32_t test_mask,uint32_t set_mask,enum memory_order ord,boolean_t wait)183 atomic_test_and_set32(uint32_t *target, uint32_t test_mask, uint32_t set_mask, enum memory_order ord, boolean_t wait)
184 {
185 uint32_t value, prev;
186
187 for (;;) {
188 value = atomic_exchange_begin32(target, &prev, ord);
189 if (value & test_mask) {
190 if (wait) {
191 cpu_pause();
192 } else {
193 atomic_exchange_abort();
194 }
195 return FALSE;
196 }
197 value |= set_mask;
198 if (atomic_exchange_complete32(target, prev, value, ord)) {
199 return TRUE;
200 }
201 }
202 }
203
204 /*
205 * Portable lock package implementation of usimple_locks.
206 */
207
208 #if USLOCK_DEBUG
209 #define USLDBG(stmt) stmt
210 void usld_lock_init(usimple_lock_t, unsigned short);
211 void usld_lock_pre(usimple_lock_t, pc_t);
212 void usld_lock_post(usimple_lock_t, pc_t);
213 void usld_unlock(usimple_lock_t, pc_t);
214 void usld_lock_try_pre(usimple_lock_t, pc_t);
215 void usld_lock_try_post(usimple_lock_t, pc_t);
216 int usld_lock_common_checks(usimple_lock_t, char *);
217 #else /* USLOCK_DEBUG */
218 #define USLDBG(stmt)
219 #endif /* USLOCK_DEBUG */
220
221 #if LCK_MTX_USE_ARCH
222 /*
223 * Forward definitions
224 */
225
226 static void lck_mtx_unlock_wakeup_tail(lck_mtx_t *mutex, uint32_t state);
227 static void lck_mtx_interlock_lock(lck_mtx_t *mutex, uint32_t *new_state);
228 static void lck_mtx_interlock_lock_clear_flags(lck_mtx_t *mutex, uint32_t and_flags, uint32_t *new_state);
229 static int lck_mtx_interlock_try_lock_set_flags(lck_mtx_t *mutex, uint32_t or_flags, uint32_t *new_state);
230 static boolean_t lck_mtx_lock_wait_interlock_to_clear(lck_mtx_t *lock, uint32_t *new_state);
231 static boolean_t lck_mtx_try_lock_wait_interlock_to_clear(lck_mtx_t *lock, uint32_t *new_state);
232 #endif /* LCK_MTX_USE_ARCH */
233
234
235 /*
236 * Routine: lck_spin_alloc_init
237 */
238 lck_spin_t *
lck_spin_alloc_init(lck_grp_t * grp,lck_attr_t * attr)239 lck_spin_alloc_init(
240 lck_grp_t *grp,
241 lck_attr_t *attr)
242 {
243 lck_spin_t *lck;
244
245 lck = zalloc(KT_LCK_SPIN);
246 lck_spin_init(lck, grp, attr);
247 return lck;
248 }
249
250 /*
251 * Routine: lck_spin_free
252 */
253 void
lck_spin_free(lck_spin_t * lck,lck_grp_t * grp)254 lck_spin_free(
255 lck_spin_t *lck,
256 lck_grp_t *grp)
257 {
258 lck_spin_destroy(lck, grp);
259 zfree(KT_LCK_SPIN, lck);
260 }
261
262 /*
263 * Routine: lck_spin_init
264 */
265 void
lck_spin_init(lck_spin_t * lck,lck_grp_t * grp,__unused lck_attr_t * attr)266 lck_spin_init(
267 lck_spin_t *lck,
268 lck_grp_t *grp,
269 __unused lck_attr_t *attr)
270 {
271 usimple_lock_init((usimple_lock_t) lck, 0);
272 if (grp) {
273 lck_grp_reference(grp, &grp->lck_grp_spincnt);
274 }
275 }
276
277 /*
278 * Routine: lck_spin_destroy
279 */
280 void
lck_spin_destroy(lck_spin_t * lck,lck_grp_t * grp)281 lck_spin_destroy(
282 lck_spin_t *lck,
283 lck_grp_t *grp)
284 {
285 if (lck->interlock == LCK_SPIN_TAG_DESTROYED) {
286 return;
287 }
288 lck->interlock = LCK_SPIN_TAG_DESTROYED;
289 if (grp) {
290 lck_grp_deallocate(grp, &grp->lck_grp_spincnt);
291 }
292 return;
293 }
294
295 /*
296 * Routine: lck_spin_lock
297 */
298 void
lck_spin_lock_grp(lck_spin_t * lck,lck_grp_t * grp)299 lck_spin_lock_grp(
300 lck_spin_t *lck,
301 lck_grp_t *grp)
302 {
303 #pragma unused(grp)
304 usimple_lock((usimple_lock_t) lck, grp);
305 }
306
307 void
lck_spin_lock(lck_spin_t * lck)308 lck_spin_lock(
309 lck_spin_t *lck)
310 {
311 usimple_lock((usimple_lock_t) lck, NULL);
312 }
313
314 void
lck_spin_lock_nopreempt(lck_spin_t * lck)315 lck_spin_lock_nopreempt(
316 lck_spin_t *lck)
317 {
318 usimple_lock_nopreempt((usimple_lock_t) lck, NULL);
319 }
320
321 void
lck_spin_lock_nopreempt_grp(lck_spin_t * lck,lck_grp_t * grp)322 lck_spin_lock_nopreempt_grp(
323 lck_spin_t *lck,
324 lck_grp_t *grp)
325 {
326 #pragma unused(grp)
327 usimple_lock_nopreempt((usimple_lock_t) lck, grp);
328 }
329
330 /*
331 * Routine: lck_spin_unlock
332 */
333 void
lck_spin_unlock(lck_spin_t * lck)334 lck_spin_unlock(
335 lck_spin_t *lck)
336 {
337 usimple_unlock((usimple_lock_t) lck);
338 }
339
340 void
lck_spin_unlock_nopreempt(lck_spin_t * lck)341 lck_spin_unlock_nopreempt(
342 lck_spin_t *lck)
343 {
344 usimple_unlock_nopreempt((usimple_lock_t) lck);
345 }
346
347 boolean_t
lck_spin_try_lock_grp(lck_spin_t * lck,lck_grp_t * grp)348 lck_spin_try_lock_grp(
349 lck_spin_t *lck,
350 lck_grp_t *grp)
351 {
352 #pragma unused(grp)
353 boolean_t lrval = (boolean_t)usimple_lock_try((usimple_lock_t) lck, grp);
354 #if DEVELOPMENT || DEBUG
355 if (lrval) {
356 pltrace(FALSE);
357 }
358 #endif
359 return lrval;
360 }
361
362
363 /*
364 * Routine: lck_spin_try_lock
365 */
366 boolean_t
lck_spin_try_lock(lck_spin_t * lck)367 lck_spin_try_lock(
368 lck_spin_t *lck)
369 {
370 boolean_t lrval = (boolean_t)usimple_lock_try((usimple_lock_t) lck, LCK_GRP_NULL);
371 #if DEVELOPMENT || DEBUG
372 if (lrval) {
373 pltrace(FALSE);
374 }
375 #endif
376 return lrval;
377 }
378
379 int
lck_spin_try_lock_nopreempt(lck_spin_t * lck)380 lck_spin_try_lock_nopreempt(
381 lck_spin_t *lck)
382 {
383 boolean_t lrval = (boolean_t)usimple_lock_try_nopreempt((usimple_lock_t) lck, LCK_GRP_NULL);
384 #if DEVELOPMENT || DEBUG
385 if (lrval) {
386 pltrace(FALSE);
387 }
388 #endif
389 return lrval;
390 }
391
392 int
lck_spin_try_lock_nopreempt_grp(lck_spin_t * lck,lck_grp_t * grp)393 lck_spin_try_lock_nopreempt_grp(
394 lck_spin_t *lck,
395 lck_grp_t *grp)
396 {
397 #pragma unused(grp)
398 boolean_t lrval = (boolean_t)usimple_lock_try_nopreempt((usimple_lock_t) lck, grp);
399 #if DEVELOPMENT || DEBUG
400 if (lrval) {
401 pltrace(FALSE);
402 }
403 #endif
404 return lrval;
405 }
406
407 /*
408 * Routine: lck_spin_assert
409 */
410 void
lck_spin_assert(lck_spin_t * lock,unsigned int type)411 lck_spin_assert(lck_spin_t *lock, unsigned int type)
412 {
413 thread_t thread, holder;
414 uintptr_t state;
415
416 if (__improbable(type != LCK_ASSERT_OWNED && type != LCK_ASSERT_NOTOWNED)) {
417 panic("lck_spin_assert(): invalid arg (%u)", type);
418 }
419
420 state = lock->interlock;
421 holder = (thread_t)state;
422 thread = current_thread();
423 if (type == LCK_ASSERT_OWNED) {
424 if (__improbable(holder == THREAD_NULL)) {
425 panic("Lock not owned %p = %lx", lock, state);
426 }
427 if (__improbable(holder != thread)) {
428 panic("Lock not owned by current thread %p = %lx", lock, state);
429 }
430 } else if (type == LCK_ASSERT_NOTOWNED) {
431 if (__improbable(holder != THREAD_NULL)) {
432 if (holder == thread) {
433 panic("Lock owned by current thread %p = %lx", lock, state);
434 }
435 }
436 }
437 }
438
439 /*
440 * Routine: kdp_lck_spin_is_acquired
441 * NOT SAFE: To be used only by kernel debugger to avoid deadlock.
442 * Returns: TRUE if lock is acquired.
443 */
444 boolean_t
kdp_lck_spin_is_acquired(lck_spin_t * lck)445 kdp_lck_spin_is_acquired(lck_spin_t *lck)
446 {
447 if (not_in_kdp) {
448 panic("panic: spinlock acquired check done outside of kernel debugger");
449 }
450 return (lck->interlock != 0)? TRUE : FALSE;
451 }
452
453 /*
454 * Initialize a usimple_lock.
455 *
456 * No change in preemption state.
457 */
458 void
usimple_lock_init(usimple_lock_t l,__unused unsigned short tag)459 usimple_lock_init(
460 usimple_lock_t l,
461 __unused unsigned short tag)
462 {
463 USLDBG(usld_lock_init(l, tag));
464 hw_lock_init(&l->interlock);
465 }
466
467 static hw_spin_timeout_status_t
usimple_lock_acquire_timeout_panic(void * _lock,hw_spin_timeout_t to,hw_spin_state_t st)468 usimple_lock_acquire_timeout_panic(void *_lock, hw_spin_timeout_t to, hw_spin_state_t st)
469 {
470 usimple_lock_t l = _lock;
471 uintptr_t lowner;
472 lck_spinlock_to_info_t lsti;
473
474 if (machine_timeout_suspended()) {
475 return HW_LOCK_TIMEOUT_CONTINUE;
476 }
477
478 lowner = (uintptr_t)l->interlock.lock_data;
479 lsti = lck_spinlock_timeout_hit(l, lowner);
480
481 panic("Spinlock[%p] " HW_SPIN_TIMEOUT_FMT " ;"
482 "lock owner thread=0x%lx, current_thread: %p, "
483 "lock owner active on CPU %d, "
484 HW_SPIN_TIMEOUT_DETAILS_FMT,
485 l, HW_SPIN_TIMEOUT_ARG(to, st),
486 lowner, current_thread(), lsti->owner_cpu,
487 HW_SPIN_TIMEOUT_DETAILS_ARG(to, st));
488 }
489
490 static const struct hw_spin_policy usimple_lock_spin_policy = {
491 .hwsp_name = "usimple_lock_t",
492 .hwsp_timeout = &LockTimeOutTSC,
493 .hwsp_op_timeout = usimple_lock_acquire_timeout_panic,
494 };
495
496 /*
497 * Acquire a usimple_lock.
498 *
499 * Returns with preemption disabled. Note
500 * that the hw_lock routines are responsible for
501 * maintaining preemption state.
502 */
503 void
504 (usimple_lock)(
505 usimple_lock_t l
506 LCK_GRP_ARG(lck_grp_t *grp))
507 {
508 DECL_PC(pc);
509
510 OBTAIN_PC(pc);
511 USLDBG(usld_lock_pre(l, pc));
512
513 (void)hw_lock_to(&l->interlock, &usimple_lock_spin_policy, grp);
514 #if DEVELOPMENT || DEBUG
515 pltrace(FALSE);
516 #endif
517
518 USLDBG(usld_lock_post(l, pc));
519 #if CONFIG_DTRACE
520 LOCKSTAT_RECORD(LS_LCK_SPIN_LOCK_ACQUIRE, l, 0, (uintptr_t)LCK_GRP_PROBEARG(grp));
521 #endif
522 }
523
524 /*
525 * Acquire a usimple_lock_nopreempt
526 *
527 * Called and returns with preemption disabled. Note
528 * that the hw_lock routines are responsible for
529 * maintaining preemption state.
530 */
531 static void
usimple_lock_nopreempt(usimple_lock_t l,lck_grp_t * grp)532 usimple_lock_nopreempt(
533 usimple_lock_t l,
534 lck_grp_t *grp)
535 {
536 DECL_PC(pc);
537
538 OBTAIN_PC(pc);
539 USLDBG(usld_lock_pre(l, pc));
540
541 (void)hw_lock_to_nopreempt(&l->interlock, &usimple_lock_spin_policy, grp);
542
543 #if DEVELOPMENT || DEBUG
544 pltrace(FALSE);
545 #endif
546
547 USLDBG(usld_lock_post(l, pc));
548 #if CONFIG_DTRACE
549 LOCKSTAT_RECORD(LS_LCK_SPIN_LOCK_ACQUIRE, l, 0, (uintptr_t)LCK_GRP_PROBEARG(grp));
550 #endif
551 }
552
553
554 /*
555 * Release a usimple_lock.
556 *
557 * Returns with preemption enabled. Note
558 * that the hw_lock routines are responsible for
559 * maintaining preemption state.
560 */
561 void
usimple_unlock(usimple_lock_t l)562 usimple_unlock(
563 usimple_lock_t l)
564 {
565 DECL_PC(pc);
566
567 OBTAIN_PC(pc);
568 USLDBG(usld_unlock(l, pc));
569 #if DEVELOPMENT || DEBUG
570 pltrace(TRUE);
571 #endif
572 hw_lock_unlock(&l->interlock);
573 }
574
575 /*
576 * Release a usimple_unlock_nopreempt.
577 *
578 * Called and returns with preemption enabled. Note
579 * that the hw_lock routines are responsible for
580 * maintaining preemption state.
581 */
582 void
usimple_unlock_nopreempt(usimple_lock_t l)583 usimple_unlock_nopreempt(
584 usimple_lock_t l)
585 {
586 DECL_PC(pc);
587
588 OBTAIN_PC(pc);
589 USLDBG(usld_unlock(l, pc));
590 #if DEVELOPMENT || DEBUG
591 pltrace(TRUE);
592 #endif
593 hw_lock_unlock_nopreempt(&l->interlock);
594 }
595
596 /*
597 * Conditionally acquire a usimple_lock.
598 *
599 * On success, returns with preemption disabled.
600 * On failure, returns with preemption in the same state
601 * as when first invoked. Note that the hw_lock routines
602 * are responsible for maintaining preemption state.
603 *
604 * XXX No stats are gathered on a miss; I preserved this
605 * behavior from the original assembly-language code, but
606 * doesn't it make sense to log misses? XXX
607 */
608 unsigned int
usimple_lock_try(usimple_lock_t l,lck_grp_t * grp)609 usimple_lock_try(
610 usimple_lock_t l,
611 lck_grp_t *grp)
612 {
613 unsigned int success;
614 DECL_PC(pc);
615
616 OBTAIN_PC(pc);
617 USLDBG(usld_lock_try_pre(l, pc));
618 if ((success = hw_lock_try(&l->interlock, grp))) {
619 #if DEVELOPMENT || DEBUG
620 pltrace(FALSE);
621 #endif
622 USLDBG(usld_lock_try_post(l, pc));
623 }
624 return success;
625 }
626
627 /*
628 * Conditionally acquire a usimple_lock.
629 *
630 * Called and returns with preemption disabled. Note
631 * that the hw_lock routines are responsible for
632 * maintaining preemption state.
633 *
634 * XXX No stats are gathered on a miss; I preserved this
635 * behavior from the original assembly-language code, but
636 * doesn't it make sense to log misses? XXX
637 */
638 static unsigned int
usimple_lock_try_nopreempt(usimple_lock_t l,lck_grp_t * grp)639 usimple_lock_try_nopreempt(
640 usimple_lock_t l,
641 lck_grp_t *grp)
642 {
643 unsigned int success;
644 DECL_PC(pc);
645
646 OBTAIN_PC(pc);
647 USLDBG(usld_lock_try_pre(l, pc));
648 if ((success = hw_lock_try_nopreempt(&l->interlock, grp))) {
649 #if DEVELOPMENT || DEBUG
650 pltrace(FALSE);
651 #endif
652 USLDBG(usld_lock_try_post(l, pc));
653 }
654 return success;
655 }
656
657 /*
658 * Acquire a usimple_lock while polling for pending cpu signals
659 * and spinning on a lock.
660 *
661 */
662 unsigned
663 int
664 (usimple_lock_try_lock_mp_signal_safe_loop_deadline)(usimple_lock_t l,
665 uint64_t deadline
666 LCK_GRP_ARG(lck_grp_t *grp))
667 {
668 boolean_t istate = ml_get_interrupts_enabled();
669
670 if (deadline < mach_absolute_time()) {
671 return 0;
672 }
673
674 while (!simple_lock_try(l, grp)) {
675 if (!istate) {
676 cpu_signal_handler(NULL);
677 }
678
679 if (deadline < mach_absolute_time()) {
680 return 0;
681 }
682
683 cpu_pause();
684 }
685
686 return 1;
687 }
688
689 void
690 (usimple_lock_try_lock_loop)(usimple_lock_t l
691 LCK_GRP_ARG(lck_grp_t *grp))
692 {
693 /* When the lock is not contended, grab the lock and go. */
694 if (!simple_lock_try(l, grp)) {
695 usimple_lock_try_lock_mp_signal_safe_loop_deadline(l, ULLONG_MAX, grp);
696 }
697 }
698
699 unsigned
700 int
701 (usimple_lock_try_lock_mp_signal_safe_loop_duration)(usimple_lock_t l,
702 uint64_t duration
703 LCK_GRP_ARG(lck_grp_t *grp))
704 {
705 uint64_t deadline;
706 uint64_t base_at;
707 uint64_t duration_at;
708
709 /* Fast track for uncontended locks */
710 if (simple_lock_try(l, grp)) {
711 return 1;
712 }
713
714 base_at = mach_absolute_time();
715
716 nanoseconds_to_absolutetime(duration, &duration_at);
717 deadline = base_at + duration_at;
718 if (deadline < base_at) {
719 /* deadline has overflowed, make it saturate */
720 deadline = ULLONG_MAX;
721 }
722
723 return usimple_lock_try_lock_mp_signal_safe_loop_deadline(l, deadline, grp);
724 }
725
726 #if USLOCK_DEBUG
727 /*
728 * States of a usimple_lock. The default when initializing
729 * a usimple_lock is setting it up for debug checking.
730 */
731 #define USLOCK_CHECKED 0x0001 /* lock is being checked */
732 #define USLOCK_TAKEN 0x0002 /* lock has been taken */
733 #define USLOCK_INIT 0xBAA0 /* lock has been initialized */
734 #define USLOCK_INITIALIZED (USLOCK_INIT|USLOCK_CHECKED)
735 #define USLOCK_CHECKING(l) (uslock_check && \
736 ((l)->debug.state & USLOCK_CHECKED))
737
738 /*
739 * Initialize the debugging information contained
740 * in a usimple_lock.
741 */
742 void
usld_lock_init(usimple_lock_t l,__unused unsigned short tag)743 usld_lock_init(
744 usimple_lock_t l,
745 __unused unsigned short tag)
746 {
747 if (l == USIMPLE_LOCK_NULL) {
748 panic("lock initialization: null lock pointer");
749 }
750 l->lock_type = USLOCK_TAG;
751 l->debug.state = uslock_check ? USLOCK_INITIALIZED : 0;
752 l->debug.lock_cpu = l->debug.unlock_cpu = 0;
753 l->debug.lock_pc = l->debug.unlock_pc = INVALID_PC;
754 l->debug.lock_thread = l->debug.unlock_thread = INVALID_THREAD;
755 l->debug.duration[0] = l->debug.duration[1] = 0;
756 l->debug.unlock_cpu = l->debug.unlock_cpu = 0;
757 l->debug.unlock_pc = l->debug.unlock_pc = INVALID_PC;
758 l->debug.unlock_thread = l->debug.unlock_thread = INVALID_THREAD;
759 }
760
761
762 /*
763 * These checks apply to all usimple_locks, not just
764 * those with USLOCK_CHECKED turned on.
765 */
766 int
usld_lock_common_checks(usimple_lock_t l,char * caller)767 usld_lock_common_checks(
768 usimple_lock_t l,
769 char *caller)
770 {
771 if (l == USIMPLE_LOCK_NULL) {
772 panic("%s: null lock pointer", caller);
773 }
774 if (l->lock_type != USLOCK_TAG) {
775 panic("%s: %p is not a usimple lock, 0x%x", caller, l, l->lock_type);
776 }
777 if (!(l->debug.state & USLOCK_INIT)) {
778 panic("%s: %p is not an initialized lock, 0x%x", caller, l, l->debug.state);
779 }
780 return USLOCK_CHECKING(l);
781 }
782
783
784 /*
785 * Debug checks on a usimple_lock just before attempting
786 * to acquire it.
787 */
788 /* ARGSUSED */
789 void
usld_lock_pre(usimple_lock_t l,pc_t pc)790 usld_lock_pre(
791 usimple_lock_t l,
792 pc_t pc)
793 {
794 char caller[] = "usimple_lock";
795
796
797 if (!usld_lock_common_checks(l, caller)) {
798 return;
799 }
800
801 /*
802 * Note that we have a weird case where we are getting a lock when we are]
803 * in the process of putting the system to sleep. We are running with no
804 * current threads, therefore we can't tell if we are trying to retake a lock
805 * we have or someone on the other processor has it. Therefore we just
806 * ignore this test if the locking thread is 0.
807 */
808
809 if ((l->debug.state & USLOCK_TAKEN) && l->debug.lock_thread &&
810 l->debug.lock_thread == (void *) current_thread()) {
811 printf("%s: lock %p already locked (at %p) by",
812 caller, l, l->debug.lock_pc);
813 printf(" current thread %p (new attempt at pc %p)\n",
814 l->debug.lock_thread, pc);
815 panic("%s", caller);
816 }
817 mp_disable_preemption();
818 mp_enable_preemption();
819 }
820
821
822 /*
823 * Debug checks on a usimple_lock just after acquiring it.
824 *
825 * Pre-emption has been disabled at this point,
826 * so we are safe in using cpu_number.
827 */
828 void
usld_lock_post(usimple_lock_t l,pc_t pc)829 usld_lock_post(
830 usimple_lock_t l,
831 pc_t pc)
832 {
833 unsigned int mycpu;
834 char caller[] = "successful usimple_lock";
835
836
837 if (!usld_lock_common_checks(l, caller)) {
838 return;
839 }
840
841 if (!((l->debug.state & ~USLOCK_TAKEN) == USLOCK_INITIALIZED)) {
842 panic("%s: lock %p became uninitialized",
843 caller, l);
844 }
845 if ((l->debug.state & USLOCK_TAKEN)) {
846 panic("%s: lock 0x%p became TAKEN by someone else",
847 caller, l);
848 }
849
850 mycpu = (unsigned int)cpu_number();
851 assert(mycpu <= UCHAR_MAX);
852
853 l->debug.lock_thread = (void *)current_thread();
854 l->debug.state |= USLOCK_TAKEN;
855 l->debug.lock_pc = pc;
856 l->debug.lock_cpu = (unsigned char)mycpu;
857 }
858
859
860 /*
861 * Debug checks on a usimple_lock just before
862 * releasing it. Note that the caller has not
863 * yet released the hardware lock.
864 *
865 * Preemption is still disabled, so there's
866 * no problem using cpu_number.
867 */
868 void
usld_unlock(usimple_lock_t l,pc_t pc)869 usld_unlock(
870 usimple_lock_t l,
871 pc_t pc)
872 {
873 unsigned int mycpu;
874 char caller[] = "usimple_unlock";
875
876
877 if (!usld_lock_common_checks(l, caller)) {
878 return;
879 }
880
881 mycpu = cpu_number();
882 assert(mycpu <= UCHAR_MAX);
883
884 if (!(l->debug.state & USLOCK_TAKEN)) {
885 panic("%s: lock 0x%p hasn't been taken",
886 caller, l);
887 }
888 if (l->debug.lock_thread != (void *) current_thread()) {
889 panic("%s: unlocking lock 0x%p, owned by thread %p",
890 caller, l, l->debug.lock_thread);
891 }
892 if (l->debug.lock_cpu != mycpu) {
893 printf("%s: unlocking lock 0x%p on cpu 0x%x",
894 caller, l, mycpu);
895 printf(" (acquired on cpu 0x%x)\n", l->debug.lock_cpu);
896 panic("%s", caller);
897 }
898
899 l->debug.unlock_thread = l->debug.lock_thread;
900 l->debug.lock_thread = INVALID_PC;
901 l->debug.state &= ~USLOCK_TAKEN;
902 l->debug.unlock_pc = pc;
903 l->debug.unlock_cpu = (unsigned char)mycpu;
904 }
905
906
907 /*
908 * Debug checks on a usimple_lock just before
909 * attempting to acquire it.
910 *
911 * Preemption isn't guaranteed to be disabled.
912 */
913 void
usld_lock_try_pre(usimple_lock_t l,__unused pc_t pc)914 usld_lock_try_pre(
915 usimple_lock_t l,
916 __unused pc_t pc)
917 {
918 char caller[] = "usimple_lock_try";
919
920 if (!usld_lock_common_checks(l, caller)) {
921 return;
922 }
923 }
924
925
926 /*
927 * Debug checks on a usimple_lock just after
928 * successfully attempting to acquire it.
929 *
930 * Preemption has been disabled by the
931 * lock acquisition attempt, so it's safe
932 * to use cpu_number.
933 */
934 void
usld_lock_try_post(usimple_lock_t l,pc_t pc)935 usld_lock_try_post(
936 usimple_lock_t l,
937 pc_t pc)
938 {
939 unsigned int mycpu;
940 char caller[] = "successful usimple_lock_try";
941
942 if (!usld_lock_common_checks(l, caller)) {
943 return;
944 }
945
946 if (!((l->debug.state & ~USLOCK_TAKEN) == USLOCK_INITIALIZED)) {
947 panic("%s: lock 0x%p became uninitialized",
948 caller, l);
949 }
950 if ((l->debug.state & USLOCK_TAKEN)) {
951 panic("%s: lock 0x%p became TAKEN by someone else",
952 caller, l);
953 }
954
955 mycpu = cpu_number();
956 assert(mycpu <= UCHAR_MAX);
957
958 l->debug.lock_thread = (void *) current_thread();
959 l->debug.state |= USLOCK_TAKEN;
960 l->debug.lock_pc = pc;
961 l->debug.lock_cpu = (unsigned char)mycpu;
962 }
963 #endif /* USLOCK_DEBUG */
964 #if LCK_MTX_USE_ARCH
965
966 /*
967 * Slow path routines for lck_mtx locking and unlocking functions.
968 *
969 * These functions were previously implemented in x86 assembly,
970 * and some optimizations are in place in this c code to obtain a compiled code
971 * as performant and compact as the assembly version.
972 *
973 * To avoid to inline these functions on the fast path, all functions directly called by
974 * the fast paths have the __attribute__((noinline)) specified. Also they are all implemented
975 * in such a way the fast path can tail call into them. In this way the return address
976 * does not need to be pushed on the caller stack and stack optimization can happen on the caller.
977 *
978 * Slow path code is structured in such a way there are no calls to functions that will return
979 * on the context of the caller function, i.e. all functions called are or tail call functions
980 * or inline functions. The number of arguments of the tail call functions are less then six,
981 * so that they can be passed over registers and do not need to be pushed on stack.
982 * This allows the compiler to not create a stack frame for the functions.
983 *
984 * __improbable and __probable are used to compile the slow path code in such a way
985 * the fast path case will be on a sequence of instructions with as less jumps as possible,
986 * to make this case the most optimized even if falling through the slow path.
987 */
988
989 /*
990 * Intel lock invariants:
991 *
992 * lck_mtx_waiters: contains the count of threads currently in the mutex waitqueue
993 *
994 * The lock owner is promoted to the max priority of all its waiters only if it
995 * was a lower priority when it acquired or was an owner when a waiter waited.
996 * Max priority is capped at MAXPRI_PROMOTE.
997 *
998 * The last waiter will not be promoted as it is woken up, but the last
999 * lock owner may not have been the last thread to have been woken up depending on the
1000 * luck of the draw. Therefore a last-owner may still have the promoted-on-wakeup
1001 * flag set.
1002 *
1003 * TODO: Figure out an algorithm for stopping a lock holder which is already at the right
1004 * priority from dropping priority in the future without having to take thread lock
1005 * on acquire.
1006 */
1007
1008 /*
1009 * Routine: lck_mtx_alloc_init
1010 */
1011 lck_mtx_t *
lck_mtx_alloc_init(lck_grp_t * grp,lck_attr_t * attr)1012 lck_mtx_alloc_init(
1013 lck_grp_t *grp,
1014 lck_attr_t *attr)
1015 {
1016 lck_mtx_t *lck;
1017
1018 lck = zalloc(KT_LCK_MTX);
1019 lck_mtx_init(lck, grp, attr);
1020 return lck;
1021 }
1022
1023 /*
1024 * Routine: lck_mtx_free
1025 */
1026 void
lck_mtx_free(lck_mtx_t * lck,lck_grp_t * grp)1027 lck_mtx_free(
1028 lck_mtx_t *lck,
1029 lck_grp_t *grp)
1030 {
1031 lck_mtx_destroy(lck, grp);
1032 zfree(KT_LCK_MTX, lck);
1033 }
1034
1035 /*
1036 * Routine: lck_mtx_init
1037 */
1038 void
lck_mtx_init(lck_mtx_t * lck,lck_grp_t * grp,lck_attr_t * attr)1039 lck_mtx_init(
1040 lck_mtx_t *lck,
1041 lck_grp_t *grp,
1042 lck_attr_t *attr)
1043 {
1044 if (attr == LCK_ATTR_NULL) {
1045 attr = &lck_attr_default;
1046 }
1047
1048 *lck = (lck_mtx_t){
1049 .lck_mtx_grp = grp->lck_grp_attr_id,
1050 };
1051
1052 if (__improbable((attr->lck_attr_val & LCK_ATTR_DEBUG) ||
1053 (grp->lck_grp_attr_id & LCK_GRP_ATTR_DEBUG))) {
1054 lck->lck_mtx_profile = 1;
1055 }
1056
1057 lck_grp_reference(grp, &grp->lck_grp_mtxcnt);
1058 }
1059
1060 /*
1061 * Routine: lck_mtx_destroy
1062 */
1063 void
lck_mtx_destroy(lck_mtx_t * lck,lck_grp_t * grp)1064 lck_mtx_destroy(
1065 lck_mtx_t *lck,
1066 lck_grp_t *grp)
1067 {
1068 if (lck->lck_mtx_state == LCK_MTX_TAG_DESTROYED) {
1069 return;
1070 }
1071 #if MACH_LDEBUG
1072 lck_mtx_assert(lck, LCK_MTX_ASSERT_NOTOWNED);
1073 #endif
1074 ordered_store_mtx_state_release(lck, LCK_MTX_TAG_DESTROYED);
1075 lck_grp_deallocate(grp, &grp->lck_grp_mtxcnt);
1076 }
1077
1078
1079 #if DEVELOPMENT | DEBUG
1080 __attribute__((noinline))
1081 void
lck_mtx_owner_check_panic(lck_mtx_t * lock)1082 lck_mtx_owner_check_panic(
1083 lck_mtx_t *lock)
1084 {
1085 thread_t owner = ctid_get_thread_unsafe(lock->lck_mtx_owner);
1086 panic("Mutex unlock attempted from non-owner thread. Owner=%p lock=%p", owner, lock);
1087 }
1088 #endif
1089
1090 /*
1091 * Routine: lck_mtx_unlock_slow
1092 *
1093 * Unlocks a mutex held by current thread.
1094 *
1095 * It will wake up waiters if necessary.
1096 *
1097 * Interlock can be held.
1098 */
1099 __attribute__((noinline))
1100 void
lck_mtx_unlock_slow(lck_mtx_t * lock)1101 lck_mtx_unlock_slow(
1102 lck_mtx_t *lock)
1103 {
1104 thread_t thread;
1105 uint32_t state;
1106
1107 state = ordered_load_mtx_state(lock);
1108
1109 thread = current_thread();
1110
1111 #if DEVELOPMENT | DEBUG
1112 if (__improbable(lock->lck_mtx_owner != thread->ctid)) {
1113 lck_mtx_owner_check_panic(lock);
1114 }
1115 #endif
1116
1117 /* check if it is held as a spinlock */
1118 if (__improbable((state & LCK_MTX_MLOCKED_MSK) == 0)) {
1119 goto unlock;
1120 }
1121
1122 lck_mtx_interlock_lock_clear_flags(lock, LCK_MTX_MLOCKED_MSK, &state);
1123
1124 unlock:
1125 /* preemption disabled, interlock held and mutex not held */
1126
1127 /* clear owner */
1128 ordered_store_mtx_owner(lock, 0);
1129
1130 if (__improbable(state & LCK_MTX_WAITERS_MSK)) {
1131 #if MACH_LDEBUG
1132 if (thread) {
1133 thread->mutex_count--;
1134 }
1135 #endif
1136 return lck_mtx_unlock_wakeup_tail(lock, state);
1137 }
1138
1139 /* release interlock, promotion and clear spin flag */
1140 state &= (~(LCK_MTX_ILOCKED_MSK | LCK_MTX_SPIN_MSK));
1141 ordered_store_mtx_state_release(lock, state); /* since I own the interlock, I don't need an atomic update */
1142
1143 #if MACH_LDEBUG
1144 /* perform lock statistics after drop to prevent delay */
1145 if (thread) {
1146 thread->mutex_count--; /* lock statistic */
1147 }
1148 #endif /* MACH_LDEBUG */
1149
1150 /* re-enable preemption */
1151 lck_mtx_unlock_finish_inline(lock, state);
1152 }
1153
1154 #define LCK_MTX_LCK_WAIT_CODE 0x20
1155 #define LCK_MTX_LCK_WAKEUP_CODE 0x21
1156 #define LCK_MTX_LCK_SPIN_CODE 0x22
1157 #define LCK_MTX_LCK_ACQUIRE_CODE 0x23
1158 #define LCK_MTX_LCK_DEMOTE_CODE 0x24
1159
1160 /*
1161 * Routine: lck_mtx_unlock_wakeup_tail
1162 *
1163 * Invoked on unlock when there is
1164 * contention, i.e. the assembly routine sees
1165 * that mutex->lck_mtx_waiters != 0
1166 *
1167 * neither the mutex or interlock is held
1168 *
1169 * Note that this routine might not be called if there are pending
1170 * waiters which have previously been woken up, and they didn't
1171 * end up boosting the old owner.
1172 *
1173 * assembly routine previously did the following to mutex:
1174 * (after saving the state in prior_lock_state)
1175 * decremented lck_mtx_waiters if nonzero
1176 *
1177 * This function needs to be called as a tail call
1178 * to optimize the compiled code.
1179 */
1180 __attribute__((noinline))
1181 static void
lck_mtx_unlock_wakeup_tail(lck_mtx_t * mutex,uint32_t state)1182 lck_mtx_unlock_wakeup_tail(
1183 lck_mtx_t *mutex,
1184 uint32_t state)
1185 {
1186 struct turnstile *ts;
1187
1188 __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(mutex);
1189 kern_return_t did_wake;
1190
1191 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAKEUP_CODE) | DBG_FUNC_START,
1192 trace_lck, 0, mutex->lck_mtx_waiters, 0, 0);
1193
1194 ts = turnstile_prepare_hash((uintptr_t)mutex, TURNSTILE_KERNEL_MUTEX);
1195
1196 did_wake = waitq_wakeup64_one(&ts->ts_waitq, LCK_MTX_EVENT(mutex),
1197 THREAD_AWAKENED, WAITQ_UPDATE_INHERITOR);
1198 assert(did_wake == KERN_SUCCESS);
1199
1200 turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD);
1201 turnstile_complete_hash((uintptr_t)mutex, TURNSTILE_KERNEL_MUTEX);
1202
1203 state -= LCK_MTX_WAITER;
1204 state &= (~(LCK_MTX_SPIN_MSK | LCK_MTX_ILOCKED_MSK));
1205 ordered_store_mtx_state_release(mutex, state);
1206
1207 assert(current_thread()->turnstile != NULL);
1208
1209 turnstile_cleanup();
1210
1211 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAKEUP_CODE) | DBG_FUNC_END,
1212 trace_lck, 0, mutex->lck_mtx_waiters, 0, 0);
1213
1214 lck_mtx_unlock_finish_inline(mutex, state);
1215 }
1216
1217 /*
1218 * Routine: lck_mtx_lock_acquire_x86
1219 *
1220 * Invoked on acquiring the mutex when there is
1221 * contention (i.e. the assembly routine sees that
1222 * that mutex->lck_mtx_waiters != 0
1223 *
1224 * mutex is owned... interlock is held... preemption is disabled
1225 */
1226 __attribute__((always_inline))
1227 static void
lck_mtx_lock_acquire_inline(lck_mtx_t * mutex,struct turnstile * ts)1228 lck_mtx_lock_acquire_inline(
1229 lck_mtx_t *mutex,
1230 struct turnstile *ts)
1231 {
1232 __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(mutex);
1233
1234 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_ACQUIRE_CODE) | DBG_FUNC_START,
1235 trace_lck, 0, mutex->lck_mtx_waiters, 0, 0);
1236
1237 if (mutex->lck_mtx_waiters > 0) {
1238 if (ts == NULL) {
1239 ts = turnstile_prepare_hash((uintptr_t)mutex, TURNSTILE_KERNEL_MUTEX);
1240 }
1241
1242 turnstile_update_inheritor(ts, current_thread(),
1243 TURNSTILE_IMMEDIATE_UPDATE | TURNSTILE_INHERITOR_THREAD);
1244 turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD);
1245 }
1246
1247 if (ts != NULL) {
1248 turnstile_complete_hash((uintptr_t)mutex, TURNSTILE_KERNEL_MUTEX);
1249 }
1250
1251 assert(current_thread()->turnstile != NULL);
1252
1253 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_ACQUIRE_CODE) | DBG_FUNC_END,
1254 trace_lck, 0, mutex->lck_mtx_waiters, 0, 0);
1255 }
1256
1257 void
lck_mtx_lock_acquire_x86(lck_mtx_t * mutex)1258 lck_mtx_lock_acquire_x86(
1259 lck_mtx_t *mutex)
1260 {
1261 return lck_mtx_lock_acquire_inline(mutex, NULL);
1262 }
1263
1264 /*
1265 * Tail call helpers for lock functions that perform
1266 * lck_mtx_lock_acquire followed by the caller's finish routine, to optimize
1267 * the caller's compiled code.
1268 */
1269
1270 __attribute__((noinline))
1271 static void
lck_mtx_lock_acquire_tail(lck_mtx_t * mutex,struct turnstile * ts)1272 lck_mtx_lock_acquire_tail(
1273 lck_mtx_t *mutex,
1274 struct turnstile *ts)
1275 {
1276 lck_mtx_lock_acquire_inline(mutex, ts);
1277 lck_mtx_lock_finish_inline_with_cleanup(mutex, ordered_load_mtx_state(mutex));
1278 }
1279
1280 __attribute__((noinline))
1281 static boolean_t
lck_mtx_try_lock_acquire_tail(lck_mtx_t * mutex)1282 lck_mtx_try_lock_acquire_tail(
1283 lck_mtx_t *mutex)
1284 {
1285 lck_mtx_lock_acquire_inline(mutex, NULL);
1286 lck_mtx_try_lock_finish_inline(mutex, ordered_load_mtx_state(mutex));
1287
1288 return TRUE;
1289 }
1290
1291 __attribute__((noinline))
1292 static void
lck_mtx_convert_spin_acquire_tail(lck_mtx_t * mutex)1293 lck_mtx_convert_spin_acquire_tail(
1294 lck_mtx_t *mutex)
1295 {
1296 lck_mtx_lock_acquire_inline(mutex, NULL);
1297 lck_mtx_convert_spin_finish_inline(mutex, ordered_load_mtx_state(mutex));
1298 }
1299
1300 static void
lck_mtx_ilk_unlock(lck_mtx_t * mutex)1301 lck_mtx_ilk_unlock(
1302 lck_mtx_t *mutex)
1303 {
1304 lck_mtx_ilk_unlock_inline(mutex, ordered_load_mtx_state(mutex));
1305 }
1306
1307 static inline void
lck_mtx_interlock_lock_set_and_clear_flags(lck_mtx_t * mutex,uint32_t xor_flags,uint32_t and_flags,uint32_t * new_state)1308 lck_mtx_interlock_lock_set_and_clear_flags(
1309 lck_mtx_t *mutex,
1310 uint32_t xor_flags,
1311 uint32_t and_flags,
1312 uint32_t *new_state)
1313 {
1314 uint32_t state, prev;
1315 state = *new_state;
1316
1317 for (;;) {
1318 /* have to wait for interlock to clear */
1319 while (__improbable(state & (LCK_MTX_ILOCKED_MSK | xor_flags))) {
1320 cpu_pause();
1321 state = ordered_load_mtx_state(mutex);
1322 }
1323 prev = state; /* prev contains snapshot for exchange */
1324 state |= LCK_MTX_ILOCKED_MSK | xor_flags; /* pick up interlock */
1325 state &= ~and_flags; /* clear flags */
1326
1327 disable_preemption();
1328 if (os_atomic_cmpxchg(&mutex->lck_mtx_state, prev, state, acquire)) {
1329 break;
1330 }
1331 enable_preemption();
1332 cpu_pause();
1333 state = ordered_load_mtx_state(mutex);
1334 }
1335 *new_state = state;
1336 return;
1337 }
1338
1339 static inline void
lck_mtx_interlock_lock_clear_flags(lck_mtx_t * mutex,uint32_t and_flags,uint32_t * new_state)1340 lck_mtx_interlock_lock_clear_flags(
1341 lck_mtx_t *mutex,
1342 uint32_t and_flags,
1343 uint32_t *new_state)
1344 {
1345 return lck_mtx_interlock_lock_set_and_clear_flags(mutex, 0, and_flags, new_state);
1346 }
1347
1348 static inline void
lck_mtx_interlock_lock(lck_mtx_t * mutex,uint32_t * new_state)1349 lck_mtx_interlock_lock(
1350 lck_mtx_t *mutex,
1351 uint32_t *new_state)
1352 {
1353 return lck_mtx_interlock_lock_set_and_clear_flags(mutex, 0, 0, new_state);
1354 }
1355
1356 static inline int
lck_mtx_interlock_try_lock_set_flags(lck_mtx_t * mutex,uint32_t or_flags,uint32_t * new_state)1357 lck_mtx_interlock_try_lock_set_flags(
1358 lck_mtx_t *mutex,
1359 uint32_t or_flags,
1360 uint32_t *new_state)
1361 {
1362 uint32_t state, prev;
1363 state = *new_state;
1364
1365 /* have to wait for interlock to clear */
1366 if (state & (LCK_MTX_ILOCKED_MSK | or_flags)) {
1367 return 0;
1368 }
1369 prev = state; /* prev contains snapshot for exchange */
1370 state |= LCK_MTX_ILOCKED_MSK | or_flags; /* pick up interlock */
1371 disable_preemption();
1372 if (os_atomic_cmpxchg(&mutex->lck_mtx_state, prev, state, acquire)) {
1373 *new_state = state;
1374 return 1;
1375 }
1376
1377 enable_preemption();
1378 return 0;
1379 }
1380
1381 __attribute__((noinline))
1382 static void
lck_mtx_lock_contended(lck_mtx_t * lock,bool profile,boolean_t * first_miss)1383 lck_mtx_lock_contended(
1384 lck_mtx_t *lock,
1385 bool profile,
1386 boolean_t *first_miss)
1387 {
1388 lck_mtx_spinwait_ret_type_t ret;
1389 uint32_t state;
1390 thread_t thread;
1391 struct turnstile *ts = NULL;
1392
1393 try_again:
1394
1395 if (profile) {
1396 LCK_MTX_PROF_MISS(lock, lock->lck_mtx_grp, first_miss);
1397 }
1398
1399 ret = lck_mtx_lock_spinwait_x86(lock);
1400 state = ordered_load_mtx_state(lock);
1401 switch (ret) {
1402 case LCK_MTX_SPINWAIT_NO_SPIN:
1403 /*
1404 * owner not on core, lck_mtx_lock_spinwait_x86 didn't even
1405 * try to spin.
1406 */
1407 /* just fall through case LCK_MTX_SPINWAIT_SPUN */
1408 OS_FALLTHROUGH;
1409 case LCK_MTX_SPINWAIT_SPUN_HIGH_THR:
1410 case LCK_MTX_SPINWAIT_SPUN_OWNER_NOT_CORE:
1411 case LCK_MTX_SPINWAIT_SPUN_NO_WINDOW_CONTENTION:
1412 case LCK_MTX_SPINWAIT_SPUN_SLIDING_THR:
1413 /*
1414 * mutex not acquired but lck_mtx_lock_spinwait_x86 tried to spin
1415 * interlock not held
1416 */
1417 lck_mtx_interlock_lock(lock, &state);
1418 assert(state & LCK_MTX_ILOCKED_MSK);
1419
1420 if (state & LCK_MTX_MLOCKED_MSK) {
1421 if (profile) {
1422 LCK_MTX_PROF_WAIT(lock, lock->lck_mtx_grp,
1423 ret == LCK_MTX_SPINWAIT_NO_SPIN, first_miss);
1424 }
1425 lck_mtx_lock_wait_x86(lock, &ts);
1426 /*
1427 * interlock is not held here.
1428 */
1429 goto try_again;
1430 } else {
1431 /* grab the mutex */
1432 state |= LCK_MTX_MLOCKED_MSK;
1433 ordered_store_mtx_state_release(lock, state);
1434 thread = current_thread();
1435 ordered_store_mtx_owner(lock, thread->ctid);
1436 #if MACH_LDEBUG
1437 if (thread) {
1438 thread->mutex_count++;
1439 }
1440 #endif /* MACH_LDEBUG */
1441 }
1442
1443 break;
1444 case LCK_MTX_SPINWAIT_ACQUIRED:
1445 /*
1446 * mutex has been acquired by lck_mtx_lock_spinwait_x86
1447 * interlock is held and preemption disabled
1448 * owner is set and mutex marked as locked
1449 * statistics updated too
1450 */
1451 break;
1452 default:
1453 panic("lck_mtx_lock_spinwait_x86 returned %d for mutex %p", ret, lock);
1454 }
1455
1456 /*
1457 * interlock is already acquired here
1458 */
1459
1460 /* mutex has been acquired */
1461 if (state & LCK_MTX_WAITERS_MSK) {
1462 /*
1463 * lck_mtx_lock_acquire_tail will call
1464 * turnstile_complete.
1465 */
1466 return lck_mtx_lock_acquire_tail(lock, ts);
1467 }
1468
1469 if (ts != NULL) {
1470 turnstile_complete_hash((uintptr_t)lock, TURNSTILE_KERNEL_MUTEX);
1471 }
1472
1473 assert(current_thread()->turnstile != NULL);
1474
1475 /* release the interlock */
1476 lck_mtx_lock_finish_inline_with_cleanup(lock, ordered_load_mtx_state(lock));
1477 }
1478
1479 /*
1480 * Helper noinline functions for calling
1481 * panic to optimize compiled code.
1482 */
1483
1484 __attribute__((noinline)) __abortlike
1485 static void
lck_mtx_destroyed(lck_mtx_t * lock)1486 lck_mtx_destroyed(
1487 lck_mtx_t *lock)
1488 {
1489 panic("trying to interlock destroyed mutex (%p)", lock);
1490 }
1491
1492 __attribute__((noinline))
1493 static boolean_t
lck_mtx_try_destroyed(lck_mtx_t * lock)1494 lck_mtx_try_destroyed(
1495 lck_mtx_t *lock)
1496 {
1497 panic("trying to interlock destroyed mutex (%p)", lock);
1498 return FALSE;
1499 }
1500
1501 __attribute__((always_inline))
1502 static boolean_t
lck_mtx_lock_wait_interlock_to_clear(lck_mtx_t * lock,uint32_t * new_state)1503 lck_mtx_lock_wait_interlock_to_clear(
1504 lck_mtx_t *lock,
1505 uint32_t* new_state)
1506 {
1507 uint32_t state;
1508
1509 for (;;) {
1510 cpu_pause();
1511 state = ordered_load_mtx_state(lock);
1512 if (!(state & (LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK))) {
1513 *new_state = state;
1514 return TRUE;
1515 }
1516 if (state & LCK_MTX_MLOCKED_MSK) {
1517 /* if it is held as mutex, just fail */
1518 return FALSE;
1519 }
1520 }
1521 }
1522
1523 __attribute__((always_inline))
1524 static boolean_t
lck_mtx_try_lock_wait_interlock_to_clear(lck_mtx_t * lock,uint32_t * new_state)1525 lck_mtx_try_lock_wait_interlock_to_clear(
1526 lck_mtx_t *lock,
1527 uint32_t* new_state)
1528 {
1529 uint32_t state;
1530
1531 for (;;) {
1532 cpu_pause();
1533 state = ordered_load_mtx_state(lock);
1534 if (state & (LCK_MTX_MLOCKED_MSK | LCK_MTX_SPIN_MSK)) {
1535 /* if it is held as mutex or spin, just fail */
1536 return FALSE;
1537 }
1538 if (!(state & LCK_MTX_ILOCKED_MSK)) {
1539 *new_state = state;
1540 return TRUE;
1541 }
1542 }
1543 }
1544
1545 /*
1546 * Routine: lck_mtx_lock_slow
1547 *
1548 * Locks a mutex for current thread.
1549 * If the lock is contended this function might
1550 * sleep.
1551 *
1552 * Called with interlock not held.
1553 */
1554 __attribute__((noinline))
1555 void
lck_mtx_lock_slow(lck_mtx_t * lock)1556 lck_mtx_lock_slow(
1557 lck_mtx_t *lock)
1558 {
1559 bool profile;
1560 uint32_t state;
1561 int first_miss = 0;
1562
1563 state = ordered_load_mtx_state(lock);
1564 profile = (state & LCK_MTX_PROFILE_MSK);
1565
1566 if (__improbable(profile)) {
1567 if (state & LCK_MTX_SPIN_MSK) {
1568 /* M_SPIN_MSK was set, so M_ILOCKED_MSK must also be present */
1569 assert(state & LCK_MTX_ILOCKED_MSK);
1570 LCK_MTX_PROF_MISS(lock, lock->lck_mtx_grp, &first_miss);
1571 }
1572 }
1573
1574 /* is the interlock or mutex held */
1575 if (__improbable(state & (LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK))) {
1576 /*
1577 * Note: LCK_MTX_TAG_DESTROYED has
1578 * LCK_MTX_ILOCKED_MSK and LCK_MTX_MLOCKED_MSK set.
1579 */
1580
1581 /* is the mutex already held */
1582 if (__improbable(!(state & LCK_MTX_ILOCKED_MSK))) {
1583 /* no, must have been the mutex */
1584 return lck_mtx_lock_contended(lock, profile, &first_miss);
1585 }
1586
1587 /* check to see if it is marked destroyed */
1588 if (__improbable(state == LCK_MTX_TAG_DESTROYED)) {
1589 lck_mtx_destroyed(lock);
1590 }
1591 }
1592
1593 /* no - can't be DESTROYED or locked */
1594 while (__improbable(!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_MLOCKED_MSK, &state))) {
1595 if (!lck_mtx_lock_wait_interlock_to_clear(lock, &state)) {
1596 return lck_mtx_lock_contended(lock, profile, &first_miss);
1597 }
1598 }
1599
1600 /* lock and interlock acquired */
1601
1602 /* record owner of mutex */
1603 ordered_store_mtx_owner(lock, current_thread()->ctid);
1604
1605 #if MACH_LDEBUG
1606 if (thread) {
1607 thread->mutex_count++; /* lock statistic */
1608 }
1609 #endif
1610 /*
1611 * Check if there are waiters to
1612 * inherit their priority.
1613 */
1614 if (__improbable(state & LCK_MTX_WAITERS_MSK)) {
1615 return lck_mtx_lock_acquire_tail(lock, NULL);
1616 }
1617
1618 /* release the interlock */
1619 lck_mtx_lock_finish_inline(lock, ordered_load_mtx_state(lock));
1620 }
1621
1622 __attribute__((noinline))
1623 boolean_t
lck_mtx_try_lock_slow(lck_mtx_t * lock)1624 lck_mtx_try_lock_slow(
1625 lck_mtx_t *lock)
1626 {
1627 bool profile;
1628 uint32_t state;
1629 int first_miss = 0;
1630
1631 state = ordered_load_mtx_state(lock);
1632 profile = (state & LCK_MTX_PROFILE_MSK);
1633
1634 /* is the interlock or mutex held */
1635 if (__improbable(state & (LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK))) {
1636 /*
1637 * Note: LCK_MTX_TAG_DESTROYED has
1638 * LCK_MTX_ILOCKED_MSK and LCK_MTX_MLOCKED_MSK set.
1639 */
1640
1641 /* is the mutex already held */
1642 if (__improbable(!(state & LCK_MTX_ILOCKED_MSK))) {
1643 return FALSE;
1644 }
1645
1646 /* check to see if it is marked destroyed */
1647 if (__improbable(state == LCK_MTX_TAG_DESTROYED)) {
1648 lck_mtx_try_destroyed(lock);
1649 }
1650
1651 if (!lck_mtx_try_lock_wait_interlock_to_clear(lock, &state)) {
1652 if (__improbable(profile)) {
1653 LCK_MTX_PROF_MISS(lock, lock->lck_mtx_grp, &first_miss);
1654 }
1655 return FALSE;
1656 }
1657 }
1658
1659 if (__improbable(!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_MLOCKED_MSK, &state))) {
1660 if (__improbable(profile)) {
1661 LCK_MTX_PROF_MISS(lock, lock->lck_mtx_grp, &first_miss);
1662 }
1663 return FALSE;
1664 }
1665
1666 /* lock and interlock acquired */
1667
1668 /* record owner of mutex */
1669 ordered_store_mtx_owner(lock, current_thread()->ctid);
1670
1671 #if MACH_LDEBUG
1672 if (thread) {
1673 thread->mutex_count++; /* lock statistic */
1674 }
1675 #endif
1676 /*
1677 * Check if there are waiters to
1678 * inherit their priority.
1679 */
1680 if (__improbable(state & LCK_MTX_WAITERS_MSK)) {
1681 return lck_mtx_try_lock_acquire_tail(lock);
1682 }
1683
1684 /* release the interlock */
1685 lck_mtx_try_lock_finish_inline(lock, ordered_load_mtx_state(lock));
1686
1687 return TRUE;
1688 }
1689
1690 __attribute__((noinline))
1691 void
lck_mtx_lock_spin_slow(lck_mtx_t * lock)1692 lck_mtx_lock_spin_slow(
1693 lck_mtx_t *lock)
1694 {
1695 bool profile;
1696 uint32_t state;
1697 int first_miss = 0;
1698
1699 state = ordered_load_mtx_state(lock);
1700 profile = (state & LCK_MTX_PROFILE_MSK);
1701
1702 if (__improbable(profile)) {
1703 if (state & LCK_MTX_SPIN_MSK) {
1704 /* M_SPIN_MSK was set, so M_ILOCKED_MSK must also be present */
1705 assert(state & LCK_MTX_ILOCKED_MSK);
1706 LCK_MTX_PROF_MISS(lock, lock->lck_mtx_grp, &first_miss);
1707 }
1708 }
1709
1710 /* is interlock or mutex held */
1711 if (__improbable(state & (LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK))) {
1712 /*
1713 * Note: LCK_MTX_TAG_DESTROYED has
1714 * LCK_MTX_ILOCKED_MSK and LCK_MTX_MLOCKED_MSK set.
1715 */
1716
1717 /* is the mutex already held */
1718 if (__improbable(!(state & LCK_MTX_ILOCKED_MSK))) {
1719 /* no, must have been the mutex */
1720 return lck_mtx_lock_contended(lock, profile, &first_miss);
1721 }
1722
1723 /* check to see if it is marked destroyed */
1724 if (__improbable(state == LCK_MTX_TAG_DESTROYED)) {
1725 lck_mtx_destroyed(lock);
1726 }
1727 }
1728
1729 bool acquired = true;
1730
1731 #if CONFIG_DTRACE
1732 uint64_t spin_start;
1733
1734 if (__probable(lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_SPIN_MSK, &state))) {
1735 goto past_spin;
1736 }
1737
1738 spin_start = LCK_MTX_SPIN_SPIN_BEGIN();
1739 #endif /* CONFIG_DTRACE */
1740
1741 /* note - can't be DESTROYED or locked */
1742 /* note - preemption is not disabled while spinning */
1743 while (__improbable(!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_SPIN_MSK, &state))) {
1744 if (!lck_mtx_lock_wait_interlock_to_clear(lock, &state)) {
1745 acquired = false;
1746 break;
1747 }
1748 }
1749
1750 LCK_MTX_SPIN_SPIN_END(lock, lock->lck_mtx_grp, spin_start);
1751 #if CONFIG_DTRACE
1752 past_spin:
1753 #endif /* CONFIG_DTRACE */
1754
1755 if (__improbable(!acquired)) {
1756 return lck_mtx_lock_contended(lock, profile, &first_miss);
1757 }
1758
1759 /* lock as spinlock and interlock acquired */
1760
1761 /* record owner of mutex */
1762 ordered_store_mtx_owner(lock, current_thread()->ctid);
1763
1764 #if MACH_LDEBUG
1765 if (thread) {
1766 thread->mutex_count++; /* lock statistic */
1767 }
1768 #endif
1769 LCK_MTX_ACQUIRED(lock, lock->lck_mtx_grp, true, profile);
1770
1771 /* return with the interlock held and preemption disabled */
1772 return;
1773 }
1774
1775 __attribute__((noinline))
1776 boolean_t
lck_mtx_try_lock_spin_slow(lck_mtx_t * lock)1777 lck_mtx_try_lock_spin_slow(
1778 lck_mtx_t *lock)
1779 {
1780 bool profile;
1781 uint32_t state;
1782 int first_miss = 0;
1783
1784 state = ordered_load_mtx_state(lock);
1785 profile = (state & LCK_MTX_PROFILE_MSK);
1786
1787 /* is the interlock or mutex held */
1788 if (__improbable(state & (LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK))) {
1789 /*
1790 * Note: LCK_MTX_TAG_DESTROYED has
1791 * LCK_MTX_ILOCKED_MSK and LCK_MTX_MLOCKED_MSK set.
1792 */
1793
1794 /* is the mutex already held */
1795 if (__improbable(!(state & LCK_MTX_ILOCKED_MSK))) {
1796 return FALSE;
1797 }
1798
1799 /* check to see if it is marked destroyed */
1800 if (__improbable(state == LCK_MTX_TAG_DESTROYED)) {
1801 lck_mtx_try_destroyed(lock);
1802 }
1803 }
1804
1805 /* note - can't be DESTROYED or locked */
1806 if (__improbable(!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_SPIN_MSK, &state))) {
1807 if (__improbable(profile)) {
1808 LCK_MTX_PROF_MISS(lock, lock->lck_mtx_grp, &first_miss);
1809 }
1810 return FALSE;
1811 }
1812
1813 /* lock and interlock acquired */
1814
1815 /* record owner of mutex */
1816 ordered_store_mtx_owner(lock, current_thread()->ctid);
1817
1818 #if MACH_LDEBUG
1819 if (thread) {
1820 thread->mutex_count++; /* lock statistic */
1821 }
1822 #endif
1823
1824 #if CONFIG_DTRACE
1825 LOCKSTAT_RECORD(LS_LCK_MTX_TRY_LOCK_SPIN_ACQUIRE, lock, 0);
1826 #endif
1827 return TRUE;
1828 }
1829
1830 __attribute__((noinline))
1831 void
lck_mtx_convert_spin(lck_mtx_t * lock)1832 lck_mtx_convert_spin(
1833 lck_mtx_t *lock)
1834 {
1835 uint32_t state;
1836
1837 state = ordered_load_mtx_state(lock);
1838
1839 assertf(lock->lck_mtx_owner == current_thread()->ctid,
1840 "lock %p not owned by thread ctid %d (current owner %d)",
1841 lock, current_thread()->ctid, lock->lck_mtx_owner);
1842
1843 if (__improbable(state & LCK_MTX_MLOCKED_MSK)) {
1844 /* already owned as a mutex, just return */
1845 return;
1846 }
1847
1848 assert(get_preemption_level() > 0);
1849 assert(state & LCK_MTX_ILOCKED_MSK);
1850 assert(state & LCK_MTX_SPIN_MSK);
1851
1852 /*
1853 * Check if there are waiters to
1854 * inherit their priority.
1855 */
1856 if (__improbable(state & LCK_MTX_WAITERS_MSK)) {
1857 return lck_mtx_convert_spin_acquire_tail(lock);
1858 }
1859
1860 lck_mtx_convert_spin_finish_inline(lock, ordered_load_mtx_state(lock));
1861
1862 return;
1863 }
1864
1865 static inline boolean_t
lck_mtx_lock_grab_mutex(lck_mtx_t * lock)1866 lck_mtx_lock_grab_mutex(
1867 lck_mtx_t *lock)
1868 {
1869 uint32_t state;
1870
1871 state = ordered_load_mtx_state(lock);
1872
1873 if (!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_MLOCKED_MSK, &state)) {
1874 return FALSE;
1875 }
1876
1877 /* lock and interlock acquired */
1878
1879 /* record owner of mutex */
1880 ordered_store_mtx_owner(lock, current_thread()->ctid);
1881
1882 #if MACH_LDEBUG
1883 if (thread) {
1884 thread->mutex_count++; /* lock statistic */
1885 }
1886 #endif
1887 return TRUE;
1888 }
1889
1890 __attribute__((noinline))
1891 void
lck_mtx_assert(lck_mtx_t * lock,unsigned int type)1892 lck_mtx_assert(
1893 lck_mtx_t *lock,
1894 unsigned int type)
1895 {
1896 thread_t thread;
1897 uint32_t state;
1898
1899 thread = current_thread();
1900 state = ordered_load_mtx_state(lock);
1901
1902 if (type == LCK_MTX_ASSERT_OWNED) {
1903 if ((lock->lck_mtx_owner != thread->ctid) ||
1904 !(state & (LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK))) {
1905 panic("mutex (%p) not owned", lock);
1906 }
1907 } else {
1908 assert(type == LCK_MTX_ASSERT_NOTOWNED);
1909 if (lock->lck_mtx_owner == thread->ctid) {
1910 panic("mutex (%p) owned", lock);
1911 }
1912 }
1913 }
1914
1915 /*
1916 * Routine: lck_mtx_lock_spinwait_x86
1917 *
1918 * Invoked trying to acquire a mutex when there is contention but
1919 * the holder is running on another processor. We spin for up to a maximum
1920 * time waiting for the lock to be released.
1921 *
1922 * Called with the interlock unlocked.
1923 * returns LCK_MTX_SPINWAIT_ACQUIRED if mutex acquired
1924 * returns LCK_MTX_SPINWAIT_SPUN if we spun
1925 * returns LCK_MTX_SPINWAIT_NO_SPIN if we didn't spin due to the holder not running
1926 */
1927 __attribute__((noinline))
1928 lck_mtx_spinwait_ret_type_t
lck_mtx_lock_spinwait_x86(lck_mtx_t * mutex)1929 lck_mtx_lock_spinwait_x86(
1930 lck_mtx_t *mutex)
1931 {
1932 __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(mutex);
1933 ctid_t owner, prev_owner;
1934 uint64_t window_deadline, sliding_deadline, high_deadline;
1935 uint64_t spin_start, start_time, cur_time, avg_hold_time, bias, delta;
1936 lck_mtx_spinwait_ret_type_t retval = LCK_MTX_SPINWAIT_SPUN_HIGH_THR;
1937 int loopcount = 0;
1938 int total_hold_time_samples, window_hold_time_samples, unfairness;
1939 uint prev_owner_cpu;
1940 bool adjust;
1941
1942 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_START,
1943 trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(ctid_get_thread_unsafe(mutex->lck_mtx_owner)),
1944 mutex->lck_mtx_waiters, 0, 0);
1945
1946 spin_start = LCK_MTX_ADAPTIVE_SPIN_BEGIN();
1947 start_time = mach_absolute_time();
1948
1949 /*
1950 * window_deadline represents the "learning" phase.
1951 * The thread collects statistics about the lock during
1952 * window_deadline and then it makes a decision on whether to spin more
1953 * or block according to the concurrency behavior
1954 * observed.
1955 *
1956 * Every thread can spin at least low_MutexSpin.
1957 */
1958 window_deadline = start_time + low_MutexSpin;
1959 /*
1960 * Sliding_deadline is the adjusted spin deadline
1961 * computed after the "learning" phase.
1962 */
1963 sliding_deadline = window_deadline;
1964 /*
1965 * High_deadline is a hard deadline. No thread
1966 * can spin more than this deadline.
1967 */
1968 if (high_MutexSpin >= 0) {
1969 high_deadline = start_time + high_MutexSpin;
1970 } else {
1971 high_deadline = start_time + low_MutexSpin * real_ncpus;
1972 }
1973
1974 /*
1975 * Do not know yet which is the owner cpu.
1976 * Initialize prev_owner_cpu with next cpu.
1977 */
1978 prev_owner_cpu = (cpu_number() + 1) % real_ncpus;
1979 total_hold_time_samples = 0;
1980 window_hold_time_samples = 0;
1981 avg_hold_time = 0;
1982 adjust = TRUE;
1983 bias = (os_hash_kernel_pointer(mutex) + cpu_number()) % real_ncpus;
1984
1985 prev_owner = mutex->lck_mtx_owner;
1986
1987 /*
1988 * Spin while:
1989 * - mutex is locked, and
1990 * - it's locked as a spin lock, and
1991 * - owner is running on another processor, and
1992 * - we haven't spun for long enough.
1993 */
1994 do {
1995 /*
1996 * Try to acquire the lock.
1997 */
1998 if (__probable(lck_mtx_lock_grab_mutex(mutex))) {
1999 retval = LCK_MTX_SPINWAIT_ACQUIRED;
2000 break;
2001 }
2002
2003 cur_time = mach_absolute_time();
2004
2005 /*
2006 * Never spin past high_deadline.
2007 */
2008 if (cur_time >= high_deadline) {
2009 retval = LCK_MTX_SPINWAIT_SPUN_HIGH_THR;
2010 break;
2011 }
2012
2013 /*
2014 * Check if owner is on core. If not block.
2015 */
2016 owner = mutex->lck_mtx_owner;
2017 if (owner) {
2018 uint32_t i = prev_owner_cpu;
2019 bool owner_on_core = false;
2020
2021 disable_preemption();
2022 owner = mutex->lck_mtx_owner;
2023
2024 /*
2025 * For scalability we want to check if the owner is on core
2026 * without locking the mutex interlock.
2027 * If we do not lock the mutex interlock, the owner that we see might be
2028 * invalid, so we cannot dereference it. Therefore we cannot check
2029 * any field of the thread to tell us if it is on core.
2030 * Check if the thread that is running on the other cpus matches the owner.
2031 */
2032 if (owner) {
2033 thread_t owner_thread = ctid_get_thread_unsafe(owner);
2034
2035 do {
2036 if ((cpu_data_ptr[i] != NULL) && (cpu_data_ptr[i]->cpu_active_thread == owner_thread)) {
2037 owner_on_core = true;
2038 break;
2039 }
2040 if (++i >= real_ncpus) {
2041 i = 0;
2042 }
2043 } while (i != prev_owner_cpu);
2044 }
2045
2046 enable_preemption();
2047
2048 if (owner == 0) {
2049 /* nothing to do, we didn't find an owner */
2050 } else if (owner_on_core) {
2051 prev_owner_cpu = i;
2052 } else {
2053 prev_owner = owner;
2054 owner = mutex->lck_mtx_owner;
2055 if (owner == prev_owner) {
2056 /*
2057 * Owner is not on core.
2058 * Stop spinning.
2059 */
2060 if (loopcount == 0) {
2061 retval = LCK_MTX_SPINWAIT_NO_SPIN;
2062 } else {
2063 retval = LCK_MTX_SPINWAIT_SPUN_OWNER_NOT_CORE;
2064 }
2065 break;
2066 }
2067 /*
2068 * Fall through if the owner changed while we were scanning.
2069 * The new owner could potentially be on core, so loop
2070 * again.
2071 */
2072 }
2073 }
2074
2075 /*
2076 * Save how many times we see the owner changing.
2077 * We can roughly estimate the mutex hold
2078 * time and the fairness with that.
2079 */
2080 if (owner != prev_owner) {
2081 prev_owner = owner;
2082 total_hold_time_samples++;
2083 window_hold_time_samples++;
2084 }
2085
2086 /*
2087 * Learning window expired.
2088 * Try to adjust the sliding_deadline.
2089 */
2090 if (cur_time >= window_deadline) {
2091 /*
2092 * If there was not contention during the window
2093 * stop spinning.
2094 */
2095 if (window_hold_time_samples < 1) {
2096 retval = LCK_MTX_SPINWAIT_SPUN_NO_WINDOW_CONTENTION;
2097 break;
2098 }
2099
2100 if (adjust) {
2101 /*
2102 * For a fair lock, we'd wait for at most (NCPU-1) periods,
2103 * but the lock is unfair, so let's try to estimate by how much.
2104 */
2105 unfairness = total_hold_time_samples / real_ncpus;
2106
2107 if (unfairness == 0) {
2108 /*
2109 * We observed the owner changing `total_hold_time_samples` times which
2110 * let us estimate the average hold time of this mutex for the duration
2111 * of the spin time.
2112 * avg_hold_time = (cur_time - start_time) / total_hold_time_samples;
2113 *
2114 * In this case spin at max avg_hold_time * (real_ncpus - 1)
2115 */
2116 delta = cur_time - start_time;
2117 sliding_deadline = start_time + (delta * (real_ncpus - 1)) / total_hold_time_samples;
2118 } else {
2119 /*
2120 * In this case at least one of the other cpus was able to get the lock twice
2121 * while I was spinning.
2122 * We could spin longer but it won't necessarily help if the system is unfair.
2123 * Try to randomize the wait to reduce contention.
2124 *
2125 * We compute how much time we could potentially spin
2126 * and distribute it over the cpus.
2127 *
2128 * bias is an integer between 0 and real_ncpus.
2129 * distributed_increment = ((high_deadline - cur_time) / real_ncpus) * bias
2130 */
2131 delta = high_deadline - cur_time;
2132 sliding_deadline = cur_time + ((delta * bias) / real_ncpus);
2133 adjust = FALSE;
2134 }
2135 }
2136
2137 window_deadline += low_MutexSpin;
2138 window_hold_time_samples = 0;
2139 }
2140
2141 /*
2142 * Stop spinning if we past
2143 * the adjusted deadline.
2144 */
2145 if (cur_time >= sliding_deadline) {
2146 retval = LCK_MTX_SPINWAIT_SPUN_SLIDING_THR;
2147 break;
2148 }
2149
2150 if (mutex->lck_mtx_owner != 0) {
2151 cpu_pause();
2152 }
2153
2154 loopcount++;
2155 } while (TRUE);
2156
2157 LCK_MTX_ADAPTIVE_SPIN_END(mutex, mutex->lck_mtx_grp, spin_start);
2158
2159 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_END,
2160 trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(ctid_get_thread_unsafe(mutex->lck_mtx_owner)),
2161 mutex->lck_mtx_waiters, retval, 0);
2162
2163 return retval;
2164 }
2165
2166
2167
2168 /*
2169 * Routine: lck_mtx_lock_wait_x86
2170 *
2171 * Invoked in order to wait on contention.
2172 *
2173 * Called with the interlock locked and
2174 * preemption disabled...
2175 * returns it unlocked and with preemption enabled
2176 *
2177 * lck_mtx_waiters is 1:1 with a wakeup needing to occur.
2178 * A runnable waiter can exist between wait and acquire
2179 * without a waiters count being set.
2180 * This allows us to never make a spurious wakeup call.
2181 *
2182 * Priority:
2183 * This avoids taking the thread lock if the owning thread is the same priority.
2184 * This optimizes the case of same-priority threads contending on a lock.
2185 * However, that allows the owning thread to drop in priority while holding the lock,
2186 * because there is no state that the priority change can notice that
2187 * says that the targeted thread holds a contended mutex.
2188 *
2189 * One possible solution: priority changes could look for some atomic tag
2190 * on the thread saying 'holding contended lock', and then set up a promotion.
2191 * Needs a story for dropping that promotion - the last contended unlock
2192 * has to notice that this has happened.
2193 */
2194 __attribute__((noinline))
2195 void
lck_mtx_lock_wait_x86(lck_mtx_t * mutex,struct turnstile ** ts)2196 lck_mtx_lock_wait_x86(
2197 lck_mtx_t *mutex,
2198 struct turnstile **ts)
2199 {
2200 thread_t self = current_thread();
2201 uint64_t sleep_start;
2202
2203 __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(mutex);
2204
2205 thread_t holder = ctid_get_thread(mutex->lck_mtx_owner);
2206
2207 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_START,
2208 trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(holder),
2209 mutex->lck_mtx_waiters, 0, 0);
2210 sleep_start = LCK_MTX_BLOCK_BEGIN();
2211
2212 mutex->lck_mtx_waiters++;
2213
2214 /*
2215 * lck_mtx_lock_wait_x86 might be called on a loop. Call prepare just once and reuse
2216 * the same turnstile while looping, the matching turnstile compleate will be called
2217 * by lck_mtx_lock_contended when finally acquiring the lock.
2218 */
2219 if (*ts == NULL) {
2220 *ts = turnstile_prepare_hash((uintptr_t)mutex, TURNSTILE_KERNEL_MUTEX);
2221 }
2222
2223 struct turnstile *turnstile = *ts;
2224 thread_set_pending_block_hint(self, kThreadWaitKernelMutex);
2225 turnstile_update_inheritor(turnstile, holder, (TURNSTILE_DELAYED_UPDATE | TURNSTILE_INHERITOR_THREAD));
2226
2227 waitq_assert_wait64(&turnstile->ts_waitq, LCK_MTX_EVENT(mutex), THREAD_UNINT | THREAD_WAIT_NOREPORT_USER, TIMEOUT_WAIT_FOREVER);
2228
2229 lck_mtx_ilk_unlock(mutex);
2230
2231 turnstile_update_inheritor_complete(turnstile, TURNSTILE_INTERLOCK_NOT_HELD);
2232
2233 thread_block(THREAD_CONTINUE_NULL);
2234
2235 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_END,
2236 trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(ctid_get_thread_unsafe(mutex->lck_mtx_owner)),
2237 mutex->lck_mtx_waiters, 0, 0);
2238
2239 LCK_MTX_BLOCK_END(mutex, mutex->lck_mtx_grp, sleep_start);
2240 }
2241
2242 /*
2243 * Routine: kdp_lck_mtx_lock_spin_is_acquired
2244 * NOT SAFE: To be used only by kernel debugger to avoid deadlock.
2245 * Returns: TRUE if lock is acquired.
2246 */
2247 boolean_t
kdp_lck_mtx_lock_spin_is_acquired(lck_mtx_t * lck)2248 kdp_lck_mtx_lock_spin_is_acquired(lck_mtx_t *lck)
2249 {
2250 if (not_in_kdp) {
2251 panic("panic: kdp_lck_mtx_lock_spin_is_acquired called outside of kernel debugger");
2252 }
2253
2254 if (lck->lck_mtx_ilocked || lck->lck_mtx_mlocked) {
2255 return TRUE;
2256 }
2257
2258 return FALSE;
2259 }
2260
2261 void
kdp_lck_mtx_find_owner(__unused struct waitq * waitq,event64_t event,thread_waitinfo_t * waitinfo)2262 kdp_lck_mtx_find_owner(__unused struct waitq * waitq, event64_t event, thread_waitinfo_t * waitinfo)
2263 {
2264 lck_mtx_t * mutex = LCK_EVENT_TO_MUTEX(event);
2265 waitinfo->context = VM_KERNEL_UNSLIDE_OR_PERM(mutex);
2266 waitinfo->owner = thread_tid(ctid_get_thread(mutex->lck_mtx_owner));
2267 }
2268
2269 #endif /* LCK_MTX_USE_ARCH */
2270 #if CONFIG_PV_TICKET
2271 __startup_func
2272 void
lck_init_pv(void)2273 lck_init_pv(void)
2274 {
2275 uint32_t pvtck = 1;
2276 PE_parse_boot_argn("pvticket", &pvtck, sizeof(pvtck));
2277 if (pvtck == 0) {
2278 return;
2279 }
2280 has_lock_pv = cpuid_vmm_present() &&
2281 (cpuid_vmm_get_kvm_features() & CPUID_KVM_FEATURE_PV_UNHALT) != 0;
2282 }
2283 STARTUP(LOCKS, STARTUP_RANK_FIRST, lck_init_pv);
2284 #endif
2285