xref: /xnu-11215.81.4/osfmk/kern/lock_rw.c (revision d4514f0bc1d3f944c22d92e68b646ac3fb40d452)
1 /*
2  * Copyright (c) 2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * @OSF_COPYRIGHT@
30  */
31 /*
32  * Mach Operating System
33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34  * All Rights Reserved.
35  *
36  * Permission to use, copy, modify and distribute this software and its
37  * documentation is hereby granted, provided that both the copyright
38  * notice and this permission notice appear in all copies of the
39  * software, derivative works or modified versions, and any portions
40  * thereof, and that both notices appear in supporting documentation.
41  *
42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45  *
46  * Carnegie Mellon requests users of this software to return to
47  *
48  *  Software Distribution Coordinator  or  [email protected]
49  *  School of Computer Science
50  *  Carnegie Mellon University
51  *  Pittsburgh PA 15213-3890
52  *
53  * any improvements or extensions that they make and grant Carnegie Mellon
54  * the rights to redistribute these changes.
55  */
56 #define LOCK_PRIVATE 1
57 #include <debug.h>
58 #include <kern/locks_internal.h>
59 #include <kern/lock_stat.h>
60 #include <kern/locks.h>
61 #include <kern/zalloc.h>
62 #include <kern/thread.h>
63 #include <kern/processor.h>
64 #include <kern/sched_prim.h>
65 #include <kern/debug.h>
66 #include <machine/atomic.h>
67 #include <machine/machine_cpu.h>
68 
69 KALLOC_TYPE_DEFINE(KT_LCK_RW, lck_rw_t, KT_PRIV_ACCT);
70 
71 #define LCK_RW_WRITER_EVENT(lck)                (event_t)((uintptr_t)(lck)+1)
72 #define LCK_RW_READER_EVENT(lck)                (event_t)((uintptr_t)(lck)+2)
73 #define WRITE_EVENT_TO_RWLOCK(event)            ((lck_rw_t *)((uintptr_t)(event)-1))
74 #define READ_EVENT_TO_RWLOCK(event)             ((lck_rw_t *)((uintptr_t)(event)-2))
75 
76 #if CONFIG_DTRACE
77 #define DTRACE_RW_SHARED        0x0     //reader
78 #define DTRACE_RW_EXCL          0x1     //writer
79 #define DTRACE_NO_FLAG          0x0     //not applicable
80 #endif  /* CONFIG_DTRACE */
81 
82 #define LCK_RW_LCK_EXCLUSIVE_CODE       0x100
83 #define LCK_RW_LCK_EXCLUSIVE1_CODE      0x101
84 #define LCK_RW_LCK_SHARED_CODE          0x102
85 #define LCK_RW_LCK_SH_TO_EX_CODE        0x103
86 #define LCK_RW_LCK_SH_TO_EX1_CODE       0x104
87 #define LCK_RW_LCK_EX_TO_SH_CODE        0x105
88 
89 #if __x86_64__
90 #define LCK_RW_LCK_EX_WRITER_SPIN_CODE  0x106
91 #define LCK_RW_LCK_EX_WRITER_WAIT_CODE  0x107
92 #define LCK_RW_LCK_EX_READER_SPIN_CODE  0x108
93 #define LCK_RW_LCK_EX_READER_WAIT_CODE  0x109
94 #define LCK_RW_LCK_SHARED_SPIN_CODE     0x110
95 #define LCK_RW_LCK_SHARED_WAIT_CODE     0x111
96 #define LCK_RW_LCK_SH_TO_EX_SPIN_CODE   0x112
97 #define LCK_RW_LCK_SH_TO_EX_WAIT_CODE   0x113
98 #endif
99 
100 #define lck_rw_ilk_lock(lock)   hw_lock_bit  ((hw_lock_bit_t*)(&(lock)->lck_rw_tag), LCK_RW_INTERLOCK_BIT, LCK_GRP_NULL)
101 #define lck_rw_ilk_unlock(lock) hw_unlock_bit((hw_lock_bit_t*)(&(lock)->lck_rw_tag), LCK_RW_INTERLOCK_BIT)
102 
103 #define ordered_load_rw(lock)                   os_atomic_load(&(lock)->lck_rw_data, compiler_acq_rel)
104 #define ordered_store_rw(lock, value)           os_atomic_store(&(lock)->lck_rw_data, (value), compiler_acq_rel)
105 #define ordered_store_rw_owner(lock, value)     os_atomic_store(&(lock)->lck_rw_owner, (value), compiler_acq_rel)
106 
107 #ifdef DEBUG_RW
108 static TUNABLE(bool, lck_rw_recursive_shared_assert_74048094, "lck_rw_recursive_shared_assert", false);
109 SECURITY_READ_ONLY_EARLY(vm_packing_params_t) rwlde_caller_packing_params =
110     VM_PACKING_PARAMS(LCK_RW_CALLER_PACKED);
111 #define rw_lock_debug_disabled()                (lck_opts_get() & LCK_OPTION_DISABLE_RW_DEBUG)
112 
113 #define set_rwlde_caller_packed(entry, caller)          ((entry)->rwlde_caller_packed = VM_PACK_POINTER((vm_offset_t)caller, LCK_RW_CALLER_PACKED))
114 #define get_rwlde_caller(entry)                         ((void*)VM_UNPACK_POINTER(entry->rwlde_caller_packed, LCK_RW_CALLER_PACKED))
115 
116 #endif /* DEBUG_RW */
117 
118 /*!
119  * @function lck_rw_alloc_init
120  *
121  * @abstract
122  * Allocates and initializes a rw_lock_t.
123  *
124  * @discussion
125  * The function can block. See lck_rw_init() for initialization details.
126  *
127  * @param grp           lock group to associate with the lock.
128  * @param attr          lock attribute to initialize the lock.
129  *
130  * @returns             NULL or the allocated lock
131  */
132 lck_rw_t *
lck_rw_alloc_init(lck_grp_t * grp,lck_attr_t * attr)133 lck_rw_alloc_init(
134 	lck_grp_t       *grp,
135 	lck_attr_t      *attr)
136 {
137 	lck_rw_t *lck;
138 
139 	lck = zalloc_flags(KT_LCK_RW, Z_WAITOK | Z_ZERO);
140 	lck_rw_init(lck, grp, attr);
141 	return lck;
142 }
143 
144 /*!
145  * @function lck_rw_init
146  *
147  * @abstract
148  * Initializes a rw_lock_t.
149  *
150  * @discussion
151  * Usage statistics for the lock are going to be added to the lock group provided.
152  *
153  * The lock attribute can be used to specify the lock contention behaviour.
154  * RW_WRITER_PRIORITY is the default behaviour (LCK_ATTR_NULL defaults to RW_WRITER_PRIORITY)
155  * and lck_attr_rw_shared_priority() can be used to set the behaviour to RW_SHARED_PRIORITY.
156  *
157  * RW_WRITER_PRIORITY gives priority to the writers upon contention with the readers;
158  * if the lock is held and a writer starts waiting for the lock, readers will not be able
159  * to acquire the lock until all writers stop contending. Readers could
160  * potentially starve.
161  * RW_SHARED_PRIORITY gives priority to the readers upon contention with the writers:
162  * unleass the lock is held in exclusive mode, readers will always be able to acquire the lock.
163  * Readers can lock a shared lock even if there are writers waiting. Writers could potentially
164  * starve.
165  *
166  * @param lck           lock to initialize.
167  * @param grp           lock group to associate with the lock.
168  * @param attr          lock attribute to initialize the lock.
169  *
170  */
171 void
lck_rw_init(lck_rw_t * lck,lck_grp_t * grp,lck_attr_t * attr)172 lck_rw_init(
173 	lck_rw_t        *lck,
174 	lck_grp_t       *grp,
175 	lck_attr_t      *attr)
176 {
177 	/* keep this so that the lck_type_t type is referenced for lldb */
178 	lck_type_t type = LCK_TYPE_RW;
179 
180 	if (attr == LCK_ATTR_NULL) {
181 		attr = &lck_attr_default;
182 	}
183 	*lck = (lck_rw_t){
184 		.lck_rw_type = type,
185 		.lck_rw_can_sleep = true,
186 		.lck_rw_priv_excl = !(attr->lck_attr_val & LCK_ATTR_RW_SHARED_PRIORITY),
187 	};
188 	lck_grp_reference(grp, &grp->lck_grp_rwcnt);
189 }
190 
191 /*!
192  * @function lck_rw_free
193  *
194  * @abstract
195  * Frees a rw_lock previously allocated with lck_rw_alloc_init().
196  *
197  * @discussion
198  * The lock must be not held by any thread.
199  *
200  * @param lck           rw_lock to free.
201  */
202 void
lck_rw_free(lck_rw_t * lck,lck_grp_t * grp)203 lck_rw_free(
204 	lck_rw_t        *lck,
205 	lck_grp_t       *grp)
206 {
207 	lck_rw_destroy(lck, grp);
208 	zfree(KT_LCK_RW, lck);
209 }
210 
211 /*!
212  * @function lck_rw_destroy
213  *
214  * @abstract
215  * Destroys a rw_lock previously initialized with lck_rw_init().
216  *
217  * @discussion
218  * The lock must be not held by any thread.
219  *
220  * @param lck           rw_lock to destroy.
221  */
222 void
lck_rw_destroy(lck_rw_t * lck,lck_grp_t * grp)223 lck_rw_destroy(
224 	lck_rw_t        *lck,
225 	lck_grp_t       *grp)
226 {
227 	if (lck->lck_rw_type != LCK_TYPE_RW ||
228 	    lck->lck_rw_tag == LCK_RW_TAG_DESTROYED) {
229 		panic("Destroying previously destroyed lock %p", lck);
230 	}
231 	lck_rw_assert(lck, LCK_RW_ASSERT_NOTHELD);
232 
233 	lck->lck_rw_type = LCK_TYPE_NONE;
234 	lck->lck_rw_tag = LCK_RW_TAG_DESTROYED;
235 	lck_grp_deallocate(grp, &grp->lck_grp_rwcnt);
236 }
237 
238 #ifdef DEBUG_RW
239 
240 /*
241  * Best effort mechanism to debug rw_locks.
242  *
243  * This mechanism is in addition to the owner checks. The owner is set
244  * only when the lock is held in exclusive mode so the checks do not cover
245  * the cases in which the lock is held in shared mode.
246  *
247  * This mechanism tentatively stores the rw_lock acquired and its debug
248  * information on the thread struct.
249  * Just up to LCK_RW_EXPECTED_MAX_NUMBER rw lock debug information can be stored.
250  *
251  * NOTE: LCK_RW_EXPECTED_MAX_NUMBER is the expected number of rw_locks held
252  * at the same time. If a thread holds more than this number of rw_locks we
253  * will start losing debug information.
254  * Increasing LCK_RW_EXPECTED_MAX_NUMBER will increase the probability we will
255  * store the debug information but it will require more memory per thread
256  * and longer lock/unlock time.
257  *
258  * If an empty slot is found for the debug information, we record the lock
259  * otherwise we set the overflow threshold flag.
260  *
261  * If we reached the overflow threshold we might stop asserting because we cannot be sure
262  * anymore if the lock was acquired or not.
263  *
264  * Even if we reached the overflow threshold, we try to store the debug information
265  * for the new locks acquired. This can be useful in core dumps to debug
266  * possible return to userspace without unlocking and to find possible readers
267  * holding the lock.
268  */
269 __startup_func
270 static void
rw_lock_init(void)271 rw_lock_init(void)
272 {
273 	if (kern_feature_override(KF_RW_LOCK_DEBUG_OVRD)) {
274 		LcksOpts |= LCK_OPTION_DISABLE_RW_DEBUG;
275 	}
276 }
277 STARTUP(LOCKS, STARTUP_RANK_FIRST, rw_lock_init);
278 
279 static inline struct rw_lock_debug_entry *
find_lock_in_savedlocks(lck_rw_t * lock,rw_lock_debug_t * rw_locks_held)280 find_lock_in_savedlocks(lck_rw_t* lock, rw_lock_debug_t *rw_locks_held)
281 {
282 	int i;
283 	for (i = 0; i < LCK_RW_EXPECTED_MAX_NUMBER; i++) {
284 		struct rw_lock_debug_entry *existing = &rw_locks_held->rwld_locks[i];
285 		if (existing->rwlde_lock == lock) {
286 			return existing;
287 		}
288 	}
289 
290 	return NULL;
291 }
292 
293 __abortlike
294 static void
rwlock_slot_panic(rw_lock_debug_t * rw_locks_held)295 rwlock_slot_panic(rw_lock_debug_t *rw_locks_held)
296 {
297 	panic("No empty slot found in %p slot_used %d", rw_locks_held, rw_locks_held->rwld_locks_saved);
298 }
299 
300 static inline struct rw_lock_debug_entry *
find_empty_slot(rw_lock_debug_t * rw_locks_held)301 find_empty_slot(rw_lock_debug_t *rw_locks_held)
302 {
303 	int i;
304 	for (i = 0; i < LCK_RW_EXPECTED_MAX_NUMBER; i++) {
305 		struct rw_lock_debug_entry *entry = &rw_locks_held->rwld_locks[i];
306 		if (entry->rwlde_lock == NULL) {
307 			return entry;
308 		}
309 	}
310 	rwlock_slot_panic(rw_locks_held);
311 }
312 
313 __abortlike
314 static void
canlock_rwlock_panic(lck_rw_t * lock,thread_t thread,struct rw_lock_debug_entry * entry)315 canlock_rwlock_panic(lck_rw_t* lock, thread_t thread, struct rw_lock_debug_entry *entry)
316 {
317 	panic("RW lock %p already held by %p caller %p mode_count %d state 0x%x owner 0x%p ",
318 	    lock, thread, get_rwlde_caller(entry), entry->rwlde_mode_count,
319 	    ordered_load_rw(lock), ctid_get_thread_unsafe(lock->lck_rw_owner));
320 }
321 
322 __attribute__((noinline))
323 static void
assert_canlock_rwlock_slow(lck_rw_t * lock,thread_t thread,lck_rw_type_t type)324 assert_canlock_rwlock_slow(lck_rw_t* lock, thread_t thread, lck_rw_type_t type)
325 {
326 	rw_lock_debug_t *rw_locks_held = &thread->rw_lock_held;
327 	if (__probable(rw_locks_held->rwld_locks_acquired == 0)) {
328 		//no locks saved, safe to lock
329 		return;
330 	}
331 
332 	struct rw_lock_debug_entry *entry = find_lock_in_savedlocks(lock, rw_locks_held);
333 	if (__improbable(entry != NULL)) {
334 		boolean_t can_be_shared_recursive;
335 		if (lck_rw_recursive_shared_assert_74048094) {
336 			can_be_shared_recursive = (lock->lck_rw_priv_excl == 0);
337 		} else {
338 			/* currently rw_lock_shared is called recursively,
339 			 * until the code is fixed allow to lock
340 			 * recursively in shared mode
341 			 */
342 			can_be_shared_recursive = TRUE;
343 		}
344 		if ((type == LCK_RW_TYPE_SHARED) && can_be_shared_recursive && entry->rwlde_mode_count >= 1) {
345 			return;
346 		}
347 		canlock_rwlock_panic(lock, thread, entry);
348 	}
349 }
350 
351 static inline void
assert_canlock_rwlock(lck_rw_t * lock,thread_t thread,lck_rw_type_t type)352 assert_canlock_rwlock(lck_rw_t* lock, thread_t thread, lck_rw_type_t type)
353 {
354 	if (__improbable(!rw_lock_debug_disabled())) {
355 		assert_canlock_rwlock_slow(lock, thread, type);
356 	}
357 }
358 
359 __abortlike
360 static void
held_rwlock_notheld_panic(lck_rw_t * lock,thread_t thread)361 held_rwlock_notheld_panic(lck_rw_t* lock, thread_t thread)
362 {
363 	panic("RW lock %p not held by %p", lock, thread);
364 }
365 
366 __abortlike
367 static void
held_rwlock_notheld_with_info_panic(lck_rw_t * lock,thread_t thread,lck_rw_type_t type,struct rw_lock_debug_entry * entry)368 held_rwlock_notheld_with_info_panic(lck_rw_t* lock, thread_t thread, lck_rw_type_t type, struct rw_lock_debug_entry *entry)
369 {
370 	if (type == LCK_RW_TYPE_EXCLUSIVE) {
371 		panic("RW lock %p not held in exclusive by %p caller %p read %d state 0x%x owner 0x%p ",
372 		    lock, thread, get_rwlde_caller(entry), entry->rwlde_mode_count,
373 		    ordered_load_rw(lock), ctid_get_thread_unsafe(lock->lck_rw_owner));
374 	} else {
375 		panic("RW lock %p not held in shared by %p caller %p read %d state 0x%x owner 0x%p ",
376 		    lock, thread, get_rwlde_caller(entry), entry->rwlde_mode_count,
377 		    ordered_load_rw(lock), ctid_get_thread_unsafe(lock->lck_rw_owner));
378 	}
379 }
380 
381 __attribute__((noinline))
382 static void
assert_held_rwlock_slow(lck_rw_t * lock,thread_t thread,lck_rw_type_t type)383 assert_held_rwlock_slow(lck_rw_t* lock, thread_t thread, lck_rw_type_t type)
384 {
385 	rw_lock_debug_t *rw_locks_held = &thread->rw_lock_held;
386 
387 	if (__improbable(rw_locks_held->rwld_locks_acquired == 0 || rw_locks_held->rwld_locks_saved == 0)) {
388 		if (rw_locks_held->rwld_locks_acquired == 0 || rw_locks_held->rwld_overflow == 0) {
389 			held_rwlock_notheld_panic(lock, thread);
390 		}
391 		return;
392 	}
393 
394 	struct rw_lock_debug_entry *entry = find_lock_in_savedlocks(lock, rw_locks_held);
395 	if (__probable(entry != NULL)) {
396 		if (type == LCK_RW_TYPE_EXCLUSIVE && entry->rwlde_mode_count != -1) {
397 			held_rwlock_notheld_with_info_panic(lock, thread, type, entry);
398 		} else {
399 			if (type == LCK_RW_TYPE_SHARED && entry->rwlde_mode_count <= 0) {
400 				held_rwlock_notheld_with_info_panic(lock, thread, type, entry);
401 			}
402 		}
403 	} else {
404 		if (rw_locks_held->rwld_overflow == 0) {
405 			held_rwlock_notheld_panic(lock, thread);
406 		}
407 	}
408 }
409 
410 static inline void
assert_held_rwlock(lck_rw_t * lock,thread_t thread,lck_rw_type_t type)411 assert_held_rwlock(lck_rw_t* lock, thread_t thread, lck_rw_type_t type)
412 {
413 	if (__improbable(!rw_lock_debug_disabled())) {
414 		assert_held_rwlock_slow(lock, thread, type);
415 	}
416 }
417 
418 __attribute__((noinline))
419 static void
change_held_rwlock_slow(lck_rw_t * lock,thread_t thread,lck_rw_type_t typeFrom,void * caller)420 change_held_rwlock_slow(lck_rw_t* lock, thread_t thread, lck_rw_type_t typeFrom, void* caller)
421 {
422 	rw_lock_debug_t *rw_locks_held = &thread->rw_lock_held;
423 	if (__improbable(rw_locks_held->rwld_locks_saved == 0)) {
424 		if (rw_locks_held->rwld_overflow == 0) {
425 			held_rwlock_notheld_panic(lock, thread);
426 		}
427 		return;
428 	}
429 
430 	struct rw_lock_debug_entry *entry = find_lock_in_savedlocks(lock, rw_locks_held);
431 	if (__probable(entry != NULL)) {
432 		if (typeFrom == LCK_RW_TYPE_SHARED) {
433 			//We are upgrading
434 			assertf(entry->rwlde_mode_count == 1,
435 			    "RW lock %p not held by a single shared when upgrading "
436 			    "by %p caller %p read %d state 0x%x owner 0x%p ",
437 			    lock, thread, get_rwlde_caller(entry), entry->rwlde_mode_count,
438 			    ordered_load_rw(lock), ctid_get_thread_unsafe(lock->lck_rw_owner));
439 			entry->rwlde_mode_count = -1;
440 			set_rwlde_caller_packed(entry, caller);
441 		} else {
442 			//We are downgrading
443 			assertf(entry->rwlde_mode_count == -1,
444 			    "RW lock %p not held in write mode when downgrading "
445 			    "by %p caller %p read %d state 0x%x owner 0x%p ",
446 			    lock, thread, get_rwlde_caller(entry), entry->rwlde_mode_count,
447 			    ordered_load_rw(lock), ctid_get_thread_unsafe(lock->lck_rw_owner));
448 			entry->rwlde_mode_count = 1;
449 			set_rwlde_caller_packed(entry, caller);
450 		}
451 		return;
452 	}
453 
454 	if (rw_locks_held->rwld_overflow == 0) {
455 		held_rwlock_notheld_panic(lock, thread);
456 	}
457 
458 	if (rw_locks_held->rwld_locks_saved == LCK_RW_EXPECTED_MAX_NUMBER) {
459 		//array is full
460 		return;
461 	}
462 
463 	struct rw_lock_debug_entry *null_entry = find_empty_slot(rw_locks_held);
464 	null_entry->rwlde_lock = lock;
465 	set_rwlde_caller_packed(null_entry, caller);
466 	if (typeFrom == LCK_RW_TYPE_SHARED) {
467 		null_entry->rwlde_mode_count = -1;
468 	} else {
469 		null_entry->rwlde_mode_count = 1;
470 	}
471 	rw_locks_held->rwld_locks_saved++;
472 }
473 
474 static inline void
change_held_rwlock(lck_rw_t * lock,thread_t thread,lck_rw_type_t typeFrom,void * caller)475 change_held_rwlock(lck_rw_t* lock, thread_t thread, lck_rw_type_t typeFrom, void* caller)
476 {
477 	if (__improbable(!rw_lock_debug_disabled())) {
478 		change_held_rwlock_slow(lock, thread, typeFrom, caller);
479 	}
480 }
481 
482 __abortlike
483 static void
add_held_rwlock_too_many_panic(thread_t thread)484 add_held_rwlock_too_many_panic(thread_t thread)
485 {
486 	panic("RW lock too many rw locks held, rwld_locks_acquired maxed out for thread %p", thread);
487 }
488 
489 static __attribute__((noinline)) void
add_held_rwlock_slow(lck_rw_t * lock,thread_t thread,lck_rw_type_t type,void * caller)490 add_held_rwlock_slow(lck_rw_t* lock, thread_t thread, lck_rw_type_t type, void* caller)
491 {
492 	rw_lock_debug_t *rw_locks_held = &thread->rw_lock_held;
493 	struct rw_lock_debug_entry *null_entry;
494 	if (__improbable(rw_locks_held->rwld_locks_acquired == UINT32_MAX)) {
495 		add_held_rwlock_too_many_panic(thread);
496 	}
497 	rw_locks_held->rwld_locks_acquired++;
498 
499 	if (type == LCK_RW_TYPE_EXCLUSIVE) {
500 		if (__improbable(rw_locks_held->rwld_locks_saved == LCK_RW_EXPECTED_MAX_NUMBER)) {
501 			//array is full
502 			rw_locks_held->rwld_overflow = 1;
503 			return;
504 		}
505 		null_entry = find_empty_slot(rw_locks_held);
506 		null_entry->rwlde_lock = lock;
507 		set_rwlde_caller_packed(null_entry, caller);
508 		null_entry->rwlde_mode_count = -1;
509 		rw_locks_held->rwld_locks_saved++;
510 		return;
511 	} else {
512 		if (__probable(rw_locks_held->rwld_locks_saved == 0)) {
513 			//array is empty
514 			goto add_shared;
515 		}
516 
517 		boolean_t allow_shared_recursive;
518 		if (lck_rw_recursive_shared_assert_74048094) {
519 			allow_shared_recursive = (lock->lck_rw_priv_excl == 0);
520 		} else {
521 			allow_shared_recursive = TRUE;
522 		}
523 		if (allow_shared_recursive) {
524 			//It could be already locked in shared mode
525 			struct rw_lock_debug_entry *entry = find_lock_in_savedlocks(lock, rw_locks_held);
526 			if (entry != NULL) {
527 				assert(entry->rwlde_mode_count > 0);
528 				assertf(entry->rwlde_mode_count != INT8_MAX,
529 				    "RW lock %p with too many recursive shared held "
530 				    "from %p caller %p read %d state 0x%x owner 0x%p",
531 				    lock, thread, get_rwlde_caller(entry), entry->rwlde_mode_count,
532 				    ordered_load_rw(lock), ctid_get_thread_unsafe(lock->lck_rw_owner));
533 				entry->rwlde_mode_count += 1;
534 				return;
535 			}
536 		}
537 
538 		//none of the locks were a match
539 		//try to add a new entry
540 		if (__improbable(rw_locks_held->rwld_locks_saved == LCK_RW_EXPECTED_MAX_NUMBER)) {
541 			//array is full
542 			rw_locks_held->rwld_overflow = 1;
543 			return;
544 		}
545 
546 add_shared:
547 		null_entry = find_empty_slot(rw_locks_held);
548 		null_entry->rwlde_lock = lock;
549 		set_rwlde_caller_packed(null_entry, caller);
550 		null_entry->rwlde_mode_count = 1;
551 		rw_locks_held->rwld_locks_saved++;
552 	}
553 }
554 
555 static inline void
add_held_rwlock(lck_rw_t * lock,thread_t thread,lck_rw_type_t type,void * caller)556 add_held_rwlock(lck_rw_t* lock, thread_t thread, lck_rw_type_t type, void* caller)
557 {
558 	if (__improbable(!rw_lock_debug_disabled())) {
559 		add_held_rwlock_slow(lock, thread, type, caller);
560 	}
561 }
562 
563 static void
remove_held_rwlock_slow(lck_rw_t * lock,thread_t thread,lck_rw_type_t type)564 remove_held_rwlock_slow(lck_rw_t *lock, thread_t thread, lck_rw_type_t type)
565 {
566 	rw_lock_debug_t *rw_locks_held = &thread->rw_lock_held;
567 	if (__improbable(rw_locks_held->rwld_locks_acquired == 0)) {
568 		return;
569 	}
570 	rw_locks_held->rwld_locks_acquired--;
571 
572 	if (rw_locks_held->rwld_locks_saved == 0) {
573 		assert(rw_locks_held->rwld_overflow == 1);
574 		goto out;
575 	}
576 
577 	struct rw_lock_debug_entry *entry = find_lock_in_savedlocks(lock, rw_locks_held);
578 	if (__probable(entry != NULL)) {
579 		if (type == LCK_RW_TYPE_EXCLUSIVE) {
580 			assert(entry->rwlde_mode_count == -1);
581 			entry->rwlde_mode_count = 0;
582 		} else {
583 			assert(entry->rwlde_mode_count > 0);
584 			entry->rwlde_mode_count--;
585 			if (entry->rwlde_mode_count > 0) {
586 				goto out;
587 			}
588 		}
589 		entry->rwlde_caller_packed = 0;
590 		entry->rwlde_lock = NULL;
591 		rw_locks_held->rwld_locks_saved--;
592 	} else {
593 		assert(rw_locks_held->rwld_overflow == 1);
594 	}
595 
596 out:
597 	if (rw_locks_held->rwld_locks_acquired == 0) {
598 		rw_locks_held->rwld_overflow = 0;
599 	}
600 	return;
601 }
602 
603 static inline void
remove_held_rwlock(lck_rw_t * lock,thread_t thread,lck_rw_type_t type)604 remove_held_rwlock(lck_rw_t* lock, thread_t thread, lck_rw_type_t type)
605 {
606 	if (__improbable(!rw_lock_debug_disabled())) {
607 		remove_held_rwlock_slow(lock, thread, type);
608 	}
609 }
610 #endif /* DEBUG_RW */
611 
612 /*
613  * We disable interrupts while holding the RW interlock to prevent an
614  * interrupt from exacerbating hold time.
615  * Hence, local helper functions lck_interlock_lock()/lck_interlock_unlock().
616  */
617 static inline boolean_t
lck_interlock_lock(lck_rw_t * lck)618 lck_interlock_lock(
619 	lck_rw_t        *lck)
620 {
621 	boolean_t       istate;
622 
623 	istate = ml_set_interrupts_enabled(FALSE);
624 	lck_rw_ilk_lock(lck);
625 	return istate;
626 }
627 
628 static inline void
lck_interlock_unlock(lck_rw_t * lck,boolean_t istate)629 lck_interlock_unlock(
630 	lck_rw_t        *lck,
631 	boolean_t       istate)
632 {
633 	lck_rw_ilk_unlock(lck);
634 	ml_set_interrupts_enabled(istate);
635 }
636 
637 /*
638  * compute the deadline to spin against when
639  * waiting for a change of state on a lck_rw_t
640  */
641 static inline uint64_t
lck_rw_deadline_for_spin(lck_rw_t * lck)642 lck_rw_deadline_for_spin(
643 	lck_rw_t        *lck)
644 {
645 	lck_rw_word_t   word;
646 
647 	word.data = ordered_load_rw(lck);
648 	if (word.can_sleep) {
649 		if (word.r_waiting || word.w_waiting || (word.shared_count > machine_info.max_cpus)) {
650 			/*
651 			 * there are already threads waiting on this lock... this
652 			 * implies that they have spun beyond their deadlines waiting for
653 			 * the desired state to show up so we will not bother spinning at this time...
654 			 *   or
655 			 * the current number of threads sharing this lock exceeds our capacity to run them
656 			 * concurrently and since all states we're going to spin for require the rw_shared_count
657 			 * to be at 0, we'll not bother spinning since the latency for this to happen is
658 			 * unpredictable...
659 			 */
660 			return mach_absolute_time();
661 		}
662 		return mach_absolute_time() + os_atomic_load(&MutexSpin, relaxed);
663 	} else {
664 		return mach_absolute_time() + (100000LL * 1000000000LL);
665 	}
666 }
667 
668 /*
669  * This inline is used when busy-waiting for an rw lock.
670  * If interrupts were disabled when the lock primitive was called,
671  * we poll the IPI handler for pending tlb flushes in x86.
672  */
673 static inline void
lck_rw_lock_pause(boolean_t interrupts_enabled)674 lck_rw_lock_pause(
675 	boolean_t       interrupts_enabled)
676 {
677 #if X86_64
678 	if (!interrupts_enabled) {
679 		handle_pending_TLB_flushes();
680 	}
681 	cpu_pause();
682 #else
683 	(void) interrupts_enabled;
684 	wait_for_event();
685 #endif
686 }
687 
688 typedef enum __enum_closed {
689 	LCK_RW_DRAIN_S_DRAINED       = 0,
690 	LCK_RW_DRAIN_S_NOT_DRAINED   = 1,
691 	LCK_RW_DRAIN_S_EARLY_RETURN  = 2,
692 	LCK_RW_DRAIN_S_TIMED_OUT     = 3,
693 } lck_rw_drain_state_t;
694 
695 static lck_rw_drain_state_t
696 lck_rw_drain_status(
697 	lck_rw_t        *lock,
698 	uint32_t        status_mask,
699 	boolean_t       wait,
700 	bool            (^lock_pause)(void))
701 {
702 	uint64_t        deadline = 0;
703 	uint32_t        data;
704 	boolean_t       istate = FALSE;
705 
706 	if (wait) {
707 		deadline = lck_rw_deadline_for_spin(lock);
708 #if __x86_64__
709 		istate = ml_get_interrupts_enabled();
710 #endif
711 	}
712 
713 	for (;;) {
714 #if __x86_64__
715 		data = os_atomic_load(&lock->lck_rw_data, relaxed);
716 #else
717 		data = load_exclusive32(&lock->lck_rw_data, memory_order_acquire_smp);
718 #endif
719 		if ((data & status_mask) == 0) {
720 			atomic_exchange_abort();
721 			return LCK_RW_DRAIN_S_DRAINED;
722 		}
723 
724 		if (!wait) {
725 			atomic_exchange_abort();
726 			return LCK_RW_DRAIN_S_NOT_DRAINED;
727 		}
728 
729 		lck_rw_lock_pause(istate);
730 
731 		if (mach_absolute_time() >= deadline) {
732 			return LCK_RW_DRAIN_S_TIMED_OUT;
733 		}
734 
735 		if (lock_pause && lock_pause()) {
736 			return LCK_RW_DRAIN_S_EARLY_RETURN;
737 		}
738 	}
739 }
740 
741 /*
742  * Spin while interlock is held.
743  */
744 static inline void
lck_rw_interlock_spin(lck_rw_t * lock)745 lck_rw_interlock_spin(
746 	lck_rw_t        *lock)
747 {
748 	uint32_t        data, prev;
749 
750 	for (;;) {
751 		data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_relaxed);
752 		if (data & LCK_RW_INTERLOCK) {
753 #if __x86_64__
754 			cpu_pause();
755 #else
756 			wait_for_event();
757 #endif
758 		} else {
759 			atomic_exchange_abort();
760 			return;
761 		}
762 	}
763 }
764 
765 #define LCK_RW_GRAB_WANT        0
766 #define LCK_RW_GRAB_SHARED      1
767 
768 typedef enum __enum_closed __enum_options {
769 	LCK_RW_GRAB_F_SHARED    = 0x0,  // Not really a flag obviously but makes call sites more readable.
770 	LCK_RW_GRAB_F_WANT_EXCL = 0x1,
771 	LCK_RW_GRAB_F_WAIT      = 0x2,
772 } lck_rw_grab_flags_t;
773 
774 typedef enum __enum_closed {
775 	LCK_RW_GRAB_S_NOT_LOCKED    = 0,
776 	LCK_RW_GRAB_S_LOCKED        = 1,
777 	LCK_RW_GRAB_S_EARLY_RETURN  = 2,
778 	LCK_RW_GRAB_S_TIMED_OUT     = 3,
779 } lck_rw_grab_state_t;
780 
781 static lck_rw_grab_state_t
782 lck_rw_grab(
783 	lck_rw_t            *lock,
784 	lck_rw_grab_flags_t flags,
785 	bool                (^lock_pause)(void))
786 {
787 	uint64_t        deadline = 0;
788 	uint32_t        data, prev;
789 	boolean_t       do_exch, istate = FALSE;
790 
791 	assert3u(flags & ~(LCK_RW_GRAB_F_WANT_EXCL | LCK_RW_GRAB_F_WAIT), ==, 0);
792 
793 	if ((flags & LCK_RW_GRAB_F_WAIT) != 0) {
794 		deadline = lck_rw_deadline_for_spin(lock);
795 #if __x86_64__
796 		istate = ml_get_interrupts_enabled();
797 #endif
798 	}
799 
800 	for (;;) {
801 		data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_acquire_smp);
802 		if (data & LCK_RW_INTERLOCK) {
803 			atomic_exchange_abort();
804 			lck_rw_interlock_spin(lock);
805 			continue;
806 		}
807 		do_exch = FALSE;
808 		if ((flags & LCK_RW_GRAB_F_WANT_EXCL) != 0) {
809 			if ((data & LCK_RW_WANT_EXCL) == 0) {
810 				data |= LCK_RW_WANT_EXCL;
811 				do_exch = TRUE;
812 			}
813 		} else {        // LCK_RW_GRAB_SHARED
814 			if (((data & (LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE)) == 0) ||
815 			    (((data & LCK_RW_SHARED_MASK)) && ((data & LCK_RW_PRIV_EXCL) == 0))) {
816 				data += LCK_RW_SHARED_READER;
817 				do_exch = TRUE;
818 			}
819 		}
820 		if (do_exch) {
821 			if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_acquire_smp)) {
822 				return LCK_RW_GRAB_S_LOCKED;
823 			}
824 		} else {
825 			if ((flags & LCK_RW_GRAB_F_WAIT) == 0) {
826 				atomic_exchange_abort();
827 				return LCK_RW_GRAB_S_NOT_LOCKED;
828 			}
829 
830 			lck_rw_lock_pause(istate);
831 
832 			if (mach_absolute_time() >= deadline) {
833 				return LCK_RW_GRAB_S_TIMED_OUT;
834 			}
835 			if (lock_pause && lock_pause()) {
836 				return LCK_RW_GRAB_S_EARLY_RETURN;
837 			}
838 		}
839 	}
840 }
841 
842 /*
843  * The inverse of lck_rw_grab - drops either the LCK_RW_WANT_EXCL bit or
844  * decrements the reader count. Doesn't deal with waking up waiters - i.e.
845  * should only be called when can_sleep is false.
846  */
847 static void
lck_rw_drop(lck_rw_t * lock,lck_rw_grab_flags_t flags)848 lck_rw_drop(lck_rw_t *lock, lck_rw_grab_flags_t flags)
849 {
850 	uint32_t data, prev;
851 
852 	assert3u(flags & ~(LCK_RW_GRAB_F_WANT_EXCL | LCK_RW_GRAB_F_WAIT), ==, 0);
853 	assert(!lock->lck_rw_can_sleep);
854 
855 	for (;;) {
856 		data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_acquire_smp);
857 
858 		/* Interlock should never be taken when can_sleep is false. */
859 		assert3u(data & LCK_RW_INTERLOCK, ==, 0);
860 
861 		if ((flags & LCK_RW_GRAB_F_WANT_EXCL) != 0) {
862 			data &= ~LCK_RW_WANT_EXCL;
863 		} else {
864 			data -= LCK_RW_SHARED_READER;
865 		}
866 
867 		if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_acquire_smp)) {
868 			break;
869 		}
870 
871 		cpu_pause();
872 	}
873 
874 	return;
875 }
876 
877 static boolean_t
878 lck_rw_lock_exclusive_gen(
879 	lck_rw_t        *lock,
880 	bool            (^lock_pause)(void))
881 {
882 	__assert_only thread_t self = current_thread();
883 	__kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lock);
884 	lck_rw_word_t           word;
885 	int                     slept = 0;
886 	lck_rw_grab_state_t     grab_state = LCK_RW_GRAB_S_NOT_LOCKED;
887 	lck_rw_drain_state_t    drain_state = LCK_RW_DRAIN_S_NOT_DRAINED;
888 	wait_result_t           res = 0;
889 	boolean_t               istate;
890 
891 #if     CONFIG_DTRACE
892 	boolean_t dtrace_ls_initialized = FALSE;
893 	boolean_t dtrace_rwl_excl_spin, dtrace_rwl_excl_block, dtrace_ls_enabled = FALSE;
894 	uint64_t wait_interval = 0;
895 	int readers_at_sleep = 0;
896 #endif
897 
898 	assertf(lock->lck_rw_owner != self->ctid,
899 	    "Lock already held state=0x%x, owner=%p",
900 	    ordered_load_rw(lock), self);
901 
902 #ifdef DEBUG_RW
903 	/*
904 	 * Best effort attempt to check that this thread
905 	 * is not already holding the lock (this checks read mode too).
906 	 */
907 	assert_canlock_rwlock(lock, self, LCK_RW_TYPE_EXCLUSIVE);
908 #endif /* DEBUG_RW */
909 
910 	/*
911 	 *	Try to acquire the lck_rw_want_excl bit.
912 	 */
913 	while (lck_rw_grab(lock, LCK_RW_GRAB_F_WANT_EXCL, NULL) != LCK_RW_GRAB_S_LOCKED) {
914 #if     CONFIG_DTRACE
915 		if (dtrace_ls_initialized == FALSE) {
916 			dtrace_ls_initialized = TRUE;
917 			dtrace_rwl_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] != 0);
918 			dtrace_rwl_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK] != 0);
919 			dtrace_ls_enabled = dtrace_rwl_excl_spin || dtrace_rwl_excl_block;
920 			if (dtrace_ls_enabled) {
921 				/*
922 				 * Either sleeping or spinning is happening,
923 				 *  start a timing of our delay interval now.
924 				 */
925 				readers_at_sleep = lock->lck_rw_shared_count;
926 				wait_interval = mach_absolute_time();
927 			}
928 		}
929 #endif
930 
931 		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) | DBG_FUNC_START,
932 		    trace_lck, 0, 0, 0, 0);
933 
934 		grab_state = lck_rw_grab(lock, LCK_RW_GRAB_F_WANT_EXCL | LCK_RW_GRAB_F_WAIT, lock_pause);
935 
936 		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) | DBG_FUNC_END,
937 		    trace_lck, 0, 0, grab_state, 0);
938 
939 		if (grab_state == LCK_RW_GRAB_S_LOCKED ||
940 		    grab_state == LCK_RW_GRAB_S_EARLY_RETURN) {
941 			break;
942 		}
943 		/*
944 		 * if we get here, the deadline has expired w/o us
945 		 * being able to grab the lock exclusively
946 		 * check to see if we're allowed to do a thread_block
947 		 */
948 		word.data = ordered_load_rw(lock);
949 		if (word.can_sleep) {
950 			istate = lck_interlock_lock(lock);
951 			word.data = ordered_load_rw(lock);
952 
953 			if (word.want_excl) {
954 				KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_WAIT_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
955 
956 				word.w_waiting = 1;
957 				ordered_store_rw(lock, word.data);
958 
959 				thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockWrite);
960 				res = assert_wait(LCK_RW_WRITER_EVENT(lock),
961 				    THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
962 				lck_interlock_unlock(lock, istate);
963 				if (res == THREAD_WAITING) {
964 					res = thread_block(THREAD_CONTINUE_NULL);
965 					slept++;
966 				}
967 				KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_WAIT_CODE) | DBG_FUNC_END, trace_lck, res, slept, 0, 0);
968 			} else {
969 				word.want_excl = 1;
970 				ordered_store_rw(lock, word.data);
971 				lck_interlock_unlock(lock, istate);
972 				break;
973 			}
974 		}
975 	}
976 
977 	if (grab_state == LCK_RW_GRAB_S_EARLY_RETURN) {
978 		assert(lock_pause);
979 		return FALSE;
980 	}
981 
982 	/*
983 	 * Wait for readers (and upgrades) to finish...
984 	 */
985 	while (lck_rw_drain_status(lock, LCK_RW_SHARED_MASK | LCK_RW_WANT_UPGRADE, FALSE, NULL) != LCK_RW_DRAIN_S_DRAINED) {
986 #if     CONFIG_DTRACE
987 		/*
988 		 * Either sleeping or spinning is happening, start
989 		 * a timing of our delay interval now.  If we set it
990 		 * to -1 we don't have accurate data so we cannot later
991 		 * decide to record a dtrace spin or sleep event.
992 		 */
993 		if (dtrace_ls_initialized == FALSE) {
994 			dtrace_ls_initialized = TRUE;
995 			dtrace_rwl_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] != 0);
996 			dtrace_rwl_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK] != 0);
997 			dtrace_ls_enabled = dtrace_rwl_excl_spin || dtrace_rwl_excl_block;
998 			if (dtrace_ls_enabled) {
999 				/*
1000 				 * Either sleeping or spinning is happening,
1001 				 *  start a timing of our delay interval now.
1002 				 */
1003 				readers_at_sleep = lock->lck_rw_shared_count;
1004 				wait_interval = mach_absolute_time();
1005 			}
1006 		}
1007 #endif
1008 
1009 		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
1010 
1011 		drain_state = lck_rw_drain_status(lock, LCK_RW_SHARED_MASK | LCK_RW_WANT_UPGRADE, TRUE, lock_pause);
1012 
1013 		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) | DBG_FUNC_END, trace_lck, 0, 0, drain_state, 0);
1014 
1015 		if (drain_state == LCK_RW_DRAIN_S_DRAINED ||
1016 		    drain_state == LCK_RW_DRAIN_S_EARLY_RETURN) {
1017 			break;
1018 		}
1019 		/*
1020 		 * if we get here, the deadline has expired w/o us
1021 		 * being able to grab the lock exclusively
1022 		 * check to see if we're allowed to do a thread_block
1023 		 */
1024 		word.data = ordered_load_rw(lock);
1025 		if (word.can_sleep) {
1026 			istate = lck_interlock_lock(lock);
1027 			word.data = ordered_load_rw(lock);
1028 
1029 			if (word.shared_count != 0 || word.want_upgrade) {
1030 				KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_WAIT_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
1031 
1032 				word.w_waiting = 1;
1033 				ordered_store_rw(lock, word.data);
1034 
1035 				thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockWrite);
1036 				res = assert_wait(LCK_RW_WRITER_EVENT(lock),
1037 				    THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
1038 				lck_interlock_unlock(lock, istate);
1039 
1040 				if (res == THREAD_WAITING) {
1041 					res = thread_block(THREAD_CONTINUE_NULL);
1042 					slept++;
1043 				}
1044 				KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_WAIT_CODE) | DBG_FUNC_END, trace_lck, res, slept, 0, 0);
1045 			} else {
1046 				lck_interlock_unlock(lock, istate);
1047 				/*
1048 				 * must own the lock now, since we checked for
1049 				 * readers or upgrade owner behind the interlock
1050 				 * no need for a call to 'lck_rw_drain_status'
1051 				 */
1052 				break;
1053 			}
1054 		}
1055 	}
1056 
1057 #if     CONFIG_DTRACE
1058 	/*
1059 	 * Decide what latencies we suffered that are Dtrace events.
1060 	 * If we have set wait_interval, then we either spun or slept.
1061 	 * At least we get out from under the interlock before we record
1062 	 * which is the best we can do here to minimize the impact
1063 	 * of the tracing.
1064 	 * If we have set wait_interval to -1, then dtrace was not enabled when we
1065 	 * started sleeping/spinning so we don't record this event.
1066 	 */
1067 	if (dtrace_ls_enabled == TRUE) {
1068 		if (slept == 0) {
1069 			LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_SPIN, lock,
1070 			    mach_absolute_time() - wait_interval, 1);
1071 		} else {
1072 			/*
1073 			 * For the blocking case, we also record if when we blocked
1074 			 * it was held for read or write, and how many readers.
1075 			 * Notice that above we recorded this before we dropped
1076 			 * the interlock so the count is accurate.
1077 			 */
1078 			LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_BLOCK, lock,
1079 			    mach_absolute_time() - wait_interval, 1,
1080 			    (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1081 		}
1082 	}
1083 #endif /* CONFIG_DTRACE */
1084 
1085 	if (drain_state == LCK_RW_DRAIN_S_EARLY_RETURN) {
1086 		lck_rw_drop(lock, LCK_RW_GRAB_F_WANT_EXCL);
1087 		assert(lock_pause);
1088 		return FALSE;
1089 	}
1090 
1091 #if CONFIG_DTRACE
1092 	LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE, lock, 1);
1093 #endif  /* CONFIG_DTRACE */
1094 
1095 	return TRUE;
1096 }
1097 
1098 static inline void
lck_rw_lock_check_preemption(lck_rw_t * lock __unused)1099 lck_rw_lock_check_preemption(lck_rw_t *lock __unused)
1100 {
1101 	assertf((get_preemption_level() == 0 && ml_get_interrupts_enabled()) ||
1102 	    startup_phase < STARTUP_SUB_EARLY_BOOT ||
1103 	    current_cpu_datap()->cpu_hibernate ||
1104 	    ml_is_quiescing() ||
1105 	    !not_in_kdp,
1106 	    "%s: attempt to take rwlock %p in non-preemptible or interrupt context: "
1107 	    "preemption level = %d, interruptible = %d", __func__, lock,
1108 	    get_preemption_level(), (int)ml_get_interrupts_enabled());
1109 }
1110 
1111 #define LCK_RW_LOCK_EXCLUSIVE_TAS(lck) (atomic_test_and_set32(&(lck)->lck_rw_data, \
1112 	    (LCK_RW_SHARED_MASK | LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE | LCK_RW_INTERLOCK), \
1113 	    LCK_RW_WANT_EXCL, memory_order_acquire_smp, FALSE))
1114 /*!
1115  * @function lck_rw_lock_exclusive_check_contended
1116  *
1117  * @abstract
1118  * Locks a rw_lock in exclusive mode.
1119  *
1120  * @discussion
1121  * This routine IS EXPERIMENTAL.
1122  * It's only used for the vm object lock, and use for other subsystems is UNSUPPORTED.
1123  * Note that the return value is ONLY A HEURISTIC w.r.t. the lock's contention.
1124  *
1125  * @param lock           rw_lock to lock.
1126  *
1127  * @returns Returns TRUE if the thread spun or blocked while attempting to acquire the lock, FALSE
1128  *          otherwise.
1129  */
1130 bool
lck_rw_lock_exclusive_check_contended(lck_rw_t * lock)1131 lck_rw_lock_exclusive_check_contended(
1132 	lck_rw_t        *lock)
1133 {
1134 	thread_t        thread = current_thread();
1135 	bool            contended  = false;
1136 
1137 	if (lock->lck_rw_can_sleep) {
1138 		lck_rw_lock_check_preemption(lock);
1139 		lck_rw_lock_count_inc(thread, lock);
1140 	} else if (get_preemption_level() == 0) {
1141 		panic("Taking non-sleepable RW lock with preemption enabled");
1142 	}
1143 
1144 	if (LCK_RW_LOCK_EXCLUSIVE_TAS(lock)) {
1145 #if     CONFIG_DTRACE
1146 		LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE, lock, DTRACE_RW_EXCL);
1147 #endif  /* CONFIG_DTRACE */
1148 	} else {
1149 		contended = true;
1150 		(void) lck_rw_lock_exclusive_gen(lock, NULL);
1151 	}
1152 	assertf(lock->lck_rw_owner == 0, "state=0x%x, owner=%p",
1153 	    ordered_load_rw(lock), ctid_get_thread_unsafe(lock->lck_rw_owner));
1154 	ordered_store_rw_owner(lock, thread->ctid);
1155 
1156 #ifdef DEBUG_RW
1157 	add_held_rwlock(lock, thread, LCK_RW_TYPE_EXCLUSIVE, __builtin_return_address(0));
1158 #endif /* DEBUG_RW */
1159 	return contended;
1160 }
1161 
1162 __attribute__((always_inline))
1163 static boolean_t
1164 lck_rw_lock_exclusive_internal_inline(
1165 	lck_rw_t        *lock,
1166 	void            *caller,
1167 	bool            (^lock_pause)(void))
1168 {
1169 #pragma unused(caller)
1170 	thread_t        thread = current_thread();
1171 
1172 	if (lock->lck_rw_can_sleep) {
1173 		lck_rw_lock_check_preemption(lock);
1174 		lck_rw_lock_count_inc(thread, lock);
1175 	} else if (get_preemption_level() == 0) {
1176 		panic("Taking non-sleepable RW lock with preemption enabled");
1177 	}
1178 
1179 	if (LCK_RW_LOCK_EXCLUSIVE_TAS(lock)) {
1180 #if     CONFIG_DTRACE
1181 		LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE, lock, DTRACE_RW_EXCL);
1182 #endif  /* CONFIG_DTRACE */
1183 	} else if (!lck_rw_lock_exclusive_gen(lock, lock_pause)) {
1184 		/*
1185 		 * lck_rw_lock_exclusive_gen() should only return
1186 		 * early if lock_pause has been passed and
1187 		 * returns FALSE. lock_pause is exclusive with
1188 		 * lck_rw_can_sleep().
1189 		 */
1190 		assert(!lock->lck_rw_can_sleep);
1191 		return FALSE;
1192 	}
1193 
1194 	assertf(lock->lck_rw_owner == 0, "state=0x%x, owner=%p",
1195 	    ordered_load_rw(lock), ctid_get_thread_unsafe(lock->lck_rw_owner));
1196 	ordered_store_rw_owner(lock, thread->ctid);
1197 
1198 #if DEBUG_RW
1199 	add_held_rwlock(lock, thread, LCK_RW_TYPE_EXCLUSIVE, caller);
1200 #endif /* DEBUG_RW */
1201 
1202 	return TRUE;
1203 }
1204 
1205 __attribute__((noinline))
1206 static void
lck_rw_lock_exclusive_internal(lck_rw_t * lock,void * caller)1207 lck_rw_lock_exclusive_internal(
1208 	lck_rw_t        *lock,
1209 	void            *caller)
1210 {
1211 	(void) lck_rw_lock_exclusive_internal_inline(lock, caller, NULL);
1212 }
1213 
1214 /*!
1215  * @function lck_rw_lock_exclusive
1216  *
1217  * @abstract
1218  * Locks a rw_lock in exclusive mode.
1219  *
1220  * @discussion
1221  * This function can block.
1222  * Multiple threads can acquire the lock in shared mode at the same time, but only one thread at a time
1223  * can acquire it in exclusive mode.
1224  * NOTE: the thread cannot return to userspace while the lock is held. Recursive locking is not supported.
1225  *
1226  * @param lock           rw_lock to lock.
1227  */
1228 void
lck_rw_lock_exclusive(lck_rw_t * lock)1229 lck_rw_lock_exclusive(
1230 	lck_rw_t        *lock)
1231 {
1232 	(void) lck_rw_lock_exclusive_internal_inline(lock, __builtin_return_address(0), NULL);
1233 }
1234 
1235 /*!
1236  * @function lck_rw_lock_exclusive_b
1237  *
1238  * @abstract
1239  * Locks a rw_lock in exclusive mode. Returns early if the lock can't be acquired
1240  * and the specified block returns true.
1241  *
1242  * @discussion
1243  * Identical to lck_rw_lock_exclusive() but can return early if the lock can't be
1244  * acquired and the specified block returns true. The block is called
1245  * repeatedly when waiting to acquire the lock.
1246  * Should only be called when the lock cannot sleep (i.e. when
1247  * lock->lck_rw_can_sleep is false).
1248  *
1249  * @param lock           rw_lock to lock.
1250  * @param lock_pause     block invoked while waiting to acquire lock
1251  *
1252  * @returns              Returns TRUE if the lock is successfully taken,
1253  *                       FALSE if the block returns true and the lock has
1254  *                       not been acquired.
1255  */
1256 boolean_t
1257 lck_rw_lock_exclusive_b(
1258 	lck_rw_t        *lock,
1259 	bool            (^lock_pause)(void))
1260 {
1261 	assert(!lock->lck_rw_can_sleep);
1262 
1263 	return lck_rw_lock_exclusive_internal_inline(lock, __builtin_return_address(0), lock_pause);
1264 }
1265 
1266 /*
1267  *	Routine:	lck_rw_lock_shared_gen
1268  *	Function:
1269  *		Fast path code has determined that this lock
1270  *		is held exclusively... this is where we spin/block
1271  *		until we can acquire the lock in the shared mode
1272  */
1273 static boolean_t
1274 lck_rw_lock_shared_gen(
1275 	lck_rw_t        *lck,
1276 	bool            (^lock_pause)(void))
1277 {
1278 	__assert_only thread_t  self = current_thread();
1279 	__kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lck);
1280 	lck_rw_word_t           word;
1281 	lck_rw_grab_state_t     grab_state = LCK_RW_GRAB_S_NOT_LOCKED;
1282 	int                     slept = 0;
1283 	wait_result_t           res = 0;
1284 	boolean_t               istate;
1285 
1286 #if     CONFIG_DTRACE
1287 	uint64_t wait_interval = 0;
1288 	int readers_at_sleep = 0;
1289 	boolean_t dtrace_ls_initialized = FALSE;
1290 	boolean_t dtrace_rwl_shared_spin, dtrace_rwl_shared_block, dtrace_ls_enabled = FALSE;
1291 #endif /* CONFIG_DTRACE */
1292 
1293 	assertf(lck->lck_rw_owner != self->ctid,
1294 	    "Lock already held state=0x%x, owner=%p",
1295 	    ordered_load_rw(lck), self);
1296 
1297 #ifdef DEBUG_RW
1298 	/*
1299 	 * Best effort attempt to check that this thread
1300 	 * is not already holding the lock in shared mode.
1301 	 */
1302 	assert_canlock_rwlock(lck, self, LCK_RW_TYPE_SHARED);
1303 #endif
1304 
1305 	while (lck_rw_grab(lck, LCK_RW_GRAB_F_SHARED, NULL) != LCK_RW_GRAB_S_LOCKED) {
1306 #if     CONFIG_DTRACE
1307 		if (dtrace_ls_initialized == FALSE) {
1308 			dtrace_ls_initialized = TRUE;
1309 			dtrace_rwl_shared_spin = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_SPIN] != 0);
1310 			dtrace_rwl_shared_block = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_BLOCK] != 0);
1311 			dtrace_ls_enabled = dtrace_rwl_shared_spin || dtrace_rwl_shared_block;
1312 			if (dtrace_ls_enabled) {
1313 				/*
1314 				 * Either sleeping or spinning is happening,
1315 				 *  start a timing of our delay interval now.
1316 				 */
1317 				readers_at_sleep = lck->lck_rw_shared_count;
1318 				wait_interval = mach_absolute_time();
1319 			}
1320 		}
1321 #endif
1322 
1323 		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_SPIN_CODE) | DBG_FUNC_START,
1324 		    trace_lck, lck->lck_rw_want_excl, lck->lck_rw_want_upgrade, 0, 0);
1325 
1326 		grab_state = lck_rw_grab(lck, LCK_RW_GRAB_F_SHARED | LCK_RW_GRAB_F_WAIT, lock_pause);
1327 
1328 		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_SPIN_CODE) | DBG_FUNC_END,
1329 		    trace_lck, lck->lck_rw_want_excl, lck->lck_rw_want_upgrade, grab_state, 0);
1330 
1331 		if (grab_state == LCK_RW_GRAB_S_LOCKED ||
1332 		    grab_state == LCK_RW_GRAB_S_EARLY_RETURN) {
1333 			break;
1334 		}
1335 
1336 		/*
1337 		 * if we get here, the deadline has expired w/o us
1338 		 * being able to grab the lock for read
1339 		 * check to see if we're allowed to do a thread_block
1340 		 */
1341 		if (lck->lck_rw_can_sleep) {
1342 			istate = lck_interlock_lock(lck);
1343 
1344 			word.data = ordered_load_rw(lck);
1345 			if ((word.want_excl || word.want_upgrade) &&
1346 			    ((word.shared_count == 0) || word.priv_excl)) {
1347 				KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_WAIT_CODE) | DBG_FUNC_START,
1348 				    trace_lck, word.want_excl, word.want_upgrade, 0, 0);
1349 
1350 				word.r_waiting = 1;
1351 				ordered_store_rw(lck, word.data);
1352 
1353 				thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockRead);
1354 				res = assert_wait(LCK_RW_READER_EVENT(lck),
1355 				    THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
1356 				lck_interlock_unlock(lck, istate);
1357 
1358 				if (res == THREAD_WAITING) {
1359 					res = thread_block(THREAD_CONTINUE_NULL);
1360 					slept++;
1361 				}
1362 				KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_WAIT_CODE) | DBG_FUNC_END,
1363 				    trace_lck, res, slept, 0, 0);
1364 			} else {
1365 				word.shared_count++;
1366 				ordered_store_rw(lck, word.data);
1367 				lck_interlock_unlock(lck, istate);
1368 				break;
1369 			}
1370 		}
1371 	}
1372 
1373 #if     CONFIG_DTRACE
1374 	if (dtrace_ls_enabled == TRUE) {
1375 		if (slept == 0) {
1376 			LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_SPIN, lck, mach_absolute_time() - wait_interval, 0);
1377 		} else {
1378 			LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_BLOCK, lck,
1379 			    mach_absolute_time() - wait_interval, 0,
1380 			    (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1381 		}
1382 	}
1383 #endif /* CONFIG_DTRACE */
1384 
1385 	if (grab_state == LCK_RW_GRAB_S_EARLY_RETURN) {
1386 		assert(lock_pause);
1387 		return FALSE;
1388 	}
1389 
1390 #if     CONFIG_DTRACE
1391 	LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_ACQUIRE, lck, 0);
1392 #endif  /* CONFIG_DTRACE */
1393 
1394 	return TRUE;
1395 }
1396 
1397 __attribute__((always_inline))
1398 static boolean_t
1399 lck_rw_lock_shared_internal_inline(
1400 	lck_rw_t        *lock,
1401 	void            *caller,
1402 	bool            (^lock_pause)(void))
1403 {
1404 #pragma unused(caller)
1405 
1406 	uint32_t        data, prev;
1407 	thread_t        thread = current_thread();
1408 #ifdef DEBUG_RW
1409 	boolean_t       check_canlock = TRUE;
1410 #endif
1411 
1412 	if (lock->lck_rw_can_sleep) {
1413 		lck_rw_lock_check_preemption(lock);
1414 		lck_rw_lock_count_inc(thread, lock);
1415 	} else if (get_preemption_level() == 0) {
1416 		panic("Taking non-sleepable RW lock with preemption enabled");
1417 	}
1418 
1419 	for (;;) {
1420 		data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_acquire_smp);
1421 		if (data & (LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE | LCK_RW_INTERLOCK)) {
1422 			atomic_exchange_abort();
1423 			if (!lck_rw_lock_shared_gen(lock, lock_pause)) {
1424 				/*
1425 				 * lck_rw_lock_shared_gen() should only return
1426 				 * early if lock_pause has been passed and
1427 				 * returns FALSE. lock_pause is exclusive with
1428 				 * lck_rw_can_sleep().
1429 				 */
1430 				assert(!lock->lck_rw_can_sleep);
1431 				return FALSE;
1432 			}
1433 
1434 			goto locked;
1435 		}
1436 #ifdef DEBUG_RW
1437 		if ((data & LCK_RW_SHARED_MASK) == 0) {
1438 			/*
1439 			 * If the lock is uncontended,
1440 			 * we do not need to check if we can lock it
1441 			 */
1442 			check_canlock = FALSE;
1443 		}
1444 #endif
1445 		data += LCK_RW_SHARED_READER;
1446 		if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_acquire_smp)) {
1447 			break;
1448 		}
1449 		cpu_pause();
1450 	}
1451 #ifdef DEBUG_RW
1452 	if (check_canlock) {
1453 		/*
1454 		 * Best effort attempt to check that this thread
1455 		 * is not already holding the lock (this checks read mode too).
1456 		 */
1457 		assert_canlock_rwlock(lock, thread, LCK_RW_TYPE_SHARED);
1458 	}
1459 #endif
1460 locked:
1461 	assertf(lock->lck_rw_owner == 0, "state=0x%x, owner=%p",
1462 	    ordered_load_rw(lock), ctid_get_thread_unsafe(lock->lck_rw_owner));
1463 
1464 #if     CONFIG_DTRACE
1465 	LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_ACQUIRE, lock, DTRACE_RW_SHARED);
1466 #endif  /* CONFIG_DTRACE */
1467 
1468 #ifdef DEBUG_RW
1469 	add_held_rwlock(lock, thread, LCK_RW_TYPE_SHARED, caller);
1470 #endif /* DEBUG_RW */
1471 
1472 	return TRUE;
1473 }
1474 
1475 __attribute__((noinline))
1476 static void
lck_rw_lock_shared_internal(lck_rw_t * lock,void * caller)1477 lck_rw_lock_shared_internal(
1478 	lck_rw_t        *lock,
1479 	void            *caller)
1480 {
1481 	(void) lck_rw_lock_shared_internal_inline(lock, caller, NULL);
1482 }
1483 
1484 /*!
1485  * @function lck_rw_lock_shared
1486  *
1487  * @abstract
1488  * Locks a rw_lock in shared mode.
1489  *
1490  * @discussion
1491  * This function can block.
1492  * Multiple threads can acquire the lock in shared mode at the same time, but only one thread at a time
1493  * can acquire it in exclusive mode.
1494  * If the lock is held in shared mode and there are no writers waiting, a reader will be able to acquire
1495  * the lock without waiting.
1496  * If the lock is held in shared mode and there is at least a writer waiting, a reader will wait
1497  * for all the writers to make progress if the lock was initialized with the default settings. Instead if
1498  * RW_SHARED_PRIORITY was selected at initialization time, a reader will never wait if the lock is held
1499  * in shared mode.
1500  * NOTE: the thread cannot return to userspace while the lock is held. Recursive locking is not supported.
1501  *
1502  * @param lock           rw_lock to lock.
1503  */
1504 void
lck_rw_lock_shared(lck_rw_t * lock)1505 lck_rw_lock_shared(
1506 	lck_rw_t        *lock)
1507 {
1508 	(void) lck_rw_lock_shared_internal_inline(lock, __builtin_return_address(0), NULL);
1509 }
1510 
1511 /*!
1512  * @function lck_rw_lock_shared_b
1513  *
1514  * @abstract
1515  * Locks a rw_lock in shared mode. Returns early if the lock can't be acquired
1516  * and the specified block returns true.
1517  *
1518  * @discussion
1519  * Identical to lck_rw_lock_shared() but can return early if the lock can't be
1520  * acquired and the specified block returns true. The block is called
1521  * repeatedly when waiting to acquire the lock.
1522  * Should only be called when the lock cannot sleep (i.e. when
1523  * lock->lck_rw_can_sleep is false).
1524  *
1525  * @param lock           rw_lock to lock.
1526  * @param lock_pause     block invoked while waiting to acquire lock
1527  *
1528  * @returns              Returns TRUE if the lock is successfully taken,
1529  *                       FALSE if the block returns true and the lock has
1530  *                       not been acquired.
1531  */
1532 boolean_t
1533 lck_rw_lock_shared_b(
1534 	lck_rw_t        *lock,
1535 	bool            (^lock_pause)(void))
1536 {
1537 	assert(!lock->lck_rw_can_sleep);
1538 
1539 	return lck_rw_lock_shared_internal_inline(lock, __builtin_return_address(0), lock_pause);
1540 }
1541 
1542 /*
1543  *	Routine:	lck_rw_lock_shared_to_exclusive_failure
1544  *	Function:
1545  *		Fast path code has already dropped our read
1546  *		count and determined that someone else owns 'lck_rw_want_upgrade'
1547  *		if 'lck_rw_shared_count' == 0, its also already dropped 'lck_w_waiting'
1548  *		all we need to do here is determine if a wakeup is needed
1549  */
1550 static boolean_t
lck_rw_lock_shared_to_exclusive_failure(lck_rw_t * lck,uint32_t prior_lock_state)1551 lck_rw_lock_shared_to_exclusive_failure(
1552 	lck_rw_t        *lck,
1553 	uint32_t        prior_lock_state)
1554 {
1555 	thread_t        thread = current_thread();
1556 
1557 	if ((prior_lock_state & LCK_RW_W_WAITING) &&
1558 	    ((prior_lock_state & LCK_RW_SHARED_MASK) == LCK_RW_SHARED_READER)) {
1559 		/*
1560 		 *	Someone else has requested upgrade.
1561 		 *	Since we've released the read lock, wake
1562 		 *	him up if he's blocked waiting
1563 		 */
1564 		thread_wakeup(LCK_RW_WRITER_EVENT(lck));
1565 	}
1566 
1567 	/* Check if dropping the lock means that we need to unpromote */
1568 	if (lck->lck_rw_can_sleep) {
1569 		lck_rw_lock_count_dec(thread, lck);
1570 	}
1571 
1572 	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_CODE) | DBG_FUNC_NONE,
1573 	    VM_KERNEL_UNSLIDE_OR_PERM(lck), lck->lck_rw_shared_count, lck->lck_rw_want_upgrade, 0, 0);
1574 
1575 #ifdef DEBUG_RW
1576 	remove_held_rwlock(lck, thread, LCK_RW_TYPE_SHARED);
1577 #endif /* DEBUG_RW */
1578 
1579 	return FALSE;
1580 }
1581 
1582 /*
1583  *	Routine:	lck_rw_lock_shared_to_exclusive_success
1584  *	Function:
1585  *		the fast path code has already dropped our read
1586  *		count and successfully acquired 'lck_rw_want_upgrade'
1587  *		we just need to wait for the rest of the readers to drain
1588  *		and then we can return as the exclusive holder of this lock
1589  */
1590 static void
lck_rw_lock_shared_to_exclusive_success(lck_rw_t * lock)1591 lck_rw_lock_shared_to_exclusive_success(
1592 	lck_rw_t        *lock)
1593 {
1594 	__kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lock);
1595 	int                     slept = 0;
1596 	lck_rw_word_t           word;
1597 	wait_result_t           res;
1598 	boolean_t               istate;
1599 	lck_rw_drain_state_t    drain_state;
1600 
1601 #if     CONFIG_DTRACE
1602 	uint64_t                wait_interval = 0;
1603 	int                     readers_at_sleep = 0;
1604 	boolean_t               dtrace_ls_initialized = FALSE;
1605 	boolean_t               dtrace_rwl_shared_to_excl_spin, dtrace_rwl_shared_to_excl_block, dtrace_ls_enabled = FALSE;
1606 #endif
1607 
1608 	while (lck_rw_drain_status(lock, LCK_RW_SHARED_MASK, FALSE, NULL) != LCK_RW_DRAIN_S_DRAINED) {
1609 		word.data = ordered_load_rw(lock);
1610 #if     CONFIG_DTRACE
1611 		if (dtrace_ls_initialized == FALSE) {
1612 			dtrace_ls_initialized = TRUE;
1613 			dtrace_rwl_shared_to_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN] != 0);
1614 			dtrace_rwl_shared_to_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK] != 0);
1615 			dtrace_ls_enabled = dtrace_rwl_shared_to_excl_spin || dtrace_rwl_shared_to_excl_block;
1616 			if (dtrace_ls_enabled) {
1617 				/*
1618 				 * Either sleeping or spinning is happening,
1619 				 *  start a timing of our delay interval now.
1620 				 */
1621 				readers_at_sleep = word.shared_count;
1622 				wait_interval = mach_absolute_time();
1623 			}
1624 		}
1625 #endif
1626 
1627 		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_SPIN_CODE) | DBG_FUNC_START,
1628 		    trace_lck, word.shared_count, 0, 0, 0);
1629 
1630 		drain_state = lck_rw_drain_status(lock, LCK_RW_SHARED_MASK, TRUE, NULL);
1631 
1632 		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_SPIN_CODE) | DBG_FUNC_END,
1633 		    trace_lck, lock->lck_rw_shared_count, 0, 0, 0);
1634 
1635 		if (drain_state == LCK_RW_DRAIN_S_DRAINED) {
1636 			break;
1637 		}
1638 
1639 		/*
1640 		 * if we get here, the spin deadline in lck_rw_wait_on_status()
1641 		 * has expired w/o the rw_shared_count having drained to 0
1642 		 * check to see if we're allowed to do a thread_block
1643 		 */
1644 		if (word.can_sleep) {
1645 			istate = lck_interlock_lock(lock);
1646 
1647 			word.data = ordered_load_rw(lock);
1648 			if (word.shared_count != 0) {
1649 				KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_WAIT_CODE) | DBG_FUNC_START,
1650 				    trace_lck, word.shared_count, 0, 0, 0);
1651 
1652 				word.w_waiting = 1;
1653 				ordered_store_rw(lock, word.data);
1654 
1655 				thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockUpgrade);
1656 				res = assert_wait(LCK_RW_WRITER_EVENT(lock),
1657 				    THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
1658 				lck_interlock_unlock(lock, istate);
1659 
1660 				if (res == THREAD_WAITING) {
1661 					res = thread_block(THREAD_CONTINUE_NULL);
1662 					slept++;
1663 				}
1664 				KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_WAIT_CODE) | DBG_FUNC_END,
1665 				    trace_lck, res, slept, 0, 0);
1666 			} else {
1667 				lck_interlock_unlock(lock, istate);
1668 				break;
1669 			}
1670 		}
1671 	}
1672 #if     CONFIG_DTRACE
1673 	/*
1674 	 * We infer whether we took the sleep/spin path above by checking readers_at_sleep.
1675 	 */
1676 	if (dtrace_ls_enabled == TRUE) {
1677 		if (slept == 0) {
1678 			LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN, lock, mach_absolute_time() - wait_interval, 0);
1679 		} else {
1680 			LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK, lock,
1681 			    mach_absolute_time() - wait_interval, 1,
1682 			    (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1683 		}
1684 	}
1685 	LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_UPGRADE, lock, 1);
1686 #endif
1687 }
1688 
1689 /*!
1690  * @function lck_rw_lock_shared_to_exclusive
1691  *
1692  * @abstract
1693  * Upgrades a rw_lock held in shared mode to exclusive.
1694  *
1695  * @discussion
1696  * This function can block.
1697  * Only one reader at a time can upgrade to exclusive mode. If the upgrades fails the function will
1698  * return with the lock not held.
1699  * The caller needs to hold the lock in shared mode to upgrade it.
1700  *
1701  * @param lock           rw_lock already held in shared mode to upgrade.
1702  *
1703  * @returns TRUE if the lock was upgraded, FALSE if it was not possible.
1704  *          If the function was not able to upgrade the lock, the lock will be dropped
1705  *          by the function.
1706  */
1707 boolean_t
lck_rw_lock_shared_to_exclusive(lck_rw_t * lock)1708 lck_rw_lock_shared_to_exclusive(
1709 	lck_rw_t        *lock)
1710 {
1711 	thread_t thread = current_thread();
1712 	uint32_t data, prev;
1713 
1714 	assertf(lock->lck_rw_priv_excl != 0, "lock %p thread %p", lock, current_thread());
1715 
1716 #if DEBUG_RW
1717 	assert_held_rwlock(lock, thread, LCK_RW_TYPE_SHARED);
1718 #endif /* DEBUG_RW */
1719 
1720 	for (;;) {
1721 		data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_acquire_smp);
1722 		if (data & LCK_RW_INTERLOCK) {
1723 			atomic_exchange_abort();
1724 			lck_rw_interlock_spin(lock);
1725 			continue;
1726 		}
1727 		if (data & LCK_RW_WANT_UPGRADE) {
1728 			data -= LCK_RW_SHARED_READER;
1729 			if ((data & LCK_RW_SHARED_MASK) == 0) {         /* we were the last reader */
1730 				data &= ~(LCK_RW_W_WAITING);            /* so clear the wait indicator */
1731 			}
1732 			if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_acquire_smp)) {
1733 				return lck_rw_lock_shared_to_exclusive_failure(lock, prev);
1734 			}
1735 		} else {
1736 			data |= LCK_RW_WANT_UPGRADE;            /* ask for WANT_UPGRADE */
1737 			data -= LCK_RW_SHARED_READER;           /* and shed our read count */
1738 			if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_acquire_smp)) {
1739 				break;
1740 			}
1741 		}
1742 		cpu_pause();
1743 	}
1744 	/* we now own the WANT_UPGRADE */
1745 	if (data & LCK_RW_SHARED_MASK) {        /* check to see if all of the readers are drained */
1746 		lck_rw_lock_shared_to_exclusive_success(lock);  /* if not, we need to go wait */
1747 	}
1748 
1749 	assertf(lock->lck_rw_owner == 0, "state=0x%x, owner=%p",
1750 	    ordered_load_rw(lock), ctid_get_thread_unsafe(lock->lck_rw_owner));
1751 
1752 	ordered_store_rw_owner(lock, thread->ctid);
1753 #if     CONFIG_DTRACE
1754 	LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_UPGRADE, lock, 0);
1755 #endif  /* CONFIG_DTRACE */
1756 
1757 #if DEBUG_RW
1758 	change_held_rwlock(lock, thread, LCK_RW_TYPE_SHARED, __builtin_return_address(0));
1759 #endif /* DEBUG_RW */
1760 	return TRUE;
1761 }
1762 
1763 /*
1764  *      Routine:        lck_rw_lock_exclusive_to_shared_gen
1765  *      Function:
1766  *		Fast path has already dropped
1767  *		our exclusive state and bumped lck_rw_shared_count
1768  *		all we need to do here is determine if anyone
1769  *		needs to be awakened.
1770  */
1771 static void
lck_rw_lock_exclusive_to_shared_gen(lck_rw_t * lck,uint32_t prior_lock_state,void * caller)1772 lck_rw_lock_exclusive_to_shared_gen(
1773 	lck_rw_t        *lck,
1774 	uint32_t        prior_lock_state,
1775 	void            *caller)
1776 {
1777 #pragma unused(caller)
1778 	__kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lck);
1779 	lck_rw_word_t   fake_lck;
1780 
1781 	/*
1782 	 * prior_lock state is a snapshot of the 1st word of the
1783 	 * lock in question... we'll fake up a pointer to it
1784 	 * and carefully not access anything beyond whats defined
1785 	 * in the first word of a lck_rw_t
1786 	 */
1787 	fake_lck.data = prior_lock_state;
1788 
1789 	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_START,
1790 	    trace_lck, fake_lck->want_excl, fake_lck->want_upgrade, 0, 0);
1791 
1792 	/*
1793 	 * don't wake up anyone waiting to take the lock exclusively
1794 	 * since we hold a read count... when the read count drops to 0,
1795 	 * the writers will be woken.
1796 	 *
1797 	 * wake up any waiting readers if we don't have any writers waiting,
1798 	 * or the lock is NOT marked as rw_priv_excl (writers have privilege)
1799 	 */
1800 	if (!(fake_lck.priv_excl && fake_lck.w_waiting) && fake_lck.r_waiting) {
1801 		thread_wakeup(LCK_RW_READER_EVENT(lck));
1802 	}
1803 
1804 	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_END,
1805 	    trace_lck, lck->lck_rw_want_excl, lck->lck_rw_want_upgrade, lck->lck_rw_shared_count, 0);
1806 
1807 #if CONFIG_DTRACE
1808 	LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_TO_SHARED_DOWNGRADE, lck, 0);
1809 #endif
1810 
1811 #if DEBUG_RW
1812 	thread_t        thread = current_thread();
1813 	change_held_rwlock(lck, thread, LCK_RW_TYPE_EXCLUSIVE, caller);
1814 #endif /* DEBUG_RW */
1815 }
1816 
1817 /*!
1818  * @function lck_rw_lock_exclusive_to_shared
1819  *
1820  * @abstract
1821  * Downgrades a rw_lock held in exclusive mode to shared.
1822  *
1823  * @discussion
1824  * The caller needs to hold the lock in exclusive mode to be able to downgrade it.
1825  *
1826  * @param lock           rw_lock already held in exclusive mode to downgrade.
1827  */
1828 void
lck_rw_lock_exclusive_to_shared(lck_rw_t * lock)1829 lck_rw_lock_exclusive_to_shared(
1830 	lck_rw_t        *lock)
1831 {
1832 	uint32_t        data, prev;
1833 
1834 	assertf(lock->lck_rw_owner == current_thread()->ctid,
1835 	    "state=0x%x, owner=%p", lock->lck_rw_data,
1836 	    ctid_get_thread_unsafe(lock->lck_rw_owner));
1837 	ordered_store_rw_owner(lock, 0);
1838 
1839 	for (;;) {
1840 		data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_release_smp);
1841 		if (data & LCK_RW_INTERLOCK) {
1842 			atomic_exchange_abort();
1843 			lck_rw_interlock_spin(lock);    /* wait for interlock to clear */
1844 			continue;
1845 		}
1846 		data += LCK_RW_SHARED_READER;
1847 		if (data & LCK_RW_WANT_UPGRADE) {
1848 			data &= ~(LCK_RW_WANT_UPGRADE);
1849 		} else {
1850 			data &= ~(LCK_RW_WANT_EXCL);
1851 		}
1852 		if (!((prev & LCK_RW_W_WAITING) && (prev & LCK_RW_PRIV_EXCL))) {
1853 			data &= ~(LCK_RW_W_WAITING);
1854 		}
1855 		if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_release_smp)) {
1856 			break;
1857 		}
1858 		cpu_pause();
1859 	}
1860 	lck_rw_lock_exclusive_to_shared_gen(lock, prev, __builtin_return_address(0));
1861 }
1862 
1863 /*
1864  * Very sad hack, but the codegen for lck_rw_lock
1865  * is very unhappy with the combination of __builtin_return_address()
1866  * and a noreturn function. For some reason it adds more frames
1867  * than it should. rdar://76570684
1868  */
1869 void
1870 _lck_rw_lock_type_panic(lck_rw_t *lck, lck_rw_type_t lck_rw_type);
1871 #pragma clang diagnostic push
1872 #pragma clang diagnostic ignored "-Wmissing-noreturn"
1873 __attribute__((noinline, weak))
1874 void
_lck_rw_lock_type_panic(lck_rw_t * lck,lck_rw_type_t lck_rw_type)1875 _lck_rw_lock_type_panic(
1876 	lck_rw_t        *lck,
1877 	lck_rw_type_t   lck_rw_type)
1878 {
1879 	panic("lck_rw_lock(): Invalid RW lock type: %x for lock %p", lck_rw_type, lck);
1880 }
1881 #pragma clang diagnostic pop
1882 
1883 /*!
1884  * @function lck_rw_lock
1885  *
1886  * @abstract
1887  * Locks a rw_lock with the specified type.
1888  *
1889  * @discussion
1890  * See lck_rw_lock_shared() or lck_rw_lock_exclusive() for more details.
1891  *
1892  * @param lck           rw_lock to lock.
1893  * @param lck_rw_type   LCK_RW_TYPE_SHARED or LCK_RW_TYPE_EXCLUSIVE
1894  */
1895 void
lck_rw_lock(lck_rw_t * lck,lck_rw_type_t lck_rw_type)1896 lck_rw_lock(
1897 	lck_rw_t        *lck,
1898 	lck_rw_type_t   lck_rw_type)
1899 {
1900 	if (lck_rw_type == LCK_RW_TYPE_SHARED) {
1901 		return lck_rw_lock_shared_internal(lck, __builtin_return_address(0));
1902 	} else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE) {
1903 		return lck_rw_lock_exclusive_internal(lck, __builtin_return_address(0));
1904 	}
1905 	_lck_rw_lock_type_panic(lck, lck_rw_type);
1906 }
1907 
1908 __attribute__((always_inline))
1909 static boolean_t
lck_rw_try_lock_shared_internal_inline(lck_rw_t * lock,void * caller)1910 lck_rw_try_lock_shared_internal_inline(
1911 	lck_rw_t        *lock,
1912 	void            *caller)
1913 {
1914 #pragma unused(caller)
1915 
1916 	uint32_t        data, prev;
1917 	thread_t        thread = current_thread();
1918 #ifdef DEBUG_RW
1919 	boolean_t       check_canlock = TRUE;
1920 #endif
1921 
1922 	for (;;) {
1923 		data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_acquire_smp);
1924 		if (data & LCK_RW_INTERLOCK) {
1925 			atomic_exchange_abort();
1926 			lck_rw_interlock_spin(lock);
1927 			continue;
1928 		}
1929 		if (data & (LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE)) {
1930 			atomic_exchange_abort();
1931 			return FALSE;             /* lock is busy */
1932 		}
1933 #ifdef DEBUG_RW
1934 		if ((data & LCK_RW_SHARED_MASK) == 0) {
1935 			/*
1936 			 * If the lock is uncontended,
1937 			 * we do not need to check if we can lock it
1938 			 */
1939 			check_canlock = FALSE;
1940 		}
1941 #endif
1942 		data += LCK_RW_SHARED_READER;     /* Increment reader refcount */
1943 		if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_acquire_smp)) {
1944 			break;
1945 		}
1946 		cpu_pause();
1947 	}
1948 #ifdef DEBUG_RW
1949 	if (check_canlock) {
1950 		/*
1951 		 * Best effort attempt to check that this thread
1952 		 * is not already holding the lock (this checks read mode too).
1953 		 */
1954 		assert_canlock_rwlock(lock, thread, LCK_RW_TYPE_SHARED);
1955 	}
1956 #endif
1957 	assertf(lock->lck_rw_owner == 0, "state=0x%x, owner=%p",
1958 	    ordered_load_rw(lock), ctid_get_thread_unsafe(lock->lck_rw_owner));
1959 
1960 	if (lock->lck_rw_can_sleep) {
1961 		lck_rw_lock_count_inc(thread, lock);
1962 	} else if (get_preemption_level() == 0) {
1963 		panic("Taking non-sleepable RW lock with preemption enabled");
1964 	}
1965 
1966 #if     CONFIG_DTRACE
1967 	LOCKSTAT_RECORD(LS_LCK_RW_TRY_LOCK_SHARED_ACQUIRE, lock, DTRACE_RW_SHARED);
1968 #endif  /* CONFIG_DTRACE */
1969 
1970 #ifdef DEBUG_RW
1971 	add_held_rwlock(lock, thread, LCK_RW_TYPE_SHARED, caller);
1972 #endif /* DEBUG_RW */
1973 	return TRUE;
1974 }
1975 
1976 __attribute__((noinline))
1977 static boolean_t
lck_rw_try_lock_shared_internal(lck_rw_t * lock,void * caller)1978 lck_rw_try_lock_shared_internal(
1979 	lck_rw_t        *lock,
1980 	void            *caller)
1981 {
1982 	return lck_rw_try_lock_shared_internal_inline(lock, caller);
1983 }
1984 
1985 /*!
1986  * @function lck_rw_try_lock_shared
1987  *
1988  * @abstract
1989  * Tries to locks a rw_lock in read mode.
1990  *
1991  * @discussion
1992  * This function will return and not block in case the lock is already held.
1993  * See lck_rw_lock_shared for more details.
1994  *
1995  * @param lock           rw_lock to lock.
1996  *
1997  * @returns TRUE if the lock is successfully acquired, FALSE in case it was already held.
1998  */
1999 boolean_t
lck_rw_try_lock_shared(lck_rw_t * lock)2000 lck_rw_try_lock_shared(
2001 	lck_rw_t        *lock)
2002 {
2003 	return lck_rw_try_lock_shared_internal_inline(lock, __builtin_return_address(0));
2004 }
2005 
2006 __attribute__((always_inline))
2007 static boolean_t
lck_rw_try_lock_exclusive_internal_inline(lck_rw_t * lock,void * caller)2008 lck_rw_try_lock_exclusive_internal_inline(
2009 	lck_rw_t        *lock,
2010 	void            *caller)
2011 {
2012 #pragma unused(caller)
2013 	uint32_t        data, prev;
2014 
2015 	for (;;) {
2016 		data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_acquire_smp);
2017 		if (data & LCK_RW_INTERLOCK) {
2018 			atomic_exchange_abort();
2019 			lck_rw_interlock_spin(lock);
2020 			continue;
2021 		}
2022 		if (data & (LCK_RW_SHARED_MASK | LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE)) {
2023 			atomic_exchange_abort();
2024 			return FALSE;
2025 		}
2026 		data |= LCK_RW_WANT_EXCL;
2027 		if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_acquire_smp)) {
2028 			break;
2029 		}
2030 		cpu_pause();
2031 	}
2032 	thread_t thread = current_thread();
2033 
2034 	if (lock->lck_rw_can_sleep) {
2035 		lck_rw_lock_count_inc(thread, lock);
2036 	} else if (get_preemption_level() == 0) {
2037 		panic("Taking non-sleepable RW lock with preemption enabled");
2038 	}
2039 
2040 	assertf(lock->lck_rw_owner == 0, "state=0x%x, owner=%p",
2041 	    ordered_load_rw(lock), ctid_get_thread_unsafe(lock->lck_rw_owner));
2042 
2043 	ordered_store_rw_owner(lock, thread->ctid);
2044 #if     CONFIG_DTRACE
2045 	LOCKSTAT_RECORD(LS_LCK_RW_TRY_LOCK_EXCL_ACQUIRE, lock, DTRACE_RW_EXCL);
2046 #endif  /* CONFIG_DTRACE */
2047 
2048 #ifdef DEBUG_RW
2049 	add_held_rwlock(lock, thread, LCK_RW_TYPE_EXCLUSIVE, caller);
2050 #endif /* DEBUG_RW */
2051 	return TRUE;
2052 }
2053 
2054 __attribute__((noinline))
2055 static boolean_t
lck_rw_try_lock_exclusive_internal(lck_rw_t * lock,void * caller)2056 lck_rw_try_lock_exclusive_internal(
2057 	lck_rw_t        *lock,
2058 	void            *caller)
2059 {
2060 	return lck_rw_try_lock_exclusive_internal_inline(lock, caller);
2061 }
2062 
2063 /*!
2064  * @function lck_rw_try_lock_exclusive
2065  *
2066  * @abstract
2067  * Tries to locks a rw_lock in write mode.
2068  *
2069  * @discussion
2070  * This function will return and not block in case the lock is already held.
2071  * See lck_rw_lock_exclusive for more details.
2072  *
2073  * @param lock           rw_lock to lock.
2074  *
2075  * @returns TRUE if the lock is successfully acquired, FALSE in case it was already held.
2076  */
2077 boolean_t
lck_rw_try_lock_exclusive(lck_rw_t * lock)2078 lck_rw_try_lock_exclusive(
2079 	lck_rw_t        *lock)
2080 {
2081 	return lck_rw_try_lock_exclusive_internal_inline(lock, __builtin_return_address(0));
2082 }
2083 
2084 /*
2085  * Very sad hack, but the codegen for lck_rw_try_lock
2086  * is very unhappy with the combination of __builtin_return_address()
2087  * and a noreturn function. For some reason it adds more frames
2088  * than it should. rdar://76570684
2089  */
2090 boolean_t
2091 _lck_rw_try_lock_type_panic(lck_rw_t *lck, lck_rw_type_t lck_rw_type);
2092 #pragma clang diagnostic push
2093 #pragma clang diagnostic ignored "-Wmissing-noreturn"
2094 __attribute__((noinline, weak))
2095 boolean_t
_lck_rw_try_lock_type_panic(lck_rw_t * lck,lck_rw_type_t lck_rw_type)2096 _lck_rw_try_lock_type_panic(
2097 	lck_rw_t        *lck,
2098 	lck_rw_type_t   lck_rw_type)
2099 {
2100 	panic("lck_rw_lock(): Invalid RW lock type: %x for lock %p", lck_rw_type, lck);
2101 }
2102 #pragma clang diagnostic pop
2103 
2104 /*!
2105  * @function lck_rw_try_lock
2106  *
2107  * @abstract
2108  * Tries to locks a rw_lock with the specified type.
2109  *
2110  * @discussion
2111  * This function will return and not wait/block in case the lock is already held.
2112  * See lck_rw_try_lock_shared() or lck_rw_try_lock_exclusive() for more details.
2113  *
2114  * @param lck           rw_lock to lock.
2115  * @param lck_rw_type   LCK_RW_TYPE_SHARED or LCK_RW_TYPE_EXCLUSIVE
2116  *
2117  * @returns TRUE if the lock is successfully acquired, FALSE in case it was already held.
2118  */
2119 boolean_t
lck_rw_try_lock(lck_rw_t * lck,lck_rw_type_t lck_rw_type)2120 lck_rw_try_lock(
2121 	lck_rw_t        *lck,
2122 	lck_rw_type_t   lck_rw_type)
2123 {
2124 	if (lck_rw_type == LCK_RW_TYPE_SHARED) {
2125 		return lck_rw_try_lock_shared_internal(lck, __builtin_return_address(0));
2126 	} else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE) {
2127 		return lck_rw_try_lock_exclusive_internal(lck, __builtin_return_address(0));
2128 	}
2129 	return _lck_rw_try_lock_type_panic(lck, lck_rw_type);
2130 }
2131 
2132 /*
2133  *      Routine:        lck_rw_done_gen
2134  *
2135  *	prior_lock_state is the value in the 1st
2136  *      word of the lock at the time of a successful
2137  *	atomic compare and exchange with the new value...
2138  *      it represents the state of the lock before we
2139  *	decremented the rw_shared_count or cleared either
2140  *      rw_want_upgrade or rw_want_write and
2141  *	the lck_x_waiting bits...  since the wrapper
2142  *      routine has already changed the state atomically,
2143  *	we just need to decide if we should
2144  *	wake up anyone and what value to return... we do
2145  *	this by examining the state of the lock before
2146  *	we changed it
2147  */
2148 static lck_rw_type_t
lck_rw_done_gen(lck_rw_t * lck,uint32_t prior_lock_state)2149 lck_rw_done_gen(
2150 	lck_rw_t        *lck,
2151 	uint32_t        prior_lock_state)
2152 {
2153 	lck_rw_word_t   fake_lck;
2154 	lck_rw_type_t   lock_type;
2155 	thread_t        thread;
2156 
2157 	/*
2158 	 * prior_lock state is a snapshot of the 1st word of the
2159 	 * lock in question... we'll fake up a pointer to it
2160 	 * and carefully not access anything beyond whats defined
2161 	 * in the first word of a lck_rw_t
2162 	 */
2163 	fake_lck.data = prior_lock_state;
2164 
2165 	if (fake_lck.shared_count <= 1) {
2166 		if (fake_lck.w_waiting) {
2167 			thread_wakeup(LCK_RW_WRITER_EVENT(lck));
2168 		}
2169 
2170 		if (!(fake_lck.priv_excl && fake_lck.w_waiting) && fake_lck.r_waiting) {
2171 			thread_wakeup(LCK_RW_READER_EVENT(lck));
2172 		}
2173 	}
2174 	if (fake_lck.shared_count) {
2175 		lock_type = LCK_RW_TYPE_SHARED;
2176 	} else {
2177 		lock_type = LCK_RW_TYPE_EXCLUSIVE;
2178 	}
2179 
2180 	/* Check if dropping the lock means that we need to unpromote */
2181 	thread = current_thread();
2182 	if (fake_lck.can_sleep) {
2183 		lck_rw_lock_count_dec(thread, lck);
2184 	}
2185 
2186 #if CONFIG_DTRACE
2187 	LOCKSTAT_RECORD(LS_LCK_RW_DONE_RELEASE, lck, lock_type == LCK_RW_TYPE_SHARED ? 0 : 1);
2188 #endif
2189 
2190 #ifdef DEBUG_RW
2191 	remove_held_rwlock(lck, thread, lock_type);
2192 #endif /* DEBUG_RW */
2193 	return lock_type;
2194 }
2195 
2196 /*!
2197  * @function lck_rw_done
2198  *
2199  * @abstract
2200  * Force unlocks a rw_lock without consistency checks.
2201  *
2202  * @discussion
2203  * Do not use unless sure you can avoid consistency checks.
2204  *
2205  * @param lock           rw_lock to unlock.
2206  */
2207 lck_rw_type_t
lck_rw_done(lck_rw_t * lock)2208 lck_rw_done(
2209 	lck_rw_t        *lock)
2210 {
2211 	uint32_t        data, prev;
2212 	boolean_t       once = FALSE;
2213 
2214 #ifdef DEBUG_RW
2215 	/*
2216 	 * Best effort attempt to check that this thread
2217 	 * is holding the lock.
2218 	 */
2219 	thread_t thread = current_thread();
2220 	assert_held_rwlock(lock, thread, 0);
2221 #endif /* DEBUG_RW */
2222 	for (;;) {
2223 		data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_release_smp);
2224 		if (data & LCK_RW_INTERLOCK) {          /* wait for interlock to clear */
2225 			atomic_exchange_abort();
2226 			lck_rw_interlock_spin(lock);
2227 			continue;
2228 		}
2229 		if (data & LCK_RW_SHARED_MASK) {        /* lock is held shared */
2230 			assertf(lock->lck_rw_owner == 0,
2231 			    "state=0x%x, owner=%p", lock->lck_rw_data,
2232 			    ctid_get_thread_unsafe(lock->lck_rw_owner));
2233 			data -= LCK_RW_SHARED_READER;
2234 			if ((data & LCK_RW_SHARED_MASK) == 0) { /* if reader count has now gone to 0, check for waiters */
2235 				goto check_waiters;
2236 			}
2237 		} else {                                        /* if reader count == 0, must be exclusive lock */
2238 			if (data & LCK_RW_WANT_UPGRADE) {
2239 				data &= ~(LCK_RW_WANT_UPGRADE);
2240 			} else {
2241 				if (data & LCK_RW_WANT_EXCL) {
2242 					data &= ~(LCK_RW_WANT_EXCL);
2243 				} else {                                /* lock is not 'owned', panic */
2244 					panic("Releasing non-exclusive RW lock without a reader refcount!");
2245 				}
2246 			}
2247 			if (!once) {
2248 				// Only check for holder and clear it once
2249 				assertf(lock->lck_rw_owner == current_thread()->ctid,
2250 				    "state=0x%x, owner=%p", lock->lck_rw_data,
2251 				    ctid_get_thread_unsafe(lock->lck_rw_owner));
2252 				ordered_store_rw_owner(lock, 0);
2253 				once = TRUE;
2254 			}
2255 check_waiters:
2256 			/*
2257 			 * test the original values to match what
2258 			 * lck_rw_done_gen is going to do to determine
2259 			 * which wakeups need to happen...
2260 			 *
2261 			 * if !(fake_lck->lck_rw_priv_excl && fake_lck->lck_w_waiting)
2262 			 */
2263 			if (prev & LCK_RW_W_WAITING) {
2264 				data &= ~(LCK_RW_W_WAITING);
2265 				if ((prev & LCK_RW_PRIV_EXCL) == 0) {
2266 					data &= ~(LCK_RW_R_WAITING);
2267 				}
2268 			} else {
2269 				data &= ~(LCK_RW_R_WAITING);
2270 			}
2271 		}
2272 		if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_release_smp)) {
2273 			break;
2274 		}
2275 		cpu_pause();
2276 	}
2277 	return lck_rw_done_gen(lock, prev);
2278 }
2279 
2280 /*!
2281  * @function lck_rw_unlock_shared
2282  *
2283  * @abstract
2284  * Unlocks a rw_lock previously locked in shared mode.
2285  *
2286  * @discussion
2287  * The same thread that locked the lock needs to unlock it.
2288  *
2289  * @param lck           rw_lock held in shared mode to unlock.
2290  */
2291 void
lck_rw_unlock_shared(lck_rw_t * lck)2292 lck_rw_unlock_shared(
2293 	lck_rw_t        *lck)
2294 {
2295 	lck_rw_type_t   ret;
2296 
2297 	assertf(lck->lck_rw_owner == 0,
2298 	    "state=0x%x, owner=%p", lck->lck_rw_data,
2299 	    ctid_get_thread_unsafe(lck->lck_rw_owner));
2300 	assertf(lck->lck_rw_shared_count > 0, "shared_count=0x%x", lck->lck_rw_shared_count);
2301 	ret = lck_rw_done(lck);
2302 
2303 	if (ret != LCK_RW_TYPE_SHARED) {
2304 		panic("lck_rw_unlock_shared(): lock %p held in mode: %d", lck, ret);
2305 	}
2306 }
2307 
2308 /*!
2309  * @function lck_rw_unlock_exclusive
2310  *
2311  * @abstract
2312  * Unlocks a rw_lock previously locked in exclusive mode.
2313  *
2314  * @discussion
2315  * The same thread that locked the lock needs to unlock it.
2316  *
2317  * @param lck           rw_lock held in exclusive mode to unlock.
2318  */
2319 void
lck_rw_unlock_exclusive(lck_rw_t * lck)2320 lck_rw_unlock_exclusive(
2321 	lck_rw_t        *lck)
2322 {
2323 	lck_rw_type_t   ret;
2324 
2325 	assertf(lck->lck_rw_owner == current_thread()->ctid,
2326 	    "state=0x%x, owner=%p", lck->lck_rw_data,
2327 	    ctid_get_thread_unsafe(lck->lck_rw_owner));
2328 	ret = lck_rw_done(lck);
2329 
2330 	if (ret != LCK_RW_TYPE_EXCLUSIVE) {
2331 		panic("lck_rw_unlock_exclusive(): lock %p held in mode: %d", lck, ret);
2332 	}
2333 }
2334 
2335 /*!
2336  * @function lck_rw_unlock
2337  *
2338  * @abstract
2339  * Unlocks a rw_lock previously locked with lck_rw_type.
2340  *
2341  * @discussion
2342  * The lock must be unlocked by the same thread it was locked from.
2343  * The type of the lock/unlock have to match, unless an upgrade/downgrade was performed while
2344  * holding the lock.
2345  *
2346  * @param lck           rw_lock to unlock.
2347  * @param lck_rw_type   LCK_RW_TYPE_SHARED or LCK_RW_TYPE_EXCLUSIVE
2348  */
2349 void
lck_rw_unlock(lck_rw_t * lck,lck_rw_type_t lck_rw_type)2350 lck_rw_unlock(
2351 	lck_rw_t         *lck,
2352 	lck_rw_type_t    lck_rw_type)
2353 {
2354 	if (lck_rw_type == LCK_RW_TYPE_SHARED) {
2355 		lck_rw_unlock_shared(lck);
2356 	} else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE) {
2357 		lck_rw_unlock_exclusive(lck);
2358 	} else {
2359 		panic("lck_rw_unlock(): Invalid RW lock type: %d", lck_rw_type);
2360 	}
2361 }
2362 
2363 /*!
2364  * @function lck_rw_assert
2365  *
2366  * @abstract
2367  * Asserts the rw_lock is held.
2368  *
2369  * @discussion
2370  * read-write locks do not have a concept of ownership when held in shared mode,
2371  * so this function merely asserts that someone is holding the lock, not necessarily the caller.
2372  * However if rw_lock_debug is on, a best effort mechanism to track the owners is in place, and
2373  * this function can be more accurate.
2374  * Type can be LCK_RW_ASSERT_SHARED, LCK_RW_ASSERT_EXCLUSIVE, LCK_RW_ASSERT_HELD
2375  * LCK_RW_ASSERT_NOTHELD.
2376  *
2377  * @param lck   rw_lock to check.
2378  * @param type  assert type
2379  */
2380 void
lck_rw_assert(lck_rw_t * lck,unsigned int type)2381 lck_rw_assert(
2382 	lck_rw_t        *lck,
2383 	unsigned int    type)
2384 {
2385 	thread_t thread = current_thread();
2386 
2387 	switch (type) {
2388 	case LCK_RW_ASSERT_SHARED:
2389 		if ((lck->lck_rw_shared_count != 0) &&
2390 		    (lck->lck_rw_owner == 0)) {
2391 #if DEBUG_RW
2392 			assert_held_rwlock(lck, thread, LCK_RW_TYPE_SHARED);
2393 #endif /* DEBUG_RW */
2394 			return;
2395 		}
2396 		break;
2397 	case LCK_RW_ASSERT_EXCLUSIVE:
2398 		if ((lck->lck_rw_want_excl || lck->lck_rw_want_upgrade) &&
2399 		    (lck->lck_rw_shared_count == 0) &&
2400 		    (lck->lck_rw_owner == thread->ctid)) {
2401 #if DEBUG_RW
2402 			assert_held_rwlock(lck, thread, LCK_RW_TYPE_EXCLUSIVE);
2403 #endif /* DEBUG_RW */
2404 			return;
2405 		}
2406 		break;
2407 	case LCK_RW_ASSERT_HELD:
2408 		if (lck->lck_rw_shared_count != 0) {
2409 #if DEBUG_RW
2410 			assert_held_rwlock(lck, thread, LCK_RW_TYPE_SHARED);
2411 #endif /* DEBUG_RW */
2412 			return;         // Held shared
2413 		}
2414 		if ((lck->lck_rw_want_excl || lck->lck_rw_want_upgrade) &&
2415 		    (lck->lck_rw_owner == thread->ctid)) {
2416 #if DEBUG_RW
2417 			assert_held_rwlock(lck, thread, LCK_RW_TYPE_EXCLUSIVE);
2418 #endif /* DEBUG_RW */
2419 			return;         // Held exclusive
2420 		}
2421 		break;
2422 	case LCK_RW_ASSERT_NOTHELD:
2423 		if ((lck->lck_rw_shared_count == 0) &&
2424 		    !(lck->lck_rw_want_excl || lck->lck_rw_want_upgrade) &&
2425 		    (lck->lck_rw_owner == 0)) {
2426 #ifdef DEBUG_RW
2427 			assert_canlock_rwlock(lck, thread, LCK_RW_TYPE_EXCLUSIVE);
2428 #endif /* DEBUG_RW */
2429 			return;
2430 		}
2431 		break;
2432 	default:
2433 		break;
2434 	}
2435 	panic("rw lock (%p)%s held (mode=%u)", lck, (type == LCK_RW_ASSERT_NOTHELD ? "" : " not"), type);
2436 }
2437 
2438 /*!
2439  * @function kdp_lck_rw_lock_is_acquired_exclusive
2440  *
2441  * @abstract
2442  * Checks if a rw_lock is held exclusevely.
2443  *
2444  * @discussion
2445  * NOT SAFE: To be used only by kernel debugger to avoid deadlock.
2446  *
2447  * @param lck   lock to check
2448  *
2449  * @returns TRUE if the lock is held exclusevely
2450  */
2451 boolean_t
kdp_lck_rw_lock_is_acquired_exclusive(lck_rw_t * lck)2452 kdp_lck_rw_lock_is_acquired_exclusive(
2453 	lck_rw_t        *lck)
2454 {
2455 	if (not_in_kdp) {
2456 		panic("panic: rw lock exclusive check done outside of kernel debugger");
2457 	}
2458 	return ((lck->lck_rw_want_upgrade || lck->lck_rw_want_excl) && (lck->lck_rw_shared_count == 0)) ? TRUE : FALSE;
2459 }
2460 
2461 void
kdp_rwlck_find_owner(__unused struct waitq * waitq,event64_t event,thread_waitinfo_t * waitinfo)2462 kdp_rwlck_find_owner(
2463 	__unused struct waitq   *waitq,
2464 	event64_t               event,
2465 	thread_waitinfo_t       *waitinfo)
2466 {
2467 	lck_rw_t        *rwlck = NULL;
2468 	switch (waitinfo->wait_type) {
2469 	case kThreadWaitKernelRWLockRead:
2470 		rwlck = READ_EVENT_TO_RWLOCK(event);
2471 		break;
2472 	case kThreadWaitKernelRWLockWrite:
2473 	case kThreadWaitKernelRWLockUpgrade:
2474 		rwlck = WRITE_EVENT_TO_RWLOCK(event);
2475 		break;
2476 	default:
2477 		panic("%s was called with an invalid blocking type", __FUNCTION__);
2478 		break;
2479 	}
2480 	waitinfo->context = VM_KERNEL_UNSLIDE_OR_PERM(rwlck);
2481 	waitinfo->owner = thread_tid(ctid_get_thread(rwlck->lck_rw_owner));
2482 }
2483 
2484 /*!
2485  * @function lck_rw_lock_would_yield_shared
2486  *
2487  * @abstract
2488  * Check whether a rw_lock currently held in shared mode would be yielded
2489  *
2490  * @discussion
2491  * This function can be used when lck_rw_lock_yield_shared() would be
2492  * inappropriate due to the need to perform additional housekeeping
2493  * prior to any yield or when the caller may wish to prematurely terminate
2494  * an operation rather than resume it after regaining the lock.
2495  *
2496  * @param lck           rw_lock already held in shared mode to yield.
2497  *
2498  * @returns TRUE if the lock would yield, FALSE otherwise
2499  */
2500 bool
lck_rw_lock_would_yield_shared(lck_rw_t * lck)2501 lck_rw_lock_would_yield_shared(
2502 	lck_rw_t        *lck)
2503 {
2504 	lck_rw_word_t   word;
2505 
2506 	lck_rw_assert(lck, LCK_RW_ASSERT_SHARED);
2507 
2508 	word.data = ordered_load_rw(lck);
2509 	if (word.want_excl || word.want_upgrade) {
2510 		return true;
2511 	}
2512 
2513 	return false;
2514 }
2515 
2516 /*!
2517  * @function lck_rw_lock_yield_shared
2518  *
2519  * @abstract
2520  * Yields a rw_lock held in shared mode.
2521  *
2522  * @discussion
2523  * This function can block.
2524  * Yields the lock in case there are writers waiting.
2525  * The yield will unlock, block, and re-lock the lock in shared mode.
2526  *
2527  * @param lck           rw_lock already held in shared mode to yield.
2528  * @param force_yield   if set to true it will always yield irrespective of the lock status
2529  *
2530  * @returns TRUE if the lock was yield, FALSE otherwise
2531  */
2532 bool
lck_rw_lock_yield_shared(lck_rw_t * lck,boolean_t force_yield)2533 lck_rw_lock_yield_shared(
2534 	lck_rw_t        *lck,
2535 	boolean_t       force_yield)
2536 {
2537 	if (lck_rw_lock_would_yield_shared(lck) || force_yield) {
2538 		lck_rw_unlock_shared(lck);
2539 		mutex_pause(2);
2540 		lck_rw_lock_shared(lck);
2541 		return true;
2542 	}
2543 
2544 	return false;
2545 }
2546 
2547 /*!
2548  * @function lck_rw_lock_would_yield_exclusive
2549  *
2550  * @abstract
2551  * Check whether a rw_lock currently held in exclusive mode would be yielded
2552  *
2553  * @discussion
2554  * This function can be used when lck_rw_lock_yield_exclusive would be
2555  * inappropriate due to the need to perform additional housekeeping
2556  * prior to any yield or when the caller may wish to prematurely terminate
2557  * an operation rather than resume it after regaining the lock.
2558  *
2559  * @param lck           rw_lock already held in exclusive mode to yield.
2560  * @param mode          when to yield.
2561  *
2562  * @returns TRUE if the lock would yield, FALSE otherwise
2563  */
2564 bool
lck_rw_lock_would_yield_exclusive(lck_rw_t * lck,lck_rw_yield_t mode)2565 lck_rw_lock_would_yield_exclusive(
2566 	lck_rw_t        *lck,
2567 	lck_rw_yield_t  mode)
2568 {
2569 	lck_rw_word_t word;
2570 	bool yield = false;
2571 
2572 	lck_rw_assert(lck, LCK_RW_ASSERT_EXCLUSIVE);
2573 
2574 	if (mode == LCK_RW_YIELD_ALWAYS) {
2575 		yield = true;
2576 	} else {
2577 		word.data = ordered_load_rw(lck);
2578 		if (word.w_waiting) {
2579 			yield = true;
2580 		} else if (mode == LCK_RW_YIELD_ANY_WAITER) {
2581 			yield = (word.r_waiting != 0);
2582 		}
2583 	}
2584 
2585 	return yield;
2586 }
2587 
2588 /*!
2589  * @function lck_rw_lock_yield_exclusive
2590  *
2591  * @abstract
2592  * Yields a rw_lock held in exclusive mode.
2593  *
2594  * @discussion
2595  * This function can block.
2596  * Yields the lock in case there are writers waiting.
2597  * The yield will unlock, block, and re-lock the lock in exclusive mode.
2598  *
2599  * @param lck           rw_lock already held in exclusive mode to yield.
2600  * @param mode          when to yield.
2601  *
2602  * @returns TRUE if the lock was yield, FALSE otherwise
2603  */
2604 bool
lck_rw_lock_yield_exclusive(lck_rw_t * lck,lck_rw_yield_t mode)2605 lck_rw_lock_yield_exclusive(
2606 	lck_rw_t        *lck,
2607 	lck_rw_yield_t  mode)
2608 {
2609 	bool yield = lck_rw_lock_would_yield_exclusive(lck, mode);
2610 
2611 	if (yield) {
2612 		lck_rw_unlock_exclusive(lck);
2613 		mutex_pause(2);
2614 		lck_rw_lock_exclusive(lck);
2615 	}
2616 
2617 	return yield;
2618 }
2619 
2620 /*!
2621  * @function lck_rw_sleep
2622  *
2623  * @abstract
2624  * Assert_wait on an event while holding the rw_lock.
2625  *
2626  * @discussion
2627  * the flags can decide how to re-acquire the lock upon wake up
2628  * (LCK_SLEEP_SHARED, or LCK_SLEEP_EXCLUSIVE, or LCK_SLEEP_UNLOCK)
2629  * and if the priority needs to be kept boosted until the lock is
2630  * re-acquired (LCK_SLEEP_PROMOTED_PRI).
2631  *
2632  * @param lck                   rw_lock to use to synch the assert_wait.
2633  * @param lck_sleep_action      flags.
2634  * @param event                 event to assert_wait on.
2635  * @param interruptible         wait type.
2636  */
2637 wait_result_t
lck_rw_sleep(lck_rw_t * lck,lck_sleep_action_t lck_sleep_action,event_t event,wait_interrupt_t interruptible)2638 lck_rw_sleep(
2639 	lck_rw_t                *lck,
2640 	lck_sleep_action_t      lck_sleep_action,
2641 	event_t                 event,
2642 	wait_interrupt_t        interruptible)
2643 {
2644 	wait_result_t           res;
2645 	lck_rw_type_t           lck_rw_type;
2646 	thread_pri_floor_t      token;
2647 
2648 	if ((lck_sleep_action & ~LCK_SLEEP_MASK) != 0) {
2649 		panic("Invalid lock sleep action %x", lck_sleep_action);
2650 	}
2651 
2652 	if (lck_sleep_action & LCK_SLEEP_PROMOTED_PRI) {
2653 		/*
2654 		 * Although we are dropping the RW lock, the intent in most cases
2655 		 * is that this thread remains as an observer, since it may hold
2656 		 * some secondary resource, but must yield to avoid deadlock. In
2657 		 * this situation, make sure that the thread is boosted to the
2658 		 * ceiling while blocked, so that it can re-acquire the
2659 		 * RW lock at that priority.
2660 		 */
2661 		token = thread_priority_floor_start();
2662 	}
2663 
2664 	res = assert_wait(event, interruptible);
2665 	if (res == THREAD_WAITING) {
2666 		lck_rw_type = lck_rw_done(lck);
2667 		res = thread_block(THREAD_CONTINUE_NULL);
2668 		if (!(lck_sleep_action & LCK_SLEEP_UNLOCK)) {
2669 			if (!(lck_sleep_action & (LCK_SLEEP_SHARED | LCK_SLEEP_EXCLUSIVE))) {
2670 				lck_rw_lock(lck, lck_rw_type);
2671 			} else if (lck_sleep_action & LCK_SLEEP_EXCLUSIVE) {
2672 				lck_rw_lock_exclusive(lck);
2673 			} else {
2674 				lck_rw_lock_shared(lck);
2675 			}
2676 		}
2677 	} else if (lck_sleep_action & LCK_SLEEP_UNLOCK) {
2678 		(void)lck_rw_done(lck);
2679 	}
2680 
2681 	if (lck_sleep_action & LCK_SLEEP_PROMOTED_PRI) {
2682 		thread_priority_floor_end(&token);
2683 	}
2684 
2685 	return res;
2686 }
2687 
2688 /*!
2689  * @function lck_rw_sleep_deadline
2690  *
2691  * @abstract
2692  * Assert_wait_deadline on an event while holding the rw_lock.
2693  *
2694  * @discussion
2695  * the flags can decide how to re-acquire the lock upon wake up
2696  * (LCK_SLEEP_SHARED, or LCK_SLEEP_EXCLUSIVE, or LCK_SLEEP_UNLOCK)
2697  * and if the priority needs to be kept boosted until the lock is
2698  * re-acquired (LCK_SLEEP_PROMOTED_PRI).
2699  *
2700  * @param lck                   rw_lock to use to synch the assert_wait.
2701  * @param lck_sleep_action      flags.
2702  * @param event                 event to assert_wait on.
2703  * @param interruptible         wait type.
2704  * @param deadline              maximum time after which being woken up
2705  */
2706 wait_result_t
lck_rw_sleep_deadline(lck_rw_t * lck,lck_sleep_action_t lck_sleep_action,event_t event,wait_interrupt_t interruptible,uint64_t deadline)2707 lck_rw_sleep_deadline(
2708 	lck_rw_t                *lck,
2709 	lck_sleep_action_t      lck_sleep_action,
2710 	event_t                 event,
2711 	wait_interrupt_t        interruptible,
2712 	uint64_t                deadline)
2713 {
2714 	wait_result_t           res;
2715 	lck_rw_type_t           lck_rw_type;
2716 	thread_pri_floor_t      token;
2717 
2718 	if ((lck_sleep_action & ~LCK_SLEEP_MASK) != 0) {
2719 		panic("Invalid lock sleep action %x", lck_sleep_action);
2720 	}
2721 
2722 	if (lck_sleep_action & LCK_SLEEP_PROMOTED_PRI) {
2723 		token = thread_priority_floor_start();
2724 	}
2725 
2726 	res = assert_wait_deadline(event, interruptible, deadline);
2727 	if (res == THREAD_WAITING) {
2728 		lck_rw_type = lck_rw_done(lck);
2729 		res = thread_block(THREAD_CONTINUE_NULL);
2730 		if (!(lck_sleep_action & LCK_SLEEP_UNLOCK)) {
2731 			if (!(lck_sleep_action & (LCK_SLEEP_SHARED | LCK_SLEEP_EXCLUSIVE))) {
2732 				lck_rw_lock(lck, lck_rw_type);
2733 			} else if (lck_sleep_action & LCK_SLEEP_EXCLUSIVE) {
2734 				lck_rw_lock_exclusive(lck);
2735 			} else {
2736 				lck_rw_lock_shared(lck);
2737 			}
2738 		}
2739 	} else if (lck_sleep_action & LCK_SLEEP_UNLOCK) {
2740 		(void)lck_rw_done(lck);
2741 	}
2742 
2743 	if (lck_sleep_action & LCK_SLEEP_PROMOTED_PRI) {
2744 		thread_priority_floor_end(&token);
2745 	}
2746 
2747 	return res;
2748 }
2749 
2750 /*
2751  * Reader-writer lock promotion
2752  *
2753  * We support a limited form of reader-writer
2754  * lock promotion whose effects are:
2755  *
2756  *   * Qualifying threads have decay disabled
2757  *   * Scheduler priority is reset to a floor of
2758  *     of their statically assigned priority
2759  *     or MINPRI_RWLOCK
2760  *
2761  * The rationale is that lck_rw_ts do not have
2762  * a single owner, so we cannot apply a directed
2763  * priority boost from all waiting threads
2764  * to all holding threads without maintaining
2765  * lists of all shared owners and all waiting
2766  * threads for every lock.
2767  *
2768  * Instead (and to preserve the uncontended fast-
2769  * path), acquiring (or attempting to acquire)
2770  * a RW lock in shared or exclusive lock increments
2771  * a per-thread counter. Only if that thread stops
2772  * making forward progress (for instance blocking
2773  * on a mutex, or being preempted) do we consult
2774  * the counter and apply the priority floor.
2775  * When the thread becomes runnable again (or in
2776  * the case of preemption it never stopped being
2777  * runnable), it has the priority boost and should
2778  * be in a good position to run on the CPU and
2779  * release all RW locks (at which point the priority
2780  * boost is cleared).
2781  *
2782  * Care must be taken to ensure that priority
2783  * boosts are not retained indefinitely, since unlike
2784  * mutex priority boosts (where the boost is tied
2785  * to the mutex lifecycle), the boost is tied
2786  * to the thread and independent of any particular
2787  * lck_rw_t. Assertions are in place on return
2788  * to userspace so that the boost is not held
2789  * indefinitely.
2790  *
2791  * The routines that increment/decrement the
2792  * per-thread counter should err on the side of
2793  * incrementing any time a preemption is possible
2794  * and the lock would be visible to the rest of the
2795  * system as held (so it should be incremented before
2796  * interlocks are dropped/preemption is enabled, or
2797  * before a CAS is executed to acquire the lock).
2798  *
2799  */
2800 
2801 /*!
2802  * @function lck_rw_clear_promotion
2803  *
2804  * @abstract
2805  * Undo priority promotions when the last rw_lock
2806  * is released by a thread (if a promotion was active).
2807  *
2808  * @param thread        thread to demote.
2809  * @param lock          object reason for the demotion.
2810  */
2811 __attribute__((noinline))
2812 static void
lck_rw_clear_promotion(thread_t thread,const void * lock)2813 lck_rw_clear_promotion(thread_t thread, const void *lock)
2814 {
2815 	/* Cancel any promotions if the thread had actually blocked while holding a RW lock */
2816 	spl_t s = splsched();
2817 	thread_lock(thread);
2818 
2819 	if (thread->sched_flags & TH_SFLAG_RW_PROMOTED) {
2820 		sched_thread_unpromote_reason(thread, TH_SFLAG_RW_PROMOTED,
2821 		    unslide_for_kdebug(lock));
2822 	}
2823 
2824 	thread_unlock(thread);
2825 	splx(s);
2826 }
2827 
2828 /*!
2829  * @function lck_rw_set_promotion_locked
2830  *
2831  * @abstract
2832  * Callout from context switch if the thread goes
2833  * off core with a positive rwlock_count.
2834  *
2835  * @discussion
2836  * Called at splsched with the thread locked.
2837  *
2838  * @param thread        thread to promote.
2839  */
2840 __attribute__((always_inline))
2841 void
lck_rw_set_promotion_locked(thread_t thread)2842 lck_rw_set_promotion_locked(thread_t thread)
2843 {
2844 	if (LcksOpts & LCK_OPTION_DISABLE_RW_PRIO) {
2845 		return;
2846 	}
2847 
2848 	assert(thread->rwlock_count > 0);
2849 
2850 	if (!(thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
2851 		sched_thread_promote_reason(thread, TH_SFLAG_RW_PROMOTED, 0);
2852 	}
2853 }
2854 
2855 __attribute__((always_inline))
2856 void
lck_rw_lock_count_inc(thread_t thread,const void * lock __unused)2857 lck_rw_lock_count_inc(thread_t thread, const void *lock __unused)
2858 {
2859 	if (thread->rwlock_count++ == 0) {
2860 #if MACH_ASSERT
2861 		/*
2862 		 * Set the ast to check that the
2863 		 * rwlock_count is going to be set to zero when
2864 		 * going back to userspace.
2865 		 * Set it only once when we increment it for the first time.
2866 		 */
2867 		act_set_debug_assert();
2868 #endif
2869 	}
2870 }
2871 
2872 __abortlike
2873 static void
__lck_rw_lock_count_dec_panic(thread_t thread)2874 __lck_rw_lock_count_dec_panic(thread_t thread)
2875 {
2876 	panic("rw lock count underflow for thread %p", thread);
2877 }
2878 
2879 __attribute__((always_inline))
2880 void
lck_rw_lock_count_dec(thread_t thread,const void * lock)2881 lck_rw_lock_count_dec(thread_t thread, const void *lock)
2882 {
2883 	uint32_t rwlock_count = thread->rwlock_count--;
2884 
2885 	if (rwlock_count == 0) {
2886 		__lck_rw_lock_count_dec_panic(thread);
2887 	}
2888 
2889 	if (__probable(rwlock_count == 1)) {
2890 		/* sched_flags checked without lock, but will be rechecked while clearing */
2891 		if (__improbable(thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
2892 			lck_rw_clear_promotion(thread, lock);
2893 		}
2894 	}
2895 }
2896