xref: /xnu-10002.41.9/osfmk/kern/lock_rw.c (revision 699cd48037512bf4380799317ca44ca453c82f57)
1 /*
2  * Copyright (c) 2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * @OSF_COPYRIGHT@
30  */
31 /*
32  * Mach Operating System
33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34  * All Rights Reserved.
35  *
36  * Permission to use, copy, modify and distribute this software and its
37  * documentation is hereby granted, provided that both the copyright
38  * notice and this permission notice appear in all copies of the
39  * software, derivative works or modified versions, and any portions
40  * thereof, and that both notices appear in supporting documentation.
41  *
42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45  *
46  * Carnegie Mellon requests users of this software to return to
47  *
48  *  Software Distribution Coordinator  or  [email protected]
49  *  School of Computer Science
50  *  Carnegie Mellon University
51  *  Pittsburgh PA 15213-3890
52  *
53  * any improvements or extensions that they make and grant Carnegie Mellon
54  * the rights to redistribute these changes.
55  */
56 #define LOCK_PRIVATE 1
57 #include <debug.h>
58 #include <kern/locks_internal.h>
59 #include <kern/lock_stat.h>
60 #include <kern/locks.h>
61 #include <kern/zalloc.h>
62 #include <kern/thread.h>
63 #include <kern/processor.h>
64 #include <kern/sched_prim.h>
65 #include <kern/debug.h>
66 #include <machine/atomic.h>
67 #include <machine/machine_cpu.h>
68 
69 KALLOC_TYPE_DEFINE(KT_LCK_RW, lck_rw_t, KT_PRIV_ACCT);
70 
71 #define LCK_RW_WRITER_EVENT(lck)                (event_t)((uintptr_t)(lck)+1)
72 #define LCK_RW_READER_EVENT(lck)                (event_t)((uintptr_t)(lck)+2)
73 #define WRITE_EVENT_TO_RWLOCK(event)            ((lck_rw_t *)((uintptr_t)(event)-1))
74 #define READ_EVENT_TO_RWLOCK(event)             ((lck_rw_t *)((uintptr_t)(event)-2))
75 
76 #if CONFIG_DTRACE
77 #define DTRACE_RW_SHARED        0x0     //reader
78 #define DTRACE_RW_EXCL          0x1     //writer
79 #define DTRACE_NO_FLAG          0x0     //not applicable
80 #endif  /* CONFIG_DTRACE */
81 
82 #define LCK_RW_LCK_EXCLUSIVE_CODE       0x100
83 #define LCK_RW_LCK_EXCLUSIVE1_CODE      0x101
84 #define LCK_RW_LCK_SHARED_CODE          0x102
85 #define LCK_RW_LCK_SH_TO_EX_CODE        0x103
86 #define LCK_RW_LCK_SH_TO_EX1_CODE       0x104
87 #define LCK_RW_LCK_EX_TO_SH_CODE        0x105
88 
89 #if __x86_64__
90 #define LCK_RW_LCK_EX_WRITER_SPIN_CODE  0x106
91 #define LCK_RW_LCK_EX_WRITER_WAIT_CODE  0x107
92 #define LCK_RW_LCK_EX_READER_SPIN_CODE  0x108
93 #define LCK_RW_LCK_EX_READER_WAIT_CODE  0x109
94 #define LCK_RW_LCK_SHARED_SPIN_CODE     0x110
95 #define LCK_RW_LCK_SHARED_WAIT_CODE     0x111
96 #define LCK_RW_LCK_SH_TO_EX_SPIN_CODE   0x112
97 #define LCK_RW_LCK_SH_TO_EX_WAIT_CODE   0x113
98 #endif
99 
100 #define lck_rw_ilk_lock(lock)   hw_lock_bit  ((hw_lock_bit_t*)(&(lock)->lck_rw_tag), LCK_RW_INTERLOCK_BIT, LCK_GRP_NULL)
101 #define lck_rw_ilk_unlock(lock) hw_unlock_bit((hw_lock_bit_t*)(&(lock)->lck_rw_tag), LCK_RW_INTERLOCK_BIT)
102 
103 #define ordered_load_rw(lock)                   os_atomic_load(&(lock)->lck_rw_data, compiler_acq_rel)
104 #define ordered_store_rw(lock, value)           os_atomic_store(&(lock)->lck_rw_data, (value), compiler_acq_rel)
105 #define ordered_store_rw_owner(lock, value)     os_atomic_store(&(lock)->lck_rw_owner, (value), compiler_acq_rel)
106 
107 #ifdef DEBUG_RW
108 static TUNABLE(bool, lck_rw_recursive_shared_assert_74048094, "lck_rw_recursive_shared_assert", false);
109 SECURITY_READ_ONLY_EARLY(vm_packing_params_t) rwlde_caller_packing_params =
110     VM_PACKING_PARAMS(LCK_RW_CALLER_PACKED);
111 #define rw_lock_debug_disabled()                (lck_opts_get() & LCK_OPTION_DISABLE_RW_DEBUG)
112 
113 #define set_rwlde_caller_packed(entry, caller)          ((entry)->rwlde_caller_packed = VM_PACK_POINTER((vm_offset_t)caller, LCK_RW_CALLER_PACKED))
114 #define get_rwlde_caller(entry)                         ((void*)VM_UNPACK_POINTER(entry->rwlde_caller_packed, LCK_RW_CALLER_PACKED))
115 
116 #endif /* DEBUG_RW */
117 
118 /*!
119  * @function lck_rw_alloc_init
120  *
121  * @abstract
122  * Allocates and initializes a rw_lock_t.
123  *
124  * @discussion
125  * The function can block. See lck_rw_init() for initialization details.
126  *
127  * @param grp           lock group to associate with the lock.
128  * @param attr          lock attribute to initialize the lock.
129  *
130  * @returns             NULL or the allocated lock
131  */
132 lck_rw_t *
lck_rw_alloc_init(lck_grp_t * grp,lck_attr_t * attr)133 lck_rw_alloc_init(
134 	lck_grp_t       *grp,
135 	lck_attr_t      *attr)
136 {
137 	lck_rw_t *lck;
138 
139 	lck = zalloc_flags(KT_LCK_RW, Z_WAITOK | Z_ZERO);
140 	lck_rw_init(lck, grp, attr);
141 	return lck;
142 }
143 
144 /*!
145  * @function lck_rw_init
146  *
147  * @abstract
148  * Initializes a rw_lock_t.
149  *
150  * @discussion
151  * Usage statistics for the lock are going to be added to the lock group provided.
152  *
153  * The lock attribute can be used to specify the lock contention behaviour.
154  * RW_WRITER_PRIORITY is the default behaviour (LCK_ATTR_NULL defaults to RW_WRITER_PRIORITY)
155  * and lck_attr_rw_shared_priority() can be used to set the behaviour to RW_SHARED_PRIORITY.
156  *
157  * RW_WRITER_PRIORITY gives priority to the writers upon contention with the readers;
158  * if the lock is held and a writer starts waiting for the lock, readers will not be able
159  * to acquire the lock until all writers stop contending. Readers could
160  * potentially starve.
161  * RW_SHARED_PRIORITY gives priority to the readers upon contention with the writers:
162  * unleass the lock is held in exclusive mode, readers will always be able to acquire the lock.
163  * Readers can lock a shared lock even if there are writers waiting. Writers could potentially
164  * starve.
165  *
166  * @param lck           lock to initialize.
167  * @param grp           lock group to associate with the lock.
168  * @param attr          lock attribute to initialize the lock.
169  *
170  */
171 void
lck_rw_init(lck_rw_t * lck,lck_grp_t * grp,lck_attr_t * attr)172 lck_rw_init(
173 	lck_rw_t        *lck,
174 	lck_grp_t       *grp,
175 	lck_attr_t      *attr)
176 {
177 	/* keep this so that the lck_type_t type is referenced for lldb */
178 	lck_type_t type = LCK_TYPE_RW;
179 
180 	if (attr == LCK_ATTR_NULL) {
181 		attr = &lck_attr_default;
182 	}
183 	*lck = (lck_rw_t){
184 		.lck_rw_type = type,
185 		.lck_rw_can_sleep = true,
186 		.lck_rw_priv_excl = !(attr->lck_attr_val & LCK_ATTR_RW_SHARED_PRIORITY),
187 	};
188 	lck_grp_reference(grp, &grp->lck_grp_rwcnt);
189 }
190 
191 /*!
192  * @function lck_rw_free
193  *
194  * @abstract
195  * Frees a rw_lock previously allocated with lck_rw_alloc_init().
196  *
197  * @discussion
198  * The lock must be not held by any thread.
199  *
200  * @param lck           rw_lock to free.
201  */
202 void
lck_rw_free(lck_rw_t * lck,lck_grp_t * grp)203 lck_rw_free(
204 	lck_rw_t        *lck,
205 	lck_grp_t       *grp)
206 {
207 	lck_rw_destroy(lck, grp);
208 	zfree(KT_LCK_RW, lck);
209 }
210 
211 /*!
212  * @function lck_rw_destroy
213  *
214  * @abstract
215  * Destroys a rw_lock previously initialized with lck_rw_init().
216  *
217  * @discussion
218  * The lock must be not held by any thread.
219  *
220  * @param lck           rw_lock to destroy.
221  */
222 void
lck_rw_destroy(lck_rw_t * lck,lck_grp_t * grp)223 lck_rw_destroy(
224 	lck_rw_t        *lck,
225 	lck_grp_t       *grp)
226 {
227 	if (lck->lck_rw_type != LCK_TYPE_RW ||
228 	    lck->lck_rw_tag == LCK_RW_TAG_DESTROYED) {
229 		panic("Destroying previously destroyed lock %p", lck);
230 	}
231 	lck_rw_assert(lck, LCK_RW_ASSERT_NOTHELD);
232 
233 	lck->lck_rw_type = LCK_TYPE_NONE;
234 	lck->lck_rw_tag = LCK_RW_TAG_DESTROYED;
235 	lck_grp_deallocate(grp, &grp->lck_grp_rwcnt);
236 }
237 
238 #ifdef DEBUG_RW
239 
240 /*
241  * Best effort mechanism to debug rw_locks.
242  *
243  * This mechanism is in addition to the owner checks. The owner is set
244  * only when the lock is held in exclusive mode so the checks do not cover
245  * the cases in which the lock is held in shared mode.
246  *
247  * This mechanism tentatively stores the rw_lock acquired and its debug
248  * information on the thread struct.
249  * Just up to LCK_RW_EXPECTED_MAX_NUMBER rw lock debug information can be stored.
250  *
251  * NOTE: LCK_RW_EXPECTED_MAX_NUMBER is the expected number of rw_locks held
252  * at the same time. If a thread holds more than this number of rw_locks we
253  * will start losing debug information.
254  * Increasing LCK_RW_EXPECTED_MAX_NUMBER will increase the probability we will
255  * store the debug information but it will require more memory per thread
256  * and longer lock/unlock time.
257  *
258  * If an empty slot is found for the debug information, we record the lock
259  * otherwise we set the overflow threshold flag.
260  *
261  * If we reached the overflow threshold we might stop asserting because we cannot be sure
262  * anymore if the lock was acquired or not.
263  *
264  * Even if we reached the overflow threshold, we try to store the debug information
265  * for the new locks acquired. This can be useful in core dumps to debug
266  * possible return to userspace without unlocking and to find possible readers
267  * holding the lock.
268  */
269 __startup_func
270 static void
rw_lock_init(void)271 rw_lock_init(void)
272 {
273 	if (kern_feature_override(KF_RW_LOCK_DEBUG_OVRD)) {
274 		LcksOpts |= LCK_OPTION_DISABLE_RW_DEBUG;
275 	}
276 }
277 STARTUP(LOCKS, STARTUP_RANK_FIRST, rw_lock_init);
278 
279 static inline struct rw_lock_debug_entry *
find_lock_in_savedlocks(lck_rw_t * lock,rw_lock_debug_t * rw_locks_held)280 find_lock_in_savedlocks(lck_rw_t* lock, rw_lock_debug_t *rw_locks_held)
281 {
282 	int i;
283 	for (i = 0; i < LCK_RW_EXPECTED_MAX_NUMBER; i++) {
284 		struct rw_lock_debug_entry *existing = &rw_locks_held->rwld_locks[i];
285 		if (existing->rwlde_lock == lock) {
286 			return existing;
287 		}
288 	}
289 
290 	return NULL;
291 }
292 
293 __abortlike
294 static void
rwlock_slot_panic(rw_lock_debug_t * rw_locks_held)295 rwlock_slot_panic(rw_lock_debug_t *rw_locks_held)
296 {
297 	panic("No empty slot found in %p slot_used %d", rw_locks_held, rw_locks_held->rwld_locks_saved);
298 }
299 
300 static inline struct rw_lock_debug_entry *
find_empty_slot(rw_lock_debug_t * rw_locks_held)301 find_empty_slot(rw_lock_debug_t *rw_locks_held)
302 {
303 	int i;
304 	for (i = 0; i < LCK_RW_EXPECTED_MAX_NUMBER; i++) {
305 		struct rw_lock_debug_entry *entry = &rw_locks_held->rwld_locks[i];
306 		if (entry->rwlde_lock == NULL) {
307 			return entry;
308 		}
309 	}
310 	rwlock_slot_panic(rw_locks_held);
311 }
312 
313 __abortlike
314 static void
canlock_rwlock_panic(lck_rw_t * lock,thread_t thread,struct rw_lock_debug_entry * entry)315 canlock_rwlock_panic(lck_rw_t* lock, thread_t thread, struct rw_lock_debug_entry *entry)
316 {
317 	panic("RW lock %p already held by %p caller %p mode_count %d state 0x%x owner 0x%p ",
318 	    lock, thread, get_rwlde_caller(entry), entry->rwlde_mode_count,
319 	    ordered_load_rw(lock), ctid_get_thread_unsafe(lock->lck_rw_owner));
320 }
321 
322 __attribute__((noinline))
323 static void
assert_canlock_rwlock_slow(lck_rw_t * lock,thread_t thread,lck_rw_type_t type)324 assert_canlock_rwlock_slow(lck_rw_t* lock, thread_t thread, lck_rw_type_t type)
325 {
326 	rw_lock_debug_t *rw_locks_held = &thread->rw_lock_held;
327 	if (__probable(rw_locks_held->rwld_locks_acquired == 0)) {
328 		//no locks saved, safe to lock
329 		return;
330 	}
331 
332 	struct rw_lock_debug_entry *entry = find_lock_in_savedlocks(lock, rw_locks_held);
333 	if (__improbable(entry != NULL)) {
334 		boolean_t can_be_shared_recursive;
335 		if (lck_rw_recursive_shared_assert_74048094) {
336 			can_be_shared_recursive = (lock->lck_rw_priv_excl == 0);
337 		} else {
338 			/* currently rw_lock_shared is called recursively,
339 			 * until the code is fixed allow to lock
340 			 * recursively in shared mode
341 			 */
342 			can_be_shared_recursive = TRUE;
343 		}
344 		if ((type == LCK_RW_TYPE_SHARED) && can_be_shared_recursive && entry->rwlde_mode_count >= 1) {
345 			return;
346 		}
347 		canlock_rwlock_panic(lock, thread, entry);
348 	}
349 }
350 
351 static inline void
assert_canlock_rwlock(lck_rw_t * lock,thread_t thread,lck_rw_type_t type)352 assert_canlock_rwlock(lck_rw_t* lock, thread_t thread, lck_rw_type_t type)
353 {
354 	if (__improbable(!rw_lock_debug_disabled())) {
355 		assert_canlock_rwlock_slow(lock, thread, type);
356 	}
357 }
358 
359 __abortlike
360 static void
held_rwlock_notheld_panic(lck_rw_t * lock,thread_t thread)361 held_rwlock_notheld_panic(lck_rw_t* lock, thread_t thread)
362 {
363 	panic("RW lock %p not held by %p", lock, thread);
364 }
365 
366 __abortlike
367 static void
held_rwlock_notheld_with_info_panic(lck_rw_t * lock,thread_t thread,lck_rw_type_t type,struct rw_lock_debug_entry * entry)368 held_rwlock_notheld_with_info_panic(lck_rw_t* lock, thread_t thread, lck_rw_type_t type, struct rw_lock_debug_entry *entry)
369 {
370 	if (type == LCK_RW_TYPE_EXCLUSIVE) {
371 		panic("RW lock %p not held in exclusive by %p caller %p read %d state 0x%x owner 0x%p ",
372 		    lock, thread, get_rwlde_caller(entry), entry->rwlde_mode_count,
373 		    ordered_load_rw(lock), ctid_get_thread_unsafe(lock->lck_rw_owner));
374 	} else {
375 		panic("RW lock %p not held in shared by %p caller %p read %d state 0x%x owner 0x%p ",
376 		    lock, thread, get_rwlde_caller(entry), entry->rwlde_mode_count,
377 		    ordered_load_rw(lock), ctid_get_thread_unsafe(lock->lck_rw_owner));
378 	}
379 }
380 
381 __attribute__((noinline))
382 static void
assert_held_rwlock_slow(lck_rw_t * lock,thread_t thread,lck_rw_type_t type)383 assert_held_rwlock_slow(lck_rw_t* lock, thread_t thread, lck_rw_type_t type)
384 {
385 	rw_lock_debug_t *rw_locks_held = &thread->rw_lock_held;
386 
387 	if (__improbable(rw_locks_held->rwld_locks_acquired == 0 || rw_locks_held->rwld_locks_saved == 0)) {
388 		if (rw_locks_held->rwld_locks_acquired == 0 || rw_locks_held->rwld_overflow == 0) {
389 			held_rwlock_notheld_panic(lock, thread);
390 		}
391 		return;
392 	}
393 
394 	struct rw_lock_debug_entry *entry = find_lock_in_savedlocks(lock, rw_locks_held);
395 	if (__probable(entry != NULL)) {
396 		if (type == LCK_RW_TYPE_EXCLUSIVE && entry->rwlde_mode_count != -1) {
397 			held_rwlock_notheld_with_info_panic(lock, thread, type, entry);
398 		} else {
399 			if (type == LCK_RW_TYPE_SHARED && entry->rwlde_mode_count <= 0) {
400 				held_rwlock_notheld_with_info_panic(lock, thread, type, entry);
401 			}
402 		}
403 	} else {
404 		if (rw_locks_held->rwld_overflow == 0) {
405 			held_rwlock_notheld_panic(lock, thread);
406 		}
407 	}
408 }
409 
410 static inline void
assert_held_rwlock(lck_rw_t * lock,thread_t thread,lck_rw_type_t type)411 assert_held_rwlock(lck_rw_t* lock, thread_t thread, lck_rw_type_t type)
412 {
413 	if (__improbable(!rw_lock_debug_disabled())) {
414 		assert_held_rwlock_slow(lock, thread, type);
415 	}
416 }
417 
418 __attribute__((noinline))
419 static void
change_held_rwlock_slow(lck_rw_t * lock,thread_t thread,lck_rw_type_t typeFrom,void * caller)420 change_held_rwlock_slow(lck_rw_t* lock, thread_t thread, lck_rw_type_t typeFrom, void* caller)
421 {
422 	rw_lock_debug_t *rw_locks_held = &thread->rw_lock_held;
423 	if (__improbable(rw_locks_held->rwld_locks_saved == 0)) {
424 		if (rw_locks_held->rwld_overflow == 0) {
425 			held_rwlock_notheld_panic(lock, thread);
426 		}
427 		return;
428 	}
429 
430 	struct rw_lock_debug_entry *entry = find_lock_in_savedlocks(lock, rw_locks_held);
431 	if (__probable(entry != NULL)) {
432 		if (typeFrom == LCK_RW_TYPE_SHARED) {
433 			//We are upgrading
434 			assertf(entry->rwlde_mode_count == 1,
435 			    "RW lock %p not held by a single shared when upgrading "
436 			    "by %p caller %p read %d state 0x%x owner 0x%p ",
437 			    lock, thread, get_rwlde_caller(entry), entry->rwlde_mode_count,
438 			    ordered_load_rw(lock), ctid_get_thread_unsafe(lock->lck_rw_owner));
439 			entry->rwlde_mode_count = -1;
440 			set_rwlde_caller_packed(entry, caller);
441 		} else {
442 			//We are downgrading
443 			assertf(entry->rwlde_mode_count == -1,
444 			    "RW lock %p not held in write mode when downgrading "
445 			    "by %p caller %p read %d state 0x%x owner 0x%p ",
446 			    lock, thread, get_rwlde_caller(entry), entry->rwlde_mode_count,
447 			    ordered_load_rw(lock), ctid_get_thread_unsafe(lock->lck_rw_owner));
448 			entry->rwlde_mode_count = 1;
449 			set_rwlde_caller_packed(entry, caller);
450 		}
451 		return;
452 	}
453 
454 	if (rw_locks_held->rwld_overflow == 0) {
455 		held_rwlock_notheld_panic(lock, thread);
456 	}
457 
458 	if (rw_locks_held->rwld_locks_saved == LCK_RW_EXPECTED_MAX_NUMBER) {
459 		//array is full
460 		return;
461 	}
462 
463 	struct rw_lock_debug_entry *null_entry = find_empty_slot(rw_locks_held);
464 	null_entry->rwlde_lock = lock;
465 	set_rwlde_caller_packed(null_entry, caller);
466 	if (typeFrom == LCK_RW_TYPE_SHARED) {
467 		null_entry->rwlde_mode_count = -1;
468 	} else {
469 		null_entry->rwlde_mode_count = 1;
470 	}
471 	rw_locks_held->rwld_locks_saved++;
472 }
473 
474 static inline void
change_held_rwlock(lck_rw_t * lock,thread_t thread,lck_rw_type_t typeFrom,void * caller)475 change_held_rwlock(lck_rw_t* lock, thread_t thread, lck_rw_type_t typeFrom, void* caller)
476 {
477 	if (__improbable(!rw_lock_debug_disabled())) {
478 		change_held_rwlock_slow(lock, thread, typeFrom, caller);
479 	}
480 }
481 
482 __abortlike
483 static void
add_held_rwlock_too_many_panic(thread_t thread)484 add_held_rwlock_too_many_panic(thread_t thread)
485 {
486 	panic("RW lock too many rw locks held, rwld_locks_acquired maxed out for thread %p", thread);
487 }
488 
489 static __attribute__((noinline)) void
add_held_rwlock_slow(lck_rw_t * lock,thread_t thread,lck_rw_type_t type,void * caller)490 add_held_rwlock_slow(lck_rw_t* lock, thread_t thread, lck_rw_type_t type, void* caller)
491 {
492 	rw_lock_debug_t *rw_locks_held = &thread->rw_lock_held;
493 	struct rw_lock_debug_entry *null_entry;
494 	if (__improbable(rw_locks_held->rwld_locks_acquired == UINT32_MAX)) {
495 		add_held_rwlock_too_many_panic(thread);
496 	}
497 	rw_locks_held->rwld_locks_acquired++;
498 
499 	if (type == LCK_RW_TYPE_EXCLUSIVE) {
500 		if (__improbable(rw_locks_held->rwld_locks_saved == LCK_RW_EXPECTED_MAX_NUMBER)) {
501 			//array is full
502 			rw_locks_held->rwld_overflow = 1;
503 			return;
504 		}
505 		null_entry = find_empty_slot(rw_locks_held);
506 		null_entry->rwlde_lock = lock;
507 		set_rwlde_caller_packed(null_entry, caller);
508 		null_entry->rwlde_mode_count = -1;
509 		rw_locks_held->rwld_locks_saved++;
510 		return;
511 	} else {
512 		if (__probable(rw_locks_held->rwld_locks_saved == 0)) {
513 			//array is empty
514 			goto add_shared;
515 		}
516 
517 		boolean_t allow_shared_recursive;
518 		if (lck_rw_recursive_shared_assert_74048094) {
519 			allow_shared_recursive = (lock->lck_rw_priv_excl == 0);
520 		} else {
521 			allow_shared_recursive = TRUE;
522 		}
523 		if (allow_shared_recursive) {
524 			//It could be already locked in shared mode
525 			struct rw_lock_debug_entry *entry = find_lock_in_savedlocks(lock, rw_locks_held);
526 			if (entry != NULL) {
527 				assert(entry->rwlde_mode_count > 0);
528 				assertf(entry->rwlde_mode_count != INT8_MAX,
529 				    "RW lock %p with too many recursive shared held "
530 				    "from %p caller %p read %d state 0x%x owner 0x%p",
531 				    lock, thread, get_rwlde_caller(entry), entry->rwlde_mode_count,
532 				    ordered_load_rw(lock), ctid_get_thread_unsafe(lock->lck_rw_owner));
533 				entry->rwlde_mode_count += 1;
534 				return;
535 			}
536 		}
537 
538 		//none of the locks were a match
539 		//try to add a new entry
540 		if (__improbable(rw_locks_held->rwld_locks_saved == LCK_RW_EXPECTED_MAX_NUMBER)) {
541 			//array is full
542 			rw_locks_held->rwld_overflow = 1;
543 			return;
544 		}
545 
546 add_shared:
547 		null_entry = find_empty_slot(rw_locks_held);
548 		null_entry->rwlde_lock = lock;
549 		set_rwlde_caller_packed(null_entry, caller);
550 		null_entry->rwlde_mode_count = 1;
551 		rw_locks_held->rwld_locks_saved++;
552 	}
553 }
554 
555 static inline void
add_held_rwlock(lck_rw_t * lock,thread_t thread,lck_rw_type_t type,void * caller)556 add_held_rwlock(lck_rw_t* lock, thread_t thread, lck_rw_type_t type, void* caller)
557 {
558 	if (__improbable(!rw_lock_debug_disabled())) {
559 		add_held_rwlock_slow(lock, thread, type, caller);
560 	}
561 }
562 
563 static void
remove_held_rwlock_slow(lck_rw_t * lock,thread_t thread,lck_rw_type_t type)564 remove_held_rwlock_slow(lck_rw_t *lock, thread_t thread, lck_rw_type_t type)
565 {
566 	rw_lock_debug_t *rw_locks_held = &thread->rw_lock_held;
567 	if (__improbable(rw_locks_held->rwld_locks_acquired == 0)) {
568 		return;
569 	}
570 	rw_locks_held->rwld_locks_acquired--;
571 
572 	if (rw_locks_held->rwld_locks_saved == 0) {
573 		assert(rw_locks_held->rwld_overflow == 1);
574 		goto out;
575 	}
576 
577 	struct rw_lock_debug_entry *entry = find_lock_in_savedlocks(lock, rw_locks_held);
578 	if (__probable(entry != NULL)) {
579 		if (type == LCK_RW_TYPE_EXCLUSIVE) {
580 			assert(entry->rwlde_mode_count == -1);
581 			entry->rwlde_mode_count = 0;
582 		} else {
583 			assert(entry->rwlde_mode_count > 0);
584 			entry->rwlde_mode_count--;
585 			if (entry->rwlde_mode_count > 0) {
586 				goto out;
587 			}
588 		}
589 		entry->rwlde_caller_packed = 0;
590 		entry->rwlde_lock = NULL;
591 		rw_locks_held->rwld_locks_saved--;
592 	} else {
593 		assert(rw_locks_held->rwld_overflow == 1);
594 	}
595 
596 out:
597 	if (rw_locks_held->rwld_locks_acquired == 0) {
598 		rw_locks_held->rwld_overflow = 0;
599 	}
600 	return;
601 }
602 
603 static inline void
remove_held_rwlock(lck_rw_t * lock,thread_t thread,lck_rw_type_t type)604 remove_held_rwlock(lck_rw_t* lock, thread_t thread, lck_rw_type_t type)
605 {
606 	if (__improbable(!rw_lock_debug_disabled())) {
607 		remove_held_rwlock_slow(lock, thread, type);
608 	}
609 }
610 #endif /* DEBUG_RW */
611 
612 /*
613  * We disable interrupts while holding the RW interlock to prevent an
614  * interrupt from exacerbating hold time.
615  * Hence, local helper functions lck_interlock_lock()/lck_interlock_unlock().
616  */
617 static inline boolean_t
lck_interlock_lock(lck_rw_t * lck)618 lck_interlock_lock(
619 	lck_rw_t        *lck)
620 {
621 	boolean_t       istate;
622 
623 	istate = ml_set_interrupts_enabled(FALSE);
624 	lck_rw_ilk_lock(lck);
625 	return istate;
626 }
627 
628 static inline void
lck_interlock_unlock(lck_rw_t * lck,boolean_t istate)629 lck_interlock_unlock(
630 	lck_rw_t        *lck,
631 	boolean_t       istate)
632 {
633 	lck_rw_ilk_unlock(lck);
634 	ml_set_interrupts_enabled(istate);
635 }
636 
637 /*
638  * compute the deadline to spin against when
639  * waiting for a change of state on a lck_rw_t
640  */
641 static inline uint64_t
lck_rw_deadline_for_spin(lck_rw_t * lck)642 lck_rw_deadline_for_spin(
643 	lck_rw_t        *lck)
644 {
645 	lck_rw_word_t   word;
646 
647 	word.data = ordered_load_rw(lck);
648 	if (word.can_sleep) {
649 		if (word.r_waiting || word.w_waiting || (word.shared_count > machine_info.max_cpus)) {
650 			/*
651 			 * there are already threads waiting on this lock... this
652 			 * implies that they have spun beyond their deadlines waiting for
653 			 * the desired state to show up so we will not bother spinning at this time...
654 			 *   or
655 			 * the current number of threads sharing this lock exceeds our capacity to run them
656 			 * concurrently and since all states we're going to spin for require the rw_shared_count
657 			 * to be at 0, we'll not bother spinning since the latency for this to happen is
658 			 * unpredictable...
659 			 */
660 			return mach_absolute_time();
661 		}
662 		return mach_absolute_time() + os_atomic_load(&MutexSpin, relaxed);
663 	} else {
664 		return mach_absolute_time() + (100000LL * 1000000000LL);
665 	}
666 }
667 
668 /*
669  * This inline is used when busy-waiting for an rw lock.
670  * If interrupts were disabled when the lock primitive was called,
671  * we poll the IPI handler for pending tlb flushes in x86.
672  */
673 static inline void
lck_rw_lock_pause(boolean_t interrupts_enabled)674 lck_rw_lock_pause(
675 	boolean_t       interrupts_enabled)
676 {
677 #if X86_64
678 	if (!interrupts_enabled) {
679 		handle_pending_TLB_flushes();
680 	}
681 	cpu_pause();
682 #else
683 	(void) interrupts_enabled;
684 	wait_for_event();
685 #endif
686 }
687 
688 typedef enum __enum_closed {
689 	LCK_RW_DRAIN_S_DRAINED       = 0,
690 	LCK_RW_DRAIN_S_NOT_DRAINED   = 1,
691 	LCK_RW_DRAIN_S_EARLY_RETURN  = 2,
692 	LCK_RW_DRAIN_S_TIMED_OUT     = 3,
693 } lck_rw_drain_state_t;
694 
695 static lck_rw_drain_state_t
696 lck_rw_drain_status(
697 	lck_rw_t        *lock,
698 	uint32_t        status_mask,
699 	boolean_t       wait,
700 	bool            (^lock_pause)(void))
701 {
702 	uint64_t        deadline = 0;
703 	uint32_t        data;
704 	boolean_t       istate = FALSE;
705 
706 	if (wait) {
707 		deadline = lck_rw_deadline_for_spin(lock);
708 #if __x86_64__
709 		istate = ml_get_interrupts_enabled();
710 #endif
711 	}
712 
713 	for (;;) {
714 #if __x86_64__
715 		data = os_atomic_load(&lock->lck_rw_data, relaxed);
716 #else
717 		data = load_exclusive32(&lock->lck_rw_data, memory_order_acquire_smp);
718 #endif
719 		if ((data & status_mask) == 0) {
720 			atomic_exchange_abort();
721 			return LCK_RW_DRAIN_S_DRAINED;
722 		}
723 
724 		if (!wait) {
725 			atomic_exchange_abort();
726 			return LCK_RW_DRAIN_S_NOT_DRAINED;
727 		}
728 
729 		lck_rw_lock_pause(istate);
730 
731 		if (mach_absolute_time() >= deadline) {
732 			return LCK_RW_DRAIN_S_TIMED_OUT;
733 		}
734 
735 		if (lock_pause && lock_pause()) {
736 			return LCK_RW_DRAIN_S_EARLY_RETURN;
737 		}
738 	}
739 }
740 
741 /*
742  * Spin while interlock is held.
743  */
744 static inline void
lck_rw_interlock_spin(lck_rw_t * lock)745 lck_rw_interlock_spin(
746 	lck_rw_t        *lock)
747 {
748 	uint32_t        data, prev;
749 
750 	for (;;) {
751 		data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_relaxed);
752 		if (data & LCK_RW_INTERLOCK) {
753 #if __x86_64__
754 			cpu_pause();
755 #else
756 			wait_for_event();
757 #endif
758 		} else {
759 			atomic_exchange_abort();
760 			return;
761 		}
762 	}
763 }
764 
765 #define LCK_RW_GRAB_WANT        0
766 #define LCK_RW_GRAB_SHARED      1
767 
768 typedef enum __enum_closed __enum_options {
769 	LCK_RW_GRAB_F_SHARED    = 0x0,  // Not really a flag obviously but makes call sites more readable.
770 	LCK_RW_GRAB_F_WANT_EXCL = 0x1,
771 	LCK_RW_GRAB_F_WAIT      = 0x2,
772 } lck_rw_grab_flags_t;
773 
774 typedef enum __enum_closed {
775 	LCK_RW_GRAB_S_NOT_LOCKED    = 0,
776 	LCK_RW_GRAB_S_LOCKED        = 1,
777 	LCK_RW_GRAB_S_EARLY_RETURN  = 2,
778 	LCK_RW_GRAB_S_TIMED_OUT     = 3,
779 } lck_rw_grab_state_t;
780 
781 static lck_rw_grab_state_t
782 lck_rw_grab(
783 	lck_rw_t            *lock,
784 	lck_rw_grab_flags_t flags,
785 	bool                (^lock_pause)(void))
786 {
787 	uint64_t        deadline = 0;
788 	uint32_t        data, prev;
789 	boolean_t       do_exch, istate = FALSE;
790 
791 	assert3u(flags & ~(LCK_RW_GRAB_F_WANT_EXCL | LCK_RW_GRAB_F_WAIT), ==, 0);
792 
793 	if ((flags & LCK_RW_GRAB_F_WAIT) != 0) {
794 		deadline = lck_rw_deadline_for_spin(lock);
795 #if __x86_64__
796 		istate = ml_get_interrupts_enabled();
797 #endif
798 	}
799 
800 	for (;;) {
801 		data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_acquire_smp);
802 		if (data & LCK_RW_INTERLOCK) {
803 			atomic_exchange_abort();
804 			lck_rw_interlock_spin(lock);
805 			continue;
806 		}
807 		do_exch = FALSE;
808 		if ((flags & LCK_RW_GRAB_F_WANT_EXCL) != 0) {
809 			if ((data & LCK_RW_WANT_EXCL) == 0) {
810 				data |= LCK_RW_WANT_EXCL;
811 				do_exch = TRUE;
812 			}
813 		} else {        // LCK_RW_GRAB_SHARED
814 			if (((data & (LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE)) == 0) ||
815 			    (((data & LCK_RW_SHARED_MASK)) && ((data & LCK_RW_PRIV_EXCL) == 0))) {
816 				data += LCK_RW_SHARED_READER;
817 				do_exch = TRUE;
818 			}
819 		}
820 		if (do_exch) {
821 			if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_acquire_smp)) {
822 				return LCK_RW_GRAB_S_LOCKED;
823 			}
824 		} else {
825 			if ((flags & LCK_RW_GRAB_F_WAIT) == 0) {
826 				atomic_exchange_abort();
827 				return LCK_RW_GRAB_S_NOT_LOCKED;
828 			}
829 
830 			lck_rw_lock_pause(istate);
831 
832 			if (mach_absolute_time() >= deadline) {
833 				return LCK_RW_GRAB_S_TIMED_OUT;
834 			}
835 			if (lock_pause && lock_pause()) {
836 				return LCK_RW_GRAB_S_EARLY_RETURN;
837 			}
838 		}
839 	}
840 }
841 
842 /*
843  * The inverse of lck_rw_grab - drops either the LCK_RW_WANT_EXCL bit or
844  * decrements the reader count. Doesn't deal with waking up waiters - i.e.
845  * should only be called when can_sleep is false.
846  */
847 static void
lck_rw_drop(lck_rw_t * lock,lck_rw_grab_flags_t flags)848 lck_rw_drop(lck_rw_t *lock, lck_rw_grab_flags_t flags)
849 {
850 	uint32_t data, prev;
851 
852 	assert3u(flags & ~(LCK_RW_GRAB_F_WANT_EXCL | LCK_RW_GRAB_F_WAIT), ==, 0);
853 	assert(!lock->lck_rw_can_sleep);
854 
855 	for (;;) {
856 		data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_acquire_smp);
857 
858 		/* Interlock should never be taken when can_sleep is false. */
859 		assert3u(data & LCK_RW_INTERLOCK, ==, 0);
860 
861 		if ((flags & LCK_RW_GRAB_F_WANT_EXCL) != 0) {
862 			data &= ~LCK_RW_WANT_EXCL;
863 		} else {
864 			data -= LCK_RW_SHARED_READER;
865 		}
866 
867 		if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_acquire_smp)) {
868 			break;
869 		}
870 
871 		cpu_pause();
872 	}
873 
874 	return;
875 }
876 
877 static boolean_t
878 lck_rw_lock_exclusive_gen(
879 	lck_rw_t        *lock,
880 	bool            (^lock_pause)(void))
881 {
882 	__assert_only thread_t self = current_thread();
883 	__kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lock);
884 	lck_rw_word_t           word;
885 	int                     slept = 0;
886 	lck_rw_grab_state_t     grab_state = LCK_RW_GRAB_S_NOT_LOCKED;
887 	lck_rw_drain_state_t    drain_state = LCK_RW_DRAIN_S_NOT_DRAINED;
888 	wait_result_t           res = 0;
889 	boolean_t               istate;
890 
891 #if     CONFIG_DTRACE
892 	boolean_t dtrace_ls_initialized = FALSE;
893 	boolean_t dtrace_rwl_excl_spin, dtrace_rwl_excl_block, dtrace_ls_enabled = FALSE;
894 	uint64_t wait_interval = 0;
895 	int readers_at_sleep = 0;
896 #endif
897 
898 	assertf(lock->lck_rw_owner != self->ctid,
899 	    "Lock already held state=0x%x, owner=%p",
900 	    ordered_load_rw(lock), self);
901 
902 #ifdef DEBUG_RW
903 	/*
904 	 * Best effort attempt to check that this thread
905 	 * is not already holding the lock (this checks read mode too).
906 	 */
907 	assert_canlock_rwlock(lock, self, LCK_RW_TYPE_EXCLUSIVE);
908 #endif /* DEBUG_RW */
909 
910 	/*
911 	 *	Try to acquire the lck_rw_want_excl bit.
912 	 */
913 	while (lck_rw_grab(lock, LCK_RW_GRAB_F_WANT_EXCL, NULL) != LCK_RW_GRAB_S_LOCKED) {
914 #if     CONFIG_DTRACE
915 		if (dtrace_ls_initialized == FALSE) {
916 			dtrace_ls_initialized = TRUE;
917 			dtrace_rwl_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] != 0);
918 			dtrace_rwl_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK] != 0);
919 			dtrace_ls_enabled = dtrace_rwl_excl_spin || dtrace_rwl_excl_block;
920 			if (dtrace_ls_enabled) {
921 				/*
922 				 * Either sleeping or spinning is happening,
923 				 *  start a timing of our delay interval now.
924 				 */
925 				readers_at_sleep = lock->lck_rw_shared_count;
926 				wait_interval = mach_absolute_time();
927 			}
928 		}
929 #endif
930 
931 		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) | DBG_FUNC_START,
932 		    trace_lck, 0, 0, 0, 0);
933 
934 		grab_state = lck_rw_grab(lock, LCK_RW_GRAB_F_WANT_EXCL | LCK_RW_GRAB_F_WAIT, lock_pause);
935 
936 		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) | DBG_FUNC_END,
937 		    trace_lck, 0, 0, grab_state, 0);
938 
939 		if (grab_state == LCK_RW_GRAB_S_LOCKED ||
940 		    grab_state == LCK_RW_GRAB_S_EARLY_RETURN) {
941 			break;
942 		}
943 		/*
944 		 * if we get here, the deadline has expired w/o us
945 		 * being able to grab the lock exclusively
946 		 * check to see if we're allowed to do a thread_block
947 		 */
948 		word.data = ordered_load_rw(lock);
949 		if (word.can_sleep) {
950 			istate = lck_interlock_lock(lock);
951 			word.data = ordered_load_rw(lock);
952 
953 			if (word.want_excl) {
954 				KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_WAIT_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
955 
956 				word.w_waiting = 1;
957 				ordered_store_rw(lock, word.data);
958 
959 				thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockWrite);
960 				res = assert_wait(LCK_RW_WRITER_EVENT(lock),
961 				    THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
962 				lck_interlock_unlock(lock, istate);
963 				if (res == THREAD_WAITING) {
964 					res = thread_block(THREAD_CONTINUE_NULL);
965 					slept++;
966 				}
967 				KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_WAIT_CODE) | DBG_FUNC_END, trace_lck, res, slept, 0, 0);
968 			} else {
969 				word.want_excl = 1;
970 				ordered_store_rw(lock, word.data);
971 				lck_interlock_unlock(lock, istate);
972 				break;
973 			}
974 		}
975 	}
976 
977 	if (grab_state == LCK_RW_GRAB_S_EARLY_RETURN) {
978 		assert(lock_pause);
979 		return FALSE;
980 	}
981 
982 	/*
983 	 * Wait for readers (and upgrades) to finish...
984 	 */
985 	while (lck_rw_drain_status(lock, LCK_RW_SHARED_MASK | LCK_RW_WANT_UPGRADE, FALSE, NULL) != LCK_RW_DRAIN_S_DRAINED) {
986 #if     CONFIG_DTRACE
987 		/*
988 		 * Either sleeping or spinning is happening, start
989 		 * a timing of our delay interval now.  If we set it
990 		 * to -1 we don't have accurate data so we cannot later
991 		 * decide to record a dtrace spin or sleep event.
992 		 */
993 		if (dtrace_ls_initialized == FALSE) {
994 			dtrace_ls_initialized = TRUE;
995 			dtrace_rwl_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] != 0);
996 			dtrace_rwl_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK] != 0);
997 			dtrace_ls_enabled = dtrace_rwl_excl_spin || dtrace_rwl_excl_block;
998 			if (dtrace_ls_enabled) {
999 				/*
1000 				 * Either sleeping or spinning is happening,
1001 				 *  start a timing of our delay interval now.
1002 				 */
1003 				readers_at_sleep = lock->lck_rw_shared_count;
1004 				wait_interval = mach_absolute_time();
1005 			}
1006 		}
1007 #endif
1008 
1009 		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
1010 
1011 		drain_state = lck_rw_drain_status(lock, LCK_RW_SHARED_MASK | LCK_RW_WANT_UPGRADE, TRUE, lock_pause);
1012 
1013 		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) | DBG_FUNC_END, trace_lck, 0, 0, drain_state, 0);
1014 
1015 		if (drain_state == LCK_RW_DRAIN_S_DRAINED ||
1016 		    drain_state == LCK_RW_DRAIN_S_EARLY_RETURN) {
1017 			break;
1018 		}
1019 		/*
1020 		 * if we get here, the deadline has expired w/o us
1021 		 * being able to grab the lock exclusively
1022 		 * check to see if we're allowed to do a thread_block
1023 		 */
1024 		word.data = ordered_load_rw(lock);
1025 		if (word.can_sleep) {
1026 			istate = lck_interlock_lock(lock);
1027 			word.data = ordered_load_rw(lock);
1028 
1029 			if (word.shared_count != 0 || word.want_upgrade) {
1030 				KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_WAIT_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
1031 
1032 				word.w_waiting = 1;
1033 				ordered_store_rw(lock, word.data);
1034 
1035 				thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockWrite);
1036 				res = assert_wait(LCK_RW_WRITER_EVENT(lock),
1037 				    THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
1038 				lck_interlock_unlock(lock, istate);
1039 
1040 				if (res == THREAD_WAITING) {
1041 					res = thread_block(THREAD_CONTINUE_NULL);
1042 					slept++;
1043 				}
1044 				KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_WAIT_CODE) | DBG_FUNC_END, trace_lck, res, slept, 0, 0);
1045 			} else {
1046 				lck_interlock_unlock(lock, istate);
1047 				/*
1048 				 * must own the lock now, since we checked for
1049 				 * readers or upgrade owner behind the interlock
1050 				 * no need for a call to 'lck_rw_drain_status'
1051 				 */
1052 				break;
1053 			}
1054 		}
1055 	}
1056 
1057 #if     CONFIG_DTRACE
1058 	/*
1059 	 * Decide what latencies we suffered that are Dtrace events.
1060 	 * If we have set wait_interval, then we either spun or slept.
1061 	 * At least we get out from under the interlock before we record
1062 	 * which is the best we can do here to minimize the impact
1063 	 * of the tracing.
1064 	 * If we have set wait_interval to -1, then dtrace was not enabled when we
1065 	 * started sleeping/spinning so we don't record this event.
1066 	 */
1067 	if (dtrace_ls_enabled == TRUE) {
1068 		if (slept == 0) {
1069 			LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_SPIN, lock,
1070 			    mach_absolute_time() - wait_interval, 1);
1071 		} else {
1072 			/*
1073 			 * For the blocking case, we also record if when we blocked
1074 			 * it was held for read or write, and how many readers.
1075 			 * Notice that above we recorded this before we dropped
1076 			 * the interlock so the count is accurate.
1077 			 */
1078 			LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_BLOCK, lock,
1079 			    mach_absolute_time() - wait_interval, 1,
1080 			    (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1081 		}
1082 	}
1083 #endif /* CONFIG_DTRACE */
1084 
1085 	if (drain_state == LCK_RW_DRAIN_S_EARLY_RETURN) {
1086 		lck_rw_drop(lock, LCK_RW_GRAB_F_WANT_EXCL);
1087 		assert(lock_pause);
1088 		return FALSE;
1089 	}
1090 
1091 #if CONFIG_DTRACE
1092 	LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE, lock, 1);
1093 #endif  /* CONFIG_DTRACE */
1094 
1095 	return TRUE;
1096 }
1097 
1098 #define LCK_RW_LOCK_EXCLUSIVE_TAS(lck) (atomic_test_and_set32(&(lck)->lck_rw_data, \
1099 	    (LCK_RW_SHARED_MASK | LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE | LCK_RW_INTERLOCK), \
1100 	    LCK_RW_WANT_EXCL, memory_order_acquire_smp, FALSE))
1101 /*!
1102  * @function lck_rw_lock_exclusive_check_contended
1103  *
1104  * @abstract
1105  * Locks a rw_lock in exclusive mode.
1106  *
1107  * @discussion
1108  * This routine IS EXPERIMENTAL.
1109  * It's only used for the vm object lock, and use for other subsystems is UNSUPPORTED.
1110  * Note that the return value is ONLY A HEURISTIC w.r.t. the lock's contention.
1111  *
1112  * @param lock           rw_lock to lock.
1113  *
1114  * @returns Returns TRUE if the thread spun or blocked while attempting to acquire the lock, FALSE
1115  *          otherwise.
1116  */
1117 bool
lck_rw_lock_exclusive_check_contended(lck_rw_t * lock)1118 lck_rw_lock_exclusive_check_contended(
1119 	lck_rw_t        *lock)
1120 {
1121 	thread_t        thread = current_thread();
1122 	bool            contended  = false;
1123 
1124 	if (lock->lck_rw_can_sleep) {
1125 		lck_rw_lock_count_inc(thread, lock);
1126 	} else if (get_preemption_level() == 0) {
1127 		panic("Taking non-sleepable RW lock with preemption enabled");
1128 	}
1129 
1130 	if (LCK_RW_LOCK_EXCLUSIVE_TAS(lock)) {
1131 #if     CONFIG_DTRACE
1132 		LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE, lock, DTRACE_RW_EXCL);
1133 #endif  /* CONFIG_DTRACE */
1134 	} else {
1135 		contended = true;
1136 		(void) lck_rw_lock_exclusive_gen(lock, NULL);
1137 	}
1138 	assertf(lock->lck_rw_owner == 0, "state=0x%x, owner=%p",
1139 	    ordered_load_rw(lock), ctid_get_thread_unsafe(lock->lck_rw_owner));
1140 	ordered_store_rw_owner(lock, thread->ctid);
1141 
1142 #ifdef DEBUG_RW
1143 	add_held_rwlock(lock, thread, LCK_RW_TYPE_EXCLUSIVE, __builtin_return_address(0));
1144 #endif /* DEBUG_RW */
1145 	return contended;
1146 }
1147 
1148 __attribute__((always_inline))
1149 static boolean_t
1150 lck_rw_lock_exclusive_internal_inline(
1151 	lck_rw_t        *lock,
1152 	void            *caller,
1153 	bool            (^lock_pause)(void))
1154 {
1155 #pragma unused(caller)
1156 	thread_t        thread = current_thread();
1157 
1158 	if (lock->lck_rw_can_sleep) {
1159 		lck_rw_lock_count_inc(thread, lock);
1160 	} else if (get_preemption_level() == 0) {
1161 		panic("Taking non-sleepable RW lock with preemption enabled");
1162 	}
1163 
1164 	if (LCK_RW_LOCK_EXCLUSIVE_TAS(lock)) {
1165 #if     CONFIG_DTRACE
1166 		LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE, lock, DTRACE_RW_EXCL);
1167 #endif  /* CONFIG_DTRACE */
1168 	} else if (!lck_rw_lock_exclusive_gen(lock, lock_pause)) {
1169 		/*
1170 		 * lck_rw_lock_exclusive_gen() should only return
1171 		 * early if lock_pause has been passed and
1172 		 * returns FALSE. lock_pause is exclusive with
1173 		 * lck_rw_can_sleep().
1174 		 */
1175 		assert(!lock->lck_rw_can_sleep);
1176 		return FALSE;
1177 	}
1178 
1179 	assertf(lock->lck_rw_owner == 0, "state=0x%x, owner=%p",
1180 	    ordered_load_rw(lock), ctid_get_thread_unsafe(lock->lck_rw_owner));
1181 	ordered_store_rw_owner(lock, thread->ctid);
1182 
1183 #if DEBUG_RW
1184 	add_held_rwlock(lock, thread, LCK_RW_TYPE_EXCLUSIVE, caller);
1185 #endif /* DEBUG_RW */
1186 
1187 	return TRUE;
1188 }
1189 
1190 __attribute__((noinline))
1191 static void
lck_rw_lock_exclusive_internal(lck_rw_t * lock,void * caller)1192 lck_rw_lock_exclusive_internal(
1193 	lck_rw_t        *lock,
1194 	void            *caller)
1195 {
1196 	(void) lck_rw_lock_exclusive_internal_inline(lock, caller, NULL);
1197 }
1198 
1199 /*!
1200  * @function lck_rw_lock_exclusive
1201  *
1202  * @abstract
1203  * Locks a rw_lock in exclusive mode.
1204  *
1205  * @discussion
1206  * This function can block.
1207  * Multiple threads can acquire the lock in shared mode at the same time, but only one thread at a time
1208  * can acquire it in exclusive mode.
1209  * NOTE: the thread cannot return to userspace while the lock is held. Recursive locking is not supported.
1210  *
1211  * @param lock           rw_lock to lock.
1212  */
1213 void
lck_rw_lock_exclusive(lck_rw_t * lock)1214 lck_rw_lock_exclusive(
1215 	lck_rw_t        *lock)
1216 {
1217 	(void) lck_rw_lock_exclusive_internal_inline(lock, __builtin_return_address(0), NULL);
1218 }
1219 
1220 /*!
1221  * @function lck_rw_lock_exclusive_b
1222  *
1223  * @abstract
1224  * Locks a rw_lock in exclusive mode. Returns early if the lock can't be acquired
1225  * and the specified block returns true.
1226  *
1227  * @discussion
1228  * Identical to lck_rw_lock_exclusive() but can return early if the lock can't be
1229  * acquired and the specified block returns true. The block is called
1230  * repeatedly when waiting to acquire the lock.
1231  * Should only be called when the lock cannot sleep (i.e. when
1232  * lock->lck_rw_can_sleep is false).
1233  *
1234  * @param lock           rw_lock to lock.
1235  * @param lock_pause     block invoked while waiting to acquire lock
1236  *
1237  * @returns              Returns TRUE if the lock is successfully taken,
1238  *                       FALSE if the block returns true and the lock has
1239  *                       not been acquired.
1240  */
1241 boolean_t
1242 lck_rw_lock_exclusive_b(
1243 	lck_rw_t        *lock,
1244 	bool            (^lock_pause)(void))
1245 {
1246 	assert(!lock->lck_rw_can_sleep);
1247 
1248 	return lck_rw_lock_exclusive_internal_inline(lock, __builtin_return_address(0), lock_pause);
1249 }
1250 
1251 /*
1252  *	Routine:	lck_rw_lock_shared_gen
1253  *	Function:
1254  *		Fast path code has determined that this lock
1255  *		is held exclusively... this is where we spin/block
1256  *		until we can acquire the lock in the shared mode
1257  */
1258 static boolean_t
1259 lck_rw_lock_shared_gen(
1260 	lck_rw_t        *lck,
1261 	bool            (^lock_pause)(void))
1262 {
1263 	__assert_only thread_t  self = current_thread();
1264 	__kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lck);
1265 	lck_rw_word_t           word;
1266 	lck_rw_grab_state_t     grab_state = LCK_RW_GRAB_S_NOT_LOCKED;
1267 	int                     slept = 0;
1268 	wait_result_t           res = 0;
1269 	boolean_t               istate;
1270 
1271 #if     CONFIG_DTRACE
1272 	uint64_t wait_interval = 0;
1273 	int readers_at_sleep = 0;
1274 	boolean_t dtrace_ls_initialized = FALSE;
1275 	boolean_t dtrace_rwl_shared_spin, dtrace_rwl_shared_block, dtrace_ls_enabled = FALSE;
1276 #endif /* CONFIG_DTRACE */
1277 
1278 	assertf(lck->lck_rw_owner != self->ctid,
1279 	    "Lock already held state=0x%x, owner=%p",
1280 	    ordered_load_rw(lck), self);
1281 
1282 #ifdef DEBUG_RW
1283 	/*
1284 	 * Best effort attempt to check that this thread
1285 	 * is not already holding the lock in shared mode.
1286 	 */
1287 	assert_canlock_rwlock(lck, self, LCK_RW_TYPE_SHARED);
1288 #endif
1289 
1290 	while (lck_rw_grab(lck, LCK_RW_GRAB_F_SHARED, NULL) != LCK_RW_GRAB_S_LOCKED) {
1291 #if     CONFIG_DTRACE
1292 		if (dtrace_ls_initialized == FALSE) {
1293 			dtrace_ls_initialized = TRUE;
1294 			dtrace_rwl_shared_spin = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_SPIN] != 0);
1295 			dtrace_rwl_shared_block = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_BLOCK] != 0);
1296 			dtrace_ls_enabled = dtrace_rwl_shared_spin || dtrace_rwl_shared_block;
1297 			if (dtrace_ls_enabled) {
1298 				/*
1299 				 * Either sleeping or spinning is happening,
1300 				 *  start a timing of our delay interval now.
1301 				 */
1302 				readers_at_sleep = lck->lck_rw_shared_count;
1303 				wait_interval = mach_absolute_time();
1304 			}
1305 		}
1306 #endif
1307 
1308 		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_SPIN_CODE) | DBG_FUNC_START,
1309 		    trace_lck, lck->lck_rw_want_excl, lck->lck_rw_want_upgrade, 0, 0);
1310 
1311 		grab_state = lck_rw_grab(lck, LCK_RW_GRAB_F_SHARED | LCK_RW_GRAB_F_WAIT, lock_pause);
1312 
1313 		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_SPIN_CODE) | DBG_FUNC_END,
1314 		    trace_lck, lck->lck_rw_want_excl, lck->lck_rw_want_upgrade, grab_state, 0);
1315 
1316 		if (grab_state == LCK_RW_GRAB_S_LOCKED ||
1317 		    grab_state == LCK_RW_GRAB_S_EARLY_RETURN) {
1318 			break;
1319 		}
1320 
1321 		/*
1322 		 * if we get here, the deadline has expired w/o us
1323 		 * being able to grab the lock for read
1324 		 * check to see if we're allowed to do a thread_block
1325 		 */
1326 		if (lck->lck_rw_can_sleep) {
1327 			istate = lck_interlock_lock(lck);
1328 
1329 			word.data = ordered_load_rw(lck);
1330 			if ((word.want_excl || word.want_upgrade) &&
1331 			    ((word.shared_count == 0) || word.priv_excl)) {
1332 				KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_WAIT_CODE) | DBG_FUNC_START,
1333 				    trace_lck, word.want_excl, word.want_upgrade, 0, 0);
1334 
1335 				word.r_waiting = 1;
1336 				ordered_store_rw(lck, word.data);
1337 
1338 				thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockRead);
1339 				res = assert_wait(LCK_RW_READER_EVENT(lck),
1340 				    THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
1341 				lck_interlock_unlock(lck, istate);
1342 
1343 				if (res == THREAD_WAITING) {
1344 					res = thread_block(THREAD_CONTINUE_NULL);
1345 					slept++;
1346 				}
1347 				KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_WAIT_CODE) | DBG_FUNC_END,
1348 				    trace_lck, res, slept, 0, 0);
1349 			} else {
1350 				word.shared_count++;
1351 				ordered_store_rw(lck, word.data);
1352 				lck_interlock_unlock(lck, istate);
1353 				break;
1354 			}
1355 		}
1356 	}
1357 
1358 #if     CONFIG_DTRACE
1359 	if (dtrace_ls_enabled == TRUE) {
1360 		if (slept == 0) {
1361 			LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_SPIN, lck, mach_absolute_time() - wait_interval, 0);
1362 		} else {
1363 			LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_BLOCK, lck,
1364 			    mach_absolute_time() - wait_interval, 0,
1365 			    (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1366 		}
1367 	}
1368 #endif /* CONFIG_DTRACE */
1369 
1370 	if (grab_state == LCK_RW_GRAB_S_EARLY_RETURN) {
1371 		assert(lock_pause);
1372 		return FALSE;
1373 	}
1374 
1375 #if     CONFIG_DTRACE
1376 	LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_ACQUIRE, lck, 0);
1377 #endif  /* CONFIG_DTRACE */
1378 
1379 	return TRUE;
1380 }
1381 
1382 __attribute__((always_inline))
1383 static boolean_t
1384 lck_rw_lock_shared_internal_inline(
1385 	lck_rw_t        *lock,
1386 	void            *caller,
1387 	bool            (^lock_pause)(void))
1388 {
1389 #pragma unused(caller)
1390 
1391 	uint32_t        data, prev;
1392 	thread_t        thread = current_thread();
1393 #ifdef DEBUG_RW
1394 	boolean_t       check_canlock = TRUE;
1395 #endif
1396 
1397 	if (lock->lck_rw_can_sleep) {
1398 		lck_rw_lock_count_inc(thread, lock);
1399 	} else if (get_preemption_level() == 0) {
1400 		panic("Taking non-sleepable RW lock with preemption enabled");
1401 	}
1402 
1403 	for (;;) {
1404 		data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_acquire_smp);
1405 		if (data & (LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE | LCK_RW_INTERLOCK)) {
1406 			atomic_exchange_abort();
1407 			if (!lck_rw_lock_shared_gen(lock, lock_pause)) {
1408 				/*
1409 				 * lck_rw_lock_shared_gen() should only return
1410 				 * early if lock_pause has been passed and
1411 				 * returns FALSE. lock_pause is exclusive with
1412 				 * lck_rw_can_sleep().
1413 				 */
1414 				assert(!lock->lck_rw_can_sleep);
1415 				return FALSE;
1416 			}
1417 
1418 			goto locked;
1419 		}
1420 #ifdef DEBUG_RW
1421 		if ((data & LCK_RW_SHARED_MASK) == 0) {
1422 			/*
1423 			 * If the lock is uncontended,
1424 			 * we do not need to check if we can lock it
1425 			 */
1426 			check_canlock = FALSE;
1427 		}
1428 #endif
1429 		data += LCK_RW_SHARED_READER;
1430 		if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_acquire_smp)) {
1431 			break;
1432 		}
1433 		cpu_pause();
1434 	}
1435 #ifdef DEBUG_RW
1436 	if (check_canlock) {
1437 		/*
1438 		 * Best effort attempt to check that this thread
1439 		 * is not already holding the lock (this checks read mode too).
1440 		 */
1441 		assert_canlock_rwlock(lock, thread, LCK_RW_TYPE_SHARED);
1442 	}
1443 #endif
1444 locked:
1445 	assertf(lock->lck_rw_owner == 0, "state=0x%x, owner=%p",
1446 	    ordered_load_rw(lock), ctid_get_thread_unsafe(lock->lck_rw_owner));
1447 
1448 #if     CONFIG_DTRACE
1449 	LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_ACQUIRE, lock, DTRACE_RW_SHARED);
1450 #endif  /* CONFIG_DTRACE */
1451 
1452 #ifdef DEBUG_RW
1453 	add_held_rwlock(lock, thread, LCK_RW_TYPE_SHARED, caller);
1454 #endif /* DEBUG_RW */
1455 
1456 	return TRUE;
1457 }
1458 
1459 __attribute__((noinline))
1460 static void
lck_rw_lock_shared_internal(lck_rw_t * lock,void * caller)1461 lck_rw_lock_shared_internal(
1462 	lck_rw_t        *lock,
1463 	void            *caller)
1464 {
1465 	(void) lck_rw_lock_shared_internal_inline(lock, caller, NULL);
1466 }
1467 
1468 /*!
1469  * @function lck_rw_lock_shared
1470  *
1471  * @abstract
1472  * Locks a rw_lock in shared mode.
1473  *
1474  * @discussion
1475  * This function can block.
1476  * Multiple threads can acquire the lock in shared mode at the same time, but only one thread at a time
1477  * can acquire it in exclusive mode.
1478  * If the lock is held in shared mode and there are no writers waiting, a reader will be able to acquire
1479  * the lock without waiting.
1480  * If the lock is held in shared mode and there is at least a writer waiting, a reader will wait
1481  * for all the writers to make progress if the lock was initialized with the default settings. Instead if
1482  * RW_SHARED_PRIORITY was selected at initialization time, a reader will never wait if the lock is held
1483  * in shared mode.
1484  * NOTE: the thread cannot return to userspace while the lock is held. Recursive locking is not supported.
1485  *
1486  * @param lock           rw_lock to lock.
1487  */
1488 void
lck_rw_lock_shared(lck_rw_t * lock)1489 lck_rw_lock_shared(
1490 	lck_rw_t        *lock)
1491 {
1492 	(void) lck_rw_lock_shared_internal_inline(lock, __builtin_return_address(0), NULL);
1493 }
1494 
1495 /*!
1496  * @function lck_rw_lock_shared_b
1497  *
1498  * @abstract
1499  * Locks a rw_lock in shared mode. Returns early if the lock can't be acquired
1500  * and the specified block returns true.
1501  *
1502  * @discussion
1503  * Identical to lck_rw_lock_shared() but can return early if the lock can't be
1504  * acquired and the specified block returns true. The block is called
1505  * repeatedly when waiting to acquire the lock.
1506  * Should only be called when the lock cannot sleep (i.e. when
1507  * lock->lck_rw_can_sleep is false).
1508  *
1509  * @param lock           rw_lock to lock.
1510  * @param lock_pause     block invoked while waiting to acquire lock
1511  *
1512  * @returns              Returns TRUE if the lock is successfully taken,
1513  *                       FALSE if the block returns true and the lock has
1514  *                       not been acquired.
1515  */
1516 boolean_t
1517 lck_rw_lock_shared_b(
1518 	lck_rw_t        *lock,
1519 	bool            (^lock_pause)(void))
1520 {
1521 	assert(!lock->lck_rw_can_sleep);
1522 
1523 	return lck_rw_lock_shared_internal_inline(lock, __builtin_return_address(0), lock_pause);
1524 }
1525 
1526 /*
1527  *	Routine:	lck_rw_lock_shared_to_exclusive_failure
1528  *	Function:
1529  *		Fast path code has already dropped our read
1530  *		count and determined that someone else owns 'lck_rw_want_upgrade'
1531  *		if 'lck_rw_shared_count' == 0, its also already dropped 'lck_w_waiting'
1532  *		all we need to do here is determine if a wakeup is needed
1533  */
1534 static boolean_t
lck_rw_lock_shared_to_exclusive_failure(lck_rw_t * lck,uint32_t prior_lock_state)1535 lck_rw_lock_shared_to_exclusive_failure(
1536 	lck_rw_t        *lck,
1537 	uint32_t        prior_lock_state)
1538 {
1539 	thread_t        thread = current_thread();
1540 
1541 	if ((prior_lock_state & LCK_RW_W_WAITING) &&
1542 	    ((prior_lock_state & LCK_RW_SHARED_MASK) == LCK_RW_SHARED_READER)) {
1543 		/*
1544 		 *	Someone else has requested upgrade.
1545 		 *	Since we've released the read lock, wake
1546 		 *	him up if he's blocked waiting
1547 		 */
1548 		thread_wakeup(LCK_RW_WRITER_EVENT(lck));
1549 	}
1550 
1551 	/* Check if dropping the lock means that we need to unpromote */
1552 	if (lck->lck_rw_can_sleep) {
1553 		lck_rw_lock_count_dec(thread, lck);
1554 	}
1555 
1556 	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_CODE) | DBG_FUNC_NONE,
1557 	    VM_KERNEL_UNSLIDE_OR_PERM(lck), lck->lck_rw_shared_count, lck->lck_rw_want_upgrade, 0, 0);
1558 
1559 #ifdef DEBUG_RW
1560 	remove_held_rwlock(lck, thread, LCK_RW_TYPE_SHARED);
1561 #endif /* DEBUG_RW */
1562 
1563 	return FALSE;
1564 }
1565 
1566 /*
1567  *	Routine:	lck_rw_lock_shared_to_exclusive_success
1568  *	Function:
1569  *		the fast path code has already dropped our read
1570  *		count and successfully acquired 'lck_rw_want_upgrade'
1571  *		we just need to wait for the rest of the readers to drain
1572  *		and then we can return as the exclusive holder of this lock
1573  */
1574 static void
lck_rw_lock_shared_to_exclusive_success(lck_rw_t * lock)1575 lck_rw_lock_shared_to_exclusive_success(
1576 	lck_rw_t        *lock)
1577 {
1578 	__kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lock);
1579 	int                     slept = 0;
1580 	lck_rw_word_t           word;
1581 	wait_result_t           res;
1582 	boolean_t               istate;
1583 	lck_rw_drain_state_t    drain_state;
1584 
1585 #if     CONFIG_DTRACE
1586 	uint64_t                wait_interval = 0;
1587 	int                     readers_at_sleep = 0;
1588 	boolean_t               dtrace_ls_initialized = FALSE;
1589 	boolean_t               dtrace_rwl_shared_to_excl_spin, dtrace_rwl_shared_to_excl_block, dtrace_ls_enabled = FALSE;
1590 #endif
1591 
1592 	while (lck_rw_drain_status(lock, LCK_RW_SHARED_MASK, FALSE, NULL) != LCK_RW_DRAIN_S_DRAINED) {
1593 		word.data = ordered_load_rw(lock);
1594 #if     CONFIG_DTRACE
1595 		if (dtrace_ls_initialized == FALSE) {
1596 			dtrace_ls_initialized = TRUE;
1597 			dtrace_rwl_shared_to_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN] != 0);
1598 			dtrace_rwl_shared_to_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK] != 0);
1599 			dtrace_ls_enabled = dtrace_rwl_shared_to_excl_spin || dtrace_rwl_shared_to_excl_block;
1600 			if (dtrace_ls_enabled) {
1601 				/*
1602 				 * Either sleeping or spinning is happening,
1603 				 *  start a timing of our delay interval now.
1604 				 */
1605 				readers_at_sleep = word.shared_count;
1606 				wait_interval = mach_absolute_time();
1607 			}
1608 		}
1609 #endif
1610 
1611 		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_SPIN_CODE) | DBG_FUNC_START,
1612 		    trace_lck, word.shared_count, 0, 0, 0);
1613 
1614 		drain_state = lck_rw_drain_status(lock, LCK_RW_SHARED_MASK, TRUE, NULL);
1615 
1616 		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_SPIN_CODE) | DBG_FUNC_END,
1617 		    trace_lck, lock->lck_rw_shared_count, 0, 0, 0);
1618 
1619 		if (drain_state == LCK_RW_DRAIN_S_DRAINED) {
1620 			break;
1621 		}
1622 
1623 		/*
1624 		 * if we get here, the spin deadline in lck_rw_wait_on_status()
1625 		 * has expired w/o the rw_shared_count having drained to 0
1626 		 * check to see if we're allowed to do a thread_block
1627 		 */
1628 		if (word.can_sleep) {
1629 			istate = lck_interlock_lock(lock);
1630 
1631 			word.data = ordered_load_rw(lock);
1632 			if (word.shared_count != 0) {
1633 				KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_WAIT_CODE) | DBG_FUNC_START,
1634 				    trace_lck, word.shared_count, 0, 0, 0);
1635 
1636 				word.w_waiting = 1;
1637 				ordered_store_rw(lock, word.data);
1638 
1639 				thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockUpgrade);
1640 				res = assert_wait(LCK_RW_WRITER_EVENT(lock),
1641 				    THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
1642 				lck_interlock_unlock(lock, istate);
1643 
1644 				if (res == THREAD_WAITING) {
1645 					res = thread_block(THREAD_CONTINUE_NULL);
1646 					slept++;
1647 				}
1648 				KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_WAIT_CODE) | DBG_FUNC_END,
1649 				    trace_lck, res, slept, 0, 0);
1650 			} else {
1651 				lck_interlock_unlock(lock, istate);
1652 				break;
1653 			}
1654 		}
1655 	}
1656 #if     CONFIG_DTRACE
1657 	/*
1658 	 * We infer whether we took the sleep/spin path above by checking readers_at_sleep.
1659 	 */
1660 	if (dtrace_ls_enabled == TRUE) {
1661 		if (slept == 0) {
1662 			LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN, lock, mach_absolute_time() - wait_interval, 0);
1663 		} else {
1664 			LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK, lock,
1665 			    mach_absolute_time() - wait_interval, 1,
1666 			    (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1667 		}
1668 	}
1669 	LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_UPGRADE, lock, 1);
1670 #endif
1671 }
1672 
1673 /*!
1674  * @function lck_rw_lock_shared_to_exclusive
1675  *
1676  * @abstract
1677  * Upgrades a rw_lock held in shared mode to exclusive.
1678  *
1679  * @discussion
1680  * This function can block.
1681  * Only one reader at a time can upgrade to exclusive mode. If the upgrades fails the function will
1682  * return with the lock not held.
1683  * The caller needs to hold the lock in shared mode to upgrade it.
1684  *
1685  * @param lock           rw_lock already held in shared mode to upgrade.
1686  *
1687  * @returns TRUE if the lock was upgraded, FALSE if it was not possible.
1688  *          If the function was not able to upgrade the lock, the lock will be dropped
1689  *          by the function.
1690  */
1691 boolean_t
lck_rw_lock_shared_to_exclusive(lck_rw_t * lock)1692 lck_rw_lock_shared_to_exclusive(
1693 	lck_rw_t        *lock)
1694 {
1695 	thread_t thread = current_thread();
1696 	uint32_t data, prev;
1697 
1698 	assertf(lock->lck_rw_priv_excl != 0, "lock %p thread %p", lock, current_thread());
1699 
1700 #if DEBUG_RW
1701 	assert_held_rwlock(lock, thread, LCK_RW_TYPE_SHARED);
1702 #endif /* DEBUG_RW */
1703 
1704 	for (;;) {
1705 		data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_acquire_smp);
1706 		if (data & LCK_RW_INTERLOCK) {
1707 			atomic_exchange_abort();
1708 			lck_rw_interlock_spin(lock);
1709 			continue;
1710 		}
1711 		if (data & LCK_RW_WANT_UPGRADE) {
1712 			data -= LCK_RW_SHARED_READER;
1713 			if ((data & LCK_RW_SHARED_MASK) == 0) {         /* we were the last reader */
1714 				data &= ~(LCK_RW_W_WAITING);            /* so clear the wait indicator */
1715 			}
1716 			if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_acquire_smp)) {
1717 				return lck_rw_lock_shared_to_exclusive_failure(lock, prev);
1718 			}
1719 		} else {
1720 			data |= LCK_RW_WANT_UPGRADE;            /* ask for WANT_UPGRADE */
1721 			data -= LCK_RW_SHARED_READER;           /* and shed our read count */
1722 			if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_acquire_smp)) {
1723 				break;
1724 			}
1725 		}
1726 		cpu_pause();
1727 	}
1728 	/* we now own the WANT_UPGRADE */
1729 	if (data & LCK_RW_SHARED_MASK) {        /* check to see if all of the readers are drained */
1730 		lck_rw_lock_shared_to_exclusive_success(lock);  /* if not, we need to go wait */
1731 	}
1732 
1733 	assertf(lock->lck_rw_owner == 0, "state=0x%x, owner=%p",
1734 	    ordered_load_rw(lock), ctid_get_thread_unsafe(lock->lck_rw_owner));
1735 
1736 	ordered_store_rw_owner(lock, thread->ctid);
1737 #if     CONFIG_DTRACE
1738 	LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_UPGRADE, lock, 0);
1739 #endif  /* CONFIG_DTRACE */
1740 
1741 #if DEBUG_RW
1742 	change_held_rwlock(lock, thread, LCK_RW_TYPE_SHARED, __builtin_return_address(0));
1743 #endif /* DEBUG_RW */
1744 	return TRUE;
1745 }
1746 
1747 /*
1748  *      Routine:        lck_rw_lock_exclusive_to_shared_gen
1749  *      Function:
1750  *		Fast path has already dropped
1751  *		our exclusive state and bumped lck_rw_shared_count
1752  *		all we need to do here is determine if anyone
1753  *		needs to be awakened.
1754  */
1755 static void
lck_rw_lock_exclusive_to_shared_gen(lck_rw_t * lck,uint32_t prior_lock_state,void * caller)1756 lck_rw_lock_exclusive_to_shared_gen(
1757 	lck_rw_t        *lck,
1758 	uint32_t        prior_lock_state,
1759 	void            *caller)
1760 {
1761 #pragma unused(caller)
1762 	__kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lck);
1763 	lck_rw_word_t   fake_lck;
1764 
1765 	/*
1766 	 * prior_lock state is a snapshot of the 1st word of the
1767 	 * lock in question... we'll fake up a pointer to it
1768 	 * and carefully not access anything beyond whats defined
1769 	 * in the first word of a lck_rw_t
1770 	 */
1771 	fake_lck.data = prior_lock_state;
1772 
1773 	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_START,
1774 	    trace_lck, fake_lck->want_excl, fake_lck->want_upgrade, 0, 0);
1775 
1776 	/*
1777 	 * don't wake up anyone waiting to take the lock exclusively
1778 	 * since we hold a read count... when the read count drops to 0,
1779 	 * the writers will be woken.
1780 	 *
1781 	 * wake up any waiting readers if we don't have any writers waiting,
1782 	 * or the lock is NOT marked as rw_priv_excl (writers have privilege)
1783 	 */
1784 	if (!(fake_lck.priv_excl && fake_lck.w_waiting) && fake_lck.r_waiting) {
1785 		thread_wakeup(LCK_RW_READER_EVENT(lck));
1786 	}
1787 
1788 	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_END,
1789 	    trace_lck, lck->lck_rw_want_excl, lck->lck_rw_want_upgrade, lck->lck_rw_shared_count, 0);
1790 
1791 #if CONFIG_DTRACE
1792 	LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_TO_SHARED_DOWNGRADE, lck, 0);
1793 #endif
1794 
1795 #if DEBUG_RW
1796 	thread_t        thread = current_thread();
1797 	change_held_rwlock(lck, thread, LCK_RW_TYPE_EXCLUSIVE, caller);
1798 #endif /* DEBUG_RW */
1799 }
1800 
1801 /*!
1802  * @function lck_rw_lock_exclusive_to_shared
1803  *
1804  * @abstract
1805  * Downgrades a rw_lock held in exclusive mode to shared.
1806  *
1807  * @discussion
1808  * The caller needs to hold the lock in exclusive mode to be able to downgrade it.
1809  *
1810  * @param lock           rw_lock already held in exclusive mode to downgrade.
1811  */
1812 void
lck_rw_lock_exclusive_to_shared(lck_rw_t * lock)1813 lck_rw_lock_exclusive_to_shared(
1814 	lck_rw_t        *lock)
1815 {
1816 	uint32_t        data, prev;
1817 
1818 	assertf(lock->lck_rw_owner == current_thread()->ctid,
1819 	    "state=0x%x, owner=%p", lock->lck_rw_data,
1820 	    ctid_get_thread_unsafe(lock->lck_rw_owner));
1821 	ordered_store_rw_owner(lock, 0);
1822 
1823 	for (;;) {
1824 		data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_release_smp);
1825 		if (data & LCK_RW_INTERLOCK) {
1826 			atomic_exchange_abort();
1827 			lck_rw_interlock_spin(lock);    /* wait for interlock to clear */
1828 			continue;
1829 		}
1830 		data += LCK_RW_SHARED_READER;
1831 		if (data & LCK_RW_WANT_UPGRADE) {
1832 			data &= ~(LCK_RW_WANT_UPGRADE);
1833 		} else {
1834 			data &= ~(LCK_RW_WANT_EXCL);
1835 		}
1836 		if (!((prev & LCK_RW_W_WAITING) && (prev & LCK_RW_PRIV_EXCL))) {
1837 			data &= ~(LCK_RW_W_WAITING);
1838 		}
1839 		if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_release_smp)) {
1840 			break;
1841 		}
1842 		cpu_pause();
1843 	}
1844 	lck_rw_lock_exclusive_to_shared_gen(lock, prev, __builtin_return_address(0));
1845 }
1846 
1847 /*
1848  * Very sad hack, but the codegen for lck_rw_lock
1849  * is very unhappy with the combination of __builtin_return_address()
1850  * and a noreturn function. For some reason it adds more frames
1851  * than it should. rdar://76570684
1852  */
1853 void
1854 _lck_rw_lock_type_panic(lck_rw_t *lck, lck_rw_type_t lck_rw_type);
1855 #pragma clang diagnostic push
1856 #pragma clang diagnostic ignored "-Wmissing-noreturn"
1857 __attribute__((noinline, weak))
1858 void
_lck_rw_lock_type_panic(lck_rw_t * lck,lck_rw_type_t lck_rw_type)1859 _lck_rw_lock_type_panic(
1860 	lck_rw_t        *lck,
1861 	lck_rw_type_t   lck_rw_type)
1862 {
1863 	panic("lck_rw_lock(): Invalid RW lock type: %x for lock %p", lck_rw_type, lck);
1864 }
1865 #pragma clang diagnostic pop
1866 
1867 /*!
1868  * @function lck_rw_lock
1869  *
1870  * @abstract
1871  * Locks a rw_lock with the specified type.
1872  *
1873  * @discussion
1874  * See lck_rw_lock_shared() or lck_rw_lock_exclusive() for more details.
1875  *
1876  * @param lck           rw_lock to lock.
1877  * @param lck_rw_type   LCK_RW_TYPE_SHARED or LCK_RW_TYPE_EXCLUSIVE
1878  */
1879 void
lck_rw_lock(lck_rw_t * lck,lck_rw_type_t lck_rw_type)1880 lck_rw_lock(
1881 	lck_rw_t        *lck,
1882 	lck_rw_type_t   lck_rw_type)
1883 {
1884 	if (lck_rw_type == LCK_RW_TYPE_SHARED) {
1885 		return lck_rw_lock_shared_internal(lck, __builtin_return_address(0));
1886 	} else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE) {
1887 		return lck_rw_lock_exclusive_internal(lck, __builtin_return_address(0));
1888 	}
1889 	_lck_rw_lock_type_panic(lck, lck_rw_type);
1890 }
1891 
1892 __attribute__((always_inline))
1893 static boolean_t
lck_rw_try_lock_shared_internal_inline(lck_rw_t * lock,void * caller)1894 lck_rw_try_lock_shared_internal_inline(
1895 	lck_rw_t        *lock,
1896 	void            *caller)
1897 {
1898 #pragma unused(caller)
1899 
1900 	uint32_t        data, prev;
1901 	thread_t        thread = current_thread();
1902 #ifdef DEBUG_RW
1903 	boolean_t       check_canlock = TRUE;
1904 #endif
1905 
1906 	for (;;) {
1907 		data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_acquire_smp);
1908 		if (data & LCK_RW_INTERLOCK) {
1909 			atomic_exchange_abort();
1910 			lck_rw_interlock_spin(lock);
1911 			continue;
1912 		}
1913 		if (data & (LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE)) {
1914 			atomic_exchange_abort();
1915 			return FALSE;             /* lock is busy */
1916 		}
1917 #ifdef DEBUG_RW
1918 		if ((data & LCK_RW_SHARED_MASK) == 0) {
1919 			/*
1920 			 * If the lock is uncontended,
1921 			 * we do not need to check if we can lock it
1922 			 */
1923 			check_canlock = FALSE;
1924 		}
1925 #endif
1926 		data += LCK_RW_SHARED_READER;     /* Increment reader refcount */
1927 		if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_acquire_smp)) {
1928 			break;
1929 		}
1930 		cpu_pause();
1931 	}
1932 #ifdef DEBUG_RW
1933 	if (check_canlock) {
1934 		/*
1935 		 * Best effort attempt to check that this thread
1936 		 * is not already holding the lock (this checks read mode too).
1937 		 */
1938 		assert_canlock_rwlock(lock, thread, LCK_RW_TYPE_SHARED);
1939 	}
1940 #endif
1941 	assertf(lock->lck_rw_owner == 0, "state=0x%x, owner=%p",
1942 	    ordered_load_rw(lock), ctid_get_thread_unsafe(lock->lck_rw_owner));
1943 
1944 	if (lock->lck_rw_can_sleep) {
1945 		lck_rw_lock_count_inc(thread, lock);
1946 	} else if (get_preemption_level() == 0) {
1947 		panic("Taking non-sleepable RW lock with preemption enabled");
1948 	}
1949 
1950 #if     CONFIG_DTRACE
1951 	LOCKSTAT_RECORD(LS_LCK_RW_TRY_LOCK_SHARED_ACQUIRE, lock, DTRACE_RW_SHARED);
1952 #endif  /* CONFIG_DTRACE */
1953 
1954 #ifdef DEBUG_RW
1955 	add_held_rwlock(lock, thread, LCK_RW_TYPE_SHARED, caller);
1956 #endif /* DEBUG_RW */
1957 	return TRUE;
1958 }
1959 
1960 __attribute__((noinline))
1961 static boolean_t
lck_rw_try_lock_shared_internal(lck_rw_t * lock,void * caller)1962 lck_rw_try_lock_shared_internal(
1963 	lck_rw_t        *lock,
1964 	void            *caller)
1965 {
1966 	return lck_rw_try_lock_shared_internal_inline(lock, caller);
1967 }
1968 
1969 /*!
1970  * @function lck_rw_try_lock_shared
1971  *
1972  * @abstract
1973  * Tries to locks a rw_lock in read mode.
1974  *
1975  * @discussion
1976  * This function will return and not block in case the lock is already held.
1977  * See lck_rw_lock_shared for more details.
1978  *
1979  * @param lock           rw_lock to lock.
1980  *
1981  * @returns TRUE if the lock is successfully acquired, FALSE in case it was already held.
1982  */
1983 boolean_t
lck_rw_try_lock_shared(lck_rw_t * lock)1984 lck_rw_try_lock_shared(
1985 	lck_rw_t        *lock)
1986 {
1987 	return lck_rw_try_lock_shared_internal_inline(lock, __builtin_return_address(0));
1988 }
1989 
1990 __attribute__((always_inline))
1991 static boolean_t
lck_rw_try_lock_exclusive_internal_inline(lck_rw_t * lock,void * caller)1992 lck_rw_try_lock_exclusive_internal_inline(
1993 	lck_rw_t        *lock,
1994 	void            *caller)
1995 {
1996 #pragma unused(caller)
1997 	uint32_t        data, prev;
1998 
1999 	for (;;) {
2000 		data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_acquire_smp);
2001 		if (data & LCK_RW_INTERLOCK) {
2002 			atomic_exchange_abort();
2003 			lck_rw_interlock_spin(lock);
2004 			continue;
2005 		}
2006 		if (data & (LCK_RW_SHARED_MASK | LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE)) {
2007 			atomic_exchange_abort();
2008 			return FALSE;
2009 		}
2010 		data |= LCK_RW_WANT_EXCL;
2011 		if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_acquire_smp)) {
2012 			break;
2013 		}
2014 		cpu_pause();
2015 	}
2016 	thread_t thread = current_thread();
2017 
2018 	if (lock->lck_rw_can_sleep) {
2019 		lck_rw_lock_count_inc(thread, lock);
2020 	} else if (get_preemption_level() == 0) {
2021 		panic("Taking non-sleepable RW lock with preemption enabled");
2022 	}
2023 
2024 	assertf(lock->lck_rw_owner == 0, "state=0x%x, owner=%p",
2025 	    ordered_load_rw(lock), ctid_get_thread_unsafe(lock->lck_rw_owner));
2026 
2027 	ordered_store_rw_owner(lock, thread->ctid);
2028 #if     CONFIG_DTRACE
2029 	LOCKSTAT_RECORD(LS_LCK_RW_TRY_LOCK_EXCL_ACQUIRE, lock, DTRACE_RW_EXCL);
2030 #endif  /* CONFIG_DTRACE */
2031 
2032 #ifdef DEBUG_RW
2033 	add_held_rwlock(lock, thread, LCK_RW_TYPE_EXCLUSIVE, caller);
2034 #endif /* DEBUG_RW */
2035 	return TRUE;
2036 }
2037 
2038 __attribute__((noinline))
2039 static boolean_t
lck_rw_try_lock_exclusive_internal(lck_rw_t * lock,void * caller)2040 lck_rw_try_lock_exclusive_internal(
2041 	lck_rw_t        *lock,
2042 	void            *caller)
2043 {
2044 	return lck_rw_try_lock_exclusive_internal_inline(lock, caller);
2045 }
2046 
2047 /*!
2048  * @function lck_rw_try_lock_exclusive
2049  *
2050  * @abstract
2051  * Tries to locks a rw_lock in write mode.
2052  *
2053  * @discussion
2054  * This function will return and not block in case the lock is already held.
2055  * See lck_rw_lock_exclusive for more details.
2056  *
2057  * @param lock           rw_lock to lock.
2058  *
2059  * @returns TRUE if the lock is successfully acquired, FALSE in case it was already held.
2060  */
2061 boolean_t
lck_rw_try_lock_exclusive(lck_rw_t * lock)2062 lck_rw_try_lock_exclusive(
2063 	lck_rw_t        *lock)
2064 {
2065 	return lck_rw_try_lock_exclusive_internal_inline(lock, __builtin_return_address(0));
2066 }
2067 
2068 /*
2069  * Very sad hack, but the codegen for lck_rw_try_lock
2070  * is very unhappy with the combination of __builtin_return_address()
2071  * and a noreturn function. For some reason it adds more frames
2072  * than it should. rdar://76570684
2073  */
2074 boolean_t
2075 _lck_rw_try_lock_type_panic(lck_rw_t *lck, lck_rw_type_t lck_rw_type);
2076 #pragma clang diagnostic push
2077 #pragma clang diagnostic ignored "-Wmissing-noreturn"
2078 __attribute__((noinline, weak))
2079 boolean_t
_lck_rw_try_lock_type_panic(lck_rw_t * lck,lck_rw_type_t lck_rw_type)2080 _lck_rw_try_lock_type_panic(
2081 	lck_rw_t        *lck,
2082 	lck_rw_type_t   lck_rw_type)
2083 {
2084 	panic("lck_rw_lock(): Invalid RW lock type: %x for lock %p", lck_rw_type, lck);
2085 }
2086 #pragma clang diagnostic pop
2087 
2088 /*!
2089  * @function lck_rw_try_lock
2090  *
2091  * @abstract
2092  * Tries to locks a rw_lock with the specified type.
2093  *
2094  * @discussion
2095  * This function will return and not wait/block in case the lock is already held.
2096  * See lck_rw_try_lock_shared() or lck_rw_try_lock_exclusive() for more details.
2097  *
2098  * @param lck           rw_lock to lock.
2099  * @param lck_rw_type   LCK_RW_TYPE_SHARED or LCK_RW_TYPE_EXCLUSIVE
2100  *
2101  * @returns TRUE if the lock is successfully acquired, FALSE in case it was already held.
2102  */
2103 boolean_t
lck_rw_try_lock(lck_rw_t * lck,lck_rw_type_t lck_rw_type)2104 lck_rw_try_lock(
2105 	lck_rw_t        *lck,
2106 	lck_rw_type_t   lck_rw_type)
2107 {
2108 	if (lck_rw_type == LCK_RW_TYPE_SHARED) {
2109 		return lck_rw_try_lock_shared_internal(lck, __builtin_return_address(0));
2110 	} else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE) {
2111 		return lck_rw_try_lock_exclusive_internal(lck, __builtin_return_address(0));
2112 	}
2113 	return _lck_rw_try_lock_type_panic(lck, lck_rw_type);
2114 }
2115 
2116 /*
2117  *      Routine:        lck_rw_done_gen
2118  *
2119  *	prior_lock_state is the value in the 1st
2120  *      word of the lock at the time of a successful
2121  *	atomic compare and exchange with the new value...
2122  *      it represents the state of the lock before we
2123  *	decremented the rw_shared_count or cleared either
2124  *      rw_want_upgrade or rw_want_write and
2125  *	the lck_x_waiting bits...  since the wrapper
2126  *      routine has already changed the state atomically,
2127  *	we just need to decide if we should
2128  *	wake up anyone and what value to return... we do
2129  *	this by examining the state of the lock before
2130  *	we changed it
2131  */
2132 static lck_rw_type_t
lck_rw_done_gen(lck_rw_t * lck,uint32_t prior_lock_state)2133 lck_rw_done_gen(
2134 	lck_rw_t        *lck,
2135 	uint32_t        prior_lock_state)
2136 {
2137 	lck_rw_word_t   fake_lck;
2138 	lck_rw_type_t   lock_type;
2139 	thread_t        thread;
2140 
2141 	/*
2142 	 * prior_lock state is a snapshot of the 1st word of the
2143 	 * lock in question... we'll fake up a pointer to it
2144 	 * and carefully not access anything beyond whats defined
2145 	 * in the first word of a lck_rw_t
2146 	 */
2147 	fake_lck.data = prior_lock_state;
2148 
2149 	if (fake_lck.shared_count <= 1) {
2150 		if (fake_lck.w_waiting) {
2151 			thread_wakeup(LCK_RW_WRITER_EVENT(lck));
2152 		}
2153 
2154 		if (!(fake_lck.priv_excl && fake_lck.w_waiting) && fake_lck.r_waiting) {
2155 			thread_wakeup(LCK_RW_READER_EVENT(lck));
2156 		}
2157 	}
2158 	if (fake_lck.shared_count) {
2159 		lock_type = LCK_RW_TYPE_SHARED;
2160 	} else {
2161 		lock_type = LCK_RW_TYPE_EXCLUSIVE;
2162 	}
2163 
2164 	/* Check if dropping the lock means that we need to unpromote */
2165 	thread = current_thread();
2166 	if (fake_lck.can_sleep) {
2167 		lck_rw_lock_count_dec(thread, lck);
2168 	}
2169 
2170 #if CONFIG_DTRACE
2171 	LOCKSTAT_RECORD(LS_LCK_RW_DONE_RELEASE, lck, lock_type == LCK_RW_TYPE_SHARED ? 0 : 1);
2172 #endif
2173 
2174 #ifdef DEBUG_RW
2175 	remove_held_rwlock(lck, thread, lock_type);
2176 #endif /* DEBUG_RW */
2177 	return lock_type;
2178 }
2179 
2180 /*!
2181  * @function lck_rw_done
2182  *
2183  * @abstract
2184  * Force unlocks a rw_lock without consistency checks.
2185  *
2186  * @discussion
2187  * Do not use unless sure you can avoid consistency checks.
2188  *
2189  * @param lock           rw_lock to unlock.
2190  */
2191 lck_rw_type_t
lck_rw_done(lck_rw_t * lock)2192 lck_rw_done(
2193 	lck_rw_t        *lock)
2194 {
2195 	uint32_t        data, prev;
2196 	boolean_t       once = FALSE;
2197 
2198 #ifdef DEBUG_RW
2199 	/*
2200 	 * Best effort attempt to check that this thread
2201 	 * is holding the lock.
2202 	 */
2203 	thread_t thread = current_thread();
2204 	assert_held_rwlock(lock, thread, 0);
2205 #endif /* DEBUG_RW */
2206 	for (;;) {
2207 		data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_release_smp);
2208 		if (data & LCK_RW_INTERLOCK) {          /* wait for interlock to clear */
2209 			atomic_exchange_abort();
2210 			lck_rw_interlock_spin(lock);
2211 			continue;
2212 		}
2213 		if (data & LCK_RW_SHARED_MASK) {        /* lock is held shared */
2214 			assertf(lock->lck_rw_owner == 0,
2215 			    "state=0x%x, owner=%p", lock->lck_rw_data,
2216 			    ctid_get_thread_unsafe(lock->lck_rw_owner));
2217 			data -= LCK_RW_SHARED_READER;
2218 			if ((data & LCK_RW_SHARED_MASK) == 0) { /* if reader count has now gone to 0, check for waiters */
2219 				goto check_waiters;
2220 			}
2221 		} else {                                        /* if reader count == 0, must be exclusive lock */
2222 			if (data & LCK_RW_WANT_UPGRADE) {
2223 				data &= ~(LCK_RW_WANT_UPGRADE);
2224 			} else {
2225 				if (data & LCK_RW_WANT_EXCL) {
2226 					data &= ~(LCK_RW_WANT_EXCL);
2227 				} else {                                /* lock is not 'owned', panic */
2228 					panic("Releasing non-exclusive RW lock without a reader refcount!");
2229 				}
2230 			}
2231 			if (!once) {
2232 				// Only check for holder and clear it once
2233 				assertf(lock->lck_rw_owner == current_thread()->ctid,
2234 				    "state=0x%x, owner=%p", lock->lck_rw_data,
2235 				    ctid_get_thread_unsafe(lock->lck_rw_owner));
2236 				ordered_store_rw_owner(lock, 0);
2237 				once = TRUE;
2238 			}
2239 check_waiters:
2240 			/*
2241 			 * test the original values to match what
2242 			 * lck_rw_done_gen is going to do to determine
2243 			 * which wakeups need to happen...
2244 			 *
2245 			 * if !(fake_lck->lck_rw_priv_excl && fake_lck->lck_w_waiting)
2246 			 */
2247 			if (prev & LCK_RW_W_WAITING) {
2248 				data &= ~(LCK_RW_W_WAITING);
2249 				if ((prev & LCK_RW_PRIV_EXCL) == 0) {
2250 					data &= ~(LCK_RW_R_WAITING);
2251 				}
2252 			} else {
2253 				data &= ~(LCK_RW_R_WAITING);
2254 			}
2255 		}
2256 		if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_release_smp)) {
2257 			break;
2258 		}
2259 		cpu_pause();
2260 	}
2261 	return lck_rw_done_gen(lock, prev);
2262 }
2263 
2264 /*!
2265  * @function lck_rw_unlock_shared
2266  *
2267  * @abstract
2268  * Unlocks a rw_lock previously locked in shared mode.
2269  *
2270  * @discussion
2271  * The same thread that locked the lock needs to unlock it.
2272  *
2273  * @param lck           rw_lock held in shared mode to unlock.
2274  */
2275 void
lck_rw_unlock_shared(lck_rw_t * lck)2276 lck_rw_unlock_shared(
2277 	lck_rw_t        *lck)
2278 {
2279 	lck_rw_type_t   ret;
2280 
2281 	assertf(lck->lck_rw_owner == 0,
2282 	    "state=0x%x, owner=%p", lck->lck_rw_data,
2283 	    ctid_get_thread_unsafe(lck->lck_rw_owner));
2284 	assertf(lck->lck_rw_shared_count > 0, "shared_count=0x%x", lck->lck_rw_shared_count);
2285 	ret = lck_rw_done(lck);
2286 
2287 	if (ret != LCK_RW_TYPE_SHARED) {
2288 		panic("lck_rw_unlock_shared(): lock %p held in mode: %d", lck, ret);
2289 	}
2290 }
2291 
2292 /*!
2293  * @function lck_rw_unlock_exclusive
2294  *
2295  * @abstract
2296  * Unlocks a rw_lock previously locked in exclusive mode.
2297  *
2298  * @discussion
2299  * The same thread that locked the lock needs to unlock it.
2300  *
2301  * @param lck           rw_lock held in exclusive mode to unlock.
2302  */
2303 void
lck_rw_unlock_exclusive(lck_rw_t * lck)2304 lck_rw_unlock_exclusive(
2305 	lck_rw_t        *lck)
2306 {
2307 	lck_rw_type_t   ret;
2308 
2309 	assertf(lck->lck_rw_owner == current_thread()->ctid,
2310 	    "state=0x%x, owner=%p", lck->lck_rw_data,
2311 	    ctid_get_thread_unsafe(lck->lck_rw_owner));
2312 	ret = lck_rw_done(lck);
2313 
2314 	if (ret != LCK_RW_TYPE_EXCLUSIVE) {
2315 		panic("lck_rw_unlock_exclusive(): lock %p held in mode: %d", lck, ret);
2316 	}
2317 }
2318 
2319 /*!
2320  * @function lck_rw_unlock
2321  *
2322  * @abstract
2323  * Unlocks a rw_lock previously locked with lck_rw_type.
2324  *
2325  * @discussion
2326  * The lock must be unlocked by the same thread it was locked from.
2327  * The type of the lock/unlock have to match, unless an upgrade/downgrade was performed while
2328  * holding the lock.
2329  *
2330  * @param lck           rw_lock to unlock.
2331  * @param lck_rw_type   LCK_RW_TYPE_SHARED or LCK_RW_TYPE_EXCLUSIVE
2332  */
2333 void
lck_rw_unlock(lck_rw_t * lck,lck_rw_type_t lck_rw_type)2334 lck_rw_unlock(
2335 	lck_rw_t         *lck,
2336 	lck_rw_type_t    lck_rw_type)
2337 {
2338 	if (lck_rw_type == LCK_RW_TYPE_SHARED) {
2339 		lck_rw_unlock_shared(lck);
2340 	} else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE) {
2341 		lck_rw_unlock_exclusive(lck);
2342 	} else {
2343 		panic("lck_rw_unlock(): Invalid RW lock type: %d", lck_rw_type);
2344 	}
2345 }
2346 
2347 /*!
2348  * @function lck_rw_assert
2349  *
2350  * @abstract
2351  * Asserts the rw_lock is held.
2352  *
2353  * @discussion
2354  * read-write locks do not have a concept of ownership when held in shared mode,
2355  * so this function merely asserts that someone is holding the lock, not necessarily the caller.
2356  * However if rw_lock_debug is on, a best effort mechanism to track the owners is in place, and
2357  * this function can be more accurate.
2358  * Type can be LCK_RW_ASSERT_SHARED, LCK_RW_ASSERT_EXCLUSIVE, LCK_RW_ASSERT_HELD
2359  * LCK_RW_ASSERT_NOTHELD.
2360  *
2361  * @param lck   rw_lock to check.
2362  * @param type  assert type
2363  */
2364 void
lck_rw_assert(lck_rw_t * lck,unsigned int type)2365 lck_rw_assert(
2366 	lck_rw_t        *lck,
2367 	unsigned int    type)
2368 {
2369 	thread_t thread = current_thread();
2370 
2371 	switch (type) {
2372 	case LCK_RW_ASSERT_SHARED:
2373 		if ((lck->lck_rw_shared_count != 0) &&
2374 		    (lck->lck_rw_owner == 0)) {
2375 #if DEBUG_RW
2376 			assert_held_rwlock(lck, thread, LCK_RW_TYPE_SHARED);
2377 #endif /* DEBUG_RW */
2378 			return;
2379 		}
2380 		break;
2381 	case LCK_RW_ASSERT_EXCLUSIVE:
2382 		if ((lck->lck_rw_want_excl || lck->lck_rw_want_upgrade) &&
2383 		    (lck->lck_rw_shared_count == 0) &&
2384 		    (lck->lck_rw_owner == thread->ctid)) {
2385 #if DEBUG_RW
2386 			assert_held_rwlock(lck, thread, LCK_RW_TYPE_EXCLUSIVE);
2387 #endif /* DEBUG_RW */
2388 			return;
2389 		}
2390 		break;
2391 	case LCK_RW_ASSERT_HELD:
2392 		if (lck->lck_rw_shared_count != 0) {
2393 #if DEBUG_RW
2394 			assert_held_rwlock(lck, thread, LCK_RW_TYPE_SHARED);
2395 #endif /* DEBUG_RW */
2396 			return;         // Held shared
2397 		}
2398 		if ((lck->lck_rw_want_excl || lck->lck_rw_want_upgrade) &&
2399 		    (lck->lck_rw_owner == thread->ctid)) {
2400 #if DEBUG_RW
2401 			assert_held_rwlock(lck, thread, LCK_RW_TYPE_EXCLUSIVE);
2402 #endif /* DEBUG_RW */
2403 			return;         // Held exclusive
2404 		}
2405 		break;
2406 	case LCK_RW_ASSERT_NOTHELD:
2407 		if ((lck->lck_rw_shared_count == 0) &&
2408 		    !(lck->lck_rw_want_excl || lck->lck_rw_want_upgrade) &&
2409 		    (lck->lck_rw_owner == 0)) {
2410 #ifdef DEBUG_RW
2411 			assert_canlock_rwlock(lck, thread, LCK_RW_TYPE_EXCLUSIVE);
2412 #endif /* DEBUG_RW */
2413 			return;
2414 		}
2415 		break;
2416 	default:
2417 		break;
2418 	}
2419 	panic("rw lock (%p)%s held (mode=%u)", lck, (type == LCK_RW_ASSERT_NOTHELD ? "" : " not"), type);
2420 }
2421 
2422 /*!
2423  * @function kdp_lck_rw_lock_is_acquired_exclusive
2424  *
2425  * @abstract
2426  * Checks if a rw_lock is held exclusevely.
2427  *
2428  * @discussion
2429  * NOT SAFE: To be used only by kernel debugger to avoid deadlock.
2430  *
2431  * @param lck   lock to check
2432  *
2433  * @returns TRUE if the lock is held exclusevely
2434  */
2435 boolean_t
kdp_lck_rw_lock_is_acquired_exclusive(lck_rw_t * lck)2436 kdp_lck_rw_lock_is_acquired_exclusive(
2437 	lck_rw_t        *lck)
2438 {
2439 	if (not_in_kdp) {
2440 		panic("panic: rw lock exclusive check done outside of kernel debugger");
2441 	}
2442 	return ((lck->lck_rw_want_upgrade || lck->lck_rw_want_excl) && (lck->lck_rw_shared_count == 0)) ? TRUE : FALSE;
2443 }
2444 
2445 void
kdp_rwlck_find_owner(__unused struct waitq * waitq,event64_t event,thread_waitinfo_t * waitinfo)2446 kdp_rwlck_find_owner(
2447 	__unused struct waitq   *waitq,
2448 	event64_t               event,
2449 	thread_waitinfo_t       *waitinfo)
2450 {
2451 	lck_rw_t        *rwlck = NULL;
2452 	switch (waitinfo->wait_type) {
2453 	case kThreadWaitKernelRWLockRead:
2454 		rwlck = READ_EVENT_TO_RWLOCK(event);
2455 		break;
2456 	case kThreadWaitKernelRWLockWrite:
2457 	case kThreadWaitKernelRWLockUpgrade:
2458 		rwlck = WRITE_EVENT_TO_RWLOCK(event);
2459 		break;
2460 	default:
2461 		panic("%s was called with an invalid blocking type", __FUNCTION__);
2462 		break;
2463 	}
2464 	waitinfo->context = VM_KERNEL_UNSLIDE_OR_PERM(rwlck);
2465 	waitinfo->owner = thread_tid(ctid_get_thread(rwlck->lck_rw_owner));
2466 }
2467 
2468 /*!
2469  * @function lck_rw_lock_yield_shared
2470  *
2471  * @abstract
2472  * Yields a rw_lock held in shared mode.
2473  *
2474  * @discussion
2475  * This function can block.
2476  * Yields the lock in case there are writers waiting.
2477  * The yield will unlock, block, and re-lock the lock in shared mode.
2478  *
2479  * @param lck           rw_lock already held in shared mode to yield.
2480  * @param force_yield   if set to true it will always yield irrespective of the lock status
2481  *
2482  * @returns TRUE if the lock was yield, FALSE otherwise
2483  */
2484 bool
lck_rw_lock_yield_shared(lck_rw_t * lck,boolean_t force_yield)2485 lck_rw_lock_yield_shared(
2486 	lck_rw_t        *lck,
2487 	boolean_t       force_yield)
2488 {
2489 	lck_rw_word_t   word;
2490 
2491 	lck_rw_assert(lck, LCK_RW_ASSERT_SHARED);
2492 
2493 	word.data = ordered_load_rw(lck);
2494 	if (word.want_excl || word.want_upgrade || force_yield) {
2495 		lck_rw_unlock_shared(lck);
2496 		mutex_pause(2);
2497 		lck_rw_lock_shared(lck);
2498 		return true;
2499 	}
2500 
2501 	return false;
2502 }
2503 
2504 /*!
2505  * @function lck_rw_lock_yield_exclusive
2506  *
2507  * @abstract
2508  * Yields a rw_lock held in exclusive mode.
2509  *
2510  * @discussion
2511  * This function can block.
2512  * Yields the lock in case there are writers waiting.
2513  * The yield will unlock, block, and re-lock the lock in exclusive mode.
2514  *
2515  * @param lck           rw_lock already held in exclusive mode to yield.
2516  * @param mode          when to yield.
2517  *
2518  * @returns TRUE if the lock was yield, FALSE otherwise
2519  */
2520 bool
lck_rw_lock_yield_exclusive(lck_rw_t * lck,lck_rw_yield_t mode)2521 lck_rw_lock_yield_exclusive(
2522 	lck_rw_t        *lck,
2523 	lck_rw_yield_t  mode)
2524 {
2525 	lck_rw_word_t word;
2526 	bool yield = false;
2527 
2528 	lck_rw_assert(lck, LCK_RW_ASSERT_EXCLUSIVE);
2529 
2530 	if (mode == LCK_RW_YIELD_ALWAYS) {
2531 		yield = true;
2532 	} else {
2533 		word.data = ordered_load_rw(lck);
2534 		if (word.w_waiting) {
2535 			yield = true;
2536 		} else if (mode == LCK_RW_YIELD_ANY_WAITER) {
2537 			yield = (word.r_waiting != 0);
2538 		}
2539 	}
2540 
2541 	if (yield) {
2542 		lck_rw_unlock_exclusive(lck);
2543 		mutex_pause(2);
2544 		lck_rw_lock_exclusive(lck);
2545 	}
2546 
2547 	return yield;
2548 }
2549 
2550 /*!
2551  * @function lck_rw_sleep
2552  *
2553  * @abstract
2554  * Assert_wait on an event while holding the rw_lock.
2555  *
2556  * @discussion
2557  * the flags can decide how to re-acquire the lock upon wake up
2558  * (LCK_SLEEP_SHARED, or LCK_SLEEP_EXCLUSIVE, or LCK_SLEEP_UNLOCK)
2559  * and if the priority needs to be kept boosted until the lock is
2560  * re-acquired (LCK_SLEEP_PROMOTED_PRI).
2561  *
2562  * @param lck                   rw_lock to use to synch the assert_wait.
2563  * @param lck_sleep_action      flags.
2564  * @param event                 event to assert_wait on.
2565  * @param interruptible         wait type.
2566  */
2567 wait_result_t
lck_rw_sleep(lck_rw_t * lck,lck_sleep_action_t lck_sleep_action,event_t event,wait_interrupt_t interruptible)2568 lck_rw_sleep(
2569 	lck_rw_t                *lck,
2570 	lck_sleep_action_t      lck_sleep_action,
2571 	event_t                 event,
2572 	wait_interrupt_t        interruptible)
2573 {
2574 	wait_result_t           res;
2575 	lck_rw_type_t           lck_rw_type;
2576 	thread_pri_floor_t      token;
2577 
2578 	if ((lck_sleep_action & ~LCK_SLEEP_MASK) != 0) {
2579 		panic("Invalid lock sleep action %x", lck_sleep_action);
2580 	}
2581 
2582 	if (lck_sleep_action & LCK_SLEEP_PROMOTED_PRI) {
2583 		/*
2584 		 * Although we are dropping the RW lock, the intent in most cases
2585 		 * is that this thread remains as an observer, since it may hold
2586 		 * some secondary resource, but must yield to avoid deadlock. In
2587 		 * this situation, make sure that the thread is boosted to the
2588 		 * ceiling while blocked, so that it can re-acquire the
2589 		 * RW lock at that priority.
2590 		 */
2591 		token = thread_priority_floor_start();
2592 	}
2593 
2594 	res = assert_wait(event, interruptible);
2595 	if (res == THREAD_WAITING) {
2596 		lck_rw_type = lck_rw_done(lck);
2597 		res = thread_block(THREAD_CONTINUE_NULL);
2598 		if (!(lck_sleep_action & LCK_SLEEP_UNLOCK)) {
2599 			if (!(lck_sleep_action & (LCK_SLEEP_SHARED | LCK_SLEEP_EXCLUSIVE))) {
2600 				lck_rw_lock(lck, lck_rw_type);
2601 			} else if (lck_sleep_action & LCK_SLEEP_EXCLUSIVE) {
2602 				lck_rw_lock_exclusive(lck);
2603 			} else {
2604 				lck_rw_lock_shared(lck);
2605 			}
2606 		}
2607 	} else if (lck_sleep_action & LCK_SLEEP_UNLOCK) {
2608 		(void)lck_rw_done(lck);
2609 	}
2610 
2611 	if (lck_sleep_action & LCK_SLEEP_PROMOTED_PRI) {
2612 		thread_priority_floor_end(&token);
2613 	}
2614 
2615 	return res;
2616 }
2617 
2618 /*!
2619  * @function lck_rw_sleep_deadline
2620  *
2621  * @abstract
2622  * Assert_wait_deadline on an event while holding the rw_lock.
2623  *
2624  * @discussion
2625  * the flags can decide how to re-acquire the lock upon wake up
2626  * (LCK_SLEEP_SHARED, or LCK_SLEEP_EXCLUSIVE, or LCK_SLEEP_UNLOCK)
2627  * and if the priority needs to be kept boosted until the lock is
2628  * re-acquired (LCK_SLEEP_PROMOTED_PRI).
2629  *
2630  * @param lck                   rw_lock to use to synch the assert_wait.
2631  * @param lck_sleep_action      flags.
2632  * @param event                 event to assert_wait on.
2633  * @param interruptible         wait type.
2634  * @param deadline              maximum time after which being woken up
2635  */
2636 wait_result_t
lck_rw_sleep_deadline(lck_rw_t * lck,lck_sleep_action_t lck_sleep_action,event_t event,wait_interrupt_t interruptible,uint64_t deadline)2637 lck_rw_sleep_deadline(
2638 	lck_rw_t                *lck,
2639 	lck_sleep_action_t      lck_sleep_action,
2640 	event_t                 event,
2641 	wait_interrupt_t        interruptible,
2642 	uint64_t                deadline)
2643 {
2644 	wait_result_t           res;
2645 	lck_rw_type_t           lck_rw_type;
2646 	thread_pri_floor_t      token;
2647 
2648 	if ((lck_sleep_action & ~LCK_SLEEP_MASK) != 0) {
2649 		panic("Invalid lock sleep action %x", lck_sleep_action);
2650 	}
2651 
2652 	if (lck_sleep_action & LCK_SLEEP_PROMOTED_PRI) {
2653 		token = thread_priority_floor_start();
2654 	}
2655 
2656 	res = assert_wait_deadline(event, interruptible, deadline);
2657 	if (res == THREAD_WAITING) {
2658 		lck_rw_type = lck_rw_done(lck);
2659 		res = thread_block(THREAD_CONTINUE_NULL);
2660 		if (!(lck_sleep_action & LCK_SLEEP_UNLOCK)) {
2661 			if (!(lck_sleep_action & (LCK_SLEEP_SHARED | LCK_SLEEP_EXCLUSIVE))) {
2662 				lck_rw_lock(lck, lck_rw_type);
2663 			} else if (lck_sleep_action & LCK_SLEEP_EXCLUSIVE) {
2664 				lck_rw_lock_exclusive(lck);
2665 			} else {
2666 				lck_rw_lock_shared(lck);
2667 			}
2668 		}
2669 	} else if (lck_sleep_action & LCK_SLEEP_UNLOCK) {
2670 		(void)lck_rw_done(lck);
2671 	}
2672 
2673 	if (lck_sleep_action & LCK_SLEEP_PROMOTED_PRI) {
2674 		thread_priority_floor_end(&token);
2675 	}
2676 
2677 	return res;
2678 }
2679 
2680 /*
2681  * Reader-writer lock promotion
2682  *
2683  * We support a limited form of reader-writer
2684  * lock promotion whose effects are:
2685  *
2686  *   * Qualifying threads have decay disabled
2687  *   * Scheduler priority is reset to a floor of
2688  *     of their statically assigned priority
2689  *     or MINPRI_RWLOCK
2690  *
2691  * The rationale is that lck_rw_ts do not have
2692  * a single owner, so we cannot apply a directed
2693  * priority boost from all waiting threads
2694  * to all holding threads without maintaining
2695  * lists of all shared owners and all waiting
2696  * threads for every lock.
2697  *
2698  * Instead (and to preserve the uncontended fast-
2699  * path), acquiring (or attempting to acquire)
2700  * a RW lock in shared or exclusive lock increments
2701  * a per-thread counter. Only if that thread stops
2702  * making forward progress (for instance blocking
2703  * on a mutex, or being preempted) do we consult
2704  * the counter and apply the priority floor.
2705  * When the thread becomes runnable again (or in
2706  * the case of preemption it never stopped being
2707  * runnable), it has the priority boost and should
2708  * be in a good position to run on the CPU and
2709  * release all RW locks (at which point the priority
2710  * boost is cleared).
2711  *
2712  * Care must be taken to ensure that priority
2713  * boosts are not retained indefinitely, since unlike
2714  * mutex priority boosts (where the boost is tied
2715  * to the mutex lifecycle), the boost is tied
2716  * to the thread and independent of any particular
2717  * lck_rw_t. Assertions are in place on return
2718  * to userspace so that the boost is not held
2719  * indefinitely.
2720  *
2721  * The routines that increment/decrement the
2722  * per-thread counter should err on the side of
2723  * incrementing any time a preemption is possible
2724  * and the lock would be visible to the rest of the
2725  * system as held (so it should be incremented before
2726  * interlocks are dropped/preemption is enabled, or
2727  * before a CAS is executed to acquire the lock).
2728  *
2729  */
2730 
2731 /*!
2732  * @function lck_rw_clear_promotion
2733  *
2734  * @abstract
2735  * Undo priority promotions when the last rw_lock
2736  * is released by a thread (if a promotion was active).
2737  *
2738  * @param thread        thread to demote.
2739  * @param lock          object reason for the demotion.
2740  */
2741 __attribute__((noinline))
2742 static void
lck_rw_clear_promotion(thread_t thread,const void * lock)2743 lck_rw_clear_promotion(thread_t thread, const void *lock)
2744 {
2745 	/* Cancel any promotions if the thread had actually blocked while holding a RW lock */
2746 	spl_t s = splsched();
2747 	thread_lock(thread);
2748 
2749 	if (thread->sched_flags & TH_SFLAG_RW_PROMOTED) {
2750 		sched_thread_unpromote_reason(thread, TH_SFLAG_RW_PROMOTED,
2751 		    unslide_for_kdebug(lock));
2752 	}
2753 
2754 	thread_unlock(thread);
2755 	splx(s);
2756 }
2757 
2758 /*!
2759  * @function lck_rw_set_promotion_locked
2760  *
2761  * @abstract
2762  * Callout from context switch if the thread goes
2763  * off core with a positive rwlock_count.
2764  *
2765  * @discussion
2766  * Called at splsched with the thread locked.
2767  *
2768  * @param thread        thread to promote.
2769  */
2770 __attribute__((always_inline))
2771 void
lck_rw_set_promotion_locked(thread_t thread)2772 lck_rw_set_promotion_locked(thread_t thread)
2773 {
2774 	if (LcksOpts & LCK_OPTION_DISABLE_RW_PRIO) {
2775 		return;
2776 	}
2777 
2778 	assert(thread->rwlock_count > 0);
2779 
2780 	if (!(thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
2781 		sched_thread_promote_reason(thread, TH_SFLAG_RW_PROMOTED, 0);
2782 	}
2783 }
2784 
2785 __attribute__((always_inline))
2786 void
lck_rw_lock_count_inc(thread_t thread,const void * lock __unused)2787 lck_rw_lock_count_inc(thread_t thread, const void *lock __unused)
2788 {
2789 	if (thread->rwlock_count++ == 0) {
2790 #if MACH_ASSERT
2791 		/*
2792 		 * Set the ast to check that the
2793 		 * rwlock_count is going to be set to zero when
2794 		 * going back to userspace.
2795 		 * Set it only once when we increment it for the first time.
2796 		 */
2797 		act_set_debug_assert();
2798 #endif
2799 	}
2800 }
2801 
2802 __abortlike
2803 static void
__lck_rw_lock_count_dec_panic(thread_t thread)2804 __lck_rw_lock_count_dec_panic(thread_t thread)
2805 {
2806 	panic("rw lock count underflow for thread %p", thread);
2807 }
2808 
2809 __attribute__((always_inline))
2810 void
lck_rw_lock_count_dec(thread_t thread,const void * lock)2811 lck_rw_lock_count_dec(thread_t thread, const void *lock)
2812 {
2813 	uint32_t rwlock_count = thread->rwlock_count--;
2814 
2815 	if (rwlock_count == 0) {
2816 		__lck_rw_lock_count_dec_panic(thread);
2817 	}
2818 
2819 	if (__probable(rwlock_count == 1)) {
2820 		/* sched_flags checked without lock, but will be rechecked while clearing */
2821 		if (__improbable(thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
2822 			lck_rw_clear_promotion(thread, lock);
2823 		}
2824 	}
2825 }
2826