xref: /xnu-8796.121.2/osfmk/kern/lock_rw.c (revision c54f35ca767986246321eb901baf8f5ff7923f6a)
1 /*
2  * Copyright (c) 2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * @OSF_COPYRIGHT@
30  */
31 /*
32  * Mach Operating System
33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34  * All Rights Reserved.
35  *
36  * Permission to use, copy, modify and distribute this software and its
37  * documentation is hereby granted, provided that both the copyright
38  * notice and this permission notice appear in all copies of the
39  * software, derivative works or modified versions, and any portions
40  * thereof, and that both notices appear in supporting documentation.
41  *
42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45  *
46  * Carnegie Mellon requests users of this software to return to
47  *
48  *  Software Distribution Coordinator  or  [email protected]
49  *  School of Computer Science
50  *  Carnegie Mellon University
51  *  Pittsburgh PA 15213-3890
52  *
53  * any improvements or extensions that they make and grant Carnegie Mellon
54  * the rights to redistribute these changes.
55  */
56 #define LOCK_PRIVATE 1
57 #include <debug.h>
58 #include <kern/locks_internal.h>
59 #include <kern/lock_stat.h>
60 #include <kern/locks.h>
61 #include <kern/zalloc.h>
62 #include <kern/thread.h>
63 #include <kern/processor.h>
64 #include <kern/sched_prim.h>
65 #include <kern/debug.h>
66 #include <machine/atomic.h>
67 #include <machine/machine_cpu.h>
68 
69 KALLOC_TYPE_DEFINE(KT_LCK_RW, lck_rw_t, KT_PRIV_ACCT);
70 
71 #define LCK_RW_WRITER_EVENT(lck)                (event_t)((uintptr_t)(lck)+1)
72 #define LCK_RW_READER_EVENT(lck)                (event_t)((uintptr_t)(lck)+2)
73 #define WRITE_EVENT_TO_RWLOCK(event)            ((lck_rw_t *)((uintptr_t)(event)-1))
74 #define READ_EVENT_TO_RWLOCK(event)             ((lck_rw_t *)((uintptr_t)(event)-2))
75 
76 #if CONFIG_DTRACE
77 #define DTRACE_RW_SHARED        0x0     //reader
78 #define DTRACE_RW_EXCL          0x1     //writer
79 #define DTRACE_NO_FLAG          0x0     //not applicable
80 #endif  /* CONFIG_DTRACE */
81 
82 #define LCK_RW_LCK_EXCLUSIVE_CODE       0x100
83 #define LCK_RW_LCK_EXCLUSIVE1_CODE      0x101
84 #define LCK_RW_LCK_SHARED_CODE          0x102
85 #define LCK_RW_LCK_SH_TO_EX_CODE        0x103
86 #define LCK_RW_LCK_SH_TO_EX1_CODE       0x104
87 #define LCK_RW_LCK_EX_TO_SH_CODE        0x105
88 
89 #if __x86_64__
90 #define LCK_RW_LCK_EX_WRITER_SPIN_CODE  0x106
91 #define LCK_RW_LCK_EX_WRITER_WAIT_CODE  0x107
92 #define LCK_RW_LCK_EX_READER_SPIN_CODE  0x108
93 #define LCK_RW_LCK_EX_READER_WAIT_CODE  0x109
94 #define LCK_RW_LCK_SHARED_SPIN_CODE     0x110
95 #define LCK_RW_LCK_SHARED_WAIT_CODE     0x111
96 #define LCK_RW_LCK_SH_TO_EX_SPIN_CODE   0x112
97 #define LCK_RW_LCK_SH_TO_EX_WAIT_CODE   0x113
98 #endif
99 
100 #define lck_rw_ilk_lock(lock)   hw_lock_bit  ((hw_lock_bit_t*)(&(lock)->lck_rw_tag), LCK_RW_INTERLOCK_BIT, LCK_GRP_NULL)
101 #define lck_rw_ilk_unlock(lock) hw_unlock_bit((hw_lock_bit_t*)(&(lock)->lck_rw_tag), LCK_RW_INTERLOCK_BIT)
102 
103 #define ordered_load_rw(lock)                   os_atomic_load(&(lock)->lck_rw_data, compiler_acq_rel)
104 #define ordered_store_rw(lock, value)           os_atomic_store(&(lock)->lck_rw_data, (value), compiler_acq_rel)
105 #define ordered_store_rw_owner(lock, value)     os_atomic_store(&(lock)->lck_rw_owner, (value), compiler_acq_rel)
106 
107 #ifdef DEBUG_RW
108 static TUNABLE(bool, lck_rw_recursive_shared_assert_74048094, "lck_rw_recursive_shared_assert", false);
109 SECURITY_READ_ONLY_EARLY(vm_packing_params_t) rwlde_caller_packing_params =
110     VM_PACKING_PARAMS(LCK_RW_CALLER_PACKED);
111 #define rw_lock_debug_disabled()                ((LcksOpts & disLkRWDebug) == disLkRWDebug)
112 
113 #define set_rwlde_caller_packed(entry, caller)          ((entry)->rwlde_caller_packed = VM_PACK_POINTER((vm_offset_t)caller, LCK_RW_CALLER_PACKED))
114 #define get_rwlde_caller(entry)                         ((void*)VM_UNPACK_POINTER(entry->rwlde_caller_packed, LCK_RW_CALLER_PACKED))
115 
116 #endif /* DEBUG_RW */
117 
118 /*!
119  * @function lck_rw_alloc_init
120  *
121  * @abstract
122  * Allocates and initializes a rw_lock_t.
123  *
124  * @discussion
125  * The function can block. See lck_rw_init() for initialization details.
126  *
127  * @param grp           lock group to associate with the lock.
128  * @param attr          lock attribute to initialize the lock.
129  *
130  * @returns             NULL or the allocated lock
131  */
132 lck_rw_t *
lck_rw_alloc_init(lck_grp_t * grp,lck_attr_t * attr)133 lck_rw_alloc_init(
134 	lck_grp_t       *grp,
135 	lck_attr_t      *attr)
136 {
137 	lck_rw_t *lck;
138 
139 	lck = zalloc_flags(KT_LCK_RW, Z_WAITOK | Z_ZERO);
140 	lck_rw_init(lck, grp, attr);
141 	return lck;
142 }
143 
144 /*!
145  * @function lck_rw_init
146  *
147  * @abstract
148  * Initializes a rw_lock_t.
149  *
150  * @discussion
151  * Usage statistics for the lock are going to be added to the lock group provided.
152  *
153  * The lock attribute can be used to specify the lock contention behaviour.
154  * RW_WRITER_PRIORITY is the default behaviour (LCK_ATTR_NULL defaults to RW_WRITER_PRIORITY)
155  * and lck_attr_rw_shared_priority() can be used to set the behaviour to RW_SHARED_PRIORITY.
156  *
157  * RW_WRITER_PRIORITY gives priority to the writers upon contention with the readers;
158  * if the lock is held and a writer starts waiting for the lock, readers will not be able
159  * to acquire the lock until all writers stop contending. Readers could
160  * potentially starve.
161  * RW_SHARED_PRIORITY gives priority to the readers upon contention with the writers:
162  * unleass the lock is held in exclusive mode, readers will always be able to acquire the lock.
163  * Readers can lock a shared lock even if there are writers waiting. Writers could potentially
164  * starve.
165  *
166  * @param lck           lock to initialize.
167  * @param grp           lock group to associate with the lock.
168  * @param attr          lock attribute to initialize the lock.
169  *
170  */
171 void
lck_rw_init(lck_rw_t * lck,lck_grp_t * grp,lck_attr_t * attr)172 lck_rw_init(
173 	lck_rw_t        *lck,
174 	lck_grp_t       *grp,
175 	lck_attr_t      *attr)
176 {
177 	/* keep this so that the lck_type_t type is referenced for lldb */
178 	lck_type_t type = LCK_TYPE_RW;
179 
180 	if (attr == LCK_ATTR_NULL) {
181 		attr = &lck_attr_default;
182 	}
183 	*lck = (lck_rw_t){
184 		.lck_rw_type = type,
185 		.lck_rw_can_sleep = true,
186 		.lck_rw_priv_excl = !(attr->lck_attr_val & LCK_ATTR_RW_SHARED_PRIORITY),
187 	};
188 	lck_grp_reference(grp, &grp->lck_grp_rwcnt);
189 }
190 
191 /*!
192  * @function lck_rw_free
193  *
194  * @abstract
195  * Frees a rw_lock previously allocated with lck_rw_alloc_init().
196  *
197  * @discussion
198  * The lock must be not held by any thread.
199  *
200  * @param lck           rw_lock to free.
201  */
202 void
lck_rw_free(lck_rw_t * lck,lck_grp_t * grp)203 lck_rw_free(
204 	lck_rw_t        *lck,
205 	lck_grp_t       *grp)
206 {
207 	lck_rw_destroy(lck, grp);
208 	zfree(KT_LCK_RW, lck);
209 }
210 
211 /*!
212  * @function lck_rw_destroy
213  *
214  * @abstract
215  * Destroys a rw_lock previously initialized with lck_rw_init().
216  *
217  * @discussion
218  * The lock must be not held by any thread.
219  *
220  * @param lck           rw_lock to destroy.
221  */
222 void
lck_rw_destroy(lck_rw_t * lck,lck_grp_t * grp)223 lck_rw_destroy(
224 	lck_rw_t        *lck,
225 	lck_grp_t       *grp)
226 {
227 	if (lck->lck_rw_type != LCK_TYPE_RW ||
228 	    lck->lck_rw_tag == LCK_RW_TAG_DESTROYED) {
229 		panic("Destroying previously destroyed lock %p", lck);
230 	}
231 	lck_rw_assert(lck, LCK_RW_ASSERT_NOTHELD);
232 
233 	lck->lck_rw_type = LCK_TYPE_NONE;
234 	lck->lck_rw_tag = LCK_RW_TAG_DESTROYED;
235 	lck_grp_deallocate(grp, &grp->lck_grp_rwcnt);
236 }
237 
238 #ifdef DEBUG_RW
239 
240 /*
241  * Best effort mechanism to debug rw_locks.
242  *
243  * This mechanism is in addition to the owner checks. The owner is set
244  * only when the lock is held in exclusive mode so the checks do not cover
245  * the cases in which the lock is held in shared mode.
246  *
247  * This mechanism tentatively stores the rw_lock acquired and its debug
248  * information on the thread struct.
249  * Just up to LCK_RW_EXPECTED_MAX_NUMBER rw lock debug information can be stored.
250  *
251  * NOTE: LCK_RW_EXPECTED_MAX_NUMBER is the expected number of rw_locks held
252  * at the same time. If a thread holds more than this number of rw_locks we
253  * will start losing debug information.
254  * Increasing LCK_RW_EXPECTED_MAX_NUMBER will increase the probability we will
255  * store the debug information but it will require more memory per thread
256  * and longer lock/unlock time.
257  *
258  * If an empty slot is found for the debug information, we record the lock
259  * otherwise we set the overflow threshold flag.
260  *
261  * If we reached the overflow threshold we might stop asserting because we cannot be sure
262  * anymore if the lock was acquired or not.
263  *
264  * Even if we reached the overflow threshold, we try to store the debug information
265  * for the new locks acquired. This can be useful in core dumps to debug
266  * possible return to userspace without unlocking and to find possible readers
267  * holding the lock.
268  */
269 __startup_func
270 static void
rw_lock_init(void)271 rw_lock_init(void)
272 {
273 	if (kern_feature_override(KF_RW_LOCK_DEBUG_OVRD)) {
274 		LcksOpts |= disLkRWDebug;
275 	}
276 }
277 STARTUP(LOCKS, STARTUP_RANK_FIRST, rw_lock_init);
278 
279 static inline struct rw_lock_debug_entry *
find_lock_in_savedlocks(lck_rw_t * lock,rw_lock_debug_t * rw_locks_held)280 find_lock_in_savedlocks(lck_rw_t* lock, rw_lock_debug_t *rw_locks_held)
281 {
282 	int i;
283 	for (i = 0; i < LCK_RW_EXPECTED_MAX_NUMBER; i++) {
284 		struct rw_lock_debug_entry *existing = &rw_locks_held->rwld_locks[i];
285 		if (existing->rwlde_lock == lock) {
286 			return existing;
287 		}
288 	}
289 
290 	return NULL;
291 }
292 
293 __abortlike
294 static void
rwlock_slot_panic(rw_lock_debug_t * rw_locks_held)295 rwlock_slot_panic(rw_lock_debug_t *rw_locks_held)
296 {
297 	panic("No empty slot found in %p slot_used %d", rw_locks_held, rw_locks_held->rwld_locks_saved);
298 }
299 
300 static inline struct rw_lock_debug_entry *
find_empty_slot(rw_lock_debug_t * rw_locks_held)301 find_empty_slot(rw_lock_debug_t *rw_locks_held)
302 {
303 	int i;
304 	for (i = 0; i < LCK_RW_EXPECTED_MAX_NUMBER; i++) {
305 		struct rw_lock_debug_entry *entry = &rw_locks_held->rwld_locks[i];
306 		if (entry->rwlde_lock == NULL) {
307 			return entry;
308 		}
309 	}
310 	rwlock_slot_panic(rw_locks_held);
311 }
312 
313 __abortlike
314 static void
canlock_rwlock_panic(lck_rw_t * lock,thread_t thread,struct rw_lock_debug_entry * entry)315 canlock_rwlock_panic(lck_rw_t* lock, thread_t thread, struct rw_lock_debug_entry *entry)
316 {
317 	panic("RW lock %p already held by %p caller %p mode_count %d state 0x%x owner 0x%p ",
318 	    lock, thread, get_rwlde_caller(entry), entry->rwlde_mode_count,
319 	    ordered_load_rw(lock), ctid_get_thread_unsafe(lock->lck_rw_owner));
320 }
321 
322 static inline void
assert_canlock_rwlock(lck_rw_t * lock,thread_t thread,lck_rw_type_t type)323 assert_canlock_rwlock(lck_rw_t* lock, thread_t thread, lck_rw_type_t type)
324 {
325 	rw_lock_debug_t *rw_locks_held = &thread->rw_lock_held;
326 
327 	if (__probable(rw_lock_debug_disabled() || (rw_locks_held->rwld_locks_acquired == 0))) {
328 		//no locks saved, safe to lock
329 		return;
330 	}
331 
332 	struct rw_lock_debug_entry *entry = find_lock_in_savedlocks(lock, rw_locks_held);
333 	if (__improbable(entry != NULL)) {
334 		boolean_t can_be_shared_recursive;
335 		if (lck_rw_recursive_shared_assert_74048094) {
336 			can_be_shared_recursive = (lock->lck_rw_priv_excl == 0);
337 		} else {
338 			/* currently rw_lock_shared is called recursively,
339 			 * until the code is fixed allow to lock
340 			 * recursively in shared mode
341 			 */
342 			can_be_shared_recursive = TRUE;
343 		}
344 		if ((type == LCK_RW_TYPE_SHARED) && can_be_shared_recursive && entry->rwlde_mode_count >= 1) {
345 			return;
346 		}
347 		canlock_rwlock_panic(lock, thread, entry);
348 	}
349 }
350 
351 __abortlike
352 static void
held_rwlock_notheld_panic(lck_rw_t * lock,thread_t thread)353 held_rwlock_notheld_panic(lck_rw_t* lock, thread_t thread)
354 {
355 	panic("RW lock %p not held by %p", lock, thread);
356 }
357 
358 __abortlike
359 static void
held_rwlock_notheld_with_info_panic(lck_rw_t * lock,thread_t thread,lck_rw_type_t type,struct rw_lock_debug_entry * entry)360 held_rwlock_notheld_with_info_panic(lck_rw_t* lock, thread_t thread, lck_rw_type_t type, struct rw_lock_debug_entry *entry)
361 {
362 	if (type == LCK_RW_TYPE_EXCLUSIVE) {
363 		panic("RW lock %p not held in exclusive by %p caller %p read %d state 0x%x owner 0x%p ",
364 		    lock, thread, get_rwlde_caller(entry), entry->rwlde_mode_count,
365 		    ordered_load_rw(lock), ctid_get_thread_unsafe(lock->lck_rw_owner));
366 	} else {
367 		panic("RW lock %p not held in shared by %p caller %p read %d state 0x%x owner 0x%p ",
368 		    lock, thread, get_rwlde_caller(entry), entry->rwlde_mode_count,
369 		    ordered_load_rw(lock), ctid_get_thread_unsafe(lock->lck_rw_owner));
370 	}
371 }
372 
373 static inline void
assert_held_rwlock(lck_rw_t * lock,thread_t thread,lck_rw_type_t type)374 assert_held_rwlock(lck_rw_t* lock, thread_t thread, lck_rw_type_t type)
375 {
376 	rw_lock_debug_t *rw_locks_held = &thread->rw_lock_held;
377 
378 	if (__probable(rw_lock_debug_disabled())) {
379 		return;
380 	}
381 
382 	if (__improbable(rw_locks_held->rwld_locks_acquired == 0 || rw_locks_held->rwld_locks_saved == 0)) {
383 		if (rw_locks_held->rwld_locks_acquired == 0 || rw_locks_held->rwld_overflow == 0) {
384 			held_rwlock_notheld_panic(lock, thread);
385 		}
386 		return;
387 	}
388 
389 	struct rw_lock_debug_entry *entry = find_lock_in_savedlocks(lock, rw_locks_held);
390 	if (__probable(entry != NULL)) {
391 		if (type == LCK_RW_TYPE_EXCLUSIVE && entry->rwlde_mode_count != -1) {
392 			held_rwlock_notheld_with_info_panic(lock, thread, type, entry);
393 		} else {
394 			if (type == LCK_RW_TYPE_SHARED && entry->rwlde_mode_count <= 0) {
395 				held_rwlock_notheld_with_info_panic(lock, thread, type, entry);
396 			}
397 		}
398 	} else {
399 		if (rw_locks_held->rwld_overflow == 0) {
400 			held_rwlock_notheld_panic(lock, thread);
401 		}
402 	}
403 }
404 
405 static inline void
change_held_rwlock(lck_rw_t * lock,thread_t thread,lck_rw_type_t typeFrom,void * caller)406 change_held_rwlock(lck_rw_t* lock, thread_t thread, lck_rw_type_t typeFrom, void* caller)
407 {
408 	rw_lock_debug_t *rw_locks_held = &thread->rw_lock_held;
409 
410 	if (__probable(rw_lock_debug_disabled())) {
411 		return;
412 	}
413 
414 	if (__improbable(rw_locks_held->rwld_locks_saved == 0)) {
415 		if (rw_locks_held->rwld_overflow == 0) {
416 			held_rwlock_notheld_panic(lock, thread);
417 		}
418 		return;
419 	}
420 
421 	struct rw_lock_debug_entry *entry = find_lock_in_savedlocks(lock, rw_locks_held);
422 	if (__probable(entry != NULL)) {
423 		if (typeFrom == LCK_RW_TYPE_SHARED) {
424 			//We are upgrading
425 			assertf(entry->rwlde_mode_count == 1,
426 			    "RW lock %p not held by a single shared when upgrading "
427 			    "by %p caller %p read %d state 0x%x owner 0x%p ",
428 			    lock, thread, get_rwlde_caller(entry), entry->rwlde_mode_count,
429 			    ordered_load_rw(lock), ctid_get_thread_unsafe(lock->lck_rw_owner));
430 			entry->rwlde_mode_count = -1;
431 			set_rwlde_caller_packed(entry, caller);
432 		} else {
433 			//We are downgrading
434 			assertf(entry->rwlde_mode_count == -1,
435 			    "RW lock %p not held in write mode when downgrading "
436 			    "by %p caller %p read %d state 0x%x owner 0x%p ",
437 			    lock, thread, get_rwlde_caller(entry), entry->rwlde_mode_count,
438 			    ordered_load_rw(lock), ctid_get_thread_unsafe(lock->lck_rw_owner));
439 			entry->rwlde_mode_count = 1;
440 			set_rwlde_caller_packed(entry, caller);
441 		}
442 		return;
443 	}
444 
445 	if (rw_locks_held->rwld_overflow == 0) {
446 		held_rwlock_notheld_panic(lock, thread);
447 	}
448 
449 	if (rw_locks_held->rwld_locks_saved == LCK_RW_EXPECTED_MAX_NUMBER) {
450 		//array is full
451 		return;
452 	}
453 
454 	struct rw_lock_debug_entry *null_entry = find_empty_slot(rw_locks_held);
455 	null_entry->rwlde_lock = lock;
456 	set_rwlde_caller_packed(null_entry, caller);
457 	if (typeFrom == LCK_RW_TYPE_SHARED) {
458 		null_entry->rwlde_mode_count = -1;
459 	} else {
460 		null_entry->rwlde_mode_count = 1;
461 	}
462 	rw_locks_held->rwld_locks_saved++;
463 }
464 
465 __abortlike
466 static void
add_held_rwlock_too_many_panic(thread_t thread)467 add_held_rwlock_too_many_panic(thread_t thread)
468 {
469 	panic("RW lock too many rw locks held, rwld_locks_acquired maxed out for thread %p", thread);
470 }
471 
472 static inline void
add_held_rwlock(lck_rw_t * lock,thread_t thread,lck_rw_type_t type,void * caller)473 add_held_rwlock(lck_rw_t* lock, thread_t thread, lck_rw_type_t type, void* caller)
474 {
475 	rw_lock_debug_t *rw_locks_held = &thread->rw_lock_held;
476 	struct rw_lock_debug_entry *null_entry;
477 
478 	if (__probable(rw_lock_debug_disabled())) {
479 		return;
480 	}
481 
482 	if (__improbable(rw_locks_held->rwld_locks_acquired == UINT32_MAX)) {
483 		add_held_rwlock_too_many_panic(thread);
484 	}
485 	rw_locks_held->rwld_locks_acquired++;
486 
487 	if (type == LCK_RW_TYPE_EXCLUSIVE) {
488 		if (__improbable(rw_locks_held->rwld_locks_saved == LCK_RW_EXPECTED_MAX_NUMBER)) {
489 			//array is full
490 			rw_locks_held->rwld_overflow = 1;
491 			return;
492 		}
493 		null_entry = find_empty_slot(rw_locks_held);
494 		null_entry->rwlde_lock = lock;
495 		set_rwlde_caller_packed(null_entry, caller);
496 		null_entry->rwlde_mode_count = -1;
497 		rw_locks_held->rwld_locks_saved++;
498 		return;
499 	} else {
500 		if (__probable(rw_locks_held->rwld_locks_saved == 0)) {
501 			//array is empty
502 			goto add_shared;
503 		}
504 
505 		boolean_t allow_shared_recursive;
506 		if (lck_rw_recursive_shared_assert_74048094) {
507 			allow_shared_recursive = (lock->lck_rw_priv_excl == 0);
508 		} else {
509 			allow_shared_recursive = TRUE;
510 		}
511 		if (allow_shared_recursive) {
512 			//It could be already locked in shared mode
513 			struct rw_lock_debug_entry *entry = find_lock_in_savedlocks(lock, rw_locks_held);
514 			if (entry != NULL) {
515 				assert(entry->rwlde_mode_count > 0);
516 				assertf(entry->rwlde_mode_count != INT8_MAX,
517 				    "RW lock %p with too many recursive shared held "
518 				    "from %p caller %p read %d state 0x%x owner 0x%p",
519 				    lock, thread, get_rwlde_caller(entry), entry->rwlde_mode_count,
520 				    ordered_load_rw(lock), ctid_get_thread_unsafe(lock->lck_rw_owner));
521 				entry->rwlde_mode_count += 1;
522 				return;
523 			}
524 		}
525 
526 		//none of the locks were a match
527 		//try to add a new entry
528 		if (__improbable(rw_locks_held->rwld_locks_saved == LCK_RW_EXPECTED_MAX_NUMBER)) {
529 			//array is full
530 			rw_locks_held->rwld_overflow = 1;
531 			return;
532 		}
533 
534 add_shared:
535 		null_entry = find_empty_slot(rw_locks_held);
536 		null_entry->rwlde_lock = lock;
537 		set_rwlde_caller_packed(null_entry, caller);
538 		null_entry->rwlde_mode_count = 1;
539 		rw_locks_held->rwld_locks_saved++;
540 	}
541 }
542 
543 static inline void
remove_held_rwlock(lck_rw_t * lock,thread_t thread,lck_rw_type_t type)544 remove_held_rwlock(lck_rw_t* lock, thread_t thread, lck_rw_type_t type)
545 {
546 	rw_lock_debug_t *rw_locks_held = &thread->rw_lock_held;
547 
548 	if (__probable(rw_lock_debug_disabled())) {
549 		return;
550 	}
551 
552 	if (__improbable(rw_locks_held->rwld_locks_acquired == 0)) {
553 		return;
554 	}
555 	rw_locks_held->rwld_locks_acquired--;
556 
557 	if (rw_locks_held->rwld_locks_saved == 0) {
558 		assert(rw_locks_held->rwld_overflow == 1);
559 		goto out;
560 	}
561 
562 	struct rw_lock_debug_entry *entry = find_lock_in_savedlocks(lock, rw_locks_held);
563 	if (__probable(entry != NULL)) {
564 		if (type == LCK_RW_TYPE_EXCLUSIVE) {
565 			assert(entry->rwlde_mode_count == -1);
566 			entry->rwlde_mode_count = 0;
567 		} else {
568 			assert(entry->rwlde_mode_count > 0);
569 			entry->rwlde_mode_count--;
570 			if (entry->rwlde_mode_count > 0) {
571 				goto out;
572 			}
573 		}
574 		entry->rwlde_caller_packed = 0;
575 		entry->rwlde_lock = NULL;
576 		rw_locks_held->rwld_locks_saved--;
577 	} else {
578 		assert(rw_locks_held->rwld_overflow == 1);
579 	}
580 
581 out:
582 	if (rw_locks_held->rwld_locks_acquired == 0) {
583 		rw_locks_held->rwld_overflow = 0;
584 	}
585 	return;
586 }
587 #endif /* DEBUG_RW */
588 
589 /*
590  * We disable interrupts while holding the RW interlock to prevent an
591  * interrupt from exacerbating hold time.
592  * Hence, local helper functions lck_interlock_lock()/lck_interlock_unlock().
593  */
594 static inline boolean_t
lck_interlock_lock(lck_rw_t * lck)595 lck_interlock_lock(
596 	lck_rw_t        *lck)
597 {
598 	boolean_t       istate;
599 
600 	istate = ml_set_interrupts_enabled(FALSE);
601 	lck_rw_ilk_lock(lck);
602 	return istate;
603 }
604 
605 static inline void
lck_interlock_unlock(lck_rw_t * lck,boolean_t istate)606 lck_interlock_unlock(
607 	lck_rw_t        *lck,
608 	boolean_t       istate)
609 {
610 	lck_rw_ilk_unlock(lck);
611 	ml_set_interrupts_enabled(istate);
612 }
613 
614 /*
615  * compute the deadline to spin against when
616  * waiting for a change of state on a lck_rw_t
617  */
618 static inline uint64_t
lck_rw_deadline_for_spin(lck_rw_t * lck)619 lck_rw_deadline_for_spin(
620 	lck_rw_t        *lck)
621 {
622 	lck_rw_word_t   word;
623 
624 	word.data = ordered_load_rw(lck);
625 	if (word.can_sleep) {
626 		if (word.r_waiting || word.w_waiting || (word.shared_count > machine_info.max_cpus)) {
627 			/*
628 			 * there are already threads waiting on this lock... this
629 			 * implies that they have spun beyond their deadlines waiting for
630 			 * the desired state to show up so we will not bother spinning at this time...
631 			 *   or
632 			 * the current number of threads sharing this lock exceeds our capacity to run them
633 			 * concurrently and since all states we're going to spin for require the rw_shared_count
634 			 * to be at 0, we'll not bother spinning since the latency for this to happen is
635 			 * unpredictable...
636 			 */
637 			return mach_absolute_time();
638 		}
639 		return mach_absolute_time() + os_atomic_load(&MutexSpin, relaxed);
640 	} else {
641 		return mach_absolute_time() + (100000LL * 1000000000LL);
642 	}
643 }
644 
645 /*
646  * This inline is used when busy-waiting for an rw lock.
647  * If interrupts were disabled when the lock primitive was called,
648  * we poll the IPI handler for pending tlb flushes in x86.
649  */
650 static inline void
lck_rw_lock_pause(boolean_t interrupts_enabled)651 lck_rw_lock_pause(
652 	boolean_t       interrupts_enabled)
653 {
654 #if X86_64
655 	if (!interrupts_enabled) {
656 		handle_pending_TLB_flushes();
657 	}
658 	cpu_pause();
659 #else
660 	(void) interrupts_enabled;
661 	wait_for_event();
662 #endif
663 }
664 
665 typedef enum __enum_closed {
666 	LCK_RW_DRAIN_S_DRAINED       = 0,
667 	LCK_RW_DRAIN_S_NOT_DRAINED   = 1,
668 	LCK_RW_DRAIN_S_EARLY_RETURN  = 2,
669 	LCK_RW_DRAIN_S_TIMED_OUT     = 3,
670 } lck_rw_drain_state_t;
671 
672 static lck_rw_drain_state_t
673 lck_rw_drain_status(
674 	lck_rw_t        *lock,
675 	uint32_t        status_mask,
676 	boolean_t       wait,
677 	bool            (^lock_pause)(void))
678 {
679 	uint64_t        deadline = 0;
680 	uint32_t        data;
681 	boolean_t       istate = FALSE;
682 
683 	if (wait) {
684 		deadline = lck_rw_deadline_for_spin(lock);
685 #if __x86_64__
686 		istate = ml_get_interrupts_enabled();
687 #endif
688 	}
689 
690 	for (;;) {
691 #if __x86_64__
692 		data = os_atomic_load(&lock->lck_rw_data, relaxed);
693 #else
694 		data = load_exclusive32(&lock->lck_rw_data, memory_order_acquire_smp);
695 #endif
696 		if ((data & status_mask) == 0) {
697 			atomic_exchange_abort();
698 			return LCK_RW_DRAIN_S_DRAINED;
699 		}
700 
701 		if (!wait) {
702 			atomic_exchange_abort();
703 			return LCK_RW_DRAIN_S_NOT_DRAINED;
704 		}
705 
706 		lck_rw_lock_pause(istate);
707 
708 		if (mach_absolute_time() >= deadline) {
709 			return LCK_RW_DRAIN_S_TIMED_OUT;
710 		}
711 
712 		if (lock_pause && lock_pause()) {
713 			return LCK_RW_DRAIN_S_EARLY_RETURN;
714 		}
715 	}
716 }
717 
718 /*
719  * Spin while interlock is held.
720  */
721 static inline void
lck_rw_interlock_spin(lck_rw_t * lock)722 lck_rw_interlock_spin(
723 	lck_rw_t        *lock)
724 {
725 	uint32_t        data, prev;
726 
727 	for (;;) {
728 		data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_relaxed);
729 		if (data & LCK_RW_INTERLOCK) {
730 #if __x86_64__
731 			cpu_pause();
732 #else
733 			wait_for_event();
734 #endif
735 		} else {
736 			atomic_exchange_abort();
737 			return;
738 		}
739 	}
740 }
741 
742 #define LCK_RW_GRAB_WANT        0
743 #define LCK_RW_GRAB_SHARED      1
744 
745 typedef enum __enum_closed __enum_options {
746 	LCK_RW_GRAB_F_SHARED    = 0x0,  // Not really a flag obviously but makes call sites more readable.
747 	LCK_RW_GRAB_F_WANT_EXCL = 0x1,
748 	LCK_RW_GRAB_F_WAIT      = 0x2,
749 } lck_rw_grab_flags_t;
750 
751 typedef enum __enum_closed {
752 	LCK_RW_GRAB_S_NOT_LOCKED    = 0,
753 	LCK_RW_GRAB_S_LOCKED        = 1,
754 	LCK_RW_GRAB_S_EARLY_RETURN  = 2,
755 	LCK_RW_GRAB_S_TIMED_OUT     = 3,
756 } lck_rw_grab_state_t;
757 
758 static lck_rw_grab_state_t
759 lck_rw_grab(
760 	lck_rw_t            *lock,
761 	lck_rw_grab_flags_t flags,
762 	bool                (^lock_pause)(void))
763 {
764 	uint64_t        deadline = 0;
765 	uint32_t        data, prev;
766 	boolean_t       do_exch, istate = FALSE;
767 
768 	assert3u(flags & ~(LCK_RW_GRAB_F_WANT_EXCL | LCK_RW_GRAB_F_WAIT), ==, 0);
769 
770 	if ((flags & LCK_RW_GRAB_F_WAIT) != 0) {
771 		deadline = lck_rw_deadline_for_spin(lock);
772 #if __x86_64__
773 		istate = ml_get_interrupts_enabled();
774 #endif
775 	}
776 
777 	for (;;) {
778 		data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_acquire_smp);
779 		if (data & LCK_RW_INTERLOCK) {
780 			atomic_exchange_abort();
781 			lck_rw_interlock_spin(lock);
782 			continue;
783 		}
784 		do_exch = FALSE;
785 		if ((flags & LCK_RW_GRAB_F_WANT_EXCL) != 0) {
786 			if ((data & LCK_RW_WANT_EXCL) == 0) {
787 				data |= LCK_RW_WANT_EXCL;
788 				do_exch = TRUE;
789 			}
790 		} else {        // LCK_RW_GRAB_SHARED
791 			if (((data & (LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE)) == 0) ||
792 			    (((data & LCK_RW_SHARED_MASK)) && ((data & LCK_RW_PRIV_EXCL) == 0))) {
793 				data += LCK_RW_SHARED_READER;
794 				do_exch = TRUE;
795 			}
796 		}
797 		if (do_exch) {
798 			if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_acquire_smp)) {
799 				return LCK_RW_GRAB_S_LOCKED;
800 			}
801 		} else {
802 			if ((flags & LCK_RW_GRAB_F_WAIT) == 0) {
803 				atomic_exchange_abort();
804 				return LCK_RW_GRAB_S_NOT_LOCKED;
805 			}
806 
807 			lck_rw_lock_pause(istate);
808 
809 			if (mach_absolute_time() >= deadline) {
810 				return LCK_RW_GRAB_S_TIMED_OUT;
811 			}
812 			if (lock_pause && lock_pause()) {
813 				return LCK_RW_GRAB_S_EARLY_RETURN;
814 			}
815 		}
816 	}
817 }
818 
819 /*
820  * The inverse of lck_rw_grab - drops either the LCK_RW_WANT_EXCL bit or
821  * decrements the reader count. Doesn't deal with waking up waiters - i.e.
822  * should only be called when can_sleep is false.
823  */
824 static void
lck_rw_drop(lck_rw_t * lock,lck_rw_grab_flags_t flags)825 lck_rw_drop(lck_rw_t *lock, lck_rw_grab_flags_t flags)
826 {
827 	uint32_t data, prev;
828 
829 	assert3u(flags & ~(LCK_RW_GRAB_F_WANT_EXCL | LCK_RW_GRAB_F_WAIT), ==, 0);
830 	assert(!lock->lck_rw_can_sleep);
831 
832 	for (;;) {
833 		data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_acquire_smp);
834 
835 		/* Interlock should never be taken when can_sleep is false. */
836 		assert3u(data & LCK_RW_INTERLOCK, ==, 0);
837 
838 		if ((flags & LCK_RW_GRAB_F_WANT_EXCL) != 0) {
839 			data &= ~LCK_RW_WANT_EXCL;
840 		} else {
841 			data -= LCK_RW_SHARED_READER;
842 		}
843 
844 		if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_acquire_smp)) {
845 			break;
846 		}
847 
848 		cpu_pause();
849 	}
850 
851 	return;
852 }
853 
854 static boolean_t
855 lck_rw_lock_exclusive_gen(
856 	lck_rw_t        *lock,
857 	bool            (^lock_pause)(void))
858 {
859 	__assert_only thread_t self = current_thread();
860 	__kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lock);
861 	lck_rw_word_t           word;
862 	int                     slept = 0;
863 	lck_rw_grab_state_t     grab_state = LCK_RW_GRAB_S_NOT_LOCKED;
864 	lck_rw_drain_state_t    drain_state = LCK_RW_DRAIN_S_NOT_DRAINED;
865 	wait_result_t           res = 0;
866 	boolean_t               istate;
867 
868 #if     CONFIG_DTRACE
869 	boolean_t dtrace_ls_initialized = FALSE;
870 	boolean_t dtrace_rwl_excl_spin, dtrace_rwl_excl_block, dtrace_ls_enabled = FALSE;
871 	uint64_t wait_interval = 0;
872 	int readers_at_sleep = 0;
873 #endif
874 
875 	assertf(lock->lck_rw_owner != self->ctid,
876 	    "Lock already held state=0x%x, owner=%p",
877 	    ordered_load_rw(lock), self);
878 
879 #ifdef DEBUG_RW
880 	/*
881 	 * Best effort attempt to check that this thread
882 	 * is not already holding the lock (this checks read mode too).
883 	 */
884 	assert_canlock_rwlock(lock, self, LCK_RW_TYPE_EXCLUSIVE);
885 #endif /* DEBUG_RW */
886 
887 	/*
888 	 *	Try to acquire the lck_rw_want_excl bit.
889 	 */
890 	while (lck_rw_grab(lock, LCK_RW_GRAB_F_WANT_EXCL, NULL) != LCK_RW_GRAB_S_LOCKED) {
891 #if     CONFIG_DTRACE
892 		if (dtrace_ls_initialized == FALSE) {
893 			dtrace_ls_initialized = TRUE;
894 			dtrace_rwl_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] != 0);
895 			dtrace_rwl_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK] != 0);
896 			dtrace_ls_enabled = dtrace_rwl_excl_spin || dtrace_rwl_excl_block;
897 			if (dtrace_ls_enabled) {
898 				/*
899 				 * Either sleeping or spinning is happening,
900 				 *  start a timing of our delay interval now.
901 				 */
902 				readers_at_sleep = lock->lck_rw_shared_count;
903 				wait_interval = mach_absolute_time();
904 			}
905 		}
906 #endif
907 
908 		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) | DBG_FUNC_START,
909 		    trace_lck, 0, 0, 0, 0);
910 
911 		grab_state = lck_rw_grab(lock, LCK_RW_GRAB_F_WANT_EXCL | LCK_RW_GRAB_F_WAIT, lock_pause);
912 
913 		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) | DBG_FUNC_END,
914 		    trace_lck, 0, 0, grab_state, 0);
915 
916 		if (grab_state == LCK_RW_GRAB_S_LOCKED ||
917 		    grab_state == LCK_RW_GRAB_S_EARLY_RETURN) {
918 			break;
919 		}
920 		/*
921 		 * if we get here, the deadline has expired w/o us
922 		 * being able to grab the lock exclusively
923 		 * check to see if we're allowed to do a thread_block
924 		 */
925 		word.data = ordered_load_rw(lock);
926 		if (word.can_sleep) {
927 			istate = lck_interlock_lock(lock);
928 			word.data = ordered_load_rw(lock);
929 
930 			if (word.want_excl) {
931 				KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_WAIT_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
932 
933 				word.w_waiting = 1;
934 				ordered_store_rw(lock, word.data);
935 
936 				thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockWrite);
937 				res = assert_wait(LCK_RW_WRITER_EVENT(lock),
938 				    THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
939 				lck_interlock_unlock(lock, istate);
940 				if (res == THREAD_WAITING) {
941 					res = thread_block(THREAD_CONTINUE_NULL);
942 					slept++;
943 				}
944 				KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_WAIT_CODE) | DBG_FUNC_END, trace_lck, res, slept, 0, 0);
945 			} else {
946 				word.want_excl = 1;
947 				ordered_store_rw(lock, word.data);
948 				lck_interlock_unlock(lock, istate);
949 				break;
950 			}
951 		}
952 	}
953 
954 	if (grab_state == LCK_RW_GRAB_S_EARLY_RETURN) {
955 		assert(lock_pause);
956 		return FALSE;
957 	}
958 
959 	/*
960 	 * Wait for readers (and upgrades) to finish...
961 	 */
962 	while (lck_rw_drain_status(lock, LCK_RW_SHARED_MASK | LCK_RW_WANT_UPGRADE, FALSE, NULL) != LCK_RW_DRAIN_S_DRAINED) {
963 #if     CONFIG_DTRACE
964 		/*
965 		 * Either sleeping or spinning is happening, start
966 		 * a timing of our delay interval now.  If we set it
967 		 * to -1 we don't have accurate data so we cannot later
968 		 * decide to record a dtrace spin or sleep event.
969 		 */
970 		if (dtrace_ls_initialized == FALSE) {
971 			dtrace_ls_initialized = TRUE;
972 			dtrace_rwl_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] != 0);
973 			dtrace_rwl_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK] != 0);
974 			dtrace_ls_enabled = dtrace_rwl_excl_spin || dtrace_rwl_excl_block;
975 			if (dtrace_ls_enabled) {
976 				/*
977 				 * Either sleeping or spinning is happening,
978 				 *  start a timing of our delay interval now.
979 				 */
980 				readers_at_sleep = lock->lck_rw_shared_count;
981 				wait_interval = mach_absolute_time();
982 			}
983 		}
984 #endif
985 
986 		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
987 
988 		drain_state = lck_rw_drain_status(lock, LCK_RW_SHARED_MASK | LCK_RW_WANT_UPGRADE, TRUE, lock_pause);
989 
990 		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) | DBG_FUNC_END, trace_lck, 0, 0, drain_state, 0);
991 
992 		if (drain_state == LCK_RW_DRAIN_S_DRAINED ||
993 		    drain_state == LCK_RW_DRAIN_S_EARLY_RETURN) {
994 			break;
995 		}
996 		/*
997 		 * if we get here, the deadline has expired w/o us
998 		 * being able to grab the lock exclusively
999 		 * check to see if we're allowed to do a thread_block
1000 		 */
1001 		word.data = ordered_load_rw(lock);
1002 		if (word.can_sleep) {
1003 			istate = lck_interlock_lock(lock);
1004 			word.data = ordered_load_rw(lock);
1005 
1006 			if (word.shared_count != 0 || word.want_upgrade) {
1007 				KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_WAIT_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
1008 
1009 				word.w_waiting = 1;
1010 				ordered_store_rw(lock, word.data);
1011 
1012 				thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockWrite);
1013 				res = assert_wait(LCK_RW_WRITER_EVENT(lock),
1014 				    THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
1015 				lck_interlock_unlock(lock, istate);
1016 
1017 				if (res == THREAD_WAITING) {
1018 					res = thread_block(THREAD_CONTINUE_NULL);
1019 					slept++;
1020 				}
1021 				KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_WAIT_CODE) | DBG_FUNC_END, trace_lck, res, slept, 0, 0);
1022 			} else {
1023 				lck_interlock_unlock(lock, istate);
1024 				/*
1025 				 * must own the lock now, since we checked for
1026 				 * readers or upgrade owner behind the interlock
1027 				 * no need for a call to 'lck_rw_drain_status'
1028 				 */
1029 				break;
1030 			}
1031 		}
1032 	}
1033 
1034 #if     CONFIG_DTRACE
1035 	/*
1036 	 * Decide what latencies we suffered that are Dtrace events.
1037 	 * If we have set wait_interval, then we either spun or slept.
1038 	 * At least we get out from under the interlock before we record
1039 	 * which is the best we can do here to minimize the impact
1040 	 * of the tracing.
1041 	 * If we have set wait_interval to -1, then dtrace was not enabled when we
1042 	 * started sleeping/spinning so we don't record this event.
1043 	 */
1044 	if (dtrace_ls_enabled == TRUE) {
1045 		if (slept == 0) {
1046 			LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_SPIN, lock,
1047 			    mach_absolute_time() - wait_interval, 1);
1048 		} else {
1049 			/*
1050 			 * For the blocking case, we also record if when we blocked
1051 			 * it was held for read or write, and how many readers.
1052 			 * Notice that above we recorded this before we dropped
1053 			 * the interlock so the count is accurate.
1054 			 */
1055 			LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_BLOCK, lock,
1056 			    mach_absolute_time() - wait_interval, 1,
1057 			    (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1058 		}
1059 	}
1060 #endif /* CONFIG_DTRACE */
1061 
1062 	if (drain_state == LCK_RW_DRAIN_S_EARLY_RETURN) {
1063 		lck_rw_drop(lock, LCK_RW_GRAB_F_WANT_EXCL);
1064 		assert(lock_pause);
1065 		return FALSE;
1066 	}
1067 
1068 #if CONFIG_DTRACE
1069 	LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE, lock, 1);
1070 #endif  /* CONFIG_DTRACE */
1071 
1072 	return TRUE;
1073 }
1074 
1075 #define LCK_RW_LOCK_EXCLUSIVE_TAS(lck) (atomic_test_and_set32(&(lck)->lck_rw_data, \
1076 	    (LCK_RW_SHARED_MASK | LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE | LCK_RW_INTERLOCK), \
1077 	    LCK_RW_WANT_EXCL, memory_order_acquire_smp, FALSE))
1078 /*!
1079  * @function lck_rw_lock_exclusive_check_contended
1080  *
1081  * @abstract
1082  * Locks a rw_lock in exclusive mode.
1083  *
1084  * @discussion
1085  * This routine IS EXPERIMENTAL.
1086  * It's only used for the vm object lock, and use for other subsystems is UNSUPPORTED.
1087  * Note that the return value is ONLY A HEURISTIC w.r.t. the lock's contention.
1088  *
1089  * @param lock           rw_lock to lock.
1090  *
1091  * @returns Returns TRUE if the thread spun or blocked while attempting to acquire the lock, FALSE
1092  *          otherwise.
1093  */
1094 bool
lck_rw_lock_exclusive_check_contended(lck_rw_t * lock)1095 lck_rw_lock_exclusive_check_contended(
1096 	lck_rw_t        *lock)
1097 {
1098 	thread_t        thread = current_thread();
1099 	bool            contended  = false;
1100 
1101 	if (lock->lck_rw_can_sleep) {
1102 		lck_rw_lock_count_inc(thread, lock);
1103 	} else if (get_preemption_level() == 0) {
1104 		panic("Taking non-sleepable RW lock with preemption enabled");
1105 	}
1106 
1107 	if (LCK_RW_LOCK_EXCLUSIVE_TAS(lock)) {
1108 #if     CONFIG_DTRACE
1109 		LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE, lock, DTRACE_RW_EXCL);
1110 #endif  /* CONFIG_DTRACE */
1111 	} else {
1112 		contended = true;
1113 		(void) lck_rw_lock_exclusive_gen(lock, NULL);
1114 	}
1115 	assertf(lock->lck_rw_owner == 0, "state=0x%x, owner=%p",
1116 	    ordered_load_rw(lock), ctid_get_thread_unsafe(lock->lck_rw_owner));
1117 	ordered_store_rw_owner(lock, thread->ctid);
1118 
1119 #ifdef DEBUG_RW
1120 	add_held_rwlock(lock, thread, LCK_RW_TYPE_EXCLUSIVE, __builtin_return_address(0));
1121 #endif /* DEBUG_RW */
1122 	return contended;
1123 }
1124 
1125 __attribute__((always_inline))
1126 static boolean_t
1127 lck_rw_lock_exclusive_internal_inline(
1128 	lck_rw_t        *lock,
1129 	void            *caller,
1130 	bool            (^lock_pause)(void))
1131 {
1132 #pragma unused(caller)
1133 	thread_t        thread = current_thread();
1134 
1135 	if (lock->lck_rw_can_sleep) {
1136 		lck_rw_lock_count_inc(thread, lock);
1137 	} else if (get_preemption_level() == 0) {
1138 		panic("Taking non-sleepable RW lock with preemption enabled");
1139 	}
1140 
1141 	if (LCK_RW_LOCK_EXCLUSIVE_TAS(lock)) {
1142 #if     CONFIG_DTRACE
1143 		LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE, lock, DTRACE_RW_EXCL);
1144 #endif  /* CONFIG_DTRACE */
1145 	} else if (!lck_rw_lock_exclusive_gen(lock, lock_pause)) {
1146 		/*
1147 		 * lck_rw_lock_exclusive_gen() should only return
1148 		 * early if lock_pause has been passed and
1149 		 * returns FALSE. lock_pause is exclusive with
1150 		 * lck_rw_can_sleep().
1151 		 */
1152 		assert(!lock->lck_rw_can_sleep);
1153 		return FALSE;
1154 	}
1155 
1156 	assertf(lock->lck_rw_owner == 0, "state=0x%x, owner=%p",
1157 	    ordered_load_rw(lock), ctid_get_thread_unsafe(lock->lck_rw_owner));
1158 	ordered_store_rw_owner(lock, thread->ctid);
1159 
1160 #if DEBUG_RW
1161 	add_held_rwlock(lock, thread, LCK_RW_TYPE_EXCLUSIVE, caller);
1162 #endif /* DEBUG_RW */
1163 
1164 	return TRUE;
1165 }
1166 
1167 __attribute__((noinline))
1168 static void
lck_rw_lock_exclusive_internal(lck_rw_t * lock,void * caller)1169 lck_rw_lock_exclusive_internal(
1170 	lck_rw_t        *lock,
1171 	void            *caller)
1172 {
1173 	(void) lck_rw_lock_exclusive_internal_inline(lock, caller, NULL);
1174 }
1175 
1176 /*!
1177  * @function lck_rw_lock_exclusive
1178  *
1179  * @abstract
1180  * Locks a rw_lock in exclusive mode.
1181  *
1182  * @discussion
1183  * This function can block.
1184  * Multiple threads can acquire the lock in shared mode at the same time, but only one thread at a time
1185  * can acquire it in exclusive mode.
1186  * NOTE: the thread cannot return to userspace while the lock is held. Recursive locking is not supported.
1187  *
1188  * @param lock           rw_lock to lock.
1189  */
1190 void
lck_rw_lock_exclusive(lck_rw_t * lock)1191 lck_rw_lock_exclusive(
1192 	lck_rw_t        *lock)
1193 {
1194 	(void) lck_rw_lock_exclusive_internal_inline(lock, __builtin_return_address(0), NULL);
1195 }
1196 
1197 /*!
1198  * @function lck_rw_lock_exclusive_b
1199  *
1200  * @abstract
1201  * Locks a rw_lock in exclusive mode. Returns early if the lock can't be acquired
1202  * and the specified block returns true.
1203  *
1204  * @discussion
1205  * Identical to lck_rw_lock_exclusive() but can return early if the lock can't be
1206  * acquired and the specified block returns true. The block is called
1207  * repeatedly when waiting to acquire the lock.
1208  * Should only be called when the lock cannot sleep (i.e. when
1209  * lock->lck_rw_can_sleep is false).
1210  *
1211  * @param lock           rw_lock to lock.
1212  * @param lock_pause     block invoked while waiting to acquire lock
1213  *
1214  * @returns              Returns TRUE if the lock is successfully taken,
1215  *                       FALSE if the block returns true and the lock has
1216  *                       not been acquired.
1217  */
1218 boolean_t
1219 lck_rw_lock_exclusive_b(
1220 	lck_rw_t        *lock,
1221 	bool            (^lock_pause)(void))
1222 {
1223 	assert(!lock->lck_rw_can_sleep);
1224 
1225 	return lck_rw_lock_exclusive_internal_inline(lock, __builtin_return_address(0), lock_pause);
1226 }
1227 
1228 /*
1229  *	Routine:	lck_rw_lock_shared_gen
1230  *	Function:
1231  *		Fast path code has determined that this lock
1232  *		is held exclusively... this is where we spin/block
1233  *		until we can acquire the lock in the shared mode
1234  */
1235 static boolean_t
1236 lck_rw_lock_shared_gen(
1237 	lck_rw_t        *lck,
1238 	bool            (^lock_pause)(void))
1239 {
1240 	__assert_only thread_t  self = current_thread();
1241 	__kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lck);
1242 	lck_rw_word_t           word;
1243 	lck_rw_grab_state_t     grab_state = LCK_RW_GRAB_S_NOT_LOCKED;
1244 	int                     slept = 0;
1245 	wait_result_t           res = 0;
1246 	boolean_t               istate;
1247 
1248 #if     CONFIG_DTRACE
1249 	uint64_t wait_interval = 0;
1250 	int readers_at_sleep = 0;
1251 	boolean_t dtrace_ls_initialized = FALSE;
1252 	boolean_t dtrace_rwl_shared_spin, dtrace_rwl_shared_block, dtrace_ls_enabled = FALSE;
1253 #endif /* CONFIG_DTRACE */
1254 
1255 	assertf(lck->lck_rw_owner != self->ctid,
1256 	    "Lock already held state=0x%x, owner=%p",
1257 	    ordered_load_rw(lck), self);
1258 
1259 #ifdef DEBUG_RW
1260 	/*
1261 	 * Best effort attempt to check that this thread
1262 	 * is not already holding the lock in shared mode.
1263 	 */
1264 	assert_canlock_rwlock(lck, self, LCK_RW_TYPE_SHARED);
1265 #endif
1266 
1267 	while (lck_rw_grab(lck, LCK_RW_GRAB_F_SHARED, NULL) != LCK_RW_GRAB_S_LOCKED) {
1268 #if     CONFIG_DTRACE
1269 		if (dtrace_ls_initialized == FALSE) {
1270 			dtrace_ls_initialized = TRUE;
1271 			dtrace_rwl_shared_spin = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_SPIN] != 0);
1272 			dtrace_rwl_shared_block = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_BLOCK] != 0);
1273 			dtrace_ls_enabled = dtrace_rwl_shared_spin || dtrace_rwl_shared_block;
1274 			if (dtrace_ls_enabled) {
1275 				/*
1276 				 * Either sleeping or spinning is happening,
1277 				 *  start a timing of our delay interval now.
1278 				 */
1279 				readers_at_sleep = lck->lck_rw_shared_count;
1280 				wait_interval = mach_absolute_time();
1281 			}
1282 		}
1283 #endif
1284 
1285 		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_SPIN_CODE) | DBG_FUNC_START,
1286 		    trace_lck, lck->lck_rw_want_excl, lck->lck_rw_want_upgrade, 0, 0);
1287 
1288 		grab_state = lck_rw_grab(lck, LCK_RW_GRAB_F_SHARED | LCK_RW_GRAB_F_WAIT, lock_pause);
1289 
1290 		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_SPIN_CODE) | DBG_FUNC_END,
1291 		    trace_lck, lck->lck_rw_want_excl, lck->lck_rw_want_upgrade, grab_state, 0);
1292 
1293 		if (grab_state == LCK_RW_GRAB_S_LOCKED ||
1294 		    grab_state == LCK_RW_GRAB_S_EARLY_RETURN) {
1295 			break;
1296 		}
1297 
1298 		/*
1299 		 * if we get here, the deadline has expired w/o us
1300 		 * being able to grab the lock for read
1301 		 * check to see if we're allowed to do a thread_block
1302 		 */
1303 		if (lck->lck_rw_can_sleep) {
1304 			istate = lck_interlock_lock(lck);
1305 
1306 			word.data = ordered_load_rw(lck);
1307 			if ((word.want_excl || word.want_upgrade) &&
1308 			    ((word.shared_count == 0) || word.priv_excl)) {
1309 				KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_WAIT_CODE) | DBG_FUNC_START,
1310 				    trace_lck, word.want_excl, word.want_upgrade, 0, 0);
1311 
1312 				word.r_waiting = 1;
1313 				ordered_store_rw(lck, word.data);
1314 
1315 				thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockRead);
1316 				res = assert_wait(LCK_RW_READER_EVENT(lck),
1317 				    THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
1318 				lck_interlock_unlock(lck, istate);
1319 
1320 				if (res == THREAD_WAITING) {
1321 					res = thread_block(THREAD_CONTINUE_NULL);
1322 					slept++;
1323 				}
1324 				KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_WAIT_CODE) | DBG_FUNC_END,
1325 				    trace_lck, res, slept, 0, 0);
1326 			} else {
1327 				word.shared_count++;
1328 				ordered_store_rw(lck, word.data);
1329 				lck_interlock_unlock(lck, istate);
1330 				break;
1331 			}
1332 		}
1333 	}
1334 
1335 #if     CONFIG_DTRACE
1336 	if (dtrace_ls_enabled == TRUE) {
1337 		if (slept == 0) {
1338 			LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_SPIN, lck, mach_absolute_time() - wait_interval, 0);
1339 		} else {
1340 			LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_BLOCK, lck,
1341 			    mach_absolute_time() - wait_interval, 0,
1342 			    (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1343 		}
1344 	}
1345 #endif /* CONFIG_DTRACE */
1346 
1347 	if (grab_state == LCK_RW_GRAB_S_EARLY_RETURN) {
1348 		assert(lock_pause);
1349 		return FALSE;
1350 	}
1351 
1352 #if     CONFIG_DTRACE
1353 	LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_ACQUIRE, lck, 0);
1354 #endif  /* CONFIG_DTRACE */
1355 
1356 	return TRUE;
1357 }
1358 
1359 __attribute__((always_inline))
1360 static boolean_t
1361 lck_rw_lock_shared_internal_inline(
1362 	lck_rw_t        *lock,
1363 	void            *caller,
1364 	bool            (^lock_pause)(void))
1365 {
1366 #pragma unused(caller)
1367 
1368 	uint32_t        data, prev;
1369 	thread_t        thread = current_thread();
1370 #ifdef DEBUG_RW
1371 	boolean_t       check_canlock = TRUE;
1372 #endif
1373 
1374 	if (lock->lck_rw_can_sleep) {
1375 		lck_rw_lock_count_inc(thread, lock);
1376 	} else if (get_preemption_level() == 0) {
1377 		panic("Taking non-sleepable RW lock with preemption enabled");
1378 	}
1379 
1380 	for (;;) {
1381 		data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_acquire_smp);
1382 		if (data & (LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE | LCK_RW_INTERLOCK)) {
1383 			atomic_exchange_abort();
1384 			if (!lck_rw_lock_shared_gen(lock, lock_pause)) {
1385 				/*
1386 				 * lck_rw_lock_shared_gen() should only return
1387 				 * early if lock_pause has been passed and
1388 				 * returns FALSE. lock_pause is exclusive with
1389 				 * lck_rw_can_sleep().
1390 				 */
1391 				assert(!lock->lck_rw_can_sleep);
1392 				return FALSE;
1393 			}
1394 
1395 			goto locked;
1396 		}
1397 #ifdef DEBUG_RW
1398 		if ((data & LCK_RW_SHARED_MASK) == 0) {
1399 			/*
1400 			 * If the lock is uncontended,
1401 			 * we do not need to check if we can lock it
1402 			 */
1403 			check_canlock = FALSE;
1404 		}
1405 #endif
1406 		data += LCK_RW_SHARED_READER;
1407 		if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_acquire_smp)) {
1408 			break;
1409 		}
1410 		cpu_pause();
1411 	}
1412 #ifdef DEBUG_RW
1413 	if (check_canlock) {
1414 		/*
1415 		 * Best effort attempt to check that this thread
1416 		 * is not already holding the lock (this checks read mode too).
1417 		 */
1418 		assert_canlock_rwlock(lock, thread, LCK_RW_TYPE_SHARED);
1419 	}
1420 #endif
1421 locked:
1422 	assertf(lock->lck_rw_owner == 0, "state=0x%x, owner=%p",
1423 	    ordered_load_rw(lock), ctid_get_thread_unsafe(lock->lck_rw_owner));
1424 
1425 #if     CONFIG_DTRACE
1426 	LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_ACQUIRE, lock, DTRACE_RW_SHARED);
1427 #endif  /* CONFIG_DTRACE */
1428 
1429 #ifdef DEBUG_RW
1430 	add_held_rwlock(lock, thread, LCK_RW_TYPE_SHARED, caller);
1431 #endif /* DEBUG_RW */
1432 
1433 	return TRUE;
1434 }
1435 
1436 __attribute__((noinline))
1437 static void
lck_rw_lock_shared_internal(lck_rw_t * lock,void * caller)1438 lck_rw_lock_shared_internal(
1439 	lck_rw_t        *lock,
1440 	void            *caller)
1441 {
1442 	(void) lck_rw_lock_shared_internal_inline(lock, caller, NULL);
1443 }
1444 
1445 /*!
1446  * @function lck_rw_lock_shared
1447  *
1448  * @abstract
1449  * Locks a rw_lock in shared mode.
1450  *
1451  * @discussion
1452  * This function can block.
1453  * Multiple threads can acquire the lock in shared mode at the same time, but only one thread at a time
1454  * can acquire it in exclusive mode.
1455  * If the lock is held in shared mode and there are no writers waiting, a reader will be able to acquire
1456  * the lock without waiting.
1457  * If the lock is held in shared mode and there is at least a writer waiting, a reader will wait
1458  * for all the writers to make progress if the lock was initialized with the default settings. Instead if
1459  * RW_SHARED_PRIORITY was selected at initialization time, a reader will never wait if the lock is held
1460  * in shared mode.
1461  * NOTE: the thread cannot return to userspace while the lock is held. Recursive locking is not supported.
1462  *
1463  * @param lock           rw_lock to lock.
1464  */
1465 void
lck_rw_lock_shared(lck_rw_t * lock)1466 lck_rw_lock_shared(
1467 	lck_rw_t        *lock)
1468 {
1469 	(void) lck_rw_lock_shared_internal_inline(lock, __builtin_return_address(0), NULL);
1470 }
1471 
1472 /*!
1473  * @function lck_rw_lock_shared_b
1474  *
1475  * @abstract
1476  * Locks a rw_lock in shared mode. Returns early if the lock can't be acquired
1477  * and the specified block returns true.
1478  *
1479  * @discussion
1480  * Identical to lck_rw_lock_shared() but can return early if the lock can't be
1481  * acquired and the specified block returns true. The block is called
1482  * repeatedly when waiting to acquire the lock.
1483  * Should only be called when the lock cannot sleep (i.e. when
1484  * lock->lck_rw_can_sleep is false).
1485  *
1486  * @param lock           rw_lock to lock.
1487  * @param lock_pause     block invoked while waiting to acquire lock
1488  *
1489  * @returns              Returns TRUE if the lock is successfully taken,
1490  *                       FALSE if the block returns true and the lock has
1491  *                       not been acquired.
1492  */
1493 boolean_t
1494 lck_rw_lock_shared_b(
1495 	lck_rw_t        *lock,
1496 	bool            (^lock_pause)(void))
1497 {
1498 	assert(!lock->lck_rw_can_sleep);
1499 
1500 	return lck_rw_lock_shared_internal_inline(lock, __builtin_return_address(0), lock_pause);
1501 }
1502 
1503 /*
1504  *	Routine:	lck_rw_lock_shared_to_exclusive_failure
1505  *	Function:
1506  *		Fast path code has already dropped our read
1507  *		count and determined that someone else owns 'lck_rw_want_upgrade'
1508  *		if 'lck_rw_shared_count' == 0, its also already dropped 'lck_w_waiting'
1509  *		all we need to do here is determine if a wakeup is needed
1510  */
1511 static boolean_t
lck_rw_lock_shared_to_exclusive_failure(lck_rw_t * lck,uint32_t prior_lock_state)1512 lck_rw_lock_shared_to_exclusive_failure(
1513 	lck_rw_t        *lck,
1514 	uint32_t        prior_lock_state)
1515 {
1516 	thread_t        thread = current_thread();
1517 
1518 	if ((prior_lock_state & LCK_RW_W_WAITING) &&
1519 	    ((prior_lock_state & LCK_RW_SHARED_MASK) == LCK_RW_SHARED_READER)) {
1520 		/*
1521 		 *	Someone else has requested upgrade.
1522 		 *	Since we've released the read lock, wake
1523 		 *	him up if he's blocked waiting
1524 		 */
1525 		thread_wakeup(LCK_RW_WRITER_EVENT(lck));
1526 	}
1527 
1528 	/* Check if dropping the lock means that we need to unpromote */
1529 	if (lck->lck_rw_can_sleep) {
1530 		lck_rw_lock_count_dec(thread, lck);
1531 	}
1532 
1533 	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_CODE) | DBG_FUNC_NONE,
1534 	    VM_KERNEL_UNSLIDE_OR_PERM(lck), lck->lck_rw_shared_count, lck->lck_rw_want_upgrade, 0, 0);
1535 
1536 #ifdef DEBUG_RW
1537 	remove_held_rwlock(lck, thread, LCK_RW_TYPE_SHARED);
1538 #endif /* DEBUG_RW */
1539 
1540 	return FALSE;
1541 }
1542 
1543 /*
1544  *	Routine:	lck_rw_lock_shared_to_exclusive_success
1545  *	Function:
1546  *		the fast path code has already dropped our read
1547  *		count and successfully acquired 'lck_rw_want_upgrade'
1548  *		we just need to wait for the rest of the readers to drain
1549  *		and then we can return as the exclusive holder of this lock
1550  */
1551 static void
lck_rw_lock_shared_to_exclusive_success(lck_rw_t * lock)1552 lck_rw_lock_shared_to_exclusive_success(
1553 	lck_rw_t        *lock)
1554 {
1555 	__kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lock);
1556 	int                     slept = 0;
1557 	lck_rw_word_t           word;
1558 	wait_result_t           res;
1559 	boolean_t               istate;
1560 	lck_rw_drain_state_t    drain_state;
1561 
1562 #if     CONFIG_DTRACE
1563 	uint64_t                wait_interval = 0;
1564 	int                     readers_at_sleep = 0;
1565 	boolean_t               dtrace_ls_initialized = FALSE;
1566 	boolean_t               dtrace_rwl_shared_to_excl_spin, dtrace_rwl_shared_to_excl_block, dtrace_ls_enabled = FALSE;
1567 #endif
1568 
1569 	while (lck_rw_drain_status(lock, LCK_RW_SHARED_MASK, FALSE, NULL) != LCK_RW_DRAIN_S_DRAINED) {
1570 		word.data = ordered_load_rw(lock);
1571 #if     CONFIG_DTRACE
1572 		if (dtrace_ls_initialized == FALSE) {
1573 			dtrace_ls_initialized = TRUE;
1574 			dtrace_rwl_shared_to_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN] != 0);
1575 			dtrace_rwl_shared_to_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK] != 0);
1576 			dtrace_ls_enabled = dtrace_rwl_shared_to_excl_spin || dtrace_rwl_shared_to_excl_block;
1577 			if (dtrace_ls_enabled) {
1578 				/*
1579 				 * Either sleeping or spinning is happening,
1580 				 *  start a timing of our delay interval now.
1581 				 */
1582 				readers_at_sleep = word.shared_count;
1583 				wait_interval = mach_absolute_time();
1584 			}
1585 		}
1586 #endif
1587 
1588 		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_SPIN_CODE) | DBG_FUNC_START,
1589 		    trace_lck, word.shared_count, 0, 0, 0);
1590 
1591 		drain_state = lck_rw_drain_status(lock, LCK_RW_SHARED_MASK, TRUE, NULL);
1592 
1593 		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_SPIN_CODE) | DBG_FUNC_END,
1594 		    trace_lck, lock->lck_rw_shared_count, 0, 0, 0);
1595 
1596 		if (drain_state == LCK_RW_DRAIN_S_DRAINED) {
1597 			break;
1598 		}
1599 
1600 		/*
1601 		 * if we get here, the spin deadline in lck_rw_wait_on_status()
1602 		 * has expired w/o the rw_shared_count having drained to 0
1603 		 * check to see if we're allowed to do a thread_block
1604 		 */
1605 		if (word.can_sleep) {
1606 			istate = lck_interlock_lock(lock);
1607 
1608 			word.data = ordered_load_rw(lock);
1609 			if (word.shared_count != 0) {
1610 				KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_WAIT_CODE) | DBG_FUNC_START,
1611 				    trace_lck, word.shared_count, 0, 0, 0);
1612 
1613 				word.w_waiting = 1;
1614 				ordered_store_rw(lock, word.data);
1615 
1616 				thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockUpgrade);
1617 				res = assert_wait(LCK_RW_WRITER_EVENT(lock),
1618 				    THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
1619 				lck_interlock_unlock(lock, istate);
1620 
1621 				if (res == THREAD_WAITING) {
1622 					res = thread_block(THREAD_CONTINUE_NULL);
1623 					slept++;
1624 				}
1625 				KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_WAIT_CODE) | DBG_FUNC_END,
1626 				    trace_lck, res, slept, 0, 0);
1627 			} else {
1628 				lck_interlock_unlock(lock, istate);
1629 				break;
1630 			}
1631 		}
1632 	}
1633 #if     CONFIG_DTRACE
1634 	/*
1635 	 * We infer whether we took the sleep/spin path above by checking readers_at_sleep.
1636 	 */
1637 	if (dtrace_ls_enabled == TRUE) {
1638 		if (slept == 0) {
1639 			LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN, lock, mach_absolute_time() - wait_interval, 0);
1640 		} else {
1641 			LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK, lock,
1642 			    mach_absolute_time() - wait_interval, 1,
1643 			    (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1644 		}
1645 	}
1646 	LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_UPGRADE, lock, 1);
1647 #endif
1648 }
1649 
1650 /*!
1651  * @function lck_rw_lock_shared_to_exclusive
1652  *
1653  * @abstract
1654  * Upgrades a rw_lock held in shared mode to exclusive.
1655  *
1656  * @discussion
1657  * This function can block.
1658  * Only one reader at a time can upgrade to exclusive mode. If the upgrades fails the function will
1659  * return with the lock not held.
1660  * The caller needs to hold the lock in shared mode to upgrade it.
1661  *
1662  * @param lock           rw_lock already held in shared mode to upgrade.
1663  *
1664  * @returns TRUE if the lock was upgraded, FALSE if it was not possible.
1665  *          If the function was not able to upgrade the lock, the lock will be dropped
1666  *          by the function.
1667  */
1668 boolean_t
lck_rw_lock_shared_to_exclusive(lck_rw_t * lock)1669 lck_rw_lock_shared_to_exclusive(
1670 	lck_rw_t        *lock)
1671 {
1672 	thread_t thread = current_thread();
1673 	uint32_t data, prev;
1674 
1675 	assertf(lock->lck_rw_priv_excl != 0, "lock %p thread %p", lock, current_thread());
1676 
1677 #if DEBUG_RW
1678 	assert_held_rwlock(lock, thread, LCK_RW_TYPE_SHARED);
1679 #endif /* DEBUG_RW */
1680 
1681 	for (;;) {
1682 		data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_acquire_smp);
1683 		if (data & LCK_RW_INTERLOCK) {
1684 			atomic_exchange_abort();
1685 			lck_rw_interlock_spin(lock);
1686 			continue;
1687 		}
1688 		if (data & LCK_RW_WANT_UPGRADE) {
1689 			data -= LCK_RW_SHARED_READER;
1690 			if ((data & LCK_RW_SHARED_MASK) == 0) {         /* we were the last reader */
1691 				data &= ~(LCK_RW_W_WAITING);            /* so clear the wait indicator */
1692 			}
1693 			if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_acquire_smp)) {
1694 				return lck_rw_lock_shared_to_exclusive_failure(lock, prev);
1695 			}
1696 		} else {
1697 			data |= LCK_RW_WANT_UPGRADE;            /* ask for WANT_UPGRADE */
1698 			data -= LCK_RW_SHARED_READER;           /* and shed our read count */
1699 			if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_acquire_smp)) {
1700 				break;
1701 			}
1702 		}
1703 		cpu_pause();
1704 	}
1705 	/* we now own the WANT_UPGRADE */
1706 	if (data & LCK_RW_SHARED_MASK) {        /* check to see if all of the readers are drained */
1707 		lck_rw_lock_shared_to_exclusive_success(lock);  /* if not, we need to go wait */
1708 	}
1709 
1710 	assertf(lock->lck_rw_owner == 0, "state=0x%x, owner=%p",
1711 	    ordered_load_rw(lock), ctid_get_thread_unsafe(lock->lck_rw_owner));
1712 
1713 	ordered_store_rw_owner(lock, thread->ctid);
1714 #if     CONFIG_DTRACE
1715 	LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_UPGRADE, lock, 0);
1716 #endif  /* CONFIG_DTRACE */
1717 
1718 #if DEBUG_RW
1719 	change_held_rwlock(lock, thread, LCK_RW_TYPE_SHARED, __builtin_return_address(0));
1720 #endif /* DEBUG_RW */
1721 	return TRUE;
1722 }
1723 
1724 /*
1725  *      Routine:        lck_rw_lock_exclusive_to_shared_gen
1726  *      Function:
1727  *		Fast path has already dropped
1728  *		our exclusive state and bumped lck_rw_shared_count
1729  *		all we need to do here is determine if anyone
1730  *		needs to be awakened.
1731  */
1732 static void
lck_rw_lock_exclusive_to_shared_gen(lck_rw_t * lck,uint32_t prior_lock_state,void * caller)1733 lck_rw_lock_exclusive_to_shared_gen(
1734 	lck_rw_t        *lck,
1735 	uint32_t        prior_lock_state,
1736 	void            *caller)
1737 {
1738 #pragma unused(caller)
1739 	__kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lck);
1740 	lck_rw_word_t   fake_lck;
1741 
1742 	/*
1743 	 * prior_lock state is a snapshot of the 1st word of the
1744 	 * lock in question... we'll fake up a pointer to it
1745 	 * and carefully not access anything beyond whats defined
1746 	 * in the first word of a lck_rw_t
1747 	 */
1748 	fake_lck.data = prior_lock_state;
1749 
1750 	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_START,
1751 	    trace_lck, fake_lck->want_excl, fake_lck->want_upgrade, 0, 0);
1752 
1753 	/*
1754 	 * don't wake up anyone waiting to take the lock exclusively
1755 	 * since we hold a read count... when the read count drops to 0,
1756 	 * the writers will be woken.
1757 	 *
1758 	 * wake up any waiting readers if we don't have any writers waiting,
1759 	 * or the lock is NOT marked as rw_priv_excl (writers have privilege)
1760 	 */
1761 	if (!(fake_lck.priv_excl && fake_lck.w_waiting) && fake_lck.r_waiting) {
1762 		thread_wakeup(LCK_RW_READER_EVENT(lck));
1763 	}
1764 
1765 	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_END,
1766 	    trace_lck, lck->lck_rw_want_excl, lck->lck_rw_want_upgrade, lck->lck_rw_shared_count, 0);
1767 
1768 #if CONFIG_DTRACE
1769 	LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_TO_SHARED_DOWNGRADE, lck, 0);
1770 #endif
1771 
1772 #if DEBUG_RW
1773 	thread_t        thread = current_thread();
1774 	change_held_rwlock(lck, thread, LCK_RW_TYPE_EXCLUSIVE, caller);
1775 #endif /* DEBUG_RW */
1776 }
1777 
1778 /*!
1779  * @function lck_rw_lock_exclusive_to_shared
1780  *
1781  * @abstract
1782  * Downgrades a rw_lock held in exclusive mode to shared.
1783  *
1784  * @discussion
1785  * The caller needs to hold the lock in exclusive mode to be able to downgrade it.
1786  *
1787  * @param lock           rw_lock already held in exclusive mode to downgrade.
1788  */
1789 void
lck_rw_lock_exclusive_to_shared(lck_rw_t * lock)1790 lck_rw_lock_exclusive_to_shared(
1791 	lck_rw_t        *lock)
1792 {
1793 	uint32_t        data, prev;
1794 
1795 	assertf(lock->lck_rw_owner == current_thread()->ctid,
1796 	    "state=0x%x, owner=%p", lock->lck_rw_data,
1797 	    ctid_get_thread_unsafe(lock->lck_rw_owner));
1798 	ordered_store_rw_owner(lock, 0);
1799 
1800 	for (;;) {
1801 		data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_release_smp);
1802 		if (data & LCK_RW_INTERLOCK) {
1803 			atomic_exchange_abort();
1804 			lck_rw_interlock_spin(lock);    /* wait for interlock to clear */
1805 			continue;
1806 		}
1807 		data += LCK_RW_SHARED_READER;
1808 		if (data & LCK_RW_WANT_UPGRADE) {
1809 			data &= ~(LCK_RW_WANT_UPGRADE);
1810 		} else {
1811 			data &= ~(LCK_RW_WANT_EXCL);
1812 		}
1813 		if (!((prev & LCK_RW_W_WAITING) && (prev & LCK_RW_PRIV_EXCL))) {
1814 			data &= ~(LCK_RW_W_WAITING);
1815 		}
1816 		if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_release_smp)) {
1817 			break;
1818 		}
1819 		cpu_pause();
1820 	}
1821 	lck_rw_lock_exclusive_to_shared_gen(lock, prev, __builtin_return_address(0));
1822 }
1823 
1824 /*
1825  * Very sad hack, but the codegen for lck_rw_lock
1826  * is very unhappy with the combination of __builtin_return_address()
1827  * and a noreturn function. For some reason it adds more frames
1828  * than it should. rdar://76570684
1829  */
1830 void
1831 _lck_rw_lock_type_panic(lck_rw_t *lck, lck_rw_type_t lck_rw_type);
1832 #pragma clang diagnostic push
1833 #pragma clang diagnostic ignored "-Wmissing-noreturn"
1834 __attribute__((noinline, weak))
1835 void
_lck_rw_lock_type_panic(lck_rw_t * lck,lck_rw_type_t lck_rw_type)1836 _lck_rw_lock_type_panic(
1837 	lck_rw_t        *lck,
1838 	lck_rw_type_t   lck_rw_type)
1839 {
1840 	panic("lck_rw_lock(): Invalid RW lock type: %x for lock %p", lck_rw_type, lck);
1841 }
1842 #pragma clang diagnostic pop
1843 
1844 /*!
1845  * @function lck_rw_lock
1846  *
1847  * @abstract
1848  * Locks a rw_lock with the specified type.
1849  *
1850  * @discussion
1851  * See lck_rw_lock_shared() or lck_rw_lock_exclusive() for more details.
1852  *
1853  * @param lck           rw_lock to lock.
1854  * @param lck_rw_type   LCK_RW_TYPE_SHARED or LCK_RW_TYPE_EXCLUSIVE
1855  */
1856 void
lck_rw_lock(lck_rw_t * lck,lck_rw_type_t lck_rw_type)1857 lck_rw_lock(
1858 	lck_rw_t        *lck,
1859 	lck_rw_type_t   lck_rw_type)
1860 {
1861 	if (lck_rw_type == LCK_RW_TYPE_SHARED) {
1862 		return lck_rw_lock_shared_internal(lck, __builtin_return_address(0));
1863 	} else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE) {
1864 		return lck_rw_lock_exclusive_internal(lck, __builtin_return_address(0));
1865 	}
1866 	_lck_rw_lock_type_panic(lck, lck_rw_type);
1867 }
1868 
1869 __attribute__((always_inline))
1870 static boolean_t
lck_rw_try_lock_shared_internal_inline(lck_rw_t * lock,void * caller)1871 lck_rw_try_lock_shared_internal_inline(
1872 	lck_rw_t        *lock,
1873 	void            *caller)
1874 {
1875 #pragma unused(caller)
1876 
1877 	uint32_t        data, prev;
1878 	thread_t        thread = current_thread();
1879 #ifdef DEBUG_RW
1880 	boolean_t       check_canlock = TRUE;
1881 #endif
1882 
1883 	for (;;) {
1884 		data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_acquire_smp);
1885 		if (data & LCK_RW_INTERLOCK) {
1886 			atomic_exchange_abort();
1887 			lck_rw_interlock_spin(lock);
1888 			continue;
1889 		}
1890 		if (data & (LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE)) {
1891 			atomic_exchange_abort();
1892 			return FALSE;             /* lock is busy */
1893 		}
1894 #ifdef DEBUG_RW
1895 		if ((data & LCK_RW_SHARED_MASK) == 0) {
1896 			/*
1897 			 * If the lock is uncontended,
1898 			 * we do not need to check if we can lock it
1899 			 */
1900 			check_canlock = FALSE;
1901 		}
1902 #endif
1903 		data += LCK_RW_SHARED_READER;     /* Increment reader refcount */
1904 		if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_acquire_smp)) {
1905 			break;
1906 		}
1907 		cpu_pause();
1908 	}
1909 #ifdef DEBUG_RW
1910 	if (check_canlock) {
1911 		/*
1912 		 * Best effort attempt to check that this thread
1913 		 * is not already holding the lock (this checks read mode too).
1914 		 */
1915 		assert_canlock_rwlock(lock, thread, LCK_RW_TYPE_SHARED);
1916 	}
1917 #endif
1918 	assertf(lock->lck_rw_owner == 0, "state=0x%x, owner=%p",
1919 	    ordered_load_rw(lock), ctid_get_thread_unsafe(lock->lck_rw_owner));
1920 
1921 	if (lock->lck_rw_can_sleep) {
1922 		lck_rw_lock_count_inc(thread, lock);
1923 	} else if (get_preemption_level() == 0) {
1924 		panic("Taking non-sleepable RW lock with preemption enabled");
1925 	}
1926 
1927 #if     CONFIG_DTRACE
1928 	LOCKSTAT_RECORD(LS_LCK_RW_TRY_LOCK_SHARED_ACQUIRE, lock, DTRACE_RW_SHARED);
1929 #endif  /* CONFIG_DTRACE */
1930 
1931 #ifdef DEBUG_RW
1932 	add_held_rwlock(lock, thread, LCK_RW_TYPE_SHARED, caller);
1933 #endif /* DEBUG_RW */
1934 	return TRUE;
1935 }
1936 
1937 __attribute__((noinline))
1938 static boolean_t
lck_rw_try_lock_shared_internal(lck_rw_t * lock,void * caller)1939 lck_rw_try_lock_shared_internal(
1940 	lck_rw_t        *lock,
1941 	void            *caller)
1942 {
1943 	return lck_rw_try_lock_shared_internal_inline(lock, caller);
1944 }
1945 
1946 /*!
1947  * @function lck_rw_try_lock_shared
1948  *
1949  * @abstract
1950  * Tries to locks a rw_lock in read mode.
1951  *
1952  * @discussion
1953  * This function will return and not block in case the lock is already held.
1954  * See lck_rw_lock_shared for more details.
1955  *
1956  * @param lock           rw_lock to lock.
1957  *
1958  * @returns TRUE if the lock is successfully acquired, FALSE in case it was already held.
1959  */
1960 boolean_t
lck_rw_try_lock_shared(lck_rw_t * lock)1961 lck_rw_try_lock_shared(
1962 	lck_rw_t        *lock)
1963 {
1964 	return lck_rw_try_lock_shared_internal_inline(lock, __builtin_return_address(0));
1965 }
1966 
1967 __attribute__((always_inline))
1968 static boolean_t
lck_rw_try_lock_exclusive_internal_inline(lck_rw_t * lock,void * caller)1969 lck_rw_try_lock_exclusive_internal_inline(
1970 	lck_rw_t        *lock,
1971 	void            *caller)
1972 {
1973 #pragma unused(caller)
1974 	uint32_t        data, prev;
1975 
1976 	for (;;) {
1977 		data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_acquire_smp);
1978 		if (data & LCK_RW_INTERLOCK) {
1979 			atomic_exchange_abort();
1980 			lck_rw_interlock_spin(lock);
1981 			continue;
1982 		}
1983 		if (data & (LCK_RW_SHARED_MASK | LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE)) {
1984 			atomic_exchange_abort();
1985 			return FALSE;
1986 		}
1987 		data |= LCK_RW_WANT_EXCL;
1988 		if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_acquire_smp)) {
1989 			break;
1990 		}
1991 		cpu_pause();
1992 	}
1993 	thread_t thread = current_thread();
1994 
1995 	if (lock->lck_rw_can_sleep) {
1996 		lck_rw_lock_count_inc(thread, lock);
1997 	} else if (get_preemption_level() == 0) {
1998 		panic("Taking non-sleepable RW lock with preemption enabled");
1999 	}
2000 
2001 	assertf(lock->lck_rw_owner == 0, "state=0x%x, owner=%p",
2002 	    ordered_load_rw(lock), ctid_get_thread_unsafe(lock->lck_rw_owner));
2003 
2004 	ordered_store_rw_owner(lock, thread->ctid);
2005 #if     CONFIG_DTRACE
2006 	LOCKSTAT_RECORD(LS_LCK_RW_TRY_LOCK_EXCL_ACQUIRE, lock, DTRACE_RW_EXCL);
2007 #endif  /* CONFIG_DTRACE */
2008 
2009 #ifdef DEBUG_RW
2010 	add_held_rwlock(lock, thread, LCK_RW_TYPE_EXCLUSIVE, caller);
2011 #endif /* DEBUG_RW */
2012 	return TRUE;
2013 }
2014 
2015 __attribute__((noinline))
2016 static boolean_t
lck_rw_try_lock_exclusive_internal(lck_rw_t * lock,void * caller)2017 lck_rw_try_lock_exclusive_internal(
2018 	lck_rw_t        *lock,
2019 	void            *caller)
2020 {
2021 	return lck_rw_try_lock_exclusive_internal_inline(lock, caller);
2022 }
2023 
2024 /*!
2025  * @function lck_rw_try_lock_exclusive
2026  *
2027  * @abstract
2028  * Tries to locks a rw_lock in write mode.
2029  *
2030  * @discussion
2031  * This function will return and not block in case the lock is already held.
2032  * See lck_rw_lock_exclusive for more details.
2033  *
2034  * @param lock           rw_lock to lock.
2035  *
2036  * @returns TRUE if the lock is successfully acquired, FALSE in case it was already held.
2037  */
2038 boolean_t
lck_rw_try_lock_exclusive(lck_rw_t * lock)2039 lck_rw_try_lock_exclusive(
2040 	lck_rw_t        *lock)
2041 {
2042 	return lck_rw_try_lock_exclusive_internal_inline(lock, __builtin_return_address(0));
2043 }
2044 
2045 /*
2046  * Very sad hack, but the codegen for lck_rw_try_lock
2047  * is very unhappy with the combination of __builtin_return_address()
2048  * and a noreturn function. For some reason it adds more frames
2049  * than it should. rdar://76570684
2050  */
2051 boolean_t
2052 _lck_rw_try_lock_type_panic(lck_rw_t *lck, lck_rw_type_t lck_rw_type);
2053 #pragma clang diagnostic push
2054 #pragma clang diagnostic ignored "-Wmissing-noreturn"
2055 __attribute__((noinline, weak))
2056 boolean_t
_lck_rw_try_lock_type_panic(lck_rw_t * lck,lck_rw_type_t lck_rw_type)2057 _lck_rw_try_lock_type_panic(
2058 	lck_rw_t        *lck,
2059 	lck_rw_type_t   lck_rw_type)
2060 {
2061 	panic("lck_rw_lock(): Invalid RW lock type: %x for lock %p", lck_rw_type, lck);
2062 }
2063 #pragma clang diagnostic pop
2064 
2065 /*!
2066  * @function lck_rw_try_lock
2067  *
2068  * @abstract
2069  * Tries to locks a rw_lock with the specified type.
2070  *
2071  * @discussion
2072  * This function will return and not wait/block in case the lock is already held.
2073  * See lck_rw_try_lock_shared() or lck_rw_try_lock_exclusive() for more details.
2074  *
2075  * @param lck           rw_lock to lock.
2076  * @param lck_rw_type   LCK_RW_TYPE_SHARED or LCK_RW_TYPE_EXCLUSIVE
2077  *
2078  * @returns TRUE if the lock is successfully acquired, FALSE in case it was already held.
2079  */
2080 boolean_t
lck_rw_try_lock(lck_rw_t * lck,lck_rw_type_t lck_rw_type)2081 lck_rw_try_lock(
2082 	lck_rw_t        *lck,
2083 	lck_rw_type_t   lck_rw_type)
2084 {
2085 	if (lck_rw_type == LCK_RW_TYPE_SHARED) {
2086 		return lck_rw_try_lock_shared_internal(lck, __builtin_return_address(0));
2087 	} else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE) {
2088 		return lck_rw_try_lock_exclusive_internal(lck, __builtin_return_address(0));
2089 	}
2090 	return _lck_rw_try_lock_type_panic(lck, lck_rw_type);
2091 }
2092 
2093 /*
2094  *      Routine:        lck_rw_done_gen
2095  *
2096  *	prior_lock_state is the value in the 1st
2097  *      word of the lock at the time of a successful
2098  *	atomic compare and exchange with the new value...
2099  *      it represents the state of the lock before we
2100  *	decremented the rw_shared_count or cleared either
2101  *      rw_want_upgrade or rw_want_write and
2102  *	the lck_x_waiting bits...  since the wrapper
2103  *      routine has already changed the state atomically,
2104  *	we just need to decide if we should
2105  *	wake up anyone and what value to return... we do
2106  *	this by examining the state of the lock before
2107  *	we changed it
2108  */
2109 static lck_rw_type_t
lck_rw_done_gen(lck_rw_t * lck,uint32_t prior_lock_state)2110 lck_rw_done_gen(
2111 	lck_rw_t        *lck,
2112 	uint32_t        prior_lock_state)
2113 {
2114 	lck_rw_word_t   fake_lck;
2115 	lck_rw_type_t   lock_type;
2116 	thread_t        thread;
2117 
2118 	/*
2119 	 * prior_lock state is a snapshot of the 1st word of the
2120 	 * lock in question... we'll fake up a pointer to it
2121 	 * and carefully not access anything beyond whats defined
2122 	 * in the first word of a lck_rw_t
2123 	 */
2124 	fake_lck.data = prior_lock_state;
2125 
2126 	if (fake_lck.shared_count <= 1) {
2127 		if (fake_lck.w_waiting) {
2128 			thread_wakeup(LCK_RW_WRITER_EVENT(lck));
2129 		}
2130 
2131 		if (!(fake_lck.priv_excl && fake_lck.w_waiting) && fake_lck.r_waiting) {
2132 			thread_wakeup(LCK_RW_READER_EVENT(lck));
2133 		}
2134 	}
2135 	if (fake_lck.shared_count) {
2136 		lock_type = LCK_RW_TYPE_SHARED;
2137 	} else {
2138 		lock_type = LCK_RW_TYPE_EXCLUSIVE;
2139 	}
2140 
2141 	/* Check if dropping the lock means that we need to unpromote */
2142 	thread = current_thread();
2143 	if (fake_lck.can_sleep) {
2144 		lck_rw_lock_count_dec(thread, lck);
2145 	}
2146 
2147 #if CONFIG_DTRACE
2148 	LOCKSTAT_RECORD(LS_LCK_RW_DONE_RELEASE, lck, lock_type == LCK_RW_TYPE_SHARED ? 0 : 1);
2149 #endif
2150 
2151 #ifdef DEBUG_RW
2152 	remove_held_rwlock(lck, thread, lock_type);
2153 #endif /* DEBUG_RW */
2154 	return lock_type;
2155 }
2156 
2157 /*!
2158  * @function lck_rw_done
2159  *
2160  * @abstract
2161  * Force unlocks a rw_lock without consistency checks.
2162  *
2163  * @discussion
2164  * Do not use unless sure you can avoid consistency checks.
2165  *
2166  * @param lock           rw_lock to unlock.
2167  */
2168 lck_rw_type_t
lck_rw_done(lck_rw_t * lock)2169 lck_rw_done(
2170 	lck_rw_t        *lock)
2171 {
2172 	uint32_t        data, prev;
2173 	boolean_t       once = FALSE;
2174 
2175 #ifdef DEBUG_RW
2176 	/*
2177 	 * Best effort attempt to check that this thread
2178 	 * is holding the lock.
2179 	 */
2180 	thread_t thread = current_thread();
2181 	assert_held_rwlock(lock, thread, 0);
2182 #endif /* DEBUG_RW */
2183 	for (;;) {
2184 		data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_release_smp);
2185 		if (data & LCK_RW_INTERLOCK) {          /* wait for interlock to clear */
2186 			atomic_exchange_abort();
2187 			lck_rw_interlock_spin(lock);
2188 			continue;
2189 		}
2190 		if (data & LCK_RW_SHARED_MASK) {        /* lock is held shared */
2191 			assertf(lock->lck_rw_owner == 0,
2192 			    "state=0x%x, owner=%p", lock->lck_rw_data,
2193 			    ctid_get_thread_unsafe(lock->lck_rw_owner));
2194 			data -= LCK_RW_SHARED_READER;
2195 			if ((data & LCK_RW_SHARED_MASK) == 0) { /* if reader count has now gone to 0, check for waiters */
2196 				goto check_waiters;
2197 			}
2198 		} else {                                        /* if reader count == 0, must be exclusive lock */
2199 			if (data & LCK_RW_WANT_UPGRADE) {
2200 				data &= ~(LCK_RW_WANT_UPGRADE);
2201 			} else {
2202 				if (data & LCK_RW_WANT_EXCL) {
2203 					data &= ~(LCK_RW_WANT_EXCL);
2204 				} else {                                /* lock is not 'owned', panic */
2205 					panic("Releasing non-exclusive RW lock without a reader refcount!");
2206 				}
2207 			}
2208 			if (!once) {
2209 				// Only check for holder and clear it once
2210 				assertf(lock->lck_rw_owner == current_thread()->ctid,
2211 				    "state=0x%x, owner=%p", lock->lck_rw_data,
2212 				    ctid_get_thread_unsafe(lock->lck_rw_owner));
2213 				ordered_store_rw_owner(lock, 0);
2214 				once = TRUE;
2215 			}
2216 check_waiters:
2217 			/*
2218 			 * test the original values to match what
2219 			 * lck_rw_done_gen is going to do to determine
2220 			 * which wakeups need to happen...
2221 			 *
2222 			 * if !(fake_lck->lck_rw_priv_excl && fake_lck->lck_w_waiting)
2223 			 */
2224 			if (prev & LCK_RW_W_WAITING) {
2225 				data &= ~(LCK_RW_W_WAITING);
2226 				if ((prev & LCK_RW_PRIV_EXCL) == 0) {
2227 					data &= ~(LCK_RW_R_WAITING);
2228 				}
2229 			} else {
2230 				data &= ~(LCK_RW_R_WAITING);
2231 			}
2232 		}
2233 		if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_release_smp)) {
2234 			break;
2235 		}
2236 		cpu_pause();
2237 	}
2238 	return lck_rw_done_gen(lock, prev);
2239 }
2240 
2241 /*!
2242  * @function lck_rw_unlock_shared
2243  *
2244  * @abstract
2245  * Unlocks a rw_lock previously locked in shared mode.
2246  *
2247  * @discussion
2248  * The same thread that locked the lock needs to unlock it.
2249  *
2250  * @param lck           rw_lock held in shared mode to unlock.
2251  */
2252 void
lck_rw_unlock_shared(lck_rw_t * lck)2253 lck_rw_unlock_shared(
2254 	lck_rw_t        *lck)
2255 {
2256 	lck_rw_type_t   ret;
2257 
2258 	assertf(lck->lck_rw_owner == 0,
2259 	    "state=0x%x, owner=%p", lck->lck_rw_data,
2260 	    ctid_get_thread_unsafe(lck->lck_rw_owner));
2261 	assertf(lck->lck_rw_shared_count > 0, "shared_count=0x%x", lck->lck_rw_shared_count);
2262 	ret = lck_rw_done(lck);
2263 
2264 	if (ret != LCK_RW_TYPE_SHARED) {
2265 		panic("lck_rw_unlock_shared(): lock %p held in mode: %d", lck, ret);
2266 	}
2267 }
2268 
2269 /*!
2270  * @function lck_rw_unlock_exclusive
2271  *
2272  * @abstract
2273  * Unlocks a rw_lock previously locked in exclusive mode.
2274  *
2275  * @discussion
2276  * The same thread that locked the lock needs to unlock it.
2277  *
2278  * @param lck           rw_lock held in exclusive mode to unlock.
2279  */
2280 void
lck_rw_unlock_exclusive(lck_rw_t * lck)2281 lck_rw_unlock_exclusive(
2282 	lck_rw_t        *lck)
2283 {
2284 	lck_rw_type_t   ret;
2285 
2286 	assertf(lck->lck_rw_owner == current_thread()->ctid,
2287 	    "state=0x%x, owner=%p", lck->lck_rw_data,
2288 	    ctid_get_thread_unsafe(lck->lck_rw_owner));
2289 	ret = lck_rw_done(lck);
2290 
2291 	if (ret != LCK_RW_TYPE_EXCLUSIVE) {
2292 		panic("lck_rw_unlock_exclusive(): lock %p held in mode: %d", lck, ret);
2293 	}
2294 }
2295 
2296 /*!
2297  * @function lck_rw_unlock
2298  *
2299  * @abstract
2300  * Unlocks a rw_lock previously locked with lck_rw_type.
2301  *
2302  * @discussion
2303  * The lock must be unlocked by the same thread it was locked from.
2304  * The type of the lock/unlock have to match, unless an upgrade/downgrade was performed while
2305  * holding the lock.
2306  *
2307  * @param lck           rw_lock to unlock.
2308  * @param lck_rw_type   LCK_RW_TYPE_SHARED or LCK_RW_TYPE_EXCLUSIVE
2309  */
2310 void
lck_rw_unlock(lck_rw_t * lck,lck_rw_type_t lck_rw_type)2311 lck_rw_unlock(
2312 	lck_rw_t         *lck,
2313 	lck_rw_type_t    lck_rw_type)
2314 {
2315 	if (lck_rw_type == LCK_RW_TYPE_SHARED) {
2316 		lck_rw_unlock_shared(lck);
2317 	} else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE) {
2318 		lck_rw_unlock_exclusive(lck);
2319 	} else {
2320 		panic("lck_rw_unlock(): Invalid RW lock type: %d", lck_rw_type);
2321 	}
2322 }
2323 
2324 /*!
2325  * @function lck_rw_assert
2326  *
2327  * @abstract
2328  * Asserts the rw_lock is held.
2329  *
2330  * @discussion
2331  * read-write locks do not have a concept of ownership when held in shared mode,
2332  * so this function merely asserts that someone is holding the lock, not necessarily the caller.
2333  * However if rw_lock_debug is on, a best effort mechanism to track the owners is in place, and
2334  * this function can be more accurate.
2335  * Type can be LCK_RW_ASSERT_SHARED, LCK_RW_ASSERT_EXCLUSIVE, LCK_RW_ASSERT_HELD
2336  * LCK_RW_ASSERT_NOTHELD.
2337  *
2338  * @param lck   rw_lock to check.
2339  * @param type  assert type
2340  */
2341 void
lck_rw_assert(lck_rw_t * lck,unsigned int type)2342 lck_rw_assert(
2343 	lck_rw_t        *lck,
2344 	unsigned int    type)
2345 {
2346 	thread_t thread = current_thread();
2347 
2348 	switch (type) {
2349 	case LCK_RW_ASSERT_SHARED:
2350 		if ((lck->lck_rw_shared_count != 0) &&
2351 		    (lck->lck_rw_owner == 0)) {
2352 #if DEBUG_RW
2353 			assert_held_rwlock(lck, thread, LCK_RW_TYPE_SHARED);
2354 #endif /* DEBUG_RW */
2355 			return;
2356 		}
2357 		break;
2358 	case LCK_RW_ASSERT_EXCLUSIVE:
2359 		if ((lck->lck_rw_want_excl || lck->lck_rw_want_upgrade) &&
2360 		    (lck->lck_rw_shared_count == 0) &&
2361 		    (lck->lck_rw_owner == thread->ctid)) {
2362 #if DEBUG_RW
2363 			assert_held_rwlock(lck, thread, LCK_RW_TYPE_EXCLUSIVE);
2364 #endif /* DEBUG_RW */
2365 			return;
2366 		}
2367 		break;
2368 	case LCK_RW_ASSERT_HELD:
2369 		if (lck->lck_rw_shared_count != 0) {
2370 #if DEBUG_RW
2371 			assert_held_rwlock(lck, thread, LCK_RW_TYPE_SHARED);
2372 #endif /* DEBUG_RW */
2373 			return;         // Held shared
2374 		}
2375 		if ((lck->lck_rw_want_excl || lck->lck_rw_want_upgrade) &&
2376 		    (lck->lck_rw_owner == thread->ctid)) {
2377 #if DEBUG_RW
2378 			assert_held_rwlock(lck, thread, LCK_RW_TYPE_EXCLUSIVE);
2379 #endif /* DEBUG_RW */
2380 			return;         // Held exclusive
2381 		}
2382 		break;
2383 	case LCK_RW_ASSERT_NOTHELD:
2384 		if ((lck->lck_rw_shared_count == 0) &&
2385 		    !(lck->lck_rw_want_excl || lck->lck_rw_want_upgrade) &&
2386 		    (lck->lck_rw_owner == 0)) {
2387 #ifdef DEBUG_RW
2388 			assert_canlock_rwlock(lck, thread, LCK_RW_TYPE_EXCLUSIVE);
2389 #endif /* DEBUG_RW */
2390 			return;
2391 		}
2392 		break;
2393 	default:
2394 		break;
2395 	}
2396 	panic("rw lock (%p)%s held (mode=%u)", lck, (type == LCK_RW_ASSERT_NOTHELD ? "" : " not"), type);
2397 }
2398 
2399 /*!
2400  * @function kdp_lck_rw_lock_is_acquired_exclusive
2401  *
2402  * @abstract
2403  * Checks if a rw_lock is held exclusevely.
2404  *
2405  * @discussion
2406  * NOT SAFE: To be used only by kernel debugger to avoid deadlock.
2407  *
2408  * @param lck   lock to check
2409  *
2410  * @returns TRUE if the lock is held exclusevely
2411  */
2412 boolean_t
kdp_lck_rw_lock_is_acquired_exclusive(lck_rw_t * lck)2413 kdp_lck_rw_lock_is_acquired_exclusive(
2414 	lck_rw_t        *lck)
2415 {
2416 	if (not_in_kdp) {
2417 		panic("panic: rw lock exclusive check done outside of kernel debugger");
2418 	}
2419 	return ((lck->lck_rw_want_upgrade || lck->lck_rw_want_excl) && (lck->lck_rw_shared_count == 0)) ? TRUE : FALSE;
2420 }
2421 
2422 void
kdp_rwlck_find_owner(__unused struct waitq * waitq,event64_t event,thread_waitinfo_t * waitinfo)2423 kdp_rwlck_find_owner(
2424 	__unused struct waitq   *waitq,
2425 	event64_t               event,
2426 	thread_waitinfo_t       *waitinfo)
2427 {
2428 	lck_rw_t        *rwlck = NULL;
2429 	switch (waitinfo->wait_type) {
2430 	case kThreadWaitKernelRWLockRead:
2431 		rwlck = READ_EVENT_TO_RWLOCK(event);
2432 		break;
2433 	case kThreadWaitKernelRWLockWrite:
2434 	case kThreadWaitKernelRWLockUpgrade:
2435 		rwlck = WRITE_EVENT_TO_RWLOCK(event);
2436 		break;
2437 	default:
2438 		panic("%s was called with an invalid blocking type", __FUNCTION__);
2439 		break;
2440 	}
2441 	waitinfo->context = VM_KERNEL_UNSLIDE_OR_PERM(rwlck);
2442 	waitinfo->owner = thread_tid(ctid_get_thread(rwlck->lck_rw_owner));
2443 }
2444 
2445 /*!
2446  * @function lck_rw_lock_yield_shared
2447  *
2448  * @abstract
2449  * Yields a rw_lock held in shared mode.
2450  *
2451  * @discussion
2452  * This function can block.
2453  * Yields the lock in case there are writers waiting.
2454  * The yield will unlock, block, and re-lock the lock in shared mode.
2455  *
2456  * @param lck           rw_lock already held in shared mode to yield.
2457  * @param force_yield   if set to true it will always yield irrespective of the lock status
2458  *
2459  * @returns TRUE if the lock was yield, FALSE otherwise
2460  */
2461 bool
lck_rw_lock_yield_shared(lck_rw_t * lck,boolean_t force_yield)2462 lck_rw_lock_yield_shared(
2463 	lck_rw_t        *lck,
2464 	boolean_t       force_yield)
2465 {
2466 	lck_rw_word_t   word;
2467 
2468 	lck_rw_assert(lck, LCK_RW_ASSERT_SHARED);
2469 
2470 	word.data = ordered_load_rw(lck);
2471 	if (word.want_excl || word.want_upgrade || force_yield) {
2472 		lck_rw_unlock_shared(lck);
2473 		mutex_pause(2);
2474 		lck_rw_lock_shared(lck);
2475 		return true;
2476 	}
2477 
2478 	return false;
2479 }
2480 
2481 /*!
2482  * @function lck_rw_lock_yield_exclusive
2483  *
2484  * @abstract
2485  * Yields a rw_lock held in exclusive mode.
2486  *
2487  * @discussion
2488  * This function can block.
2489  * Yields the lock in case there are writers waiting.
2490  * The yield will unlock, block, and re-lock the lock in exclusive mode.
2491  *
2492  * @param lck           rw_lock already held in exclusive mode to yield.
2493  * @param mode          when to yield.
2494  *
2495  * @returns TRUE if the lock was yield, FALSE otherwise
2496  */
2497 bool
lck_rw_lock_yield_exclusive(lck_rw_t * lck,lck_rw_yield_t mode)2498 lck_rw_lock_yield_exclusive(
2499 	lck_rw_t        *lck,
2500 	lck_rw_yield_t  mode)
2501 {
2502 	lck_rw_word_t word;
2503 	bool yield = false;
2504 
2505 	lck_rw_assert(lck, LCK_RW_ASSERT_EXCLUSIVE);
2506 
2507 	if (mode == LCK_RW_YIELD_ALWAYS) {
2508 		yield = true;
2509 	} else {
2510 		word.data = ordered_load_rw(lck);
2511 		if (word.w_waiting) {
2512 			yield = true;
2513 		} else if (mode == LCK_RW_YIELD_ANY_WAITER) {
2514 			yield = (word.r_waiting != 0);
2515 		}
2516 	}
2517 
2518 	if (yield) {
2519 		lck_rw_unlock_exclusive(lck);
2520 		mutex_pause(2);
2521 		lck_rw_lock_exclusive(lck);
2522 	}
2523 
2524 	return yield;
2525 }
2526 
2527 /*!
2528  * @function lck_rw_sleep
2529  *
2530  * @abstract
2531  * Assert_wait on an event while holding the rw_lock.
2532  *
2533  * @discussion
2534  * the flags can decide how to re-acquire the lock upon wake up
2535  * (LCK_SLEEP_SHARED, or LCK_SLEEP_EXCLUSIVE, or LCK_SLEEP_UNLOCK)
2536  * and if the priority needs to be kept boosted until the lock is
2537  * re-acquired (LCK_SLEEP_PROMOTED_PRI).
2538  *
2539  * @param lck                   rw_lock to use to synch the assert_wait.
2540  * @param lck_sleep_action      flags.
2541  * @param event                 event to assert_wait on.
2542  * @param interruptible         wait type.
2543  */
2544 wait_result_t
lck_rw_sleep(lck_rw_t * lck,lck_sleep_action_t lck_sleep_action,event_t event,wait_interrupt_t interruptible)2545 lck_rw_sleep(
2546 	lck_rw_t                *lck,
2547 	lck_sleep_action_t      lck_sleep_action,
2548 	event_t                 event,
2549 	wait_interrupt_t        interruptible)
2550 {
2551 	wait_result_t           res;
2552 	lck_rw_type_t           lck_rw_type;
2553 	thread_pri_floor_t      token;
2554 
2555 	if ((lck_sleep_action & ~LCK_SLEEP_MASK) != 0) {
2556 		panic("Invalid lock sleep action %x", lck_sleep_action);
2557 	}
2558 
2559 	if (lck_sleep_action & LCK_SLEEP_PROMOTED_PRI) {
2560 		/*
2561 		 * Although we are dropping the RW lock, the intent in most cases
2562 		 * is that this thread remains as an observer, since it may hold
2563 		 * some secondary resource, but must yield to avoid deadlock. In
2564 		 * this situation, make sure that the thread is boosted to the
2565 		 * ceiling while blocked, so that it can re-acquire the
2566 		 * RW lock at that priority.
2567 		 */
2568 		token = thread_priority_floor_start();
2569 	}
2570 
2571 	res = assert_wait(event, interruptible);
2572 	if (res == THREAD_WAITING) {
2573 		lck_rw_type = lck_rw_done(lck);
2574 		res = thread_block(THREAD_CONTINUE_NULL);
2575 		if (!(lck_sleep_action & LCK_SLEEP_UNLOCK)) {
2576 			if (!(lck_sleep_action & (LCK_SLEEP_SHARED | LCK_SLEEP_EXCLUSIVE))) {
2577 				lck_rw_lock(lck, lck_rw_type);
2578 			} else if (lck_sleep_action & LCK_SLEEP_EXCLUSIVE) {
2579 				lck_rw_lock_exclusive(lck);
2580 			} else {
2581 				lck_rw_lock_shared(lck);
2582 			}
2583 		}
2584 	} else if (lck_sleep_action & LCK_SLEEP_UNLOCK) {
2585 		(void)lck_rw_done(lck);
2586 	}
2587 
2588 	if (lck_sleep_action & LCK_SLEEP_PROMOTED_PRI) {
2589 		thread_priority_floor_end(&token);
2590 	}
2591 
2592 	return res;
2593 }
2594 
2595 /*!
2596  * @function lck_rw_sleep_deadline
2597  *
2598  * @abstract
2599  * Assert_wait_deadline on an event while holding the rw_lock.
2600  *
2601  * @discussion
2602  * the flags can decide how to re-acquire the lock upon wake up
2603  * (LCK_SLEEP_SHARED, or LCK_SLEEP_EXCLUSIVE, or LCK_SLEEP_UNLOCK)
2604  * and if the priority needs to be kept boosted until the lock is
2605  * re-acquired (LCK_SLEEP_PROMOTED_PRI).
2606  *
2607  * @param lck                   rw_lock to use to synch the assert_wait.
2608  * @param lck_sleep_action      flags.
2609  * @param event                 event to assert_wait on.
2610  * @param interruptible         wait type.
2611  * @param deadline              maximum time after which being woken up
2612  */
2613 wait_result_t
lck_rw_sleep_deadline(lck_rw_t * lck,lck_sleep_action_t lck_sleep_action,event_t event,wait_interrupt_t interruptible,uint64_t deadline)2614 lck_rw_sleep_deadline(
2615 	lck_rw_t                *lck,
2616 	lck_sleep_action_t      lck_sleep_action,
2617 	event_t                 event,
2618 	wait_interrupt_t        interruptible,
2619 	uint64_t                deadline)
2620 {
2621 	wait_result_t           res;
2622 	lck_rw_type_t           lck_rw_type;
2623 	thread_pri_floor_t      token;
2624 
2625 	if ((lck_sleep_action & ~LCK_SLEEP_MASK) != 0) {
2626 		panic("Invalid lock sleep action %x", lck_sleep_action);
2627 	}
2628 
2629 	if (lck_sleep_action & LCK_SLEEP_PROMOTED_PRI) {
2630 		token = thread_priority_floor_start();
2631 	}
2632 
2633 	res = assert_wait_deadline(event, interruptible, deadline);
2634 	if (res == THREAD_WAITING) {
2635 		lck_rw_type = lck_rw_done(lck);
2636 		res = thread_block(THREAD_CONTINUE_NULL);
2637 		if (!(lck_sleep_action & LCK_SLEEP_UNLOCK)) {
2638 			if (!(lck_sleep_action & (LCK_SLEEP_SHARED | LCK_SLEEP_EXCLUSIVE))) {
2639 				lck_rw_lock(lck, lck_rw_type);
2640 			} else if (lck_sleep_action & LCK_SLEEP_EXCLUSIVE) {
2641 				lck_rw_lock_exclusive(lck);
2642 			} else {
2643 				lck_rw_lock_shared(lck);
2644 			}
2645 		}
2646 	} else if (lck_sleep_action & LCK_SLEEP_UNLOCK) {
2647 		(void)lck_rw_done(lck);
2648 	}
2649 
2650 	if (lck_sleep_action & LCK_SLEEP_PROMOTED_PRI) {
2651 		thread_priority_floor_end(&token);
2652 	}
2653 
2654 	return res;
2655 }
2656 
2657 /*
2658  * Reader-writer lock promotion
2659  *
2660  * We support a limited form of reader-writer
2661  * lock promotion whose effects are:
2662  *
2663  *   * Qualifying threads have decay disabled
2664  *   * Scheduler priority is reset to a floor of
2665  *     of their statically assigned priority
2666  *     or MINPRI_RWLOCK
2667  *
2668  * The rationale is that lck_rw_ts do not have
2669  * a single owner, so we cannot apply a directed
2670  * priority boost from all waiting threads
2671  * to all holding threads without maintaining
2672  * lists of all shared owners and all waiting
2673  * threads for every lock.
2674  *
2675  * Instead (and to preserve the uncontended fast-
2676  * path), acquiring (or attempting to acquire)
2677  * a RW lock in shared or exclusive lock increments
2678  * a per-thread counter. Only if that thread stops
2679  * making forward progress (for instance blocking
2680  * on a mutex, or being preempted) do we consult
2681  * the counter and apply the priority floor.
2682  * When the thread becomes runnable again (or in
2683  * the case of preemption it never stopped being
2684  * runnable), it has the priority boost and should
2685  * be in a good position to run on the CPU and
2686  * release all RW locks (at which point the priority
2687  * boost is cleared).
2688  *
2689  * Care must be taken to ensure that priority
2690  * boosts are not retained indefinitely, since unlike
2691  * mutex priority boosts (where the boost is tied
2692  * to the mutex lifecycle), the boost is tied
2693  * to the thread and independent of any particular
2694  * lck_rw_t. Assertions are in place on return
2695  * to userspace so that the boost is not held
2696  * indefinitely.
2697  *
2698  * The routines that increment/decrement the
2699  * per-thread counter should err on the side of
2700  * incrementing any time a preemption is possible
2701  * and the lock would be visible to the rest of the
2702  * system as held (so it should be incremented before
2703  * interlocks are dropped/preemption is enabled, or
2704  * before a CAS is executed to acquire the lock).
2705  *
2706  */
2707 
2708 /*!
2709  * @function lck_rw_clear_promotion
2710  *
2711  * @abstract
2712  * Undo priority promotions when the last rw_lock
2713  * is released by a thread (if a promotion was active).
2714  *
2715  * @param thread        thread to demote.
2716  * @param lock          object reason for the demotion.
2717  */
2718 __attribute__((noinline))
2719 static void
lck_rw_clear_promotion(thread_t thread,const void * lock)2720 lck_rw_clear_promotion(thread_t thread, const void *lock)
2721 {
2722 	/* Cancel any promotions if the thread had actually blocked while holding a RW lock */
2723 	spl_t s = splsched();
2724 	thread_lock(thread);
2725 
2726 	if (thread->sched_flags & TH_SFLAG_RW_PROMOTED) {
2727 		sched_thread_unpromote_reason(thread, TH_SFLAG_RW_PROMOTED,
2728 		    unslide_for_kdebug(lock));
2729 	}
2730 
2731 	thread_unlock(thread);
2732 	splx(s);
2733 }
2734 
2735 /*!
2736  * @function lck_rw_set_promotion_locked
2737  *
2738  * @abstract
2739  * Callout from context switch if the thread goes
2740  * off core with a positive rwlock_count.
2741  *
2742  * @discussion
2743  * Called at splsched with the thread locked.
2744  *
2745  * @param thread        thread to promote.
2746  */
2747 __attribute__((always_inline))
2748 void
lck_rw_set_promotion_locked(thread_t thread)2749 lck_rw_set_promotion_locked(thread_t thread)
2750 {
2751 	if (LcksOpts & disLkRWPrio) {
2752 		return;
2753 	}
2754 
2755 	assert(thread->rwlock_count > 0);
2756 
2757 	if (!(thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
2758 		sched_thread_promote_reason(thread, TH_SFLAG_RW_PROMOTED, 0);
2759 	}
2760 }
2761 
2762 __attribute__((always_inline))
2763 void
lck_rw_lock_count_inc(thread_t thread,const void * lock __unused)2764 lck_rw_lock_count_inc(thread_t thread, const void *lock __unused)
2765 {
2766 	if (thread->rwlock_count++ == 0) {
2767 #if MACH_ASSERT
2768 		/*
2769 		 * Set the ast to check that the
2770 		 * rwlock_count is going to be set to zero when
2771 		 * going back to userspace.
2772 		 * Set it only once when we increment it for the first time.
2773 		 */
2774 		act_set_debug_assert();
2775 #endif
2776 	}
2777 }
2778 
2779 __abortlike
2780 static void
__lck_rw_lock_count_dec_panic(thread_t thread)2781 __lck_rw_lock_count_dec_panic(thread_t thread)
2782 {
2783 	panic("rw lock count underflow for thread %p", thread);
2784 }
2785 
2786 __attribute__((always_inline))
2787 void
lck_rw_lock_count_dec(thread_t thread,const void * lock)2788 lck_rw_lock_count_dec(thread_t thread, const void *lock)
2789 {
2790 	uint32_t rwlock_count = thread->rwlock_count--;
2791 
2792 	if (rwlock_count == 0) {
2793 		__lck_rw_lock_count_dec_panic(thread);
2794 	}
2795 
2796 	if (__probable(rwlock_count == 1)) {
2797 		/* sched_flags checked without lock, but will be rechecked while clearing */
2798 		if (__improbable(thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
2799 			lck_rw_clear_promotion(thread, lock);
2800 		}
2801 	}
2802 }
2803