xref: /xnu-8792.61.2/osfmk/kern/lock_rw.c (revision 42e220869062b56f8d7d0726fd4c88954f87902c)
1 /*
2  * Copyright (c) 2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * @OSF_COPYRIGHT@
30  */
31 /*
32  * Mach Operating System
33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34  * All Rights Reserved.
35  *
36  * Permission to use, copy, modify and distribute this software and its
37  * documentation is hereby granted, provided that both the copyright
38  * notice and this permission notice appear in all copies of the
39  * software, derivative works or modified versions, and any portions
40  * thereof, and that both notices appear in supporting documentation.
41  *
42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45  *
46  * Carnegie Mellon requests users of this software to return to
47  *
48  *  Software Distribution Coordinator  or  [email protected]
49  *  School of Computer Science
50  *  Carnegie Mellon University
51  *  Pittsburgh PA 15213-3890
52  *
53  * any improvements or extensions that they make and grant Carnegie Mellon
54  * the rights to redistribute these changes.
55  */
56 #define LOCK_PRIVATE 1
57 #include <debug.h>
58 #include <kern/locks_internal.h>
59 #include <kern/lock_stat.h>
60 #include <kern/locks.h>
61 #include <kern/zalloc.h>
62 #include <kern/thread.h>
63 #include <kern/processor.h>
64 #include <kern/sched_prim.h>
65 #include <kern/debug.h>
66 #include <machine/atomic.h>
67 #include <machine/machine_cpu.h>
68 
69 KALLOC_TYPE_DEFINE(KT_LCK_RW, lck_rw_t, KT_PRIV_ACCT);
70 
71 #define LCK_RW_WRITER_EVENT(lck)                (event_t)((uintptr_t)(lck)+1)
72 #define LCK_RW_READER_EVENT(lck)                (event_t)((uintptr_t)(lck)+2)
73 #define WRITE_EVENT_TO_RWLOCK(event)            ((lck_rw_t *)((uintptr_t)(event)-1))
74 #define READ_EVENT_TO_RWLOCK(event)             ((lck_rw_t *)((uintptr_t)(event)-2))
75 
76 #if CONFIG_DTRACE
77 #define DTRACE_RW_SHARED        0x0     //reader
78 #define DTRACE_RW_EXCL          0x1     //writer
79 #define DTRACE_NO_FLAG          0x0     //not applicable
80 #endif  /* CONFIG_DTRACE */
81 
82 #define LCK_RW_LCK_EXCLUSIVE_CODE       0x100
83 #define LCK_RW_LCK_EXCLUSIVE1_CODE      0x101
84 #define LCK_RW_LCK_SHARED_CODE          0x102
85 #define LCK_RW_LCK_SH_TO_EX_CODE        0x103
86 #define LCK_RW_LCK_SH_TO_EX1_CODE       0x104
87 #define LCK_RW_LCK_EX_TO_SH_CODE        0x105
88 
89 #if __x86_64__
90 #define LCK_RW_LCK_EX_WRITER_SPIN_CODE  0x106
91 #define LCK_RW_LCK_EX_WRITER_WAIT_CODE  0x107
92 #define LCK_RW_LCK_EX_READER_SPIN_CODE  0x108
93 #define LCK_RW_LCK_EX_READER_WAIT_CODE  0x109
94 #define LCK_RW_LCK_SHARED_SPIN_CODE     0x110
95 #define LCK_RW_LCK_SHARED_WAIT_CODE     0x111
96 #define LCK_RW_LCK_SH_TO_EX_SPIN_CODE   0x112
97 #define LCK_RW_LCK_SH_TO_EX_WAIT_CODE   0x113
98 #endif
99 
100 #define lck_rw_ilk_lock(lock)   hw_lock_bit  ((hw_lock_bit_t*)(&(lock)->lck_rw_tag), LCK_RW_INTERLOCK_BIT, LCK_GRP_NULL)
101 #define lck_rw_ilk_unlock(lock) hw_unlock_bit((hw_lock_bit_t*)(&(lock)->lck_rw_tag), LCK_RW_INTERLOCK_BIT)
102 
103 #define ordered_load_rw(lock)                   os_atomic_load(&(lock)->lck_rw_data, compiler_acq_rel)
104 #define ordered_store_rw(lock, value)           os_atomic_store(&(lock)->lck_rw_data, (value), compiler_acq_rel)
105 #define ordered_store_rw_owner(lock, value)     os_atomic_store(&(lock)->lck_rw_owner, (value), compiler_acq_rel)
106 
107 #ifdef DEBUG_RW
108 static TUNABLE(bool, lck_rw_recursive_shared_assert_74048094, "lck_rw_recursive_shared_assert", false);
109 SECURITY_READ_ONLY_EARLY(vm_packing_params_t) rwlde_caller_packing_params =
110     VM_PACKING_PARAMS(LCK_RW_CALLER_PACKED);
111 #define rw_lock_debug_disabled()                ((LcksOpts & disLkRWDebug) == disLkRWDebug)
112 
113 #define set_rwlde_caller_packed(entry, caller)          ((entry)->rwlde_caller_packed = VM_PACK_POINTER((vm_offset_t)caller, LCK_RW_CALLER_PACKED))
114 #define get_rwlde_caller(entry)                         ((void*)VM_UNPACK_POINTER(entry->rwlde_caller_packed, LCK_RW_CALLER_PACKED))
115 
116 #endif /* DEBUG_RW */
117 
118 /*!
119  * @function lck_rw_alloc_init
120  *
121  * @abstract
122  * Allocates and initializes a rw_lock_t.
123  *
124  * @discussion
125  * The function can block. See lck_rw_init() for initialization details.
126  *
127  * @param grp           lock group to associate with the lock.
128  * @param attr          lock attribute to initialize the lock.
129  *
130  * @returns             NULL or the allocated lock
131  */
132 lck_rw_t *
lck_rw_alloc_init(lck_grp_t * grp,lck_attr_t * attr)133 lck_rw_alloc_init(
134 	lck_grp_t       *grp,
135 	lck_attr_t      *attr)
136 {
137 	lck_rw_t *lck;
138 
139 	lck = zalloc_flags(KT_LCK_RW, Z_WAITOK | Z_ZERO);
140 	lck_rw_init(lck, grp, attr);
141 	return lck;
142 }
143 
144 /*!
145  * @function lck_rw_init
146  *
147  * @abstract
148  * Initializes a rw_lock_t.
149  *
150  * @discussion
151  * Usage statistics for the lock are going to be added to the lock group provided.
152  *
153  * The lock attribute can be used to specify the lock contention behaviour.
154  * RW_WRITER_PRIORITY is the default behaviour (LCK_ATTR_NULL defaults to RW_WRITER_PRIORITY)
155  * and lck_attr_rw_shared_priority() can be used to set the behaviour to RW_SHARED_PRIORITY.
156  *
157  * RW_WRITER_PRIORITY gives priority to the writers upon contention with the readers;
158  * if the lock is held and a writer starts waiting for the lock, readers will not be able
159  * to acquire the lock until all writers stop contending. Readers could
160  * potentially starve.
161  * RW_SHARED_PRIORITY gives priority to the readers upon contention with the writers:
162  * unleass the lock is held in exclusive mode, readers will always be able to acquire the lock.
163  * Readers can lock a shared lock even if there are writers waiting. Writers could potentially
164  * starve.
165  *
166  * @param lck           lock to initialize.
167  * @param grp           lock group to associate with the lock.
168  * @param attr          lock attribute to initialize the lock.
169  *
170  */
171 void
lck_rw_init(lck_rw_t * lck,lck_grp_t * grp,lck_attr_t * attr)172 lck_rw_init(
173 	lck_rw_t        *lck,
174 	lck_grp_t       *grp,
175 	lck_attr_t      *attr)
176 {
177 	/* keep this so that the lck_type_t type is referenced for lldb */
178 	lck_type_t type = LCK_TYPE_RW;
179 
180 	if (attr == LCK_ATTR_NULL) {
181 		attr = &lck_attr_default;
182 	}
183 	*lck = (lck_rw_t){
184 		.lck_rw_type = type,
185 		.lck_rw_can_sleep = true,
186 		.lck_rw_priv_excl = !(attr->lck_attr_val & LCK_ATTR_RW_SHARED_PRIORITY),
187 	};
188 	lck_grp_reference(grp, &grp->lck_grp_rwcnt);
189 }
190 
191 /*!
192  * @function lck_rw_free
193  *
194  * @abstract
195  * Frees a rw_lock previously allocated with lck_rw_alloc_init().
196  *
197  * @discussion
198  * The lock must be not held by any thread.
199  *
200  * @param lck           rw_lock to free.
201  */
202 void
lck_rw_free(lck_rw_t * lck,lck_grp_t * grp)203 lck_rw_free(
204 	lck_rw_t        *lck,
205 	lck_grp_t       *grp)
206 {
207 	lck_rw_destroy(lck, grp);
208 	zfree(KT_LCK_RW, lck);
209 }
210 
211 /*!
212  * @function lck_rw_destroy
213  *
214  * @abstract
215  * Destroys a rw_lock previously initialized with lck_rw_init().
216  *
217  * @discussion
218  * The lock must be not held by any thread.
219  *
220  * @param lck           rw_lock to destroy.
221  */
222 void
lck_rw_destroy(lck_rw_t * lck,lck_grp_t * grp)223 lck_rw_destroy(
224 	lck_rw_t        *lck,
225 	lck_grp_t       *grp)
226 {
227 	if (lck->lck_rw_type != LCK_TYPE_RW ||
228 	    lck->lck_rw_tag == LCK_RW_TAG_DESTROYED) {
229 		panic("Destroying previously destroyed lock %p", lck);
230 	}
231 	lck_rw_assert(lck, LCK_RW_ASSERT_NOTHELD);
232 
233 	lck->lck_rw_type = LCK_TYPE_NONE;
234 	lck->lck_rw_tag = LCK_RW_TAG_DESTROYED;
235 	lck_grp_deallocate(grp, &grp->lck_grp_rwcnt);
236 }
237 
238 #ifdef DEBUG_RW
239 
240 /*
241  * Best effort mechanism to debug rw_locks.
242  *
243  * This mechanism is in addition to the owner checks. The owner is set
244  * only when the lock is held in exclusive mode so the checks do not cover
245  * the cases in which the lock is held in shared mode.
246  *
247  * This mechanism tentatively stores the rw_lock acquired and its debug
248  * information on the thread struct.
249  * Just up to LCK_RW_EXPECTED_MAX_NUMBER rw lock debug information can be stored.
250  *
251  * NOTE: LCK_RW_EXPECTED_MAX_NUMBER is the expected number of rw_locks held
252  * at the same time. If a thread holds more than this number of rw_locks we
253  * will start losing debug information.
254  * Increasing LCK_RW_EXPECTED_MAX_NUMBER will increase the probability we will
255  * store the debug information but it will require more memory per thread
256  * and longer lock/unlock time.
257  *
258  * If an empty slot is found for the debug information, we record the lock
259  * otherwise we set the overflow threshold flag.
260  *
261  * If we reached the overflow threshold we might stop asserting because we cannot be sure
262  * anymore if the lock was acquired or not.
263  *
264  * Even if we reached the overflow threshold, we try to store the debug information
265  * for the new locks acquired. This can be useful in core dumps to debug
266  * possible return to userspace without unlocking and to find possible readers
267  * holding the lock.
268  */
269 __startup_func
270 static void
rw_lock_init(void)271 rw_lock_init(void)
272 {
273 	if (kern_feature_override(KF_RW_LOCK_DEBUG_OVRD)) {
274 		LcksOpts |= disLkRWDebug;
275 	}
276 }
277 STARTUP(LOCKS, STARTUP_RANK_FIRST, rw_lock_init);
278 
279 static inline struct rw_lock_debug_entry *
find_lock_in_savedlocks(lck_rw_t * lock,rw_lock_debug_t * rw_locks_held)280 find_lock_in_savedlocks(lck_rw_t* lock, rw_lock_debug_t *rw_locks_held)
281 {
282 	int i;
283 	for (i = 0; i < LCK_RW_EXPECTED_MAX_NUMBER; i++) {
284 		struct rw_lock_debug_entry *existing = &rw_locks_held->rwld_locks[i];
285 		if (existing->rwlde_lock == lock) {
286 			return existing;
287 		}
288 	}
289 
290 	return NULL;
291 }
292 
293 __abortlike
294 static void
rwlock_slot_panic(rw_lock_debug_t * rw_locks_held)295 rwlock_slot_panic(rw_lock_debug_t *rw_locks_held)
296 {
297 	panic("No empty slot found in %p slot_used %d", rw_locks_held, rw_locks_held->rwld_locks_saved);
298 }
299 
300 static inline struct rw_lock_debug_entry *
find_empty_slot(rw_lock_debug_t * rw_locks_held)301 find_empty_slot(rw_lock_debug_t *rw_locks_held)
302 {
303 	int i;
304 	for (i = 0; i < LCK_RW_EXPECTED_MAX_NUMBER; i++) {
305 		struct rw_lock_debug_entry *entry = &rw_locks_held->rwld_locks[i];
306 		if (entry->rwlde_lock == NULL) {
307 			return entry;
308 		}
309 	}
310 	rwlock_slot_panic(rw_locks_held);
311 }
312 
313 __abortlike
314 static void
canlock_rwlock_panic(lck_rw_t * lock,thread_t thread,struct rw_lock_debug_entry * entry)315 canlock_rwlock_panic(lck_rw_t* lock, thread_t thread, struct rw_lock_debug_entry *entry)
316 {
317 	panic("RW lock %p already held by %p caller %p mode_count %d state 0x%x owner 0x%p ",
318 	    lock, thread, get_rwlde_caller(entry), entry->rwlde_mode_count,
319 	    ordered_load_rw(lock), ctid_get_thread_unsafe(lock->lck_rw_owner));
320 }
321 
322 static inline void
assert_canlock_rwlock(lck_rw_t * lock,thread_t thread,lck_rw_type_t type)323 assert_canlock_rwlock(lck_rw_t* lock, thread_t thread, lck_rw_type_t type)
324 {
325 	rw_lock_debug_t *rw_locks_held = &thread->rw_lock_held;
326 
327 	if (__probable(rw_lock_debug_disabled() || (rw_locks_held->rwld_locks_acquired == 0))) {
328 		//no locks saved, safe to lock
329 		return;
330 	}
331 
332 	struct rw_lock_debug_entry *entry = find_lock_in_savedlocks(lock, rw_locks_held);
333 	if (__improbable(entry != NULL)) {
334 		boolean_t can_be_shared_recursive;
335 		if (lck_rw_recursive_shared_assert_74048094) {
336 			can_be_shared_recursive = (lock->lck_rw_priv_excl == 0);
337 		} else {
338 			/* currently rw_lock_shared is called recursively,
339 			 * until the code is fixed allow to lock
340 			 * recursively in shared mode
341 			 */
342 			can_be_shared_recursive = TRUE;
343 		}
344 		if ((type == LCK_RW_TYPE_SHARED) && can_be_shared_recursive && entry->rwlde_mode_count >= 1) {
345 			return;
346 		}
347 		canlock_rwlock_panic(lock, thread, entry);
348 	}
349 }
350 
351 __abortlike
352 static void
held_rwlock_notheld_panic(lck_rw_t * lock,thread_t thread)353 held_rwlock_notheld_panic(lck_rw_t* lock, thread_t thread)
354 {
355 	panic("RW lock %p not held by %p", lock, thread);
356 }
357 
358 __abortlike
359 static void
held_rwlock_notheld_with_info_panic(lck_rw_t * lock,thread_t thread,lck_rw_type_t type,struct rw_lock_debug_entry * entry)360 held_rwlock_notheld_with_info_panic(lck_rw_t* lock, thread_t thread, lck_rw_type_t type, struct rw_lock_debug_entry *entry)
361 {
362 	if (type == LCK_RW_TYPE_EXCLUSIVE) {
363 		panic("RW lock %p not held in exclusive by %p caller %p read %d state 0x%x owner 0x%p ",
364 		    lock, thread, get_rwlde_caller(entry), entry->rwlde_mode_count,
365 		    ordered_load_rw(lock), ctid_get_thread_unsafe(lock->lck_rw_owner));
366 	} else {
367 		panic("RW lock %p not held in shared by %p caller %p read %d state 0x%x owner 0x%p ",
368 		    lock, thread, get_rwlde_caller(entry), entry->rwlde_mode_count,
369 		    ordered_load_rw(lock), ctid_get_thread_unsafe(lock->lck_rw_owner));
370 	}
371 }
372 
373 static inline void
assert_held_rwlock(lck_rw_t * lock,thread_t thread,lck_rw_type_t type)374 assert_held_rwlock(lck_rw_t* lock, thread_t thread, lck_rw_type_t type)
375 {
376 	rw_lock_debug_t *rw_locks_held = &thread->rw_lock_held;
377 
378 	if (__probable(rw_lock_debug_disabled())) {
379 		return;
380 	}
381 
382 	if (__improbable(rw_locks_held->rwld_locks_acquired == 0 || rw_locks_held->rwld_locks_saved == 0)) {
383 		if (rw_locks_held->rwld_locks_acquired == 0 || rw_locks_held->rwld_overflow == 0) {
384 			held_rwlock_notheld_panic(lock, thread);
385 		}
386 		return;
387 	}
388 
389 	struct rw_lock_debug_entry *entry = find_lock_in_savedlocks(lock, rw_locks_held);
390 	if (__probable(entry != NULL)) {
391 		if (type == LCK_RW_TYPE_EXCLUSIVE && entry->rwlde_mode_count != -1) {
392 			held_rwlock_notheld_with_info_panic(lock, thread, type, entry);
393 		} else {
394 			if (type == LCK_RW_TYPE_SHARED && entry->rwlde_mode_count <= 0) {
395 				held_rwlock_notheld_with_info_panic(lock, thread, type, entry);
396 			}
397 		}
398 	} else {
399 		if (rw_locks_held->rwld_overflow == 0) {
400 			held_rwlock_notheld_panic(lock, thread);
401 		}
402 	}
403 }
404 
405 static inline void
change_held_rwlock(lck_rw_t * lock,thread_t thread,lck_rw_type_t typeFrom,void * caller)406 change_held_rwlock(lck_rw_t* lock, thread_t thread, lck_rw_type_t typeFrom, void* caller)
407 {
408 	rw_lock_debug_t *rw_locks_held = &thread->rw_lock_held;
409 
410 	if (__probable(rw_lock_debug_disabled())) {
411 		return;
412 	}
413 
414 	if (__improbable(rw_locks_held->rwld_locks_saved == 0)) {
415 		if (rw_locks_held->rwld_overflow == 0) {
416 			held_rwlock_notheld_panic(lock, thread);
417 		}
418 		return;
419 	}
420 
421 	struct rw_lock_debug_entry *entry = find_lock_in_savedlocks(lock, rw_locks_held);
422 	if (__probable(entry != NULL)) {
423 		if (typeFrom == LCK_RW_TYPE_SHARED) {
424 			//We are upgrading
425 			assertf(entry->rwlde_mode_count == 1,
426 			    "RW lock %p not held by a single shared when upgrading "
427 			    "by %p caller %p read %d state 0x%x owner 0x%p ",
428 			    lock, thread, get_rwlde_caller(entry), entry->rwlde_mode_count,
429 			    ordered_load_rw(lock), ctid_get_thread_unsafe(lock->lck_rw_owner));
430 			entry->rwlde_mode_count = -1;
431 			set_rwlde_caller_packed(entry, caller);
432 		} else {
433 			//We are downgrading
434 			assertf(entry->rwlde_mode_count == -1,
435 			    "RW lock %p not held in write mode when downgrading "
436 			    "by %p caller %p read %d state 0x%x owner 0x%p ",
437 			    lock, thread, get_rwlde_caller(entry), entry->rwlde_mode_count,
438 			    ordered_load_rw(lock), ctid_get_thread_unsafe(lock->lck_rw_owner));
439 			entry->rwlde_mode_count = 1;
440 			set_rwlde_caller_packed(entry, caller);
441 		}
442 		return;
443 	}
444 
445 	if (rw_locks_held->rwld_overflow == 0) {
446 		held_rwlock_notheld_panic(lock, thread);
447 	}
448 
449 	if (rw_locks_held->rwld_locks_saved == LCK_RW_EXPECTED_MAX_NUMBER) {
450 		//array is full
451 		return;
452 	}
453 
454 	struct rw_lock_debug_entry *null_entry = find_empty_slot(rw_locks_held);
455 	null_entry->rwlde_lock = lock;
456 	set_rwlde_caller_packed(null_entry, caller);
457 	if (typeFrom == LCK_RW_TYPE_SHARED) {
458 		null_entry->rwlde_mode_count = -1;
459 	} else {
460 		null_entry->rwlde_mode_count = 1;
461 	}
462 	rw_locks_held->rwld_locks_saved++;
463 }
464 
465 __abortlike
466 static void
add_held_rwlock_too_many_panic(thread_t thread)467 add_held_rwlock_too_many_panic(thread_t thread)
468 {
469 	panic("RW lock too many rw locks held, rwld_locks_acquired maxed out for thread %p", thread);
470 }
471 
472 static inline void
add_held_rwlock(lck_rw_t * lock,thread_t thread,lck_rw_type_t type,void * caller)473 add_held_rwlock(lck_rw_t* lock, thread_t thread, lck_rw_type_t type, void* caller)
474 {
475 	rw_lock_debug_t *rw_locks_held = &thread->rw_lock_held;
476 	struct rw_lock_debug_entry *null_entry;
477 
478 	if (__probable(rw_lock_debug_disabled())) {
479 		return;
480 	}
481 
482 	if (__improbable(rw_locks_held->rwld_locks_acquired == UINT32_MAX)) {
483 		add_held_rwlock_too_many_panic(thread);
484 	}
485 	rw_locks_held->rwld_locks_acquired++;
486 
487 	if (type == LCK_RW_TYPE_EXCLUSIVE) {
488 		if (__improbable(rw_locks_held->rwld_locks_saved == LCK_RW_EXPECTED_MAX_NUMBER)) {
489 			//array is full
490 			rw_locks_held->rwld_overflow = 1;
491 			return;
492 		}
493 		null_entry = find_empty_slot(rw_locks_held);
494 		null_entry->rwlde_lock = lock;
495 		set_rwlde_caller_packed(null_entry, caller);
496 		null_entry->rwlde_mode_count = -1;
497 		rw_locks_held->rwld_locks_saved++;
498 		return;
499 	} else {
500 		if (__probable(rw_locks_held->rwld_locks_saved == 0)) {
501 			//array is empty
502 			goto add_shared;
503 		}
504 
505 		boolean_t allow_shared_recursive;
506 		if (lck_rw_recursive_shared_assert_74048094) {
507 			allow_shared_recursive = (lock->lck_rw_priv_excl == 0);
508 		} else {
509 			allow_shared_recursive = TRUE;
510 		}
511 		if (allow_shared_recursive) {
512 			//It could be already locked in shared mode
513 			struct rw_lock_debug_entry *entry = find_lock_in_savedlocks(lock, rw_locks_held);
514 			if (entry != NULL) {
515 				assert(entry->rwlde_mode_count > 0);
516 				assertf(entry->rwlde_mode_count != INT8_MAX,
517 				    "RW lock %p with too many recursive shared held "
518 				    "from %p caller %p read %d state 0x%x owner 0x%p",
519 				    lock, thread, get_rwlde_caller(entry), entry->rwlde_mode_count,
520 				    ordered_load_rw(lock), ctid_get_thread_unsafe(lock->lck_rw_owner));
521 				entry->rwlde_mode_count += 1;
522 				return;
523 			}
524 		}
525 
526 		//none of the locks were a match
527 		//try to add a new entry
528 		if (__improbable(rw_locks_held->rwld_locks_saved == LCK_RW_EXPECTED_MAX_NUMBER)) {
529 			//array is full
530 			rw_locks_held->rwld_overflow = 1;
531 			return;
532 		}
533 
534 add_shared:
535 		null_entry = find_empty_slot(rw_locks_held);
536 		null_entry->rwlde_lock = lock;
537 		set_rwlde_caller_packed(null_entry, caller);
538 		null_entry->rwlde_mode_count = 1;
539 		rw_locks_held->rwld_locks_saved++;
540 	}
541 }
542 
543 static inline void
remove_held_rwlock(lck_rw_t * lock,thread_t thread,lck_rw_type_t type)544 remove_held_rwlock(lck_rw_t* lock, thread_t thread, lck_rw_type_t type)
545 {
546 	rw_lock_debug_t *rw_locks_held = &thread->rw_lock_held;
547 
548 	if (__probable(rw_lock_debug_disabled())) {
549 		return;
550 	}
551 
552 	if (__improbable(rw_locks_held->rwld_locks_acquired == 0)) {
553 		return;
554 	}
555 	rw_locks_held->rwld_locks_acquired--;
556 
557 	if (rw_locks_held->rwld_locks_saved == 0) {
558 		assert(rw_locks_held->rwld_overflow == 1);
559 		goto out;
560 	}
561 
562 	struct rw_lock_debug_entry *entry = find_lock_in_savedlocks(lock, rw_locks_held);
563 	if (__probable(entry != NULL)) {
564 		if (type == LCK_RW_TYPE_EXCLUSIVE) {
565 			assert(entry->rwlde_mode_count == -1);
566 			entry->rwlde_mode_count = 0;
567 		} else {
568 			assert(entry->rwlde_mode_count > 0);
569 			entry->rwlde_mode_count--;
570 			if (entry->rwlde_mode_count > 0) {
571 				goto out;
572 			}
573 		}
574 		entry->rwlde_caller_packed = 0;
575 		entry->rwlde_lock = NULL;
576 		rw_locks_held->rwld_locks_saved--;
577 	} else {
578 		assert(rw_locks_held->rwld_overflow == 1);
579 	}
580 
581 out:
582 	if (rw_locks_held->rwld_locks_acquired == 0) {
583 		rw_locks_held->rwld_overflow = 0;
584 	}
585 	return;
586 }
587 #endif /* DEBUG_RW */
588 
589 /*
590  * We disable interrupts while holding the RW interlock to prevent an
591  * interrupt from exacerbating hold time.
592  * Hence, local helper functions lck_interlock_lock()/lck_interlock_unlock().
593  */
594 static inline boolean_t
lck_interlock_lock(lck_rw_t * lck)595 lck_interlock_lock(
596 	lck_rw_t        *lck)
597 {
598 	boolean_t       istate;
599 
600 	istate = ml_set_interrupts_enabled(FALSE);
601 	lck_rw_ilk_lock(lck);
602 	return istate;
603 }
604 
605 static inline void
lck_interlock_unlock(lck_rw_t * lck,boolean_t istate)606 lck_interlock_unlock(
607 	lck_rw_t        *lck,
608 	boolean_t       istate)
609 {
610 	lck_rw_ilk_unlock(lck);
611 	ml_set_interrupts_enabled(istate);
612 }
613 
614 static inline void
lck_rw_inc_thread_count(thread_t thread)615 lck_rw_inc_thread_count(
616 	thread_t thread)
617 {
618 	__assert_only uint32_t prev_rwlock_count;
619 
620 	prev_rwlock_count = thread->rwlock_count++;
621 #if MACH_ASSERT
622 	/*
623 	 * Set the ast to check that the
624 	 * rwlock_count is going to be set to zero when
625 	 * going back to userspace.
626 	 * Set it only once when we increment it for the first time.
627 	 */
628 	if (prev_rwlock_count == 0) {
629 		act_set_debug_assert();
630 	}
631 #endif
632 }
633 
634 /*
635  * compute the deadline to spin against when
636  * waiting for a change of state on a lck_rw_t
637  */
638 static inline uint64_t
lck_rw_deadline_for_spin(lck_rw_t * lck)639 lck_rw_deadline_for_spin(
640 	lck_rw_t        *lck)
641 {
642 	lck_rw_word_t   word;
643 
644 	word.data = ordered_load_rw(lck);
645 	if (word.can_sleep) {
646 		if (word.r_waiting || word.w_waiting || (word.shared_count > machine_info.max_cpus)) {
647 			/*
648 			 * there are already threads waiting on this lock... this
649 			 * implies that they have spun beyond their deadlines waiting for
650 			 * the desired state to show up so we will not bother spinning at this time...
651 			 *   or
652 			 * the current number of threads sharing this lock exceeds our capacity to run them
653 			 * concurrently and since all states we're going to spin for require the rw_shared_count
654 			 * to be at 0, we'll not bother spinning since the latency for this to happen is
655 			 * unpredictable...
656 			 */
657 			return mach_absolute_time();
658 		}
659 		return mach_absolute_time() + os_atomic_load(&MutexSpin, relaxed);
660 	} else {
661 		return mach_absolute_time() + (100000LL * 1000000000LL);
662 	}
663 }
664 
665 /*
666  * This inline is used when busy-waiting for an rw lock.
667  * If interrupts were disabled when the lock primitive was called,
668  * we poll the IPI handler for pending tlb flushes in x86.
669  */
670 static inline void
lck_rw_lock_pause(boolean_t interrupts_enabled)671 lck_rw_lock_pause(
672 	boolean_t       interrupts_enabled)
673 {
674 #if X86_64
675 	if (!interrupts_enabled) {
676 		handle_pending_TLB_flushes();
677 	}
678 	cpu_pause();
679 #else
680 	(void) interrupts_enabled;
681 	wait_for_event();
682 #endif
683 }
684 
685 typedef enum __enum_closed {
686 	LCK_RW_DRAIN_S_DRAINED       = 0,
687 	LCK_RW_DRAIN_S_NOT_DRAINED   = 1,
688 	LCK_RW_DRAIN_S_EARLY_RETURN  = 2,
689 	LCK_RW_DRAIN_S_TIMED_OUT     = 3,
690 } lck_rw_drain_state_t;
691 
692 static lck_rw_drain_state_t
693 lck_rw_drain_status(
694 	lck_rw_t        *lock,
695 	uint32_t        status_mask,
696 	boolean_t       wait,
697 	bool            (^lock_pause)(void))
698 {
699 	uint64_t        deadline = 0;
700 	uint32_t        data;
701 	boolean_t       istate = FALSE;
702 
703 	if (wait) {
704 		deadline = lck_rw_deadline_for_spin(lock);
705 #if __x86_64__
706 		istate = ml_get_interrupts_enabled();
707 #endif
708 	}
709 
710 	for (;;) {
711 #if __x86_64__
712 		data = os_atomic_load(&lock->lck_rw_data, relaxed);
713 #else
714 		data = load_exclusive32(&lock->lck_rw_data, memory_order_acquire_smp);
715 #endif
716 		if ((data & status_mask) == 0) {
717 			atomic_exchange_abort();
718 			return LCK_RW_DRAIN_S_DRAINED;
719 		}
720 
721 		if (!wait) {
722 			atomic_exchange_abort();
723 			return LCK_RW_DRAIN_S_NOT_DRAINED;
724 		}
725 
726 		lck_rw_lock_pause(istate);
727 
728 		if (mach_absolute_time() >= deadline) {
729 			return LCK_RW_DRAIN_S_TIMED_OUT;
730 		}
731 
732 		if (lock_pause && lock_pause()) {
733 			return LCK_RW_DRAIN_S_EARLY_RETURN;
734 		}
735 	}
736 }
737 
738 /*
739  * Spin while interlock is held.
740  */
741 static inline void
lck_rw_interlock_spin(lck_rw_t * lock)742 lck_rw_interlock_spin(
743 	lck_rw_t        *lock)
744 {
745 	uint32_t        data, prev;
746 
747 	for (;;) {
748 		data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_relaxed);
749 		if (data & LCK_RW_INTERLOCK) {
750 #if __x86_64__
751 			cpu_pause();
752 #else
753 			wait_for_event();
754 #endif
755 		} else {
756 			atomic_exchange_abort();
757 			return;
758 		}
759 	}
760 }
761 
762 #define LCK_RW_GRAB_WANT        0
763 #define LCK_RW_GRAB_SHARED      1
764 
765 typedef enum __enum_closed __enum_options {
766 	LCK_RW_GRAB_F_SHARED    = 0x0,  // Not really a flag obviously but makes call sites more readable.
767 	LCK_RW_GRAB_F_WANT_EXCL = 0x1,
768 	LCK_RW_GRAB_F_WAIT      = 0x2,
769 } lck_rw_grab_flags_t;
770 
771 typedef enum __enum_closed {
772 	LCK_RW_GRAB_S_NOT_LOCKED    = 0,
773 	LCK_RW_GRAB_S_LOCKED        = 1,
774 	LCK_RW_GRAB_S_EARLY_RETURN  = 2,
775 	LCK_RW_GRAB_S_TIMED_OUT     = 3,
776 } lck_rw_grab_state_t;
777 
778 static lck_rw_grab_state_t
779 lck_rw_grab(
780 	lck_rw_t            *lock,
781 	lck_rw_grab_flags_t flags,
782 	bool                (^lock_pause)(void))
783 {
784 	uint64_t        deadline = 0;
785 	uint32_t        data, prev;
786 	boolean_t       do_exch, istate = FALSE;
787 
788 	assert3u(flags & ~(LCK_RW_GRAB_F_WANT_EXCL | LCK_RW_GRAB_F_WAIT), ==, 0);
789 
790 	if ((flags & LCK_RW_GRAB_F_WAIT) != 0) {
791 		deadline = lck_rw_deadline_for_spin(lock);
792 #if __x86_64__
793 		istate = ml_get_interrupts_enabled();
794 #endif
795 	}
796 
797 	for (;;) {
798 		data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_acquire_smp);
799 		if (data & LCK_RW_INTERLOCK) {
800 			atomic_exchange_abort();
801 			lck_rw_interlock_spin(lock);
802 			continue;
803 		}
804 		do_exch = FALSE;
805 		if ((flags & LCK_RW_GRAB_F_WANT_EXCL) != 0) {
806 			if ((data & LCK_RW_WANT_EXCL) == 0) {
807 				data |= LCK_RW_WANT_EXCL;
808 				do_exch = TRUE;
809 			}
810 		} else {        // LCK_RW_GRAB_SHARED
811 			if (((data & (LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE)) == 0) ||
812 			    (((data & LCK_RW_SHARED_MASK)) && ((data & LCK_RW_PRIV_EXCL) == 0))) {
813 				data += LCK_RW_SHARED_READER;
814 				do_exch = TRUE;
815 			}
816 		}
817 		if (do_exch) {
818 			if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_acquire_smp)) {
819 				return LCK_RW_GRAB_S_LOCKED;
820 			}
821 		} else {
822 			if ((flags & LCK_RW_GRAB_F_WAIT) == 0) {
823 				atomic_exchange_abort();
824 				return LCK_RW_GRAB_S_NOT_LOCKED;
825 			}
826 
827 			lck_rw_lock_pause(istate);
828 
829 			if (mach_absolute_time() >= deadline) {
830 				return LCK_RW_GRAB_S_TIMED_OUT;
831 			}
832 			if (lock_pause && lock_pause()) {
833 				return LCK_RW_GRAB_S_EARLY_RETURN;
834 			}
835 		}
836 	}
837 }
838 
839 /*
840  * The inverse of lck_rw_grab - drops either the LCK_RW_WANT_EXCL bit or
841  * decrements the reader count. Doesn't deal with waking up waiters - i.e.
842  * should only be called when can_sleep is false.
843  */
844 static void
lck_rw_drop(lck_rw_t * lock,lck_rw_grab_flags_t flags)845 lck_rw_drop(lck_rw_t *lock, lck_rw_grab_flags_t flags)
846 {
847 	uint32_t data, prev;
848 
849 	assert3u(flags & ~(LCK_RW_GRAB_F_WANT_EXCL | LCK_RW_GRAB_F_WAIT), ==, 0);
850 	assert(!lock->lck_rw_can_sleep);
851 
852 	for (;;) {
853 		data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_acquire_smp);
854 
855 		/* Interlock should never be taken when can_sleep is false. */
856 		assert3u(data & LCK_RW_INTERLOCK, ==, 0);
857 
858 		if ((flags & LCK_RW_GRAB_F_WANT_EXCL) != 0) {
859 			data &= ~LCK_RW_WANT_EXCL;
860 		} else {
861 			data -= LCK_RW_SHARED_READER;
862 		}
863 
864 		if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_acquire_smp)) {
865 			break;
866 		}
867 
868 		cpu_pause();
869 	}
870 
871 	return;
872 }
873 
874 static boolean_t
875 lck_rw_lock_exclusive_gen(
876 	lck_rw_t        *lock,
877 	bool            (^lock_pause)(void))
878 {
879 	__assert_only thread_t self = current_thread();
880 	__kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lock);
881 	lck_rw_word_t           word;
882 	int                     slept = 0;
883 	lck_rw_grab_state_t     grab_state = LCK_RW_GRAB_S_NOT_LOCKED;
884 	lck_rw_drain_state_t    drain_state = LCK_RW_DRAIN_S_NOT_DRAINED;
885 	wait_result_t           res = 0;
886 	boolean_t               istate;
887 
888 #if     CONFIG_DTRACE
889 	boolean_t dtrace_ls_initialized = FALSE;
890 	boolean_t dtrace_rwl_excl_spin, dtrace_rwl_excl_block, dtrace_ls_enabled = FALSE;
891 	uint64_t wait_interval = 0;
892 	int readers_at_sleep = 0;
893 #endif
894 
895 	assertf(lock->lck_rw_owner != self->ctid,
896 	    "Lock already held state=0x%x, owner=%p",
897 	    ordered_load_rw(lock), self);
898 
899 #ifdef DEBUG_RW
900 	/*
901 	 * Best effort attempt to check that this thread
902 	 * is not already holding the lock (this checks read mode too).
903 	 */
904 	assert_canlock_rwlock(lock, self, LCK_RW_TYPE_EXCLUSIVE);
905 #endif /* DEBUG_RW */
906 
907 	/*
908 	 *	Try to acquire the lck_rw_want_excl bit.
909 	 */
910 	while (lck_rw_grab(lock, LCK_RW_GRAB_F_WANT_EXCL, NULL) != LCK_RW_GRAB_S_LOCKED) {
911 #if     CONFIG_DTRACE
912 		if (dtrace_ls_initialized == FALSE) {
913 			dtrace_ls_initialized = TRUE;
914 			dtrace_rwl_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] != 0);
915 			dtrace_rwl_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK] != 0);
916 			dtrace_ls_enabled = dtrace_rwl_excl_spin || dtrace_rwl_excl_block;
917 			if (dtrace_ls_enabled) {
918 				/*
919 				 * Either sleeping or spinning is happening,
920 				 *  start a timing of our delay interval now.
921 				 */
922 				readers_at_sleep = lock->lck_rw_shared_count;
923 				wait_interval = mach_absolute_time();
924 			}
925 		}
926 #endif
927 
928 		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) | DBG_FUNC_START,
929 		    trace_lck, 0, 0, 0, 0);
930 
931 		grab_state = lck_rw_grab(lock, LCK_RW_GRAB_F_WANT_EXCL | LCK_RW_GRAB_F_WAIT, lock_pause);
932 
933 		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) | DBG_FUNC_END,
934 		    trace_lck, 0, 0, grab_state, 0);
935 
936 		if (grab_state == LCK_RW_GRAB_S_LOCKED ||
937 		    grab_state == LCK_RW_GRAB_S_EARLY_RETURN) {
938 			break;
939 		}
940 		/*
941 		 * if we get here, the deadline has expired w/o us
942 		 * being able to grab the lock exclusively
943 		 * check to see if we're allowed to do a thread_block
944 		 */
945 		word.data = ordered_load_rw(lock);
946 		if (word.can_sleep) {
947 			istate = lck_interlock_lock(lock);
948 			word.data = ordered_load_rw(lock);
949 
950 			if (word.want_excl) {
951 				KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_WAIT_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
952 
953 				word.w_waiting = 1;
954 				ordered_store_rw(lock, word.data);
955 
956 				thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockWrite);
957 				res = assert_wait(LCK_RW_WRITER_EVENT(lock),
958 				    THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
959 				lck_interlock_unlock(lock, istate);
960 				if (res == THREAD_WAITING) {
961 					res = thread_block(THREAD_CONTINUE_NULL);
962 					slept++;
963 				}
964 				KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_WAIT_CODE) | DBG_FUNC_END, trace_lck, res, slept, 0, 0);
965 			} else {
966 				word.want_excl = 1;
967 				ordered_store_rw(lock, word.data);
968 				lck_interlock_unlock(lock, istate);
969 				break;
970 			}
971 		}
972 	}
973 
974 	if (grab_state == LCK_RW_GRAB_S_EARLY_RETURN) {
975 		assert(lock_pause);
976 		return FALSE;
977 	}
978 
979 	/*
980 	 * Wait for readers (and upgrades) to finish...
981 	 */
982 	while (lck_rw_drain_status(lock, LCK_RW_SHARED_MASK | LCK_RW_WANT_UPGRADE, FALSE, NULL) != LCK_RW_DRAIN_S_DRAINED) {
983 #if     CONFIG_DTRACE
984 		/*
985 		 * Either sleeping or spinning is happening, start
986 		 * a timing of our delay interval now.  If we set it
987 		 * to -1 we don't have accurate data so we cannot later
988 		 * decide to record a dtrace spin or sleep event.
989 		 */
990 		if (dtrace_ls_initialized == FALSE) {
991 			dtrace_ls_initialized = TRUE;
992 			dtrace_rwl_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] != 0);
993 			dtrace_rwl_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK] != 0);
994 			dtrace_ls_enabled = dtrace_rwl_excl_spin || dtrace_rwl_excl_block;
995 			if (dtrace_ls_enabled) {
996 				/*
997 				 * Either sleeping or spinning is happening,
998 				 *  start a timing of our delay interval now.
999 				 */
1000 				readers_at_sleep = lock->lck_rw_shared_count;
1001 				wait_interval = mach_absolute_time();
1002 			}
1003 		}
1004 #endif
1005 
1006 		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
1007 
1008 		drain_state = lck_rw_drain_status(lock, LCK_RW_SHARED_MASK | LCK_RW_WANT_UPGRADE, TRUE, lock_pause);
1009 
1010 		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) | DBG_FUNC_END, trace_lck, 0, 0, drain_state, 0);
1011 
1012 		if (drain_state == LCK_RW_DRAIN_S_DRAINED ||
1013 		    drain_state == LCK_RW_DRAIN_S_EARLY_RETURN) {
1014 			break;
1015 		}
1016 		/*
1017 		 * if we get here, the deadline has expired w/o us
1018 		 * being able to grab the lock exclusively
1019 		 * check to see if we're allowed to do a thread_block
1020 		 */
1021 		word.data = ordered_load_rw(lock);
1022 		if (word.can_sleep) {
1023 			istate = lck_interlock_lock(lock);
1024 			word.data = ordered_load_rw(lock);
1025 
1026 			if (word.shared_count != 0 || word.want_upgrade) {
1027 				KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_WAIT_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
1028 
1029 				word.w_waiting = 1;
1030 				ordered_store_rw(lock, word.data);
1031 
1032 				thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockWrite);
1033 				res = assert_wait(LCK_RW_WRITER_EVENT(lock),
1034 				    THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
1035 				lck_interlock_unlock(lock, istate);
1036 
1037 				if (res == THREAD_WAITING) {
1038 					res = thread_block(THREAD_CONTINUE_NULL);
1039 					slept++;
1040 				}
1041 				KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_WAIT_CODE) | DBG_FUNC_END, trace_lck, res, slept, 0, 0);
1042 			} else {
1043 				lck_interlock_unlock(lock, istate);
1044 				/*
1045 				 * must own the lock now, since we checked for
1046 				 * readers or upgrade owner behind the interlock
1047 				 * no need for a call to 'lck_rw_drain_status'
1048 				 */
1049 				break;
1050 			}
1051 		}
1052 	}
1053 
1054 #if     CONFIG_DTRACE
1055 	/*
1056 	 * Decide what latencies we suffered that are Dtrace events.
1057 	 * If we have set wait_interval, then we either spun or slept.
1058 	 * At least we get out from under the interlock before we record
1059 	 * which is the best we can do here to minimize the impact
1060 	 * of the tracing.
1061 	 * If we have set wait_interval to -1, then dtrace was not enabled when we
1062 	 * started sleeping/spinning so we don't record this event.
1063 	 */
1064 	if (dtrace_ls_enabled == TRUE) {
1065 		if (slept == 0) {
1066 			LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_SPIN, lock,
1067 			    mach_absolute_time() - wait_interval, 1);
1068 		} else {
1069 			/*
1070 			 * For the blocking case, we also record if when we blocked
1071 			 * it was held for read or write, and how many readers.
1072 			 * Notice that above we recorded this before we dropped
1073 			 * the interlock so the count is accurate.
1074 			 */
1075 			LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_BLOCK, lock,
1076 			    mach_absolute_time() - wait_interval, 1,
1077 			    (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1078 		}
1079 	}
1080 #endif /* CONFIG_DTRACE */
1081 
1082 	if (drain_state == LCK_RW_DRAIN_S_EARLY_RETURN) {
1083 		lck_rw_drop(lock, LCK_RW_GRAB_F_WANT_EXCL);
1084 		assert(lock_pause);
1085 		return FALSE;
1086 	}
1087 
1088 #if CONFIG_DTRACE
1089 	LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE, lock, 1);
1090 #endif  /* CONFIG_DTRACE */
1091 
1092 	return TRUE;
1093 }
1094 
1095 #define LCK_RW_LOCK_EXCLUSIVE_TAS(lck) (atomic_test_and_set32(&(lck)->lck_rw_data, \
1096 	    (LCK_RW_SHARED_MASK | LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE | LCK_RW_INTERLOCK), \
1097 	    LCK_RW_WANT_EXCL, memory_order_acquire_smp, FALSE))
1098 /*!
1099  * @function lck_rw_lock_exclusive_check_contended
1100  *
1101  * @abstract
1102  * Locks a rw_lock in exclusive mode.
1103  *
1104  * @discussion
1105  * This routine IS EXPERIMENTAL.
1106  * It's only used for the vm object lock, and use for other subsystems is UNSUPPORTED.
1107  * Note that the return value is ONLY A HEURISTIC w.r.t. the lock's contention.
1108  *
1109  * @param lock           rw_lock to lock.
1110  *
1111  * @returns Returns TRUE if the thread spun or blocked while attempting to acquire the lock, FALSE
1112  *          otherwise.
1113  */
1114 bool
lck_rw_lock_exclusive_check_contended(lck_rw_t * lock)1115 lck_rw_lock_exclusive_check_contended(
1116 	lck_rw_t        *lock)
1117 {
1118 	thread_t        thread = current_thread();
1119 	bool            contended  = false;
1120 
1121 	if (lock->lck_rw_can_sleep) {
1122 		lck_rw_inc_thread_count(thread);
1123 	} else if (get_preemption_level() == 0) {
1124 		panic("Taking non-sleepable RW lock with preemption enabled");
1125 	}
1126 
1127 	if (LCK_RW_LOCK_EXCLUSIVE_TAS(lock)) {
1128 #if     CONFIG_DTRACE
1129 		LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE, lock, DTRACE_RW_EXCL);
1130 #endif  /* CONFIG_DTRACE */
1131 	} else {
1132 		contended = true;
1133 		(void) lck_rw_lock_exclusive_gen(lock, NULL);
1134 	}
1135 	assertf(lock->lck_rw_owner == 0, "state=0x%x, owner=%p",
1136 	    ordered_load_rw(lock), ctid_get_thread_unsafe(lock->lck_rw_owner));
1137 	ordered_store_rw_owner(lock, thread->ctid);
1138 
1139 #ifdef DEBUG_RW
1140 	add_held_rwlock(lock, thread, LCK_RW_TYPE_EXCLUSIVE, __builtin_return_address(0));
1141 #endif /* DEBUG_RW */
1142 	return contended;
1143 }
1144 
1145 __attribute__((always_inline))
1146 static boolean_t
1147 lck_rw_lock_exclusive_internal_inline(
1148 	lck_rw_t        *lock,
1149 	void            *caller,
1150 	bool            (^lock_pause)(void))
1151 {
1152 #pragma unused(caller)
1153 	thread_t        thread = current_thread();
1154 
1155 	if (lock->lck_rw_can_sleep) {
1156 		lck_rw_inc_thread_count(thread);
1157 	} else if (get_preemption_level() == 0) {
1158 		panic("Taking non-sleepable RW lock with preemption enabled");
1159 	}
1160 
1161 	if (LCK_RW_LOCK_EXCLUSIVE_TAS(lock)) {
1162 #if     CONFIG_DTRACE
1163 		LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE, lock, DTRACE_RW_EXCL);
1164 #endif  /* CONFIG_DTRACE */
1165 	} else if (!lck_rw_lock_exclusive_gen(lock, lock_pause)) {
1166 		/*
1167 		 * lck_rw_lock_exclusive_gen() should only return
1168 		 * early if lock_pause has been passed and
1169 		 * returns FALSE. lock_pause is exclusive with
1170 		 * lck_rw_can_sleep().
1171 		 */
1172 		assert(!lock->lck_rw_can_sleep);
1173 		return FALSE;
1174 	}
1175 
1176 	assertf(lock->lck_rw_owner == 0, "state=0x%x, owner=%p",
1177 	    ordered_load_rw(lock), ctid_get_thread_unsafe(lock->lck_rw_owner));
1178 	ordered_store_rw_owner(lock, thread->ctid);
1179 
1180 #if DEBUG_RW
1181 	add_held_rwlock(lock, thread, LCK_RW_TYPE_EXCLUSIVE, caller);
1182 #endif /* DEBUG_RW */
1183 
1184 	return TRUE;
1185 }
1186 
1187 __attribute__((noinline))
1188 static void
lck_rw_lock_exclusive_internal(lck_rw_t * lock,void * caller)1189 lck_rw_lock_exclusive_internal(
1190 	lck_rw_t        *lock,
1191 	void            *caller)
1192 {
1193 	(void) lck_rw_lock_exclusive_internal_inline(lock, caller, NULL);
1194 }
1195 
1196 /*!
1197  * @function lck_rw_lock_exclusive
1198  *
1199  * @abstract
1200  * Locks a rw_lock in exclusive mode.
1201  *
1202  * @discussion
1203  * This function can block.
1204  * Multiple threads can acquire the lock in shared mode at the same time, but only one thread at a time
1205  * can acquire it in exclusive mode.
1206  * NOTE: the thread cannot return to userspace while the lock is held. Recursive locking is not supported.
1207  *
1208  * @param lock           rw_lock to lock.
1209  */
1210 void
lck_rw_lock_exclusive(lck_rw_t * lock)1211 lck_rw_lock_exclusive(
1212 	lck_rw_t        *lock)
1213 {
1214 	(void) lck_rw_lock_exclusive_internal_inline(lock, __builtin_return_address(0), NULL);
1215 }
1216 
1217 /*!
1218  * @function lck_rw_lock_exclusive_b
1219  *
1220  * @abstract
1221  * Locks a rw_lock in exclusive mode. Returns early if the lock can't be acquired
1222  * and the specified block returns true.
1223  *
1224  * @discussion
1225  * Identical to lck_rw_lock_exclusive() but can return early if the lock can't be
1226  * acquired and the specified block returns true. The block is called
1227  * repeatedly when waiting to acquire the lock.
1228  * Should only be called when the lock cannot sleep (i.e. when
1229  * lock->lck_rw_can_sleep is false).
1230  *
1231  * @param lock           rw_lock to lock.
1232  * @param lock_pause     block invoked while waiting to acquire lock
1233  *
1234  * @returns              Returns TRUE if the lock is successfully taken,
1235  *                       FALSE if the block returns true and the lock has
1236  *                       not been acquired.
1237  */
1238 boolean_t
1239 lck_rw_lock_exclusive_b(
1240 	lck_rw_t        *lock,
1241 	bool            (^lock_pause)(void))
1242 {
1243 	assert(!lock->lck_rw_can_sleep);
1244 
1245 	return lck_rw_lock_exclusive_internal_inline(lock, __builtin_return_address(0), lock_pause);
1246 }
1247 
1248 /*
1249  *	Routine:	lck_rw_lock_shared_gen
1250  *	Function:
1251  *		Fast path code has determined that this lock
1252  *		is held exclusively... this is where we spin/block
1253  *		until we can acquire the lock in the shared mode
1254  */
1255 static boolean_t
1256 lck_rw_lock_shared_gen(
1257 	lck_rw_t        *lck,
1258 	bool            (^lock_pause)(void))
1259 {
1260 	__assert_only thread_t  self = current_thread();
1261 	__kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lck);
1262 	lck_rw_word_t           word;
1263 	lck_rw_grab_state_t     grab_state = LCK_RW_GRAB_S_NOT_LOCKED;
1264 	int                     slept = 0;
1265 	wait_result_t           res = 0;
1266 	boolean_t               istate;
1267 
1268 #if     CONFIG_DTRACE
1269 	uint64_t wait_interval = 0;
1270 	int readers_at_sleep = 0;
1271 	boolean_t dtrace_ls_initialized = FALSE;
1272 	boolean_t dtrace_rwl_shared_spin, dtrace_rwl_shared_block, dtrace_ls_enabled = FALSE;
1273 #endif /* CONFIG_DTRACE */
1274 
1275 	assertf(lck->lck_rw_owner != self->ctid,
1276 	    "Lock already held state=0x%x, owner=%p",
1277 	    ordered_load_rw(lck), self);
1278 
1279 #ifdef DEBUG_RW
1280 	/*
1281 	 * Best effort attempt to check that this thread
1282 	 * is not already holding the lock in shared mode.
1283 	 */
1284 	assert_canlock_rwlock(lck, self, LCK_RW_TYPE_SHARED);
1285 #endif
1286 
1287 	while (lck_rw_grab(lck, LCK_RW_GRAB_F_SHARED, NULL) != LCK_RW_GRAB_S_LOCKED) {
1288 #if     CONFIG_DTRACE
1289 		if (dtrace_ls_initialized == FALSE) {
1290 			dtrace_ls_initialized = TRUE;
1291 			dtrace_rwl_shared_spin = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_SPIN] != 0);
1292 			dtrace_rwl_shared_block = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_BLOCK] != 0);
1293 			dtrace_ls_enabled = dtrace_rwl_shared_spin || dtrace_rwl_shared_block;
1294 			if (dtrace_ls_enabled) {
1295 				/*
1296 				 * Either sleeping or spinning is happening,
1297 				 *  start a timing of our delay interval now.
1298 				 */
1299 				readers_at_sleep = lck->lck_rw_shared_count;
1300 				wait_interval = mach_absolute_time();
1301 			}
1302 		}
1303 #endif
1304 
1305 		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_SPIN_CODE) | DBG_FUNC_START,
1306 		    trace_lck, lck->lck_rw_want_excl, lck->lck_rw_want_upgrade, 0, 0);
1307 
1308 		grab_state = lck_rw_grab(lck, LCK_RW_GRAB_F_SHARED | LCK_RW_GRAB_F_WAIT, lock_pause);
1309 
1310 		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_SPIN_CODE) | DBG_FUNC_END,
1311 		    trace_lck, lck->lck_rw_want_excl, lck->lck_rw_want_upgrade, grab_state, 0);
1312 
1313 		if (grab_state == LCK_RW_GRAB_S_LOCKED ||
1314 		    grab_state == LCK_RW_GRAB_S_EARLY_RETURN) {
1315 			break;
1316 		}
1317 
1318 		/*
1319 		 * if we get here, the deadline has expired w/o us
1320 		 * being able to grab the lock for read
1321 		 * check to see if we're allowed to do a thread_block
1322 		 */
1323 		if (lck->lck_rw_can_sleep) {
1324 			istate = lck_interlock_lock(lck);
1325 
1326 			word.data = ordered_load_rw(lck);
1327 			if ((word.want_excl || word.want_upgrade) &&
1328 			    ((word.shared_count == 0) || word.priv_excl)) {
1329 				KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_WAIT_CODE) | DBG_FUNC_START,
1330 				    trace_lck, word.want_excl, word.want_upgrade, 0, 0);
1331 
1332 				word.r_waiting = 1;
1333 				ordered_store_rw(lck, word.data);
1334 
1335 				thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockRead);
1336 				res = assert_wait(LCK_RW_READER_EVENT(lck),
1337 				    THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
1338 				lck_interlock_unlock(lck, istate);
1339 
1340 				if (res == THREAD_WAITING) {
1341 					res = thread_block(THREAD_CONTINUE_NULL);
1342 					slept++;
1343 				}
1344 				KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_WAIT_CODE) | DBG_FUNC_END,
1345 				    trace_lck, res, slept, 0, 0);
1346 			} else {
1347 				word.shared_count++;
1348 				ordered_store_rw(lck, word.data);
1349 				lck_interlock_unlock(lck, istate);
1350 				break;
1351 			}
1352 		}
1353 	}
1354 
1355 #if     CONFIG_DTRACE
1356 	if (dtrace_ls_enabled == TRUE) {
1357 		if (slept == 0) {
1358 			LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_SPIN, lck, mach_absolute_time() - wait_interval, 0);
1359 		} else {
1360 			LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_BLOCK, lck,
1361 			    mach_absolute_time() - wait_interval, 0,
1362 			    (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1363 		}
1364 	}
1365 #endif /* CONFIG_DTRACE */
1366 
1367 	if (grab_state == LCK_RW_GRAB_S_EARLY_RETURN) {
1368 		assert(lock_pause);
1369 		return FALSE;
1370 	}
1371 
1372 #if     CONFIG_DTRACE
1373 	LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_ACQUIRE, lck, 0);
1374 #endif  /* CONFIG_DTRACE */
1375 
1376 	return TRUE;
1377 }
1378 
1379 __attribute__((always_inline))
1380 static boolean_t
1381 lck_rw_lock_shared_internal_inline(
1382 	lck_rw_t        *lock,
1383 	void            *caller,
1384 	bool            (^lock_pause)(void))
1385 {
1386 #pragma unused(caller)
1387 
1388 	uint32_t        data, prev;
1389 	thread_t        thread = current_thread();
1390 #ifdef DEBUG_RW
1391 	boolean_t       check_canlock = TRUE;
1392 #endif
1393 
1394 	if (lock->lck_rw_can_sleep) {
1395 		lck_rw_inc_thread_count(thread);
1396 	} else if (get_preemption_level() == 0) {
1397 		panic("Taking non-sleepable RW lock with preemption enabled");
1398 	}
1399 
1400 	for (;;) {
1401 		data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_acquire_smp);
1402 		if (data & (LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE | LCK_RW_INTERLOCK)) {
1403 			atomic_exchange_abort();
1404 			if (!lck_rw_lock_shared_gen(lock, lock_pause)) {
1405 				/*
1406 				 * lck_rw_lock_shared_gen() should only return
1407 				 * early if lock_pause has been passed and
1408 				 * returns FALSE. lock_pause is exclusive with
1409 				 * lck_rw_can_sleep().
1410 				 */
1411 				assert(!lock->lck_rw_can_sleep);
1412 				return FALSE;
1413 			}
1414 
1415 			goto locked;
1416 		}
1417 #ifdef DEBUG_RW
1418 		if ((data & LCK_RW_SHARED_MASK) == 0) {
1419 			/*
1420 			 * If the lock is uncontended,
1421 			 * we do not need to check if we can lock it
1422 			 */
1423 			check_canlock = FALSE;
1424 		}
1425 #endif
1426 		data += LCK_RW_SHARED_READER;
1427 		if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_acquire_smp)) {
1428 			break;
1429 		}
1430 		cpu_pause();
1431 	}
1432 #ifdef DEBUG_RW
1433 	if (check_canlock) {
1434 		/*
1435 		 * Best effort attempt to check that this thread
1436 		 * is not already holding the lock (this checks read mode too).
1437 		 */
1438 		assert_canlock_rwlock(lock, thread, LCK_RW_TYPE_SHARED);
1439 	}
1440 #endif
1441 locked:
1442 	assertf(lock->lck_rw_owner == 0, "state=0x%x, owner=%p",
1443 	    ordered_load_rw(lock), ctid_get_thread_unsafe(lock->lck_rw_owner));
1444 
1445 #if     CONFIG_DTRACE
1446 	LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_ACQUIRE, lock, DTRACE_RW_SHARED);
1447 #endif  /* CONFIG_DTRACE */
1448 
1449 #ifdef DEBUG_RW
1450 	add_held_rwlock(lock, thread, LCK_RW_TYPE_SHARED, caller);
1451 #endif /* DEBUG_RW */
1452 
1453 	return TRUE;
1454 }
1455 
1456 __attribute__((noinline))
1457 static void
lck_rw_lock_shared_internal(lck_rw_t * lock,void * caller)1458 lck_rw_lock_shared_internal(
1459 	lck_rw_t        *lock,
1460 	void            *caller)
1461 {
1462 	(void) lck_rw_lock_shared_internal_inline(lock, caller, NULL);
1463 }
1464 
1465 /*!
1466  * @function lck_rw_lock_shared
1467  *
1468  * @abstract
1469  * Locks a rw_lock in shared mode.
1470  *
1471  * @discussion
1472  * This function can block.
1473  * Multiple threads can acquire the lock in shared mode at the same time, but only one thread at a time
1474  * can acquire it in exclusive mode.
1475  * If the lock is held in shared mode and there are no writers waiting, a reader will be able to acquire
1476  * the lock without waiting.
1477  * If the lock is held in shared mode and there is at least a writer waiting, a reader will wait
1478  * for all the writers to make progress if the lock was initialized with the default settings. Instead if
1479  * RW_SHARED_PRIORITY was selected at initialization time, a reader will never wait if the lock is held
1480  * in shared mode.
1481  * NOTE: the thread cannot return to userspace while the lock is held. Recursive locking is not supported.
1482  *
1483  * @param lock           rw_lock to lock.
1484  */
1485 void
lck_rw_lock_shared(lck_rw_t * lock)1486 lck_rw_lock_shared(
1487 	lck_rw_t        *lock)
1488 {
1489 	(void) lck_rw_lock_shared_internal_inline(lock, __builtin_return_address(0), NULL);
1490 }
1491 
1492 /*!
1493  * @function lck_rw_lock_shared_b
1494  *
1495  * @abstract
1496  * Locks a rw_lock in shared mode. Returns early if the lock can't be acquired
1497  * and the specified block returns true.
1498  *
1499  * @discussion
1500  * Identical to lck_rw_lock_shared() but can return early if the lock can't be
1501  * acquired and the specified block returns true. The block is called
1502  * repeatedly when waiting to acquire the lock.
1503  * Should only be called when the lock cannot sleep (i.e. when
1504  * lock->lck_rw_can_sleep is false).
1505  *
1506  * @param lock           rw_lock to lock.
1507  * @param lock_pause     block invoked while waiting to acquire lock
1508  *
1509  * @returns              Returns TRUE if the lock is successfully taken,
1510  *                       FALSE if the block returns true and the lock has
1511  *                       not been acquired.
1512  */
1513 boolean_t
1514 lck_rw_lock_shared_b(
1515 	lck_rw_t        *lock,
1516 	bool            (^lock_pause)(void))
1517 {
1518 	assert(!lock->lck_rw_can_sleep);
1519 
1520 	return lck_rw_lock_shared_internal_inline(lock, __builtin_return_address(0), lock_pause);
1521 }
1522 
1523 /*
1524  *	Routine:	lck_rw_lock_shared_to_exclusive_failure
1525  *	Function:
1526  *		Fast path code has already dropped our read
1527  *		count and determined that someone else owns 'lck_rw_want_upgrade'
1528  *		if 'lck_rw_shared_count' == 0, its also already dropped 'lck_w_waiting'
1529  *		all we need to do here is determine if a wakeup is needed
1530  */
1531 static boolean_t
lck_rw_lock_shared_to_exclusive_failure(lck_rw_t * lck,uint32_t prior_lock_state)1532 lck_rw_lock_shared_to_exclusive_failure(
1533 	lck_rw_t        *lck,
1534 	uint32_t        prior_lock_state)
1535 {
1536 	thread_t        thread = current_thread();
1537 	uint32_t        rwlock_count;
1538 
1539 	if ((prior_lock_state & LCK_RW_W_WAITING) &&
1540 	    ((prior_lock_state & LCK_RW_SHARED_MASK) == LCK_RW_SHARED_READER)) {
1541 		/*
1542 		 *	Someone else has requested upgrade.
1543 		 *	Since we've released the read lock, wake
1544 		 *	him up if he's blocked waiting
1545 		 */
1546 		thread_wakeup(LCK_RW_WRITER_EVENT(lck));
1547 	}
1548 
1549 	/* Check if dropping the lock means that we need to unpromote */
1550 	if (lck->lck_rw_can_sleep) {
1551 		rwlock_count = thread->rwlock_count--;
1552 	} else {
1553 		rwlock_count = UINT32_MAX;
1554 	}
1555 
1556 	if (rwlock_count == 0) {
1557 		panic("rw lock count underflow for thread %p", thread);
1558 	}
1559 
1560 	if ((rwlock_count == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
1561 		/* sched_flags checked without lock, but will be rechecked while clearing */
1562 		lck_rw_clear_promotion(thread, unslide_for_kdebug(lck));
1563 	}
1564 
1565 	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_CODE) | DBG_FUNC_NONE,
1566 	    VM_KERNEL_UNSLIDE_OR_PERM(lck), lck->lck_rw_shared_count, lck->lck_rw_want_upgrade, 0, 0);
1567 
1568 #ifdef DEBUG_RW
1569 	remove_held_rwlock(lck, thread, LCK_RW_TYPE_SHARED);
1570 #endif /* DEBUG_RW */
1571 
1572 	return FALSE;
1573 }
1574 
1575 /*
1576  *	Routine:	lck_rw_lock_shared_to_exclusive_success
1577  *	Function:
1578  *		the fast path code has already dropped our read
1579  *		count and successfully acquired 'lck_rw_want_upgrade'
1580  *		we just need to wait for the rest of the readers to drain
1581  *		and then we can return as the exclusive holder of this lock
1582  */
1583 static void
lck_rw_lock_shared_to_exclusive_success(lck_rw_t * lock)1584 lck_rw_lock_shared_to_exclusive_success(
1585 	lck_rw_t        *lock)
1586 {
1587 	__kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lock);
1588 	int                     slept = 0;
1589 	lck_rw_word_t           word;
1590 	wait_result_t           res;
1591 	boolean_t               istate;
1592 	lck_rw_drain_state_t    drain_state;
1593 
1594 #if     CONFIG_DTRACE
1595 	uint64_t                wait_interval = 0;
1596 	int                     readers_at_sleep = 0;
1597 	boolean_t               dtrace_ls_initialized = FALSE;
1598 	boolean_t               dtrace_rwl_shared_to_excl_spin, dtrace_rwl_shared_to_excl_block, dtrace_ls_enabled = FALSE;
1599 #endif
1600 
1601 	while (lck_rw_drain_status(lock, LCK_RW_SHARED_MASK, FALSE, NULL) != LCK_RW_DRAIN_S_DRAINED) {
1602 		word.data = ordered_load_rw(lock);
1603 #if     CONFIG_DTRACE
1604 		if (dtrace_ls_initialized == FALSE) {
1605 			dtrace_ls_initialized = TRUE;
1606 			dtrace_rwl_shared_to_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN] != 0);
1607 			dtrace_rwl_shared_to_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK] != 0);
1608 			dtrace_ls_enabled = dtrace_rwl_shared_to_excl_spin || dtrace_rwl_shared_to_excl_block;
1609 			if (dtrace_ls_enabled) {
1610 				/*
1611 				 * Either sleeping or spinning is happening,
1612 				 *  start a timing of our delay interval now.
1613 				 */
1614 				readers_at_sleep = word.shared_count;
1615 				wait_interval = mach_absolute_time();
1616 			}
1617 		}
1618 #endif
1619 
1620 		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_SPIN_CODE) | DBG_FUNC_START,
1621 		    trace_lck, word.shared_count, 0, 0, 0);
1622 
1623 		drain_state = lck_rw_drain_status(lock, LCK_RW_SHARED_MASK, TRUE, NULL);
1624 
1625 		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_SPIN_CODE) | DBG_FUNC_END,
1626 		    trace_lck, lock->lck_rw_shared_count, 0, 0, 0);
1627 
1628 		if (drain_state == LCK_RW_DRAIN_S_DRAINED) {
1629 			break;
1630 		}
1631 
1632 		/*
1633 		 * if we get here, the spin deadline in lck_rw_wait_on_status()
1634 		 * has expired w/o the rw_shared_count having drained to 0
1635 		 * check to see if we're allowed to do a thread_block
1636 		 */
1637 		if (word.can_sleep) {
1638 			istate = lck_interlock_lock(lock);
1639 
1640 			word.data = ordered_load_rw(lock);
1641 			if (word.shared_count != 0) {
1642 				KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_WAIT_CODE) | DBG_FUNC_START,
1643 				    trace_lck, word.shared_count, 0, 0, 0);
1644 
1645 				word.w_waiting = 1;
1646 				ordered_store_rw(lock, word.data);
1647 
1648 				thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockUpgrade);
1649 				res = assert_wait(LCK_RW_WRITER_EVENT(lock),
1650 				    THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
1651 				lck_interlock_unlock(lock, istate);
1652 
1653 				if (res == THREAD_WAITING) {
1654 					res = thread_block(THREAD_CONTINUE_NULL);
1655 					slept++;
1656 				}
1657 				KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_WAIT_CODE) | DBG_FUNC_END,
1658 				    trace_lck, res, slept, 0, 0);
1659 			} else {
1660 				lck_interlock_unlock(lock, istate);
1661 				break;
1662 			}
1663 		}
1664 	}
1665 #if     CONFIG_DTRACE
1666 	/*
1667 	 * We infer whether we took the sleep/spin path above by checking readers_at_sleep.
1668 	 */
1669 	if (dtrace_ls_enabled == TRUE) {
1670 		if (slept == 0) {
1671 			LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN, lock, mach_absolute_time() - wait_interval, 0);
1672 		} else {
1673 			LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK, lock,
1674 			    mach_absolute_time() - wait_interval, 1,
1675 			    (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1676 		}
1677 	}
1678 	LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_UPGRADE, lock, 1);
1679 #endif
1680 }
1681 
1682 /*!
1683  * @function lck_rw_lock_shared_to_exclusive
1684  *
1685  * @abstract
1686  * Upgrades a rw_lock held in shared mode to exclusive.
1687  *
1688  * @discussion
1689  * This function can block.
1690  * Only one reader at a time can upgrade to exclusive mode. If the upgrades fails the function will
1691  * return with the lock not held.
1692  * The caller needs to hold the lock in shared mode to upgrade it.
1693  *
1694  * @param lock           rw_lock already held in shared mode to upgrade.
1695  *
1696  * @returns TRUE if the lock was upgraded, FALSE if it was not possible.
1697  *          If the function was not able to upgrade the lock, the lock will be dropped
1698  *          by the function.
1699  */
1700 boolean_t
lck_rw_lock_shared_to_exclusive(lck_rw_t * lock)1701 lck_rw_lock_shared_to_exclusive(
1702 	lck_rw_t        *lock)
1703 {
1704 	thread_t thread = current_thread();
1705 	uint32_t data, prev;
1706 
1707 	assertf(lock->lck_rw_priv_excl != 0, "lock %p thread %p", lock, current_thread());
1708 
1709 #if DEBUG_RW
1710 	assert_held_rwlock(lock, thread, LCK_RW_TYPE_SHARED);
1711 #endif /* DEBUG_RW */
1712 
1713 	for (;;) {
1714 		data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_acquire_smp);
1715 		if (data & LCK_RW_INTERLOCK) {
1716 			atomic_exchange_abort();
1717 			lck_rw_interlock_spin(lock);
1718 			continue;
1719 		}
1720 		if (data & LCK_RW_WANT_UPGRADE) {
1721 			data -= LCK_RW_SHARED_READER;
1722 			if ((data & LCK_RW_SHARED_MASK) == 0) {         /* we were the last reader */
1723 				data &= ~(LCK_RW_W_WAITING);            /* so clear the wait indicator */
1724 			}
1725 			if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_acquire_smp)) {
1726 				return lck_rw_lock_shared_to_exclusive_failure(lock, prev);
1727 			}
1728 		} else {
1729 			data |= LCK_RW_WANT_UPGRADE;            /* ask for WANT_UPGRADE */
1730 			data -= LCK_RW_SHARED_READER;           /* and shed our read count */
1731 			if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_acquire_smp)) {
1732 				break;
1733 			}
1734 		}
1735 		cpu_pause();
1736 	}
1737 	/* we now own the WANT_UPGRADE */
1738 	if (data & LCK_RW_SHARED_MASK) {        /* check to see if all of the readers are drained */
1739 		lck_rw_lock_shared_to_exclusive_success(lock);  /* if not, we need to go wait */
1740 	}
1741 
1742 	assertf(lock->lck_rw_owner == 0, "state=0x%x, owner=%p",
1743 	    ordered_load_rw(lock), ctid_get_thread_unsafe(lock->lck_rw_owner));
1744 
1745 	ordered_store_rw_owner(lock, thread->ctid);
1746 #if     CONFIG_DTRACE
1747 	LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_UPGRADE, lock, 0);
1748 #endif  /* CONFIG_DTRACE */
1749 
1750 #if DEBUG_RW
1751 	change_held_rwlock(lock, thread, LCK_RW_TYPE_SHARED, __builtin_return_address(0));
1752 #endif /* DEBUG_RW */
1753 	return TRUE;
1754 }
1755 
1756 /*
1757  *      Routine:        lck_rw_lock_exclusive_to_shared_gen
1758  *      Function:
1759  *		Fast path has already dropped
1760  *		our exclusive state and bumped lck_rw_shared_count
1761  *		all we need to do here is determine if anyone
1762  *		needs to be awakened.
1763  */
1764 static void
lck_rw_lock_exclusive_to_shared_gen(lck_rw_t * lck,uint32_t prior_lock_state,void * caller)1765 lck_rw_lock_exclusive_to_shared_gen(
1766 	lck_rw_t        *lck,
1767 	uint32_t        prior_lock_state,
1768 	void            *caller)
1769 {
1770 #pragma unused(caller)
1771 	__kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lck);
1772 	lck_rw_word_t   fake_lck;
1773 
1774 	/*
1775 	 * prior_lock state is a snapshot of the 1st word of the
1776 	 * lock in question... we'll fake up a pointer to it
1777 	 * and carefully not access anything beyond whats defined
1778 	 * in the first word of a lck_rw_t
1779 	 */
1780 	fake_lck.data = prior_lock_state;
1781 
1782 	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_START,
1783 	    trace_lck, fake_lck->want_excl, fake_lck->want_upgrade, 0, 0);
1784 
1785 	/*
1786 	 * don't wake up anyone waiting to take the lock exclusively
1787 	 * since we hold a read count... when the read count drops to 0,
1788 	 * the writers will be woken.
1789 	 *
1790 	 * wake up any waiting readers if we don't have any writers waiting,
1791 	 * or the lock is NOT marked as rw_priv_excl (writers have privilege)
1792 	 */
1793 	if (!(fake_lck.priv_excl && fake_lck.w_waiting) && fake_lck.r_waiting) {
1794 		thread_wakeup(LCK_RW_READER_EVENT(lck));
1795 	}
1796 
1797 	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_END,
1798 	    trace_lck, lck->lck_rw_want_excl, lck->lck_rw_want_upgrade, lck->lck_rw_shared_count, 0);
1799 
1800 #if CONFIG_DTRACE
1801 	LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_TO_SHARED_DOWNGRADE, lck, 0);
1802 #endif
1803 
1804 #if DEBUG_RW
1805 	thread_t        thread = current_thread();
1806 	change_held_rwlock(lck, thread, LCK_RW_TYPE_EXCLUSIVE, caller);
1807 #endif /* DEBUG_RW */
1808 }
1809 
1810 /*!
1811  * @function lck_rw_lock_exclusive_to_shared
1812  *
1813  * @abstract
1814  * Downgrades a rw_lock held in exclusive mode to shared.
1815  *
1816  * @discussion
1817  * The caller needs to hold the lock in exclusive mode to be able to downgrade it.
1818  *
1819  * @param lock           rw_lock already held in exclusive mode to downgrade.
1820  */
1821 void
lck_rw_lock_exclusive_to_shared(lck_rw_t * lock)1822 lck_rw_lock_exclusive_to_shared(
1823 	lck_rw_t        *lock)
1824 {
1825 	uint32_t        data, prev;
1826 
1827 	assertf(lock->lck_rw_owner == current_thread()->ctid,
1828 	    "state=0x%x, owner=%p", lock->lck_rw_data,
1829 	    ctid_get_thread_unsafe(lock->lck_rw_owner));
1830 	ordered_store_rw_owner(lock, 0);
1831 
1832 	for (;;) {
1833 		data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_release_smp);
1834 		if (data & LCK_RW_INTERLOCK) {
1835 			atomic_exchange_abort();
1836 			lck_rw_interlock_spin(lock);    /* wait for interlock to clear */
1837 			continue;
1838 		}
1839 		data += LCK_RW_SHARED_READER;
1840 		if (data & LCK_RW_WANT_UPGRADE) {
1841 			data &= ~(LCK_RW_WANT_UPGRADE);
1842 		} else {
1843 			data &= ~(LCK_RW_WANT_EXCL);
1844 		}
1845 		if (!((prev & LCK_RW_W_WAITING) && (prev & LCK_RW_PRIV_EXCL))) {
1846 			data &= ~(LCK_RW_W_WAITING);
1847 		}
1848 		if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_release_smp)) {
1849 			break;
1850 		}
1851 		cpu_pause();
1852 	}
1853 	lck_rw_lock_exclusive_to_shared_gen(lock, prev, __builtin_return_address(0));
1854 }
1855 
1856 /*
1857  * Very sad hack, but the codegen for lck_rw_lock
1858  * is very unhappy with the combination of __builtin_return_address()
1859  * and a noreturn function. For some reason it adds more frames
1860  * than it should. rdar://76570684
1861  */
1862 void
1863 _lck_rw_lock_type_panic(lck_rw_t *lck, lck_rw_type_t lck_rw_type);
1864 #pragma clang diagnostic push
1865 #pragma clang diagnostic ignored "-Wmissing-noreturn"
1866 __attribute__((noinline, weak))
1867 void
_lck_rw_lock_type_panic(lck_rw_t * lck,lck_rw_type_t lck_rw_type)1868 _lck_rw_lock_type_panic(
1869 	lck_rw_t        *lck,
1870 	lck_rw_type_t   lck_rw_type)
1871 {
1872 	panic("lck_rw_lock(): Invalid RW lock type: %x for lock %p", lck_rw_type, lck);
1873 }
1874 #pragma clang diagnostic pop
1875 
1876 /*!
1877  * @function lck_rw_lock
1878  *
1879  * @abstract
1880  * Locks a rw_lock with the specified type.
1881  *
1882  * @discussion
1883  * See lck_rw_lock_shared() or lck_rw_lock_exclusive() for more details.
1884  *
1885  * @param lck           rw_lock to lock.
1886  * @param lck_rw_type   LCK_RW_TYPE_SHARED or LCK_RW_TYPE_EXCLUSIVE
1887  */
1888 void
lck_rw_lock(lck_rw_t * lck,lck_rw_type_t lck_rw_type)1889 lck_rw_lock(
1890 	lck_rw_t        *lck,
1891 	lck_rw_type_t   lck_rw_type)
1892 {
1893 	if (lck_rw_type == LCK_RW_TYPE_SHARED) {
1894 		return lck_rw_lock_shared_internal(lck, __builtin_return_address(0));
1895 	} else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE) {
1896 		return lck_rw_lock_exclusive_internal(lck, __builtin_return_address(0));
1897 	}
1898 	_lck_rw_lock_type_panic(lck, lck_rw_type);
1899 }
1900 
1901 __attribute__((always_inline))
1902 static boolean_t
lck_rw_try_lock_shared_internal_inline(lck_rw_t * lock,void * caller)1903 lck_rw_try_lock_shared_internal_inline(
1904 	lck_rw_t        *lock,
1905 	void            *caller)
1906 {
1907 #pragma unused(caller)
1908 
1909 	uint32_t        data, prev;
1910 	thread_t        thread = current_thread();
1911 #ifdef DEBUG_RW
1912 	boolean_t       check_canlock = TRUE;
1913 #endif
1914 
1915 	for (;;) {
1916 		data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_acquire_smp);
1917 		if (data & LCK_RW_INTERLOCK) {
1918 			atomic_exchange_abort();
1919 			lck_rw_interlock_spin(lock);
1920 			continue;
1921 		}
1922 		if (data & (LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE)) {
1923 			atomic_exchange_abort();
1924 			return FALSE;             /* lock is busy */
1925 		}
1926 #ifdef DEBUG_RW
1927 		if ((data & LCK_RW_SHARED_MASK) == 0) {
1928 			/*
1929 			 * If the lock is uncontended,
1930 			 * we do not need to check if we can lock it
1931 			 */
1932 			check_canlock = FALSE;
1933 		}
1934 #endif
1935 		data += LCK_RW_SHARED_READER;     /* Increment reader refcount */
1936 		if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_acquire_smp)) {
1937 			break;
1938 		}
1939 		cpu_pause();
1940 	}
1941 #ifdef DEBUG_RW
1942 	if (check_canlock) {
1943 		/*
1944 		 * Best effort attempt to check that this thread
1945 		 * is not already holding the lock (this checks read mode too).
1946 		 */
1947 		assert_canlock_rwlock(lock, thread, LCK_RW_TYPE_SHARED);
1948 	}
1949 #endif
1950 	assertf(lock->lck_rw_owner == 0, "state=0x%x, owner=%p",
1951 	    ordered_load_rw(lock), ctid_get_thread_unsafe(lock->lck_rw_owner));
1952 
1953 	if (lock->lck_rw_can_sleep) {
1954 		lck_rw_inc_thread_count(thread);
1955 	} else if (get_preemption_level() == 0) {
1956 		panic("Taking non-sleepable RW lock with preemption enabled");
1957 	}
1958 
1959 #if     CONFIG_DTRACE
1960 	LOCKSTAT_RECORD(LS_LCK_RW_TRY_LOCK_SHARED_ACQUIRE, lock, DTRACE_RW_SHARED);
1961 #endif  /* CONFIG_DTRACE */
1962 
1963 #ifdef DEBUG_RW
1964 	add_held_rwlock(lock, thread, LCK_RW_TYPE_SHARED, caller);
1965 #endif /* DEBUG_RW */
1966 	return TRUE;
1967 }
1968 
1969 __attribute__((noinline))
1970 static boolean_t
lck_rw_try_lock_shared_internal(lck_rw_t * lock,void * caller)1971 lck_rw_try_lock_shared_internal(
1972 	lck_rw_t        *lock,
1973 	void            *caller)
1974 {
1975 	return lck_rw_try_lock_shared_internal_inline(lock, caller);
1976 }
1977 
1978 /*!
1979  * @function lck_rw_try_lock_shared
1980  *
1981  * @abstract
1982  * Tries to locks a rw_lock in read mode.
1983  *
1984  * @discussion
1985  * This function will return and not block in case the lock is already held.
1986  * See lck_rw_lock_shared for more details.
1987  *
1988  * @param lock           rw_lock to lock.
1989  *
1990  * @returns TRUE if the lock is successfully acquired, FALSE in case it was already held.
1991  */
1992 boolean_t
lck_rw_try_lock_shared(lck_rw_t * lock)1993 lck_rw_try_lock_shared(
1994 	lck_rw_t        *lock)
1995 {
1996 	return lck_rw_try_lock_shared_internal_inline(lock, __builtin_return_address(0));
1997 }
1998 
1999 __attribute__((always_inline))
2000 static boolean_t
lck_rw_try_lock_exclusive_internal_inline(lck_rw_t * lock,void * caller)2001 lck_rw_try_lock_exclusive_internal_inline(
2002 	lck_rw_t        *lock,
2003 	void            *caller)
2004 {
2005 #pragma unused(caller)
2006 	uint32_t        data, prev;
2007 
2008 	for (;;) {
2009 		data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_acquire_smp);
2010 		if (data & LCK_RW_INTERLOCK) {
2011 			atomic_exchange_abort();
2012 			lck_rw_interlock_spin(lock);
2013 			continue;
2014 		}
2015 		if (data & (LCK_RW_SHARED_MASK | LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE)) {
2016 			atomic_exchange_abort();
2017 			return FALSE;
2018 		}
2019 		data |= LCK_RW_WANT_EXCL;
2020 		if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_acquire_smp)) {
2021 			break;
2022 		}
2023 		cpu_pause();
2024 	}
2025 	thread_t thread = current_thread();
2026 
2027 	if (lock->lck_rw_can_sleep) {
2028 		lck_rw_inc_thread_count(thread);
2029 	} else if (get_preemption_level() == 0) {
2030 		panic("Taking non-sleepable RW lock with preemption enabled");
2031 	}
2032 
2033 	assertf(lock->lck_rw_owner == 0, "state=0x%x, owner=%p",
2034 	    ordered_load_rw(lock), ctid_get_thread_unsafe(lock->lck_rw_owner));
2035 
2036 	ordered_store_rw_owner(lock, thread->ctid);
2037 #if     CONFIG_DTRACE
2038 	LOCKSTAT_RECORD(LS_LCK_RW_TRY_LOCK_EXCL_ACQUIRE, lock, DTRACE_RW_EXCL);
2039 #endif  /* CONFIG_DTRACE */
2040 
2041 #ifdef DEBUG_RW
2042 	add_held_rwlock(lock, thread, LCK_RW_TYPE_EXCLUSIVE, caller);
2043 #endif /* DEBUG_RW */
2044 	return TRUE;
2045 }
2046 
2047 __attribute__((noinline))
2048 static boolean_t
lck_rw_try_lock_exclusive_internal(lck_rw_t * lock,void * caller)2049 lck_rw_try_lock_exclusive_internal(
2050 	lck_rw_t        *lock,
2051 	void            *caller)
2052 {
2053 	return lck_rw_try_lock_exclusive_internal_inline(lock, caller);
2054 }
2055 
2056 /*!
2057  * @function lck_rw_try_lock_exclusive
2058  *
2059  * @abstract
2060  * Tries to locks a rw_lock in write mode.
2061  *
2062  * @discussion
2063  * This function will return and not block in case the lock is already held.
2064  * See lck_rw_lock_exclusive for more details.
2065  *
2066  * @param lock           rw_lock to lock.
2067  *
2068  * @returns TRUE if the lock is successfully acquired, FALSE in case it was already held.
2069  */
2070 boolean_t
lck_rw_try_lock_exclusive(lck_rw_t * lock)2071 lck_rw_try_lock_exclusive(
2072 	lck_rw_t        *lock)
2073 {
2074 	return lck_rw_try_lock_exclusive_internal_inline(lock, __builtin_return_address(0));
2075 }
2076 
2077 /*
2078  * Very sad hack, but the codegen for lck_rw_try_lock
2079  * is very unhappy with the combination of __builtin_return_address()
2080  * and a noreturn function. For some reason it adds more frames
2081  * than it should. rdar://76570684
2082  */
2083 boolean_t
2084 _lck_rw_try_lock_type_panic(lck_rw_t *lck, lck_rw_type_t lck_rw_type);
2085 #pragma clang diagnostic push
2086 #pragma clang diagnostic ignored "-Wmissing-noreturn"
2087 __attribute__((noinline, weak))
2088 boolean_t
_lck_rw_try_lock_type_panic(lck_rw_t * lck,lck_rw_type_t lck_rw_type)2089 _lck_rw_try_lock_type_panic(
2090 	lck_rw_t        *lck,
2091 	lck_rw_type_t   lck_rw_type)
2092 {
2093 	panic("lck_rw_lock(): Invalid RW lock type: %x for lock %p", lck_rw_type, lck);
2094 }
2095 #pragma clang diagnostic pop
2096 
2097 /*!
2098  * @function lck_rw_try_lock
2099  *
2100  * @abstract
2101  * Tries to locks a rw_lock with the specified type.
2102  *
2103  * @discussion
2104  * This function will return and not wait/block in case the lock is already held.
2105  * See lck_rw_try_lock_shared() or lck_rw_try_lock_exclusive() for more details.
2106  *
2107  * @param lck           rw_lock to lock.
2108  * @param lck_rw_type   LCK_RW_TYPE_SHARED or LCK_RW_TYPE_EXCLUSIVE
2109  *
2110  * @returns TRUE if the lock is successfully acquired, FALSE in case it was already held.
2111  */
2112 boolean_t
lck_rw_try_lock(lck_rw_t * lck,lck_rw_type_t lck_rw_type)2113 lck_rw_try_lock(
2114 	lck_rw_t        *lck,
2115 	lck_rw_type_t   lck_rw_type)
2116 {
2117 	if (lck_rw_type == LCK_RW_TYPE_SHARED) {
2118 		return lck_rw_try_lock_shared_internal(lck, __builtin_return_address(0));
2119 	} else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE) {
2120 		return lck_rw_try_lock_exclusive_internal(lck, __builtin_return_address(0));
2121 	}
2122 	return _lck_rw_try_lock_type_panic(lck, lck_rw_type);
2123 }
2124 
2125 /*
2126  *      Routine:        lck_rw_done_gen
2127  *
2128  *	prior_lock_state is the value in the 1st
2129  *      word of the lock at the time of a successful
2130  *	atomic compare and exchange with the new value...
2131  *      it represents the state of the lock before we
2132  *	decremented the rw_shared_count or cleared either
2133  *      rw_want_upgrade or rw_want_write and
2134  *	the lck_x_waiting bits...  since the wrapper
2135  *      routine has already changed the state atomically,
2136  *	we just need to decide if we should
2137  *	wake up anyone and what value to return... we do
2138  *	this by examining the state of the lock before
2139  *	we changed it
2140  */
2141 static lck_rw_type_t
lck_rw_done_gen(lck_rw_t * lck,uint32_t prior_lock_state)2142 lck_rw_done_gen(
2143 	lck_rw_t        *lck,
2144 	uint32_t        prior_lock_state)
2145 {
2146 	lck_rw_word_t   fake_lck;
2147 	lck_rw_type_t   lock_type;
2148 	thread_t        thread;
2149 	uint32_t        rwlock_count;
2150 
2151 	/*
2152 	 * prior_lock state is a snapshot of the 1st word of the
2153 	 * lock in question... we'll fake up a pointer to it
2154 	 * and carefully not access anything beyond whats defined
2155 	 * in the first word of a lck_rw_t
2156 	 */
2157 	fake_lck.data = prior_lock_state;
2158 
2159 	if (fake_lck.shared_count <= 1) {
2160 		if (fake_lck.w_waiting) {
2161 			thread_wakeup(LCK_RW_WRITER_EVENT(lck));
2162 		}
2163 
2164 		if (!(fake_lck.priv_excl && fake_lck.w_waiting) && fake_lck.r_waiting) {
2165 			thread_wakeup(LCK_RW_READER_EVENT(lck));
2166 		}
2167 	}
2168 	if (fake_lck.shared_count) {
2169 		lock_type = LCK_RW_TYPE_SHARED;
2170 	} else {
2171 		lock_type = LCK_RW_TYPE_EXCLUSIVE;
2172 	}
2173 
2174 	/* Check if dropping the lock means that we need to unpromote */
2175 	thread = current_thread();
2176 	if (fake_lck.can_sleep) {
2177 		rwlock_count = thread->rwlock_count--;
2178 	} else {
2179 		rwlock_count = UINT32_MAX;
2180 	}
2181 
2182 	if (rwlock_count == 0) {
2183 		panic("rw lock count underflow for thread %p", thread);
2184 	}
2185 
2186 	if ((rwlock_count == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
2187 		/* sched_flags checked without lock, but will be rechecked while clearing */
2188 		lck_rw_clear_promotion(thread, unslide_for_kdebug(lck));
2189 	}
2190 #if CONFIG_DTRACE
2191 	LOCKSTAT_RECORD(LS_LCK_RW_DONE_RELEASE, lck, lock_type == LCK_RW_TYPE_SHARED ? 0 : 1);
2192 #endif
2193 
2194 #ifdef DEBUG_RW
2195 	remove_held_rwlock(lck, thread, lock_type);
2196 #endif /* DEBUG_RW */
2197 	return lock_type;
2198 }
2199 
2200 /*!
2201  * @function lck_rw_done
2202  *
2203  * @abstract
2204  * Force unlocks a rw_lock without consistency checks.
2205  *
2206  * @discussion
2207  * Do not use unless sure you can avoid consistency checks.
2208  *
2209  * @param lock           rw_lock to unlock.
2210  */
2211 lck_rw_type_t
lck_rw_done(lck_rw_t * lock)2212 lck_rw_done(
2213 	lck_rw_t        *lock)
2214 {
2215 	uint32_t        data, prev;
2216 	boolean_t       once = FALSE;
2217 
2218 #ifdef DEBUG_RW
2219 	/*
2220 	 * Best effort attempt to check that this thread
2221 	 * is holding the lock.
2222 	 */
2223 	thread_t thread = current_thread();
2224 	assert_held_rwlock(lock, thread, 0);
2225 #endif /* DEBUG_RW */
2226 	for (;;) {
2227 		data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_release_smp);
2228 		if (data & LCK_RW_INTERLOCK) {          /* wait for interlock to clear */
2229 			atomic_exchange_abort();
2230 			lck_rw_interlock_spin(lock);
2231 			continue;
2232 		}
2233 		if (data & LCK_RW_SHARED_MASK) {        /* lock is held shared */
2234 			assertf(lock->lck_rw_owner == 0,
2235 			    "state=0x%x, owner=%p", lock->lck_rw_data,
2236 			    ctid_get_thread_unsafe(lock->lck_rw_owner));
2237 			data -= LCK_RW_SHARED_READER;
2238 			if ((data & LCK_RW_SHARED_MASK) == 0) { /* if reader count has now gone to 0, check for waiters */
2239 				goto check_waiters;
2240 			}
2241 		} else {                                        /* if reader count == 0, must be exclusive lock */
2242 			if (data & LCK_RW_WANT_UPGRADE) {
2243 				data &= ~(LCK_RW_WANT_UPGRADE);
2244 			} else {
2245 				if (data & LCK_RW_WANT_EXCL) {
2246 					data &= ~(LCK_RW_WANT_EXCL);
2247 				} else {                                /* lock is not 'owned', panic */
2248 					panic("Releasing non-exclusive RW lock without a reader refcount!");
2249 				}
2250 			}
2251 			if (!once) {
2252 				// Only check for holder and clear it once
2253 				assertf(lock->lck_rw_owner == current_thread()->ctid,
2254 				    "state=0x%x, owner=%p", lock->lck_rw_data,
2255 				    ctid_get_thread_unsafe(lock->lck_rw_owner));
2256 				ordered_store_rw_owner(lock, 0);
2257 				once = TRUE;
2258 			}
2259 check_waiters:
2260 			/*
2261 			 * test the original values to match what
2262 			 * lck_rw_done_gen is going to do to determine
2263 			 * which wakeups need to happen...
2264 			 *
2265 			 * if !(fake_lck->lck_rw_priv_excl && fake_lck->lck_w_waiting)
2266 			 */
2267 			if (prev & LCK_RW_W_WAITING) {
2268 				data &= ~(LCK_RW_W_WAITING);
2269 				if ((prev & LCK_RW_PRIV_EXCL) == 0) {
2270 					data &= ~(LCK_RW_R_WAITING);
2271 				}
2272 			} else {
2273 				data &= ~(LCK_RW_R_WAITING);
2274 			}
2275 		}
2276 		if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_release_smp)) {
2277 			break;
2278 		}
2279 		cpu_pause();
2280 	}
2281 	return lck_rw_done_gen(lock, prev);
2282 }
2283 
2284 /*!
2285  * @function lck_rw_unlock_shared
2286  *
2287  * @abstract
2288  * Unlocks a rw_lock previously locked in shared mode.
2289  *
2290  * @discussion
2291  * The same thread that locked the lock needs to unlock it.
2292  *
2293  * @param lck           rw_lock held in shared mode to unlock.
2294  */
2295 void
lck_rw_unlock_shared(lck_rw_t * lck)2296 lck_rw_unlock_shared(
2297 	lck_rw_t        *lck)
2298 {
2299 	lck_rw_type_t   ret;
2300 
2301 	assertf(lck->lck_rw_owner == 0,
2302 	    "state=0x%x, owner=%p", lck->lck_rw_data,
2303 	    ctid_get_thread_unsafe(lck->lck_rw_owner));
2304 	assertf(lck->lck_rw_shared_count > 0, "shared_count=0x%x", lck->lck_rw_shared_count);
2305 	ret = lck_rw_done(lck);
2306 
2307 	if (ret != LCK_RW_TYPE_SHARED) {
2308 		panic("lck_rw_unlock_shared(): lock %p held in mode: %d", lck, ret);
2309 	}
2310 }
2311 
2312 /*!
2313  * @function lck_rw_unlock_exclusive
2314  *
2315  * @abstract
2316  * Unlocks a rw_lock previously locked in exclusive mode.
2317  *
2318  * @discussion
2319  * The same thread that locked the lock needs to unlock it.
2320  *
2321  * @param lck           rw_lock held in exclusive mode to unlock.
2322  */
2323 void
lck_rw_unlock_exclusive(lck_rw_t * lck)2324 lck_rw_unlock_exclusive(
2325 	lck_rw_t        *lck)
2326 {
2327 	lck_rw_type_t   ret;
2328 
2329 	assertf(lck->lck_rw_owner == current_thread()->ctid,
2330 	    "state=0x%x, owner=%p", lck->lck_rw_data,
2331 	    ctid_get_thread_unsafe(lck->lck_rw_owner));
2332 	ret = lck_rw_done(lck);
2333 
2334 	if (ret != LCK_RW_TYPE_EXCLUSIVE) {
2335 		panic("lck_rw_unlock_exclusive(): lock %p held in mode: %d", lck, ret);
2336 	}
2337 }
2338 
2339 /*!
2340  * @function lck_rw_unlock
2341  *
2342  * @abstract
2343  * Unlocks a rw_lock previously locked with lck_rw_type.
2344  *
2345  * @discussion
2346  * The lock must be unlocked by the same thread it was locked from.
2347  * The type of the lock/unlock have to match, unless an upgrade/downgrade was performed while
2348  * holding the lock.
2349  *
2350  * @param lck           rw_lock to unlock.
2351  * @param lck_rw_type   LCK_RW_TYPE_SHARED or LCK_RW_TYPE_EXCLUSIVE
2352  */
2353 void
lck_rw_unlock(lck_rw_t * lck,lck_rw_type_t lck_rw_type)2354 lck_rw_unlock(
2355 	lck_rw_t         *lck,
2356 	lck_rw_type_t    lck_rw_type)
2357 {
2358 	if (lck_rw_type == LCK_RW_TYPE_SHARED) {
2359 		lck_rw_unlock_shared(lck);
2360 	} else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE) {
2361 		lck_rw_unlock_exclusive(lck);
2362 	} else {
2363 		panic("lck_rw_unlock(): Invalid RW lock type: %d", lck_rw_type);
2364 	}
2365 }
2366 
2367 /*!
2368  * @function lck_rw_assert
2369  *
2370  * @abstract
2371  * Asserts the rw_lock is held.
2372  *
2373  * @discussion
2374  * read-write locks do not have a concept of ownership when held in shared mode,
2375  * so this function merely asserts that someone is holding the lock, not necessarily the caller.
2376  * However if rw_lock_debug is on, a best effort mechanism to track the owners is in place, and
2377  * this function can be more accurate.
2378  * Type can be LCK_RW_ASSERT_SHARED, LCK_RW_ASSERT_EXCLUSIVE, LCK_RW_ASSERT_HELD
2379  * LCK_RW_ASSERT_NOTHELD.
2380  *
2381  * @param lck   rw_lock to check.
2382  * @param type  assert type
2383  */
2384 void
lck_rw_assert(lck_rw_t * lck,unsigned int type)2385 lck_rw_assert(
2386 	lck_rw_t        *lck,
2387 	unsigned int    type)
2388 {
2389 	thread_t thread = current_thread();
2390 
2391 	switch (type) {
2392 	case LCK_RW_ASSERT_SHARED:
2393 		if ((lck->lck_rw_shared_count != 0) &&
2394 		    (lck->lck_rw_owner == 0)) {
2395 #if DEBUG_RW
2396 			assert_held_rwlock(lck, thread, LCK_RW_TYPE_SHARED);
2397 #endif /* DEBUG_RW */
2398 			return;
2399 		}
2400 		break;
2401 	case LCK_RW_ASSERT_EXCLUSIVE:
2402 		if ((lck->lck_rw_want_excl || lck->lck_rw_want_upgrade) &&
2403 		    (lck->lck_rw_shared_count == 0) &&
2404 		    (lck->lck_rw_owner == thread->ctid)) {
2405 #if DEBUG_RW
2406 			assert_held_rwlock(lck, thread, LCK_RW_TYPE_EXCLUSIVE);
2407 #endif /* DEBUG_RW */
2408 			return;
2409 		}
2410 		break;
2411 	case LCK_RW_ASSERT_HELD:
2412 		if (lck->lck_rw_shared_count != 0) {
2413 #if DEBUG_RW
2414 			assert_held_rwlock(lck, thread, LCK_RW_TYPE_SHARED);
2415 #endif /* DEBUG_RW */
2416 			return;         // Held shared
2417 		}
2418 		if ((lck->lck_rw_want_excl || lck->lck_rw_want_upgrade) &&
2419 		    (lck->lck_rw_owner == thread->ctid)) {
2420 #if DEBUG_RW
2421 			assert_held_rwlock(lck, thread, LCK_RW_TYPE_EXCLUSIVE);
2422 #endif /* DEBUG_RW */
2423 			return;         // Held exclusive
2424 		}
2425 		break;
2426 	case LCK_RW_ASSERT_NOTHELD:
2427 		if ((lck->lck_rw_shared_count == 0) &&
2428 		    !(lck->lck_rw_want_excl || lck->lck_rw_want_upgrade) &&
2429 		    (lck->lck_rw_owner == 0)) {
2430 #ifdef DEBUG_RW
2431 			assert_canlock_rwlock(lck, thread, LCK_RW_TYPE_EXCLUSIVE);
2432 #endif /* DEBUG_RW */
2433 			return;
2434 		}
2435 		break;
2436 	default:
2437 		break;
2438 	}
2439 	panic("rw lock (%p)%s held (mode=%u)", lck, (type == LCK_RW_ASSERT_NOTHELD ? "" : " not"), type);
2440 }
2441 
2442 /*!
2443  * @function kdp_lck_rw_lock_is_acquired_exclusive
2444  *
2445  * @abstract
2446  * Checks if a rw_lock is held exclusevely.
2447  *
2448  * @discussion
2449  * NOT SAFE: To be used only by kernel debugger to avoid deadlock.
2450  *
2451  * @param lck   lock to check
2452  *
2453  * @returns TRUE if the lock is held exclusevely
2454  */
2455 boolean_t
kdp_lck_rw_lock_is_acquired_exclusive(lck_rw_t * lck)2456 kdp_lck_rw_lock_is_acquired_exclusive(
2457 	lck_rw_t        *lck)
2458 {
2459 	if (not_in_kdp) {
2460 		panic("panic: rw lock exclusive check done outside of kernel debugger");
2461 	}
2462 	return ((lck->lck_rw_want_upgrade || lck->lck_rw_want_excl) && (lck->lck_rw_shared_count == 0)) ? TRUE : FALSE;
2463 }
2464 
2465 void
kdp_rwlck_find_owner(__unused struct waitq * waitq,event64_t event,thread_waitinfo_t * waitinfo)2466 kdp_rwlck_find_owner(
2467 	__unused struct waitq   *waitq,
2468 	event64_t               event,
2469 	thread_waitinfo_t       *waitinfo)
2470 {
2471 	lck_rw_t        *rwlck = NULL;
2472 	switch (waitinfo->wait_type) {
2473 	case kThreadWaitKernelRWLockRead:
2474 		rwlck = READ_EVENT_TO_RWLOCK(event);
2475 		break;
2476 	case kThreadWaitKernelRWLockWrite:
2477 	case kThreadWaitKernelRWLockUpgrade:
2478 		rwlck = WRITE_EVENT_TO_RWLOCK(event);
2479 		break;
2480 	default:
2481 		panic("%s was called with an invalid blocking type", __FUNCTION__);
2482 		break;
2483 	}
2484 	waitinfo->context = VM_KERNEL_UNSLIDE_OR_PERM(rwlck);
2485 	waitinfo->owner = thread_tid(ctid_get_thread(rwlck->lck_rw_owner));
2486 }
2487 
2488 /*!
2489  * @function lck_rw_lock_yield_shared
2490  *
2491  * @abstract
2492  * Yields a rw_lock held in shared mode.
2493  *
2494  * @discussion
2495  * This function can block.
2496  * Yields the lock in case there are writers waiting.
2497  * The yield will unlock, block, and re-lock the lock in shared mode.
2498  *
2499  * @param lck           rw_lock already held in shared mode to yield.
2500  * @param force_yield   if set to true it will always yield irrespective of the lock status
2501  *
2502  * @returns TRUE if the lock was yield, FALSE otherwise
2503  */
2504 bool
lck_rw_lock_yield_shared(lck_rw_t * lck,boolean_t force_yield)2505 lck_rw_lock_yield_shared(
2506 	lck_rw_t        *lck,
2507 	boolean_t       force_yield)
2508 {
2509 	lck_rw_word_t   word;
2510 
2511 	lck_rw_assert(lck, LCK_RW_ASSERT_SHARED);
2512 
2513 	word.data = ordered_load_rw(lck);
2514 	if (word.want_excl || word.want_upgrade || force_yield) {
2515 		lck_rw_unlock_shared(lck);
2516 		mutex_pause(2);
2517 		lck_rw_lock_shared(lck);
2518 		return true;
2519 	}
2520 
2521 	return false;
2522 }
2523 
2524 /*!
2525  * @function lck_rw_lock_yield_exclusive
2526  *
2527  * @abstract
2528  * Yields a rw_lock held in exclusive mode.
2529  *
2530  * @discussion
2531  * This function can block.
2532  * Yields the lock in case there are writers waiting.
2533  * The yield will unlock, block, and re-lock the lock in exclusive mode.
2534  *
2535  * @param lck           rw_lock already held in exclusive mode to yield.
2536  * @param mode          when to yield.
2537  *
2538  * @returns TRUE if the lock was yield, FALSE otherwise
2539  */
2540 bool
lck_rw_lock_yield_exclusive(lck_rw_t * lck,lck_rw_yield_t mode)2541 lck_rw_lock_yield_exclusive(
2542 	lck_rw_t        *lck,
2543 	lck_rw_yield_t  mode)
2544 {
2545 	lck_rw_word_t word;
2546 	bool yield = false;
2547 
2548 	lck_rw_assert(lck, LCK_RW_ASSERT_EXCLUSIVE);
2549 
2550 	if (mode == LCK_RW_YIELD_ALWAYS) {
2551 		yield = true;
2552 	} else {
2553 		word.data = ordered_load_rw(lck);
2554 		if (word.w_waiting) {
2555 			yield = true;
2556 		} else if (mode == LCK_RW_YIELD_ANY_WAITER) {
2557 			yield = (word.r_waiting != 0);
2558 		}
2559 	}
2560 
2561 	if (yield) {
2562 		lck_rw_unlock_exclusive(lck);
2563 		mutex_pause(2);
2564 		lck_rw_lock_exclusive(lck);
2565 	}
2566 
2567 	return yield;
2568 }
2569 
2570 /*!
2571  * @function lck_rw_sleep
2572  *
2573  * @abstract
2574  * Assert_wait on an event while holding the rw_lock.
2575  *
2576  * @discussion
2577  * the flags can decide how to re-acquire the lock upon wake up
2578  * (LCK_SLEEP_SHARED, or LCK_SLEEP_EXCLUSIVE, or LCK_SLEEP_UNLOCK)
2579  * and if the priority needs to be kept boosted until the lock is
2580  * re-acquired (LCK_SLEEP_PROMOTED_PRI).
2581  *
2582  * @param lck                   rw_lock to use to synch the assert_wait.
2583  * @param lck_sleep_action      flags.
2584  * @param event                 event to assert_wait on.
2585  * @param interruptible         wait type.
2586  */
2587 wait_result_t
lck_rw_sleep(lck_rw_t * lck,lck_sleep_action_t lck_sleep_action,event_t event,wait_interrupt_t interruptible)2588 lck_rw_sleep(
2589 	lck_rw_t                *lck,
2590 	lck_sleep_action_t      lck_sleep_action,
2591 	event_t                 event,
2592 	wait_interrupt_t        interruptible)
2593 {
2594 	wait_result_t           res;
2595 	lck_rw_type_t           lck_rw_type;
2596 	thread_pri_floor_t      token;
2597 
2598 	if ((lck_sleep_action & ~LCK_SLEEP_MASK) != 0) {
2599 		panic("Invalid lock sleep action %x", lck_sleep_action);
2600 	}
2601 
2602 	if (lck_sleep_action & LCK_SLEEP_PROMOTED_PRI) {
2603 		/*
2604 		 * Although we are dropping the RW lock, the intent in most cases
2605 		 * is that this thread remains as an observer, since it may hold
2606 		 * some secondary resource, but must yield to avoid deadlock. In
2607 		 * this situation, make sure that the thread is boosted to the
2608 		 * ceiling while blocked, so that it can re-acquire the
2609 		 * RW lock at that priority.
2610 		 */
2611 		token = thread_priority_floor_start();
2612 	}
2613 
2614 	res = assert_wait(event, interruptible);
2615 	if (res == THREAD_WAITING) {
2616 		lck_rw_type = lck_rw_done(lck);
2617 		res = thread_block(THREAD_CONTINUE_NULL);
2618 		if (!(lck_sleep_action & LCK_SLEEP_UNLOCK)) {
2619 			if (!(lck_sleep_action & (LCK_SLEEP_SHARED | LCK_SLEEP_EXCLUSIVE))) {
2620 				lck_rw_lock(lck, lck_rw_type);
2621 			} else if (lck_sleep_action & LCK_SLEEP_EXCLUSIVE) {
2622 				lck_rw_lock_exclusive(lck);
2623 			} else {
2624 				lck_rw_lock_shared(lck);
2625 			}
2626 		}
2627 	} else if (lck_sleep_action & LCK_SLEEP_UNLOCK) {
2628 		(void)lck_rw_done(lck);
2629 	}
2630 
2631 	if (lck_sleep_action & LCK_SLEEP_PROMOTED_PRI) {
2632 		thread_priority_floor_end(&token);
2633 	}
2634 
2635 	return res;
2636 }
2637 
2638 /*!
2639  * @function lck_rw_sleep_deadline
2640  *
2641  * @abstract
2642  * Assert_wait_deadline on an event while holding the rw_lock.
2643  *
2644  * @discussion
2645  * the flags can decide how to re-acquire the lock upon wake up
2646  * (LCK_SLEEP_SHARED, or LCK_SLEEP_EXCLUSIVE, or LCK_SLEEP_UNLOCK)
2647  * and if the priority needs to be kept boosted until the lock is
2648  * re-acquired (LCK_SLEEP_PROMOTED_PRI).
2649  *
2650  * @param lck                   rw_lock to use to synch the assert_wait.
2651  * @param lck_sleep_action      flags.
2652  * @param event                 event to assert_wait on.
2653  * @param interruptible         wait type.
2654  * @param deadline              maximum time after which being woken up
2655  */
2656 wait_result_t
lck_rw_sleep_deadline(lck_rw_t * lck,lck_sleep_action_t lck_sleep_action,event_t event,wait_interrupt_t interruptible,uint64_t deadline)2657 lck_rw_sleep_deadline(
2658 	lck_rw_t                *lck,
2659 	lck_sleep_action_t      lck_sleep_action,
2660 	event_t                 event,
2661 	wait_interrupt_t        interruptible,
2662 	uint64_t                deadline)
2663 {
2664 	wait_result_t           res;
2665 	lck_rw_type_t           lck_rw_type;
2666 	thread_pri_floor_t      token;
2667 
2668 	if ((lck_sleep_action & ~LCK_SLEEP_MASK) != 0) {
2669 		panic("Invalid lock sleep action %x", lck_sleep_action);
2670 	}
2671 
2672 	if (lck_sleep_action & LCK_SLEEP_PROMOTED_PRI) {
2673 		token = thread_priority_floor_start();
2674 	}
2675 
2676 	res = assert_wait_deadline(event, interruptible, deadline);
2677 	if (res == THREAD_WAITING) {
2678 		lck_rw_type = lck_rw_done(lck);
2679 		res = thread_block(THREAD_CONTINUE_NULL);
2680 		if (!(lck_sleep_action & LCK_SLEEP_UNLOCK)) {
2681 			if (!(lck_sleep_action & (LCK_SLEEP_SHARED | LCK_SLEEP_EXCLUSIVE))) {
2682 				lck_rw_lock(lck, lck_rw_type);
2683 			} else if (lck_sleep_action & LCK_SLEEP_EXCLUSIVE) {
2684 				lck_rw_lock_exclusive(lck);
2685 			} else {
2686 				lck_rw_lock_shared(lck);
2687 			}
2688 		}
2689 	} else if (lck_sleep_action & LCK_SLEEP_UNLOCK) {
2690 		(void)lck_rw_done(lck);
2691 	}
2692 
2693 	if (lck_sleep_action & LCK_SLEEP_PROMOTED_PRI) {
2694 		thread_priority_floor_end(&token);
2695 	}
2696 
2697 	return res;
2698 }
2699 
2700 /*
2701  * Reader-writer lock promotion
2702  *
2703  * We support a limited form of reader-writer
2704  * lock promotion whose effects are:
2705  *
2706  *   * Qualifying threads have decay disabled
2707  *   * Scheduler priority is reset to a floor of
2708  *     of their statically assigned priority
2709  *     or MINPRI_RWLOCK
2710  *
2711  * The rationale is that lck_rw_ts do not have
2712  * a single owner, so we cannot apply a directed
2713  * priority boost from all waiting threads
2714  * to all holding threads without maintaining
2715  * lists of all shared owners and all waiting
2716  * threads for every lock.
2717  *
2718  * Instead (and to preserve the uncontended fast-
2719  * path), acquiring (or attempting to acquire)
2720  * a RW lock in shared or exclusive lock increments
2721  * a per-thread counter. Only if that thread stops
2722  * making forward progress (for instance blocking
2723  * on a mutex, or being preempted) do we consult
2724  * the counter and apply the priority floor.
2725  * When the thread becomes runnable again (or in
2726  * the case of preemption it never stopped being
2727  * runnable), it has the priority boost and should
2728  * be in a good position to run on the CPU and
2729  * release all RW locks (at which point the priority
2730  * boost is cleared).
2731  *
2732  * Care must be taken to ensure that priority
2733  * boosts are not retained indefinitely, since unlike
2734  * mutex priority boosts (where the boost is tied
2735  * to the mutex lifecycle), the boost is tied
2736  * to the thread and independent of any particular
2737  * lck_rw_t. Assertions are in place on return
2738  * to userspace so that the boost is not held
2739  * indefinitely.
2740  *
2741  * The routines that increment/decrement the
2742  * per-thread counter should err on the side of
2743  * incrementing any time a preemption is possible
2744  * and the lock would be visible to the rest of the
2745  * system as held (so it should be incremented before
2746  * interlocks are dropped/preemption is enabled, or
2747  * before a CAS is executed to acquire the lock).
2748  *
2749  */
2750 
2751 /*!
2752  * @function lck_rw_clear_promotion
2753  *
2754  * @abstract
2755  * Undo priority promotions when the last rw_lock
2756  * is released by a thread (if a promotion was active).
2757  *
2758  * @param thread        thread to demote.
2759  * @param trace_obj     object reason for the demotion.
2760  */
2761 void
lck_rw_clear_promotion(thread_t thread,uintptr_t trace_obj)2762 lck_rw_clear_promotion(
2763 	thread_t thread,
2764 	uintptr_t trace_obj)
2765 {
2766 	assert(thread->rwlock_count == 0);
2767 
2768 	/* Cancel any promotions if the thread had actually blocked while holding a RW lock */
2769 	spl_t s = splsched();
2770 	thread_lock(thread);
2771 
2772 	if (thread->sched_flags & TH_SFLAG_RW_PROMOTED) {
2773 		sched_thread_unpromote_reason(thread, TH_SFLAG_RW_PROMOTED, trace_obj);
2774 	}
2775 
2776 	thread_unlock(thread);
2777 	splx(s);
2778 }
2779 
2780 /*!
2781  * @function lck_rw_set_promotion_locked
2782  *
2783  * @abstract
2784  * Callout from context switch if the thread goes
2785  * off core with a positive rwlock_count.
2786  *
2787  * @discussion
2788  * Called at splsched with the thread locked.
2789  *
2790  * @param thread        thread to promote.
2791  */
2792 void
lck_rw_set_promotion_locked(thread_t thread)2793 lck_rw_set_promotion_locked(thread_t thread)
2794 {
2795 	if (LcksOpts & disLkRWPrio) {
2796 		return;
2797 	}
2798 
2799 	assert(thread->rwlock_count > 0);
2800 
2801 	if (!(thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
2802 		sched_thread_promote_reason(thread, TH_SFLAG_RW_PROMOTED, 0);
2803 	}
2804 }
2805