xref: /xnu-12377.1.9/osfmk/kern/lock_rw.c (revision f6217f891ac0bb64f3d375211650a4c1ff8ca1ea)
1 /*
2  * Copyright (c) 2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * @OSF_COPYRIGHT@
30  */
31 /*
32  * Mach Operating System
33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34  * All Rights Reserved.
35  *
36  * Permission to use, copy, modify and distribute this software and its
37  * documentation is hereby granted, provided that both the copyright
38  * notice and this permission notice appear in all copies of the
39  * software, derivative works or modified versions, and any portions
40  * thereof, and that both notices appear in supporting documentation.
41  *
42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45  *
46  * Carnegie Mellon requests users of this software to return to
47  *
48  *  Software Distribution Coordinator  or  [email protected]
49  *  School of Computer Science
50  *  Carnegie Mellon University
51  *  Pittsburgh PA 15213-3890
52  *
53  * any improvements or extensions that they make and grant Carnegie Mellon
54  * the rights to redistribute these changes.
55  */
56 #define LOCK_PRIVATE 1
57 #include <debug.h>
58 #include <kern/locks_internal.h>
59 #include <kern/lock_stat.h>
60 #include <kern/locks.h>
61 #include <kern/zalloc.h>
62 #include <kern/thread.h>
63 #include <kern/processor.h>
64 #include <kern/sched_prim.h>
65 #include <kern/debug.h>
66 #include <machine/atomic.h>
67 #include <machine/machine_cpu.h>
68 
69 KALLOC_TYPE_DEFINE(KT_LCK_RW, lck_rw_t, KT_PRIV_ACCT);
70 
71 #define LCK_RW_WRITER_EVENT(lck)                (event_t)((uintptr_t)(lck)+1)
72 #define LCK_RW_READER_EVENT(lck)                (event_t)((uintptr_t)(lck)+2)
73 #define WRITE_EVENT_TO_RWLOCK(event)            ((lck_rw_t *)((uintptr_t)(event)-1))
74 #define READ_EVENT_TO_RWLOCK(event)             ((lck_rw_t *)((uintptr_t)(event)-2))
75 
76 #if CONFIG_DTRACE
77 #define DTRACE_RW_SHARED        0x0     //reader
78 #define DTRACE_RW_EXCL          0x1     //writer
79 #define DTRACE_NO_FLAG          0x0     //not applicable
80 #endif  /* CONFIG_DTRACE */
81 
82 #define LCK_RW_LCK_EXCLUSIVE_CODE       0x100
83 #define LCK_RW_LCK_EXCLUSIVE1_CODE      0x101
84 #define LCK_RW_LCK_SHARED_CODE          0x102
85 #define LCK_RW_LCK_SH_TO_EX_CODE        0x103
86 #define LCK_RW_LCK_SH_TO_EX1_CODE       0x104
87 #define LCK_RW_LCK_EX_TO_SH_CODE        0x105
88 
89 #if __x86_64__
90 #define LCK_RW_LCK_EX_WRITER_SPIN_CODE  0x106
91 #define LCK_RW_LCK_EX_WRITER_WAIT_CODE  0x107
92 #define LCK_RW_LCK_EX_READER_SPIN_CODE  0x108
93 #define LCK_RW_LCK_EX_READER_WAIT_CODE  0x109
94 #define LCK_RW_LCK_SHARED_SPIN_CODE     0x110
95 #define LCK_RW_LCK_SHARED_WAIT_CODE     0x111
96 #define LCK_RW_LCK_SH_TO_EX_SPIN_CODE   0x112
97 #define LCK_RW_LCK_SH_TO_EX_WAIT_CODE   0x113
98 #endif
99 
100 #define lck_rw_ilk_lock(lock)   hw_lock_bit  ((hw_lock_bit_t*)(&(lock)->lck_rw_tag), LCK_RW_INTERLOCK_BIT, LCK_GRP_NULL)
101 #define lck_rw_ilk_unlock(lock) hw_unlock_bit((hw_lock_bit_t*)(&(lock)->lck_rw_tag), LCK_RW_INTERLOCK_BIT)
102 
103 #define ordered_load_rw(lock)                   os_atomic_load(&(lock)->lck_rw_data, compiler_acq_rel)
104 #define ordered_store_rw(lock, value)           os_atomic_store(&(lock)->lck_rw_data, (value), compiler_acq_rel)
105 #define ordered_store_rw_owner(lock, value)     os_atomic_store(&(lock)->lck_rw_owner, (value), compiler_acq_rel)
106 
107 #ifdef DEBUG_RW
108 
109 STATIC_IF_KEY_DEFINE_TRUE(lck_rw_assert);
110 
111 static TUNABLE(bool, lck_rw_recursive_shared_assert_74048094, "lck_rw_recursive_shared_assert", false);
112 SECURITY_READ_ONLY_EARLY(vm_packing_params_t) rwlde_caller_packing_params =
113     VM_PACKING_PARAMS(LCK_RW_CALLER_PACKED);
114 
115 #define set_rwlde_caller_packed(entry, caller)          ((entry)->rwlde_caller_packed = VM_PACK_POINTER((vm_offset_t)caller, LCK_RW_CALLER_PACKED))
116 #define get_rwlde_caller(entry)                         ((void*)VM_UNPACK_POINTER(entry->rwlde_caller_packed, LCK_RW_CALLER_PACKED))
117 
118 #endif /* DEBUG_RW */
119 
120 /*!
121  * @function lck_rw_alloc_init
122  *
123  * @abstract
124  * Allocates and initializes a rw_lock_t.
125  *
126  * @discussion
127  * The function can block. See lck_rw_init() for initialization details.
128  *
129  * @param grp           lock group to associate with the lock.
130  * @param attr          lock attribute to initialize the lock.
131  *
132  * @returns             NULL or the allocated lock
133  */
134 lck_rw_t *
lck_rw_alloc_init(lck_grp_t * grp,lck_attr_t * attr)135 lck_rw_alloc_init(
136 	lck_grp_t       *grp,
137 	lck_attr_t      *attr)
138 {
139 	lck_rw_t *lck;
140 
141 	lck = zalloc_flags(KT_LCK_RW, Z_WAITOK | Z_ZERO);
142 	lck_rw_init(lck, grp, attr);
143 	return lck;
144 }
145 
146 /*!
147  * @function lck_rw_init
148  *
149  * @abstract
150  * Initializes a rw_lock_t.
151  *
152  * @discussion
153  * Usage statistics for the lock are going to be added to the lock group provided.
154  *
155  * The lock attribute can be used to specify the lock contention behaviour.
156  * RW_WRITER_PRIORITY is the default behaviour (LCK_ATTR_NULL defaults to RW_WRITER_PRIORITY)
157  * and lck_attr_rw_shared_priority() can be used to set the behaviour to RW_SHARED_PRIORITY.
158  *
159  * RW_WRITER_PRIORITY gives priority to the writers upon contention with the readers;
160  * if the lock is held and a writer starts waiting for the lock, readers will not be able
161  * to acquire the lock until all writers stop contending. Readers could
162  * potentially starve.
163  * RW_SHARED_PRIORITY gives priority to the readers upon contention with the writers:
164  * unleass the lock is held in exclusive mode, readers will always be able to acquire the lock.
165  * Readers can lock a shared lock even if there are writers waiting. Writers could potentially
166  * starve.
167  *
168  * @param lck           lock to initialize.
169  * @param grp           lock group to associate with the lock.
170  * @param attr          lock attribute to initialize the lock.
171  *
172  */
173 void
lck_rw_init(lck_rw_t * lck,lck_grp_t * grp,lck_attr_t * attr)174 lck_rw_init(
175 	lck_rw_t        *lck,
176 	lck_grp_t       *grp,
177 	lck_attr_t      *attr)
178 {
179 	/* keep this so that the lck_type_t type is referenced for lldb */
180 	lck_type_t type = LCK_TYPE_RW;
181 
182 	if (attr == LCK_ATTR_NULL) {
183 		attr = &lck_attr_default;
184 	}
185 	*lck = (lck_rw_t){
186 		.lck_rw_type = type,
187 		.lck_rw_can_sleep = true,
188 		.lck_rw_priv_excl = !(attr->lck_attr_val & LCK_ATTR_RW_SHARED_PRIORITY),
189 	};
190 	lck_grp_reference(grp, &grp->lck_grp_rwcnt);
191 }
192 
193 /*!
194  * @function lck_rw_free
195  *
196  * @abstract
197  * Frees a rw_lock previously allocated with lck_rw_alloc_init().
198  *
199  * @discussion
200  * The lock must be not held by any thread.
201  *
202  * @param lck           rw_lock to free.
203  */
204 void
lck_rw_free(lck_rw_t * lck,lck_grp_t * grp)205 lck_rw_free(
206 	lck_rw_t        *lck,
207 	lck_grp_t       *grp)
208 {
209 	lck_rw_destroy(lck, grp);
210 	zfree(KT_LCK_RW, lck);
211 }
212 
213 /*!
214  * @function lck_rw_destroy
215  *
216  * @abstract
217  * Destroys a rw_lock previously initialized with lck_rw_init().
218  *
219  * @discussion
220  * The lock must be not held by any thread.
221  *
222  * @param lck           rw_lock to destroy.
223  */
224 void
lck_rw_destroy(lck_rw_t * lck,lck_grp_t * grp)225 lck_rw_destroy(
226 	lck_rw_t        *lck,
227 	lck_grp_t       *grp)
228 {
229 	if (lck->lck_rw_type != LCK_TYPE_RW ||
230 	    lck->lck_rw_tag == LCK_RW_TAG_DESTROYED) {
231 		panic("Destroying previously destroyed lock %p", lck);
232 	}
233 	lck_rw_assert(lck, LCK_RW_ASSERT_NOTHELD);
234 
235 	lck->lck_rw_type = LCK_TYPE_NONE;
236 	lck->lck_rw_tag = LCK_RW_TAG_DESTROYED;
237 	lck_grp_deallocate(grp, &grp->lck_grp_rwcnt);
238 }
239 
240 #ifdef DEBUG_RW
241 
242 /*
243  * Best effort mechanism to debug rw_locks.
244  *
245  * This mechanism is in addition to the owner checks. The owner is set
246  * only when the lock is held in exclusive mode so the checks do not cover
247  * the cases in which the lock is held in shared mode.
248  *
249  * This mechanism tentatively stores the rw_lock acquired and its debug
250  * information on the thread struct.
251  * Just up to LCK_RW_EXPECTED_MAX_NUMBER rw lock debug information can be stored.
252  *
253  * NOTE: LCK_RW_EXPECTED_MAX_NUMBER is the expected number of rw_locks held
254  * at the same time. If a thread holds more than this number of rw_locks we
255  * will start losing debug information.
256  * Increasing LCK_RW_EXPECTED_MAX_NUMBER will increase the probability we will
257  * store the debug information but it will require more memory per thread
258  * and longer lock/unlock time.
259  *
260  * If an empty slot is found for the debug information, we record the lock
261  * otherwise we set the overflow threshold flag.
262  *
263  * If we reached the overflow threshold we might stop asserting because we cannot be sure
264  * anymore if the lock was acquired or not.
265  *
266  * Even if we reached the overflow threshold, we try to store the debug information
267  * for the new locks acquired. This can be useful in core dumps to debug
268  * possible return to userspace without unlocking and to find possible readers
269  * holding the lock.
270  */
271 #if DEBUG_RW
272 
273 __static_if_init_func
274 void
lck_rw_assert_init(const char * args,uint64_t kf_ovrd)275 lck_rw_assert_init(const char *args, uint64_t kf_ovrd)
276 {
277 	bool lck_rw_assert_disable = false;
278 
279 	if (kf_ovrd & KF_MACH_ASSERT_OVRD) {
280 		lck_rw_assert_disable = true;
281 	}
282 
283 	if (static_if_boot_arg_uint64(args, "lcks", 0) &
284 	    LCK_OPTION_DISABLE_RW_DEBUG) {
285 		lck_rw_assert_disable = true;
286 	}
287 
288 	if (lck_rw_assert_disable) {
289 		static_if_key_disable(lck_rw_assert);
290 	}
291 }
292 
293 #endif /* DEBUG_RW */
294 
295 static inline struct rw_lock_debug_entry *
find_lock_in_savedlocks(lck_rw_t * lock,rw_lock_debug_t * rw_locks_held)296 find_lock_in_savedlocks(lck_rw_t* lock, rw_lock_debug_t *rw_locks_held)
297 {
298 	int i;
299 	for (i = 0; i < LCK_RW_EXPECTED_MAX_NUMBER; i++) {
300 		struct rw_lock_debug_entry *existing = &rw_locks_held->rwld_locks[i];
301 		if (existing->rwlde_lock == lock) {
302 			return existing;
303 		}
304 	}
305 
306 	return NULL;
307 }
308 
309 __abortlike
310 static void
rwlock_slot_panic(rw_lock_debug_t * rw_locks_held)311 rwlock_slot_panic(rw_lock_debug_t *rw_locks_held)
312 {
313 	panic("No empty slot found in %p slot_used %d", rw_locks_held, rw_locks_held->rwld_locks_saved);
314 }
315 
316 static inline struct rw_lock_debug_entry *
find_empty_slot(rw_lock_debug_t * rw_locks_held)317 find_empty_slot(rw_lock_debug_t *rw_locks_held)
318 {
319 	int i;
320 	for (i = 0; i < LCK_RW_EXPECTED_MAX_NUMBER; i++) {
321 		struct rw_lock_debug_entry *entry = &rw_locks_held->rwld_locks[i];
322 		if (entry->rwlde_lock == NULL) {
323 			return entry;
324 		}
325 	}
326 	rwlock_slot_panic(rw_locks_held);
327 }
328 
329 __abortlike
330 static void
canlock_rwlock_panic(lck_rw_t * lock,thread_t thread,struct rw_lock_debug_entry * entry)331 canlock_rwlock_panic(lck_rw_t* lock, thread_t thread, struct rw_lock_debug_entry *entry)
332 {
333 	panic("RW lock %p already held by %p caller %p mode_count %d state 0x%x owner 0x%p ",
334 	    lock, thread, get_rwlde_caller(entry), entry->rwlde_mode_count,
335 	    ordered_load_rw(lock), ctid_get_thread_unsafe(lock->lck_rw_owner));
336 }
337 
338 __attribute__((noinline))
339 static void
assert_canlock_rwlock_slow(lck_rw_t * lock,thread_t thread,lck_rw_type_t type)340 assert_canlock_rwlock_slow(lck_rw_t* lock, thread_t thread, lck_rw_type_t type)
341 {
342 	rw_lock_debug_t *rw_locks_held = &thread->rw_lock_held;
343 	if (__probable(rw_locks_held->rwld_locks_acquired == 0)) {
344 		//no locks saved, safe to lock
345 		return;
346 	}
347 
348 	struct rw_lock_debug_entry *entry = find_lock_in_savedlocks(lock, rw_locks_held);
349 	if (__improbable(entry != NULL)) {
350 		boolean_t can_be_shared_recursive;
351 		if (lck_rw_recursive_shared_assert_74048094) {
352 			can_be_shared_recursive = (lock->lck_rw_priv_excl == 0);
353 		} else {
354 			/* currently rw_lock_shared is called recursively,
355 			 * until the code is fixed allow to lock
356 			 * recursively in shared mode
357 			 */
358 			can_be_shared_recursive = TRUE;
359 		}
360 		if ((type == LCK_RW_TYPE_SHARED) && can_be_shared_recursive && entry->rwlde_mode_count >= 1) {
361 			return;
362 		}
363 		canlock_rwlock_panic(lock, thread, entry);
364 	}
365 }
366 
367 static inline void
assert_canlock_rwlock(lck_rw_t * lock,thread_t thread,lck_rw_type_t type)368 assert_canlock_rwlock(lck_rw_t* lock, thread_t thread, lck_rw_type_t type)
369 {
370 	if (lck_rw_assert_enabled()) {
371 		assert_canlock_rwlock_slow(lock, thread, type);
372 	}
373 }
374 
375 __abortlike
376 static void
held_rwlock_notheld_panic(lck_rw_t * lock,thread_t thread)377 held_rwlock_notheld_panic(lck_rw_t* lock, thread_t thread)
378 {
379 	panic("RW lock %p not held by %p", lock, thread);
380 }
381 
382 __abortlike
383 static void
held_rwlock_notheld_with_info_panic(lck_rw_t * lock,thread_t thread,lck_rw_type_t type,struct rw_lock_debug_entry * entry)384 held_rwlock_notheld_with_info_panic(lck_rw_t* lock, thread_t thread, lck_rw_type_t type, struct rw_lock_debug_entry *entry)
385 {
386 	if (type == LCK_RW_TYPE_EXCLUSIVE) {
387 		panic("RW lock %p not held in exclusive by %p caller %p read %d state 0x%x owner 0x%p ",
388 		    lock, thread, get_rwlde_caller(entry), entry->rwlde_mode_count,
389 		    ordered_load_rw(lock), ctid_get_thread_unsafe(lock->lck_rw_owner));
390 	} else {
391 		panic("RW lock %p not held in shared by %p caller %p read %d state 0x%x owner 0x%p ",
392 		    lock, thread, get_rwlde_caller(entry), entry->rwlde_mode_count,
393 		    ordered_load_rw(lock), ctid_get_thread_unsafe(lock->lck_rw_owner));
394 	}
395 }
396 
397 __attribute__((noinline))
398 static void
assert_held_rwlock_slow(lck_rw_t * lock,thread_t thread,lck_rw_type_t type)399 assert_held_rwlock_slow(lck_rw_t* lock, thread_t thread, lck_rw_type_t type)
400 {
401 	rw_lock_debug_t *rw_locks_held = &thread->rw_lock_held;
402 
403 	if (__improbable(rw_locks_held->rwld_locks_acquired == 0 || rw_locks_held->rwld_locks_saved == 0)) {
404 		if (rw_locks_held->rwld_locks_acquired == 0 || rw_locks_held->rwld_overflow == 0) {
405 			held_rwlock_notheld_panic(lock, thread);
406 		}
407 		return;
408 	}
409 
410 	struct rw_lock_debug_entry *entry = find_lock_in_savedlocks(lock, rw_locks_held);
411 	if (__probable(entry != NULL)) {
412 		if (type == LCK_RW_TYPE_EXCLUSIVE && entry->rwlde_mode_count != -1) {
413 			held_rwlock_notheld_with_info_panic(lock, thread, type, entry);
414 		} else {
415 			if (type == LCK_RW_TYPE_SHARED && entry->rwlde_mode_count <= 0) {
416 				held_rwlock_notheld_with_info_panic(lock, thread, type, entry);
417 			}
418 		}
419 	} else {
420 		if (rw_locks_held->rwld_overflow == 0) {
421 			held_rwlock_notheld_panic(lock, thread);
422 		}
423 	}
424 }
425 
426 static inline void
assert_held_rwlock(lck_rw_t * lock,thread_t thread,lck_rw_type_t type)427 assert_held_rwlock(lck_rw_t* lock, thread_t thread, lck_rw_type_t type)
428 {
429 	if (lck_rw_assert_enabled()) {
430 		assert_held_rwlock_slow(lock, thread, type);
431 	}
432 }
433 
434 __attribute__((noinline))
435 static void
change_held_rwlock_slow(lck_rw_t * lock,thread_t thread,lck_rw_type_t typeFrom,void * caller)436 change_held_rwlock_slow(lck_rw_t* lock, thread_t thread, lck_rw_type_t typeFrom, void* caller)
437 {
438 	rw_lock_debug_t *rw_locks_held = &thread->rw_lock_held;
439 	if (__improbable(rw_locks_held->rwld_locks_saved == 0)) {
440 		if (rw_locks_held->rwld_overflow == 0) {
441 			held_rwlock_notheld_panic(lock, thread);
442 		}
443 		return;
444 	}
445 
446 	struct rw_lock_debug_entry *entry = find_lock_in_savedlocks(lock, rw_locks_held);
447 	if (__probable(entry != NULL)) {
448 		if (typeFrom == LCK_RW_TYPE_SHARED) {
449 			//We are upgrading
450 			assertf(entry->rwlde_mode_count == 1,
451 			    "RW lock %p not held by a single shared when upgrading "
452 			    "by %p caller %p read %d state 0x%x owner 0x%p ",
453 			    lock, thread, get_rwlde_caller(entry), entry->rwlde_mode_count,
454 			    ordered_load_rw(lock), ctid_get_thread_unsafe(lock->lck_rw_owner));
455 			entry->rwlde_mode_count = -1;
456 			set_rwlde_caller_packed(entry, caller);
457 		} else {
458 			//We are downgrading
459 			assertf(entry->rwlde_mode_count == -1,
460 			    "RW lock %p not held in write mode when downgrading "
461 			    "by %p caller %p read %d state 0x%x owner 0x%p ",
462 			    lock, thread, get_rwlde_caller(entry), entry->rwlde_mode_count,
463 			    ordered_load_rw(lock), ctid_get_thread_unsafe(lock->lck_rw_owner));
464 			entry->rwlde_mode_count = 1;
465 			set_rwlde_caller_packed(entry, caller);
466 		}
467 		return;
468 	}
469 
470 	if (rw_locks_held->rwld_overflow == 0) {
471 		held_rwlock_notheld_panic(lock, thread);
472 	}
473 
474 	if (rw_locks_held->rwld_locks_saved == LCK_RW_EXPECTED_MAX_NUMBER) {
475 		//array is full
476 		return;
477 	}
478 
479 	struct rw_lock_debug_entry *null_entry = find_empty_slot(rw_locks_held);
480 	null_entry->rwlde_lock = lock;
481 	set_rwlde_caller_packed(null_entry, caller);
482 	if (typeFrom == LCK_RW_TYPE_SHARED) {
483 		null_entry->rwlde_mode_count = -1;
484 	} else {
485 		null_entry->rwlde_mode_count = 1;
486 	}
487 	rw_locks_held->rwld_locks_saved++;
488 }
489 
490 static inline void
change_held_rwlock(lck_rw_t * lock,thread_t thread,lck_rw_type_t typeFrom,void * caller)491 change_held_rwlock(lck_rw_t* lock, thread_t thread, lck_rw_type_t typeFrom, void* caller)
492 {
493 	if (lck_rw_assert_enabled()) {
494 		change_held_rwlock_slow(lock, thread, typeFrom, caller);
495 	}
496 }
497 
498 __abortlike
499 static void
add_held_rwlock_too_many_panic(thread_t thread)500 add_held_rwlock_too_many_panic(thread_t thread)
501 {
502 	panic("RW lock too many rw locks held, rwld_locks_acquired maxed out for thread %p", thread);
503 }
504 
505 static __attribute__((noinline)) void
add_held_rwlock_slow(lck_rw_t * lock,thread_t thread,lck_rw_type_t type,void * caller)506 add_held_rwlock_slow(lck_rw_t* lock, thread_t thread, lck_rw_type_t type, void* caller)
507 {
508 	rw_lock_debug_t *rw_locks_held = &thread->rw_lock_held;
509 	struct rw_lock_debug_entry *null_entry;
510 	if (__improbable(rw_locks_held->rwld_locks_acquired == UINT32_MAX)) {
511 		add_held_rwlock_too_many_panic(thread);
512 	}
513 	rw_locks_held->rwld_locks_acquired++;
514 
515 	if (type == LCK_RW_TYPE_EXCLUSIVE) {
516 		if (__improbable(rw_locks_held->rwld_locks_saved == LCK_RW_EXPECTED_MAX_NUMBER)) {
517 			//array is full
518 			rw_locks_held->rwld_overflow = 1;
519 			return;
520 		}
521 		null_entry = find_empty_slot(rw_locks_held);
522 		null_entry->rwlde_lock = lock;
523 		set_rwlde_caller_packed(null_entry, caller);
524 		null_entry->rwlde_mode_count = -1;
525 		rw_locks_held->rwld_locks_saved++;
526 		return;
527 	} else {
528 		if (__probable(rw_locks_held->rwld_locks_saved == 0)) {
529 			//array is empty
530 			goto add_shared;
531 		}
532 
533 		boolean_t allow_shared_recursive;
534 		if (lck_rw_recursive_shared_assert_74048094) {
535 			allow_shared_recursive = (lock->lck_rw_priv_excl == 0);
536 		} else {
537 			allow_shared_recursive = TRUE;
538 		}
539 		if (allow_shared_recursive) {
540 			//It could be already locked in shared mode
541 			struct rw_lock_debug_entry *entry = find_lock_in_savedlocks(lock, rw_locks_held);
542 			if (entry != NULL) {
543 				assert(entry->rwlde_mode_count > 0);
544 				assertf(entry->rwlde_mode_count != INT8_MAX,
545 				    "RW lock %p with too many recursive shared held "
546 				    "from %p caller %p read %d state 0x%x owner 0x%p",
547 				    lock, thread, get_rwlde_caller(entry), entry->rwlde_mode_count,
548 				    ordered_load_rw(lock), ctid_get_thread_unsafe(lock->lck_rw_owner));
549 				entry->rwlde_mode_count += 1;
550 				return;
551 			}
552 		}
553 
554 		//none of the locks were a match
555 		//try to add a new entry
556 		if (__improbable(rw_locks_held->rwld_locks_saved == LCK_RW_EXPECTED_MAX_NUMBER)) {
557 			//array is full
558 			rw_locks_held->rwld_overflow = 1;
559 			return;
560 		}
561 
562 add_shared:
563 		null_entry = find_empty_slot(rw_locks_held);
564 		null_entry->rwlde_lock = lock;
565 		set_rwlde_caller_packed(null_entry, caller);
566 		null_entry->rwlde_mode_count = 1;
567 		rw_locks_held->rwld_locks_saved++;
568 	}
569 }
570 
571 static inline void
add_held_rwlock(lck_rw_t * lock,thread_t thread,lck_rw_type_t type,void * caller)572 add_held_rwlock(lck_rw_t* lock, thread_t thread, lck_rw_type_t type, void* caller)
573 {
574 	if (lck_rw_assert_enabled()) {
575 		add_held_rwlock_slow(lock, thread, type, caller);
576 	}
577 }
578 
579 static void
remove_held_rwlock_slow(lck_rw_t * lock,thread_t thread,lck_rw_type_t type)580 remove_held_rwlock_slow(lck_rw_t *lock, thread_t thread, lck_rw_type_t type)
581 {
582 	rw_lock_debug_t *rw_locks_held = &thread->rw_lock_held;
583 	if (__improbable(rw_locks_held->rwld_locks_acquired == 0)) {
584 		return;
585 	}
586 	rw_locks_held->rwld_locks_acquired--;
587 
588 	if (rw_locks_held->rwld_locks_saved == 0) {
589 		assert(rw_locks_held->rwld_overflow == 1);
590 		goto out;
591 	}
592 
593 	struct rw_lock_debug_entry *entry = find_lock_in_savedlocks(lock, rw_locks_held);
594 	if (__probable(entry != NULL)) {
595 		if (type == LCK_RW_TYPE_EXCLUSIVE) {
596 			assert(entry->rwlde_mode_count == -1);
597 			entry->rwlde_mode_count = 0;
598 		} else {
599 			assert(entry->rwlde_mode_count > 0);
600 			entry->rwlde_mode_count--;
601 			if (entry->rwlde_mode_count > 0) {
602 				goto out;
603 			}
604 		}
605 		entry->rwlde_caller_packed = 0;
606 		entry->rwlde_lock = NULL;
607 		rw_locks_held->rwld_locks_saved--;
608 	} else {
609 		assert(rw_locks_held->rwld_overflow == 1);
610 	}
611 
612 out:
613 	if (rw_locks_held->rwld_locks_acquired == 0) {
614 		rw_locks_held->rwld_overflow = 0;
615 	}
616 	return;
617 }
618 
619 static inline void
remove_held_rwlock(lck_rw_t * lock,thread_t thread,lck_rw_type_t type)620 remove_held_rwlock(lck_rw_t* lock, thread_t thread, lck_rw_type_t type)
621 {
622 	if (lck_rw_assert_enabled()) {
623 		remove_held_rwlock_slow(lock, thread, type);
624 	}
625 }
626 #endif /* DEBUG_RW */
627 
628 /*
629  * We disable interrupts while holding the RW interlock to prevent an
630  * interrupt from exacerbating hold time.
631  * Hence, local helper functions lck_interlock_lock()/lck_interlock_unlock().
632  */
633 static inline boolean_t
lck_interlock_lock(lck_rw_t * lck)634 lck_interlock_lock(
635 	lck_rw_t        *lck)
636 {
637 	boolean_t       istate;
638 
639 	istate = ml_set_interrupts_enabled(FALSE);
640 	lck_rw_ilk_lock(lck);
641 	return istate;
642 }
643 
644 static inline void
lck_interlock_unlock(lck_rw_t * lck,boolean_t istate)645 lck_interlock_unlock(
646 	lck_rw_t        *lck,
647 	boolean_t       istate)
648 {
649 	lck_rw_ilk_unlock(lck);
650 	ml_set_interrupts_enabled(istate);
651 }
652 
653 /*
654  * compute the deadline to spin against when
655  * waiting for a change of state on a lck_rw_t
656  */
657 static inline uint64_t
lck_rw_deadline_for_spin(lck_rw_t * lck)658 lck_rw_deadline_for_spin(
659 	lck_rw_t        *lck)
660 {
661 	lck_rw_word_t   word;
662 
663 	word.data = ordered_load_rw(lck);
664 	if (word.can_sleep) {
665 		if (word.r_waiting || word.w_waiting || (word.shared_count > machine_info.max_cpus)) {
666 			/*
667 			 * there are already threads waiting on this lock... this
668 			 * implies that they have spun beyond their deadlines waiting for
669 			 * the desired state to show up so we will not bother spinning at this time...
670 			 *   or
671 			 * the current number of threads sharing this lock exceeds our capacity to run them
672 			 * concurrently and since all states we're going to spin for require the rw_shared_count
673 			 * to be at 0, we'll not bother spinning since the latency for this to happen is
674 			 * unpredictable...
675 			 */
676 			return mach_absolute_time();
677 		}
678 		return mach_absolute_time() + os_atomic_load(&MutexSpin, relaxed);
679 	} else {
680 		return mach_absolute_time() + (100000LL * 1000000000LL);
681 	}
682 }
683 
684 /*
685  * This inline is used when busy-waiting for an rw lock.
686  * If interrupts were disabled when the lock primitive was called,
687  * we poll the IPI handler for pending tlb flushes in x86.
688  */
689 static inline void
lck_rw_lock_pause(boolean_t interrupts_enabled)690 lck_rw_lock_pause(
691 	boolean_t       interrupts_enabled)
692 {
693 #if X86_64
694 	if (!interrupts_enabled) {
695 		handle_pending_TLB_flushes();
696 	}
697 	cpu_pause();
698 #else
699 	(void) interrupts_enabled;
700 	wait_for_event();
701 #endif
702 }
703 
704 typedef enum __enum_closed {
705 	LCK_RW_DRAIN_S_DRAINED       = 0,
706 	LCK_RW_DRAIN_S_NOT_DRAINED   = 1,
707 	LCK_RW_DRAIN_S_EARLY_RETURN  = 2,
708 	LCK_RW_DRAIN_S_TIMED_OUT     = 3,
709 } lck_rw_drain_state_t;
710 
711 static lck_rw_drain_state_t
712 lck_rw_drain_status(
713 	lck_rw_t        *lock,
714 	uint32_t        status_mask,
715 	boolean_t       wait,
716 	bool            (^lock_pause)(void))
717 {
718 	uint64_t        deadline = 0;
719 	uint32_t        data;
720 	boolean_t       istate = FALSE;
721 
722 	if (wait) {
723 		deadline = lck_rw_deadline_for_spin(lock);
724 #if __x86_64__
725 		istate = ml_get_interrupts_enabled();
726 #endif
727 	}
728 
729 	for (;;) {
730 #if __x86_64__
731 		data = os_atomic_load(&lock->lck_rw_data, relaxed);
732 #else
733 		data = load_exclusive32(&lock->lck_rw_data, memory_order_acquire_smp);
734 #endif
735 		if ((data & status_mask) == 0) {
736 			atomic_exchange_abort();
737 			return LCK_RW_DRAIN_S_DRAINED;
738 		}
739 
740 		if (!wait) {
741 			atomic_exchange_abort();
742 			return LCK_RW_DRAIN_S_NOT_DRAINED;
743 		}
744 
745 		lck_rw_lock_pause(istate);
746 
747 		if (mach_absolute_time() >= deadline) {
748 			return LCK_RW_DRAIN_S_TIMED_OUT;
749 		}
750 
751 		if (lock_pause && lock_pause()) {
752 			return LCK_RW_DRAIN_S_EARLY_RETURN;
753 		}
754 	}
755 }
756 
757 /*
758  * Spin while interlock is held.
759  */
760 static inline void
lck_rw_interlock_spin(lck_rw_t * lock)761 lck_rw_interlock_spin(
762 	lck_rw_t        *lock)
763 {
764 	uint32_t        data, prev;
765 
766 	for (;;) {
767 		data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_relaxed);
768 		if (data & LCK_RW_INTERLOCK) {
769 #if __x86_64__
770 			cpu_pause();
771 #else
772 			wait_for_event();
773 #endif
774 		} else {
775 			atomic_exchange_abort();
776 			return;
777 		}
778 	}
779 }
780 
781 #define LCK_RW_GRAB_WANT        0
782 #define LCK_RW_GRAB_SHARED      1
783 
784 typedef enum __enum_closed __enum_options {
785 	LCK_RW_GRAB_F_SHARED    = 0x0,  // Not really a flag obviously but makes call sites more readable.
786 	LCK_RW_GRAB_F_WANT_EXCL = 0x1,
787 	LCK_RW_GRAB_F_WAIT      = 0x2,
788 } lck_rw_grab_flags_t;
789 
790 typedef enum __enum_closed {
791 	LCK_RW_GRAB_S_NOT_LOCKED    = 0,
792 	LCK_RW_GRAB_S_LOCKED        = 1,
793 	LCK_RW_GRAB_S_EARLY_RETURN  = 2,
794 	LCK_RW_GRAB_S_TIMED_OUT     = 3,
795 } lck_rw_grab_state_t;
796 
797 static lck_rw_grab_state_t
798 lck_rw_grab(
799 	lck_rw_t            *lock,
800 	lck_rw_grab_flags_t flags,
801 	bool                (^lock_pause)(void))
802 {
803 	uint64_t        deadline = 0;
804 	uint32_t        data, prev;
805 	boolean_t       do_exch, istate = FALSE;
806 
807 	assert3u(flags & ~(LCK_RW_GRAB_F_WANT_EXCL | LCK_RW_GRAB_F_WAIT), ==, 0);
808 
809 	if ((flags & LCK_RW_GRAB_F_WAIT) != 0) {
810 		deadline = lck_rw_deadline_for_spin(lock);
811 #if __x86_64__
812 		istate = ml_get_interrupts_enabled();
813 #endif
814 	}
815 
816 	for (;;) {
817 		data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_acquire_smp);
818 		if (data & LCK_RW_INTERLOCK) {
819 			atomic_exchange_abort();
820 			lck_rw_interlock_spin(lock);
821 			continue;
822 		}
823 		do_exch = FALSE;
824 		if ((flags & LCK_RW_GRAB_F_WANT_EXCL) != 0) {
825 			if ((data & LCK_RW_WANT_EXCL) == 0) {
826 				data |= LCK_RW_WANT_EXCL;
827 				do_exch = TRUE;
828 			}
829 		} else {        // LCK_RW_GRAB_SHARED
830 			if (((data & (LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE)) == 0) ||
831 			    (((data & LCK_RW_SHARED_MASK)) && ((data & LCK_RW_PRIV_EXCL) == 0))) {
832 				data += LCK_RW_SHARED_READER;
833 				do_exch = TRUE;
834 			}
835 		}
836 		if (do_exch) {
837 			if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_acquire_smp)) {
838 				return LCK_RW_GRAB_S_LOCKED;
839 			}
840 		} else {
841 			if ((flags & LCK_RW_GRAB_F_WAIT) == 0) {
842 				atomic_exchange_abort();
843 				return LCK_RW_GRAB_S_NOT_LOCKED;
844 			}
845 
846 			lck_rw_lock_pause(istate);
847 
848 			if (mach_absolute_time() >= deadline) {
849 				return LCK_RW_GRAB_S_TIMED_OUT;
850 			}
851 			if (lock_pause && lock_pause()) {
852 				return LCK_RW_GRAB_S_EARLY_RETURN;
853 			}
854 		}
855 	}
856 }
857 
858 /*
859  * The inverse of lck_rw_grab - drops either the LCK_RW_WANT_EXCL bit or
860  * decrements the reader count. Doesn't deal with waking up waiters - i.e.
861  * should only be called when can_sleep is false.
862  */
863 static void
lck_rw_drop(lck_rw_t * lock,lck_rw_grab_flags_t flags)864 lck_rw_drop(lck_rw_t *lock, lck_rw_grab_flags_t flags)
865 {
866 	uint32_t data, prev;
867 
868 	assert3u(flags & ~(LCK_RW_GRAB_F_WANT_EXCL | LCK_RW_GRAB_F_WAIT), ==, 0);
869 	assert(!lock->lck_rw_can_sleep);
870 
871 	for (;;) {
872 		data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_acquire_smp);
873 
874 		/* Interlock should never be taken when can_sleep is false. */
875 		assert3u(data & LCK_RW_INTERLOCK, ==, 0);
876 
877 		if ((flags & LCK_RW_GRAB_F_WANT_EXCL) != 0) {
878 			data &= ~LCK_RW_WANT_EXCL;
879 		} else {
880 			data -= LCK_RW_SHARED_READER;
881 		}
882 
883 		if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_acquire_smp)) {
884 			break;
885 		}
886 
887 		cpu_pause();
888 	}
889 
890 	return;
891 }
892 
893 static boolean_t
894 lck_rw_lock_exclusive_gen(
895 	lck_rw_t        *lock,
896 	bool            (^lock_pause)(void))
897 {
898 	__assert_only thread_t self = current_thread();
899 	__kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lock);
900 	lck_rw_word_t           word;
901 	int                     slept = 0;
902 	lck_rw_grab_state_t     grab_state = LCK_RW_GRAB_S_NOT_LOCKED;
903 	lck_rw_drain_state_t    drain_state = LCK_RW_DRAIN_S_NOT_DRAINED;
904 	wait_result_t           res = 0;
905 	boolean_t               istate;
906 
907 #if     CONFIG_DTRACE
908 	boolean_t dtrace_ls_initialized = FALSE;
909 	boolean_t dtrace_rwl_excl_spin, dtrace_rwl_excl_block, dtrace_ls_enabled = FALSE;
910 	uint64_t wait_interval = 0;
911 	int readers_at_sleep = 0;
912 #endif
913 
914 	assertf(lock->lck_rw_owner != self->ctid,
915 	    "Lock already held state=0x%x, owner=%p",
916 	    ordered_load_rw(lock), self);
917 
918 #ifdef DEBUG_RW
919 	/*
920 	 * Best effort attempt to check that this thread
921 	 * is not already holding the lock (this checks read mode too).
922 	 */
923 	assert_canlock_rwlock(lock, self, LCK_RW_TYPE_EXCLUSIVE);
924 #endif /* DEBUG_RW */
925 
926 	/*
927 	 *	Try to acquire the lck_rw_want_excl bit.
928 	 */
929 	while (lck_rw_grab(lock, LCK_RW_GRAB_F_WANT_EXCL, NULL) != LCK_RW_GRAB_S_LOCKED) {
930 #if     CONFIG_DTRACE
931 		if (dtrace_ls_initialized == FALSE) {
932 			dtrace_ls_initialized = TRUE;
933 			dtrace_rwl_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] != 0);
934 			dtrace_rwl_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK] != 0);
935 			dtrace_ls_enabled = dtrace_rwl_excl_spin || dtrace_rwl_excl_block;
936 			if (dtrace_ls_enabled) {
937 				/*
938 				 * Either sleeping or spinning is happening,
939 				 *  start a timing of our delay interval now.
940 				 */
941 				readers_at_sleep = lock->lck_rw_shared_count;
942 				wait_interval = mach_absolute_time();
943 			}
944 		}
945 #endif
946 
947 		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) | DBG_FUNC_START,
948 		    trace_lck, 0, 0, 0, 0);
949 
950 		grab_state = lck_rw_grab(lock, LCK_RW_GRAB_F_WANT_EXCL | LCK_RW_GRAB_F_WAIT, lock_pause);
951 
952 		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) | DBG_FUNC_END,
953 		    trace_lck, 0, 0, grab_state, 0);
954 
955 		if (grab_state == LCK_RW_GRAB_S_LOCKED ||
956 		    grab_state == LCK_RW_GRAB_S_EARLY_RETURN) {
957 			break;
958 		}
959 		/*
960 		 * if we get here, the deadline has expired w/o us
961 		 * being able to grab the lock exclusively
962 		 * check to see if we're allowed to do a thread_block
963 		 */
964 		word.data = ordered_load_rw(lock);
965 		if (word.can_sleep) {
966 			istate = lck_interlock_lock(lock);
967 			word.data = ordered_load_rw(lock);
968 
969 			if (word.want_excl) {
970 				KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_WAIT_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
971 
972 				word.w_waiting = 1;
973 				ordered_store_rw(lock, word.data);
974 
975 				thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockWrite);
976 				res = assert_wait(LCK_RW_WRITER_EVENT(lock),
977 				    THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
978 				lck_interlock_unlock(lock, istate);
979 				if (res == THREAD_WAITING) {
980 					res = thread_block(THREAD_CONTINUE_NULL);
981 					slept++;
982 				}
983 				KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_WAIT_CODE) | DBG_FUNC_END, trace_lck, res, slept, 0, 0);
984 			} else {
985 				word.want_excl = 1;
986 				ordered_store_rw(lock, word.data);
987 				lck_interlock_unlock(lock, istate);
988 				break;
989 			}
990 		}
991 	}
992 
993 	if (grab_state == LCK_RW_GRAB_S_EARLY_RETURN) {
994 		assert(lock_pause);
995 		return FALSE;
996 	}
997 
998 	/*
999 	 * Wait for readers (and upgrades) to finish...
1000 	 */
1001 	while (lck_rw_drain_status(lock, LCK_RW_SHARED_MASK | LCK_RW_WANT_UPGRADE, FALSE, NULL) != LCK_RW_DRAIN_S_DRAINED) {
1002 #if     CONFIG_DTRACE
1003 		/*
1004 		 * Either sleeping or spinning is happening, start
1005 		 * a timing of our delay interval now.  If we set it
1006 		 * to -1 we don't have accurate data so we cannot later
1007 		 * decide to record a dtrace spin or sleep event.
1008 		 */
1009 		if (dtrace_ls_initialized == FALSE) {
1010 			dtrace_ls_initialized = TRUE;
1011 			dtrace_rwl_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] != 0);
1012 			dtrace_rwl_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK] != 0);
1013 			dtrace_ls_enabled = dtrace_rwl_excl_spin || dtrace_rwl_excl_block;
1014 			if (dtrace_ls_enabled) {
1015 				/*
1016 				 * Either sleeping or spinning is happening,
1017 				 *  start a timing of our delay interval now.
1018 				 */
1019 				readers_at_sleep = lock->lck_rw_shared_count;
1020 				wait_interval = mach_absolute_time();
1021 			}
1022 		}
1023 #endif
1024 
1025 		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
1026 
1027 		drain_state = lck_rw_drain_status(lock, LCK_RW_SHARED_MASK | LCK_RW_WANT_UPGRADE, TRUE, lock_pause);
1028 
1029 		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) | DBG_FUNC_END, trace_lck, 0, 0, drain_state, 0);
1030 
1031 		if (drain_state == LCK_RW_DRAIN_S_DRAINED ||
1032 		    drain_state == LCK_RW_DRAIN_S_EARLY_RETURN) {
1033 			break;
1034 		}
1035 		/*
1036 		 * if we get here, the deadline has expired w/o us
1037 		 * being able to grab the lock exclusively
1038 		 * check to see if we're allowed to do a thread_block
1039 		 */
1040 		word.data = ordered_load_rw(lock);
1041 		if (word.can_sleep) {
1042 			istate = lck_interlock_lock(lock);
1043 			word.data = ordered_load_rw(lock);
1044 
1045 			if (word.shared_count != 0 || word.want_upgrade) {
1046 				KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_WAIT_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
1047 
1048 				word.w_waiting = 1;
1049 				ordered_store_rw(lock, word.data);
1050 
1051 				thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockWrite);
1052 				res = assert_wait(LCK_RW_WRITER_EVENT(lock),
1053 				    THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
1054 				lck_interlock_unlock(lock, istate);
1055 
1056 				if (res == THREAD_WAITING) {
1057 					res = thread_block(THREAD_CONTINUE_NULL);
1058 					slept++;
1059 				}
1060 				KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_WAIT_CODE) | DBG_FUNC_END, trace_lck, res, slept, 0, 0);
1061 			} else {
1062 				lck_interlock_unlock(lock, istate);
1063 				/*
1064 				 * must own the lock now, since we checked for
1065 				 * readers or upgrade owner behind the interlock
1066 				 * no need for a call to 'lck_rw_drain_status'
1067 				 */
1068 				break;
1069 			}
1070 		}
1071 	}
1072 
1073 #if     CONFIG_DTRACE
1074 	/*
1075 	 * Decide what latencies we suffered that are Dtrace events.
1076 	 * If we have set wait_interval, then we either spun or slept.
1077 	 * At least we get out from under the interlock before we record
1078 	 * which is the best we can do here to minimize the impact
1079 	 * of the tracing.
1080 	 * If we have set wait_interval to -1, then dtrace was not enabled when we
1081 	 * started sleeping/spinning so we don't record this event.
1082 	 */
1083 	if (dtrace_ls_enabled == TRUE) {
1084 		if (slept == 0) {
1085 			LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_SPIN, lock,
1086 			    mach_absolute_time() - wait_interval, 1);
1087 		} else {
1088 			/*
1089 			 * For the blocking case, we also record if when we blocked
1090 			 * it was held for read or write, and how many readers.
1091 			 * Notice that above we recorded this before we dropped
1092 			 * the interlock so the count is accurate.
1093 			 */
1094 			LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_BLOCK, lock,
1095 			    mach_absolute_time() - wait_interval, 1,
1096 			    (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1097 		}
1098 	}
1099 #endif /* CONFIG_DTRACE */
1100 
1101 	if (drain_state == LCK_RW_DRAIN_S_EARLY_RETURN) {
1102 		lck_rw_drop(lock, LCK_RW_GRAB_F_WANT_EXCL);
1103 		assert(lock_pause);
1104 		return FALSE;
1105 	}
1106 
1107 #if CONFIG_DTRACE
1108 	LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE, lock, 1);
1109 #endif  /* CONFIG_DTRACE */
1110 
1111 	return TRUE;
1112 }
1113 
1114 static inline void
lck_rw_lock_check_preemption(lck_rw_t * lock __unused)1115 lck_rw_lock_check_preemption(lck_rw_t *lock __unused)
1116 {
1117 	assertf((get_preemption_level() == 0 && ml_get_interrupts_enabled()) ||
1118 	    startup_phase < STARTUP_SUB_EARLY_BOOT ||
1119 	    current_cpu_datap()->cpu_hibernate ||
1120 	    ml_is_quiescing() ||
1121 	    !not_in_kdp,
1122 	    "%s: attempt to take rwlock %p in non-preemptible or interrupt context: "
1123 	    "preemption level = %d, interruptible = %d", __func__, lock,
1124 	    get_preemption_level(), (int)ml_get_interrupts_enabled());
1125 }
1126 
1127 #define LCK_RW_LOCK_EXCLUSIVE_TAS(lck) (atomic_test_and_set32(&(lck)->lck_rw_data, \
1128 	    (LCK_RW_SHARED_MASK | LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE | LCK_RW_INTERLOCK), \
1129 	    LCK_RW_WANT_EXCL, memory_order_acquire_smp, FALSE))
1130 /*!
1131  * @function lck_rw_lock_exclusive_check_contended
1132  *
1133  * @abstract
1134  * Locks a rw_lock in exclusive mode.
1135  *
1136  * @discussion
1137  * This routine IS EXPERIMENTAL.
1138  * It's only used for the vm object lock, and use for other subsystems is UNSUPPORTED.
1139  * Note that the return value is ONLY A HEURISTIC w.r.t. the lock's contention.
1140  *
1141  * @param lock           rw_lock to lock.
1142  *
1143  * @returns Returns TRUE if the thread spun or blocked while attempting to acquire the lock, FALSE
1144  *          otherwise.
1145  */
1146 bool
lck_rw_lock_exclusive_check_contended(lck_rw_t * lock)1147 lck_rw_lock_exclusive_check_contended(
1148 	lck_rw_t        *lock)
1149 {
1150 	thread_t        thread = current_thread();
1151 	bool            contended  = false;
1152 
1153 	if (lock->lck_rw_can_sleep) {
1154 		lck_rw_lock_check_preemption(lock);
1155 		lck_rw_lock_count_inc(thread, lock);
1156 	} else if (get_preemption_level() == 0) {
1157 		panic("Taking non-sleepable RW lock with preemption enabled");
1158 	}
1159 
1160 	if (LCK_RW_LOCK_EXCLUSIVE_TAS(lock)) {
1161 #if     CONFIG_DTRACE
1162 		LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE, lock, DTRACE_RW_EXCL);
1163 #endif  /* CONFIG_DTRACE */
1164 	} else {
1165 		contended = true;
1166 		(void) lck_rw_lock_exclusive_gen(lock, NULL);
1167 	}
1168 	assertf(lock->lck_rw_owner == 0, "state=0x%x, owner=%p",
1169 	    ordered_load_rw(lock), ctid_get_thread_unsafe(lock->lck_rw_owner));
1170 	ordered_store_rw_owner(lock, thread->ctid);
1171 
1172 #ifdef DEBUG_RW
1173 	add_held_rwlock(lock, thread, LCK_RW_TYPE_EXCLUSIVE, __builtin_return_address(0));
1174 #endif /* DEBUG_RW */
1175 	return contended;
1176 }
1177 
1178 __attribute__((always_inline))
1179 static boolean_t
1180 lck_rw_lock_exclusive_internal_inline(
1181 	lck_rw_t        *lock,
1182 	void            *caller,
1183 	bool            (^lock_pause)(void))
1184 {
1185 #pragma unused(caller)
1186 	thread_t        thread = current_thread();
1187 
1188 	if (lock->lck_rw_can_sleep) {
1189 		lck_rw_lock_check_preemption(lock);
1190 		lck_rw_lock_count_inc(thread, lock);
1191 	} else if (get_preemption_level() == 0) {
1192 		panic("Taking non-sleepable RW lock with preemption enabled");
1193 	}
1194 
1195 	if (LCK_RW_LOCK_EXCLUSIVE_TAS(lock)) {
1196 #if     CONFIG_DTRACE
1197 		LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE, lock, DTRACE_RW_EXCL);
1198 #endif  /* CONFIG_DTRACE */
1199 	} else if (!lck_rw_lock_exclusive_gen(lock, lock_pause)) {
1200 		/*
1201 		 * lck_rw_lock_exclusive_gen() should only return
1202 		 * early if lock_pause has been passed and
1203 		 * returns FALSE. lock_pause is exclusive with
1204 		 * lck_rw_can_sleep().
1205 		 */
1206 		assert(!lock->lck_rw_can_sleep);
1207 		return FALSE;
1208 	}
1209 
1210 	assertf(lock->lck_rw_owner == 0, "state=0x%x, owner=%p",
1211 	    ordered_load_rw(lock), ctid_get_thread_unsafe(lock->lck_rw_owner));
1212 	ordered_store_rw_owner(lock, thread->ctid);
1213 
1214 #if DEBUG_RW
1215 	add_held_rwlock(lock, thread, LCK_RW_TYPE_EXCLUSIVE, caller);
1216 #endif /* DEBUG_RW */
1217 
1218 	return TRUE;
1219 }
1220 
1221 __attribute__((noinline))
1222 static void
lck_rw_lock_exclusive_internal(lck_rw_t * lock,void * caller)1223 lck_rw_lock_exclusive_internal(
1224 	lck_rw_t        *lock,
1225 	void            *caller)
1226 {
1227 	(void) lck_rw_lock_exclusive_internal_inline(lock, caller, NULL);
1228 }
1229 
1230 /*!
1231  * @function lck_rw_lock_exclusive
1232  *
1233  * @abstract
1234  * Locks a rw_lock in exclusive mode.
1235  *
1236  * @discussion
1237  * This function can block.
1238  * Multiple threads can acquire the lock in shared mode at the same time, but only one thread at a time
1239  * can acquire it in exclusive mode.
1240  * NOTE: the thread cannot return to userspace while the lock is held. Recursive locking is not supported.
1241  *
1242  * @param lock           rw_lock to lock.
1243  */
1244 __mockable
1245 void
lck_rw_lock_exclusive(lck_rw_t * lock)1246 lck_rw_lock_exclusive(
1247 	lck_rw_t        *lock)
1248 {
1249 	(void) lck_rw_lock_exclusive_internal_inline(lock, __builtin_return_address(0), NULL);
1250 }
1251 
1252 /*!
1253  * @function lck_rw_lock_exclusive_b
1254  *
1255  * @abstract
1256  * Locks a rw_lock in exclusive mode. Returns early if the lock can't be acquired
1257  * and the specified block returns true.
1258  *
1259  * @discussion
1260  * Identical to lck_rw_lock_exclusive() but can return early if the lock can't be
1261  * acquired and the specified block returns true. The block is called
1262  * repeatedly when waiting to acquire the lock.
1263  * Should only be called when the lock cannot sleep (i.e. when
1264  * lock->lck_rw_can_sleep is false).
1265  *
1266  * @param lock           rw_lock to lock.
1267  * @param lock_pause     block invoked while waiting to acquire lock
1268  *
1269  * @returns              Returns TRUE if the lock is successfully taken,
1270  *                       FALSE if the block returns true and the lock has
1271  *                       not been acquired.
1272  */
1273 boolean_t
1274 lck_rw_lock_exclusive_b(
1275 	lck_rw_t        *lock,
1276 	bool            (^lock_pause)(void))
1277 {
1278 	assert(!lock->lck_rw_can_sleep);
1279 
1280 	return lck_rw_lock_exclusive_internal_inline(lock, __builtin_return_address(0), lock_pause);
1281 }
1282 
1283 /*
1284  *	Routine:	lck_rw_lock_shared_gen
1285  *	Function:
1286  *		Fast path code has determined that this lock
1287  *		is held exclusively... this is where we spin/block
1288  *		until we can acquire the lock in the shared mode
1289  */
1290 static boolean_t
1291 lck_rw_lock_shared_gen(
1292 	lck_rw_t        *lck,
1293 	bool            (^lock_pause)(void))
1294 {
1295 	__assert_only thread_t  self = current_thread();
1296 	__kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lck);
1297 	lck_rw_word_t           word;
1298 	lck_rw_grab_state_t     grab_state = LCK_RW_GRAB_S_NOT_LOCKED;
1299 	int                     slept = 0;
1300 	wait_result_t           res = 0;
1301 	boolean_t               istate;
1302 
1303 #if     CONFIG_DTRACE
1304 	uint64_t wait_interval = 0;
1305 	int readers_at_sleep = 0;
1306 	boolean_t dtrace_ls_initialized = FALSE;
1307 	boolean_t dtrace_rwl_shared_spin, dtrace_rwl_shared_block, dtrace_ls_enabled = FALSE;
1308 #endif /* CONFIG_DTRACE */
1309 
1310 	assertf(lck->lck_rw_owner != self->ctid,
1311 	    "Lock already held state=0x%x, owner=%p",
1312 	    ordered_load_rw(lck), self);
1313 
1314 #ifdef DEBUG_RW
1315 	/*
1316 	 * Best effort attempt to check that this thread
1317 	 * is not already holding the lock in shared mode.
1318 	 */
1319 	assert_canlock_rwlock(lck, self, LCK_RW_TYPE_SHARED);
1320 #endif
1321 
1322 	while (lck_rw_grab(lck, LCK_RW_GRAB_F_SHARED, NULL) != LCK_RW_GRAB_S_LOCKED) {
1323 #if     CONFIG_DTRACE
1324 		if (dtrace_ls_initialized == FALSE) {
1325 			dtrace_ls_initialized = TRUE;
1326 			dtrace_rwl_shared_spin = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_SPIN] != 0);
1327 			dtrace_rwl_shared_block = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_BLOCK] != 0);
1328 			dtrace_ls_enabled = dtrace_rwl_shared_spin || dtrace_rwl_shared_block;
1329 			if (dtrace_ls_enabled) {
1330 				/*
1331 				 * Either sleeping or spinning is happening,
1332 				 *  start a timing of our delay interval now.
1333 				 */
1334 				readers_at_sleep = lck->lck_rw_shared_count;
1335 				wait_interval = mach_absolute_time();
1336 			}
1337 		}
1338 #endif
1339 
1340 		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_SPIN_CODE) | DBG_FUNC_START,
1341 		    trace_lck, lck->lck_rw_want_excl, lck->lck_rw_want_upgrade, 0, 0);
1342 
1343 		grab_state = lck_rw_grab(lck, LCK_RW_GRAB_F_SHARED | LCK_RW_GRAB_F_WAIT, lock_pause);
1344 
1345 		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_SPIN_CODE) | DBG_FUNC_END,
1346 		    trace_lck, lck->lck_rw_want_excl, lck->lck_rw_want_upgrade, grab_state, 0);
1347 
1348 		if (grab_state == LCK_RW_GRAB_S_LOCKED ||
1349 		    grab_state == LCK_RW_GRAB_S_EARLY_RETURN) {
1350 			break;
1351 		}
1352 
1353 		/*
1354 		 * if we get here, the deadline has expired w/o us
1355 		 * being able to grab the lock for read
1356 		 * check to see if we're allowed to do a thread_block
1357 		 */
1358 		if (lck->lck_rw_can_sleep) {
1359 			istate = lck_interlock_lock(lck);
1360 
1361 			word.data = ordered_load_rw(lck);
1362 			if ((word.want_excl || word.want_upgrade) &&
1363 			    ((word.shared_count == 0) || word.priv_excl)) {
1364 				KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_WAIT_CODE) | DBG_FUNC_START,
1365 				    trace_lck, word.want_excl, word.want_upgrade, 0, 0);
1366 
1367 				word.r_waiting = 1;
1368 				ordered_store_rw(lck, word.data);
1369 
1370 				thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockRead);
1371 				res = assert_wait(LCK_RW_READER_EVENT(lck),
1372 				    THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
1373 				lck_interlock_unlock(lck, istate);
1374 
1375 				if (res == THREAD_WAITING) {
1376 					res = thread_block(THREAD_CONTINUE_NULL);
1377 					slept++;
1378 				}
1379 				KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_WAIT_CODE) | DBG_FUNC_END,
1380 				    trace_lck, res, slept, 0, 0);
1381 			} else {
1382 				word.shared_count++;
1383 				ordered_store_rw(lck, word.data);
1384 				lck_interlock_unlock(lck, istate);
1385 				break;
1386 			}
1387 		}
1388 	}
1389 
1390 #if     CONFIG_DTRACE
1391 	if (dtrace_ls_enabled == TRUE) {
1392 		if (slept == 0) {
1393 			LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_SPIN, lck, mach_absolute_time() - wait_interval, 0);
1394 		} else {
1395 			LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_BLOCK, lck,
1396 			    mach_absolute_time() - wait_interval, 0,
1397 			    (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1398 		}
1399 	}
1400 #endif /* CONFIG_DTRACE */
1401 
1402 	if (grab_state == LCK_RW_GRAB_S_EARLY_RETURN) {
1403 		assert(lock_pause);
1404 		return FALSE;
1405 	}
1406 
1407 #if     CONFIG_DTRACE
1408 	LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_ACQUIRE, lck, 0);
1409 #endif  /* CONFIG_DTRACE */
1410 
1411 	return TRUE;
1412 }
1413 
1414 __attribute__((always_inline))
1415 static boolean_t
1416 lck_rw_lock_shared_internal_inline(
1417 	lck_rw_t        *lock,
1418 	void            *caller,
1419 	bool            (^lock_pause)(void))
1420 {
1421 #pragma unused(caller)
1422 
1423 	uint32_t        data, prev;
1424 	thread_t        thread = current_thread();
1425 #ifdef DEBUG_RW
1426 	boolean_t       check_canlock = TRUE;
1427 #endif
1428 
1429 	if (lock->lck_rw_can_sleep) {
1430 		lck_rw_lock_check_preemption(lock);
1431 		lck_rw_lock_count_inc(thread, lock);
1432 	} else if (get_preemption_level() == 0) {
1433 		panic("Taking non-sleepable RW lock with preemption enabled");
1434 	}
1435 
1436 	for (;;) {
1437 		data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_acquire_smp);
1438 		if (data & (LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE | LCK_RW_INTERLOCK)) {
1439 			atomic_exchange_abort();
1440 			if (!lck_rw_lock_shared_gen(lock, lock_pause)) {
1441 				/*
1442 				 * lck_rw_lock_shared_gen() should only return
1443 				 * early if lock_pause has been passed and
1444 				 * returns FALSE. lock_pause is exclusive with
1445 				 * lck_rw_can_sleep().
1446 				 */
1447 				assert(!lock->lck_rw_can_sleep);
1448 				return FALSE;
1449 			}
1450 
1451 			goto locked;
1452 		}
1453 #ifdef DEBUG_RW
1454 		if ((data & LCK_RW_SHARED_MASK) == 0) {
1455 			/*
1456 			 * If the lock is uncontended,
1457 			 * we do not need to check if we can lock it
1458 			 */
1459 			check_canlock = FALSE;
1460 		}
1461 #endif
1462 		data += LCK_RW_SHARED_READER;
1463 		if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_acquire_smp)) {
1464 			break;
1465 		}
1466 		cpu_pause();
1467 	}
1468 #ifdef DEBUG_RW
1469 	if (check_canlock) {
1470 		/*
1471 		 * Best effort attempt to check that this thread
1472 		 * is not already holding the lock (this checks read mode too).
1473 		 */
1474 		assert_canlock_rwlock(lock, thread, LCK_RW_TYPE_SHARED);
1475 	}
1476 #endif
1477 locked:
1478 	assertf(lock->lck_rw_owner == 0, "state=0x%x, owner=%p",
1479 	    ordered_load_rw(lock), ctid_get_thread_unsafe(lock->lck_rw_owner));
1480 
1481 #if     CONFIG_DTRACE
1482 	LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_ACQUIRE, lock, DTRACE_RW_SHARED);
1483 #endif  /* CONFIG_DTRACE */
1484 
1485 #ifdef DEBUG_RW
1486 	add_held_rwlock(lock, thread, LCK_RW_TYPE_SHARED, caller);
1487 #endif /* DEBUG_RW */
1488 
1489 	return TRUE;
1490 }
1491 
1492 __attribute__((noinline))
1493 static void
lck_rw_lock_shared_internal(lck_rw_t * lock,void * caller)1494 lck_rw_lock_shared_internal(
1495 	lck_rw_t        *lock,
1496 	void            *caller)
1497 {
1498 	(void) lck_rw_lock_shared_internal_inline(lock, caller, NULL);
1499 }
1500 
1501 /*!
1502  * @function lck_rw_lock_shared
1503  *
1504  * @abstract
1505  * Locks a rw_lock in shared mode.
1506  *
1507  * @discussion
1508  * This function can block.
1509  * Multiple threads can acquire the lock in shared mode at the same time, but only one thread at a time
1510  * can acquire it in exclusive mode.
1511  * If the lock is held in shared mode and there are no writers waiting, a reader will be able to acquire
1512  * the lock without waiting.
1513  * If the lock is held in shared mode and there is at least a writer waiting, a reader will wait
1514  * for all the writers to make progress if the lock was initialized with the default settings. Instead if
1515  * RW_SHARED_PRIORITY was selected at initialization time, a reader will never wait if the lock is held
1516  * in shared mode.
1517  * NOTE: the thread cannot return to userspace while the lock is held. Recursive locking is not supported.
1518  *
1519  * @param lock           rw_lock to lock.
1520  */
1521 __mockable
1522 void
lck_rw_lock_shared(lck_rw_t * lock)1523 lck_rw_lock_shared(
1524 	lck_rw_t        *lock)
1525 {
1526 	(void) lck_rw_lock_shared_internal_inline(lock, __builtin_return_address(0), NULL);
1527 }
1528 
1529 /*!
1530  * @function lck_rw_lock_shared_b
1531  *
1532  * @abstract
1533  * Locks a rw_lock in shared mode. Returns early if the lock can't be acquired
1534  * and the specified block returns true.
1535  *
1536  * @discussion
1537  * Identical to lck_rw_lock_shared() but can return early if the lock can't be
1538  * acquired and the specified block returns true. The block is called
1539  * repeatedly when waiting to acquire the lock.
1540  * Should only be called when the lock cannot sleep (i.e. when
1541  * lock->lck_rw_can_sleep is false).
1542  *
1543  * @param lock           rw_lock to lock.
1544  * @param lock_pause     block invoked while waiting to acquire lock
1545  *
1546  * @returns              Returns TRUE if the lock is successfully taken,
1547  *                       FALSE if the block returns true and the lock has
1548  *                       not been acquired.
1549  */
1550 boolean_t
1551 lck_rw_lock_shared_b(
1552 	lck_rw_t        *lock,
1553 	bool            (^lock_pause)(void))
1554 {
1555 	assert(!lock->lck_rw_can_sleep);
1556 
1557 	return lck_rw_lock_shared_internal_inline(lock, __builtin_return_address(0), lock_pause);
1558 }
1559 
1560 /*
1561  *	Routine:	lck_rw_lock_shared_to_exclusive_failure
1562  *	Function:
1563  *		Fast path code has already dropped our read
1564  *		count and determined that someone else owns 'lck_rw_want_upgrade'
1565  *		if 'lck_rw_shared_count' == 0, its also already dropped 'lck_w_waiting'
1566  *		all we need to do here is determine if a wakeup is needed
1567  */
1568 static boolean_t
lck_rw_lock_shared_to_exclusive_failure(lck_rw_t * lck,uint32_t prior_lock_state)1569 lck_rw_lock_shared_to_exclusive_failure(
1570 	lck_rw_t        *lck,
1571 	uint32_t        prior_lock_state)
1572 {
1573 	thread_t        thread = current_thread();
1574 
1575 	if ((prior_lock_state & LCK_RW_W_WAITING) &&
1576 	    ((prior_lock_state & LCK_RW_SHARED_MASK) == LCK_RW_SHARED_READER)) {
1577 		/*
1578 		 *	Someone else has requested upgrade.
1579 		 *	Since we've released the read lock, wake
1580 		 *	him up if he's blocked waiting
1581 		 */
1582 		thread_wakeup(LCK_RW_WRITER_EVENT(lck));
1583 	}
1584 
1585 	/* Check if dropping the lock means that we need to unpromote */
1586 	if (lck->lck_rw_can_sleep) {
1587 		lck_rw_lock_count_dec(thread, lck);
1588 	}
1589 
1590 	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_CODE) | DBG_FUNC_NONE,
1591 	    VM_KERNEL_UNSLIDE_OR_PERM(lck), lck->lck_rw_shared_count, lck->lck_rw_want_upgrade, 0, 0);
1592 
1593 #ifdef DEBUG_RW
1594 	remove_held_rwlock(lck, thread, LCK_RW_TYPE_SHARED);
1595 #endif /* DEBUG_RW */
1596 
1597 	return FALSE;
1598 }
1599 
1600 /*
1601  *	Routine:	lck_rw_lock_shared_to_exclusive_success
1602  *	Function:
1603  *		the fast path code has already dropped our read
1604  *		count and successfully acquired 'lck_rw_want_upgrade'
1605  *		we just need to wait for the rest of the readers to drain
1606  *		and then we can return as the exclusive holder of this lock
1607  */
1608 static void
lck_rw_lock_shared_to_exclusive_success(lck_rw_t * lock)1609 lck_rw_lock_shared_to_exclusive_success(
1610 	lck_rw_t        *lock)
1611 {
1612 	__kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lock);
1613 	int                     slept = 0;
1614 	lck_rw_word_t           word;
1615 	wait_result_t           res;
1616 	boolean_t               istate;
1617 	lck_rw_drain_state_t    drain_state;
1618 
1619 #if     CONFIG_DTRACE
1620 	uint64_t                wait_interval = 0;
1621 	int                     readers_at_sleep = 0;
1622 	boolean_t               dtrace_ls_initialized = FALSE;
1623 	boolean_t               dtrace_rwl_shared_to_excl_spin, dtrace_rwl_shared_to_excl_block, dtrace_ls_enabled = FALSE;
1624 #endif
1625 
1626 	while (lck_rw_drain_status(lock, LCK_RW_SHARED_MASK, FALSE, NULL) != LCK_RW_DRAIN_S_DRAINED) {
1627 		word.data = ordered_load_rw(lock);
1628 #if     CONFIG_DTRACE
1629 		if (dtrace_ls_initialized == FALSE) {
1630 			dtrace_ls_initialized = TRUE;
1631 			dtrace_rwl_shared_to_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN] != 0);
1632 			dtrace_rwl_shared_to_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK] != 0);
1633 			dtrace_ls_enabled = dtrace_rwl_shared_to_excl_spin || dtrace_rwl_shared_to_excl_block;
1634 			if (dtrace_ls_enabled) {
1635 				/*
1636 				 * Either sleeping or spinning is happening,
1637 				 *  start a timing of our delay interval now.
1638 				 */
1639 				readers_at_sleep = word.shared_count;
1640 				wait_interval = mach_absolute_time();
1641 			}
1642 		}
1643 #endif
1644 
1645 		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_SPIN_CODE) | DBG_FUNC_START,
1646 		    trace_lck, word.shared_count, 0, 0, 0);
1647 
1648 		drain_state = lck_rw_drain_status(lock, LCK_RW_SHARED_MASK, TRUE, NULL);
1649 
1650 		KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_SPIN_CODE) | DBG_FUNC_END,
1651 		    trace_lck, lock->lck_rw_shared_count, 0, 0, 0);
1652 
1653 		if (drain_state == LCK_RW_DRAIN_S_DRAINED) {
1654 			break;
1655 		}
1656 
1657 		/*
1658 		 * if we get here, the spin deadline in lck_rw_wait_on_status()
1659 		 * has expired w/o the rw_shared_count having drained to 0
1660 		 * check to see if we're allowed to do a thread_block
1661 		 */
1662 		if (word.can_sleep) {
1663 			istate = lck_interlock_lock(lock);
1664 
1665 			word.data = ordered_load_rw(lock);
1666 			if (word.shared_count != 0) {
1667 				KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_WAIT_CODE) | DBG_FUNC_START,
1668 				    trace_lck, word.shared_count, 0, 0, 0);
1669 
1670 				word.w_waiting = 1;
1671 				ordered_store_rw(lock, word.data);
1672 
1673 				thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockUpgrade);
1674 				res = assert_wait(LCK_RW_WRITER_EVENT(lock),
1675 				    THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
1676 				lck_interlock_unlock(lock, istate);
1677 
1678 				if (res == THREAD_WAITING) {
1679 					res = thread_block(THREAD_CONTINUE_NULL);
1680 					slept++;
1681 				}
1682 				KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_WAIT_CODE) | DBG_FUNC_END,
1683 				    trace_lck, res, slept, 0, 0);
1684 			} else {
1685 				lck_interlock_unlock(lock, istate);
1686 				break;
1687 			}
1688 		}
1689 	}
1690 #if     CONFIG_DTRACE
1691 	/*
1692 	 * We infer whether we took the sleep/spin path above by checking readers_at_sleep.
1693 	 */
1694 	if (dtrace_ls_enabled == TRUE) {
1695 		if (slept == 0) {
1696 			LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN, lock, mach_absolute_time() - wait_interval, 0);
1697 		} else {
1698 			LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK, lock,
1699 			    mach_absolute_time() - wait_interval, 1,
1700 			    (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1701 		}
1702 	}
1703 	LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_UPGRADE, lock, 1);
1704 #endif
1705 }
1706 
1707 /*!
1708  * @function lck_rw_lock_shared_to_exclusive
1709  *
1710  * @abstract
1711  * Upgrades a rw_lock held in shared mode to exclusive.
1712  *
1713  * @discussion
1714  * This function can block.
1715  * Only one reader at a time can upgrade to exclusive mode. If the upgrades fails the function will
1716  * return with the lock not held.
1717  * The caller needs to hold the lock in shared mode to upgrade it.
1718  *
1719  * @param lock           rw_lock already held in shared mode to upgrade.
1720  *
1721  * @returns TRUE if the lock was upgraded, FALSE if it was not possible.
1722  *          If the function was not able to upgrade the lock, the lock will be dropped
1723  *          by the function.
1724  */
1725 __mockable
1726 boolean_t
lck_rw_lock_shared_to_exclusive(lck_rw_t * lock)1727 lck_rw_lock_shared_to_exclusive(
1728 	lck_rw_t        *lock)
1729 {
1730 	thread_t thread = current_thread();
1731 	uint32_t data, prev;
1732 
1733 	assertf(lock->lck_rw_priv_excl != 0, "lock %p thread %p", lock, current_thread());
1734 
1735 #if DEBUG_RW
1736 	assert_held_rwlock(lock, thread, LCK_RW_TYPE_SHARED);
1737 #endif /* DEBUG_RW */
1738 
1739 	for (;;) {
1740 		data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_acquire_smp);
1741 		if (data & LCK_RW_INTERLOCK) {
1742 			atomic_exchange_abort();
1743 			lck_rw_interlock_spin(lock);
1744 			continue;
1745 		}
1746 		if (data & LCK_RW_WANT_UPGRADE) {
1747 			data -= LCK_RW_SHARED_READER;
1748 			if ((data & LCK_RW_SHARED_MASK) == 0) {         /* we were the last reader */
1749 				data &= ~(LCK_RW_W_WAITING);            /* so clear the wait indicator */
1750 			}
1751 			if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_acquire_smp)) {
1752 				return lck_rw_lock_shared_to_exclusive_failure(lock, prev);
1753 			}
1754 		} else {
1755 			data |= LCK_RW_WANT_UPGRADE;            /* ask for WANT_UPGRADE */
1756 			data -= LCK_RW_SHARED_READER;           /* and shed our read count */
1757 			if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_acquire_smp)) {
1758 				break;
1759 			}
1760 		}
1761 		cpu_pause();
1762 	}
1763 	/* we now own the WANT_UPGRADE */
1764 	if (data & LCK_RW_SHARED_MASK) {        /* check to see if all of the readers are drained */
1765 		lck_rw_lock_shared_to_exclusive_success(lock);  /* if not, we need to go wait */
1766 	}
1767 
1768 	assertf(lock->lck_rw_owner == 0, "state=0x%x, owner=%p",
1769 	    ordered_load_rw(lock), ctid_get_thread_unsafe(lock->lck_rw_owner));
1770 
1771 	ordered_store_rw_owner(lock, thread->ctid);
1772 #if     CONFIG_DTRACE
1773 	LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_UPGRADE, lock, 0);
1774 #endif  /* CONFIG_DTRACE */
1775 
1776 #if DEBUG_RW
1777 	change_held_rwlock(lock, thread, LCK_RW_TYPE_SHARED, __builtin_return_address(0));
1778 #endif /* DEBUG_RW */
1779 	return TRUE;
1780 }
1781 
1782 /*
1783  *      Routine:        lck_rw_lock_exclusive_to_shared_gen
1784  *      Function:
1785  *		Fast path has already dropped
1786  *		our exclusive state and bumped lck_rw_shared_count
1787  *		all we need to do here is determine if anyone
1788  *		needs to be awakened.
1789  */
1790 static void
lck_rw_lock_exclusive_to_shared_gen(lck_rw_t * lck,uint32_t prior_lock_state,void * caller)1791 lck_rw_lock_exclusive_to_shared_gen(
1792 	lck_rw_t        *lck,
1793 	uint32_t        prior_lock_state,
1794 	void            *caller)
1795 {
1796 #pragma unused(caller)
1797 	__kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lck);
1798 	lck_rw_word_t   fake_lck;
1799 
1800 	/*
1801 	 * prior_lock state is a snapshot of the 1st word of the
1802 	 * lock in question... we'll fake up a pointer to it
1803 	 * and carefully not access anything beyond whats defined
1804 	 * in the first word of a lck_rw_t
1805 	 */
1806 	fake_lck.data = prior_lock_state;
1807 
1808 	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_START,
1809 	    trace_lck, fake_lck->want_excl, fake_lck->want_upgrade, 0, 0);
1810 
1811 	/*
1812 	 * don't wake up anyone waiting to take the lock exclusively
1813 	 * since we hold a read count... when the read count drops to 0,
1814 	 * the writers will be woken.
1815 	 *
1816 	 * wake up any waiting readers if we don't have any writers waiting,
1817 	 * or the lock is NOT marked as rw_priv_excl (writers have privilege)
1818 	 */
1819 	if (!(fake_lck.priv_excl && fake_lck.w_waiting) && fake_lck.r_waiting) {
1820 		thread_wakeup(LCK_RW_READER_EVENT(lck));
1821 	}
1822 
1823 	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_END,
1824 	    trace_lck, lck->lck_rw_want_excl, lck->lck_rw_want_upgrade, lck->lck_rw_shared_count, 0);
1825 
1826 #if CONFIG_DTRACE
1827 	LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_TO_SHARED_DOWNGRADE, lck, 0);
1828 #endif
1829 
1830 #if DEBUG_RW
1831 	thread_t        thread = current_thread();
1832 	change_held_rwlock(lck, thread, LCK_RW_TYPE_EXCLUSIVE, caller);
1833 #endif /* DEBUG_RW */
1834 }
1835 
1836 /*!
1837  * @function lck_rw_lock_exclusive_to_shared
1838  *
1839  * @abstract
1840  * Downgrades a rw_lock held in exclusive mode to shared.
1841  *
1842  * @discussion
1843  * The caller needs to hold the lock in exclusive mode to be able to downgrade it.
1844  *
1845  * @param lock           rw_lock already held in exclusive mode to downgrade.
1846  */
1847 __mockable
1848 void
lck_rw_lock_exclusive_to_shared(lck_rw_t * lock)1849 lck_rw_lock_exclusive_to_shared(
1850 	lck_rw_t        *lock)
1851 {
1852 	uint32_t        data, prev;
1853 
1854 	assertf(lock->lck_rw_owner == current_thread()->ctid,
1855 	    "state=0x%x, owner=%p", lock->lck_rw_data,
1856 	    ctid_get_thread_unsafe(lock->lck_rw_owner));
1857 	ordered_store_rw_owner(lock, 0);
1858 
1859 	for (;;) {
1860 		data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_release_smp);
1861 		if (data & LCK_RW_INTERLOCK) {
1862 			atomic_exchange_abort();
1863 			lck_rw_interlock_spin(lock);    /* wait for interlock to clear */
1864 			continue;
1865 		}
1866 		data += LCK_RW_SHARED_READER;
1867 		if (data & LCK_RW_WANT_UPGRADE) {
1868 			data &= ~(LCK_RW_WANT_UPGRADE);
1869 		} else {
1870 			data &= ~(LCK_RW_WANT_EXCL);
1871 		}
1872 		if (!((prev & LCK_RW_W_WAITING) && (prev & LCK_RW_PRIV_EXCL))) {
1873 			data &= ~(LCK_RW_W_WAITING);
1874 		}
1875 		if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_release_smp)) {
1876 			break;
1877 		}
1878 		cpu_pause();
1879 	}
1880 	lck_rw_lock_exclusive_to_shared_gen(lock, prev, __builtin_return_address(0));
1881 }
1882 
1883 /*
1884  * Very sad hack, but the codegen for lck_rw_lock
1885  * is very unhappy with the combination of __builtin_return_address()
1886  * and a noreturn function. For some reason it adds more frames
1887  * than it should. rdar://76570684
1888  */
1889 void
1890 _lck_rw_lock_type_panic(lck_rw_t *lck, lck_rw_type_t lck_rw_type);
1891 #pragma clang diagnostic push
1892 #pragma clang diagnostic ignored "-Wmissing-noreturn"
1893 __attribute__((noinline, weak))
1894 void
_lck_rw_lock_type_panic(lck_rw_t * lck,lck_rw_type_t lck_rw_type)1895 _lck_rw_lock_type_panic(
1896 	lck_rw_t        *lck,
1897 	lck_rw_type_t   lck_rw_type)
1898 {
1899 	panic("lck_rw_lock(): Invalid RW lock type: %x for lock %p", lck_rw_type, lck);
1900 }
1901 #pragma clang diagnostic pop
1902 
1903 /*!
1904  * @function lck_rw_lock
1905  *
1906  * @abstract
1907  * Locks a rw_lock with the specified type.
1908  *
1909  * @discussion
1910  * See lck_rw_lock_shared() or lck_rw_lock_exclusive() for more details.
1911  *
1912  * @param lck           rw_lock to lock.
1913  * @param lck_rw_type   LCK_RW_TYPE_SHARED or LCK_RW_TYPE_EXCLUSIVE
1914  */
1915 __mockable
1916 void
lck_rw_lock(lck_rw_t * lck,lck_rw_type_t lck_rw_type)1917 lck_rw_lock(
1918 	lck_rw_t        *lck,
1919 	lck_rw_type_t   lck_rw_type)
1920 {
1921 	if (lck_rw_type == LCK_RW_TYPE_SHARED) {
1922 		return lck_rw_lock_shared_internal(lck, __builtin_return_address(0));
1923 	} else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE) {
1924 		return lck_rw_lock_exclusive_internal(lck, __builtin_return_address(0));
1925 	}
1926 	_lck_rw_lock_type_panic(lck, lck_rw_type);
1927 }
1928 
1929 __attribute__((always_inline))
1930 static boolean_t
lck_rw_try_lock_shared_internal_inline(lck_rw_t * lock,void * caller)1931 lck_rw_try_lock_shared_internal_inline(
1932 	lck_rw_t        *lock,
1933 	void            *caller)
1934 {
1935 #pragma unused(caller)
1936 
1937 	uint32_t        data, prev;
1938 	thread_t        thread = current_thread();
1939 #ifdef DEBUG_RW
1940 	boolean_t       check_canlock = TRUE;
1941 #endif
1942 
1943 	for (;;) {
1944 		data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_acquire_smp);
1945 		if (data & LCK_RW_INTERLOCK) {
1946 			atomic_exchange_abort();
1947 			lck_rw_interlock_spin(lock);
1948 			continue;
1949 		}
1950 		if (data & (LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE)) {
1951 			atomic_exchange_abort();
1952 			return FALSE;             /* lock is busy */
1953 		}
1954 #ifdef DEBUG_RW
1955 		if ((data & LCK_RW_SHARED_MASK) == 0) {
1956 			/*
1957 			 * If the lock is uncontended,
1958 			 * we do not need to check if we can lock it
1959 			 */
1960 			check_canlock = FALSE;
1961 		}
1962 #endif
1963 		data += LCK_RW_SHARED_READER;     /* Increment reader refcount */
1964 		if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_acquire_smp)) {
1965 			break;
1966 		}
1967 		cpu_pause();
1968 	}
1969 #ifdef DEBUG_RW
1970 	if (check_canlock) {
1971 		/*
1972 		 * Best effort attempt to check that this thread
1973 		 * is not already holding the lock (this checks read mode too).
1974 		 */
1975 		assert_canlock_rwlock(lock, thread, LCK_RW_TYPE_SHARED);
1976 	}
1977 #endif
1978 	assertf(lock->lck_rw_owner == 0, "state=0x%x, owner=%p",
1979 	    ordered_load_rw(lock), ctid_get_thread_unsafe(lock->lck_rw_owner));
1980 
1981 	if (lock->lck_rw_can_sleep) {
1982 		lck_rw_lock_count_inc(thread, lock);
1983 	} else if (get_preemption_level() == 0) {
1984 		panic("Taking non-sleepable RW lock with preemption enabled");
1985 	}
1986 
1987 #if     CONFIG_DTRACE
1988 	LOCKSTAT_RECORD(LS_LCK_RW_TRY_LOCK_SHARED_ACQUIRE, lock, DTRACE_RW_SHARED);
1989 #endif  /* CONFIG_DTRACE */
1990 
1991 #ifdef DEBUG_RW
1992 	add_held_rwlock(lock, thread, LCK_RW_TYPE_SHARED, caller);
1993 #endif /* DEBUG_RW */
1994 	return TRUE;
1995 }
1996 
1997 __attribute__((noinline))
1998 static boolean_t
lck_rw_try_lock_shared_internal(lck_rw_t * lock,void * caller)1999 lck_rw_try_lock_shared_internal(
2000 	lck_rw_t        *lock,
2001 	void            *caller)
2002 {
2003 	return lck_rw_try_lock_shared_internal_inline(lock, caller);
2004 }
2005 
2006 /*!
2007  * @function lck_rw_try_lock_shared
2008  *
2009  * @abstract
2010  * Tries to locks a rw_lock in read mode.
2011  *
2012  * @discussion
2013  * This function will return and not block in case the lock is already held.
2014  * See lck_rw_lock_shared for more details.
2015  *
2016  * @param lock           rw_lock to lock.
2017  *
2018  * @returns TRUE if the lock is successfully acquired, FALSE in case it was already held.
2019  */
2020 __mockable
2021 boolean_t
lck_rw_try_lock_shared(lck_rw_t * lock)2022 lck_rw_try_lock_shared(
2023 	lck_rw_t        *lock)
2024 {
2025 	return lck_rw_try_lock_shared_internal_inline(lock, __builtin_return_address(0));
2026 }
2027 
2028 __attribute__((always_inline))
2029 static boolean_t
lck_rw_try_lock_exclusive_internal_inline(lck_rw_t * lock,void * caller)2030 lck_rw_try_lock_exclusive_internal_inline(
2031 	lck_rw_t        *lock,
2032 	void            *caller)
2033 {
2034 #pragma unused(caller)
2035 	uint32_t        data, prev;
2036 
2037 	for (;;) {
2038 		data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_acquire_smp);
2039 		if (data & LCK_RW_INTERLOCK) {
2040 			atomic_exchange_abort();
2041 			lck_rw_interlock_spin(lock);
2042 			continue;
2043 		}
2044 		if (data & (LCK_RW_SHARED_MASK | LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE)) {
2045 			atomic_exchange_abort();
2046 			return FALSE;
2047 		}
2048 		data |= LCK_RW_WANT_EXCL;
2049 		if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_acquire_smp)) {
2050 			break;
2051 		}
2052 		cpu_pause();
2053 	}
2054 	thread_t thread = current_thread();
2055 
2056 	if (lock->lck_rw_can_sleep) {
2057 		lck_rw_lock_count_inc(thread, lock);
2058 	} else if (get_preemption_level() == 0) {
2059 		panic("Taking non-sleepable RW lock with preemption enabled");
2060 	}
2061 
2062 	assertf(lock->lck_rw_owner == 0, "state=0x%x, owner=%p",
2063 	    ordered_load_rw(lock), ctid_get_thread_unsafe(lock->lck_rw_owner));
2064 
2065 	ordered_store_rw_owner(lock, thread->ctid);
2066 #if     CONFIG_DTRACE
2067 	LOCKSTAT_RECORD(LS_LCK_RW_TRY_LOCK_EXCL_ACQUIRE, lock, DTRACE_RW_EXCL);
2068 #endif  /* CONFIG_DTRACE */
2069 
2070 #ifdef DEBUG_RW
2071 	add_held_rwlock(lock, thread, LCK_RW_TYPE_EXCLUSIVE, caller);
2072 #endif /* DEBUG_RW */
2073 	return TRUE;
2074 }
2075 
2076 __attribute__((noinline))
2077 static boolean_t
lck_rw_try_lock_exclusive_internal(lck_rw_t * lock,void * caller)2078 lck_rw_try_lock_exclusive_internal(
2079 	lck_rw_t        *lock,
2080 	void            *caller)
2081 {
2082 	return lck_rw_try_lock_exclusive_internal_inline(lock, caller);
2083 }
2084 
2085 /*!
2086  * @function lck_rw_try_lock_exclusive
2087  *
2088  * @abstract
2089  * Tries to locks a rw_lock in write mode.
2090  *
2091  * @discussion
2092  * This function will return and not block in case the lock is already held.
2093  * See lck_rw_lock_exclusive for more details.
2094  *
2095  * @param lock           rw_lock to lock.
2096  *
2097  * @returns TRUE if the lock is successfully acquired, FALSE in case it was already held.
2098  */
2099 __mockable
2100 boolean_t
lck_rw_try_lock_exclusive(lck_rw_t * lock)2101 lck_rw_try_lock_exclusive(
2102 	lck_rw_t        *lock)
2103 {
2104 	return lck_rw_try_lock_exclusive_internal_inline(lock, __builtin_return_address(0));
2105 }
2106 
2107 /*
2108  * Very sad hack, but the codegen for lck_rw_try_lock
2109  * is very unhappy with the combination of __builtin_return_address()
2110  * and a noreturn function. For some reason it adds more frames
2111  * than it should. rdar://76570684
2112  */
2113 boolean_t
2114 _lck_rw_try_lock_type_panic(lck_rw_t *lck, lck_rw_type_t lck_rw_type);
2115 #pragma clang diagnostic push
2116 #pragma clang diagnostic ignored "-Wmissing-noreturn"
2117 __attribute__((noinline, weak))
2118 boolean_t
_lck_rw_try_lock_type_panic(lck_rw_t * lck,lck_rw_type_t lck_rw_type)2119 _lck_rw_try_lock_type_panic(
2120 	lck_rw_t        *lck,
2121 	lck_rw_type_t   lck_rw_type)
2122 {
2123 	panic("lck_rw_lock(): Invalid RW lock type: %x for lock %p", lck_rw_type, lck);
2124 }
2125 #pragma clang diagnostic pop
2126 
2127 /*!
2128  * @function lck_rw_try_lock
2129  *
2130  * @abstract
2131  * Tries to locks a rw_lock with the specified type.
2132  *
2133  * @discussion
2134  * This function will return and not wait/block in case the lock is already held.
2135  * See lck_rw_try_lock_shared() or lck_rw_try_lock_exclusive() for more details.
2136  *
2137  * @param lck           rw_lock to lock.
2138  * @param lck_rw_type   LCK_RW_TYPE_SHARED or LCK_RW_TYPE_EXCLUSIVE
2139  *
2140  * @returns TRUE if the lock is successfully acquired, FALSE in case it was already held.
2141  */
2142 __mockable
2143 boolean_t
lck_rw_try_lock(lck_rw_t * lck,lck_rw_type_t lck_rw_type)2144 lck_rw_try_lock(
2145 	lck_rw_t        *lck,
2146 	lck_rw_type_t   lck_rw_type)
2147 {
2148 	if (lck_rw_type == LCK_RW_TYPE_SHARED) {
2149 		return lck_rw_try_lock_shared_internal(lck, __builtin_return_address(0));
2150 	} else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE) {
2151 		return lck_rw_try_lock_exclusive_internal(lck, __builtin_return_address(0));
2152 	}
2153 	return _lck_rw_try_lock_type_panic(lck, lck_rw_type);
2154 }
2155 
2156 /*
2157  *      Routine:        lck_rw_done_gen
2158  *
2159  *	prior_lock_state is the value in the 1st
2160  *      word of the lock at the time of a successful
2161  *	atomic compare and exchange with the new value...
2162  *      it represents the state of the lock before we
2163  *	decremented the rw_shared_count or cleared either
2164  *      rw_want_upgrade or rw_want_write and
2165  *	the lck_x_waiting bits...  since the wrapper
2166  *      routine has already changed the state atomically,
2167  *	we just need to decide if we should
2168  *	wake up anyone and what value to return... we do
2169  *	this by examining the state of the lock before
2170  *	we changed it
2171  */
2172 static lck_rw_type_t
lck_rw_done_gen(lck_rw_t * lck,uint32_t prior_lock_state)2173 lck_rw_done_gen(
2174 	lck_rw_t        *lck,
2175 	uint32_t        prior_lock_state)
2176 {
2177 	lck_rw_word_t   fake_lck;
2178 	lck_rw_type_t   lock_type;
2179 	thread_t        thread;
2180 
2181 	/*
2182 	 * prior_lock state is a snapshot of the 1st word of the
2183 	 * lock in question... we'll fake up a pointer to it
2184 	 * and carefully not access anything beyond whats defined
2185 	 * in the first word of a lck_rw_t
2186 	 */
2187 	fake_lck.data = prior_lock_state;
2188 
2189 	if (fake_lck.shared_count <= 1) {
2190 		if (fake_lck.w_waiting) {
2191 			thread_wakeup(LCK_RW_WRITER_EVENT(lck));
2192 		}
2193 
2194 		if (!(fake_lck.priv_excl && fake_lck.w_waiting) && fake_lck.r_waiting) {
2195 			thread_wakeup(LCK_RW_READER_EVENT(lck));
2196 		}
2197 	}
2198 	if (fake_lck.shared_count) {
2199 		lock_type = LCK_RW_TYPE_SHARED;
2200 	} else {
2201 		lock_type = LCK_RW_TYPE_EXCLUSIVE;
2202 	}
2203 
2204 	/* Check if dropping the lock means that we need to unpromote */
2205 	thread = current_thread();
2206 	if (fake_lck.can_sleep) {
2207 		lck_rw_lock_count_dec(thread, lck);
2208 	}
2209 
2210 #if CONFIG_DTRACE
2211 	LOCKSTAT_RECORD(LS_LCK_RW_DONE_RELEASE, lck, lock_type == LCK_RW_TYPE_SHARED ? 0 : 1);
2212 #endif
2213 
2214 #ifdef DEBUG_RW
2215 	remove_held_rwlock(lck, thread, lock_type);
2216 #endif /* DEBUG_RW */
2217 	return lock_type;
2218 }
2219 
2220 /*!
2221  * @function lck_rw_done
2222  *
2223  * @abstract
2224  * Force unlocks a rw_lock without consistency checks.
2225  *
2226  * @discussion
2227  * Do not use unless sure you can avoid consistency checks.
2228  *
2229  * @param lock           rw_lock to unlock.
2230  */
2231 __mockable
2232 lck_rw_type_t
lck_rw_done(lck_rw_t * lock)2233 lck_rw_done(
2234 	lck_rw_t        *lock)
2235 {
2236 	uint32_t        data, prev;
2237 	boolean_t       once = FALSE;
2238 
2239 #ifdef DEBUG_RW
2240 	/*
2241 	 * Best effort attempt to check that this thread
2242 	 * is holding the lock.
2243 	 */
2244 	thread_t thread = current_thread();
2245 	assert_held_rwlock(lock, thread, 0);
2246 #endif /* DEBUG_RW */
2247 	for (;;) {
2248 		data = atomic_exchange_begin32(&lock->lck_rw_data, &prev, memory_order_release_smp);
2249 		if (data & LCK_RW_INTERLOCK) {          /* wait for interlock to clear */
2250 			atomic_exchange_abort();
2251 			lck_rw_interlock_spin(lock);
2252 			continue;
2253 		}
2254 		if (data & LCK_RW_SHARED_MASK) {        /* lock is held shared */
2255 			assertf(lock->lck_rw_owner == 0,
2256 			    "state=0x%x, owner=%p", lock->lck_rw_data,
2257 			    ctid_get_thread_unsafe(lock->lck_rw_owner));
2258 			data -= LCK_RW_SHARED_READER;
2259 			if ((data & LCK_RW_SHARED_MASK) == 0) { /* if reader count has now gone to 0, check for waiters */
2260 				goto check_waiters;
2261 			}
2262 		} else {                                        /* if reader count == 0, must be exclusive lock */
2263 			if (data & LCK_RW_WANT_UPGRADE) {
2264 				data &= ~(LCK_RW_WANT_UPGRADE);
2265 			} else {
2266 				if (data & LCK_RW_WANT_EXCL) {
2267 					data &= ~(LCK_RW_WANT_EXCL);
2268 				} else {                                /* lock is not 'owned', panic */
2269 					panic("Releasing non-exclusive RW lock without a reader refcount!");
2270 				}
2271 			}
2272 			if (!once) {
2273 				// Only check for holder and clear it once
2274 				assertf(lock->lck_rw_owner == current_thread()->ctid,
2275 				    "state=0x%x, owner=%p", lock->lck_rw_data,
2276 				    ctid_get_thread_unsafe(lock->lck_rw_owner));
2277 				ordered_store_rw_owner(lock, 0);
2278 				once = TRUE;
2279 			}
2280 check_waiters:
2281 			/*
2282 			 * test the original values to match what
2283 			 * lck_rw_done_gen is going to do to determine
2284 			 * which wakeups need to happen...
2285 			 *
2286 			 * if !(fake_lck->lck_rw_priv_excl && fake_lck->lck_w_waiting)
2287 			 */
2288 			if (prev & LCK_RW_W_WAITING) {
2289 				data &= ~(LCK_RW_W_WAITING);
2290 				if ((prev & LCK_RW_PRIV_EXCL) == 0) {
2291 					data &= ~(LCK_RW_R_WAITING);
2292 				}
2293 			} else {
2294 				data &= ~(LCK_RW_R_WAITING);
2295 			}
2296 		}
2297 		if (atomic_exchange_complete32(&lock->lck_rw_data, prev, data, memory_order_release_smp)) {
2298 			break;
2299 		}
2300 		cpu_pause();
2301 	}
2302 	return lck_rw_done_gen(lock, prev);
2303 }
2304 
2305 /*!
2306  * @function lck_rw_unlock_shared
2307  *
2308  * @abstract
2309  * Unlocks a rw_lock previously locked in shared mode.
2310  *
2311  * @discussion
2312  * The same thread that locked the lock needs to unlock it.
2313  *
2314  * @param lck           rw_lock held in shared mode to unlock.
2315  */
2316 __mockable
2317 void
lck_rw_unlock_shared(lck_rw_t * lck)2318 lck_rw_unlock_shared(
2319 	lck_rw_t        *lck)
2320 {
2321 	lck_rw_type_t   ret;
2322 
2323 	assertf(lck->lck_rw_owner == 0,
2324 	    "state=0x%x, owner=%p", lck->lck_rw_data,
2325 	    ctid_get_thread_unsafe(lck->lck_rw_owner));
2326 	assertf(lck->lck_rw_shared_count > 0, "shared_count=0x%x", lck->lck_rw_shared_count);
2327 	ret = lck_rw_done(lck);
2328 
2329 	if (ret != LCK_RW_TYPE_SHARED) {
2330 		panic("lck_rw_unlock_shared(): lock %p held in mode: %d", lck, ret);
2331 	}
2332 }
2333 
2334 /*!
2335  * @function lck_rw_unlock_exclusive
2336  *
2337  * @abstract
2338  * Unlocks a rw_lock previously locked in exclusive mode.
2339  *
2340  * @discussion
2341  * The same thread that locked the lock needs to unlock it.
2342  *
2343  * @param lck           rw_lock held in exclusive mode to unlock.
2344  */
2345 __mockable
2346 void
lck_rw_unlock_exclusive(lck_rw_t * lck)2347 lck_rw_unlock_exclusive(
2348 	lck_rw_t        *lck)
2349 {
2350 	lck_rw_type_t   ret;
2351 
2352 	assertf(lck->lck_rw_owner == current_thread()->ctid,
2353 	    "state=0x%x, owner=%p", lck->lck_rw_data,
2354 	    ctid_get_thread_unsafe(lck->lck_rw_owner));
2355 	ret = lck_rw_done(lck);
2356 
2357 	if (ret != LCK_RW_TYPE_EXCLUSIVE) {
2358 		panic("lck_rw_unlock_exclusive(): lock %p held in mode: %d", lck, ret);
2359 	}
2360 }
2361 
2362 /*!
2363  * @function lck_rw_unlock
2364  *
2365  * @abstract
2366  * Unlocks a rw_lock previously locked with lck_rw_type.
2367  *
2368  * @discussion
2369  * The lock must be unlocked by the same thread it was locked from.
2370  * The type of the lock/unlock have to match, unless an upgrade/downgrade was performed while
2371  * holding the lock.
2372  *
2373  * @param lck           rw_lock to unlock.
2374  * @param lck_rw_type   LCK_RW_TYPE_SHARED or LCK_RW_TYPE_EXCLUSIVE
2375  */
2376 __mockable
2377 void
lck_rw_unlock(lck_rw_t * lck,lck_rw_type_t lck_rw_type)2378 lck_rw_unlock(
2379 	lck_rw_t         *lck,
2380 	lck_rw_type_t    lck_rw_type)
2381 {
2382 	if (lck_rw_type == LCK_RW_TYPE_SHARED) {
2383 		lck_rw_unlock_shared(lck);
2384 	} else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE) {
2385 		lck_rw_unlock_exclusive(lck);
2386 	} else {
2387 		panic("lck_rw_unlock(): Invalid RW lock type: %d", lck_rw_type);
2388 	}
2389 }
2390 
2391 /*!
2392  * @function lck_rw_assert
2393  *
2394  * @abstract
2395  * Asserts the rw_lock is held.
2396  *
2397  * @discussion
2398  * read-write locks do not have a concept of ownership when held in shared mode,
2399  * so this function merely asserts that someone is holding the lock, not necessarily the caller.
2400  * However if rw_lock_debug is on, a best effort mechanism to track the owners is in place, and
2401  * this function can be more accurate.
2402  * Type can be LCK_RW_ASSERT_SHARED, LCK_RW_ASSERT_EXCLUSIVE, LCK_RW_ASSERT_HELD
2403  * LCK_RW_ASSERT_NOTHELD.
2404  *
2405  * @param lck   rw_lock to check.
2406  * @param type  assert type
2407  */
2408 __mockable
2409 void
lck_rw_assert(lck_rw_t * lck,unsigned int type)2410 lck_rw_assert(
2411 	lck_rw_t        *lck,
2412 	unsigned int    type)
2413 {
2414 	thread_t thread = current_thread();
2415 
2416 	switch (type) {
2417 	case LCK_RW_ASSERT_SHARED:
2418 		if ((lck->lck_rw_shared_count != 0) &&
2419 		    (lck->lck_rw_owner == 0)) {
2420 #if DEBUG_RW
2421 			assert_held_rwlock(lck, thread, LCK_RW_TYPE_SHARED);
2422 #endif /* DEBUG_RW */
2423 			return;
2424 		}
2425 		break;
2426 	case LCK_RW_ASSERT_EXCLUSIVE:
2427 		if ((lck->lck_rw_want_excl || lck->lck_rw_want_upgrade) &&
2428 		    (lck->lck_rw_shared_count == 0) &&
2429 		    (lck->lck_rw_owner == thread->ctid)) {
2430 #if DEBUG_RW
2431 			assert_held_rwlock(lck, thread, LCK_RW_TYPE_EXCLUSIVE);
2432 #endif /* DEBUG_RW */
2433 			return;
2434 		}
2435 		break;
2436 	case LCK_RW_ASSERT_HELD:
2437 		if (lck->lck_rw_shared_count != 0) {
2438 #if DEBUG_RW
2439 			assert_held_rwlock(lck, thread, LCK_RW_TYPE_SHARED);
2440 #endif /* DEBUG_RW */
2441 			return;         // Held shared
2442 		}
2443 		if ((lck->lck_rw_want_excl || lck->lck_rw_want_upgrade) &&
2444 		    (lck->lck_rw_owner == thread->ctid)) {
2445 #if DEBUG_RW
2446 			assert_held_rwlock(lck, thread, LCK_RW_TYPE_EXCLUSIVE);
2447 #endif /* DEBUG_RW */
2448 			return;         // Held exclusive
2449 		}
2450 		break;
2451 	case LCK_RW_ASSERT_NOTHELD:
2452 		if ((lck->lck_rw_shared_count == 0) &&
2453 		    !(lck->lck_rw_want_excl || lck->lck_rw_want_upgrade) &&
2454 		    (lck->lck_rw_owner == 0)) {
2455 #ifdef DEBUG_RW
2456 			assert_canlock_rwlock(lck, thread, LCK_RW_TYPE_EXCLUSIVE);
2457 #endif /* DEBUG_RW */
2458 			return;
2459 		}
2460 		break;
2461 	default:
2462 		break;
2463 	}
2464 	panic("rw lock (%p)%s held (mode=%u)", lck, (type == LCK_RW_ASSERT_NOTHELD ? "" : " not"), type);
2465 }
2466 
2467 /*!
2468  * @function kdp_lck_rw_lock_is_acquired_exclusive
2469  *
2470  * @abstract
2471  * Checks if a rw_lock is held exclusevely.
2472  *
2473  * @discussion
2474  * NOT SAFE: To be used only by kernel debugger to avoid deadlock.
2475  *
2476  * @param lck   lock to check
2477  *
2478  * @returns TRUE if the lock is held exclusevely
2479  */
2480 boolean_t
kdp_lck_rw_lock_is_acquired_exclusive(lck_rw_t * lck)2481 kdp_lck_rw_lock_is_acquired_exclusive(
2482 	lck_rw_t        *lck)
2483 {
2484 	if (not_in_kdp) {
2485 		panic("panic: rw lock exclusive check done outside of kernel debugger");
2486 	}
2487 	return ((lck->lck_rw_want_upgrade || lck->lck_rw_want_excl) && (lck->lck_rw_shared_count == 0)) ? TRUE : FALSE;
2488 }
2489 
2490 void
kdp_rwlck_find_owner(__unused struct waitq * waitq,event64_t event,thread_waitinfo_t * waitinfo)2491 kdp_rwlck_find_owner(
2492 	__unused struct waitq   *waitq,
2493 	event64_t               event,
2494 	thread_waitinfo_t       *waitinfo)
2495 {
2496 	lck_rw_t        *rwlck = NULL;
2497 	switch (waitinfo->wait_type) {
2498 	case kThreadWaitKernelRWLockRead:
2499 		rwlck = READ_EVENT_TO_RWLOCK(event);
2500 		break;
2501 	case kThreadWaitKernelRWLockWrite:
2502 	case kThreadWaitKernelRWLockUpgrade:
2503 		rwlck = WRITE_EVENT_TO_RWLOCK(event);
2504 		break;
2505 	default:
2506 		panic("%s was called with an invalid blocking type", __FUNCTION__);
2507 		break;
2508 	}
2509 	waitinfo->context = VM_KERNEL_UNSLIDE_OR_PERM(rwlck);
2510 	waitinfo->owner = thread_tid(ctid_get_thread(rwlck->lck_rw_owner));
2511 }
2512 
2513 /*!
2514  * @function lck_rw_lock_would_yield_shared
2515  *
2516  * @abstract
2517  * Check whether a rw_lock currently held in shared mode would be yielded
2518  *
2519  * @discussion
2520  * This function can be used when lck_rw_lock_yield_shared() would be
2521  * inappropriate due to the need to perform additional housekeeping
2522  * prior to any yield or when the caller may wish to prematurely terminate
2523  * an operation rather than resume it after regaining the lock.
2524  *
2525  * @param lck           rw_lock already held in shared mode to yield.
2526  *
2527  * @returns TRUE if the lock would yield, FALSE otherwise
2528  */
2529 __mockable
2530 bool
lck_rw_lock_would_yield_shared(lck_rw_t * lck)2531 lck_rw_lock_would_yield_shared(
2532 	lck_rw_t        *lck)
2533 {
2534 	lck_rw_word_t   word;
2535 
2536 	lck_rw_assert(lck, LCK_RW_ASSERT_SHARED);
2537 
2538 	word.data = ordered_load_rw(lck);
2539 	if (word.want_excl || word.want_upgrade) {
2540 		return true;
2541 	}
2542 
2543 	return false;
2544 }
2545 
2546 /*!
2547  * @function lck_rw_lock_yield_shared
2548  *
2549  * @abstract
2550  * Yields a rw_lock held in shared mode.
2551  *
2552  * @discussion
2553  * This function can block.
2554  * Yields the lock in case there are writers waiting.
2555  * The yield will unlock, block, and re-lock the lock in shared mode.
2556  *
2557  * @param lck           rw_lock already held in shared mode to yield.
2558  * @param force_yield   if set to true it will always yield irrespective of the lock status
2559  *
2560  * @returns TRUE if the lock was yield, FALSE otherwise
2561  */
2562 bool
lck_rw_lock_yield_shared(lck_rw_t * lck,boolean_t force_yield)2563 lck_rw_lock_yield_shared(
2564 	lck_rw_t        *lck,
2565 	boolean_t       force_yield)
2566 {
2567 	if (lck_rw_lock_would_yield_shared(lck) || force_yield) {
2568 		lck_rw_unlock_shared(lck);
2569 		mutex_pause(2);
2570 		lck_rw_lock_shared(lck);
2571 		return true;
2572 	}
2573 
2574 	return false;
2575 }
2576 
2577 /*!
2578  * @function lck_rw_lock_would_yield_exclusive
2579  *
2580  * @abstract
2581  * Check whether a rw_lock currently held in exclusive mode would be yielded
2582  *
2583  * @discussion
2584  * This function can be used when lck_rw_lock_yield_exclusive would be
2585  * inappropriate due to the need to perform additional housekeeping
2586  * prior to any yield or when the caller may wish to prematurely terminate
2587  * an operation rather than resume it after regaining the lock.
2588  *
2589  * @param lck           rw_lock already held in exclusive mode to yield.
2590  * @param mode          when to yield.
2591  *
2592  * @returns TRUE if the lock would yield, FALSE otherwise
2593  */
2594 __mockable
2595 bool
lck_rw_lock_would_yield_exclusive(lck_rw_t * lck,lck_rw_yield_t mode)2596 lck_rw_lock_would_yield_exclusive(
2597 	lck_rw_t        *lck,
2598 	lck_rw_yield_t  mode)
2599 {
2600 	lck_rw_word_t word;
2601 	bool yield = false;
2602 
2603 	lck_rw_assert(lck, LCK_RW_ASSERT_EXCLUSIVE);
2604 
2605 	if (mode == LCK_RW_YIELD_ALWAYS) {
2606 		yield = true;
2607 	} else {
2608 		word.data = ordered_load_rw(lck);
2609 		if (word.w_waiting) {
2610 			yield = true;
2611 		} else if (mode == LCK_RW_YIELD_ANY_WAITER) {
2612 			yield = (word.r_waiting != 0);
2613 		}
2614 	}
2615 
2616 	return yield;
2617 }
2618 
2619 /*!
2620  * @function lck_rw_lock_yield_exclusive
2621  *
2622  * @abstract
2623  * Yields a rw_lock held in exclusive mode.
2624  *
2625  * @discussion
2626  * This function can block.
2627  * Yields the lock in case there are writers waiting.
2628  * The yield will unlock, block, and re-lock the lock in exclusive mode.
2629  *
2630  * @param lck           rw_lock already held in exclusive mode to yield.
2631  * @param mode          when to yield.
2632  *
2633  * @returns TRUE if the lock was yield, FALSE otherwise
2634  */
2635 bool
lck_rw_lock_yield_exclusive(lck_rw_t * lck,lck_rw_yield_t mode)2636 lck_rw_lock_yield_exclusive(
2637 	lck_rw_t        *lck,
2638 	lck_rw_yield_t  mode)
2639 {
2640 	bool yield = lck_rw_lock_would_yield_exclusive(lck, mode);
2641 
2642 	if (yield) {
2643 		lck_rw_unlock_exclusive(lck);
2644 		mutex_pause(2);
2645 		lck_rw_lock_exclusive(lck);
2646 	}
2647 
2648 	return yield;
2649 }
2650 
2651 /*!
2652  * @function lck_rw_sleep
2653  *
2654  * @abstract
2655  * Assert_wait on an event while holding the rw_lock.
2656  *
2657  * @discussion
2658  * the flags can decide how to re-acquire the lock upon wake up
2659  * (LCK_SLEEP_SHARED, or LCK_SLEEP_EXCLUSIVE, or LCK_SLEEP_UNLOCK)
2660  * and if the priority needs to be kept boosted until the lock is
2661  * re-acquired (LCK_SLEEP_PROMOTED_PRI).
2662  *
2663  * @param lck                   rw_lock to use to synch the assert_wait.
2664  * @param lck_sleep_action      flags.
2665  * @param event                 event to assert_wait on.
2666  * @param interruptible         wait type.
2667  */
2668 wait_result_t
lck_rw_sleep(lck_rw_t * lck,lck_sleep_action_t lck_sleep_action,event_t event,wait_interrupt_t interruptible)2669 lck_rw_sleep(
2670 	lck_rw_t                *lck,
2671 	lck_sleep_action_t      lck_sleep_action,
2672 	event_t                 event,
2673 	wait_interrupt_t        interruptible)
2674 {
2675 	wait_result_t           res;
2676 	lck_rw_type_t           lck_rw_type;
2677 	thread_pri_floor_t      token;
2678 
2679 	if ((lck_sleep_action & ~LCK_SLEEP_MASK) != 0) {
2680 		panic("Invalid lock sleep action %x", lck_sleep_action);
2681 	}
2682 
2683 	if (lck_sleep_action & LCK_SLEEP_PROMOTED_PRI) {
2684 		/*
2685 		 * Although we are dropping the RW lock, the intent in most cases
2686 		 * is that this thread remains as an observer, since it may hold
2687 		 * some secondary resource, but must yield to avoid deadlock. In
2688 		 * this situation, make sure that the thread is boosted to the
2689 		 * ceiling while blocked, so that it can re-acquire the
2690 		 * RW lock at that priority.
2691 		 */
2692 		token = thread_priority_floor_start();
2693 	}
2694 
2695 	res = assert_wait(event, interruptible);
2696 	if (res == THREAD_WAITING) {
2697 		lck_rw_type = lck_rw_done(lck);
2698 		res = thread_block(THREAD_CONTINUE_NULL);
2699 		if (!(lck_sleep_action & LCK_SLEEP_UNLOCK)) {
2700 			if (!(lck_sleep_action & (LCK_SLEEP_SHARED | LCK_SLEEP_EXCLUSIVE))) {
2701 				lck_rw_lock(lck, lck_rw_type);
2702 			} else if (lck_sleep_action & LCK_SLEEP_EXCLUSIVE) {
2703 				lck_rw_lock_exclusive(lck);
2704 			} else {
2705 				lck_rw_lock_shared(lck);
2706 			}
2707 		}
2708 	} else if (lck_sleep_action & LCK_SLEEP_UNLOCK) {
2709 		(void)lck_rw_done(lck);
2710 	}
2711 
2712 	if (lck_sleep_action & LCK_SLEEP_PROMOTED_PRI) {
2713 		thread_priority_floor_end(&token);
2714 	}
2715 
2716 	return res;
2717 }
2718 
2719 /*!
2720  * @function lck_rw_sleep_deadline
2721  *
2722  * @abstract
2723  * Assert_wait_deadline on an event while holding the rw_lock.
2724  *
2725  * @discussion
2726  * the flags can decide how to re-acquire the lock upon wake up
2727  * (LCK_SLEEP_SHARED, or LCK_SLEEP_EXCLUSIVE, or LCK_SLEEP_UNLOCK)
2728  * and if the priority needs to be kept boosted until the lock is
2729  * re-acquired (LCK_SLEEP_PROMOTED_PRI).
2730  *
2731  * @param lck                   rw_lock to use to synch the assert_wait.
2732  * @param lck_sleep_action      flags.
2733  * @param event                 event to assert_wait on.
2734  * @param interruptible         wait type.
2735  * @param deadline              maximum time after which being woken up
2736  */
2737 wait_result_t
lck_rw_sleep_deadline(lck_rw_t * lck,lck_sleep_action_t lck_sleep_action,event_t event,wait_interrupt_t interruptible,uint64_t deadline)2738 lck_rw_sleep_deadline(
2739 	lck_rw_t                *lck,
2740 	lck_sleep_action_t      lck_sleep_action,
2741 	event_t                 event,
2742 	wait_interrupt_t        interruptible,
2743 	uint64_t                deadline)
2744 {
2745 	wait_result_t           res;
2746 	lck_rw_type_t           lck_rw_type;
2747 	thread_pri_floor_t      token;
2748 
2749 	if ((lck_sleep_action & ~LCK_SLEEP_MASK) != 0) {
2750 		panic("Invalid lock sleep action %x", lck_sleep_action);
2751 	}
2752 
2753 	if (lck_sleep_action & LCK_SLEEP_PROMOTED_PRI) {
2754 		token = thread_priority_floor_start();
2755 	}
2756 
2757 	res = assert_wait_deadline(event, interruptible, deadline);
2758 	if (res == THREAD_WAITING) {
2759 		lck_rw_type = lck_rw_done(lck);
2760 		res = thread_block(THREAD_CONTINUE_NULL);
2761 		if (!(lck_sleep_action & LCK_SLEEP_UNLOCK)) {
2762 			if (!(lck_sleep_action & (LCK_SLEEP_SHARED | LCK_SLEEP_EXCLUSIVE))) {
2763 				lck_rw_lock(lck, lck_rw_type);
2764 			} else if (lck_sleep_action & LCK_SLEEP_EXCLUSIVE) {
2765 				lck_rw_lock_exclusive(lck);
2766 			} else {
2767 				lck_rw_lock_shared(lck);
2768 			}
2769 		}
2770 	} else if (lck_sleep_action & LCK_SLEEP_UNLOCK) {
2771 		(void)lck_rw_done(lck);
2772 	}
2773 
2774 	if (lck_sleep_action & LCK_SLEEP_PROMOTED_PRI) {
2775 		thread_priority_floor_end(&token);
2776 	}
2777 
2778 	return res;
2779 }
2780 
2781 /*
2782  * Reader-writer lock promotion
2783  *
2784  * We support a limited form of reader-writer
2785  * lock promotion whose effects are:
2786  *
2787  *   * Qualifying threads have decay disabled
2788  *   * Scheduler priority is reset to a floor of
2789  *     of their statically assigned priority
2790  *     or MINPRI_RWLOCK
2791  *
2792  * The rationale is that lck_rw_ts do not have
2793  * a single owner, so we cannot apply a directed
2794  * priority boost from all waiting threads
2795  * to all holding threads without maintaining
2796  * lists of all shared owners and all waiting
2797  * threads for every lock.
2798  *
2799  * Instead (and to preserve the uncontended fast-
2800  * path), acquiring (or attempting to acquire)
2801  * a RW lock in shared or exclusive lock increments
2802  * a per-thread counter. Only if that thread stops
2803  * making forward progress (for instance blocking
2804  * on a mutex, or being preempted) do we consult
2805  * the counter and apply the priority floor.
2806  * When the thread becomes runnable again (or in
2807  * the case of preemption it never stopped being
2808  * runnable), it has the priority boost and should
2809  * be in a good position to run on the CPU and
2810  * release all RW locks (at which point the priority
2811  * boost is cleared).
2812  *
2813  * Care must be taken to ensure that priority
2814  * boosts are not retained indefinitely, since unlike
2815  * mutex priority boosts (where the boost is tied
2816  * to the mutex lifecycle), the boost is tied
2817  * to the thread and independent of any particular
2818  * lck_rw_t. Assertions are in place on return
2819  * to userspace so that the boost is not held
2820  * indefinitely.
2821  *
2822  * The routines that increment/decrement the
2823  * per-thread counter should err on the side of
2824  * incrementing any time a preemption is possible
2825  * and the lock would be visible to the rest of the
2826  * system as held (so it should be incremented before
2827  * interlocks are dropped/preemption is enabled, or
2828  * before a CAS is executed to acquire the lock).
2829  *
2830  */
2831 
2832 /*!
2833  * @function lck_rw_clear_promotion
2834  *
2835  * @abstract
2836  * Undo priority promotions when the last rw_lock
2837  * is released by a thread (if a promotion was active).
2838  *
2839  * @param thread        thread to demote.
2840  * @param lock          object reason for the demotion.
2841  */
2842 __attribute__((noinline))
2843 static void
lck_rw_clear_promotion(thread_t thread,const void * lock)2844 lck_rw_clear_promotion(thread_t thread, const void *lock)
2845 {
2846 	/* Cancel any promotions if the thread had actually blocked while holding a RW lock */
2847 	spl_t s = splsched();
2848 	thread_lock(thread);
2849 
2850 	if (thread->sched_flags & TH_SFLAG_RW_PROMOTED) {
2851 		sched_thread_unpromote_reason(thread, TH_SFLAG_RW_PROMOTED,
2852 		    unslide_for_kdebug(lock));
2853 	}
2854 
2855 	thread_unlock(thread);
2856 	splx(s);
2857 }
2858 
2859 /*!
2860  * @function lck_rw_set_promotion_locked
2861  *
2862  * @abstract
2863  * Callout from context switch if the thread goes
2864  * off core with a positive rwlock_count.
2865  *
2866  * @discussion
2867  * Called at splsched with the thread locked.
2868  *
2869  * @param thread        thread to promote.
2870  */
2871 __attribute__((always_inline))
2872 void
lck_rw_set_promotion_locked(thread_t thread)2873 lck_rw_set_promotion_locked(thread_t thread)
2874 {
2875 	if (LcksOpts & LCK_OPTION_DISABLE_RW_PRIO) {
2876 		return;
2877 	}
2878 
2879 	assert(thread->rwlock_count > 0);
2880 
2881 	if (!(thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
2882 		sched_thread_promote_reason(thread, TH_SFLAG_RW_PROMOTED, 0);
2883 	}
2884 }
2885 
2886 __attribute__((always_inline))
2887 void
lck_rw_lock_count_inc(thread_t thread,const void * lock __unused)2888 lck_rw_lock_count_inc(thread_t thread, const void *lock __unused)
2889 {
2890 	if (thread->rwlock_count++ == 0) {
2891 #if MACH_ASSERT
2892 		/*
2893 		 * Set the ast to check that the
2894 		 * rwlock_count is going to be set to zero when
2895 		 * going back to userspace.
2896 		 * Set it only once when we increment it for the first time.
2897 		 */
2898 		act_set_debug_assert();
2899 #endif
2900 	}
2901 }
2902 
2903 __abortlike
2904 static void
__lck_rw_lock_count_dec_panic(thread_t thread)2905 __lck_rw_lock_count_dec_panic(thread_t thread)
2906 {
2907 	panic("rw lock count underflow for thread %p", thread);
2908 }
2909 
2910 __attribute__((always_inline))
2911 void
lck_rw_lock_count_dec(thread_t thread,const void * lock)2912 lck_rw_lock_count_dec(thread_t thread, const void *lock)
2913 {
2914 	uint32_t rwlock_count = thread->rwlock_count--;
2915 
2916 	if (rwlock_count == 0) {
2917 		__lck_rw_lock_count_dec_panic(thread);
2918 	}
2919 
2920 	if (__probable(rwlock_count == 1)) {
2921 		/* sched_flags checked without lock, but will be rechecked while clearing */
2922 		if (__improbable(thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
2923 			lck_rw_clear_promotion(thread, lock);
2924 		}
2925 	}
2926 }
2927